1a9643ea8Slogwang /*-
2*22ce4affSfengbojiang * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3*22ce4affSfengbojiang *
4a9643ea8Slogwang * Copyright (c) 1991, 1993
5a9643ea8Slogwang * The Regents of the University of California. All rights reserved.
6a9643ea8Slogwang *
7a9643ea8Slogwang * This code is derived from software contributed to Berkeley by
8a9643ea8Slogwang * The Mach Operating System project at Carnegie-Mellon University.
9a9643ea8Slogwang *
10a9643ea8Slogwang * Redistribution and use in source and binary forms, with or without
11a9643ea8Slogwang * modification, are permitted provided that the following conditions
12a9643ea8Slogwang * are met:
13a9643ea8Slogwang * 1. Redistributions of source code must retain the above copyright
14a9643ea8Slogwang * notice, this list of conditions and the following disclaimer.
15a9643ea8Slogwang * 2. Redistributions in binary form must reproduce the above copyright
16a9643ea8Slogwang * notice, this list of conditions and the following disclaimer in the
17a9643ea8Slogwang * documentation and/or other materials provided with the distribution.
18*22ce4affSfengbojiang * 3. Neither the name of the University nor the names of its contributors
19a9643ea8Slogwang * may be used to endorse or promote products derived from this software
20a9643ea8Slogwang * without specific prior written permission.
21a9643ea8Slogwang *
22a9643ea8Slogwang * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23a9643ea8Slogwang * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24a9643ea8Slogwang * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25a9643ea8Slogwang * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26a9643ea8Slogwang * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27a9643ea8Slogwang * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28a9643ea8Slogwang * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29a9643ea8Slogwang * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30a9643ea8Slogwang * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31a9643ea8Slogwang * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32a9643ea8Slogwang * SUCH DAMAGE.
33a9643ea8Slogwang *
34a9643ea8Slogwang * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94
35a9643ea8Slogwang *
36a9643ea8Slogwang *
37a9643ea8Slogwang * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38a9643ea8Slogwang * All rights reserved.
39a9643ea8Slogwang *
40a9643ea8Slogwang * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41a9643ea8Slogwang *
42a9643ea8Slogwang * Permission to use, copy, modify and distribute this software and
43a9643ea8Slogwang * its documentation is hereby granted, provided that both the copyright
44a9643ea8Slogwang * notice and this permission notice appear in all copies of the
45a9643ea8Slogwang * software, derivative works or modified versions, and any portions
46a9643ea8Slogwang * thereof, and that both notices appear in supporting documentation.
47a9643ea8Slogwang *
48a9643ea8Slogwang * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49a9643ea8Slogwang * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50a9643ea8Slogwang * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51a9643ea8Slogwang *
52a9643ea8Slogwang * Carnegie Mellon requests users of this software to return to
53a9643ea8Slogwang *
54a9643ea8Slogwang * Software Distribution Coordinator or [email protected]
55a9643ea8Slogwang * School of Computer Science
56a9643ea8Slogwang * Carnegie Mellon University
57a9643ea8Slogwang * Pittsburgh PA 15213-3890
58a9643ea8Slogwang *
59a9643ea8Slogwang * any improvements or extensions that they make and grant Carnegie the
60a9643ea8Slogwang * rights to redistribute these changes.
61a9643ea8Slogwang */
62a9643ea8Slogwang
63a9643ea8Slogwang /*
64a9643ea8Slogwang * Virtual memory mapping module.
65a9643ea8Slogwang */
66a9643ea8Slogwang
67a9643ea8Slogwang #include <sys/cdefs.h>
68a9643ea8Slogwang __FBSDID("$FreeBSD$");
69a9643ea8Slogwang
70a9643ea8Slogwang #include <sys/param.h>
71a9643ea8Slogwang #include <sys/systm.h>
72*22ce4affSfengbojiang #include <sys/elf.h>
73a9643ea8Slogwang #include <sys/kernel.h>
74a9643ea8Slogwang #include <sys/ktr.h>
75a9643ea8Slogwang #include <sys/lock.h>
76a9643ea8Slogwang #include <sys/mutex.h>
77a9643ea8Slogwang #include <sys/proc.h>
78a9643ea8Slogwang #include <sys/vmmeter.h>
79a9643ea8Slogwang #include <sys/mman.h>
80a9643ea8Slogwang #include <sys/vnode.h>
81a9643ea8Slogwang #include <sys/racct.h>
82a9643ea8Slogwang #include <sys/resourcevar.h>
83a9643ea8Slogwang #include <sys/rwlock.h>
84a9643ea8Slogwang #include <sys/file.h>
85a9643ea8Slogwang #include <sys/sysctl.h>
86a9643ea8Slogwang #include <sys/sysent.h>
87a9643ea8Slogwang #include <sys/shm.h>
88a9643ea8Slogwang
89a9643ea8Slogwang #include <vm/vm.h>
90a9643ea8Slogwang #include <vm/vm_param.h>
91a9643ea8Slogwang #include <vm/pmap.h>
92a9643ea8Slogwang #include <vm/vm_map.h>
93a9643ea8Slogwang #include <vm/vm_page.h>
94*22ce4affSfengbojiang #include <vm/vm_pageout.h>
95a9643ea8Slogwang #include <vm/vm_object.h>
96a9643ea8Slogwang #include <vm/vm_pager.h>
97a9643ea8Slogwang #include <vm/vm_kern.h>
98a9643ea8Slogwang #include <vm/vm_extern.h>
99a9643ea8Slogwang #include <vm/vnode_pager.h>
100a9643ea8Slogwang #include <vm/swap_pager.h>
101a9643ea8Slogwang #include <vm/uma.h>
102a9643ea8Slogwang
103a9643ea8Slogwang /*
104a9643ea8Slogwang * Virtual memory maps provide for the mapping, protection,
105a9643ea8Slogwang * and sharing of virtual memory objects. In addition,
106a9643ea8Slogwang * this module provides for an efficient virtual copy of
107a9643ea8Slogwang * memory from one map to another.
108a9643ea8Slogwang *
109a9643ea8Slogwang * Synchronization is required prior to most operations.
110a9643ea8Slogwang *
111a9643ea8Slogwang * Maps consist of an ordered doubly-linked list of simple
112a9643ea8Slogwang * entries; a self-adjusting binary search tree of these
113a9643ea8Slogwang * entries is used to speed up lookups.
114a9643ea8Slogwang *
115a9643ea8Slogwang * Since portions of maps are specified by start/end addresses,
116a9643ea8Slogwang * which may not align with existing map entries, all
117a9643ea8Slogwang * routines merely "clip" entries to these start/end values.
118a9643ea8Slogwang * [That is, an entry is split into two, bordering at a
119a9643ea8Slogwang * start or end value.] Note that these clippings may not
120a9643ea8Slogwang * always be necessary (as the two resulting entries are then
121a9643ea8Slogwang * not changed); however, the clipping is done for convenience.
122a9643ea8Slogwang *
123a9643ea8Slogwang * As mentioned above, virtual copy operations are performed
124a9643ea8Slogwang * by copying VM object references from one map to
125a9643ea8Slogwang * another, and then marking both regions as copy-on-write.
126a9643ea8Slogwang */
127a9643ea8Slogwang
128a9643ea8Slogwang static struct mtx map_sleep_mtx;
129a9643ea8Slogwang static uma_zone_t mapentzone;
130a9643ea8Slogwang static uma_zone_t kmapentzone;
131a9643ea8Slogwang static uma_zone_t vmspace_zone;
132a9643ea8Slogwang static int vmspace_zinit(void *mem, int size, int flags);
133a9643ea8Slogwang static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
134a9643ea8Slogwang vm_offset_t max);
135a9643ea8Slogwang static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
136a9643ea8Slogwang static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
137a9643ea8Slogwang static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
138*22ce4affSfengbojiang static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
139*22ce4affSfengbojiang vm_map_entry_t gap_entry);
140a9643ea8Slogwang static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
141a9643ea8Slogwang vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
142a9643ea8Slogwang #ifdef INVARIANTS
143a9643ea8Slogwang static void vmspace_zdtor(void *mem, int size, void *arg);
144a9643ea8Slogwang #endif
145a9643ea8Slogwang static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
146a9643ea8Slogwang vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
147a9643ea8Slogwang int cow);
148a9643ea8Slogwang static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
149a9643ea8Slogwang vm_offset_t failed_addr);
150a9643ea8Slogwang
151a9643ea8Slogwang #define ENTRY_CHARGED(e) ((e)->cred != NULL || \
152a9643ea8Slogwang ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
153a9643ea8Slogwang !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
154a9643ea8Slogwang
155a9643ea8Slogwang /*
156a9643ea8Slogwang * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
157a9643ea8Slogwang * stable.
158a9643ea8Slogwang */
159a9643ea8Slogwang #define PROC_VMSPACE_LOCK(p) do { } while (0)
160a9643ea8Slogwang #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
161a9643ea8Slogwang
162a9643ea8Slogwang /*
163a9643ea8Slogwang * VM_MAP_RANGE_CHECK: [ internal use only ]
164a9643ea8Slogwang *
165a9643ea8Slogwang * Asserts that the starting and ending region
166a9643ea8Slogwang * addresses fall within the valid range of the map.
167a9643ea8Slogwang */
168a9643ea8Slogwang #define VM_MAP_RANGE_CHECK(map, start, end) \
169a9643ea8Slogwang { \
170a9643ea8Slogwang if (start < vm_map_min(map)) \
171a9643ea8Slogwang start = vm_map_min(map); \
172a9643ea8Slogwang if (end > vm_map_max(map)) \
173a9643ea8Slogwang end = vm_map_max(map); \
174a9643ea8Slogwang if (start > end) \
175a9643ea8Slogwang start = end; \
176a9643ea8Slogwang }
177a9643ea8Slogwang
178*22ce4affSfengbojiang #ifndef UMA_MD_SMALL_ALLOC
179*22ce4affSfengbojiang
180*22ce4affSfengbojiang /*
181*22ce4affSfengbojiang * Allocate a new slab for kernel map entries. The kernel map may be locked or
182*22ce4affSfengbojiang * unlocked, depending on whether the request is coming from the kernel map or a
183*22ce4affSfengbojiang * submap. This function allocates a virtual address range directly from the
184*22ce4affSfengbojiang * kernel map instead of the kmem_* layer to avoid recursion on the kernel map
185*22ce4affSfengbojiang * lock and also to avoid triggering allocator recursion in the vmem boundary
186*22ce4affSfengbojiang * tag allocator.
187*22ce4affSfengbojiang */
188*22ce4affSfengbojiang static void *
kmapent_alloc(uma_zone_t zone,vm_size_t bytes,int domain,uint8_t * pflag,int wait)189*22ce4affSfengbojiang kmapent_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
190*22ce4affSfengbojiang int wait)
191*22ce4affSfengbojiang {
192*22ce4affSfengbojiang vm_offset_t addr;
193*22ce4affSfengbojiang int error, locked;
194*22ce4affSfengbojiang
195*22ce4affSfengbojiang *pflag = UMA_SLAB_PRIV;
196*22ce4affSfengbojiang
197*22ce4affSfengbojiang if (!(locked = vm_map_locked(kernel_map)))
198*22ce4affSfengbojiang vm_map_lock(kernel_map);
199*22ce4affSfengbojiang addr = vm_map_findspace(kernel_map, vm_map_min(kernel_map), bytes);
200*22ce4affSfengbojiang if (addr + bytes < addr || addr + bytes > vm_map_max(kernel_map))
201*22ce4affSfengbojiang panic("%s: kernel map is exhausted", __func__);
202*22ce4affSfengbojiang error = vm_map_insert(kernel_map, NULL, 0, addr, addr + bytes,
203*22ce4affSfengbojiang VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
204*22ce4affSfengbojiang if (error != KERN_SUCCESS)
205*22ce4affSfengbojiang panic("%s: vm_map_insert() failed: %d", __func__, error);
206*22ce4affSfengbojiang if (!locked)
207*22ce4affSfengbojiang vm_map_unlock(kernel_map);
208*22ce4affSfengbojiang error = kmem_back_domain(domain, kernel_object, addr, bytes, M_NOWAIT |
209*22ce4affSfengbojiang M_USE_RESERVE | (wait & M_ZERO));
210*22ce4affSfengbojiang if (error == KERN_SUCCESS) {
211*22ce4affSfengbojiang return ((void *)addr);
212*22ce4affSfengbojiang } else {
213*22ce4affSfengbojiang if (!locked)
214*22ce4affSfengbojiang vm_map_lock(kernel_map);
215*22ce4affSfengbojiang vm_map_delete(kernel_map, addr, bytes);
216*22ce4affSfengbojiang if (!locked)
217*22ce4affSfengbojiang vm_map_unlock(kernel_map);
218*22ce4affSfengbojiang return (NULL);
219*22ce4affSfengbojiang }
220*22ce4affSfengbojiang }
221*22ce4affSfengbojiang
222*22ce4affSfengbojiang static void
kmapent_free(void * item,vm_size_t size,uint8_t pflag)223*22ce4affSfengbojiang kmapent_free(void *item, vm_size_t size, uint8_t pflag)
224*22ce4affSfengbojiang {
225*22ce4affSfengbojiang vm_offset_t addr;
226*22ce4affSfengbojiang int error;
227*22ce4affSfengbojiang
228*22ce4affSfengbojiang if ((pflag & UMA_SLAB_PRIV) == 0)
229*22ce4affSfengbojiang /* XXX leaked */
230*22ce4affSfengbojiang return;
231*22ce4affSfengbojiang
232*22ce4affSfengbojiang addr = (vm_offset_t)item;
233*22ce4affSfengbojiang kmem_unback(kernel_object, addr, size);
234*22ce4affSfengbojiang error = vm_map_remove(kernel_map, addr, addr + size);
235*22ce4affSfengbojiang KASSERT(error == KERN_SUCCESS,
236*22ce4affSfengbojiang ("%s: vm_map_remove failed: %d", __func__, error));
237*22ce4affSfengbojiang }
238*22ce4affSfengbojiang
239*22ce4affSfengbojiang /*
240*22ce4affSfengbojiang * The worst-case upper bound on the number of kernel map entries that may be
241*22ce4affSfengbojiang * created before the zone must be replenished in _vm_map_unlock().
242*22ce4affSfengbojiang */
243*22ce4affSfengbojiang #define KMAPENT_RESERVE 1
244*22ce4affSfengbojiang
245*22ce4affSfengbojiang #endif /* !UMD_MD_SMALL_ALLOC */
246*22ce4affSfengbojiang
247a9643ea8Slogwang /*
248a9643ea8Slogwang * vm_map_startup:
249a9643ea8Slogwang *
250*22ce4affSfengbojiang * Initialize the vm_map module. Must be called before any other vm_map
251*22ce4affSfengbojiang * routines.
252a9643ea8Slogwang *
253*22ce4affSfengbojiang * User map and entry structures are allocated from the general purpose
254*22ce4affSfengbojiang * memory pool. Kernel maps are statically defined. Kernel map entries
255*22ce4affSfengbojiang * require special handling to avoid recursion; see the comments above
256*22ce4affSfengbojiang * kmapent_alloc() and in vm_map_entry_create().
257a9643ea8Slogwang */
258a9643ea8Slogwang void
vm_map_startup(void)259a9643ea8Slogwang vm_map_startup(void)
260a9643ea8Slogwang {
261a9643ea8Slogwang mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
262*22ce4affSfengbojiang
263*22ce4affSfengbojiang /*
264*22ce4affSfengbojiang * Disable the use of per-CPU buckets: map entry allocation is
265*22ce4affSfengbojiang * serialized by the kernel map lock.
266*22ce4affSfengbojiang */
267a9643ea8Slogwang kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
268a9643ea8Slogwang NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
269*22ce4affSfengbojiang UMA_ZONE_VM | UMA_ZONE_NOBUCKET);
270*22ce4affSfengbojiang #ifndef UMA_MD_SMALL_ALLOC
271*22ce4affSfengbojiang /* Reserve an extra map entry for use when replenishing the reserve. */
272*22ce4affSfengbojiang uma_zone_reserve(kmapentzone, KMAPENT_RESERVE + 1);
273*22ce4affSfengbojiang uma_prealloc(kmapentzone, KMAPENT_RESERVE + 1);
274*22ce4affSfengbojiang uma_zone_set_allocf(kmapentzone, kmapent_alloc);
275*22ce4affSfengbojiang uma_zone_set_freef(kmapentzone, kmapent_free);
276*22ce4affSfengbojiang #endif
277*22ce4affSfengbojiang
278a9643ea8Slogwang mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
279a9643ea8Slogwang NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
280a9643ea8Slogwang vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
281a9643ea8Slogwang #ifdef INVARIANTS
282a9643ea8Slogwang vmspace_zdtor,
283a9643ea8Slogwang #else
284a9643ea8Slogwang NULL,
285a9643ea8Slogwang #endif
286a9643ea8Slogwang vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
287a9643ea8Slogwang }
288a9643ea8Slogwang
289a9643ea8Slogwang static int
vmspace_zinit(void * mem,int size,int flags)290a9643ea8Slogwang vmspace_zinit(void *mem, int size, int flags)
291a9643ea8Slogwang {
292a9643ea8Slogwang struct vmspace *vm;
293a9643ea8Slogwang vm_map_t map;
294a9643ea8Slogwang
295*22ce4affSfengbojiang vm = (struct vmspace *)mem;
296*22ce4affSfengbojiang map = &vm->vm_map;
297*22ce4affSfengbojiang
298a9643ea8Slogwang memset(map, 0, sizeof(*map));
299*22ce4affSfengbojiang mtx_init(&map->system_mtx, "vm map (system)", NULL,
300*22ce4affSfengbojiang MTX_DEF | MTX_DUPOK);
301a9643ea8Slogwang sx_init(&map->lock, "vm map (user)");
302*22ce4affSfengbojiang PMAP_LOCK_INIT(vmspace_pmap(vm));
303a9643ea8Slogwang return (0);
304a9643ea8Slogwang }
305a9643ea8Slogwang
306a9643ea8Slogwang #ifdef INVARIANTS
307a9643ea8Slogwang static void
vmspace_zdtor(void * mem,int size,void * arg)308a9643ea8Slogwang vmspace_zdtor(void *mem, int size, void *arg)
309a9643ea8Slogwang {
310a9643ea8Slogwang struct vmspace *vm;
311a9643ea8Slogwang
312a9643ea8Slogwang vm = (struct vmspace *)mem;
313*22ce4affSfengbojiang KASSERT(vm->vm_map.nentries == 0,
314*22ce4affSfengbojiang ("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries));
315*22ce4affSfengbojiang KASSERT(vm->vm_map.size == 0,
316*22ce4affSfengbojiang ("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size));
317a9643ea8Slogwang }
318a9643ea8Slogwang #endif /* INVARIANTS */
319a9643ea8Slogwang
320a9643ea8Slogwang /*
321a9643ea8Slogwang * Allocate a vmspace structure, including a vm_map and pmap,
322a9643ea8Slogwang * and initialize those structures. The refcnt is set to 1.
323a9643ea8Slogwang */
324a9643ea8Slogwang struct vmspace *
vmspace_alloc(vm_offset_t min,vm_offset_t max,pmap_pinit_t pinit)325a9643ea8Slogwang vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
326a9643ea8Slogwang {
327a9643ea8Slogwang struct vmspace *vm;
328a9643ea8Slogwang
329a9643ea8Slogwang vm = uma_zalloc(vmspace_zone, M_WAITOK);
330a9643ea8Slogwang KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
331a9643ea8Slogwang if (!pinit(vmspace_pmap(vm))) {
332a9643ea8Slogwang uma_zfree(vmspace_zone, vm);
333a9643ea8Slogwang return (NULL);
334a9643ea8Slogwang }
335a9643ea8Slogwang CTR1(KTR_VM, "vmspace_alloc: %p", vm);
336a9643ea8Slogwang _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
337*22ce4affSfengbojiang refcount_init(&vm->vm_refcnt, 1);
338a9643ea8Slogwang vm->vm_shm = NULL;
339a9643ea8Slogwang vm->vm_swrss = 0;
340a9643ea8Slogwang vm->vm_tsize = 0;
341a9643ea8Slogwang vm->vm_dsize = 0;
342a9643ea8Slogwang vm->vm_ssize = 0;
343a9643ea8Slogwang vm->vm_taddr = 0;
344a9643ea8Slogwang vm->vm_daddr = 0;
345a9643ea8Slogwang vm->vm_maxsaddr = 0;
346a9643ea8Slogwang return (vm);
347a9643ea8Slogwang }
348a9643ea8Slogwang
349a9643ea8Slogwang #ifdef RACCT
350a9643ea8Slogwang static void
vmspace_container_reset(struct proc * p)351a9643ea8Slogwang vmspace_container_reset(struct proc *p)
352a9643ea8Slogwang {
353a9643ea8Slogwang
354a9643ea8Slogwang PROC_LOCK(p);
355a9643ea8Slogwang racct_set(p, RACCT_DATA, 0);
356a9643ea8Slogwang racct_set(p, RACCT_STACK, 0);
357a9643ea8Slogwang racct_set(p, RACCT_RSS, 0);
358a9643ea8Slogwang racct_set(p, RACCT_MEMLOCK, 0);
359a9643ea8Slogwang racct_set(p, RACCT_VMEM, 0);
360a9643ea8Slogwang PROC_UNLOCK(p);
361a9643ea8Slogwang }
362a9643ea8Slogwang #endif
363a9643ea8Slogwang
364a9643ea8Slogwang static inline void
vmspace_dofree(struct vmspace * vm)365a9643ea8Slogwang vmspace_dofree(struct vmspace *vm)
366a9643ea8Slogwang {
367a9643ea8Slogwang
368a9643ea8Slogwang CTR1(KTR_VM, "vmspace_free: %p", vm);
369a9643ea8Slogwang
370a9643ea8Slogwang /*
371a9643ea8Slogwang * Make sure any SysV shm is freed, it might not have been in
372a9643ea8Slogwang * exit1().
373a9643ea8Slogwang */
374a9643ea8Slogwang shmexit(vm);
375a9643ea8Slogwang
376a9643ea8Slogwang /*
377a9643ea8Slogwang * Lock the map, to wait out all other references to it.
378a9643ea8Slogwang * Delete all of the mappings and pages they hold, then call
379a9643ea8Slogwang * the pmap module to reclaim anything left.
380a9643ea8Slogwang */
381*22ce4affSfengbojiang (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
382*22ce4affSfengbojiang vm_map_max(&vm->vm_map));
383a9643ea8Slogwang
384a9643ea8Slogwang pmap_release(vmspace_pmap(vm));
385a9643ea8Slogwang vm->vm_map.pmap = NULL;
386a9643ea8Slogwang uma_zfree(vmspace_zone, vm);
387a9643ea8Slogwang }
388a9643ea8Slogwang
389a9643ea8Slogwang void
vmspace_free(struct vmspace * vm)390a9643ea8Slogwang vmspace_free(struct vmspace *vm)
391a9643ea8Slogwang {
392a9643ea8Slogwang
393a9643ea8Slogwang WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
394a9643ea8Slogwang "vmspace_free() called");
395a9643ea8Slogwang
396*22ce4affSfengbojiang if (refcount_release(&vm->vm_refcnt))
397a9643ea8Slogwang vmspace_dofree(vm);
398a9643ea8Slogwang }
399a9643ea8Slogwang
400a9643ea8Slogwang void
vmspace_exitfree(struct proc * p)401a9643ea8Slogwang vmspace_exitfree(struct proc *p)
402a9643ea8Slogwang {
403a9643ea8Slogwang struct vmspace *vm;
404a9643ea8Slogwang
405a9643ea8Slogwang PROC_VMSPACE_LOCK(p);
406a9643ea8Slogwang vm = p->p_vmspace;
407a9643ea8Slogwang p->p_vmspace = NULL;
408a9643ea8Slogwang PROC_VMSPACE_UNLOCK(p);
409a9643ea8Slogwang KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
410a9643ea8Slogwang vmspace_free(vm);
411a9643ea8Slogwang }
412a9643ea8Slogwang
413a9643ea8Slogwang void
vmspace_exit(struct thread * td)414a9643ea8Slogwang vmspace_exit(struct thread *td)
415a9643ea8Slogwang {
416a9643ea8Slogwang struct vmspace *vm;
417a9643ea8Slogwang struct proc *p;
418*22ce4affSfengbojiang bool released;
419a9643ea8Slogwang
420a9643ea8Slogwang p = td->td_proc;
421a9643ea8Slogwang vm = p->p_vmspace;
422*22ce4affSfengbojiang
423*22ce4affSfengbojiang /*
424*22ce4affSfengbojiang * Prepare to release the vmspace reference. The thread that releases
425*22ce4affSfengbojiang * the last reference is responsible for tearing down the vmspace.
426*22ce4affSfengbojiang * However, threads not releasing the final reference must switch to the
427*22ce4affSfengbojiang * kernel's vmspace0 before the decrement so that the subsequent pmap
428*22ce4affSfengbojiang * deactivation does not modify a freed vmspace.
429*22ce4affSfengbojiang */
430*22ce4affSfengbojiang refcount_acquire(&vmspace0.vm_refcnt);
431*22ce4affSfengbojiang if (!(released = refcount_release_if_last(&vm->vm_refcnt))) {
432*22ce4affSfengbojiang if (p->p_vmspace != &vmspace0) {
433a9643ea8Slogwang PROC_VMSPACE_LOCK(p);
434a9643ea8Slogwang p->p_vmspace = &vmspace0;
435a9643ea8Slogwang PROC_VMSPACE_UNLOCK(p);
436a9643ea8Slogwang pmap_activate(td);
437a9643ea8Slogwang }
438*22ce4affSfengbojiang released = refcount_release(&vm->vm_refcnt);
439*22ce4affSfengbojiang }
440*22ce4affSfengbojiang if (released) {
441*22ce4affSfengbojiang /*
442*22ce4affSfengbojiang * pmap_remove_pages() expects the pmap to be active, so switch
443*22ce4affSfengbojiang * back first if necessary.
444*22ce4affSfengbojiang */
445a9643ea8Slogwang if (p->p_vmspace != vm) {
446a9643ea8Slogwang PROC_VMSPACE_LOCK(p);
447a9643ea8Slogwang p->p_vmspace = vm;
448a9643ea8Slogwang PROC_VMSPACE_UNLOCK(p);
449a9643ea8Slogwang pmap_activate(td);
450a9643ea8Slogwang }
451a9643ea8Slogwang pmap_remove_pages(vmspace_pmap(vm));
452a9643ea8Slogwang PROC_VMSPACE_LOCK(p);
453a9643ea8Slogwang p->p_vmspace = &vmspace0;
454a9643ea8Slogwang PROC_VMSPACE_UNLOCK(p);
455a9643ea8Slogwang pmap_activate(td);
456a9643ea8Slogwang vmspace_dofree(vm);
457a9643ea8Slogwang }
458a9643ea8Slogwang #ifdef RACCT
459a9643ea8Slogwang if (racct_enable)
460a9643ea8Slogwang vmspace_container_reset(p);
461a9643ea8Slogwang #endif
462a9643ea8Slogwang }
463a9643ea8Slogwang
464a9643ea8Slogwang /* Acquire reference to vmspace owned by another process. */
465a9643ea8Slogwang
466a9643ea8Slogwang struct vmspace *
vmspace_acquire_ref(struct proc * p)467a9643ea8Slogwang vmspace_acquire_ref(struct proc *p)
468a9643ea8Slogwang {
469a9643ea8Slogwang struct vmspace *vm;
470a9643ea8Slogwang
471a9643ea8Slogwang PROC_VMSPACE_LOCK(p);
472a9643ea8Slogwang vm = p->p_vmspace;
473*22ce4affSfengbojiang if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) {
474a9643ea8Slogwang PROC_VMSPACE_UNLOCK(p);
475a9643ea8Slogwang return (NULL);
476a9643ea8Slogwang }
477a9643ea8Slogwang if (vm != p->p_vmspace) {
478a9643ea8Slogwang PROC_VMSPACE_UNLOCK(p);
479a9643ea8Slogwang vmspace_free(vm);
480a9643ea8Slogwang return (NULL);
481a9643ea8Slogwang }
482a9643ea8Slogwang PROC_VMSPACE_UNLOCK(p);
483a9643ea8Slogwang return (vm);
484a9643ea8Slogwang }
485a9643ea8Slogwang
486a9643ea8Slogwang /*
487a9643ea8Slogwang * Switch between vmspaces in an AIO kernel process.
488a9643ea8Slogwang *
489*22ce4affSfengbojiang * The new vmspace is either the vmspace of a user process obtained
490*22ce4affSfengbojiang * from an active AIO request or the initial vmspace of the AIO kernel
491*22ce4affSfengbojiang * process (when it is idling). Because user processes will block to
492*22ce4affSfengbojiang * drain any active AIO requests before proceeding in exit() or
493*22ce4affSfengbojiang * execve(), the reference count for vmspaces from AIO requests can
494*22ce4affSfengbojiang * never be 0. Similarly, AIO kernel processes hold an extra
495*22ce4affSfengbojiang * reference on their initial vmspace for the life of the process. As
496*22ce4affSfengbojiang * a result, the 'newvm' vmspace always has a non-zero reference
497*22ce4affSfengbojiang * count. This permits an additional reference on 'newvm' to be
498*22ce4affSfengbojiang * acquired via a simple atomic increment rather than the loop in
499*22ce4affSfengbojiang * vmspace_acquire_ref() above.
500a9643ea8Slogwang */
501a9643ea8Slogwang void
vmspace_switch_aio(struct vmspace * newvm)502a9643ea8Slogwang vmspace_switch_aio(struct vmspace *newvm)
503a9643ea8Slogwang {
504a9643ea8Slogwang struct vmspace *oldvm;
505a9643ea8Slogwang
506a9643ea8Slogwang /* XXX: Need some way to assert that this is an aio daemon. */
507a9643ea8Slogwang
508*22ce4affSfengbojiang KASSERT(refcount_load(&newvm->vm_refcnt) > 0,
509a9643ea8Slogwang ("vmspace_switch_aio: newvm unreferenced"));
510a9643ea8Slogwang
511a9643ea8Slogwang oldvm = curproc->p_vmspace;
512a9643ea8Slogwang if (oldvm == newvm)
513a9643ea8Slogwang return;
514a9643ea8Slogwang
515a9643ea8Slogwang /*
516a9643ea8Slogwang * Point to the new address space and refer to it.
517a9643ea8Slogwang */
518a9643ea8Slogwang curproc->p_vmspace = newvm;
519*22ce4affSfengbojiang refcount_acquire(&newvm->vm_refcnt);
520a9643ea8Slogwang
521a9643ea8Slogwang /* Activate the new mapping. */
522a9643ea8Slogwang pmap_activate(curthread);
523a9643ea8Slogwang
524a9643ea8Slogwang vmspace_free(oldvm);
525a9643ea8Slogwang }
526a9643ea8Slogwang
527a9643ea8Slogwang void
_vm_map_lock(vm_map_t map,const char * file,int line)528a9643ea8Slogwang _vm_map_lock(vm_map_t map, const char *file, int line)
529a9643ea8Slogwang {
530a9643ea8Slogwang
531a9643ea8Slogwang if (map->system_map)
532a9643ea8Slogwang mtx_lock_flags_(&map->system_mtx, 0, file, line);
533a9643ea8Slogwang else
534a9643ea8Slogwang sx_xlock_(&map->lock, file, line);
535a9643ea8Slogwang map->timestamp++;
536a9643ea8Slogwang }
537a9643ea8Slogwang
538*22ce4affSfengbojiang void
vm_map_entry_set_vnode_text(vm_map_entry_t entry,bool add)539*22ce4affSfengbojiang vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
540*22ce4affSfengbojiang {
541*22ce4affSfengbojiang vm_object_t object;
542*22ce4affSfengbojiang struct vnode *vp;
543*22ce4affSfengbojiang bool vp_held;
544*22ce4affSfengbojiang
545*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
546*22ce4affSfengbojiang return;
547*22ce4affSfengbojiang KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
548*22ce4affSfengbojiang ("Submap with execs"));
549*22ce4affSfengbojiang object = entry->object.vm_object;
550*22ce4affSfengbojiang KASSERT(object != NULL, ("No object for text, entry %p", entry));
551*22ce4affSfengbojiang if ((object->flags & OBJ_ANON) != 0)
552*22ce4affSfengbojiang object = object->handle;
553*22ce4affSfengbojiang else
554*22ce4affSfengbojiang KASSERT(object->backing_object == NULL,
555*22ce4affSfengbojiang ("non-anon object %p shadows", object));
556*22ce4affSfengbojiang KASSERT(object != NULL, ("No content object for text, entry %p obj %p",
557*22ce4affSfengbojiang entry, entry->object.vm_object));
558*22ce4affSfengbojiang
559*22ce4affSfengbojiang /*
560*22ce4affSfengbojiang * Mostly, we do not lock the backing object. It is
561*22ce4affSfengbojiang * referenced by the entry we are processing, so it cannot go
562*22ce4affSfengbojiang * away.
563*22ce4affSfengbojiang */
564*22ce4affSfengbojiang vp = NULL;
565*22ce4affSfengbojiang vp_held = false;
566*22ce4affSfengbojiang if (object->type == OBJT_DEAD) {
567*22ce4affSfengbojiang /*
568*22ce4affSfengbojiang * For OBJT_DEAD objects, v_writecount was handled in
569*22ce4affSfengbojiang * vnode_pager_dealloc().
570*22ce4affSfengbojiang */
571*22ce4affSfengbojiang } else if (object->type == OBJT_VNODE) {
572*22ce4affSfengbojiang vp = object->handle;
573*22ce4affSfengbojiang } else if (object->type == OBJT_SWAP) {
574*22ce4affSfengbojiang KASSERT((object->flags & OBJ_TMPFS_NODE) != 0,
575*22ce4affSfengbojiang ("vm_map_entry_set_vnode_text: swap and !TMPFS "
576*22ce4affSfengbojiang "entry %p, object %p, add %d", entry, object, add));
577*22ce4affSfengbojiang /*
578*22ce4affSfengbojiang * Tmpfs VREG node, which was reclaimed, has
579*22ce4affSfengbojiang * OBJ_TMPFS_NODE flag set, but not OBJ_TMPFS. In
580*22ce4affSfengbojiang * this case there is no v_writecount to adjust.
581*22ce4affSfengbojiang */
582*22ce4affSfengbojiang VM_OBJECT_RLOCK(object);
583*22ce4affSfengbojiang if ((object->flags & OBJ_TMPFS) != 0) {
584*22ce4affSfengbojiang vp = object->un_pager.swp.swp_tmpfs;
585*22ce4affSfengbojiang if (vp != NULL) {
586*22ce4affSfengbojiang vhold(vp);
587*22ce4affSfengbojiang vp_held = true;
588*22ce4affSfengbojiang }
589*22ce4affSfengbojiang }
590*22ce4affSfengbojiang VM_OBJECT_RUNLOCK(object);
591*22ce4affSfengbojiang } else {
592*22ce4affSfengbojiang KASSERT(0,
593*22ce4affSfengbojiang ("vm_map_entry_set_vnode_text: wrong object type, "
594*22ce4affSfengbojiang "entry %p, object %p, add %d", entry, object, add));
595*22ce4affSfengbojiang }
596*22ce4affSfengbojiang if (vp != NULL) {
597*22ce4affSfengbojiang if (add) {
598*22ce4affSfengbojiang VOP_SET_TEXT_CHECKED(vp);
599*22ce4affSfengbojiang } else {
600*22ce4affSfengbojiang vn_lock(vp, LK_SHARED | LK_RETRY);
601*22ce4affSfengbojiang VOP_UNSET_TEXT_CHECKED(vp);
602*22ce4affSfengbojiang VOP_UNLOCK(vp);
603*22ce4affSfengbojiang }
604*22ce4affSfengbojiang if (vp_held)
605*22ce4affSfengbojiang vdrop(vp);
606*22ce4affSfengbojiang }
607*22ce4affSfengbojiang }
608*22ce4affSfengbojiang
609*22ce4affSfengbojiang /*
610*22ce4affSfengbojiang * Use a different name for this vm_map_entry field when it's use
611*22ce4affSfengbojiang * is not consistent with its use as part of an ordered search tree.
612*22ce4affSfengbojiang */
613*22ce4affSfengbojiang #define defer_next right
614*22ce4affSfengbojiang
615a9643ea8Slogwang static void
vm_map_process_deferred(void)616a9643ea8Slogwang vm_map_process_deferred(void)
617a9643ea8Slogwang {
618a9643ea8Slogwang struct thread *td;
619a9643ea8Slogwang vm_map_entry_t entry, next;
620a9643ea8Slogwang vm_object_t object;
621a9643ea8Slogwang
622a9643ea8Slogwang td = curthread;
623a9643ea8Slogwang entry = td->td_map_def_user;
624a9643ea8Slogwang td->td_map_def_user = NULL;
625a9643ea8Slogwang while (entry != NULL) {
626*22ce4affSfengbojiang next = entry->defer_next;
627*22ce4affSfengbojiang MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
628*22ce4affSfengbojiang MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
629*22ce4affSfengbojiang MAP_ENTRY_VN_EXEC));
630*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
631a9643ea8Slogwang /*
632a9643ea8Slogwang * Decrement the object's writemappings and
633a9643ea8Slogwang * possibly the vnode's v_writecount.
634a9643ea8Slogwang */
635a9643ea8Slogwang KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
636a9643ea8Slogwang ("Submap with writecount"));
637a9643ea8Slogwang object = entry->object.vm_object;
638a9643ea8Slogwang KASSERT(object != NULL, ("No object for writecount"));
639*22ce4affSfengbojiang vm_pager_release_writecount(object, entry->start,
640a9643ea8Slogwang entry->end);
641a9643ea8Slogwang }
642*22ce4affSfengbojiang vm_map_entry_set_vnode_text(entry, false);
643a9643ea8Slogwang vm_map_entry_deallocate(entry, FALSE);
644a9643ea8Slogwang entry = next;
645a9643ea8Slogwang }
646a9643ea8Slogwang }
647a9643ea8Slogwang
648*22ce4affSfengbojiang #ifdef INVARIANTS
649*22ce4affSfengbojiang static void
_vm_map_assert_locked(vm_map_t map,const char * file,int line)650*22ce4affSfengbojiang _vm_map_assert_locked(vm_map_t map, const char *file, int line)
651*22ce4affSfengbojiang {
652*22ce4affSfengbojiang
653*22ce4affSfengbojiang if (map->system_map)
654*22ce4affSfengbojiang mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
655*22ce4affSfengbojiang else
656*22ce4affSfengbojiang sx_assert_(&map->lock, SA_XLOCKED, file, line);
657*22ce4affSfengbojiang }
658*22ce4affSfengbojiang
659*22ce4affSfengbojiang #define VM_MAP_ASSERT_LOCKED(map) \
660*22ce4affSfengbojiang _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
661*22ce4affSfengbojiang
662*22ce4affSfengbojiang enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL };
663*22ce4affSfengbojiang #ifdef DIAGNOSTIC
664*22ce4affSfengbojiang static int enable_vmmap_check = VMMAP_CHECK_UNLOCK;
665*22ce4affSfengbojiang #else
666*22ce4affSfengbojiang static int enable_vmmap_check = VMMAP_CHECK_NONE;
667*22ce4affSfengbojiang #endif
668*22ce4affSfengbojiang SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
669*22ce4affSfengbojiang &enable_vmmap_check, 0, "Enable vm map consistency checking");
670*22ce4affSfengbojiang
671*22ce4affSfengbojiang static void _vm_map_assert_consistent(vm_map_t map, int check);
672*22ce4affSfengbojiang
673*22ce4affSfengbojiang #define VM_MAP_ASSERT_CONSISTENT(map) \
674*22ce4affSfengbojiang _vm_map_assert_consistent(map, VMMAP_CHECK_ALL)
675*22ce4affSfengbojiang #ifdef DIAGNOSTIC
676*22ce4affSfengbojiang #define VM_MAP_UNLOCK_CONSISTENT(map) do { \
677*22ce4affSfengbojiang if (map->nupdates > map->nentries) { \
678*22ce4affSfengbojiang _vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \
679*22ce4affSfengbojiang map->nupdates = 0; \
680*22ce4affSfengbojiang } \
681*22ce4affSfengbojiang } while (0)
682*22ce4affSfengbojiang #else
683*22ce4affSfengbojiang #define VM_MAP_UNLOCK_CONSISTENT(map)
684*22ce4affSfengbojiang #endif
685*22ce4affSfengbojiang #else
686*22ce4affSfengbojiang #define VM_MAP_ASSERT_LOCKED(map)
687*22ce4affSfengbojiang #define VM_MAP_ASSERT_CONSISTENT(map)
688*22ce4affSfengbojiang #define VM_MAP_UNLOCK_CONSISTENT(map)
689*22ce4affSfengbojiang #endif /* INVARIANTS */
690*22ce4affSfengbojiang
691a9643ea8Slogwang void
_vm_map_unlock(vm_map_t map,const char * file,int line)692a9643ea8Slogwang _vm_map_unlock(vm_map_t map, const char *file, int line)
693a9643ea8Slogwang {
694a9643ea8Slogwang
695*22ce4affSfengbojiang VM_MAP_UNLOCK_CONSISTENT(map);
696*22ce4affSfengbojiang if (map->system_map) {
697*22ce4affSfengbojiang #ifndef UMA_MD_SMALL_ALLOC
698*22ce4affSfengbojiang if (map == kernel_map && (map->flags & MAP_REPLENISH) != 0) {
699*22ce4affSfengbojiang uma_prealloc(kmapentzone, 1);
700*22ce4affSfengbojiang map->flags &= ~MAP_REPLENISH;
701*22ce4affSfengbojiang }
702*22ce4affSfengbojiang #endif
703a9643ea8Slogwang mtx_unlock_flags_(&map->system_mtx, 0, file, line);
704*22ce4affSfengbojiang } else {
705a9643ea8Slogwang sx_xunlock_(&map->lock, file, line);
706a9643ea8Slogwang vm_map_process_deferred();
707a9643ea8Slogwang }
708a9643ea8Slogwang }
709a9643ea8Slogwang
710a9643ea8Slogwang void
_vm_map_lock_read(vm_map_t map,const char * file,int line)711a9643ea8Slogwang _vm_map_lock_read(vm_map_t map, const char *file, int line)
712a9643ea8Slogwang {
713a9643ea8Slogwang
714a9643ea8Slogwang if (map->system_map)
715a9643ea8Slogwang mtx_lock_flags_(&map->system_mtx, 0, file, line);
716a9643ea8Slogwang else
717a9643ea8Slogwang sx_slock_(&map->lock, file, line);
718a9643ea8Slogwang }
719a9643ea8Slogwang
720a9643ea8Slogwang void
_vm_map_unlock_read(vm_map_t map,const char * file,int line)721a9643ea8Slogwang _vm_map_unlock_read(vm_map_t map, const char *file, int line)
722a9643ea8Slogwang {
723a9643ea8Slogwang
724*22ce4affSfengbojiang if (map->system_map) {
725*22ce4affSfengbojiang KASSERT((map->flags & MAP_REPLENISH) == 0,
726*22ce4affSfengbojiang ("%s: MAP_REPLENISH leaked", __func__));
727a9643ea8Slogwang mtx_unlock_flags_(&map->system_mtx, 0, file, line);
728*22ce4affSfengbojiang } else {
729a9643ea8Slogwang sx_sunlock_(&map->lock, file, line);
730a9643ea8Slogwang vm_map_process_deferred();
731a9643ea8Slogwang }
732a9643ea8Slogwang }
733a9643ea8Slogwang
734a9643ea8Slogwang int
_vm_map_trylock(vm_map_t map,const char * file,int line)735a9643ea8Slogwang _vm_map_trylock(vm_map_t map, const char *file, int line)
736a9643ea8Slogwang {
737a9643ea8Slogwang int error;
738a9643ea8Slogwang
739a9643ea8Slogwang error = map->system_map ?
740a9643ea8Slogwang !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
741a9643ea8Slogwang !sx_try_xlock_(&map->lock, file, line);
742a9643ea8Slogwang if (error == 0)
743a9643ea8Slogwang map->timestamp++;
744a9643ea8Slogwang return (error == 0);
745a9643ea8Slogwang }
746a9643ea8Slogwang
747a9643ea8Slogwang int
_vm_map_trylock_read(vm_map_t map,const char * file,int line)748a9643ea8Slogwang _vm_map_trylock_read(vm_map_t map, const char *file, int line)
749a9643ea8Slogwang {
750a9643ea8Slogwang int error;
751a9643ea8Slogwang
752a9643ea8Slogwang error = map->system_map ?
753a9643ea8Slogwang !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
754a9643ea8Slogwang !sx_try_slock_(&map->lock, file, line);
755a9643ea8Slogwang return (error == 0);
756a9643ea8Slogwang }
757a9643ea8Slogwang
758a9643ea8Slogwang /*
759a9643ea8Slogwang * _vm_map_lock_upgrade: [ internal use only ]
760a9643ea8Slogwang *
761a9643ea8Slogwang * Tries to upgrade a read (shared) lock on the specified map to a write
762a9643ea8Slogwang * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a
763a9643ea8Slogwang * non-zero value if the upgrade fails. If the upgrade fails, the map is
764a9643ea8Slogwang * returned without a read or write lock held.
765a9643ea8Slogwang *
766a9643ea8Slogwang * Requires that the map be read locked.
767a9643ea8Slogwang */
768a9643ea8Slogwang int
_vm_map_lock_upgrade(vm_map_t map,const char * file,int line)769a9643ea8Slogwang _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
770a9643ea8Slogwang {
771a9643ea8Slogwang unsigned int last_timestamp;
772a9643ea8Slogwang
773a9643ea8Slogwang if (map->system_map) {
774a9643ea8Slogwang mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
775a9643ea8Slogwang } else {
776a9643ea8Slogwang if (!sx_try_upgrade_(&map->lock, file, line)) {
777a9643ea8Slogwang last_timestamp = map->timestamp;
778a9643ea8Slogwang sx_sunlock_(&map->lock, file, line);
779a9643ea8Slogwang vm_map_process_deferred();
780a9643ea8Slogwang /*
781a9643ea8Slogwang * If the map's timestamp does not change while the
782a9643ea8Slogwang * map is unlocked, then the upgrade succeeds.
783a9643ea8Slogwang */
784a9643ea8Slogwang sx_xlock_(&map->lock, file, line);
785a9643ea8Slogwang if (last_timestamp != map->timestamp) {
786a9643ea8Slogwang sx_xunlock_(&map->lock, file, line);
787a9643ea8Slogwang return (1);
788a9643ea8Slogwang }
789a9643ea8Slogwang }
790a9643ea8Slogwang }
791a9643ea8Slogwang map->timestamp++;
792a9643ea8Slogwang return (0);
793a9643ea8Slogwang }
794a9643ea8Slogwang
795a9643ea8Slogwang void
_vm_map_lock_downgrade(vm_map_t map,const char * file,int line)796a9643ea8Slogwang _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
797a9643ea8Slogwang {
798a9643ea8Slogwang
799a9643ea8Slogwang if (map->system_map) {
800*22ce4affSfengbojiang KASSERT((map->flags & MAP_REPLENISH) == 0,
801*22ce4affSfengbojiang ("%s: MAP_REPLENISH leaked", __func__));
802a9643ea8Slogwang mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
803*22ce4affSfengbojiang } else {
804*22ce4affSfengbojiang VM_MAP_UNLOCK_CONSISTENT(map);
805a9643ea8Slogwang sx_downgrade_(&map->lock, file, line);
806a9643ea8Slogwang }
807*22ce4affSfengbojiang }
808a9643ea8Slogwang
809a9643ea8Slogwang /*
810a9643ea8Slogwang * vm_map_locked:
811a9643ea8Slogwang *
812a9643ea8Slogwang * Returns a non-zero value if the caller holds a write (exclusive) lock
813a9643ea8Slogwang * on the specified map and the value "0" otherwise.
814a9643ea8Slogwang */
815a9643ea8Slogwang int
vm_map_locked(vm_map_t map)816a9643ea8Slogwang vm_map_locked(vm_map_t map)
817a9643ea8Slogwang {
818a9643ea8Slogwang
819a9643ea8Slogwang if (map->system_map)
820a9643ea8Slogwang return (mtx_owned(&map->system_mtx));
821a9643ea8Slogwang else
822a9643ea8Slogwang return (sx_xlocked(&map->lock));
823a9643ea8Slogwang }
824a9643ea8Slogwang
825a9643ea8Slogwang /*
826a9643ea8Slogwang * _vm_map_unlock_and_wait:
827a9643ea8Slogwang *
828a9643ea8Slogwang * Atomically releases the lock on the specified map and puts the calling
829a9643ea8Slogwang * thread to sleep. The calling thread will remain asleep until either
830a9643ea8Slogwang * vm_map_wakeup() is performed on the map or the specified timeout is
831a9643ea8Slogwang * exceeded.
832a9643ea8Slogwang *
833a9643ea8Slogwang * WARNING! This function does not perform deferred deallocations of
834a9643ea8Slogwang * objects and map entries. Therefore, the calling thread is expected to
835a9643ea8Slogwang * reacquire the map lock after reawakening and later perform an ordinary
836a9643ea8Slogwang * unlock operation, such as vm_map_unlock(), before completing its
837a9643ea8Slogwang * operation on the map.
838a9643ea8Slogwang */
839a9643ea8Slogwang int
_vm_map_unlock_and_wait(vm_map_t map,int timo,const char * file,int line)840a9643ea8Slogwang _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
841a9643ea8Slogwang {
842a9643ea8Slogwang
843*22ce4affSfengbojiang VM_MAP_UNLOCK_CONSISTENT(map);
844a9643ea8Slogwang mtx_lock(&map_sleep_mtx);
845*22ce4affSfengbojiang if (map->system_map) {
846*22ce4affSfengbojiang KASSERT((map->flags & MAP_REPLENISH) == 0,
847*22ce4affSfengbojiang ("%s: MAP_REPLENISH leaked", __func__));
848a9643ea8Slogwang mtx_unlock_flags_(&map->system_mtx, 0, file, line);
849*22ce4affSfengbojiang } else {
850a9643ea8Slogwang sx_xunlock_(&map->lock, file, line);
851*22ce4affSfengbojiang }
852a9643ea8Slogwang return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
853a9643ea8Slogwang timo));
854a9643ea8Slogwang }
855a9643ea8Slogwang
856a9643ea8Slogwang /*
857a9643ea8Slogwang * vm_map_wakeup:
858a9643ea8Slogwang *
859a9643ea8Slogwang * Awaken any threads that have slept on the map using
860a9643ea8Slogwang * vm_map_unlock_and_wait().
861a9643ea8Slogwang */
862a9643ea8Slogwang void
vm_map_wakeup(vm_map_t map)863a9643ea8Slogwang vm_map_wakeup(vm_map_t map)
864a9643ea8Slogwang {
865a9643ea8Slogwang
866a9643ea8Slogwang /*
867a9643ea8Slogwang * Acquire and release map_sleep_mtx to prevent a wakeup()
868a9643ea8Slogwang * from being performed (and lost) between the map unlock
869a9643ea8Slogwang * and the msleep() in _vm_map_unlock_and_wait().
870a9643ea8Slogwang */
871a9643ea8Slogwang mtx_lock(&map_sleep_mtx);
872a9643ea8Slogwang mtx_unlock(&map_sleep_mtx);
873a9643ea8Slogwang wakeup(&map->root);
874a9643ea8Slogwang }
875a9643ea8Slogwang
876a9643ea8Slogwang void
vm_map_busy(vm_map_t map)877a9643ea8Slogwang vm_map_busy(vm_map_t map)
878a9643ea8Slogwang {
879a9643ea8Slogwang
880a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
881a9643ea8Slogwang map->busy++;
882a9643ea8Slogwang }
883a9643ea8Slogwang
884a9643ea8Slogwang void
vm_map_unbusy(vm_map_t map)885a9643ea8Slogwang vm_map_unbusy(vm_map_t map)
886a9643ea8Slogwang {
887a9643ea8Slogwang
888a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
889a9643ea8Slogwang KASSERT(map->busy, ("vm_map_unbusy: not busy"));
890a9643ea8Slogwang if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
891a9643ea8Slogwang vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
892a9643ea8Slogwang wakeup(&map->busy);
893a9643ea8Slogwang }
894a9643ea8Slogwang }
895a9643ea8Slogwang
896a9643ea8Slogwang void
vm_map_wait_busy(vm_map_t map)897a9643ea8Slogwang vm_map_wait_busy(vm_map_t map)
898a9643ea8Slogwang {
899a9643ea8Slogwang
900a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
901a9643ea8Slogwang while (map->busy) {
902a9643ea8Slogwang vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
903a9643ea8Slogwang if (map->system_map)
904a9643ea8Slogwang msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
905a9643ea8Slogwang else
906a9643ea8Slogwang sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
907a9643ea8Slogwang }
908a9643ea8Slogwang map->timestamp++;
909a9643ea8Slogwang }
910a9643ea8Slogwang
911a9643ea8Slogwang long
vmspace_resident_count(struct vmspace * vmspace)912a9643ea8Slogwang vmspace_resident_count(struct vmspace *vmspace)
913a9643ea8Slogwang {
914a9643ea8Slogwang return pmap_resident_count(vmspace_pmap(vmspace));
915a9643ea8Slogwang }
916a9643ea8Slogwang
917a9643ea8Slogwang /*
918a9643ea8Slogwang * Initialize an existing vm_map structure
919a9643ea8Slogwang * such as that in the vmspace structure.
920a9643ea8Slogwang */
921a9643ea8Slogwang static void
_vm_map_init(vm_map_t map,pmap_t pmap,vm_offset_t min,vm_offset_t max)922a9643ea8Slogwang _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
923a9643ea8Slogwang {
924a9643ea8Slogwang
925*22ce4affSfengbojiang map->header.eflags = MAP_ENTRY_HEADER;
926a9643ea8Slogwang map->needs_wakeup = FALSE;
927a9643ea8Slogwang map->system_map = 0;
928a9643ea8Slogwang map->pmap = pmap;
929*22ce4affSfengbojiang map->header.end = min;
930*22ce4affSfengbojiang map->header.start = max;
931a9643ea8Slogwang map->flags = 0;
932*22ce4affSfengbojiang map->header.left = map->header.right = &map->header;
933a9643ea8Slogwang map->root = NULL;
934a9643ea8Slogwang map->timestamp = 0;
935a9643ea8Slogwang map->busy = 0;
936*22ce4affSfengbojiang map->anon_loc = 0;
937*22ce4affSfengbojiang #ifdef DIAGNOSTIC
938*22ce4affSfengbojiang map->nupdates = 0;
939*22ce4affSfengbojiang #endif
940a9643ea8Slogwang }
941a9643ea8Slogwang
942a9643ea8Slogwang void
vm_map_init(vm_map_t map,pmap_t pmap,vm_offset_t min,vm_offset_t max)943a9643ea8Slogwang vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
944a9643ea8Slogwang {
945a9643ea8Slogwang
946a9643ea8Slogwang _vm_map_init(map, pmap, min, max);
947*22ce4affSfengbojiang mtx_init(&map->system_mtx, "vm map (system)", NULL,
948*22ce4affSfengbojiang MTX_DEF | MTX_DUPOK);
949*22ce4affSfengbojiang sx_init(&map->lock, "vm map (user)");
950a9643ea8Slogwang }
951a9643ea8Slogwang
952a9643ea8Slogwang /*
953a9643ea8Slogwang * vm_map_entry_dispose: [ internal use only ]
954a9643ea8Slogwang *
955a9643ea8Slogwang * Inverse of vm_map_entry_create.
956a9643ea8Slogwang */
957a9643ea8Slogwang static void
vm_map_entry_dispose(vm_map_t map,vm_map_entry_t entry)958a9643ea8Slogwang vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
959a9643ea8Slogwang {
960a9643ea8Slogwang uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
961a9643ea8Slogwang }
962a9643ea8Slogwang
963a9643ea8Slogwang /*
964a9643ea8Slogwang * vm_map_entry_create: [ internal use only ]
965a9643ea8Slogwang *
966a9643ea8Slogwang * Allocates a VM map entry for insertion.
967a9643ea8Slogwang * No entry fields are filled in.
968a9643ea8Slogwang */
969a9643ea8Slogwang static vm_map_entry_t
vm_map_entry_create(vm_map_t map)970a9643ea8Slogwang vm_map_entry_create(vm_map_t map)
971a9643ea8Slogwang {
972a9643ea8Slogwang vm_map_entry_t new_entry;
973a9643ea8Slogwang
974*22ce4affSfengbojiang #ifndef UMA_MD_SMALL_ALLOC
975*22ce4affSfengbojiang if (map == kernel_map) {
976*22ce4affSfengbojiang VM_MAP_ASSERT_LOCKED(map);
977*22ce4affSfengbojiang
978*22ce4affSfengbojiang /*
979*22ce4affSfengbojiang * A new slab of kernel map entries cannot be allocated at this
980*22ce4affSfengbojiang * point because the kernel map has not yet been updated to
981*22ce4affSfengbojiang * reflect the caller's request. Therefore, we allocate a new
982*22ce4affSfengbojiang * map entry, dipping into the reserve if necessary, and set a
983*22ce4affSfengbojiang * flag indicating that the reserve must be replenished before
984*22ce4affSfengbojiang * the map is unlocked.
985*22ce4affSfengbojiang */
986*22ce4affSfengbojiang new_entry = uma_zalloc(kmapentzone, M_NOWAIT | M_NOVM);
987*22ce4affSfengbojiang if (new_entry == NULL) {
988*22ce4affSfengbojiang new_entry = uma_zalloc(kmapentzone,
989*22ce4affSfengbojiang M_NOWAIT | M_NOVM | M_USE_RESERVE);
990*22ce4affSfengbojiang kernel_map->flags |= MAP_REPLENISH;
991*22ce4affSfengbojiang }
992*22ce4affSfengbojiang } else
993*22ce4affSfengbojiang #endif
994*22ce4affSfengbojiang if (map->system_map) {
995a9643ea8Slogwang new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
996*22ce4affSfengbojiang } else {
997a9643ea8Slogwang new_entry = uma_zalloc(mapentzone, M_WAITOK);
998*22ce4affSfengbojiang }
999*22ce4affSfengbojiang KASSERT(new_entry != NULL,
1000*22ce4affSfengbojiang ("vm_map_entry_create: kernel resources exhausted"));
1001a9643ea8Slogwang return (new_entry);
1002a9643ea8Slogwang }
1003a9643ea8Slogwang
1004a9643ea8Slogwang /*
1005a9643ea8Slogwang * vm_map_entry_set_behavior:
1006a9643ea8Slogwang *
1007a9643ea8Slogwang * Set the expected access behavior, either normal, random, or
1008a9643ea8Slogwang * sequential.
1009a9643ea8Slogwang */
1010a9643ea8Slogwang static inline void
vm_map_entry_set_behavior(vm_map_entry_t entry,u_char behavior)1011a9643ea8Slogwang vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
1012a9643ea8Slogwang {
1013a9643ea8Slogwang entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
1014a9643ea8Slogwang (behavior & MAP_ENTRY_BEHAV_MASK);
1015a9643ea8Slogwang }
1016a9643ea8Slogwang
1017a9643ea8Slogwang /*
1018*22ce4affSfengbojiang * vm_map_entry_max_free_{left,right}:
1019a9643ea8Slogwang *
1020*22ce4affSfengbojiang * Compute the size of the largest free gap between two entries,
1021*22ce4affSfengbojiang * one the root of a tree and the other the ancestor of that root
1022*22ce4affSfengbojiang * that is the least or greatest ancestor found on the search path.
1023a9643ea8Slogwang */
1024*22ce4affSfengbojiang static inline vm_size_t
vm_map_entry_max_free_left(vm_map_entry_t root,vm_map_entry_t left_ancestor)1025*22ce4affSfengbojiang vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
1026a9643ea8Slogwang {
1027a9643ea8Slogwang
1028*22ce4affSfengbojiang return (root->left != left_ancestor ?
1029*22ce4affSfengbojiang root->left->max_free : root->start - left_ancestor->end);
1030*22ce4affSfengbojiang }
1031*22ce4affSfengbojiang
1032*22ce4affSfengbojiang static inline vm_size_t
vm_map_entry_max_free_right(vm_map_entry_t root,vm_map_entry_t right_ancestor)1033*22ce4affSfengbojiang vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
1034*22ce4affSfengbojiang {
1035*22ce4affSfengbojiang
1036*22ce4affSfengbojiang return (root->right != right_ancestor ?
1037*22ce4affSfengbojiang root->right->max_free : right_ancestor->start - root->end);
1038a9643ea8Slogwang }
1039a9643ea8Slogwang
1040a9643ea8Slogwang /*
1041*22ce4affSfengbojiang * vm_map_entry_{pred,succ}:
1042*22ce4affSfengbojiang *
1043*22ce4affSfengbojiang * Find the {predecessor, successor} of the entry by taking one step
1044*22ce4affSfengbojiang * in the appropriate direction and backtracking as much as necessary.
1045*22ce4affSfengbojiang * vm_map_entry_succ is defined in vm_map.h.
1046*22ce4affSfengbojiang */
1047*22ce4affSfengbojiang static inline vm_map_entry_t
vm_map_entry_pred(vm_map_entry_t entry)1048*22ce4affSfengbojiang vm_map_entry_pred(vm_map_entry_t entry)
1049*22ce4affSfengbojiang {
1050*22ce4affSfengbojiang vm_map_entry_t prior;
1051*22ce4affSfengbojiang
1052*22ce4affSfengbojiang prior = entry->left;
1053*22ce4affSfengbojiang if (prior->right->start < entry->start) {
1054*22ce4affSfengbojiang do
1055*22ce4affSfengbojiang prior = prior->right;
1056*22ce4affSfengbojiang while (prior->right != entry);
1057*22ce4affSfengbojiang }
1058*22ce4affSfengbojiang return (prior);
1059*22ce4affSfengbojiang }
1060*22ce4affSfengbojiang
1061*22ce4affSfengbojiang static inline vm_size_t
vm_size_max(vm_size_t a,vm_size_t b)1062*22ce4affSfengbojiang vm_size_max(vm_size_t a, vm_size_t b)
1063*22ce4affSfengbojiang {
1064*22ce4affSfengbojiang
1065*22ce4affSfengbojiang return (a > b ? a : b);
1066*22ce4affSfengbojiang }
1067*22ce4affSfengbojiang
1068*22ce4affSfengbojiang #define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do { \
1069*22ce4affSfengbojiang vm_map_entry_t z; \
1070*22ce4affSfengbojiang vm_size_t max_free; \
1071*22ce4affSfengbojiang \
1072*22ce4affSfengbojiang /* \
1073*22ce4affSfengbojiang * Infer root->right->max_free == root->max_free when \
1074*22ce4affSfengbojiang * y->max_free < root->max_free || root->max_free == 0. \
1075*22ce4affSfengbojiang * Otherwise, look right to find it. \
1076*22ce4affSfengbojiang */ \
1077*22ce4affSfengbojiang y = root->left; \
1078*22ce4affSfengbojiang max_free = root->max_free; \
1079*22ce4affSfengbojiang KASSERT(max_free == vm_size_max( \
1080*22ce4affSfengbojiang vm_map_entry_max_free_left(root, llist), \
1081*22ce4affSfengbojiang vm_map_entry_max_free_right(root, rlist)), \
1082*22ce4affSfengbojiang ("%s: max_free invariant fails", __func__)); \
1083*22ce4affSfengbojiang if (max_free - 1 < vm_map_entry_max_free_left(root, llist)) \
1084*22ce4affSfengbojiang max_free = vm_map_entry_max_free_right(root, rlist); \
1085*22ce4affSfengbojiang if (y != llist && (test)) { \
1086*22ce4affSfengbojiang /* Rotate right and make y root. */ \
1087*22ce4affSfengbojiang z = y->right; \
1088*22ce4affSfengbojiang if (z != root) { \
1089*22ce4affSfengbojiang root->left = z; \
1090*22ce4affSfengbojiang y->right = root; \
1091*22ce4affSfengbojiang if (max_free < y->max_free) \
1092*22ce4affSfengbojiang root->max_free = max_free = \
1093*22ce4affSfengbojiang vm_size_max(max_free, z->max_free); \
1094*22ce4affSfengbojiang } else if (max_free < y->max_free) \
1095*22ce4affSfengbojiang root->max_free = max_free = \
1096*22ce4affSfengbojiang vm_size_max(max_free, root->start - y->end);\
1097*22ce4affSfengbojiang root = y; \
1098*22ce4affSfengbojiang y = root->left; \
1099*22ce4affSfengbojiang } \
1100*22ce4affSfengbojiang /* Copy right->max_free. Put root on rlist. */ \
1101*22ce4affSfengbojiang root->max_free = max_free; \
1102*22ce4affSfengbojiang KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \
1103*22ce4affSfengbojiang ("%s: max_free not copied from right", __func__)); \
1104*22ce4affSfengbojiang root->left = rlist; \
1105*22ce4affSfengbojiang rlist = root; \
1106*22ce4affSfengbojiang root = y != llist ? y : NULL; \
1107*22ce4affSfengbojiang } while (0)
1108*22ce4affSfengbojiang
1109*22ce4affSfengbojiang #define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do { \
1110*22ce4affSfengbojiang vm_map_entry_t z; \
1111*22ce4affSfengbojiang vm_size_t max_free; \
1112*22ce4affSfengbojiang \
1113*22ce4affSfengbojiang /* \
1114*22ce4affSfengbojiang * Infer root->left->max_free == root->max_free when \
1115*22ce4affSfengbojiang * y->max_free < root->max_free || root->max_free == 0. \
1116*22ce4affSfengbojiang * Otherwise, look left to find it. \
1117*22ce4affSfengbojiang */ \
1118*22ce4affSfengbojiang y = root->right; \
1119*22ce4affSfengbojiang max_free = root->max_free; \
1120*22ce4affSfengbojiang KASSERT(max_free == vm_size_max( \
1121*22ce4affSfengbojiang vm_map_entry_max_free_left(root, llist), \
1122*22ce4affSfengbojiang vm_map_entry_max_free_right(root, rlist)), \
1123*22ce4affSfengbojiang ("%s: max_free invariant fails", __func__)); \
1124*22ce4affSfengbojiang if (max_free - 1 < vm_map_entry_max_free_right(root, rlist)) \
1125*22ce4affSfengbojiang max_free = vm_map_entry_max_free_left(root, llist); \
1126*22ce4affSfengbojiang if (y != rlist && (test)) { \
1127*22ce4affSfengbojiang /* Rotate left and make y root. */ \
1128*22ce4affSfengbojiang z = y->left; \
1129*22ce4affSfengbojiang if (z != root) { \
1130*22ce4affSfengbojiang root->right = z; \
1131*22ce4affSfengbojiang y->left = root; \
1132*22ce4affSfengbojiang if (max_free < y->max_free) \
1133*22ce4affSfengbojiang root->max_free = max_free = \
1134*22ce4affSfengbojiang vm_size_max(max_free, z->max_free); \
1135*22ce4affSfengbojiang } else if (max_free < y->max_free) \
1136*22ce4affSfengbojiang root->max_free = max_free = \
1137*22ce4affSfengbojiang vm_size_max(max_free, y->start - root->end);\
1138*22ce4affSfengbojiang root = y; \
1139*22ce4affSfengbojiang y = root->right; \
1140*22ce4affSfengbojiang } \
1141*22ce4affSfengbojiang /* Copy left->max_free. Put root on llist. */ \
1142*22ce4affSfengbojiang root->max_free = max_free; \
1143*22ce4affSfengbojiang KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \
1144*22ce4affSfengbojiang ("%s: max_free not copied from left", __func__)); \
1145*22ce4affSfengbojiang root->right = llist; \
1146*22ce4affSfengbojiang llist = root; \
1147*22ce4affSfengbojiang root = y != rlist ? y : NULL; \
1148*22ce4affSfengbojiang } while (0)
1149*22ce4affSfengbojiang
1150*22ce4affSfengbojiang /*
1151*22ce4affSfengbojiang * Walk down the tree until we find addr or a gap where addr would go, breaking
1152*22ce4affSfengbojiang * off left and right subtrees of nodes less than, or greater than addr. Treat
1153*22ce4affSfengbojiang * subtrees with root->max_free < length as empty trees. llist and rlist are
1154*22ce4affSfengbojiang * the two sides in reverse order (bottom-up), with llist linked by the right
1155*22ce4affSfengbojiang * pointer and rlist linked by the left pointer in the vm_map_entry, and both
1156*22ce4affSfengbojiang * lists terminated by &map->header. This function, and the subsequent call to
1157*22ce4affSfengbojiang * vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address
1158*22ce4affSfengbojiang * values in &map->header.
1159*22ce4affSfengbojiang */
1160*22ce4affSfengbojiang static __always_inline vm_map_entry_t
vm_map_splay_split(vm_map_t map,vm_offset_t addr,vm_size_t length,vm_map_entry_t * llist,vm_map_entry_t * rlist)1161*22ce4affSfengbojiang vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
1162*22ce4affSfengbojiang vm_map_entry_t *llist, vm_map_entry_t *rlist)
1163*22ce4affSfengbojiang {
1164*22ce4affSfengbojiang vm_map_entry_t left, right, root, y;
1165*22ce4affSfengbojiang
1166*22ce4affSfengbojiang left = right = &map->header;
1167*22ce4affSfengbojiang root = map->root;
1168*22ce4affSfengbojiang while (root != NULL && root->max_free >= length) {
1169*22ce4affSfengbojiang KASSERT(left->end <= root->start &&
1170*22ce4affSfengbojiang root->end <= right->start,
1171*22ce4affSfengbojiang ("%s: root not within tree bounds", __func__));
1172*22ce4affSfengbojiang if (addr < root->start) {
1173*22ce4affSfengbojiang SPLAY_LEFT_STEP(root, y, left, right,
1174*22ce4affSfengbojiang y->max_free >= length && addr < y->start);
1175*22ce4affSfengbojiang } else if (addr >= root->end) {
1176*22ce4affSfengbojiang SPLAY_RIGHT_STEP(root, y, left, right,
1177*22ce4affSfengbojiang y->max_free >= length && addr >= y->end);
1178*22ce4affSfengbojiang } else
1179*22ce4affSfengbojiang break;
1180*22ce4affSfengbojiang }
1181*22ce4affSfengbojiang *llist = left;
1182*22ce4affSfengbojiang *rlist = right;
1183*22ce4affSfengbojiang return (root);
1184*22ce4affSfengbojiang }
1185*22ce4affSfengbojiang
1186*22ce4affSfengbojiang static __always_inline void
vm_map_splay_findnext(vm_map_entry_t root,vm_map_entry_t * rlist)1187*22ce4affSfengbojiang vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist)
1188*22ce4affSfengbojiang {
1189*22ce4affSfengbojiang vm_map_entry_t hi, right, y;
1190*22ce4affSfengbojiang
1191*22ce4affSfengbojiang right = *rlist;
1192*22ce4affSfengbojiang hi = root->right == right ? NULL : root->right;
1193*22ce4affSfengbojiang if (hi == NULL)
1194*22ce4affSfengbojiang return;
1195*22ce4affSfengbojiang do
1196*22ce4affSfengbojiang SPLAY_LEFT_STEP(hi, y, root, right, true);
1197*22ce4affSfengbojiang while (hi != NULL);
1198*22ce4affSfengbojiang *rlist = right;
1199*22ce4affSfengbojiang }
1200*22ce4affSfengbojiang
1201*22ce4affSfengbojiang static __always_inline void
vm_map_splay_findprev(vm_map_entry_t root,vm_map_entry_t * llist)1202*22ce4affSfengbojiang vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist)
1203*22ce4affSfengbojiang {
1204*22ce4affSfengbojiang vm_map_entry_t left, lo, y;
1205*22ce4affSfengbojiang
1206*22ce4affSfengbojiang left = *llist;
1207*22ce4affSfengbojiang lo = root->left == left ? NULL : root->left;
1208*22ce4affSfengbojiang if (lo == NULL)
1209*22ce4affSfengbojiang return;
1210*22ce4affSfengbojiang do
1211*22ce4affSfengbojiang SPLAY_RIGHT_STEP(lo, y, left, root, true);
1212*22ce4affSfengbojiang while (lo != NULL);
1213*22ce4affSfengbojiang *llist = left;
1214*22ce4affSfengbojiang }
1215*22ce4affSfengbojiang
1216*22ce4affSfengbojiang static inline void
vm_map_entry_swap(vm_map_entry_t * a,vm_map_entry_t * b)1217*22ce4affSfengbojiang vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
1218*22ce4affSfengbojiang {
1219*22ce4affSfengbojiang vm_map_entry_t tmp;
1220*22ce4affSfengbojiang
1221*22ce4affSfengbojiang tmp = *b;
1222*22ce4affSfengbojiang *b = *a;
1223*22ce4affSfengbojiang *a = tmp;
1224*22ce4affSfengbojiang }
1225*22ce4affSfengbojiang
1226*22ce4affSfengbojiang /*
1227*22ce4affSfengbojiang * Walk back up the two spines, flip the pointers and set max_free. The
1228*22ce4affSfengbojiang * subtrees of the root go at the bottom of llist and rlist.
1229*22ce4affSfengbojiang */
1230*22ce4affSfengbojiang static vm_size_t
vm_map_splay_merge_left_walk(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t tail,vm_size_t max_free,vm_map_entry_t llist)1231*22ce4affSfengbojiang vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root,
1232*22ce4affSfengbojiang vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist)
1233*22ce4affSfengbojiang {
1234*22ce4affSfengbojiang do {
1235*22ce4affSfengbojiang /*
1236*22ce4affSfengbojiang * The max_free values of the children of llist are in
1237*22ce4affSfengbojiang * llist->max_free and max_free. Update with the
1238*22ce4affSfengbojiang * max value.
1239*22ce4affSfengbojiang */
1240*22ce4affSfengbojiang llist->max_free = max_free =
1241*22ce4affSfengbojiang vm_size_max(llist->max_free, max_free);
1242*22ce4affSfengbojiang vm_map_entry_swap(&llist->right, &tail);
1243*22ce4affSfengbojiang vm_map_entry_swap(&tail, &llist);
1244*22ce4affSfengbojiang } while (llist != header);
1245*22ce4affSfengbojiang root->left = tail;
1246*22ce4affSfengbojiang return (max_free);
1247*22ce4affSfengbojiang }
1248*22ce4affSfengbojiang
1249*22ce4affSfengbojiang /*
1250*22ce4affSfengbojiang * When llist is known to be the predecessor of root.
1251*22ce4affSfengbojiang */
1252*22ce4affSfengbojiang static inline vm_size_t
vm_map_splay_merge_pred(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t llist)1253*22ce4affSfengbojiang vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root,
1254*22ce4affSfengbojiang vm_map_entry_t llist)
1255*22ce4affSfengbojiang {
1256*22ce4affSfengbojiang vm_size_t max_free;
1257*22ce4affSfengbojiang
1258*22ce4affSfengbojiang max_free = root->start - llist->end;
1259*22ce4affSfengbojiang if (llist != header) {
1260*22ce4affSfengbojiang max_free = vm_map_splay_merge_left_walk(header, root,
1261*22ce4affSfengbojiang root, max_free, llist);
1262*22ce4affSfengbojiang } else {
1263*22ce4affSfengbojiang root->left = header;
1264*22ce4affSfengbojiang header->right = root;
1265*22ce4affSfengbojiang }
1266*22ce4affSfengbojiang return (max_free);
1267*22ce4affSfengbojiang }
1268*22ce4affSfengbojiang
1269*22ce4affSfengbojiang /*
1270*22ce4affSfengbojiang * When llist may or may not be the predecessor of root.
1271*22ce4affSfengbojiang */
1272*22ce4affSfengbojiang static inline vm_size_t
vm_map_splay_merge_left(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t llist)1273*22ce4affSfengbojiang vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root,
1274*22ce4affSfengbojiang vm_map_entry_t llist)
1275*22ce4affSfengbojiang {
1276*22ce4affSfengbojiang vm_size_t max_free;
1277*22ce4affSfengbojiang
1278*22ce4affSfengbojiang max_free = vm_map_entry_max_free_left(root, llist);
1279*22ce4affSfengbojiang if (llist != header) {
1280*22ce4affSfengbojiang max_free = vm_map_splay_merge_left_walk(header, root,
1281*22ce4affSfengbojiang root->left == llist ? root : root->left,
1282*22ce4affSfengbojiang max_free, llist);
1283*22ce4affSfengbojiang }
1284*22ce4affSfengbojiang return (max_free);
1285*22ce4affSfengbojiang }
1286*22ce4affSfengbojiang
1287*22ce4affSfengbojiang static vm_size_t
vm_map_splay_merge_right_walk(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t tail,vm_size_t max_free,vm_map_entry_t rlist)1288*22ce4affSfengbojiang vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root,
1289*22ce4affSfengbojiang vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist)
1290*22ce4affSfengbojiang {
1291*22ce4affSfengbojiang do {
1292*22ce4affSfengbojiang /*
1293*22ce4affSfengbojiang * The max_free values of the children of rlist are in
1294*22ce4affSfengbojiang * rlist->max_free and max_free. Update with the
1295*22ce4affSfengbojiang * max value.
1296*22ce4affSfengbojiang */
1297*22ce4affSfengbojiang rlist->max_free = max_free =
1298*22ce4affSfengbojiang vm_size_max(rlist->max_free, max_free);
1299*22ce4affSfengbojiang vm_map_entry_swap(&rlist->left, &tail);
1300*22ce4affSfengbojiang vm_map_entry_swap(&tail, &rlist);
1301*22ce4affSfengbojiang } while (rlist != header);
1302*22ce4affSfengbojiang root->right = tail;
1303*22ce4affSfengbojiang return (max_free);
1304*22ce4affSfengbojiang }
1305*22ce4affSfengbojiang
1306*22ce4affSfengbojiang /*
1307*22ce4affSfengbojiang * When rlist is known to be the succecessor of root.
1308*22ce4affSfengbojiang */
1309*22ce4affSfengbojiang static inline vm_size_t
vm_map_splay_merge_succ(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t rlist)1310*22ce4affSfengbojiang vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root,
1311*22ce4affSfengbojiang vm_map_entry_t rlist)
1312*22ce4affSfengbojiang {
1313*22ce4affSfengbojiang vm_size_t max_free;
1314*22ce4affSfengbojiang
1315*22ce4affSfengbojiang max_free = rlist->start - root->end;
1316*22ce4affSfengbojiang if (rlist != header) {
1317*22ce4affSfengbojiang max_free = vm_map_splay_merge_right_walk(header, root,
1318*22ce4affSfengbojiang root, max_free, rlist);
1319*22ce4affSfengbojiang } else {
1320*22ce4affSfengbojiang root->right = header;
1321*22ce4affSfengbojiang header->left = root;
1322*22ce4affSfengbojiang }
1323*22ce4affSfengbojiang return (max_free);
1324*22ce4affSfengbojiang }
1325*22ce4affSfengbojiang
1326*22ce4affSfengbojiang /*
1327*22ce4affSfengbojiang * When rlist may or may not be the succecessor of root.
1328*22ce4affSfengbojiang */
1329*22ce4affSfengbojiang static inline vm_size_t
vm_map_splay_merge_right(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t rlist)1330*22ce4affSfengbojiang vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root,
1331*22ce4affSfengbojiang vm_map_entry_t rlist)
1332*22ce4affSfengbojiang {
1333*22ce4affSfengbojiang vm_size_t max_free;
1334*22ce4affSfengbojiang
1335*22ce4affSfengbojiang max_free = vm_map_entry_max_free_right(root, rlist);
1336*22ce4affSfengbojiang if (rlist != header) {
1337*22ce4affSfengbojiang max_free = vm_map_splay_merge_right_walk(header, root,
1338*22ce4affSfengbojiang root->right == rlist ? root : root->right,
1339*22ce4affSfengbojiang max_free, rlist);
1340*22ce4affSfengbojiang }
1341*22ce4affSfengbojiang return (max_free);
1342*22ce4affSfengbojiang }
1343*22ce4affSfengbojiang
1344*22ce4affSfengbojiang /*
1345*22ce4affSfengbojiang * vm_map_splay:
1346a9643ea8Slogwang *
1347a9643ea8Slogwang * The Sleator and Tarjan top-down splay algorithm with the
1348a9643ea8Slogwang * following variation. Max_free must be computed bottom-up, so
1349a9643ea8Slogwang * on the downward pass, maintain the left and right spines in
1350a9643ea8Slogwang * reverse order. Then, make a second pass up each side to fix
1351a9643ea8Slogwang * the pointers and compute max_free. The time bound is O(log n)
1352a9643ea8Slogwang * amortized.
1353a9643ea8Slogwang *
1354*22ce4affSfengbojiang * The tree is threaded, which means that there are no null pointers.
1355*22ce4affSfengbojiang * When a node has no left child, its left pointer points to its
1356*22ce4affSfengbojiang * predecessor, which the last ancestor on the search path from the root
1357*22ce4affSfengbojiang * where the search branched right. Likewise, when a node has no right
1358*22ce4affSfengbojiang * child, its right pointer points to its successor. The map header node
1359*22ce4affSfengbojiang * is the predecessor of the first map entry, and the successor of the
1360*22ce4affSfengbojiang * last.
1361*22ce4affSfengbojiang *
1362a9643ea8Slogwang * The new root is the vm_map_entry containing "addr", or else an
1363*22ce4affSfengbojiang * adjacent entry (lower if possible) if addr is not in the tree.
1364a9643ea8Slogwang *
1365a9643ea8Slogwang * The map must be locked, and leaves it so.
1366a9643ea8Slogwang *
1367a9643ea8Slogwang * Returns: the new root.
1368a9643ea8Slogwang */
1369a9643ea8Slogwang static vm_map_entry_t
vm_map_splay(vm_map_t map,vm_offset_t addr)1370*22ce4affSfengbojiang vm_map_splay(vm_map_t map, vm_offset_t addr)
1371a9643ea8Slogwang {
1372*22ce4affSfengbojiang vm_map_entry_t header, llist, rlist, root;
1373*22ce4affSfengbojiang vm_size_t max_free_left, max_free_right;
1374a9643ea8Slogwang
1375*22ce4affSfengbojiang header = &map->header;
1376*22ce4affSfengbojiang root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
1377*22ce4affSfengbojiang if (root != NULL) {
1378*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_left(header, root, llist);
1379*22ce4affSfengbojiang max_free_right = vm_map_splay_merge_right(header, root, rlist);
1380*22ce4affSfengbojiang } else if (llist != header) {
1381a9643ea8Slogwang /*
1382*22ce4affSfengbojiang * Recover the greatest node in the left
1383*22ce4affSfengbojiang * subtree and make it the root.
1384a9643ea8Slogwang */
1385*22ce4affSfengbojiang root = llist;
1386*22ce4affSfengbojiang llist = root->right;
1387*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_left(header, root, llist);
1388*22ce4affSfengbojiang max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1389*22ce4affSfengbojiang } else if (rlist != header) {
1390*22ce4affSfengbojiang /*
1391*22ce4affSfengbojiang * Recover the least node in the right
1392*22ce4affSfengbojiang * subtree and make it the root.
1393*22ce4affSfengbojiang */
1394*22ce4affSfengbojiang root = rlist;
1395*22ce4affSfengbojiang rlist = root->left;
1396*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_pred(header, root, llist);
1397*22ce4affSfengbojiang max_free_right = vm_map_splay_merge_right(header, root, rlist);
1398a9643ea8Slogwang } else {
1399*22ce4affSfengbojiang /* There is no root. */
1400*22ce4affSfengbojiang return (NULL);
1401a9643ea8Slogwang }
1402*22ce4affSfengbojiang root->max_free = vm_size_max(max_free_left, max_free_right);
1403*22ce4affSfengbojiang map->root = root;
1404*22ce4affSfengbojiang VM_MAP_ASSERT_CONSISTENT(map);
1405a9643ea8Slogwang return (root);
1406a9643ea8Slogwang }
1407a9643ea8Slogwang
1408a9643ea8Slogwang /*
1409a9643ea8Slogwang * vm_map_entry_{un,}link:
1410a9643ea8Slogwang *
1411*22ce4affSfengbojiang * Insert/remove entries from maps. On linking, if new entry clips
1412*22ce4affSfengbojiang * existing entry, trim existing entry to avoid overlap, and manage
1413*22ce4affSfengbojiang * offsets. On unlinking, merge disappearing entry with neighbor, if
1414*22ce4affSfengbojiang * called for, and manage offsets. Callers should not modify fields in
1415*22ce4affSfengbojiang * entries already mapped.
1416a9643ea8Slogwang */
1417a9643ea8Slogwang static void
vm_map_entry_link(vm_map_t map,vm_map_entry_t entry)1418*22ce4affSfengbojiang vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1419a9643ea8Slogwang {
1420*22ce4affSfengbojiang vm_map_entry_t header, llist, rlist, root;
1421*22ce4affSfengbojiang vm_size_t max_free_left, max_free_right;
1422a9643ea8Slogwang
1423*22ce4affSfengbojiang CTR3(KTR_VM,
1424*22ce4affSfengbojiang "vm_map_entry_link: map %p, nentries %d, entry %p", map,
1425*22ce4affSfengbojiang map->nentries, entry);
1426a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
1427a9643ea8Slogwang map->nentries++;
1428*22ce4affSfengbojiang header = &map->header;
1429*22ce4affSfengbojiang root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1430*22ce4affSfengbojiang if (root == NULL) {
1431*22ce4affSfengbojiang /*
1432*22ce4affSfengbojiang * The new entry does not overlap any existing entry in the
1433*22ce4affSfengbojiang * map, so it becomes the new root of the map tree.
1434*22ce4affSfengbojiang */
1435*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1436*22ce4affSfengbojiang max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1437*22ce4affSfengbojiang } else if (entry->start == root->start) {
1438*22ce4affSfengbojiang /*
1439*22ce4affSfengbojiang * The new entry is a clone of root, with only the end field
1440*22ce4affSfengbojiang * changed. The root entry will be shrunk to abut the new
1441*22ce4affSfengbojiang * entry, and will be the right child of the new root entry in
1442*22ce4affSfengbojiang * the modified map.
1443*22ce4affSfengbojiang */
1444*22ce4affSfengbojiang KASSERT(entry->end < root->end,
1445*22ce4affSfengbojiang ("%s: clip_start not within entry", __func__));
1446*22ce4affSfengbojiang vm_map_splay_findprev(root, &llist);
1447*22ce4affSfengbojiang root->offset += entry->end - root->start;
1448*22ce4affSfengbojiang root->start = entry->end;
1449*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1450*22ce4affSfengbojiang max_free_right = root->max_free = vm_size_max(
1451*22ce4affSfengbojiang vm_map_splay_merge_pred(entry, root, entry),
1452*22ce4affSfengbojiang vm_map_splay_merge_right(header, root, rlist));
1453a9643ea8Slogwang } else {
1454*22ce4affSfengbojiang /*
1455*22ce4affSfengbojiang * The new entry is a clone of root, with only the start field
1456*22ce4affSfengbojiang * changed. The root entry will be shrunk to abut the new
1457*22ce4affSfengbojiang * entry, and will be the left child of the new root entry in
1458*22ce4affSfengbojiang * the modified map.
1459*22ce4affSfengbojiang */
1460*22ce4affSfengbojiang KASSERT(entry->end == root->end,
1461*22ce4affSfengbojiang ("%s: clip_start not within entry", __func__));
1462*22ce4affSfengbojiang vm_map_splay_findnext(root, &rlist);
1463*22ce4affSfengbojiang entry->offset += entry->start - root->start;
1464*22ce4affSfengbojiang root->end = entry->start;
1465*22ce4affSfengbojiang max_free_left = root->max_free = vm_size_max(
1466*22ce4affSfengbojiang vm_map_splay_merge_left(header, root, llist),
1467*22ce4affSfengbojiang vm_map_splay_merge_succ(entry, root, entry));
1468*22ce4affSfengbojiang max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1469a9643ea8Slogwang }
1470*22ce4affSfengbojiang entry->max_free = vm_size_max(max_free_left, max_free_right);
1471a9643ea8Slogwang map->root = entry;
1472*22ce4affSfengbojiang VM_MAP_ASSERT_CONSISTENT(map);
1473a9643ea8Slogwang }
1474a9643ea8Slogwang
1475*22ce4affSfengbojiang enum unlink_merge_type {
1476*22ce4affSfengbojiang UNLINK_MERGE_NONE,
1477*22ce4affSfengbojiang UNLINK_MERGE_NEXT
1478*22ce4affSfengbojiang };
1479*22ce4affSfengbojiang
1480a9643ea8Slogwang static void
vm_map_entry_unlink(vm_map_t map,vm_map_entry_t entry,enum unlink_merge_type op)1481*22ce4affSfengbojiang vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
1482*22ce4affSfengbojiang enum unlink_merge_type op)
1483a9643ea8Slogwang {
1484*22ce4affSfengbojiang vm_map_entry_t header, llist, rlist, root;
1485*22ce4affSfengbojiang vm_size_t max_free_left, max_free_right;
1486a9643ea8Slogwang
1487a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
1488*22ce4affSfengbojiang header = &map->header;
1489*22ce4affSfengbojiang root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1490*22ce4affSfengbojiang KASSERT(root != NULL,
1491*22ce4affSfengbojiang ("vm_map_entry_unlink: unlink object not mapped"));
1492a9643ea8Slogwang
1493*22ce4affSfengbojiang vm_map_splay_findprev(root, &llist);
1494*22ce4affSfengbojiang vm_map_splay_findnext(root, &rlist);
1495*22ce4affSfengbojiang if (op == UNLINK_MERGE_NEXT) {
1496*22ce4affSfengbojiang rlist->start = root->start;
1497*22ce4affSfengbojiang rlist->offset = root->offset;
1498*22ce4affSfengbojiang }
1499*22ce4affSfengbojiang if (llist != header) {
1500*22ce4affSfengbojiang root = llist;
1501*22ce4affSfengbojiang llist = root->right;
1502*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_left(header, root, llist);
1503*22ce4affSfengbojiang max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1504*22ce4affSfengbojiang } else if (rlist != header) {
1505*22ce4affSfengbojiang root = rlist;
1506*22ce4affSfengbojiang rlist = root->left;
1507*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_pred(header, root, llist);
1508*22ce4affSfengbojiang max_free_right = vm_map_splay_merge_right(header, root, rlist);
1509*22ce4affSfengbojiang } else {
1510*22ce4affSfengbojiang header->left = header->right = header;
1511*22ce4affSfengbojiang root = NULL;
1512*22ce4affSfengbojiang }
1513*22ce4affSfengbojiang if (root != NULL)
1514*22ce4affSfengbojiang root->max_free = vm_size_max(max_free_left, max_free_right);
1515*22ce4affSfengbojiang map->root = root;
1516*22ce4affSfengbojiang VM_MAP_ASSERT_CONSISTENT(map);
1517a9643ea8Slogwang map->nentries--;
1518a9643ea8Slogwang CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1519a9643ea8Slogwang map->nentries, entry);
1520a9643ea8Slogwang }
1521a9643ea8Slogwang
1522a9643ea8Slogwang /*
1523*22ce4affSfengbojiang * vm_map_entry_resize:
1524a9643ea8Slogwang *
1525*22ce4affSfengbojiang * Resize a vm_map_entry, recompute the amount of free space that
1526*22ce4affSfengbojiang * follows it and propagate that value up the tree.
1527a9643ea8Slogwang *
1528a9643ea8Slogwang * The map must be locked, and leaves it so.
1529a9643ea8Slogwang */
1530a9643ea8Slogwang static void
vm_map_entry_resize(vm_map_t map,vm_map_entry_t entry,vm_size_t grow_amount)1531*22ce4affSfengbojiang vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
1532a9643ea8Slogwang {
1533*22ce4affSfengbojiang vm_map_entry_t header, llist, rlist, root;
1534a9643ea8Slogwang
1535*22ce4affSfengbojiang VM_MAP_ASSERT_LOCKED(map);
1536*22ce4affSfengbojiang header = &map->header;
1537*22ce4affSfengbojiang root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1538*22ce4affSfengbojiang KASSERT(root != NULL, ("%s: resize object not mapped", __func__));
1539*22ce4affSfengbojiang vm_map_splay_findnext(root, &rlist);
1540*22ce4affSfengbojiang entry->end += grow_amount;
1541*22ce4affSfengbojiang root->max_free = vm_size_max(
1542*22ce4affSfengbojiang vm_map_splay_merge_left(header, root, llist),
1543*22ce4affSfengbojiang vm_map_splay_merge_succ(header, root, rlist));
1544*22ce4affSfengbojiang map->root = root;
1545*22ce4affSfengbojiang VM_MAP_ASSERT_CONSISTENT(map);
1546*22ce4affSfengbojiang CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
1547*22ce4affSfengbojiang __func__, map, map->nentries, entry);
1548a9643ea8Slogwang }
1549a9643ea8Slogwang
1550a9643ea8Slogwang /*
1551a9643ea8Slogwang * vm_map_lookup_entry: [ internal use only ]
1552a9643ea8Slogwang *
1553a9643ea8Slogwang * Finds the map entry containing (or
1554a9643ea8Slogwang * immediately preceding) the specified address
1555a9643ea8Slogwang * in the given map; the entry is returned
1556a9643ea8Slogwang * in the "entry" parameter. The boolean
1557a9643ea8Slogwang * result indicates whether the address is
1558a9643ea8Slogwang * actually contained in the map.
1559a9643ea8Slogwang */
1560a9643ea8Slogwang boolean_t
vm_map_lookup_entry(vm_map_t map,vm_offset_t address,vm_map_entry_t * entry)1561a9643ea8Slogwang vm_map_lookup_entry(
1562a9643ea8Slogwang vm_map_t map,
1563a9643ea8Slogwang vm_offset_t address,
1564a9643ea8Slogwang vm_map_entry_t *entry) /* OUT */
1565a9643ea8Slogwang {
1566*22ce4affSfengbojiang vm_map_entry_t cur, header, lbound, ubound;
1567a9643ea8Slogwang boolean_t locked;
1568a9643ea8Slogwang
1569a9643ea8Slogwang /*
1570a9643ea8Slogwang * If the map is empty, then the map entry immediately preceding
1571a9643ea8Slogwang * "address" is the map's header.
1572a9643ea8Slogwang */
1573*22ce4affSfengbojiang header = &map->header;
1574a9643ea8Slogwang cur = map->root;
1575*22ce4affSfengbojiang if (cur == NULL) {
1576*22ce4affSfengbojiang *entry = header;
1577*22ce4affSfengbojiang return (FALSE);
1578*22ce4affSfengbojiang }
1579*22ce4affSfengbojiang if (address >= cur->start && cur->end > address) {
1580a9643ea8Slogwang *entry = cur;
1581a9643ea8Slogwang return (TRUE);
1582*22ce4affSfengbojiang }
1583*22ce4affSfengbojiang if ((locked = vm_map_locked(map)) ||
1584a9643ea8Slogwang sx_try_upgrade(&map->lock)) {
1585a9643ea8Slogwang /*
1586a9643ea8Slogwang * Splay requires a write lock on the map. However, it only
1587a9643ea8Slogwang * restructures the binary search tree; it does not otherwise
1588a9643ea8Slogwang * change the map. Thus, the map's timestamp need not change
1589a9643ea8Slogwang * on a temporary upgrade.
1590a9643ea8Slogwang */
1591*22ce4affSfengbojiang cur = vm_map_splay(map, address);
1592*22ce4affSfengbojiang if (!locked) {
1593*22ce4affSfengbojiang VM_MAP_UNLOCK_CONSISTENT(map);
1594a9643ea8Slogwang sx_downgrade(&map->lock);
1595*22ce4affSfengbojiang }
1596a9643ea8Slogwang
1597a9643ea8Slogwang /*
1598a9643ea8Slogwang * If "address" is contained within a map entry, the new root
1599a9643ea8Slogwang * is that map entry. Otherwise, the new root is a map entry
1600a9643ea8Slogwang * immediately before or after "address".
1601a9643ea8Slogwang */
1602*22ce4affSfengbojiang if (address < cur->start) {
1603*22ce4affSfengbojiang *entry = header;
1604*22ce4affSfengbojiang return (FALSE);
1605*22ce4affSfengbojiang }
1606a9643ea8Slogwang *entry = cur;
1607*22ce4affSfengbojiang return (address < cur->end);
1608*22ce4affSfengbojiang }
1609a9643ea8Slogwang /*
1610a9643ea8Slogwang * Since the map is only locked for read access, perform a
1611a9643ea8Slogwang * standard binary search tree lookup for "address".
1612a9643ea8Slogwang */
1613*22ce4affSfengbojiang lbound = ubound = header;
1614a9643ea8Slogwang for (;;) {
1615a9643ea8Slogwang if (address < cur->start) {
1616*22ce4affSfengbojiang ubound = cur;
1617a9643ea8Slogwang cur = cur->left;
1618*22ce4affSfengbojiang if (cur == lbound)
1619*22ce4affSfengbojiang break;
1620*22ce4affSfengbojiang } else if (cur->end <= address) {
1621*22ce4affSfengbojiang lbound = cur;
1622*22ce4affSfengbojiang cur = cur->right;
1623*22ce4affSfengbojiang if (cur == ubound)
1624*22ce4affSfengbojiang break;
1625*22ce4affSfengbojiang } else {
1626a9643ea8Slogwang *entry = cur;
1627a9643ea8Slogwang return (TRUE);
1628a9643ea8Slogwang }
1629a9643ea8Slogwang }
1630*22ce4affSfengbojiang *entry = lbound;
1631a9643ea8Slogwang return (FALSE);
1632a9643ea8Slogwang }
1633a9643ea8Slogwang
1634a9643ea8Slogwang /*
1635a9643ea8Slogwang * vm_map_insert:
1636a9643ea8Slogwang *
1637a9643ea8Slogwang * Inserts the given whole VM object into the target
1638a9643ea8Slogwang * map at the specified address range. The object's
1639a9643ea8Slogwang * size should match that of the address range.
1640a9643ea8Slogwang *
1641a9643ea8Slogwang * Requires that the map be locked, and leaves it so.
1642a9643ea8Slogwang *
1643a9643ea8Slogwang * If object is non-NULL, ref count must be bumped by caller
1644a9643ea8Slogwang * prior to making call to account for the new entry.
1645a9643ea8Slogwang */
1646a9643ea8Slogwang int
vm_map_insert(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t start,vm_offset_t end,vm_prot_t prot,vm_prot_t max,int cow)1647a9643ea8Slogwang vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1648a9643ea8Slogwang vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1649a9643ea8Slogwang {
1650*22ce4affSfengbojiang vm_map_entry_t new_entry, next_entry, prev_entry;
1651a9643ea8Slogwang struct ucred *cred;
1652*22ce4affSfengbojiang vm_eflags_t protoeflags;
1653a9643ea8Slogwang vm_inherit_t inheritance;
1654*22ce4affSfengbojiang u_long bdry;
1655*22ce4affSfengbojiang u_int bidx;
1656a9643ea8Slogwang
1657a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
1658*22ce4affSfengbojiang KASSERT(object != kernel_object ||
1659a9643ea8Slogwang (cow & MAP_COPY_ON_WRITE) == 0,
1660*22ce4affSfengbojiang ("vm_map_insert: kernel object and COW"));
1661*22ce4affSfengbojiang KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
1662*22ce4affSfengbojiang (cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
1663*22ce4affSfengbojiang ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
1664*22ce4affSfengbojiang object, cow));
1665*22ce4affSfengbojiang KASSERT((prot & ~max) == 0,
1666*22ce4affSfengbojiang ("prot %#x is not subset of max_prot %#x", prot, max));
1667a9643ea8Slogwang
1668a9643ea8Slogwang /*
1669a9643ea8Slogwang * Check that the start and end points are not bogus.
1670a9643ea8Slogwang */
1671*22ce4affSfengbojiang if (start == end || !vm_map_range_valid(map, start, end))
1672a9643ea8Slogwang return (KERN_INVALID_ADDRESS);
1673a9643ea8Slogwang
1674*22ce4affSfengbojiang if ((map->flags & MAP_WXORX) != 0 && (prot & (VM_PROT_WRITE |
1675*22ce4affSfengbojiang VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE))
1676*22ce4affSfengbojiang return (KERN_PROTECTION_FAILURE);
1677*22ce4affSfengbojiang
1678a9643ea8Slogwang /*
1679a9643ea8Slogwang * Find the entry prior to the proposed starting address; if it's part
1680a9643ea8Slogwang * of an existing entry, this range is bogus.
1681a9643ea8Slogwang */
1682*22ce4affSfengbojiang if (vm_map_lookup_entry(map, start, &prev_entry))
1683a9643ea8Slogwang return (KERN_NO_SPACE);
1684a9643ea8Slogwang
1685a9643ea8Slogwang /*
1686a9643ea8Slogwang * Assert that the next entry doesn't overlap the end point.
1687a9643ea8Slogwang */
1688*22ce4affSfengbojiang next_entry = vm_map_entry_succ(prev_entry);
1689*22ce4affSfengbojiang if (next_entry->start < end)
1690a9643ea8Slogwang return (KERN_NO_SPACE);
1691a9643ea8Slogwang
1692*22ce4affSfengbojiang if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1693*22ce4affSfengbojiang max != VM_PROT_NONE))
1694*22ce4affSfengbojiang return (KERN_INVALID_ARGUMENT);
1695*22ce4affSfengbojiang
1696a9643ea8Slogwang protoeflags = 0;
1697a9643ea8Slogwang if (cow & MAP_COPY_ON_WRITE)
1698a9643ea8Slogwang protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1699a9643ea8Slogwang if (cow & MAP_NOFAULT)
1700a9643ea8Slogwang protoeflags |= MAP_ENTRY_NOFAULT;
1701a9643ea8Slogwang if (cow & MAP_DISABLE_SYNCER)
1702a9643ea8Slogwang protoeflags |= MAP_ENTRY_NOSYNC;
1703a9643ea8Slogwang if (cow & MAP_DISABLE_COREDUMP)
1704a9643ea8Slogwang protoeflags |= MAP_ENTRY_NOCOREDUMP;
1705a9643ea8Slogwang if (cow & MAP_STACK_GROWS_DOWN)
1706a9643ea8Slogwang protoeflags |= MAP_ENTRY_GROWS_DOWN;
1707a9643ea8Slogwang if (cow & MAP_STACK_GROWS_UP)
1708a9643ea8Slogwang protoeflags |= MAP_ENTRY_GROWS_UP;
1709*22ce4affSfengbojiang if (cow & MAP_WRITECOUNT)
1710*22ce4affSfengbojiang protoeflags |= MAP_ENTRY_WRITECNT;
1711*22ce4affSfengbojiang if (cow & MAP_VN_EXEC)
1712*22ce4affSfengbojiang protoeflags |= MAP_ENTRY_VN_EXEC;
1713*22ce4affSfengbojiang if ((cow & MAP_CREATE_GUARD) != 0)
1714*22ce4affSfengbojiang protoeflags |= MAP_ENTRY_GUARD;
1715*22ce4affSfengbojiang if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
1716*22ce4affSfengbojiang protoeflags |= MAP_ENTRY_STACK_GAP_DN;
1717*22ce4affSfengbojiang if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
1718*22ce4affSfengbojiang protoeflags |= MAP_ENTRY_STACK_GAP_UP;
1719a9643ea8Slogwang if (cow & MAP_INHERIT_SHARE)
1720a9643ea8Slogwang inheritance = VM_INHERIT_SHARE;
1721a9643ea8Slogwang else
1722a9643ea8Slogwang inheritance = VM_INHERIT_DEFAULT;
1723*22ce4affSfengbojiang if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
1724*22ce4affSfengbojiang /* This magically ignores index 0, for usual page size. */
1725*22ce4affSfengbojiang bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
1726*22ce4affSfengbojiang MAP_SPLIT_BOUNDARY_SHIFT;
1727*22ce4affSfengbojiang if (bidx >= MAXPAGESIZES)
1728*22ce4affSfengbojiang return (KERN_INVALID_ARGUMENT);
1729*22ce4affSfengbojiang bdry = pagesizes[bidx] - 1;
1730*22ce4affSfengbojiang if ((start & bdry) != 0 || (end & bdry) != 0)
1731*22ce4affSfengbojiang return (KERN_INVALID_ARGUMENT);
1732*22ce4affSfengbojiang protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
1733*22ce4affSfengbojiang }
1734a9643ea8Slogwang
1735a9643ea8Slogwang cred = NULL;
1736*22ce4affSfengbojiang if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1737a9643ea8Slogwang goto charged;
1738a9643ea8Slogwang if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1739a9643ea8Slogwang ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1740a9643ea8Slogwang if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1741a9643ea8Slogwang return (KERN_RESOURCE_SHORTAGE);
1742*22ce4affSfengbojiang KASSERT(object == NULL ||
1743*22ce4affSfengbojiang (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1744a9643ea8Slogwang object->cred == NULL,
1745*22ce4affSfengbojiang ("overcommit: vm_map_insert o %p", object));
1746a9643ea8Slogwang cred = curthread->td_ucred;
1747a9643ea8Slogwang }
1748a9643ea8Slogwang
1749a9643ea8Slogwang charged:
1750a9643ea8Slogwang /* Expand the kernel pmap, if necessary. */
1751a9643ea8Slogwang if (map == kernel_map && end > kernel_vm_end)
1752a9643ea8Slogwang pmap_growkernel(end);
1753a9643ea8Slogwang if (object != NULL) {
1754a9643ea8Slogwang /*
1755a9643ea8Slogwang * OBJ_ONEMAPPING must be cleared unless this mapping
1756a9643ea8Slogwang * is trivially proven to be the only mapping for any
1757a9643ea8Slogwang * of the object's pages. (Object granularity
1758a9643ea8Slogwang * reference counting is insufficient to recognize
1759a9643ea8Slogwang * aliases with precision.)
1760a9643ea8Slogwang */
1761*22ce4affSfengbojiang if ((object->flags & OBJ_ANON) != 0) {
1762a9643ea8Slogwang VM_OBJECT_WLOCK(object);
1763a9643ea8Slogwang if (object->ref_count > 1 || object->shadow_count != 0)
1764a9643ea8Slogwang vm_object_clear_flag(object, OBJ_ONEMAPPING);
1765a9643ea8Slogwang VM_OBJECT_WUNLOCK(object);
1766a9643ea8Slogwang }
1767*22ce4affSfengbojiang } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
1768*22ce4affSfengbojiang protoeflags &&
1769*22ce4affSfengbojiang (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP |
1770*22ce4affSfengbojiang MAP_VN_EXEC)) == 0 &&
1771*22ce4affSfengbojiang prev_entry->end == start && (prev_entry->cred == cred ||
1772a9643ea8Slogwang (prev_entry->object.vm_object != NULL &&
1773*22ce4affSfengbojiang prev_entry->object.vm_object->cred == cred)) &&
1774a9643ea8Slogwang vm_object_coalesce(prev_entry->object.vm_object,
1775a9643ea8Slogwang prev_entry->offset,
1776a9643ea8Slogwang (vm_size_t)(prev_entry->end - prev_entry->start),
1777a9643ea8Slogwang (vm_size_t)(end - prev_entry->end), cred != NULL &&
1778a9643ea8Slogwang (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1779a9643ea8Slogwang /*
1780a9643ea8Slogwang * We were able to extend the object. Determine if we
1781a9643ea8Slogwang * can extend the previous map entry to include the
1782a9643ea8Slogwang * new range as well.
1783a9643ea8Slogwang */
1784*22ce4affSfengbojiang if (prev_entry->inheritance == inheritance &&
1785*22ce4affSfengbojiang prev_entry->protection == prot &&
1786*22ce4affSfengbojiang prev_entry->max_protection == max &&
1787*22ce4affSfengbojiang prev_entry->wired_count == 0) {
1788*22ce4affSfengbojiang KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
1789*22ce4affSfengbojiang 0, ("prev_entry %p has incoherent wiring",
1790*22ce4affSfengbojiang prev_entry));
1791*22ce4affSfengbojiang if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1792*22ce4affSfengbojiang map->size += end - prev_entry->end;
1793*22ce4affSfengbojiang vm_map_entry_resize(map, prev_entry,
1794*22ce4affSfengbojiang end - prev_entry->end);
1795*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, next_entry);
1796a9643ea8Slogwang return (KERN_SUCCESS);
1797a9643ea8Slogwang }
1798a9643ea8Slogwang
1799a9643ea8Slogwang /*
1800a9643ea8Slogwang * If we can extend the object but cannot extend the
1801a9643ea8Slogwang * map entry, we have to create a new map entry. We
1802a9643ea8Slogwang * must bump the ref count on the extended object to
1803a9643ea8Slogwang * account for it. object may be NULL.
1804a9643ea8Slogwang */
1805a9643ea8Slogwang object = prev_entry->object.vm_object;
1806a9643ea8Slogwang offset = prev_entry->offset +
1807a9643ea8Slogwang (prev_entry->end - prev_entry->start);
1808a9643ea8Slogwang vm_object_reference(object);
1809a9643ea8Slogwang if (cred != NULL && object != NULL && object->cred != NULL &&
1810a9643ea8Slogwang !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1811a9643ea8Slogwang /* Object already accounts for this uid. */
1812a9643ea8Slogwang cred = NULL;
1813a9643ea8Slogwang }
1814a9643ea8Slogwang }
1815a9643ea8Slogwang if (cred != NULL)
1816a9643ea8Slogwang crhold(cred);
1817a9643ea8Slogwang
1818a9643ea8Slogwang /*
1819a9643ea8Slogwang * Create a new entry
1820a9643ea8Slogwang */
1821a9643ea8Slogwang new_entry = vm_map_entry_create(map);
1822a9643ea8Slogwang new_entry->start = start;
1823a9643ea8Slogwang new_entry->end = end;
1824a9643ea8Slogwang new_entry->cred = NULL;
1825a9643ea8Slogwang
1826a9643ea8Slogwang new_entry->eflags = protoeflags;
1827a9643ea8Slogwang new_entry->object.vm_object = object;
1828a9643ea8Slogwang new_entry->offset = offset;
1829a9643ea8Slogwang
1830a9643ea8Slogwang new_entry->inheritance = inheritance;
1831a9643ea8Slogwang new_entry->protection = prot;
1832a9643ea8Slogwang new_entry->max_protection = max;
1833a9643ea8Slogwang new_entry->wired_count = 0;
1834a9643ea8Slogwang new_entry->wiring_thread = NULL;
1835a9643ea8Slogwang new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1836a9643ea8Slogwang new_entry->next_read = start;
1837a9643ea8Slogwang
1838a9643ea8Slogwang KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1839*22ce4affSfengbojiang ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1840a9643ea8Slogwang new_entry->cred = cred;
1841a9643ea8Slogwang
1842a9643ea8Slogwang /*
1843a9643ea8Slogwang * Insert the new entry into the list
1844a9643ea8Slogwang */
1845*22ce4affSfengbojiang vm_map_entry_link(map, new_entry);
1846*22ce4affSfengbojiang if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1847a9643ea8Slogwang map->size += new_entry->end - new_entry->start;
1848a9643ea8Slogwang
1849a9643ea8Slogwang /*
1850a9643ea8Slogwang * Try to coalesce the new entry with both the previous and next
1851a9643ea8Slogwang * entries in the list. Previously, we only attempted to coalesce
1852a9643ea8Slogwang * with the previous entry when object is NULL. Here, we handle the
1853a9643ea8Slogwang * other cases, which are less common.
1854a9643ea8Slogwang */
1855*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, new_entry);
1856*22ce4affSfengbojiang vm_map_try_merge_entries(map, new_entry, next_entry);
1857a9643ea8Slogwang
1858*22ce4affSfengbojiang if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1859*22ce4affSfengbojiang vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1860*22ce4affSfengbojiang end - start, cow & MAP_PREFAULT_PARTIAL);
1861a9643ea8Slogwang }
1862a9643ea8Slogwang
1863a9643ea8Slogwang return (KERN_SUCCESS);
1864a9643ea8Slogwang }
1865a9643ea8Slogwang
1866a9643ea8Slogwang /*
1867a9643ea8Slogwang * vm_map_findspace:
1868a9643ea8Slogwang *
1869a9643ea8Slogwang * Find the first fit (lowest VM address) for "length" free bytes
1870a9643ea8Slogwang * beginning at address >= start in the given map.
1871a9643ea8Slogwang *
1872*22ce4affSfengbojiang * In a vm_map_entry, "max_free" is the maximum amount of
1873*22ce4affSfengbojiang * contiguous free space between an entry in its subtree and a
1874*22ce4affSfengbojiang * neighbor of that entry. This allows finding a free region in
1875*22ce4affSfengbojiang * one path down the tree, so O(log n) amortized with splay
1876*22ce4affSfengbojiang * trees.
1877a9643ea8Slogwang *
1878a9643ea8Slogwang * The map must be locked, and leaves it so.
1879a9643ea8Slogwang *
1880*22ce4affSfengbojiang * Returns: starting address if sufficient space,
1881*22ce4affSfengbojiang * vm_map_max(map)-length+1 if insufficient space.
1882a9643ea8Slogwang */
1883*22ce4affSfengbojiang vm_offset_t
vm_map_findspace(vm_map_t map,vm_offset_t start,vm_size_t length)1884*22ce4affSfengbojiang vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
1885a9643ea8Slogwang {
1886*22ce4affSfengbojiang vm_map_entry_t header, llist, rlist, root, y;
1887*22ce4affSfengbojiang vm_size_t left_length, max_free_left, max_free_right;
1888*22ce4affSfengbojiang vm_offset_t gap_end;
1889*22ce4affSfengbojiang
1890*22ce4affSfengbojiang VM_MAP_ASSERT_LOCKED(map);
1891a9643ea8Slogwang
1892a9643ea8Slogwang /*
1893a9643ea8Slogwang * Request must fit within min/max VM address and must avoid
1894a9643ea8Slogwang * address wrap.
1895a9643ea8Slogwang */
1896*22ce4affSfengbojiang start = MAX(start, vm_map_min(map));
1897*22ce4affSfengbojiang if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
1898*22ce4affSfengbojiang return (vm_map_max(map) - length + 1);
1899a9643ea8Slogwang
1900a9643ea8Slogwang /* Empty tree means wide open address space. */
1901*22ce4affSfengbojiang if (map->root == NULL)
1902*22ce4affSfengbojiang return (start);
1903a9643ea8Slogwang
1904a9643ea8Slogwang /*
1905*22ce4affSfengbojiang * After splay_split, if start is within an entry, push it to the start
1906*22ce4affSfengbojiang * of the following gap. If rlist is at the end of the gap containing
1907*22ce4affSfengbojiang * start, save the end of that gap in gap_end to see if the gap is big
1908*22ce4affSfengbojiang * enough; otherwise set gap_end to start skip gap-checking and move
1909*22ce4affSfengbojiang * directly to a search of the right subtree.
1910a9643ea8Slogwang */
1911*22ce4affSfengbojiang header = &map->header;
1912*22ce4affSfengbojiang root = vm_map_splay_split(map, start, length, &llist, &rlist);
1913*22ce4affSfengbojiang gap_end = rlist->start;
1914*22ce4affSfengbojiang if (root != NULL) {
1915*22ce4affSfengbojiang start = root->end;
1916*22ce4affSfengbojiang if (root->right != rlist)
1917*22ce4affSfengbojiang gap_end = start;
1918*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_left(header, root, llist);
1919*22ce4affSfengbojiang max_free_right = vm_map_splay_merge_right(header, root, rlist);
1920*22ce4affSfengbojiang } else if (rlist != header) {
1921*22ce4affSfengbojiang root = rlist;
1922*22ce4affSfengbojiang rlist = root->left;
1923*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_pred(header, root, llist);
1924*22ce4affSfengbojiang max_free_right = vm_map_splay_merge_right(header, root, rlist);
1925*22ce4affSfengbojiang } else {
1926*22ce4affSfengbojiang root = llist;
1927*22ce4affSfengbojiang llist = root->right;
1928*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_left(header, root, llist);
1929*22ce4affSfengbojiang max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1930a9643ea8Slogwang }
1931*22ce4affSfengbojiang root->max_free = vm_size_max(max_free_left, max_free_right);
1932*22ce4affSfengbojiang map->root = root;
1933*22ce4affSfengbojiang VM_MAP_ASSERT_CONSISTENT(map);
1934*22ce4affSfengbojiang if (length <= gap_end - start)
1935*22ce4affSfengbojiang return (start);
1936a9643ea8Slogwang
1937a9643ea8Slogwang /* With max_free, can immediately tell if no solution. */
1938*22ce4affSfengbojiang if (root->right == header || length > root->right->max_free)
1939*22ce4affSfengbojiang return (vm_map_max(map) - length + 1);
1940a9643ea8Slogwang
1941a9643ea8Slogwang /*
1942*22ce4affSfengbojiang * Splay for the least large-enough gap in the right subtree.
1943a9643ea8Slogwang */
1944*22ce4affSfengbojiang llist = rlist = header;
1945*22ce4affSfengbojiang for (left_length = 0;;
1946*22ce4affSfengbojiang left_length = vm_map_entry_max_free_left(root, llist)) {
1947*22ce4affSfengbojiang if (length <= left_length)
1948*22ce4affSfengbojiang SPLAY_LEFT_STEP(root, y, llist, rlist,
1949*22ce4affSfengbojiang length <= vm_map_entry_max_free_left(y, llist));
1950*22ce4affSfengbojiang else
1951*22ce4affSfengbojiang SPLAY_RIGHT_STEP(root, y, llist, rlist,
1952*22ce4affSfengbojiang length > vm_map_entry_max_free_left(y, root));
1953*22ce4affSfengbojiang if (root == NULL)
1954*22ce4affSfengbojiang break;
1955a9643ea8Slogwang }
1956*22ce4affSfengbojiang root = llist;
1957*22ce4affSfengbojiang llist = root->right;
1958*22ce4affSfengbojiang max_free_left = vm_map_splay_merge_left(header, root, llist);
1959*22ce4affSfengbojiang if (rlist == header) {
1960*22ce4affSfengbojiang root->max_free = vm_size_max(max_free_left,
1961*22ce4affSfengbojiang vm_map_splay_merge_succ(header, root, rlist));
1962*22ce4affSfengbojiang } else {
1963*22ce4affSfengbojiang y = rlist;
1964*22ce4affSfengbojiang rlist = y->left;
1965*22ce4affSfengbojiang y->max_free = vm_size_max(
1966*22ce4affSfengbojiang vm_map_splay_merge_pred(root, y, root),
1967*22ce4affSfengbojiang vm_map_splay_merge_right(header, y, rlist));
1968*22ce4affSfengbojiang root->max_free = vm_size_max(max_free_left, y->max_free);
1969*22ce4affSfengbojiang }
1970*22ce4affSfengbojiang map->root = root;
1971*22ce4affSfengbojiang VM_MAP_ASSERT_CONSISTENT(map);
1972*22ce4affSfengbojiang return (root->end);
1973a9643ea8Slogwang }
1974a9643ea8Slogwang
1975a9643ea8Slogwang int
vm_map_fixed(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t start,vm_size_t length,vm_prot_t prot,vm_prot_t max,int cow)1976a9643ea8Slogwang vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1977a9643ea8Slogwang vm_offset_t start, vm_size_t length, vm_prot_t prot,
1978a9643ea8Slogwang vm_prot_t max, int cow)
1979a9643ea8Slogwang {
1980a9643ea8Slogwang vm_offset_t end;
1981a9643ea8Slogwang int result;
1982a9643ea8Slogwang
1983a9643ea8Slogwang end = start + length;
1984a9643ea8Slogwang KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1985a9643ea8Slogwang object == NULL,
1986a9643ea8Slogwang ("vm_map_fixed: non-NULL backing object for stack"));
1987a9643ea8Slogwang vm_map_lock(map);
1988a9643ea8Slogwang VM_MAP_RANGE_CHECK(map, start, end);
1989*22ce4affSfengbojiang if ((cow & MAP_CHECK_EXCL) == 0) {
1990*22ce4affSfengbojiang result = vm_map_delete(map, start, end);
1991*22ce4affSfengbojiang if (result != KERN_SUCCESS)
1992*22ce4affSfengbojiang goto out;
1993*22ce4affSfengbojiang }
1994a9643ea8Slogwang if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1995a9643ea8Slogwang result = vm_map_stack_locked(map, start, length, sgrowsiz,
1996a9643ea8Slogwang prot, max, cow);
1997a9643ea8Slogwang } else {
1998a9643ea8Slogwang result = vm_map_insert(map, object, offset, start, end,
1999a9643ea8Slogwang prot, max, cow);
2000a9643ea8Slogwang }
2001*22ce4affSfengbojiang out:
2002a9643ea8Slogwang vm_map_unlock(map);
2003a9643ea8Slogwang return (result);
2004a9643ea8Slogwang }
2005a9643ea8Slogwang
2006*22ce4affSfengbojiang static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
2007*22ce4affSfengbojiang static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
2008*22ce4affSfengbojiang
2009*22ce4affSfengbojiang static int cluster_anon = 1;
2010*22ce4affSfengbojiang SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
2011*22ce4affSfengbojiang &cluster_anon, 0,
2012*22ce4affSfengbojiang "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
2013*22ce4affSfengbojiang
2014*22ce4affSfengbojiang static bool
clustering_anon_allowed(vm_offset_t addr)2015*22ce4affSfengbojiang clustering_anon_allowed(vm_offset_t addr)
2016*22ce4affSfengbojiang {
2017*22ce4affSfengbojiang
2018*22ce4affSfengbojiang switch (cluster_anon) {
2019*22ce4affSfengbojiang case 0:
2020*22ce4affSfengbojiang return (false);
2021*22ce4affSfengbojiang case 1:
2022*22ce4affSfengbojiang return (addr == 0);
2023*22ce4affSfengbojiang case 2:
2024*22ce4affSfengbojiang default:
2025*22ce4affSfengbojiang return (true);
2026*22ce4affSfengbojiang }
2027*22ce4affSfengbojiang }
2028*22ce4affSfengbojiang
2029*22ce4affSfengbojiang static long aslr_restarts;
2030*22ce4affSfengbojiang SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
2031*22ce4affSfengbojiang &aslr_restarts, 0,
2032*22ce4affSfengbojiang "Number of aslr failures");
2033*22ce4affSfengbojiang
2034*22ce4affSfengbojiang /*
2035*22ce4affSfengbojiang * Searches for the specified amount of free space in the given map with the
2036*22ce4affSfengbojiang * specified alignment. Performs an address-ordered, first-fit search from
2037*22ce4affSfengbojiang * the given address "*addr", with an optional upper bound "max_addr". If the
2038*22ce4affSfengbojiang * parameter "alignment" is zero, then the alignment is computed from the
2039*22ce4affSfengbojiang * given (object, offset) pair so as to enable the greatest possible use of
2040*22ce4affSfengbojiang * superpage mappings. Returns KERN_SUCCESS and the address of the free space
2041*22ce4affSfengbojiang * in "*addr" if successful. Otherwise, returns KERN_NO_SPACE.
2042*22ce4affSfengbojiang *
2043*22ce4affSfengbojiang * The map must be locked. Initially, there must be at least "length" bytes
2044*22ce4affSfengbojiang * of free space at the given address.
2045*22ce4affSfengbojiang */
2046*22ce4affSfengbojiang static int
vm_map_alignspace(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,vm_offset_t alignment)2047*22ce4affSfengbojiang vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2048*22ce4affSfengbojiang vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
2049*22ce4affSfengbojiang vm_offset_t alignment)
2050*22ce4affSfengbojiang {
2051*22ce4affSfengbojiang vm_offset_t aligned_addr, free_addr;
2052*22ce4affSfengbojiang
2053*22ce4affSfengbojiang VM_MAP_ASSERT_LOCKED(map);
2054*22ce4affSfengbojiang free_addr = *addr;
2055*22ce4affSfengbojiang KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
2056*22ce4affSfengbojiang ("caller failed to provide space %#jx at address %p",
2057*22ce4affSfengbojiang (uintmax_t)length, (void *)free_addr));
2058*22ce4affSfengbojiang for (;;) {
2059*22ce4affSfengbojiang /*
2060*22ce4affSfengbojiang * At the start of every iteration, the free space at address
2061*22ce4affSfengbojiang * "*addr" is at least "length" bytes.
2062*22ce4affSfengbojiang */
2063*22ce4affSfengbojiang if (alignment == 0)
2064*22ce4affSfengbojiang pmap_align_superpage(object, offset, addr, length);
2065*22ce4affSfengbojiang else if ((*addr & (alignment - 1)) != 0) {
2066*22ce4affSfengbojiang *addr &= ~(alignment - 1);
2067*22ce4affSfengbojiang *addr += alignment;
2068*22ce4affSfengbojiang }
2069*22ce4affSfengbojiang aligned_addr = *addr;
2070*22ce4affSfengbojiang if (aligned_addr == free_addr) {
2071*22ce4affSfengbojiang /*
2072*22ce4affSfengbojiang * Alignment did not change "*addr", so "*addr" must
2073*22ce4affSfengbojiang * still provide sufficient free space.
2074*22ce4affSfengbojiang */
2075*22ce4affSfengbojiang return (KERN_SUCCESS);
2076*22ce4affSfengbojiang }
2077*22ce4affSfengbojiang
2078*22ce4affSfengbojiang /*
2079*22ce4affSfengbojiang * Test for address wrap on "*addr". A wrapped "*addr" could
2080*22ce4affSfengbojiang * be a valid address, in which case vm_map_findspace() cannot
2081*22ce4affSfengbojiang * be relied upon to fail.
2082*22ce4affSfengbojiang */
2083*22ce4affSfengbojiang if (aligned_addr < free_addr)
2084*22ce4affSfengbojiang return (KERN_NO_SPACE);
2085*22ce4affSfengbojiang *addr = vm_map_findspace(map, aligned_addr, length);
2086*22ce4affSfengbojiang if (*addr + length > vm_map_max(map) ||
2087*22ce4affSfengbojiang (max_addr != 0 && *addr + length > max_addr))
2088*22ce4affSfengbojiang return (KERN_NO_SPACE);
2089*22ce4affSfengbojiang free_addr = *addr;
2090*22ce4affSfengbojiang if (free_addr == aligned_addr) {
2091*22ce4affSfengbojiang /*
2092*22ce4affSfengbojiang * If a successful call to vm_map_findspace() did not
2093*22ce4affSfengbojiang * change "*addr", then "*addr" must still be aligned
2094*22ce4affSfengbojiang * and provide sufficient free space.
2095*22ce4affSfengbojiang */
2096*22ce4affSfengbojiang return (KERN_SUCCESS);
2097*22ce4affSfengbojiang }
2098*22ce4affSfengbojiang }
2099*22ce4affSfengbojiang }
2100*22ce4affSfengbojiang
2101*22ce4affSfengbojiang int
vm_map_find_aligned(vm_map_t map,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,vm_offset_t alignment)2102*22ce4affSfengbojiang vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
2103*22ce4affSfengbojiang vm_offset_t max_addr, vm_offset_t alignment)
2104*22ce4affSfengbojiang {
2105*22ce4affSfengbojiang /* XXXKIB ASLR eh ? */
2106*22ce4affSfengbojiang *addr = vm_map_findspace(map, *addr, length);
2107*22ce4affSfengbojiang if (*addr + length > vm_map_max(map) ||
2108*22ce4affSfengbojiang (max_addr != 0 && *addr + length > max_addr))
2109*22ce4affSfengbojiang return (KERN_NO_SPACE);
2110*22ce4affSfengbojiang return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
2111*22ce4affSfengbojiang alignment));
2112*22ce4affSfengbojiang }
2113*22ce4affSfengbojiang
2114a9643ea8Slogwang /*
2115a9643ea8Slogwang * vm_map_find finds an unallocated region in the target address
2116a9643ea8Slogwang * map with the given length. The search is defined to be
2117a9643ea8Slogwang * first-fit from the specified address; the region found is
2118a9643ea8Slogwang * returned in the same parameter.
2119a9643ea8Slogwang *
2120a9643ea8Slogwang * If object is non-NULL, ref count must be bumped by caller
2121a9643ea8Slogwang * prior to making call to account for the new entry.
2122a9643ea8Slogwang */
2123a9643ea8Slogwang int
vm_map_find(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,int find_space,vm_prot_t prot,vm_prot_t max,int cow)2124a9643ea8Slogwang vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2125a9643ea8Slogwang vm_offset_t *addr, /* IN/OUT */
2126a9643ea8Slogwang vm_size_t length, vm_offset_t max_addr, int find_space,
2127a9643ea8Slogwang vm_prot_t prot, vm_prot_t max, int cow)
2128a9643ea8Slogwang {
2129*22ce4affSfengbojiang vm_offset_t alignment, curr_min_addr, min_addr;
2130*22ce4affSfengbojiang int gap, pidx, rv, try;
2131*22ce4affSfengbojiang bool cluster, en_aslr, update_anon;
2132a9643ea8Slogwang
2133a9643ea8Slogwang KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
2134a9643ea8Slogwang object == NULL,
2135a9643ea8Slogwang ("vm_map_find: non-NULL backing object for stack"));
2136*22ce4affSfengbojiang MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
2137*22ce4affSfengbojiang (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
2138a9643ea8Slogwang if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
2139a9643ea8Slogwang (object->flags & OBJ_COLORED) == 0))
2140a9643ea8Slogwang find_space = VMFS_ANY_SPACE;
2141a9643ea8Slogwang if (find_space >> 8 != 0) {
2142a9643ea8Slogwang KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
2143a9643ea8Slogwang alignment = (vm_offset_t)1 << (find_space >> 8);
2144a9643ea8Slogwang } else
2145a9643ea8Slogwang alignment = 0;
2146*22ce4affSfengbojiang en_aslr = (map->flags & MAP_ASLR) != 0;
2147*22ce4affSfengbojiang update_anon = cluster = clustering_anon_allowed(*addr) &&
2148*22ce4affSfengbojiang (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
2149*22ce4affSfengbojiang find_space != VMFS_NO_SPACE && object == NULL &&
2150*22ce4affSfengbojiang (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
2151*22ce4affSfengbojiang MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
2152*22ce4affSfengbojiang curr_min_addr = min_addr = *addr;
2153*22ce4affSfengbojiang if (en_aslr && min_addr == 0 && !cluster &&
2154*22ce4affSfengbojiang find_space != VMFS_NO_SPACE &&
2155*22ce4affSfengbojiang (map->flags & MAP_ASLR_IGNSTART) != 0)
2156*22ce4affSfengbojiang curr_min_addr = min_addr = vm_map_min(map);
2157*22ce4affSfengbojiang try = 0;
2158a9643ea8Slogwang vm_map_lock(map);
2159*22ce4affSfengbojiang if (cluster) {
2160*22ce4affSfengbojiang curr_min_addr = map->anon_loc;
2161*22ce4affSfengbojiang if (curr_min_addr == 0)
2162*22ce4affSfengbojiang cluster = false;
2163*22ce4affSfengbojiang }
2164a9643ea8Slogwang if (find_space != VMFS_NO_SPACE) {
2165*22ce4affSfengbojiang KASSERT(find_space == VMFS_ANY_SPACE ||
2166*22ce4affSfengbojiang find_space == VMFS_OPTIMAL_SPACE ||
2167*22ce4affSfengbojiang find_space == VMFS_SUPER_SPACE ||
2168*22ce4affSfengbojiang alignment != 0, ("unexpected VMFS flag"));
2169*22ce4affSfengbojiang again:
2170*22ce4affSfengbojiang /*
2171*22ce4affSfengbojiang * When creating an anonymous mapping, try clustering
2172*22ce4affSfengbojiang * with an existing anonymous mapping first.
2173*22ce4affSfengbojiang *
2174*22ce4affSfengbojiang * We make up to two attempts to find address space
2175*22ce4affSfengbojiang * for a given find_space value. The first attempt may
2176*22ce4affSfengbojiang * apply randomization or may cluster with an existing
2177*22ce4affSfengbojiang * anonymous mapping. If this first attempt fails,
2178*22ce4affSfengbojiang * perform a first-fit search of the available address
2179*22ce4affSfengbojiang * space.
2180*22ce4affSfengbojiang *
2181*22ce4affSfengbojiang * If all tries failed, and find_space is
2182*22ce4affSfengbojiang * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
2183*22ce4affSfengbojiang * Again enable clustering and randomization.
2184*22ce4affSfengbojiang */
2185*22ce4affSfengbojiang try++;
2186*22ce4affSfengbojiang MPASS(try <= 2);
2187*22ce4affSfengbojiang
2188*22ce4affSfengbojiang if (try == 2) {
2189*22ce4affSfengbojiang /*
2190*22ce4affSfengbojiang * Second try: we failed either to find a
2191*22ce4affSfengbojiang * suitable region for randomizing the
2192*22ce4affSfengbojiang * allocation, or to cluster with an existing
2193*22ce4affSfengbojiang * mapping. Retry with free run.
2194*22ce4affSfengbojiang */
2195*22ce4affSfengbojiang curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
2196*22ce4affSfengbojiang vm_map_min(map) : min_addr;
2197*22ce4affSfengbojiang atomic_add_long(&aslr_restarts, 1);
2198*22ce4affSfengbojiang }
2199*22ce4affSfengbojiang
2200*22ce4affSfengbojiang if (try == 1 && en_aslr && !cluster) {
2201*22ce4affSfengbojiang /*
2202*22ce4affSfengbojiang * Find space for allocation, including
2203*22ce4affSfengbojiang * gap needed for later randomization.
2204*22ce4affSfengbojiang */
2205*22ce4affSfengbojiang pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
2206*22ce4affSfengbojiang (find_space == VMFS_SUPER_SPACE || find_space ==
2207*22ce4affSfengbojiang VMFS_OPTIMAL_SPACE) ? 1 : 0;
2208*22ce4affSfengbojiang gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
2209*22ce4affSfengbojiang (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
2210*22ce4affSfengbojiang aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
2211*22ce4affSfengbojiang *addr = vm_map_findspace(map, curr_min_addr,
2212*22ce4affSfengbojiang length + gap * pagesizes[pidx]);
2213*22ce4affSfengbojiang if (*addr + length + gap * pagesizes[pidx] >
2214*22ce4affSfengbojiang vm_map_max(map))
2215*22ce4affSfengbojiang goto again;
2216*22ce4affSfengbojiang /* And randomize the start address. */
2217*22ce4affSfengbojiang *addr += (arc4random() % gap) * pagesizes[pidx];
2218*22ce4affSfengbojiang if (max_addr != 0 && *addr + length > max_addr)
2219*22ce4affSfengbojiang goto again;
2220*22ce4affSfengbojiang } else {
2221*22ce4affSfengbojiang *addr = vm_map_findspace(map, curr_min_addr, length);
2222*22ce4affSfengbojiang if (*addr + length > vm_map_max(map) ||
2223a9643ea8Slogwang (max_addr != 0 && *addr + length > max_addr)) {
2224*22ce4affSfengbojiang if (cluster) {
2225*22ce4affSfengbojiang cluster = false;
2226*22ce4affSfengbojiang MPASS(try == 1);
2227a9643ea8Slogwang goto again;
2228a9643ea8Slogwang }
2229*22ce4affSfengbojiang rv = KERN_NO_SPACE;
2230*22ce4affSfengbojiang goto done;
2231a9643ea8Slogwang }
2232a9643ea8Slogwang }
2233a9643ea8Slogwang
2234*22ce4affSfengbojiang if (find_space != VMFS_ANY_SPACE &&
2235*22ce4affSfengbojiang (rv = vm_map_alignspace(map, object, offset, addr, length,
2236*22ce4affSfengbojiang max_addr, alignment)) != KERN_SUCCESS) {
2237*22ce4affSfengbojiang if (find_space == VMFS_OPTIMAL_SPACE) {
2238*22ce4affSfengbojiang find_space = VMFS_ANY_SPACE;
2239*22ce4affSfengbojiang curr_min_addr = min_addr;
2240*22ce4affSfengbojiang cluster = update_anon;
2241*22ce4affSfengbojiang try = 0;
2242*22ce4affSfengbojiang goto again;
2243*22ce4affSfengbojiang }
2244*22ce4affSfengbojiang goto done;
2245*22ce4affSfengbojiang }
2246*22ce4affSfengbojiang } else if ((cow & MAP_REMAP) != 0) {
2247*22ce4affSfengbojiang if (!vm_map_range_valid(map, *addr, *addr + length)) {
2248*22ce4affSfengbojiang rv = KERN_INVALID_ADDRESS;
2249*22ce4affSfengbojiang goto done;
2250*22ce4affSfengbojiang }
2251*22ce4affSfengbojiang rv = vm_map_delete(map, *addr, *addr + length);
2252*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
2253*22ce4affSfengbojiang goto done;
2254a9643ea8Slogwang }
2255a9643ea8Slogwang if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
2256*22ce4affSfengbojiang rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
2257*22ce4affSfengbojiang max, cow);
2258a9643ea8Slogwang } else {
2259*22ce4affSfengbojiang rv = vm_map_insert(map, object, offset, *addr, *addr + length,
2260*22ce4affSfengbojiang prot, max, cow);
2261a9643ea8Slogwang }
2262*22ce4affSfengbojiang if (rv == KERN_SUCCESS && update_anon)
2263*22ce4affSfengbojiang map->anon_loc = *addr + length;
2264*22ce4affSfengbojiang done:
2265a9643ea8Slogwang vm_map_unlock(map);
2266*22ce4affSfengbojiang return (rv);
2267a9643ea8Slogwang }
2268a9643ea8Slogwang
2269a9643ea8Slogwang /*
2270*22ce4affSfengbojiang * vm_map_find_min() is a variant of vm_map_find() that takes an
2271*22ce4affSfengbojiang * additional parameter (min_addr) and treats the given address
2272*22ce4affSfengbojiang * (*addr) differently. Specifically, it treats *addr as a hint
2273*22ce4affSfengbojiang * and not as the minimum address where the mapping is created.
2274a9643ea8Slogwang *
2275*22ce4affSfengbojiang * This function works in two phases. First, it tries to
2276*22ce4affSfengbojiang * allocate above the hint. If that fails and the hint is
2277*22ce4affSfengbojiang * greater than min_addr, it performs a second pass, replacing
2278*22ce4affSfengbojiang * the hint with min_addr as the minimum address for the
2279*22ce4affSfengbojiang * allocation.
2280*22ce4affSfengbojiang */
2281*22ce4affSfengbojiang int
vm_map_find_min(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t min_addr,vm_offset_t max_addr,int find_space,vm_prot_t prot,vm_prot_t max,int cow)2282*22ce4affSfengbojiang vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2283*22ce4affSfengbojiang vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
2284*22ce4affSfengbojiang vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
2285*22ce4affSfengbojiang int cow)
2286*22ce4affSfengbojiang {
2287*22ce4affSfengbojiang vm_offset_t hint;
2288*22ce4affSfengbojiang int rv;
2289*22ce4affSfengbojiang
2290*22ce4affSfengbojiang hint = *addr;
2291*22ce4affSfengbojiang for (;;) {
2292*22ce4affSfengbojiang rv = vm_map_find(map, object, offset, addr, length, max_addr,
2293*22ce4affSfengbojiang find_space, prot, max, cow);
2294*22ce4affSfengbojiang if (rv == KERN_SUCCESS || min_addr >= hint)
2295*22ce4affSfengbojiang return (rv);
2296*22ce4affSfengbojiang *addr = hint = min_addr;
2297*22ce4affSfengbojiang }
2298*22ce4affSfengbojiang }
2299*22ce4affSfengbojiang
2300*22ce4affSfengbojiang /*
2301*22ce4affSfengbojiang * A map entry with any of the following flags set must not be merged with
2302*22ce4affSfengbojiang * another entry.
2303*22ce4affSfengbojiang */
2304*22ce4affSfengbojiang #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
2305*22ce4affSfengbojiang MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC)
2306*22ce4affSfengbojiang
2307*22ce4affSfengbojiang static bool
vm_map_mergeable_neighbors(vm_map_entry_t prev,vm_map_entry_t entry)2308*22ce4affSfengbojiang vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
2309*22ce4affSfengbojiang {
2310*22ce4affSfengbojiang
2311*22ce4affSfengbojiang KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
2312*22ce4affSfengbojiang (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
2313*22ce4affSfengbojiang ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
2314*22ce4affSfengbojiang prev, entry));
2315*22ce4affSfengbojiang return (prev->end == entry->start &&
2316*22ce4affSfengbojiang prev->object.vm_object == entry->object.vm_object &&
2317*22ce4affSfengbojiang (prev->object.vm_object == NULL ||
2318*22ce4affSfengbojiang prev->offset + (prev->end - prev->start) == entry->offset) &&
2319*22ce4affSfengbojiang prev->eflags == entry->eflags &&
2320*22ce4affSfengbojiang prev->protection == entry->protection &&
2321*22ce4affSfengbojiang prev->max_protection == entry->max_protection &&
2322*22ce4affSfengbojiang prev->inheritance == entry->inheritance &&
2323*22ce4affSfengbojiang prev->wired_count == entry->wired_count &&
2324*22ce4affSfengbojiang prev->cred == entry->cred);
2325*22ce4affSfengbojiang }
2326*22ce4affSfengbojiang
2327*22ce4affSfengbojiang static void
vm_map_merged_neighbor_dispose(vm_map_t map,vm_map_entry_t entry)2328*22ce4affSfengbojiang vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
2329*22ce4affSfengbojiang {
2330*22ce4affSfengbojiang
2331*22ce4affSfengbojiang /*
2332*22ce4affSfengbojiang * If the backing object is a vnode object, vm_object_deallocate()
2333*22ce4affSfengbojiang * calls vrele(). However, vrele() does not lock the vnode because
2334*22ce4affSfengbojiang * the vnode has additional references. Thus, the map lock can be
2335*22ce4affSfengbojiang * kept without causing a lock-order reversal with the vnode lock.
2336*22ce4affSfengbojiang *
2337*22ce4affSfengbojiang * Since we count the number of virtual page mappings in
2338*22ce4affSfengbojiang * object->un_pager.vnp.writemappings, the writemappings value
2339*22ce4affSfengbojiang * should not be adjusted when the entry is disposed of.
2340*22ce4affSfengbojiang */
2341*22ce4affSfengbojiang if (entry->object.vm_object != NULL)
2342*22ce4affSfengbojiang vm_object_deallocate(entry->object.vm_object);
2343*22ce4affSfengbojiang if (entry->cred != NULL)
2344*22ce4affSfengbojiang crfree(entry->cred);
2345*22ce4affSfengbojiang vm_map_entry_dispose(map, entry);
2346*22ce4affSfengbojiang }
2347*22ce4affSfengbojiang
2348*22ce4affSfengbojiang /*
2349*22ce4affSfengbojiang * vm_map_try_merge_entries:
2350*22ce4affSfengbojiang *
2351*22ce4affSfengbojiang * Compare the given map entry to its predecessor, and merge its precessor
2352*22ce4affSfengbojiang * into it if possible. The entry remains valid, and may be extended.
2353*22ce4affSfengbojiang * The predecessor may be deleted.
2354a9643ea8Slogwang *
2355a9643ea8Slogwang * The map must be locked.
2356a9643ea8Slogwang */
2357a9643ea8Slogwang void
vm_map_try_merge_entries(vm_map_t map,vm_map_entry_t prev_entry,vm_map_entry_t entry)2358*22ce4affSfengbojiang vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry,
2359*22ce4affSfengbojiang vm_map_entry_t entry)
2360a9643ea8Slogwang {
2361a9643ea8Slogwang
2362*22ce4affSfengbojiang VM_MAP_ASSERT_LOCKED(map);
2363*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
2364*22ce4affSfengbojiang vm_map_mergeable_neighbors(prev_entry, entry)) {
2365*22ce4affSfengbojiang vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT);
2366*22ce4affSfengbojiang vm_map_merged_neighbor_dispose(map, prev_entry);
2367*22ce4affSfengbojiang }
2368*22ce4affSfengbojiang }
2369a9643ea8Slogwang
2370a9643ea8Slogwang /*
2371*22ce4affSfengbojiang * vm_map_entry_back:
2372a9643ea8Slogwang *
2373*22ce4affSfengbojiang * Allocate an object to back a map entry.
2374a9643ea8Slogwang */
2375*22ce4affSfengbojiang static inline void
vm_map_entry_back(vm_map_entry_t entry)2376*22ce4affSfengbojiang vm_map_entry_back(vm_map_entry_t entry)
2377*22ce4affSfengbojiang {
2378*22ce4affSfengbojiang vm_object_t object;
2379a9643ea8Slogwang
2380*22ce4affSfengbojiang KASSERT(entry->object.vm_object == NULL,
2381*22ce4affSfengbojiang ("map entry %p has backing object", entry));
2382*22ce4affSfengbojiang KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2383*22ce4affSfengbojiang ("map entry %p is a submap", entry));
2384*22ce4affSfengbojiang object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL,
2385*22ce4affSfengbojiang entry->cred, entry->end - entry->start);
2386*22ce4affSfengbojiang entry->object.vm_object = object;
2387*22ce4affSfengbojiang entry->offset = 0;
2388*22ce4affSfengbojiang entry->cred = NULL;
2389*22ce4affSfengbojiang }
2390a9643ea8Slogwang
2391a9643ea8Slogwang /*
2392*22ce4affSfengbojiang * vm_map_entry_charge_object
2393*22ce4affSfengbojiang *
2394*22ce4affSfengbojiang * If there is no object backing this entry, create one. Otherwise, if
2395*22ce4affSfengbojiang * the entry has cred, give it to the backing object.
2396a9643ea8Slogwang */
2397*22ce4affSfengbojiang static inline void
vm_map_entry_charge_object(vm_map_t map,vm_map_entry_t entry)2398*22ce4affSfengbojiang vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
2399*22ce4affSfengbojiang {
2400*22ce4affSfengbojiang
2401*22ce4affSfengbojiang VM_MAP_ASSERT_LOCKED(map);
2402*22ce4affSfengbojiang KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2403*22ce4affSfengbojiang ("map entry %p is a submap", entry));
2404*22ce4affSfengbojiang if (entry->object.vm_object == NULL && !map->system_map &&
2405*22ce4affSfengbojiang (entry->eflags & MAP_ENTRY_GUARD) == 0)
2406*22ce4affSfengbojiang vm_map_entry_back(entry);
2407*22ce4affSfengbojiang else if (entry->object.vm_object != NULL &&
2408*22ce4affSfengbojiang ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2409*22ce4affSfengbojiang entry->cred != NULL) {
2410*22ce4affSfengbojiang VM_OBJECT_WLOCK(entry->object.vm_object);
2411*22ce4affSfengbojiang KASSERT(entry->object.vm_object->cred == NULL,
2412*22ce4affSfengbojiang ("OVERCOMMIT: %s: both cred e %p", __func__, entry));
2413*22ce4affSfengbojiang entry->object.vm_object->cred = entry->cred;
2414*22ce4affSfengbojiang entry->object.vm_object->charge = entry->end - entry->start;
2415*22ce4affSfengbojiang VM_OBJECT_WUNLOCK(entry->object.vm_object);
2416*22ce4affSfengbojiang entry->cred = NULL;
2417a9643ea8Slogwang }
2418a9643ea8Slogwang }
2419*22ce4affSfengbojiang
2420*22ce4affSfengbojiang /*
2421*22ce4affSfengbojiang * vm_map_entry_clone
2422*22ce4affSfengbojiang *
2423*22ce4affSfengbojiang * Create a duplicate map entry for clipping.
2424*22ce4affSfengbojiang */
2425*22ce4affSfengbojiang static vm_map_entry_t
vm_map_entry_clone(vm_map_t map,vm_map_entry_t entry)2426*22ce4affSfengbojiang vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry)
2427*22ce4affSfengbojiang {
2428*22ce4affSfengbojiang vm_map_entry_t new_entry;
2429*22ce4affSfengbojiang
2430*22ce4affSfengbojiang VM_MAP_ASSERT_LOCKED(map);
2431*22ce4affSfengbojiang
2432*22ce4affSfengbojiang /*
2433*22ce4affSfengbojiang * Create a backing object now, if none exists, so that more individual
2434*22ce4affSfengbojiang * objects won't be created after the map entry is split.
2435*22ce4affSfengbojiang */
2436*22ce4affSfengbojiang vm_map_entry_charge_object(map, entry);
2437*22ce4affSfengbojiang
2438*22ce4affSfengbojiang /* Clone the entry. */
2439*22ce4affSfengbojiang new_entry = vm_map_entry_create(map);
2440*22ce4affSfengbojiang *new_entry = *entry;
2441*22ce4affSfengbojiang if (new_entry->cred != NULL)
2442*22ce4affSfengbojiang crhold(entry->cred);
2443*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2444*22ce4affSfengbojiang vm_object_reference(new_entry->object.vm_object);
2445*22ce4affSfengbojiang vm_map_entry_set_vnode_text(new_entry, true);
2446*22ce4affSfengbojiang /*
2447*22ce4affSfengbojiang * The object->un_pager.vnp.writemappings for the object of
2448*22ce4affSfengbojiang * MAP_ENTRY_WRITECNT type entry shall be kept as is here. The
2449*22ce4affSfengbojiang * virtual pages are re-distributed among the clipped entries,
2450*22ce4affSfengbojiang * so the sum is left the same.
2451*22ce4affSfengbojiang */
2452a9643ea8Slogwang }
2453*22ce4affSfengbojiang return (new_entry);
2454*22ce4affSfengbojiang }
2455*22ce4affSfengbojiang
2456a9643ea8Slogwang /*
2457a9643ea8Slogwang * vm_map_clip_start: [ internal use only ]
2458a9643ea8Slogwang *
2459a9643ea8Slogwang * Asserts that the given entry begins at or after
2460a9643ea8Slogwang * the specified address; if necessary,
2461a9643ea8Slogwang * it splits the entry into two.
2462a9643ea8Slogwang */
2463*22ce4affSfengbojiang static int
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_offset_t startaddr)2464*22ce4affSfengbojiang vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
2465a9643ea8Slogwang {
2466a9643ea8Slogwang vm_map_entry_t new_entry;
2467*22ce4affSfengbojiang int bdry_idx;
2468*22ce4affSfengbojiang
2469*22ce4affSfengbojiang if (!map->system_map)
2470*22ce4affSfengbojiang WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2471*22ce4affSfengbojiang "%s: map %p entry %p start 0x%jx", __func__, map, entry,
2472*22ce4affSfengbojiang (uintmax_t)startaddr);
2473*22ce4affSfengbojiang
2474*22ce4affSfengbojiang if (startaddr <= entry->start)
2475*22ce4affSfengbojiang return (KERN_SUCCESS);
2476a9643ea8Slogwang
2477a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
2478*22ce4affSfengbojiang KASSERT(entry->end > startaddr && entry->start < startaddr,
2479*22ce4affSfengbojiang ("%s: invalid clip of entry %p", __func__, entry));
2480a9643ea8Slogwang
2481*22ce4affSfengbojiang bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
2482*22ce4affSfengbojiang MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
2483*22ce4affSfengbojiang if (bdry_idx != 0) {
2484*22ce4affSfengbojiang if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
2485*22ce4affSfengbojiang return (KERN_INVALID_ARGUMENT);
2486a9643ea8Slogwang }
2487a9643ea8Slogwang
2488*22ce4affSfengbojiang new_entry = vm_map_entry_clone(map, entry);
2489a9643ea8Slogwang
2490a9643ea8Slogwang /*
2491*22ce4affSfengbojiang * Split off the front portion. Insert the new entry BEFORE this one,
2492*22ce4affSfengbojiang * so that this entry has the specified starting address.
2493a9643ea8Slogwang */
2494*22ce4affSfengbojiang new_entry->end = startaddr;
2495*22ce4affSfengbojiang vm_map_entry_link(map, new_entry);
2496*22ce4affSfengbojiang return (KERN_SUCCESS);
2497a9643ea8Slogwang }
2498*22ce4affSfengbojiang
2499*22ce4affSfengbojiang /*
2500*22ce4affSfengbojiang * vm_map_lookup_clip_start:
2501*22ce4affSfengbojiang *
2502*22ce4affSfengbojiang * Find the entry at or just after 'start', and clip it if 'start' is in
2503*22ce4affSfengbojiang * the interior of the entry. Return entry after 'start', and in
2504*22ce4affSfengbojiang * prev_entry set the entry before 'start'.
2505*22ce4affSfengbojiang */
2506*22ce4affSfengbojiang static int
vm_map_lookup_clip_start(vm_map_t map,vm_offset_t start,vm_map_entry_t * res_entry,vm_map_entry_t * prev_entry)2507*22ce4affSfengbojiang vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
2508*22ce4affSfengbojiang vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
2509*22ce4affSfengbojiang {
2510*22ce4affSfengbojiang vm_map_entry_t entry;
2511*22ce4affSfengbojiang int rv;
2512*22ce4affSfengbojiang
2513*22ce4affSfengbojiang if (!map->system_map)
2514*22ce4affSfengbojiang WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2515*22ce4affSfengbojiang "%s: map %p start 0x%jx prev %p", __func__, map,
2516*22ce4affSfengbojiang (uintmax_t)start, prev_entry);
2517*22ce4affSfengbojiang
2518*22ce4affSfengbojiang if (vm_map_lookup_entry(map, start, prev_entry)) {
2519*22ce4affSfengbojiang entry = *prev_entry;
2520*22ce4affSfengbojiang rv = vm_map_clip_start(map, entry, start);
2521*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
2522*22ce4affSfengbojiang return (rv);
2523*22ce4affSfengbojiang *prev_entry = vm_map_entry_pred(entry);
2524*22ce4affSfengbojiang } else
2525*22ce4affSfengbojiang entry = vm_map_entry_succ(*prev_entry);
2526*22ce4affSfengbojiang *res_entry = entry;
2527*22ce4affSfengbojiang return (KERN_SUCCESS);
2528a9643ea8Slogwang }
2529a9643ea8Slogwang
2530a9643ea8Slogwang /*
2531a9643ea8Slogwang * vm_map_clip_end: [ internal use only ]
2532a9643ea8Slogwang *
2533a9643ea8Slogwang * Asserts that the given entry ends at or before
2534a9643ea8Slogwang * the specified address; if necessary,
2535a9643ea8Slogwang * it splits the entry into two.
2536a9643ea8Slogwang */
2537*22ce4affSfengbojiang static int
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_offset_t endaddr)2538*22ce4affSfengbojiang vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
2539a9643ea8Slogwang {
2540a9643ea8Slogwang vm_map_entry_t new_entry;
2541*22ce4affSfengbojiang int bdry_idx;
2542*22ce4affSfengbojiang
2543*22ce4affSfengbojiang if (!map->system_map)
2544*22ce4affSfengbojiang WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2545*22ce4affSfengbojiang "%s: map %p entry %p end 0x%jx", __func__, map, entry,
2546*22ce4affSfengbojiang (uintmax_t)endaddr);
2547*22ce4affSfengbojiang
2548*22ce4affSfengbojiang if (endaddr >= entry->end)
2549*22ce4affSfengbojiang return (KERN_SUCCESS);
2550a9643ea8Slogwang
2551a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
2552*22ce4affSfengbojiang KASSERT(entry->start < endaddr && entry->end > endaddr,
2553*22ce4affSfengbojiang ("%s: invalid clip of entry %p", __func__, entry));
2554*22ce4affSfengbojiang
2555*22ce4affSfengbojiang bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
2556*22ce4affSfengbojiang MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
2557*22ce4affSfengbojiang if (bdry_idx != 0) {
2558*22ce4affSfengbojiang if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
2559*22ce4affSfengbojiang return (KERN_INVALID_ARGUMENT);
2560*22ce4affSfengbojiang }
2561*22ce4affSfengbojiang
2562*22ce4affSfengbojiang new_entry = vm_map_entry_clone(map, entry);
2563a9643ea8Slogwang
2564a9643ea8Slogwang /*
2565*22ce4affSfengbojiang * Split off the back portion. Insert the new entry AFTER this one,
2566*22ce4affSfengbojiang * so that this entry has the specified ending address.
2567a9643ea8Slogwang */
2568*22ce4affSfengbojiang new_entry->start = endaddr;
2569*22ce4affSfengbojiang vm_map_entry_link(map, new_entry);
2570a9643ea8Slogwang
2571*22ce4affSfengbojiang return (KERN_SUCCESS);
2572a9643ea8Slogwang }
2573a9643ea8Slogwang
2574a9643ea8Slogwang /*
2575a9643ea8Slogwang * vm_map_submap: [ kernel use only ]
2576a9643ea8Slogwang *
2577a9643ea8Slogwang * Mark the given range as handled by a subordinate map.
2578a9643ea8Slogwang *
2579a9643ea8Slogwang * This range must have been created with vm_map_find,
2580a9643ea8Slogwang * and no other operations may have been performed on this
2581a9643ea8Slogwang * range prior to calling vm_map_submap.
2582a9643ea8Slogwang *
2583a9643ea8Slogwang * Only a limited number of operations can be performed
2584a9643ea8Slogwang * within this rage after calling vm_map_submap:
2585a9643ea8Slogwang * vm_fault
2586a9643ea8Slogwang * [Don't try vm_map_copy!]
2587a9643ea8Slogwang *
2588a9643ea8Slogwang * To remove a submapping, one must first remove the
2589a9643ea8Slogwang * range from the superior map, and then destroy the
2590a9643ea8Slogwang * submap (if desired). [Better yet, don't try it.]
2591a9643ea8Slogwang */
2592a9643ea8Slogwang int
vm_map_submap(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_map_t submap)2593a9643ea8Slogwang vm_map_submap(
2594a9643ea8Slogwang vm_map_t map,
2595a9643ea8Slogwang vm_offset_t start,
2596a9643ea8Slogwang vm_offset_t end,
2597a9643ea8Slogwang vm_map_t submap)
2598a9643ea8Slogwang {
2599a9643ea8Slogwang vm_map_entry_t entry;
2600*22ce4affSfengbojiang int result;
2601*22ce4affSfengbojiang
2602*22ce4affSfengbojiang result = KERN_INVALID_ARGUMENT;
2603*22ce4affSfengbojiang
2604*22ce4affSfengbojiang vm_map_lock(submap);
2605*22ce4affSfengbojiang submap->flags |= MAP_IS_SUB_MAP;
2606*22ce4affSfengbojiang vm_map_unlock(submap);
2607a9643ea8Slogwang
2608a9643ea8Slogwang vm_map_lock(map);
2609a9643ea8Slogwang VM_MAP_RANGE_CHECK(map, start, end);
2610*22ce4affSfengbojiang if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
2611*22ce4affSfengbojiang (entry->eflags & MAP_ENTRY_COW) == 0 &&
2612*22ce4affSfengbojiang entry->object.vm_object == NULL) {
2613*22ce4affSfengbojiang result = vm_map_clip_start(map, entry, start);
2614*22ce4affSfengbojiang if (result != KERN_SUCCESS)
2615*22ce4affSfengbojiang goto unlock;
2616*22ce4affSfengbojiang result = vm_map_clip_end(map, entry, end);
2617*22ce4affSfengbojiang if (result != KERN_SUCCESS)
2618*22ce4affSfengbojiang goto unlock;
2619a9643ea8Slogwang entry->object.sub_map = submap;
2620a9643ea8Slogwang entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
2621a9643ea8Slogwang result = KERN_SUCCESS;
2622a9643ea8Slogwang }
2623*22ce4affSfengbojiang unlock:
2624a9643ea8Slogwang vm_map_unlock(map);
2625a9643ea8Slogwang
2626*22ce4affSfengbojiang if (result != KERN_SUCCESS) {
2627*22ce4affSfengbojiang vm_map_lock(submap);
2628*22ce4affSfengbojiang submap->flags &= ~MAP_IS_SUB_MAP;
2629*22ce4affSfengbojiang vm_map_unlock(submap);
2630*22ce4affSfengbojiang }
2631a9643ea8Slogwang return (result);
2632a9643ea8Slogwang }
2633a9643ea8Slogwang
2634a9643ea8Slogwang /*
2635a9643ea8Slogwang * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
2636a9643ea8Slogwang */
2637a9643ea8Slogwang #define MAX_INIT_PT 96
2638a9643ea8Slogwang
2639a9643ea8Slogwang /*
2640a9643ea8Slogwang * vm_map_pmap_enter:
2641a9643ea8Slogwang *
2642a9643ea8Slogwang * Preload the specified map's pmap with mappings to the specified
2643a9643ea8Slogwang * object's memory-resident pages. No further physical pages are
2644a9643ea8Slogwang * allocated, and no further virtual pages are retrieved from secondary
2645a9643ea8Slogwang * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a
2646a9643ea8Slogwang * limited number of page mappings are created at the low-end of the
2647a9643ea8Slogwang * specified address range. (For this purpose, a superpage mapping
2648a9643ea8Slogwang * counts as one page mapping.) Otherwise, all resident pages within
2649*22ce4affSfengbojiang * the specified address range are mapped.
2650a9643ea8Slogwang */
2651a9643ea8Slogwang static void
vm_map_pmap_enter(vm_map_t map,vm_offset_t addr,vm_prot_t prot,vm_object_t object,vm_pindex_t pindex,vm_size_t size,int flags)2652a9643ea8Slogwang vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
2653a9643ea8Slogwang vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
2654a9643ea8Slogwang {
2655a9643ea8Slogwang vm_offset_t start;
2656a9643ea8Slogwang vm_page_t p, p_start;
2657a9643ea8Slogwang vm_pindex_t mask, psize, threshold, tmpidx;
2658a9643ea8Slogwang
2659a9643ea8Slogwang if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
2660a9643ea8Slogwang return;
2661a9643ea8Slogwang if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2662a9643ea8Slogwang VM_OBJECT_WLOCK(object);
2663a9643ea8Slogwang if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2664a9643ea8Slogwang pmap_object_init_pt(map->pmap, addr, object, pindex,
2665a9643ea8Slogwang size);
2666a9643ea8Slogwang VM_OBJECT_WUNLOCK(object);
2667a9643ea8Slogwang return;
2668a9643ea8Slogwang }
2669a9643ea8Slogwang VM_OBJECT_LOCK_DOWNGRADE(object);
2670*22ce4affSfengbojiang } else
2671*22ce4affSfengbojiang VM_OBJECT_RLOCK(object);
2672a9643ea8Slogwang
2673a9643ea8Slogwang psize = atop(size);
2674a9643ea8Slogwang if (psize + pindex > object->size) {
2675*22ce4affSfengbojiang if (pindex >= object->size) {
2676a9643ea8Slogwang VM_OBJECT_RUNLOCK(object);
2677a9643ea8Slogwang return;
2678a9643ea8Slogwang }
2679a9643ea8Slogwang psize = object->size - pindex;
2680a9643ea8Slogwang }
2681a9643ea8Slogwang
2682a9643ea8Slogwang start = 0;
2683a9643ea8Slogwang p_start = NULL;
2684a9643ea8Slogwang threshold = MAX_INIT_PT;
2685a9643ea8Slogwang
2686a9643ea8Slogwang p = vm_page_find_least(object, pindex);
2687a9643ea8Slogwang /*
2688a9643ea8Slogwang * Assert: the variable p is either (1) the page with the
2689a9643ea8Slogwang * least pindex greater than or equal to the parameter pindex
2690a9643ea8Slogwang * or (2) NULL.
2691a9643ea8Slogwang */
2692a9643ea8Slogwang for (;
2693a9643ea8Slogwang p != NULL && (tmpidx = p->pindex - pindex) < psize;
2694a9643ea8Slogwang p = TAILQ_NEXT(p, listq)) {
2695a9643ea8Slogwang /*
2696a9643ea8Slogwang * don't allow an madvise to blow away our really
2697a9643ea8Slogwang * free pages allocating pv entries.
2698a9643ea8Slogwang */
2699a9643ea8Slogwang if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
2700*22ce4affSfengbojiang vm_page_count_severe()) ||
2701a9643ea8Slogwang ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
2702a9643ea8Slogwang tmpidx >= threshold)) {
2703a9643ea8Slogwang psize = tmpidx;
2704a9643ea8Slogwang break;
2705a9643ea8Slogwang }
2706*22ce4affSfengbojiang if (vm_page_all_valid(p)) {
2707a9643ea8Slogwang if (p_start == NULL) {
2708a9643ea8Slogwang start = addr + ptoa(tmpidx);
2709a9643ea8Slogwang p_start = p;
2710a9643ea8Slogwang }
2711a9643ea8Slogwang /* Jump ahead if a superpage mapping is possible. */
2712a9643ea8Slogwang if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
2713a9643ea8Slogwang (pagesizes[p->psind] - 1)) == 0) {
2714a9643ea8Slogwang mask = atop(pagesizes[p->psind]) - 1;
2715a9643ea8Slogwang if (tmpidx + mask < psize &&
2716*22ce4affSfengbojiang vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
2717a9643ea8Slogwang p += mask;
2718a9643ea8Slogwang threshold += mask;
2719a9643ea8Slogwang }
2720a9643ea8Slogwang }
2721a9643ea8Slogwang } else if (p_start != NULL) {
2722a9643ea8Slogwang pmap_enter_object(map->pmap, start, addr +
2723a9643ea8Slogwang ptoa(tmpidx), p_start, prot);
2724a9643ea8Slogwang p_start = NULL;
2725a9643ea8Slogwang }
2726a9643ea8Slogwang }
2727a9643ea8Slogwang if (p_start != NULL)
2728a9643ea8Slogwang pmap_enter_object(map->pmap, start, addr + ptoa(psize),
2729a9643ea8Slogwang p_start, prot);
2730a9643ea8Slogwang VM_OBJECT_RUNLOCK(object);
2731a9643ea8Slogwang }
2732a9643ea8Slogwang
2733a9643ea8Slogwang /*
2734a9643ea8Slogwang * vm_map_protect:
2735a9643ea8Slogwang *
2736*22ce4affSfengbojiang * Sets the protection and/or the maximum protection of the
2737*22ce4affSfengbojiang * specified address region in the target map.
2738a9643ea8Slogwang */
2739a9643ea8Slogwang int
vm_map_protect(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_prot_t new_prot,vm_prot_t new_maxprot,int flags)2740a9643ea8Slogwang vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2741*22ce4affSfengbojiang vm_prot_t new_prot, vm_prot_t new_maxprot, int flags)
2742a9643ea8Slogwang {
2743*22ce4affSfengbojiang vm_map_entry_t entry, first_entry, in_tran, prev_entry;
2744a9643ea8Slogwang vm_object_t obj;
2745a9643ea8Slogwang struct ucred *cred;
2746a9643ea8Slogwang vm_prot_t old_prot;
2747*22ce4affSfengbojiang int rv;
2748a9643ea8Slogwang
2749a9643ea8Slogwang if (start == end)
2750a9643ea8Slogwang return (KERN_SUCCESS);
2751a9643ea8Slogwang
2752*22ce4affSfengbojiang if ((flags & (VM_MAP_PROTECT_SET_PROT | VM_MAP_PROTECT_SET_MAXPROT)) ==
2753*22ce4affSfengbojiang (VM_MAP_PROTECT_SET_PROT | VM_MAP_PROTECT_SET_MAXPROT) &&
2754*22ce4affSfengbojiang (new_prot & new_maxprot) != new_prot)
2755*22ce4affSfengbojiang return (KERN_OUT_OF_BOUNDS);
2756*22ce4affSfengbojiang
2757*22ce4affSfengbojiang again:
2758*22ce4affSfengbojiang in_tran = NULL;
2759a9643ea8Slogwang vm_map_lock(map);
2760a9643ea8Slogwang
2761*22ce4affSfengbojiang if ((map->flags & MAP_WXORX) != 0 &&
2762*22ce4affSfengbojiang (flags & VM_MAP_PROTECT_SET_PROT) != 0 &&
2763*22ce4affSfengbojiang (new_prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE |
2764*22ce4affSfengbojiang VM_PROT_EXECUTE)) {
2765*22ce4affSfengbojiang vm_map_unlock(map);
2766*22ce4affSfengbojiang return (KERN_PROTECTION_FAILURE);
2767*22ce4affSfengbojiang }
2768*22ce4affSfengbojiang
2769*22ce4affSfengbojiang /*
2770*22ce4affSfengbojiang * Ensure that we are not concurrently wiring pages. vm_map_wire() may
2771*22ce4affSfengbojiang * need to fault pages into the map and will drop the map lock while
2772*22ce4affSfengbojiang * doing so, and the VM object may end up in an inconsistent state if we
2773*22ce4affSfengbojiang * update the protection on the map entry in between faults.
2774*22ce4affSfengbojiang */
2775*22ce4affSfengbojiang vm_map_wait_busy(map);
2776*22ce4affSfengbojiang
2777a9643ea8Slogwang VM_MAP_RANGE_CHECK(map, start, end);
2778a9643ea8Slogwang
2779*22ce4affSfengbojiang if (!vm_map_lookup_entry(map, start, &first_entry))
2780*22ce4affSfengbojiang first_entry = vm_map_entry_succ(first_entry);
2781a9643ea8Slogwang
2782a9643ea8Slogwang /*
2783a9643ea8Slogwang * Make a first pass to check for protection violations.
2784a9643ea8Slogwang */
2785*22ce4affSfengbojiang for (entry = first_entry; entry->start < end;
2786*22ce4affSfengbojiang entry = vm_map_entry_succ(entry)) {
2787*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
2788*22ce4affSfengbojiang continue;
2789*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
2790a9643ea8Slogwang vm_map_unlock(map);
2791a9643ea8Slogwang return (KERN_INVALID_ARGUMENT);
2792a9643ea8Slogwang }
2793*22ce4affSfengbojiang if ((flags & VM_MAP_PROTECT_SET_PROT) == 0)
2794*22ce4affSfengbojiang new_prot = entry->protection;
2795*22ce4affSfengbojiang if ((flags & VM_MAP_PROTECT_SET_MAXPROT) == 0)
2796*22ce4affSfengbojiang new_maxprot = entry->max_protection;
2797*22ce4affSfengbojiang if ((new_prot & entry->max_protection) != new_prot ||
2798*22ce4affSfengbojiang (new_maxprot & entry->max_protection) != new_maxprot) {
2799a9643ea8Slogwang vm_map_unlock(map);
2800a9643ea8Slogwang return (KERN_PROTECTION_FAILURE);
2801a9643ea8Slogwang }
2802*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
2803*22ce4affSfengbojiang in_tran = entry;
2804a9643ea8Slogwang }
2805a9643ea8Slogwang
2806a9643ea8Slogwang /*
2807*22ce4affSfengbojiang * Postpone the operation until all in-transition map entries have
2808*22ce4affSfengbojiang * stabilized. An in-transition entry might already have its pages
2809*22ce4affSfengbojiang * wired and wired_count incremented, but not yet have its
2810*22ce4affSfengbojiang * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call
2811*22ce4affSfengbojiang * vm_fault_copy_entry() in the final loop below.
2812a9643ea8Slogwang */
2813*22ce4affSfengbojiang if (in_tran != NULL) {
2814*22ce4affSfengbojiang in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2815*22ce4affSfengbojiang vm_map_unlock_and_wait(map, 0);
2816*22ce4affSfengbojiang goto again;
2817a9643ea8Slogwang }
2818a9643ea8Slogwang
2819*22ce4affSfengbojiang /*
2820*22ce4affSfengbojiang * Before changing the protections, try to reserve swap space for any
2821*22ce4affSfengbojiang * private (i.e., copy-on-write) mappings that are transitioning from
2822*22ce4affSfengbojiang * read-only to read/write access. If a reservation fails, break out
2823*22ce4affSfengbojiang * of this loop early and let the next loop simplify the entries, since
2824*22ce4affSfengbojiang * some may now be mergeable.
2825*22ce4affSfengbojiang */
2826*22ce4affSfengbojiang rv = vm_map_clip_start(map, first_entry, start);
2827*22ce4affSfengbojiang if (rv != KERN_SUCCESS) {
2828*22ce4affSfengbojiang vm_map_unlock(map);
2829*22ce4affSfengbojiang return (rv);
2830*22ce4affSfengbojiang }
2831*22ce4affSfengbojiang for (entry = first_entry; entry->start < end;
2832*22ce4affSfengbojiang entry = vm_map_entry_succ(entry)) {
2833*22ce4affSfengbojiang rv = vm_map_clip_end(map, entry, end);
2834*22ce4affSfengbojiang if (rv != KERN_SUCCESS) {
2835*22ce4affSfengbojiang vm_map_unlock(map);
2836*22ce4affSfengbojiang return (rv);
2837*22ce4affSfengbojiang }
2838*22ce4affSfengbojiang
2839*22ce4affSfengbojiang if ((flags & VM_MAP_PROTECT_SET_PROT) == 0 ||
2840*22ce4affSfengbojiang ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
2841*22ce4affSfengbojiang ENTRY_CHARGED(entry) ||
2842*22ce4affSfengbojiang (entry->eflags & MAP_ENTRY_GUARD) != 0)
2843*22ce4affSfengbojiang continue;
2844*22ce4affSfengbojiang
2845a9643ea8Slogwang cred = curthread->td_ucred;
2846*22ce4affSfengbojiang obj = entry->object.vm_object;
2847a9643ea8Slogwang
2848*22ce4affSfengbojiang if (obj == NULL ||
2849*22ce4affSfengbojiang (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) {
2850*22ce4affSfengbojiang if (!swap_reserve(entry->end - entry->start)) {
2851*22ce4affSfengbojiang rv = KERN_RESOURCE_SHORTAGE;
2852*22ce4affSfengbojiang end = entry->end;
2853*22ce4affSfengbojiang break;
2854a9643ea8Slogwang }
2855a9643ea8Slogwang crhold(cred);
2856*22ce4affSfengbojiang entry->cred = cred;
2857a9643ea8Slogwang continue;
2858a9643ea8Slogwang }
2859a9643ea8Slogwang
2860*22ce4affSfengbojiang if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP)
2861*22ce4affSfengbojiang continue;
2862a9643ea8Slogwang VM_OBJECT_WLOCK(obj);
2863a9643ea8Slogwang if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
2864a9643ea8Slogwang VM_OBJECT_WUNLOCK(obj);
2865a9643ea8Slogwang continue;
2866a9643ea8Slogwang }
2867a9643ea8Slogwang
2868a9643ea8Slogwang /*
2869a9643ea8Slogwang * Charge for the whole object allocation now, since
2870a9643ea8Slogwang * we cannot distinguish between non-charged and
2871a9643ea8Slogwang * charged clipped mapping of the same object later.
2872a9643ea8Slogwang */
2873a9643ea8Slogwang KASSERT(obj->charge == 0,
2874a9643ea8Slogwang ("vm_map_protect: object %p overcharged (entry %p)",
2875*22ce4affSfengbojiang obj, entry));
2876a9643ea8Slogwang if (!swap_reserve(ptoa(obj->size))) {
2877a9643ea8Slogwang VM_OBJECT_WUNLOCK(obj);
2878*22ce4affSfengbojiang rv = KERN_RESOURCE_SHORTAGE;
2879*22ce4affSfengbojiang end = entry->end;
2880*22ce4affSfengbojiang break;
2881a9643ea8Slogwang }
2882a9643ea8Slogwang
2883a9643ea8Slogwang crhold(cred);
2884a9643ea8Slogwang obj->cred = cred;
2885a9643ea8Slogwang obj->charge = ptoa(obj->size);
2886a9643ea8Slogwang VM_OBJECT_WUNLOCK(obj);
2887a9643ea8Slogwang }
2888a9643ea8Slogwang
2889a9643ea8Slogwang /*
2890*22ce4affSfengbojiang * If enough swap space was available, go back and fix up protections.
2891*22ce4affSfengbojiang * Otherwise, just simplify entries, since some may have been modified.
2892*22ce4affSfengbojiang * [Note that clipping is not necessary the second time.]
2893a9643ea8Slogwang */
2894*22ce4affSfengbojiang for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry;
2895*22ce4affSfengbojiang entry->start < end;
2896*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, entry),
2897*22ce4affSfengbojiang prev_entry = entry, entry = vm_map_entry_succ(entry)) {
2898*22ce4affSfengbojiang if (rv != KERN_SUCCESS ||
2899*22ce4affSfengbojiang (entry->eflags & MAP_ENTRY_GUARD) != 0)
2900*22ce4affSfengbojiang continue;
2901a9643ea8Slogwang
2902*22ce4affSfengbojiang old_prot = entry->protection;
2903*22ce4affSfengbojiang
2904*22ce4affSfengbojiang if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
2905*22ce4affSfengbojiang entry->max_protection = new_maxprot;
2906*22ce4affSfengbojiang entry->protection = new_maxprot & old_prot;
2907*22ce4affSfengbojiang }
2908*22ce4affSfengbojiang if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
2909*22ce4affSfengbojiang entry->protection = new_prot;
2910a9643ea8Slogwang
2911a9643ea8Slogwang /*
2912a9643ea8Slogwang * For user wired map entries, the normal lazy evaluation of
2913a9643ea8Slogwang * write access upgrades through soft page faults is
2914a9643ea8Slogwang * undesirable. Instead, immediately copy any pages that are
2915a9643ea8Slogwang * copy-on-write and enable write access in the physical map.
2916a9643ea8Slogwang */
2917*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2918*22ce4affSfengbojiang (entry->protection & VM_PROT_WRITE) != 0 &&
2919a9643ea8Slogwang (old_prot & VM_PROT_WRITE) == 0)
2920*22ce4affSfengbojiang vm_fault_copy_entry(map, map, entry, entry, NULL);
2921a9643ea8Slogwang
2922a9643ea8Slogwang /*
2923a9643ea8Slogwang * When restricting access, update the physical map. Worry
2924a9643ea8Slogwang * about copy-on-write here.
2925a9643ea8Slogwang */
2926*22ce4affSfengbojiang if ((old_prot & ~entry->protection) != 0) {
2927a9643ea8Slogwang #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2928a9643ea8Slogwang VM_PROT_ALL)
2929*22ce4affSfengbojiang pmap_protect(map->pmap, entry->start,
2930*22ce4affSfengbojiang entry->end,
2931*22ce4affSfengbojiang entry->protection & MASK(entry));
2932a9643ea8Slogwang #undef MASK
2933a9643ea8Slogwang }
2934a9643ea8Slogwang }
2935*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, entry);
2936a9643ea8Slogwang vm_map_unlock(map);
2937*22ce4affSfengbojiang return (rv);
2938a9643ea8Slogwang }
2939a9643ea8Slogwang
2940a9643ea8Slogwang /*
2941a9643ea8Slogwang * vm_map_madvise:
2942a9643ea8Slogwang *
2943a9643ea8Slogwang * This routine traverses a processes map handling the madvise
2944a9643ea8Slogwang * system call. Advisories are classified as either those effecting
2945a9643ea8Slogwang * the vm_map_entry structure, or those effecting the underlying
2946a9643ea8Slogwang * objects.
2947a9643ea8Slogwang */
2948a9643ea8Slogwang int
vm_map_madvise(vm_map_t map,vm_offset_t start,vm_offset_t end,int behav)2949a9643ea8Slogwang vm_map_madvise(
2950a9643ea8Slogwang vm_map_t map,
2951a9643ea8Slogwang vm_offset_t start,
2952a9643ea8Slogwang vm_offset_t end,
2953a9643ea8Slogwang int behav)
2954a9643ea8Slogwang {
2955*22ce4affSfengbojiang vm_map_entry_t entry, prev_entry;
2956*22ce4affSfengbojiang int rv;
2957*22ce4affSfengbojiang bool modify_map;
2958a9643ea8Slogwang
2959a9643ea8Slogwang /*
2960a9643ea8Slogwang * Some madvise calls directly modify the vm_map_entry, in which case
2961a9643ea8Slogwang * we need to use an exclusive lock on the map and we need to perform
2962a9643ea8Slogwang * various clipping operations. Otherwise we only need a read-lock
2963a9643ea8Slogwang * on the map.
2964a9643ea8Slogwang */
2965a9643ea8Slogwang switch(behav) {
2966a9643ea8Slogwang case MADV_NORMAL:
2967a9643ea8Slogwang case MADV_SEQUENTIAL:
2968a9643ea8Slogwang case MADV_RANDOM:
2969a9643ea8Slogwang case MADV_NOSYNC:
2970a9643ea8Slogwang case MADV_AUTOSYNC:
2971a9643ea8Slogwang case MADV_NOCORE:
2972a9643ea8Slogwang case MADV_CORE:
2973a9643ea8Slogwang if (start == end)
2974*22ce4affSfengbojiang return (0);
2975*22ce4affSfengbojiang modify_map = true;
2976a9643ea8Slogwang vm_map_lock(map);
2977a9643ea8Slogwang break;
2978a9643ea8Slogwang case MADV_WILLNEED:
2979a9643ea8Slogwang case MADV_DONTNEED:
2980a9643ea8Slogwang case MADV_FREE:
2981a9643ea8Slogwang if (start == end)
2982*22ce4affSfengbojiang return (0);
2983*22ce4affSfengbojiang modify_map = false;
2984a9643ea8Slogwang vm_map_lock_read(map);
2985a9643ea8Slogwang break;
2986a9643ea8Slogwang default:
2987*22ce4affSfengbojiang return (EINVAL);
2988a9643ea8Slogwang }
2989a9643ea8Slogwang
2990a9643ea8Slogwang /*
2991a9643ea8Slogwang * Locate starting entry and clip if necessary.
2992a9643ea8Slogwang */
2993a9643ea8Slogwang VM_MAP_RANGE_CHECK(map, start, end);
2994a9643ea8Slogwang
2995a9643ea8Slogwang if (modify_map) {
2996a9643ea8Slogwang /*
2997a9643ea8Slogwang * madvise behaviors that are implemented in the vm_map_entry.
2998a9643ea8Slogwang *
2999a9643ea8Slogwang * We clip the vm_map_entry so that behavioral changes are
3000a9643ea8Slogwang * limited to the specified address range.
3001a9643ea8Slogwang */
3002*22ce4affSfengbojiang rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
3003*22ce4affSfengbojiang if (rv != KERN_SUCCESS) {
3004*22ce4affSfengbojiang vm_map_unlock(map);
3005*22ce4affSfengbojiang return (vm_mmap_to_errno(rv));
3006*22ce4affSfengbojiang }
3007*22ce4affSfengbojiang
3008*22ce4affSfengbojiang for (; entry->start < end; prev_entry = entry,
3009*22ce4affSfengbojiang entry = vm_map_entry_succ(entry)) {
3010*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3011a9643ea8Slogwang continue;
3012a9643ea8Slogwang
3013*22ce4affSfengbojiang rv = vm_map_clip_end(map, entry, end);
3014*22ce4affSfengbojiang if (rv != KERN_SUCCESS) {
3015*22ce4affSfengbojiang vm_map_unlock(map);
3016*22ce4affSfengbojiang return (vm_mmap_to_errno(rv));
3017*22ce4affSfengbojiang }
3018a9643ea8Slogwang
3019a9643ea8Slogwang switch (behav) {
3020a9643ea8Slogwang case MADV_NORMAL:
3021*22ce4affSfengbojiang vm_map_entry_set_behavior(entry,
3022*22ce4affSfengbojiang MAP_ENTRY_BEHAV_NORMAL);
3023a9643ea8Slogwang break;
3024a9643ea8Slogwang case MADV_SEQUENTIAL:
3025*22ce4affSfengbojiang vm_map_entry_set_behavior(entry,
3026*22ce4affSfengbojiang MAP_ENTRY_BEHAV_SEQUENTIAL);
3027a9643ea8Slogwang break;
3028a9643ea8Slogwang case MADV_RANDOM:
3029*22ce4affSfengbojiang vm_map_entry_set_behavior(entry,
3030*22ce4affSfengbojiang MAP_ENTRY_BEHAV_RANDOM);
3031a9643ea8Slogwang break;
3032a9643ea8Slogwang case MADV_NOSYNC:
3033*22ce4affSfengbojiang entry->eflags |= MAP_ENTRY_NOSYNC;
3034a9643ea8Slogwang break;
3035a9643ea8Slogwang case MADV_AUTOSYNC:
3036*22ce4affSfengbojiang entry->eflags &= ~MAP_ENTRY_NOSYNC;
3037a9643ea8Slogwang break;
3038a9643ea8Slogwang case MADV_NOCORE:
3039*22ce4affSfengbojiang entry->eflags |= MAP_ENTRY_NOCOREDUMP;
3040a9643ea8Slogwang break;
3041a9643ea8Slogwang case MADV_CORE:
3042*22ce4affSfengbojiang entry->eflags &= ~MAP_ENTRY_NOCOREDUMP;
3043a9643ea8Slogwang break;
3044a9643ea8Slogwang default:
3045a9643ea8Slogwang break;
3046a9643ea8Slogwang }
3047*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, entry);
3048a9643ea8Slogwang }
3049*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, entry);
3050a9643ea8Slogwang vm_map_unlock(map);
3051a9643ea8Slogwang } else {
3052a9643ea8Slogwang vm_pindex_t pstart, pend;
3053a9643ea8Slogwang
3054a9643ea8Slogwang /*
3055a9643ea8Slogwang * madvise behaviors that are implemented in the underlying
3056a9643ea8Slogwang * vm_object.
3057a9643ea8Slogwang *
3058a9643ea8Slogwang * Since we don't clip the vm_map_entry, we have to clip
3059a9643ea8Slogwang * the vm_object pindex and count.
3060a9643ea8Slogwang */
3061*22ce4affSfengbojiang if (!vm_map_lookup_entry(map, start, &entry))
3062*22ce4affSfengbojiang entry = vm_map_entry_succ(entry);
3063*22ce4affSfengbojiang for (; entry->start < end;
3064*22ce4affSfengbojiang entry = vm_map_entry_succ(entry)) {
3065a9643ea8Slogwang vm_offset_t useEnd, useStart;
3066a9643ea8Slogwang
3067*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3068a9643ea8Slogwang continue;
3069a9643ea8Slogwang
3070*22ce4affSfengbojiang /*
3071*22ce4affSfengbojiang * MADV_FREE would otherwise rewind time to
3072*22ce4affSfengbojiang * the creation of the shadow object. Because
3073*22ce4affSfengbojiang * we hold the VM map read-locked, neither the
3074*22ce4affSfengbojiang * entry's object nor the presence of a
3075*22ce4affSfengbojiang * backing object can change.
3076*22ce4affSfengbojiang */
3077*22ce4affSfengbojiang if (behav == MADV_FREE &&
3078*22ce4affSfengbojiang entry->object.vm_object != NULL &&
3079*22ce4affSfengbojiang entry->object.vm_object->backing_object != NULL)
3080*22ce4affSfengbojiang continue;
3081a9643ea8Slogwang
3082*22ce4affSfengbojiang pstart = OFF_TO_IDX(entry->offset);
3083*22ce4affSfengbojiang pend = pstart + atop(entry->end - entry->start);
3084*22ce4affSfengbojiang useStart = entry->start;
3085*22ce4affSfengbojiang useEnd = entry->end;
3086*22ce4affSfengbojiang
3087*22ce4affSfengbojiang if (entry->start < start) {
3088*22ce4affSfengbojiang pstart += atop(start - entry->start);
3089a9643ea8Slogwang useStart = start;
3090a9643ea8Slogwang }
3091*22ce4affSfengbojiang if (entry->end > end) {
3092*22ce4affSfengbojiang pend -= atop(entry->end - end);
3093a9643ea8Slogwang useEnd = end;
3094a9643ea8Slogwang }
3095a9643ea8Slogwang
3096a9643ea8Slogwang if (pstart >= pend)
3097a9643ea8Slogwang continue;
3098a9643ea8Slogwang
3099a9643ea8Slogwang /*
3100a9643ea8Slogwang * Perform the pmap_advise() before clearing
3101a9643ea8Slogwang * PGA_REFERENCED in vm_page_advise(). Otherwise, a
3102a9643ea8Slogwang * concurrent pmap operation, such as pmap_remove(),
3103a9643ea8Slogwang * could clear a reference in the pmap and set
3104a9643ea8Slogwang * PGA_REFERENCED on the page before the pmap_advise()
3105a9643ea8Slogwang * had completed. Consequently, the page would appear
3106a9643ea8Slogwang * referenced based upon an old reference that
3107a9643ea8Slogwang * occurred before this pmap_advise() ran.
3108a9643ea8Slogwang */
3109a9643ea8Slogwang if (behav == MADV_DONTNEED || behav == MADV_FREE)
3110a9643ea8Slogwang pmap_advise(map->pmap, useStart, useEnd,
3111a9643ea8Slogwang behav);
3112a9643ea8Slogwang
3113*22ce4affSfengbojiang vm_object_madvise(entry->object.vm_object, pstart,
3114a9643ea8Slogwang pend, behav);
3115a9643ea8Slogwang
3116a9643ea8Slogwang /*
3117a9643ea8Slogwang * Pre-populate paging structures in the
3118a9643ea8Slogwang * WILLNEED case. For wired entries, the
3119a9643ea8Slogwang * paging structures are already populated.
3120a9643ea8Slogwang */
3121a9643ea8Slogwang if (behav == MADV_WILLNEED &&
3122*22ce4affSfengbojiang entry->wired_count == 0) {
3123a9643ea8Slogwang vm_map_pmap_enter(map,
3124a9643ea8Slogwang useStart,
3125*22ce4affSfengbojiang entry->protection,
3126*22ce4affSfengbojiang entry->object.vm_object,
3127a9643ea8Slogwang pstart,
3128a9643ea8Slogwang ptoa(pend - pstart),
3129a9643ea8Slogwang MAP_PREFAULT_MADVISE
3130a9643ea8Slogwang );
3131a9643ea8Slogwang }
3132a9643ea8Slogwang }
3133a9643ea8Slogwang vm_map_unlock_read(map);
3134a9643ea8Slogwang }
3135a9643ea8Slogwang return (0);
3136a9643ea8Slogwang }
3137a9643ea8Slogwang
3138a9643ea8Slogwang /*
3139a9643ea8Slogwang * vm_map_inherit:
3140a9643ea8Slogwang *
3141a9643ea8Slogwang * Sets the inheritance of the specified address
3142a9643ea8Slogwang * range in the target map. Inheritance
3143a9643ea8Slogwang * affects how the map will be shared with
3144a9643ea8Slogwang * child maps at the time of vmspace_fork.
3145a9643ea8Slogwang */
3146a9643ea8Slogwang int
vm_map_inherit(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_inherit_t new_inheritance)3147a9643ea8Slogwang vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
3148a9643ea8Slogwang vm_inherit_t new_inheritance)
3149a9643ea8Slogwang {
3150*22ce4affSfengbojiang vm_map_entry_t entry, lentry, prev_entry, start_entry;
3151*22ce4affSfengbojiang int rv;
3152a9643ea8Slogwang
3153a9643ea8Slogwang switch (new_inheritance) {
3154a9643ea8Slogwang case VM_INHERIT_NONE:
3155a9643ea8Slogwang case VM_INHERIT_COPY:
3156a9643ea8Slogwang case VM_INHERIT_SHARE:
3157*22ce4affSfengbojiang case VM_INHERIT_ZERO:
3158a9643ea8Slogwang break;
3159a9643ea8Slogwang default:
3160a9643ea8Slogwang return (KERN_INVALID_ARGUMENT);
3161a9643ea8Slogwang }
3162a9643ea8Slogwang if (start == end)
3163a9643ea8Slogwang return (KERN_SUCCESS);
3164a9643ea8Slogwang vm_map_lock(map);
3165a9643ea8Slogwang VM_MAP_RANGE_CHECK(map, start, end);
3166*22ce4affSfengbojiang rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry);
3167*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
3168*22ce4affSfengbojiang goto unlock;
3169*22ce4affSfengbojiang if (vm_map_lookup_entry(map, end - 1, &lentry)) {
3170*22ce4affSfengbojiang rv = vm_map_clip_end(map, lentry, end);
3171*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
3172*22ce4affSfengbojiang goto unlock;
3173a9643ea8Slogwang }
3174*22ce4affSfengbojiang if (new_inheritance == VM_INHERIT_COPY) {
3175*22ce4affSfengbojiang for (entry = start_entry; entry->start < end;
3176*22ce4affSfengbojiang prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3177*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
3178*22ce4affSfengbojiang != 0) {
3179*22ce4affSfengbojiang rv = KERN_INVALID_ARGUMENT;
3180*22ce4affSfengbojiang goto unlock;
3181*22ce4affSfengbojiang }
3182*22ce4affSfengbojiang }
3183*22ce4affSfengbojiang }
3184*22ce4affSfengbojiang for (entry = start_entry; entry->start < end; prev_entry = entry,
3185*22ce4affSfengbojiang entry = vm_map_entry_succ(entry)) {
3186*22ce4affSfengbojiang KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx",
3187*22ce4affSfengbojiang entry, (uintmax_t)entry->end, (uintmax_t)end));
3188*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
3189*22ce4affSfengbojiang new_inheritance != VM_INHERIT_ZERO)
3190*22ce4affSfengbojiang entry->inheritance = new_inheritance;
3191*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, entry);
3192*22ce4affSfengbojiang }
3193*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, entry);
3194*22ce4affSfengbojiang unlock:
3195a9643ea8Slogwang vm_map_unlock(map);
3196*22ce4affSfengbojiang return (rv);
3197*22ce4affSfengbojiang }
3198*22ce4affSfengbojiang
3199*22ce4affSfengbojiang /*
3200*22ce4affSfengbojiang * vm_map_entry_in_transition:
3201*22ce4affSfengbojiang *
3202*22ce4affSfengbojiang * Release the map lock, and sleep until the entry is no longer in
3203*22ce4affSfengbojiang * transition. Awake and acquire the map lock. If the map changed while
3204*22ce4affSfengbojiang * another held the lock, lookup a possibly-changed entry at or after the
3205*22ce4affSfengbojiang * 'start' position of the old entry.
3206*22ce4affSfengbojiang */
3207*22ce4affSfengbojiang static vm_map_entry_t
vm_map_entry_in_transition(vm_map_t map,vm_offset_t in_start,vm_offset_t * io_end,bool holes_ok,vm_map_entry_t in_entry)3208*22ce4affSfengbojiang vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
3209*22ce4affSfengbojiang vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
3210*22ce4affSfengbojiang {
3211*22ce4affSfengbojiang vm_map_entry_t entry;
3212*22ce4affSfengbojiang vm_offset_t start;
3213*22ce4affSfengbojiang u_int last_timestamp;
3214*22ce4affSfengbojiang
3215*22ce4affSfengbojiang VM_MAP_ASSERT_LOCKED(map);
3216*22ce4affSfengbojiang KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3217*22ce4affSfengbojiang ("not in-tranition map entry %p", in_entry));
3218*22ce4affSfengbojiang /*
3219*22ce4affSfengbojiang * We have not yet clipped the entry.
3220*22ce4affSfengbojiang */
3221*22ce4affSfengbojiang start = MAX(in_start, in_entry->start);
3222*22ce4affSfengbojiang in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3223*22ce4affSfengbojiang last_timestamp = map->timestamp;
3224*22ce4affSfengbojiang if (vm_map_unlock_and_wait(map, 0)) {
3225*22ce4affSfengbojiang /*
3226*22ce4affSfengbojiang * Allow interruption of user wiring/unwiring?
3227*22ce4affSfengbojiang */
3228*22ce4affSfengbojiang }
3229*22ce4affSfengbojiang vm_map_lock(map);
3230*22ce4affSfengbojiang if (last_timestamp + 1 == map->timestamp)
3231*22ce4affSfengbojiang return (in_entry);
3232*22ce4affSfengbojiang
3233*22ce4affSfengbojiang /*
3234*22ce4affSfengbojiang * Look again for the entry because the map was modified while it was
3235*22ce4affSfengbojiang * unlocked. Specifically, the entry may have been clipped, merged, or
3236*22ce4affSfengbojiang * deleted.
3237*22ce4affSfengbojiang */
3238*22ce4affSfengbojiang if (!vm_map_lookup_entry(map, start, &entry)) {
3239*22ce4affSfengbojiang if (!holes_ok) {
3240*22ce4affSfengbojiang *io_end = start;
3241*22ce4affSfengbojiang return (NULL);
3242*22ce4affSfengbojiang }
3243*22ce4affSfengbojiang entry = vm_map_entry_succ(entry);
3244*22ce4affSfengbojiang }
3245*22ce4affSfengbojiang return (entry);
3246a9643ea8Slogwang }
3247a9643ea8Slogwang
3248a9643ea8Slogwang /*
3249a9643ea8Slogwang * vm_map_unwire:
3250a9643ea8Slogwang *
3251a9643ea8Slogwang * Implements both kernel and user unwiring.
3252a9643ea8Slogwang */
3253a9643ea8Slogwang int
vm_map_unwire(vm_map_t map,vm_offset_t start,vm_offset_t end,int flags)3254a9643ea8Slogwang vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
3255a9643ea8Slogwang int flags)
3256a9643ea8Slogwang {
3257*22ce4affSfengbojiang vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3258a9643ea8Slogwang int rv;
3259*22ce4affSfengbojiang bool holes_ok, need_wakeup, user_unwire;
3260a9643ea8Slogwang
3261a9643ea8Slogwang if (start == end)
3262a9643ea8Slogwang return (KERN_SUCCESS);
3263*22ce4affSfengbojiang holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3264*22ce4affSfengbojiang user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
3265a9643ea8Slogwang vm_map_lock(map);
3266a9643ea8Slogwang VM_MAP_RANGE_CHECK(map, start, end);
3267a9643ea8Slogwang if (!vm_map_lookup_entry(map, start, &first_entry)) {
3268*22ce4affSfengbojiang if (holes_ok)
3269*22ce4affSfengbojiang first_entry = vm_map_entry_succ(first_entry);
3270a9643ea8Slogwang else {
3271a9643ea8Slogwang vm_map_unlock(map);
3272a9643ea8Slogwang return (KERN_INVALID_ADDRESS);
3273a9643ea8Slogwang }
3274a9643ea8Slogwang }
3275*22ce4affSfengbojiang rv = KERN_SUCCESS;
3276*22ce4affSfengbojiang for (entry = first_entry; entry->start < end; entry = next_entry) {
3277a9643ea8Slogwang if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3278a9643ea8Slogwang /*
3279a9643ea8Slogwang * We have not yet clipped the entry.
3280a9643ea8Slogwang */
3281*22ce4affSfengbojiang next_entry = vm_map_entry_in_transition(map, start,
3282*22ce4affSfengbojiang &end, holes_ok, entry);
3283*22ce4affSfengbojiang if (next_entry == NULL) {
3284*22ce4affSfengbojiang if (entry == first_entry) {
3285a9643ea8Slogwang vm_map_unlock(map);
3286a9643ea8Slogwang return (KERN_INVALID_ADDRESS);
3287a9643ea8Slogwang }
3288a9643ea8Slogwang rv = KERN_INVALID_ADDRESS;
3289*22ce4affSfengbojiang break;
3290a9643ea8Slogwang }
3291*22ce4affSfengbojiang first_entry = (entry == first_entry) ?
3292*22ce4affSfengbojiang next_entry : NULL;
3293a9643ea8Slogwang continue;
3294a9643ea8Slogwang }
3295*22ce4affSfengbojiang rv = vm_map_clip_start(map, entry, start);
3296*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
3297*22ce4affSfengbojiang break;
3298*22ce4affSfengbojiang rv = vm_map_clip_end(map, entry, end);
3299*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
3300*22ce4affSfengbojiang break;
3301*22ce4affSfengbojiang
3302a9643ea8Slogwang /*
3303a9643ea8Slogwang * Mark the entry in case the map lock is released. (See
3304a9643ea8Slogwang * above.)
3305a9643ea8Slogwang */
3306a9643ea8Slogwang KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3307a9643ea8Slogwang entry->wiring_thread == NULL,
3308a9643ea8Slogwang ("owned map entry %p", entry));
3309a9643ea8Slogwang entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3310a9643ea8Slogwang entry->wiring_thread = curthread;
3311*22ce4affSfengbojiang next_entry = vm_map_entry_succ(entry);
3312a9643ea8Slogwang /*
3313a9643ea8Slogwang * Check the map for holes in the specified region.
3314*22ce4affSfengbojiang * If holes_ok, skip this check.
3315a9643ea8Slogwang */
3316*22ce4affSfengbojiang if (!holes_ok &&
3317*22ce4affSfengbojiang entry->end < end && next_entry->start > entry->end) {
3318a9643ea8Slogwang end = entry->end;
3319a9643ea8Slogwang rv = KERN_INVALID_ADDRESS;
3320*22ce4affSfengbojiang break;
3321a9643ea8Slogwang }
3322a9643ea8Slogwang /*
3323a9643ea8Slogwang * If system unwiring, require that the entry is system wired.
3324a9643ea8Slogwang */
3325a9643ea8Slogwang if (!user_unwire &&
3326a9643ea8Slogwang vm_map_entry_system_wired_count(entry) == 0) {
3327a9643ea8Slogwang end = entry->end;
3328a9643ea8Slogwang rv = KERN_INVALID_ARGUMENT;
3329*22ce4affSfengbojiang break;
3330a9643ea8Slogwang }
3331a9643ea8Slogwang }
3332*22ce4affSfengbojiang need_wakeup = false;
3333*22ce4affSfengbojiang if (first_entry == NULL &&
3334*22ce4affSfengbojiang !vm_map_lookup_entry(map, start, &first_entry)) {
3335*22ce4affSfengbojiang KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
3336*22ce4affSfengbojiang prev_entry = first_entry;
3337*22ce4affSfengbojiang entry = vm_map_entry_succ(first_entry);
3338*22ce4affSfengbojiang } else {
3339*22ce4affSfengbojiang prev_entry = vm_map_entry_pred(first_entry);
3340*22ce4affSfengbojiang entry = first_entry;
3341a9643ea8Slogwang }
3342*22ce4affSfengbojiang for (; entry->start < end;
3343*22ce4affSfengbojiang prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3344a9643ea8Slogwang /*
3345*22ce4affSfengbojiang * If holes_ok was specified, an empty
3346a9643ea8Slogwang * space in the unwired region could have been mapped
3347a9643ea8Slogwang * while the map lock was dropped for draining
3348a9643ea8Slogwang * MAP_ENTRY_IN_TRANSITION. Moreover, another thread
3349a9643ea8Slogwang * could be simultaneously wiring this new mapping
3350a9643ea8Slogwang * entry. Detect these cases and skip any entries
3351a9643ea8Slogwang * marked as in transition by us.
3352a9643ea8Slogwang */
3353a9643ea8Slogwang if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3354a9643ea8Slogwang entry->wiring_thread != curthread) {
3355*22ce4affSfengbojiang KASSERT(holes_ok,
3356a9643ea8Slogwang ("vm_map_unwire: !HOLESOK and new/changed entry"));
3357a9643ea8Slogwang continue;
3358a9643ea8Slogwang }
3359a9643ea8Slogwang
3360a9643ea8Slogwang if (rv == KERN_SUCCESS && (!user_unwire ||
3361a9643ea8Slogwang (entry->eflags & MAP_ENTRY_USER_WIRED))) {
3362a9643ea8Slogwang if (entry->wired_count == 1)
3363a9643ea8Slogwang vm_map_entry_unwire(map, entry);
3364a9643ea8Slogwang else
3365a9643ea8Slogwang entry->wired_count--;
3366*22ce4affSfengbojiang if (user_unwire)
3367*22ce4affSfengbojiang entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3368a9643ea8Slogwang }
3369a9643ea8Slogwang KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3370a9643ea8Slogwang ("vm_map_unwire: in-transition flag missing %p", entry));
3371a9643ea8Slogwang KASSERT(entry->wiring_thread == curthread,
3372a9643ea8Slogwang ("vm_map_unwire: alien wire %p", entry));
3373a9643ea8Slogwang entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
3374a9643ea8Slogwang entry->wiring_thread = NULL;
3375a9643ea8Slogwang if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3376a9643ea8Slogwang entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3377*22ce4affSfengbojiang need_wakeup = true;
3378a9643ea8Slogwang }
3379*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, entry);
3380a9643ea8Slogwang }
3381*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, entry);
3382a9643ea8Slogwang vm_map_unlock(map);
3383a9643ea8Slogwang if (need_wakeup)
3384a9643ea8Slogwang vm_map_wakeup(map);
3385a9643ea8Slogwang return (rv);
3386a9643ea8Slogwang }
3387a9643ea8Slogwang
3388*22ce4affSfengbojiang static void
vm_map_wire_user_count_sub(u_long npages)3389*22ce4affSfengbojiang vm_map_wire_user_count_sub(u_long npages)
3390*22ce4affSfengbojiang {
3391*22ce4affSfengbojiang
3392*22ce4affSfengbojiang atomic_subtract_long(&vm_user_wire_count, npages);
3393*22ce4affSfengbojiang }
3394*22ce4affSfengbojiang
3395*22ce4affSfengbojiang static bool
vm_map_wire_user_count_add(u_long npages)3396*22ce4affSfengbojiang vm_map_wire_user_count_add(u_long npages)
3397*22ce4affSfengbojiang {
3398*22ce4affSfengbojiang u_long wired;
3399*22ce4affSfengbojiang
3400*22ce4affSfengbojiang wired = vm_user_wire_count;
3401*22ce4affSfengbojiang do {
3402*22ce4affSfengbojiang if (npages + wired > vm_page_max_user_wired)
3403*22ce4affSfengbojiang return (false);
3404*22ce4affSfengbojiang } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
3405*22ce4affSfengbojiang npages + wired));
3406*22ce4affSfengbojiang
3407*22ce4affSfengbojiang return (true);
3408*22ce4affSfengbojiang }
3409*22ce4affSfengbojiang
3410a9643ea8Slogwang /*
3411a9643ea8Slogwang * vm_map_wire_entry_failure:
3412a9643ea8Slogwang *
3413a9643ea8Slogwang * Handle a wiring failure on the given entry.
3414a9643ea8Slogwang *
3415a9643ea8Slogwang * The map should be locked.
3416a9643ea8Slogwang */
3417a9643ea8Slogwang static void
vm_map_wire_entry_failure(vm_map_t map,vm_map_entry_t entry,vm_offset_t failed_addr)3418a9643ea8Slogwang vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
3419a9643ea8Slogwang vm_offset_t failed_addr)
3420a9643ea8Slogwang {
3421a9643ea8Slogwang
3422a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
3423a9643ea8Slogwang KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
3424a9643ea8Slogwang entry->wired_count == 1,
3425a9643ea8Slogwang ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
3426a9643ea8Slogwang KASSERT(failed_addr < entry->end,
3427a9643ea8Slogwang ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
3428a9643ea8Slogwang
3429a9643ea8Slogwang /*
3430a9643ea8Slogwang * If any pages at the start of this entry were successfully wired,
3431a9643ea8Slogwang * then unwire them.
3432a9643ea8Slogwang */
3433a9643ea8Slogwang if (failed_addr > entry->start) {
3434a9643ea8Slogwang pmap_unwire(map->pmap, entry->start, failed_addr);
3435a9643ea8Slogwang vm_object_unwire(entry->object.vm_object, entry->offset,
3436a9643ea8Slogwang failed_addr - entry->start, PQ_ACTIVE);
3437a9643ea8Slogwang }
3438a9643ea8Slogwang
3439a9643ea8Slogwang /*
3440a9643ea8Slogwang * Assign an out-of-range value to represent the failure to wire this
3441a9643ea8Slogwang * entry.
3442a9643ea8Slogwang */
3443a9643ea8Slogwang entry->wired_count = -1;
3444a9643ea8Slogwang }
3445a9643ea8Slogwang
3446*22ce4affSfengbojiang int
vm_map_wire(vm_map_t map,vm_offset_t start,vm_offset_t end,int flags)3447*22ce4affSfengbojiang vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3448*22ce4affSfengbojiang {
3449*22ce4affSfengbojiang int rv;
3450*22ce4affSfengbojiang
3451*22ce4affSfengbojiang vm_map_lock(map);
3452*22ce4affSfengbojiang rv = vm_map_wire_locked(map, start, end, flags);
3453*22ce4affSfengbojiang vm_map_unlock(map);
3454*22ce4affSfengbojiang return (rv);
3455*22ce4affSfengbojiang }
3456*22ce4affSfengbojiang
3457a9643ea8Slogwang /*
3458*22ce4affSfengbojiang * vm_map_wire_locked:
3459a9643ea8Slogwang *
3460*22ce4affSfengbojiang * Implements both kernel and user wiring. Returns with the map locked,
3461*22ce4affSfengbojiang * the map lock may be dropped.
3462a9643ea8Slogwang */
3463a9643ea8Slogwang int
vm_map_wire_locked(vm_map_t map,vm_offset_t start,vm_offset_t end,int flags)3464*22ce4affSfengbojiang vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3465a9643ea8Slogwang {
3466*22ce4affSfengbojiang vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3467a9643ea8Slogwang vm_offset_t faddr, saved_end, saved_start;
3468*22ce4affSfengbojiang u_long incr, npages;
3469*22ce4affSfengbojiang u_int bidx, last_timestamp;
3470a9643ea8Slogwang int rv;
3471*22ce4affSfengbojiang bool holes_ok, need_wakeup, user_wire;
3472a9643ea8Slogwang vm_prot_t prot;
3473a9643ea8Slogwang
3474*22ce4affSfengbojiang VM_MAP_ASSERT_LOCKED(map);
3475*22ce4affSfengbojiang
3476a9643ea8Slogwang if (start == end)
3477a9643ea8Slogwang return (KERN_SUCCESS);
3478a9643ea8Slogwang prot = 0;
3479a9643ea8Slogwang if (flags & VM_MAP_WIRE_WRITE)
3480a9643ea8Slogwang prot |= VM_PROT_WRITE;
3481*22ce4affSfengbojiang holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3482*22ce4affSfengbojiang user_wire = (flags & VM_MAP_WIRE_USER) != 0;
3483a9643ea8Slogwang VM_MAP_RANGE_CHECK(map, start, end);
3484a9643ea8Slogwang if (!vm_map_lookup_entry(map, start, &first_entry)) {
3485*22ce4affSfengbojiang if (holes_ok)
3486*22ce4affSfengbojiang first_entry = vm_map_entry_succ(first_entry);
3487*22ce4affSfengbojiang else
3488a9643ea8Slogwang return (KERN_INVALID_ADDRESS);
3489a9643ea8Slogwang }
3490*22ce4affSfengbojiang for (entry = first_entry; entry->start < end; entry = next_entry) {
3491a9643ea8Slogwang if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3492a9643ea8Slogwang /*
3493a9643ea8Slogwang * We have not yet clipped the entry.
3494a9643ea8Slogwang */
3495*22ce4affSfengbojiang next_entry = vm_map_entry_in_transition(map, start,
3496*22ce4affSfengbojiang &end, holes_ok, entry);
3497*22ce4affSfengbojiang if (next_entry == NULL) {
3498*22ce4affSfengbojiang if (entry == first_entry)
3499a9643ea8Slogwang return (KERN_INVALID_ADDRESS);
3500a9643ea8Slogwang rv = KERN_INVALID_ADDRESS;
3501a9643ea8Slogwang goto done;
3502a9643ea8Slogwang }
3503*22ce4affSfengbojiang first_entry = (entry == first_entry) ?
3504*22ce4affSfengbojiang next_entry : NULL;
3505a9643ea8Slogwang continue;
3506a9643ea8Slogwang }
3507*22ce4affSfengbojiang rv = vm_map_clip_start(map, entry, start);
3508*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
3509*22ce4affSfengbojiang goto done;
3510*22ce4affSfengbojiang rv = vm_map_clip_end(map, entry, end);
3511*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
3512*22ce4affSfengbojiang goto done;
3513*22ce4affSfengbojiang
3514a9643ea8Slogwang /*
3515a9643ea8Slogwang * Mark the entry in case the map lock is released. (See
3516a9643ea8Slogwang * above.)
3517a9643ea8Slogwang */
3518a9643ea8Slogwang KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3519a9643ea8Slogwang entry->wiring_thread == NULL,
3520a9643ea8Slogwang ("owned map entry %p", entry));
3521a9643ea8Slogwang entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3522a9643ea8Slogwang entry->wiring_thread = curthread;
3523a9643ea8Slogwang if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
3524a9643ea8Slogwang || (entry->protection & prot) != prot) {
3525a9643ea8Slogwang entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
3526*22ce4affSfengbojiang if (!holes_ok) {
3527a9643ea8Slogwang end = entry->end;
3528a9643ea8Slogwang rv = KERN_INVALID_ADDRESS;
3529a9643ea8Slogwang goto done;
3530a9643ea8Slogwang }
3531*22ce4affSfengbojiang } else if (entry->wired_count == 0) {
3532a9643ea8Slogwang entry->wired_count++;
3533*22ce4affSfengbojiang
3534*22ce4affSfengbojiang npages = atop(entry->end - entry->start);
3535*22ce4affSfengbojiang if (user_wire && !vm_map_wire_user_count_add(npages)) {
3536*22ce4affSfengbojiang vm_map_wire_entry_failure(map, entry,
3537*22ce4affSfengbojiang entry->start);
3538*22ce4affSfengbojiang end = entry->end;
3539*22ce4affSfengbojiang rv = KERN_RESOURCE_SHORTAGE;
3540*22ce4affSfengbojiang goto done;
3541*22ce4affSfengbojiang }
3542a9643ea8Slogwang
3543a9643ea8Slogwang /*
3544a9643ea8Slogwang * Release the map lock, relying on the in-transition
3545a9643ea8Slogwang * mark. Mark the map busy for fork.
3546a9643ea8Slogwang */
3547*22ce4affSfengbojiang saved_start = entry->start;
3548*22ce4affSfengbojiang saved_end = entry->end;
3549*22ce4affSfengbojiang last_timestamp = map->timestamp;
3550*22ce4affSfengbojiang bidx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
3551*22ce4affSfengbojiang >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
3552*22ce4affSfengbojiang incr = pagesizes[bidx];
3553a9643ea8Slogwang vm_map_busy(map);
3554a9643ea8Slogwang vm_map_unlock(map);
3555a9643ea8Slogwang
3556*22ce4affSfengbojiang for (faddr = saved_start; faddr < saved_end;
3557*22ce4affSfengbojiang faddr += incr) {
3558a9643ea8Slogwang /*
3559a9643ea8Slogwang * Simulate a fault to get the page and enter
3560a9643ea8Slogwang * it into the physical map.
3561a9643ea8Slogwang */
3562*22ce4affSfengbojiang rv = vm_fault(map, faddr, VM_PROT_NONE,
3563*22ce4affSfengbojiang VM_FAULT_WIRE, NULL);
3564*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
3565a9643ea8Slogwang break;
3566*22ce4affSfengbojiang }
3567a9643ea8Slogwang vm_map_lock(map);
3568a9643ea8Slogwang vm_map_unbusy(map);
3569a9643ea8Slogwang if (last_timestamp + 1 != map->timestamp) {
3570a9643ea8Slogwang /*
3571a9643ea8Slogwang * Look again for the entry because the map was
3572a9643ea8Slogwang * modified while it was unlocked. The entry
3573a9643ea8Slogwang * may have been clipped, but NOT merged or
3574a9643ea8Slogwang * deleted.
3575a9643ea8Slogwang */
3576*22ce4affSfengbojiang if (!vm_map_lookup_entry(map, saved_start,
3577*22ce4affSfengbojiang &next_entry))
3578*22ce4affSfengbojiang KASSERT(false,
3579*22ce4affSfengbojiang ("vm_map_wire: lookup failed"));
3580*22ce4affSfengbojiang first_entry = (entry == first_entry) ?
3581*22ce4affSfengbojiang next_entry : NULL;
3582*22ce4affSfengbojiang for (entry = next_entry; entry->end < saved_end;
3583*22ce4affSfengbojiang entry = vm_map_entry_succ(entry)) {
3584a9643ea8Slogwang /*
3585a9643ea8Slogwang * In case of failure, handle entries
3586a9643ea8Slogwang * that were not fully wired here;
3587a9643ea8Slogwang * fully wired entries are handled
3588a9643ea8Slogwang * later.
3589a9643ea8Slogwang */
3590a9643ea8Slogwang if (rv != KERN_SUCCESS &&
3591a9643ea8Slogwang faddr < entry->end)
3592a9643ea8Slogwang vm_map_wire_entry_failure(map,
3593a9643ea8Slogwang entry, faddr);
3594a9643ea8Slogwang }
3595a9643ea8Slogwang }
3596a9643ea8Slogwang if (rv != KERN_SUCCESS) {
3597a9643ea8Slogwang vm_map_wire_entry_failure(map, entry, faddr);
3598*22ce4affSfengbojiang if (user_wire)
3599*22ce4affSfengbojiang vm_map_wire_user_count_sub(npages);
3600a9643ea8Slogwang end = entry->end;
3601a9643ea8Slogwang goto done;
3602a9643ea8Slogwang }
3603a9643ea8Slogwang } else if (!user_wire ||
3604a9643ea8Slogwang (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3605a9643ea8Slogwang entry->wired_count++;
3606a9643ea8Slogwang }
3607a9643ea8Slogwang /*
3608a9643ea8Slogwang * Check the map for holes in the specified region.
3609*22ce4affSfengbojiang * If holes_ok was specified, skip this check.
3610a9643ea8Slogwang */
3611*22ce4affSfengbojiang next_entry = vm_map_entry_succ(entry);
3612*22ce4affSfengbojiang if (!holes_ok &&
3613*22ce4affSfengbojiang entry->end < end && next_entry->start > entry->end) {
3614a9643ea8Slogwang end = entry->end;
3615a9643ea8Slogwang rv = KERN_INVALID_ADDRESS;
3616a9643ea8Slogwang goto done;
3617a9643ea8Slogwang }
3618a9643ea8Slogwang }
3619a9643ea8Slogwang rv = KERN_SUCCESS;
3620a9643ea8Slogwang done:
3621*22ce4affSfengbojiang need_wakeup = false;
3622*22ce4affSfengbojiang if (first_entry == NULL &&
3623*22ce4affSfengbojiang !vm_map_lookup_entry(map, start, &first_entry)) {
3624*22ce4affSfengbojiang KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
3625*22ce4affSfengbojiang prev_entry = first_entry;
3626*22ce4affSfengbojiang entry = vm_map_entry_succ(first_entry);
3627*22ce4affSfengbojiang } else {
3628*22ce4affSfengbojiang prev_entry = vm_map_entry_pred(first_entry);
3629*22ce4affSfengbojiang entry = first_entry;
3630a9643ea8Slogwang }
3631*22ce4affSfengbojiang for (; entry->start < end;
3632*22ce4affSfengbojiang prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3633a9643ea8Slogwang /*
3634*22ce4affSfengbojiang * If holes_ok was specified, an empty
3635a9643ea8Slogwang * space in the unwired region could have been mapped
3636a9643ea8Slogwang * while the map lock was dropped for faulting in the
3637a9643ea8Slogwang * pages or draining MAP_ENTRY_IN_TRANSITION.
3638a9643ea8Slogwang * Moreover, another thread could be simultaneously
3639a9643ea8Slogwang * wiring this new mapping entry. Detect these cases
3640*22ce4affSfengbojiang * and skip any entries marked as in transition not by us.
3641*22ce4affSfengbojiang *
3642*22ce4affSfengbojiang * Another way to get an entry not marked with
3643*22ce4affSfengbojiang * MAP_ENTRY_IN_TRANSITION is after failed clipping,
3644*22ce4affSfengbojiang * which set rv to KERN_INVALID_ARGUMENT.
3645a9643ea8Slogwang */
3646a9643ea8Slogwang if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3647a9643ea8Slogwang entry->wiring_thread != curthread) {
3648*22ce4affSfengbojiang KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT,
3649a9643ea8Slogwang ("vm_map_wire: !HOLESOK and new/changed entry"));
3650a9643ea8Slogwang continue;
3651a9643ea8Slogwang }
3652a9643ea8Slogwang
3653*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
3654*22ce4affSfengbojiang /* do nothing */
3655*22ce4affSfengbojiang } else if (rv == KERN_SUCCESS) {
3656a9643ea8Slogwang if (user_wire)
3657a9643ea8Slogwang entry->eflags |= MAP_ENTRY_USER_WIRED;
3658a9643ea8Slogwang } else if (entry->wired_count == -1) {
3659a9643ea8Slogwang /*
3660a9643ea8Slogwang * Wiring failed on this entry. Thus, unwiring is
3661a9643ea8Slogwang * unnecessary.
3662a9643ea8Slogwang */
3663a9643ea8Slogwang entry->wired_count = 0;
3664a9643ea8Slogwang } else if (!user_wire ||
3665a9643ea8Slogwang (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3666a9643ea8Slogwang /*
3667a9643ea8Slogwang * Undo the wiring. Wiring succeeded on this entry
3668a9643ea8Slogwang * but failed on a later entry.
3669a9643ea8Slogwang */
3670*22ce4affSfengbojiang if (entry->wired_count == 1) {
3671a9643ea8Slogwang vm_map_entry_unwire(map, entry);
3672*22ce4affSfengbojiang if (user_wire)
3673*22ce4affSfengbojiang vm_map_wire_user_count_sub(
3674*22ce4affSfengbojiang atop(entry->end - entry->start));
3675*22ce4affSfengbojiang } else
3676a9643ea8Slogwang entry->wired_count--;
3677a9643ea8Slogwang }
3678a9643ea8Slogwang KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3679a9643ea8Slogwang ("vm_map_wire: in-transition flag missing %p", entry));
3680a9643ea8Slogwang KASSERT(entry->wiring_thread == curthread,
3681a9643ea8Slogwang ("vm_map_wire: alien wire %p", entry));
3682a9643ea8Slogwang entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
3683a9643ea8Slogwang MAP_ENTRY_WIRE_SKIPPED);
3684a9643ea8Slogwang entry->wiring_thread = NULL;
3685a9643ea8Slogwang if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3686a9643ea8Slogwang entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3687*22ce4affSfengbojiang need_wakeup = true;
3688a9643ea8Slogwang }
3689*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, entry);
3690a9643ea8Slogwang }
3691*22ce4affSfengbojiang vm_map_try_merge_entries(map, prev_entry, entry);
3692a9643ea8Slogwang if (need_wakeup)
3693a9643ea8Slogwang vm_map_wakeup(map);
3694a9643ea8Slogwang return (rv);
3695a9643ea8Slogwang }
3696a9643ea8Slogwang
3697a9643ea8Slogwang /*
3698a9643ea8Slogwang * vm_map_sync
3699a9643ea8Slogwang *
3700a9643ea8Slogwang * Push any dirty cached pages in the address range to their pager.
3701a9643ea8Slogwang * If syncio is TRUE, dirty pages are written synchronously.
3702a9643ea8Slogwang * If invalidate is TRUE, any cached pages are freed as well.
3703a9643ea8Slogwang *
3704a9643ea8Slogwang * If the size of the region from start to end is zero, we are
3705a9643ea8Slogwang * supposed to flush all modified pages within the region containing
3706a9643ea8Slogwang * start. Unfortunately, a region can be split or coalesced with
3707a9643ea8Slogwang * neighboring regions, making it difficult to determine what the
3708a9643ea8Slogwang * original region was. Therefore, we approximate this requirement by
3709a9643ea8Slogwang * flushing the current region containing start.
3710a9643ea8Slogwang *
3711a9643ea8Slogwang * Returns an error if any part of the specified range is not mapped.
3712a9643ea8Slogwang */
3713a9643ea8Slogwang int
vm_map_sync(vm_map_t map,vm_offset_t start,vm_offset_t end,boolean_t syncio,boolean_t invalidate)3714a9643ea8Slogwang vm_map_sync(
3715a9643ea8Slogwang vm_map_t map,
3716a9643ea8Slogwang vm_offset_t start,
3717a9643ea8Slogwang vm_offset_t end,
3718a9643ea8Slogwang boolean_t syncio,
3719a9643ea8Slogwang boolean_t invalidate)
3720a9643ea8Slogwang {
3721*22ce4affSfengbojiang vm_map_entry_t entry, first_entry, next_entry;
3722a9643ea8Slogwang vm_size_t size;
3723a9643ea8Slogwang vm_object_t object;
3724a9643ea8Slogwang vm_ooffset_t offset;
3725a9643ea8Slogwang unsigned int last_timestamp;
3726*22ce4affSfengbojiang int bdry_idx;
3727a9643ea8Slogwang boolean_t failed;
3728a9643ea8Slogwang
3729a9643ea8Slogwang vm_map_lock_read(map);
3730a9643ea8Slogwang VM_MAP_RANGE_CHECK(map, start, end);
3731*22ce4affSfengbojiang if (!vm_map_lookup_entry(map, start, &first_entry)) {
3732a9643ea8Slogwang vm_map_unlock_read(map);
3733a9643ea8Slogwang return (KERN_INVALID_ADDRESS);
3734a9643ea8Slogwang } else if (start == end) {
3735*22ce4affSfengbojiang start = first_entry->start;
3736*22ce4affSfengbojiang end = first_entry->end;
3737a9643ea8Slogwang }
3738*22ce4affSfengbojiang
3739a9643ea8Slogwang /*
3740*22ce4affSfengbojiang * Make a first pass to check for user-wired memory, holes,
3741*22ce4affSfengbojiang * and partial invalidation of largepage mappings.
3742a9643ea8Slogwang */
3743*22ce4affSfengbojiang for (entry = first_entry; entry->start < end; entry = next_entry) {
3744*22ce4affSfengbojiang if (invalidate) {
3745*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) {
3746a9643ea8Slogwang vm_map_unlock_read(map);
3747a9643ea8Slogwang return (KERN_INVALID_ARGUMENT);
3748a9643ea8Slogwang }
3749*22ce4affSfengbojiang bdry_idx = (entry->eflags &
3750*22ce4affSfengbojiang MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
3751*22ce4affSfengbojiang MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
3752*22ce4affSfengbojiang if (bdry_idx != 0 &&
3753*22ce4affSfengbojiang ((start & (pagesizes[bdry_idx] - 1)) != 0 ||
3754*22ce4affSfengbojiang (end & (pagesizes[bdry_idx] - 1)) != 0)) {
3755*22ce4affSfengbojiang vm_map_unlock_read(map);
3756*22ce4affSfengbojiang return (KERN_INVALID_ARGUMENT);
3757*22ce4affSfengbojiang }
3758*22ce4affSfengbojiang }
3759*22ce4affSfengbojiang next_entry = vm_map_entry_succ(entry);
3760*22ce4affSfengbojiang if (end > entry->end &&
3761*22ce4affSfengbojiang entry->end != next_entry->start) {
3762a9643ea8Slogwang vm_map_unlock_read(map);
3763a9643ea8Slogwang return (KERN_INVALID_ADDRESS);
3764a9643ea8Slogwang }
3765a9643ea8Slogwang }
3766a9643ea8Slogwang
3767a9643ea8Slogwang if (invalidate)
3768a9643ea8Slogwang pmap_remove(map->pmap, start, end);
3769a9643ea8Slogwang failed = FALSE;
3770a9643ea8Slogwang
3771a9643ea8Slogwang /*
3772a9643ea8Slogwang * Make a second pass, cleaning/uncaching pages from the indicated
3773a9643ea8Slogwang * objects as we go.
3774a9643ea8Slogwang */
3775*22ce4affSfengbojiang for (entry = first_entry; entry->start < end;) {
3776*22ce4affSfengbojiang offset = entry->offset + (start - entry->start);
3777*22ce4affSfengbojiang size = (end <= entry->end ? end : entry->end) - start;
3778*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
3779a9643ea8Slogwang vm_map_t smap;
3780a9643ea8Slogwang vm_map_entry_t tentry;
3781a9643ea8Slogwang vm_size_t tsize;
3782a9643ea8Slogwang
3783*22ce4affSfengbojiang smap = entry->object.sub_map;
3784a9643ea8Slogwang vm_map_lock_read(smap);
3785a9643ea8Slogwang (void) vm_map_lookup_entry(smap, offset, &tentry);
3786a9643ea8Slogwang tsize = tentry->end - offset;
3787a9643ea8Slogwang if (tsize < size)
3788a9643ea8Slogwang size = tsize;
3789a9643ea8Slogwang object = tentry->object.vm_object;
3790a9643ea8Slogwang offset = tentry->offset + (offset - tentry->start);
3791a9643ea8Slogwang vm_map_unlock_read(smap);
3792a9643ea8Slogwang } else {
3793*22ce4affSfengbojiang object = entry->object.vm_object;
3794a9643ea8Slogwang }
3795a9643ea8Slogwang vm_object_reference(object);
3796a9643ea8Slogwang last_timestamp = map->timestamp;
3797a9643ea8Slogwang vm_map_unlock_read(map);
3798a9643ea8Slogwang if (!vm_object_sync(object, offset, size, syncio, invalidate))
3799a9643ea8Slogwang failed = TRUE;
3800a9643ea8Slogwang start += size;
3801a9643ea8Slogwang vm_object_deallocate(object);
3802a9643ea8Slogwang vm_map_lock_read(map);
3803a9643ea8Slogwang if (last_timestamp == map->timestamp ||
3804*22ce4affSfengbojiang !vm_map_lookup_entry(map, start, &entry))
3805*22ce4affSfengbojiang entry = vm_map_entry_succ(entry);
3806a9643ea8Slogwang }
3807a9643ea8Slogwang
3808a9643ea8Slogwang vm_map_unlock_read(map);
3809a9643ea8Slogwang return (failed ? KERN_FAILURE : KERN_SUCCESS);
3810a9643ea8Slogwang }
3811a9643ea8Slogwang
3812a9643ea8Slogwang /*
3813a9643ea8Slogwang * vm_map_entry_unwire: [ internal use only ]
3814a9643ea8Slogwang *
3815a9643ea8Slogwang * Make the region specified by this entry pageable.
3816a9643ea8Slogwang *
3817a9643ea8Slogwang * The map in question should be locked.
3818a9643ea8Slogwang * [This is the reason for this routine's existence.]
3819a9643ea8Slogwang */
3820a9643ea8Slogwang static void
vm_map_entry_unwire(vm_map_t map,vm_map_entry_t entry)3821a9643ea8Slogwang vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3822a9643ea8Slogwang {
3823*22ce4affSfengbojiang vm_size_t size;
3824a9643ea8Slogwang
3825a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
3826a9643ea8Slogwang KASSERT(entry->wired_count > 0,
3827a9643ea8Slogwang ("vm_map_entry_unwire: entry %p isn't wired", entry));
3828*22ce4affSfengbojiang
3829*22ce4affSfengbojiang size = entry->end - entry->start;
3830*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
3831*22ce4affSfengbojiang vm_map_wire_user_count_sub(atop(size));
3832a9643ea8Slogwang pmap_unwire(map->pmap, entry->start, entry->end);
3833*22ce4affSfengbojiang vm_object_unwire(entry->object.vm_object, entry->offset, size,
3834*22ce4affSfengbojiang PQ_ACTIVE);
3835a9643ea8Slogwang entry->wired_count = 0;
3836a9643ea8Slogwang }
3837a9643ea8Slogwang
3838a9643ea8Slogwang static void
vm_map_entry_deallocate(vm_map_entry_t entry,boolean_t system_map)3839a9643ea8Slogwang vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
3840a9643ea8Slogwang {
3841a9643ea8Slogwang
3842a9643ea8Slogwang if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
3843a9643ea8Slogwang vm_object_deallocate(entry->object.vm_object);
3844a9643ea8Slogwang uma_zfree(system_map ? kmapentzone : mapentzone, entry);
3845a9643ea8Slogwang }
3846a9643ea8Slogwang
3847a9643ea8Slogwang /*
3848a9643ea8Slogwang * vm_map_entry_delete: [ internal use only ]
3849a9643ea8Slogwang *
3850a9643ea8Slogwang * Deallocate the given entry from the target map.
3851a9643ea8Slogwang */
3852a9643ea8Slogwang static void
vm_map_entry_delete(vm_map_t map,vm_map_entry_t entry)3853a9643ea8Slogwang vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
3854a9643ea8Slogwang {
3855a9643ea8Slogwang vm_object_t object;
3856*22ce4affSfengbojiang vm_pindex_t offidxstart, offidxend, size1;
3857*22ce4affSfengbojiang vm_size_t size;
3858a9643ea8Slogwang
3859*22ce4affSfengbojiang vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
3860a9643ea8Slogwang object = entry->object.vm_object;
3861*22ce4affSfengbojiang
3862*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
3863*22ce4affSfengbojiang MPASS(entry->cred == NULL);
3864*22ce4affSfengbojiang MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
3865*22ce4affSfengbojiang MPASS(object == NULL);
3866*22ce4affSfengbojiang vm_map_entry_deallocate(entry, map->system_map);
3867*22ce4affSfengbojiang return;
3868*22ce4affSfengbojiang }
3869*22ce4affSfengbojiang
3870a9643ea8Slogwang size = entry->end - entry->start;
3871a9643ea8Slogwang map->size -= size;
3872a9643ea8Slogwang
3873a9643ea8Slogwang if (entry->cred != NULL) {
3874a9643ea8Slogwang swap_release_by_cred(size, entry->cred);
3875a9643ea8Slogwang crfree(entry->cred);
3876a9643ea8Slogwang }
3877a9643ea8Slogwang
3878*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) {
3879*22ce4affSfengbojiang entry->object.vm_object = NULL;
3880*22ce4affSfengbojiang } else if ((object->flags & OBJ_ANON) != 0 ||
3881*22ce4affSfengbojiang object == kernel_object) {
3882a9643ea8Slogwang KASSERT(entry->cred == NULL || object->cred == NULL ||
3883a9643ea8Slogwang (entry->eflags & MAP_ENTRY_NEEDS_COPY),
3884a9643ea8Slogwang ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
3885a9643ea8Slogwang offidxstart = OFF_TO_IDX(entry->offset);
3886*22ce4affSfengbojiang offidxend = offidxstart + atop(size);
3887a9643ea8Slogwang VM_OBJECT_WLOCK(object);
3888a9643ea8Slogwang if (object->ref_count != 1 &&
3889*22ce4affSfengbojiang ((object->flags & OBJ_ONEMAPPING) != 0 ||
3890*22ce4affSfengbojiang object == kernel_object)) {
3891a9643ea8Slogwang vm_object_collapse(object);
3892a9643ea8Slogwang
3893a9643ea8Slogwang /*
3894a9643ea8Slogwang * The option OBJPR_NOTMAPPED can be passed here
3895a9643ea8Slogwang * because vm_map_delete() already performed
3896a9643ea8Slogwang * pmap_remove() on the only mapping to this range
3897a9643ea8Slogwang * of pages.
3898a9643ea8Slogwang */
3899a9643ea8Slogwang vm_object_page_remove(object, offidxstart, offidxend,
3900a9643ea8Slogwang OBJPR_NOTMAPPED);
3901a9643ea8Slogwang if (offidxend >= object->size &&
3902a9643ea8Slogwang offidxstart < object->size) {
3903a9643ea8Slogwang size1 = object->size;
3904a9643ea8Slogwang object->size = offidxstart;
3905a9643ea8Slogwang if (object->cred != NULL) {
3906a9643ea8Slogwang size1 -= object->size;
3907a9643ea8Slogwang KASSERT(object->charge >= ptoa(size1),
3908*22ce4affSfengbojiang ("object %p charge < 0", object));
3909*22ce4affSfengbojiang swap_release_by_cred(ptoa(size1),
3910*22ce4affSfengbojiang object->cred);
3911a9643ea8Slogwang object->charge -= ptoa(size1);
3912a9643ea8Slogwang }
3913a9643ea8Slogwang }
3914a9643ea8Slogwang }
3915a9643ea8Slogwang VM_OBJECT_WUNLOCK(object);
3916*22ce4affSfengbojiang }
3917a9643ea8Slogwang if (map->system_map)
3918a9643ea8Slogwang vm_map_entry_deallocate(entry, TRUE);
3919a9643ea8Slogwang else {
3920*22ce4affSfengbojiang entry->defer_next = curthread->td_map_def_user;
3921a9643ea8Slogwang curthread->td_map_def_user = entry;
3922a9643ea8Slogwang }
3923a9643ea8Slogwang }
3924a9643ea8Slogwang
3925a9643ea8Slogwang /*
3926a9643ea8Slogwang * vm_map_delete: [ internal use only ]
3927a9643ea8Slogwang *
3928a9643ea8Slogwang * Deallocates the given address range from the target
3929a9643ea8Slogwang * map.
3930a9643ea8Slogwang */
3931a9643ea8Slogwang int
vm_map_delete(vm_map_t map,vm_offset_t start,vm_offset_t end)3932a9643ea8Slogwang vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
3933a9643ea8Slogwang {
3934*22ce4affSfengbojiang vm_map_entry_t entry, next_entry, scratch_entry;
3935*22ce4affSfengbojiang int rv;
3936a9643ea8Slogwang
3937a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(map);
3938*22ce4affSfengbojiang
3939a9643ea8Slogwang if (start == end)
3940a9643ea8Slogwang return (KERN_SUCCESS);
3941a9643ea8Slogwang
3942a9643ea8Slogwang /*
3943*22ce4affSfengbojiang * Find the start of the region, and clip it.
3944*22ce4affSfengbojiang * Step through all entries in this region.
3945a9643ea8Slogwang */
3946*22ce4affSfengbojiang rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
3947*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
3948*22ce4affSfengbojiang return (rv);
3949*22ce4affSfengbojiang for (; entry->start < end; entry = next_entry) {
3950a9643ea8Slogwang /*
3951a9643ea8Slogwang * Wait for wiring or unwiring of an entry to complete.
3952a9643ea8Slogwang * Also wait for any system wirings to disappear on
3953a9643ea8Slogwang * user maps.
3954a9643ea8Slogwang */
3955a9643ea8Slogwang if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
3956a9643ea8Slogwang (vm_map_pmap(map) != kernel_pmap &&
3957a9643ea8Slogwang vm_map_entry_system_wired_count(entry) != 0)) {
3958a9643ea8Slogwang unsigned int last_timestamp;
3959a9643ea8Slogwang vm_offset_t saved_start;
3960a9643ea8Slogwang
3961a9643ea8Slogwang saved_start = entry->start;
3962a9643ea8Slogwang entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3963a9643ea8Slogwang last_timestamp = map->timestamp;
3964a9643ea8Slogwang (void) vm_map_unlock_and_wait(map, 0);
3965a9643ea8Slogwang vm_map_lock(map);
3966a9643ea8Slogwang if (last_timestamp + 1 != map->timestamp) {
3967a9643ea8Slogwang /*
3968a9643ea8Slogwang * Look again for the entry because the map was
3969a9643ea8Slogwang * modified while it was unlocked.
3970a9643ea8Slogwang * Specifically, the entry may have been
3971a9643ea8Slogwang * clipped, merged, or deleted.
3972a9643ea8Slogwang */
3973*22ce4affSfengbojiang rv = vm_map_lookup_clip_start(map, saved_start,
3974*22ce4affSfengbojiang &next_entry, &scratch_entry);
3975*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
3976*22ce4affSfengbojiang break;
3977*22ce4affSfengbojiang } else
3978*22ce4affSfengbojiang next_entry = entry;
3979a9643ea8Slogwang continue;
3980a9643ea8Slogwang }
3981a9643ea8Slogwang
3982*22ce4affSfengbojiang /* XXXKIB or delete to the upper superpage boundary ? */
3983*22ce4affSfengbojiang rv = vm_map_clip_end(map, entry, end);
3984*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
3985*22ce4affSfengbojiang break;
3986*22ce4affSfengbojiang next_entry = vm_map_entry_succ(entry);
3987a9643ea8Slogwang
3988a9643ea8Slogwang /*
3989a9643ea8Slogwang * Unwire before removing addresses from the pmap; otherwise,
3990a9643ea8Slogwang * unwiring will put the entries back in the pmap.
3991a9643ea8Slogwang */
3992*22ce4affSfengbojiang if (entry->wired_count != 0)
3993a9643ea8Slogwang vm_map_entry_unwire(map, entry);
3994a9643ea8Slogwang
3995*22ce4affSfengbojiang /*
3996*22ce4affSfengbojiang * Remove mappings for the pages, but only if the
3997*22ce4affSfengbojiang * mappings could exist. For instance, it does not
3998*22ce4affSfengbojiang * make sense to call pmap_remove() for guard entries.
3999*22ce4affSfengbojiang */
4000*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
4001*22ce4affSfengbojiang entry->object.vm_object != NULL)
4002a9643ea8Slogwang pmap_remove(map->pmap, entry->start, entry->end);
4003a9643ea8Slogwang
4004*22ce4affSfengbojiang if (entry->end == map->anon_loc)
4005*22ce4affSfengbojiang map->anon_loc = entry->start;
4006*22ce4affSfengbojiang
4007a9643ea8Slogwang /*
4008a9643ea8Slogwang * Delete the entry only after removing all pmap
4009a9643ea8Slogwang * entries pointing to its pages. (Otherwise, its
4010a9643ea8Slogwang * page frames may be reallocated, and any modify bits
4011a9643ea8Slogwang * will be set in the wrong object!)
4012a9643ea8Slogwang */
4013a9643ea8Slogwang vm_map_entry_delete(map, entry);
4014a9643ea8Slogwang }
4015*22ce4affSfengbojiang return (rv);
4016a9643ea8Slogwang }
4017a9643ea8Slogwang
4018a9643ea8Slogwang /*
4019a9643ea8Slogwang * vm_map_remove:
4020a9643ea8Slogwang *
4021a9643ea8Slogwang * Remove the given address range from the target map.
4022a9643ea8Slogwang * This is the exported form of vm_map_delete.
4023a9643ea8Slogwang */
4024a9643ea8Slogwang int
vm_map_remove(vm_map_t map,vm_offset_t start,vm_offset_t end)4025a9643ea8Slogwang vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
4026a9643ea8Slogwang {
4027a9643ea8Slogwang int result;
4028a9643ea8Slogwang
4029a9643ea8Slogwang vm_map_lock(map);
4030a9643ea8Slogwang VM_MAP_RANGE_CHECK(map, start, end);
4031a9643ea8Slogwang result = vm_map_delete(map, start, end);
4032a9643ea8Slogwang vm_map_unlock(map);
4033a9643ea8Slogwang return (result);
4034a9643ea8Slogwang }
4035a9643ea8Slogwang
4036a9643ea8Slogwang /*
4037a9643ea8Slogwang * vm_map_check_protection:
4038a9643ea8Slogwang *
4039a9643ea8Slogwang * Assert that the target map allows the specified privilege on the
4040a9643ea8Slogwang * entire address region given. The entire region must be allocated.
4041a9643ea8Slogwang *
4042a9643ea8Slogwang * WARNING! This code does not and should not check whether the
4043a9643ea8Slogwang * contents of the region is accessible. For example a smaller file
4044a9643ea8Slogwang * might be mapped into a larger address space.
4045a9643ea8Slogwang *
4046a9643ea8Slogwang * NOTE! This code is also called by munmap().
4047a9643ea8Slogwang *
4048a9643ea8Slogwang * The map must be locked. A read lock is sufficient.
4049a9643ea8Slogwang */
4050a9643ea8Slogwang boolean_t
vm_map_check_protection(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_prot_t protection)4051a9643ea8Slogwang vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
4052a9643ea8Slogwang vm_prot_t protection)
4053a9643ea8Slogwang {
4054a9643ea8Slogwang vm_map_entry_t entry;
4055a9643ea8Slogwang vm_map_entry_t tmp_entry;
4056a9643ea8Slogwang
4057a9643ea8Slogwang if (!vm_map_lookup_entry(map, start, &tmp_entry))
4058a9643ea8Slogwang return (FALSE);
4059a9643ea8Slogwang entry = tmp_entry;
4060a9643ea8Slogwang
4061a9643ea8Slogwang while (start < end) {
4062a9643ea8Slogwang /*
4063a9643ea8Slogwang * No holes allowed!
4064a9643ea8Slogwang */
4065a9643ea8Slogwang if (start < entry->start)
4066a9643ea8Slogwang return (FALSE);
4067a9643ea8Slogwang /*
4068a9643ea8Slogwang * Check protection associated with entry.
4069a9643ea8Slogwang */
4070a9643ea8Slogwang if ((entry->protection & protection) != protection)
4071a9643ea8Slogwang return (FALSE);
4072a9643ea8Slogwang /* go to next entry */
4073a9643ea8Slogwang start = entry->end;
4074*22ce4affSfengbojiang entry = vm_map_entry_succ(entry);
4075a9643ea8Slogwang }
4076a9643ea8Slogwang return (TRUE);
4077a9643ea8Slogwang }
4078a9643ea8Slogwang
4079a9643ea8Slogwang /*
4080*22ce4affSfengbojiang *
4081*22ce4affSfengbojiang * vm_map_copy_swap_object:
4082*22ce4affSfengbojiang *
4083*22ce4affSfengbojiang * Copies a swap-backed object from an existing map entry to a
4084*22ce4affSfengbojiang * new one. Carries forward the swap charge. May change the
4085*22ce4affSfengbojiang * src object on return.
4086*22ce4affSfengbojiang */
4087*22ce4affSfengbojiang static void
vm_map_copy_swap_object(vm_map_entry_t src_entry,vm_map_entry_t dst_entry,vm_offset_t size,vm_ooffset_t * fork_charge)4088*22ce4affSfengbojiang vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry,
4089*22ce4affSfengbojiang vm_offset_t size, vm_ooffset_t *fork_charge)
4090*22ce4affSfengbojiang {
4091*22ce4affSfengbojiang vm_object_t src_object;
4092*22ce4affSfengbojiang struct ucred *cred;
4093*22ce4affSfengbojiang int charged;
4094*22ce4affSfengbojiang
4095*22ce4affSfengbojiang src_object = src_entry->object.vm_object;
4096*22ce4affSfengbojiang charged = ENTRY_CHARGED(src_entry);
4097*22ce4affSfengbojiang if ((src_object->flags & OBJ_ANON) != 0) {
4098*22ce4affSfengbojiang VM_OBJECT_WLOCK(src_object);
4099*22ce4affSfengbojiang vm_object_collapse(src_object);
4100*22ce4affSfengbojiang if ((src_object->flags & OBJ_ONEMAPPING) != 0) {
4101*22ce4affSfengbojiang vm_object_split(src_entry);
4102*22ce4affSfengbojiang src_object = src_entry->object.vm_object;
4103*22ce4affSfengbojiang }
4104*22ce4affSfengbojiang vm_object_reference_locked(src_object);
4105*22ce4affSfengbojiang vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
4106*22ce4affSfengbojiang VM_OBJECT_WUNLOCK(src_object);
4107*22ce4affSfengbojiang } else
4108*22ce4affSfengbojiang vm_object_reference(src_object);
4109*22ce4affSfengbojiang if (src_entry->cred != NULL &&
4110*22ce4affSfengbojiang !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4111*22ce4affSfengbojiang KASSERT(src_object->cred == NULL,
4112*22ce4affSfengbojiang ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p",
4113*22ce4affSfengbojiang src_object));
4114*22ce4affSfengbojiang src_object->cred = src_entry->cred;
4115*22ce4affSfengbojiang src_object->charge = size;
4116*22ce4affSfengbojiang }
4117*22ce4affSfengbojiang dst_entry->object.vm_object = src_object;
4118*22ce4affSfengbojiang if (charged) {
4119*22ce4affSfengbojiang cred = curthread->td_ucred;
4120*22ce4affSfengbojiang crhold(cred);
4121*22ce4affSfengbojiang dst_entry->cred = cred;
4122*22ce4affSfengbojiang *fork_charge += size;
4123*22ce4affSfengbojiang if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4124*22ce4affSfengbojiang crhold(cred);
4125*22ce4affSfengbojiang src_entry->cred = cred;
4126*22ce4affSfengbojiang *fork_charge += size;
4127*22ce4affSfengbojiang }
4128*22ce4affSfengbojiang }
4129*22ce4affSfengbojiang }
4130*22ce4affSfengbojiang
4131*22ce4affSfengbojiang /*
4132a9643ea8Slogwang * vm_map_copy_entry:
4133a9643ea8Slogwang *
4134a9643ea8Slogwang * Copies the contents of the source entry to the destination
4135a9643ea8Slogwang * entry. The entries *must* be aligned properly.
4136a9643ea8Slogwang */
4137a9643ea8Slogwang static void
vm_map_copy_entry(vm_map_t src_map,vm_map_t dst_map,vm_map_entry_t src_entry,vm_map_entry_t dst_entry,vm_ooffset_t * fork_charge)4138a9643ea8Slogwang vm_map_copy_entry(
4139a9643ea8Slogwang vm_map_t src_map,
4140a9643ea8Slogwang vm_map_t dst_map,
4141a9643ea8Slogwang vm_map_entry_t src_entry,
4142a9643ea8Slogwang vm_map_entry_t dst_entry,
4143a9643ea8Slogwang vm_ooffset_t *fork_charge)
4144a9643ea8Slogwang {
4145a9643ea8Slogwang vm_object_t src_object;
4146a9643ea8Slogwang vm_map_entry_t fake_entry;
4147a9643ea8Slogwang vm_offset_t size;
4148a9643ea8Slogwang
4149a9643ea8Slogwang VM_MAP_ASSERT_LOCKED(dst_map);
4150a9643ea8Slogwang
4151a9643ea8Slogwang if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
4152a9643ea8Slogwang return;
4153a9643ea8Slogwang
4154a9643ea8Slogwang if (src_entry->wired_count == 0 ||
4155a9643ea8Slogwang (src_entry->protection & VM_PROT_WRITE) == 0) {
4156a9643ea8Slogwang /*
4157a9643ea8Slogwang * If the source entry is marked needs_copy, it is already
4158a9643ea8Slogwang * write-protected.
4159a9643ea8Slogwang */
4160a9643ea8Slogwang if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
4161a9643ea8Slogwang (src_entry->protection & VM_PROT_WRITE) != 0) {
4162a9643ea8Slogwang pmap_protect(src_map->pmap,
4163a9643ea8Slogwang src_entry->start,
4164a9643ea8Slogwang src_entry->end,
4165a9643ea8Slogwang src_entry->protection & ~VM_PROT_WRITE);
4166a9643ea8Slogwang }
4167a9643ea8Slogwang
4168a9643ea8Slogwang /*
4169a9643ea8Slogwang * Make a copy of the object.
4170a9643ea8Slogwang */
4171a9643ea8Slogwang size = src_entry->end - src_entry->start;
4172a9643ea8Slogwang if ((src_object = src_entry->object.vm_object) != NULL) {
4173*22ce4affSfengbojiang if (src_object->type == OBJT_DEFAULT ||
4174*22ce4affSfengbojiang src_object->type == OBJT_SWAP) {
4175*22ce4affSfengbojiang vm_map_copy_swap_object(src_entry, dst_entry,
4176*22ce4affSfengbojiang size, fork_charge);
4177*22ce4affSfengbojiang /* May have split/collapsed, reload obj. */
4178a9643ea8Slogwang src_object = src_entry->object.vm_object;
4179*22ce4affSfengbojiang } else {
4180*22ce4affSfengbojiang vm_object_reference(src_object);
4181a9643ea8Slogwang dst_entry->object.vm_object = src_object;
4182a9643ea8Slogwang }
4183*22ce4affSfengbojiang src_entry->eflags |= MAP_ENTRY_COW |
4184*22ce4affSfengbojiang MAP_ENTRY_NEEDS_COPY;
4185*22ce4affSfengbojiang dst_entry->eflags |= MAP_ENTRY_COW |
4186*22ce4affSfengbojiang MAP_ENTRY_NEEDS_COPY;
4187a9643ea8Slogwang dst_entry->offset = src_entry->offset;
4188*22ce4affSfengbojiang if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
4189a9643ea8Slogwang /*
4190*22ce4affSfengbojiang * MAP_ENTRY_WRITECNT cannot
4191a9643ea8Slogwang * indicate write reference from
4192a9643ea8Slogwang * src_entry, since the entry is
4193a9643ea8Slogwang * marked as needs copy. Allocate a
4194a9643ea8Slogwang * fake entry that is used to
4195*22ce4affSfengbojiang * decrement object->un_pager writecount
4196a9643ea8Slogwang * at the appropriate time. Attach
4197a9643ea8Slogwang * fake_entry to the deferred list.
4198a9643ea8Slogwang */
4199a9643ea8Slogwang fake_entry = vm_map_entry_create(dst_map);
4200*22ce4affSfengbojiang fake_entry->eflags = MAP_ENTRY_WRITECNT;
4201*22ce4affSfengbojiang src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
4202a9643ea8Slogwang vm_object_reference(src_object);
4203a9643ea8Slogwang fake_entry->object.vm_object = src_object;
4204a9643ea8Slogwang fake_entry->start = src_entry->start;
4205a9643ea8Slogwang fake_entry->end = src_entry->end;
4206*22ce4affSfengbojiang fake_entry->defer_next =
4207*22ce4affSfengbojiang curthread->td_map_def_user;
4208a9643ea8Slogwang curthread->td_map_def_user = fake_entry;
4209a9643ea8Slogwang }
4210*22ce4affSfengbojiang
4211*22ce4affSfengbojiang pmap_copy(dst_map->pmap, src_map->pmap,
4212*22ce4affSfengbojiang dst_entry->start, dst_entry->end - dst_entry->start,
4213*22ce4affSfengbojiang src_entry->start);
4214a9643ea8Slogwang } else {
4215a9643ea8Slogwang dst_entry->object.vm_object = NULL;
4216a9643ea8Slogwang dst_entry->offset = 0;
4217a9643ea8Slogwang if (src_entry->cred != NULL) {
4218a9643ea8Slogwang dst_entry->cred = curthread->td_ucred;
4219a9643ea8Slogwang crhold(dst_entry->cred);
4220a9643ea8Slogwang *fork_charge += size;
4221a9643ea8Slogwang }
4222a9643ea8Slogwang }
4223a9643ea8Slogwang } else {
4224a9643ea8Slogwang /*
4225a9643ea8Slogwang * We don't want to make writeable wired pages copy-on-write.
4226a9643ea8Slogwang * Immediately copy these pages into the new map by simulating
4227a9643ea8Slogwang * page faults. The new pages are pageable.
4228a9643ea8Slogwang */
4229a9643ea8Slogwang vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
4230a9643ea8Slogwang fork_charge);
4231a9643ea8Slogwang }
4232a9643ea8Slogwang }
4233a9643ea8Slogwang
4234a9643ea8Slogwang /*
4235a9643ea8Slogwang * vmspace_map_entry_forked:
4236a9643ea8Slogwang * Update the newly-forked vmspace each time a map entry is inherited
4237a9643ea8Slogwang * or copied. The values for vm_dsize and vm_tsize are approximate
4238a9643ea8Slogwang * (and mostly-obsolete ideas in the face of mmap(2) et al.)
4239a9643ea8Slogwang */
4240a9643ea8Slogwang static void
vmspace_map_entry_forked(const struct vmspace * vm1,struct vmspace * vm2,vm_map_entry_t entry)4241a9643ea8Slogwang vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
4242a9643ea8Slogwang vm_map_entry_t entry)
4243a9643ea8Slogwang {
4244a9643ea8Slogwang vm_size_t entrysize;
4245a9643ea8Slogwang vm_offset_t newend;
4246a9643ea8Slogwang
4247*22ce4affSfengbojiang if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
4248*22ce4affSfengbojiang return;
4249a9643ea8Slogwang entrysize = entry->end - entry->start;
4250a9643ea8Slogwang vm2->vm_map.size += entrysize;
4251a9643ea8Slogwang if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
4252a9643ea8Slogwang vm2->vm_ssize += btoc(entrysize);
4253a9643ea8Slogwang } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
4254a9643ea8Slogwang entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
4255a9643ea8Slogwang newend = MIN(entry->end,
4256a9643ea8Slogwang (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
4257a9643ea8Slogwang vm2->vm_dsize += btoc(newend - entry->start);
4258a9643ea8Slogwang } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
4259a9643ea8Slogwang entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
4260a9643ea8Slogwang newend = MIN(entry->end,
4261a9643ea8Slogwang (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
4262a9643ea8Slogwang vm2->vm_tsize += btoc(newend - entry->start);
4263a9643ea8Slogwang }
4264a9643ea8Slogwang }
4265a9643ea8Slogwang
4266a9643ea8Slogwang /*
4267a9643ea8Slogwang * vmspace_fork:
4268a9643ea8Slogwang * Create a new process vmspace structure and vm_map
4269a9643ea8Slogwang * based on those of an existing process. The new map
4270a9643ea8Slogwang * is based on the old map, according to the inheritance
4271a9643ea8Slogwang * values on the regions in that map.
4272a9643ea8Slogwang *
4273a9643ea8Slogwang * XXX It might be worth coalescing the entries added to the new vmspace.
4274a9643ea8Slogwang *
4275a9643ea8Slogwang * The source map must not be locked.
4276a9643ea8Slogwang */
4277a9643ea8Slogwang struct vmspace *
vmspace_fork(struct vmspace * vm1,vm_ooffset_t * fork_charge)4278a9643ea8Slogwang vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
4279a9643ea8Slogwang {
4280a9643ea8Slogwang struct vmspace *vm2;
4281a9643ea8Slogwang vm_map_t new_map, old_map;
4282a9643ea8Slogwang vm_map_entry_t new_entry, old_entry;
4283a9643ea8Slogwang vm_object_t object;
4284*22ce4affSfengbojiang int error, locked;
4285*22ce4affSfengbojiang vm_inherit_t inh;
4286a9643ea8Slogwang
4287a9643ea8Slogwang old_map = &vm1->vm_map;
4288a9643ea8Slogwang /* Copy immutable fields of vm1 to vm2. */
4289*22ce4affSfengbojiang vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
4290*22ce4affSfengbojiang pmap_pinit);
4291a9643ea8Slogwang if (vm2 == NULL)
4292a9643ea8Slogwang return (NULL);
4293*22ce4affSfengbojiang
4294a9643ea8Slogwang vm2->vm_taddr = vm1->vm_taddr;
4295a9643ea8Slogwang vm2->vm_daddr = vm1->vm_daddr;
4296a9643ea8Slogwang vm2->vm_maxsaddr = vm1->vm_maxsaddr;
4297a9643ea8Slogwang vm_map_lock(old_map);
4298a9643ea8Slogwang if (old_map->busy)
4299a9643ea8Slogwang vm_map_wait_busy(old_map);
4300a9643ea8Slogwang new_map = &vm2->vm_map;
4301a9643ea8Slogwang locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
4302a9643ea8Slogwang KASSERT(locked, ("vmspace_fork: lock failed"));
4303a9643ea8Slogwang
4304*22ce4affSfengbojiang error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
4305*22ce4affSfengbojiang if (error != 0) {
4306*22ce4affSfengbojiang sx_xunlock(&old_map->lock);
4307*22ce4affSfengbojiang sx_xunlock(&new_map->lock);
4308*22ce4affSfengbojiang vm_map_process_deferred();
4309*22ce4affSfengbojiang vmspace_free(vm2);
4310*22ce4affSfengbojiang return (NULL);
4311*22ce4affSfengbojiang }
4312a9643ea8Slogwang
4313*22ce4affSfengbojiang new_map->anon_loc = old_map->anon_loc;
4314*22ce4affSfengbojiang new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART |
4315*22ce4affSfengbojiang MAP_WXORX);
4316*22ce4affSfengbojiang
4317*22ce4affSfengbojiang VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
4318*22ce4affSfengbojiang if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
4319a9643ea8Slogwang panic("vm_map_fork: encountered a submap");
4320a9643ea8Slogwang
4321*22ce4affSfengbojiang inh = old_entry->inheritance;
4322*22ce4affSfengbojiang if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4323*22ce4affSfengbojiang inh != VM_INHERIT_NONE)
4324*22ce4affSfengbojiang inh = VM_INHERIT_COPY;
4325*22ce4affSfengbojiang
4326*22ce4affSfengbojiang switch (inh) {
4327a9643ea8Slogwang case VM_INHERIT_NONE:
4328a9643ea8Slogwang break;
4329a9643ea8Slogwang
4330a9643ea8Slogwang case VM_INHERIT_SHARE:
4331a9643ea8Slogwang /*
4332*22ce4affSfengbojiang * Clone the entry, creating the shared object if
4333*22ce4affSfengbojiang * necessary.
4334a9643ea8Slogwang */
4335a9643ea8Slogwang object = old_entry->object.vm_object;
4336a9643ea8Slogwang if (object == NULL) {
4337*22ce4affSfengbojiang vm_map_entry_back(old_entry);
4338*22ce4affSfengbojiang object = old_entry->object.vm_object;
4339a9643ea8Slogwang }
4340a9643ea8Slogwang
4341a9643ea8Slogwang /*
4342a9643ea8Slogwang * Add the reference before calling vm_object_shadow
4343a9643ea8Slogwang * to insure that a shadow object is created.
4344a9643ea8Slogwang */
4345a9643ea8Slogwang vm_object_reference(object);
4346a9643ea8Slogwang if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4347a9643ea8Slogwang vm_object_shadow(&old_entry->object.vm_object,
4348a9643ea8Slogwang &old_entry->offset,
4349*22ce4affSfengbojiang old_entry->end - old_entry->start,
4350*22ce4affSfengbojiang old_entry->cred,
4351a9643ea8Slogwang /* Transfer the second reference too. */
4352*22ce4affSfengbojiang true);
4353*22ce4affSfengbojiang old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4354*22ce4affSfengbojiang old_entry->cred = NULL;
4355a9643ea8Slogwang
4356a9643ea8Slogwang /*
4357*22ce4affSfengbojiang * As in vm_map_merged_neighbor_dispose(),
4358*22ce4affSfengbojiang * the vnode lock will not be acquired in
4359a9643ea8Slogwang * this call to vm_object_deallocate().
4360a9643ea8Slogwang */
4361a9643ea8Slogwang vm_object_deallocate(object);
4362a9643ea8Slogwang object = old_entry->object.vm_object;
4363*22ce4affSfengbojiang } else {
4364a9643ea8Slogwang VM_OBJECT_WLOCK(object);
4365a9643ea8Slogwang vm_object_clear_flag(object, OBJ_ONEMAPPING);
4366a9643ea8Slogwang if (old_entry->cred != NULL) {
4367*22ce4affSfengbojiang KASSERT(object->cred == NULL,
4368*22ce4affSfengbojiang ("vmspace_fork both cred"));
4369a9643ea8Slogwang object->cred = old_entry->cred;
4370*22ce4affSfengbojiang object->charge = old_entry->end -
4371*22ce4affSfengbojiang old_entry->start;
4372a9643ea8Slogwang old_entry->cred = NULL;
4373a9643ea8Slogwang }
4374a9643ea8Slogwang
4375a9643ea8Slogwang /*
4376a9643ea8Slogwang * Assert the correct state of the vnode
4377a9643ea8Slogwang * v_writecount while the object is locked, to
4378a9643ea8Slogwang * not relock it later for the assertion
4379a9643ea8Slogwang * correctness.
4380a9643ea8Slogwang */
4381*22ce4affSfengbojiang if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
4382a9643ea8Slogwang object->type == OBJT_VNODE) {
4383*22ce4affSfengbojiang KASSERT(((struct vnode *)object->
4384*22ce4affSfengbojiang handle)->v_writecount > 0,
4385*22ce4affSfengbojiang ("vmspace_fork: v_writecount %p",
4386*22ce4affSfengbojiang object));
4387*22ce4affSfengbojiang KASSERT(object->un_pager.vnp.
4388*22ce4affSfengbojiang writemappings > 0,
4389a9643ea8Slogwang ("vmspace_fork: vnp.writecount %p",
4390a9643ea8Slogwang object));
4391a9643ea8Slogwang }
4392a9643ea8Slogwang VM_OBJECT_WUNLOCK(object);
4393*22ce4affSfengbojiang }
4394a9643ea8Slogwang
4395a9643ea8Slogwang /*
4396a9643ea8Slogwang * Clone the entry, referencing the shared object.
4397a9643ea8Slogwang */
4398a9643ea8Slogwang new_entry = vm_map_entry_create(new_map);
4399a9643ea8Slogwang *new_entry = *old_entry;
4400a9643ea8Slogwang new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4401a9643ea8Slogwang MAP_ENTRY_IN_TRANSITION);
4402a9643ea8Slogwang new_entry->wiring_thread = NULL;
4403a9643ea8Slogwang new_entry->wired_count = 0;
4404*22ce4affSfengbojiang if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
4405*22ce4affSfengbojiang vm_pager_update_writecount(object,
4406a9643ea8Slogwang new_entry->start, new_entry->end);
4407a9643ea8Slogwang }
4408*22ce4affSfengbojiang vm_map_entry_set_vnode_text(new_entry, true);
4409a9643ea8Slogwang
4410a9643ea8Slogwang /*
4411a9643ea8Slogwang * Insert the entry into the new map -- we know we're
4412a9643ea8Slogwang * inserting at the end of the new map.
4413a9643ea8Slogwang */
4414*22ce4affSfengbojiang vm_map_entry_link(new_map, new_entry);
4415a9643ea8Slogwang vmspace_map_entry_forked(vm1, vm2, new_entry);
4416a9643ea8Slogwang
4417a9643ea8Slogwang /*
4418a9643ea8Slogwang * Update the physical map
4419a9643ea8Slogwang */
4420a9643ea8Slogwang pmap_copy(new_map->pmap, old_map->pmap,
4421a9643ea8Slogwang new_entry->start,
4422a9643ea8Slogwang (old_entry->end - old_entry->start),
4423a9643ea8Slogwang old_entry->start);
4424a9643ea8Slogwang break;
4425a9643ea8Slogwang
4426a9643ea8Slogwang case VM_INHERIT_COPY:
4427a9643ea8Slogwang /*
4428a9643ea8Slogwang * Clone the entry and link into the map.
4429a9643ea8Slogwang */
4430a9643ea8Slogwang new_entry = vm_map_entry_create(new_map);
4431a9643ea8Slogwang *new_entry = *old_entry;
4432a9643ea8Slogwang /*
4433a9643ea8Slogwang * Copied entry is COW over the old object.
4434a9643ea8Slogwang */
4435a9643ea8Slogwang new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4436*22ce4affSfengbojiang MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
4437a9643ea8Slogwang new_entry->wiring_thread = NULL;
4438a9643ea8Slogwang new_entry->wired_count = 0;
4439a9643ea8Slogwang new_entry->object.vm_object = NULL;
4440a9643ea8Slogwang new_entry->cred = NULL;
4441*22ce4affSfengbojiang vm_map_entry_link(new_map, new_entry);
4442a9643ea8Slogwang vmspace_map_entry_forked(vm1, vm2, new_entry);
4443a9643ea8Slogwang vm_map_copy_entry(old_map, new_map, old_entry,
4444a9643ea8Slogwang new_entry, fork_charge);
4445*22ce4affSfengbojiang vm_map_entry_set_vnode_text(new_entry, true);
4446*22ce4affSfengbojiang break;
4447*22ce4affSfengbojiang
4448*22ce4affSfengbojiang case VM_INHERIT_ZERO:
4449*22ce4affSfengbojiang /*
4450*22ce4affSfengbojiang * Create a new anonymous mapping entry modelled from
4451*22ce4affSfengbojiang * the old one.
4452*22ce4affSfengbojiang */
4453*22ce4affSfengbojiang new_entry = vm_map_entry_create(new_map);
4454*22ce4affSfengbojiang memset(new_entry, 0, sizeof(*new_entry));
4455*22ce4affSfengbojiang
4456*22ce4affSfengbojiang new_entry->start = old_entry->start;
4457*22ce4affSfengbojiang new_entry->end = old_entry->end;
4458*22ce4affSfengbojiang new_entry->eflags = old_entry->eflags &
4459*22ce4affSfengbojiang ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
4460*22ce4affSfengbojiang MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC |
4461*22ce4affSfengbojiang MAP_ENTRY_SPLIT_BOUNDARY_MASK);
4462*22ce4affSfengbojiang new_entry->protection = old_entry->protection;
4463*22ce4affSfengbojiang new_entry->max_protection = old_entry->max_protection;
4464*22ce4affSfengbojiang new_entry->inheritance = VM_INHERIT_ZERO;
4465*22ce4affSfengbojiang
4466*22ce4affSfengbojiang vm_map_entry_link(new_map, new_entry);
4467*22ce4affSfengbojiang vmspace_map_entry_forked(vm1, vm2, new_entry);
4468*22ce4affSfengbojiang
4469*22ce4affSfengbojiang new_entry->cred = curthread->td_ucred;
4470*22ce4affSfengbojiang crhold(new_entry->cred);
4471*22ce4affSfengbojiang *fork_charge += (new_entry->end - new_entry->start);
4472*22ce4affSfengbojiang
4473a9643ea8Slogwang break;
4474a9643ea8Slogwang }
4475a9643ea8Slogwang }
4476a9643ea8Slogwang /*
4477a9643ea8Slogwang * Use inlined vm_map_unlock() to postpone handling the deferred
4478a9643ea8Slogwang * map entries, which cannot be done until both old_map and
4479a9643ea8Slogwang * new_map locks are released.
4480a9643ea8Slogwang */
4481a9643ea8Slogwang sx_xunlock(&old_map->lock);
4482a9643ea8Slogwang sx_xunlock(&new_map->lock);
4483a9643ea8Slogwang vm_map_process_deferred();
4484a9643ea8Slogwang
4485a9643ea8Slogwang return (vm2);
4486a9643ea8Slogwang }
4487a9643ea8Slogwang
4488*22ce4affSfengbojiang /*
4489*22ce4affSfengbojiang * Create a process's stack for exec_new_vmspace(). This function is never
4490*22ce4affSfengbojiang * asked to wire the newly created stack.
4491*22ce4affSfengbojiang */
4492a9643ea8Slogwang int
vm_map_stack(vm_map_t map,vm_offset_t addrbos,vm_size_t max_ssize,vm_prot_t prot,vm_prot_t max,int cow)4493a9643ea8Slogwang vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4494a9643ea8Slogwang vm_prot_t prot, vm_prot_t max, int cow)
4495a9643ea8Slogwang {
4496a9643ea8Slogwang vm_size_t growsize, init_ssize;
4497*22ce4affSfengbojiang rlim_t vmemlim;
4498a9643ea8Slogwang int rv;
4499a9643ea8Slogwang
4500*22ce4affSfengbojiang MPASS((map->flags & MAP_WIREFUTURE) == 0);
4501a9643ea8Slogwang growsize = sgrowsiz;
4502a9643ea8Slogwang init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
4503a9643ea8Slogwang vm_map_lock(map);
4504a9643ea8Slogwang vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4505a9643ea8Slogwang /* If we would blow our VMEM resource limit, no go */
4506a9643ea8Slogwang if (map->size + init_ssize > vmemlim) {
4507a9643ea8Slogwang rv = KERN_NO_SPACE;
4508a9643ea8Slogwang goto out;
4509a9643ea8Slogwang }
4510a9643ea8Slogwang rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
4511a9643ea8Slogwang max, cow);
4512a9643ea8Slogwang out:
4513a9643ea8Slogwang vm_map_unlock(map);
4514a9643ea8Slogwang return (rv);
4515a9643ea8Slogwang }
4516a9643ea8Slogwang
4517*22ce4affSfengbojiang static int stack_guard_page = 1;
4518*22ce4affSfengbojiang SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
4519*22ce4affSfengbojiang &stack_guard_page, 0,
4520*22ce4affSfengbojiang "Specifies the number of guard pages for a stack that grows");
4521*22ce4affSfengbojiang
4522a9643ea8Slogwang static int
vm_map_stack_locked(vm_map_t map,vm_offset_t addrbos,vm_size_t max_ssize,vm_size_t growsize,vm_prot_t prot,vm_prot_t max,int cow)4523a9643ea8Slogwang vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4524a9643ea8Slogwang vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
4525a9643ea8Slogwang {
4526a9643ea8Slogwang vm_map_entry_t new_entry, prev_entry;
4527*22ce4affSfengbojiang vm_offset_t bot, gap_bot, gap_top, top;
4528*22ce4affSfengbojiang vm_size_t init_ssize, sgp;
4529a9643ea8Slogwang int orient, rv;
4530a9643ea8Slogwang
4531a9643ea8Slogwang /*
4532a9643ea8Slogwang * The stack orientation is piggybacked with the cow argument.
4533a9643ea8Slogwang * Extract it into orient and mask the cow argument so that we
4534a9643ea8Slogwang * don't pass it around further.
4535a9643ea8Slogwang */
4536a9643ea8Slogwang orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
4537a9643ea8Slogwang KASSERT(orient != 0, ("No stack grow direction"));
4538*22ce4affSfengbojiang KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
4539*22ce4affSfengbojiang ("bi-dir stack"));
4540a9643ea8Slogwang
4541*22ce4affSfengbojiang if (max_ssize == 0 ||
4542*22ce4affSfengbojiang !vm_map_range_valid(map, addrbos, addrbos + max_ssize))
4543*22ce4affSfengbojiang return (KERN_INVALID_ADDRESS);
4544*22ce4affSfengbojiang sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4545*22ce4affSfengbojiang (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4546*22ce4affSfengbojiang (vm_size_t)stack_guard_page * PAGE_SIZE;
4547*22ce4affSfengbojiang if (sgp >= max_ssize)
4548*22ce4affSfengbojiang return (KERN_INVALID_ARGUMENT);
4549a9643ea8Slogwang
4550*22ce4affSfengbojiang init_ssize = growsize;
4551*22ce4affSfengbojiang if (max_ssize < init_ssize + sgp)
4552*22ce4affSfengbojiang init_ssize = max_ssize - sgp;
4553a9643ea8Slogwang
4554a9643ea8Slogwang /* If addr is already mapped, no go */
4555a9643ea8Slogwang if (vm_map_lookup_entry(map, addrbos, &prev_entry))
4556a9643ea8Slogwang return (KERN_NO_SPACE);
4557a9643ea8Slogwang
4558a9643ea8Slogwang /*
4559a9643ea8Slogwang * If we can't accommodate max_ssize in the current mapping, no go.
4560a9643ea8Slogwang */
4561*22ce4affSfengbojiang if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize)
4562a9643ea8Slogwang return (KERN_NO_SPACE);
4563a9643ea8Slogwang
4564a9643ea8Slogwang /*
4565a9643ea8Slogwang * We initially map a stack of only init_ssize. We will grow as
4566a9643ea8Slogwang * needed later. Depending on the orientation of the stack (i.e.
4567a9643ea8Slogwang * the grow direction) we either map at the top of the range, the
4568a9643ea8Slogwang * bottom of the range or in the middle.
4569a9643ea8Slogwang *
4570a9643ea8Slogwang * Note: we would normally expect prot and max to be VM_PROT_ALL,
4571a9643ea8Slogwang * and cow to be 0. Possibly we should eliminate these as input
4572a9643ea8Slogwang * parameters, and just pass these values here in the insert call.
4573a9643ea8Slogwang */
4574*22ce4affSfengbojiang if (orient == MAP_STACK_GROWS_DOWN) {
4575a9643ea8Slogwang bot = addrbos + max_ssize - init_ssize;
4576a9643ea8Slogwang top = bot + init_ssize;
4577*22ce4affSfengbojiang gap_bot = addrbos;
4578*22ce4affSfengbojiang gap_top = bot;
4579*22ce4affSfengbojiang } else /* if (orient == MAP_STACK_GROWS_UP) */ {
4580*22ce4affSfengbojiang bot = addrbos;
4581*22ce4affSfengbojiang top = bot + init_ssize;
4582*22ce4affSfengbojiang gap_bot = top;
4583*22ce4affSfengbojiang gap_top = addrbos + max_ssize;
4584*22ce4affSfengbojiang }
4585a9643ea8Slogwang rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
4586*22ce4affSfengbojiang if (rv != KERN_SUCCESS)
4587*22ce4affSfengbojiang return (rv);
4588*22ce4affSfengbojiang new_entry = vm_map_entry_succ(prev_entry);
4589*22ce4affSfengbojiang KASSERT(new_entry->end == top || new_entry->start == bot,
4590*22ce4affSfengbojiang ("Bad entry start/end for new stack entry"));
4591a9643ea8Slogwang KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
4592a9643ea8Slogwang (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
4593a9643ea8Slogwang ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
4594a9643ea8Slogwang KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
4595a9643ea8Slogwang (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
4596a9643ea8Slogwang ("new entry lacks MAP_ENTRY_GROWS_UP"));
4597*22ce4affSfengbojiang if (gap_bot == gap_top)
4598*22ce4affSfengbojiang return (KERN_SUCCESS);
4599*22ce4affSfengbojiang rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
4600*22ce4affSfengbojiang VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
4601*22ce4affSfengbojiang MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
4602*22ce4affSfengbojiang if (rv == KERN_SUCCESS) {
4603*22ce4affSfengbojiang /*
4604*22ce4affSfengbojiang * Gap can never successfully handle a fault, so
4605*22ce4affSfengbojiang * read-ahead logic is never used for it. Re-use
4606*22ce4affSfengbojiang * next_read of the gap entry to store
4607*22ce4affSfengbojiang * stack_guard_page for vm_map_growstack().
4608*22ce4affSfengbojiang */
4609*22ce4affSfengbojiang if (orient == MAP_STACK_GROWS_DOWN)
4610*22ce4affSfengbojiang vm_map_entry_pred(new_entry)->next_read = sgp;
4611*22ce4affSfengbojiang else
4612*22ce4affSfengbojiang vm_map_entry_succ(new_entry)->next_read = sgp;
4613*22ce4affSfengbojiang } else {
4614*22ce4affSfengbojiang (void)vm_map_delete(map, bot, top);
4615a9643ea8Slogwang }
4616a9643ea8Slogwang return (rv);
4617a9643ea8Slogwang }
4618a9643ea8Slogwang
4619*22ce4affSfengbojiang /*
4620*22ce4affSfengbojiang * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we
4621*22ce4affSfengbojiang * successfully grow the stack.
4622a9643ea8Slogwang */
4623*22ce4affSfengbojiang static int
vm_map_growstack(vm_map_t map,vm_offset_t addr,vm_map_entry_t gap_entry)4624*22ce4affSfengbojiang vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
4625a9643ea8Slogwang {
4626*22ce4affSfengbojiang vm_map_entry_t stack_entry;
4627*22ce4affSfengbojiang struct proc *p;
4628*22ce4affSfengbojiang struct vmspace *vm;
4629a9643ea8Slogwang struct ucred *cred;
4630*22ce4affSfengbojiang vm_offset_t gap_end, gap_start, grow_start;
4631*22ce4affSfengbojiang vm_size_t grow_amount, guard, max_grow;
4632*22ce4affSfengbojiang rlim_t lmemlim, stacklim, vmemlim;
4633*22ce4affSfengbojiang int rv, rv1;
4634*22ce4affSfengbojiang bool gap_deleted, grow_down, is_procstack;
4635a9643ea8Slogwang #ifdef notyet
4636a9643ea8Slogwang uint64_t limit;
4637a9643ea8Slogwang #endif
4638a9643ea8Slogwang #ifdef RACCT
4639a9643ea8Slogwang int error;
4640a9643ea8Slogwang #endif
4641a9643ea8Slogwang
4642*22ce4affSfengbojiang p = curproc;
4643*22ce4affSfengbojiang vm = p->p_vmspace;
4644*22ce4affSfengbojiang
4645*22ce4affSfengbojiang /*
4646*22ce4affSfengbojiang * Disallow stack growth when the access is performed by a
4647*22ce4affSfengbojiang * debugger or AIO daemon. The reason is that the wrong
4648*22ce4affSfengbojiang * resource limits are applied.
4649*22ce4affSfengbojiang */
4650*22ce4affSfengbojiang if (p != initproc && (map != &p->p_vmspace->vm_map ||
4651*22ce4affSfengbojiang p->p_textvp == NULL))
4652*22ce4affSfengbojiang return (KERN_FAILURE);
4653*22ce4affSfengbojiang
4654*22ce4affSfengbojiang MPASS(!map->system_map);
4655*22ce4affSfengbojiang
4656a9643ea8Slogwang lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
4657a9643ea8Slogwang stacklim = lim_cur(curthread, RLIMIT_STACK);
4658a9643ea8Slogwang vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4659*22ce4affSfengbojiang retry:
4660*22ce4affSfengbojiang /* If addr is not in a hole for a stack grow area, no need to grow. */
4661*22ce4affSfengbojiang if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
4662*22ce4affSfengbojiang return (KERN_FAILURE);
4663*22ce4affSfengbojiang if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
4664a9643ea8Slogwang return (KERN_SUCCESS);
4665*22ce4affSfengbojiang if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
4666*22ce4affSfengbojiang stack_entry = vm_map_entry_succ(gap_entry);
4667*22ce4affSfengbojiang if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
4668*22ce4affSfengbojiang stack_entry->start != gap_entry->end)
4669*22ce4affSfengbojiang return (KERN_FAILURE);
4670*22ce4affSfengbojiang grow_amount = round_page(stack_entry->start - addr);
4671*22ce4affSfengbojiang grow_down = true;
4672*22ce4affSfengbojiang } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
4673*22ce4affSfengbojiang stack_entry = vm_map_entry_pred(gap_entry);
4674*22ce4affSfengbojiang if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
4675*22ce4affSfengbojiang stack_entry->end != gap_entry->start)
4676*22ce4affSfengbojiang return (KERN_FAILURE);
4677*22ce4affSfengbojiang grow_amount = round_page(addr + 1 - stack_entry->end);
4678*22ce4affSfengbojiang grow_down = false;
4679a9643ea8Slogwang } else {
4680*22ce4affSfengbojiang return (KERN_FAILURE);
4681a9643ea8Slogwang }
4682*22ce4affSfengbojiang guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4683*22ce4affSfengbojiang (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4684*22ce4affSfengbojiang gap_entry->next_read;
4685*22ce4affSfengbojiang max_grow = gap_entry->end - gap_entry->start;
4686*22ce4affSfengbojiang if (guard > max_grow)
4687a9643ea8Slogwang return (KERN_NO_SPACE);
4688*22ce4affSfengbojiang max_grow -= guard;
4689*22ce4affSfengbojiang if (grow_amount > max_grow)
4690a9643ea8Slogwang return (KERN_NO_SPACE);
4691a9643ea8Slogwang
4692a9643ea8Slogwang /*
4693a9643ea8Slogwang * If this is the main process stack, see if we're over the stack
4694a9643ea8Slogwang * limit.
4695a9643ea8Slogwang */
4696*22ce4affSfengbojiang is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
4697*22ce4affSfengbojiang addr < (vm_offset_t)p->p_sysent->sv_usrstack;
4698*22ce4affSfengbojiang if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
4699a9643ea8Slogwang return (KERN_NO_SPACE);
4700*22ce4affSfengbojiang
4701a9643ea8Slogwang #ifdef RACCT
4702a9643ea8Slogwang if (racct_enable) {
4703a9643ea8Slogwang PROC_LOCK(p);
4704a9643ea8Slogwang if (is_procstack && racct_set(p, RACCT_STACK,
4705a9643ea8Slogwang ctob(vm->vm_ssize) + grow_amount)) {
4706a9643ea8Slogwang PROC_UNLOCK(p);
4707a9643ea8Slogwang return (KERN_NO_SPACE);
4708a9643ea8Slogwang }
4709a9643ea8Slogwang PROC_UNLOCK(p);
4710a9643ea8Slogwang }
4711a9643ea8Slogwang #endif
4712a9643ea8Slogwang
4713*22ce4affSfengbojiang grow_amount = roundup(grow_amount, sgrowsiz);
4714*22ce4affSfengbojiang if (grow_amount > max_grow)
4715*22ce4affSfengbojiang grow_amount = max_grow;
4716a9643ea8Slogwang if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
4717a9643ea8Slogwang grow_amount = trunc_page((vm_size_t)stacklim) -
4718a9643ea8Slogwang ctob(vm->vm_ssize);
4719a9643ea8Slogwang }
4720*22ce4affSfengbojiang
4721a9643ea8Slogwang #ifdef notyet
4722a9643ea8Slogwang PROC_LOCK(p);
4723a9643ea8Slogwang limit = racct_get_available(p, RACCT_STACK);
4724a9643ea8Slogwang PROC_UNLOCK(p);
4725a9643ea8Slogwang if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
4726a9643ea8Slogwang grow_amount = limit - ctob(vm->vm_ssize);
4727a9643ea8Slogwang #endif
4728*22ce4affSfengbojiang
4729*22ce4affSfengbojiang if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
4730a9643ea8Slogwang if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
4731a9643ea8Slogwang rv = KERN_NO_SPACE;
4732a9643ea8Slogwang goto out;
4733a9643ea8Slogwang }
4734a9643ea8Slogwang #ifdef RACCT
4735a9643ea8Slogwang if (racct_enable) {
4736a9643ea8Slogwang PROC_LOCK(p);
4737a9643ea8Slogwang if (racct_set(p, RACCT_MEMLOCK,
4738a9643ea8Slogwang ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
4739a9643ea8Slogwang PROC_UNLOCK(p);
4740a9643ea8Slogwang rv = KERN_NO_SPACE;
4741a9643ea8Slogwang goto out;
4742a9643ea8Slogwang }
4743a9643ea8Slogwang PROC_UNLOCK(p);
4744a9643ea8Slogwang }
4745a9643ea8Slogwang #endif
4746a9643ea8Slogwang }
4747*22ce4affSfengbojiang
4748a9643ea8Slogwang /* If we would blow our VMEM resource limit, no go */
4749a9643ea8Slogwang if (map->size + grow_amount > vmemlim) {
4750a9643ea8Slogwang rv = KERN_NO_SPACE;
4751a9643ea8Slogwang goto out;
4752a9643ea8Slogwang }
4753a9643ea8Slogwang #ifdef RACCT
4754a9643ea8Slogwang if (racct_enable) {
4755a9643ea8Slogwang PROC_LOCK(p);
4756a9643ea8Slogwang if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
4757a9643ea8Slogwang PROC_UNLOCK(p);
4758a9643ea8Slogwang rv = KERN_NO_SPACE;
4759a9643ea8Slogwang goto out;
4760a9643ea8Slogwang }
4761a9643ea8Slogwang PROC_UNLOCK(p);
4762a9643ea8Slogwang }
4763a9643ea8Slogwang #endif
4764a9643ea8Slogwang
4765*22ce4affSfengbojiang if (vm_map_lock_upgrade(map)) {
4766*22ce4affSfengbojiang gap_entry = NULL;
4767*22ce4affSfengbojiang vm_map_lock_read(map);
4768*22ce4affSfengbojiang goto retry;
4769a9643ea8Slogwang }
4770a9643ea8Slogwang
4771*22ce4affSfengbojiang if (grow_down) {
4772*22ce4affSfengbojiang grow_start = gap_entry->end - grow_amount;
4773*22ce4affSfengbojiang if (gap_entry->start + grow_amount == gap_entry->end) {
4774*22ce4affSfengbojiang gap_start = gap_entry->start;
4775*22ce4affSfengbojiang gap_end = gap_entry->end;
4776*22ce4affSfengbojiang vm_map_entry_delete(map, gap_entry);
4777*22ce4affSfengbojiang gap_deleted = true;
4778*22ce4affSfengbojiang } else {
4779*22ce4affSfengbojiang MPASS(gap_entry->start < gap_entry->end - grow_amount);
4780*22ce4affSfengbojiang vm_map_entry_resize(map, gap_entry, -grow_amount);
4781*22ce4affSfengbojiang gap_deleted = false;
4782*22ce4affSfengbojiang }
4783*22ce4affSfengbojiang rv = vm_map_insert(map, NULL, 0, grow_start,
4784*22ce4affSfengbojiang grow_start + grow_amount,
4785*22ce4affSfengbojiang stack_entry->protection, stack_entry->max_protection,
4786a9643ea8Slogwang MAP_STACK_GROWS_DOWN);
4787*22ce4affSfengbojiang if (rv != KERN_SUCCESS) {
4788*22ce4affSfengbojiang if (gap_deleted) {
4789*22ce4affSfengbojiang rv1 = vm_map_insert(map, NULL, 0, gap_start,
4790*22ce4affSfengbojiang gap_end, VM_PROT_NONE, VM_PROT_NONE,
4791*22ce4affSfengbojiang MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
4792*22ce4affSfengbojiang MPASS(rv1 == KERN_SUCCESS);
4793*22ce4affSfengbojiang } else
4794*22ce4affSfengbojiang vm_map_entry_resize(map, gap_entry,
4795*22ce4affSfengbojiang grow_amount);
4796a9643ea8Slogwang }
4797a9643ea8Slogwang } else {
4798*22ce4affSfengbojiang grow_start = stack_entry->end;
4799a9643ea8Slogwang cred = stack_entry->cred;
4800a9643ea8Slogwang if (cred == NULL && stack_entry->object.vm_object != NULL)
4801a9643ea8Slogwang cred = stack_entry->object.vm_object->cred;
4802a9643ea8Slogwang if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
4803a9643ea8Slogwang rv = KERN_NO_SPACE;
4804a9643ea8Slogwang /* Grow the underlying object if applicable. */
4805a9643ea8Slogwang else if (stack_entry->object.vm_object == NULL ||
4806a9643ea8Slogwang vm_object_coalesce(stack_entry->object.vm_object,
4807a9643ea8Slogwang stack_entry->offset,
4808a9643ea8Slogwang (vm_size_t)(stack_entry->end - stack_entry->start),
4809*22ce4affSfengbojiang grow_amount, cred != NULL)) {
4810*22ce4affSfengbojiang if (gap_entry->start + grow_amount == gap_entry->end) {
4811*22ce4affSfengbojiang vm_map_entry_delete(map, gap_entry);
4812*22ce4affSfengbojiang vm_map_entry_resize(map, stack_entry,
4813*22ce4affSfengbojiang grow_amount);
4814*22ce4affSfengbojiang } else {
4815*22ce4affSfengbojiang gap_entry->start += grow_amount;
4816*22ce4affSfengbojiang stack_entry->end += grow_amount;
4817*22ce4affSfengbojiang }
4818*22ce4affSfengbojiang map->size += grow_amount;
4819a9643ea8Slogwang rv = KERN_SUCCESS;
4820a9643ea8Slogwang } else
4821a9643ea8Slogwang rv = KERN_FAILURE;
4822a9643ea8Slogwang }
4823a9643ea8Slogwang if (rv == KERN_SUCCESS && is_procstack)
4824a9643ea8Slogwang vm->vm_ssize += btoc(grow_amount);
4825a9643ea8Slogwang
4826a9643ea8Slogwang /*
4827a9643ea8Slogwang * Heed the MAP_WIREFUTURE flag if it was set for this process.
4828a9643ea8Slogwang */
4829*22ce4affSfengbojiang if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
4830*22ce4affSfengbojiang rv = vm_map_wire_locked(map, grow_start,
4831*22ce4affSfengbojiang grow_start + grow_amount,
4832*22ce4affSfengbojiang VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
4833a9643ea8Slogwang }
4834*22ce4affSfengbojiang vm_map_lock_downgrade(map);
4835a9643ea8Slogwang
4836a9643ea8Slogwang out:
4837a9643ea8Slogwang #ifdef RACCT
4838a9643ea8Slogwang if (racct_enable && rv != KERN_SUCCESS) {
4839a9643ea8Slogwang PROC_LOCK(p);
4840a9643ea8Slogwang error = racct_set(p, RACCT_VMEM, map->size);
4841a9643ea8Slogwang KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
4842a9643ea8Slogwang if (!old_mlock) {
4843a9643ea8Slogwang error = racct_set(p, RACCT_MEMLOCK,
4844a9643ea8Slogwang ptoa(pmap_wired_count(map->pmap)));
4845a9643ea8Slogwang KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
4846a9643ea8Slogwang }
4847a9643ea8Slogwang error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
4848a9643ea8Slogwang KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
4849a9643ea8Slogwang PROC_UNLOCK(p);
4850a9643ea8Slogwang }
4851a9643ea8Slogwang #endif
4852a9643ea8Slogwang
4853a9643ea8Slogwang return (rv);
4854a9643ea8Slogwang }
4855a9643ea8Slogwang
4856a9643ea8Slogwang /*
4857a9643ea8Slogwang * Unshare the specified VM space for exec. If other processes are
4858a9643ea8Slogwang * mapped to it, then create a new one. The new vmspace is null.
4859a9643ea8Slogwang */
4860a9643ea8Slogwang int
vmspace_exec(struct proc * p,vm_offset_t minuser,vm_offset_t maxuser)4861a9643ea8Slogwang vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
4862a9643ea8Slogwang {
4863a9643ea8Slogwang struct vmspace *oldvmspace = p->p_vmspace;
4864a9643ea8Slogwang struct vmspace *newvmspace;
4865a9643ea8Slogwang
4866a9643ea8Slogwang KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
4867a9643ea8Slogwang ("vmspace_exec recursed"));
4868*22ce4affSfengbojiang newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
4869a9643ea8Slogwang if (newvmspace == NULL)
4870a9643ea8Slogwang return (ENOMEM);
4871a9643ea8Slogwang newvmspace->vm_swrss = oldvmspace->vm_swrss;
4872a9643ea8Slogwang /*
4873a9643ea8Slogwang * This code is written like this for prototype purposes. The
4874a9643ea8Slogwang * goal is to avoid running down the vmspace here, but let the
4875a9643ea8Slogwang * other process's that are still using the vmspace to finally
4876a9643ea8Slogwang * run it down. Even though there is little or no chance of blocking
4877a9643ea8Slogwang * here, it is a good idea to keep this form for future mods.
4878a9643ea8Slogwang */
4879a9643ea8Slogwang PROC_VMSPACE_LOCK(p);
4880a9643ea8Slogwang p->p_vmspace = newvmspace;
4881a9643ea8Slogwang PROC_VMSPACE_UNLOCK(p);
4882a9643ea8Slogwang if (p == curthread->td_proc)
4883a9643ea8Slogwang pmap_activate(curthread);
4884a9643ea8Slogwang curthread->td_pflags |= TDP_EXECVMSPC;
4885a9643ea8Slogwang return (0);
4886a9643ea8Slogwang }
4887a9643ea8Slogwang
4888a9643ea8Slogwang /*
4889a9643ea8Slogwang * Unshare the specified VM space for forcing COW. This
4890a9643ea8Slogwang * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4891a9643ea8Slogwang */
4892a9643ea8Slogwang int
vmspace_unshare(struct proc * p)4893a9643ea8Slogwang vmspace_unshare(struct proc *p)
4894a9643ea8Slogwang {
4895a9643ea8Slogwang struct vmspace *oldvmspace = p->p_vmspace;
4896a9643ea8Slogwang struct vmspace *newvmspace;
4897a9643ea8Slogwang vm_ooffset_t fork_charge;
4898a9643ea8Slogwang
4899*22ce4affSfengbojiang if (refcount_load(&oldvmspace->vm_refcnt) == 1)
4900a9643ea8Slogwang return (0);
4901a9643ea8Slogwang fork_charge = 0;
4902a9643ea8Slogwang newvmspace = vmspace_fork(oldvmspace, &fork_charge);
4903a9643ea8Slogwang if (newvmspace == NULL)
4904a9643ea8Slogwang return (ENOMEM);
4905a9643ea8Slogwang if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
4906a9643ea8Slogwang vmspace_free(newvmspace);
4907a9643ea8Slogwang return (ENOMEM);
4908a9643ea8Slogwang }
4909a9643ea8Slogwang PROC_VMSPACE_LOCK(p);
4910a9643ea8Slogwang p->p_vmspace = newvmspace;
4911a9643ea8Slogwang PROC_VMSPACE_UNLOCK(p);
4912a9643ea8Slogwang if (p == curthread->td_proc)
4913a9643ea8Slogwang pmap_activate(curthread);
4914a9643ea8Slogwang vmspace_free(oldvmspace);
4915a9643ea8Slogwang return (0);
4916a9643ea8Slogwang }
4917a9643ea8Slogwang
4918a9643ea8Slogwang /*
4919a9643ea8Slogwang * vm_map_lookup:
4920a9643ea8Slogwang *
4921a9643ea8Slogwang * Finds the VM object, offset, and
4922a9643ea8Slogwang * protection for a given virtual address in the
4923a9643ea8Slogwang * specified map, assuming a page fault of the
4924a9643ea8Slogwang * type specified.
4925a9643ea8Slogwang *
4926a9643ea8Slogwang * Leaves the map in question locked for read; return
4927a9643ea8Slogwang * values are guaranteed until a vm_map_lookup_done
4928a9643ea8Slogwang * call is performed. Note that the map argument
4929a9643ea8Slogwang * is in/out; the returned map must be used in
4930a9643ea8Slogwang * the call to vm_map_lookup_done.
4931a9643ea8Slogwang *
4932a9643ea8Slogwang * A handle (out_entry) is returned for use in
4933a9643ea8Slogwang * vm_map_lookup_done, to make that fast.
4934a9643ea8Slogwang *
4935a9643ea8Slogwang * If a lookup is requested with "write protection"
4936a9643ea8Slogwang * specified, the map may be changed to perform virtual
4937a9643ea8Slogwang * copying operations, although the data referenced will
4938a9643ea8Slogwang * remain the same.
4939a9643ea8Slogwang */
4940a9643ea8Slogwang int
vm_map_lookup(vm_map_t * var_map,vm_offset_t vaddr,vm_prot_t fault_typea,vm_map_entry_t * out_entry,vm_object_t * object,vm_pindex_t * pindex,vm_prot_t * out_prot,boolean_t * wired)4941a9643ea8Slogwang vm_map_lookup(vm_map_t *var_map, /* IN/OUT */
4942a9643ea8Slogwang vm_offset_t vaddr,
4943a9643ea8Slogwang vm_prot_t fault_typea,
4944a9643ea8Slogwang vm_map_entry_t *out_entry, /* OUT */
4945a9643ea8Slogwang vm_object_t *object, /* OUT */
4946a9643ea8Slogwang vm_pindex_t *pindex, /* OUT */
4947a9643ea8Slogwang vm_prot_t *out_prot, /* OUT */
4948a9643ea8Slogwang boolean_t *wired) /* OUT */
4949a9643ea8Slogwang {
4950a9643ea8Slogwang vm_map_entry_t entry;
4951a9643ea8Slogwang vm_map_t map = *var_map;
4952a9643ea8Slogwang vm_prot_t prot;
4953*22ce4affSfengbojiang vm_prot_t fault_type;
4954a9643ea8Slogwang vm_object_t eobject;
4955a9643ea8Slogwang vm_size_t size;
4956a9643ea8Slogwang struct ucred *cred;
4957a9643ea8Slogwang
4958*22ce4affSfengbojiang RetryLookup:
4959a9643ea8Slogwang
4960a9643ea8Slogwang vm_map_lock_read(map);
4961a9643ea8Slogwang
4962*22ce4affSfengbojiang RetryLookupLocked:
4963a9643ea8Slogwang /*
4964a9643ea8Slogwang * Lookup the faulting address.
4965a9643ea8Slogwang */
4966a9643ea8Slogwang if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
4967a9643ea8Slogwang vm_map_unlock_read(map);
4968a9643ea8Slogwang return (KERN_INVALID_ADDRESS);
4969a9643ea8Slogwang }
4970a9643ea8Slogwang
4971a9643ea8Slogwang entry = *out_entry;
4972a9643ea8Slogwang
4973a9643ea8Slogwang /*
4974a9643ea8Slogwang * Handle submaps.
4975a9643ea8Slogwang */
4976a9643ea8Slogwang if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4977a9643ea8Slogwang vm_map_t old_map = map;
4978a9643ea8Slogwang
4979a9643ea8Slogwang *var_map = map = entry->object.sub_map;
4980a9643ea8Slogwang vm_map_unlock_read(old_map);
4981a9643ea8Slogwang goto RetryLookup;
4982a9643ea8Slogwang }
4983a9643ea8Slogwang
4984a9643ea8Slogwang /*
4985a9643ea8Slogwang * Check whether this task is allowed to have this page.
4986a9643ea8Slogwang */
4987a9643ea8Slogwang prot = entry->protection;
4988*22ce4affSfengbojiang if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
4989*22ce4affSfengbojiang fault_typea &= ~VM_PROT_FAULT_LOOKUP;
4990*22ce4affSfengbojiang if (prot == VM_PROT_NONE && map != kernel_map &&
4991*22ce4affSfengbojiang (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4992*22ce4affSfengbojiang (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
4993*22ce4affSfengbojiang MAP_ENTRY_STACK_GAP_UP)) != 0 &&
4994*22ce4affSfengbojiang vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
4995*22ce4affSfengbojiang goto RetryLookupLocked;
4996*22ce4affSfengbojiang }
4997*22ce4affSfengbojiang fault_type = fault_typea & VM_PROT_ALL;
4998a9643ea8Slogwang if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
4999a9643ea8Slogwang vm_map_unlock_read(map);
5000a9643ea8Slogwang return (KERN_PROTECTION_FAILURE);
5001a9643ea8Slogwang }
5002a9643ea8Slogwang KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
5003a9643ea8Slogwang (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
5004a9643ea8Slogwang (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
5005a9643ea8Slogwang ("entry %p flags %x", entry, entry->eflags));
5006a9643ea8Slogwang if ((fault_typea & VM_PROT_COPY) != 0 &&
5007a9643ea8Slogwang (entry->max_protection & VM_PROT_WRITE) == 0 &&
5008a9643ea8Slogwang (entry->eflags & MAP_ENTRY_COW) == 0) {
5009a9643ea8Slogwang vm_map_unlock_read(map);
5010a9643ea8Slogwang return (KERN_PROTECTION_FAILURE);
5011a9643ea8Slogwang }
5012a9643ea8Slogwang
5013a9643ea8Slogwang /*
5014a9643ea8Slogwang * If this page is not pageable, we have to get it for all possible
5015a9643ea8Slogwang * accesses.
5016a9643ea8Slogwang */
5017a9643ea8Slogwang *wired = (entry->wired_count != 0);
5018a9643ea8Slogwang if (*wired)
5019a9643ea8Slogwang fault_type = entry->protection;
5020a9643ea8Slogwang size = entry->end - entry->start;
5021*22ce4affSfengbojiang
5022a9643ea8Slogwang /*
5023a9643ea8Slogwang * If the entry was copy-on-write, we either ...
5024a9643ea8Slogwang */
5025a9643ea8Slogwang if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5026a9643ea8Slogwang /*
5027a9643ea8Slogwang * If we want to write the page, we may as well handle that
5028a9643ea8Slogwang * now since we've got the map locked.
5029a9643ea8Slogwang *
5030a9643ea8Slogwang * If we don't need to write the page, we just demote the
5031a9643ea8Slogwang * permissions allowed.
5032a9643ea8Slogwang */
5033a9643ea8Slogwang if ((fault_type & VM_PROT_WRITE) != 0 ||
5034a9643ea8Slogwang (fault_typea & VM_PROT_COPY) != 0) {
5035a9643ea8Slogwang /*
5036a9643ea8Slogwang * Make a new object, and place it in the object
5037a9643ea8Slogwang * chain. Note that no new references have appeared
5038a9643ea8Slogwang * -- one just moved from the map to the new
5039a9643ea8Slogwang * object.
5040a9643ea8Slogwang */
5041a9643ea8Slogwang if (vm_map_lock_upgrade(map))
5042a9643ea8Slogwang goto RetryLookup;
5043a9643ea8Slogwang
5044a9643ea8Slogwang if (entry->cred == NULL) {
5045a9643ea8Slogwang /*
5046a9643ea8Slogwang * The debugger owner is charged for
5047a9643ea8Slogwang * the memory.
5048a9643ea8Slogwang */
5049a9643ea8Slogwang cred = curthread->td_ucred;
5050a9643ea8Slogwang crhold(cred);
5051a9643ea8Slogwang if (!swap_reserve_by_cred(size, cred)) {
5052a9643ea8Slogwang crfree(cred);
5053a9643ea8Slogwang vm_map_unlock(map);
5054a9643ea8Slogwang return (KERN_RESOURCE_SHORTAGE);
5055a9643ea8Slogwang }
5056a9643ea8Slogwang entry->cred = cred;
5057a9643ea8Slogwang }
5058a9643ea8Slogwang eobject = entry->object.vm_object;
5059*22ce4affSfengbojiang vm_object_shadow(&entry->object.vm_object,
5060*22ce4affSfengbojiang &entry->offset, size, entry->cred, false);
5061*22ce4affSfengbojiang if (eobject == entry->object.vm_object) {
5062a9643ea8Slogwang /*
5063a9643ea8Slogwang * The object was not shadowed.
5064a9643ea8Slogwang */
5065a9643ea8Slogwang swap_release_by_cred(size, entry->cred);
5066a9643ea8Slogwang crfree(entry->cred);
5067a9643ea8Slogwang }
5068*22ce4affSfengbojiang entry->cred = NULL;
5069*22ce4affSfengbojiang entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
5070a9643ea8Slogwang
5071a9643ea8Slogwang vm_map_lock_downgrade(map);
5072a9643ea8Slogwang } else {
5073a9643ea8Slogwang /*
5074a9643ea8Slogwang * We're attempting to read a copy-on-write page --
5075a9643ea8Slogwang * don't allow writes.
5076a9643ea8Slogwang */
5077a9643ea8Slogwang prot &= ~VM_PROT_WRITE;
5078a9643ea8Slogwang }
5079a9643ea8Slogwang }
5080a9643ea8Slogwang
5081a9643ea8Slogwang /*
5082a9643ea8Slogwang * Create an object if necessary.
5083a9643ea8Slogwang */
5084*22ce4affSfengbojiang if (entry->object.vm_object == NULL && !map->system_map) {
5085a9643ea8Slogwang if (vm_map_lock_upgrade(map))
5086a9643ea8Slogwang goto RetryLookup;
5087*22ce4affSfengbojiang entry->object.vm_object = vm_object_allocate_anon(atop(size),
5088*22ce4affSfengbojiang NULL, entry->cred, entry->cred != NULL ? size : 0);
5089a9643ea8Slogwang entry->offset = 0;
5090a9643ea8Slogwang entry->cred = NULL;
5091a9643ea8Slogwang vm_map_lock_downgrade(map);
5092a9643ea8Slogwang }
5093a9643ea8Slogwang
5094a9643ea8Slogwang /*
5095a9643ea8Slogwang * Return the object/offset from this entry. If the entry was
5096a9643ea8Slogwang * copy-on-write or empty, it has been fixed up.
5097a9643ea8Slogwang */
5098a9643ea8Slogwang *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5099a9643ea8Slogwang *object = entry->object.vm_object;
5100a9643ea8Slogwang
5101a9643ea8Slogwang *out_prot = prot;
5102a9643ea8Slogwang return (KERN_SUCCESS);
5103a9643ea8Slogwang }
5104a9643ea8Slogwang
5105a9643ea8Slogwang /*
5106a9643ea8Slogwang * vm_map_lookup_locked:
5107a9643ea8Slogwang *
5108a9643ea8Slogwang * Lookup the faulting address. A version of vm_map_lookup that returns
5109a9643ea8Slogwang * KERN_FAILURE instead of blocking on map lock or memory allocation.
5110a9643ea8Slogwang */
5111a9643ea8Slogwang int
vm_map_lookup_locked(vm_map_t * var_map,vm_offset_t vaddr,vm_prot_t fault_typea,vm_map_entry_t * out_entry,vm_object_t * object,vm_pindex_t * pindex,vm_prot_t * out_prot,boolean_t * wired)5112a9643ea8Slogwang vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */
5113a9643ea8Slogwang vm_offset_t vaddr,
5114a9643ea8Slogwang vm_prot_t fault_typea,
5115a9643ea8Slogwang vm_map_entry_t *out_entry, /* OUT */
5116a9643ea8Slogwang vm_object_t *object, /* OUT */
5117a9643ea8Slogwang vm_pindex_t *pindex, /* OUT */
5118a9643ea8Slogwang vm_prot_t *out_prot, /* OUT */
5119a9643ea8Slogwang boolean_t *wired) /* OUT */
5120a9643ea8Slogwang {
5121a9643ea8Slogwang vm_map_entry_t entry;
5122a9643ea8Slogwang vm_map_t map = *var_map;
5123a9643ea8Slogwang vm_prot_t prot;
5124a9643ea8Slogwang vm_prot_t fault_type = fault_typea;
5125a9643ea8Slogwang
5126a9643ea8Slogwang /*
5127a9643ea8Slogwang * Lookup the faulting address.
5128a9643ea8Slogwang */
5129a9643ea8Slogwang if (!vm_map_lookup_entry(map, vaddr, out_entry))
5130a9643ea8Slogwang return (KERN_INVALID_ADDRESS);
5131a9643ea8Slogwang
5132a9643ea8Slogwang entry = *out_entry;
5133a9643ea8Slogwang
5134a9643ea8Slogwang /*
5135a9643ea8Slogwang * Fail if the entry refers to a submap.
5136a9643ea8Slogwang */
5137a9643ea8Slogwang if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
5138a9643ea8Slogwang return (KERN_FAILURE);
5139a9643ea8Slogwang
5140a9643ea8Slogwang /*
5141a9643ea8Slogwang * Check whether this task is allowed to have this page.
5142a9643ea8Slogwang */
5143a9643ea8Slogwang prot = entry->protection;
5144a9643ea8Slogwang fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
5145a9643ea8Slogwang if ((fault_type & prot) != fault_type)
5146a9643ea8Slogwang return (KERN_PROTECTION_FAILURE);
5147a9643ea8Slogwang
5148a9643ea8Slogwang /*
5149a9643ea8Slogwang * If this page is not pageable, we have to get it for all possible
5150a9643ea8Slogwang * accesses.
5151a9643ea8Slogwang */
5152a9643ea8Slogwang *wired = (entry->wired_count != 0);
5153a9643ea8Slogwang if (*wired)
5154a9643ea8Slogwang fault_type = entry->protection;
5155a9643ea8Slogwang
5156a9643ea8Slogwang if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5157a9643ea8Slogwang /*
5158a9643ea8Slogwang * Fail if the entry was copy-on-write for a write fault.
5159a9643ea8Slogwang */
5160a9643ea8Slogwang if (fault_type & VM_PROT_WRITE)
5161a9643ea8Slogwang return (KERN_FAILURE);
5162a9643ea8Slogwang /*
5163a9643ea8Slogwang * We're attempting to read a copy-on-write page --
5164a9643ea8Slogwang * don't allow writes.
5165a9643ea8Slogwang */
5166a9643ea8Slogwang prot &= ~VM_PROT_WRITE;
5167a9643ea8Slogwang }
5168a9643ea8Slogwang
5169a9643ea8Slogwang /*
5170a9643ea8Slogwang * Fail if an object should be created.
5171a9643ea8Slogwang */
5172a9643ea8Slogwang if (entry->object.vm_object == NULL && !map->system_map)
5173a9643ea8Slogwang return (KERN_FAILURE);
5174a9643ea8Slogwang
5175a9643ea8Slogwang /*
5176a9643ea8Slogwang * Return the object/offset from this entry. If the entry was
5177a9643ea8Slogwang * copy-on-write or empty, it has been fixed up.
5178a9643ea8Slogwang */
5179a9643ea8Slogwang *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5180a9643ea8Slogwang *object = entry->object.vm_object;
5181a9643ea8Slogwang
5182a9643ea8Slogwang *out_prot = prot;
5183a9643ea8Slogwang return (KERN_SUCCESS);
5184a9643ea8Slogwang }
5185a9643ea8Slogwang
5186a9643ea8Slogwang /*
5187a9643ea8Slogwang * vm_map_lookup_done:
5188a9643ea8Slogwang *
5189a9643ea8Slogwang * Releases locks acquired by a vm_map_lookup
5190a9643ea8Slogwang * (according to the handle returned by that lookup).
5191a9643ea8Slogwang */
5192a9643ea8Slogwang void
vm_map_lookup_done(vm_map_t map,vm_map_entry_t entry)5193a9643ea8Slogwang vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
5194a9643ea8Slogwang {
5195a9643ea8Slogwang /*
5196a9643ea8Slogwang * Unlock the main-level map
5197a9643ea8Slogwang */
5198a9643ea8Slogwang vm_map_unlock_read(map);
5199a9643ea8Slogwang }
5200a9643ea8Slogwang
5201*22ce4affSfengbojiang vm_offset_t
vm_map_max_KBI(const struct vm_map * map)5202*22ce4affSfengbojiang vm_map_max_KBI(const struct vm_map *map)
5203*22ce4affSfengbojiang {
5204*22ce4affSfengbojiang
5205*22ce4affSfengbojiang return (vm_map_max(map));
5206*22ce4affSfengbojiang }
5207*22ce4affSfengbojiang
5208*22ce4affSfengbojiang vm_offset_t
vm_map_min_KBI(const struct vm_map * map)5209*22ce4affSfengbojiang vm_map_min_KBI(const struct vm_map *map)
5210*22ce4affSfengbojiang {
5211*22ce4affSfengbojiang
5212*22ce4affSfengbojiang return (vm_map_min(map));
5213*22ce4affSfengbojiang }
5214*22ce4affSfengbojiang
5215*22ce4affSfengbojiang pmap_t
vm_map_pmap_KBI(vm_map_t map)5216*22ce4affSfengbojiang vm_map_pmap_KBI(vm_map_t map)
5217*22ce4affSfengbojiang {
5218*22ce4affSfengbojiang
5219*22ce4affSfengbojiang return (map->pmap);
5220*22ce4affSfengbojiang }
5221*22ce4affSfengbojiang
5222*22ce4affSfengbojiang bool
vm_map_range_valid_KBI(vm_map_t map,vm_offset_t start,vm_offset_t end)5223*22ce4affSfengbojiang vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end)
5224*22ce4affSfengbojiang {
5225*22ce4affSfengbojiang
5226*22ce4affSfengbojiang return (vm_map_range_valid(map, start, end));
5227*22ce4affSfengbojiang }
5228*22ce4affSfengbojiang
5229*22ce4affSfengbojiang #ifdef INVARIANTS
5230*22ce4affSfengbojiang static void
_vm_map_assert_consistent(vm_map_t map,int check)5231*22ce4affSfengbojiang _vm_map_assert_consistent(vm_map_t map, int check)
5232*22ce4affSfengbojiang {
5233*22ce4affSfengbojiang vm_map_entry_t entry, prev;
5234*22ce4affSfengbojiang vm_map_entry_t cur, header, lbound, ubound;
5235*22ce4affSfengbojiang vm_size_t max_left, max_right;
5236*22ce4affSfengbojiang
5237*22ce4affSfengbojiang #ifdef DIAGNOSTIC
5238*22ce4affSfengbojiang ++map->nupdates;
5239*22ce4affSfengbojiang #endif
5240*22ce4affSfengbojiang if (enable_vmmap_check != check)
5241*22ce4affSfengbojiang return;
5242*22ce4affSfengbojiang
5243*22ce4affSfengbojiang header = prev = &map->header;
5244*22ce4affSfengbojiang VM_MAP_ENTRY_FOREACH(entry, map) {
5245*22ce4affSfengbojiang KASSERT(prev->end <= entry->start,
5246*22ce4affSfengbojiang ("map %p prev->end = %jx, start = %jx", map,
5247*22ce4affSfengbojiang (uintmax_t)prev->end, (uintmax_t)entry->start));
5248*22ce4affSfengbojiang KASSERT(entry->start < entry->end,
5249*22ce4affSfengbojiang ("map %p start = %jx, end = %jx", map,
5250*22ce4affSfengbojiang (uintmax_t)entry->start, (uintmax_t)entry->end));
5251*22ce4affSfengbojiang KASSERT(entry->left == header ||
5252*22ce4affSfengbojiang entry->left->start < entry->start,
5253*22ce4affSfengbojiang ("map %p left->start = %jx, start = %jx", map,
5254*22ce4affSfengbojiang (uintmax_t)entry->left->start, (uintmax_t)entry->start));
5255*22ce4affSfengbojiang KASSERT(entry->right == header ||
5256*22ce4affSfengbojiang entry->start < entry->right->start,
5257*22ce4affSfengbojiang ("map %p start = %jx, right->start = %jx", map,
5258*22ce4affSfengbojiang (uintmax_t)entry->start, (uintmax_t)entry->right->start));
5259*22ce4affSfengbojiang cur = map->root;
5260*22ce4affSfengbojiang lbound = ubound = header;
5261*22ce4affSfengbojiang for (;;) {
5262*22ce4affSfengbojiang if (entry->start < cur->start) {
5263*22ce4affSfengbojiang ubound = cur;
5264*22ce4affSfengbojiang cur = cur->left;
5265*22ce4affSfengbojiang KASSERT(cur != lbound,
5266*22ce4affSfengbojiang ("map %p cannot find %jx",
5267*22ce4affSfengbojiang map, (uintmax_t)entry->start));
5268*22ce4affSfengbojiang } else if (cur->end <= entry->start) {
5269*22ce4affSfengbojiang lbound = cur;
5270*22ce4affSfengbojiang cur = cur->right;
5271*22ce4affSfengbojiang KASSERT(cur != ubound,
5272*22ce4affSfengbojiang ("map %p cannot find %jx",
5273*22ce4affSfengbojiang map, (uintmax_t)entry->start));
5274*22ce4affSfengbojiang } else {
5275*22ce4affSfengbojiang KASSERT(cur == entry,
5276*22ce4affSfengbojiang ("map %p cannot find %jx",
5277*22ce4affSfengbojiang map, (uintmax_t)entry->start));
5278*22ce4affSfengbojiang break;
5279*22ce4affSfengbojiang }
5280*22ce4affSfengbojiang }
5281*22ce4affSfengbojiang max_left = vm_map_entry_max_free_left(entry, lbound);
5282*22ce4affSfengbojiang max_right = vm_map_entry_max_free_right(entry, ubound);
5283*22ce4affSfengbojiang KASSERT(entry->max_free == vm_size_max(max_left, max_right),
5284*22ce4affSfengbojiang ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
5285*22ce4affSfengbojiang (uintmax_t)entry->max_free,
5286*22ce4affSfengbojiang (uintmax_t)max_left, (uintmax_t)max_right));
5287*22ce4affSfengbojiang prev = entry;
5288*22ce4affSfengbojiang }
5289*22ce4affSfengbojiang KASSERT(prev->end <= entry->start,
5290*22ce4affSfengbojiang ("map %p prev->end = %jx, start = %jx", map,
5291*22ce4affSfengbojiang (uintmax_t)prev->end, (uintmax_t)entry->start));
5292*22ce4affSfengbojiang }
5293*22ce4affSfengbojiang #endif
5294*22ce4affSfengbojiang
5295a9643ea8Slogwang #include "opt_ddb.h"
5296a9643ea8Slogwang #ifdef DDB
5297a9643ea8Slogwang #include <sys/kernel.h>
5298a9643ea8Slogwang
5299a9643ea8Slogwang #include <ddb/ddb.h>
5300a9643ea8Slogwang
5301a9643ea8Slogwang static void
vm_map_print(vm_map_t map)5302a9643ea8Slogwang vm_map_print(vm_map_t map)
5303a9643ea8Slogwang {
5304*22ce4affSfengbojiang vm_map_entry_t entry, prev;
5305a9643ea8Slogwang
5306a9643ea8Slogwang db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
5307a9643ea8Slogwang (void *)map,
5308a9643ea8Slogwang (void *)map->pmap, map->nentries, map->timestamp);
5309a9643ea8Slogwang
5310a9643ea8Slogwang db_indent += 2;
5311*22ce4affSfengbojiang prev = &map->header;
5312*22ce4affSfengbojiang VM_MAP_ENTRY_FOREACH(entry, map) {
5313*22ce4affSfengbojiang db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
5314*22ce4affSfengbojiang (void *)entry, (void *)entry->start, (void *)entry->end,
5315*22ce4affSfengbojiang entry->eflags);
5316a9643ea8Slogwang {
5317*22ce4affSfengbojiang static const char * const inheritance_name[4] =
5318a9643ea8Slogwang {"share", "copy", "none", "donate_copy"};
5319a9643ea8Slogwang
5320a9643ea8Slogwang db_iprintf(" prot=%x/%x/%s",
5321a9643ea8Slogwang entry->protection,
5322a9643ea8Slogwang entry->max_protection,
5323*22ce4affSfengbojiang inheritance_name[(int)(unsigned char)
5324*22ce4affSfengbojiang entry->inheritance]);
5325a9643ea8Slogwang if (entry->wired_count != 0)
5326a9643ea8Slogwang db_printf(", wired");
5327a9643ea8Slogwang }
5328a9643ea8Slogwang if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
5329a9643ea8Slogwang db_printf(", share=%p, offset=0x%jx\n",
5330a9643ea8Slogwang (void *)entry->object.sub_map,
5331a9643ea8Slogwang (uintmax_t)entry->offset);
5332*22ce4affSfengbojiang if (prev == &map->header ||
5333*22ce4affSfengbojiang prev->object.sub_map !=
5334*22ce4affSfengbojiang entry->object.sub_map) {
5335a9643ea8Slogwang db_indent += 2;
5336a9643ea8Slogwang vm_map_print((vm_map_t)entry->object.sub_map);
5337a9643ea8Slogwang db_indent -= 2;
5338a9643ea8Slogwang }
5339a9643ea8Slogwang } else {
5340a9643ea8Slogwang if (entry->cred != NULL)
5341a9643ea8Slogwang db_printf(", ruid %d", entry->cred->cr_ruid);
5342a9643ea8Slogwang db_printf(", object=%p, offset=0x%jx",
5343a9643ea8Slogwang (void *)entry->object.vm_object,
5344a9643ea8Slogwang (uintmax_t)entry->offset);
5345a9643ea8Slogwang if (entry->object.vm_object && entry->object.vm_object->cred)
5346a9643ea8Slogwang db_printf(", obj ruid %d charge %jx",
5347a9643ea8Slogwang entry->object.vm_object->cred->cr_ruid,
5348a9643ea8Slogwang (uintmax_t)entry->object.vm_object->charge);
5349a9643ea8Slogwang if (entry->eflags & MAP_ENTRY_COW)
5350a9643ea8Slogwang db_printf(", copy (%s)",
5351a9643ea8Slogwang (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
5352a9643ea8Slogwang db_printf("\n");
5353a9643ea8Slogwang
5354*22ce4affSfengbojiang if (prev == &map->header ||
5355*22ce4affSfengbojiang prev->object.vm_object !=
5356*22ce4affSfengbojiang entry->object.vm_object) {
5357a9643ea8Slogwang db_indent += 2;
5358a9643ea8Slogwang vm_object_print((db_expr_t)(intptr_t)
5359a9643ea8Slogwang entry->object.vm_object,
5360a9643ea8Slogwang 0, 0, (char *)0);
5361a9643ea8Slogwang db_indent -= 2;
5362a9643ea8Slogwang }
5363a9643ea8Slogwang }
5364*22ce4affSfengbojiang prev = entry;
5365a9643ea8Slogwang }
5366a9643ea8Slogwang db_indent -= 2;
5367a9643ea8Slogwang }
5368a9643ea8Slogwang
DB_SHOW_COMMAND(map,map)5369a9643ea8Slogwang DB_SHOW_COMMAND(map, map)
5370a9643ea8Slogwang {
5371a9643ea8Slogwang
5372a9643ea8Slogwang if (!have_addr) {
5373a9643ea8Slogwang db_printf("usage: show map <addr>\n");
5374a9643ea8Slogwang return;
5375a9643ea8Slogwang }
5376a9643ea8Slogwang vm_map_print((vm_map_t)addr);
5377a9643ea8Slogwang }
5378a9643ea8Slogwang
DB_SHOW_COMMAND(procvm,procvm)5379a9643ea8Slogwang DB_SHOW_COMMAND(procvm, procvm)
5380a9643ea8Slogwang {
5381a9643ea8Slogwang struct proc *p;
5382a9643ea8Slogwang
5383a9643ea8Slogwang if (have_addr) {
5384*22ce4affSfengbojiang p = db_lookup_proc(addr);
5385a9643ea8Slogwang } else {
5386a9643ea8Slogwang p = curproc;
5387a9643ea8Slogwang }
5388a9643ea8Slogwang
5389a9643ea8Slogwang db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
5390a9643ea8Slogwang (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
5391a9643ea8Slogwang (void *)vmspace_pmap(p->p_vmspace));
5392a9643ea8Slogwang
5393a9643ea8Slogwang vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
5394a9643ea8Slogwang }
5395a9643ea8Slogwang
5396a9643ea8Slogwang #endif /* DDB */
5397