1 /*-
2 * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3 *
4 * Copyright (c) 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * The Mach Operating System project at Carnegie-Mellon University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94
35 *
36 *
37 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38 * All rights reserved.
39 *
40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 * Software Distribution Coordinator or [email protected]
55 * School of Computer Science
56 * Carnegie Mellon University
57 * Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 */
62
63 /*
64 * Virtual memory mapping module.
65 */
66
67 #include <sys/cdefs.h>
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/elf.h>
71 #include <sys/kernel.h>
72 #include <sys/ktr.h>
73 #include <sys/lock.h>
74 #include <sys/mutex.h>
75 #include <sys/proc.h>
76 #include <sys/vmmeter.h>
77 #include <sys/mman.h>
78 #include <sys/vnode.h>
79 #include <sys/racct.h>
80 #include <sys/resourcevar.h>
81 #include <sys/rwlock.h>
82 #include <sys/file.h>
83 #include <sys/sysctl.h>
84 #include <sys/sysent.h>
85 #include <sys/shm.h>
86
87 #include <vm/vm.h>
88 #include <vm/vm_param.h>
89 #include <vm/pmap.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_page.h>
92 #include <vm/vm_pageout.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_pager.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_extern.h>
97 #include <vm/vnode_pager.h>
98 #include <vm/swap_pager.h>
99 #include <vm/uma.h>
100
101 /*
102 * Virtual memory maps provide for the mapping, protection,
103 * and sharing of virtual memory objects. In addition,
104 * this module provides for an efficient virtual copy of
105 * memory from one map to another.
106 *
107 * Synchronization is required prior to most operations.
108 *
109 * Maps consist of an ordered doubly-linked list of simple
110 * entries; a self-adjusting binary search tree of these
111 * entries is used to speed up lookups.
112 *
113 * Since portions of maps are specified by start/end addresses,
114 * which may not align with existing map entries, all
115 * routines merely "clip" entries to these start/end values.
116 * [That is, an entry is split into two, bordering at a
117 * start or end value.] Note that these clippings may not
118 * always be necessary (as the two resulting entries are then
119 * not changed); however, the clipping is done for convenience.
120 *
121 * As mentioned above, virtual copy operations are performed
122 * by copying VM object references from one map to
123 * another, and then marking both regions as copy-on-write.
124 */
125
126 static struct mtx map_sleep_mtx;
127 static uma_zone_t mapentzone;
128 static uma_zone_t kmapentzone;
129 static uma_zone_t vmspace_zone;
130 static int vmspace_zinit(void *mem, int size, int flags);
131 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
132 vm_offset_t max);
133 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
134 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
135 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
136 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
137 vm_map_entry_t gap_entry);
138 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
139 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
140 #ifdef INVARIANTS
141 static void vmspace_zdtor(void *mem, int size, void *arg);
142 #endif
143 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
144 vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
145 int cow);
146 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
147 vm_offset_t failed_addr);
148
149 #define CONTAINS_BITS(set, bits) ((~(set) & (bits)) == 0)
150
151 #define ENTRY_CHARGED(e) ((e)->cred != NULL || \
152 ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
153 !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
154
155 /*
156 * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
157 * stable.
158 */
159 #define PROC_VMSPACE_LOCK(p) do { } while (0)
160 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
161
162 /*
163 * VM_MAP_RANGE_CHECK: [ internal use only ]
164 *
165 * Asserts that the starting and ending region
166 * addresses fall within the valid range of the map.
167 */
168 #define VM_MAP_RANGE_CHECK(map, start, end) \
169 { \
170 if (start < vm_map_min(map)) \
171 start = vm_map_min(map); \
172 if (end > vm_map_max(map)) \
173 end = vm_map_max(map); \
174 if (start > end) \
175 start = end; \
176 }
177
178 #ifndef UMA_MD_SMALL_ALLOC
179
180 /*
181 * Allocate a new slab for kernel map entries. The kernel map may be locked or
182 * unlocked, depending on whether the request is coming from the kernel map or a
183 * submap. This function allocates a virtual address range directly from the
184 * kernel map instead of the kmem_* layer to avoid recursion on the kernel map
185 * lock and also to avoid triggering allocator recursion in the vmem boundary
186 * tag allocator.
187 */
188 static void *
kmapent_alloc(uma_zone_t zone,vm_size_t bytes,int domain,uint8_t * pflag,int wait)189 kmapent_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
190 int wait)
191 {
192 vm_offset_t addr;
193 int error, locked;
194
195 *pflag = UMA_SLAB_PRIV;
196
197 if (!(locked = vm_map_locked(kernel_map)))
198 vm_map_lock(kernel_map);
199 addr = vm_map_findspace(kernel_map, vm_map_min(kernel_map), bytes);
200 if (addr + bytes < addr || addr + bytes > vm_map_max(kernel_map))
201 panic("%s: kernel map is exhausted", __func__);
202 error = vm_map_insert(kernel_map, NULL, 0, addr, addr + bytes,
203 VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
204 if (error != KERN_SUCCESS)
205 panic("%s: vm_map_insert() failed: %d", __func__, error);
206 if (!locked)
207 vm_map_unlock(kernel_map);
208 error = kmem_back_domain(domain, kernel_object, addr, bytes, M_NOWAIT |
209 M_USE_RESERVE | (wait & M_ZERO));
210 if (error == KERN_SUCCESS) {
211 return ((void *)addr);
212 } else {
213 if (!locked)
214 vm_map_lock(kernel_map);
215 vm_map_delete(kernel_map, addr, bytes);
216 if (!locked)
217 vm_map_unlock(kernel_map);
218 return (NULL);
219 }
220 }
221
222 static void
kmapent_free(void * item,vm_size_t size,uint8_t pflag)223 kmapent_free(void *item, vm_size_t size, uint8_t pflag)
224 {
225 vm_offset_t addr;
226 int error __diagused;
227
228 if ((pflag & UMA_SLAB_PRIV) == 0)
229 /* XXX leaked */
230 return;
231
232 addr = (vm_offset_t)item;
233 kmem_unback(kernel_object, addr, size);
234 error = vm_map_remove(kernel_map, addr, addr + size);
235 KASSERT(error == KERN_SUCCESS,
236 ("%s: vm_map_remove failed: %d", __func__, error));
237 }
238
239 /*
240 * The worst-case upper bound on the number of kernel map entries that may be
241 * created before the zone must be replenished in _vm_map_unlock().
242 */
243 #define KMAPENT_RESERVE 1
244
245 #endif /* !UMD_MD_SMALL_ALLOC */
246
247 /*
248 * vm_map_startup:
249 *
250 * Initialize the vm_map module. Must be called before any other vm_map
251 * routines.
252 *
253 * User map and entry structures are allocated from the general purpose
254 * memory pool. Kernel maps are statically defined. Kernel map entries
255 * require special handling to avoid recursion; see the comments above
256 * kmapent_alloc() and in vm_map_entry_create().
257 */
258 void
vm_map_startup(void)259 vm_map_startup(void)
260 {
261 mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
262
263 /*
264 * Disable the use of per-CPU buckets: map entry allocation is
265 * serialized by the kernel map lock.
266 */
267 kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
268 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
269 UMA_ZONE_VM | UMA_ZONE_NOBUCKET);
270 #ifndef UMA_MD_SMALL_ALLOC
271 /* Reserve an extra map entry for use when replenishing the reserve. */
272 uma_zone_reserve(kmapentzone, KMAPENT_RESERVE + 1);
273 uma_prealloc(kmapentzone, KMAPENT_RESERVE + 1);
274 uma_zone_set_allocf(kmapentzone, kmapent_alloc);
275 uma_zone_set_freef(kmapentzone, kmapent_free);
276 #endif
277
278 mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
279 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
280 vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
281 #ifdef INVARIANTS
282 vmspace_zdtor,
283 #else
284 NULL,
285 #endif
286 vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
287 }
288
289 static int
vmspace_zinit(void * mem,int size,int flags)290 vmspace_zinit(void *mem, int size, int flags)
291 {
292 struct vmspace *vm;
293 vm_map_t map;
294
295 vm = (struct vmspace *)mem;
296 map = &vm->vm_map;
297
298 memset(map, 0, sizeof(*map));
299 mtx_init(&map->system_mtx, "vm map (system)", NULL,
300 MTX_DEF | MTX_DUPOK);
301 sx_init(&map->lock, "vm map (user)");
302 PMAP_LOCK_INIT(vmspace_pmap(vm));
303 return (0);
304 }
305
306 #ifdef INVARIANTS
307 static void
vmspace_zdtor(void * mem,int size,void * arg)308 vmspace_zdtor(void *mem, int size, void *arg)
309 {
310 struct vmspace *vm;
311
312 vm = (struct vmspace *)mem;
313 KASSERT(vm->vm_map.nentries == 0,
314 ("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries));
315 KASSERT(vm->vm_map.size == 0,
316 ("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size));
317 }
318 #endif /* INVARIANTS */
319
320 /*
321 * Allocate a vmspace structure, including a vm_map and pmap,
322 * and initialize those structures. The refcnt is set to 1.
323 */
324 struct vmspace *
vmspace_alloc(vm_offset_t min,vm_offset_t max,pmap_pinit_t pinit)325 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
326 {
327 struct vmspace *vm;
328
329 vm = uma_zalloc(vmspace_zone, M_WAITOK);
330 KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
331 if (!pinit(vmspace_pmap(vm))) {
332 uma_zfree(vmspace_zone, vm);
333 return (NULL);
334 }
335 CTR1(KTR_VM, "vmspace_alloc: %p", vm);
336 _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
337 refcount_init(&vm->vm_refcnt, 1);
338 vm->vm_shm = NULL;
339 vm->vm_swrss = 0;
340 vm->vm_tsize = 0;
341 vm->vm_dsize = 0;
342 vm->vm_ssize = 0;
343 vm->vm_taddr = 0;
344 vm->vm_daddr = 0;
345 vm->vm_maxsaddr = 0;
346 return (vm);
347 }
348
349 #ifdef RACCT
350 static void
vmspace_container_reset(struct proc * p)351 vmspace_container_reset(struct proc *p)
352 {
353
354 PROC_LOCK(p);
355 racct_set(p, RACCT_DATA, 0);
356 racct_set(p, RACCT_STACK, 0);
357 racct_set(p, RACCT_RSS, 0);
358 racct_set(p, RACCT_MEMLOCK, 0);
359 racct_set(p, RACCT_VMEM, 0);
360 PROC_UNLOCK(p);
361 }
362 #endif
363
364 static inline void
vmspace_dofree(struct vmspace * vm)365 vmspace_dofree(struct vmspace *vm)
366 {
367
368 CTR1(KTR_VM, "vmspace_free: %p", vm);
369
370 /*
371 * Make sure any SysV shm is freed, it might not have been in
372 * exit1().
373 */
374 shmexit(vm);
375
376 /*
377 * Lock the map, to wait out all other references to it.
378 * Delete all of the mappings and pages they hold, then call
379 * the pmap module to reclaim anything left.
380 */
381 (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
382 vm_map_max(&vm->vm_map));
383
384 pmap_release(vmspace_pmap(vm));
385 vm->vm_map.pmap = NULL;
386 uma_zfree(vmspace_zone, vm);
387 }
388
389 void
vmspace_free(struct vmspace * vm)390 vmspace_free(struct vmspace *vm)
391 {
392
393 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
394 "vmspace_free() called");
395
396 if (refcount_release(&vm->vm_refcnt))
397 vmspace_dofree(vm);
398 }
399
400 void
vmspace_exitfree(struct proc * p)401 vmspace_exitfree(struct proc *p)
402 {
403 struct vmspace *vm;
404
405 PROC_VMSPACE_LOCK(p);
406 vm = p->p_vmspace;
407 p->p_vmspace = NULL;
408 PROC_VMSPACE_UNLOCK(p);
409 KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
410 vmspace_free(vm);
411 }
412
413 void
vmspace_exit(struct thread * td)414 vmspace_exit(struct thread *td)
415 {
416 struct vmspace *vm;
417 struct proc *p;
418 bool released;
419
420 p = td->td_proc;
421 vm = p->p_vmspace;
422
423 /*
424 * Prepare to release the vmspace reference. The thread that releases
425 * the last reference is responsible for tearing down the vmspace.
426 * However, threads not releasing the final reference must switch to the
427 * kernel's vmspace0 before the decrement so that the subsequent pmap
428 * deactivation does not modify a freed vmspace.
429 */
430 refcount_acquire(&vmspace0.vm_refcnt);
431 if (!(released = refcount_release_if_last(&vm->vm_refcnt))) {
432 if (p->p_vmspace != &vmspace0) {
433 PROC_VMSPACE_LOCK(p);
434 p->p_vmspace = &vmspace0;
435 PROC_VMSPACE_UNLOCK(p);
436 pmap_activate(td);
437 }
438 released = refcount_release(&vm->vm_refcnt);
439 }
440 if (released) {
441 /*
442 * pmap_remove_pages() expects the pmap to be active, so switch
443 * back first if necessary.
444 */
445 if (p->p_vmspace != vm) {
446 PROC_VMSPACE_LOCK(p);
447 p->p_vmspace = vm;
448 PROC_VMSPACE_UNLOCK(p);
449 pmap_activate(td);
450 }
451 pmap_remove_pages(vmspace_pmap(vm));
452 PROC_VMSPACE_LOCK(p);
453 p->p_vmspace = &vmspace0;
454 PROC_VMSPACE_UNLOCK(p);
455 pmap_activate(td);
456 vmspace_dofree(vm);
457 }
458 #ifdef RACCT
459 if (racct_enable)
460 vmspace_container_reset(p);
461 #endif
462 }
463
464 /* Acquire reference to vmspace owned by another process. */
465
466 struct vmspace *
vmspace_acquire_ref(struct proc * p)467 vmspace_acquire_ref(struct proc *p)
468 {
469 struct vmspace *vm;
470
471 PROC_VMSPACE_LOCK(p);
472 vm = p->p_vmspace;
473 if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) {
474 PROC_VMSPACE_UNLOCK(p);
475 return (NULL);
476 }
477 if (vm != p->p_vmspace) {
478 PROC_VMSPACE_UNLOCK(p);
479 vmspace_free(vm);
480 return (NULL);
481 }
482 PROC_VMSPACE_UNLOCK(p);
483 return (vm);
484 }
485
486 /*
487 * Switch between vmspaces in an AIO kernel process.
488 *
489 * The new vmspace is either the vmspace of a user process obtained
490 * from an active AIO request or the initial vmspace of the AIO kernel
491 * process (when it is idling). Because user processes will block to
492 * drain any active AIO requests before proceeding in exit() or
493 * execve(), the reference count for vmspaces from AIO requests can
494 * never be 0. Similarly, AIO kernel processes hold an extra
495 * reference on their initial vmspace for the life of the process. As
496 * a result, the 'newvm' vmspace always has a non-zero reference
497 * count. This permits an additional reference on 'newvm' to be
498 * acquired via a simple atomic increment rather than the loop in
499 * vmspace_acquire_ref() above.
500 */
501 void
vmspace_switch_aio(struct vmspace * newvm)502 vmspace_switch_aio(struct vmspace *newvm)
503 {
504 struct vmspace *oldvm;
505
506 /* XXX: Need some way to assert that this is an aio daemon. */
507
508 KASSERT(refcount_load(&newvm->vm_refcnt) > 0,
509 ("vmspace_switch_aio: newvm unreferenced"));
510
511 oldvm = curproc->p_vmspace;
512 if (oldvm == newvm)
513 return;
514
515 /*
516 * Point to the new address space and refer to it.
517 */
518 curproc->p_vmspace = newvm;
519 refcount_acquire(&newvm->vm_refcnt);
520
521 /* Activate the new mapping. */
522 pmap_activate(curthread);
523
524 vmspace_free(oldvm);
525 }
526
527 void
_vm_map_lock(vm_map_t map,const char * file,int line)528 _vm_map_lock(vm_map_t map, const char *file, int line)
529 {
530
531 if (map->system_map)
532 mtx_lock_flags_(&map->system_mtx, 0, file, line);
533 else
534 sx_xlock_(&map->lock, file, line);
535 map->timestamp++;
536 }
537
538 void
vm_map_entry_set_vnode_text(vm_map_entry_t entry,bool add)539 vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
540 {
541 vm_object_t object;
542 struct vnode *vp;
543 bool vp_held;
544
545 if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
546 return;
547 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
548 ("Submap with execs"));
549 object = entry->object.vm_object;
550 KASSERT(object != NULL, ("No object for text, entry %p", entry));
551 if ((object->flags & OBJ_ANON) != 0)
552 object = object->handle;
553 else
554 KASSERT(object->backing_object == NULL,
555 ("non-anon object %p shadows", object));
556 KASSERT(object != NULL, ("No content object for text, entry %p obj %p",
557 entry, entry->object.vm_object));
558
559 /*
560 * Mostly, we do not lock the backing object. It is
561 * referenced by the entry we are processing, so it cannot go
562 * away.
563 */
564 vm_pager_getvp(object, &vp, &vp_held);
565 if (vp != NULL) {
566 if (add) {
567 VOP_SET_TEXT_CHECKED(vp);
568 } else {
569 vn_lock(vp, LK_SHARED | LK_RETRY);
570 VOP_UNSET_TEXT_CHECKED(vp);
571 VOP_UNLOCK(vp);
572 }
573 if (vp_held)
574 vdrop(vp);
575 }
576 }
577
578 /*
579 * Use a different name for this vm_map_entry field when it's use
580 * is not consistent with its use as part of an ordered search tree.
581 */
582 #define defer_next right
583
584 static void
vm_map_process_deferred(void)585 vm_map_process_deferred(void)
586 {
587 struct thread *td;
588 vm_map_entry_t entry, next;
589 vm_object_t object;
590
591 td = curthread;
592 entry = td->td_map_def_user;
593 td->td_map_def_user = NULL;
594 while (entry != NULL) {
595 next = entry->defer_next;
596 MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
597 MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
598 MAP_ENTRY_VN_EXEC));
599 if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
600 /*
601 * Decrement the object's writemappings and
602 * possibly the vnode's v_writecount.
603 */
604 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
605 ("Submap with writecount"));
606 object = entry->object.vm_object;
607 KASSERT(object != NULL, ("No object for writecount"));
608 vm_pager_release_writecount(object, entry->start,
609 entry->end);
610 }
611 vm_map_entry_set_vnode_text(entry, false);
612 vm_map_entry_deallocate(entry, FALSE);
613 entry = next;
614 }
615 }
616
617 #ifdef INVARIANTS
618 static void
_vm_map_assert_locked(vm_map_t map,const char * file,int line)619 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
620 {
621
622 if (map->system_map)
623 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
624 else
625 sx_assert_(&map->lock, SA_XLOCKED, file, line);
626 }
627
628 #define VM_MAP_ASSERT_LOCKED(map) \
629 _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
630
631 enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL };
632 #ifdef DIAGNOSTIC
633 static int enable_vmmap_check = VMMAP_CHECK_UNLOCK;
634 #else
635 static int enable_vmmap_check = VMMAP_CHECK_NONE;
636 #endif
637 SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
638 &enable_vmmap_check, 0, "Enable vm map consistency checking");
639
640 static void _vm_map_assert_consistent(vm_map_t map, int check);
641
642 #define VM_MAP_ASSERT_CONSISTENT(map) \
643 _vm_map_assert_consistent(map, VMMAP_CHECK_ALL)
644 #ifdef DIAGNOSTIC
645 #define VM_MAP_UNLOCK_CONSISTENT(map) do { \
646 if (map->nupdates > map->nentries) { \
647 _vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \
648 map->nupdates = 0; \
649 } \
650 } while (0)
651 #else
652 #define VM_MAP_UNLOCK_CONSISTENT(map)
653 #endif
654 #else
655 #define VM_MAP_ASSERT_LOCKED(map)
656 #define VM_MAP_ASSERT_CONSISTENT(map)
657 #define VM_MAP_UNLOCK_CONSISTENT(map)
658 #endif /* INVARIANTS */
659
660 void
_vm_map_unlock(vm_map_t map,const char * file,int line)661 _vm_map_unlock(vm_map_t map, const char *file, int line)
662 {
663
664 VM_MAP_UNLOCK_CONSISTENT(map);
665 if (map->system_map) {
666 #ifndef UMA_MD_SMALL_ALLOC
667 if (map == kernel_map && (map->flags & MAP_REPLENISH) != 0) {
668 uma_prealloc(kmapentzone, 1);
669 map->flags &= ~MAP_REPLENISH;
670 }
671 #endif
672 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
673 } else {
674 sx_xunlock_(&map->lock, file, line);
675 vm_map_process_deferred();
676 }
677 }
678
679 void
_vm_map_lock_read(vm_map_t map,const char * file,int line)680 _vm_map_lock_read(vm_map_t map, const char *file, int line)
681 {
682
683 if (map->system_map)
684 mtx_lock_flags_(&map->system_mtx, 0, file, line);
685 else
686 sx_slock_(&map->lock, file, line);
687 }
688
689 void
_vm_map_unlock_read(vm_map_t map,const char * file,int line)690 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
691 {
692
693 if (map->system_map) {
694 KASSERT((map->flags & MAP_REPLENISH) == 0,
695 ("%s: MAP_REPLENISH leaked", __func__));
696 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
697 } else {
698 sx_sunlock_(&map->lock, file, line);
699 vm_map_process_deferred();
700 }
701 }
702
703 int
_vm_map_trylock(vm_map_t map,const char * file,int line)704 _vm_map_trylock(vm_map_t map, const char *file, int line)
705 {
706 int error;
707
708 error = map->system_map ?
709 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
710 !sx_try_xlock_(&map->lock, file, line);
711 if (error == 0)
712 map->timestamp++;
713 return (error == 0);
714 }
715
716 int
_vm_map_trylock_read(vm_map_t map,const char * file,int line)717 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
718 {
719 int error;
720
721 error = map->system_map ?
722 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
723 !sx_try_slock_(&map->lock, file, line);
724 return (error == 0);
725 }
726
727 /*
728 * _vm_map_lock_upgrade: [ internal use only ]
729 *
730 * Tries to upgrade a read (shared) lock on the specified map to a write
731 * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a
732 * non-zero value if the upgrade fails. If the upgrade fails, the map is
733 * returned without a read or write lock held.
734 *
735 * Requires that the map be read locked.
736 */
737 int
_vm_map_lock_upgrade(vm_map_t map,const char * file,int line)738 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
739 {
740 unsigned int last_timestamp;
741
742 if (map->system_map) {
743 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
744 } else {
745 if (!sx_try_upgrade_(&map->lock, file, line)) {
746 last_timestamp = map->timestamp;
747 sx_sunlock_(&map->lock, file, line);
748 vm_map_process_deferred();
749 /*
750 * If the map's timestamp does not change while the
751 * map is unlocked, then the upgrade succeeds.
752 */
753 sx_xlock_(&map->lock, file, line);
754 if (last_timestamp != map->timestamp) {
755 sx_xunlock_(&map->lock, file, line);
756 return (1);
757 }
758 }
759 }
760 map->timestamp++;
761 return (0);
762 }
763
764 void
_vm_map_lock_downgrade(vm_map_t map,const char * file,int line)765 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
766 {
767
768 if (map->system_map) {
769 KASSERT((map->flags & MAP_REPLENISH) == 0,
770 ("%s: MAP_REPLENISH leaked", __func__));
771 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
772 } else {
773 VM_MAP_UNLOCK_CONSISTENT(map);
774 sx_downgrade_(&map->lock, file, line);
775 }
776 }
777
778 /*
779 * vm_map_locked:
780 *
781 * Returns a non-zero value if the caller holds a write (exclusive) lock
782 * on the specified map and the value "0" otherwise.
783 */
784 int
vm_map_locked(vm_map_t map)785 vm_map_locked(vm_map_t map)
786 {
787
788 if (map->system_map)
789 return (mtx_owned(&map->system_mtx));
790 else
791 return (sx_xlocked(&map->lock));
792 }
793
794 /*
795 * _vm_map_unlock_and_wait:
796 *
797 * Atomically releases the lock on the specified map and puts the calling
798 * thread to sleep. The calling thread will remain asleep until either
799 * vm_map_wakeup() is performed on the map or the specified timeout is
800 * exceeded.
801 *
802 * WARNING! This function does not perform deferred deallocations of
803 * objects and map entries. Therefore, the calling thread is expected to
804 * reacquire the map lock after reawakening and later perform an ordinary
805 * unlock operation, such as vm_map_unlock(), before completing its
806 * operation on the map.
807 */
808 int
_vm_map_unlock_and_wait(vm_map_t map,int timo,const char * file,int line)809 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
810 {
811
812 VM_MAP_UNLOCK_CONSISTENT(map);
813 mtx_lock(&map_sleep_mtx);
814 if (map->system_map) {
815 KASSERT((map->flags & MAP_REPLENISH) == 0,
816 ("%s: MAP_REPLENISH leaked", __func__));
817 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
818 } else {
819 sx_xunlock_(&map->lock, file, line);
820 }
821 return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
822 timo));
823 }
824
825 /*
826 * vm_map_wakeup:
827 *
828 * Awaken any threads that have slept on the map using
829 * vm_map_unlock_and_wait().
830 */
831 void
vm_map_wakeup(vm_map_t map)832 vm_map_wakeup(vm_map_t map)
833 {
834
835 /*
836 * Acquire and release map_sleep_mtx to prevent a wakeup()
837 * from being performed (and lost) between the map unlock
838 * and the msleep() in _vm_map_unlock_and_wait().
839 */
840 mtx_lock(&map_sleep_mtx);
841 mtx_unlock(&map_sleep_mtx);
842 wakeup(&map->root);
843 }
844
845 void
vm_map_busy(vm_map_t map)846 vm_map_busy(vm_map_t map)
847 {
848
849 VM_MAP_ASSERT_LOCKED(map);
850 map->busy++;
851 }
852
853 void
vm_map_unbusy(vm_map_t map)854 vm_map_unbusy(vm_map_t map)
855 {
856
857 VM_MAP_ASSERT_LOCKED(map);
858 KASSERT(map->busy, ("vm_map_unbusy: not busy"));
859 if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
860 vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
861 wakeup(&map->busy);
862 }
863 }
864
865 void
vm_map_wait_busy(vm_map_t map)866 vm_map_wait_busy(vm_map_t map)
867 {
868
869 VM_MAP_ASSERT_LOCKED(map);
870 while (map->busy) {
871 vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
872 if (map->system_map)
873 msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
874 else
875 sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
876 }
877 map->timestamp++;
878 }
879
880 long
vmspace_resident_count(struct vmspace * vmspace)881 vmspace_resident_count(struct vmspace *vmspace)
882 {
883 return pmap_resident_count(vmspace_pmap(vmspace));
884 }
885
886 /*
887 * Initialize an existing vm_map structure
888 * such as that in the vmspace structure.
889 */
890 static void
_vm_map_init(vm_map_t map,pmap_t pmap,vm_offset_t min,vm_offset_t max)891 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
892 {
893
894 map->header.eflags = MAP_ENTRY_HEADER;
895 map->needs_wakeup = FALSE;
896 map->system_map = 0;
897 map->pmap = pmap;
898 map->header.end = min;
899 map->header.start = max;
900 map->flags = 0;
901 map->header.left = map->header.right = &map->header;
902 map->root = NULL;
903 map->timestamp = 0;
904 map->busy = 0;
905 map->anon_loc = 0;
906 #ifdef DIAGNOSTIC
907 map->nupdates = 0;
908 #endif
909 }
910
911 void
vm_map_init(vm_map_t map,pmap_t pmap,vm_offset_t min,vm_offset_t max)912 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
913 {
914
915 _vm_map_init(map, pmap, min, max);
916 mtx_init(&map->system_mtx, "vm map (system)", NULL,
917 MTX_DEF | MTX_DUPOK);
918 sx_init(&map->lock, "vm map (user)");
919 }
920
921 /*
922 * vm_map_entry_dispose: [ internal use only ]
923 *
924 * Inverse of vm_map_entry_create.
925 */
926 static void
vm_map_entry_dispose(vm_map_t map,vm_map_entry_t entry)927 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
928 {
929 uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
930 }
931
932 /*
933 * vm_map_entry_create: [ internal use only ]
934 *
935 * Allocates a VM map entry for insertion.
936 * No entry fields are filled in.
937 */
938 static vm_map_entry_t
vm_map_entry_create(vm_map_t map)939 vm_map_entry_create(vm_map_t map)
940 {
941 vm_map_entry_t new_entry;
942
943 #ifndef UMA_MD_SMALL_ALLOC
944 if (map == kernel_map) {
945 VM_MAP_ASSERT_LOCKED(map);
946
947 /*
948 * A new slab of kernel map entries cannot be allocated at this
949 * point because the kernel map has not yet been updated to
950 * reflect the caller's request. Therefore, we allocate a new
951 * map entry, dipping into the reserve if necessary, and set a
952 * flag indicating that the reserve must be replenished before
953 * the map is unlocked.
954 */
955 new_entry = uma_zalloc(kmapentzone, M_NOWAIT | M_NOVM);
956 if (new_entry == NULL) {
957 new_entry = uma_zalloc(kmapentzone,
958 M_NOWAIT | M_NOVM | M_USE_RESERVE);
959 kernel_map->flags |= MAP_REPLENISH;
960 }
961 } else
962 #endif
963 if (map->system_map) {
964 new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
965 } else {
966 new_entry = uma_zalloc(mapentzone, M_WAITOK);
967 }
968 KASSERT(new_entry != NULL,
969 ("vm_map_entry_create: kernel resources exhausted"));
970 return (new_entry);
971 }
972
973 /*
974 * vm_map_entry_set_behavior:
975 *
976 * Set the expected access behavior, either normal, random, or
977 * sequential.
978 */
979 static inline void
vm_map_entry_set_behavior(vm_map_entry_t entry,u_char behavior)980 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
981 {
982 entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
983 (behavior & MAP_ENTRY_BEHAV_MASK);
984 }
985
986 /*
987 * vm_map_entry_max_free_{left,right}:
988 *
989 * Compute the size of the largest free gap between two entries,
990 * one the root of a tree and the other the ancestor of that root
991 * that is the least or greatest ancestor found on the search path.
992 */
993 static inline vm_size_t
vm_map_entry_max_free_left(vm_map_entry_t root,vm_map_entry_t left_ancestor)994 vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
995 {
996
997 return (root->left != left_ancestor ?
998 root->left->max_free : root->start - left_ancestor->end);
999 }
1000
1001 static inline vm_size_t
vm_map_entry_max_free_right(vm_map_entry_t root,vm_map_entry_t right_ancestor)1002 vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
1003 {
1004
1005 return (root->right != right_ancestor ?
1006 root->right->max_free : right_ancestor->start - root->end);
1007 }
1008
1009 /*
1010 * vm_map_entry_{pred,succ}:
1011 *
1012 * Find the {predecessor, successor} of the entry by taking one step
1013 * in the appropriate direction and backtracking as much as necessary.
1014 * vm_map_entry_succ is defined in vm_map.h.
1015 */
1016 static inline vm_map_entry_t
vm_map_entry_pred(vm_map_entry_t entry)1017 vm_map_entry_pred(vm_map_entry_t entry)
1018 {
1019 vm_map_entry_t prior;
1020
1021 prior = entry->left;
1022 if (prior->right->start < entry->start) {
1023 do
1024 prior = prior->right;
1025 while (prior->right != entry);
1026 }
1027 return (prior);
1028 }
1029
1030 static inline vm_size_t
vm_size_max(vm_size_t a,vm_size_t b)1031 vm_size_max(vm_size_t a, vm_size_t b)
1032 {
1033
1034 return (a > b ? a : b);
1035 }
1036
1037 #define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do { \
1038 vm_map_entry_t z; \
1039 vm_size_t max_free; \
1040 \
1041 /* \
1042 * Infer root->right->max_free == root->max_free when \
1043 * y->max_free < root->max_free || root->max_free == 0. \
1044 * Otherwise, look right to find it. \
1045 */ \
1046 y = root->left; \
1047 max_free = root->max_free; \
1048 KASSERT(max_free == vm_size_max( \
1049 vm_map_entry_max_free_left(root, llist), \
1050 vm_map_entry_max_free_right(root, rlist)), \
1051 ("%s: max_free invariant fails", __func__)); \
1052 if (max_free - 1 < vm_map_entry_max_free_left(root, llist)) \
1053 max_free = vm_map_entry_max_free_right(root, rlist); \
1054 if (y != llist && (test)) { \
1055 /* Rotate right and make y root. */ \
1056 z = y->right; \
1057 if (z != root) { \
1058 root->left = z; \
1059 y->right = root; \
1060 if (max_free < y->max_free) \
1061 root->max_free = max_free = \
1062 vm_size_max(max_free, z->max_free); \
1063 } else if (max_free < y->max_free) \
1064 root->max_free = max_free = \
1065 vm_size_max(max_free, root->start - y->end);\
1066 root = y; \
1067 y = root->left; \
1068 } \
1069 /* Copy right->max_free. Put root on rlist. */ \
1070 root->max_free = max_free; \
1071 KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \
1072 ("%s: max_free not copied from right", __func__)); \
1073 root->left = rlist; \
1074 rlist = root; \
1075 root = y != llist ? y : NULL; \
1076 } while (0)
1077
1078 #define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do { \
1079 vm_map_entry_t z; \
1080 vm_size_t max_free; \
1081 \
1082 /* \
1083 * Infer root->left->max_free == root->max_free when \
1084 * y->max_free < root->max_free || root->max_free == 0. \
1085 * Otherwise, look left to find it. \
1086 */ \
1087 y = root->right; \
1088 max_free = root->max_free; \
1089 KASSERT(max_free == vm_size_max( \
1090 vm_map_entry_max_free_left(root, llist), \
1091 vm_map_entry_max_free_right(root, rlist)), \
1092 ("%s: max_free invariant fails", __func__)); \
1093 if (max_free - 1 < vm_map_entry_max_free_right(root, rlist)) \
1094 max_free = vm_map_entry_max_free_left(root, llist); \
1095 if (y != rlist && (test)) { \
1096 /* Rotate left and make y root. */ \
1097 z = y->left; \
1098 if (z != root) { \
1099 root->right = z; \
1100 y->left = root; \
1101 if (max_free < y->max_free) \
1102 root->max_free = max_free = \
1103 vm_size_max(max_free, z->max_free); \
1104 } else if (max_free < y->max_free) \
1105 root->max_free = max_free = \
1106 vm_size_max(max_free, y->start - root->end);\
1107 root = y; \
1108 y = root->right; \
1109 } \
1110 /* Copy left->max_free. Put root on llist. */ \
1111 root->max_free = max_free; \
1112 KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \
1113 ("%s: max_free not copied from left", __func__)); \
1114 root->right = llist; \
1115 llist = root; \
1116 root = y != rlist ? y : NULL; \
1117 } while (0)
1118
1119 /*
1120 * Walk down the tree until we find addr or a gap where addr would go, breaking
1121 * off left and right subtrees of nodes less than, or greater than addr. Treat
1122 * subtrees with root->max_free < length as empty trees. llist and rlist are
1123 * the two sides in reverse order (bottom-up), with llist linked by the right
1124 * pointer and rlist linked by the left pointer in the vm_map_entry, and both
1125 * lists terminated by &map->header. This function, and the subsequent call to
1126 * vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address
1127 * values in &map->header.
1128 */
1129 static __always_inline vm_map_entry_t
vm_map_splay_split(vm_map_t map,vm_offset_t addr,vm_size_t length,vm_map_entry_t * llist,vm_map_entry_t * rlist)1130 vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
1131 vm_map_entry_t *llist, vm_map_entry_t *rlist)
1132 {
1133 vm_map_entry_t left, right, root, y;
1134
1135 left = right = &map->header;
1136 root = map->root;
1137 while (root != NULL && root->max_free >= length) {
1138 KASSERT(left->end <= root->start &&
1139 root->end <= right->start,
1140 ("%s: root not within tree bounds", __func__));
1141 if (addr < root->start) {
1142 SPLAY_LEFT_STEP(root, y, left, right,
1143 y->max_free >= length && addr < y->start);
1144 } else if (addr >= root->end) {
1145 SPLAY_RIGHT_STEP(root, y, left, right,
1146 y->max_free >= length && addr >= y->end);
1147 } else
1148 break;
1149 }
1150 *llist = left;
1151 *rlist = right;
1152 return (root);
1153 }
1154
1155 static __always_inline void
vm_map_splay_findnext(vm_map_entry_t root,vm_map_entry_t * rlist)1156 vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist)
1157 {
1158 vm_map_entry_t hi, right, y;
1159
1160 right = *rlist;
1161 hi = root->right == right ? NULL : root->right;
1162 if (hi == NULL)
1163 return;
1164 do
1165 SPLAY_LEFT_STEP(hi, y, root, right, true);
1166 while (hi != NULL);
1167 *rlist = right;
1168 }
1169
1170 static __always_inline void
vm_map_splay_findprev(vm_map_entry_t root,vm_map_entry_t * llist)1171 vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist)
1172 {
1173 vm_map_entry_t left, lo, y;
1174
1175 left = *llist;
1176 lo = root->left == left ? NULL : root->left;
1177 if (lo == NULL)
1178 return;
1179 do
1180 SPLAY_RIGHT_STEP(lo, y, left, root, true);
1181 while (lo != NULL);
1182 *llist = left;
1183 }
1184
1185 static inline void
vm_map_entry_swap(vm_map_entry_t * a,vm_map_entry_t * b)1186 vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
1187 {
1188 vm_map_entry_t tmp;
1189
1190 tmp = *b;
1191 *b = *a;
1192 *a = tmp;
1193 }
1194
1195 /*
1196 * Walk back up the two spines, flip the pointers and set max_free. The
1197 * subtrees of the root go at the bottom of llist and rlist.
1198 */
1199 static vm_size_t
vm_map_splay_merge_left_walk(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t tail,vm_size_t max_free,vm_map_entry_t llist)1200 vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root,
1201 vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist)
1202 {
1203 do {
1204 /*
1205 * The max_free values of the children of llist are in
1206 * llist->max_free and max_free. Update with the
1207 * max value.
1208 */
1209 llist->max_free = max_free =
1210 vm_size_max(llist->max_free, max_free);
1211 vm_map_entry_swap(&llist->right, &tail);
1212 vm_map_entry_swap(&tail, &llist);
1213 } while (llist != header);
1214 root->left = tail;
1215 return (max_free);
1216 }
1217
1218 /*
1219 * When llist is known to be the predecessor of root.
1220 */
1221 static inline vm_size_t
vm_map_splay_merge_pred(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t llist)1222 vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root,
1223 vm_map_entry_t llist)
1224 {
1225 vm_size_t max_free;
1226
1227 max_free = root->start - llist->end;
1228 if (llist != header) {
1229 max_free = vm_map_splay_merge_left_walk(header, root,
1230 root, max_free, llist);
1231 } else {
1232 root->left = header;
1233 header->right = root;
1234 }
1235 return (max_free);
1236 }
1237
1238 /*
1239 * When llist may or may not be the predecessor of root.
1240 */
1241 static inline vm_size_t
vm_map_splay_merge_left(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t llist)1242 vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root,
1243 vm_map_entry_t llist)
1244 {
1245 vm_size_t max_free;
1246
1247 max_free = vm_map_entry_max_free_left(root, llist);
1248 if (llist != header) {
1249 max_free = vm_map_splay_merge_left_walk(header, root,
1250 root->left == llist ? root : root->left,
1251 max_free, llist);
1252 }
1253 return (max_free);
1254 }
1255
1256 static vm_size_t
vm_map_splay_merge_right_walk(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t tail,vm_size_t max_free,vm_map_entry_t rlist)1257 vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root,
1258 vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist)
1259 {
1260 do {
1261 /*
1262 * The max_free values of the children of rlist are in
1263 * rlist->max_free and max_free. Update with the
1264 * max value.
1265 */
1266 rlist->max_free = max_free =
1267 vm_size_max(rlist->max_free, max_free);
1268 vm_map_entry_swap(&rlist->left, &tail);
1269 vm_map_entry_swap(&tail, &rlist);
1270 } while (rlist != header);
1271 root->right = tail;
1272 return (max_free);
1273 }
1274
1275 /*
1276 * When rlist is known to be the succecessor of root.
1277 */
1278 static inline vm_size_t
vm_map_splay_merge_succ(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t rlist)1279 vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root,
1280 vm_map_entry_t rlist)
1281 {
1282 vm_size_t max_free;
1283
1284 max_free = rlist->start - root->end;
1285 if (rlist != header) {
1286 max_free = vm_map_splay_merge_right_walk(header, root,
1287 root, max_free, rlist);
1288 } else {
1289 root->right = header;
1290 header->left = root;
1291 }
1292 return (max_free);
1293 }
1294
1295 /*
1296 * When rlist may or may not be the succecessor of root.
1297 */
1298 static inline vm_size_t
vm_map_splay_merge_right(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t rlist)1299 vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root,
1300 vm_map_entry_t rlist)
1301 {
1302 vm_size_t max_free;
1303
1304 max_free = vm_map_entry_max_free_right(root, rlist);
1305 if (rlist != header) {
1306 max_free = vm_map_splay_merge_right_walk(header, root,
1307 root->right == rlist ? root : root->right,
1308 max_free, rlist);
1309 }
1310 return (max_free);
1311 }
1312
1313 /*
1314 * vm_map_splay:
1315 *
1316 * The Sleator and Tarjan top-down splay algorithm with the
1317 * following variation. Max_free must be computed bottom-up, so
1318 * on the downward pass, maintain the left and right spines in
1319 * reverse order. Then, make a second pass up each side to fix
1320 * the pointers and compute max_free. The time bound is O(log n)
1321 * amortized.
1322 *
1323 * The tree is threaded, which means that there are no null pointers.
1324 * When a node has no left child, its left pointer points to its
1325 * predecessor, which the last ancestor on the search path from the root
1326 * where the search branched right. Likewise, when a node has no right
1327 * child, its right pointer points to its successor. The map header node
1328 * is the predecessor of the first map entry, and the successor of the
1329 * last.
1330 *
1331 * The new root is the vm_map_entry containing "addr", or else an
1332 * adjacent entry (lower if possible) if addr is not in the tree.
1333 *
1334 * The map must be locked, and leaves it so.
1335 *
1336 * Returns: the new root.
1337 */
1338 static vm_map_entry_t
vm_map_splay(vm_map_t map,vm_offset_t addr)1339 vm_map_splay(vm_map_t map, vm_offset_t addr)
1340 {
1341 vm_map_entry_t header, llist, rlist, root;
1342 vm_size_t max_free_left, max_free_right;
1343
1344 header = &map->header;
1345 root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
1346 if (root != NULL) {
1347 max_free_left = vm_map_splay_merge_left(header, root, llist);
1348 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1349 } else if (llist != header) {
1350 /*
1351 * Recover the greatest node in the left
1352 * subtree and make it the root.
1353 */
1354 root = llist;
1355 llist = root->right;
1356 max_free_left = vm_map_splay_merge_left(header, root, llist);
1357 max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1358 } else if (rlist != header) {
1359 /*
1360 * Recover the least node in the right
1361 * subtree and make it the root.
1362 */
1363 root = rlist;
1364 rlist = root->left;
1365 max_free_left = vm_map_splay_merge_pred(header, root, llist);
1366 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1367 } else {
1368 /* There is no root. */
1369 return (NULL);
1370 }
1371 root->max_free = vm_size_max(max_free_left, max_free_right);
1372 map->root = root;
1373 VM_MAP_ASSERT_CONSISTENT(map);
1374 return (root);
1375 }
1376
1377 /*
1378 * vm_map_entry_{un,}link:
1379 *
1380 * Insert/remove entries from maps. On linking, if new entry clips
1381 * existing entry, trim existing entry to avoid overlap, and manage
1382 * offsets. On unlinking, merge disappearing entry with neighbor, if
1383 * called for, and manage offsets. Callers should not modify fields in
1384 * entries already mapped.
1385 */
1386 static void
vm_map_entry_link(vm_map_t map,vm_map_entry_t entry)1387 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1388 {
1389 vm_map_entry_t header, llist, rlist, root;
1390 vm_size_t max_free_left, max_free_right;
1391
1392 CTR3(KTR_VM,
1393 "vm_map_entry_link: map %p, nentries %d, entry %p", map,
1394 map->nentries, entry);
1395 VM_MAP_ASSERT_LOCKED(map);
1396 map->nentries++;
1397 header = &map->header;
1398 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1399 if (root == NULL) {
1400 /*
1401 * The new entry does not overlap any existing entry in the
1402 * map, so it becomes the new root of the map tree.
1403 */
1404 max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1405 max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1406 } else if (entry->start == root->start) {
1407 /*
1408 * The new entry is a clone of root, with only the end field
1409 * changed. The root entry will be shrunk to abut the new
1410 * entry, and will be the right child of the new root entry in
1411 * the modified map.
1412 */
1413 KASSERT(entry->end < root->end,
1414 ("%s: clip_start not within entry", __func__));
1415 vm_map_splay_findprev(root, &llist);
1416 if ((root->eflags & (MAP_ENTRY_STACK_GAP_DN |
1417 MAP_ENTRY_STACK_GAP_UP)) == 0)
1418 root->offset += entry->end - root->start;
1419 root->start = entry->end;
1420 max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1421 max_free_right = root->max_free = vm_size_max(
1422 vm_map_splay_merge_pred(entry, root, entry),
1423 vm_map_splay_merge_right(header, root, rlist));
1424 } else {
1425 /*
1426 * The new entry is a clone of root, with only the start field
1427 * changed. The root entry will be shrunk to abut the new
1428 * entry, and will be the left child of the new root entry in
1429 * the modified map.
1430 */
1431 KASSERT(entry->end == root->end,
1432 ("%s: clip_start not within entry", __func__));
1433 vm_map_splay_findnext(root, &rlist);
1434 if ((entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
1435 MAP_ENTRY_STACK_GAP_UP)) == 0)
1436 entry->offset += entry->start - root->start;
1437 root->end = entry->start;
1438 max_free_left = root->max_free = vm_size_max(
1439 vm_map_splay_merge_left(header, root, llist),
1440 vm_map_splay_merge_succ(entry, root, entry));
1441 max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1442 }
1443 entry->max_free = vm_size_max(max_free_left, max_free_right);
1444 map->root = entry;
1445 VM_MAP_ASSERT_CONSISTENT(map);
1446 }
1447
1448 enum unlink_merge_type {
1449 UNLINK_MERGE_NONE,
1450 UNLINK_MERGE_NEXT
1451 };
1452
1453 static void
vm_map_entry_unlink(vm_map_t map,vm_map_entry_t entry,enum unlink_merge_type op)1454 vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
1455 enum unlink_merge_type op)
1456 {
1457 vm_map_entry_t header, llist, rlist, root;
1458 vm_size_t max_free_left, max_free_right;
1459
1460 VM_MAP_ASSERT_LOCKED(map);
1461 header = &map->header;
1462 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1463 KASSERT(root != NULL,
1464 ("vm_map_entry_unlink: unlink object not mapped"));
1465
1466 vm_map_splay_findprev(root, &llist);
1467 vm_map_splay_findnext(root, &rlist);
1468 if (op == UNLINK_MERGE_NEXT) {
1469 rlist->start = root->start;
1470 MPASS((rlist->eflags & (MAP_ENTRY_STACK_GAP_DN |
1471 MAP_ENTRY_STACK_GAP_UP)) == 0);
1472 rlist->offset = root->offset;
1473 }
1474 if (llist != header) {
1475 root = llist;
1476 llist = root->right;
1477 max_free_left = vm_map_splay_merge_left(header, root, llist);
1478 max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1479 } else if (rlist != header) {
1480 root = rlist;
1481 rlist = root->left;
1482 max_free_left = vm_map_splay_merge_pred(header, root, llist);
1483 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1484 } else {
1485 header->left = header->right = header;
1486 root = NULL;
1487 }
1488 if (root != NULL)
1489 root->max_free = vm_size_max(max_free_left, max_free_right);
1490 map->root = root;
1491 VM_MAP_ASSERT_CONSISTENT(map);
1492 map->nentries--;
1493 CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1494 map->nentries, entry);
1495 }
1496
1497 /*
1498 * vm_map_entry_resize:
1499 *
1500 * Resize a vm_map_entry, recompute the amount of free space that
1501 * follows it and propagate that value up the tree.
1502 *
1503 * The map must be locked, and leaves it so.
1504 */
1505 static void
vm_map_entry_resize(vm_map_t map,vm_map_entry_t entry,vm_size_t grow_amount)1506 vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
1507 {
1508 vm_map_entry_t header, llist, rlist, root;
1509
1510 VM_MAP_ASSERT_LOCKED(map);
1511 header = &map->header;
1512 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1513 KASSERT(root != NULL, ("%s: resize object not mapped", __func__));
1514 vm_map_splay_findnext(root, &rlist);
1515 entry->end += grow_amount;
1516 root->max_free = vm_size_max(
1517 vm_map_splay_merge_left(header, root, llist),
1518 vm_map_splay_merge_succ(header, root, rlist));
1519 map->root = root;
1520 VM_MAP_ASSERT_CONSISTENT(map);
1521 CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
1522 __func__, map, map->nentries, entry);
1523 }
1524
1525 /*
1526 * vm_map_lookup_entry: [ internal use only ]
1527 *
1528 * Finds the map entry containing (or
1529 * immediately preceding) the specified address
1530 * in the given map; the entry is returned
1531 * in the "entry" parameter. The boolean
1532 * result indicates whether the address is
1533 * actually contained in the map.
1534 */
1535 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_offset_t address,vm_map_entry_t * entry)1536 vm_map_lookup_entry(
1537 vm_map_t map,
1538 vm_offset_t address,
1539 vm_map_entry_t *entry) /* OUT */
1540 {
1541 vm_map_entry_t cur, header, lbound, ubound;
1542 boolean_t locked;
1543
1544 /*
1545 * If the map is empty, then the map entry immediately preceding
1546 * "address" is the map's header.
1547 */
1548 header = &map->header;
1549 cur = map->root;
1550 if (cur == NULL) {
1551 *entry = header;
1552 return (FALSE);
1553 }
1554 if (address >= cur->start && cur->end > address) {
1555 *entry = cur;
1556 return (TRUE);
1557 }
1558 if ((locked = vm_map_locked(map)) ||
1559 sx_try_upgrade(&map->lock)) {
1560 /*
1561 * Splay requires a write lock on the map. However, it only
1562 * restructures the binary search tree; it does not otherwise
1563 * change the map. Thus, the map's timestamp need not change
1564 * on a temporary upgrade.
1565 */
1566 cur = vm_map_splay(map, address);
1567 if (!locked) {
1568 VM_MAP_UNLOCK_CONSISTENT(map);
1569 sx_downgrade(&map->lock);
1570 }
1571
1572 /*
1573 * If "address" is contained within a map entry, the new root
1574 * is that map entry. Otherwise, the new root is a map entry
1575 * immediately before or after "address".
1576 */
1577 if (address < cur->start) {
1578 *entry = header;
1579 return (FALSE);
1580 }
1581 *entry = cur;
1582 return (address < cur->end);
1583 }
1584 /*
1585 * Since the map is only locked for read access, perform a
1586 * standard binary search tree lookup for "address".
1587 */
1588 lbound = ubound = header;
1589 for (;;) {
1590 if (address < cur->start) {
1591 ubound = cur;
1592 cur = cur->left;
1593 if (cur == lbound)
1594 break;
1595 } else if (cur->end <= address) {
1596 lbound = cur;
1597 cur = cur->right;
1598 if (cur == ubound)
1599 break;
1600 } else {
1601 *entry = cur;
1602 return (TRUE);
1603 }
1604 }
1605 *entry = lbound;
1606 return (FALSE);
1607 }
1608
1609 /*
1610 * vm_map_insert1() is identical to vm_map_insert() except that it
1611 * returns the newly inserted map entry in '*res'. In case the new
1612 * entry is coalesced with a neighbor or an existing entry was
1613 * resized, that entry is returned. In any case, the returned entry
1614 * covers the specified address range.
1615 */
1616 static int
vm_map_insert1(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t start,vm_offset_t end,vm_prot_t prot,vm_prot_t max,int cow,vm_map_entry_t * res)1617 vm_map_insert1(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1618 vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow,
1619 vm_map_entry_t *res)
1620 {
1621 vm_map_entry_t new_entry, next_entry, prev_entry;
1622 struct ucred *cred;
1623 vm_eflags_t protoeflags;
1624 vm_inherit_t inheritance;
1625 u_long bdry;
1626 u_int bidx;
1627
1628 VM_MAP_ASSERT_LOCKED(map);
1629 KASSERT(object != kernel_object ||
1630 (cow & MAP_COPY_ON_WRITE) == 0,
1631 ("vm_map_insert: kernel object and COW"));
1632 KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
1633 (cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
1634 ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
1635 object, cow));
1636 KASSERT((prot & ~max) == 0,
1637 ("prot %#x is not subset of max_prot %#x", prot, max));
1638
1639 /*
1640 * Check that the start and end points are not bogus.
1641 */
1642 if (start == end || !vm_map_range_valid(map, start, end))
1643 return (KERN_INVALID_ADDRESS);
1644
1645 if ((map->flags & MAP_WXORX) != 0 && (prot & (VM_PROT_WRITE |
1646 VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE))
1647 return (KERN_PROTECTION_FAILURE);
1648
1649 /*
1650 * Find the entry prior to the proposed starting address; if it's part
1651 * of an existing entry, this range is bogus.
1652 */
1653 if (vm_map_lookup_entry(map, start, &prev_entry))
1654 return (KERN_NO_SPACE);
1655
1656 /*
1657 * Assert that the next entry doesn't overlap the end point.
1658 */
1659 next_entry = vm_map_entry_succ(prev_entry);
1660 if (next_entry->start < end)
1661 return (KERN_NO_SPACE);
1662
1663 if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1664 max != VM_PROT_NONE))
1665 return (KERN_INVALID_ARGUMENT);
1666
1667 protoeflags = 0;
1668 if (cow & MAP_COPY_ON_WRITE)
1669 protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1670 if (cow & MAP_NOFAULT)
1671 protoeflags |= MAP_ENTRY_NOFAULT;
1672 if (cow & MAP_DISABLE_SYNCER)
1673 protoeflags |= MAP_ENTRY_NOSYNC;
1674 if (cow & MAP_DISABLE_COREDUMP)
1675 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1676 if (cow & MAP_STACK_GROWS_DOWN)
1677 protoeflags |= MAP_ENTRY_GROWS_DOWN;
1678 if (cow & MAP_STACK_GROWS_UP)
1679 protoeflags |= MAP_ENTRY_GROWS_UP;
1680 if (cow & MAP_WRITECOUNT)
1681 protoeflags |= MAP_ENTRY_WRITECNT;
1682 if (cow & MAP_VN_EXEC)
1683 protoeflags |= MAP_ENTRY_VN_EXEC;
1684 if ((cow & MAP_CREATE_GUARD) != 0)
1685 protoeflags |= MAP_ENTRY_GUARD;
1686 if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
1687 protoeflags |= MAP_ENTRY_STACK_GAP_DN;
1688 if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
1689 protoeflags |= MAP_ENTRY_STACK_GAP_UP;
1690 if (cow & MAP_INHERIT_SHARE)
1691 inheritance = VM_INHERIT_SHARE;
1692 else
1693 inheritance = VM_INHERIT_DEFAULT;
1694 if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
1695 /* This magically ignores index 0, for usual page size. */
1696 bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
1697 MAP_SPLIT_BOUNDARY_SHIFT;
1698 if (bidx >= MAXPAGESIZES)
1699 return (KERN_INVALID_ARGUMENT);
1700 bdry = pagesizes[bidx] - 1;
1701 if ((start & bdry) != 0 || (end & bdry) != 0)
1702 return (KERN_INVALID_ARGUMENT);
1703 protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
1704 }
1705
1706 cred = NULL;
1707 if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1708 goto charged;
1709 if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1710 ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1711 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1712 return (KERN_RESOURCE_SHORTAGE);
1713 KASSERT(object == NULL ||
1714 (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1715 object->cred == NULL,
1716 ("overcommit: vm_map_insert o %p", object));
1717 cred = curthread->td_ucred;
1718 }
1719
1720 charged:
1721 /* Expand the kernel pmap, if necessary. */
1722 if (map == kernel_map && end > kernel_vm_end)
1723 pmap_growkernel(end);
1724 if (object != NULL) {
1725 /*
1726 * OBJ_ONEMAPPING must be cleared unless this mapping
1727 * is trivially proven to be the only mapping for any
1728 * of the object's pages. (Object granularity
1729 * reference counting is insufficient to recognize
1730 * aliases with precision.)
1731 */
1732 if ((object->flags & OBJ_ANON) != 0) {
1733 VM_OBJECT_WLOCK(object);
1734 if (object->ref_count > 1 || object->shadow_count != 0)
1735 vm_object_clear_flag(object, OBJ_ONEMAPPING);
1736 VM_OBJECT_WUNLOCK(object);
1737 }
1738 } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
1739 protoeflags &&
1740 (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP |
1741 MAP_VN_EXEC)) == 0 &&
1742 prev_entry->end == start && (prev_entry->cred == cred ||
1743 (prev_entry->object.vm_object != NULL &&
1744 prev_entry->object.vm_object->cred == cred)) &&
1745 vm_object_coalesce(prev_entry->object.vm_object,
1746 prev_entry->offset,
1747 (vm_size_t)(prev_entry->end - prev_entry->start),
1748 (vm_size_t)(end - prev_entry->end), cred != NULL &&
1749 (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1750 /*
1751 * We were able to extend the object. Determine if we
1752 * can extend the previous map entry to include the
1753 * new range as well.
1754 */
1755 if (prev_entry->inheritance == inheritance &&
1756 prev_entry->protection == prot &&
1757 prev_entry->max_protection == max &&
1758 prev_entry->wired_count == 0) {
1759 KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
1760 0, ("prev_entry %p has incoherent wiring",
1761 prev_entry));
1762 if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1763 map->size += end - prev_entry->end;
1764 vm_map_entry_resize(map, prev_entry,
1765 end - prev_entry->end);
1766 *res = vm_map_try_merge_entries(map, prev_entry,
1767 next_entry);
1768 return (KERN_SUCCESS);
1769 }
1770
1771 /*
1772 * If we can extend the object but cannot extend the
1773 * map entry, we have to create a new map entry. We
1774 * must bump the ref count on the extended object to
1775 * account for it. object may be NULL.
1776 */
1777 object = prev_entry->object.vm_object;
1778 offset = prev_entry->offset +
1779 (prev_entry->end - prev_entry->start);
1780 vm_object_reference(object);
1781 if (cred != NULL && object != NULL && object->cred != NULL &&
1782 !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1783 /* Object already accounts for this uid. */
1784 cred = NULL;
1785 }
1786 }
1787 if (cred != NULL)
1788 crhold(cred);
1789
1790 /*
1791 * Create a new entry
1792 */
1793 new_entry = vm_map_entry_create(map);
1794 new_entry->start = start;
1795 new_entry->end = end;
1796 new_entry->cred = NULL;
1797
1798 new_entry->eflags = protoeflags;
1799 new_entry->object.vm_object = object;
1800 new_entry->offset = offset;
1801
1802 new_entry->inheritance = inheritance;
1803 new_entry->protection = prot;
1804 new_entry->max_protection = max;
1805 new_entry->wired_count = 0;
1806 new_entry->wiring_thread = NULL;
1807 new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1808 new_entry->next_read = start;
1809
1810 KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1811 ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1812 new_entry->cred = cred;
1813
1814 /*
1815 * Insert the new entry into the list
1816 */
1817 vm_map_entry_link(map, new_entry);
1818 if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1819 map->size += new_entry->end - new_entry->start;
1820
1821 /*
1822 * Try to coalesce the new entry with both the previous and next
1823 * entries in the list. Previously, we only attempted to coalesce
1824 * with the previous entry when object is NULL. Here, we handle the
1825 * other cases, which are less common.
1826 */
1827 vm_map_try_merge_entries(map, prev_entry, new_entry);
1828 *res = vm_map_try_merge_entries(map, new_entry, next_entry);
1829
1830 if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1831 vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1832 end - start, cow & MAP_PREFAULT_PARTIAL);
1833 }
1834
1835 return (KERN_SUCCESS);
1836 }
1837
1838 /*
1839 * vm_map_insert:
1840 *
1841 * Inserts the given VM object into the target map at the
1842 * specified address range.
1843 *
1844 * Requires that the map be locked, and leaves it so.
1845 *
1846 * If object is non-NULL, ref count must be bumped by caller
1847 * prior to making call to account for the new entry.
1848 */
1849 int
vm_map_insert(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t start,vm_offset_t end,vm_prot_t prot,vm_prot_t max,int cow)1850 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1851 vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1852 {
1853 vm_map_entry_t res;
1854
1855 return (vm_map_insert1(map, object, offset, start, end, prot, max,
1856 cow, &res));
1857 }
1858
1859 /*
1860 * vm_map_findspace:
1861 *
1862 * Find the first fit (lowest VM address) for "length" free bytes
1863 * beginning at address >= start in the given map.
1864 *
1865 * In a vm_map_entry, "max_free" is the maximum amount of
1866 * contiguous free space between an entry in its subtree and a
1867 * neighbor of that entry. This allows finding a free region in
1868 * one path down the tree, so O(log n) amortized with splay
1869 * trees.
1870 *
1871 * The map must be locked, and leaves it so.
1872 *
1873 * Returns: starting address if sufficient space,
1874 * vm_map_max(map)-length+1 if insufficient space.
1875 */
1876 vm_offset_t
vm_map_findspace(vm_map_t map,vm_offset_t start,vm_size_t length)1877 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
1878 {
1879 vm_map_entry_t header, llist, rlist, root, y;
1880 vm_size_t left_length, max_free_left, max_free_right;
1881 vm_offset_t gap_end;
1882
1883 VM_MAP_ASSERT_LOCKED(map);
1884
1885 /*
1886 * Request must fit within min/max VM address and must avoid
1887 * address wrap.
1888 */
1889 start = MAX(start, vm_map_min(map));
1890 if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
1891 return (vm_map_max(map) - length + 1);
1892
1893 /* Empty tree means wide open address space. */
1894 if (map->root == NULL)
1895 return (start);
1896
1897 /*
1898 * After splay_split, if start is within an entry, push it to the start
1899 * of the following gap. If rlist is at the end of the gap containing
1900 * start, save the end of that gap in gap_end to see if the gap is big
1901 * enough; otherwise set gap_end to start skip gap-checking and move
1902 * directly to a search of the right subtree.
1903 */
1904 header = &map->header;
1905 root = vm_map_splay_split(map, start, length, &llist, &rlist);
1906 gap_end = rlist->start;
1907 if (root != NULL) {
1908 start = root->end;
1909 if (root->right != rlist)
1910 gap_end = start;
1911 max_free_left = vm_map_splay_merge_left(header, root, llist);
1912 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1913 } else if (rlist != header) {
1914 root = rlist;
1915 rlist = root->left;
1916 max_free_left = vm_map_splay_merge_pred(header, root, llist);
1917 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1918 } else {
1919 root = llist;
1920 llist = root->right;
1921 max_free_left = vm_map_splay_merge_left(header, root, llist);
1922 max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1923 }
1924 root->max_free = vm_size_max(max_free_left, max_free_right);
1925 map->root = root;
1926 VM_MAP_ASSERT_CONSISTENT(map);
1927 if (length <= gap_end - start)
1928 return (start);
1929
1930 /* With max_free, can immediately tell if no solution. */
1931 if (root->right == header || length > root->right->max_free)
1932 return (vm_map_max(map) - length + 1);
1933
1934 /*
1935 * Splay for the least large-enough gap in the right subtree.
1936 */
1937 llist = rlist = header;
1938 for (left_length = 0;;
1939 left_length = vm_map_entry_max_free_left(root, llist)) {
1940 if (length <= left_length)
1941 SPLAY_LEFT_STEP(root, y, llist, rlist,
1942 length <= vm_map_entry_max_free_left(y, llist));
1943 else
1944 SPLAY_RIGHT_STEP(root, y, llist, rlist,
1945 length > vm_map_entry_max_free_left(y, root));
1946 if (root == NULL)
1947 break;
1948 }
1949 root = llist;
1950 llist = root->right;
1951 max_free_left = vm_map_splay_merge_left(header, root, llist);
1952 if (rlist == header) {
1953 root->max_free = vm_size_max(max_free_left,
1954 vm_map_splay_merge_succ(header, root, rlist));
1955 } else {
1956 y = rlist;
1957 rlist = y->left;
1958 y->max_free = vm_size_max(
1959 vm_map_splay_merge_pred(root, y, root),
1960 vm_map_splay_merge_right(header, y, rlist));
1961 root->max_free = vm_size_max(max_free_left, y->max_free);
1962 }
1963 map->root = root;
1964 VM_MAP_ASSERT_CONSISTENT(map);
1965 return (root->end);
1966 }
1967
1968 int
vm_map_fixed(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t start,vm_size_t length,vm_prot_t prot,vm_prot_t max,int cow)1969 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1970 vm_offset_t start, vm_size_t length, vm_prot_t prot,
1971 vm_prot_t max, int cow)
1972 {
1973 vm_offset_t end;
1974 int result;
1975
1976 end = start + length;
1977 KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1978 object == NULL,
1979 ("vm_map_fixed: non-NULL backing object for stack"));
1980 vm_map_lock(map);
1981 VM_MAP_RANGE_CHECK(map, start, end);
1982 if ((cow & MAP_CHECK_EXCL) == 0) {
1983 result = vm_map_delete(map, start, end);
1984 if (result != KERN_SUCCESS)
1985 goto out;
1986 }
1987 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1988 result = vm_map_stack_locked(map, start, length, sgrowsiz,
1989 prot, max, cow);
1990 } else {
1991 result = vm_map_insert(map, object, offset, start, end,
1992 prot, max, cow);
1993 }
1994 out:
1995 vm_map_unlock(map);
1996 return (result);
1997 }
1998
1999 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
2000 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
2001
2002 static int cluster_anon = 1;
2003 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
2004 &cluster_anon, 0,
2005 "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
2006
2007 static bool
clustering_anon_allowed(vm_offset_t addr,int cow)2008 clustering_anon_allowed(vm_offset_t addr, int cow)
2009 {
2010
2011 switch (cluster_anon) {
2012 case 0:
2013 return (false);
2014 case 1:
2015 return (addr == 0 || (cow & MAP_NO_HINT) != 0);
2016 case 2:
2017 default:
2018 return (true);
2019 }
2020 }
2021
2022 static long aslr_restarts;
2023 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
2024 &aslr_restarts, 0,
2025 "Number of aslr failures");
2026
2027 /*
2028 * Searches for the specified amount of free space in the given map with the
2029 * specified alignment. Performs an address-ordered, first-fit search from
2030 * the given address "*addr", with an optional upper bound "max_addr". If the
2031 * parameter "alignment" is zero, then the alignment is computed from the
2032 * given (object, offset) pair so as to enable the greatest possible use of
2033 * superpage mappings. Returns KERN_SUCCESS and the address of the free space
2034 * in "*addr" if successful. Otherwise, returns KERN_NO_SPACE.
2035 *
2036 * The map must be locked. Initially, there must be at least "length" bytes
2037 * of free space at the given address.
2038 */
2039 static int
vm_map_alignspace(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,vm_offset_t alignment)2040 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2041 vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
2042 vm_offset_t alignment)
2043 {
2044 vm_offset_t aligned_addr, free_addr;
2045
2046 VM_MAP_ASSERT_LOCKED(map);
2047 free_addr = *addr;
2048 KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
2049 ("caller failed to provide space %#jx at address %p",
2050 (uintmax_t)length, (void *)free_addr));
2051 for (;;) {
2052 /*
2053 * At the start of every iteration, the free space at address
2054 * "*addr" is at least "length" bytes.
2055 */
2056 if (alignment == 0)
2057 pmap_align_superpage(object, offset, addr, length);
2058 else
2059 *addr = roundup2(*addr, alignment);
2060 aligned_addr = *addr;
2061 if (aligned_addr == free_addr) {
2062 /*
2063 * Alignment did not change "*addr", so "*addr" must
2064 * still provide sufficient free space.
2065 */
2066 return (KERN_SUCCESS);
2067 }
2068
2069 /*
2070 * Test for address wrap on "*addr". A wrapped "*addr" could
2071 * be a valid address, in which case vm_map_findspace() cannot
2072 * be relied upon to fail.
2073 */
2074 if (aligned_addr < free_addr)
2075 return (KERN_NO_SPACE);
2076 *addr = vm_map_findspace(map, aligned_addr, length);
2077 if (*addr + length > vm_map_max(map) ||
2078 (max_addr != 0 && *addr + length > max_addr))
2079 return (KERN_NO_SPACE);
2080 free_addr = *addr;
2081 if (free_addr == aligned_addr) {
2082 /*
2083 * If a successful call to vm_map_findspace() did not
2084 * change "*addr", then "*addr" must still be aligned
2085 * and provide sufficient free space.
2086 */
2087 return (KERN_SUCCESS);
2088 }
2089 }
2090 }
2091
2092 int
vm_map_find_aligned(vm_map_t map,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,vm_offset_t alignment)2093 vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
2094 vm_offset_t max_addr, vm_offset_t alignment)
2095 {
2096 /* XXXKIB ASLR eh ? */
2097 *addr = vm_map_findspace(map, *addr, length);
2098 if (*addr + length > vm_map_max(map) ||
2099 (max_addr != 0 && *addr + length > max_addr))
2100 return (KERN_NO_SPACE);
2101 return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
2102 alignment));
2103 }
2104
2105 /*
2106 * vm_map_find finds an unallocated region in the target address
2107 * map with the given length. The search is defined to be
2108 * first-fit from the specified address; the region found is
2109 * returned in the same parameter.
2110 *
2111 * If object is non-NULL, ref count must be bumped by caller
2112 * prior to making call to account for the new entry.
2113 */
2114 int
vm_map_find(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,int find_space,vm_prot_t prot,vm_prot_t max,int cow)2115 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2116 vm_offset_t *addr, /* IN/OUT */
2117 vm_size_t length, vm_offset_t max_addr, int find_space,
2118 vm_prot_t prot, vm_prot_t max, int cow)
2119 {
2120 int rv;
2121
2122 vm_map_lock(map);
2123 rv = vm_map_find_locked(map, object, offset, addr, length, max_addr,
2124 find_space, prot, max, cow);
2125 vm_map_unlock(map);
2126 return (rv);
2127 }
2128
2129 int
vm_map_find_locked(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,int find_space,vm_prot_t prot,vm_prot_t max,int cow)2130 vm_map_find_locked(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2131 vm_offset_t *addr, /* IN/OUT */
2132 vm_size_t length, vm_offset_t max_addr, int find_space,
2133 vm_prot_t prot, vm_prot_t max, int cow)
2134 {
2135 vm_offset_t alignment, curr_min_addr, min_addr;
2136 int gap, pidx, rv, try;
2137 bool cluster, en_aslr, update_anon;
2138
2139 KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
2140 object == NULL,
2141 ("non-NULL backing object for stack"));
2142 MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
2143 (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
2144 if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
2145 (object->flags & OBJ_COLORED) == 0))
2146 find_space = VMFS_ANY_SPACE;
2147 if (find_space >> 8 != 0) {
2148 KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
2149 alignment = (vm_offset_t)1 << (find_space >> 8);
2150 } else
2151 alignment = 0;
2152 en_aslr = (map->flags & MAP_ASLR) != 0;
2153 update_anon = cluster = clustering_anon_allowed(*addr, cow) &&
2154 (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
2155 find_space != VMFS_NO_SPACE && object == NULL &&
2156 (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
2157 MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
2158 curr_min_addr = min_addr = *addr;
2159 if (en_aslr && min_addr == 0 && !cluster &&
2160 find_space != VMFS_NO_SPACE &&
2161 (map->flags & MAP_ASLR_IGNSTART) != 0)
2162 curr_min_addr = min_addr = vm_map_min(map);
2163 try = 0;
2164 if (cluster) {
2165 curr_min_addr = map->anon_loc;
2166 if (curr_min_addr == 0)
2167 cluster = false;
2168 }
2169 if (find_space != VMFS_NO_SPACE) {
2170 KASSERT(find_space == VMFS_ANY_SPACE ||
2171 find_space == VMFS_OPTIMAL_SPACE ||
2172 find_space == VMFS_SUPER_SPACE ||
2173 alignment != 0, ("unexpected VMFS flag"));
2174 again:
2175 /*
2176 * When creating an anonymous mapping, try clustering
2177 * with an existing anonymous mapping first.
2178 *
2179 * We make up to two attempts to find address space
2180 * for a given find_space value. The first attempt may
2181 * apply randomization or may cluster with an existing
2182 * anonymous mapping. If this first attempt fails,
2183 * perform a first-fit search of the available address
2184 * space.
2185 *
2186 * If all tries failed, and find_space is
2187 * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
2188 * Again enable clustering and randomization.
2189 */
2190 try++;
2191 MPASS(try <= 2);
2192
2193 if (try == 2) {
2194 /*
2195 * Second try: we failed either to find a
2196 * suitable region for randomizing the
2197 * allocation, or to cluster with an existing
2198 * mapping. Retry with free run.
2199 */
2200 curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
2201 vm_map_min(map) : min_addr;
2202 atomic_add_long(&aslr_restarts, 1);
2203 }
2204
2205 if (try == 1 && en_aslr && !cluster) {
2206 /*
2207 * Find space for allocation, including
2208 * gap needed for later randomization.
2209 */
2210 pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
2211 (find_space == VMFS_SUPER_SPACE || find_space ==
2212 VMFS_OPTIMAL_SPACE) ? 1 : 0;
2213 gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
2214 (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
2215 aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
2216 *addr = vm_map_findspace(map, curr_min_addr,
2217 length + gap * pagesizes[pidx]);
2218 if (*addr + length + gap * pagesizes[pidx] >
2219 vm_map_max(map))
2220 goto again;
2221 /* And randomize the start address. */
2222 *addr += (arc4random() % gap) * pagesizes[pidx];
2223 if (max_addr != 0 && *addr + length > max_addr)
2224 goto again;
2225 } else {
2226 *addr = vm_map_findspace(map, curr_min_addr, length);
2227 if (*addr + length > vm_map_max(map) ||
2228 (max_addr != 0 && *addr + length > max_addr)) {
2229 if (cluster) {
2230 cluster = false;
2231 MPASS(try == 1);
2232 goto again;
2233 }
2234 return (KERN_NO_SPACE);
2235 }
2236 }
2237
2238 if (find_space != VMFS_ANY_SPACE &&
2239 (rv = vm_map_alignspace(map, object, offset, addr, length,
2240 max_addr, alignment)) != KERN_SUCCESS) {
2241 if (find_space == VMFS_OPTIMAL_SPACE) {
2242 find_space = VMFS_ANY_SPACE;
2243 curr_min_addr = min_addr;
2244 cluster = update_anon;
2245 try = 0;
2246 goto again;
2247 }
2248 return (rv);
2249 }
2250 } else if ((cow & MAP_REMAP) != 0) {
2251 if (!vm_map_range_valid(map, *addr, *addr + length))
2252 return (KERN_INVALID_ADDRESS);
2253 rv = vm_map_delete(map, *addr, *addr + length);
2254 if (rv != KERN_SUCCESS)
2255 return (rv);
2256 }
2257 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
2258 rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
2259 max, cow);
2260 } else {
2261 rv = vm_map_insert(map, object, offset, *addr, *addr + length,
2262 prot, max, cow);
2263 }
2264
2265 /*
2266 * Update the starting address for clustered anonymous memory mappings
2267 * if a starting address was not previously defined or an ASLR restart
2268 * placed an anonymous memory mapping at a lower address.
2269 */
2270 if (update_anon && rv == KERN_SUCCESS && (map->anon_loc == 0 ||
2271 *addr < map->anon_loc))
2272 map->anon_loc = *addr;
2273 return (rv);
2274 }
2275
2276 /*
2277 * vm_map_find_min() is a variant of vm_map_find() that takes an
2278 * additional parameter ("default_addr") and treats the given address
2279 * ("*addr") differently. Specifically, it treats "*addr" as a hint
2280 * and not as the minimum address where the mapping is created.
2281 *
2282 * This function works in two phases. First, it tries to
2283 * allocate above the hint. If that fails and the hint is
2284 * greater than "default_addr", it performs a second pass, replacing
2285 * the hint with "default_addr" as the minimum address for the
2286 * allocation.
2287 */
2288 int
vm_map_find_min(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t default_addr,vm_offset_t max_addr,int find_space,vm_prot_t prot,vm_prot_t max,int cow)2289 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2290 vm_offset_t *addr, vm_size_t length, vm_offset_t default_addr,
2291 vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
2292 int cow)
2293 {
2294 vm_offset_t hint;
2295 int rv;
2296
2297 hint = *addr;
2298 if (hint == 0) {
2299 cow |= MAP_NO_HINT;
2300 *addr = hint = default_addr;
2301 }
2302 for (;;) {
2303 rv = vm_map_find(map, object, offset, addr, length, max_addr,
2304 find_space, prot, max, cow);
2305 if (rv == KERN_SUCCESS || default_addr >= hint)
2306 return (rv);
2307 *addr = hint = default_addr;
2308 }
2309 }
2310
2311 /*
2312 * A map entry with any of the following flags set must not be merged with
2313 * another entry.
2314 */
2315 #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
2316 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC | \
2317 MAP_ENTRY_STACK_GAP_UP | MAP_ENTRY_STACK_GAP_DN)
2318
2319 static bool
vm_map_mergeable_neighbors(vm_map_entry_t prev,vm_map_entry_t entry)2320 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
2321 {
2322
2323 KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
2324 (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
2325 ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
2326 prev, entry));
2327 return (prev->end == entry->start &&
2328 prev->object.vm_object == entry->object.vm_object &&
2329 (prev->object.vm_object == NULL ||
2330 prev->offset + (prev->end - prev->start) == entry->offset) &&
2331 prev->eflags == entry->eflags &&
2332 prev->protection == entry->protection &&
2333 prev->max_protection == entry->max_protection &&
2334 prev->inheritance == entry->inheritance &&
2335 prev->wired_count == entry->wired_count &&
2336 prev->cred == entry->cred);
2337 }
2338
2339 static void
vm_map_merged_neighbor_dispose(vm_map_t map,vm_map_entry_t entry)2340 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
2341 {
2342
2343 /*
2344 * If the backing object is a vnode object, vm_object_deallocate()
2345 * calls vrele(). However, vrele() does not lock the vnode because
2346 * the vnode has additional references. Thus, the map lock can be
2347 * kept without causing a lock-order reversal with the vnode lock.
2348 *
2349 * Since we count the number of virtual page mappings in
2350 * object->un_pager.vnp.writemappings, the writemappings value
2351 * should not be adjusted when the entry is disposed of.
2352 */
2353 if (entry->object.vm_object != NULL)
2354 vm_object_deallocate(entry->object.vm_object);
2355 if (entry->cred != NULL)
2356 crfree(entry->cred);
2357 vm_map_entry_dispose(map, entry);
2358 }
2359
2360 /*
2361 * vm_map_try_merge_entries:
2362 *
2363 * Compare two map entries that represent consecutive ranges. If
2364 * the entries can be merged, expand the range of the second to
2365 * cover the range of the first and delete the first. Then return
2366 * the map entry that includes the first range.
2367 *
2368 * The map must be locked.
2369 */
2370 vm_map_entry_t
vm_map_try_merge_entries(vm_map_t map,vm_map_entry_t prev_entry,vm_map_entry_t entry)2371 vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry,
2372 vm_map_entry_t entry)
2373 {
2374
2375 VM_MAP_ASSERT_LOCKED(map);
2376 if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
2377 vm_map_mergeable_neighbors(prev_entry, entry)) {
2378 vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT);
2379 vm_map_merged_neighbor_dispose(map, prev_entry);
2380 return (entry);
2381 }
2382 return (prev_entry);
2383 }
2384
2385 /*
2386 * vm_map_entry_back:
2387 *
2388 * Allocate an object to back a map entry.
2389 */
2390 static inline void
vm_map_entry_back(vm_map_entry_t entry)2391 vm_map_entry_back(vm_map_entry_t entry)
2392 {
2393 vm_object_t object;
2394
2395 KASSERT(entry->object.vm_object == NULL,
2396 ("map entry %p has backing object", entry));
2397 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2398 ("map entry %p is a submap", entry));
2399 object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL,
2400 entry->cred, entry->end - entry->start);
2401 entry->object.vm_object = object;
2402 entry->offset = 0;
2403 entry->cred = NULL;
2404 }
2405
2406 /*
2407 * vm_map_entry_charge_object
2408 *
2409 * If there is no object backing this entry, create one. Otherwise, if
2410 * the entry has cred, give it to the backing object.
2411 */
2412 static inline void
vm_map_entry_charge_object(vm_map_t map,vm_map_entry_t entry)2413 vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
2414 {
2415
2416 VM_MAP_ASSERT_LOCKED(map);
2417 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2418 ("map entry %p is a submap", entry));
2419 if (entry->object.vm_object == NULL && !map->system_map &&
2420 (entry->eflags & MAP_ENTRY_GUARD) == 0)
2421 vm_map_entry_back(entry);
2422 else if (entry->object.vm_object != NULL &&
2423 ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2424 entry->cred != NULL) {
2425 VM_OBJECT_WLOCK(entry->object.vm_object);
2426 KASSERT(entry->object.vm_object->cred == NULL,
2427 ("OVERCOMMIT: %s: both cred e %p", __func__, entry));
2428 entry->object.vm_object->cred = entry->cred;
2429 entry->object.vm_object->charge = entry->end - entry->start;
2430 VM_OBJECT_WUNLOCK(entry->object.vm_object);
2431 entry->cred = NULL;
2432 }
2433 }
2434
2435 /*
2436 * vm_map_entry_clone
2437 *
2438 * Create a duplicate map entry for clipping.
2439 */
2440 static vm_map_entry_t
vm_map_entry_clone(vm_map_t map,vm_map_entry_t entry)2441 vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry)
2442 {
2443 vm_map_entry_t new_entry;
2444
2445 VM_MAP_ASSERT_LOCKED(map);
2446
2447 /*
2448 * Create a backing object now, if none exists, so that more individual
2449 * objects won't be created after the map entry is split.
2450 */
2451 vm_map_entry_charge_object(map, entry);
2452
2453 /* Clone the entry. */
2454 new_entry = vm_map_entry_create(map);
2455 *new_entry = *entry;
2456 if (new_entry->cred != NULL)
2457 crhold(entry->cred);
2458 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2459 vm_object_reference(new_entry->object.vm_object);
2460 vm_map_entry_set_vnode_text(new_entry, true);
2461 /*
2462 * The object->un_pager.vnp.writemappings for the object of
2463 * MAP_ENTRY_WRITECNT type entry shall be kept as is here. The
2464 * virtual pages are re-distributed among the clipped entries,
2465 * so the sum is left the same.
2466 */
2467 }
2468 return (new_entry);
2469 }
2470
2471 /*
2472 * vm_map_clip_start: [ internal use only ]
2473 *
2474 * Asserts that the given entry begins at or after
2475 * the specified address; if necessary,
2476 * it splits the entry into two.
2477 */
2478 static int
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_offset_t startaddr)2479 vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
2480 {
2481 vm_map_entry_t new_entry;
2482 int bdry_idx;
2483
2484 if (!map->system_map)
2485 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2486 "%s: map %p entry %p start 0x%jx", __func__, map, entry,
2487 (uintmax_t)startaddr);
2488
2489 if (startaddr <= entry->start)
2490 return (KERN_SUCCESS);
2491
2492 VM_MAP_ASSERT_LOCKED(map);
2493 KASSERT(entry->end > startaddr && entry->start < startaddr,
2494 ("%s: invalid clip of entry %p", __func__, entry));
2495
2496 bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
2497 if (bdry_idx != 0) {
2498 if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
2499 return (KERN_INVALID_ARGUMENT);
2500 }
2501
2502 new_entry = vm_map_entry_clone(map, entry);
2503
2504 /*
2505 * Split off the front portion. Insert the new entry BEFORE this one,
2506 * so that this entry has the specified starting address.
2507 */
2508 new_entry->end = startaddr;
2509 vm_map_entry_link(map, new_entry);
2510 return (KERN_SUCCESS);
2511 }
2512
2513 /*
2514 * vm_map_lookup_clip_start:
2515 *
2516 * Find the entry at or just after 'start', and clip it if 'start' is in
2517 * the interior of the entry. Return entry after 'start', and in
2518 * prev_entry set the entry before 'start'.
2519 */
2520 static int
vm_map_lookup_clip_start(vm_map_t map,vm_offset_t start,vm_map_entry_t * res_entry,vm_map_entry_t * prev_entry)2521 vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
2522 vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
2523 {
2524 vm_map_entry_t entry;
2525 int rv;
2526
2527 if (!map->system_map)
2528 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2529 "%s: map %p start 0x%jx prev %p", __func__, map,
2530 (uintmax_t)start, prev_entry);
2531
2532 if (vm_map_lookup_entry(map, start, prev_entry)) {
2533 entry = *prev_entry;
2534 rv = vm_map_clip_start(map, entry, start);
2535 if (rv != KERN_SUCCESS)
2536 return (rv);
2537 *prev_entry = vm_map_entry_pred(entry);
2538 } else
2539 entry = vm_map_entry_succ(*prev_entry);
2540 *res_entry = entry;
2541 return (KERN_SUCCESS);
2542 }
2543
2544 /*
2545 * vm_map_clip_end: [ internal use only ]
2546 *
2547 * Asserts that the given entry ends at or before
2548 * the specified address; if necessary,
2549 * it splits the entry into two.
2550 */
2551 static int
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_offset_t endaddr)2552 vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
2553 {
2554 vm_map_entry_t new_entry;
2555 int bdry_idx;
2556
2557 if (!map->system_map)
2558 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2559 "%s: map %p entry %p end 0x%jx", __func__, map, entry,
2560 (uintmax_t)endaddr);
2561
2562 if (endaddr >= entry->end)
2563 return (KERN_SUCCESS);
2564
2565 VM_MAP_ASSERT_LOCKED(map);
2566 KASSERT(entry->start < endaddr && entry->end > endaddr,
2567 ("%s: invalid clip of entry %p", __func__, entry));
2568
2569 bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
2570 if (bdry_idx != 0) {
2571 if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
2572 return (KERN_INVALID_ARGUMENT);
2573 }
2574
2575 new_entry = vm_map_entry_clone(map, entry);
2576
2577 /*
2578 * Split off the back portion. Insert the new entry AFTER this one,
2579 * so that this entry has the specified ending address.
2580 */
2581 new_entry->start = endaddr;
2582 vm_map_entry_link(map, new_entry);
2583
2584 return (KERN_SUCCESS);
2585 }
2586
2587 /*
2588 * vm_map_submap: [ kernel use only ]
2589 *
2590 * Mark the given range as handled by a subordinate map.
2591 *
2592 * This range must have been created with vm_map_find,
2593 * and no other operations may have been performed on this
2594 * range prior to calling vm_map_submap.
2595 *
2596 * Only a limited number of operations can be performed
2597 * within this rage after calling vm_map_submap:
2598 * vm_fault
2599 * [Don't try vm_map_copy!]
2600 *
2601 * To remove a submapping, one must first remove the
2602 * range from the superior map, and then destroy the
2603 * submap (if desired). [Better yet, don't try it.]
2604 */
2605 int
vm_map_submap(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_map_t submap)2606 vm_map_submap(
2607 vm_map_t map,
2608 vm_offset_t start,
2609 vm_offset_t end,
2610 vm_map_t submap)
2611 {
2612 vm_map_entry_t entry;
2613 int result;
2614
2615 result = KERN_INVALID_ARGUMENT;
2616
2617 vm_map_lock(submap);
2618 submap->flags |= MAP_IS_SUB_MAP;
2619 vm_map_unlock(submap);
2620
2621 vm_map_lock(map);
2622 VM_MAP_RANGE_CHECK(map, start, end);
2623 if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
2624 (entry->eflags & MAP_ENTRY_COW) == 0 &&
2625 entry->object.vm_object == NULL) {
2626 result = vm_map_clip_start(map, entry, start);
2627 if (result != KERN_SUCCESS)
2628 goto unlock;
2629 result = vm_map_clip_end(map, entry, end);
2630 if (result != KERN_SUCCESS)
2631 goto unlock;
2632 entry->object.sub_map = submap;
2633 entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
2634 result = KERN_SUCCESS;
2635 }
2636 unlock:
2637 vm_map_unlock(map);
2638
2639 if (result != KERN_SUCCESS) {
2640 vm_map_lock(submap);
2641 submap->flags &= ~MAP_IS_SUB_MAP;
2642 vm_map_unlock(submap);
2643 }
2644 return (result);
2645 }
2646
2647 /*
2648 * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
2649 */
2650 #define MAX_INIT_PT 96
2651
2652 /*
2653 * vm_map_pmap_enter:
2654 *
2655 * Preload the specified map's pmap with mappings to the specified
2656 * object's memory-resident pages. No further physical pages are
2657 * allocated, and no further virtual pages are retrieved from secondary
2658 * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a
2659 * limited number of page mappings are created at the low-end of the
2660 * specified address range. (For this purpose, a superpage mapping
2661 * counts as one page mapping.) Otherwise, all resident pages within
2662 * the specified address range are mapped.
2663 */
2664 static void
vm_map_pmap_enter(vm_map_t map,vm_offset_t addr,vm_prot_t prot,vm_object_t object,vm_pindex_t pindex,vm_size_t size,int flags)2665 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
2666 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
2667 {
2668 vm_offset_t start;
2669 vm_page_t p, p_start;
2670 vm_pindex_t mask, psize, threshold, tmpidx;
2671
2672 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
2673 return;
2674 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2675 VM_OBJECT_WLOCK(object);
2676 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2677 pmap_object_init_pt(map->pmap, addr, object, pindex,
2678 size);
2679 VM_OBJECT_WUNLOCK(object);
2680 return;
2681 }
2682 VM_OBJECT_LOCK_DOWNGRADE(object);
2683 } else
2684 VM_OBJECT_RLOCK(object);
2685
2686 psize = atop(size);
2687 if (psize + pindex > object->size) {
2688 if (pindex >= object->size) {
2689 VM_OBJECT_RUNLOCK(object);
2690 return;
2691 }
2692 psize = object->size - pindex;
2693 }
2694
2695 start = 0;
2696 p_start = NULL;
2697 threshold = MAX_INIT_PT;
2698
2699 p = vm_page_find_least(object, pindex);
2700 /*
2701 * Assert: the variable p is either (1) the page with the
2702 * least pindex greater than or equal to the parameter pindex
2703 * or (2) NULL.
2704 */
2705 for (;
2706 p != NULL && (tmpidx = p->pindex - pindex) < psize;
2707 p = TAILQ_NEXT(p, listq)) {
2708 /*
2709 * don't allow an madvise to blow away our really
2710 * free pages allocating pv entries.
2711 */
2712 if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
2713 vm_page_count_severe()) ||
2714 ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
2715 tmpidx >= threshold)) {
2716 psize = tmpidx;
2717 break;
2718 }
2719 if (vm_page_all_valid(p)) {
2720 if (p_start == NULL) {
2721 start = addr + ptoa(tmpidx);
2722 p_start = p;
2723 }
2724 /* Jump ahead if a superpage mapping is possible. */
2725 if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
2726 (pagesizes[p->psind] - 1)) == 0) {
2727 mask = atop(pagesizes[p->psind]) - 1;
2728 if (tmpidx + mask < psize &&
2729 vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
2730 p += mask;
2731 threshold += mask;
2732 }
2733 }
2734 } else if (p_start != NULL) {
2735 pmap_enter_object(map->pmap, start, addr +
2736 ptoa(tmpidx), p_start, prot);
2737 p_start = NULL;
2738 }
2739 }
2740 if (p_start != NULL)
2741 pmap_enter_object(map->pmap, start, addr + ptoa(psize),
2742 p_start, prot);
2743 VM_OBJECT_RUNLOCK(object);
2744 }
2745
2746 static void
vm_map_protect_guard(vm_map_entry_t entry,vm_prot_t new_prot,vm_prot_t new_maxprot,int flags)2747 vm_map_protect_guard(vm_map_entry_t entry, vm_prot_t new_prot,
2748 vm_prot_t new_maxprot, int flags)
2749 {
2750 vm_prot_t old_prot;
2751
2752 MPASS((entry->eflags & MAP_ENTRY_GUARD) != 0);
2753 if ((entry->eflags & (MAP_ENTRY_STACK_GAP_UP |
2754 MAP_ENTRY_STACK_GAP_DN)) == 0)
2755 return;
2756
2757 old_prot = PROT_EXTRACT(entry->offset);
2758 if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
2759 entry->offset = PROT_MAX(new_maxprot) |
2760 (new_maxprot & old_prot);
2761 }
2762 if ((flags & VM_MAP_PROTECT_SET_PROT) != 0) {
2763 entry->offset = new_prot | PROT_MAX(
2764 PROT_MAX_EXTRACT(entry->offset));
2765 }
2766 }
2767
2768 /*
2769 * vm_map_protect:
2770 *
2771 * Sets the protection and/or the maximum protection of the
2772 * specified address region in the target map.
2773 */
2774 int
vm_map_protect(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_prot_t new_prot,vm_prot_t new_maxprot,int flags)2775 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2776 vm_prot_t new_prot, vm_prot_t new_maxprot, int flags)
2777 {
2778 vm_map_entry_t entry, first_entry, in_tran, prev_entry;
2779 vm_object_t obj;
2780 struct ucred *cred;
2781 vm_offset_t orig_start;
2782 vm_prot_t check_prot, max_prot, old_prot;
2783 int rv;
2784
2785 if (start == end)
2786 return (KERN_SUCCESS);
2787
2788 if (CONTAINS_BITS(flags, VM_MAP_PROTECT_SET_PROT |
2789 VM_MAP_PROTECT_SET_MAXPROT) &&
2790 !CONTAINS_BITS(new_maxprot, new_prot))
2791 return (KERN_OUT_OF_BOUNDS);
2792
2793 orig_start = start;
2794 again:
2795 in_tran = NULL;
2796 start = orig_start;
2797 vm_map_lock(map);
2798
2799 if ((map->flags & MAP_WXORX) != 0 &&
2800 (flags & VM_MAP_PROTECT_SET_PROT) != 0 &&
2801 CONTAINS_BITS(new_prot, VM_PROT_WRITE | VM_PROT_EXECUTE)) {
2802 vm_map_unlock(map);
2803 return (KERN_PROTECTION_FAILURE);
2804 }
2805
2806 /*
2807 * Ensure that we are not concurrently wiring pages. vm_map_wire() may
2808 * need to fault pages into the map and will drop the map lock while
2809 * doing so, and the VM object may end up in an inconsistent state if we
2810 * update the protection on the map entry in between faults.
2811 */
2812 vm_map_wait_busy(map);
2813
2814 VM_MAP_RANGE_CHECK(map, start, end);
2815
2816 if (!vm_map_lookup_entry(map, start, &first_entry))
2817 first_entry = vm_map_entry_succ(first_entry);
2818
2819 if ((flags & VM_MAP_PROTECT_GROWSDOWN) != 0 &&
2820 (first_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0) {
2821 /*
2822 * Handle Linux's PROT_GROWSDOWN flag.
2823 * It means that protection is applied down to the
2824 * whole stack, including the specified range of the
2825 * mapped region, and the grow down region (AKA
2826 * guard).
2827 */
2828 while (!CONTAINS_BITS(first_entry->eflags,
2829 MAP_ENTRY_GUARD | MAP_ENTRY_STACK_GAP_DN) &&
2830 first_entry != vm_map_entry_first(map))
2831 first_entry = vm_map_entry_pred(first_entry);
2832 start = first_entry->start;
2833 }
2834
2835 /*
2836 * Make a first pass to check for protection violations.
2837 */
2838 check_prot = 0;
2839 if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
2840 check_prot |= new_prot;
2841 if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0)
2842 check_prot |= new_maxprot;
2843 for (entry = first_entry; entry->start < end;
2844 entry = vm_map_entry_succ(entry)) {
2845 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
2846 vm_map_unlock(map);
2847 return (KERN_INVALID_ARGUMENT);
2848 }
2849 if ((entry->eflags & (MAP_ENTRY_GUARD |
2850 MAP_ENTRY_STACK_GAP_DN | MAP_ENTRY_STACK_GAP_UP)) ==
2851 MAP_ENTRY_GUARD)
2852 continue;
2853 max_prot = (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
2854 MAP_ENTRY_STACK_GAP_UP)) != 0 ?
2855 PROT_MAX_EXTRACT(entry->offset) : entry->max_protection;
2856 if (!CONTAINS_BITS(max_prot, check_prot)) {
2857 vm_map_unlock(map);
2858 return (KERN_PROTECTION_FAILURE);
2859 }
2860 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
2861 in_tran = entry;
2862 }
2863
2864 /*
2865 * Postpone the operation until all in-transition map entries have
2866 * stabilized. An in-transition entry might already have its pages
2867 * wired and wired_count incremented, but not yet have its
2868 * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call
2869 * vm_fault_copy_entry() in the final loop below.
2870 */
2871 if (in_tran != NULL) {
2872 in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2873 vm_map_unlock_and_wait(map, 0);
2874 goto again;
2875 }
2876
2877 /*
2878 * Before changing the protections, try to reserve swap space for any
2879 * private (i.e., copy-on-write) mappings that are transitioning from
2880 * read-only to read/write access. If a reservation fails, break out
2881 * of this loop early and let the next loop simplify the entries, since
2882 * some may now be mergeable.
2883 */
2884 rv = vm_map_clip_start(map, first_entry, start);
2885 if (rv != KERN_SUCCESS) {
2886 vm_map_unlock(map);
2887 return (rv);
2888 }
2889 for (entry = first_entry; entry->start < end;
2890 entry = vm_map_entry_succ(entry)) {
2891 rv = vm_map_clip_end(map, entry, end);
2892 if (rv != KERN_SUCCESS) {
2893 vm_map_unlock(map);
2894 return (rv);
2895 }
2896
2897 if ((flags & VM_MAP_PROTECT_SET_PROT) == 0 ||
2898 ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
2899 ENTRY_CHARGED(entry) ||
2900 (entry->eflags & MAP_ENTRY_GUARD) != 0)
2901 continue;
2902
2903 cred = curthread->td_ucred;
2904 obj = entry->object.vm_object;
2905
2906 if (obj == NULL ||
2907 (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) {
2908 if (!swap_reserve(entry->end - entry->start)) {
2909 rv = KERN_RESOURCE_SHORTAGE;
2910 end = entry->end;
2911 break;
2912 }
2913 crhold(cred);
2914 entry->cred = cred;
2915 continue;
2916 }
2917
2918 VM_OBJECT_WLOCK(obj);
2919 if ((obj->flags & OBJ_SWAP) == 0) {
2920 VM_OBJECT_WUNLOCK(obj);
2921 continue;
2922 }
2923
2924 /*
2925 * Charge for the whole object allocation now, since
2926 * we cannot distinguish between non-charged and
2927 * charged clipped mapping of the same object later.
2928 */
2929 KASSERT(obj->charge == 0,
2930 ("vm_map_protect: object %p overcharged (entry %p)",
2931 obj, entry));
2932 if (!swap_reserve(ptoa(obj->size))) {
2933 VM_OBJECT_WUNLOCK(obj);
2934 rv = KERN_RESOURCE_SHORTAGE;
2935 end = entry->end;
2936 break;
2937 }
2938
2939 crhold(cred);
2940 obj->cred = cred;
2941 obj->charge = ptoa(obj->size);
2942 VM_OBJECT_WUNLOCK(obj);
2943 }
2944
2945 /*
2946 * If enough swap space was available, go back and fix up protections.
2947 * Otherwise, just simplify entries, since some may have been modified.
2948 * [Note that clipping is not necessary the second time.]
2949 */
2950 for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry;
2951 entry->start < end;
2952 vm_map_try_merge_entries(map, prev_entry, entry),
2953 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
2954 if (rv != KERN_SUCCESS)
2955 continue;
2956
2957 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
2958 vm_map_protect_guard(entry, new_prot, new_maxprot,
2959 flags);
2960 continue;
2961 }
2962
2963 old_prot = entry->protection;
2964
2965 if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
2966 entry->max_protection = new_maxprot;
2967 entry->protection = new_maxprot & old_prot;
2968 }
2969 if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
2970 entry->protection = new_prot;
2971
2972 /*
2973 * For user wired map entries, the normal lazy evaluation of
2974 * write access upgrades through soft page faults is
2975 * undesirable. Instead, immediately copy any pages that are
2976 * copy-on-write and enable write access in the physical map.
2977 */
2978 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2979 (entry->protection & VM_PROT_WRITE) != 0 &&
2980 (old_prot & VM_PROT_WRITE) == 0)
2981 vm_fault_copy_entry(map, map, entry, entry, NULL);
2982
2983 /*
2984 * When restricting access, update the physical map. Worry
2985 * about copy-on-write here.
2986 */
2987 if ((old_prot & ~entry->protection) != 0) {
2988 #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2989 VM_PROT_ALL)
2990 pmap_protect(map->pmap, entry->start,
2991 entry->end,
2992 entry->protection & MASK(entry));
2993 #undef MASK
2994 }
2995 }
2996 vm_map_try_merge_entries(map, prev_entry, entry);
2997 vm_map_unlock(map);
2998 return (rv);
2999 }
3000
3001 /*
3002 * vm_map_madvise:
3003 *
3004 * This routine traverses a processes map handling the madvise
3005 * system call. Advisories are classified as either those effecting
3006 * the vm_map_entry structure, or those effecting the underlying
3007 * objects.
3008 */
3009 int
vm_map_madvise(vm_map_t map,vm_offset_t start,vm_offset_t end,int behav)3010 vm_map_madvise(
3011 vm_map_t map,
3012 vm_offset_t start,
3013 vm_offset_t end,
3014 int behav)
3015 {
3016 vm_map_entry_t entry, prev_entry;
3017 int rv;
3018 bool modify_map;
3019
3020 /*
3021 * Some madvise calls directly modify the vm_map_entry, in which case
3022 * we need to use an exclusive lock on the map and we need to perform
3023 * various clipping operations. Otherwise we only need a read-lock
3024 * on the map.
3025 */
3026 switch(behav) {
3027 case MADV_NORMAL:
3028 case MADV_SEQUENTIAL:
3029 case MADV_RANDOM:
3030 case MADV_NOSYNC:
3031 case MADV_AUTOSYNC:
3032 case MADV_NOCORE:
3033 case MADV_CORE:
3034 if (start == end)
3035 return (0);
3036 modify_map = true;
3037 vm_map_lock(map);
3038 break;
3039 case MADV_WILLNEED:
3040 case MADV_DONTNEED:
3041 case MADV_FREE:
3042 if (start == end)
3043 return (0);
3044 modify_map = false;
3045 vm_map_lock_read(map);
3046 break;
3047 default:
3048 return (EINVAL);
3049 }
3050
3051 /*
3052 * Locate starting entry and clip if necessary.
3053 */
3054 VM_MAP_RANGE_CHECK(map, start, end);
3055
3056 if (modify_map) {
3057 /*
3058 * madvise behaviors that are implemented in the vm_map_entry.
3059 *
3060 * We clip the vm_map_entry so that behavioral changes are
3061 * limited to the specified address range.
3062 */
3063 rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
3064 if (rv != KERN_SUCCESS) {
3065 vm_map_unlock(map);
3066 return (vm_mmap_to_errno(rv));
3067 }
3068
3069 for (; entry->start < end; prev_entry = entry,
3070 entry = vm_map_entry_succ(entry)) {
3071 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3072 continue;
3073
3074 rv = vm_map_clip_end(map, entry, end);
3075 if (rv != KERN_SUCCESS) {
3076 vm_map_unlock(map);
3077 return (vm_mmap_to_errno(rv));
3078 }
3079
3080 switch (behav) {
3081 case MADV_NORMAL:
3082 vm_map_entry_set_behavior(entry,
3083 MAP_ENTRY_BEHAV_NORMAL);
3084 break;
3085 case MADV_SEQUENTIAL:
3086 vm_map_entry_set_behavior(entry,
3087 MAP_ENTRY_BEHAV_SEQUENTIAL);
3088 break;
3089 case MADV_RANDOM:
3090 vm_map_entry_set_behavior(entry,
3091 MAP_ENTRY_BEHAV_RANDOM);
3092 break;
3093 case MADV_NOSYNC:
3094 entry->eflags |= MAP_ENTRY_NOSYNC;
3095 break;
3096 case MADV_AUTOSYNC:
3097 entry->eflags &= ~MAP_ENTRY_NOSYNC;
3098 break;
3099 case MADV_NOCORE:
3100 entry->eflags |= MAP_ENTRY_NOCOREDUMP;
3101 break;
3102 case MADV_CORE:
3103 entry->eflags &= ~MAP_ENTRY_NOCOREDUMP;
3104 break;
3105 default:
3106 break;
3107 }
3108 vm_map_try_merge_entries(map, prev_entry, entry);
3109 }
3110 vm_map_try_merge_entries(map, prev_entry, entry);
3111 vm_map_unlock(map);
3112 } else {
3113 vm_pindex_t pstart, pend;
3114
3115 /*
3116 * madvise behaviors that are implemented in the underlying
3117 * vm_object.
3118 *
3119 * Since we don't clip the vm_map_entry, we have to clip
3120 * the vm_object pindex and count.
3121 */
3122 if (!vm_map_lookup_entry(map, start, &entry))
3123 entry = vm_map_entry_succ(entry);
3124 for (; entry->start < end;
3125 entry = vm_map_entry_succ(entry)) {
3126 vm_offset_t useEnd, useStart;
3127
3128 if ((entry->eflags & (MAP_ENTRY_IS_SUB_MAP |
3129 MAP_ENTRY_GUARD)) != 0)
3130 continue;
3131
3132 /*
3133 * MADV_FREE would otherwise rewind time to
3134 * the creation of the shadow object. Because
3135 * we hold the VM map read-locked, neither the
3136 * entry's object nor the presence of a
3137 * backing object can change.
3138 */
3139 if (behav == MADV_FREE &&
3140 entry->object.vm_object != NULL &&
3141 entry->object.vm_object->backing_object != NULL)
3142 continue;
3143
3144 pstart = OFF_TO_IDX(entry->offset);
3145 pend = pstart + atop(entry->end - entry->start);
3146 useStart = entry->start;
3147 useEnd = entry->end;
3148
3149 if (entry->start < start) {
3150 pstart += atop(start - entry->start);
3151 useStart = start;
3152 }
3153 if (entry->end > end) {
3154 pend -= atop(entry->end - end);
3155 useEnd = end;
3156 }
3157
3158 if (pstart >= pend)
3159 continue;
3160
3161 /*
3162 * Perform the pmap_advise() before clearing
3163 * PGA_REFERENCED in vm_page_advise(). Otherwise, a
3164 * concurrent pmap operation, such as pmap_remove(),
3165 * could clear a reference in the pmap and set
3166 * PGA_REFERENCED on the page before the pmap_advise()
3167 * had completed. Consequently, the page would appear
3168 * referenced based upon an old reference that
3169 * occurred before this pmap_advise() ran.
3170 */
3171 if (behav == MADV_DONTNEED || behav == MADV_FREE)
3172 pmap_advise(map->pmap, useStart, useEnd,
3173 behav);
3174
3175 vm_object_madvise(entry->object.vm_object, pstart,
3176 pend, behav);
3177
3178 /*
3179 * Pre-populate paging structures in the
3180 * WILLNEED case. For wired entries, the
3181 * paging structures are already populated.
3182 */
3183 if (behav == MADV_WILLNEED &&
3184 entry->wired_count == 0) {
3185 vm_map_pmap_enter(map,
3186 useStart,
3187 entry->protection,
3188 entry->object.vm_object,
3189 pstart,
3190 ptoa(pend - pstart),
3191 MAP_PREFAULT_MADVISE
3192 );
3193 }
3194 }
3195 vm_map_unlock_read(map);
3196 }
3197 return (0);
3198 }
3199
3200 /*
3201 * vm_map_inherit:
3202 *
3203 * Sets the inheritance of the specified address
3204 * range in the target map. Inheritance
3205 * affects how the map will be shared with
3206 * child maps at the time of vmspace_fork.
3207 */
3208 int
vm_map_inherit(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_inherit_t new_inheritance)3209 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
3210 vm_inherit_t new_inheritance)
3211 {
3212 vm_map_entry_t entry, lentry, prev_entry, start_entry;
3213 int rv;
3214
3215 switch (new_inheritance) {
3216 case VM_INHERIT_NONE:
3217 case VM_INHERIT_COPY:
3218 case VM_INHERIT_SHARE:
3219 case VM_INHERIT_ZERO:
3220 break;
3221 default:
3222 return (KERN_INVALID_ARGUMENT);
3223 }
3224 if (start == end)
3225 return (KERN_SUCCESS);
3226 vm_map_lock(map);
3227 VM_MAP_RANGE_CHECK(map, start, end);
3228 rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry);
3229 if (rv != KERN_SUCCESS)
3230 goto unlock;
3231 if (vm_map_lookup_entry(map, end - 1, &lentry)) {
3232 rv = vm_map_clip_end(map, lentry, end);
3233 if (rv != KERN_SUCCESS)
3234 goto unlock;
3235 }
3236 if (new_inheritance == VM_INHERIT_COPY) {
3237 for (entry = start_entry; entry->start < end;
3238 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3239 if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
3240 != 0) {
3241 rv = KERN_INVALID_ARGUMENT;
3242 goto unlock;
3243 }
3244 }
3245 }
3246 for (entry = start_entry; entry->start < end; prev_entry = entry,
3247 entry = vm_map_entry_succ(entry)) {
3248 KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx",
3249 entry, (uintmax_t)entry->end, (uintmax_t)end));
3250 if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
3251 new_inheritance != VM_INHERIT_ZERO)
3252 entry->inheritance = new_inheritance;
3253 vm_map_try_merge_entries(map, prev_entry, entry);
3254 }
3255 vm_map_try_merge_entries(map, prev_entry, entry);
3256 unlock:
3257 vm_map_unlock(map);
3258 return (rv);
3259 }
3260
3261 /*
3262 * vm_map_entry_in_transition:
3263 *
3264 * Release the map lock, and sleep until the entry is no longer in
3265 * transition. Awake and acquire the map lock. If the map changed while
3266 * another held the lock, lookup a possibly-changed entry at or after the
3267 * 'start' position of the old entry.
3268 */
3269 static vm_map_entry_t
vm_map_entry_in_transition(vm_map_t map,vm_offset_t in_start,vm_offset_t * io_end,bool holes_ok,vm_map_entry_t in_entry)3270 vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
3271 vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
3272 {
3273 vm_map_entry_t entry;
3274 vm_offset_t start;
3275 u_int last_timestamp;
3276
3277 VM_MAP_ASSERT_LOCKED(map);
3278 KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3279 ("not in-tranition map entry %p", in_entry));
3280 /*
3281 * We have not yet clipped the entry.
3282 */
3283 start = MAX(in_start, in_entry->start);
3284 in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3285 last_timestamp = map->timestamp;
3286 if (vm_map_unlock_and_wait(map, 0)) {
3287 /*
3288 * Allow interruption of user wiring/unwiring?
3289 */
3290 }
3291 vm_map_lock(map);
3292 if (last_timestamp + 1 == map->timestamp)
3293 return (in_entry);
3294
3295 /*
3296 * Look again for the entry because the map was modified while it was
3297 * unlocked. Specifically, the entry may have been clipped, merged, or
3298 * deleted.
3299 */
3300 if (!vm_map_lookup_entry(map, start, &entry)) {
3301 if (!holes_ok) {
3302 *io_end = start;
3303 return (NULL);
3304 }
3305 entry = vm_map_entry_succ(entry);
3306 }
3307 return (entry);
3308 }
3309
3310 /*
3311 * vm_map_unwire:
3312 *
3313 * Implements both kernel and user unwiring.
3314 */
3315 int
vm_map_unwire(vm_map_t map,vm_offset_t start,vm_offset_t end,int flags)3316 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
3317 int flags)
3318 {
3319 vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3320 int rv;
3321 bool holes_ok, need_wakeup, user_unwire;
3322
3323 if (start == end)
3324 return (KERN_SUCCESS);
3325 holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3326 user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
3327 vm_map_lock(map);
3328 VM_MAP_RANGE_CHECK(map, start, end);
3329 if (!vm_map_lookup_entry(map, start, &first_entry)) {
3330 if (holes_ok)
3331 first_entry = vm_map_entry_succ(first_entry);
3332 else {
3333 vm_map_unlock(map);
3334 return (KERN_INVALID_ADDRESS);
3335 }
3336 }
3337 rv = KERN_SUCCESS;
3338 for (entry = first_entry; entry->start < end; entry = next_entry) {
3339 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3340 /*
3341 * We have not yet clipped the entry.
3342 */
3343 next_entry = vm_map_entry_in_transition(map, start,
3344 &end, holes_ok, entry);
3345 if (next_entry == NULL) {
3346 if (entry == first_entry) {
3347 vm_map_unlock(map);
3348 return (KERN_INVALID_ADDRESS);
3349 }
3350 rv = KERN_INVALID_ADDRESS;
3351 break;
3352 }
3353 first_entry = (entry == first_entry) ?
3354 next_entry : NULL;
3355 continue;
3356 }
3357 rv = vm_map_clip_start(map, entry, start);
3358 if (rv != KERN_SUCCESS)
3359 break;
3360 rv = vm_map_clip_end(map, entry, end);
3361 if (rv != KERN_SUCCESS)
3362 break;
3363
3364 /*
3365 * Mark the entry in case the map lock is released. (See
3366 * above.)
3367 */
3368 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3369 entry->wiring_thread == NULL,
3370 ("owned map entry %p", entry));
3371 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3372 entry->wiring_thread = curthread;
3373 next_entry = vm_map_entry_succ(entry);
3374 /*
3375 * Check the map for holes in the specified region.
3376 * If holes_ok, skip this check.
3377 */
3378 if (!holes_ok &&
3379 entry->end < end && next_entry->start > entry->end) {
3380 end = entry->end;
3381 rv = KERN_INVALID_ADDRESS;
3382 break;
3383 }
3384 /*
3385 * If system unwiring, require that the entry is system wired.
3386 */
3387 if (!user_unwire &&
3388 vm_map_entry_system_wired_count(entry) == 0) {
3389 end = entry->end;
3390 rv = KERN_INVALID_ARGUMENT;
3391 break;
3392 }
3393 }
3394 need_wakeup = false;
3395 if (first_entry == NULL &&
3396 !vm_map_lookup_entry(map, start, &first_entry)) {
3397 KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
3398 prev_entry = first_entry;
3399 entry = vm_map_entry_succ(first_entry);
3400 } else {
3401 prev_entry = vm_map_entry_pred(first_entry);
3402 entry = first_entry;
3403 }
3404 for (; entry->start < end;
3405 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3406 /*
3407 * If holes_ok was specified, an empty
3408 * space in the unwired region could have been mapped
3409 * while the map lock was dropped for draining
3410 * MAP_ENTRY_IN_TRANSITION. Moreover, another thread
3411 * could be simultaneously wiring this new mapping
3412 * entry. Detect these cases and skip any entries
3413 * marked as in transition by us.
3414 */
3415 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3416 entry->wiring_thread != curthread) {
3417 KASSERT(holes_ok,
3418 ("vm_map_unwire: !HOLESOK and new/changed entry"));
3419 continue;
3420 }
3421
3422 if (rv == KERN_SUCCESS && (!user_unwire ||
3423 (entry->eflags & MAP_ENTRY_USER_WIRED))) {
3424 if (entry->wired_count == 1)
3425 vm_map_entry_unwire(map, entry);
3426 else
3427 entry->wired_count--;
3428 if (user_unwire)
3429 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3430 }
3431 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3432 ("vm_map_unwire: in-transition flag missing %p", entry));
3433 KASSERT(entry->wiring_thread == curthread,
3434 ("vm_map_unwire: alien wire %p", entry));
3435 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
3436 entry->wiring_thread = NULL;
3437 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3438 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3439 need_wakeup = true;
3440 }
3441 vm_map_try_merge_entries(map, prev_entry, entry);
3442 }
3443 vm_map_try_merge_entries(map, prev_entry, entry);
3444 vm_map_unlock(map);
3445 if (need_wakeup)
3446 vm_map_wakeup(map);
3447 return (rv);
3448 }
3449
3450 static void
vm_map_wire_user_count_sub(u_long npages)3451 vm_map_wire_user_count_sub(u_long npages)
3452 {
3453
3454 atomic_subtract_long(&vm_user_wire_count, npages);
3455 }
3456
3457 static bool
vm_map_wire_user_count_add(u_long npages)3458 vm_map_wire_user_count_add(u_long npages)
3459 {
3460 u_long wired;
3461
3462 wired = vm_user_wire_count;
3463 do {
3464 if (npages + wired > vm_page_max_user_wired)
3465 return (false);
3466 } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
3467 npages + wired));
3468
3469 return (true);
3470 }
3471
3472 /*
3473 * vm_map_wire_entry_failure:
3474 *
3475 * Handle a wiring failure on the given entry.
3476 *
3477 * The map should be locked.
3478 */
3479 static void
vm_map_wire_entry_failure(vm_map_t map,vm_map_entry_t entry,vm_offset_t failed_addr)3480 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
3481 vm_offset_t failed_addr)
3482 {
3483
3484 VM_MAP_ASSERT_LOCKED(map);
3485 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
3486 entry->wired_count == 1,
3487 ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
3488 KASSERT(failed_addr < entry->end,
3489 ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
3490
3491 /*
3492 * If any pages at the start of this entry were successfully wired,
3493 * then unwire them.
3494 */
3495 if (failed_addr > entry->start) {
3496 pmap_unwire(map->pmap, entry->start, failed_addr);
3497 vm_object_unwire(entry->object.vm_object, entry->offset,
3498 failed_addr - entry->start, PQ_ACTIVE);
3499 }
3500
3501 /*
3502 * Assign an out-of-range value to represent the failure to wire this
3503 * entry.
3504 */
3505 entry->wired_count = -1;
3506 }
3507
3508 int
vm_map_wire(vm_map_t map,vm_offset_t start,vm_offset_t end,int flags)3509 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3510 {
3511 int rv;
3512
3513 vm_map_lock(map);
3514 rv = vm_map_wire_locked(map, start, end, flags);
3515 vm_map_unlock(map);
3516 return (rv);
3517 }
3518
3519 /*
3520 * vm_map_wire_locked:
3521 *
3522 * Implements both kernel and user wiring. Returns with the map locked,
3523 * the map lock may be dropped.
3524 */
3525 int
vm_map_wire_locked(vm_map_t map,vm_offset_t start,vm_offset_t end,int flags)3526 vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3527 {
3528 vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3529 vm_offset_t faddr, saved_end, saved_start;
3530 u_long incr, npages;
3531 u_int bidx, last_timestamp;
3532 int rv;
3533 bool holes_ok, need_wakeup, user_wire;
3534 vm_prot_t prot;
3535
3536 VM_MAP_ASSERT_LOCKED(map);
3537
3538 if (start == end)
3539 return (KERN_SUCCESS);
3540 prot = 0;
3541 if (flags & VM_MAP_WIRE_WRITE)
3542 prot |= VM_PROT_WRITE;
3543 holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3544 user_wire = (flags & VM_MAP_WIRE_USER) != 0;
3545 VM_MAP_RANGE_CHECK(map, start, end);
3546 if (!vm_map_lookup_entry(map, start, &first_entry)) {
3547 if (holes_ok)
3548 first_entry = vm_map_entry_succ(first_entry);
3549 else
3550 return (KERN_INVALID_ADDRESS);
3551 }
3552 for (entry = first_entry; entry->start < end; entry = next_entry) {
3553 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3554 /*
3555 * We have not yet clipped the entry.
3556 */
3557 next_entry = vm_map_entry_in_transition(map, start,
3558 &end, holes_ok, entry);
3559 if (next_entry == NULL) {
3560 if (entry == first_entry)
3561 return (KERN_INVALID_ADDRESS);
3562 rv = KERN_INVALID_ADDRESS;
3563 goto done;
3564 }
3565 first_entry = (entry == first_entry) ?
3566 next_entry : NULL;
3567 continue;
3568 }
3569 rv = vm_map_clip_start(map, entry, start);
3570 if (rv != KERN_SUCCESS)
3571 goto done;
3572 rv = vm_map_clip_end(map, entry, end);
3573 if (rv != KERN_SUCCESS)
3574 goto done;
3575
3576 /*
3577 * Mark the entry in case the map lock is released. (See
3578 * above.)
3579 */
3580 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3581 entry->wiring_thread == NULL,
3582 ("owned map entry %p", entry));
3583 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3584 entry->wiring_thread = curthread;
3585 if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
3586 || (entry->protection & prot) != prot) {
3587 entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
3588 if (!holes_ok) {
3589 end = entry->end;
3590 rv = KERN_INVALID_ADDRESS;
3591 goto done;
3592 }
3593 } else if (entry->wired_count == 0) {
3594 entry->wired_count++;
3595
3596 npages = atop(entry->end - entry->start);
3597 if (user_wire && !vm_map_wire_user_count_add(npages)) {
3598 vm_map_wire_entry_failure(map, entry,
3599 entry->start);
3600 end = entry->end;
3601 rv = KERN_RESOURCE_SHORTAGE;
3602 goto done;
3603 }
3604
3605 /*
3606 * Release the map lock, relying on the in-transition
3607 * mark. Mark the map busy for fork.
3608 */
3609 saved_start = entry->start;
3610 saved_end = entry->end;
3611 last_timestamp = map->timestamp;
3612 bidx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
3613 incr = pagesizes[bidx];
3614 vm_map_busy(map);
3615 vm_map_unlock(map);
3616
3617 for (faddr = saved_start; faddr < saved_end;
3618 faddr += incr) {
3619 /*
3620 * Simulate a fault to get the page and enter
3621 * it into the physical map.
3622 */
3623 rv = vm_fault(map, faddr, VM_PROT_NONE,
3624 VM_FAULT_WIRE, NULL);
3625 if (rv != KERN_SUCCESS)
3626 break;
3627 }
3628 vm_map_lock(map);
3629 vm_map_unbusy(map);
3630 if (last_timestamp + 1 != map->timestamp) {
3631 /*
3632 * Look again for the entry because the map was
3633 * modified while it was unlocked. The entry
3634 * may have been clipped, but NOT merged or
3635 * deleted.
3636 */
3637 if (!vm_map_lookup_entry(map, saved_start,
3638 &next_entry))
3639 KASSERT(false,
3640 ("vm_map_wire: lookup failed"));
3641 first_entry = (entry == first_entry) ?
3642 next_entry : NULL;
3643 for (entry = next_entry; entry->end < saved_end;
3644 entry = vm_map_entry_succ(entry)) {
3645 /*
3646 * In case of failure, handle entries
3647 * that were not fully wired here;
3648 * fully wired entries are handled
3649 * later.
3650 */
3651 if (rv != KERN_SUCCESS &&
3652 faddr < entry->end)
3653 vm_map_wire_entry_failure(map,
3654 entry, faddr);
3655 }
3656 }
3657 if (rv != KERN_SUCCESS) {
3658 vm_map_wire_entry_failure(map, entry, faddr);
3659 if (user_wire)
3660 vm_map_wire_user_count_sub(npages);
3661 end = entry->end;
3662 goto done;
3663 }
3664 } else if (!user_wire ||
3665 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3666 entry->wired_count++;
3667 }
3668 /*
3669 * Check the map for holes in the specified region.
3670 * If holes_ok was specified, skip this check.
3671 */
3672 next_entry = vm_map_entry_succ(entry);
3673 if (!holes_ok &&
3674 entry->end < end && next_entry->start > entry->end) {
3675 end = entry->end;
3676 rv = KERN_INVALID_ADDRESS;
3677 goto done;
3678 }
3679 }
3680 rv = KERN_SUCCESS;
3681 done:
3682 need_wakeup = false;
3683 if (first_entry == NULL &&
3684 !vm_map_lookup_entry(map, start, &first_entry)) {
3685 KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
3686 prev_entry = first_entry;
3687 entry = vm_map_entry_succ(first_entry);
3688 } else {
3689 prev_entry = vm_map_entry_pred(first_entry);
3690 entry = first_entry;
3691 }
3692 for (; entry->start < end;
3693 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3694 /*
3695 * If holes_ok was specified, an empty
3696 * space in the unwired region could have been mapped
3697 * while the map lock was dropped for faulting in the
3698 * pages or draining MAP_ENTRY_IN_TRANSITION.
3699 * Moreover, another thread could be simultaneously
3700 * wiring this new mapping entry. Detect these cases
3701 * and skip any entries marked as in transition not by us.
3702 *
3703 * Another way to get an entry not marked with
3704 * MAP_ENTRY_IN_TRANSITION is after failed clipping,
3705 * which set rv to KERN_INVALID_ARGUMENT.
3706 */
3707 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3708 entry->wiring_thread != curthread) {
3709 KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT,
3710 ("vm_map_wire: !HOLESOK and new/changed entry"));
3711 continue;
3712 }
3713
3714 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
3715 /* do nothing */
3716 } else if (rv == KERN_SUCCESS) {
3717 if (user_wire)
3718 entry->eflags |= MAP_ENTRY_USER_WIRED;
3719 } else if (entry->wired_count == -1) {
3720 /*
3721 * Wiring failed on this entry. Thus, unwiring is
3722 * unnecessary.
3723 */
3724 entry->wired_count = 0;
3725 } else if (!user_wire ||
3726 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3727 /*
3728 * Undo the wiring. Wiring succeeded on this entry
3729 * but failed on a later entry.
3730 */
3731 if (entry->wired_count == 1) {
3732 vm_map_entry_unwire(map, entry);
3733 if (user_wire)
3734 vm_map_wire_user_count_sub(
3735 atop(entry->end - entry->start));
3736 } else
3737 entry->wired_count--;
3738 }
3739 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3740 ("vm_map_wire: in-transition flag missing %p", entry));
3741 KASSERT(entry->wiring_thread == curthread,
3742 ("vm_map_wire: alien wire %p", entry));
3743 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
3744 MAP_ENTRY_WIRE_SKIPPED);
3745 entry->wiring_thread = NULL;
3746 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3747 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3748 need_wakeup = true;
3749 }
3750 vm_map_try_merge_entries(map, prev_entry, entry);
3751 }
3752 vm_map_try_merge_entries(map, prev_entry, entry);
3753 if (need_wakeup)
3754 vm_map_wakeup(map);
3755 return (rv);
3756 }
3757
3758 /*
3759 * vm_map_sync
3760 *
3761 * Push any dirty cached pages in the address range to their pager.
3762 * If syncio is TRUE, dirty pages are written synchronously.
3763 * If invalidate is TRUE, any cached pages are freed as well.
3764 *
3765 * If the size of the region from start to end is zero, we are
3766 * supposed to flush all modified pages within the region containing
3767 * start. Unfortunately, a region can be split or coalesced with
3768 * neighboring regions, making it difficult to determine what the
3769 * original region was. Therefore, we approximate this requirement by
3770 * flushing the current region containing start.
3771 *
3772 * Returns an error if any part of the specified range is not mapped.
3773 */
3774 int
vm_map_sync(vm_map_t map,vm_offset_t start,vm_offset_t end,boolean_t syncio,boolean_t invalidate)3775 vm_map_sync(
3776 vm_map_t map,
3777 vm_offset_t start,
3778 vm_offset_t end,
3779 boolean_t syncio,
3780 boolean_t invalidate)
3781 {
3782 vm_map_entry_t entry, first_entry, next_entry;
3783 vm_size_t size;
3784 vm_object_t object;
3785 vm_ooffset_t offset;
3786 unsigned int last_timestamp;
3787 int bdry_idx;
3788 boolean_t failed;
3789
3790 vm_map_lock_read(map);
3791 VM_MAP_RANGE_CHECK(map, start, end);
3792 if (!vm_map_lookup_entry(map, start, &first_entry)) {
3793 vm_map_unlock_read(map);
3794 return (KERN_INVALID_ADDRESS);
3795 } else if (start == end) {
3796 start = first_entry->start;
3797 end = first_entry->end;
3798 }
3799
3800 /*
3801 * Make a first pass to check for user-wired memory, holes,
3802 * and partial invalidation of largepage mappings.
3803 */
3804 for (entry = first_entry; entry->start < end; entry = next_entry) {
3805 if (invalidate) {
3806 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) {
3807 vm_map_unlock_read(map);
3808 return (KERN_INVALID_ARGUMENT);
3809 }
3810 bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
3811 if (bdry_idx != 0 &&
3812 ((start & (pagesizes[bdry_idx] - 1)) != 0 ||
3813 (end & (pagesizes[bdry_idx] - 1)) != 0)) {
3814 vm_map_unlock_read(map);
3815 return (KERN_INVALID_ARGUMENT);
3816 }
3817 }
3818 next_entry = vm_map_entry_succ(entry);
3819 if (end > entry->end &&
3820 entry->end != next_entry->start) {
3821 vm_map_unlock_read(map);
3822 return (KERN_INVALID_ADDRESS);
3823 }
3824 }
3825
3826 if (invalidate)
3827 pmap_remove(map->pmap, start, end);
3828 failed = FALSE;
3829
3830 /*
3831 * Make a second pass, cleaning/uncaching pages from the indicated
3832 * objects as we go.
3833 */
3834 for (entry = first_entry; entry->start < end;) {
3835 offset = entry->offset + (start - entry->start);
3836 size = (end <= entry->end ? end : entry->end) - start;
3837 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
3838 vm_map_t smap;
3839 vm_map_entry_t tentry;
3840 vm_size_t tsize;
3841
3842 smap = entry->object.sub_map;
3843 vm_map_lock_read(smap);
3844 (void) vm_map_lookup_entry(smap, offset, &tentry);
3845 tsize = tentry->end - offset;
3846 if (tsize < size)
3847 size = tsize;
3848 object = tentry->object.vm_object;
3849 offset = tentry->offset + (offset - tentry->start);
3850 vm_map_unlock_read(smap);
3851 } else {
3852 object = entry->object.vm_object;
3853 }
3854 vm_object_reference(object);
3855 last_timestamp = map->timestamp;
3856 vm_map_unlock_read(map);
3857 if (!vm_object_sync(object, offset, size, syncio, invalidate))
3858 failed = TRUE;
3859 start += size;
3860 vm_object_deallocate(object);
3861 vm_map_lock_read(map);
3862 if (last_timestamp == map->timestamp ||
3863 !vm_map_lookup_entry(map, start, &entry))
3864 entry = vm_map_entry_succ(entry);
3865 }
3866
3867 vm_map_unlock_read(map);
3868 return (failed ? KERN_FAILURE : KERN_SUCCESS);
3869 }
3870
3871 /*
3872 * vm_map_entry_unwire: [ internal use only ]
3873 *
3874 * Make the region specified by this entry pageable.
3875 *
3876 * The map in question should be locked.
3877 * [This is the reason for this routine's existence.]
3878 */
3879 static void
vm_map_entry_unwire(vm_map_t map,vm_map_entry_t entry)3880 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3881 {
3882 vm_size_t size;
3883
3884 VM_MAP_ASSERT_LOCKED(map);
3885 KASSERT(entry->wired_count > 0,
3886 ("vm_map_entry_unwire: entry %p isn't wired", entry));
3887
3888 size = entry->end - entry->start;
3889 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
3890 vm_map_wire_user_count_sub(atop(size));
3891 pmap_unwire(map->pmap, entry->start, entry->end);
3892 vm_object_unwire(entry->object.vm_object, entry->offset, size,
3893 PQ_ACTIVE);
3894 entry->wired_count = 0;
3895 }
3896
3897 static void
vm_map_entry_deallocate(vm_map_entry_t entry,boolean_t system_map)3898 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
3899 {
3900
3901 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
3902 vm_object_deallocate(entry->object.vm_object);
3903 uma_zfree(system_map ? kmapentzone : mapentzone, entry);
3904 }
3905
3906 /*
3907 * vm_map_entry_delete: [ internal use only ]
3908 *
3909 * Deallocate the given entry from the target map.
3910 */
3911 static void
vm_map_entry_delete(vm_map_t map,vm_map_entry_t entry)3912 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
3913 {
3914 vm_object_t object;
3915 vm_pindex_t offidxstart, offidxend, size1;
3916 vm_size_t size;
3917
3918 vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
3919 object = entry->object.vm_object;
3920
3921 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
3922 MPASS(entry->cred == NULL);
3923 MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
3924 MPASS(object == NULL);
3925 vm_map_entry_deallocate(entry, map->system_map);
3926 return;
3927 }
3928
3929 size = entry->end - entry->start;
3930 map->size -= size;
3931
3932 if (entry->cred != NULL) {
3933 swap_release_by_cred(size, entry->cred);
3934 crfree(entry->cred);
3935 }
3936
3937 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) {
3938 entry->object.vm_object = NULL;
3939 } else if ((object->flags & OBJ_ANON) != 0 ||
3940 object == kernel_object) {
3941 KASSERT(entry->cred == NULL || object->cred == NULL ||
3942 (entry->eflags & MAP_ENTRY_NEEDS_COPY),
3943 ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
3944 offidxstart = OFF_TO_IDX(entry->offset);
3945 offidxend = offidxstart + atop(size);
3946 VM_OBJECT_WLOCK(object);
3947 if (object->ref_count != 1 &&
3948 ((object->flags & OBJ_ONEMAPPING) != 0 ||
3949 object == kernel_object)) {
3950 vm_object_collapse(object);
3951
3952 /*
3953 * The option OBJPR_NOTMAPPED can be passed here
3954 * because vm_map_delete() already performed
3955 * pmap_remove() on the only mapping to this range
3956 * of pages.
3957 */
3958 vm_object_page_remove(object, offidxstart, offidxend,
3959 OBJPR_NOTMAPPED);
3960 if (offidxend >= object->size &&
3961 offidxstart < object->size) {
3962 size1 = object->size;
3963 object->size = offidxstart;
3964 if (object->cred != NULL) {
3965 size1 -= object->size;
3966 KASSERT(object->charge >= ptoa(size1),
3967 ("object %p charge < 0", object));
3968 swap_release_by_cred(ptoa(size1),
3969 object->cred);
3970 object->charge -= ptoa(size1);
3971 }
3972 }
3973 }
3974 VM_OBJECT_WUNLOCK(object);
3975 }
3976 if (map->system_map)
3977 vm_map_entry_deallocate(entry, TRUE);
3978 else {
3979 entry->defer_next = curthread->td_map_def_user;
3980 curthread->td_map_def_user = entry;
3981 }
3982 }
3983
3984 /*
3985 * vm_map_delete: [ internal use only ]
3986 *
3987 * Deallocates the given address range from the target
3988 * map.
3989 */
3990 int
vm_map_delete(vm_map_t map,vm_offset_t start,vm_offset_t end)3991 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
3992 {
3993 vm_map_entry_t entry, next_entry, scratch_entry;
3994 int rv;
3995
3996 VM_MAP_ASSERT_LOCKED(map);
3997
3998 if (start == end)
3999 return (KERN_SUCCESS);
4000
4001 /*
4002 * Find the start of the region, and clip it.
4003 * Step through all entries in this region.
4004 */
4005 rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
4006 if (rv != KERN_SUCCESS)
4007 return (rv);
4008 for (; entry->start < end; entry = next_entry) {
4009 /*
4010 * Wait for wiring or unwiring of an entry to complete.
4011 * Also wait for any system wirings to disappear on
4012 * user maps.
4013 */
4014 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
4015 (vm_map_pmap(map) != kernel_pmap &&
4016 vm_map_entry_system_wired_count(entry) != 0)) {
4017 unsigned int last_timestamp;
4018 vm_offset_t saved_start;
4019
4020 saved_start = entry->start;
4021 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4022 last_timestamp = map->timestamp;
4023 (void) vm_map_unlock_and_wait(map, 0);
4024 vm_map_lock(map);
4025 if (last_timestamp + 1 != map->timestamp) {
4026 /*
4027 * Look again for the entry because the map was
4028 * modified while it was unlocked.
4029 * Specifically, the entry may have been
4030 * clipped, merged, or deleted.
4031 */
4032 rv = vm_map_lookup_clip_start(map, saved_start,
4033 &next_entry, &scratch_entry);
4034 if (rv != KERN_SUCCESS)
4035 break;
4036 } else
4037 next_entry = entry;
4038 continue;
4039 }
4040
4041 /* XXXKIB or delete to the upper superpage boundary ? */
4042 rv = vm_map_clip_end(map, entry, end);
4043 if (rv != KERN_SUCCESS)
4044 break;
4045 next_entry = vm_map_entry_succ(entry);
4046
4047 /*
4048 * Unwire before removing addresses from the pmap; otherwise,
4049 * unwiring will put the entries back in the pmap.
4050 */
4051 if (entry->wired_count != 0)
4052 vm_map_entry_unwire(map, entry);
4053
4054 /*
4055 * Remove mappings for the pages, but only if the
4056 * mappings could exist. For instance, it does not
4057 * make sense to call pmap_remove() for guard entries.
4058 */
4059 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
4060 entry->object.vm_object != NULL)
4061 pmap_map_delete(map->pmap, entry->start, entry->end);
4062
4063 if (entry->end == map->anon_loc)
4064 map->anon_loc = entry->start;
4065
4066 /*
4067 * Delete the entry only after removing all pmap
4068 * entries pointing to its pages. (Otherwise, its
4069 * page frames may be reallocated, and any modify bits
4070 * will be set in the wrong object!)
4071 */
4072 vm_map_entry_delete(map, entry);
4073 }
4074 return (rv);
4075 }
4076
4077 /*
4078 * vm_map_remove:
4079 *
4080 * Remove the given address range from the target map.
4081 * This is the exported form of vm_map_delete.
4082 */
4083 int
vm_map_remove(vm_map_t map,vm_offset_t start,vm_offset_t end)4084 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
4085 {
4086 int result;
4087
4088 vm_map_lock(map);
4089 VM_MAP_RANGE_CHECK(map, start, end);
4090 result = vm_map_delete(map, start, end);
4091 vm_map_unlock(map);
4092 return (result);
4093 }
4094
4095 /*
4096 * vm_map_check_protection:
4097 *
4098 * Assert that the target map allows the specified privilege on the
4099 * entire address region given. The entire region must be allocated.
4100 *
4101 * WARNING! This code does not and should not check whether the
4102 * contents of the region is accessible. For example a smaller file
4103 * might be mapped into a larger address space.
4104 *
4105 * NOTE! This code is also called by munmap().
4106 *
4107 * The map must be locked. A read lock is sufficient.
4108 */
4109 boolean_t
vm_map_check_protection(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_prot_t protection)4110 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
4111 vm_prot_t protection)
4112 {
4113 vm_map_entry_t entry;
4114 vm_map_entry_t tmp_entry;
4115
4116 if (!vm_map_lookup_entry(map, start, &tmp_entry))
4117 return (FALSE);
4118 entry = tmp_entry;
4119
4120 while (start < end) {
4121 /*
4122 * No holes allowed!
4123 */
4124 if (start < entry->start)
4125 return (FALSE);
4126 /*
4127 * Check protection associated with entry.
4128 */
4129 if ((entry->protection & protection) != protection)
4130 return (FALSE);
4131 /* go to next entry */
4132 start = entry->end;
4133 entry = vm_map_entry_succ(entry);
4134 }
4135 return (TRUE);
4136 }
4137
4138 /*
4139 *
4140 * vm_map_copy_swap_object:
4141 *
4142 * Copies a swap-backed object from an existing map entry to a
4143 * new one. Carries forward the swap charge. May change the
4144 * src object on return.
4145 */
4146 static void
vm_map_copy_swap_object(vm_map_entry_t src_entry,vm_map_entry_t dst_entry,vm_offset_t size,vm_ooffset_t * fork_charge)4147 vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry,
4148 vm_offset_t size, vm_ooffset_t *fork_charge)
4149 {
4150 vm_object_t src_object;
4151 struct ucred *cred;
4152 int charged;
4153
4154 src_object = src_entry->object.vm_object;
4155 charged = ENTRY_CHARGED(src_entry);
4156 if ((src_object->flags & OBJ_ANON) != 0) {
4157 VM_OBJECT_WLOCK(src_object);
4158 vm_object_collapse(src_object);
4159 if ((src_object->flags & OBJ_ONEMAPPING) != 0) {
4160 vm_object_split(src_entry);
4161 src_object = src_entry->object.vm_object;
4162 }
4163 vm_object_reference_locked(src_object);
4164 vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
4165 VM_OBJECT_WUNLOCK(src_object);
4166 } else
4167 vm_object_reference(src_object);
4168 if (src_entry->cred != NULL &&
4169 !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4170 KASSERT(src_object->cred == NULL,
4171 ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p",
4172 src_object));
4173 src_object->cred = src_entry->cred;
4174 src_object->charge = size;
4175 }
4176 dst_entry->object.vm_object = src_object;
4177 if (charged) {
4178 cred = curthread->td_ucred;
4179 crhold(cred);
4180 dst_entry->cred = cred;
4181 *fork_charge += size;
4182 if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4183 crhold(cred);
4184 src_entry->cred = cred;
4185 *fork_charge += size;
4186 }
4187 }
4188 }
4189
4190 /*
4191 * vm_map_copy_entry:
4192 *
4193 * Copies the contents of the source entry to the destination
4194 * entry. The entries *must* be aligned properly.
4195 */
4196 static void
vm_map_copy_entry(vm_map_t src_map,vm_map_t dst_map,vm_map_entry_t src_entry,vm_map_entry_t dst_entry,vm_ooffset_t * fork_charge)4197 vm_map_copy_entry(
4198 vm_map_t src_map,
4199 vm_map_t dst_map,
4200 vm_map_entry_t src_entry,
4201 vm_map_entry_t dst_entry,
4202 vm_ooffset_t *fork_charge)
4203 {
4204 vm_object_t src_object;
4205 vm_map_entry_t fake_entry;
4206 vm_offset_t size;
4207
4208 VM_MAP_ASSERT_LOCKED(dst_map);
4209
4210 if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
4211 return;
4212
4213 if (src_entry->wired_count == 0 ||
4214 (src_entry->protection & VM_PROT_WRITE) == 0) {
4215 /*
4216 * If the source entry is marked needs_copy, it is already
4217 * write-protected.
4218 */
4219 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
4220 (src_entry->protection & VM_PROT_WRITE) != 0) {
4221 pmap_protect(src_map->pmap,
4222 src_entry->start,
4223 src_entry->end,
4224 src_entry->protection & ~VM_PROT_WRITE);
4225 }
4226
4227 /*
4228 * Make a copy of the object.
4229 */
4230 size = src_entry->end - src_entry->start;
4231 if ((src_object = src_entry->object.vm_object) != NULL) {
4232 if ((src_object->flags & OBJ_SWAP) != 0) {
4233 vm_map_copy_swap_object(src_entry, dst_entry,
4234 size, fork_charge);
4235 /* May have split/collapsed, reload obj. */
4236 src_object = src_entry->object.vm_object;
4237 } else {
4238 vm_object_reference(src_object);
4239 dst_entry->object.vm_object = src_object;
4240 }
4241 src_entry->eflags |= MAP_ENTRY_COW |
4242 MAP_ENTRY_NEEDS_COPY;
4243 dst_entry->eflags |= MAP_ENTRY_COW |
4244 MAP_ENTRY_NEEDS_COPY;
4245 dst_entry->offset = src_entry->offset;
4246 if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
4247 /*
4248 * MAP_ENTRY_WRITECNT cannot
4249 * indicate write reference from
4250 * src_entry, since the entry is
4251 * marked as needs copy. Allocate a
4252 * fake entry that is used to
4253 * decrement object->un_pager writecount
4254 * at the appropriate time. Attach
4255 * fake_entry to the deferred list.
4256 */
4257 fake_entry = vm_map_entry_create(dst_map);
4258 fake_entry->eflags = MAP_ENTRY_WRITECNT;
4259 src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
4260 vm_object_reference(src_object);
4261 fake_entry->object.vm_object = src_object;
4262 fake_entry->start = src_entry->start;
4263 fake_entry->end = src_entry->end;
4264 fake_entry->defer_next =
4265 curthread->td_map_def_user;
4266 curthread->td_map_def_user = fake_entry;
4267 }
4268
4269 pmap_copy(dst_map->pmap, src_map->pmap,
4270 dst_entry->start, dst_entry->end - dst_entry->start,
4271 src_entry->start);
4272 } else {
4273 dst_entry->object.vm_object = NULL;
4274 if ((dst_entry->eflags & MAP_ENTRY_GUARD) == 0)
4275 dst_entry->offset = 0;
4276 if (src_entry->cred != NULL) {
4277 dst_entry->cred = curthread->td_ucred;
4278 crhold(dst_entry->cred);
4279 *fork_charge += size;
4280 }
4281 }
4282 } else {
4283 /*
4284 * We don't want to make writeable wired pages copy-on-write.
4285 * Immediately copy these pages into the new map by simulating
4286 * page faults. The new pages are pageable.
4287 */
4288 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
4289 fork_charge);
4290 }
4291 }
4292
4293 /*
4294 * vmspace_map_entry_forked:
4295 * Update the newly-forked vmspace each time a map entry is inherited
4296 * or copied. The values for vm_dsize and vm_tsize are approximate
4297 * (and mostly-obsolete ideas in the face of mmap(2) et al.)
4298 */
4299 static void
vmspace_map_entry_forked(const struct vmspace * vm1,struct vmspace * vm2,vm_map_entry_t entry)4300 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
4301 vm_map_entry_t entry)
4302 {
4303 vm_size_t entrysize;
4304 vm_offset_t newend;
4305
4306 if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
4307 return;
4308 entrysize = entry->end - entry->start;
4309 vm2->vm_map.size += entrysize;
4310 if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
4311 vm2->vm_ssize += btoc(entrysize);
4312 } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
4313 entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
4314 newend = MIN(entry->end,
4315 (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
4316 vm2->vm_dsize += btoc(newend - entry->start);
4317 } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
4318 entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
4319 newend = MIN(entry->end,
4320 (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
4321 vm2->vm_tsize += btoc(newend - entry->start);
4322 }
4323 }
4324
4325 /*
4326 * vmspace_fork:
4327 * Create a new process vmspace structure and vm_map
4328 * based on those of an existing process. The new map
4329 * is based on the old map, according to the inheritance
4330 * values on the regions in that map.
4331 *
4332 * XXX It might be worth coalescing the entries added to the new vmspace.
4333 *
4334 * The source map must not be locked.
4335 */
4336 struct vmspace *
vmspace_fork(struct vmspace * vm1,vm_ooffset_t * fork_charge)4337 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
4338 {
4339 struct vmspace *vm2;
4340 vm_map_t new_map, old_map;
4341 vm_map_entry_t new_entry, old_entry;
4342 vm_object_t object;
4343 int error, locked __diagused;
4344 vm_inherit_t inh;
4345
4346 old_map = &vm1->vm_map;
4347 /* Copy immutable fields of vm1 to vm2. */
4348 vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
4349 pmap_pinit);
4350 if (vm2 == NULL)
4351 return (NULL);
4352
4353 vm2->vm_taddr = vm1->vm_taddr;
4354 vm2->vm_daddr = vm1->vm_daddr;
4355 vm2->vm_maxsaddr = vm1->vm_maxsaddr;
4356 vm2->vm_stacktop = vm1->vm_stacktop;
4357 vm2->vm_shp_base = vm1->vm_shp_base;
4358 vm_map_lock(old_map);
4359 if (old_map->busy)
4360 vm_map_wait_busy(old_map);
4361 new_map = &vm2->vm_map;
4362 locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
4363 KASSERT(locked, ("vmspace_fork: lock failed"));
4364
4365 error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
4366 if (error != 0) {
4367 sx_xunlock(&old_map->lock);
4368 sx_xunlock(&new_map->lock);
4369 vm_map_process_deferred();
4370 vmspace_free(vm2);
4371 return (NULL);
4372 }
4373
4374 new_map->anon_loc = old_map->anon_loc;
4375 new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART |
4376 MAP_ASLR_STACK | MAP_WXORX);
4377
4378 VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
4379 if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
4380 panic("vm_map_fork: encountered a submap");
4381
4382 inh = old_entry->inheritance;
4383 if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4384 inh != VM_INHERIT_NONE)
4385 inh = VM_INHERIT_COPY;
4386
4387 switch (inh) {
4388 case VM_INHERIT_NONE:
4389 break;
4390
4391 case VM_INHERIT_SHARE:
4392 /*
4393 * Clone the entry, creating the shared object if
4394 * necessary.
4395 */
4396 object = old_entry->object.vm_object;
4397 if (object == NULL) {
4398 vm_map_entry_back(old_entry);
4399 object = old_entry->object.vm_object;
4400 }
4401
4402 /*
4403 * Add the reference before calling vm_object_shadow
4404 * to insure that a shadow object is created.
4405 */
4406 vm_object_reference(object);
4407 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4408 vm_object_shadow(&old_entry->object.vm_object,
4409 &old_entry->offset,
4410 old_entry->end - old_entry->start,
4411 old_entry->cred,
4412 /* Transfer the second reference too. */
4413 true);
4414 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4415 old_entry->cred = NULL;
4416
4417 /*
4418 * As in vm_map_merged_neighbor_dispose(),
4419 * the vnode lock will not be acquired in
4420 * this call to vm_object_deallocate().
4421 */
4422 vm_object_deallocate(object);
4423 object = old_entry->object.vm_object;
4424 } else {
4425 VM_OBJECT_WLOCK(object);
4426 vm_object_clear_flag(object, OBJ_ONEMAPPING);
4427 if (old_entry->cred != NULL) {
4428 KASSERT(object->cred == NULL,
4429 ("vmspace_fork both cred"));
4430 object->cred = old_entry->cred;
4431 object->charge = old_entry->end -
4432 old_entry->start;
4433 old_entry->cred = NULL;
4434 }
4435
4436 /*
4437 * Assert the correct state of the vnode
4438 * v_writecount while the object is locked, to
4439 * not relock it later for the assertion
4440 * correctness.
4441 */
4442 if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
4443 object->type == OBJT_VNODE) {
4444 KASSERT(((struct vnode *)object->
4445 handle)->v_writecount > 0,
4446 ("vmspace_fork: v_writecount %p",
4447 object));
4448 KASSERT(object->un_pager.vnp.
4449 writemappings > 0,
4450 ("vmspace_fork: vnp.writecount %p",
4451 object));
4452 }
4453 VM_OBJECT_WUNLOCK(object);
4454 }
4455
4456 /*
4457 * Clone the entry, referencing the shared object.
4458 */
4459 new_entry = vm_map_entry_create(new_map);
4460 *new_entry = *old_entry;
4461 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4462 MAP_ENTRY_IN_TRANSITION);
4463 new_entry->wiring_thread = NULL;
4464 new_entry->wired_count = 0;
4465 if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
4466 vm_pager_update_writecount(object,
4467 new_entry->start, new_entry->end);
4468 }
4469 vm_map_entry_set_vnode_text(new_entry, true);
4470
4471 /*
4472 * Insert the entry into the new map -- we know we're
4473 * inserting at the end of the new map.
4474 */
4475 vm_map_entry_link(new_map, new_entry);
4476 vmspace_map_entry_forked(vm1, vm2, new_entry);
4477
4478 /*
4479 * Update the physical map
4480 */
4481 pmap_copy(new_map->pmap, old_map->pmap,
4482 new_entry->start,
4483 (old_entry->end - old_entry->start),
4484 old_entry->start);
4485 break;
4486
4487 case VM_INHERIT_COPY:
4488 /*
4489 * Clone the entry and link into the map.
4490 */
4491 new_entry = vm_map_entry_create(new_map);
4492 *new_entry = *old_entry;
4493 /*
4494 * Copied entry is COW over the old object.
4495 */
4496 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4497 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
4498 new_entry->wiring_thread = NULL;
4499 new_entry->wired_count = 0;
4500 new_entry->object.vm_object = NULL;
4501 new_entry->cred = NULL;
4502 vm_map_entry_link(new_map, new_entry);
4503 vmspace_map_entry_forked(vm1, vm2, new_entry);
4504 vm_map_copy_entry(old_map, new_map, old_entry,
4505 new_entry, fork_charge);
4506 vm_map_entry_set_vnode_text(new_entry, true);
4507 break;
4508
4509 case VM_INHERIT_ZERO:
4510 /*
4511 * Create a new anonymous mapping entry modelled from
4512 * the old one.
4513 */
4514 new_entry = vm_map_entry_create(new_map);
4515 memset(new_entry, 0, sizeof(*new_entry));
4516
4517 new_entry->start = old_entry->start;
4518 new_entry->end = old_entry->end;
4519 new_entry->eflags = old_entry->eflags &
4520 ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
4521 MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC |
4522 MAP_ENTRY_SPLIT_BOUNDARY_MASK);
4523 new_entry->protection = old_entry->protection;
4524 new_entry->max_protection = old_entry->max_protection;
4525 new_entry->inheritance = VM_INHERIT_ZERO;
4526
4527 vm_map_entry_link(new_map, new_entry);
4528 vmspace_map_entry_forked(vm1, vm2, new_entry);
4529
4530 new_entry->cred = curthread->td_ucred;
4531 crhold(new_entry->cred);
4532 *fork_charge += (new_entry->end - new_entry->start);
4533
4534 break;
4535 }
4536 }
4537 /*
4538 * Use inlined vm_map_unlock() to postpone handling the deferred
4539 * map entries, which cannot be done until both old_map and
4540 * new_map locks are released.
4541 */
4542 sx_xunlock(&old_map->lock);
4543 sx_xunlock(&new_map->lock);
4544 vm_map_process_deferred();
4545
4546 return (vm2);
4547 }
4548
4549 /*
4550 * Create a process's stack for exec_new_vmspace(). This function is never
4551 * asked to wire the newly created stack.
4552 */
4553 int
vm_map_stack(vm_map_t map,vm_offset_t addrbos,vm_size_t max_ssize,vm_prot_t prot,vm_prot_t max,int cow)4554 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4555 vm_prot_t prot, vm_prot_t max, int cow)
4556 {
4557 vm_size_t growsize, init_ssize;
4558 rlim_t vmemlim;
4559 int rv;
4560
4561 MPASS((map->flags & MAP_WIREFUTURE) == 0);
4562 growsize = sgrowsiz;
4563 init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
4564 vm_map_lock(map);
4565 vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4566 /* If we would blow our VMEM resource limit, no go */
4567 if (map->size + init_ssize > vmemlim) {
4568 rv = KERN_NO_SPACE;
4569 goto out;
4570 }
4571 rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
4572 max, cow);
4573 out:
4574 vm_map_unlock(map);
4575 return (rv);
4576 }
4577
4578 static int stack_guard_page = 1;
4579 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
4580 &stack_guard_page, 0,
4581 "Specifies the number of guard pages for a stack that grows");
4582
4583 static int
vm_map_stack_locked(vm_map_t map,vm_offset_t addrbos,vm_size_t max_ssize,vm_size_t growsize,vm_prot_t prot,vm_prot_t max,int cow)4584 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4585 vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
4586 {
4587 vm_map_entry_t gap_entry, new_entry, prev_entry;
4588 vm_offset_t bot, gap_bot, gap_top, top;
4589 vm_size_t init_ssize, sgp;
4590 int orient, rv;
4591
4592 /*
4593 * The stack orientation is piggybacked with the cow argument.
4594 * Extract it into orient and mask the cow argument so that we
4595 * don't pass it around further.
4596 */
4597 orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
4598 KASSERT(orient != 0, ("No stack grow direction"));
4599 KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
4600 ("bi-dir stack"));
4601
4602 if (max_ssize == 0 ||
4603 !vm_map_range_valid(map, addrbos, addrbos + max_ssize))
4604 return (KERN_INVALID_ADDRESS);
4605 sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4606 (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4607 (vm_size_t)stack_guard_page * PAGE_SIZE;
4608 if (sgp >= max_ssize)
4609 return (KERN_INVALID_ARGUMENT);
4610
4611 init_ssize = growsize;
4612 if (max_ssize < init_ssize + sgp)
4613 init_ssize = max_ssize - sgp;
4614
4615 /* If addr is already mapped, no go */
4616 if (vm_map_lookup_entry(map, addrbos, &prev_entry))
4617 return (KERN_NO_SPACE);
4618
4619 /*
4620 * If we can't accommodate max_ssize in the current mapping, no go.
4621 */
4622 if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize)
4623 return (KERN_NO_SPACE);
4624
4625 /*
4626 * We initially map a stack of only init_ssize. We will grow as
4627 * needed later. Depending on the orientation of the stack (i.e.
4628 * the grow direction) we either map at the top of the range, the
4629 * bottom of the range or in the middle.
4630 *
4631 * Note: we would normally expect prot and max to be VM_PROT_ALL,
4632 * and cow to be 0. Possibly we should eliminate these as input
4633 * parameters, and just pass these values here in the insert call.
4634 */
4635 if (orient == MAP_STACK_GROWS_DOWN) {
4636 bot = addrbos + max_ssize - init_ssize;
4637 top = bot + init_ssize;
4638 gap_bot = addrbos;
4639 gap_top = bot;
4640 } else /* if (orient == MAP_STACK_GROWS_UP) */ {
4641 bot = addrbos;
4642 top = bot + init_ssize;
4643 gap_bot = top;
4644 gap_top = addrbos + max_ssize;
4645 }
4646 rv = vm_map_insert1(map, NULL, 0, bot, top, prot, max, cow,
4647 &new_entry);
4648 if (rv != KERN_SUCCESS)
4649 return (rv);
4650 KASSERT(new_entry->end == top || new_entry->start == bot,
4651 ("Bad entry start/end for new stack entry"));
4652 KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
4653 (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
4654 ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
4655 KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
4656 (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
4657 ("new entry lacks MAP_ENTRY_GROWS_UP"));
4658 if (gap_bot == gap_top)
4659 return (KERN_SUCCESS);
4660 rv = vm_map_insert1(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
4661 VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
4662 MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP), &gap_entry);
4663 if (rv == KERN_SUCCESS) {
4664 KASSERT((gap_entry->eflags & MAP_ENTRY_GUARD) != 0,
4665 ("entry %p not gap %#x", gap_entry, gap_entry->eflags));
4666 KASSERT((gap_entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
4667 MAP_ENTRY_STACK_GAP_UP)) != 0,
4668 ("entry %p not stack gap %#x", gap_entry,
4669 gap_entry->eflags));
4670
4671 /*
4672 * Gap can never successfully handle a fault, so
4673 * read-ahead logic is never used for it. Re-use
4674 * next_read of the gap entry to store
4675 * stack_guard_page for vm_map_growstack().
4676 * Similarly, since a gap cannot have a backing object,
4677 * store the original stack protections in the
4678 * object offset.
4679 */
4680 gap_entry->next_read = sgp;
4681 gap_entry->offset = prot | PROT_MAX(max);
4682 } else {
4683 (void)vm_map_delete(map, bot, top);
4684 }
4685 return (rv);
4686 }
4687
4688 /*
4689 * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we
4690 * successfully grow the stack.
4691 */
4692 static int
vm_map_growstack(vm_map_t map,vm_offset_t addr,vm_map_entry_t gap_entry)4693 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
4694 {
4695 vm_map_entry_t stack_entry;
4696 struct proc *p;
4697 struct vmspace *vm;
4698 struct ucred *cred;
4699 vm_offset_t gap_end, gap_start, grow_start;
4700 vm_size_t grow_amount, guard, max_grow, sgp;
4701 vm_prot_t prot, max;
4702 rlim_t lmemlim, stacklim, vmemlim;
4703 int rv, rv1 __diagused;
4704 bool gap_deleted, grow_down, is_procstack;
4705 #ifdef notyet
4706 uint64_t limit;
4707 #endif
4708 #ifdef RACCT
4709 int error __diagused;
4710 #endif
4711
4712 p = curproc;
4713 vm = p->p_vmspace;
4714
4715 /*
4716 * Disallow stack growth when the access is performed by a
4717 * debugger or AIO daemon. The reason is that the wrong
4718 * resource limits are applied.
4719 */
4720 if (p != initproc && (map != &p->p_vmspace->vm_map ||
4721 p->p_textvp == NULL))
4722 return (KERN_FAILURE);
4723
4724 MPASS(!map->system_map);
4725
4726 lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
4727 stacklim = lim_cur(curthread, RLIMIT_STACK);
4728 vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4729 retry:
4730 /* If addr is not in a hole for a stack grow area, no need to grow. */
4731 if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
4732 return (KERN_FAILURE);
4733 if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
4734 return (KERN_SUCCESS);
4735 if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
4736 stack_entry = vm_map_entry_succ(gap_entry);
4737 if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
4738 stack_entry->start != gap_entry->end)
4739 return (KERN_FAILURE);
4740 grow_amount = round_page(stack_entry->start - addr);
4741 grow_down = true;
4742 } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
4743 stack_entry = vm_map_entry_pred(gap_entry);
4744 if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
4745 stack_entry->end != gap_entry->start)
4746 return (KERN_FAILURE);
4747 grow_amount = round_page(addr + 1 - stack_entry->end);
4748 grow_down = false;
4749 } else {
4750 return (KERN_FAILURE);
4751 }
4752 guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4753 (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4754 gap_entry->next_read;
4755 max_grow = gap_entry->end - gap_entry->start;
4756 if (guard > max_grow)
4757 return (KERN_NO_SPACE);
4758 max_grow -= guard;
4759 if (grow_amount > max_grow)
4760 return (KERN_NO_SPACE);
4761
4762 /*
4763 * If this is the main process stack, see if we're over the stack
4764 * limit.
4765 */
4766 is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
4767 addr < (vm_offset_t)vm->vm_stacktop;
4768 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
4769 return (KERN_NO_SPACE);
4770
4771 #ifdef RACCT
4772 if (racct_enable) {
4773 PROC_LOCK(p);
4774 if (is_procstack && racct_set(p, RACCT_STACK,
4775 ctob(vm->vm_ssize) + grow_amount)) {
4776 PROC_UNLOCK(p);
4777 return (KERN_NO_SPACE);
4778 }
4779 PROC_UNLOCK(p);
4780 }
4781 #endif
4782
4783 grow_amount = roundup(grow_amount, sgrowsiz);
4784 if (grow_amount > max_grow)
4785 grow_amount = max_grow;
4786 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
4787 grow_amount = trunc_page((vm_size_t)stacklim) -
4788 ctob(vm->vm_ssize);
4789 }
4790
4791 #ifdef notyet
4792 PROC_LOCK(p);
4793 limit = racct_get_available(p, RACCT_STACK);
4794 PROC_UNLOCK(p);
4795 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
4796 grow_amount = limit - ctob(vm->vm_ssize);
4797 #endif
4798
4799 if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
4800 if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
4801 rv = KERN_NO_SPACE;
4802 goto out;
4803 }
4804 #ifdef RACCT
4805 if (racct_enable) {
4806 PROC_LOCK(p);
4807 if (racct_set(p, RACCT_MEMLOCK,
4808 ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
4809 PROC_UNLOCK(p);
4810 rv = KERN_NO_SPACE;
4811 goto out;
4812 }
4813 PROC_UNLOCK(p);
4814 }
4815 #endif
4816 }
4817
4818 /* If we would blow our VMEM resource limit, no go */
4819 if (map->size + grow_amount > vmemlim) {
4820 rv = KERN_NO_SPACE;
4821 goto out;
4822 }
4823 #ifdef RACCT
4824 if (racct_enable) {
4825 PROC_LOCK(p);
4826 if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
4827 PROC_UNLOCK(p);
4828 rv = KERN_NO_SPACE;
4829 goto out;
4830 }
4831 PROC_UNLOCK(p);
4832 }
4833 #endif
4834
4835 if (vm_map_lock_upgrade(map)) {
4836 gap_entry = NULL;
4837 vm_map_lock_read(map);
4838 goto retry;
4839 }
4840
4841 if (grow_down) {
4842 /*
4843 * The gap_entry "offset" field is overloaded. See
4844 * vm_map_stack_locked().
4845 */
4846 prot = PROT_EXTRACT(gap_entry->offset);
4847 max = PROT_MAX_EXTRACT(gap_entry->offset);
4848 sgp = gap_entry->next_read;
4849
4850 grow_start = gap_entry->end - grow_amount;
4851 if (gap_entry->start + grow_amount == gap_entry->end) {
4852 gap_start = gap_entry->start;
4853 gap_end = gap_entry->end;
4854 vm_map_entry_delete(map, gap_entry);
4855 gap_deleted = true;
4856 } else {
4857 MPASS(gap_entry->start < gap_entry->end - grow_amount);
4858 vm_map_entry_resize(map, gap_entry, -grow_amount);
4859 gap_deleted = false;
4860 }
4861 rv = vm_map_insert(map, NULL, 0, grow_start,
4862 grow_start + grow_amount, prot, max, MAP_STACK_GROWS_DOWN);
4863 if (rv != KERN_SUCCESS) {
4864 if (gap_deleted) {
4865 rv1 = vm_map_insert1(map, NULL, 0, gap_start,
4866 gap_end, VM_PROT_NONE, VM_PROT_NONE,
4867 MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN,
4868 &gap_entry);
4869 MPASS(rv1 == KERN_SUCCESS);
4870 gap_entry->next_read = sgp;
4871 gap_entry->offset = prot | PROT_MAX(max);
4872 } else
4873 vm_map_entry_resize(map, gap_entry,
4874 grow_amount);
4875 }
4876 } else {
4877 grow_start = stack_entry->end;
4878 cred = stack_entry->cred;
4879 if (cred == NULL && stack_entry->object.vm_object != NULL)
4880 cred = stack_entry->object.vm_object->cred;
4881 if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
4882 rv = KERN_NO_SPACE;
4883 /* Grow the underlying object if applicable. */
4884 else if (stack_entry->object.vm_object == NULL ||
4885 vm_object_coalesce(stack_entry->object.vm_object,
4886 stack_entry->offset,
4887 (vm_size_t)(stack_entry->end - stack_entry->start),
4888 grow_amount, cred != NULL)) {
4889 if (gap_entry->start + grow_amount == gap_entry->end) {
4890 vm_map_entry_delete(map, gap_entry);
4891 vm_map_entry_resize(map, stack_entry,
4892 grow_amount);
4893 } else {
4894 gap_entry->start += grow_amount;
4895 stack_entry->end += grow_amount;
4896 }
4897 map->size += grow_amount;
4898 rv = KERN_SUCCESS;
4899 } else
4900 rv = KERN_FAILURE;
4901 }
4902 if (rv == KERN_SUCCESS && is_procstack)
4903 vm->vm_ssize += btoc(grow_amount);
4904
4905 /*
4906 * Heed the MAP_WIREFUTURE flag if it was set for this process.
4907 */
4908 if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
4909 rv = vm_map_wire_locked(map, grow_start,
4910 grow_start + grow_amount,
4911 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
4912 }
4913 vm_map_lock_downgrade(map);
4914
4915 out:
4916 #ifdef RACCT
4917 if (racct_enable && rv != KERN_SUCCESS) {
4918 PROC_LOCK(p);
4919 error = racct_set(p, RACCT_VMEM, map->size);
4920 KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
4921 if (!old_mlock) {
4922 error = racct_set(p, RACCT_MEMLOCK,
4923 ptoa(pmap_wired_count(map->pmap)));
4924 KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
4925 }
4926 error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
4927 KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
4928 PROC_UNLOCK(p);
4929 }
4930 #endif
4931
4932 return (rv);
4933 }
4934
4935 /*
4936 * Unshare the specified VM space for exec. If other processes are
4937 * mapped to it, then create a new one. The new vmspace is null.
4938 */
4939 int
vmspace_exec(struct proc * p,vm_offset_t minuser,vm_offset_t maxuser)4940 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
4941 {
4942 struct vmspace *oldvmspace = p->p_vmspace;
4943 struct vmspace *newvmspace;
4944
4945 KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
4946 ("vmspace_exec recursed"));
4947 newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
4948 if (newvmspace == NULL)
4949 return (ENOMEM);
4950 newvmspace->vm_swrss = oldvmspace->vm_swrss;
4951 /*
4952 * This code is written like this for prototype purposes. The
4953 * goal is to avoid running down the vmspace here, but let the
4954 * other process's that are still using the vmspace to finally
4955 * run it down. Even though there is little or no chance of blocking
4956 * here, it is a good idea to keep this form for future mods.
4957 */
4958 PROC_VMSPACE_LOCK(p);
4959 p->p_vmspace = newvmspace;
4960 PROC_VMSPACE_UNLOCK(p);
4961 if (p == curthread->td_proc)
4962 pmap_activate(curthread);
4963 curthread->td_pflags |= TDP_EXECVMSPC;
4964 return (0);
4965 }
4966
4967 /*
4968 * Unshare the specified VM space for forcing COW. This
4969 * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4970 */
4971 int
vmspace_unshare(struct proc * p)4972 vmspace_unshare(struct proc *p)
4973 {
4974 struct vmspace *oldvmspace = p->p_vmspace;
4975 struct vmspace *newvmspace;
4976 vm_ooffset_t fork_charge;
4977
4978 /*
4979 * The caller is responsible for ensuring that the reference count
4980 * cannot concurrently transition 1 -> 2.
4981 */
4982 if (refcount_load(&oldvmspace->vm_refcnt) == 1)
4983 return (0);
4984 fork_charge = 0;
4985 newvmspace = vmspace_fork(oldvmspace, &fork_charge);
4986 if (newvmspace == NULL)
4987 return (ENOMEM);
4988 if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
4989 vmspace_free(newvmspace);
4990 return (ENOMEM);
4991 }
4992 PROC_VMSPACE_LOCK(p);
4993 p->p_vmspace = newvmspace;
4994 PROC_VMSPACE_UNLOCK(p);
4995 if (p == curthread->td_proc)
4996 pmap_activate(curthread);
4997 vmspace_free(oldvmspace);
4998 return (0);
4999 }
5000
5001 /*
5002 * vm_map_lookup:
5003 *
5004 * Finds the VM object, offset, and
5005 * protection for a given virtual address in the
5006 * specified map, assuming a page fault of the
5007 * type specified.
5008 *
5009 * Leaves the map in question locked for read; return
5010 * values are guaranteed until a vm_map_lookup_done
5011 * call is performed. Note that the map argument
5012 * is in/out; the returned map must be used in
5013 * the call to vm_map_lookup_done.
5014 *
5015 * A handle (out_entry) is returned for use in
5016 * vm_map_lookup_done, to make that fast.
5017 *
5018 * If a lookup is requested with "write protection"
5019 * specified, the map may be changed to perform virtual
5020 * copying operations, although the data referenced will
5021 * remain the same.
5022 */
5023 int
vm_map_lookup(vm_map_t * var_map,vm_offset_t vaddr,vm_prot_t fault_typea,vm_map_entry_t * out_entry,vm_object_t * object,vm_pindex_t * pindex,vm_prot_t * out_prot,boolean_t * wired)5024 vm_map_lookup(vm_map_t *var_map, /* IN/OUT */
5025 vm_offset_t vaddr,
5026 vm_prot_t fault_typea,
5027 vm_map_entry_t *out_entry, /* OUT */
5028 vm_object_t *object, /* OUT */
5029 vm_pindex_t *pindex, /* OUT */
5030 vm_prot_t *out_prot, /* OUT */
5031 boolean_t *wired) /* OUT */
5032 {
5033 vm_map_entry_t entry;
5034 vm_map_t map = *var_map;
5035 vm_prot_t prot;
5036 vm_prot_t fault_type;
5037 vm_object_t eobject;
5038 vm_size_t size;
5039 struct ucred *cred;
5040
5041 RetryLookup:
5042
5043 vm_map_lock_read(map);
5044
5045 RetryLookupLocked:
5046 /*
5047 * Lookup the faulting address.
5048 */
5049 if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
5050 vm_map_unlock_read(map);
5051 return (KERN_INVALID_ADDRESS);
5052 }
5053
5054 entry = *out_entry;
5055
5056 /*
5057 * Handle submaps.
5058 */
5059 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
5060 vm_map_t old_map = map;
5061
5062 *var_map = map = entry->object.sub_map;
5063 vm_map_unlock_read(old_map);
5064 goto RetryLookup;
5065 }
5066
5067 /*
5068 * Check whether this task is allowed to have this page.
5069 */
5070 prot = entry->protection;
5071 if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
5072 fault_typea &= ~VM_PROT_FAULT_LOOKUP;
5073 if (prot == VM_PROT_NONE && map != kernel_map &&
5074 (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
5075 (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
5076 MAP_ENTRY_STACK_GAP_UP)) != 0 &&
5077 vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
5078 goto RetryLookupLocked;
5079 }
5080 fault_type = fault_typea & VM_PROT_ALL;
5081 if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
5082 vm_map_unlock_read(map);
5083 return (KERN_PROTECTION_FAILURE);
5084 }
5085 KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
5086 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
5087 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
5088 ("entry %p flags %x", entry, entry->eflags));
5089 if ((fault_typea & VM_PROT_COPY) != 0 &&
5090 (entry->max_protection & VM_PROT_WRITE) == 0 &&
5091 (entry->eflags & MAP_ENTRY_COW) == 0) {
5092 vm_map_unlock_read(map);
5093 return (KERN_PROTECTION_FAILURE);
5094 }
5095
5096 /*
5097 * If this page is not pageable, we have to get it for all possible
5098 * accesses.
5099 */
5100 *wired = (entry->wired_count != 0);
5101 if (*wired)
5102 fault_type = entry->protection;
5103 size = entry->end - entry->start;
5104
5105 /*
5106 * If the entry was copy-on-write, we either ...
5107 */
5108 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5109 /*
5110 * If we want to write the page, we may as well handle that
5111 * now since we've got the map locked.
5112 *
5113 * If we don't need to write the page, we just demote the
5114 * permissions allowed.
5115 */
5116 if ((fault_type & VM_PROT_WRITE) != 0 ||
5117 (fault_typea & VM_PROT_COPY) != 0) {
5118 /*
5119 * Make a new object, and place it in the object
5120 * chain. Note that no new references have appeared
5121 * -- one just moved from the map to the new
5122 * object.
5123 */
5124 if (vm_map_lock_upgrade(map))
5125 goto RetryLookup;
5126
5127 if (entry->cred == NULL) {
5128 /*
5129 * The debugger owner is charged for
5130 * the memory.
5131 */
5132 cred = curthread->td_ucred;
5133 crhold(cred);
5134 if (!swap_reserve_by_cred(size, cred)) {
5135 crfree(cred);
5136 vm_map_unlock(map);
5137 return (KERN_RESOURCE_SHORTAGE);
5138 }
5139 entry->cred = cred;
5140 }
5141 eobject = entry->object.vm_object;
5142 vm_object_shadow(&entry->object.vm_object,
5143 &entry->offset, size, entry->cred, false);
5144 if (eobject == entry->object.vm_object) {
5145 /*
5146 * The object was not shadowed.
5147 */
5148 swap_release_by_cred(size, entry->cred);
5149 crfree(entry->cred);
5150 }
5151 entry->cred = NULL;
5152 entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
5153
5154 vm_map_lock_downgrade(map);
5155 } else {
5156 /*
5157 * We're attempting to read a copy-on-write page --
5158 * don't allow writes.
5159 */
5160 prot &= ~VM_PROT_WRITE;
5161 }
5162 }
5163
5164 /*
5165 * Create an object if necessary.
5166 */
5167 if (entry->object.vm_object == NULL && !map->system_map) {
5168 if (vm_map_lock_upgrade(map))
5169 goto RetryLookup;
5170 entry->object.vm_object = vm_object_allocate_anon(atop(size),
5171 NULL, entry->cred, size);
5172 entry->offset = 0;
5173 entry->cred = NULL;
5174 vm_map_lock_downgrade(map);
5175 }
5176
5177 /*
5178 * Return the object/offset from this entry. If the entry was
5179 * copy-on-write or empty, it has been fixed up.
5180 */
5181 *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5182 *object = entry->object.vm_object;
5183
5184 *out_prot = prot;
5185 return (KERN_SUCCESS);
5186 }
5187
5188 /*
5189 * vm_map_lookup_locked:
5190 *
5191 * Lookup the faulting address. A version of vm_map_lookup that returns
5192 * KERN_FAILURE instead of blocking on map lock or memory allocation.
5193 */
5194 int
vm_map_lookup_locked(vm_map_t * var_map,vm_offset_t vaddr,vm_prot_t fault_typea,vm_map_entry_t * out_entry,vm_object_t * object,vm_pindex_t * pindex,vm_prot_t * out_prot,boolean_t * wired)5195 vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */
5196 vm_offset_t vaddr,
5197 vm_prot_t fault_typea,
5198 vm_map_entry_t *out_entry, /* OUT */
5199 vm_object_t *object, /* OUT */
5200 vm_pindex_t *pindex, /* OUT */
5201 vm_prot_t *out_prot, /* OUT */
5202 boolean_t *wired) /* OUT */
5203 {
5204 vm_map_entry_t entry;
5205 vm_map_t map = *var_map;
5206 vm_prot_t prot;
5207 vm_prot_t fault_type = fault_typea;
5208
5209 /*
5210 * Lookup the faulting address.
5211 */
5212 if (!vm_map_lookup_entry(map, vaddr, out_entry))
5213 return (KERN_INVALID_ADDRESS);
5214
5215 entry = *out_entry;
5216
5217 /*
5218 * Fail if the entry refers to a submap.
5219 */
5220 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
5221 return (KERN_FAILURE);
5222
5223 /*
5224 * Check whether this task is allowed to have this page.
5225 */
5226 prot = entry->protection;
5227 fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
5228 if ((fault_type & prot) != fault_type)
5229 return (KERN_PROTECTION_FAILURE);
5230
5231 /*
5232 * If this page is not pageable, we have to get it for all possible
5233 * accesses.
5234 */
5235 *wired = (entry->wired_count != 0);
5236 if (*wired)
5237 fault_type = entry->protection;
5238
5239 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5240 /*
5241 * Fail if the entry was copy-on-write for a write fault.
5242 */
5243 if (fault_type & VM_PROT_WRITE)
5244 return (KERN_FAILURE);
5245 /*
5246 * We're attempting to read a copy-on-write page --
5247 * don't allow writes.
5248 */
5249 prot &= ~VM_PROT_WRITE;
5250 }
5251
5252 /*
5253 * Fail if an object should be created.
5254 */
5255 if (entry->object.vm_object == NULL && !map->system_map)
5256 return (KERN_FAILURE);
5257
5258 /*
5259 * Return the object/offset from this entry. If the entry was
5260 * copy-on-write or empty, it has been fixed up.
5261 */
5262 *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5263 *object = entry->object.vm_object;
5264
5265 *out_prot = prot;
5266 return (KERN_SUCCESS);
5267 }
5268
5269 /*
5270 * vm_map_lookup_done:
5271 *
5272 * Releases locks acquired by a vm_map_lookup
5273 * (according to the handle returned by that lookup).
5274 */
5275 void
vm_map_lookup_done(vm_map_t map,vm_map_entry_t entry)5276 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
5277 {
5278 /*
5279 * Unlock the main-level map
5280 */
5281 vm_map_unlock_read(map);
5282 }
5283
5284 vm_offset_t
vm_map_max_KBI(const struct vm_map * map)5285 vm_map_max_KBI(const struct vm_map *map)
5286 {
5287
5288 return (vm_map_max(map));
5289 }
5290
5291 vm_offset_t
vm_map_min_KBI(const struct vm_map * map)5292 vm_map_min_KBI(const struct vm_map *map)
5293 {
5294
5295 return (vm_map_min(map));
5296 }
5297
5298 pmap_t
vm_map_pmap_KBI(vm_map_t map)5299 vm_map_pmap_KBI(vm_map_t map)
5300 {
5301
5302 return (map->pmap);
5303 }
5304
5305 bool
vm_map_range_valid_KBI(vm_map_t map,vm_offset_t start,vm_offset_t end)5306 vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end)
5307 {
5308
5309 return (vm_map_range_valid(map, start, end));
5310 }
5311
5312 #ifdef INVARIANTS
5313 static void
_vm_map_assert_consistent(vm_map_t map,int check)5314 _vm_map_assert_consistent(vm_map_t map, int check)
5315 {
5316 vm_map_entry_t entry, prev;
5317 vm_map_entry_t cur, header, lbound, ubound;
5318 vm_size_t max_left, max_right;
5319
5320 #ifdef DIAGNOSTIC
5321 ++map->nupdates;
5322 #endif
5323 if (enable_vmmap_check != check)
5324 return;
5325
5326 header = prev = &map->header;
5327 VM_MAP_ENTRY_FOREACH(entry, map) {
5328 KASSERT(prev->end <= entry->start,
5329 ("map %p prev->end = %jx, start = %jx", map,
5330 (uintmax_t)prev->end, (uintmax_t)entry->start));
5331 KASSERT(entry->start < entry->end,
5332 ("map %p start = %jx, end = %jx", map,
5333 (uintmax_t)entry->start, (uintmax_t)entry->end));
5334 KASSERT(entry->left == header ||
5335 entry->left->start < entry->start,
5336 ("map %p left->start = %jx, start = %jx", map,
5337 (uintmax_t)entry->left->start, (uintmax_t)entry->start));
5338 KASSERT(entry->right == header ||
5339 entry->start < entry->right->start,
5340 ("map %p start = %jx, right->start = %jx", map,
5341 (uintmax_t)entry->start, (uintmax_t)entry->right->start));
5342 cur = map->root;
5343 lbound = ubound = header;
5344 for (;;) {
5345 if (entry->start < cur->start) {
5346 ubound = cur;
5347 cur = cur->left;
5348 KASSERT(cur != lbound,
5349 ("map %p cannot find %jx",
5350 map, (uintmax_t)entry->start));
5351 } else if (cur->end <= entry->start) {
5352 lbound = cur;
5353 cur = cur->right;
5354 KASSERT(cur != ubound,
5355 ("map %p cannot find %jx",
5356 map, (uintmax_t)entry->start));
5357 } else {
5358 KASSERT(cur == entry,
5359 ("map %p cannot find %jx",
5360 map, (uintmax_t)entry->start));
5361 break;
5362 }
5363 }
5364 max_left = vm_map_entry_max_free_left(entry, lbound);
5365 max_right = vm_map_entry_max_free_right(entry, ubound);
5366 KASSERT(entry->max_free == vm_size_max(max_left, max_right),
5367 ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
5368 (uintmax_t)entry->max_free,
5369 (uintmax_t)max_left, (uintmax_t)max_right));
5370 prev = entry;
5371 }
5372 KASSERT(prev->end <= entry->start,
5373 ("map %p prev->end = %jx, start = %jx", map,
5374 (uintmax_t)prev->end, (uintmax_t)entry->start));
5375 }
5376 #endif
5377
5378 #include "opt_ddb.h"
5379 #ifdef DDB
5380 #include <sys/kernel.h>
5381
5382 #include <ddb/ddb.h>
5383
5384 static void
vm_map_print(vm_map_t map)5385 vm_map_print(vm_map_t map)
5386 {
5387 vm_map_entry_t entry, prev;
5388
5389 db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
5390 (void *)map,
5391 (void *)map->pmap, map->nentries, map->timestamp);
5392
5393 db_indent += 2;
5394 prev = &map->header;
5395 VM_MAP_ENTRY_FOREACH(entry, map) {
5396 db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
5397 (void *)entry, (void *)entry->start, (void *)entry->end,
5398 entry->eflags);
5399 {
5400 static const char * const inheritance_name[4] =
5401 {"share", "copy", "none", "donate_copy"};
5402
5403 db_iprintf(" prot=%x/%x/%s",
5404 entry->protection,
5405 entry->max_protection,
5406 inheritance_name[(int)(unsigned char)
5407 entry->inheritance]);
5408 if (entry->wired_count != 0)
5409 db_printf(", wired");
5410 }
5411 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
5412 db_printf(", share=%p, offset=0x%jx\n",
5413 (void *)entry->object.sub_map,
5414 (uintmax_t)entry->offset);
5415 if (prev == &map->header ||
5416 prev->object.sub_map !=
5417 entry->object.sub_map) {
5418 db_indent += 2;
5419 vm_map_print((vm_map_t)entry->object.sub_map);
5420 db_indent -= 2;
5421 }
5422 } else {
5423 if (entry->cred != NULL)
5424 db_printf(", ruid %d", entry->cred->cr_ruid);
5425 db_printf(", object=%p, offset=0x%jx",
5426 (void *)entry->object.vm_object,
5427 (uintmax_t)entry->offset);
5428 if (entry->object.vm_object && entry->object.vm_object->cred)
5429 db_printf(", obj ruid %d charge %jx",
5430 entry->object.vm_object->cred->cr_ruid,
5431 (uintmax_t)entry->object.vm_object->charge);
5432 if (entry->eflags & MAP_ENTRY_COW)
5433 db_printf(", copy (%s)",
5434 (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
5435 db_printf("\n");
5436
5437 if (prev == &map->header ||
5438 prev->object.vm_object !=
5439 entry->object.vm_object) {
5440 db_indent += 2;
5441 vm_object_print((db_expr_t)(intptr_t)
5442 entry->object.vm_object,
5443 0, 0, (char *)0);
5444 db_indent -= 2;
5445 }
5446 }
5447 prev = entry;
5448 }
5449 db_indent -= 2;
5450 }
5451
DB_SHOW_COMMAND(map,map)5452 DB_SHOW_COMMAND(map, map)
5453 {
5454
5455 if (!have_addr) {
5456 db_printf("usage: show map <addr>\n");
5457 return;
5458 }
5459 vm_map_print((vm_map_t)addr);
5460 }
5461
DB_SHOW_COMMAND(procvm,procvm)5462 DB_SHOW_COMMAND(procvm, procvm)
5463 {
5464 struct proc *p;
5465
5466 if (have_addr) {
5467 p = db_lookup_proc(addr);
5468 } else {
5469 p = curproc;
5470 }
5471
5472 db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
5473 (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
5474 (void *)vmspace_pmap(p->p_vmspace));
5475
5476 vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
5477 }
5478
5479 #endif /* DDB */
5480