xref: /freebsd-13.1/sys/arm64/arm64/pmap.c (revision 33a625b1)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2003 Peter Wemm
9  * All rights reserved.
10  * Copyright (c) 2005-2010 Alan L. Cox <[email protected]>
11  * All rights reserved.
12  * Copyright (c) 2014 Andrew Turner
13  * All rights reserved.
14  * Copyright (c) 2014-2016 The FreeBSD Foundation
15  * All rights reserved.
16  *
17  * This code is derived from software contributed to Berkeley by
18  * the Systems Programming Group of the University of Utah Computer
19  * Science Department and William Jolitz of UUNET Technologies Inc.
20  *
21  * This software was developed by Andrew Turner under sponsorship from
22  * the FreeBSD Foundation.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  * 3. All advertising materials mentioning features or use of this software
33  *    must display the following acknowledgement:
34  *	This product includes software developed by the University of
35  *	California, Berkeley and its contributors.
36  * 4. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
53  */
54 /*-
55  * Copyright (c) 2003 Networks Associates Technology, Inc.
56  * All rights reserved.
57  *
58  * This software was developed for the FreeBSD Project by Jake Burkholder,
59  * Safeport Network Services, and Network Associates Laboratories, the
60  * Security Research Division of Network Associates, Inc. under
61  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62  * CHATS research program.
63  *
64  * Redistribution and use in source and binary forms, with or without
65  * modification, are permitted provided that the following conditions
66  * are met:
67  * 1. Redistributions of source code must retain the above copyright
68  *    notice, this list of conditions and the following disclaimer.
69  * 2. Redistributions in binary form must reproduce the above copyright
70  *    notice, this list of conditions and the following disclaimer in the
71  *    documentation and/or other materials provided with the distribution.
72  *
73  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83  * SUCH DAMAGE.
84  */
85 
86 #include <sys/cdefs.h>
87 __FBSDID("$FreeBSD$");
88 
89 /*
90  *	Manages physical address maps.
91  *
92  *	Since the information managed by this module is
93  *	also stored by the logical address mapping module,
94  *	this module may throw away valid virtual-to-physical
95  *	mappings at almost any time.  However, invalidations
96  *	of virtual-to-physical mappings must be done as
97  *	requested.
98  *
99  *	In order to cope with hardware architectures which
100  *	make virtual-to-physical map invalidates expensive,
101  *	this module may delay invalidate or reduced protection
102  *	operations until such time as they are actually
103  *	necessary.  This module is given full information as
104  *	to which processors are currently using which maps,
105  *	and to when physical maps must be made correct.
106  */
107 
108 #include "opt_vm.h"
109 
110 #include <sys/param.h>
111 #include <sys/bitstring.h>
112 #include <sys/bus.h>
113 #include <sys/systm.h>
114 #include <sys/kernel.h>
115 #include <sys/ktr.h>
116 #include <sys/limits.h>
117 #include <sys/lock.h>
118 #include <sys/malloc.h>
119 #include <sys/mman.h>
120 #include <sys/msgbuf.h>
121 #include <sys/mutex.h>
122 #include <sys/physmem.h>
123 #include <sys/proc.h>
124 #include <sys/rwlock.h>
125 #include <sys/sbuf.h>
126 #include <sys/sx.h>
127 #include <sys/vmem.h>
128 #include <sys/vmmeter.h>
129 #include <sys/sched.h>
130 #include <sys/sysctl.h>
131 #include <sys/_unrhdr.h>
132 #include <sys/smp.h>
133 
134 #include <vm/vm.h>
135 #include <vm/vm_param.h>
136 #include <vm/vm_kern.h>
137 #include <vm/vm_page.h>
138 #include <vm/vm_map.h>
139 #include <vm/vm_object.h>
140 #include <vm/vm_extern.h>
141 #include <vm/vm_pageout.h>
142 #include <vm/vm_pager.h>
143 #include <vm/vm_phys.h>
144 #include <vm/vm_radix.h>
145 #include <vm/vm_reserv.h>
146 #include <vm/vm_dumpset.h>
147 #include <vm/uma.h>
148 
149 #include <machine/machdep.h>
150 #include <machine/md_var.h>
151 #include <machine/pcb.h>
152 
153 #define	PMAP_ASSERT_STAGE1(pmap)	MPASS((pmap)->pm_stage == PM_STAGE1)
154 #define	PMAP_ASSERT_STAGE2(pmap)	MPASS((pmap)->pm_stage == PM_STAGE2)
155 
156 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
157 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
158 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
159 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
160 
161 #define	NUL0E		L0_ENTRIES
162 #define	NUL1E		(NUL0E * NL1PG)
163 #define	NUL2E		(NUL1E * NL2PG)
164 
165 #if !defined(DIAGNOSTIC)
166 #ifdef __GNUC_GNU_INLINE__
167 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
168 #else
169 #define PMAP_INLINE	extern inline
170 #endif
171 #else
172 #define PMAP_INLINE
173 #endif
174 
175 #ifdef PV_STATS
176 #define PV_STAT(x)	do { x ; } while (0)
177 #else
178 #define PV_STAT(x)	do { } while (0)
179 #endif
180 
181 #define	pmap_l0_pindex(v)	(NUL2E + NUL1E + ((v) >> L0_SHIFT))
182 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
183 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
184 
185 static struct md_page *
pa_to_pvh(vm_paddr_t pa)186 pa_to_pvh(vm_paddr_t pa)
187 {
188 	struct vm_phys_seg *seg;
189 	int segind;
190 
191 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
192 		seg = &vm_phys_segs[segind];
193 		if (pa >= seg->start && pa < seg->end)
194 			return ((struct md_page *)seg->md_first +
195 			    pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
196 	}
197 	panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
198 }
199 
200 static struct md_page *
page_to_pvh(vm_page_t m)201 page_to_pvh(vm_page_t m)
202 {
203 	struct vm_phys_seg *seg;
204 
205 	seg = &vm_phys_segs[m->segind];
206 	return ((struct md_page *)seg->md_first +
207 	    pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
208 }
209 
210 #define	NPV_LIST_LOCKS	MAXCPU
211 
212 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
213 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
214 
215 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
216 	struct rwlock **_lockp = (lockp);		\
217 	struct rwlock *_new_lock;			\
218 							\
219 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
220 	if (_new_lock != *_lockp) {			\
221 		if (*_lockp != NULL)			\
222 			rw_wunlock(*_lockp);		\
223 		*_lockp = _new_lock;			\
224 		rw_wlock(*_lockp);			\
225 	}						\
226 } while (0)
227 
228 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
229 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
230 
231 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
232 	struct rwlock **_lockp = (lockp);		\
233 							\
234 	if (*_lockp != NULL) {				\
235 		rw_wunlock(*_lockp);			\
236 		*_lockp = NULL;				\
237 	}						\
238 } while (0)
239 
240 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
241 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
242 
243 /*
244  * The presence of this flag indicates that the mapping is writeable.
245  * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
246  * it is dirty.  This flag may only be set on managed mappings.
247  *
248  * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
249  * as a software managed bit.
250  */
251 #define	ATTR_SW_DBM	ATTR_DBM
252 
253 struct pmap kernel_pmap_store;
254 
255 /* Used for mapping ACPI memory before VM is initialized */
256 #define	PMAP_PREINIT_MAPPING_COUNT	32
257 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
258 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
259 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
260 
261 /*
262  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
263  * Always map entire L2 block for simplicity.
264  * VA of L2 block = preinit_map_va + i * L2_SIZE
265  */
266 static struct pmap_preinit_mapping {
267 	vm_paddr_t	pa;
268 	vm_offset_t	va;
269 	vm_size_t	size;
270 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
271 
272 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
273 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
274 vm_offset_t kernel_vm_end = 0;
275 
276 /*
277  * Data for the pv entry allocation mechanism.
278  */
279 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
280 static struct mtx pv_chunks_mutex;
281 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
282 static struct md_page *pv_table;
283 static struct md_page pv_dummy;
284 
285 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
286 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
287 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
288 
289 /* This code assumes all L1 DMAP entries will be used */
290 CTASSERT((DMAP_MIN_ADDRESS  & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
291 CTASSERT((DMAP_MAX_ADDRESS  & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
292 
293 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
294 extern pt_entry_t pagetable_dmap[];
295 
296 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
297 static vm_paddr_t physmap[PHYSMAP_SIZE];
298 static u_int physmap_idx;
299 
300 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
301     "VM/pmap parameters");
302 
303 /*
304  * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
305  * that it has currently allocated to a pmap, a cursor ("asid_next") to
306  * optimize its search for a free ASID in the bit vector, and an epoch number
307  * ("asid_epoch") to indicate when it has reclaimed all previously allocated
308  * ASIDs that are not currently active on a processor.
309  *
310  * The current epoch number is always in the range [0, INT_MAX).  Negative
311  * numbers and INT_MAX are reserved for special cases that are described
312  * below.
313  */
314 struct asid_set {
315 	int asid_bits;
316 	bitstr_t *asid_set;
317 	int asid_set_size;
318 	int asid_next;
319 	int asid_epoch;
320 	struct mtx asid_set_mutex;
321 };
322 
323 static struct asid_set asids;
324 static struct asid_set vmids;
325 
326 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
327     "ASID allocator");
328 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
329     "The number of bits in an ASID");
330 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
331     "The last allocated ASID plus one");
332 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
333     "The current epoch number");
334 
335 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
336 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
337     "The number of bits in an VMID");
338 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
339     "The last allocated VMID plus one");
340 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
341     "The current epoch number");
342 
343 void (*pmap_clean_stage2_tlbi)(void);
344 void (*pmap_invalidate_vpipt_icache)(void);
345 
346 /*
347  * A pmap's cookie encodes an ASID and epoch number.  Cookies for reserved
348  * ASIDs have a negative epoch number, specifically, INT_MIN.  Cookies for
349  * dynamically allocated ASIDs have a non-negative epoch number.
350  *
351  * An invalid ASID is represented by -1.
352  *
353  * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
354  * which indicates that an ASID should never be allocated to the pmap, and
355  * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
356  * allocated when the pmap is next activated.
357  */
358 #define	COOKIE_FROM(asid, epoch)	((long)((u_int)(asid) |	\
359 					    ((u_long)(epoch) << 32)))
360 #define	COOKIE_TO_ASID(cookie)		((int)(cookie))
361 #define	COOKIE_TO_EPOCH(cookie)		((int)((u_long)(cookie) >> 32))
362 
363 #define	TLBI_VA_SHIFT			12
364 #define	TLBI_VA_MASK			((1ul << 44) - 1)
365 #define	TLBI_VA(addr)			(((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
366 #define	TLBI_VA_L3_INCR			(L3_SIZE >> TLBI_VA_SHIFT)
367 
368 static int superpages_enabled = 1;
369 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
370     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
371     "Are large page mappings enabled?");
372 
373 /*
374  * Internal flags for pmap_enter()'s helper functions.
375  */
376 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
377 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
378 
379 static void	free_pv_chunk(struct pv_chunk *pc);
380 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
381 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
382 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
383 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
384 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
385 		    vm_offset_t va);
386 
387 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
388 static bool pmap_activate_int(pmap_t pmap);
389 static void pmap_alloc_asid(pmap_t pmap);
390 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
391     vm_prot_t prot, int mode, bool skip_unmapped);
392 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
393 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
394     vm_offset_t va, struct rwlock **lockp);
395 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
396 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
397     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
398 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
399     u_int flags, vm_page_t m, struct rwlock **lockp);
400 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
401     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
402 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
403     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
404 static void pmap_reset_asid_set(pmap_t pmap);
405 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
406     vm_page_t m, struct rwlock **lockp);
407 
408 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
409 		struct rwlock **lockp);
410 
411 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
412     struct spglist *free);
413 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
414 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
415 
416 /*
417  * These load the old table data and store the new value.
418  * They need to be atomic as the System MMU may write to the table at
419  * the same time as the CPU.
420  */
421 #define	pmap_clear(table)		atomic_store_64(table, 0)
422 #define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
423 #define	pmap_load(table)		(*table)
424 #define	pmap_load_clear(table)		atomic_swap_64(table, 0)
425 #define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
426 #define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
427 #define	pmap_store(table, entry)	atomic_store_64(table, entry)
428 
429 /********************/
430 /* Inline functions */
431 /********************/
432 
433 static __inline void
pagecopy(void * s,void * d)434 pagecopy(void *s, void *d)
435 {
436 
437 	memcpy(d, s, PAGE_SIZE);
438 }
439 
440 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)441 pmap_l0(pmap_t pmap, vm_offset_t va)
442 {
443 
444 	return (&pmap->pm_l0[pmap_l0_index(va)]);
445 }
446 
447 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)448 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
449 {
450 	pd_entry_t *l1;
451 
452 	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
453 	return (&l1[pmap_l1_index(va)]);
454 }
455 
456 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)457 pmap_l1(pmap_t pmap, vm_offset_t va)
458 {
459 	pd_entry_t *l0;
460 
461 	l0 = pmap_l0(pmap, va);
462 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
463 		return (NULL);
464 
465 	return (pmap_l0_to_l1(l0, va));
466 }
467 
468 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)469 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
470 {
471 	pd_entry_t l1, *l2p;
472 
473 	l1 = pmap_load(l1p);
474 
475 	KASSERT(ADDR_IS_CANONICAL(va),
476 	    ("%s: Address not in canonical form: %lx", __func__, va));
477 	/*
478 	 * The valid bit may be clear if pmap_update_entry() is concurrently
479 	 * modifying the entry, so for KVA only the entry type may be checked.
480 	 */
481 	KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
482 	    ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
483 	KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
484 	    ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
485 	l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK);
486 	return (&l2p[pmap_l2_index(va)]);
487 }
488 
489 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)490 pmap_l2(pmap_t pmap, vm_offset_t va)
491 {
492 	pd_entry_t *l1;
493 
494 	l1 = pmap_l1(pmap, va);
495 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
496 		return (NULL);
497 
498 	return (pmap_l1_to_l2(l1, va));
499 }
500 
501 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)502 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
503 {
504 	pd_entry_t l2;
505 	pt_entry_t *l3p;
506 
507 	l2 = pmap_load(l2p);
508 
509 	KASSERT(ADDR_IS_CANONICAL(va),
510 	    ("%s: Address not in canonical form: %lx", __func__, va));
511 	/*
512 	 * The valid bit may be clear if pmap_update_entry() is concurrently
513 	 * modifying the entry, so for KVA only the entry type may be checked.
514 	 */
515 	KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
516 	    ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
517 	KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
518 	    ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
519 	l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK);
520 	return (&l3p[pmap_l3_index(va)]);
521 }
522 
523 /*
524  * Returns the lowest valid pde for a given virtual address.
525  * The next level may or may not point to a valid page or block.
526  */
527 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)528 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
529 {
530 	pd_entry_t *l0, *l1, *l2, desc;
531 
532 	l0 = pmap_l0(pmap, va);
533 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
534 	if (desc != L0_TABLE) {
535 		*level = -1;
536 		return (NULL);
537 	}
538 
539 	l1 = pmap_l0_to_l1(l0, va);
540 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
541 	if (desc != L1_TABLE) {
542 		*level = 0;
543 		return (l0);
544 	}
545 
546 	l2 = pmap_l1_to_l2(l1, va);
547 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
548 	if (desc != L2_TABLE) {
549 		*level = 1;
550 		return (l1);
551 	}
552 
553 	*level = 2;
554 	return (l2);
555 }
556 
557 /*
558  * Returns the lowest valid pte block or table entry for a given virtual
559  * address. If there are no valid entries return NULL and set the level to
560  * the first invalid level.
561  */
562 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)563 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
564 {
565 	pd_entry_t *l1, *l2, desc;
566 	pt_entry_t *l3;
567 
568 	l1 = pmap_l1(pmap, va);
569 	if (l1 == NULL) {
570 		*level = 0;
571 		return (NULL);
572 	}
573 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
574 	if (desc == L1_BLOCK) {
575 		*level = 1;
576 		return (l1);
577 	}
578 
579 	if (desc != L1_TABLE) {
580 		*level = 1;
581 		return (NULL);
582 	}
583 
584 	l2 = pmap_l1_to_l2(l1, va);
585 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
586 	if (desc == L2_BLOCK) {
587 		*level = 2;
588 		return (l2);
589 	}
590 
591 	if (desc != L2_TABLE) {
592 		*level = 2;
593 		return (NULL);
594 	}
595 
596 	*level = 3;
597 	l3 = pmap_l2_to_l3(l2, va);
598 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
599 		return (NULL);
600 
601 	return (l3);
602 }
603 
604 bool
pmap_ps_enabled(pmap_t pmap __unused)605 pmap_ps_enabled(pmap_t pmap __unused)
606 {
607 
608 	return (superpages_enabled != 0);
609 }
610 
611 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)612 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
613     pd_entry_t **l2, pt_entry_t **l3)
614 {
615 	pd_entry_t *l0p, *l1p, *l2p;
616 
617 	if (pmap->pm_l0 == NULL)
618 		return (false);
619 
620 	l0p = pmap_l0(pmap, va);
621 	*l0 = l0p;
622 
623 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
624 		return (false);
625 
626 	l1p = pmap_l0_to_l1(l0p, va);
627 	*l1 = l1p;
628 
629 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
630 		*l2 = NULL;
631 		*l3 = NULL;
632 		return (true);
633 	}
634 
635 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
636 		return (false);
637 
638 	l2p = pmap_l1_to_l2(l1p, va);
639 	*l2 = l2p;
640 
641 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
642 		*l3 = NULL;
643 		return (true);
644 	}
645 
646 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
647 		return (false);
648 
649 	*l3 = pmap_l2_to_l3(l2p, va);
650 
651 	return (true);
652 }
653 
654 static __inline int
pmap_l3_valid(pt_entry_t l3)655 pmap_l3_valid(pt_entry_t l3)
656 {
657 
658 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
659 }
660 
661 CTASSERT(L1_BLOCK == L2_BLOCK);
662 
663 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)664 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
665 {
666 	pt_entry_t val;
667 
668 	if (pmap->pm_stage == PM_STAGE1) {
669 		val = ATTR_S1_IDX(memattr);
670 		if (memattr == VM_MEMATTR_DEVICE)
671 			val |= ATTR_S1_XN;
672 		return (val);
673 	}
674 
675 	val = 0;
676 
677 	switch (memattr) {
678 	case VM_MEMATTR_DEVICE:
679 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
680 		    ATTR_S2_XN(ATTR_S2_XN_ALL));
681 	case VM_MEMATTR_UNCACHEABLE:
682 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
683 	case VM_MEMATTR_WRITE_BACK:
684 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
685 	case VM_MEMATTR_WRITE_THROUGH:
686 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
687 	default:
688 		panic("%s: invalid memory attribute %x", __func__, memattr);
689 	}
690 }
691 
692 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)693 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
694 {
695 	pt_entry_t val;
696 
697 	val = 0;
698 	if (pmap->pm_stage == PM_STAGE1) {
699 		if ((prot & VM_PROT_EXECUTE) == 0)
700 			val |= ATTR_S1_XN;
701 		if ((prot & VM_PROT_WRITE) == 0)
702 			val |= ATTR_S1_AP(ATTR_S1_AP_RO);
703 	} else {
704 		if ((prot & VM_PROT_WRITE) != 0)
705 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
706 		if ((prot & VM_PROT_READ) != 0)
707 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
708 		if ((prot & VM_PROT_EXECUTE) == 0)
709 			val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
710 	}
711 
712 	return (val);
713 }
714 
715 /*
716  * Checks if the PTE is dirty.
717  */
718 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)719 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
720 {
721 
722 	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
723 
724 	if (pmap->pm_stage == PM_STAGE1) {
725 		KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
726 		    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
727 
728 		return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
729 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
730 	}
731 
732 	return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
733 	    ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
734 }
735 
736 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)737 pmap_resident_count_inc(pmap_t pmap, int count)
738 {
739 
740 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
741 	pmap->pm_stats.resident_count += count;
742 }
743 
744 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)745 pmap_resident_count_dec(pmap_t pmap, int count)
746 {
747 
748 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
749 	KASSERT(pmap->pm_stats.resident_count >= count,
750 	    ("pmap %p resident count underflow %ld %d", pmap,
751 	    pmap->pm_stats.resident_count, count));
752 	pmap->pm_stats.resident_count -= count;
753 }
754 
755 static vm_paddr_t
pmap_early_vtophys(vm_offset_t l1pt,vm_offset_t va)756 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
757 {
758 	vm_paddr_t pa_page;
759 
760 	pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
761 	return (pa_page | (va & PAR_LOW_MASK));
762 }
763 
764 static vm_offset_t
pmap_bootstrap_dmap(vm_offset_t kern_l1,vm_paddr_t min_pa,vm_offset_t freemempos)765 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa,
766     vm_offset_t freemempos)
767 {
768 	pt_entry_t *l2;
769 	vm_offset_t va;
770 	vm_paddr_t l2_pa, pa;
771 	u_int l1_slot, l2_slot, prev_l1_slot;
772 	int i;
773 
774 	dmap_phys_base = min_pa & ~L1_OFFSET;
775 	dmap_phys_max = 0;
776 	dmap_max_addr = 0;
777 	l2 = NULL;
778 	prev_l1_slot = -1;
779 
780 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
781 	memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES);
782 
783 	for (i = 0; i < (physmap_idx * 2); i += 2) {
784 		pa = physmap[i] & ~L2_OFFSET;
785 		va = pa - dmap_phys_base + DMAP_MIN_ADDRESS;
786 
787 		/* Create L2 mappings at the start of the region */
788 		if ((pa & L1_OFFSET) != 0) {
789 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
790 			if (l1_slot != prev_l1_slot) {
791 				prev_l1_slot = l1_slot;
792 				l2 = (pt_entry_t *)freemempos;
793 				l2_pa = pmap_early_vtophys(kern_l1,
794 				    (vm_offset_t)l2);
795 				freemempos += PAGE_SIZE;
796 
797 				pmap_store(&pagetable_dmap[l1_slot],
798 				    (l2_pa & ~Ln_TABLE_MASK) |
799 				    TATTR_PXN_TABLE | L1_TABLE);
800 
801 				memset(l2, 0, PAGE_SIZE);
802 			}
803 			KASSERT(l2 != NULL,
804 			    ("pmap_bootstrap_dmap: NULL l2 map"));
805 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
806 			    pa += L2_SIZE, va += L2_SIZE) {
807 				/*
808 				 * We are on a boundary, stop to
809 				 * create a level 1 block
810 				 */
811 				if ((pa & L1_OFFSET) == 0)
812 					break;
813 
814 				l2_slot = pmap_l2_index(va);
815 				KASSERT(l2_slot != 0, ("..."));
816 				pmap_store(&l2[l2_slot],
817 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT |
818 				    ATTR_S1_XN |
819 				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
820 				    L2_BLOCK);
821 			}
822 			KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS),
823 			    ("..."));
824 		}
825 
826 		for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] &&
827 		    (physmap[i + 1] - pa) >= L1_SIZE;
828 		    pa += L1_SIZE, va += L1_SIZE) {
829 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
830 			pmap_store(&pagetable_dmap[l1_slot],
831 			    (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN |
832 			    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L1_BLOCK);
833 		}
834 
835 		/* Create L2 mappings at the end of the region */
836 		if (pa < physmap[i + 1]) {
837 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
838 			if (l1_slot != prev_l1_slot) {
839 				prev_l1_slot = l1_slot;
840 				l2 = (pt_entry_t *)freemempos;
841 				l2_pa = pmap_early_vtophys(kern_l1,
842 				    (vm_offset_t)l2);
843 				freemempos += PAGE_SIZE;
844 
845 				pmap_store(&pagetable_dmap[l1_slot],
846 				    (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
847 
848 				memset(l2, 0, PAGE_SIZE);
849 			}
850 			KASSERT(l2 != NULL,
851 			    ("pmap_bootstrap_dmap: NULL l2 map"));
852 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
853 			    pa += L2_SIZE, va += L2_SIZE) {
854 				l2_slot = pmap_l2_index(va);
855 				pmap_store(&l2[l2_slot],
856 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT |
857 				    ATTR_S1_XN |
858 				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
859 				    L2_BLOCK);
860 			}
861 		}
862 
863 		if (pa > dmap_phys_max) {
864 			dmap_phys_max = pa;
865 			dmap_max_addr = va;
866 		}
867 	}
868 
869 	cpu_tlb_flushID();
870 
871 	return (freemempos);
872 }
873 
874 static vm_offset_t
pmap_bootstrap_l2(vm_offset_t l1pt,vm_offset_t va,vm_offset_t l2_start)875 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
876 {
877 	vm_offset_t l2pt;
878 	vm_paddr_t pa;
879 	pd_entry_t *l1;
880 	u_int l1_slot;
881 
882 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
883 
884 	l1 = (pd_entry_t *)l1pt;
885 	l1_slot = pmap_l1_index(va);
886 	l2pt = l2_start;
887 
888 	for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
889 		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
890 
891 		pa = pmap_early_vtophys(l1pt, l2pt);
892 		pmap_store(&l1[l1_slot],
893 		    (pa & ~Ln_TABLE_MASK) | L1_TABLE);
894 		l2pt += PAGE_SIZE;
895 	}
896 
897 	/* Clean the L2 page table */
898 	memset((void *)l2_start, 0, l2pt - l2_start);
899 
900 	return l2pt;
901 }
902 
903 static vm_offset_t
pmap_bootstrap_l3(vm_offset_t l1pt,vm_offset_t va,vm_offset_t l3_start)904 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
905 {
906 	vm_offset_t l3pt;
907 	vm_paddr_t pa;
908 	pd_entry_t *l2;
909 	u_int l2_slot;
910 
911 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
912 
913 	l2 = pmap_l2(kernel_pmap, va);
914 	l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
915 	l2_slot = pmap_l2_index(va);
916 	l3pt = l3_start;
917 
918 	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
919 		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
920 
921 		pa = pmap_early_vtophys(l1pt, l3pt);
922 		pmap_store(&l2[l2_slot],
923 		    (pa & ~Ln_TABLE_MASK) | ATTR_S1_UXN | L2_TABLE);
924 		l3pt += PAGE_SIZE;
925 	}
926 
927 	/* Clean the L2 page table */
928 	memset((void *)l3_start, 0, l3pt - l3_start);
929 
930 	return l3pt;
931 }
932 
933 /*
934  *	Bootstrap the system enough to run with virtual memory.
935  */
936 void
pmap_bootstrap(vm_offset_t l0pt,vm_offset_t l1pt,vm_paddr_t kernstart,vm_size_t kernlen)937 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
938     vm_size_t kernlen)
939 {
940 	vm_offset_t freemempos;
941 	vm_offset_t dpcpu, msgbufpv;
942 	vm_paddr_t start_pa, pa, min_pa;
943 	uint64_t kern_delta;
944 	int i;
945 
946 	/* Verify that the ASID is set through TTBR0. */
947 	KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
948 	    ("pmap_bootstrap: TCR_EL1.A1 != 0"));
949 
950 	kern_delta = KERNBASE - kernstart;
951 
952 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
953 	printf("%lx\n", l1pt);
954 	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
955 
956 	/* Set this early so we can use the pagetable walking functions */
957 	kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
958 	PMAP_LOCK_INIT(kernel_pmap);
959 	kernel_pmap->pm_l0_paddr = l0pt - kern_delta;
960 	kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
961 	kernel_pmap->pm_stage = PM_STAGE1;
962 	kernel_pmap->pm_levels = 4;
963 	kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
964 	kernel_pmap->pm_asid_set = &asids;
965 
966 	/* Assume the address we were loaded to is a valid physical address */
967 	min_pa = KERNBASE - kern_delta;
968 
969 	physmap_idx = physmem_avail(physmap, nitems(physmap));
970 	physmap_idx /= 2;
971 
972 	/*
973 	 * Find the minimum physical address. physmap is sorted,
974 	 * but may contain empty ranges.
975 	 */
976 	for (i = 0; i < physmap_idx * 2; i += 2) {
977 		if (physmap[i] == physmap[i + 1])
978 			continue;
979 		if (physmap[i] <= min_pa)
980 			min_pa = physmap[i];
981 	}
982 
983 	freemempos = KERNBASE + kernlen;
984 	freemempos = roundup2(freemempos, PAGE_SIZE);
985 
986 	/* Create a direct map region early so we can use it for pa -> va */
987 	freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos);
988 
989 	start_pa = pa = KERNBASE - kern_delta;
990 
991 	/*
992 	 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS.  We assume that the
993 	 * loader allocated the first and only l2 page table page used to map
994 	 * the kernel, preloaded files and module metadata.
995 	 */
996 	freemempos = pmap_bootstrap_l2(l1pt, KERNBASE + L1_SIZE, freemempos);
997 	/* And the l3 tables for the early devmap */
998 	freemempos = pmap_bootstrap_l3(l1pt,
999 	    VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos);
1000 
1001 	cpu_tlb_flushID();
1002 
1003 #define alloc_pages(var, np)						\
1004 	(var) = freemempos;						\
1005 	freemempos += (np * PAGE_SIZE);					\
1006 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
1007 
1008 	/* Allocate dynamic per-cpu area. */
1009 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1010 	dpcpu_init((void *)dpcpu, 0);
1011 
1012 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1013 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1014 	msgbufp = (void *)msgbufpv;
1015 
1016 	/* Reserve some VA space for early BIOS/ACPI mapping */
1017 	preinit_map_va = roundup2(freemempos, L2_SIZE);
1018 
1019 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1020 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
1021 	virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1022 	kernel_vm_end = virtual_avail;
1023 
1024 	pa = pmap_early_vtophys(l1pt, freemempos);
1025 
1026 	physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1027 
1028 	cpu_tlb_flushID();
1029 }
1030 
1031 /*
1032  *	Initialize a vm_page's machine-dependent fields.
1033  */
1034 void
pmap_page_init(vm_page_t m)1035 pmap_page_init(vm_page_t m)
1036 {
1037 
1038 	TAILQ_INIT(&m->md.pv_list);
1039 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1040 }
1041 
1042 static void
pmap_init_asids(struct asid_set * set,int bits)1043 pmap_init_asids(struct asid_set *set, int bits)
1044 {
1045 	int i;
1046 
1047 	set->asid_bits = bits;
1048 
1049 	/*
1050 	 * We may be too early in the overall initialization process to use
1051 	 * bit_alloc().
1052 	 */
1053 	set->asid_set_size = 1 << set->asid_bits;
1054 	set->asid_set = (bitstr_t *)kmem_malloc(bitstr_size(set->asid_set_size),
1055 	    M_WAITOK | M_ZERO);
1056 	for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1057 		bit_set(set->asid_set, i);
1058 	set->asid_next = ASID_FIRST_AVAILABLE;
1059 	mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1060 }
1061 
1062 /*
1063  *	Initialize the pmap module.
1064  *	Called by vm_init, to initialize any structures that the pmap
1065  *	system needs to map virtual memory.
1066  */
1067 void
pmap_init(void)1068 pmap_init(void)
1069 {
1070 	struct vm_phys_seg *seg, *next_seg;
1071 	struct md_page *pvh;
1072 	vm_size_t s;
1073 	uint64_t mmfr1;
1074 	int i, pv_npg, vmid_bits;
1075 
1076 	/*
1077 	 * Are large page mappings enabled?
1078 	 */
1079 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1080 	if (superpages_enabled) {
1081 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1082 		    ("pmap_init: can't assign to pagesizes[1]"));
1083 		pagesizes[1] = L2_SIZE;
1084 		KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1085 		    ("pmap_init: can't assign to pagesizes[2]"));
1086 		pagesizes[2] = L1_SIZE;
1087 	}
1088 
1089 	/*
1090 	 * Initialize the ASID allocator.
1091 	 */
1092 	pmap_init_asids(&asids,
1093 	    (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1094 
1095 	if (has_hyp()) {
1096 		mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1097 		vmid_bits = 8;
1098 
1099 		if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1100 		    ID_AA64MMFR1_VMIDBits_16)
1101 			vmid_bits = 16;
1102 		pmap_init_asids(&vmids, vmid_bits);
1103 	}
1104 
1105 	/*
1106 	 * Initialize the pv chunk list mutex.
1107 	 */
1108 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1109 
1110 	/*
1111 	 * Initialize the pool of pv list locks.
1112 	 */
1113 	for (i = 0; i < NPV_LIST_LOCKS; i++)
1114 		rw_init(&pv_list_locks[i], "pmap pv list");
1115 
1116 	/*
1117 	 * Calculate the size of the pv head table for superpages.
1118 	 */
1119 	pv_npg = 0;
1120 	for (i = 0; i < vm_phys_nsegs; i++) {
1121 		seg = &vm_phys_segs[i];
1122 		pv_npg += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1123 		    pmap_l2_pindex(seg->start);
1124 	}
1125 
1126 	/*
1127 	 * Allocate memory for the pv head table for superpages.
1128 	 */
1129 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1130 	s = round_page(s);
1131 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
1132 	for (i = 0; i < pv_npg; i++)
1133 		TAILQ_INIT(&pv_table[i].pv_list);
1134 	TAILQ_INIT(&pv_dummy.pv_list);
1135 
1136 	/*
1137 	 * Set pointers from vm_phys_segs to pv_table.
1138 	 */
1139 	for (i = 0, pvh = pv_table; i < vm_phys_nsegs; i++) {
1140 		seg = &vm_phys_segs[i];
1141 		seg->md_first = pvh;
1142 		pvh += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1143 		    pmap_l2_pindex(seg->start);
1144 
1145 		/*
1146 		 * If there is a following segment, and the final
1147 		 * superpage of this segment and the initial superpage
1148 		 * of the next segment are the same then adjust the
1149 		 * pv_table entry for that next segment down by one so
1150 		 * that the pv_table entries will be shared.
1151 		 */
1152 		if (i + 1 < vm_phys_nsegs) {
1153 			next_seg = &vm_phys_segs[i + 1];
1154 			if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1155 			    pmap_l2_pindex(next_seg->start)) {
1156 				pvh--;
1157 			}
1158 		}
1159 	}
1160 
1161 	vm_initialized = 1;
1162 }
1163 
1164 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1165     "2MB page mapping counters");
1166 
1167 static u_long pmap_l2_demotions;
1168 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1169     &pmap_l2_demotions, 0, "2MB page demotions");
1170 
1171 static u_long pmap_l2_mappings;
1172 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1173     &pmap_l2_mappings, 0, "2MB page mappings");
1174 
1175 static u_long pmap_l2_p_failures;
1176 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1177     &pmap_l2_p_failures, 0, "2MB page promotion failures");
1178 
1179 static u_long pmap_l2_promotions;
1180 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1181     &pmap_l2_promotions, 0, "2MB page promotions");
1182 
1183 /*
1184  * Invalidate a single TLB entry.
1185  */
1186 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1187 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1188 {
1189 	uint64_t r;
1190 
1191 	PMAP_ASSERT_STAGE1(pmap);
1192 
1193 	dsb(ishst);
1194 	r = TLBI_VA(va);
1195 	if (pmap == kernel_pmap) {
1196 		__asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1197 	} else {
1198 		r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1199 		__asm __volatile("tlbi vae1is, %0" : : "r" (r));
1200 	}
1201 	dsb(ish);
1202 	isb();
1203 }
1204 
1205 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1206 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1207 {
1208 	uint64_t end, r, start;
1209 
1210 	PMAP_ASSERT_STAGE1(pmap);
1211 
1212 	dsb(ishst);
1213 	if (pmap == kernel_pmap) {
1214 		start = TLBI_VA(sva);
1215 		end = TLBI_VA(eva);
1216 		for (r = start; r < end; r += TLBI_VA_L3_INCR)
1217 			__asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1218 	} else {
1219 		start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1220 		start |= TLBI_VA(sva);
1221 		end |= TLBI_VA(eva);
1222 		for (r = start; r < end; r += TLBI_VA_L3_INCR)
1223 			__asm __volatile("tlbi vae1is, %0" : : "r" (r));
1224 	}
1225 	dsb(ish);
1226 	isb();
1227 }
1228 
1229 static __inline void
pmap_invalidate_all(pmap_t pmap)1230 pmap_invalidate_all(pmap_t pmap)
1231 {
1232 	uint64_t r;
1233 
1234 	PMAP_ASSERT_STAGE1(pmap);
1235 
1236 	dsb(ishst);
1237 	if (pmap == kernel_pmap) {
1238 		__asm __volatile("tlbi vmalle1is");
1239 	} else {
1240 		r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1241 		__asm __volatile("tlbi aside1is, %0" : : "r" (r));
1242 	}
1243 	dsb(ish);
1244 	isb();
1245 }
1246 
1247 /*
1248  *	Routine:	pmap_extract
1249  *	Function:
1250  *		Extract the physical page address associated
1251  *		with the given map/virtual_address pair.
1252  */
1253 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1254 pmap_extract(pmap_t pmap, vm_offset_t va)
1255 {
1256 	pt_entry_t *pte, tpte;
1257 	vm_paddr_t pa;
1258 	int lvl;
1259 
1260 	pa = 0;
1261 	PMAP_LOCK(pmap);
1262 	/*
1263 	 * Find the block or page map for this virtual address. pmap_pte
1264 	 * will return either a valid block/page entry, or NULL.
1265 	 */
1266 	pte = pmap_pte(pmap, va, &lvl);
1267 	if (pte != NULL) {
1268 		tpte = pmap_load(pte);
1269 		pa = tpte & ~ATTR_MASK;
1270 		switch(lvl) {
1271 		case 1:
1272 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1273 			    ("pmap_extract: Invalid L1 pte found: %lx",
1274 			    tpte & ATTR_DESCR_MASK));
1275 			pa |= (va & L1_OFFSET);
1276 			break;
1277 		case 2:
1278 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1279 			    ("pmap_extract: Invalid L2 pte found: %lx",
1280 			    tpte & ATTR_DESCR_MASK));
1281 			pa |= (va & L2_OFFSET);
1282 			break;
1283 		case 3:
1284 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1285 			    ("pmap_extract: Invalid L3 pte found: %lx",
1286 			    tpte & ATTR_DESCR_MASK));
1287 			pa |= (va & L3_OFFSET);
1288 			break;
1289 		}
1290 	}
1291 	PMAP_UNLOCK(pmap);
1292 	return (pa);
1293 }
1294 
1295 /*
1296  *	Routine:	pmap_extract_and_hold
1297  *	Function:
1298  *		Atomically extract and hold the physical page
1299  *		with the given pmap and virtual address pair
1300  *		if that mapping permits the given protection.
1301  */
1302 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1303 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1304 {
1305 	pt_entry_t *pte, tpte;
1306 	vm_offset_t off;
1307 	vm_page_t m;
1308 	int lvl;
1309 	bool use;
1310 
1311 	m = NULL;
1312 	PMAP_LOCK(pmap);
1313 	pte = pmap_pte(pmap, va, &lvl);
1314 	if (pte != NULL) {
1315 		tpte = pmap_load(pte);
1316 
1317 		KASSERT(lvl > 0 && lvl <= 3,
1318 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
1319 		CTASSERT(L1_BLOCK == L2_BLOCK);
1320 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1321 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1322 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1323 		     tpte & ATTR_DESCR_MASK));
1324 
1325 		use = false;
1326 		if ((prot & VM_PROT_WRITE) == 0)
1327 			use = true;
1328 		else if (pmap->pm_stage == PM_STAGE1 &&
1329 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1330 			use = true;
1331 		else if (pmap->pm_stage == PM_STAGE2 &&
1332 		    ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1333 		     ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1334 			use = true;
1335 
1336 		if (use) {
1337 			switch (lvl) {
1338 			case 1:
1339 				off = va & L1_OFFSET;
1340 				break;
1341 			case 2:
1342 				off = va & L2_OFFSET;
1343 				break;
1344 			case 3:
1345 			default:
1346 				off = 0;
1347 			}
1348 			m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
1349 			if (m != NULL && !vm_page_wire_mapped(m))
1350 				m = NULL;
1351 		}
1352 	}
1353 	PMAP_UNLOCK(pmap);
1354 	return (m);
1355 }
1356 
1357 /*
1358  * Walks the page tables to translate a kernel virtual address to a
1359  * physical address. Returns true if the kva is valid and stores the
1360  * physical address in pa if it is not NULL.
1361  */
1362 bool
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)1363 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
1364 {
1365 	pt_entry_t *pte, tpte;
1366 	register_t intr;
1367 	uint64_t par;
1368 
1369 	/*
1370 	 * Disable interrupts so we don't get interrupted between asking
1371 	 * for address translation, and getting the result back.
1372 	 */
1373 	intr = intr_disable();
1374 	par = arm64_address_translate_s1e1r(va);
1375 	intr_restore(intr);
1376 
1377 	if (PAR_SUCCESS(par)) {
1378 		if (pa != NULL)
1379 			*pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
1380 		return (true);
1381 	}
1382 
1383 	/*
1384 	 * Fall back to walking the page table. The address translation
1385 	 * instruction may fail when the page is in a break-before-make
1386 	 * sequence. As we only clear the valid bit in said sequence we
1387 	 * can walk the page table to find the physical address.
1388 	 */
1389 
1390 	pte = pmap_l1(kernel_pmap, va);
1391 	if (pte == NULL)
1392 		return (false);
1393 
1394 	/*
1395 	 * A concurrent pmap_update_entry() will clear the entry's valid bit
1396 	 * but leave the rest of the entry unchanged.  Therefore, we treat a
1397 	 * non-zero entry as being valid, and we ignore the valid bit when
1398 	 * determining whether the entry maps a block, page, or table.
1399 	 */
1400 	tpte = pmap_load(pte);
1401 	if (tpte == 0)
1402 		return (false);
1403 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1404 		if (pa != NULL)
1405 			*pa = (tpte & ~ATTR_MASK) | (va & L1_OFFSET);
1406 		return (true);
1407 	}
1408 	pte = pmap_l1_to_l2(&tpte, va);
1409 	tpte = pmap_load(pte);
1410 	if (tpte == 0)
1411 		return (false);
1412 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1413 		if (pa != NULL)
1414 			*pa = (tpte & ~ATTR_MASK) | (va & L2_OFFSET);
1415 		return (true);
1416 	}
1417 	pte = pmap_l2_to_l3(&tpte, va);
1418 	tpte = pmap_load(pte);
1419 	if (tpte == 0)
1420 		return (false);
1421 	if (pa != NULL)
1422 		*pa = (tpte & ~ATTR_MASK) | (va & L3_OFFSET);
1423 	return (true);
1424 }
1425 
1426 vm_paddr_t
pmap_kextract(vm_offset_t va)1427 pmap_kextract(vm_offset_t va)
1428 {
1429 	vm_paddr_t pa;
1430 
1431 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
1432 		return (DMAP_TO_PHYS(va));
1433 
1434 	if (pmap_klookup(va, &pa) == false)
1435 		return (0);
1436 	return (pa);
1437 }
1438 
1439 /***************************************************
1440  * Low level mapping routines.....
1441  ***************************************************/
1442 
1443 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)1444 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1445 {
1446 	pd_entry_t *pde;
1447 	pt_entry_t *pte, attr;
1448 	vm_offset_t va;
1449 	int lvl;
1450 
1451 	KASSERT((pa & L3_OFFSET) == 0,
1452 	   ("pmap_kenter: Invalid physical address"));
1453 	KASSERT((sva & L3_OFFSET) == 0,
1454 	   ("pmap_kenter: Invalid virtual address"));
1455 	KASSERT((size & PAGE_MASK) == 0,
1456 	    ("pmap_kenter: Mapping is not page-sized"));
1457 
1458 	attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
1459 	    ATTR_S1_IDX(mode) | L3_PAGE;
1460 	va = sva;
1461 	while (size != 0) {
1462 		pde = pmap_pde(kernel_pmap, va, &lvl);
1463 		KASSERT(pde != NULL,
1464 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
1465 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
1466 
1467 		pte = pmap_l2_to_l3(pde, va);
1468 		pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
1469 
1470 		va += PAGE_SIZE;
1471 		pa += PAGE_SIZE;
1472 		size -= PAGE_SIZE;
1473 	}
1474 	pmap_invalidate_range(kernel_pmap, sva, va);
1475 }
1476 
1477 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)1478 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1479 {
1480 
1481 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
1482 }
1483 
1484 /*
1485  * Remove a page from the kernel pagetables.
1486  */
1487 PMAP_INLINE void
pmap_kremove(vm_offset_t va)1488 pmap_kremove(vm_offset_t va)
1489 {
1490 	pt_entry_t *pte;
1491 	int lvl;
1492 
1493 	pte = pmap_pte(kernel_pmap, va, &lvl);
1494 	KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
1495 	KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));
1496 
1497 	pmap_clear(pte);
1498 	pmap_invalidate_page(kernel_pmap, va);
1499 }
1500 
1501 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)1502 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1503 {
1504 	pt_entry_t *pte;
1505 	vm_offset_t va;
1506 	int lvl;
1507 
1508 	KASSERT((sva & L3_OFFSET) == 0,
1509 	   ("pmap_kremove_device: Invalid virtual address"));
1510 	KASSERT((size & PAGE_MASK) == 0,
1511 	    ("pmap_kremove_device: Mapping is not page-sized"));
1512 
1513 	va = sva;
1514 	while (size != 0) {
1515 		pte = pmap_pte(kernel_pmap, va, &lvl);
1516 		KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
1517 		KASSERT(lvl == 3,
1518 		    ("Invalid device pagetable level: %d != 3", lvl));
1519 		pmap_clear(pte);
1520 
1521 		va += PAGE_SIZE;
1522 		size -= PAGE_SIZE;
1523 	}
1524 	pmap_invalidate_range(kernel_pmap, sva, va);
1525 }
1526 
1527 /*
1528  *	Used to map a range of physical addresses into kernel
1529  *	virtual address space.
1530  *
1531  *	The value passed in '*virt' is a suggested virtual address for
1532  *	the mapping. Architectures which can support a direct-mapped
1533  *	physical to virtual region can return the appropriate address
1534  *	within that region, leaving '*virt' unchanged. Other
1535  *	architectures should map the pages starting at '*virt' and
1536  *	update '*virt' with the first usable address after the mapped
1537  *	region.
1538  */
1539 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)1540 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1541 {
1542 	return PHYS_TO_DMAP(start);
1543 }
1544 
1545 /*
1546  * Add a list of wired pages to the kva
1547  * this routine is only used for temporary
1548  * kernel mappings that do not need to have
1549  * page modification or references recorded.
1550  * Note that old mappings are simply written
1551  * over.  The page *must* be wired.
1552  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1553  */
1554 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)1555 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1556 {
1557 	pd_entry_t *pde;
1558 	pt_entry_t *pte, pa;
1559 	vm_offset_t va;
1560 	vm_page_t m;
1561 	int i, lvl;
1562 
1563 	va = sva;
1564 	for (i = 0; i < count; i++) {
1565 		pde = pmap_pde(kernel_pmap, va, &lvl);
1566 		KASSERT(pde != NULL,
1567 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
1568 		KASSERT(lvl == 2,
1569 		    ("pmap_qenter: Invalid level %d", lvl));
1570 
1571 		m = ma[i];
1572 		pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
1573 		    ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
1574 		    ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
1575 		pte = pmap_l2_to_l3(pde, va);
1576 		pmap_load_store(pte, pa);
1577 
1578 		va += L3_SIZE;
1579 	}
1580 	pmap_invalidate_range(kernel_pmap, sva, va);
1581 }
1582 
1583 /*
1584  * This routine tears out page mappings from the
1585  * kernel -- it is meant only for temporary mappings.
1586  */
1587 void
pmap_qremove(vm_offset_t sva,int count)1588 pmap_qremove(vm_offset_t sva, int count)
1589 {
1590 	pt_entry_t *pte;
1591 	vm_offset_t va;
1592 	int lvl;
1593 
1594 	KASSERT(ADDR_IS_CANONICAL(sva),
1595 	    ("%s: Address not in canonical form: %lx", __func__, sva));
1596 	KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
1597 
1598 	va = sva;
1599 	while (count-- > 0) {
1600 		pte = pmap_pte(kernel_pmap, va, &lvl);
1601 		KASSERT(lvl == 3,
1602 		    ("Invalid device pagetable level: %d != 3", lvl));
1603 		if (pte != NULL) {
1604 			pmap_clear(pte);
1605 		}
1606 
1607 		va += PAGE_SIZE;
1608 	}
1609 	pmap_invalidate_range(kernel_pmap, sva, va);
1610 }
1611 
1612 /***************************************************
1613  * Page table page management routines.....
1614  ***************************************************/
1615 /*
1616  * Schedule the specified unused page table page to be freed.  Specifically,
1617  * add the page to the specified list of pages that will be released to the
1618  * physical memory manager after the TLB has been updated.
1619  */
1620 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,boolean_t set_PG_ZERO)1621 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1622     boolean_t set_PG_ZERO)
1623 {
1624 
1625 	if (set_PG_ZERO)
1626 		m->flags |= PG_ZERO;
1627 	else
1628 		m->flags &= ~PG_ZERO;
1629 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1630 }
1631 
1632 /*
1633  * Decrements a page table page's reference count, which is used to record the
1634  * number of valid page table entries within the page.  If the reference count
1635  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1636  * page table page was unmapped and FALSE otherwise.
1637  */
1638 static inline boolean_t
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1639 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1640 {
1641 
1642 	--m->ref_count;
1643 	if (m->ref_count == 0) {
1644 		_pmap_unwire_l3(pmap, va, m, free);
1645 		return (TRUE);
1646 	} else
1647 		return (FALSE);
1648 }
1649 
1650 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1651 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1652 {
1653 
1654 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1655 	/*
1656 	 * unmap the page table page
1657 	 */
1658 	if (m->pindex >= (NUL2E + NUL1E)) {
1659 		/* l1 page */
1660 		pd_entry_t *l0;
1661 
1662 		l0 = pmap_l0(pmap, va);
1663 		pmap_clear(l0);
1664 	} else if (m->pindex >= NUL2E) {
1665 		/* l2 page */
1666 		pd_entry_t *l1;
1667 
1668 		l1 = pmap_l1(pmap, va);
1669 		pmap_clear(l1);
1670 	} else {
1671 		/* l3 page */
1672 		pd_entry_t *l2;
1673 
1674 		l2 = pmap_l2(pmap, va);
1675 		pmap_clear(l2);
1676 	}
1677 	pmap_resident_count_dec(pmap, 1);
1678 	if (m->pindex < NUL2E) {
1679 		/* We just released an l3, unhold the matching l2 */
1680 		pd_entry_t *l1, tl1;
1681 		vm_page_t l2pg;
1682 
1683 		l1 = pmap_l1(pmap, va);
1684 		tl1 = pmap_load(l1);
1685 		l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1686 		pmap_unwire_l3(pmap, va, l2pg, free);
1687 	} else if (m->pindex < (NUL2E + NUL1E)) {
1688 		/* We just released an l2, unhold the matching l1 */
1689 		pd_entry_t *l0, tl0;
1690 		vm_page_t l1pg;
1691 
1692 		l0 = pmap_l0(pmap, va);
1693 		tl0 = pmap_load(l0);
1694 		l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1695 		pmap_unwire_l3(pmap, va, l1pg, free);
1696 	}
1697 	pmap_invalidate_page(pmap, va);
1698 
1699 	/*
1700 	 * Put page on a list so that it is released after
1701 	 * *ALL* TLB shootdown is done
1702 	 */
1703 	pmap_add_delayed_free_list(m, free, TRUE);
1704 }
1705 
1706 /*
1707  * After removing a page table entry, this routine is used to
1708  * conditionally free the page, and manage the reference count.
1709  */
1710 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)1711 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1712     struct spglist *free)
1713 {
1714 	vm_page_t mpte;
1715 
1716 	KASSERT(ADDR_IS_CANONICAL(va),
1717 	    ("%s: Address not in canonical form: %lx", __func__, va));
1718 	if (ADDR_IS_KERNEL(va))
1719 		return (0);
1720 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1721 	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
1722 	return (pmap_unwire_l3(pmap, va, mpte, free));
1723 }
1724 
1725 /*
1726  * Release a page table page reference after a failed attempt to create a
1727  * mapping.
1728  */
1729 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)1730 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1731 {
1732 	struct spglist free;
1733 
1734 	SLIST_INIT(&free);
1735 	if (pmap_unwire_l3(pmap, va, mpte, &free)) {
1736 		/*
1737 		 * Although "va" was never mapped, the TLB could nonetheless
1738 		 * have intermediate entries that refer to the freed page
1739 		 * table pages.  Invalidate those entries.
1740 		 *
1741 		 * XXX redundant invalidation (See _pmap_unwire_l3().)
1742 		 */
1743 		pmap_invalidate_page(pmap, va);
1744 		vm_page_free_pages_toq(&free, true);
1745 	}
1746 }
1747 
1748 void
pmap_pinit0(pmap_t pmap)1749 pmap_pinit0(pmap_t pmap)
1750 {
1751 
1752 	PMAP_LOCK_INIT(pmap);
1753 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1754 	pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
1755 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
1756 	vm_radix_init(&pmap->pm_root);
1757 	pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
1758 	pmap->pm_stage = PM_STAGE1;
1759 	pmap->pm_levels = 4;
1760 	pmap->pm_ttbr = pmap->pm_l0_paddr;
1761 	pmap->pm_asid_set = &asids;
1762 
1763 	PCPU_SET(curpmap, pmap);
1764 }
1765 
1766 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)1767 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
1768 {
1769 	vm_page_t m;
1770 
1771 	/*
1772 	 * allocate the l0 page
1773 	 */
1774 	m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
1775 	    VM_ALLOC_ZERO);
1776 	pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
1777 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
1778 
1779 	vm_radix_init(&pmap->pm_root);
1780 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1781 	pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
1782 
1783 	MPASS(levels == 3 || levels == 4);
1784 	pmap->pm_levels = levels;
1785 	pmap->pm_stage = stage;
1786 	switch (stage) {
1787 	case PM_STAGE1:
1788 		pmap->pm_asid_set = &asids;
1789 		break;
1790 	case PM_STAGE2:
1791 		pmap->pm_asid_set = &vmids;
1792 		break;
1793 	default:
1794 		panic("%s: Invalid pmap type %d", __func__, stage);
1795 		break;
1796 	}
1797 
1798 	/* XXX Temporarily disable deferred ASID allocation. */
1799 	pmap_alloc_asid(pmap);
1800 
1801 	/*
1802 	 * Allocate the level 1 entry to use as the root. This will increase
1803 	 * the refcount on the level 1 page so it won't be removed until
1804 	 * pmap_release() is called.
1805 	 */
1806 	if (pmap->pm_levels == 3) {
1807 		PMAP_LOCK(pmap);
1808 		m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
1809 		PMAP_UNLOCK(pmap);
1810 	}
1811 	pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
1812 
1813 	return (1);
1814 }
1815 
1816 int
pmap_pinit(pmap_t pmap)1817 pmap_pinit(pmap_t pmap)
1818 {
1819 
1820 	return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
1821 }
1822 
1823 /*
1824  * This routine is called if the desired page table page does not exist.
1825  *
1826  * If page table page allocation fails, this routine may sleep before
1827  * returning NULL.  It sleeps only if a lock pointer was given.
1828  *
1829  * Note: If a page allocation fails at page table level two or three,
1830  * one or two pages may be held during the wait, only to be released
1831  * afterwards.  This conservative approach is easily argued to avoid
1832  * race conditions.
1833  */
1834 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)1835 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1836 {
1837 	vm_page_t m, l1pg, l2pg;
1838 
1839 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1840 
1841 	/*
1842 	 * Allocate a page table page.
1843 	 */
1844 	if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1845 		if (lockp != NULL) {
1846 			RELEASE_PV_LIST_LOCK(lockp);
1847 			PMAP_UNLOCK(pmap);
1848 			vm_wait(NULL);
1849 			PMAP_LOCK(pmap);
1850 		}
1851 
1852 		/*
1853 		 * Indicate the need to retry.  While waiting, the page table
1854 		 * page may have been allocated.
1855 		 */
1856 		return (NULL);
1857 	}
1858 	m->pindex = ptepindex;
1859 
1860 	/*
1861 	 * Because of AArch64's weak memory consistency model, we must have a
1862 	 * barrier here to ensure that the stores for zeroing "m", whether by
1863 	 * pmap_zero_page() or an earlier function, are visible before adding
1864 	 * "m" to the page table.  Otherwise, a page table walk by another
1865 	 * processor's MMU could see the mapping to "m" and a stale, non-zero
1866 	 * PTE within "m".
1867 	 */
1868 	dmb(ishst);
1869 
1870 	/*
1871 	 * Map the pagetable page into the process address space, if
1872 	 * it isn't already there.
1873 	 */
1874 
1875 	if (ptepindex >= (NUL2E + NUL1E)) {
1876 		pd_entry_t *l0p, l0e;
1877 		vm_pindex_t l0index;
1878 
1879 		l0index = ptepindex - (NUL2E + NUL1E);
1880 		l0p = &pmap->pm_l0[l0index];
1881 		KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
1882 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
1883 		l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE;
1884 
1885 		/*
1886 		 * Mark all kernel memory as not accessible from userspace
1887 		 * and userspace memory as not executable from the kernel.
1888 		 * This has been done for the bootstrap L0 entries in
1889 		 * locore.S.
1890 		 */
1891 		if (pmap == kernel_pmap)
1892 			l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
1893 		else
1894 			l0e |= TATTR_PXN_TABLE;
1895 		pmap_store(l0p, l0e);
1896 	} else if (ptepindex >= NUL2E) {
1897 		vm_pindex_t l0index, l1index;
1898 		pd_entry_t *l0, *l1;
1899 		pd_entry_t tl0;
1900 
1901 		l1index = ptepindex - NUL2E;
1902 		l0index = l1index >> L0_ENTRIES_SHIFT;
1903 
1904 		l0 = &pmap->pm_l0[l0index];
1905 		tl0 = pmap_load(l0);
1906 		if (tl0 == 0) {
1907 			/* recurse for allocating page dir */
1908 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
1909 			    lockp) == NULL) {
1910 				vm_page_unwire_noq(m);
1911 				vm_page_free_zero(m);
1912 				return (NULL);
1913 			}
1914 		} else {
1915 			l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1916 			l1pg->ref_count++;
1917 		}
1918 
1919 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
1920 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
1921 		KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
1922 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
1923 		pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
1924 	} else {
1925 		vm_pindex_t l0index, l1index;
1926 		pd_entry_t *l0, *l1, *l2;
1927 		pd_entry_t tl0, tl1;
1928 
1929 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
1930 		l0index = l1index >> L0_ENTRIES_SHIFT;
1931 
1932 		l0 = &pmap->pm_l0[l0index];
1933 		tl0 = pmap_load(l0);
1934 		if (tl0 == 0) {
1935 			/* recurse for allocating page dir */
1936 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1937 			    lockp) == NULL) {
1938 				vm_page_unwire_noq(m);
1939 				vm_page_free_zero(m);
1940 				return (NULL);
1941 			}
1942 			tl0 = pmap_load(l0);
1943 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1944 			l1 = &l1[l1index & Ln_ADDR_MASK];
1945 		} else {
1946 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1947 			l1 = &l1[l1index & Ln_ADDR_MASK];
1948 			tl1 = pmap_load(l1);
1949 			if (tl1 == 0) {
1950 				/* recurse for allocating page dir */
1951 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1952 				    lockp) == NULL) {
1953 					vm_page_unwire_noq(m);
1954 					vm_page_free_zero(m);
1955 					return (NULL);
1956 				}
1957 			} else {
1958 				l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1959 				l2pg->ref_count++;
1960 			}
1961 		}
1962 
1963 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
1964 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
1965 		KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
1966 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
1967 		pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
1968 	}
1969 
1970 	pmap_resident_count_inc(pmap, 1);
1971 
1972 	return (m);
1973 }
1974 
1975 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)1976 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
1977     struct rwlock **lockp)
1978 {
1979 	pd_entry_t *l1, *l2;
1980 	vm_page_t l2pg;
1981 	vm_pindex_t l2pindex;
1982 
1983 	KASSERT(ADDR_IS_CANONICAL(va),
1984 	    ("%s: Address not in canonical form: %lx", __func__, va));
1985 
1986 retry:
1987 	l1 = pmap_l1(pmap, va);
1988 	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
1989 		l2 = pmap_l1_to_l2(l1, va);
1990 		if (!ADDR_IS_KERNEL(va)) {
1991 			/* Add a reference to the L2 page. */
1992 			l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK);
1993 			l2pg->ref_count++;
1994 		} else
1995 			l2pg = NULL;
1996 	} else if (!ADDR_IS_KERNEL(va)) {
1997 		/* Allocate a L2 page. */
1998 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
1999 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2000 		if (l2pg == NULL) {
2001 			if (lockp != NULL)
2002 				goto retry;
2003 			else
2004 				return (NULL);
2005 		}
2006 		l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2007 		l2 = &l2[pmap_l2_index(va)];
2008 	} else
2009 		panic("pmap_alloc_l2: missing page table page for va %#lx",
2010 		    va);
2011 	*l2pgp = l2pg;
2012 	return (l2);
2013 }
2014 
2015 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)2016 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2017 {
2018 	vm_pindex_t ptepindex;
2019 	pd_entry_t *pde, tpde;
2020 #ifdef INVARIANTS
2021 	pt_entry_t *pte;
2022 #endif
2023 	vm_page_t m;
2024 	int lvl;
2025 
2026 	/*
2027 	 * Calculate pagetable page index
2028 	 */
2029 	ptepindex = pmap_l2_pindex(va);
2030 retry:
2031 	/*
2032 	 * Get the page directory entry
2033 	 */
2034 	pde = pmap_pde(pmap, va, &lvl);
2035 
2036 	/*
2037 	 * If the page table page is mapped, we just increment the hold count,
2038 	 * and activate it. If we get a level 2 pde it will point to a level 3
2039 	 * table.
2040 	 */
2041 	switch (lvl) {
2042 	case -1:
2043 		break;
2044 	case 0:
2045 #ifdef INVARIANTS
2046 		pte = pmap_l0_to_l1(pde, va);
2047 		KASSERT(pmap_load(pte) == 0,
2048 		    ("pmap_alloc_l3: TODO: l0 superpages"));
2049 #endif
2050 		break;
2051 	case 1:
2052 #ifdef INVARIANTS
2053 		pte = pmap_l1_to_l2(pde, va);
2054 		KASSERT(pmap_load(pte) == 0,
2055 		    ("pmap_alloc_l3: TODO: l1 superpages"));
2056 #endif
2057 		break;
2058 	case 2:
2059 		tpde = pmap_load(pde);
2060 		if (tpde != 0) {
2061 			m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
2062 			m->ref_count++;
2063 			return (m);
2064 		}
2065 		break;
2066 	default:
2067 		panic("pmap_alloc_l3: Invalid level %d", lvl);
2068 	}
2069 
2070 	/*
2071 	 * Here if the pte page isn't mapped, or if it has been deallocated.
2072 	 */
2073 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2074 	if (m == NULL && lockp != NULL)
2075 		goto retry;
2076 
2077 	return (m);
2078 }
2079 
2080 /***************************************************
2081  * Pmap allocation/deallocation routines.
2082  ***************************************************/
2083 
2084 /*
2085  * Release any resources held by the given physical map.
2086  * Called when a pmap initialized by pmap_pinit is being released.
2087  * Should only be called if the map contains no valid mappings.
2088  */
2089 void
pmap_release(pmap_t pmap)2090 pmap_release(pmap_t pmap)
2091 {
2092 	boolean_t rv;
2093 	struct spglist free;
2094 	struct asid_set *set;
2095 	vm_page_t m;
2096 	int asid;
2097 
2098 	if (pmap->pm_levels != 4) {
2099 		PMAP_ASSERT_STAGE2(pmap);
2100 		KASSERT(pmap->pm_stats.resident_count == 1,
2101 		    ("pmap_release: pmap resident count %ld != 0",
2102 		    pmap->pm_stats.resident_count));
2103 		KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2104 		    ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2105 
2106 		SLIST_INIT(&free);
2107 		m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2108 		PMAP_LOCK(pmap);
2109 		rv = pmap_unwire_l3(pmap, 0, m, &free);
2110 		PMAP_UNLOCK(pmap);
2111 		MPASS(rv == TRUE);
2112 		vm_page_free_pages_toq(&free, true);
2113 	}
2114 
2115 	KASSERT(pmap->pm_stats.resident_count == 0,
2116 	    ("pmap_release: pmap resident count %ld != 0",
2117 	    pmap->pm_stats.resident_count));
2118 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2119 	    ("pmap_release: pmap has reserved page table page(s)"));
2120 
2121 	set = pmap->pm_asid_set;
2122 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2123 
2124 	/*
2125 	 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2126 	 * the entries when removing them so rely on a later tlb invalidation.
2127 	 * this will happen when updating the VMID generation. Because of this
2128 	 * we don't reuse VMIDs within a generation.
2129 	 */
2130 	if (pmap->pm_stage == PM_STAGE1) {
2131 		mtx_lock_spin(&set->asid_set_mutex);
2132 		if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2133 			asid = COOKIE_TO_ASID(pmap->pm_cookie);
2134 			KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2135 			    asid < set->asid_set_size,
2136 			    ("pmap_release: pmap cookie has out-of-range asid"));
2137 			bit_clear(set->asid_set, asid);
2138 		}
2139 		mtx_unlock_spin(&set->asid_set_mutex);
2140 	}
2141 
2142 	m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2143 	vm_page_unwire_noq(m);
2144 	vm_page_free_zero(m);
2145 }
2146 
2147 static int
kvm_size(SYSCTL_HANDLER_ARGS)2148 kvm_size(SYSCTL_HANDLER_ARGS)
2149 {
2150 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2151 
2152 	return sysctl_handle_long(oidp, &ksize, 0, req);
2153 }
2154 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2155     0, 0, kvm_size, "LU",
2156     "Size of KVM");
2157 
2158 static int
kvm_free(SYSCTL_HANDLER_ARGS)2159 kvm_free(SYSCTL_HANDLER_ARGS)
2160 {
2161 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2162 
2163 	return sysctl_handle_long(oidp, &kfree, 0, req);
2164 }
2165 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2166     0, 0, kvm_free, "LU",
2167     "Amount of KVM free");
2168 
2169 /*
2170  * grow the number of kernel page table entries, if needed
2171  */
2172 void
pmap_growkernel(vm_offset_t addr)2173 pmap_growkernel(vm_offset_t addr)
2174 {
2175 	vm_paddr_t paddr;
2176 	vm_page_t nkpg;
2177 	pd_entry_t *l0, *l1, *l2;
2178 
2179 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2180 
2181 	addr = roundup2(addr, L2_SIZE);
2182 	if (addr - 1 >= vm_map_max(kernel_map))
2183 		addr = vm_map_max(kernel_map);
2184 	while (kernel_vm_end < addr) {
2185 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2186 		KASSERT(pmap_load(l0) != 0,
2187 		    ("pmap_growkernel: No level 0 kernel entry"));
2188 
2189 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2190 		if (pmap_load(l1) == 0) {
2191 			/* We need a new PDP entry */
2192 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2193 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2194 			if (nkpg == NULL)
2195 				panic("pmap_growkernel: no memory to grow kernel");
2196 			nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2197 			/* See the dmb() in _pmap_alloc_l3(). */
2198 			dmb(ishst);
2199 			paddr = VM_PAGE_TO_PHYS(nkpg);
2200 			pmap_store(l1, paddr | L1_TABLE);
2201 			continue; /* try again */
2202 		}
2203 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2204 		if (pmap_load(l2) != 0) {
2205 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2206 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2207 				kernel_vm_end = vm_map_max(kernel_map);
2208 				break;
2209 			}
2210 			continue;
2211 		}
2212 
2213 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2214 		    VM_ALLOC_ZERO);
2215 		if (nkpg == NULL)
2216 			panic("pmap_growkernel: no memory to grow kernel");
2217 		nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2218 		/* See the dmb() in _pmap_alloc_l3(). */
2219 		dmb(ishst);
2220 		paddr = VM_PAGE_TO_PHYS(nkpg);
2221 		pmap_store(l2, paddr | L2_TABLE);
2222 
2223 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2224 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2225 			kernel_vm_end = vm_map_max(kernel_map);
2226 			break;
2227 		}
2228 	}
2229 }
2230 
2231 /***************************************************
2232  * page management routines.
2233  ***************************************************/
2234 
2235 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2236 CTASSERT(_NPCM == 3);
2237 CTASSERT(_NPCPV == 168);
2238 
2239 static __inline struct pv_chunk *
pv_to_chunk(pv_entry_t pv)2240 pv_to_chunk(pv_entry_t pv)
2241 {
2242 
2243 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2244 }
2245 
2246 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2247 
2248 #define	PC_FREE0	0xfffffffffffffffful
2249 #define	PC_FREE1	0xfffffffffffffffful
2250 #define	PC_FREE2	0x000000fffffffffful
2251 
2252 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2253 
2254 #ifdef PV_STATS
2255 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2256 
2257 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2258 	"Current number of pv entry chunks");
2259 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2260 	"Current number of pv entry chunks allocated");
2261 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2262 	"Current number of pv entry chunks frees");
2263 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2264 	"Number of times tried to get a chunk page but failed.");
2265 
2266 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2267 static int pv_entry_spare;
2268 
2269 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2270 	"Current number of pv entry frees");
2271 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2272 	"Current number of pv entry allocs");
2273 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2274 	"Current number of pv entries");
2275 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2276 	"Current number of spare pv entries");
2277 #endif
2278 
2279 /*
2280  * We are in a serious low memory condition.  Resort to
2281  * drastic measures to free some pages so we can allocate
2282  * another pv entry chunk.
2283  *
2284  * Returns NULL if PV entries were reclaimed from the specified pmap.
2285  *
2286  * We do not, however, unmap 2mpages because subsequent accesses will
2287  * allocate per-page pv entries until repromotion occurs, thereby
2288  * exacerbating the shortage of free pv entries.
2289  */
2290 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)2291 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2292 {
2293 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
2294 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
2295 	struct md_page *pvh;
2296 	pd_entry_t *pde;
2297 	pmap_t next_pmap, pmap;
2298 	pt_entry_t *pte, tpte;
2299 	pv_entry_t pv;
2300 	vm_offset_t va;
2301 	vm_page_t m, m_pc;
2302 	struct spglist free;
2303 	uint64_t inuse;
2304 	int bit, field, freed, lvl;
2305 	static int active_reclaims = 0;
2306 
2307 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2308 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2309 
2310 	pmap = NULL;
2311 	m_pc = NULL;
2312 	SLIST_INIT(&free);
2313 	bzero(&pc_marker_b, sizeof(pc_marker_b));
2314 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
2315 	pc_marker = (struct pv_chunk *)&pc_marker_b;
2316 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
2317 
2318 	mtx_lock(&pv_chunks_mutex);
2319 	active_reclaims++;
2320 	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
2321 	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
2322 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
2323 	    SLIST_EMPTY(&free)) {
2324 		next_pmap = pc->pc_pmap;
2325 		if (next_pmap == NULL) {
2326 			/*
2327 			 * The next chunk is a marker.  However, it is
2328 			 * not our marker, so active_reclaims must be
2329 			 * > 1.  Consequently, the next_chunk code
2330 			 * will not rotate the pv_chunks list.
2331 			 */
2332 			goto next_chunk;
2333 		}
2334 		mtx_unlock(&pv_chunks_mutex);
2335 
2336 		/*
2337 		 * A pv_chunk can only be removed from the pc_lru list
2338 		 * when both pv_chunks_mutex is owned and the
2339 		 * corresponding pmap is locked.
2340 		 */
2341 		if (pmap != next_pmap) {
2342 			if (pmap != NULL && pmap != locked_pmap)
2343 				PMAP_UNLOCK(pmap);
2344 			pmap = next_pmap;
2345 			/* Avoid deadlock and lock recursion. */
2346 			if (pmap > locked_pmap) {
2347 				RELEASE_PV_LIST_LOCK(lockp);
2348 				PMAP_LOCK(pmap);
2349 				mtx_lock(&pv_chunks_mutex);
2350 				continue;
2351 			} else if (pmap != locked_pmap) {
2352 				if (PMAP_TRYLOCK(pmap)) {
2353 					mtx_lock(&pv_chunks_mutex);
2354 					continue;
2355 				} else {
2356 					pmap = NULL; /* pmap is not locked */
2357 					mtx_lock(&pv_chunks_mutex);
2358 					pc = TAILQ_NEXT(pc_marker, pc_lru);
2359 					if (pc == NULL ||
2360 					    pc->pc_pmap != next_pmap)
2361 						continue;
2362 					goto next_chunk;
2363 				}
2364 			}
2365 		}
2366 
2367 		/*
2368 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2369 		 */
2370 		freed = 0;
2371 		for (field = 0; field < _NPCM; field++) {
2372 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2373 			    inuse != 0; inuse &= ~(1UL << bit)) {
2374 				bit = ffsl(inuse) - 1;
2375 				pv = &pc->pc_pventry[field * 64 + bit];
2376 				va = pv->pv_va;
2377 				pde = pmap_pde(pmap, va, &lvl);
2378 				if (lvl != 2)
2379 					continue;
2380 				pte = pmap_l2_to_l3(pde, va);
2381 				tpte = pmap_load(pte);
2382 				if ((tpte & ATTR_SW_WIRED) != 0)
2383 					continue;
2384 				tpte = pmap_load_clear(pte);
2385 				m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
2386 				if (pmap_pte_dirty(pmap, tpte))
2387 					vm_page_dirty(m);
2388 				if ((tpte & ATTR_AF) != 0) {
2389 					pmap_invalidate_page(pmap, va);
2390 					vm_page_aflag_set(m, PGA_REFERENCED);
2391 				}
2392 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2393 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2394 				m->md.pv_gen++;
2395 				if (TAILQ_EMPTY(&m->md.pv_list) &&
2396 				    (m->flags & PG_FICTITIOUS) == 0) {
2397 					pvh = page_to_pvh(m);
2398 					if (TAILQ_EMPTY(&pvh->pv_list)) {
2399 						vm_page_aflag_clear(m,
2400 						    PGA_WRITEABLE);
2401 					}
2402 				}
2403 				pc->pc_map[field] |= 1UL << bit;
2404 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
2405 				freed++;
2406 			}
2407 		}
2408 		if (freed == 0) {
2409 			mtx_lock(&pv_chunks_mutex);
2410 			goto next_chunk;
2411 		}
2412 		/* Every freed mapping is for a 4 KB page. */
2413 		pmap_resident_count_dec(pmap, freed);
2414 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2415 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2416 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2417 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2418 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2419 		    pc->pc_map[2] == PC_FREE2) {
2420 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2421 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2422 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2423 			/* Entire chunk is free; return it. */
2424 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2425 			dump_drop_page(m_pc->phys_addr);
2426 			mtx_lock(&pv_chunks_mutex);
2427 			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2428 			break;
2429 		}
2430 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2431 		mtx_lock(&pv_chunks_mutex);
2432 		/* One freed pv entry in locked_pmap is sufficient. */
2433 		if (pmap == locked_pmap)
2434 			break;
2435 
2436 next_chunk:
2437 		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
2438 		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
2439 		if (active_reclaims == 1 && pmap != NULL) {
2440 			/*
2441 			 * Rotate the pv chunks list so that we do not
2442 			 * scan the same pv chunks that could not be
2443 			 * freed (because they contained a wired
2444 			 * and/or superpage mapping) on every
2445 			 * invocation of reclaim_pv_chunk().
2446 			 */
2447 			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
2448 				MPASS(pc->pc_pmap != NULL);
2449 				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2450 				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2451 			}
2452 		}
2453 	}
2454 	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
2455 	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
2456 	active_reclaims--;
2457 	mtx_unlock(&pv_chunks_mutex);
2458 	if (pmap != NULL && pmap != locked_pmap)
2459 		PMAP_UNLOCK(pmap);
2460 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2461 		m_pc = SLIST_FIRST(&free);
2462 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2463 		/* Recycle a freed page table page. */
2464 		m_pc->ref_count = 1;
2465 	}
2466 	vm_page_free_pages_toq(&free, true);
2467 	return (m_pc);
2468 }
2469 
2470 /*
2471  * free the pv_entry back to the free list
2472  */
2473 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)2474 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2475 {
2476 	struct pv_chunk *pc;
2477 	int idx, field, bit;
2478 
2479 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2480 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2481 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2482 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2483 	pc = pv_to_chunk(pv);
2484 	idx = pv - &pc->pc_pventry[0];
2485 	field = idx / 64;
2486 	bit = idx % 64;
2487 	pc->pc_map[field] |= 1ul << bit;
2488 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2489 	    pc->pc_map[2] != PC_FREE2) {
2490 		/* 98% of the time, pc is already at the head of the list. */
2491 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2492 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2493 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2494 		}
2495 		return;
2496 	}
2497 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2498 	free_pv_chunk(pc);
2499 }
2500 
2501 static void
free_pv_chunk(struct pv_chunk * pc)2502 free_pv_chunk(struct pv_chunk *pc)
2503 {
2504 	vm_page_t m;
2505 
2506 	mtx_lock(&pv_chunks_mutex);
2507  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2508 	mtx_unlock(&pv_chunks_mutex);
2509 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2510 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2511 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2512 	/* entire chunk is free, return it */
2513 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2514 	dump_drop_page(m->phys_addr);
2515 	vm_page_unwire_noq(m);
2516 	vm_page_free(m);
2517 }
2518 
2519 /*
2520  * Returns a new PV entry, allocating a new PV chunk from the system when
2521  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2522  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2523  * returned.
2524  *
2525  * The given PV list lock may be released.
2526  */
2527 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)2528 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2529 {
2530 	int bit, field;
2531 	pv_entry_t pv;
2532 	struct pv_chunk *pc;
2533 	vm_page_t m;
2534 
2535 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2536 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2537 retry:
2538 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2539 	if (pc != NULL) {
2540 		for (field = 0; field < _NPCM; field++) {
2541 			if (pc->pc_map[field]) {
2542 				bit = ffsl(pc->pc_map[field]) - 1;
2543 				break;
2544 			}
2545 		}
2546 		if (field < _NPCM) {
2547 			pv = &pc->pc_pventry[field * 64 + bit];
2548 			pc->pc_map[field] &= ~(1ul << bit);
2549 			/* If this was the last item, move it to tail */
2550 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2551 			    pc->pc_map[2] == 0) {
2552 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2553 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2554 				    pc_list);
2555 			}
2556 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2557 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2558 			return (pv);
2559 		}
2560 	}
2561 	/* No free items, allocate another chunk */
2562 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2563 	if (m == NULL) {
2564 		if (lockp == NULL) {
2565 			PV_STAT(pc_chunk_tryfail++);
2566 			return (NULL);
2567 		}
2568 		m = reclaim_pv_chunk(pmap, lockp);
2569 		if (m == NULL)
2570 			goto retry;
2571 	}
2572 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2573 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2574 	dump_add_page(m->phys_addr);
2575 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2576 	pc->pc_pmap = pmap;
2577 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2578 	pc->pc_map[1] = PC_FREE1;
2579 	pc->pc_map[2] = PC_FREE2;
2580 	mtx_lock(&pv_chunks_mutex);
2581 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2582 	mtx_unlock(&pv_chunks_mutex);
2583 	pv = &pc->pc_pventry[0];
2584 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2585 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2586 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2587 	return (pv);
2588 }
2589 
2590 /*
2591  * Ensure that the number of spare PV entries in the specified pmap meets or
2592  * exceeds the given count, "needed".
2593  *
2594  * The given PV list lock may be released.
2595  */
2596 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)2597 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2598 {
2599 	struct pch new_tail;
2600 	struct pv_chunk *pc;
2601 	vm_page_t m;
2602 	int avail, free;
2603 	bool reclaimed;
2604 
2605 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2606 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2607 
2608 	/*
2609 	 * Newly allocated PV chunks must be stored in a private list until
2610 	 * the required number of PV chunks have been allocated.  Otherwise,
2611 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2612 	 * contrast, these chunks must be added to the pmap upon allocation.
2613 	 */
2614 	TAILQ_INIT(&new_tail);
2615 retry:
2616 	avail = 0;
2617 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2618 		bit_count((bitstr_t *)pc->pc_map, 0,
2619 		    sizeof(pc->pc_map) * NBBY, &free);
2620 		if (free == 0)
2621 			break;
2622 		avail += free;
2623 		if (avail >= needed)
2624 			break;
2625 	}
2626 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
2627 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2628 		if (m == NULL) {
2629 			m = reclaim_pv_chunk(pmap, lockp);
2630 			if (m == NULL)
2631 				goto retry;
2632 			reclaimed = true;
2633 		}
2634 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2635 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2636 		dump_add_page(m->phys_addr);
2637 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2638 		pc->pc_pmap = pmap;
2639 		pc->pc_map[0] = PC_FREE0;
2640 		pc->pc_map[1] = PC_FREE1;
2641 		pc->pc_map[2] = PC_FREE2;
2642 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2643 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2644 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2645 
2646 		/*
2647 		 * The reclaim might have freed a chunk from the current pmap.
2648 		 * If that chunk contained available entries, we need to
2649 		 * re-count the number of available entries.
2650 		 */
2651 		if (reclaimed)
2652 			goto retry;
2653 	}
2654 	if (!TAILQ_EMPTY(&new_tail)) {
2655 		mtx_lock(&pv_chunks_mutex);
2656 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2657 		mtx_unlock(&pv_chunks_mutex);
2658 	}
2659 }
2660 
2661 /*
2662  * First find and then remove the pv entry for the specified pmap and virtual
2663  * address from the specified pv list.  Returns the pv entry if found and NULL
2664  * otherwise.  This operation can be performed on pv lists for either 4KB or
2665  * 2MB page mappings.
2666  */
2667 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2668 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2669 {
2670 	pv_entry_t pv;
2671 
2672 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2673 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2674 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2675 			pvh->pv_gen++;
2676 			break;
2677 		}
2678 	}
2679 	return (pv);
2680 }
2681 
2682 /*
2683  * After demotion from a 2MB page mapping to 512 4KB page mappings,
2684  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2685  * entries for each of the 4KB page mappings.
2686  */
2687 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)2688 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2689     struct rwlock **lockp)
2690 {
2691 	struct md_page *pvh;
2692 	struct pv_chunk *pc;
2693 	pv_entry_t pv;
2694 	vm_offset_t va_last;
2695 	vm_page_t m;
2696 	int bit, field;
2697 
2698 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2699 	KASSERT((va & L2_OFFSET) == 0,
2700 	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
2701 	KASSERT((pa & L2_OFFSET) == 0,
2702 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
2703 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2704 
2705 	/*
2706 	 * Transfer the 2mpage's pv entry for this mapping to the first
2707 	 * page's pv list.  Once this transfer begins, the pv list lock
2708 	 * must not be released until the last pv entry is reinstantiated.
2709 	 */
2710 	pvh = pa_to_pvh(pa);
2711 	pv = pmap_pvh_remove(pvh, pmap, va);
2712 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2713 	m = PHYS_TO_VM_PAGE(pa);
2714 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2715 	m->md.pv_gen++;
2716 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
2717 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2718 	va_last = va + L2_SIZE - PAGE_SIZE;
2719 	for (;;) {
2720 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2721 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
2722 		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
2723 		for (field = 0; field < _NPCM; field++) {
2724 			while (pc->pc_map[field]) {
2725 				bit = ffsl(pc->pc_map[field]) - 1;
2726 				pc->pc_map[field] &= ~(1ul << bit);
2727 				pv = &pc->pc_pventry[field * 64 + bit];
2728 				va += PAGE_SIZE;
2729 				pv->pv_va = va;
2730 				m++;
2731 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2732 			    ("pmap_pv_demote_l2: page %p is not managed", m));
2733 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2734 				m->md.pv_gen++;
2735 				if (va == va_last)
2736 					goto out;
2737 			}
2738 		}
2739 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2740 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2741 	}
2742 out:
2743 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
2744 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2745 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2746 	}
2747 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2748 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
2749 }
2750 
2751 /*
2752  * First find and then destroy the pv entry for the specified pmap and virtual
2753  * address.  This operation can be performed on pv lists for either 4KB or 2MB
2754  * page mappings.
2755  */
2756 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2757 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2758 {
2759 	pv_entry_t pv;
2760 
2761 	pv = pmap_pvh_remove(pvh, pmap, va);
2762 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2763 	free_pv_entry(pmap, pv);
2764 }
2765 
2766 /*
2767  * Conditionally create the PV entry for a 4KB page mapping if the required
2768  * memory can be allocated without resorting to reclamation.
2769  */
2770 static boolean_t
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)2771 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2772     struct rwlock **lockp)
2773 {
2774 	pv_entry_t pv;
2775 
2776 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2777 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2778 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2779 		pv->pv_va = va;
2780 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2781 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2782 		m->md.pv_gen++;
2783 		return (TRUE);
2784 	} else
2785 		return (FALSE);
2786 }
2787 
2788 /*
2789  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
2790  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
2791  * false if the PV entry cannot be allocated without resorting to reclamation.
2792  */
2793 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)2794 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
2795     struct rwlock **lockp)
2796 {
2797 	struct md_page *pvh;
2798 	pv_entry_t pv;
2799 	vm_paddr_t pa;
2800 
2801 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2802 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2803 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
2804 	    NULL : lockp)) == NULL)
2805 		return (false);
2806 	pv->pv_va = va;
2807 	pa = l2e & ~ATTR_MASK;
2808 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2809 	pvh = pa_to_pvh(pa);
2810 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2811 	pvh->pv_gen++;
2812 	return (true);
2813 }
2814 
2815 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)2816 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
2817 {
2818 	pt_entry_t newl2, oldl2;
2819 	vm_page_t ml3;
2820 	vm_paddr_t ml3pa;
2821 
2822 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
2823 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
2824 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2825 
2826 	ml3 = pmap_remove_pt_page(pmap, va);
2827 	if (ml3 == NULL)
2828 		panic("pmap_remove_kernel_l2: Missing pt page");
2829 
2830 	ml3pa = VM_PAGE_TO_PHYS(ml3);
2831 	newl2 = ml3pa | L2_TABLE;
2832 
2833 	/*
2834 	 * If this page table page was unmapped by a promotion, then it
2835 	 * contains valid mappings.  Zero it to invalidate those mappings.
2836 	 */
2837 	if (ml3->valid != 0)
2838 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
2839 
2840 	/*
2841 	 * Demote the mapping.  The caller must have already invalidated the
2842 	 * mapping (i.e., the "break" in break-before-make).
2843 	 */
2844 	oldl2 = pmap_load_store(l2, newl2);
2845 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
2846 	    __func__, l2, oldl2));
2847 }
2848 
2849 /*
2850  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
2851  */
2852 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)2853 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2854     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2855 {
2856 	struct md_page *pvh;
2857 	pt_entry_t old_l2;
2858 	vm_page_t m, ml3, mt;
2859 
2860 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2861 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2862 	old_l2 = pmap_load_clear(l2);
2863 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
2864 	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
2865 
2866 	/*
2867 	 * Since a promotion must break the 4KB page mappings before making
2868 	 * the 2MB page mapping, a pmap_invalidate_page() suffices.
2869 	 */
2870 	pmap_invalidate_page(pmap, sva);
2871 
2872 	if (old_l2 & ATTR_SW_WIRED)
2873 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2874 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2875 	if (old_l2 & ATTR_SW_MANAGED) {
2876 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
2877 		pvh = page_to_pvh(m);
2878 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
2879 		pmap_pvh_free(pvh, pmap, sva);
2880 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
2881 			if (pmap_pte_dirty(pmap, old_l2))
2882 				vm_page_dirty(mt);
2883 			if (old_l2 & ATTR_AF)
2884 				vm_page_aflag_set(mt, PGA_REFERENCED);
2885 			if (TAILQ_EMPTY(&mt->md.pv_list) &&
2886 			    TAILQ_EMPTY(&pvh->pv_list))
2887 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
2888 		}
2889 	}
2890 	if (pmap == kernel_pmap) {
2891 		pmap_remove_kernel_l2(pmap, l2, sva);
2892 	} else {
2893 		ml3 = pmap_remove_pt_page(pmap, sva);
2894 		if (ml3 != NULL) {
2895 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
2896 			    ("pmap_remove_l2: l3 page not promoted"));
2897 			pmap_resident_count_dec(pmap, 1);
2898 			KASSERT(ml3->ref_count == NL3PG,
2899 			    ("pmap_remove_l2: l3 page ref count error"));
2900 			ml3->ref_count = 0;
2901 			pmap_add_delayed_free_list(ml3, free, FALSE);
2902 		}
2903 	}
2904 	return (pmap_unuse_pt(pmap, sva, l1e, free));
2905 }
2906 
2907 /*
2908  * pmap_remove_l3: do the things to unmap a page in a process
2909  */
2910 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)2911 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2912     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2913 {
2914 	struct md_page *pvh;
2915 	pt_entry_t old_l3;
2916 	vm_page_t m;
2917 
2918 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2919 	old_l3 = pmap_load_clear(l3);
2920 	pmap_invalidate_page(pmap, va);
2921 	if (old_l3 & ATTR_SW_WIRED)
2922 		pmap->pm_stats.wired_count -= 1;
2923 	pmap_resident_count_dec(pmap, 1);
2924 	if (old_l3 & ATTR_SW_MANAGED) {
2925 		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
2926 		if (pmap_pte_dirty(pmap, old_l3))
2927 			vm_page_dirty(m);
2928 		if (old_l3 & ATTR_AF)
2929 			vm_page_aflag_set(m, PGA_REFERENCED);
2930 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2931 		pmap_pvh_free(&m->md, pmap, va);
2932 		if (TAILQ_EMPTY(&m->md.pv_list) &&
2933 		    (m->flags & PG_FICTITIOUS) == 0) {
2934 			pvh = page_to_pvh(m);
2935 			if (TAILQ_EMPTY(&pvh->pv_list))
2936 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2937 		}
2938 	}
2939 	return (pmap_unuse_pt(pmap, va, l2e, free));
2940 }
2941 
2942 /*
2943  * Remove the specified range of addresses from the L3 page table that is
2944  * identified by the given L2 entry.
2945  */
2946 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)2947 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
2948     vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
2949 {
2950 	struct md_page *pvh;
2951 	struct rwlock *new_lock;
2952 	pt_entry_t *l3, old_l3;
2953 	vm_offset_t va;
2954 	vm_page_t l3pg, m;
2955 
2956 	KASSERT(ADDR_IS_CANONICAL(sva),
2957 	    ("%s: Start address not in canonical form: %lx", __func__, sva));
2958 	KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
2959 	    ("%s: End address not in canonical form: %lx", __func__, eva));
2960 
2961 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2962 	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
2963 	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
2964 	l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : NULL;
2965 	va = eva;
2966 	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
2967 		if (!pmap_l3_valid(pmap_load(l3))) {
2968 			if (va != eva) {
2969 				pmap_invalidate_range(pmap, va, sva);
2970 				va = eva;
2971 			}
2972 			continue;
2973 		}
2974 		old_l3 = pmap_load_clear(l3);
2975 		if ((old_l3 & ATTR_SW_WIRED) != 0)
2976 			pmap->pm_stats.wired_count--;
2977 		pmap_resident_count_dec(pmap, 1);
2978 		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
2979 			m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
2980 			if (pmap_pte_dirty(pmap, old_l3))
2981 				vm_page_dirty(m);
2982 			if ((old_l3 & ATTR_AF) != 0)
2983 				vm_page_aflag_set(m, PGA_REFERENCED);
2984 			new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m));
2985 			if (new_lock != *lockp) {
2986 				if (*lockp != NULL) {
2987 					/*
2988 					 * Pending TLB invalidations must be
2989 					 * performed before the PV list lock is
2990 					 * released.  Otherwise, a concurrent
2991 					 * pmap_remove_all() on a physical page
2992 					 * could return while a stale TLB entry
2993 					 * still provides access to that page.
2994 					 */
2995 					if (va != eva) {
2996 						pmap_invalidate_range(pmap, va,
2997 						    sva);
2998 						va = eva;
2999 					}
3000 					rw_wunlock(*lockp);
3001 				}
3002 				*lockp = new_lock;
3003 				rw_wlock(*lockp);
3004 			}
3005 			pmap_pvh_free(&m->md, pmap, sva);
3006 			if (TAILQ_EMPTY(&m->md.pv_list) &&
3007 			    (m->flags & PG_FICTITIOUS) == 0) {
3008 				pvh = page_to_pvh(m);
3009 				if (TAILQ_EMPTY(&pvh->pv_list))
3010 					vm_page_aflag_clear(m, PGA_WRITEABLE);
3011 			}
3012 		}
3013 		if (va == eva)
3014 			va = sva;
3015 		if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3016 			sva += L3_SIZE;
3017 			break;
3018 		}
3019 	}
3020 	if (va != eva)
3021 		pmap_invalidate_range(pmap, va, sva);
3022 }
3023 
3024 /*
3025  *	Remove the given range of addresses from the specified map.
3026  *
3027  *	It is assumed that the start and end are properly
3028  *	rounded to the page size.
3029  */
3030 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3031 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3032 {
3033 	struct rwlock *lock;
3034 	vm_offset_t va_next;
3035 	pd_entry_t *l0, *l1, *l2;
3036 	pt_entry_t l3_paddr;
3037 	struct spglist free;
3038 
3039 	/*
3040 	 * Perform an unsynchronized read.  This is, however, safe.
3041 	 */
3042 	if (pmap->pm_stats.resident_count == 0)
3043 		return;
3044 
3045 	SLIST_INIT(&free);
3046 
3047 	PMAP_LOCK(pmap);
3048 
3049 	lock = NULL;
3050 	for (; sva < eva; sva = va_next) {
3051 		if (pmap->pm_stats.resident_count == 0)
3052 			break;
3053 
3054 		l0 = pmap_l0(pmap, sva);
3055 		if (pmap_load(l0) == 0) {
3056 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3057 			if (va_next < sva)
3058 				va_next = eva;
3059 			continue;
3060 		}
3061 
3062 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3063 		if (va_next < sva)
3064 			va_next = eva;
3065 		l1 = pmap_l0_to_l1(l0, sva);
3066 		if (pmap_load(l1) == 0)
3067 			continue;
3068 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3069 			KASSERT(va_next <= eva,
3070 			    ("partial update of non-transparent 1G page "
3071 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3072 			    pmap_load(l1), sva, eva, va_next));
3073 			MPASS(pmap != kernel_pmap);
3074 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3075 			pmap_clear(l1);
3076 			pmap_invalidate_page(pmap, sva);
3077 			pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
3078 			pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
3079 			continue;
3080 		}
3081 
3082 		/*
3083 		 * Calculate index for next page table.
3084 		 */
3085 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3086 		if (va_next < sva)
3087 			va_next = eva;
3088 
3089 		l2 = pmap_l1_to_l2(l1, sva);
3090 		if (l2 == NULL)
3091 			continue;
3092 
3093 		l3_paddr = pmap_load(l2);
3094 
3095 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
3096 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3097 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
3098 				    &free, &lock);
3099 				continue;
3100 			} else if (pmap_demote_l2_locked(pmap, l2, sva,
3101 			    &lock) == NULL)
3102 				continue;
3103 			l3_paddr = pmap_load(l2);
3104 		}
3105 
3106 		/*
3107 		 * Weed out invalid mappings.
3108 		 */
3109 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
3110 			continue;
3111 
3112 		/*
3113 		 * Limit our scan to either the end of the va represented
3114 		 * by the current page table page, or to the end of the
3115 		 * range being removed.
3116 		 */
3117 		if (va_next > eva)
3118 			va_next = eva;
3119 
3120 		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
3121 		    &lock);
3122 	}
3123 	if (lock != NULL)
3124 		rw_wunlock(lock);
3125 	PMAP_UNLOCK(pmap);
3126 	vm_page_free_pages_toq(&free, true);
3127 }
3128 
3129 /*
3130  *	Routine:	pmap_remove_all
3131  *	Function:
3132  *		Removes this physical page from
3133  *		all physical maps in which it resides.
3134  *		Reflects back modify bits to the pager.
3135  *
3136  *	Notes:
3137  *		Original versions of this routine were very
3138  *		inefficient because they iteratively called
3139  *		pmap_remove (slow...)
3140  */
3141 
3142 void
pmap_remove_all(vm_page_t m)3143 pmap_remove_all(vm_page_t m)
3144 {
3145 	struct md_page *pvh;
3146 	pv_entry_t pv;
3147 	pmap_t pmap;
3148 	struct rwlock *lock;
3149 	pd_entry_t *pde, tpde;
3150 	pt_entry_t *pte, tpte;
3151 	vm_offset_t va;
3152 	struct spglist free;
3153 	int lvl, pvh_gen, md_gen;
3154 
3155 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3156 	    ("pmap_remove_all: page %p is not managed", m));
3157 	SLIST_INIT(&free);
3158 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3159 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
3160 	rw_wlock(lock);
3161 retry:
3162 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3163 		pmap = PV_PMAP(pv);
3164 		if (!PMAP_TRYLOCK(pmap)) {
3165 			pvh_gen = pvh->pv_gen;
3166 			rw_wunlock(lock);
3167 			PMAP_LOCK(pmap);
3168 			rw_wlock(lock);
3169 			if (pvh_gen != pvh->pv_gen) {
3170 				PMAP_UNLOCK(pmap);
3171 				goto retry;
3172 			}
3173 		}
3174 		va = pv->pv_va;
3175 		pte = pmap_pte(pmap, va, &lvl);
3176 		KASSERT(pte != NULL,
3177 		    ("pmap_remove_all: no page table entry found"));
3178 		KASSERT(lvl == 2,
3179 		    ("pmap_remove_all: invalid pte level %d", lvl));
3180 		pmap_demote_l2_locked(pmap, pte, va, &lock);
3181 		PMAP_UNLOCK(pmap);
3182 	}
3183 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3184 		pmap = PV_PMAP(pv);
3185 		PMAP_ASSERT_STAGE1(pmap);
3186 		if (!PMAP_TRYLOCK(pmap)) {
3187 			pvh_gen = pvh->pv_gen;
3188 			md_gen = m->md.pv_gen;
3189 			rw_wunlock(lock);
3190 			PMAP_LOCK(pmap);
3191 			rw_wlock(lock);
3192 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3193 				PMAP_UNLOCK(pmap);
3194 				goto retry;
3195 			}
3196 		}
3197 		pmap_resident_count_dec(pmap, 1);
3198 
3199 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
3200 		KASSERT(pde != NULL,
3201 		    ("pmap_remove_all: no page directory entry found"));
3202 		KASSERT(lvl == 2,
3203 		    ("pmap_remove_all: invalid pde level %d", lvl));
3204 		tpde = pmap_load(pde);
3205 
3206 		pte = pmap_l2_to_l3(pde, pv->pv_va);
3207 		tpte = pmap_load_clear(pte);
3208 		if (tpte & ATTR_SW_WIRED)
3209 			pmap->pm_stats.wired_count--;
3210 		if ((tpte & ATTR_AF) != 0) {
3211 			pmap_invalidate_page(pmap, pv->pv_va);
3212 			vm_page_aflag_set(m, PGA_REFERENCED);
3213 		}
3214 
3215 		/*
3216 		 * Update the vm_page_t clean and reference bits.
3217 		 */
3218 		if (pmap_pte_dirty(pmap, tpte))
3219 			vm_page_dirty(m);
3220 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
3221 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3222 		m->md.pv_gen++;
3223 		free_pv_entry(pmap, pv);
3224 		PMAP_UNLOCK(pmap);
3225 	}
3226 	vm_page_aflag_clear(m, PGA_WRITEABLE);
3227 	rw_wunlock(lock);
3228 	vm_page_free_pages_toq(&free, true);
3229 }
3230 
3231 /*
3232  * pmap_protect_l2: do the things to protect a 2MB page in a pmap
3233  */
3234 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)3235 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
3236     pt_entry_t nbits)
3237 {
3238 	pd_entry_t old_l2;
3239 	vm_page_t m, mt;
3240 
3241 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3242 	PMAP_ASSERT_STAGE1(pmap);
3243 	KASSERT((sva & L2_OFFSET) == 0,
3244 	    ("pmap_protect_l2: sva is not 2mpage aligned"));
3245 	old_l2 = pmap_load(l2);
3246 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3247 	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
3248 
3249 	/*
3250 	 * Return if the L2 entry already has the desired access restrictions
3251 	 * in place.
3252 	 */
3253 	if ((old_l2 & mask) == nbits)
3254 		return;
3255 
3256 	while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
3257 		cpu_spinwait();
3258 
3259 	/*
3260 	 * When a dirty read/write superpage mapping is write protected,
3261 	 * update the dirty field of each of the superpage's constituent 4KB
3262 	 * pages.
3263 	 */
3264 	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
3265 	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3266 	    pmap_pte_dirty(pmap, old_l2)) {
3267 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
3268 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3269 			vm_page_dirty(mt);
3270 	}
3271 
3272 	/*
3273 	 * Since a promotion must break the 4KB page mappings before making
3274 	 * the 2MB page mapping, a pmap_invalidate_page() suffices.
3275 	 */
3276 	pmap_invalidate_page(pmap, sva);
3277 }
3278 
3279 /*
3280  *	Set the physical protection on the
3281  *	specified range of this map as requested.
3282  */
3283 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)3284 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3285 {
3286 	vm_offset_t va, va_next;
3287 	pd_entry_t *l0, *l1, *l2;
3288 	pt_entry_t *l3p, l3, mask, nbits;
3289 
3290 	PMAP_ASSERT_STAGE1(pmap);
3291 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3292 	if (prot == VM_PROT_NONE) {
3293 		pmap_remove(pmap, sva, eva);
3294 		return;
3295 	}
3296 
3297 	mask = nbits = 0;
3298 	if ((prot & VM_PROT_WRITE) == 0) {
3299 		mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
3300 		nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
3301 	}
3302 	if ((prot & VM_PROT_EXECUTE) == 0) {
3303 		mask |= ATTR_S1_XN;
3304 		nbits |= ATTR_S1_XN;
3305 	}
3306 	if (mask == 0)
3307 		return;
3308 
3309 	PMAP_LOCK(pmap);
3310 	for (; sva < eva; sva = va_next) {
3311 		l0 = pmap_l0(pmap, sva);
3312 		if (pmap_load(l0) == 0) {
3313 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3314 			if (va_next < sva)
3315 				va_next = eva;
3316 			continue;
3317 		}
3318 
3319 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3320 		if (va_next < sva)
3321 			va_next = eva;
3322 		l1 = pmap_l0_to_l1(l0, sva);
3323 		if (pmap_load(l1) == 0)
3324 			continue;
3325 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3326 			KASSERT(va_next <= eva,
3327 			    ("partial update of non-transparent 1G page "
3328 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3329 			    pmap_load(l1), sva, eva, va_next));
3330 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3331 			if ((pmap_load(l1) & mask) != nbits) {
3332 				pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
3333 				pmap_invalidate_page(pmap, sva);
3334 			}
3335 			continue;
3336 		}
3337 
3338 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3339 		if (va_next < sva)
3340 			va_next = eva;
3341 
3342 		l2 = pmap_l1_to_l2(l1, sva);
3343 		if (pmap_load(l2) == 0)
3344 			continue;
3345 
3346 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
3347 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3348 				pmap_protect_l2(pmap, l2, sva, mask, nbits);
3349 				continue;
3350 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
3351 				continue;
3352 		}
3353 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
3354 		    ("pmap_protect: Invalid L2 entry after demotion"));
3355 
3356 		if (va_next > eva)
3357 			va_next = eva;
3358 
3359 		va = va_next;
3360 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
3361 		    sva += L3_SIZE) {
3362 			l3 = pmap_load(l3p);
3363 
3364 			/*
3365 			 * Go to the next L3 entry if the current one is
3366 			 * invalid or already has the desired access
3367 			 * restrictions in place.  (The latter case occurs
3368 			 * frequently.  For example, in a "buildworld"
3369 			 * workload, almost 1 out of 4 L3 entries already
3370 			 * have the desired restrictions.)
3371 			 */
3372 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
3373 				if (va != va_next) {
3374 					pmap_invalidate_range(pmap, va, sva);
3375 					va = va_next;
3376 				}
3377 				continue;
3378 			}
3379 
3380 			while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
3381 			    nbits))
3382 				cpu_spinwait();
3383 
3384 			/*
3385 			 * When a dirty read/write mapping is write protected,
3386 			 * update the page's dirty field.
3387 			 */
3388 			if ((l3 & ATTR_SW_MANAGED) != 0 &&
3389 			    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3390 			    pmap_pte_dirty(pmap, l3))
3391 				vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK));
3392 
3393 			if (va == va_next)
3394 				va = sva;
3395 		}
3396 		if (va != va_next)
3397 			pmap_invalidate_range(pmap, va, sva);
3398 	}
3399 	PMAP_UNLOCK(pmap);
3400 }
3401 
3402 /*
3403  * Inserts the specified page table page into the specified pmap's collection
3404  * of idle page table pages.  Each of a pmap's page table pages is responsible
3405  * for mapping a distinct range of virtual addresses.  The pmap's collection is
3406  * ordered by this virtual address range.
3407  *
3408  * If "promoted" is false, then the page table page "mpte" must be zero filled.
3409  */
3410 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted)3411 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
3412 {
3413 
3414 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3415 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
3416 	return (vm_radix_insert(&pmap->pm_root, mpte));
3417 }
3418 
3419 /*
3420  * Removes the page table page mapping the specified virtual address from the
3421  * specified pmap's collection of idle page table pages, and returns it.
3422  * Otherwise, returns NULL if there is no page table page corresponding to the
3423  * specified virtual address.
3424  */
3425 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)3426 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
3427 {
3428 
3429 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3430 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
3431 }
3432 
3433 /*
3434  * Performs a break-before-make update of a pmap entry. This is needed when
3435  * either promoting or demoting pages to ensure the TLB doesn't get into an
3436  * inconsistent state.
3437  */
3438 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * pte,pd_entry_t newpte,vm_offset_t va,vm_size_t size)3439 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
3440     vm_offset_t va, vm_size_t size)
3441 {
3442 	register_t intr;
3443 
3444 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3445 
3446 	/*
3447 	 * Ensure we don't get switched out with the page table in an
3448 	 * inconsistent state. We also need to ensure no interrupts fire
3449 	 * as they may make use of an address we are about to invalidate.
3450 	 */
3451 	intr = intr_disable();
3452 
3453 	/*
3454 	 * Clear the old mapping's valid bit, but leave the rest of the entry
3455 	 * unchanged, so that a lockless, concurrent pmap_kextract() can still
3456 	 * lookup the physical address.
3457 	 */
3458 	pmap_clear_bits(pte, ATTR_DESCR_VALID);
3459 	pmap_invalidate_range(pmap, va, va + size);
3460 
3461 	/* Create the new mapping */
3462 	pmap_store(pte, newpte);
3463 	dsb(ishst);
3464 
3465 	intr_restore(intr);
3466 }
3467 
3468 #if VM_NRESERVLEVEL > 0
3469 /*
3470  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3471  * replace the many pv entries for the 4KB page mappings by a single pv entry
3472  * for the 2MB page mapping.
3473  */
3474 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3475 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3476     struct rwlock **lockp)
3477 {
3478 	struct md_page *pvh;
3479 	pv_entry_t pv;
3480 	vm_offset_t va_last;
3481 	vm_page_t m;
3482 
3483 	KASSERT((pa & L2_OFFSET) == 0,
3484 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
3485 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3486 
3487 	/*
3488 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3489 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3490 	 * a transfer avoids the possibility that get_pv_entry() calls
3491 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3492 	 * mappings that is being promoted.
3493 	 */
3494 	m = PHYS_TO_VM_PAGE(pa);
3495 	va = va & ~L2_OFFSET;
3496 	pv = pmap_pvh_remove(&m->md, pmap, va);
3497 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
3498 	pvh = page_to_pvh(m);
3499 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3500 	pvh->pv_gen++;
3501 	/* Free the remaining NPTEPG - 1 pv entries. */
3502 	va_last = va + L2_SIZE - PAGE_SIZE;
3503 	do {
3504 		m++;
3505 		va += PAGE_SIZE;
3506 		pmap_pvh_free(&m->md, pmap, va);
3507 	} while (va < va_last);
3508 }
3509 
3510 /*
3511  * Tries to promote the 512, contiguous 4KB page mappings that are within a
3512  * single level 2 table entry to a single 2MB page mapping.  For promotion
3513  * to occur, two conditions must be met: (1) the 4KB page mappings must map
3514  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
3515  * identical characteristics.
3516  */
3517 static void
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)3518 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
3519     struct rwlock **lockp)
3520 {
3521 	pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
3522 	vm_page_t mpte;
3523 	vm_offset_t sva;
3524 
3525 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3526 	PMAP_ASSERT_STAGE1(pmap);
3527 
3528 	sva = va & ~L2_OFFSET;
3529 	firstl3 = pmap_l2_to_l3(l2, sva);
3530 	newl2 = pmap_load(firstl3);
3531 
3532 	if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) {
3533 		atomic_add_long(&pmap_l2_p_failures, 1);
3534 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
3535 		    " in pmap %p", va, pmap);
3536 		return;
3537 	}
3538 
3539 setl2:
3540 	if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
3541 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
3542 		/*
3543 		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
3544 		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
3545 		 */
3546 		if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
3547 			goto setl2;
3548 		newl2 &= ~ATTR_SW_DBM;
3549 	}
3550 
3551 	pa = newl2 + L2_SIZE - PAGE_SIZE;
3552 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
3553 		oldl3 = pmap_load(l3);
3554 setl3:
3555 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
3556 		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
3557 			/*
3558 			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
3559 			 * set, ATTR_SW_DBM can be cleared without a TLB
3560 			 * invalidation.
3561 			 */
3562 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
3563 			    ~ATTR_SW_DBM))
3564 				goto setl3;
3565 			oldl3 &= ~ATTR_SW_DBM;
3566 		}
3567 		if (oldl3 != pa) {
3568 			atomic_add_long(&pmap_l2_p_failures, 1);
3569 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
3570 			    " in pmap %p", va, pmap);
3571 			return;
3572 		}
3573 		pa -= PAGE_SIZE;
3574 	}
3575 
3576 	/*
3577 	 * Save the page table page in its current state until the L2
3578 	 * mapping the superpage is demoted by pmap_demote_l2() or
3579 	 * destroyed by pmap_remove_l3().
3580 	 */
3581 	mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
3582 	KASSERT(mpte >= vm_page_array &&
3583 	    mpte < &vm_page_array[vm_page_array_size],
3584 	    ("pmap_promote_l2: page table page is out of range"));
3585 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
3586 	    ("pmap_promote_l2: page table page's pindex is wrong"));
3587 	if (pmap_insert_pt_page(pmap, mpte, true)) {
3588 		atomic_add_long(&pmap_l2_p_failures, 1);
3589 		CTR2(KTR_PMAP,
3590 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
3591 		    pmap);
3592 		return;
3593 	}
3594 
3595 	if ((newl2 & ATTR_SW_MANAGED) != 0)
3596 		pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
3597 
3598 	newl2 &= ~ATTR_DESCR_MASK;
3599 	newl2 |= L2_BLOCK;
3600 
3601 	pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);
3602 
3603 	atomic_add_long(&pmap_l2_promotions, 1);
3604 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
3605 		    pmap);
3606 }
3607 #endif /* VM_NRESERVLEVEL > 0 */
3608 
3609 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t newpte,int flags,int psind)3610 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
3611     int psind)
3612 {
3613 	pd_entry_t *l0p, *l1p, *l2p, origpte;
3614 	vm_page_t mp;
3615 
3616 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3617 	KASSERT(psind > 0 && psind < MAXPAGESIZES,
3618 	    ("psind %d unexpected", psind));
3619 	KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0,
3620 	    ("unaligned phys address %#lx newpte %#lx psind %d",
3621 	    (newpte & ~ATTR_MASK), newpte, psind));
3622 
3623 restart:
3624 	if (psind == 2) {
3625 		l0p = pmap_l0(pmap, va);
3626 		if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
3627 			mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
3628 			if (mp == NULL) {
3629 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
3630 					return (KERN_RESOURCE_SHORTAGE);
3631 				PMAP_UNLOCK(pmap);
3632 				vm_wait(NULL);
3633 				PMAP_LOCK(pmap);
3634 				goto restart;
3635 			}
3636 			l1p = pmap_l0_to_l1(l0p, va);
3637 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
3638 			origpte = pmap_load(l1p);
3639 		} else {
3640 			l1p = pmap_l0_to_l1(l0p, va);
3641 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
3642 			origpte = pmap_load(l1p);
3643 			if ((origpte & ATTR_DESCR_VALID) == 0) {
3644 				mp = PHYS_TO_VM_PAGE(pmap_load(l0p) &
3645 				    ~ATTR_MASK);
3646 				mp->ref_count++;
3647 			}
3648 		}
3649 		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
3650 		    ((origpte & ATTR_DESCR_MASK) == L1_BLOCK &&
3651 		     (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)),
3652 		    ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
3653 		    va, origpte, newpte));
3654 		pmap_store(l1p, newpte);
3655 	} else /* (psind == 1) */ {
3656 		l2p = pmap_l2(pmap, va);
3657 		if (l2p == NULL) {
3658 			mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
3659 			if (mp == NULL) {
3660 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
3661 					return (KERN_RESOURCE_SHORTAGE);
3662 				PMAP_UNLOCK(pmap);
3663 				vm_wait(NULL);
3664 				PMAP_LOCK(pmap);
3665 				goto restart;
3666 			}
3667 			l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
3668 			l2p = &l2p[pmap_l2_index(va)];
3669 			origpte = pmap_load(l2p);
3670 		} else {
3671 			l1p = pmap_l1(pmap, va);
3672 			origpte = pmap_load(l2p);
3673 			if ((origpte & ATTR_DESCR_VALID) == 0) {
3674 				mp = PHYS_TO_VM_PAGE(pmap_load(l1p) &
3675 				    ~ATTR_MASK);
3676 				mp->ref_count++;
3677 			}
3678 		}
3679 		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
3680 		    ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
3681 		     (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)),
3682 		    ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
3683 		    va, origpte, newpte));
3684 		pmap_store(l2p, newpte);
3685 	}
3686 	dsb(ishst);
3687 
3688 	if ((origpte & ATTR_DESCR_VALID) == 0)
3689 		pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
3690 	if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
3691 		pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
3692 	else if ((newpte & ATTR_SW_WIRED) == 0 &&
3693 	    (origpte & ATTR_SW_WIRED) != 0)
3694 		pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
3695 
3696 	return (KERN_SUCCESS);
3697 }
3698 
3699 /*
3700  * Add a single SMMU entry. This function does not sleep.
3701  */
3702 int
pmap_senter(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,vm_prot_t prot,u_int flags)3703 pmap_senter(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3704     vm_prot_t prot, u_int flags)
3705 {
3706 	pd_entry_t *pde;
3707 	pt_entry_t new_l3, orig_l3;
3708 	pt_entry_t *l3;
3709 	vm_page_t mpte;
3710 	int lvl;
3711 	int rv;
3712 
3713 	PMAP_ASSERT_STAGE1(pmap);
3714 	KASSERT(va < VM_MAXUSER_ADDRESS, ("wrong address space"));
3715 
3716 	va = trunc_page(va);
3717 	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT |
3718 	    ATTR_S1_IDX(VM_MEMATTR_DEVICE) | L3_PAGE);
3719 	if ((prot & VM_PROT_WRITE) == 0)
3720 		new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
3721 	new_l3 |= ATTR_S1_XN; /* Execute never. */
3722 	new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER);
3723 	new_l3 |= ATTR_S1_nG; /* Non global. */
3724 
3725 	CTR2(KTR_PMAP, "pmap_senter: %.16lx -> %.16lx", va, pa);
3726 
3727 	PMAP_LOCK(pmap);
3728 
3729 	/*
3730 	 * In the case that a page table page is not
3731 	 * resident, we are creating it here.
3732 	 */
3733 retry:
3734 	pde = pmap_pde(pmap, va, &lvl);
3735 	if (pde != NULL && lvl == 2) {
3736 		l3 = pmap_l2_to_l3(pde, va);
3737 	} else {
3738 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL);
3739 		if (mpte == NULL) {
3740 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
3741 			rv = KERN_RESOURCE_SHORTAGE;
3742 			goto out;
3743 		}
3744 		goto retry;
3745 	}
3746 
3747 	orig_l3 = pmap_load(l3);
3748 	KASSERT(!pmap_l3_valid(orig_l3), ("l3 is valid"));
3749 
3750 	/* New mapping */
3751 	pmap_store(l3, new_l3);
3752 	pmap_resident_count_inc(pmap, 1);
3753 	dsb(ishst);
3754 
3755 	rv = KERN_SUCCESS;
3756 out:
3757 	PMAP_UNLOCK(pmap);
3758 
3759 	return (rv);
3760 }
3761 
3762 /*
3763  * Remove a single SMMU entry.
3764  */
3765 int
pmap_sremove(pmap_t pmap,vm_offset_t va)3766 pmap_sremove(pmap_t pmap, vm_offset_t va)
3767 {
3768 	pt_entry_t *pte;
3769 	int lvl;
3770 	int rc;
3771 
3772 	PMAP_LOCK(pmap);
3773 
3774 	pte = pmap_pte(pmap, va, &lvl);
3775 	KASSERT(lvl == 3,
3776 	    ("Invalid SMMU pagetable level: %d != 3", lvl));
3777 
3778 	if (pte != NULL) {
3779 		pmap_resident_count_dec(pmap, 1);
3780 		pmap_clear(pte);
3781 		rc = KERN_SUCCESS;
3782 	} else
3783 		rc = KERN_FAILURE;
3784 
3785 	PMAP_UNLOCK(pmap);
3786 
3787 	return (rc);
3788 }
3789 
3790 /*
3791  * Remove all the allocated L1, L2 pages from SMMU pmap.
3792  * All the L3 entires must be cleared in advance, otherwise
3793  * this function panics.
3794  */
3795 void
pmap_sremove_pages(pmap_t pmap)3796 pmap_sremove_pages(pmap_t pmap)
3797 {
3798 	pd_entry_t l0e, *l1, l1e, *l2, l2e;
3799 	pt_entry_t *l3, l3e;
3800 	vm_page_t m, m0, m1;
3801 	vm_offset_t sva;
3802 	vm_paddr_t pa;
3803 	vm_paddr_t pa0;
3804 	vm_paddr_t pa1;
3805 	int i, j, k, l;
3806 
3807 	PMAP_LOCK(pmap);
3808 
3809 	for (sva = VM_MINUSER_ADDRESS, i = pmap_l0_index(sva);
3810 	    (i < Ln_ENTRIES && sva < VM_MAXUSER_ADDRESS); i++) {
3811 		l0e = pmap->pm_l0[i];
3812 		if ((l0e & ATTR_DESCR_VALID) == 0) {
3813 			sva += L0_SIZE;
3814 			continue;
3815 		}
3816 		pa0 = l0e & ~ATTR_MASK;
3817 		m0 = PHYS_TO_VM_PAGE(pa0);
3818 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pa0);
3819 
3820 		for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
3821 			l1e = l1[j];
3822 			if ((l1e & ATTR_DESCR_VALID) == 0) {
3823 				sva += L1_SIZE;
3824 				continue;
3825 			}
3826 			if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
3827 				sva += L1_SIZE;
3828 				continue;
3829 			}
3830 			pa1 = l1e & ~ATTR_MASK;
3831 			m1 = PHYS_TO_VM_PAGE(pa1);
3832 			l2 = (pd_entry_t *)PHYS_TO_DMAP(pa1);
3833 
3834 			for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
3835 				l2e = l2[k];
3836 				if ((l2e & ATTR_DESCR_VALID) == 0) {
3837 					sva += L2_SIZE;
3838 					continue;
3839 				}
3840 				pa = l2e & ~ATTR_MASK;
3841 				m = PHYS_TO_VM_PAGE(pa);
3842 				l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
3843 
3844 				for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
3845 				    l++, sva += L3_SIZE) {
3846 					l3e = l3[l];
3847 					if ((l3e & ATTR_DESCR_VALID) == 0)
3848 						continue;
3849 					panic("%s: l3e found for va %jx\n",
3850 					    __func__, sva);
3851 				}
3852 
3853 				vm_page_unwire_noq(m1);
3854 				vm_page_unwire_noq(m);
3855 				pmap_resident_count_dec(pmap, 1);
3856 				vm_page_free(m);
3857 				pmap_clear(&l2[k]);
3858 			}
3859 
3860 			vm_page_unwire_noq(m0);
3861 			pmap_resident_count_dec(pmap, 1);
3862 			vm_page_free(m1);
3863 			pmap_clear(&l1[j]);
3864 		}
3865 
3866 		pmap_resident_count_dec(pmap, 1);
3867 		vm_page_free(m0);
3868 		pmap_clear(&pmap->pm_l0[i]);
3869 	}
3870 
3871 	KASSERT(pmap->pm_stats.resident_count == 0,
3872 	    ("Invalid resident count %jd", pmap->pm_stats.resident_count));
3873 
3874 	PMAP_UNLOCK(pmap);
3875 }
3876 
3877 /*
3878  *	Insert the given physical page (p) at
3879  *	the specified virtual address (v) in the
3880  *	target physical map with the protection requested.
3881  *
3882  *	If specified, the page will be wired down, meaning
3883  *	that the related pte can not be reclaimed.
3884  *
3885  *	NB:  This is the only routine which MAY NOT lazy-evaluate
3886  *	or lose information.  That is, this routine must actually
3887  *	insert this page into the given map NOW.
3888  */
3889 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)3890 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3891     u_int flags, int8_t psind)
3892 {
3893 	struct rwlock *lock;
3894 	pd_entry_t *pde;
3895 	pt_entry_t new_l3, orig_l3;
3896 	pt_entry_t *l2, *l3;
3897 	pv_entry_t pv;
3898 	vm_paddr_t opa, pa;
3899 	vm_page_t mpte, om;
3900 	boolean_t nosleep;
3901 	int lvl, rv;
3902 
3903 	KASSERT(ADDR_IS_CANONICAL(va),
3904 	    ("%s: Address not in canonical form: %lx", __func__, va));
3905 
3906 	va = trunc_page(va);
3907 	if ((m->oflags & VPO_UNMANAGED) == 0)
3908 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
3909 	pa = VM_PAGE_TO_PHYS(m);
3910 	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE);
3911 	new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
3912 	new_l3 |= pmap_pte_prot(pmap, prot);
3913 
3914 	if ((flags & PMAP_ENTER_WIRED) != 0)
3915 		new_l3 |= ATTR_SW_WIRED;
3916 	if (pmap->pm_stage == PM_STAGE1) {
3917 		if (!ADDR_IS_KERNEL(va))
3918 			new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
3919 		else
3920 			new_l3 |= ATTR_S1_UXN;
3921 		if (pmap != kernel_pmap)
3922 			new_l3 |= ATTR_S1_nG;
3923 	} else {
3924 		/*
3925 		 * Clear the access flag on executable mappings, this will be
3926 		 * set later when the page is accessed. The fault handler is
3927 		 * required to invalidate the I-cache.
3928 		 *
3929 		 * TODO: Switch to the valid flag to allow hardware management
3930 		 * of the access flag. Much of the pmap code assumes the
3931 		 * valid flag is set and fails to destroy the old page tables
3932 		 * correctly if it is clear.
3933 		 */
3934 		if (prot & VM_PROT_EXECUTE)
3935 			new_l3 &= ~ATTR_AF;
3936 	}
3937 	if ((m->oflags & VPO_UNMANAGED) == 0) {
3938 		new_l3 |= ATTR_SW_MANAGED;
3939 		if ((prot & VM_PROT_WRITE) != 0) {
3940 			new_l3 |= ATTR_SW_DBM;
3941 			if ((flags & VM_PROT_WRITE) == 0) {
3942 				if (pmap->pm_stage == PM_STAGE1)
3943 					new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
3944 				else
3945 					new_l3 &=
3946 					    ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
3947 			}
3948 		}
3949 	}
3950 
3951 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
3952 
3953 	lock = NULL;
3954 	PMAP_LOCK(pmap);
3955 	if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
3956 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
3957 		    ("managed largepage va %#lx flags %#x", va, flags));
3958 		new_l3 &= ~L3_PAGE;
3959 		if (psind == 2)
3960 			new_l3 |= L1_BLOCK;
3961 		else /* (psind == 1) */
3962 			new_l3 |= L2_BLOCK;
3963 		rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
3964 		goto out;
3965 	}
3966 	if (psind == 1) {
3967 		/* Assert the required virtual and physical alignment. */
3968 		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
3969 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
3970 		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
3971 		    flags, m, &lock);
3972 		goto out;
3973 	}
3974 	mpte = NULL;
3975 
3976 	/*
3977 	 * In the case that a page table page is not
3978 	 * resident, we are creating it here.
3979 	 */
3980 retry:
3981 	pde = pmap_pde(pmap, va, &lvl);
3982 	if (pde != NULL && lvl == 2) {
3983 		l3 = pmap_l2_to_l3(pde, va);
3984 		if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
3985 			mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
3986 			mpte->ref_count++;
3987 		}
3988 		goto havel3;
3989 	} else if (pde != NULL && lvl == 1) {
3990 		l2 = pmap_l1_to_l2(pde, va);
3991 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
3992 		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
3993 			l3 = &l3[pmap_l3_index(va)];
3994 			if (!ADDR_IS_KERNEL(va)) {
3995 				mpte = PHYS_TO_VM_PAGE(
3996 				    pmap_load(l2) & ~ATTR_MASK);
3997 				mpte->ref_count++;
3998 			}
3999 			goto havel3;
4000 		}
4001 		/* We need to allocate an L3 table. */
4002 	}
4003 	if (!ADDR_IS_KERNEL(va)) {
4004 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4005 
4006 		/*
4007 		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
4008 		 * to handle the possibility that a superpage mapping for "va"
4009 		 * was created while we slept.
4010 		 */
4011 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
4012 		    nosleep ? NULL : &lock);
4013 		if (mpte == NULL && nosleep) {
4014 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
4015 			rv = KERN_RESOURCE_SHORTAGE;
4016 			goto out;
4017 		}
4018 		goto retry;
4019 	} else
4020 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
4021 
4022 havel3:
4023 	orig_l3 = pmap_load(l3);
4024 	opa = orig_l3 & ~ATTR_MASK;
4025 	pv = NULL;
4026 
4027 	/*
4028 	 * Is the specified virtual address already mapped?
4029 	 */
4030 	if (pmap_l3_valid(orig_l3)) {
4031 		/*
4032 		 * Only allow adding new entries on stage 2 tables for now.
4033 		 * This simplifies cache invalidation as we may need to call
4034 		 * into EL2 to perform such actions.
4035 		 */
4036 		PMAP_ASSERT_STAGE1(pmap);
4037 		/*
4038 		 * Wiring change, just update stats. We don't worry about
4039 		 * wiring PT pages as they remain resident as long as there
4040 		 * are valid mappings in them. Hence, if a user page is wired,
4041 		 * the PT page will be also.
4042 		 */
4043 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
4044 		    (orig_l3 & ATTR_SW_WIRED) == 0)
4045 			pmap->pm_stats.wired_count++;
4046 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
4047 		    (orig_l3 & ATTR_SW_WIRED) != 0)
4048 			pmap->pm_stats.wired_count--;
4049 
4050 		/*
4051 		 * Remove the extra PT page reference.
4052 		 */
4053 		if (mpte != NULL) {
4054 			mpte->ref_count--;
4055 			KASSERT(mpte->ref_count > 0,
4056 			    ("pmap_enter: missing reference to page table page,"
4057 			     " va: 0x%lx", va));
4058 		}
4059 
4060 		/*
4061 		 * Has the physical page changed?
4062 		 */
4063 		if (opa == pa) {
4064 			/*
4065 			 * No, might be a protection or wiring change.
4066 			 */
4067 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4068 			    (new_l3 & ATTR_SW_DBM) != 0)
4069 				vm_page_aflag_set(m, PGA_WRITEABLE);
4070 			goto validate;
4071 		}
4072 
4073 		/*
4074 		 * The physical page has changed.  Temporarily invalidate
4075 		 * the mapping.
4076 		 */
4077 		orig_l3 = pmap_load_clear(l3);
4078 		KASSERT((orig_l3 & ~ATTR_MASK) == opa,
4079 		    ("pmap_enter: unexpected pa update for %#lx", va));
4080 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
4081 			om = PHYS_TO_VM_PAGE(opa);
4082 
4083 			/*
4084 			 * The pmap lock is sufficient to synchronize with
4085 			 * concurrent calls to pmap_page_test_mappings() and
4086 			 * pmap_ts_referenced().
4087 			 */
4088 			if (pmap_pte_dirty(pmap, orig_l3))
4089 				vm_page_dirty(om);
4090 			if ((orig_l3 & ATTR_AF) != 0) {
4091 				pmap_invalidate_page(pmap, va);
4092 				vm_page_aflag_set(om, PGA_REFERENCED);
4093 			}
4094 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4095 			pv = pmap_pvh_remove(&om->md, pmap, va);
4096 			if ((m->oflags & VPO_UNMANAGED) != 0)
4097 				free_pv_entry(pmap, pv);
4098 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
4099 			    TAILQ_EMPTY(&om->md.pv_list) &&
4100 			    ((om->flags & PG_FICTITIOUS) != 0 ||
4101 			    TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
4102 				vm_page_aflag_clear(om, PGA_WRITEABLE);
4103 		} else {
4104 			KASSERT((orig_l3 & ATTR_AF) != 0,
4105 			    ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
4106 			pmap_invalidate_page(pmap, va);
4107 		}
4108 		orig_l3 = 0;
4109 	} else {
4110 		/*
4111 		 * Increment the counters.
4112 		 */
4113 		if ((new_l3 & ATTR_SW_WIRED) != 0)
4114 			pmap->pm_stats.wired_count++;
4115 		pmap_resident_count_inc(pmap, 1);
4116 	}
4117 	/*
4118 	 * Enter on the PV list if part of our managed memory.
4119 	 */
4120 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4121 		if (pv == NULL) {
4122 			pv = get_pv_entry(pmap, &lock);
4123 			pv->pv_va = va;
4124 		}
4125 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4126 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4127 		m->md.pv_gen++;
4128 		if ((new_l3 & ATTR_SW_DBM) != 0)
4129 			vm_page_aflag_set(m, PGA_WRITEABLE);
4130 	}
4131 
4132 validate:
4133 	if (pmap->pm_stage == PM_STAGE1) {
4134 		/*
4135 		 * Sync icache if exec permission and attribute
4136 		 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
4137 		 * is stored and made valid for hardware table walk. If done
4138 		 * later, then other can access this page before caches are
4139 		 * properly synced. Don't do it for kernel memory which is
4140 		 * mapped with exec permission even if the memory isn't going
4141 		 * to hold executable code. The only time when icache sync is
4142 		 * needed is after kernel module is loaded and the relocation
4143 		 * info is processed. And it's done in elf_cpu_load_file().
4144 		*/
4145 		if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
4146 		    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
4147 		    (opa != pa || (orig_l3 & ATTR_S1_XN))) {
4148 			PMAP_ASSERT_STAGE1(pmap);
4149 			cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4150 		}
4151 	} else {
4152 		cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4153 	}
4154 
4155 	/*
4156 	 * Update the L3 entry
4157 	 */
4158 	if (pmap_l3_valid(orig_l3)) {
4159 		PMAP_ASSERT_STAGE1(pmap);
4160 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
4161 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
4162 			/* same PA, different attributes */
4163 			orig_l3 = pmap_load_store(l3, new_l3);
4164 			pmap_invalidate_page(pmap, va);
4165 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4166 			    pmap_pte_dirty(pmap, orig_l3))
4167 				vm_page_dirty(m);
4168 		} else {
4169 			/*
4170 			 * orig_l3 == new_l3
4171 			 * This can happens if multiple threads simultaneously
4172 			 * access not yet mapped page. This bad for performance
4173 			 * since this can cause full demotion-NOP-promotion
4174 			 * cycle.
4175 			 * Another possible reasons are:
4176 			 * - VM and pmap memory layout are diverged
4177 			 * - tlb flush is missing somewhere and CPU doesn't see
4178 			 *   actual mapping.
4179 			 */
4180 			CTR4(KTR_PMAP, "%s: already mapped page - "
4181 			    "pmap %p va 0x%#lx pte 0x%lx",
4182 			    __func__, pmap, va, new_l3);
4183 		}
4184 	} else {
4185 		/* New mapping */
4186 		pmap_store(l3, new_l3);
4187 		dsb(ishst);
4188 	}
4189 
4190 #if VM_NRESERVLEVEL > 0
4191 	/*
4192 	 * Try to promote from level 3 pages to a level 2 superpage. This
4193 	 * currently only works on stage 1 pmaps as pmap_promote_l2 looks at
4194 	 * stage 1 specific fields and performs a break-before-make sequence
4195 	 * that is incorrect a stage 2 pmap.
4196 	 */
4197 	if ((mpte == NULL || mpte->ref_count == NL3PG) &&
4198 	    pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 &&
4199 	    (m->flags & PG_FICTITIOUS) == 0 &&
4200 	    vm_reserv_level_iffullpop(m) == 0) {
4201 		pmap_promote_l2(pmap, pde, va, &lock);
4202 	}
4203 #endif
4204 
4205 	rv = KERN_SUCCESS;
4206 out:
4207 	if (lock != NULL)
4208 		rw_wunlock(lock);
4209 	PMAP_UNLOCK(pmap);
4210 	return (rv);
4211 }
4212 
4213 /*
4214  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
4215  * if successful.  Returns false if (1) a page table page cannot be allocated
4216  * without sleeping, (2) a mapping already exists at the specified virtual
4217  * address, or (3) a PV entry cannot be allocated without reclaiming another
4218  * PV entry.
4219  */
4220 static bool
pmap_enter_2mpage(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)4221 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4222     struct rwlock **lockp)
4223 {
4224 	pd_entry_t new_l2;
4225 
4226 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4227 	PMAP_ASSERT_STAGE1(pmap);
4228 	KASSERT(ADDR_IS_CANONICAL(va),
4229 	    ("%s: Address not in canonical form: %lx", __func__, va));
4230 
4231 	new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
4232 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
4233 	    L2_BLOCK);
4234 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4235 		new_l2 |= ATTR_SW_MANAGED;
4236 		new_l2 &= ~ATTR_AF;
4237 	}
4238 	if ((prot & VM_PROT_EXECUTE) == 0 ||
4239 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
4240 		new_l2 |= ATTR_S1_XN;
4241 	if (!ADDR_IS_KERNEL(va))
4242 		new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4243 	else
4244 		new_l2 |= ATTR_S1_UXN;
4245 	if (pmap != kernel_pmap)
4246 		new_l2 |= ATTR_S1_nG;
4247 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
4248 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp) ==
4249 	    KERN_SUCCESS);
4250 }
4251 
4252 /*
4253  * Returns true if every page table entry in the specified page table is
4254  * zero.
4255  */
4256 static bool
pmap_every_pte_zero(vm_paddr_t pa)4257 pmap_every_pte_zero(vm_paddr_t pa)
4258 {
4259 	pt_entry_t *pt_end, *pte;
4260 
4261 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
4262 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
4263 	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
4264 		if (*pte != 0)
4265 			return (false);
4266 	}
4267 	return (true);
4268 }
4269 
4270 /*
4271  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
4272  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
4273  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
4274  * a mapping already exists at the specified virtual address.  Returns
4275  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
4276  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
4277  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
4278  */
4279 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)4280 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
4281     vm_page_t m, struct rwlock **lockp)
4282 {
4283 	struct spglist free;
4284 	pd_entry_t *l2, old_l2;
4285 	vm_page_t l2pg, mt;
4286 
4287 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4288 	KASSERT(ADDR_IS_CANONICAL(va),
4289 	    ("%s: Address not in canonical form: %lx", __func__, va));
4290 
4291 	if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
4292 	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
4293 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
4294 		    va, pmap);
4295 		return (KERN_RESOURCE_SHORTAGE);
4296 	}
4297 
4298 	/*
4299 	 * If there are existing mappings, either abort or remove them.
4300 	 */
4301 	if ((old_l2 = pmap_load(l2)) != 0) {
4302 		KASSERT(l2pg == NULL || l2pg->ref_count > 1,
4303 		    ("pmap_enter_l2: l2pg's ref count is too low"));
4304 		if ((flags & PMAP_ENTER_NOREPLACE) != 0 &&
4305 		    (!ADDR_IS_KERNEL(va) ||
4306 		    (old_l2 & ATTR_DESCR_MASK) == L2_BLOCK ||
4307 		    !pmap_every_pte_zero(old_l2 & ~ATTR_MASK))) {
4308 			if (l2pg != NULL)
4309 				l2pg->ref_count--;
4310 			CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx"
4311 			    " in pmap %p", va, pmap);
4312 			return (KERN_FAILURE);
4313 		}
4314 		SLIST_INIT(&free);
4315 		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
4316 			(void)pmap_remove_l2(pmap, l2, va,
4317 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
4318 		else
4319 			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
4320 			    &free, lockp);
4321 		if (!ADDR_IS_KERNEL(va)) {
4322 			vm_page_free_pages_toq(&free, true);
4323 			KASSERT(pmap_load(l2) == 0,
4324 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
4325 		} else {
4326 			KASSERT(SLIST_EMPTY(&free),
4327 			    ("pmap_enter_l2: freed kernel page table page"));
4328 
4329 			/*
4330 			 * Both pmap_remove_l2() and pmap_remove_l3_range()
4331 			 * will leave the kernel page table page zero filled.
4332 			 * Nonetheless, the TLB could have an intermediate
4333 			 * entry for the kernel page table page.
4334 			 */
4335 			mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
4336 			if (pmap_insert_pt_page(pmap, mt, false))
4337 				panic("pmap_enter_l2: trie insert failed");
4338 			pmap_clear(l2);
4339 			pmap_invalidate_page(pmap, va);
4340 		}
4341 	}
4342 
4343 	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
4344 		/*
4345 		 * Abort this mapping if its PV entry could not be created.
4346 		 */
4347 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
4348 			if (l2pg != NULL)
4349 				pmap_abort_ptp(pmap, va, l2pg);
4350 			CTR2(KTR_PMAP,
4351 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
4352 			    va, pmap);
4353 			return (KERN_RESOURCE_SHORTAGE);
4354 		}
4355 		if ((new_l2 & ATTR_SW_DBM) != 0)
4356 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4357 				vm_page_aflag_set(mt, PGA_WRITEABLE);
4358 	}
4359 
4360 	/*
4361 	 * Increment counters.
4362 	 */
4363 	if ((new_l2 & ATTR_SW_WIRED) != 0)
4364 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
4365 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
4366 
4367 	/*
4368 	 * Conditionally sync the icache.  See pmap_enter() for details.
4369 	 */
4370 	if ((new_l2 & ATTR_S1_XN) == 0 && ((new_l2 & ~ATTR_MASK) !=
4371 	    (old_l2 & ~ATTR_MASK) || (old_l2 & ATTR_S1_XN) != 0) &&
4372 	    pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
4373 		cpu_icache_sync_range(PHYS_TO_DMAP(new_l2 & ~ATTR_MASK),
4374 		    L2_SIZE);
4375 	}
4376 
4377 	/*
4378 	 * Map the superpage.
4379 	 */
4380 	pmap_store(l2, new_l2);
4381 	dsb(ishst);
4382 
4383 	atomic_add_long(&pmap_l2_mappings, 1);
4384 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
4385 	    va, pmap);
4386 
4387 	return (KERN_SUCCESS);
4388 }
4389 
4390 /*
4391  * Maps a sequence of resident pages belonging to the same object.
4392  * The sequence begins with the given page m_start.  This page is
4393  * mapped at the given virtual address start.  Each subsequent page is
4394  * mapped at a virtual address that is offset from start by the same
4395  * amount as the page is offset from m_start within the object.  The
4396  * last page in the sequence is the page with the largest offset from
4397  * m_start that can be mapped at a virtual address less than the given
4398  * virtual address end.  Not every virtual page between start and end
4399  * is mapped; only those for which a resident page exists with the
4400  * corresponding offset from m_start are mapped.
4401  */
4402 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)4403 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4404     vm_page_t m_start, vm_prot_t prot)
4405 {
4406 	struct rwlock *lock;
4407 	vm_offset_t va;
4408 	vm_page_t m, mpte;
4409 	vm_pindex_t diff, psize;
4410 
4411 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4412 
4413 	psize = atop(end - start);
4414 	mpte = NULL;
4415 	m = m_start;
4416 	lock = NULL;
4417 	PMAP_LOCK(pmap);
4418 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4419 		va = start + ptoa(diff);
4420 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
4421 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
4422 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
4423 			m = &m[L2_SIZE / PAGE_SIZE - 1];
4424 		else
4425 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
4426 			    &lock);
4427 		m = TAILQ_NEXT(m, listq);
4428 	}
4429 	if (lock != NULL)
4430 		rw_wunlock(lock);
4431 	PMAP_UNLOCK(pmap);
4432 }
4433 
4434 /*
4435  * this code makes some *MAJOR* assumptions:
4436  * 1. Current pmap & pmap exists.
4437  * 2. Not wired.
4438  * 3. Read access.
4439  * 4. No page table pages.
4440  * but is *MUCH* faster than pmap_enter...
4441  */
4442 
4443 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)4444 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4445 {
4446 	struct rwlock *lock;
4447 
4448 	lock = NULL;
4449 	PMAP_LOCK(pmap);
4450 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4451 	if (lock != NULL)
4452 		rw_wunlock(lock);
4453 	PMAP_UNLOCK(pmap);
4454 }
4455 
4456 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)4457 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4458     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4459 {
4460 	pd_entry_t *pde;
4461 	pt_entry_t *l2, *l3, l3_val;
4462 	vm_paddr_t pa;
4463 	int lvl;
4464 
4465 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4466 	    (m->oflags & VPO_UNMANAGED) != 0,
4467 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4468 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4469 	PMAP_ASSERT_STAGE1(pmap);
4470 	KASSERT(ADDR_IS_CANONICAL(va),
4471 	    ("%s: Address not in canonical form: %lx", __func__, va));
4472 
4473 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
4474 	/*
4475 	 * In the case that a page table page is not
4476 	 * resident, we are creating it here.
4477 	 */
4478 	if (!ADDR_IS_KERNEL(va)) {
4479 		vm_pindex_t l2pindex;
4480 
4481 		/*
4482 		 * Calculate pagetable page index
4483 		 */
4484 		l2pindex = pmap_l2_pindex(va);
4485 		if (mpte && (mpte->pindex == l2pindex)) {
4486 			mpte->ref_count++;
4487 		} else {
4488 			/*
4489 			 * Get the l2 entry
4490 			 */
4491 			pde = pmap_pde(pmap, va, &lvl);
4492 
4493 			/*
4494 			 * If the page table page is mapped, we just increment
4495 			 * the hold count, and activate it.  Otherwise, we
4496 			 * attempt to allocate a page table page.  If this
4497 			 * attempt fails, we don't retry.  Instead, we give up.
4498 			 */
4499 			if (lvl == 1) {
4500 				l2 = pmap_l1_to_l2(pde, va);
4501 				if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
4502 				    L2_BLOCK)
4503 					return (NULL);
4504 			}
4505 			if (lvl == 2 && pmap_load(pde) != 0) {
4506 				mpte =
4507 				    PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
4508 				mpte->ref_count++;
4509 			} else {
4510 				/*
4511 				 * Pass NULL instead of the PV list lock
4512 				 * pointer, because we don't intend to sleep.
4513 				 */
4514 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
4515 				if (mpte == NULL)
4516 					return (mpte);
4517 			}
4518 		}
4519 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4520 		l3 = &l3[pmap_l3_index(va)];
4521 	} else {
4522 		mpte = NULL;
4523 		pde = pmap_pde(kernel_pmap, va, &lvl);
4524 		KASSERT(pde != NULL,
4525 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
4526 		     va));
4527 		KASSERT(lvl == 2,
4528 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
4529 		l3 = pmap_l2_to_l3(pde, va);
4530 	}
4531 
4532 	/*
4533 	 * Abort if a mapping already exists.
4534 	 */
4535 	if (pmap_load(l3) != 0) {
4536 		if (mpte != NULL)
4537 			mpte->ref_count--;
4538 		return (NULL);
4539 	}
4540 
4541 	/*
4542 	 * Enter on the PV list if part of our managed memory.
4543 	 */
4544 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
4545 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4546 		if (mpte != NULL)
4547 			pmap_abort_ptp(pmap, va, mpte);
4548 		return (NULL);
4549 	}
4550 
4551 	/*
4552 	 * Increment counters
4553 	 */
4554 	pmap_resident_count_inc(pmap, 1);
4555 
4556 	pa = VM_PAGE_TO_PHYS(m);
4557 	l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
4558 	    ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
4559 	if ((prot & VM_PROT_EXECUTE) == 0 ||
4560 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
4561 		l3_val |= ATTR_S1_XN;
4562 	if (!ADDR_IS_KERNEL(va))
4563 		l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4564 	else
4565 		l3_val |= ATTR_S1_UXN;
4566 	if (pmap != kernel_pmap)
4567 		l3_val |= ATTR_S1_nG;
4568 
4569 	/*
4570 	 * Now validate mapping with RO protection
4571 	 */
4572 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4573 		l3_val |= ATTR_SW_MANAGED;
4574 		l3_val &= ~ATTR_AF;
4575 	}
4576 
4577 	/* Sync icache before the mapping is stored to PTE */
4578 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
4579 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
4580 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4581 
4582 	pmap_store(l3, l3_val);
4583 	dsb(ishst);
4584 
4585 	return (mpte);
4586 }
4587 
4588 /*
4589  * This code maps large physical mmap regions into the
4590  * processor address space.  Note that some shortcuts
4591  * are taken, but the code works.
4592  */
4593 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)4594 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4595     vm_pindex_t pindex, vm_size_t size)
4596 {
4597 
4598 	VM_OBJECT_ASSERT_WLOCKED(object);
4599 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4600 	    ("pmap_object_init_pt: non-device object"));
4601 }
4602 
4603 /*
4604  *	Clear the wired attribute from the mappings for the specified range of
4605  *	addresses in the given pmap.  Every valid mapping within that range
4606  *	must have the wired attribute set.  In contrast, invalid mappings
4607  *	cannot have the wired attribute set, so they are ignored.
4608  *
4609  *	The wired attribute of the page table entry is not a hardware feature,
4610  *	so there is no need to invalidate any TLB entries.
4611  */
4612 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4613 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4614 {
4615 	vm_offset_t va_next;
4616 	pd_entry_t *l0, *l1, *l2;
4617 	pt_entry_t *l3;
4618 
4619 	PMAP_LOCK(pmap);
4620 	for (; sva < eva; sva = va_next) {
4621 		l0 = pmap_l0(pmap, sva);
4622 		if (pmap_load(l0) == 0) {
4623 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4624 			if (va_next < sva)
4625 				va_next = eva;
4626 			continue;
4627 		}
4628 
4629 		l1 = pmap_l0_to_l1(l0, sva);
4630 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4631 		if (va_next < sva)
4632 			va_next = eva;
4633 		if (pmap_load(l1) == 0)
4634 			continue;
4635 
4636 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4637 			KASSERT(va_next <= eva,
4638 			    ("partial update of non-transparent 1G page "
4639 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4640 			    pmap_load(l1), sva, eva, va_next));
4641 			MPASS(pmap != kernel_pmap);
4642 			MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
4643 			    ATTR_SW_WIRED)) == ATTR_SW_WIRED);
4644 			pmap_clear_bits(l1, ATTR_SW_WIRED);
4645 			pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
4646 			continue;
4647 		}
4648 
4649 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4650 		if (va_next < sva)
4651 			va_next = eva;
4652 
4653 		l2 = pmap_l1_to_l2(l1, sva);
4654 		if (pmap_load(l2) == 0)
4655 			continue;
4656 
4657 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4658 			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
4659 				panic("pmap_unwire: l2 %#jx is missing "
4660 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
4661 
4662 			/*
4663 			 * Are we unwiring the entire large page?  If not,
4664 			 * demote the mapping and fall through.
4665 			 */
4666 			if (sva + L2_SIZE == va_next && eva >= va_next) {
4667 				pmap_clear_bits(l2, ATTR_SW_WIRED);
4668 				pmap->pm_stats.wired_count -= L2_SIZE /
4669 				    PAGE_SIZE;
4670 				continue;
4671 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
4672 				panic("pmap_unwire: demotion failed");
4673 		}
4674 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4675 		    ("pmap_unwire: Invalid l2 entry after demotion"));
4676 
4677 		if (va_next > eva)
4678 			va_next = eva;
4679 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
4680 		    sva += L3_SIZE) {
4681 			if (pmap_load(l3) == 0)
4682 				continue;
4683 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
4684 				panic("pmap_unwire: l3 %#jx is missing "
4685 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
4686 
4687 			/*
4688 			 * ATTR_SW_WIRED must be cleared atomically.  Although
4689 			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
4690 			 * the System MMU may write to the entry concurrently.
4691 			 */
4692 			pmap_clear_bits(l3, ATTR_SW_WIRED);
4693 			pmap->pm_stats.wired_count--;
4694 		}
4695 	}
4696 	PMAP_UNLOCK(pmap);
4697 }
4698 
4699 /*
4700  *	Copy the range specified by src_addr/len
4701  *	from the source map to the range dst_addr/len
4702  *	in the destination map.
4703  *
4704  *	This routine is only advisory and need not do anything.
4705  *
4706  *	Because the executable mappings created by this routine are copied,
4707  *	it should not have to flush the instruction cache.
4708  */
4709 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)4710 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4711     vm_offset_t src_addr)
4712 {
4713 	struct rwlock *lock;
4714 	pd_entry_t *l0, *l1, *l2, srcptepaddr;
4715 	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
4716 	vm_offset_t addr, end_addr, va_next;
4717 	vm_page_t dst_m, dstmpte, srcmpte;
4718 
4719 	PMAP_ASSERT_STAGE1(dst_pmap);
4720 	PMAP_ASSERT_STAGE1(src_pmap);
4721 
4722 	if (dst_addr != src_addr)
4723 		return;
4724 	end_addr = src_addr + len;
4725 	lock = NULL;
4726 	if (dst_pmap < src_pmap) {
4727 		PMAP_LOCK(dst_pmap);
4728 		PMAP_LOCK(src_pmap);
4729 	} else {
4730 		PMAP_LOCK(src_pmap);
4731 		PMAP_LOCK(dst_pmap);
4732 	}
4733 	for (addr = src_addr; addr < end_addr; addr = va_next) {
4734 		l0 = pmap_l0(src_pmap, addr);
4735 		if (pmap_load(l0) == 0) {
4736 			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
4737 			if (va_next < addr)
4738 				va_next = end_addr;
4739 			continue;
4740 		}
4741 
4742 		va_next = (addr + L1_SIZE) & ~L1_OFFSET;
4743 		if (va_next < addr)
4744 			va_next = end_addr;
4745 		l1 = pmap_l0_to_l1(l0, addr);
4746 		if (pmap_load(l1) == 0)
4747 			continue;
4748 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4749 			KASSERT(va_next <= end_addr,
4750 			    ("partial update of non-transparent 1G page "
4751 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
4752 			    pmap_load(l1), addr, end_addr, va_next));
4753 			srcptepaddr = pmap_load(l1);
4754 			l1 = pmap_l1(dst_pmap, addr);
4755 			if (l1 == NULL) {
4756 				if (_pmap_alloc_l3(dst_pmap,
4757 				    pmap_l0_pindex(addr), NULL) == NULL)
4758 					break;
4759 				l1 = pmap_l1(dst_pmap, addr);
4760 			} else {
4761 				l0 = pmap_l0(dst_pmap, addr);
4762 				dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) &
4763 				    ~ATTR_MASK);
4764 				dst_m->ref_count++;
4765 			}
4766 			KASSERT(pmap_load(l1) == 0,
4767 			    ("1G mapping present in dst pmap "
4768 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
4769 			    pmap_load(l1), addr, end_addr, va_next));
4770 			pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
4771 			pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
4772 			continue;
4773 		}
4774 
4775 		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
4776 		if (va_next < addr)
4777 			va_next = end_addr;
4778 		l2 = pmap_l1_to_l2(l1, addr);
4779 		srcptepaddr = pmap_load(l2);
4780 		if (srcptepaddr == 0)
4781 			continue;
4782 		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4783 			/*
4784 			 * We can only virtual copy whole superpages.
4785 			 */
4786 			if ((addr & L2_OFFSET) != 0 ||
4787 			    addr + L2_SIZE > end_addr)
4788 				continue;
4789 			l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
4790 			if (l2 == NULL)
4791 				break;
4792 			if (pmap_load(l2) == 0 &&
4793 			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
4794 			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
4795 			    PMAP_ENTER_NORECLAIM, &lock))) {
4796 				/*
4797 				 * We leave the dirty bit unchanged because
4798 				 * managed read/write superpage mappings are
4799 				 * required to be dirty.  However, managed
4800 				 * superpage mappings are not required to
4801 				 * have their accessed bit set, so we clear
4802 				 * it because we don't know if this mapping
4803 				 * will be used.
4804 				 */
4805 				srcptepaddr &= ~ATTR_SW_WIRED;
4806 				if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
4807 					srcptepaddr &= ~ATTR_AF;
4808 				pmap_store(l2, srcptepaddr);
4809 				pmap_resident_count_inc(dst_pmap, L2_SIZE /
4810 				    PAGE_SIZE);
4811 				atomic_add_long(&pmap_l2_mappings, 1);
4812 			} else
4813 				pmap_abort_ptp(dst_pmap, addr, dst_m);
4814 			continue;
4815 		}
4816 		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
4817 		    ("pmap_copy: invalid L2 entry"));
4818 		srcptepaddr &= ~ATTR_MASK;
4819 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
4820 		KASSERT(srcmpte->ref_count > 0,
4821 		    ("pmap_copy: source page table page is unused"));
4822 		if (va_next > end_addr)
4823 			va_next = end_addr;
4824 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
4825 		src_pte = &src_pte[pmap_l3_index(addr)];
4826 		dstmpte = NULL;
4827 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
4828 			ptetemp = pmap_load(src_pte);
4829 
4830 			/*
4831 			 * We only virtual copy managed pages.
4832 			 */
4833 			if ((ptetemp & ATTR_SW_MANAGED) == 0)
4834 				continue;
4835 
4836 			if (dstmpte != NULL) {
4837 				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
4838 				    ("dstmpte pindex/addr mismatch"));
4839 				dstmpte->ref_count++;
4840 			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
4841 			    NULL)) == NULL)
4842 				goto out;
4843 			dst_pte = (pt_entry_t *)
4844 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
4845 			dst_pte = &dst_pte[pmap_l3_index(addr)];
4846 			if (pmap_load(dst_pte) == 0 &&
4847 			    pmap_try_insert_pv_entry(dst_pmap, addr,
4848 			    PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) {
4849 				/*
4850 				 * Clear the wired, modified, and accessed
4851 				 * (referenced) bits during the copy.
4852 				 */
4853 				mask = ATTR_AF | ATTR_SW_WIRED;
4854 				nbits = 0;
4855 				if ((ptetemp & ATTR_SW_DBM) != 0)
4856 					nbits |= ATTR_S1_AP_RW_BIT;
4857 				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
4858 				pmap_resident_count_inc(dst_pmap, 1);
4859 			} else {
4860 				pmap_abort_ptp(dst_pmap, addr, dstmpte);
4861 				goto out;
4862 			}
4863 			/* Have we copied all of the valid mappings? */
4864 			if (dstmpte->ref_count >= srcmpte->ref_count)
4865 				break;
4866 		}
4867 	}
4868 out:
4869 	/*
4870 	 * XXX This barrier may not be needed because the destination pmap is
4871 	 * not active.
4872 	 */
4873 	dsb(ishst);
4874 
4875 	if (lock != NULL)
4876 		rw_wunlock(lock);
4877 	PMAP_UNLOCK(src_pmap);
4878 	PMAP_UNLOCK(dst_pmap);
4879 }
4880 
4881 /*
4882  *	pmap_zero_page zeros the specified hardware page by mapping
4883  *	the page into KVM and using bzero to clear its contents.
4884  */
4885 void
pmap_zero_page(vm_page_t m)4886 pmap_zero_page(vm_page_t m)
4887 {
4888 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4889 
4890 	pagezero((void *)va);
4891 }
4892 
4893 /*
4894  *	pmap_zero_page_area zeros the specified hardware page by mapping
4895  *	the page into KVM and using bzero to clear its contents.
4896  *
4897  *	off and size may not cover an area beyond a single hardware page.
4898  */
4899 void
pmap_zero_page_area(vm_page_t m,int off,int size)4900 pmap_zero_page_area(vm_page_t m, int off, int size)
4901 {
4902 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4903 
4904 	if (off == 0 && size == PAGE_SIZE)
4905 		pagezero((void *)va);
4906 	else
4907 		bzero((char *)va + off, size);
4908 }
4909 
4910 /*
4911  *	pmap_copy_page copies the specified (machine independent)
4912  *	page by mapping the page into virtual memory and using
4913  *	bcopy to copy the page, one machine dependent page at a
4914  *	time.
4915  */
4916 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)4917 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4918 {
4919 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
4920 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
4921 
4922 	pagecopy((void *)src, (void *)dst);
4923 }
4924 
4925 int unmapped_buf_allowed = 1;
4926 
4927 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)4928 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4929     vm_offset_t b_offset, int xfersize)
4930 {
4931 	void *a_cp, *b_cp;
4932 	vm_page_t m_a, m_b;
4933 	vm_paddr_t p_a, p_b;
4934 	vm_offset_t a_pg_offset, b_pg_offset;
4935 	int cnt;
4936 
4937 	while (xfersize > 0) {
4938 		a_pg_offset = a_offset & PAGE_MASK;
4939 		m_a = ma[a_offset >> PAGE_SHIFT];
4940 		p_a = m_a->phys_addr;
4941 		b_pg_offset = b_offset & PAGE_MASK;
4942 		m_b = mb[b_offset >> PAGE_SHIFT];
4943 		p_b = m_b->phys_addr;
4944 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4945 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4946 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
4947 			panic("!DMAP a %lx", p_a);
4948 		} else {
4949 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
4950 		}
4951 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
4952 			panic("!DMAP b %lx", p_b);
4953 		} else {
4954 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
4955 		}
4956 		bcopy(a_cp, b_cp, cnt);
4957 		a_offset += cnt;
4958 		b_offset += cnt;
4959 		xfersize -= cnt;
4960 	}
4961 }
4962 
4963 vm_offset_t
pmap_quick_enter_page(vm_page_t m)4964 pmap_quick_enter_page(vm_page_t m)
4965 {
4966 
4967 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
4968 }
4969 
4970 void
pmap_quick_remove_page(vm_offset_t addr)4971 pmap_quick_remove_page(vm_offset_t addr)
4972 {
4973 }
4974 
4975 /*
4976  * Returns true if the pmap's pv is one of the first
4977  * 16 pvs linked to from this page.  This count may
4978  * be changed upwards or downwards in the future; it
4979  * is only necessary that true be returned for a small
4980  * subset of pmaps for proper page aging.
4981  */
4982 boolean_t
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)4983 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4984 {
4985 	struct md_page *pvh;
4986 	struct rwlock *lock;
4987 	pv_entry_t pv;
4988 	int loops = 0;
4989 	boolean_t rv;
4990 
4991 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4992 	    ("pmap_page_exists_quick: page %p is not managed", m));
4993 	rv = FALSE;
4994 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4995 	rw_rlock(lock);
4996 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4997 		if (PV_PMAP(pv) == pmap) {
4998 			rv = TRUE;
4999 			break;
5000 		}
5001 		loops++;
5002 		if (loops >= 16)
5003 			break;
5004 	}
5005 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5006 		pvh = page_to_pvh(m);
5007 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5008 			if (PV_PMAP(pv) == pmap) {
5009 				rv = TRUE;
5010 				break;
5011 			}
5012 			loops++;
5013 			if (loops >= 16)
5014 				break;
5015 		}
5016 	}
5017 	rw_runlock(lock);
5018 	return (rv);
5019 }
5020 
5021 /*
5022  *	pmap_page_wired_mappings:
5023  *
5024  *	Return the number of managed mappings to the given physical page
5025  *	that are wired.
5026  */
5027 int
pmap_page_wired_mappings(vm_page_t m)5028 pmap_page_wired_mappings(vm_page_t m)
5029 {
5030 	struct rwlock *lock;
5031 	struct md_page *pvh;
5032 	pmap_t pmap;
5033 	pt_entry_t *pte;
5034 	pv_entry_t pv;
5035 	int count, lvl, md_gen, pvh_gen;
5036 
5037 	if ((m->oflags & VPO_UNMANAGED) != 0)
5038 		return (0);
5039 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5040 	rw_rlock(lock);
5041 restart:
5042 	count = 0;
5043 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5044 		pmap = PV_PMAP(pv);
5045 		if (!PMAP_TRYLOCK(pmap)) {
5046 			md_gen = m->md.pv_gen;
5047 			rw_runlock(lock);
5048 			PMAP_LOCK(pmap);
5049 			rw_rlock(lock);
5050 			if (md_gen != m->md.pv_gen) {
5051 				PMAP_UNLOCK(pmap);
5052 				goto restart;
5053 			}
5054 		}
5055 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
5056 		if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
5057 			count++;
5058 		PMAP_UNLOCK(pmap);
5059 	}
5060 	if ((m->flags & PG_FICTITIOUS) == 0) {
5061 		pvh = page_to_pvh(m);
5062 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5063 			pmap = PV_PMAP(pv);
5064 			if (!PMAP_TRYLOCK(pmap)) {
5065 				md_gen = m->md.pv_gen;
5066 				pvh_gen = pvh->pv_gen;
5067 				rw_runlock(lock);
5068 				PMAP_LOCK(pmap);
5069 				rw_rlock(lock);
5070 				if (md_gen != m->md.pv_gen ||
5071 				    pvh_gen != pvh->pv_gen) {
5072 					PMAP_UNLOCK(pmap);
5073 					goto restart;
5074 				}
5075 			}
5076 			pte = pmap_pte(pmap, pv->pv_va, &lvl);
5077 			if (pte != NULL &&
5078 			    (pmap_load(pte) & ATTR_SW_WIRED) != 0)
5079 				count++;
5080 			PMAP_UNLOCK(pmap);
5081 		}
5082 	}
5083 	rw_runlock(lock);
5084 	return (count);
5085 }
5086 
5087 /*
5088  * Returns true if the given page is mapped individually or as part of
5089  * a 2mpage.  Otherwise, returns false.
5090  */
5091 bool
pmap_page_is_mapped(vm_page_t m)5092 pmap_page_is_mapped(vm_page_t m)
5093 {
5094 	struct rwlock *lock;
5095 	bool rv;
5096 
5097 	if ((m->oflags & VPO_UNMANAGED) != 0)
5098 		return (false);
5099 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5100 	rw_rlock(lock);
5101 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5102 	    ((m->flags & PG_FICTITIOUS) == 0 &&
5103 	    !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
5104 	rw_runlock(lock);
5105 	return (rv);
5106 }
5107 
5108 /*
5109  * Destroy all managed, non-wired mappings in the given user-space
5110  * pmap.  This pmap cannot be active on any processor besides the
5111  * caller.
5112  *
5113  * This function cannot be applied to the kernel pmap.  Moreover, it
5114  * is not intended for general use.  It is only to be used during
5115  * process termination.  Consequently, it can be implemented in ways
5116  * that make it faster than pmap_remove().  First, it can more quickly
5117  * destroy mappings by iterating over the pmap's collection of PV
5118  * entries, rather than searching the page table.  Second, it doesn't
5119  * have to test and clear the page table entries atomically, because
5120  * no processor is currently accessing the user address space.  In
5121  * particular, a page table entry's dirty bit won't change state once
5122  * this function starts.
5123  */
5124 void
pmap_remove_pages(pmap_t pmap)5125 pmap_remove_pages(pmap_t pmap)
5126 {
5127 	pd_entry_t *pde;
5128 	pt_entry_t *pte, tpte;
5129 	struct spglist free;
5130 	vm_page_t m, ml3, mt;
5131 	pv_entry_t pv;
5132 	struct md_page *pvh;
5133 	struct pv_chunk *pc, *npc;
5134 	struct rwlock *lock;
5135 	int64_t bit;
5136 	uint64_t inuse, bitmask;
5137 	int allfree, field, freed, idx, lvl;
5138 	vm_paddr_t pa;
5139 
5140 	lock = NULL;
5141 
5142 	SLIST_INIT(&free);
5143 	PMAP_LOCK(pmap);
5144 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5145 		allfree = 1;
5146 		freed = 0;
5147 		for (field = 0; field < _NPCM; field++) {
5148 			inuse = ~pc->pc_map[field] & pc_freemask[field];
5149 			while (inuse != 0) {
5150 				bit = ffsl(inuse) - 1;
5151 				bitmask = 1UL << bit;
5152 				idx = field * 64 + bit;
5153 				pv = &pc->pc_pventry[idx];
5154 				inuse &= ~bitmask;
5155 
5156 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
5157 				KASSERT(pde != NULL,
5158 				    ("Attempting to remove an unmapped page"));
5159 
5160 				switch(lvl) {
5161 				case 1:
5162 					pte = pmap_l1_to_l2(pde, pv->pv_va);
5163 					tpte = pmap_load(pte);
5164 					KASSERT((tpte & ATTR_DESCR_MASK) ==
5165 					    L2_BLOCK,
5166 					    ("Attempting to remove an invalid "
5167 					    "block: %lx", tpte));
5168 					break;
5169 				case 2:
5170 					pte = pmap_l2_to_l3(pde, pv->pv_va);
5171 					tpte = pmap_load(pte);
5172 					KASSERT((tpte & ATTR_DESCR_MASK) ==
5173 					    L3_PAGE,
5174 					    ("Attempting to remove an invalid "
5175 					     "page: %lx", tpte));
5176 					break;
5177 				default:
5178 					panic(
5179 					    "Invalid page directory level: %d",
5180 					    lvl);
5181 				}
5182 
5183 /*
5184  * We cannot remove wired pages from a process' mapping at this time
5185  */
5186 				if (tpte & ATTR_SW_WIRED) {
5187 					allfree = 0;
5188 					continue;
5189 				}
5190 
5191 				/* Mark free */
5192 				pc->pc_map[field] |= bitmask;
5193 
5194 				/*
5195 				 * Because this pmap is not active on other
5196 				 * processors, the dirty bit cannot have
5197 				 * changed state since we last loaded pte.
5198 				 */
5199 				pmap_clear(pte);
5200 
5201 				pa = tpte & ~ATTR_MASK;
5202 
5203 				m = PHYS_TO_VM_PAGE(pa);
5204 				KASSERT(m->phys_addr == pa,
5205 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5206 				    m, (uintmax_t)m->phys_addr,
5207 				    (uintmax_t)tpte));
5208 
5209 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5210 				    m < &vm_page_array[vm_page_array_size],
5211 				    ("pmap_remove_pages: bad pte %#jx",
5212 				    (uintmax_t)tpte));
5213 
5214 				/*
5215 				 * Update the vm_page_t clean/reference bits.
5216 				 */
5217 				if (pmap_pte_dirty(pmap, tpte)) {
5218 					switch (lvl) {
5219 					case 1:
5220 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5221 							vm_page_dirty(mt);
5222 						break;
5223 					case 2:
5224 						vm_page_dirty(m);
5225 						break;
5226 					}
5227 				}
5228 
5229 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5230 
5231 				switch (lvl) {
5232 				case 1:
5233 					pmap_resident_count_dec(pmap,
5234 					    L2_SIZE / PAGE_SIZE);
5235 					pvh = page_to_pvh(m);
5236 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
5237 					pvh->pv_gen++;
5238 					if (TAILQ_EMPTY(&pvh->pv_list)) {
5239 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5240 							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5241 							    TAILQ_EMPTY(&mt->md.pv_list))
5242 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5243 					}
5244 					ml3 = pmap_remove_pt_page(pmap,
5245 					    pv->pv_va);
5246 					if (ml3 != NULL) {
5247 						KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
5248 						    ("pmap_remove_pages: l3 page not promoted"));
5249 						pmap_resident_count_dec(pmap,1);
5250 						KASSERT(ml3->ref_count == NL3PG,
5251 						    ("pmap_remove_pages: l3 page ref count error"));
5252 						ml3->ref_count = 0;
5253 						pmap_add_delayed_free_list(ml3,
5254 						    &free, FALSE);
5255 					}
5256 					break;
5257 				case 2:
5258 					pmap_resident_count_dec(pmap, 1);
5259 					TAILQ_REMOVE(&m->md.pv_list, pv,
5260 					    pv_next);
5261 					m->md.pv_gen++;
5262 					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5263 					    TAILQ_EMPTY(&m->md.pv_list) &&
5264 					    (m->flags & PG_FICTITIOUS) == 0) {
5265 						pvh = page_to_pvh(m);
5266 						if (TAILQ_EMPTY(&pvh->pv_list))
5267 							vm_page_aflag_clear(m,
5268 							    PGA_WRITEABLE);
5269 					}
5270 					break;
5271 				}
5272 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
5273 				    &free);
5274 				freed++;
5275 			}
5276 		}
5277 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5278 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5279 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5280 		if (allfree) {
5281 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5282 			free_pv_chunk(pc);
5283 		}
5284 	}
5285 	if (lock != NULL)
5286 		rw_wunlock(lock);
5287 	pmap_invalidate_all(pmap);
5288 	PMAP_UNLOCK(pmap);
5289 	vm_page_free_pages_toq(&free, true);
5290 }
5291 
5292 /*
5293  * This is used to check if a page has been accessed or modified.
5294  */
5295 static boolean_t
pmap_page_test_mappings(vm_page_t m,boolean_t accessed,boolean_t modified)5296 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5297 {
5298 	struct rwlock *lock;
5299 	pv_entry_t pv;
5300 	struct md_page *pvh;
5301 	pt_entry_t *pte, mask, value;
5302 	pmap_t pmap;
5303 	int lvl, md_gen, pvh_gen;
5304 	boolean_t rv;
5305 
5306 	rv = FALSE;
5307 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5308 	rw_rlock(lock);
5309 restart:
5310 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5311 		pmap = PV_PMAP(pv);
5312 		PMAP_ASSERT_STAGE1(pmap);
5313 		if (!PMAP_TRYLOCK(pmap)) {
5314 			md_gen = m->md.pv_gen;
5315 			rw_runlock(lock);
5316 			PMAP_LOCK(pmap);
5317 			rw_rlock(lock);
5318 			if (md_gen != m->md.pv_gen) {
5319 				PMAP_UNLOCK(pmap);
5320 				goto restart;
5321 			}
5322 		}
5323 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
5324 		KASSERT(lvl == 3,
5325 		    ("pmap_page_test_mappings: Invalid level %d", lvl));
5326 		mask = 0;
5327 		value = 0;
5328 		if (modified) {
5329 			mask |= ATTR_S1_AP_RW_BIT;
5330 			value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5331 		}
5332 		if (accessed) {
5333 			mask |= ATTR_AF | ATTR_DESCR_MASK;
5334 			value |= ATTR_AF | L3_PAGE;
5335 		}
5336 		rv = (pmap_load(pte) & mask) == value;
5337 		PMAP_UNLOCK(pmap);
5338 		if (rv)
5339 			goto out;
5340 	}
5341 	if ((m->flags & PG_FICTITIOUS) == 0) {
5342 		pvh = page_to_pvh(m);
5343 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5344 			pmap = PV_PMAP(pv);
5345 			PMAP_ASSERT_STAGE1(pmap);
5346 			if (!PMAP_TRYLOCK(pmap)) {
5347 				md_gen = m->md.pv_gen;
5348 				pvh_gen = pvh->pv_gen;
5349 				rw_runlock(lock);
5350 				PMAP_LOCK(pmap);
5351 				rw_rlock(lock);
5352 				if (md_gen != m->md.pv_gen ||
5353 				    pvh_gen != pvh->pv_gen) {
5354 					PMAP_UNLOCK(pmap);
5355 					goto restart;
5356 				}
5357 			}
5358 			pte = pmap_pte(pmap, pv->pv_va, &lvl);
5359 			KASSERT(lvl == 2,
5360 			    ("pmap_page_test_mappings: Invalid level %d", lvl));
5361 			mask = 0;
5362 			value = 0;
5363 			if (modified) {
5364 				mask |= ATTR_S1_AP_RW_BIT;
5365 				value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5366 			}
5367 			if (accessed) {
5368 				mask |= ATTR_AF | ATTR_DESCR_MASK;
5369 				value |= ATTR_AF | L2_BLOCK;
5370 			}
5371 			rv = (pmap_load(pte) & mask) == value;
5372 			PMAP_UNLOCK(pmap);
5373 			if (rv)
5374 				goto out;
5375 		}
5376 	}
5377 out:
5378 	rw_runlock(lock);
5379 	return (rv);
5380 }
5381 
5382 /*
5383  *	pmap_is_modified:
5384  *
5385  *	Return whether or not the specified physical page was modified
5386  *	in any physical maps.
5387  */
5388 boolean_t
pmap_is_modified(vm_page_t m)5389 pmap_is_modified(vm_page_t m)
5390 {
5391 
5392 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5393 	    ("pmap_is_modified: page %p is not managed", m));
5394 
5395 	/*
5396 	 * If the page is not busied then this check is racy.
5397 	 */
5398 	if (!pmap_page_is_write_mapped(m))
5399 		return (FALSE);
5400 	return (pmap_page_test_mappings(m, FALSE, TRUE));
5401 }
5402 
5403 /*
5404  *	pmap_is_prefaultable:
5405  *
5406  *	Return whether or not the specified virtual address is eligible
5407  *	for prefault.
5408  */
5409 boolean_t
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)5410 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5411 {
5412 	pt_entry_t *pte;
5413 	boolean_t rv;
5414 	int lvl;
5415 
5416 	rv = FALSE;
5417 	PMAP_LOCK(pmap);
5418 	pte = pmap_pte(pmap, addr, &lvl);
5419 	if (pte != NULL && pmap_load(pte) != 0) {
5420 		rv = TRUE;
5421 	}
5422 	PMAP_UNLOCK(pmap);
5423 	return (rv);
5424 }
5425 
5426 /*
5427  *	pmap_is_referenced:
5428  *
5429  *	Return whether or not the specified physical page was referenced
5430  *	in any physical maps.
5431  */
5432 boolean_t
pmap_is_referenced(vm_page_t m)5433 pmap_is_referenced(vm_page_t m)
5434 {
5435 
5436 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5437 	    ("pmap_is_referenced: page %p is not managed", m));
5438 	return (pmap_page_test_mappings(m, TRUE, FALSE));
5439 }
5440 
5441 /*
5442  * Clear the write and modified bits in each of the given page's mappings.
5443  */
5444 void
pmap_remove_write(vm_page_t m)5445 pmap_remove_write(vm_page_t m)
5446 {
5447 	struct md_page *pvh;
5448 	pmap_t pmap;
5449 	struct rwlock *lock;
5450 	pv_entry_t next_pv, pv;
5451 	pt_entry_t oldpte, *pte;
5452 	vm_offset_t va;
5453 	int lvl, md_gen, pvh_gen;
5454 
5455 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5456 	    ("pmap_remove_write: page %p is not managed", m));
5457 	vm_page_assert_busied(m);
5458 
5459 	if (!pmap_page_is_write_mapped(m))
5460 		return;
5461 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5462 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5463 	rw_wlock(lock);
5464 retry:
5465 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5466 		pmap = PV_PMAP(pv);
5467 		PMAP_ASSERT_STAGE1(pmap);
5468 		if (!PMAP_TRYLOCK(pmap)) {
5469 			pvh_gen = pvh->pv_gen;
5470 			rw_wunlock(lock);
5471 			PMAP_LOCK(pmap);
5472 			rw_wlock(lock);
5473 			if (pvh_gen != pvh->pv_gen) {
5474 				PMAP_UNLOCK(pmap);
5475 				goto retry;
5476 			}
5477 		}
5478 		va = pv->pv_va;
5479 		pte = pmap_pte(pmap, va, &lvl);
5480 		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
5481 			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
5482 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5483 		    ("inconsistent pv lock %p %p for page %p",
5484 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5485 		PMAP_UNLOCK(pmap);
5486 	}
5487 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5488 		pmap = PV_PMAP(pv);
5489 		PMAP_ASSERT_STAGE1(pmap);
5490 		if (!PMAP_TRYLOCK(pmap)) {
5491 			pvh_gen = pvh->pv_gen;
5492 			md_gen = m->md.pv_gen;
5493 			rw_wunlock(lock);
5494 			PMAP_LOCK(pmap);
5495 			rw_wlock(lock);
5496 			if (pvh_gen != pvh->pv_gen ||
5497 			    md_gen != m->md.pv_gen) {
5498 				PMAP_UNLOCK(pmap);
5499 				goto retry;
5500 			}
5501 		}
5502 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
5503 		oldpte = pmap_load(pte);
5504 		if ((oldpte & ATTR_SW_DBM) != 0) {
5505 			while (!atomic_fcmpset_64(pte, &oldpte,
5506 			    (oldpte | ATTR_S1_AP_RW_BIT) & ~ATTR_SW_DBM))
5507 				cpu_spinwait();
5508 			if ((oldpte & ATTR_S1_AP_RW_BIT) ==
5509 			    ATTR_S1_AP(ATTR_S1_AP_RW))
5510 				vm_page_dirty(m);
5511 			pmap_invalidate_page(pmap, pv->pv_va);
5512 		}
5513 		PMAP_UNLOCK(pmap);
5514 	}
5515 	rw_wunlock(lock);
5516 	vm_page_aflag_clear(m, PGA_WRITEABLE);
5517 }
5518 
5519 /*
5520  *	pmap_ts_referenced:
5521  *
5522  *	Return a count of reference bits for a page, clearing those bits.
5523  *	It is not necessary for every reference bit to be cleared, but it
5524  *	is necessary that 0 only be returned when there are truly no
5525  *	reference bits set.
5526  *
5527  *	As an optimization, update the page's dirty field if a modified bit is
5528  *	found while counting reference bits.  This opportunistic update can be
5529  *	performed at low cost and can eliminate the need for some future calls
5530  *	to pmap_is_modified().  However, since this function stops after
5531  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
5532  *	dirty pages.  Those dirty pages will only be detected by a future call
5533  *	to pmap_is_modified().
5534  */
5535 int
pmap_ts_referenced(vm_page_t m)5536 pmap_ts_referenced(vm_page_t m)
5537 {
5538 	struct md_page *pvh;
5539 	pv_entry_t pv, pvf;
5540 	pmap_t pmap;
5541 	struct rwlock *lock;
5542 	pd_entry_t *pde, tpde;
5543 	pt_entry_t *pte, tpte;
5544 	vm_offset_t va;
5545 	vm_paddr_t pa;
5546 	int cleared, lvl, md_gen, not_cleared, pvh_gen;
5547 	struct spglist free;
5548 
5549 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5550 	    ("pmap_ts_referenced: page %p is not managed", m));
5551 	SLIST_INIT(&free);
5552 	cleared = 0;
5553 	pa = VM_PAGE_TO_PHYS(m);
5554 	lock = PHYS_TO_PV_LIST_LOCK(pa);
5555 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5556 	rw_wlock(lock);
5557 retry:
5558 	not_cleared = 0;
5559 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5560 		goto small_mappings;
5561 	pv = pvf;
5562 	do {
5563 		if (pvf == NULL)
5564 			pvf = pv;
5565 		pmap = PV_PMAP(pv);
5566 		if (!PMAP_TRYLOCK(pmap)) {
5567 			pvh_gen = pvh->pv_gen;
5568 			rw_wunlock(lock);
5569 			PMAP_LOCK(pmap);
5570 			rw_wlock(lock);
5571 			if (pvh_gen != pvh->pv_gen) {
5572 				PMAP_UNLOCK(pmap);
5573 				goto retry;
5574 			}
5575 		}
5576 		va = pv->pv_va;
5577 		pde = pmap_pde(pmap, va, &lvl);
5578 		KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found"));
5579 		KASSERT(lvl == 1,
5580 		    ("pmap_ts_referenced: invalid pde level %d", lvl));
5581 		tpde = pmap_load(pde);
5582 		KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE,
5583 		    ("pmap_ts_referenced: found an invalid l1 table"));
5584 		pte = pmap_l1_to_l2(pde, va);
5585 		tpte = pmap_load(pte);
5586 		if (pmap_pte_dirty(pmap, tpte)) {
5587 			/*
5588 			 * Although "tpte" is mapping a 2MB page, because
5589 			 * this function is called at a 4KB page granularity,
5590 			 * we only update the 4KB page under test.
5591 			 */
5592 			vm_page_dirty(m);
5593 		}
5594 
5595 		if ((tpte & ATTR_AF) != 0) {
5596 			/*
5597 			 * Since this reference bit is shared by 512 4KB pages,
5598 			 * it should not be cleared every time it is tested.
5599 			 * Apply a simple "hash" function on the physical page
5600 			 * number, the virtual superpage number, and the pmap
5601 			 * address to select one 4KB page out of the 512 on
5602 			 * which testing the reference bit will result in
5603 			 * clearing that reference bit.  This function is
5604 			 * designed to avoid the selection of the same 4KB page
5605 			 * for every 2MB page mapping.
5606 			 *
5607 			 * On demotion, a mapping that hasn't been referenced
5608 			 * is simply destroyed.  To avoid the possibility of a
5609 			 * subsequent page fault on a demoted wired mapping,
5610 			 * always leave its reference bit set.  Moreover,
5611 			 * since the superpage is wired, the current state of
5612 			 * its reference bit won't affect page replacement.
5613 			 */
5614 			if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
5615 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
5616 			    (tpte & ATTR_SW_WIRED) == 0) {
5617 				pmap_clear_bits(pte, ATTR_AF);
5618 				pmap_invalidate_page(pmap, va);
5619 				cleared++;
5620 			} else
5621 				not_cleared++;
5622 		}
5623 		PMAP_UNLOCK(pmap);
5624 		/* Rotate the PV list if it has more than one entry. */
5625 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5626 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5627 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5628 			pvh->pv_gen++;
5629 		}
5630 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5631 			goto out;
5632 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5633 small_mappings:
5634 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5635 		goto out;
5636 	pv = pvf;
5637 	do {
5638 		if (pvf == NULL)
5639 			pvf = pv;
5640 		pmap = PV_PMAP(pv);
5641 		if (!PMAP_TRYLOCK(pmap)) {
5642 			pvh_gen = pvh->pv_gen;
5643 			md_gen = m->md.pv_gen;
5644 			rw_wunlock(lock);
5645 			PMAP_LOCK(pmap);
5646 			rw_wlock(lock);
5647 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5648 				PMAP_UNLOCK(pmap);
5649 				goto retry;
5650 			}
5651 		}
5652 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
5653 		KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
5654 		KASSERT(lvl == 2,
5655 		    ("pmap_ts_referenced: invalid pde level %d", lvl));
5656 		tpde = pmap_load(pde);
5657 		KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
5658 		    ("pmap_ts_referenced: found an invalid l2 table"));
5659 		pte = pmap_l2_to_l3(pde, pv->pv_va);
5660 		tpte = pmap_load(pte);
5661 		if (pmap_pte_dirty(pmap, tpte))
5662 			vm_page_dirty(m);
5663 		if ((tpte & ATTR_AF) != 0) {
5664 			if ((tpte & ATTR_SW_WIRED) == 0) {
5665 				pmap_clear_bits(pte, ATTR_AF);
5666 				pmap_invalidate_page(pmap, pv->pv_va);
5667 				cleared++;
5668 			} else
5669 				not_cleared++;
5670 		}
5671 		PMAP_UNLOCK(pmap);
5672 		/* Rotate the PV list if it has more than one entry. */
5673 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5674 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5675 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5676 			m->md.pv_gen++;
5677 		}
5678 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
5679 	    not_cleared < PMAP_TS_REFERENCED_MAX);
5680 out:
5681 	rw_wunlock(lock);
5682 	vm_page_free_pages_toq(&free, true);
5683 	return (cleared + not_cleared);
5684 }
5685 
5686 /*
5687  *	Apply the given advice to the specified range of addresses within the
5688  *	given pmap.  Depending on the advice, clear the referenced and/or
5689  *	modified flags in each mapping and set the mapped page's dirty field.
5690  */
5691 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)5692 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
5693 {
5694 	struct rwlock *lock;
5695 	vm_offset_t va, va_next;
5696 	vm_page_t m;
5697 	pd_entry_t *l0, *l1, *l2, oldl2;
5698 	pt_entry_t *l3, oldl3;
5699 
5700 	PMAP_ASSERT_STAGE1(pmap);
5701 
5702 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
5703 		return;
5704 
5705 	PMAP_LOCK(pmap);
5706 	for (; sva < eva; sva = va_next) {
5707 		l0 = pmap_l0(pmap, sva);
5708 		if (pmap_load(l0) == 0) {
5709 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
5710 			if (va_next < sva)
5711 				va_next = eva;
5712 			continue;
5713 		}
5714 
5715 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
5716 		if (va_next < sva)
5717 			va_next = eva;
5718 		l1 = pmap_l0_to_l1(l0, sva);
5719 		if (pmap_load(l1) == 0)
5720 			continue;
5721 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
5722 			KASSERT(va_next <= eva,
5723 			    ("partial update of non-transparent 1G page "
5724 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
5725 			    pmap_load(l1), sva, eva, va_next));
5726 			continue;
5727 		}
5728 
5729 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
5730 		if (va_next < sva)
5731 			va_next = eva;
5732 		l2 = pmap_l1_to_l2(l1, sva);
5733 		oldl2 = pmap_load(l2);
5734 		if (oldl2 == 0)
5735 			continue;
5736 		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5737 			if ((oldl2 & ATTR_SW_MANAGED) == 0)
5738 				continue;
5739 			lock = NULL;
5740 			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
5741 				if (lock != NULL)
5742 					rw_wunlock(lock);
5743 
5744 				/*
5745 				 * The 2MB page mapping was destroyed.
5746 				 */
5747 				continue;
5748 			}
5749 
5750 			/*
5751 			 * Unless the page mappings are wired, remove the
5752 			 * mapping to a single page so that a subsequent
5753 			 * access may repromote.  Choosing the last page
5754 			 * within the address range [sva, min(va_next, eva))
5755 			 * generally results in more repromotions.  Since the
5756 			 * underlying page table page is fully populated, this
5757 			 * removal never frees a page table page.
5758 			 */
5759 			if ((oldl2 & ATTR_SW_WIRED) == 0) {
5760 				va = eva;
5761 				if (va > va_next)
5762 					va = va_next;
5763 				va -= PAGE_SIZE;
5764 				KASSERT(va >= sva,
5765 				    ("pmap_advise: no address gap"));
5766 				l3 = pmap_l2_to_l3(l2, va);
5767 				KASSERT(pmap_load(l3) != 0,
5768 				    ("pmap_advise: invalid PTE"));
5769 				pmap_remove_l3(pmap, l3, va, pmap_load(l2),
5770 				    NULL, &lock);
5771 			}
5772 			if (lock != NULL)
5773 				rw_wunlock(lock);
5774 		}
5775 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
5776 		    ("pmap_advise: invalid L2 entry after demotion"));
5777 		if (va_next > eva)
5778 			va_next = eva;
5779 		va = va_next;
5780 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
5781 		    sva += L3_SIZE) {
5782 			oldl3 = pmap_load(l3);
5783 			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
5784 			    (ATTR_SW_MANAGED | L3_PAGE))
5785 				goto maybe_invlrng;
5786 			else if (pmap_pte_dirty(pmap, oldl3)) {
5787 				if (advice == MADV_DONTNEED) {
5788 					/*
5789 					 * Future calls to pmap_is_modified()
5790 					 * can be avoided by making the page
5791 					 * dirty now.
5792 					 */
5793 					m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK);
5794 					vm_page_dirty(m);
5795 				}
5796 				while (!atomic_fcmpset_long(l3, &oldl3,
5797 				    (oldl3 & ~ATTR_AF) |
5798 				    ATTR_S1_AP(ATTR_S1_AP_RO)))
5799 					cpu_spinwait();
5800 			} else if ((oldl3 & ATTR_AF) != 0)
5801 				pmap_clear_bits(l3, ATTR_AF);
5802 			else
5803 				goto maybe_invlrng;
5804 			if (va == va_next)
5805 				va = sva;
5806 			continue;
5807 maybe_invlrng:
5808 			if (va != va_next) {
5809 				pmap_invalidate_range(pmap, va, sva);
5810 				va = va_next;
5811 			}
5812 		}
5813 		if (va != va_next)
5814 			pmap_invalidate_range(pmap, va, sva);
5815 	}
5816 	PMAP_UNLOCK(pmap);
5817 }
5818 
5819 /*
5820  *	Clear the modify bits on the specified physical page.
5821  */
5822 void
pmap_clear_modify(vm_page_t m)5823 pmap_clear_modify(vm_page_t m)
5824 {
5825 	struct md_page *pvh;
5826 	struct rwlock *lock;
5827 	pmap_t pmap;
5828 	pv_entry_t next_pv, pv;
5829 	pd_entry_t *l2, oldl2;
5830 	pt_entry_t *l3, oldl3;
5831 	vm_offset_t va;
5832 	int md_gen, pvh_gen;
5833 
5834 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5835 	    ("pmap_clear_modify: page %p is not managed", m));
5836 	vm_page_assert_busied(m);
5837 
5838 	if (!pmap_page_is_write_mapped(m))
5839 		return;
5840 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5841 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5842 	rw_wlock(lock);
5843 restart:
5844 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5845 		pmap = PV_PMAP(pv);
5846 		PMAP_ASSERT_STAGE1(pmap);
5847 		if (!PMAP_TRYLOCK(pmap)) {
5848 			pvh_gen = pvh->pv_gen;
5849 			rw_wunlock(lock);
5850 			PMAP_LOCK(pmap);
5851 			rw_wlock(lock);
5852 			if (pvh_gen != pvh->pv_gen) {
5853 				PMAP_UNLOCK(pmap);
5854 				goto restart;
5855 			}
5856 		}
5857 		va = pv->pv_va;
5858 		l2 = pmap_l2(pmap, va);
5859 		oldl2 = pmap_load(l2);
5860 		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
5861 		if ((oldl2 & ATTR_SW_DBM) != 0 &&
5862 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
5863 		    (oldl2 & ATTR_SW_WIRED) == 0) {
5864 			/*
5865 			 * Write protect the mapping to a single page so that
5866 			 * a subsequent write access may repromote.
5867 			 */
5868 			va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK);
5869 			l3 = pmap_l2_to_l3(l2, va);
5870 			oldl3 = pmap_load(l3);
5871 			while (!atomic_fcmpset_long(l3, &oldl3,
5872 			    (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
5873 				cpu_spinwait();
5874 			vm_page_dirty(m);
5875 			pmap_invalidate_page(pmap, va);
5876 		}
5877 		PMAP_UNLOCK(pmap);
5878 	}
5879 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5880 		pmap = PV_PMAP(pv);
5881 		PMAP_ASSERT_STAGE1(pmap);
5882 		if (!PMAP_TRYLOCK(pmap)) {
5883 			md_gen = m->md.pv_gen;
5884 			pvh_gen = pvh->pv_gen;
5885 			rw_wunlock(lock);
5886 			PMAP_LOCK(pmap);
5887 			rw_wlock(lock);
5888 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5889 				PMAP_UNLOCK(pmap);
5890 				goto restart;
5891 			}
5892 		}
5893 		l2 = pmap_l2(pmap, pv->pv_va);
5894 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
5895 		oldl3 = pmap_load(l3);
5896 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){
5897 			pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
5898 			pmap_invalidate_page(pmap, pv->pv_va);
5899 		}
5900 		PMAP_UNLOCK(pmap);
5901 	}
5902 	rw_wunlock(lock);
5903 }
5904 
5905 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)5906 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5907 {
5908 	struct pmap_preinit_mapping *ppim;
5909 	vm_offset_t va, offset;
5910 	pd_entry_t *pde;
5911 	pt_entry_t *l2;
5912 	int i, lvl, l2_blocks, free_l2_count, start_idx;
5913 
5914 	if (!vm_initialized) {
5915 		/*
5916 		 * No L3 ptables so map entire L2 blocks where start VA is:
5917 		 * 	preinit_map_va + start_idx * L2_SIZE
5918 		 * There may be duplicate mappings (multiple VA -> same PA) but
5919 		 * ARM64 dcache is always PIPT so that's acceptable.
5920 		 */
5921 		 if (size == 0)
5922 			 return (NULL);
5923 
5924 		 /* Calculate how many L2 blocks are needed for the mapping */
5925 		l2_blocks = (roundup2(pa + size, L2_SIZE) -
5926 		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
5927 
5928 		offset = pa & L2_OFFSET;
5929 
5930 		if (preinit_map_va == 0)
5931 			return (NULL);
5932 
5933 		/* Map 2MiB L2 blocks from reserved VA space */
5934 
5935 		free_l2_count = 0;
5936 		start_idx = -1;
5937 		/* Find enough free contiguous VA space */
5938 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5939 			ppim = pmap_preinit_mapping + i;
5940 			if (free_l2_count > 0 && ppim->pa != 0) {
5941 				/* Not enough space here */
5942 				free_l2_count = 0;
5943 				start_idx = -1;
5944 				continue;
5945 			}
5946 
5947 			if (ppim->pa == 0) {
5948 				/* Free L2 block */
5949 				if (start_idx == -1)
5950 					start_idx = i;
5951 				free_l2_count++;
5952 				if (free_l2_count == l2_blocks)
5953 					break;
5954 			}
5955 		}
5956 		if (free_l2_count != l2_blocks)
5957 			panic("%s: too many preinit mappings", __func__);
5958 
5959 		va = preinit_map_va + (start_idx * L2_SIZE);
5960 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
5961 			/* Mark entries as allocated */
5962 			ppim = pmap_preinit_mapping + i;
5963 			ppim->pa = pa;
5964 			ppim->va = va + offset;
5965 			ppim->size = size;
5966 		}
5967 
5968 		/* Map L2 blocks */
5969 		pa = rounddown2(pa, L2_SIZE);
5970 		for (i = 0; i < l2_blocks; i++) {
5971 			pde = pmap_pde(kernel_pmap, va, &lvl);
5972 			KASSERT(pde != NULL,
5973 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
5974 			    va));
5975 			KASSERT(lvl == 1,
5976 			    ("pmap_mapbios: Invalid level %d", lvl));
5977 
5978 			/* Insert L2_BLOCK */
5979 			l2 = pmap_l1_to_l2(pde, va);
5980 			pmap_load_store(l2,
5981 			    pa | ATTR_DEFAULT | ATTR_S1_XN |
5982 			    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
5983 
5984 			va += L2_SIZE;
5985 			pa += L2_SIZE;
5986 		}
5987 		pmap_invalidate_all(kernel_pmap);
5988 
5989 		va = preinit_map_va + (start_idx * L2_SIZE);
5990 
5991 	} else {
5992 		/* kva_alloc may be used to map the pages */
5993 		offset = pa & PAGE_MASK;
5994 		size = round_page(offset + size);
5995 
5996 		va = kva_alloc(size);
5997 		if (va == 0)
5998 			panic("%s: Couldn't allocate KVA", __func__);
5999 
6000 		pde = pmap_pde(kernel_pmap, va, &lvl);
6001 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
6002 
6003 		/* L3 table is linked */
6004 		va = trunc_page(va);
6005 		pa = trunc_page(pa);
6006 		pmap_kenter(va, size, pa, memory_mapping_mode(pa));
6007 	}
6008 
6009 	return ((void *)(va + offset));
6010 }
6011 
6012 void
pmap_unmapbios(vm_offset_t va,vm_size_t size)6013 pmap_unmapbios(vm_offset_t va, vm_size_t size)
6014 {
6015 	struct pmap_preinit_mapping *ppim;
6016 	vm_offset_t offset, tmpsize, va_trunc;
6017 	pd_entry_t *pde;
6018 	pt_entry_t *l2;
6019 	int i, lvl, l2_blocks, block;
6020 	bool preinit_map;
6021 
6022 	l2_blocks =
6023 	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
6024 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
6025 
6026 	/* Remove preinit mapping */
6027 	preinit_map = false;
6028 	block = 0;
6029 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6030 		ppim = pmap_preinit_mapping + i;
6031 		if (ppim->va == va) {
6032 			KASSERT(ppim->size == size,
6033 			    ("pmap_unmapbios: size mismatch"));
6034 			ppim->va = 0;
6035 			ppim->pa = 0;
6036 			ppim->size = 0;
6037 			preinit_map = true;
6038 			offset = block * L2_SIZE;
6039 			va_trunc = rounddown2(va, L2_SIZE) + offset;
6040 
6041 			/* Remove L2_BLOCK */
6042 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
6043 			KASSERT(pde != NULL,
6044 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
6045 			    va_trunc));
6046 			l2 = pmap_l1_to_l2(pde, va_trunc);
6047 			pmap_clear(l2);
6048 
6049 			if (block == (l2_blocks - 1))
6050 				break;
6051 			block++;
6052 		}
6053 	}
6054 	if (preinit_map) {
6055 		pmap_invalidate_all(kernel_pmap);
6056 		return;
6057 	}
6058 
6059 	/* Unmap the pages reserved with kva_alloc. */
6060 	if (vm_initialized) {
6061 		offset = va & PAGE_MASK;
6062 		size = round_page(offset + size);
6063 		va = trunc_page(va);
6064 
6065 		pde = pmap_pde(kernel_pmap, va, &lvl);
6066 		KASSERT(pde != NULL,
6067 		    ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va));
6068 		KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl));
6069 
6070 		/* Unmap and invalidate the pages */
6071                 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6072 			pmap_kremove(va + tmpsize);
6073 
6074 		kva_free(va, size);
6075 	}
6076 }
6077 
6078 /*
6079  * Sets the memory attribute for the specified page.
6080  */
6081 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)6082 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6083 {
6084 
6085 	m->md.pv_memattr = ma;
6086 
6087 	/*
6088 	 * If "m" is a normal page, update its direct mapping.  This update
6089 	 * can be relied upon to perform any cache operations that are
6090 	 * required for data coherence.
6091 	 */
6092 	if ((m->flags & PG_FICTITIOUS) == 0 &&
6093 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6094 	    m->md.pv_memattr) != 0)
6095 		panic("memory attribute change on the direct map failed");
6096 }
6097 
6098 /*
6099  * Changes the specified virtual address range's memory type to that given by
6100  * the parameter "mode".  The specified virtual address range must be
6101  * completely contained within either the direct map or the kernel map.  If
6102  * the virtual address range is contained within the kernel map, then the
6103  * memory type for each of the corresponding ranges of the direct map is also
6104  * changed.  (The corresponding ranges of the direct map are those ranges that
6105  * map the same physical pages as the specified virtual address range.)  These
6106  * changes to the direct map are necessary because Intel describes the
6107  * behavior of their processors as "undefined" if two or more mappings to the
6108  * same physical page have different memory types.
6109  *
6110  * Returns zero if the change completed successfully, and either EINVAL or
6111  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6112  * of the virtual address range was not mapped, and ENOMEM is returned if
6113  * there was insufficient memory available to complete the change.  In the
6114  * latter case, the memory type may have been changed on some part of the
6115  * virtual address range or the direct map.
6116  */
6117 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)6118 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6119 {
6120 	int error;
6121 
6122 	PMAP_LOCK(kernel_pmap);
6123 	error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
6124 	PMAP_UNLOCK(kernel_pmap);
6125 	return (error);
6126 }
6127 
6128 /*
6129  * Changes the specified virtual address range's protections to those
6130  * specified by "prot".  Like pmap_change_attr(), protections for aliases
6131  * in the direct map are updated as well.  Protections on aliasing mappings may
6132  * be a subset of the requested protections; for example, mappings in the direct
6133  * map are never executable.
6134  */
6135 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)6136 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
6137 {
6138 	int error;
6139 
6140 	/* Only supported within the kernel map. */
6141 	if (va < VM_MIN_KERNEL_ADDRESS)
6142 		return (EINVAL);
6143 
6144 	PMAP_LOCK(kernel_pmap);
6145 	error = pmap_change_props_locked(va, size, prot, -1, false);
6146 	PMAP_UNLOCK(kernel_pmap);
6147 	return (error);
6148 }
6149 
6150 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)6151 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
6152     int mode, bool skip_unmapped)
6153 {
6154 	vm_offset_t base, offset, tmpva;
6155 	vm_size_t pte_size;
6156 	vm_paddr_t pa;
6157 	pt_entry_t pte, *ptep, *newpte;
6158 	pt_entry_t bits, mask;
6159 	int lvl, rv;
6160 
6161 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6162 	base = trunc_page(va);
6163 	offset = va & PAGE_MASK;
6164 	size = round_page(offset + size);
6165 
6166 	if (!VIRT_IN_DMAP(base) &&
6167 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
6168 		return (EINVAL);
6169 
6170 	bits = 0;
6171 	mask = 0;
6172 	if (mode != -1) {
6173 		bits = ATTR_S1_IDX(mode);
6174 		mask = ATTR_S1_IDX_MASK;
6175 		if (mode == VM_MEMATTR_DEVICE) {
6176 			mask |= ATTR_S1_XN;
6177 			bits |= ATTR_S1_XN;
6178 		}
6179 	}
6180 	if (prot != VM_PROT_NONE) {
6181 		/* Don't mark the DMAP as executable. It never is on arm64. */
6182 		if (VIRT_IN_DMAP(base)) {
6183 			prot &= ~VM_PROT_EXECUTE;
6184 			/*
6185 			 * XXX Mark the DMAP as writable for now. We rely
6186 			 * on this in ddb & dtrace to insert breakpoint
6187 			 * instructions.
6188 			 */
6189 			prot |= VM_PROT_WRITE;
6190 		}
6191 
6192 		if ((prot & VM_PROT_WRITE) == 0) {
6193 			bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
6194 		}
6195 		if ((prot & VM_PROT_EXECUTE) == 0) {
6196 			bits |= ATTR_S1_PXN;
6197 		}
6198 		bits |= ATTR_S1_UXN;
6199 		mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
6200 	}
6201 
6202 	for (tmpva = base; tmpva < base + size; ) {
6203 		ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
6204 		if (ptep == NULL && !skip_unmapped) {
6205 			return (EINVAL);
6206 		} else if ((ptep == NULL && skip_unmapped) ||
6207 		    (pmap_load(ptep) & mask) == bits) {
6208 			/*
6209 			 * We already have the correct attribute or there
6210 			 * is no memory mapped at this address and we are
6211 			 * skipping unmapped memory.
6212 			 */
6213 			switch (lvl) {
6214 			default:
6215 				panic("Invalid DMAP table level: %d\n", lvl);
6216 			case 1:
6217 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
6218 				break;
6219 			case 2:
6220 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
6221 				break;
6222 			case 3:
6223 				tmpva += PAGE_SIZE;
6224 				break;
6225 			}
6226 		} else {
6227 			/*
6228 			 * Split the entry to an level 3 table, then
6229 			 * set the new attribute.
6230 			 */
6231 			switch (lvl) {
6232 			default:
6233 				panic("Invalid DMAP table level: %d\n", lvl);
6234 			case 1:
6235 				if ((tmpva & L1_OFFSET) == 0 &&
6236 				    (base + size - tmpva) >= L1_SIZE) {
6237 					pte_size = L1_SIZE;
6238 					break;
6239 				}
6240 				newpte = pmap_demote_l1(kernel_pmap, ptep,
6241 				    tmpva & ~L1_OFFSET);
6242 				if (newpte == NULL)
6243 					return (EINVAL);
6244 				ptep = pmap_l1_to_l2(ptep, tmpva);
6245 				/* FALLTHROUGH */
6246 			case 2:
6247 				if ((tmpva & L2_OFFSET) == 0 &&
6248 				    (base + size - tmpva) >= L2_SIZE) {
6249 					pte_size = L2_SIZE;
6250 					break;
6251 				}
6252 				newpte = pmap_demote_l2(kernel_pmap, ptep,
6253 				    tmpva);
6254 				if (newpte == NULL)
6255 					return (EINVAL);
6256 				ptep = pmap_l2_to_l3(ptep, tmpva);
6257 				/* FALLTHROUGH */
6258 			case 3:
6259 				pte_size = PAGE_SIZE;
6260 				break;
6261 			}
6262 
6263 			/* Update the entry */
6264 			pte = pmap_load(ptep);
6265 			pte &= ~mask;
6266 			pte |= bits;
6267 
6268 			pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
6269 			    pte_size);
6270 
6271 			pa = pte & ~ATTR_MASK;
6272 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
6273 				/*
6274 				 * Keep the DMAP memory in sync.
6275 				 */
6276 				rv = pmap_change_props_locked(
6277 				    PHYS_TO_DMAP(pa), pte_size,
6278 				    prot, mode, true);
6279 				if (rv != 0)
6280 					return (rv);
6281 			}
6282 
6283 			/*
6284 			 * If moving to a non-cacheable entry flush
6285 			 * the cache.
6286 			 */
6287 			if (mode == VM_MEMATTR_UNCACHEABLE)
6288 				cpu_dcache_wbinv_range(tmpva, pte_size);
6289 			tmpva += pte_size;
6290 		}
6291 	}
6292 
6293 	return (0);
6294 }
6295 
6296 /*
6297  * Create an L2 table to map all addresses within an L1 mapping.
6298  */
6299 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)6300 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
6301 {
6302 	pt_entry_t *l2, newl2, oldl1;
6303 	vm_offset_t tmpl1;
6304 	vm_paddr_t l2phys, phys;
6305 	vm_page_t ml2;
6306 	int i;
6307 
6308 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6309 	oldl1 = pmap_load(l1);
6310 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
6311 	    ("pmap_demote_l1: Demoting a non-block entry"));
6312 	KASSERT((va & L1_OFFSET) == 0,
6313 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
6314 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
6315 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
6316 
6317 	tmpl1 = 0;
6318 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
6319 		tmpl1 = kva_alloc(PAGE_SIZE);
6320 		if (tmpl1 == 0)
6321 			return (NULL);
6322 	}
6323 
6324 	if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
6325 	    NULL) {
6326 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
6327 		    " in pmap %p", va, pmap);
6328 		l2 = NULL;
6329 		goto fail;
6330 	}
6331 
6332 	l2phys = VM_PAGE_TO_PHYS(ml2);
6333 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
6334 
6335 	/* Address the range points at */
6336 	phys = oldl1 & ~ATTR_MASK;
6337 	/* The attributed from the old l1 table to be copied */
6338 	newl2 = oldl1 & ATTR_MASK;
6339 
6340 	/* Create the new entries */
6341 	for (i = 0; i < Ln_ENTRIES; i++) {
6342 		l2[i] = newl2 | phys;
6343 		phys += L2_SIZE;
6344 	}
6345 	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
6346 	    ("Invalid l2 page (%lx != %lx)", l2[0],
6347 	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
6348 
6349 	if (tmpl1 != 0) {
6350 		pmap_kenter(tmpl1, PAGE_SIZE,
6351 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
6352 		    VM_MEMATTR_WRITE_BACK);
6353 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
6354 	}
6355 
6356 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
6357 
6358 fail:
6359 	if (tmpl1 != 0) {
6360 		pmap_kremove(tmpl1);
6361 		kva_free(tmpl1, PAGE_SIZE);
6362 	}
6363 
6364 	return (l2);
6365 }
6366 
6367 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)6368 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
6369 {
6370 	pt_entry_t *l3;
6371 
6372 	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
6373 		*l3 = newl3;
6374 		newl3 += L3_SIZE;
6375 	}
6376 }
6377 
6378 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)6379 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
6380     struct rwlock **lockp)
6381 {
6382 	struct spglist free;
6383 
6384 	SLIST_INIT(&free);
6385 	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
6386 	    lockp);
6387 	vm_page_free_pages_toq(&free, true);
6388 }
6389 
6390 /*
6391  * Create an L3 table to map all addresses within an L2 mapping.
6392  */
6393 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)6394 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
6395     struct rwlock **lockp)
6396 {
6397 	pt_entry_t *l3, newl3, oldl2;
6398 	vm_offset_t tmpl2;
6399 	vm_paddr_t l3phys;
6400 	vm_page_t ml3;
6401 
6402 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6403 	PMAP_ASSERT_STAGE1(pmap);
6404 	KASSERT(ADDR_IS_CANONICAL(va),
6405 	    ("%s: Address not in canonical form: %lx", __func__, va));
6406 
6407 	l3 = NULL;
6408 	oldl2 = pmap_load(l2);
6409 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
6410 	    ("pmap_demote_l2: Demoting a non-block entry"));
6411 	va &= ~L2_OFFSET;
6412 
6413 	tmpl2 = 0;
6414 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
6415 		tmpl2 = kva_alloc(PAGE_SIZE);
6416 		if (tmpl2 == 0)
6417 			return (NULL);
6418 	}
6419 
6420 	/*
6421 	 * Invalidate the 2MB page mapping and return "failure" if the
6422 	 * mapping was never accessed.
6423 	 */
6424 	if ((oldl2 & ATTR_AF) == 0) {
6425 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
6426 		    ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
6427 		pmap_demote_l2_abort(pmap, va, l2, lockp);
6428 		CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
6429 		    va, pmap);
6430 		goto fail;
6431 	}
6432 
6433 	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
6434 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
6435 		    ("pmap_demote_l2: page table page for a wired mapping"
6436 		    " is missing"));
6437 
6438 		/*
6439 		 * If the page table page is missing and the mapping
6440 		 * is for a kernel address, the mapping must belong to
6441 		 * either the direct map or the early kernel memory.
6442 		 * Page table pages are preallocated for every other
6443 		 * part of the kernel address space, so the direct map
6444 		 * region and early kernel memory are the only parts of the
6445 		 * kernel address space that must be handled here.
6446 		 */
6447 		KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
6448 		    (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
6449 		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
6450 
6451 		/*
6452 		 * If the 2MB page mapping belongs to the direct map
6453 		 * region of the kernel's address space, then the page
6454 		 * allocation request specifies the highest possible
6455 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
6456 		 * priority is normal.
6457 		 */
6458 		ml3 = vm_page_alloc_noobj(
6459 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
6460 		    VM_ALLOC_WIRED);
6461 
6462 		/*
6463 		 * If the allocation of the new page table page fails,
6464 		 * invalidate the 2MB page mapping and return "failure".
6465 		 */
6466 		if (ml3 == NULL) {
6467 			pmap_demote_l2_abort(pmap, va, l2, lockp);
6468 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
6469 			    " in pmap %p", va, pmap);
6470 			goto fail;
6471 		}
6472 		ml3->pindex = pmap_l2_pindex(va);
6473 
6474 		if (!ADDR_IS_KERNEL(va)) {
6475 			ml3->ref_count = NL3PG;
6476 			pmap_resident_count_inc(pmap, 1);
6477 		}
6478 	}
6479 	l3phys = VM_PAGE_TO_PHYS(ml3);
6480 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
6481 	newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
6482 	KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
6483 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
6484 	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
6485 
6486 	/*
6487 	 * If the page table page is not leftover from an earlier promotion,
6488 	 * or the mapping attributes have changed, (re)initialize the L3 table.
6489 	 *
6490 	 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
6491 	 * performs a dsb().  That dsb() ensures that the stores for filling
6492 	 * "l3" are visible before "l3" is added to the page table.
6493 	 */
6494 	if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK))
6495 		pmap_fill_l3(l3, newl3);
6496 
6497 	/*
6498 	 * Map the temporary page so we don't lose access to the l2 table.
6499 	 */
6500 	if (tmpl2 != 0) {
6501 		pmap_kenter(tmpl2, PAGE_SIZE,
6502 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
6503 		    VM_MEMATTR_WRITE_BACK);
6504 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
6505 	}
6506 
6507 	/*
6508 	 * The spare PV entries must be reserved prior to demoting the
6509 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
6510 	 * of the L2 and the PV lists will be inconsistent, which can result
6511 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
6512 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
6513 	 * PV entry for the 2MB page mapping that is being demoted.
6514 	 */
6515 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
6516 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
6517 
6518 	/*
6519 	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
6520 	 * the 2MB page mapping.
6521 	 */
6522 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
6523 
6524 	/*
6525 	 * Demote the PV entry.
6526 	 */
6527 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
6528 		pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
6529 
6530 	atomic_add_long(&pmap_l2_demotions, 1);
6531 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
6532 	    " in pmap %p %lx", va, pmap, l3[0]);
6533 
6534 fail:
6535 	if (tmpl2 != 0) {
6536 		pmap_kremove(tmpl2);
6537 		kva_free(tmpl2, PAGE_SIZE);
6538 	}
6539 
6540 	return (l3);
6541 
6542 }
6543 
6544 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)6545 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
6546 {
6547 	struct rwlock *lock;
6548 	pt_entry_t *l3;
6549 
6550 	lock = NULL;
6551 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
6552 	if (lock != NULL)
6553 		rw_wunlock(lock);
6554 	return (l3);
6555 }
6556 
6557 /*
6558  * Perform the pmap work for mincore(2).  If the page is not both referenced and
6559  * modified by this pmap, returns its physical address so that the caller can
6560  * find other mappings.
6561  */
6562 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)6563 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
6564 {
6565 	pt_entry_t *pte, tpte;
6566 	vm_paddr_t mask, pa;
6567 	int lvl, val;
6568 	bool managed;
6569 
6570 	PMAP_ASSERT_STAGE1(pmap);
6571 	PMAP_LOCK(pmap);
6572 	pte = pmap_pte(pmap, addr, &lvl);
6573 	if (pte != NULL) {
6574 		tpte = pmap_load(pte);
6575 
6576 		switch (lvl) {
6577 		case 3:
6578 			mask = L3_OFFSET;
6579 			break;
6580 		case 2:
6581 			mask = L2_OFFSET;
6582 			break;
6583 		case 1:
6584 			mask = L1_OFFSET;
6585 			break;
6586 		default:
6587 			panic("pmap_mincore: invalid level %d", lvl);
6588 		}
6589 
6590 		managed = (tpte & ATTR_SW_MANAGED) != 0;
6591 		val = MINCORE_INCORE;
6592 		if (lvl != 3)
6593 			val |= MINCORE_PSIND(3 - lvl);
6594 		if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
6595 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
6596 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6597 		if ((tpte & ATTR_AF) == ATTR_AF)
6598 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6599 
6600 		pa = (tpte & ~ATTR_MASK) | (addr & mask);
6601 	} else {
6602 		managed = false;
6603 		val = 0;
6604 	}
6605 
6606 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6607 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
6608 		*pap = pa;
6609 	}
6610 	PMAP_UNLOCK(pmap);
6611 	return (val);
6612 }
6613 
6614 /*
6615  * Garbage collect every ASID that is neither active on a processor nor
6616  * reserved.
6617  */
6618 static void
pmap_reset_asid_set(pmap_t pmap)6619 pmap_reset_asid_set(pmap_t pmap)
6620 {
6621 	pmap_t curpmap;
6622 	int asid, cpuid, epoch;
6623 	struct asid_set *set;
6624 	enum pmap_stage stage;
6625 
6626 	set = pmap->pm_asid_set;
6627 	stage = pmap->pm_stage;
6628 
6629 	set = pmap->pm_asid_set;
6630 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
6631 	mtx_assert(&set->asid_set_mutex, MA_OWNED);
6632 
6633 	/*
6634 	 * Ensure that the store to asid_epoch is globally visible before the
6635 	 * loads from pc_curpmap are performed.
6636 	 */
6637 	epoch = set->asid_epoch + 1;
6638 	if (epoch == INT_MAX)
6639 		epoch = 0;
6640 	set->asid_epoch = epoch;
6641 	dsb(ishst);
6642 	if (stage == PM_STAGE1) {
6643 		__asm __volatile("tlbi vmalle1is");
6644 	} else {
6645 		KASSERT(pmap_clean_stage2_tlbi != NULL,
6646 		    ("%s: Unset stage 2 tlb invalidation callback\n",
6647 		    __func__));
6648 		pmap_clean_stage2_tlbi();
6649 	}
6650 	dsb(ish);
6651 	bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
6652 	    set->asid_set_size - 1);
6653 	CPU_FOREACH(cpuid) {
6654 		if (cpuid == curcpu)
6655 			continue;
6656 		if (stage == PM_STAGE1) {
6657 			curpmap = pcpu_find(cpuid)->pc_curpmap;
6658 			PMAP_ASSERT_STAGE1(pmap);
6659 		} else {
6660 			curpmap = pcpu_find(cpuid)->pc_curvmpmap;
6661 			if (curpmap == NULL)
6662 				continue;
6663 			PMAP_ASSERT_STAGE2(pmap);
6664 		}
6665 		KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
6666 		asid = COOKIE_TO_ASID(curpmap->pm_cookie);
6667 		if (asid == -1)
6668 			continue;
6669 		bit_set(set->asid_set, asid);
6670 		curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
6671 	}
6672 }
6673 
6674 /*
6675  * Allocate a new ASID for the specified pmap.
6676  */
6677 static void
pmap_alloc_asid(pmap_t pmap)6678 pmap_alloc_asid(pmap_t pmap)
6679 {
6680 	struct asid_set *set;
6681 	int new_asid;
6682 
6683 	set = pmap->pm_asid_set;
6684 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
6685 
6686 	mtx_lock_spin(&set->asid_set_mutex);
6687 
6688 	/*
6689 	 * While this processor was waiting to acquire the asid set mutex,
6690 	 * pmap_reset_asid_set() running on another processor might have
6691 	 * updated this pmap's cookie to the current epoch.  In which case, we
6692 	 * don't need to allocate a new ASID.
6693 	 */
6694 	if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
6695 		goto out;
6696 
6697 	bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
6698 	    &new_asid);
6699 	if (new_asid == -1) {
6700 		bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
6701 		    set->asid_next, &new_asid);
6702 		if (new_asid == -1) {
6703 			pmap_reset_asid_set(pmap);
6704 			bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
6705 			    set->asid_set_size, &new_asid);
6706 			KASSERT(new_asid != -1, ("ASID allocation failure"));
6707 		}
6708 	}
6709 	bit_set(set->asid_set, new_asid);
6710 	set->asid_next = new_asid + 1;
6711 	pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
6712 out:
6713 	mtx_unlock_spin(&set->asid_set_mutex);
6714 }
6715 
6716 /*
6717  * Compute the value that should be stored in ttbr0 to activate the specified
6718  * pmap.  This value may change from time to time.
6719  */
6720 uint64_t
pmap_to_ttbr0(pmap_t pmap)6721 pmap_to_ttbr0(pmap_t pmap)
6722 {
6723 
6724 	return (ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) |
6725 	    pmap->pm_ttbr);
6726 }
6727 
6728 static bool
pmap_activate_int(pmap_t pmap)6729 pmap_activate_int(pmap_t pmap)
6730 {
6731 	struct asid_set *set;
6732 	int epoch;
6733 
6734 	KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
6735 	KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
6736 
6737 	if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
6738 	    (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
6739 		/*
6740 		 * Handle the possibility that the old thread was preempted
6741 		 * after an "ic" or "tlbi" instruction but before it performed
6742 		 * a "dsb" instruction.  If the old thread migrates to a new
6743 		 * processor, its completion of a "dsb" instruction on that
6744 		 * new processor does not guarantee that the "ic" or "tlbi"
6745 		 * instructions performed on the old processor have completed.
6746 		 */
6747 		dsb(ish);
6748 		return (false);
6749 	}
6750 
6751 	set = pmap->pm_asid_set;
6752 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
6753 
6754 	/*
6755 	 * Ensure that the store to curpmap is globally visible before the
6756 	 * load from asid_epoch is performed.
6757 	 */
6758 	if (pmap->pm_stage == PM_STAGE1)
6759 		PCPU_SET(curpmap, pmap);
6760 	else
6761 		PCPU_SET(curvmpmap, pmap);
6762 	dsb(ish);
6763 	epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
6764 	if (epoch >= 0 && epoch != set->asid_epoch)
6765 		pmap_alloc_asid(pmap);
6766 
6767 	if (pmap->pm_stage == PM_STAGE1) {
6768 		set_ttbr0(pmap_to_ttbr0(pmap));
6769 		if (PCPU_GET(bcast_tlbi_workaround) != 0)
6770 			invalidate_local_icache();
6771 	}
6772 	return (true);
6773 }
6774 
6775 void
pmap_activate_vm(pmap_t pmap)6776 pmap_activate_vm(pmap_t pmap)
6777 {
6778 
6779 	PMAP_ASSERT_STAGE2(pmap);
6780 
6781 	(void)pmap_activate_int(pmap);
6782 }
6783 
6784 void
pmap_activate(struct thread * td)6785 pmap_activate(struct thread *td)
6786 {
6787 	pmap_t	pmap;
6788 
6789 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
6790 	PMAP_ASSERT_STAGE1(pmap);
6791 	critical_enter();
6792 	(void)pmap_activate_int(pmap);
6793 	critical_exit();
6794 }
6795 
6796 /*
6797  * To eliminate the unused parameter "old", we would have to add an instruction
6798  * to cpu_switch().
6799  */
6800 struct pcb *
pmap_switch(struct thread * old __unused,struct thread * new)6801 pmap_switch(struct thread *old __unused, struct thread *new)
6802 {
6803 	pcpu_bp_harden bp_harden;
6804 	struct pcb *pcb;
6805 
6806 	/* Store the new curthread */
6807 	PCPU_SET(curthread, new);
6808 #if defined(PERTHREAD_SSP)
6809 	/* Set the new threads SSP canary */
6810 	__asm("msr	sp_el0, %0" :: "r"(&new->td_md.md_canary));
6811 #endif
6812 
6813 	/* And the new pcb */
6814 	pcb = new->td_pcb;
6815 	PCPU_SET(curpcb, pcb);
6816 
6817 	/*
6818 	 * TODO: We may need to flush the cache here if switching
6819 	 * to a user process.
6820 	 */
6821 
6822 	if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
6823 		/*
6824 		 * Stop userspace from training the branch predictor against
6825 		 * other processes. This will call into a CPU specific
6826 		 * function that clears the branch predictor state.
6827 		 */
6828 		bp_harden = PCPU_GET(bp_harden);
6829 		if (bp_harden != NULL)
6830 			bp_harden();
6831 	}
6832 
6833 	return (pcb);
6834 }
6835 
6836 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)6837 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
6838 {
6839 
6840 	PMAP_ASSERT_STAGE1(pmap);
6841 	KASSERT(ADDR_IS_CANONICAL(va),
6842 	    ("%s: Address not in canonical form: %lx", __func__, va));
6843 
6844 	if (ADDR_IS_KERNEL(va)) {
6845 		cpu_icache_sync_range(va, sz);
6846 	} else {
6847 		u_int len, offset;
6848 		vm_paddr_t pa;
6849 
6850 		/* Find the length of data in this page to flush */
6851 		offset = va & PAGE_MASK;
6852 		len = imin(PAGE_SIZE - offset, sz);
6853 
6854 		while (sz != 0) {
6855 			/* Extract the physical address & find it in the DMAP */
6856 			pa = pmap_extract(pmap, va);
6857 			if (pa != 0)
6858 				cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
6859 
6860 			/* Move to the next page */
6861 			sz -= len;
6862 			va += len;
6863 			/* Set the length for the next iteration */
6864 			len = imin(PAGE_SIZE, sz);
6865 		}
6866 	}
6867 }
6868 
6869 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)6870 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
6871 {
6872 	pd_entry_t *pdep;
6873 	pt_entry_t *ptep, pte;
6874 	int rv, lvl, dfsc;
6875 
6876 	PMAP_ASSERT_STAGE2(pmap);
6877 	rv = KERN_FAILURE;
6878 
6879 	/* Data and insn aborts use same encoding for FSC field. */
6880 	dfsc = esr & ISS_DATA_DFSC_MASK;
6881 	switch (dfsc) {
6882 	case ISS_DATA_DFSC_TF_L0:
6883 	case ISS_DATA_DFSC_TF_L1:
6884 	case ISS_DATA_DFSC_TF_L2:
6885 	case ISS_DATA_DFSC_TF_L3:
6886 		PMAP_LOCK(pmap);
6887 		pdep = pmap_pde(pmap, far, &lvl);
6888 		if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
6889 			PMAP_LOCK(pmap);
6890 			break;
6891 		}
6892 
6893 		switch (lvl) {
6894 		case 0:
6895 			ptep = pmap_l0_to_l1(pdep, far);
6896 			break;
6897 		case 1:
6898 			ptep = pmap_l1_to_l2(pdep, far);
6899 			break;
6900 		case 2:
6901 			ptep = pmap_l2_to_l3(pdep, far);
6902 			break;
6903 		default:
6904 			panic("%s: Invalid pde level %d", __func__,lvl);
6905 		}
6906 		goto fault_exec;
6907 
6908 	case ISS_DATA_DFSC_AFF_L1:
6909 	case ISS_DATA_DFSC_AFF_L2:
6910 	case ISS_DATA_DFSC_AFF_L3:
6911 		PMAP_LOCK(pmap);
6912 		ptep = pmap_pte(pmap, far, &lvl);
6913 fault_exec:
6914 		if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
6915 			if (icache_vmid) {
6916 				pmap_invalidate_vpipt_icache();
6917 			} else {
6918 				/*
6919 				 * If accessing an executable page invalidate
6920 				 * the I-cache so it will be valid when we
6921 				 * continue execution in the guest. The D-cache
6922 				 * is assumed to already be clean to the Point
6923 				 * of Coherency.
6924 				 */
6925 				if ((pte & ATTR_S2_XN_MASK) !=
6926 				    ATTR_S2_XN(ATTR_S2_XN_NONE)) {
6927 					invalidate_icache();
6928 				}
6929 			}
6930 			pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
6931 			rv = KERN_SUCCESS;
6932 		}
6933 		PMAP_UNLOCK(pmap);
6934 		break;
6935 	}
6936 
6937 	return (rv);
6938 }
6939 
6940 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)6941 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
6942 {
6943 	pt_entry_t pte, *ptep;
6944 	register_t intr;
6945 	uint64_t ec, par;
6946 	int lvl, rv;
6947 
6948 	rv = KERN_FAILURE;
6949 
6950 	ec = ESR_ELx_EXCEPTION(esr);
6951 	switch (ec) {
6952 	case EXCP_INSN_ABORT_L:
6953 	case EXCP_INSN_ABORT:
6954 	case EXCP_DATA_ABORT_L:
6955 	case EXCP_DATA_ABORT:
6956 		break;
6957 	default:
6958 		return (rv);
6959 	}
6960 
6961 	if (pmap->pm_stage == PM_STAGE2)
6962 		return (pmap_stage2_fault(pmap, esr, far));
6963 
6964 	/* Data and insn aborts use same encoding for FSC field. */
6965 	switch (esr & ISS_DATA_DFSC_MASK) {
6966 	case ISS_DATA_DFSC_AFF_L1:
6967 	case ISS_DATA_DFSC_AFF_L2:
6968 	case ISS_DATA_DFSC_AFF_L3:
6969 		PMAP_LOCK(pmap);
6970 		ptep = pmap_pte(pmap, far, &lvl);
6971 		if (ptep != NULL) {
6972 			pmap_set_bits(ptep, ATTR_AF);
6973 			rv = KERN_SUCCESS;
6974 			/*
6975 			 * XXXMJ as an optimization we could mark the entry
6976 			 * dirty if this is a write fault.
6977 			 */
6978 		}
6979 		PMAP_UNLOCK(pmap);
6980 		break;
6981 	case ISS_DATA_DFSC_PF_L1:
6982 	case ISS_DATA_DFSC_PF_L2:
6983 	case ISS_DATA_DFSC_PF_L3:
6984 		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
6985 		    (esr & ISS_DATA_WnR) == 0)
6986 			return (rv);
6987 		PMAP_LOCK(pmap);
6988 		ptep = pmap_pte(pmap, far, &lvl);
6989 		if (ptep != NULL &&
6990 		    ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
6991 			if ((pte & ATTR_S1_AP_RW_BIT) ==
6992 			    ATTR_S1_AP(ATTR_S1_AP_RO)) {
6993 				pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
6994 				pmap_invalidate_page(pmap, far);
6995 			}
6996 			rv = KERN_SUCCESS;
6997 		}
6998 		PMAP_UNLOCK(pmap);
6999 		break;
7000 	case ISS_DATA_DFSC_TF_L0:
7001 	case ISS_DATA_DFSC_TF_L1:
7002 	case ISS_DATA_DFSC_TF_L2:
7003 	case ISS_DATA_DFSC_TF_L3:
7004 		/*
7005 		 * Retry the translation.  A break-before-make sequence can
7006 		 * produce a transient fault.
7007 		 */
7008 		if (pmap == kernel_pmap) {
7009 			/*
7010 			 * The translation fault may have occurred within a
7011 			 * critical section.  Therefore, we must check the
7012 			 * address without acquiring the kernel pmap's lock.
7013 			 */
7014 			if (pmap_klookup(far, NULL))
7015 				rv = KERN_SUCCESS;
7016 		} else {
7017 			PMAP_LOCK(pmap);
7018 			/* Ask the MMU to check the address. */
7019 			intr = intr_disable();
7020 			par = arm64_address_translate_s1e0r(far);
7021 			intr_restore(intr);
7022 			PMAP_UNLOCK(pmap);
7023 
7024 			/*
7025 			 * If the translation was successful, then we can
7026 			 * return success to the trap handler.
7027 			 */
7028 			if (PAR_SUCCESS(par))
7029 				rv = KERN_SUCCESS;
7030 		}
7031 		break;
7032 	}
7033 
7034 	return (rv);
7035 }
7036 
7037 /*
7038  *	Increase the starting virtual address of the given mapping if a
7039  *	different alignment might result in more superpage mappings.
7040  */
7041 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)7042 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
7043     vm_offset_t *addr, vm_size_t size)
7044 {
7045 	vm_offset_t superpage_offset;
7046 
7047 	if (size < L2_SIZE)
7048 		return;
7049 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
7050 		offset += ptoa(object->pg_color);
7051 	superpage_offset = offset & L2_OFFSET;
7052 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
7053 	    (*addr & L2_OFFSET) == superpage_offset)
7054 		return;
7055 	if ((*addr & L2_OFFSET) < superpage_offset)
7056 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
7057 	else
7058 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
7059 }
7060 
7061 /**
7062  * Get the kernel virtual address of a set of physical pages. If there are
7063  * physical addresses not covered by the DMAP perform a transient mapping
7064  * that will be removed when calling pmap_unmap_io_transient.
7065  *
7066  * \param page        The pages the caller wishes to obtain the virtual
7067  *                    address on the kernel memory map.
7068  * \param vaddr       On return contains the kernel virtual memory address
7069  *                    of the pages passed in the page parameter.
7070  * \param count       Number of pages passed in.
7071  * \param can_fault   TRUE if the thread using the mapped pages can take
7072  *                    page faults, FALSE otherwise.
7073  *
7074  * \returns TRUE if the caller must call pmap_unmap_io_transient when
7075  *          finished or FALSE otherwise.
7076  *
7077  */
7078 boolean_t
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,boolean_t can_fault)7079 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7080     boolean_t can_fault)
7081 {
7082 	vm_paddr_t paddr;
7083 	boolean_t needs_mapping;
7084 	int error, i;
7085 
7086 	/*
7087 	 * Allocate any KVA space that we need, this is done in a separate
7088 	 * loop to prevent calling vmem_alloc while pinned.
7089 	 */
7090 	needs_mapping = FALSE;
7091 	for (i = 0; i < count; i++) {
7092 		paddr = VM_PAGE_TO_PHYS(page[i]);
7093 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
7094 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
7095 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
7096 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
7097 			needs_mapping = TRUE;
7098 		} else {
7099 			vaddr[i] = PHYS_TO_DMAP(paddr);
7100 		}
7101 	}
7102 
7103 	/* Exit early if everything is covered by the DMAP */
7104 	if (!needs_mapping)
7105 		return (FALSE);
7106 
7107 	if (!can_fault)
7108 		sched_pin();
7109 	for (i = 0; i < count; i++) {
7110 		paddr = VM_PAGE_TO_PHYS(page[i]);
7111 		if (!PHYS_IN_DMAP(paddr)) {
7112 			panic(
7113 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
7114 		}
7115 	}
7116 
7117 	return (needs_mapping);
7118 }
7119 
7120 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,boolean_t can_fault)7121 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7122     boolean_t can_fault)
7123 {
7124 	vm_paddr_t paddr;
7125 	int i;
7126 
7127 	if (!can_fault)
7128 		sched_unpin();
7129 	for (i = 0; i < count; i++) {
7130 		paddr = VM_PAGE_TO_PHYS(page[i]);
7131 		if (!PHYS_IN_DMAP(paddr)) {
7132 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
7133 		}
7134 	}
7135 }
7136 
7137 boolean_t
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)7138 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
7139 {
7140 
7141 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
7142 }
7143 
7144 /*
7145  * Track a range of the kernel's virtual address space that is contiguous
7146  * in various mapping attributes.
7147  */
7148 struct pmap_kernel_map_range {
7149 	vm_offset_t sva;
7150 	pt_entry_t attrs;
7151 	int l3pages;
7152 	int l3contig;
7153 	int l2blocks;
7154 	int l1blocks;
7155 };
7156 
7157 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)7158 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
7159     vm_offset_t eva)
7160 {
7161 	const char *mode;
7162 	int index;
7163 
7164 	if (eva <= range->sva)
7165 		return;
7166 
7167 	index = range->attrs & ATTR_S1_IDX_MASK;
7168 	switch (index) {
7169 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
7170 		mode = "DEV";
7171 		break;
7172 	case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
7173 		mode = "UC";
7174 		break;
7175 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
7176 		mode = "WB";
7177 		break;
7178 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
7179 		mode = "WT";
7180 		break;
7181 	default:
7182 		printf(
7183 		    "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
7184 		    __func__, index, range->sva, eva);
7185 		mode = "??";
7186 		break;
7187 	}
7188 
7189 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %3s %d %d %d %d\n",
7190 	    range->sva, eva,
7191 	    (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
7192 	    (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
7193 	    (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
7194 	    (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
7195 	    mode, range->l1blocks, range->l2blocks, range->l3contig,
7196 	    range->l3pages);
7197 
7198 	/* Reset to sentinel value. */
7199 	range->sva = 0xfffffffffffffffful;
7200 }
7201 
7202 /*
7203  * Determine whether the attributes specified by a page table entry match those
7204  * being tracked by the current range.
7205  */
7206 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)7207 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
7208 {
7209 
7210 	return (range->attrs == attrs);
7211 }
7212 
7213 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)7214 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
7215     pt_entry_t attrs)
7216 {
7217 
7218 	memset(range, 0, sizeof(*range));
7219 	range->sva = va;
7220 	range->attrs = attrs;
7221 }
7222 
7223 /* Get the block/page attributes that correspond to the table attributes */
7224 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)7225 sysctl_kmaps_table_attrs(pd_entry_t table)
7226 {
7227 	pt_entry_t attrs;
7228 
7229 	attrs = 0;
7230 	if ((table & TATTR_UXN_TABLE) != 0)
7231 		attrs |= ATTR_S1_UXN;
7232 	if ((table & TATTR_PXN_TABLE) != 0)
7233 		attrs |= ATTR_S1_PXN;
7234 	if ((table & TATTR_AP_TABLE_RO) != 0)
7235 		attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
7236 
7237 	return (attrs);
7238 }
7239 
7240 /* Read the block/page attributes we care about */
7241 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)7242 sysctl_kmaps_block_attrs(pt_entry_t block)
7243 {
7244 	return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK));
7245 }
7246 
7247 /*
7248  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
7249  * those of the current run, dump the address range and its attributes, and
7250  * begin a new run.
7251  */
7252 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)7253 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
7254     vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
7255     pt_entry_t l3e)
7256 {
7257 	pt_entry_t attrs;
7258 
7259 	attrs = sysctl_kmaps_table_attrs(l0e);
7260 
7261 	if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
7262 		attrs |= sysctl_kmaps_block_attrs(l1e);
7263 		goto done;
7264 	}
7265 	attrs |= sysctl_kmaps_table_attrs(l1e);
7266 
7267 	if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
7268 		attrs |= sysctl_kmaps_block_attrs(l2e);
7269 		goto done;
7270 	}
7271 	attrs |= sysctl_kmaps_table_attrs(l2e);
7272 	attrs |= sysctl_kmaps_block_attrs(l3e);
7273 
7274 done:
7275 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
7276 		sysctl_kmaps_dump(sb, range, va);
7277 		sysctl_kmaps_reinit(range, va, attrs);
7278 	}
7279 }
7280 
7281 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)7282 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
7283 {
7284 	struct pmap_kernel_map_range range;
7285 	struct sbuf sbuf, *sb;
7286 	pd_entry_t l0e, *l1, l1e, *l2, l2e;
7287 	pt_entry_t *l3, l3e;
7288 	vm_offset_t sva;
7289 	vm_paddr_t pa;
7290 	int error, i, j, k, l;
7291 
7292 	error = sysctl_wire_old_buffer(req, 0);
7293 	if (error != 0)
7294 		return (error);
7295 	sb = &sbuf;
7296 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
7297 
7298 	/* Sentinel value. */
7299 	range.sva = 0xfffffffffffffffful;
7300 
7301 	/*
7302 	 * Iterate over the kernel page tables without holding the kernel pmap
7303 	 * lock.  Kernel page table pages are never freed, so at worst we will
7304 	 * observe inconsistencies in the output.
7305 	 */
7306 	for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
7307 	    i++) {
7308 		if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
7309 			sbuf_printf(sb, "\nDirect map:\n");
7310 		else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
7311 			sbuf_printf(sb, "\nKernel map:\n");
7312 
7313 		l0e = kernel_pmap->pm_l0[i];
7314 		if ((l0e & ATTR_DESCR_VALID) == 0) {
7315 			sysctl_kmaps_dump(sb, &range, sva);
7316 			sva += L0_SIZE;
7317 			continue;
7318 		}
7319 		pa = l0e & ~ATTR_MASK;
7320 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
7321 
7322 		for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
7323 			l1e = l1[j];
7324 			if ((l1e & ATTR_DESCR_VALID) == 0) {
7325 				sysctl_kmaps_dump(sb, &range, sva);
7326 				sva += L1_SIZE;
7327 				continue;
7328 			}
7329 			if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
7330 				sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
7331 				    0, 0);
7332 				range.l1blocks++;
7333 				sva += L1_SIZE;
7334 				continue;
7335 			}
7336 			pa = l1e & ~ATTR_MASK;
7337 			l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
7338 
7339 			for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
7340 				l2e = l2[k];
7341 				if ((l2e & ATTR_DESCR_VALID) == 0) {
7342 					sysctl_kmaps_dump(sb, &range, sva);
7343 					sva += L2_SIZE;
7344 					continue;
7345 				}
7346 				if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
7347 					sysctl_kmaps_check(sb, &range, sva,
7348 					    l0e, l1e, l2e, 0);
7349 					range.l2blocks++;
7350 					sva += L2_SIZE;
7351 					continue;
7352 				}
7353 				pa = l2e & ~ATTR_MASK;
7354 				l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
7355 
7356 				for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
7357 				    l++, sva += L3_SIZE) {
7358 					l3e = l3[l];
7359 					if ((l3e & ATTR_DESCR_VALID) == 0) {
7360 						sysctl_kmaps_dump(sb, &range,
7361 						    sva);
7362 						continue;
7363 					}
7364 					sysctl_kmaps_check(sb, &range, sva,
7365 					    l0e, l1e, l2e, l3e);
7366 					if ((l3e & ATTR_CONTIGUOUS) != 0)
7367 						range.l3contig += l % 16 == 0 ?
7368 						    1 : 0;
7369 					else
7370 						range.l3pages++;
7371 				}
7372 			}
7373 		}
7374 	}
7375 
7376 	error = sbuf_finish(sb);
7377 	sbuf_delete(sb);
7378 	return (error);
7379 }
7380 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
7381     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
7382     NULL, 0, sysctl_kmaps, "A",
7383     "Dump kernel address layout");
7384