1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <[email protected]>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
53 */
54 /*-
55 * Copyright (c) 2003 Networks Associates Technology, Inc.
56 * All rights reserved.
57 *
58 * This software was developed for the FreeBSD Project by Jake Burkholder,
59 * Safeport Network Services, and Network Associates Laboratories, the
60 * Security Research Division of Network Associates, Inc. under
61 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62 * CHATS research program.
63 *
64 * Redistribution and use in source and binary forms, with or without
65 * modification, are permitted provided that the following conditions
66 * are met:
67 * 1. Redistributions of source code must retain the above copyright
68 * notice, this list of conditions and the following disclaimer.
69 * 2. Redistributions in binary form must reproduce the above copyright
70 * notice, this list of conditions and the following disclaimer in the
71 * documentation and/or other materials provided with the distribution.
72 *
73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83 * SUCH DAMAGE.
84 */
85
86 #include <sys/cdefs.h>
87 __FBSDID("$FreeBSD$");
88
89 /*
90 * Manages physical address maps.
91 *
92 * Since the information managed by this module is
93 * also stored by the logical address mapping module,
94 * this module may throw away valid virtual-to-physical
95 * mappings at almost any time. However, invalidations
96 * of virtual-to-physical mappings must be done as
97 * requested.
98 *
99 * In order to cope with hardware architectures which
100 * make virtual-to-physical map invalidates expensive,
101 * this module may delay invalidate or reduced protection
102 * operations until such time as they are actually
103 * necessary. This module is given full information as
104 * to which processors are currently using which maps,
105 * and to when physical maps must be made correct.
106 */
107
108 #include "opt_vm.h"
109
110 #include <sys/param.h>
111 #include <sys/bitstring.h>
112 #include <sys/bus.h>
113 #include <sys/systm.h>
114 #include <sys/kernel.h>
115 #include <sys/ktr.h>
116 #include <sys/limits.h>
117 #include <sys/lock.h>
118 #include <sys/malloc.h>
119 #include <sys/mman.h>
120 #include <sys/msgbuf.h>
121 #include <sys/mutex.h>
122 #include <sys/physmem.h>
123 #include <sys/proc.h>
124 #include <sys/rwlock.h>
125 #include <sys/sbuf.h>
126 #include <sys/sx.h>
127 #include <sys/vmem.h>
128 #include <sys/vmmeter.h>
129 #include <sys/sched.h>
130 #include <sys/sysctl.h>
131 #include <sys/_unrhdr.h>
132 #include <sys/smp.h>
133
134 #include <vm/vm.h>
135 #include <vm/vm_param.h>
136 #include <vm/vm_kern.h>
137 #include <vm/vm_page.h>
138 #include <vm/vm_map.h>
139 #include <vm/vm_object.h>
140 #include <vm/vm_extern.h>
141 #include <vm/vm_pageout.h>
142 #include <vm/vm_pager.h>
143 #include <vm/vm_phys.h>
144 #include <vm/vm_radix.h>
145 #include <vm/vm_reserv.h>
146 #include <vm/vm_dumpset.h>
147 #include <vm/uma.h>
148
149 #include <machine/machdep.h>
150 #include <machine/md_var.h>
151 #include <machine/pcb.h>
152
153 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1)
154 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2)
155
156 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
157 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
158 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
159 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))
160
161 #define NUL0E L0_ENTRIES
162 #define NUL1E (NUL0E * NL1PG)
163 #define NUL2E (NUL1E * NL2PG)
164
165 #if !defined(DIAGNOSTIC)
166 #ifdef __GNUC_GNU_INLINE__
167 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline
168 #else
169 #define PMAP_INLINE extern inline
170 #endif
171 #else
172 #define PMAP_INLINE
173 #endif
174
175 #ifdef PV_STATS
176 #define PV_STAT(x) do { x ; } while (0)
177 #else
178 #define PV_STAT(x) do { } while (0)
179 #endif
180
181 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT))
182 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
183 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
184
185 static struct md_page *
pa_to_pvh(vm_paddr_t pa)186 pa_to_pvh(vm_paddr_t pa)
187 {
188 struct vm_phys_seg *seg;
189 int segind;
190
191 for (segind = 0; segind < vm_phys_nsegs; segind++) {
192 seg = &vm_phys_segs[segind];
193 if (pa >= seg->start && pa < seg->end)
194 return ((struct md_page *)seg->md_first +
195 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
196 }
197 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
198 }
199
200 static struct md_page *
page_to_pvh(vm_page_t m)201 page_to_pvh(vm_page_t m)
202 {
203 struct vm_phys_seg *seg;
204
205 seg = &vm_phys_segs[m->segind];
206 return ((struct md_page *)seg->md_first +
207 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
208 }
209
210 #define NPV_LIST_LOCKS MAXCPU
211
212 #define PHYS_TO_PV_LIST_LOCK(pa) \
213 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
214
215 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
216 struct rwlock **_lockp = (lockp); \
217 struct rwlock *_new_lock; \
218 \
219 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
220 if (_new_lock != *_lockp) { \
221 if (*_lockp != NULL) \
222 rw_wunlock(*_lockp); \
223 *_lockp = _new_lock; \
224 rw_wlock(*_lockp); \
225 } \
226 } while (0)
227
228 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
229 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
230
231 #define RELEASE_PV_LIST_LOCK(lockp) do { \
232 struct rwlock **_lockp = (lockp); \
233 \
234 if (*_lockp != NULL) { \
235 rw_wunlock(*_lockp); \
236 *_lockp = NULL; \
237 } \
238 } while (0)
239
240 #define VM_PAGE_TO_PV_LIST_LOCK(m) \
241 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
242
243 /*
244 * The presence of this flag indicates that the mapping is writeable.
245 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
246 * it is dirty. This flag may only be set on managed mappings.
247 *
248 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
249 * as a software managed bit.
250 */
251 #define ATTR_SW_DBM ATTR_DBM
252
253 struct pmap kernel_pmap_store;
254
255 /* Used for mapping ACPI memory before VM is initialized */
256 #define PMAP_PREINIT_MAPPING_COUNT 32
257 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
258 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */
259 static int vm_initialized = 0; /* No need to use pre-init maps when set */
260
261 /*
262 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
263 * Always map entire L2 block for simplicity.
264 * VA of L2 block = preinit_map_va + i * L2_SIZE
265 */
266 static struct pmap_preinit_mapping {
267 vm_paddr_t pa;
268 vm_offset_t va;
269 vm_size_t size;
270 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
271
272 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
273 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
274 vm_offset_t kernel_vm_end = 0;
275
276 /*
277 * Data for the pv entry allocation mechanism.
278 */
279 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
280 static struct mtx pv_chunks_mutex;
281 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
282 static struct md_page *pv_table;
283 static struct md_page pv_dummy;
284
285 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
286 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
287 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
288
289 /* This code assumes all L1 DMAP entries will be used */
290 CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
291 CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
292
293 #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
294 extern pt_entry_t pagetable_dmap[];
295
296 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
297 static vm_paddr_t physmap[PHYSMAP_SIZE];
298 static u_int physmap_idx;
299
300 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
301 "VM/pmap parameters");
302
303 /*
304 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
305 * that it has currently allocated to a pmap, a cursor ("asid_next") to
306 * optimize its search for a free ASID in the bit vector, and an epoch number
307 * ("asid_epoch") to indicate when it has reclaimed all previously allocated
308 * ASIDs that are not currently active on a processor.
309 *
310 * The current epoch number is always in the range [0, INT_MAX). Negative
311 * numbers and INT_MAX are reserved for special cases that are described
312 * below.
313 */
314 struct asid_set {
315 int asid_bits;
316 bitstr_t *asid_set;
317 int asid_set_size;
318 int asid_next;
319 int asid_epoch;
320 struct mtx asid_set_mutex;
321 };
322
323 static struct asid_set asids;
324 static struct asid_set vmids;
325
326 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
327 "ASID allocator");
328 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
329 "The number of bits in an ASID");
330 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
331 "The last allocated ASID plus one");
332 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
333 "The current epoch number");
334
335 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
336 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
337 "The number of bits in an VMID");
338 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
339 "The last allocated VMID plus one");
340 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
341 "The current epoch number");
342
343 void (*pmap_clean_stage2_tlbi)(void);
344 void (*pmap_invalidate_vpipt_icache)(void);
345
346 /*
347 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved
348 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for
349 * dynamically allocated ASIDs have a non-negative epoch number.
350 *
351 * An invalid ASID is represented by -1.
352 *
353 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
354 * which indicates that an ASID should never be allocated to the pmap, and
355 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
356 * allocated when the pmap is next activated.
357 */
358 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \
359 ((u_long)(epoch) << 32)))
360 #define COOKIE_TO_ASID(cookie) ((int)(cookie))
361 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32))
362
363 #define TLBI_VA_SHIFT 12
364 #define TLBI_VA_MASK ((1ul << 44) - 1)
365 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
366 #define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT)
367
368 static int superpages_enabled = 1;
369 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
370 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
371 "Are large page mappings enabled?");
372
373 /*
374 * Internal flags for pmap_enter()'s helper functions.
375 */
376 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
377 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
378
379 static void free_pv_chunk(struct pv_chunk *pc);
380 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
381 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
382 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
383 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
384 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
385 vm_offset_t va);
386
387 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
388 static bool pmap_activate_int(pmap_t pmap);
389 static void pmap_alloc_asid(pmap_t pmap);
390 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
391 vm_prot_t prot, int mode, bool skip_unmapped);
392 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
393 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
394 vm_offset_t va, struct rwlock **lockp);
395 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
396 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
397 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
398 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
399 u_int flags, vm_page_t m, struct rwlock **lockp);
400 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
401 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
402 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
403 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
404 static void pmap_reset_asid_set(pmap_t pmap);
405 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
406 vm_page_t m, struct rwlock **lockp);
407
408 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
409 struct rwlock **lockp);
410
411 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
412 struct spglist *free);
413 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
414 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
415
416 /*
417 * These load the old table data and store the new value.
418 * They need to be atomic as the System MMU may write to the table at
419 * the same time as the CPU.
420 */
421 #define pmap_clear(table) atomic_store_64(table, 0)
422 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits)
423 #define pmap_load(table) (*table)
424 #define pmap_load_clear(table) atomic_swap_64(table, 0)
425 #define pmap_load_store(table, entry) atomic_swap_64(table, entry)
426 #define pmap_set_bits(table, bits) atomic_set_64(table, bits)
427 #define pmap_store(table, entry) atomic_store_64(table, entry)
428
429 /********************/
430 /* Inline functions */
431 /********************/
432
433 static __inline void
pagecopy(void * s,void * d)434 pagecopy(void *s, void *d)
435 {
436
437 memcpy(d, s, PAGE_SIZE);
438 }
439
440 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)441 pmap_l0(pmap_t pmap, vm_offset_t va)
442 {
443
444 return (&pmap->pm_l0[pmap_l0_index(va)]);
445 }
446
447 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)448 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
449 {
450 pd_entry_t *l1;
451
452 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
453 return (&l1[pmap_l1_index(va)]);
454 }
455
456 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)457 pmap_l1(pmap_t pmap, vm_offset_t va)
458 {
459 pd_entry_t *l0;
460
461 l0 = pmap_l0(pmap, va);
462 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
463 return (NULL);
464
465 return (pmap_l0_to_l1(l0, va));
466 }
467
468 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)469 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
470 {
471 pd_entry_t l1, *l2p;
472
473 l1 = pmap_load(l1p);
474
475 KASSERT(ADDR_IS_CANONICAL(va),
476 ("%s: Address not in canonical form: %lx", __func__, va));
477 /*
478 * The valid bit may be clear if pmap_update_entry() is concurrently
479 * modifying the entry, so for KVA only the entry type may be checked.
480 */
481 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
482 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
483 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
484 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
485 l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK);
486 return (&l2p[pmap_l2_index(va)]);
487 }
488
489 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)490 pmap_l2(pmap_t pmap, vm_offset_t va)
491 {
492 pd_entry_t *l1;
493
494 l1 = pmap_l1(pmap, va);
495 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
496 return (NULL);
497
498 return (pmap_l1_to_l2(l1, va));
499 }
500
501 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)502 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
503 {
504 pd_entry_t l2;
505 pt_entry_t *l3p;
506
507 l2 = pmap_load(l2p);
508
509 KASSERT(ADDR_IS_CANONICAL(va),
510 ("%s: Address not in canonical form: %lx", __func__, va));
511 /*
512 * The valid bit may be clear if pmap_update_entry() is concurrently
513 * modifying the entry, so for KVA only the entry type may be checked.
514 */
515 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
516 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
517 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
518 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
519 l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK);
520 return (&l3p[pmap_l3_index(va)]);
521 }
522
523 /*
524 * Returns the lowest valid pde for a given virtual address.
525 * The next level may or may not point to a valid page or block.
526 */
527 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)528 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
529 {
530 pd_entry_t *l0, *l1, *l2, desc;
531
532 l0 = pmap_l0(pmap, va);
533 desc = pmap_load(l0) & ATTR_DESCR_MASK;
534 if (desc != L0_TABLE) {
535 *level = -1;
536 return (NULL);
537 }
538
539 l1 = pmap_l0_to_l1(l0, va);
540 desc = pmap_load(l1) & ATTR_DESCR_MASK;
541 if (desc != L1_TABLE) {
542 *level = 0;
543 return (l0);
544 }
545
546 l2 = pmap_l1_to_l2(l1, va);
547 desc = pmap_load(l2) & ATTR_DESCR_MASK;
548 if (desc != L2_TABLE) {
549 *level = 1;
550 return (l1);
551 }
552
553 *level = 2;
554 return (l2);
555 }
556
557 /*
558 * Returns the lowest valid pte block or table entry for a given virtual
559 * address. If there are no valid entries return NULL and set the level to
560 * the first invalid level.
561 */
562 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)563 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
564 {
565 pd_entry_t *l1, *l2, desc;
566 pt_entry_t *l3;
567
568 l1 = pmap_l1(pmap, va);
569 if (l1 == NULL) {
570 *level = 0;
571 return (NULL);
572 }
573 desc = pmap_load(l1) & ATTR_DESCR_MASK;
574 if (desc == L1_BLOCK) {
575 *level = 1;
576 return (l1);
577 }
578
579 if (desc != L1_TABLE) {
580 *level = 1;
581 return (NULL);
582 }
583
584 l2 = pmap_l1_to_l2(l1, va);
585 desc = pmap_load(l2) & ATTR_DESCR_MASK;
586 if (desc == L2_BLOCK) {
587 *level = 2;
588 return (l2);
589 }
590
591 if (desc != L2_TABLE) {
592 *level = 2;
593 return (NULL);
594 }
595
596 *level = 3;
597 l3 = pmap_l2_to_l3(l2, va);
598 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
599 return (NULL);
600
601 return (l3);
602 }
603
604 bool
pmap_ps_enabled(pmap_t pmap __unused)605 pmap_ps_enabled(pmap_t pmap __unused)
606 {
607
608 return (superpages_enabled != 0);
609 }
610
611 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)612 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
613 pd_entry_t **l2, pt_entry_t **l3)
614 {
615 pd_entry_t *l0p, *l1p, *l2p;
616
617 if (pmap->pm_l0 == NULL)
618 return (false);
619
620 l0p = pmap_l0(pmap, va);
621 *l0 = l0p;
622
623 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
624 return (false);
625
626 l1p = pmap_l0_to_l1(l0p, va);
627 *l1 = l1p;
628
629 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
630 *l2 = NULL;
631 *l3 = NULL;
632 return (true);
633 }
634
635 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
636 return (false);
637
638 l2p = pmap_l1_to_l2(l1p, va);
639 *l2 = l2p;
640
641 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
642 *l3 = NULL;
643 return (true);
644 }
645
646 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
647 return (false);
648
649 *l3 = pmap_l2_to_l3(l2p, va);
650
651 return (true);
652 }
653
654 static __inline int
pmap_l3_valid(pt_entry_t l3)655 pmap_l3_valid(pt_entry_t l3)
656 {
657
658 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
659 }
660
661 CTASSERT(L1_BLOCK == L2_BLOCK);
662
663 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)664 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
665 {
666 pt_entry_t val;
667
668 if (pmap->pm_stage == PM_STAGE1) {
669 val = ATTR_S1_IDX(memattr);
670 if (memattr == VM_MEMATTR_DEVICE)
671 val |= ATTR_S1_XN;
672 return (val);
673 }
674
675 val = 0;
676
677 switch (memattr) {
678 case VM_MEMATTR_DEVICE:
679 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
680 ATTR_S2_XN(ATTR_S2_XN_ALL));
681 case VM_MEMATTR_UNCACHEABLE:
682 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
683 case VM_MEMATTR_WRITE_BACK:
684 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
685 case VM_MEMATTR_WRITE_THROUGH:
686 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
687 default:
688 panic("%s: invalid memory attribute %x", __func__, memattr);
689 }
690 }
691
692 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)693 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
694 {
695 pt_entry_t val;
696
697 val = 0;
698 if (pmap->pm_stage == PM_STAGE1) {
699 if ((prot & VM_PROT_EXECUTE) == 0)
700 val |= ATTR_S1_XN;
701 if ((prot & VM_PROT_WRITE) == 0)
702 val |= ATTR_S1_AP(ATTR_S1_AP_RO);
703 } else {
704 if ((prot & VM_PROT_WRITE) != 0)
705 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
706 if ((prot & VM_PROT_READ) != 0)
707 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
708 if ((prot & VM_PROT_EXECUTE) == 0)
709 val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
710 }
711
712 return (val);
713 }
714
715 /*
716 * Checks if the PTE is dirty.
717 */
718 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)719 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
720 {
721
722 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
723
724 if (pmap->pm_stage == PM_STAGE1) {
725 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
726 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
727
728 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
729 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
730 }
731
732 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
733 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
734 }
735
736 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)737 pmap_resident_count_inc(pmap_t pmap, int count)
738 {
739
740 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
741 pmap->pm_stats.resident_count += count;
742 }
743
744 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)745 pmap_resident_count_dec(pmap_t pmap, int count)
746 {
747
748 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
749 KASSERT(pmap->pm_stats.resident_count >= count,
750 ("pmap %p resident count underflow %ld %d", pmap,
751 pmap->pm_stats.resident_count, count));
752 pmap->pm_stats.resident_count -= count;
753 }
754
755 static vm_paddr_t
pmap_early_vtophys(vm_offset_t l1pt,vm_offset_t va)756 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
757 {
758 vm_paddr_t pa_page;
759
760 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
761 return (pa_page | (va & PAR_LOW_MASK));
762 }
763
764 static vm_offset_t
pmap_bootstrap_dmap(vm_offset_t kern_l1,vm_paddr_t min_pa,vm_offset_t freemempos)765 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa,
766 vm_offset_t freemempos)
767 {
768 pt_entry_t *l2;
769 vm_offset_t va;
770 vm_paddr_t l2_pa, pa;
771 u_int l1_slot, l2_slot, prev_l1_slot;
772 int i;
773
774 dmap_phys_base = min_pa & ~L1_OFFSET;
775 dmap_phys_max = 0;
776 dmap_max_addr = 0;
777 l2 = NULL;
778 prev_l1_slot = -1;
779
780 #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
781 memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES);
782
783 for (i = 0; i < (physmap_idx * 2); i += 2) {
784 pa = physmap[i] & ~L2_OFFSET;
785 va = pa - dmap_phys_base + DMAP_MIN_ADDRESS;
786
787 /* Create L2 mappings at the start of the region */
788 if ((pa & L1_OFFSET) != 0) {
789 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
790 if (l1_slot != prev_l1_slot) {
791 prev_l1_slot = l1_slot;
792 l2 = (pt_entry_t *)freemempos;
793 l2_pa = pmap_early_vtophys(kern_l1,
794 (vm_offset_t)l2);
795 freemempos += PAGE_SIZE;
796
797 pmap_store(&pagetable_dmap[l1_slot],
798 (l2_pa & ~Ln_TABLE_MASK) |
799 TATTR_PXN_TABLE | L1_TABLE);
800
801 memset(l2, 0, PAGE_SIZE);
802 }
803 KASSERT(l2 != NULL,
804 ("pmap_bootstrap_dmap: NULL l2 map"));
805 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
806 pa += L2_SIZE, va += L2_SIZE) {
807 /*
808 * We are on a boundary, stop to
809 * create a level 1 block
810 */
811 if ((pa & L1_OFFSET) == 0)
812 break;
813
814 l2_slot = pmap_l2_index(va);
815 KASSERT(l2_slot != 0, ("..."));
816 pmap_store(&l2[l2_slot],
817 (pa & ~L2_OFFSET) | ATTR_DEFAULT |
818 ATTR_S1_XN |
819 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
820 L2_BLOCK);
821 }
822 KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS),
823 ("..."));
824 }
825
826 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] &&
827 (physmap[i + 1] - pa) >= L1_SIZE;
828 pa += L1_SIZE, va += L1_SIZE) {
829 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
830 pmap_store(&pagetable_dmap[l1_slot],
831 (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN |
832 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L1_BLOCK);
833 }
834
835 /* Create L2 mappings at the end of the region */
836 if (pa < physmap[i + 1]) {
837 l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
838 if (l1_slot != prev_l1_slot) {
839 prev_l1_slot = l1_slot;
840 l2 = (pt_entry_t *)freemempos;
841 l2_pa = pmap_early_vtophys(kern_l1,
842 (vm_offset_t)l2);
843 freemempos += PAGE_SIZE;
844
845 pmap_store(&pagetable_dmap[l1_slot],
846 (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
847
848 memset(l2, 0, PAGE_SIZE);
849 }
850 KASSERT(l2 != NULL,
851 ("pmap_bootstrap_dmap: NULL l2 map"));
852 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
853 pa += L2_SIZE, va += L2_SIZE) {
854 l2_slot = pmap_l2_index(va);
855 pmap_store(&l2[l2_slot],
856 (pa & ~L2_OFFSET) | ATTR_DEFAULT |
857 ATTR_S1_XN |
858 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
859 L2_BLOCK);
860 }
861 }
862
863 if (pa > dmap_phys_max) {
864 dmap_phys_max = pa;
865 dmap_max_addr = va;
866 }
867 }
868
869 cpu_tlb_flushID();
870
871 return (freemempos);
872 }
873
874 static vm_offset_t
pmap_bootstrap_l2(vm_offset_t l1pt,vm_offset_t va,vm_offset_t l2_start)875 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
876 {
877 vm_offset_t l2pt;
878 vm_paddr_t pa;
879 pd_entry_t *l1;
880 u_int l1_slot;
881
882 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
883
884 l1 = (pd_entry_t *)l1pt;
885 l1_slot = pmap_l1_index(va);
886 l2pt = l2_start;
887
888 for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
889 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
890
891 pa = pmap_early_vtophys(l1pt, l2pt);
892 pmap_store(&l1[l1_slot],
893 (pa & ~Ln_TABLE_MASK) | L1_TABLE);
894 l2pt += PAGE_SIZE;
895 }
896
897 /* Clean the L2 page table */
898 memset((void *)l2_start, 0, l2pt - l2_start);
899
900 return l2pt;
901 }
902
903 static vm_offset_t
pmap_bootstrap_l3(vm_offset_t l1pt,vm_offset_t va,vm_offset_t l3_start)904 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
905 {
906 vm_offset_t l3pt;
907 vm_paddr_t pa;
908 pd_entry_t *l2;
909 u_int l2_slot;
910
911 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
912
913 l2 = pmap_l2(kernel_pmap, va);
914 l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
915 l2_slot = pmap_l2_index(va);
916 l3pt = l3_start;
917
918 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
919 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
920
921 pa = pmap_early_vtophys(l1pt, l3pt);
922 pmap_store(&l2[l2_slot],
923 (pa & ~Ln_TABLE_MASK) | ATTR_S1_UXN | L2_TABLE);
924 l3pt += PAGE_SIZE;
925 }
926
927 /* Clean the L2 page table */
928 memset((void *)l3_start, 0, l3pt - l3_start);
929
930 return l3pt;
931 }
932
933 /*
934 * Bootstrap the system enough to run with virtual memory.
935 */
936 void
pmap_bootstrap(vm_offset_t l0pt,vm_offset_t l1pt,vm_paddr_t kernstart,vm_size_t kernlen)937 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
938 vm_size_t kernlen)
939 {
940 vm_offset_t freemempos;
941 vm_offset_t dpcpu, msgbufpv;
942 vm_paddr_t start_pa, pa, min_pa;
943 uint64_t kern_delta;
944 int i;
945
946 /* Verify that the ASID is set through TTBR0. */
947 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
948 ("pmap_bootstrap: TCR_EL1.A1 != 0"));
949
950 kern_delta = KERNBASE - kernstart;
951
952 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
953 printf("%lx\n", l1pt);
954 printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
955
956 /* Set this early so we can use the pagetable walking functions */
957 kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
958 PMAP_LOCK_INIT(kernel_pmap);
959 kernel_pmap->pm_l0_paddr = l0pt - kern_delta;
960 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
961 kernel_pmap->pm_stage = PM_STAGE1;
962 kernel_pmap->pm_levels = 4;
963 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
964 kernel_pmap->pm_asid_set = &asids;
965
966 /* Assume the address we were loaded to is a valid physical address */
967 min_pa = KERNBASE - kern_delta;
968
969 physmap_idx = physmem_avail(physmap, nitems(physmap));
970 physmap_idx /= 2;
971
972 /*
973 * Find the minimum physical address. physmap is sorted,
974 * but may contain empty ranges.
975 */
976 for (i = 0; i < physmap_idx * 2; i += 2) {
977 if (physmap[i] == physmap[i + 1])
978 continue;
979 if (physmap[i] <= min_pa)
980 min_pa = physmap[i];
981 }
982
983 freemempos = KERNBASE + kernlen;
984 freemempos = roundup2(freemempos, PAGE_SIZE);
985
986 /* Create a direct map region early so we can use it for pa -> va */
987 freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos);
988
989 start_pa = pa = KERNBASE - kern_delta;
990
991 /*
992 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the
993 * loader allocated the first and only l2 page table page used to map
994 * the kernel, preloaded files and module metadata.
995 */
996 freemempos = pmap_bootstrap_l2(l1pt, KERNBASE + L1_SIZE, freemempos);
997 /* And the l3 tables for the early devmap */
998 freemempos = pmap_bootstrap_l3(l1pt,
999 VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos);
1000
1001 cpu_tlb_flushID();
1002
1003 #define alloc_pages(var, np) \
1004 (var) = freemempos; \
1005 freemempos += (np * PAGE_SIZE); \
1006 memset((char *)(var), 0, ((np) * PAGE_SIZE));
1007
1008 /* Allocate dynamic per-cpu area. */
1009 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1010 dpcpu_init((void *)dpcpu, 0);
1011
1012 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1013 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1014 msgbufp = (void *)msgbufpv;
1015
1016 /* Reserve some VA space for early BIOS/ACPI mapping */
1017 preinit_map_va = roundup2(freemempos, L2_SIZE);
1018
1019 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1020 virtual_avail = roundup2(virtual_avail, L1_SIZE);
1021 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1022 kernel_vm_end = virtual_avail;
1023
1024 pa = pmap_early_vtophys(l1pt, freemempos);
1025
1026 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1027
1028 cpu_tlb_flushID();
1029 }
1030
1031 /*
1032 * Initialize a vm_page's machine-dependent fields.
1033 */
1034 void
pmap_page_init(vm_page_t m)1035 pmap_page_init(vm_page_t m)
1036 {
1037
1038 TAILQ_INIT(&m->md.pv_list);
1039 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1040 }
1041
1042 static void
pmap_init_asids(struct asid_set * set,int bits)1043 pmap_init_asids(struct asid_set *set, int bits)
1044 {
1045 int i;
1046
1047 set->asid_bits = bits;
1048
1049 /*
1050 * We may be too early in the overall initialization process to use
1051 * bit_alloc().
1052 */
1053 set->asid_set_size = 1 << set->asid_bits;
1054 set->asid_set = (bitstr_t *)kmem_malloc(bitstr_size(set->asid_set_size),
1055 M_WAITOK | M_ZERO);
1056 for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1057 bit_set(set->asid_set, i);
1058 set->asid_next = ASID_FIRST_AVAILABLE;
1059 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1060 }
1061
1062 /*
1063 * Initialize the pmap module.
1064 * Called by vm_init, to initialize any structures that the pmap
1065 * system needs to map virtual memory.
1066 */
1067 void
pmap_init(void)1068 pmap_init(void)
1069 {
1070 struct vm_phys_seg *seg, *next_seg;
1071 struct md_page *pvh;
1072 vm_size_t s;
1073 uint64_t mmfr1;
1074 int i, pv_npg, vmid_bits;
1075
1076 /*
1077 * Are large page mappings enabled?
1078 */
1079 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1080 if (superpages_enabled) {
1081 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1082 ("pmap_init: can't assign to pagesizes[1]"));
1083 pagesizes[1] = L2_SIZE;
1084 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1085 ("pmap_init: can't assign to pagesizes[2]"));
1086 pagesizes[2] = L1_SIZE;
1087 }
1088
1089 /*
1090 * Initialize the ASID allocator.
1091 */
1092 pmap_init_asids(&asids,
1093 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1094
1095 if (has_hyp()) {
1096 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1097 vmid_bits = 8;
1098
1099 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1100 ID_AA64MMFR1_VMIDBits_16)
1101 vmid_bits = 16;
1102 pmap_init_asids(&vmids, vmid_bits);
1103 }
1104
1105 /*
1106 * Initialize the pv chunk list mutex.
1107 */
1108 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1109
1110 /*
1111 * Initialize the pool of pv list locks.
1112 */
1113 for (i = 0; i < NPV_LIST_LOCKS; i++)
1114 rw_init(&pv_list_locks[i], "pmap pv list");
1115
1116 /*
1117 * Calculate the size of the pv head table for superpages.
1118 */
1119 pv_npg = 0;
1120 for (i = 0; i < vm_phys_nsegs; i++) {
1121 seg = &vm_phys_segs[i];
1122 pv_npg += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1123 pmap_l2_pindex(seg->start);
1124 }
1125
1126 /*
1127 * Allocate memory for the pv head table for superpages.
1128 */
1129 s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1130 s = round_page(s);
1131 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
1132 for (i = 0; i < pv_npg; i++)
1133 TAILQ_INIT(&pv_table[i].pv_list);
1134 TAILQ_INIT(&pv_dummy.pv_list);
1135
1136 /*
1137 * Set pointers from vm_phys_segs to pv_table.
1138 */
1139 for (i = 0, pvh = pv_table; i < vm_phys_nsegs; i++) {
1140 seg = &vm_phys_segs[i];
1141 seg->md_first = pvh;
1142 pvh += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1143 pmap_l2_pindex(seg->start);
1144
1145 /*
1146 * If there is a following segment, and the final
1147 * superpage of this segment and the initial superpage
1148 * of the next segment are the same then adjust the
1149 * pv_table entry for that next segment down by one so
1150 * that the pv_table entries will be shared.
1151 */
1152 if (i + 1 < vm_phys_nsegs) {
1153 next_seg = &vm_phys_segs[i + 1];
1154 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1155 pmap_l2_pindex(next_seg->start)) {
1156 pvh--;
1157 }
1158 }
1159 }
1160
1161 vm_initialized = 1;
1162 }
1163
1164 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1165 "2MB page mapping counters");
1166
1167 static u_long pmap_l2_demotions;
1168 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1169 &pmap_l2_demotions, 0, "2MB page demotions");
1170
1171 static u_long pmap_l2_mappings;
1172 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1173 &pmap_l2_mappings, 0, "2MB page mappings");
1174
1175 static u_long pmap_l2_p_failures;
1176 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1177 &pmap_l2_p_failures, 0, "2MB page promotion failures");
1178
1179 static u_long pmap_l2_promotions;
1180 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1181 &pmap_l2_promotions, 0, "2MB page promotions");
1182
1183 /*
1184 * Invalidate a single TLB entry.
1185 */
1186 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1187 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1188 {
1189 uint64_t r;
1190
1191 PMAP_ASSERT_STAGE1(pmap);
1192
1193 dsb(ishst);
1194 r = TLBI_VA(va);
1195 if (pmap == kernel_pmap) {
1196 __asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1197 } else {
1198 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1199 __asm __volatile("tlbi vae1is, %0" : : "r" (r));
1200 }
1201 dsb(ish);
1202 isb();
1203 }
1204
1205 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1206 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1207 {
1208 uint64_t end, r, start;
1209
1210 PMAP_ASSERT_STAGE1(pmap);
1211
1212 dsb(ishst);
1213 if (pmap == kernel_pmap) {
1214 start = TLBI_VA(sva);
1215 end = TLBI_VA(eva);
1216 for (r = start; r < end; r += TLBI_VA_L3_INCR)
1217 __asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1218 } else {
1219 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1220 start |= TLBI_VA(sva);
1221 end |= TLBI_VA(eva);
1222 for (r = start; r < end; r += TLBI_VA_L3_INCR)
1223 __asm __volatile("tlbi vae1is, %0" : : "r" (r));
1224 }
1225 dsb(ish);
1226 isb();
1227 }
1228
1229 static __inline void
pmap_invalidate_all(pmap_t pmap)1230 pmap_invalidate_all(pmap_t pmap)
1231 {
1232 uint64_t r;
1233
1234 PMAP_ASSERT_STAGE1(pmap);
1235
1236 dsb(ishst);
1237 if (pmap == kernel_pmap) {
1238 __asm __volatile("tlbi vmalle1is");
1239 } else {
1240 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1241 __asm __volatile("tlbi aside1is, %0" : : "r" (r));
1242 }
1243 dsb(ish);
1244 isb();
1245 }
1246
1247 /*
1248 * Routine: pmap_extract
1249 * Function:
1250 * Extract the physical page address associated
1251 * with the given map/virtual_address pair.
1252 */
1253 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1254 pmap_extract(pmap_t pmap, vm_offset_t va)
1255 {
1256 pt_entry_t *pte, tpte;
1257 vm_paddr_t pa;
1258 int lvl;
1259
1260 pa = 0;
1261 PMAP_LOCK(pmap);
1262 /*
1263 * Find the block or page map for this virtual address. pmap_pte
1264 * will return either a valid block/page entry, or NULL.
1265 */
1266 pte = pmap_pte(pmap, va, &lvl);
1267 if (pte != NULL) {
1268 tpte = pmap_load(pte);
1269 pa = tpte & ~ATTR_MASK;
1270 switch(lvl) {
1271 case 1:
1272 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1273 ("pmap_extract: Invalid L1 pte found: %lx",
1274 tpte & ATTR_DESCR_MASK));
1275 pa |= (va & L1_OFFSET);
1276 break;
1277 case 2:
1278 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1279 ("pmap_extract: Invalid L2 pte found: %lx",
1280 tpte & ATTR_DESCR_MASK));
1281 pa |= (va & L2_OFFSET);
1282 break;
1283 case 3:
1284 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1285 ("pmap_extract: Invalid L3 pte found: %lx",
1286 tpte & ATTR_DESCR_MASK));
1287 pa |= (va & L3_OFFSET);
1288 break;
1289 }
1290 }
1291 PMAP_UNLOCK(pmap);
1292 return (pa);
1293 }
1294
1295 /*
1296 * Routine: pmap_extract_and_hold
1297 * Function:
1298 * Atomically extract and hold the physical page
1299 * with the given pmap and virtual address pair
1300 * if that mapping permits the given protection.
1301 */
1302 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1303 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1304 {
1305 pt_entry_t *pte, tpte;
1306 vm_offset_t off;
1307 vm_page_t m;
1308 int lvl;
1309 bool use;
1310
1311 m = NULL;
1312 PMAP_LOCK(pmap);
1313 pte = pmap_pte(pmap, va, &lvl);
1314 if (pte != NULL) {
1315 tpte = pmap_load(pte);
1316
1317 KASSERT(lvl > 0 && lvl <= 3,
1318 ("pmap_extract_and_hold: Invalid level %d", lvl));
1319 CTASSERT(L1_BLOCK == L2_BLOCK);
1320 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1321 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1322 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1323 tpte & ATTR_DESCR_MASK));
1324
1325 use = false;
1326 if ((prot & VM_PROT_WRITE) == 0)
1327 use = true;
1328 else if (pmap->pm_stage == PM_STAGE1 &&
1329 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1330 use = true;
1331 else if (pmap->pm_stage == PM_STAGE2 &&
1332 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1333 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1334 use = true;
1335
1336 if (use) {
1337 switch (lvl) {
1338 case 1:
1339 off = va & L1_OFFSET;
1340 break;
1341 case 2:
1342 off = va & L2_OFFSET;
1343 break;
1344 case 3:
1345 default:
1346 off = 0;
1347 }
1348 m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
1349 if (m != NULL && !vm_page_wire_mapped(m))
1350 m = NULL;
1351 }
1352 }
1353 PMAP_UNLOCK(pmap);
1354 return (m);
1355 }
1356
1357 /*
1358 * Walks the page tables to translate a kernel virtual address to a
1359 * physical address. Returns true if the kva is valid and stores the
1360 * physical address in pa if it is not NULL.
1361 */
1362 bool
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)1363 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
1364 {
1365 pt_entry_t *pte, tpte;
1366 register_t intr;
1367 uint64_t par;
1368
1369 /*
1370 * Disable interrupts so we don't get interrupted between asking
1371 * for address translation, and getting the result back.
1372 */
1373 intr = intr_disable();
1374 par = arm64_address_translate_s1e1r(va);
1375 intr_restore(intr);
1376
1377 if (PAR_SUCCESS(par)) {
1378 if (pa != NULL)
1379 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
1380 return (true);
1381 }
1382
1383 /*
1384 * Fall back to walking the page table. The address translation
1385 * instruction may fail when the page is in a break-before-make
1386 * sequence. As we only clear the valid bit in said sequence we
1387 * can walk the page table to find the physical address.
1388 */
1389
1390 pte = pmap_l1(kernel_pmap, va);
1391 if (pte == NULL)
1392 return (false);
1393
1394 /*
1395 * A concurrent pmap_update_entry() will clear the entry's valid bit
1396 * but leave the rest of the entry unchanged. Therefore, we treat a
1397 * non-zero entry as being valid, and we ignore the valid bit when
1398 * determining whether the entry maps a block, page, or table.
1399 */
1400 tpte = pmap_load(pte);
1401 if (tpte == 0)
1402 return (false);
1403 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1404 if (pa != NULL)
1405 *pa = (tpte & ~ATTR_MASK) | (va & L1_OFFSET);
1406 return (true);
1407 }
1408 pte = pmap_l1_to_l2(&tpte, va);
1409 tpte = pmap_load(pte);
1410 if (tpte == 0)
1411 return (false);
1412 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1413 if (pa != NULL)
1414 *pa = (tpte & ~ATTR_MASK) | (va & L2_OFFSET);
1415 return (true);
1416 }
1417 pte = pmap_l2_to_l3(&tpte, va);
1418 tpte = pmap_load(pte);
1419 if (tpte == 0)
1420 return (false);
1421 if (pa != NULL)
1422 *pa = (tpte & ~ATTR_MASK) | (va & L3_OFFSET);
1423 return (true);
1424 }
1425
1426 vm_paddr_t
pmap_kextract(vm_offset_t va)1427 pmap_kextract(vm_offset_t va)
1428 {
1429 vm_paddr_t pa;
1430
1431 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
1432 return (DMAP_TO_PHYS(va));
1433
1434 if (pmap_klookup(va, &pa) == false)
1435 return (0);
1436 return (pa);
1437 }
1438
1439 /***************************************************
1440 * Low level mapping routines.....
1441 ***************************************************/
1442
1443 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)1444 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1445 {
1446 pd_entry_t *pde;
1447 pt_entry_t *pte, attr;
1448 vm_offset_t va;
1449 int lvl;
1450
1451 KASSERT((pa & L3_OFFSET) == 0,
1452 ("pmap_kenter: Invalid physical address"));
1453 KASSERT((sva & L3_OFFSET) == 0,
1454 ("pmap_kenter: Invalid virtual address"));
1455 KASSERT((size & PAGE_MASK) == 0,
1456 ("pmap_kenter: Mapping is not page-sized"));
1457
1458 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
1459 ATTR_S1_IDX(mode) | L3_PAGE;
1460 va = sva;
1461 while (size != 0) {
1462 pde = pmap_pde(kernel_pmap, va, &lvl);
1463 KASSERT(pde != NULL,
1464 ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
1465 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
1466
1467 pte = pmap_l2_to_l3(pde, va);
1468 pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
1469
1470 va += PAGE_SIZE;
1471 pa += PAGE_SIZE;
1472 size -= PAGE_SIZE;
1473 }
1474 pmap_invalidate_range(kernel_pmap, sva, va);
1475 }
1476
1477 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)1478 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1479 {
1480
1481 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
1482 }
1483
1484 /*
1485 * Remove a page from the kernel pagetables.
1486 */
1487 PMAP_INLINE void
pmap_kremove(vm_offset_t va)1488 pmap_kremove(vm_offset_t va)
1489 {
1490 pt_entry_t *pte;
1491 int lvl;
1492
1493 pte = pmap_pte(kernel_pmap, va, &lvl);
1494 KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
1495 KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));
1496
1497 pmap_clear(pte);
1498 pmap_invalidate_page(kernel_pmap, va);
1499 }
1500
1501 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)1502 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1503 {
1504 pt_entry_t *pte;
1505 vm_offset_t va;
1506 int lvl;
1507
1508 KASSERT((sva & L3_OFFSET) == 0,
1509 ("pmap_kremove_device: Invalid virtual address"));
1510 KASSERT((size & PAGE_MASK) == 0,
1511 ("pmap_kremove_device: Mapping is not page-sized"));
1512
1513 va = sva;
1514 while (size != 0) {
1515 pte = pmap_pte(kernel_pmap, va, &lvl);
1516 KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
1517 KASSERT(lvl == 3,
1518 ("Invalid device pagetable level: %d != 3", lvl));
1519 pmap_clear(pte);
1520
1521 va += PAGE_SIZE;
1522 size -= PAGE_SIZE;
1523 }
1524 pmap_invalidate_range(kernel_pmap, sva, va);
1525 }
1526
1527 /*
1528 * Used to map a range of physical addresses into kernel
1529 * virtual address space.
1530 *
1531 * The value passed in '*virt' is a suggested virtual address for
1532 * the mapping. Architectures which can support a direct-mapped
1533 * physical to virtual region can return the appropriate address
1534 * within that region, leaving '*virt' unchanged. Other
1535 * architectures should map the pages starting at '*virt' and
1536 * update '*virt' with the first usable address after the mapped
1537 * region.
1538 */
1539 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)1540 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1541 {
1542 return PHYS_TO_DMAP(start);
1543 }
1544
1545 /*
1546 * Add a list of wired pages to the kva
1547 * this routine is only used for temporary
1548 * kernel mappings that do not need to have
1549 * page modification or references recorded.
1550 * Note that old mappings are simply written
1551 * over. The page *must* be wired.
1552 * Note: SMP coherent. Uses a ranged shootdown IPI.
1553 */
1554 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)1555 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1556 {
1557 pd_entry_t *pde;
1558 pt_entry_t *pte, pa;
1559 vm_offset_t va;
1560 vm_page_t m;
1561 int i, lvl;
1562
1563 va = sva;
1564 for (i = 0; i < count; i++) {
1565 pde = pmap_pde(kernel_pmap, va, &lvl);
1566 KASSERT(pde != NULL,
1567 ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
1568 KASSERT(lvl == 2,
1569 ("pmap_qenter: Invalid level %d", lvl));
1570
1571 m = ma[i];
1572 pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
1573 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
1574 ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
1575 pte = pmap_l2_to_l3(pde, va);
1576 pmap_load_store(pte, pa);
1577
1578 va += L3_SIZE;
1579 }
1580 pmap_invalidate_range(kernel_pmap, sva, va);
1581 }
1582
1583 /*
1584 * This routine tears out page mappings from the
1585 * kernel -- it is meant only for temporary mappings.
1586 */
1587 void
pmap_qremove(vm_offset_t sva,int count)1588 pmap_qremove(vm_offset_t sva, int count)
1589 {
1590 pt_entry_t *pte;
1591 vm_offset_t va;
1592 int lvl;
1593
1594 KASSERT(ADDR_IS_CANONICAL(sva),
1595 ("%s: Address not in canonical form: %lx", __func__, sva));
1596 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
1597
1598 va = sva;
1599 while (count-- > 0) {
1600 pte = pmap_pte(kernel_pmap, va, &lvl);
1601 KASSERT(lvl == 3,
1602 ("Invalid device pagetable level: %d != 3", lvl));
1603 if (pte != NULL) {
1604 pmap_clear(pte);
1605 }
1606
1607 va += PAGE_SIZE;
1608 }
1609 pmap_invalidate_range(kernel_pmap, sva, va);
1610 }
1611
1612 /***************************************************
1613 * Page table page management routines.....
1614 ***************************************************/
1615 /*
1616 * Schedule the specified unused page table page to be freed. Specifically,
1617 * add the page to the specified list of pages that will be released to the
1618 * physical memory manager after the TLB has been updated.
1619 */
1620 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,boolean_t set_PG_ZERO)1621 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1622 boolean_t set_PG_ZERO)
1623 {
1624
1625 if (set_PG_ZERO)
1626 m->flags |= PG_ZERO;
1627 else
1628 m->flags &= ~PG_ZERO;
1629 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1630 }
1631
1632 /*
1633 * Decrements a page table page's reference count, which is used to record the
1634 * number of valid page table entries within the page. If the reference count
1635 * drops to zero, then the page table page is unmapped. Returns TRUE if the
1636 * page table page was unmapped and FALSE otherwise.
1637 */
1638 static inline boolean_t
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1639 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1640 {
1641
1642 --m->ref_count;
1643 if (m->ref_count == 0) {
1644 _pmap_unwire_l3(pmap, va, m, free);
1645 return (TRUE);
1646 } else
1647 return (FALSE);
1648 }
1649
1650 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)1651 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1652 {
1653
1654 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1655 /*
1656 * unmap the page table page
1657 */
1658 if (m->pindex >= (NUL2E + NUL1E)) {
1659 /* l1 page */
1660 pd_entry_t *l0;
1661
1662 l0 = pmap_l0(pmap, va);
1663 pmap_clear(l0);
1664 } else if (m->pindex >= NUL2E) {
1665 /* l2 page */
1666 pd_entry_t *l1;
1667
1668 l1 = pmap_l1(pmap, va);
1669 pmap_clear(l1);
1670 } else {
1671 /* l3 page */
1672 pd_entry_t *l2;
1673
1674 l2 = pmap_l2(pmap, va);
1675 pmap_clear(l2);
1676 }
1677 pmap_resident_count_dec(pmap, 1);
1678 if (m->pindex < NUL2E) {
1679 /* We just released an l3, unhold the matching l2 */
1680 pd_entry_t *l1, tl1;
1681 vm_page_t l2pg;
1682
1683 l1 = pmap_l1(pmap, va);
1684 tl1 = pmap_load(l1);
1685 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1686 pmap_unwire_l3(pmap, va, l2pg, free);
1687 } else if (m->pindex < (NUL2E + NUL1E)) {
1688 /* We just released an l2, unhold the matching l1 */
1689 pd_entry_t *l0, tl0;
1690 vm_page_t l1pg;
1691
1692 l0 = pmap_l0(pmap, va);
1693 tl0 = pmap_load(l0);
1694 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1695 pmap_unwire_l3(pmap, va, l1pg, free);
1696 }
1697 pmap_invalidate_page(pmap, va);
1698
1699 /*
1700 * Put page on a list so that it is released after
1701 * *ALL* TLB shootdown is done
1702 */
1703 pmap_add_delayed_free_list(m, free, TRUE);
1704 }
1705
1706 /*
1707 * After removing a page table entry, this routine is used to
1708 * conditionally free the page, and manage the reference count.
1709 */
1710 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)1711 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1712 struct spglist *free)
1713 {
1714 vm_page_t mpte;
1715
1716 KASSERT(ADDR_IS_CANONICAL(va),
1717 ("%s: Address not in canonical form: %lx", __func__, va));
1718 if (ADDR_IS_KERNEL(va))
1719 return (0);
1720 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1721 mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
1722 return (pmap_unwire_l3(pmap, va, mpte, free));
1723 }
1724
1725 /*
1726 * Release a page table page reference after a failed attempt to create a
1727 * mapping.
1728 */
1729 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)1730 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1731 {
1732 struct spglist free;
1733
1734 SLIST_INIT(&free);
1735 if (pmap_unwire_l3(pmap, va, mpte, &free)) {
1736 /*
1737 * Although "va" was never mapped, the TLB could nonetheless
1738 * have intermediate entries that refer to the freed page
1739 * table pages. Invalidate those entries.
1740 *
1741 * XXX redundant invalidation (See _pmap_unwire_l3().)
1742 */
1743 pmap_invalidate_page(pmap, va);
1744 vm_page_free_pages_toq(&free, true);
1745 }
1746 }
1747
1748 void
pmap_pinit0(pmap_t pmap)1749 pmap_pinit0(pmap_t pmap)
1750 {
1751
1752 PMAP_LOCK_INIT(pmap);
1753 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1754 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
1755 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
1756 vm_radix_init(&pmap->pm_root);
1757 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
1758 pmap->pm_stage = PM_STAGE1;
1759 pmap->pm_levels = 4;
1760 pmap->pm_ttbr = pmap->pm_l0_paddr;
1761 pmap->pm_asid_set = &asids;
1762
1763 PCPU_SET(curpmap, pmap);
1764 }
1765
1766 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)1767 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
1768 {
1769 vm_page_t m;
1770
1771 /*
1772 * allocate the l0 page
1773 */
1774 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
1775 VM_ALLOC_ZERO);
1776 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
1777 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
1778
1779 vm_radix_init(&pmap->pm_root);
1780 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1781 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
1782
1783 MPASS(levels == 3 || levels == 4);
1784 pmap->pm_levels = levels;
1785 pmap->pm_stage = stage;
1786 switch (stage) {
1787 case PM_STAGE1:
1788 pmap->pm_asid_set = &asids;
1789 break;
1790 case PM_STAGE2:
1791 pmap->pm_asid_set = &vmids;
1792 break;
1793 default:
1794 panic("%s: Invalid pmap type %d", __func__, stage);
1795 break;
1796 }
1797
1798 /* XXX Temporarily disable deferred ASID allocation. */
1799 pmap_alloc_asid(pmap);
1800
1801 /*
1802 * Allocate the level 1 entry to use as the root. This will increase
1803 * the refcount on the level 1 page so it won't be removed until
1804 * pmap_release() is called.
1805 */
1806 if (pmap->pm_levels == 3) {
1807 PMAP_LOCK(pmap);
1808 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
1809 PMAP_UNLOCK(pmap);
1810 }
1811 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
1812
1813 return (1);
1814 }
1815
1816 int
pmap_pinit(pmap_t pmap)1817 pmap_pinit(pmap_t pmap)
1818 {
1819
1820 return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
1821 }
1822
1823 /*
1824 * This routine is called if the desired page table page does not exist.
1825 *
1826 * If page table page allocation fails, this routine may sleep before
1827 * returning NULL. It sleeps only if a lock pointer was given.
1828 *
1829 * Note: If a page allocation fails at page table level two or three,
1830 * one or two pages may be held during the wait, only to be released
1831 * afterwards. This conservative approach is easily argued to avoid
1832 * race conditions.
1833 */
1834 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)1835 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1836 {
1837 vm_page_t m, l1pg, l2pg;
1838
1839 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1840
1841 /*
1842 * Allocate a page table page.
1843 */
1844 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1845 if (lockp != NULL) {
1846 RELEASE_PV_LIST_LOCK(lockp);
1847 PMAP_UNLOCK(pmap);
1848 vm_wait(NULL);
1849 PMAP_LOCK(pmap);
1850 }
1851
1852 /*
1853 * Indicate the need to retry. While waiting, the page table
1854 * page may have been allocated.
1855 */
1856 return (NULL);
1857 }
1858 m->pindex = ptepindex;
1859
1860 /*
1861 * Because of AArch64's weak memory consistency model, we must have a
1862 * barrier here to ensure that the stores for zeroing "m", whether by
1863 * pmap_zero_page() or an earlier function, are visible before adding
1864 * "m" to the page table. Otherwise, a page table walk by another
1865 * processor's MMU could see the mapping to "m" and a stale, non-zero
1866 * PTE within "m".
1867 */
1868 dmb(ishst);
1869
1870 /*
1871 * Map the pagetable page into the process address space, if
1872 * it isn't already there.
1873 */
1874
1875 if (ptepindex >= (NUL2E + NUL1E)) {
1876 pd_entry_t *l0p, l0e;
1877 vm_pindex_t l0index;
1878
1879 l0index = ptepindex - (NUL2E + NUL1E);
1880 l0p = &pmap->pm_l0[l0index];
1881 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
1882 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
1883 l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE;
1884
1885 /*
1886 * Mark all kernel memory as not accessible from userspace
1887 * and userspace memory as not executable from the kernel.
1888 * This has been done for the bootstrap L0 entries in
1889 * locore.S.
1890 */
1891 if (pmap == kernel_pmap)
1892 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
1893 else
1894 l0e |= TATTR_PXN_TABLE;
1895 pmap_store(l0p, l0e);
1896 } else if (ptepindex >= NUL2E) {
1897 vm_pindex_t l0index, l1index;
1898 pd_entry_t *l0, *l1;
1899 pd_entry_t tl0;
1900
1901 l1index = ptepindex - NUL2E;
1902 l0index = l1index >> L0_ENTRIES_SHIFT;
1903
1904 l0 = &pmap->pm_l0[l0index];
1905 tl0 = pmap_load(l0);
1906 if (tl0 == 0) {
1907 /* recurse for allocating page dir */
1908 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
1909 lockp) == NULL) {
1910 vm_page_unwire_noq(m);
1911 vm_page_free_zero(m);
1912 return (NULL);
1913 }
1914 } else {
1915 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1916 l1pg->ref_count++;
1917 }
1918
1919 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
1920 l1 = &l1[ptepindex & Ln_ADDR_MASK];
1921 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
1922 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
1923 pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
1924 } else {
1925 vm_pindex_t l0index, l1index;
1926 pd_entry_t *l0, *l1, *l2;
1927 pd_entry_t tl0, tl1;
1928
1929 l1index = ptepindex >> Ln_ENTRIES_SHIFT;
1930 l0index = l1index >> L0_ENTRIES_SHIFT;
1931
1932 l0 = &pmap->pm_l0[l0index];
1933 tl0 = pmap_load(l0);
1934 if (tl0 == 0) {
1935 /* recurse for allocating page dir */
1936 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1937 lockp) == NULL) {
1938 vm_page_unwire_noq(m);
1939 vm_page_free_zero(m);
1940 return (NULL);
1941 }
1942 tl0 = pmap_load(l0);
1943 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1944 l1 = &l1[l1index & Ln_ADDR_MASK];
1945 } else {
1946 l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1947 l1 = &l1[l1index & Ln_ADDR_MASK];
1948 tl1 = pmap_load(l1);
1949 if (tl1 == 0) {
1950 /* recurse for allocating page dir */
1951 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1952 lockp) == NULL) {
1953 vm_page_unwire_noq(m);
1954 vm_page_free_zero(m);
1955 return (NULL);
1956 }
1957 } else {
1958 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1959 l2pg->ref_count++;
1960 }
1961 }
1962
1963 l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
1964 l2 = &l2[ptepindex & Ln_ADDR_MASK];
1965 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
1966 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
1967 pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
1968 }
1969
1970 pmap_resident_count_inc(pmap, 1);
1971
1972 return (m);
1973 }
1974
1975 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)1976 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
1977 struct rwlock **lockp)
1978 {
1979 pd_entry_t *l1, *l2;
1980 vm_page_t l2pg;
1981 vm_pindex_t l2pindex;
1982
1983 KASSERT(ADDR_IS_CANONICAL(va),
1984 ("%s: Address not in canonical form: %lx", __func__, va));
1985
1986 retry:
1987 l1 = pmap_l1(pmap, va);
1988 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
1989 l2 = pmap_l1_to_l2(l1, va);
1990 if (!ADDR_IS_KERNEL(va)) {
1991 /* Add a reference to the L2 page. */
1992 l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK);
1993 l2pg->ref_count++;
1994 } else
1995 l2pg = NULL;
1996 } else if (!ADDR_IS_KERNEL(va)) {
1997 /* Allocate a L2 page. */
1998 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
1999 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2000 if (l2pg == NULL) {
2001 if (lockp != NULL)
2002 goto retry;
2003 else
2004 return (NULL);
2005 }
2006 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2007 l2 = &l2[pmap_l2_index(va)];
2008 } else
2009 panic("pmap_alloc_l2: missing page table page for va %#lx",
2010 va);
2011 *l2pgp = l2pg;
2012 return (l2);
2013 }
2014
2015 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)2016 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2017 {
2018 vm_pindex_t ptepindex;
2019 pd_entry_t *pde, tpde;
2020 #ifdef INVARIANTS
2021 pt_entry_t *pte;
2022 #endif
2023 vm_page_t m;
2024 int lvl;
2025
2026 /*
2027 * Calculate pagetable page index
2028 */
2029 ptepindex = pmap_l2_pindex(va);
2030 retry:
2031 /*
2032 * Get the page directory entry
2033 */
2034 pde = pmap_pde(pmap, va, &lvl);
2035
2036 /*
2037 * If the page table page is mapped, we just increment the hold count,
2038 * and activate it. If we get a level 2 pde it will point to a level 3
2039 * table.
2040 */
2041 switch (lvl) {
2042 case -1:
2043 break;
2044 case 0:
2045 #ifdef INVARIANTS
2046 pte = pmap_l0_to_l1(pde, va);
2047 KASSERT(pmap_load(pte) == 0,
2048 ("pmap_alloc_l3: TODO: l0 superpages"));
2049 #endif
2050 break;
2051 case 1:
2052 #ifdef INVARIANTS
2053 pte = pmap_l1_to_l2(pde, va);
2054 KASSERT(pmap_load(pte) == 0,
2055 ("pmap_alloc_l3: TODO: l1 superpages"));
2056 #endif
2057 break;
2058 case 2:
2059 tpde = pmap_load(pde);
2060 if (tpde != 0) {
2061 m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
2062 m->ref_count++;
2063 return (m);
2064 }
2065 break;
2066 default:
2067 panic("pmap_alloc_l3: Invalid level %d", lvl);
2068 }
2069
2070 /*
2071 * Here if the pte page isn't mapped, or if it has been deallocated.
2072 */
2073 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2074 if (m == NULL && lockp != NULL)
2075 goto retry;
2076
2077 return (m);
2078 }
2079
2080 /***************************************************
2081 * Pmap allocation/deallocation routines.
2082 ***************************************************/
2083
2084 /*
2085 * Release any resources held by the given physical map.
2086 * Called when a pmap initialized by pmap_pinit is being released.
2087 * Should only be called if the map contains no valid mappings.
2088 */
2089 void
pmap_release(pmap_t pmap)2090 pmap_release(pmap_t pmap)
2091 {
2092 boolean_t rv;
2093 struct spglist free;
2094 struct asid_set *set;
2095 vm_page_t m;
2096 int asid;
2097
2098 if (pmap->pm_levels != 4) {
2099 PMAP_ASSERT_STAGE2(pmap);
2100 KASSERT(pmap->pm_stats.resident_count == 1,
2101 ("pmap_release: pmap resident count %ld != 0",
2102 pmap->pm_stats.resident_count));
2103 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2104 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2105
2106 SLIST_INIT(&free);
2107 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2108 PMAP_LOCK(pmap);
2109 rv = pmap_unwire_l3(pmap, 0, m, &free);
2110 PMAP_UNLOCK(pmap);
2111 MPASS(rv == TRUE);
2112 vm_page_free_pages_toq(&free, true);
2113 }
2114
2115 KASSERT(pmap->pm_stats.resident_count == 0,
2116 ("pmap_release: pmap resident count %ld != 0",
2117 pmap->pm_stats.resident_count));
2118 KASSERT(vm_radix_is_empty(&pmap->pm_root),
2119 ("pmap_release: pmap has reserved page table page(s)"));
2120
2121 set = pmap->pm_asid_set;
2122 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2123
2124 /*
2125 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2126 * the entries when removing them so rely on a later tlb invalidation.
2127 * this will happen when updating the VMID generation. Because of this
2128 * we don't reuse VMIDs within a generation.
2129 */
2130 if (pmap->pm_stage == PM_STAGE1) {
2131 mtx_lock_spin(&set->asid_set_mutex);
2132 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2133 asid = COOKIE_TO_ASID(pmap->pm_cookie);
2134 KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2135 asid < set->asid_set_size,
2136 ("pmap_release: pmap cookie has out-of-range asid"));
2137 bit_clear(set->asid_set, asid);
2138 }
2139 mtx_unlock_spin(&set->asid_set_mutex);
2140 }
2141
2142 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2143 vm_page_unwire_noq(m);
2144 vm_page_free_zero(m);
2145 }
2146
2147 static int
kvm_size(SYSCTL_HANDLER_ARGS)2148 kvm_size(SYSCTL_HANDLER_ARGS)
2149 {
2150 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2151
2152 return sysctl_handle_long(oidp, &ksize, 0, req);
2153 }
2154 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2155 0, 0, kvm_size, "LU",
2156 "Size of KVM");
2157
2158 static int
kvm_free(SYSCTL_HANDLER_ARGS)2159 kvm_free(SYSCTL_HANDLER_ARGS)
2160 {
2161 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2162
2163 return sysctl_handle_long(oidp, &kfree, 0, req);
2164 }
2165 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2166 0, 0, kvm_free, "LU",
2167 "Amount of KVM free");
2168
2169 /*
2170 * grow the number of kernel page table entries, if needed
2171 */
2172 void
pmap_growkernel(vm_offset_t addr)2173 pmap_growkernel(vm_offset_t addr)
2174 {
2175 vm_paddr_t paddr;
2176 vm_page_t nkpg;
2177 pd_entry_t *l0, *l1, *l2;
2178
2179 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2180
2181 addr = roundup2(addr, L2_SIZE);
2182 if (addr - 1 >= vm_map_max(kernel_map))
2183 addr = vm_map_max(kernel_map);
2184 while (kernel_vm_end < addr) {
2185 l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2186 KASSERT(pmap_load(l0) != 0,
2187 ("pmap_growkernel: No level 0 kernel entry"));
2188
2189 l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2190 if (pmap_load(l1) == 0) {
2191 /* We need a new PDP entry */
2192 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2193 VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2194 if (nkpg == NULL)
2195 panic("pmap_growkernel: no memory to grow kernel");
2196 nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2197 /* See the dmb() in _pmap_alloc_l3(). */
2198 dmb(ishst);
2199 paddr = VM_PAGE_TO_PHYS(nkpg);
2200 pmap_store(l1, paddr | L1_TABLE);
2201 continue; /* try again */
2202 }
2203 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2204 if (pmap_load(l2) != 0) {
2205 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2206 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2207 kernel_vm_end = vm_map_max(kernel_map);
2208 break;
2209 }
2210 continue;
2211 }
2212
2213 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2214 VM_ALLOC_ZERO);
2215 if (nkpg == NULL)
2216 panic("pmap_growkernel: no memory to grow kernel");
2217 nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2218 /* See the dmb() in _pmap_alloc_l3(). */
2219 dmb(ishst);
2220 paddr = VM_PAGE_TO_PHYS(nkpg);
2221 pmap_store(l2, paddr | L2_TABLE);
2222
2223 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2224 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2225 kernel_vm_end = vm_map_max(kernel_map);
2226 break;
2227 }
2228 }
2229 }
2230
2231 /***************************************************
2232 * page management routines.
2233 ***************************************************/
2234
2235 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2236 CTASSERT(_NPCM == 3);
2237 CTASSERT(_NPCPV == 168);
2238
2239 static __inline struct pv_chunk *
pv_to_chunk(pv_entry_t pv)2240 pv_to_chunk(pv_entry_t pv)
2241 {
2242
2243 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2244 }
2245
2246 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2247
2248 #define PC_FREE0 0xfffffffffffffffful
2249 #define PC_FREE1 0xfffffffffffffffful
2250 #define PC_FREE2 0x000000fffffffffful
2251
2252 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2253
2254 #ifdef PV_STATS
2255 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2256
2257 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2258 "Current number of pv entry chunks");
2259 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2260 "Current number of pv entry chunks allocated");
2261 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2262 "Current number of pv entry chunks frees");
2263 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2264 "Number of times tried to get a chunk page but failed.");
2265
2266 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2267 static int pv_entry_spare;
2268
2269 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2270 "Current number of pv entry frees");
2271 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2272 "Current number of pv entry allocs");
2273 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2274 "Current number of pv entries");
2275 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2276 "Current number of spare pv entries");
2277 #endif
2278
2279 /*
2280 * We are in a serious low memory condition. Resort to
2281 * drastic measures to free some pages so we can allocate
2282 * another pv entry chunk.
2283 *
2284 * Returns NULL if PV entries were reclaimed from the specified pmap.
2285 *
2286 * We do not, however, unmap 2mpages because subsequent accesses will
2287 * allocate per-page pv entries until repromotion occurs, thereby
2288 * exacerbating the shortage of free pv entries.
2289 */
2290 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)2291 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2292 {
2293 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
2294 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
2295 struct md_page *pvh;
2296 pd_entry_t *pde;
2297 pmap_t next_pmap, pmap;
2298 pt_entry_t *pte, tpte;
2299 pv_entry_t pv;
2300 vm_offset_t va;
2301 vm_page_t m, m_pc;
2302 struct spglist free;
2303 uint64_t inuse;
2304 int bit, field, freed, lvl;
2305 static int active_reclaims = 0;
2306
2307 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2308 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2309
2310 pmap = NULL;
2311 m_pc = NULL;
2312 SLIST_INIT(&free);
2313 bzero(&pc_marker_b, sizeof(pc_marker_b));
2314 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
2315 pc_marker = (struct pv_chunk *)&pc_marker_b;
2316 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
2317
2318 mtx_lock(&pv_chunks_mutex);
2319 active_reclaims++;
2320 TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
2321 TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
2322 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
2323 SLIST_EMPTY(&free)) {
2324 next_pmap = pc->pc_pmap;
2325 if (next_pmap == NULL) {
2326 /*
2327 * The next chunk is a marker. However, it is
2328 * not our marker, so active_reclaims must be
2329 * > 1. Consequently, the next_chunk code
2330 * will not rotate the pv_chunks list.
2331 */
2332 goto next_chunk;
2333 }
2334 mtx_unlock(&pv_chunks_mutex);
2335
2336 /*
2337 * A pv_chunk can only be removed from the pc_lru list
2338 * when both pv_chunks_mutex is owned and the
2339 * corresponding pmap is locked.
2340 */
2341 if (pmap != next_pmap) {
2342 if (pmap != NULL && pmap != locked_pmap)
2343 PMAP_UNLOCK(pmap);
2344 pmap = next_pmap;
2345 /* Avoid deadlock and lock recursion. */
2346 if (pmap > locked_pmap) {
2347 RELEASE_PV_LIST_LOCK(lockp);
2348 PMAP_LOCK(pmap);
2349 mtx_lock(&pv_chunks_mutex);
2350 continue;
2351 } else if (pmap != locked_pmap) {
2352 if (PMAP_TRYLOCK(pmap)) {
2353 mtx_lock(&pv_chunks_mutex);
2354 continue;
2355 } else {
2356 pmap = NULL; /* pmap is not locked */
2357 mtx_lock(&pv_chunks_mutex);
2358 pc = TAILQ_NEXT(pc_marker, pc_lru);
2359 if (pc == NULL ||
2360 pc->pc_pmap != next_pmap)
2361 continue;
2362 goto next_chunk;
2363 }
2364 }
2365 }
2366
2367 /*
2368 * Destroy every non-wired, 4 KB page mapping in the chunk.
2369 */
2370 freed = 0;
2371 for (field = 0; field < _NPCM; field++) {
2372 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2373 inuse != 0; inuse &= ~(1UL << bit)) {
2374 bit = ffsl(inuse) - 1;
2375 pv = &pc->pc_pventry[field * 64 + bit];
2376 va = pv->pv_va;
2377 pde = pmap_pde(pmap, va, &lvl);
2378 if (lvl != 2)
2379 continue;
2380 pte = pmap_l2_to_l3(pde, va);
2381 tpte = pmap_load(pte);
2382 if ((tpte & ATTR_SW_WIRED) != 0)
2383 continue;
2384 tpte = pmap_load_clear(pte);
2385 m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
2386 if (pmap_pte_dirty(pmap, tpte))
2387 vm_page_dirty(m);
2388 if ((tpte & ATTR_AF) != 0) {
2389 pmap_invalidate_page(pmap, va);
2390 vm_page_aflag_set(m, PGA_REFERENCED);
2391 }
2392 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2393 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2394 m->md.pv_gen++;
2395 if (TAILQ_EMPTY(&m->md.pv_list) &&
2396 (m->flags & PG_FICTITIOUS) == 0) {
2397 pvh = page_to_pvh(m);
2398 if (TAILQ_EMPTY(&pvh->pv_list)) {
2399 vm_page_aflag_clear(m,
2400 PGA_WRITEABLE);
2401 }
2402 }
2403 pc->pc_map[field] |= 1UL << bit;
2404 pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
2405 freed++;
2406 }
2407 }
2408 if (freed == 0) {
2409 mtx_lock(&pv_chunks_mutex);
2410 goto next_chunk;
2411 }
2412 /* Every freed mapping is for a 4 KB page. */
2413 pmap_resident_count_dec(pmap, freed);
2414 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2415 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2416 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2417 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2418 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2419 pc->pc_map[2] == PC_FREE2) {
2420 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2421 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2422 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2423 /* Entire chunk is free; return it. */
2424 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2425 dump_drop_page(m_pc->phys_addr);
2426 mtx_lock(&pv_chunks_mutex);
2427 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2428 break;
2429 }
2430 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2431 mtx_lock(&pv_chunks_mutex);
2432 /* One freed pv entry in locked_pmap is sufficient. */
2433 if (pmap == locked_pmap)
2434 break;
2435
2436 next_chunk:
2437 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
2438 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
2439 if (active_reclaims == 1 && pmap != NULL) {
2440 /*
2441 * Rotate the pv chunks list so that we do not
2442 * scan the same pv chunks that could not be
2443 * freed (because they contained a wired
2444 * and/or superpage mapping) on every
2445 * invocation of reclaim_pv_chunk().
2446 */
2447 while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
2448 MPASS(pc->pc_pmap != NULL);
2449 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2450 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2451 }
2452 }
2453 }
2454 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
2455 TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
2456 active_reclaims--;
2457 mtx_unlock(&pv_chunks_mutex);
2458 if (pmap != NULL && pmap != locked_pmap)
2459 PMAP_UNLOCK(pmap);
2460 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2461 m_pc = SLIST_FIRST(&free);
2462 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2463 /* Recycle a freed page table page. */
2464 m_pc->ref_count = 1;
2465 }
2466 vm_page_free_pages_toq(&free, true);
2467 return (m_pc);
2468 }
2469
2470 /*
2471 * free the pv_entry back to the free list
2472 */
2473 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)2474 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2475 {
2476 struct pv_chunk *pc;
2477 int idx, field, bit;
2478
2479 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2480 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2481 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2482 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2483 pc = pv_to_chunk(pv);
2484 idx = pv - &pc->pc_pventry[0];
2485 field = idx / 64;
2486 bit = idx % 64;
2487 pc->pc_map[field] |= 1ul << bit;
2488 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2489 pc->pc_map[2] != PC_FREE2) {
2490 /* 98% of the time, pc is already at the head of the list. */
2491 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2492 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2493 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2494 }
2495 return;
2496 }
2497 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2498 free_pv_chunk(pc);
2499 }
2500
2501 static void
free_pv_chunk(struct pv_chunk * pc)2502 free_pv_chunk(struct pv_chunk *pc)
2503 {
2504 vm_page_t m;
2505
2506 mtx_lock(&pv_chunks_mutex);
2507 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2508 mtx_unlock(&pv_chunks_mutex);
2509 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2510 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2511 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2512 /* entire chunk is free, return it */
2513 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2514 dump_drop_page(m->phys_addr);
2515 vm_page_unwire_noq(m);
2516 vm_page_free(m);
2517 }
2518
2519 /*
2520 * Returns a new PV entry, allocating a new PV chunk from the system when
2521 * needed. If this PV chunk allocation fails and a PV list lock pointer was
2522 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
2523 * returned.
2524 *
2525 * The given PV list lock may be released.
2526 */
2527 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)2528 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2529 {
2530 int bit, field;
2531 pv_entry_t pv;
2532 struct pv_chunk *pc;
2533 vm_page_t m;
2534
2535 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2536 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2537 retry:
2538 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2539 if (pc != NULL) {
2540 for (field = 0; field < _NPCM; field++) {
2541 if (pc->pc_map[field]) {
2542 bit = ffsl(pc->pc_map[field]) - 1;
2543 break;
2544 }
2545 }
2546 if (field < _NPCM) {
2547 pv = &pc->pc_pventry[field * 64 + bit];
2548 pc->pc_map[field] &= ~(1ul << bit);
2549 /* If this was the last item, move it to tail */
2550 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2551 pc->pc_map[2] == 0) {
2552 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2553 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2554 pc_list);
2555 }
2556 PV_STAT(atomic_add_long(&pv_entry_count, 1));
2557 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2558 return (pv);
2559 }
2560 }
2561 /* No free items, allocate another chunk */
2562 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2563 if (m == NULL) {
2564 if (lockp == NULL) {
2565 PV_STAT(pc_chunk_tryfail++);
2566 return (NULL);
2567 }
2568 m = reclaim_pv_chunk(pmap, lockp);
2569 if (m == NULL)
2570 goto retry;
2571 }
2572 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2573 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2574 dump_add_page(m->phys_addr);
2575 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2576 pc->pc_pmap = pmap;
2577 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
2578 pc->pc_map[1] = PC_FREE1;
2579 pc->pc_map[2] = PC_FREE2;
2580 mtx_lock(&pv_chunks_mutex);
2581 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2582 mtx_unlock(&pv_chunks_mutex);
2583 pv = &pc->pc_pventry[0];
2584 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2585 PV_STAT(atomic_add_long(&pv_entry_count, 1));
2586 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2587 return (pv);
2588 }
2589
2590 /*
2591 * Ensure that the number of spare PV entries in the specified pmap meets or
2592 * exceeds the given count, "needed".
2593 *
2594 * The given PV list lock may be released.
2595 */
2596 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)2597 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2598 {
2599 struct pch new_tail;
2600 struct pv_chunk *pc;
2601 vm_page_t m;
2602 int avail, free;
2603 bool reclaimed;
2604
2605 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2606 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2607
2608 /*
2609 * Newly allocated PV chunks must be stored in a private list until
2610 * the required number of PV chunks have been allocated. Otherwise,
2611 * reclaim_pv_chunk() could recycle one of these chunks. In
2612 * contrast, these chunks must be added to the pmap upon allocation.
2613 */
2614 TAILQ_INIT(&new_tail);
2615 retry:
2616 avail = 0;
2617 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2618 bit_count((bitstr_t *)pc->pc_map, 0,
2619 sizeof(pc->pc_map) * NBBY, &free);
2620 if (free == 0)
2621 break;
2622 avail += free;
2623 if (avail >= needed)
2624 break;
2625 }
2626 for (reclaimed = false; avail < needed; avail += _NPCPV) {
2627 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2628 if (m == NULL) {
2629 m = reclaim_pv_chunk(pmap, lockp);
2630 if (m == NULL)
2631 goto retry;
2632 reclaimed = true;
2633 }
2634 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2635 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2636 dump_add_page(m->phys_addr);
2637 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2638 pc->pc_pmap = pmap;
2639 pc->pc_map[0] = PC_FREE0;
2640 pc->pc_map[1] = PC_FREE1;
2641 pc->pc_map[2] = PC_FREE2;
2642 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2643 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2644 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2645
2646 /*
2647 * The reclaim might have freed a chunk from the current pmap.
2648 * If that chunk contained available entries, we need to
2649 * re-count the number of available entries.
2650 */
2651 if (reclaimed)
2652 goto retry;
2653 }
2654 if (!TAILQ_EMPTY(&new_tail)) {
2655 mtx_lock(&pv_chunks_mutex);
2656 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2657 mtx_unlock(&pv_chunks_mutex);
2658 }
2659 }
2660
2661 /*
2662 * First find and then remove the pv entry for the specified pmap and virtual
2663 * address from the specified pv list. Returns the pv entry if found and NULL
2664 * otherwise. This operation can be performed on pv lists for either 4KB or
2665 * 2MB page mappings.
2666 */
2667 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2668 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2669 {
2670 pv_entry_t pv;
2671
2672 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2673 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2674 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2675 pvh->pv_gen++;
2676 break;
2677 }
2678 }
2679 return (pv);
2680 }
2681
2682 /*
2683 * After demotion from a 2MB page mapping to 512 4KB page mappings,
2684 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2685 * entries for each of the 4KB page mappings.
2686 */
2687 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)2688 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2689 struct rwlock **lockp)
2690 {
2691 struct md_page *pvh;
2692 struct pv_chunk *pc;
2693 pv_entry_t pv;
2694 vm_offset_t va_last;
2695 vm_page_t m;
2696 int bit, field;
2697
2698 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2699 KASSERT((va & L2_OFFSET) == 0,
2700 ("pmap_pv_demote_l2: va is not 2mpage aligned"));
2701 KASSERT((pa & L2_OFFSET) == 0,
2702 ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
2703 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2704
2705 /*
2706 * Transfer the 2mpage's pv entry for this mapping to the first
2707 * page's pv list. Once this transfer begins, the pv list lock
2708 * must not be released until the last pv entry is reinstantiated.
2709 */
2710 pvh = pa_to_pvh(pa);
2711 pv = pmap_pvh_remove(pvh, pmap, va);
2712 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2713 m = PHYS_TO_VM_PAGE(pa);
2714 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2715 m->md.pv_gen++;
2716 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
2717 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2718 va_last = va + L2_SIZE - PAGE_SIZE;
2719 for (;;) {
2720 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2721 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
2722 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
2723 for (field = 0; field < _NPCM; field++) {
2724 while (pc->pc_map[field]) {
2725 bit = ffsl(pc->pc_map[field]) - 1;
2726 pc->pc_map[field] &= ~(1ul << bit);
2727 pv = &pc->pc_pventry[field * 64 + bit];
2728 va += PAGE_SIZE;
2729 pv->pv_va = va;
2730 m++;
2731 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2732 ("pmap_pv_demote_l2: page %p is not managed", m));
2733 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2734 m->md.pv_gen++;
2735 if (va == va_last)
2736 goto out;
2737 }
2738 }
2739 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2740 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2741 }
2742 out:
2743 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
2744 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2745 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2746 }
2747 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2748 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
2749 }
2750
2751 /*
2752 * First find and then destroy the pv entry for the specified pmap and virtual
2753 * address. This operation can be performed on pv lists for either 4KB or 2MB
2754 * page mappings.
2755 */
2756 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2757 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2758 {
2759 pv_entry_t pv;
2760
2761 pv = pmap_pvh_remove(pvh, pmap, va);
2762 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2763 free_pv_entry(pmap, pv);
2764 }
2765
2766 /*
2767 * Conditionally create the PV entry for a 4KB page mapping if the required
2768 * memory can be allocated without resorting to reclamation.
2769 */
2770 static boolean_t
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)2771 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2772 struct rwlock **lockp)
2773 {
2774 pv_entry_t pv;
2775
2776 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2777 /* Pass NULL instead of the lock pointer to disable reclamation. */
2778 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2779 pv->pv_va = va;
2780 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2781 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2782 m->md.pv_gen++;
2783 return (TRUE);
2784 } else
2785 return (FALSE);
2786 }
2787
2788 /*
2789 * Create the PV entry for a 2MB page mapping. Always returns true unless the
2790 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
2791 * false if the PV entry cannot be allocated without resorting to reclamation.
2792 */
2793 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)2794 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
2795 struct rwlock **lockp)
2796 {
2797 struct md_page *pvh;
2798 pv_entry_t pv;
2799 vm_paddr_t pa;
2800
2801 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2802 /* Pass NULL instead of the lock pointer to disable reclamation. */
2803 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
2804 NULL : lockp)) == NULL)
2805 return (false);
2806 pv->pv_va = va;
2807 pa = l2e & ~ATTR_MASK;
2808 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2809 pvh = pa_to_pvh(pa);
2810 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2811 pvh->pv_gen++;
2812 return (true);
2813 }
2814
2815 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)2816 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
2817 {
2818 pt_entry_t newl2, oldl2;
2819 vm_page_t ml3;
2820 vm_paddr_t ml3pa;
2821
2822 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
2823 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
2824 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2825
2826 ml3 = pmap_remove_pt_page(pmap, va);
2827 if (ml3 == NULL)
2828 panic("pmap_remove_kernel_l2: Missing pt page");
2829
2830 ml3pa = VM_PAGE_TO_PHYS(ml3);
2831 newl2 = ml3pa | L2_TABLE;
2832
2833 /*
2834 * If this page table page was unmapped by a promotion, then it
2835 * contains valid mappings. Zero it to invalidate those mappings.
2836 */
2837 if (ml3->valid != 0)
2838 pagezero((void *)PHYS_TO_DMAP(ml3pa));
2839
2840 /*
2841 * Demote the mapping. The caller must have already invalidated the
2842 * mapping (i.e., the "break" in break-before-make).
2843 */
2844 oldl2 = pmap_load_store(l2, newl2);
2845 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
2846 __func__, l2, oldl2));
2847 }
2848
2849 /*
2850 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
2851 */
2852 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)2853 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2854 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2855 {
2856 struct md_page *pvh;
2857 pt_entry_t old_l2;
2858 vm_page_t m, ml3, mt;
2859
2860 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2861 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2862 old_l2 = pmap_load_clear(l2);
2863 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
2864 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
2865
2866 /*
2867 * Since a promotion must break the 4KB page mappings before making
2868 * the 2MB page mapping, a pmap_invalidate_page() suffices.
2869 */
2870 pmap_invalidate_page(pmap, sva);
2871
2872 if (old_l2 & ATTR_SW_WIRED)
2873 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2874 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2875 if (old_l2 & ATTR_SW_MANAGED) {
2876 m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
2877 pvh = page_to_pvh(m);
2878 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
2879 pmap_pvh_free(pvh, pmap, sva);
2880 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
2881 if (pmap_pte_dirty(pmap, old_l2))
2882 vm_page_dirty(mt);
2883 if (old_l2 & ATTR_AF)
2884 vm_page_aflag_set(mt, PGA_REFERENCED);
2885 if (TAILQ_EMPTY(&mt->md.pv_list) &&
2886 TAILQ_EMPTY(&pvh->pv_list))
2887 vm_page_aflag_clear(mt, PGA_WRITEABLE);
2888 }
2889 }
2890 if (pmap == kernel_pmap) {
2891 pmap_remove_kernel_l2(pmap, l2, sva);
2892 } else {
2893 ml3 = pmap_remove_pt_page(pmap, sva);
2894 if (ml3 != NULL) {
2895 KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
2896 ("pmap_remove_l2: l3 page not promoted"));
2897 pmap_resident_count_dec(pmap, 1);
2898 KASSERT(ml3->ref_count == NL3PG,
2899 ("pmap_remove_l2: l3 page ref count error"));
2900 ml3->ref_count = 0;
2901 pmap_add_delayed_free_list(ml3, free, FALSE);
2902 }
2903 }
2904 return (pmap_unuse_pt(pmap, sva, l1e, free));
2905 }
2906
2907 /*
2908 * pmap_remove_l3: do the things to unmap a page in a process
2909 */
2910 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)2911 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2912 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2913 {
2914 struct md_page *pvh;
2915 pt_entry_t old_l3;
2916 vm_page_t m;
2917
2918 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2919 old_l3 = pmap_load_clear(l3);
2920 pmap_invalidate_page(pmap, va);
2921 if (old_l3 & ATTR_SW_WIRED)
2922 pmap->pm_stats.wired_count -= 1;
2923 pmap_resident_count_dec(pmap, 1);
2924 if (old_l3 & ATTR_SW_MANAGED) {
2925 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
2926 if (pmap_pte_dirty(pmap, old_l3))
2927 vm_page_dirty(m);
2928 if (old_l3 & ATTR_AF)
2929 vm_page_aflag_set(m, PGA_REFERENCED);
2930 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2931 pmap_pvh_free(&m->md, pmap, va);
2932 if (TAILQ_EMPTY(&m->md.pv_list) &&
2933 (m->flags & PG_FICTITIOUS) == 0) {
2934 pvh = page_to_pvh(m);
2935 if (TAILQ_EMPTY(&pvh->pv_list))
2936 vm_page_aflag_clear(m, PGA_WRITEABLE);
2937 }
2938 }
2939 return (pmap_unuse_pt(pmap, va, l2e, free));
2940 }
2941
2942 /*
2943 * Remove the specified range of addresses from the L3 page table that is
2944 * identified by the given L2 entry.
2945 */
2946 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)2947 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
2948 vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
2949 {
2950 struct md_page *pvh;
2951 struct rwlock *new_lock;
2952 pt_entry_t *l3, old_l3;
2953 vm_offset_t va;
2954 vm_page_t l3pg, m;
2955
2956 KASSERT(ADDR_IS_CANONICAL(sva),
2957 ("%s: Start address not in canonical form: %lx", __func__, sva));
2958 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
2959 ("%s: End address not in canonical form: %lx", __func__, eva));
2960
2961 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2962 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
2963 ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
2964 l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : NULL;
2965 va = eva;
2966 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
2967 if (!pmap_l3_valid(pmap_load(l3))) {
2968 if (va != eva) {
2969 pmap_invalidate_range(pmap, va, sva);
2970 va = eva;
2971 }
2972 continue;
2973 }
2974 old_l3 = pmap_load_clear(l3);
2975 if ((old_l3 & ATTR_SW_WIRED) != 0)
2976 pmap->pm_stats.wired_count--;
2977 pmap_resident_count_dec(pmap, 1);
2978 if ((old_l3 & ATTR_SW_MANAGED) != 0) {
2979 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
2980 if (pmap_pte_dirty(pmap, old_l3))
2981 vm_page_dirty(m);
2982 if ((old_l3 & ATTR_AF) != 0)
2983 vm_page_aflag_set(m, PGA_REFERENCED);
2984 new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m));
2985 if (new_lock != *lockp) {
2986 if (*lockp != NULL) {
2987 /*
2988 * Pending TLB invalidations must be
2989 * performed before the PV list lock is
2990 * released. Otherwise, a concurrent
2991 * pmap_remove_all() on a physical page
2992 * could return while a stale TLB entry
2993 * still provides access to that page.
2994 */
2995 if (va != eva) {
2996 pmap_invalidate_range(pmap, va,
2997 sva);
2998 va = eva;
2999 }
3000 rw_wunlock(*lockp);
3001 }
3002 *lockp = new_lock;
3003 rw_wlock(*lockp);
3004 }
3005 pmap_pvh_free(&m->md, pmap, sva);
3006 if (TAILQ_EMPTY(&m->md.pv_list) &&
3007 (m->flags & PG_FICTITIOUS) == 0) {
3008 pvh = page_to_pvh(m);
3009 if (TAILQ_EMPTY(&pvh->pv_list))
3010 vm_page_aflag_clear(m, PGA_WRITEABLE);
3011 }
3012 }
3013 if (va == eva)
3014 va = sva;
3015 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3016 sva += L3_SIZE;
3017 break;
3018 }
3019 }
3020 if (va != eva)
3021 pmap_invalidate_range(pmap, va, sva);
3022 }
3023
3024 /*
3025 * Remove the given range of addresses from the specified map.
3026 *
3027 * It is assumed that the start and end are properly
3028 * rounded to the page size.
3029 */
3030 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3031 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3032 {
3033 struct rwlock *lock;
3034 vm_offset_t va_next;
3035 pd_entry_t *l0, *l1, *l2;
3036 pt_entry_t l3_paddr;
3037 struct spglist free;
3038
3039 /*
3040 * Perform an unsynchronized read. This is, however, safe.
3041 */
3042 if (pmap->pm_stats.resident_count == 0)
3043 return;
3044
3045 SLIST_INIT(&free);
3046
3047 PMAP_LOCK(pmap);
3048
3049 lock = NULL;
3050 for (; sva < eva; sva = va_next) {
3051 if (pmap->pm_stats.resident_count == 0)
3052 break;
3053
3054 l0 = pmap_l0(pmap, sva);
3055 if (pmap_load(l0) == 0) {
3056 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3057 if (va_next < sva)
3058 va_next = eva;
3059 continue;
3060 }
3061
3062 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3063 if (va_next < sva)
3064 va_next = eva;
3065 l1 = pmap_l0_to_l1(l0, sva);
3066 if (pmap_load(l1) == 0)
3067 continue;
3068 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3069 KASSERT(va_next <= eva,
3070 ("partial update of non-transparent 1G page "
3071 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3072 pmap_load(l1), sva, eva, va_next));
3073 MPASS(pmap != kernel_pmap);
3074 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3075 pmap_clear(l1);
3076 pmap_invalidate_page(pmap, sva);
3077 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
3078 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
3079 continue;
3080 }
3081
3082 /*
3083 * Calculate index for next page table.
3084 */
3085 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3086 if (va_next < sva)
3087 va_next = eva;
3088
3089 l2 = pmap_l1_to_l2(l1, sva);
3090 if (l2 == NULL)
3091 continue;
3092
3093 l3_paddr = pmap_load(l2);
3094
3095 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
3096 if (sva + L2_SIZE == va_next && eva >= va_next) {
3097 pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
3098 &free, &lock);
3099 continue;
3100 } else if (pmap_demote_l2_locked(pmap, l2, sva,
3101 &lock) == NULL)
3102 continue;
3103 l3_paddr = pmap_load(l2);
3104 }
3105
3106 /*
3107 * Weed out invalid mappings.
3108 */
3109 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
3110 continue;
3111
3112 /*
3113 * Limit our scan to either the end of the va represented
3114 * by the current page table page, or to the end of the
3115 * range being removed.
3116 */
3117 if (va_next > eva)
3118 va_next = eva;
3119
3120 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
3121 &lock);
3122 }
3123 if (lock != NULL)
3124 rw_wunlock(lock);
3125 PMAP_UNLOCK(pmap);
3126 vm_page_free_pages_toq(&free, true);
3127 }
3128
3129 /*
3130 * Routine: pmap_remove_all
3131 * Function:
3132 * Removes this physical page from
3133 * all physical maps in which it resides.
3134 * Reflects back modify bits to the pager.
3135 *
3136 * Notes:
3137 * Original versions of this routine were very
3138 * inefficient because they iteratively called
3139 * pmap_remove (slow...)
3140 */
3141
3142 void
pmap_remove_all(vm_page_t m)3143 pmap_remove_all(vm_page_t m)
3144 {
3145 struct md_page *pvh;
3146 pv_entry_t pv;
3147 pmap_t pmap;
3148 struct rwlock *lock;
3149 pd_entry_t *pde, tpde;
3150 pt_entry_t *pte, tpte;
3151 vm_offset_t va;
3152 struct spglist free;
3153 int lvl, pvh_gen, md_gen;
3154
3155 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3156 ("pmap_remove_all: page %p is not managed", m));
3157 SLIST_INIT(&free);
3158 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3159 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
3160 rw_wlock(lock);
3161 retry:
3162 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3163 pmap = PV_PMAP(pv);
3164 if (!PMAP_TRYLOCK(pmap)) {
3165 pvh_gen = pvh->pv_gen;
3166 rw_wunlock(lock);
3167 PMAP_LOCK(pmap);
3168 rw_wlock(lock);
3169 if (pvh_gen != pvh->pv_gen) {
3170 PMAP_UNLOCK(pmap);
3171 goto retry;
3172 }
3173 }
3174 va = pv->pv_va;
3175 pte = pmap_pte(pmap, va, &lvl);
3176 KASSERT(pte != NULL,
3177 ("pmap_remove_all: no page table entry found"));
3178 KASSERT(lvl == 2,
3179 ("pmap_remove_all: invalid pte level %d", lvl));
3180 pmap_demote_l2_locked(pmap, pte, va, &lock);
3181 PMAP_UNLOCK(pmap);
3182 }
3183 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3184 pmap = PV_PMAP(pv);
3185 PMAP_ASSERT_STAGE1(pmap);
3186 if (!PMAP_TRYLOCK(pmap)) {
3187 pvh_gen = pvh->pv_gen;
3188 md_gen = m->md.pv_gen;
3189 rw_wunlock(lock);
3190 PMAP_LOCK(pmap);
3191 rw_wlock(lock);
3192 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3193 PMAP_UNLOCK(pmap);
3194 goto retry;
3195 }
3196 }
3197 pmap_resident_count_dec(pmap, 1);
3198
3199 pde = pmap_pde(pmap, pv->pv_va, &lvl);
3200 KASSERT(pde != NULL,
3201 ("pmap_remove_all: no page directory entry found"));
3202 KASSERT(lvl == 2,
3203 ("pmap_remove_all: invalid pde level %d", lvl));
3204 tpde = pmap_load(pde);
3205
3206 pte = pmap_l2_to_l3(pde, pv->pv_va);
3207 tpte = pmap_load_clear(pte);
3208 if (tpte & ATTR_SW_WIRED)
3209 pmap->pm_stats.wired_count--;
3210 if ((tpte & ATTR_AF) != 0) {
3211 pmap_invalidate_page(pmap, pv->pv_va);
3212 vm_page_aflag_set(m, PGA_REFERENCED);
3213 }
3214
3215 /*
3216 * Update the vm_page_t clean and reference bits.
3217 */
3218 if (pmap_pte_dirty(pmap, tpte))
3219 vm_page_dirty(m);
3220 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
3221 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3222 m->md.pv_gen++;
3223 free_pv_entry(pmap, pv);
3224 PMAP_UNLOCK(pmap);
3225 }
3226 vm_page_aflag_clear(m, PGA_WRITEABLE);
3227 rw_wunlock(lock);
3228 vm_page_free_pages_toq(&free, true);
3229 }
3230
3231 /*
3232 * pmap_protect_l2: do the things to protect a 2MB page in a pmap
3233 */
3234 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)3235 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
3236 pt_entry_t nbits)
3237 {
3238 pd_entry_t old_l2;
3239 vm_page_t m, mt;
3240
3241 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3242 PMAP_ASSERT_STAGE1(pmap);
3243 KASSERT((sva & L2_OFFSET) == 0,
3244 ("pmap_protect_l2: sva is not 2mpage aligned"));
3245 old_l2 = pmap_load(l2);
3246 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3247 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
3248
3249 /*
3250 * Return if the L2 entry already has the desired access restrictions
3251 * in place.
3252 */
3253 if ((old_l2 & mask) == nbits)
3254 return;
3255
3256 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
3257 cpu_spinwait();
3258
3259 /*
3260 * When a dirty read/write superpage mapping is write protected,
3261 * update the dirty field of each of the superpage's constituent 4KB
3262 * pages.
3263 */
3264 if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
3265 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3266 pmap_pte_dirty(pmap, old_l2)) {
3267 m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
3268 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3269 vm_page_dirty(mt);
3270 }
3271
3272 /*
3273 * Since a promotion must break the 4KB page mappings before making
3274 * the 2MB page mapping, a pmap_invalidate_page() suffices.
3275 */
3276 pmap_invalidate_page(pmap, sva);
3277 }
3278
3279 /*
3280 * Set the physical protection on the
3281 * specified range of this map as requested.
3282 */
3283 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)3284 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3285 {
3286 vm_offset_t va, va_next;
3287 pd_entry_t *l0, *l1, *l2;
3288 pt_entry_t *l3p, l3, mask, nbits;
3289
3290 PMAP_ASSERT_STAGE1(pmap);
3291 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3292 if (prot == VM_PROT_NONE) {
3293 pmap_remove(pmap, sva, eva);
3294 return;
3295 }
3296
3297 mask = nbits = 0;
3298 if ((prot & VM_PROT_WRITE) == 0) {
3299 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
3300 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
3301 }
3302 if ((prot & VM_PROT_EXECUTE) == 0) {
3303 mask |= ATTR_S1_XN;
3304 nbits |= ATTR_S1_XN;
3305 }
3306 if (mask == 0)
3307 return;
3308
3309 PMAP_LOCK(pmap);
3310 for (; sva < eva; sva = va_next) {
3311 l0 = pmap_l0(pmap, sva);
3312 if (pmap_load(l0) == 0) {
3313 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3314 if (va_next < sva)
3315 va_next = eva;
3316 continue;
3317 }
3318
3319 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3320 if (va_next < sva)
3321 va_next = eva;
3322 l1 = pmap_l0_to_l1(l0, sva);
3323 if (pmap_load(l1) == 0)
3324 continue;
3325 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3326 KASSERT(va_next <= eva,
3327 ("partial update of non-transparent 1G page "
3328 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3329 pmap_load(l1), sva, eva, va_next));
3330 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3331 if ((pmap_load(l1) & mask) != nbits) {
3332 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
3333 pmap_invalidate_page(pmap, sva);
3334 }
3335 continue;
3336 }
3337
3338 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3339 if (va_next < sva)
3340 va_next = eva;
3341
3342 l2 = pmap_l1_to_l2(l1, sva);
3343 if (pmap_load(l2) == 0)
3344 continue;
3345
3346 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
3347 if (sva + L2_SIZE == va_next && eva >= va_next) {
3348 pmap_protect_l2(pmap, l2, sva, mask, nbits);
3349 continue;
3350 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
3351 continue;
3352 }
3353 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
3354 ("pmap_protect: Invalid L2 entry after demotion"));
3355
3356 if (va_next > eva)
3357 va_next = eva;
3358
3359 va = va_next;
3360 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
3361 sva += L3_SIZE) {
3362 l3 = pmap_load(l3p);
3363
3364 /*
3365 * Go to the next L3 entry if the current one is
3366 * invalid or already has the desired access
3367 * restrictions in place. (The latter case occurs
3368 * frequently. For example, in a "buildworld"
3369 * workload, almost 1 out of 4 L3 entries already
3370 * have the desired restrictions.)
3371 */
3372 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
3373 if (va != va_next) {
3374 pmap_invalidate_range(pmap, va, sva);
3375 va = va_next;
3376 }
3377 continue;
3378 }
3379
3380 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
3381 nbits))
3382 cpu_spinwait();
3383
3384 /*
3385 * When a dirty read/write mapping is write protected,
3386 * update the page's dirty field.
3387 */
3388 if ((l3 & ATTR_SW_MANAGED) != 0 &&
3389 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3390 pmap_pte_dirty(pmap, l3))
3391 vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK));
3392
3393 if (va == va_next)
3394 va = sva;
3395 }
3396 if (va != va_next)
3397 pmap_invalidate_range(pmap, va, sva);
3398 }
3399 PMAP_UNLOCK(pmap);
3400 }
3401
3402 /*
3403 * Inserts the specified page table page into the specified pmap's collection
3404 * of idle page table pages. Each of a pmap's page table pages is responsible
3405 * for mapping a distinct range of virtual addresses. The pmap's collection is
3406 * ordered by this virtual address range.
3407 *
3408 * If "promoted" is false, then the page table page "mpte" must be zero filled.
3409 */
3410 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted)3411 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
3412 {
3413
3414 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3415 mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
3416 return (vm_radix_insert(&pmap->pm_root, mpte));
3417 }
3418
3419 /*
3420 * Removes the page table page mapping the specified virtual address from the
3421 * specified pmap's collection of idle page table pages, and returns it.
3422 * Otherwise, returns NULL if there is no page table page corresponding to the
3423 * specified virtual address.
3424 */
3425 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)3426 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
3427 {
3428
3429 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3430 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
3431 }
3432
3433 /*
3434 * Performs a break-before-make update of a pmap entry. This is needed when
3435 * either promoting or demoting pages to ensure the TLB doesn't get into an
3436 * inconsistent state.
3437 */
3438 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * pte,pd_entry_t newpte,vm_offset_t va,vm_size_t size)3439 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
3440 vm_offset_t va, vm_size_t size)
3441 {
3442 register_t intr;
3443
3444 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3445
3446 /*
3447 * Ensure we don't get switched out with the page table in an
3448 * inconsistent state. We also need to ensure no interrupts fire
3449 * as they may make use of an address we are about to invalidate.
3450 */
3451 intr = intr_disable();
3452
3453 /*
3454 * Clear the old mapping's valid bit, but leave the rest of the entry
3455 * unchanged, so that a lockless, concurrent pmap_kextract() can still
3456 * lookup the physical address.
3457 */
3458 pmap_clear_bits(pte, ATTR_DESCR_VALID);
3459 pmap_invalidate_range(pmap, va, va + size);
3460
3461 /* Create the new mapping */
3462 pmap_store(pte, newpte);
3463 dsb(ishst);
3464
3465 intr_restore(intr);
3466 }
3467
3468 #if VM_NRESERVLEVEL > 0
3469 /*
3470 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3471 * replace the many pv entries for the 4KB page mappings by a single pv entry
3472 * for the 2MB page mapping.
3473 */
3474 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3475 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3476 struct rwlock **lockp)
3477 {
3478 struct md_page *pvh;
3479 pv_entry_t pv;
3480 vm_offset_t va_last;
3481 vm_page_t m;
3482
3483 KASSERT((pa & L2_OFFSET) == 0,
3484 ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
3485 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3486
3487 /*
3488 * Transfer the first page's pv entry for this mapping to the 2mpage's
3489 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
3490 * a transfer avoids the possibility that get_pv_entry() calls
3491 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3492 * mappings that is being promoted.
3493 */
3494 m = PHYS_TO_VM_PAGE(pa);
3495 va = va & ~L2_OFFSET;
3496 pv = pmap_pvh_remove(&m->md, pmap, va);
3497 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
3498 pvh = page_to_pvh(m);
3499 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3500 pvh->pv_gen++;
3501 /* Free the remaining NPTEPG - 1 pv entries. */
3502 va_last = va + L2_SIZE - PAGE_SIZE;
3503 do {
3504 m++;
3505 va += PAGE_SIZE;
3506 pmap_pvh_free(&m->md, pmap, va);
3507 } while (va < va_last);
3508 }
3509
3510 /*
3511 * Tries to promote the 512, contiguous 4KB page mappings that are within a
3512 * single level 2 table entry to a single 2MB page mapping. For promotion
3513 * to occur, two conditions must be met: (1) the 4KB page mappings must map
3514 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
3515 * identical characteristics.
3516 */
3517 static void
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)3518 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
3519 struct rwlock **lockp)
3520 {
3521 pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
3522 vm_page_t mpte;
3523 vm_offset_t sva;
3524
3525 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3526 PMAP_ASSERT_STAGE1(pmap);
3527
3528 sva = va & ~L2_OFFSET;
3529 firstl3 = pmap_l2_to_l3(l2, sva);
3530 newl2 = pmap_load(firstl3);
3531
3532 if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) {
3533 atomic_add_long(&pmap_l2_p_failures, 1);
3534 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
3535 " in pmap %p", va, pmap);
3536 return;
3537 }
3538
3539 setl2:
3540 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
3541 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
3542 /*
3543 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
3544 * ATTR_SW_DBM can be cleared without a TLB invalidation.
3545 */
3546 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
3547 goto setl2;
3548 newl2 &= ~ATTR_SW_DBM;
3549 }
3550
3551 pa = newl2 + L2_SIZE - PAGE_SIZE;
3552 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
3553 oldl3 = pmap_load(l3);
3554 setl3:
3555 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
3556 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
3557 /*
3558 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
3559 * set, ATTR_SW_DBM can be cleared without a TLB
3560 * invalidation.
3561 */
3562 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
3563 ~ATTR_SW_DBM))
3564 goto setl3;
3565 oldl3 &= ~ATTR_SW_DBM;
3566 }
3567 if (oldl3 != pa) {
3568 atomic_add_long(&pmap_l2_p_failures, 1);
3569 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
3570 " in pmap %p", va, pmap);
3571 return;
3572 }
3573 pa -= PAGE_SIZE;
3574 }
3575
3576 /*
3577 * Save the page table page in its current state until the L2
3578 * mapping the superpage is demoted by pmap_demote_l2() or
3579 * destroyed by pmap_remove_l3().
3580 */
3581 mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
3582 KASSERT(mpte >= vm_page_array &&
3583 mpte < &vm_page_array[vm_page_array_size],
3584 ("pmap_promote_l2: page table page is out of range"));
3585 KASSERT(mpte->pindex == pmap_l2_pindex(va),
3586 ("pmap_promote_l2: page table page's pindex is wrong"));
3587 if (pmap_insert_pt_page(pmap, mpte, true)) {
3588 atomic_add_long(&pmap_l2_p_failures, 1);
3589 CTR2(KTR_PMAP,
3590 "pmap_promote_l2: failure for va %#lx in pmap %p", va,
3591 pmap);
3592 return;
3593 }
3594
3595 if ((newl2 & ATTR_SW_MANAGED) != 0)
3596 pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
3597
3598 newl2 &= ~ATTR_DESCR_MASK;
3599 newl2 |= L2_BLOCK;
3600
3601 pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);
3602
3603 atomic_add_long(&pmap_l2_promotions, 1);
3604 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
3605 pmap);
3606 }
3607 #endif /* VM_NRESERVLEVEL > 0 */
3608
3609 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t newpte,int flags,int psind)3610 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
3611 int psind)
3612 {
3613 pd_entry_t *l0p, *l1p, *l2p, origpte;
3614 vm_page_t mp;
3615
3616 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3617 KASSERT(psind > 0 && psind < MAXPAGESIZES,
3618 ("psind %d unexpected", psind));
3619 KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0,
3620 ("unaligned phys address %#lx newpte %#lx psind %d",
3621 (newpte & ~ATTR_MASK), newpte, psind));
3622
3623 restart:
3624 if (psind == 2) {
3625 l0p = pmap_l0(pmap, va);
3626 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
3627 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
3628 if (mp == NULL) {
3629 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
3630 return (KERN_RESOURCE_SHORTAGE);
3631 PMAP_UNLOCK(pmap);
3632 vm_wait(NULL);
3633 PMAP_LOCK(pmap);
3634 goto restart;
3635 }
3636 l1p = pmap_l0_to_l1(l0p, va);
3637 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
3638 origpte = pmap_load(l1p);
3639 } else {
3640 l1p = pmap_l0_to_l1(l0p, va);
3641 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
3642 origpte = pmap_load(l1p);
3643 if ((origpte & ATTR_DESCR_VALID) == 0) {
3644 mp = PHYS_TO_VM_PAGE(pmap_load(l0p) &
3645 ~ATTR_MASK);
3646 mp->ref_count++;
3647 }
3648 }
3649 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
3650 ((origpte & ATTR_DESCR_MASK) == L1_BLOCK &&
3651 (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)),
3652 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
3653 va, origpte, newpte));
3654 pmap_store(l1p, newpte);
3655 } else /* (psind == 1) */ {
3656 l2p = pmap_l2(pmap, va);
3657 if (l2p == NULL) {
3658 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
3659 if (mp == NULL) {
3660 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
3661 return (KERN_RESOURCE_SHORTAGE);
3662 PMAP_UNLOCK(pmap);
3663 vm_wait(NULL);
3664 PMAP_LOCK(pmap);
3665 goto restart;
3666 }
3667 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
3668 l2p = &l2p[pmap_l2_index(va)];
3669 origpte = pmap_load(l2p);
3670 } else {
3671 l1p = pmap_l1(pmap, va);
3672 origpte = pmap_load(l2p);
3673 if ((origpte & ATTR_DESCR_VALID) == 0) {
3674 mp = PHYS_TO_VM_PAGE(pmap_load(l1p) &
3675 ~ATTR_MASK);
3676 mp->ref_count++;
3677 }
3678 }
3679 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
3680 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
3681 (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)),
3682 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
3683 va, origpte, newpte));
3684 pmap_store(l2p, newpte);
3685 }
3686 dsb(ishst);
3687
3688 if ((origpte & ATTR_DESCR_VALID) == 0)
3689 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
3690 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
3691 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
3692 else if ((newpte & ATTR_SW_WIRED) == 0 &&
3693 (origpte & ATTR_SW_WIRED) != 0)
3694 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
3695
3696 return (KERN_SUCCESS);
3697 }
3698
3699 /*
3700 * Add a single SMMU entry. This function does not sleep.
3701 */
3702 int
pmap_senter(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,vm_prot_t prot,u_int flags)3703 pmap_senter(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3704 vm_prot_t prot, u_int flags)
3705 {
3706 pd_entry_t *pde;
3707 pt_entry_t new_l3, orig_l3;
3708 pt_entry_t *l3;
3709 vm_page_t mpte;
3710 int lvl;
3711 int rv;
3712
3713 PMAP_ASSERT_STAGE1(pmap);
3714 KASSERT(va < VM_MAXUSER_ADDRESS, ("wrong address space"));
3715
3716 va = trunc_page(va);
3717 new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT |
3718 ATTR_S1_IDX(VM_MEMATTR_DEVICE) | L3_PAGE);
3719 if ((prot & VM_PROT_WRITE) == 0)
3720 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
3721 new_l3 |= ATTR_S1_XN; /* Execute never. */
3722 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER);
3723 new_l3 |= ATTR_S1_nG; /* Non global. */
3724
3725 CTR2(KTR_PMAP, "pmap_senter: %.16lx -> %.16lx", va, pa);
3726
3727 PMAP_LOCK(pmap);
3728
3729 /*
3730 * In the case that a page table page is not
3731 * resident, we are creating it here.
3732 */
3733 retry:
3734 pde = pmap_pde(pmap, va, &lvl);
3735 if (pde != NULL && lvl == 2) {
3736 l3 = pmap_l2_to_l3(pde, va);
3737 } else {
3738 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL);
3739 if (mpte == NULL) {
3740 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
3741 rv = KERN_RESOURCE_SHORTAGE;
3742 goto out;
3743 }
3744 goto retry;
3745 }
3746
3747 orig_l3 = pmap_load(l3);
3748 KASSERT(!pmap_l3_valid(orig_l3), ("l3 is valid"));
3749
3750 /* New mapping */
3751 pmap_store(l3, new_l3);
3752 pmap_resident_count_inc(pmap, 1);
3753 dsb(ishst);
3754
3755 rv = KERN_SUCCESS;
3756 out:
3757 PMAP_UNLOCK(pmap);
3758
3759 return (rv);
3760 }
3761
3762 /*
3763 * Remove a single SMMU entry.
3764 */
3765 int
pmap_sremove(pmap_t pmap,vm_offset_t va)3766 pmap_sremove(pmap_t pmap, vm_offset_t va)
3767 {
3768 pt_entry_t *pte;
3769 int lvl;
3770 int rc;
3771
3772 PMAP_LOCK(pmap);
3773
3774 pte = pmap_pte(pmap, va, &lvl);
3775 KASSERT(lvl == 3,
3776 ("Invalid SMMU pagetable level: %d != 3", lvl));
3777
3778 if (pte != NULL) {
3779 pmap_resident_count_dec(pmap, 1);
3780 pmap_clear(pte);
3781 rc = KERN_SUCCESS;
3782 } else
3783 rc = KERN_FAILURE;
3784
3785 PMAP_UNLOCK(pmap);
3786
3787 return (rc);
3788 }
3789
3790 /*
3791 * Remove all the allocated L1, L2 pages from SMMU pmap.
3792 * All the L3 entires must be cleared in advance, otherwise
3793 * this function panics.
3794 */
3795 void
pmap_sremove_pages(pmap_t pmap)3796 pmap_sremove_pages(pmap_t pmap)
3797 {
3798 pd_entry_t l0e, *l1, l1e, *l2, l2e;
3799 pt_entry_t *l3, l3e;
3800 vm_page_t m, m0, m1;
3801 vm_offset_t sva;
3802 vm_paddr_t pa;
3803 vm_paddr_t pa0;
3804 vm_paddr_t pa1;
3805 int i, j, k, l;
3806
3807 PMAP_LOCK(pmap);
3808
3809 for (sva = VM_MINUSER_ADDRESS, i = pmap_l0_index(sva);
3810 (i < Ln_ENTRIES && sva < VM_MAXUSER_ADDRESS); i++) {
3811 l0e = pmap->pm_l0[i];
3812 if ((l0e & ATTR_DESCR_VALID) == 0) {
3813 sva += L0_SIZE;
3814 continue;
3815 }
3816 pa0 = l0e & ~ATTR_MASK;
3817 m0 = PHYS_TO_VM_PAGE(pa0);
3818 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa0);
3819
3820 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
3821 l1e = l1[j];
3822 if ((l1e & ATTR_DESCR_VALID) == 0) {
3823 sva += L1_SIZE;
3824 continue;
3825 }
3826 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
3827 sva += L1_SIZE;
3828 continue;
3829 }
3830 pa1 = l1e & ~ATTR_MASK;
3831 m1 = PHYS_TO_VM_PAGE(pa1);
3832 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa1);
3833
3834 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
3835 l2e = l2[k];
3836 if ((l2e & ATTR_DESCR_VALID) == 0) {
3837 sva += L2_SIZE;
3838 continue;
3839 }
3840 pa = l2e & ~ATTR_MASK;
3841 m = PHYS_TO_VM_PAGE(pa);
3842 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
3843
3844 for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
3845 l++, sva += L3_SIZE) {
3846 l3e = l3[l];
3847 if ((l3e & ATTR_DESCR_VALID) == 0)
3848 continue;
3849 panic("%s: l3e found for va %jx\n",
3850 __func__, sva);
3851 }
3852
3853 vm_page_unwire_noq(m1);
3854 vm_page_unwire_noq(m);
3855 pmap_resident_count_dec(pmap, 1);
3856 vm_page_free(m);
3857 pmap_clear(&l2[k]);
3858 }
3859
3860 vm_page_unwire_noq(m0);
3861 pmap_resident_count_dec(pmap, 1);
3862 vm_page_free(m1);
3863 pmap_clear(&l1[j]);
3864 }
3865
3866 pmap_resident_count_dec(pmap, 1);
3867 vm_page_free(m0);
3868 pmap_clear(&pmap->pm_l0[i]);
3869 }
3870
3871 KASSERT(pmap->pm_stats.resident_count == 0,
3872 ("Invalid resident count %jd", pmap->pm_stats.resident_count));
3873
3874 PMAP_UNLOCK(pmap);
3875 }
3876
3877 /*
3878 * Insert the given physical page (p) at
3879 * the specified virtual address (v) in the
3880 * target physical map with the protection requested.
3881 *
3882 * If specified, the page will be wired down, meaning
3883 * that the related pte can not be reclaimed.
3884 *
3885 * NB: This is the only routine which MAY NOT lazy-evaluate
3886 * or lose information. That is, this routine must actually
3887 * insert this page into the given map NOW.
3888 */
3889 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)3890 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3891 u_int flags, int8_t psind)
3892 {
3893 struct rwlock *lock;
3894 pd_entry_t *pde;
3895 pt_entry_t new_l3, orig_l3;
3896 pt_entry_t *l2, *l3;
3897 pv_entry_t pv;
3898 vm_paddr_t opa, pa;
3899 vm_page_t mpte, om;
3900 boolean_t nosleep;
3901 int lvl, rv;
3902
3903 KASSERT(ADDR_IS_CANONICAL(va),
3904 ("%s: Address not in canonical form: %lx", __func__, va));
3905
3906 va = trunc_page(va);
3907 if ((m->oflags & VPO_UNMANAGED) == 0)
3908 VM_PAGE_OBJECT_BUSY_ASSERT(m);
3909 pa = VM_PAGE_TO_PHYS(m);
3910 new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE);
3911 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
3912 new_l3 |= pmap_pte_prot(pmap, prot);
3913
3914 if ((flags & PMAP_ENTER_WIRED) != 0)
3915 new_l3 |= ATTR_SW_WIRED;
3916 if (pmap->pm_stage == PM_STAGE1) {
3917 if (!ADDR_IS_KERNEL(va))
3918 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
3919 else
3920 new_l3 |= ATTR_S1_UXN;
3921 if (pmap != kernel_pmap)
3922 new_l3 |= ATTR_S1_nG;
3923 } else {
3924 /*
3925 * Clear the access flag on executable mappings, this will be
3926 * set later when the page is accessed. The fault handler is
3927 * required to invalidate the I-cache.
3928 *
3929 * TODO: Switch to the valid flag to allow hardware management
3930 * of the access flag. Much of the pmap code assumes the
3931 * valid flag is set and fails to destroy the old page tables
3932 * correctly if it is clear.
3933 */
3934 if (prot & VM_PROT_EXECUTE)
3935 new_l3 &= ~ATTR_AF;
3936 }
3937 if ((m->oflags & VPO_UNMANAGED) == 0) {
3938 new_l3 |= ATTR_SW_MANAGED;
3939 if ((prot & VM_PROT_WRITE) != 0) {
3940 new_l3 |= ATTR_SW_DBM;
3941 if ((flags & VM_PROT_WRITE) == 0) {
3942 if (pmap->pm_stage == PM_STAGE1)
3943 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
3944 else
3945 new_l3 &=
3946 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
3947 }
3948 }
3949 }
3950
3951 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
3952
3953 lock = NULL;
3954 PMAP_LOCK(pmap);
3955 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
3956 KASSERT((m->oflags & VPO_UNMANAGED) != 0,
3957 ("managed largepage va %#lx flags %#x", va, flags));
3958 new_l3 &= ~L3_PAGE;
3959 if (psind == 2)
3960 new_l3 |= L1_BLOCK;
3961 else /* (psind == 1) */
3962 new_l3 |= L2_BLOCK;
3963 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
3964 goto out;
3965 }
3966 if (psind == 1) {
3967 /* Assert the required virtual and physical alignment. */
3968 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
3969 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
3970 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
3971 flags, m, &lock);
3972 goto out;
3973 }
3974 mpte = NULL;
3975
3976 /*
3977 * In the case that a page table page is not
3978 * resident, we are creating it here.
3979 */
3980 retry:
3981 pde = pmap_pde(pmap, va, &lvl);
3982 if (pde != NULL && lvl == 2) {
3983 l3 = pmap_l2_to_l3(pde, va);
3984 if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
3985 mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
3986 mpte->ref_count++;
3987 }
3988 goto havel3;
3989 } else if (pde != NULL && lvl == 1) {
3990 l2 = pmap_l1_to_l2(pde, va);
3991 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
3992 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
3993 l3 = &l3[pmap_l3_index(va)];
3994 if (!ADDR_IS_KERNEL(va)) {
3995 mpte = PHYS_TO_VM_PAGE(
3996 pmap_load(l2) & ~ATTR_MASK);
3997 mpte->ref_count++;
3998 }
3999 goto havel3;
4000 }
4001 /* We need to allocate an L3 table. */
4002 }
4003 if (!ADDR_IS_KERNEL(va)) {
4004 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4005
4006 /*
4007 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
4008 * to handle the possibility that a superpage mapping for "va"
4009 * was created while we slept.
4010 */
4011 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
4012 nosleep ? NULL : &lock);
4013 if (mpte == NULL && nosleep) {
4014 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
4015 rv = KERN_RESOURCE_SHORTAGE;
4016 goto out;
4017 }
4018 goto retry;
4019 } else
4020 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
4021
4022 havel3:
4023 orig_l3 = pmap_load(l3);
4024 opa = orig_l3 & ~ATTR_MASK;
4025 pv = NULL;
4026
4027 /*
4028 * Is the specified virtual address already mapped?
4029 */
4030 if (pmap_l3_valid(orig_l3)) {
4031 /*
4032 * Only allow adding new entries on stage 2 tables for now.
4033 * This simplifies cache invalidation as we may need to call
4034 * into EL2 to perform such actions.
4035 */
4036 PMAP_ASSERT_STAGE1(pmap);
4037 /*
4038 * Wiring change, just update stats. We don't worry about
4039 * wiring PT pages as they remain resident as long as there
4040 * are valid mappings in them. Hence, if a user page is wired,
4041 * the PT page will be also.
4042 */
4043 if ((flags & PMAP_ENTER_WIRED) != 0 &&
4044 (orig_l3 & ATTR_SW_WIRED) == 0)
4045 pmap->pm_stats.wired_count++;
4046 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
4047 (orig_l3 & ATTR_SW_WIRED) != 0)
4048 pmap->pm_stats.wired_count--;
4049
4050 /*
4051 * Remove the extra PT page reference.
4052 */
4053 if (mpte != NULL) {
4054 mpte->ref_count--;
4055 KASSERT(mpte->ref_count > 0,
4056 ("pmap_enter: missing reference to page table page,"
4057 " va: 0x%lx", va));
4058 }
4059
4060 /*
4061 * Has the physical page changed?
4062 */
4063 if (opa == pa) {
4064 /*
4065 * No, might be a protection or wiring change.
4066 */
4067 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4068 (new_l3 & ATTR_SW_DBM) != 0)
4069 vm_page_aflag_set(m, PGA_WRITEABLE);
4070 goto validate;
4071 }
4072
4073 /*
4074 * The physical page has changed. Temporarily invalidate
4075 * the mapping.
4076 */
4077 orig_l3 = pmap_load_clear(l3);
4078 KASSERT((orig_l3 & ~ATTR_MASK) == opa,
4079 ("pmap_enter: unexpected pa update for %#lx", va));
4080 if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
4081 om = PHYS_TO_VM_PAGE(opa);
4082
4083 /*
4084 * The pmap lock is sufficient to synchronize with
4085 * concurrent calls to pmap_page_test_mappings() and
4086 * pmap_ts_referenced().
4087 */
4088 if (pmap_pte_dirty(pmap, orig_l3))
4089 vm_page_dirty(om);
4090 if ((orig_l3 & ATTR_AF) != 0) {
4091 pmap_invalidate_page(pmap, va);
4092 vm_page_aflag_set(om, PGA_REFERENCED);
4093 }
4094 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4095 pv = pmap_pvh_remove(&om->md, pmap, va);
4096 if ((m->oflags & VPO_UNMANAGED) != 0)
4097 free_pv_entry(pmap, pv);
4098 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
4099 TAILQ_EMPTY(&om->md.pv_list) &&
4100 ((om->flags & PG_FICTITIOUS) != 0 ||
4101 TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
4102 vm_page_aflag_clear(om, PGA_WRITEABLE);
4103 } else {
4104 KASSERT((orig_l3 & ATTR_AF) != 0,
4105 ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
4106 pmap_invalidate_page(pmap, va);
4107 }
4108 orig_l3 = 0;
4109 } else {
4110 /*
4111 * Increment the counters.
4112 */
4113 if ((new_l3 & ATTR_SW_WIRED) != 0)
4114 pmap->pm_stats.wired_count++;
4115 pmap_resident_count_inc(pmap, 1);
4116 }
4117 /*
4118 * Enter on the PV list if part of our managed memory.
4119 */
4120 if ((m->oflags & VPO_UNMANAGED) == 0) {
4121 if (pv == NULL) {
4122 pv = get_pv_entry(pmap, &lock);
4123 pv->pv_va = va;
4124 }
4125 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4126 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4127 m->md.pv_gen++;
4128 if ((new_l3 & ATTR_SW_DBM) != 0)
4129 vm_page_aflag_set(m, PGA_WRITEABLE);
4130 }
4131
4132 validate:
4133 if (pmap->pm_stage == PM_STAGE1) {
4134 /*
4135 * Sync icache if exec permission and attribute
4136 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
4137 * is stored and made valid for hardware table walk. If done
4138 * later, then other can access this page before caches are
4139 * properly synced. Don't do it for kernel memory which is
4140 * mapped with exec permission even if the memory isn't going
4141 * to hold executable code. The only time when icache sync is
4142 * needed is after kernel module is loaded and the relocation
4143 * info is processed. And it's done in elf_cpu_load_file().
4144 */
4145 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
4146 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
4147 (opa != pa || (orig_l3 & ATTR_S1_XN))) {
4148 PMAP_ASSERT_STAGE1(pmap);
4149 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4150 }
4151 } else {
4152 cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4153 }
4154
4155 /*
4156 * Update the L3 entry
4157 */
4158 if (pmap_l3_valid(orig_l3)) {
4159 PMAP_ASSERT_STAGE1(pmap);
4160 KASSERT(opa == pa, ("pmap_enter: invalid update"));
4161 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
4162 /* same PA, different attributes */
4163 orig_l3 = pmap_load_store(l3, new_l3);
4164 pmap_invalidate_page(pmap, va);
4165 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4166 pmap_pte_dirty(pmap, orig_l3))
4167 vm_page_dirty(m);
4168 } else {
4169 /*
4170 * orig_l3 == new_l3
4171 * This can happens if multiple threads simultaneously
4172 * access not yet mapped page. This bad for performance
4173 * since this can cause full demotion-NOP-promotion
4174 * cycle.
4175 * Another possible reasons are:
4176 * - VM and pmap memory layout are diverged
4177 * - tlb flush is missing somewhere and CPU doesn't see
4178 * actual mapping.
4179 */
4180 CTR4(KTR_PMAP, "%s: already mapped page - "
4181 "pmap %p va 0x%#lx pte 0x%lx",
4182 __func__, pmap, va, new_l3);
4183 }
4184 } else {
4185 /* New mapping */
4186 pmap_store(l3, new_l3);
4187 dsb(ishst);
4188 }
4189
4190 #if VM_NRESERVLEVEL > 0
4191 /*
4192 * Try to promote from level 3 pages to a level 2 superpage. This
4193 * currently only works on stage 1 pmaps as pmap_promote_l2 looks at
4194 * stage 1 specific fields and performs a break-before-make sequence
4195 * that is incorrect a stage 2 pmap.
4196 */
4197 if ((mpte == NULL || mpte->ref_count == NL3PG) &&
4198 pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 &&
4199 (m->flags & PG_FICTITIOUS) == 0 &&
4200 vm_reserv_level_iffullpop(m) == 0) {
4201 pmap_promote_l2(pmap, pde, va, &lock);
4202 }
4203 #endif
4204
4205 rv = KERN_SUCCESS;
4206 out:
4207 if (lock != NULL)
4208 rw_wunlock(lock);
4209 PMAP_UNLOCK(pmap);
4210 return (rv);
4211 }
4212
4213 /*
4214 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true
4215 * if successful. Returns false if (1) a page table page cannot be allocated
4216 * without sleeping, (2) a mapping already exists at the specified virtual
4217 * address, or (3) a PV entry cannot be allocated without reclaiming another
4218 * PV entry.
4219 */
4220 static bool
pmap_enter_2mpage(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)4221 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4222 struct rwlock **lockp)
4223 {
4224 pd_entry_t new_l2;
4225
4226 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4227 PMAP_ASSERT_STAGE1(pmap);
4228 KASSERT(ADDR_IS_CANONICAL(va),
4229 ("%s: Address not in canonical form: %lx", __func__, va));
4230
4231 new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
4232 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
4233 L2_BLOCK);
4234 if ((m->oflags & VPO_UNMANAGED) == 0) {
4235 new_l2 |= ATTR_SW_MANAGED;
4236 new_l2 &= ~ATTR_AF;
4237 }
4238 if ((prot & VM_PROT_EXECUTE) == 0 ||
4239 m->md.pv_memattr == VM_MEMATTR_DEVICE)
4240 new_l2 |= ATTR_S1_XN;
4241 if (!ADDR_IS_KERNEL(va))
4242 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4243 else
4244 new_l2 |= ATTR_S1_UXN;
4245 if (pmap != kernel_pmap)
4246 new_l2 |= ATTR_S1_nG;
4247 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
4248 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp) ==
4249 KERN_SUCCESS);
4250 }
4251
4252 /*
4253 * Returns true if every page table entry in the specified page table is
4254 * zero.
4255 */
4256 static bool
pmap_every_pte_zero(vm_paddr_t pa)4257 pmap_every_pte_zero(vm_paddr_t pa)
4258 {
4259 pt_entry_t *pt_end, *pte;
4260
4261 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
4262 pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
4263 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
4264 if (*pte != 0)
4265 return (false);
4266 }
4267 return (true);
4268 }
4269
4270 /*
4271 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if
4272 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
4273 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
4274 * a mapping already exists at the specified virtual address. Returns
4275 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
4276 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if
4277 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
4278 */
4279 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)4280 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
4281 vm_page_t m, struct rwlock **lockp)
4282 {
4283 struct spglist free;
4284 pd_entry_t *l2, old_l2;
4285 vm_page_t l2pg, mt;
4286
4287 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4288 KASSERT(ADDR_IS_CANONICAL(va),
4289 ("%s: Address not in canonical form: %lx", __func__, va));
4290
4291 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
4292 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
4293 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
4294 va, pmap);
4295 return (KERN_RESOURCE_SHORTAGE);
4296 }
4297
4298 /*
4299 * If there are existing mappings, either abort or remove them.
4300 */
4301 if ((old_l2 = pmap_load(l2)) != 0) {
4302 KASSERT(l2pg == NULL || l2pg->ref_count > 1,
4303 ("pmap_enter_l2: l2pg's ref count is too low"));
4304 if ((flags & PMAP_ENTER_NOREPLACE) != 0 &&
4305 (!ADDR_IS_KERNEL(va) ||
4306 (old_l2 & ATTR_DESCR_MASK) == L2_BLOCK ||
4307 !pmap_every_pte_zero(old_l2 & ~ATTR_MASK))) {
4308 if (l2pg != NULL)
4309 l2pg->ref_count--;
4310 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx"
4311 " in pmap %p", va, pmap);
4312 return (KERN_FAILURE);
4313 }
4314 SLIST_INIT(&free);
4315 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
4316 (void)pmap_remove_l2(pmap, l2, va,
4317 pmap_load(pmap_l1(pmap, va)), &free, lockp);
4318 else
4319 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
4320 &free, lockp);
4321 if (!ADDR_IS_KERNEL(va)) {
4322 vm_page_free_pages_toq(&free, true);
4323 KASSERT(pmap_load(l2) == 0,
4324 ("pmap_enter_l2: non-zero L2 entry %p", l2));
4325 } else {
4326 KASSERT(SLIST_EMPTY(&free),
4327 ("pmap_enter_l2: freed kernel page table page"));
4328
4329 /*
4330 * Both pmap_remove_l2() and pmap_remove_l3_range()
4331 * will leave the kernel page table page zero filled.
4332 * Nonetheless, the TLB could have an intermediate
4333 * entry for the kernel page table page.
4334 */
4335 mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
4336 if (pmap_insert_pt_page(pmap, mt, false))
4337 panic("pmap_enter_l2: trie insert failed");
4338 pmap_clear(l2);
4339 pmap_invalidate_page(pmap, va);
4340 }
4341 }
4342
4343 if ((new_l2 & ATTR_SW_MANAGED) != 0) {
4344 /*
4345 * Abort this mapping if its PV entry could not be created.
4346 */
4347 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
4348 if (l2pg != NULL)
4349 pmap_abort_ptp(pmap, va, l2pg);
4350 CTR2(KTR_PMAP,
4351 "pmap_enter_l2: failure for va %#lx in pmap %p",
4352 va, pmap);
4353 return (KERN_RESOURCE_SHORTAGE);
4354 }
4355 if ((new_l2 & ATTR_SW_DBM) != 0)
4356 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4357 vm_page_aflag_set(mt, PGA_WRITEABLE);
4358 }
4359
4360 /*
4361 * Increment counters.
4362 */
4363 if ((new_l2 & ATTR_SW_WIRED) != 0)
4364 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
4365 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
4366
4367 /*
4368 * Conditionally sync the icache. See pmap_enter() for details.
4369 */
4370 if ((new_l2 & ATTR_S1_XN) == 0 && ((new_l2 & ~ATTR_MASK) !=
4371 (old_l2 & ~ATTR_MASK) || (old_l2 & ATTR_S1_XN) != 0) &&
4372 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
4373 cpu_icache_sync_range(PHYS_TO_DMAP(new_l2 & ~ATTR_MASK),
4374 L2_SIZE);
4375 }
4376
4377 /*
4378 * Map the superpage.
4379 */
4380 pmap_store(l2, new_l2);
4381 dsb(ishst);
4382
4383 atomic_add_long(&pmap_l2_mappings, 1);
4384 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
4385 va, pmap);
4386
4387 return (KERN_SUCCESS);
4388 }
4389
4390 /*
4391 * Maps a sequence of resident pages belonging to the same object.
4392 * The sequence begins with the given page m_start. This page is
4393 * mapped at the given virtual address start. Each subsequent page is
4394 * mapped at a virtual address that is offset from start by the same
4395 * amount as the page is offset from m_start within the object. The
4396 * last page in the sequence is the page with the largest offset from
4397 * m_start that can be mapped at a virtual address less than the given
4398 * virtual address end. Not every virtual page between start and end
4399 * is mapped; only those for which a resident page exists with the
4400 * corresponding offset from m_start are mapped.
4401 */
4402 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)4403 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4404 vm_page_t m_start, vm_prot_t prot)
4405 {
4406 struct rwlock *lock;
4407 vm_offset_t va;
4408 vm_page_t m, mpte;
4409 vm_pindex_t diff, psize;
4410
4411 VM_OBJECT_ASSERT_LOCKED(m_start->object);
4412
4413 psize = atop(end - start);
4414 mpte = NULL;
4415 m = m_start;
4416 lock = NULL;
4417 PMAP_LOCK(pmap);
4418 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4419 va = start + ptoa(diff);
4420 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
4421 m->psind == 1 && pmap_ps_enabled(pmap) &&
4422 pmap_enter_2mpage(pmap, va, m, prot, &lock))
4423 m = &m[L2_SIZE / PAGE_SIZE - 1];
4424 else
4425 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
4426 &lock);
4427 m = TAILQ_NEXT(m, listq);
4428 }
4429 if (lock != NULL)
4430 rw_wunlock(lock);
4431 PMAP_UNLOCK(pmap);
4432 }
4433
4434 /*
4435 * this code makes some *MAJOR* assumptions:
4436 * 1. Current pmap & pmap exists.
4437 * 2. Not wired.
4438 * 3. Read access.
4439 * 4. No page table pages.
4440 * but is *MUCH* faster than pmap_enter...
4441 */
4442
4443 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)4444 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4445 {
4446 struct rwlock *lock;
4447
4448 lock = NULL;
4449 PMAP_LOCK(pmap);
4450 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4451 if (lock != NULL)
4452 rw_wunlock(lock);
4453 PMAP_UNLOCK(pmap);
4454 }
4455
4456 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)4457 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4458 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4459 {
4460 pd_entry_t *pde;
4461 pt_entry_t *l2, *l3, l3_val;
4462 vm_paddr_t pa;
4463 int lvl;
4464
4465 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
4466 (m->oflags & VPO_UNMANAGED) != 0,
4467 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4468 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4469 PMAP_ASSERT_STAGE1(pmap);
4470 KASSERT(ADDR_IS_CANONICAL(va),
4471 ("%s: Address not in canonical form: %lx", __func__, va));
4472
4473 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
4474 /*
4475 * In the case that a page table page is not
4476 * resident, we are creating it here.
4477 */
4478 if (!ADDR_IS_KERNEL(va)) {
4479 vm_pindex_t l2pindex;
4480
4481 /*
4482 * Calculate pagetable page index
4483 */
4484 l2pindex = pmap_l2_pindex(va);
4485 if (mpte && (mpte->pindex == l2pindex)) {
4486 mpte->ref_count++;
4487 } else {
4488 /*
4489 * Get the l2 entry
4490 */
4491 pde = pmap_pde(pmap, va, &lvl);
4492
4493 /*
4494 * If the page table page is mapped, we just increment
4495 * the hold count, and activate it. Otherwise, we
4496 * attempt to allocate a page table page. If this
4497 * attempt fails, we don't retry. Instead, we give up.
4498 */
4499 if (lvl == 1) {
4500 l2 = pmap_l1_to_l2(pde, va);
4501 if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
4502 L2_BLOCK)
4503 return (NULL);
4504 }
4505 if (lvl == 2 && pmap_load(pde) != 0) {
4506 mpte =
4507 PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
4508 mpte->ref_count++;
4509 } else {
4510 /*
4511 * Pass NULL instead of the PV list lock
4512 * pointer, because we don't intend to sleep.
4513 */
4514 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
4515 if (mpte == NULL)
4516 return (mpte);
4517 }
4518 }
4519 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4520 l3 = &l3[pmap_l3_index(va)];
4521 } else {
4522 mpte = NULL;
4523 pde = pmap_pde(kernel_pmap, va, &lvl);
4524 KASSERT(pde != NULL,
4525 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
4526 va));
4527 KASSERT(lvl == 2,
4528 ("pmap_enter_quick_locked: Invalid level %d", lvl));
4529 l3 = pmap_l2_to_l3(pde, va);
4530 }
4531
4532 /*
4533 * Abort if a mapping already exists.
4534 */
4535 if (pmap_load(l3) != 0) {
4536 if (mpte != NULL)
4537 mpte->ref_count--;
4538 return (NULL);
4539 }
4540
4541 /*
4542 * Enter on the PV list if part of our managed memory.
4543 */
4544 if ((m->oflags & VPO_UNMANAGED) == 0 &&
4545 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4546 if (mpte != NULL)
4547 pmap_abort_ptp(pmap, va, mpte);
4548 return (NULL);
4549 }
4550
4551 /*
4552 * Increment counters
4553 */
4554 pmap_resident_count_inc(pmap, 1);
4555
4556 pa = VM_PAGE_TO_PHYS(m);
4557 l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
4558 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
4559 if ((prot & VM_PROT_EXECUTE) == 0 ||
4560 m->md.pv_memattr == VM_MEMATTR_DEVICE)
4561 l3_val |= ATTR_S1_XN;
4562 if (!ADDR_IS_KERNEL(va))
4563 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4564 else
4565 l3_val |= ATTR_S1_UXN;
4566 if (pmap != kernel_pmap)
4567 l3_val |= ATTR_S1_nG;
4568
4569 /*
4570 * Now validate mapping with RO protection
4571 */
4572 if ((m->oflags & VPO_UNMANAGED) == 0) {
4573 l3_val |= ATTR_SW_MANAGED;
4574 l3_val &= ~ATTR_AF;
4575 }
4576
4577 /* Sync icache before the mapping is stored to PTE */
4578 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
4579 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
4580 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4581
4582 pmap_store(l3, l3_val);
4583 dsb(ishst);
4584
4585 return (mpte);
4586 }
4587
4588 /*
4589 * This code maps large physical mmap regions into the
4590 * processor address space. Note that some shortcuts
4591 * are taken, but the code works.
4592 */
4593 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)4594 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4595 vm_pindex_t pindex, vm_size_t size)
4596 {
4597
4598 VM_OBJECT_ASSERT_WLOCKED(object);
4599 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4600 ("pmap_object_init_pt: non-device object"));
4601 }
4602
4603 /*
4604 * Clear the wired attribute from the mappings for the specified range of
4605 * addresses in the given pmap. Every valid mapping within that range
4606 * must have the wired attribute set. In contrast, invalid mappings
4607 * cannot have the wired attribute set, so they are ignored.
4608 *
4609 * The wired attribute of the page table entry is not a hardware feature,
4610 * so there is no need to invalidate any TLB entries.
4611 */
4612 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4613 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4614 {
4615 vm_offset_t va_next;
4616 pd_entry_t *l0, *l1, *l2;
4617 pt_entry_t *l3;
4618
4619 PMAP_LOCK(pmap);
4620 for (; sva < eva; sva = va_next) {
4621 l0 = pmap_l0(pmap, sva);
4622 if (pmap_load(l0) == 0) {
4623 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4624 if (va_next < sva)
4625 va_next = eva;
4626 continue;
4627 }
4628
4629 l1 = pmap_l0_to_l1(l0, sva);
4630 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4631 if (va_next < sva)
4632 va_next = eva;
4633 if (pmap_load(l1) == 0)
4634 continue;
4635
4636 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4637 KASSERT(va_next <= eva,
4638 ("partial update of non-transparent 1G page "
4639 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4640 pmap_load(l1), sva, eva, va_next));
4641 MPASS(pmap != kernel_pmap);
4642 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
4643 ATTR_SW_WIRED)) == ATTR_SW_WIRED);
4644 pmap_clear_bits(l1, ATTR_SW_WIRED);
4645 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
4646 continue;
4647 }
4648
4649 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4650 if (va_next < sva)
4651 va_next = eva;
4652
4653 l2 = pmap_l1_to_l2(l1, sva);
4654 if (pmap_load(l2) == 0)
4655 continue;
4656
4657 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4658 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
4659 panic("pmap_unwire: l2 %#jx is missing "
4660 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
4661
4662 /*
4663 * Are we unwiring the entire large page? If not,
4664 * demote the mapping and fall through.
4665 */
4666 if (sva + L2_SIZE == va_next && eva >= va_next) {
4667 pmap_clear_bits(l2, ATTR_SW_WIRED);
4668 pmap->pm_stats.wired_count -= L2_SIZE /
4669 PAGE_SIZE;
4670 continue;
4671 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
4672 panic("pmap_unwire: demotion failed");
4673 }
4674 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4675 ("pmap_unwire: Invalid l2 entry after demotion"));
4676
4677 if (va_next > eva)
4678 va_next = eva;
4679 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
4680 sva += L3_SIZE) {
4681 if (pmap_load(l3) == 0)
4682 continue;
4683 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
4684 panic("pmap_unwire: l3 %#jx is missing "
4685 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
4686
4687 /*
4688 * ATTR_SW_WIRED must be cleared atomically. Although
4689 * the pmap lock synchronizes access to ATTR_SW_WIRED,
4690 * the System MMU may write to the entry concurrently.
4691 */
4692 pmap_clear_bits(l3, ATTR_SW_WIRED);
4693 pmap->pm_stats.wired_count--;
4694 }
4695 }
4696 PMAP_UNLOCK(pmap);
4697 }
4698
4699 /*
4700 * Copy the range specified by src_addr/len
4701 * from the source map to the range dst_addr/len
4702 * in the destination map.
4703 *
4704 * This routine is only advisory and need not do anything.
4705 *
4706 * Because the executable mappings created by this routine are copied,
4707 * it should not have to flush the instruction cache.
4708 */
4709 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)4710 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4711 vm_offset_t src_addr)
4712 {
4713 struct rwlock *lock;
4714 pd_entry_t *l0, *l1, *l2, srcptepaddr;
4715 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
4716 vm_offset_t addr, end_addr, va_next;
4717 vm_page_t dst_m, dstmpte, srcmpte;
4718
4719 PMAP_ASSERT_STAGE1(dst_pmap);
4720 PMAP_ASSERT_STAGE1(src_pmap);
4721
4722 if (dst_addr != src_addr)
4723 return;
4724 end_addr = src_addr + len;
4725 lock = NULL;
4726 if (dst_pmap < src_pmap) {
4727 PMAP_LOCK(dst_pmap);
4728 PMAP_LOCK(src_pmap);
4729 } else {
4730 PMAP_LOCK(src_pmap);
4731 PMAP_LOCK(dst_pmap);
4732 }
4733 for (addr = src_addr; addr < end_addr; addr = va_next) {
4734 l0 = pmap_l0(src_pmap, addr);
4735 if (pmap_load(l0) == 0) {
4736 va_next = (addr + L0_SIZE) & ~L0_OFFSET;
4737 if (va_next < addr)
4738 va_next = end_addr;
4739 continue;
4740 }
4741
4742 va_next = (addr + L1_SIZE) & ~L1_OFFSET;
4743 if (va_next < addr)
4744 va_next = end_addr;
4745 l1 = pmap_l0_to_l1(l0, addr);
4746 if (pmap_load(l1) == 0)
4747 continue;
4748 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4749 KASSERT(va_next <= end_addr,
4750 ("partial update of non-transparent 1G page "
4751 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
4752 pmap_load(l1), addr, end_addr, va_next));
4753 srcptepaddr = pmap_load(l1);
4754 l1 = pmap_l1(dst_pmap, addr);
4755 if (l1 == NULL) {
4756 if (_pmap_alloc_l3(dst_pmap,
4757 pmap_l0_pindex(addr), NULL) == NULL)
4758 break;
4759 l1 = pmap_l1(dst_pmap, addr);
4760 } else {
4761 l0 = pmap_l0(dst_pmap, addr);
4762 dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) &
4763 ~ATTR_MASK);
4764 dst_m->ref_count++;
4765 }
4766 KASSERT(pmap_load(l1) == 0,
4767 ("1G mapping present in dst pmap "
4768 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
4769 pmap_load(l1), addr, end_addr, va_next));
4770 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
4771 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
4772 continue;
4773 }
4774
4775 va_next = (addr + L2_SIZE) & ~L2_OFFSET;
4776 if (va_next < addr)
4777 va_next = end_addr;
4778 l2 = pmap_l1_to_l2(l1, addr);
4779 srcptepaddr = pmap_load(l2);
4780 if (srcptepaddr == 0)
4781 continue;
4782 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4783 /*
4784 * We can only virtual copy whole superpages.
4785 */
4786 if ((addr & L2_OFFSET) != 0 ||
4787 addr + L2_SIZE > end_addr)
4788 continue;
4789 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
4790 if (l2 == NULL)
4791 break;
4792 if (pmap_load(l2) == 0 &&
4793 ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
4794 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
4795 PMAP_ENTER_NORECLAIM, &lock))) {
4796 /*
4797 * We leave the dirty bit unchanged because
4798 * managed read/write superpage mappings are
4799 * required to be dirty. However, managed
4800 * superpage mappings are not required to
4801 * have their accessed bit set, so we clear
4802 * it because we don't know if this mapping
4803 * will be used.
4804 */
4805 srcptepaddr &= ~ATTR_SW_WIRED;
4806 if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
4807 srcptepaddr &= ~ATTR_AF;
4808 pmap_store(l2, srcptepaddr);
4809 pmap_resident_count_inc(dst_pmap, L2_SIZE /
4810 PAGE_SIZE);
4811 atomic_add_long(&pmap_l2_mappings, 1);
4812 } else
4813 pmap_abort_ptp(dst_pmap, addr, dst_m);
4814 continue;
4815 }
4816 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
4817 ("pmap_copy: invalid L2 entry"));
4818 srcptepaddr &= ~ATTR_MASK;
4819 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
4820 KASSERT(srcmpte->ref_count > 0,
4821 ("pmap_copy: source page table page is unused"));
4822 if (va_next > end_addr)
4823 va_next = end_addr;
4824 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
4825 src_pte = &src_pte[pmap_l3_index(addr)];
4826 dstmpte = NULL;
4827 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
4828 ptetemp = pmap_load(src_pte);
4829
4830 /*
4831 * We only virtual copy managed pages.
4832 */
4833 if ((ptetemp & ATTR_SW_MANAGED) == 0)
4834 continue;
4835
4836 if (dstmpte != NULL) {
4837 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
4838 ("dstmpte pindex/addr mismatch"));
4839 dstmpte->ref_count++;
4840 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
4841 NULL)) == NULL)
4842 goto out;
4843 dst_pte = (pt_entry_t *)
4844 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
4845 dst_pte = &dst_pte[pmap_l3_index(addr)];
4846 if (pmap_load(dst_pte) == 0 &&
4847 pmap_try_insert_pv_entry(dst_pmap, addr,
4848 PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) {
4849 /*
4850 * Clear the wired, modified, and accessed
4851 * (referenced) bits during the copy.
4852 */
4853 mask = ATTR_AF | ATTR_SW_WIRED;
4854 nbits = 0;
4855 if ((ptetemp & ATTR_SW_DBM) != 0)
4856 nbits |= ATTR_S1_AP_RW_BIT;
4857 pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
4858 pmap_resident_count_inc(dst_pmap, 1);
4859 } else {
4860 pmap_abort_ptp(dst_pmap, addr, dstmpte);
4861 goto out;
4862 }
4863 /* Have we copied all of the valid mappings? */
4864 if (dstmpte->ref_count >= srcmpte->ref_count)
4865 break;
4866 }
4867 }
4868 out:
4869 /*
4870 * XXX This barrier may not be needed because the destination pmap is
4871 * not active.
4872 */
4873 dsb(ishst);
4874
4875 if (lock != NULL)
4876 rw_wunlock(lock);
4877 PMAP_UNLOCK(src_pmap);
4878 PMAP_UNLOCK(dst_pmap);
4879 }
4880
4881 /*
4882 * pmap_zero_page zeros the specified hardware page by mapping
4883 * the page into KVM and using bzero to clear its contents.
4884 */
4885 void
pmap_zero_page(vm_page_t m)4886 pmap_zero_page(vm_page_t m)
4887 {
4888 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4889
4890 pagezero((void *)va);
4891 }
4892
4893 /*
4894 * pmap_zero_page_area zeros the specified hardware page by mapping
4895 * the page into KVM and using bzero to clear its contents.
4896 *
4897 * off and size may not cover an area beyond a single hardware page.
4898 */
4899 void
pmap_zero_page_area(vm_page_t m,int off,int size)4900 pmap_zero_page_area(vm_page_t m, int off, int size)
4901 {
4902 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4903
4904 if (off == 0 && size == PAGE_SIZE)
4905 pagezero((void *)va);
4906 else
4907 bzero((char *)va + off, size);
4908 }
4909
4910 /*
4911 * pmap_copy_page copies the specified (machine independent)
4912 * page by mapping the page into virtual memory and using
4913 * bcopy to copy the page, one machine dependent page at a
4914 * time.
4915 */
4916 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)4917 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4918 {
4919 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
4920 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
4921
4922 pagecopy((void *)src, (void *)dst);
4923 }
4924
4925 int unmapped_buf_allowed = 1;
4926
4927 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)4928 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4929 vm_offset_t b_offset, int xfersize)
4930 {
4931 void *a_cp, *b_cp;
4932 vm_page_t m_a, m_b;
4933 vm_paddr_t p_a, p_b;
4934 vm_offset_t a_pg_offset, b_pg_offset;
4935 int cnt;
4936
4937 while (xfersize > 0) {
4938 a_pg_offset = a_offset & PAGE_MASK;
4939 m_a = ma[a_offset >> PAGE_SHIFT];
4940 p_a = m_a->phys_addr;
4941 b_pg_offset = b_offset & PAGE_MASK;
4942 m_b = mb[b_offset >> PAGE_SHIFT];
4943 p_b = m_b->phys_addr;
4944 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4945 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4946 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
4947 panic("!DMAP a %lx", p_a);
4948 } else {
4949 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
4950 }
4951 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
4952 panic("!DMAP b %lx", p_b);
4953 } else {
4954 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
4955 }
4956 bcopy(a_cp, b_cp, cnt);
4957 a_offset += cnt;
4958 b_offset += cnt;
4959 xfersize -= cnt;
4960 }
4961 }
4962
4963 vm_offset_t
pmap_quick_enter_page(vm_page_t m)4964 pmap_quick_enter_page(vm_page_t m)
4965 {
4966
4967 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
4968 }
4969
4970 void
pmap_quick_remove_page(vm_offset_t addr)4971 pmap_quick_remove_page(vm_offset_t addr)
4972 {
4973 }
4974
4975 /*
4976 * Returns true if the pmap's pv is one of the first
4977 * 16 pvs linked to from this page. This count may
4978 * be changed upwards or downwards in the future; it
4979 * is only necessary that true be returned for a small
4980 * subset of pmaps for proper page aging.
4981 */
4982 boolean_t
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)4983 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4984 {
4985 struct md_page *pvh;
4986 struct rwlock *lock;
4987 pv_entry_t pv;
4988 int loops = 0;
4989 boolean_t rv;
4990
4991 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4992 ("pmap_page_exists_quick: page %p is not managed", m));
4993 rv = FALSE;
4994 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4995 rw_rlock(lock);
4996 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4997 if (PV_PMAP(pv) == pmap) {
4998 rv = TRUE;
4999 break;
5000 }
5001 loops++;
5002 if (loops >= 16)
5003 break;
5004 }
5005 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5006 pvh = page_to_pvh(m);
5007 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5008 if (PV_PMAP(pv) == pmap) {
5009 rv = TRUE;
5010 break;
5011 }
5012 loops++;
5013 if (loops >= 16)
5014 break;
5015 }
5016 }
5017 rw_runlock(lock);
5018 return (rv);
5019 }
5020
5021 /*
5022 * pmap_page_wired_mappings:
5023 *
5024 * Return the number of managed mappings to the given physical page
5025 * that are wired.
5026 */
5027 int
pmap_page_wired_mappings(vm_page_t m)5028 pmap_page_wired_mappings(vm_page_t m)
5029 {
5030 struct rwlock *lock;
5031 struct md_page *pvh;
5032 pmap_t pmap;
5033 pt_entry_t *pte;
5034 pv_entry_t pv;
5035 int count, lvl, md_gen, pvh_gen;
5036
5037 if ((m->oflags & VPO_UNMANAGED) != 0)
5038 return (0);
5039 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5040 rw_rlock(lock);
5041 restart:
5042 count = 0;
5043 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5044 pmap = PV_PMAP(pv);
5045 if (!PMAP_TRYLOCK(pmap)) {
5046 md_gen = m->md.pv_gen;
5047 rw_runlock(lock);
5048 PMAP_LOCK(pmap);
5049 rw_rlock(lock);
5050 if (md_gen != m->md.pv_gen) {
5051 PMAP_UNLOCK(pmap);
5052 goto restart;
5053 }
5054 }
5055 pte = pmap_pte(pmap, pv->pv_va, &lvl);
5056 if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
5057 count++;
5058 PMAP_UNLOCK(pmap);
5059 }
5060 if ((m->flags & PG_FICTITIOUS) == 0) {
5061 pvh = page_to_pvh(m);
5062 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5063 pmap = PV_PMAP(pv);
5064 if (!PMAP_TRYLOCK(pmap)) {
5065 md_gen = m->md.pv_gen;
5066 pvh_gen = pvh->pv_gen;
5067 rw_runlock(lock);
5068 PMAP_LOCK(pmap);
5069 rw_rlock(lock);
5070 if (md_gen != m->md.pv_gen ||
5071 pvh_gen != pvh->pv_gen) {
5072 PMAP_UNLOCK(pmap);
5073 goto restart;
5074 }
5075 }
5076 pte = pmap_pte(pmap, pv->pv_va, &lvl);
5077 if (pte != NULL &&
5078 (pmap_load(pte) & ATTR_SW_WIRED) != 0)
5079 count++;
5080 PMAP_UNLOCK(pmap);
5081 }
5082 }
5083 rw_runlock(lock);
5084 return (count);
5085 }
5086
5087 /*
5088 * Returns true if the given page is mapped individually or as part of
5089 * a 2mpage. Otherwise, returns false.
5090 */
5091 bool
pmap_page_is_mapped(vm_page_t m)5092 pmap_page_is_mapped(vm_page_t m)
5093 {
5094 struct rwlock *lock;
5095 bool rv;
5096
5097 if ((m->oflags & VPO_UNMANAGED) != 0)
5098 return (false);
5099 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5100 rw_rlock(lock);
5101 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5102 ((m->flags & PG_FICTITIOUS) == 0 &&
5103 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
5104 rw_runlock(lock);
5105 return (rv);
5106 }
5107
5108 /*
5109 * Destroy all managed, non-wired mappings in the given user-space
5110 * pmap. This pmap cannot be active on any processor besides the
5111 * caller.
5112 *
5113 * This function cannot be applied to the kernel pmap. Moreover, it
5114 * is not intended for general use. It is only to be used during
5115 * process termination. Consequently, it can be implemented in ways
5116 * that make it faster than pmap_remove(). First, it can more quickly
5117 * destroy mappings by iterating over the pmap's collection of PV
5118 * entries, rather than searching the page table. Second, it doesn't
5119 * have to test and clear the page table entries atomically, because
5120 * no processor is currently accessing the user address space. In
5121 * particular, a page table entry's dirty bit won't change state once
5122 * this function starts.
5123 */
5124 void
pmap_remove_pages(pmap_t pmap)5125 pmap_remove_pages(pmap_t pmap)
5126 {
5127 pd_entry_t *pde;
5128 pt_entry_t *pte, tpte;
5129 struct spglist free;
5130 vm_page_t m, ml3, mt;
5131 pv_entry_t pv;
5132 struct md_page *pvh;
5133 struct pv_chunk *pc, *npc;
5134 struct rwlock *lock;
5135 int64_t bit;
5136 uint64_t inuse, bitmask;
5137 int allfree, field, freed, idx, lvl;
5138 vm_paddr_t pa;
5139
5140 lock = NULL;
5141
5142 SLIST_INIT(&free);
5143 PMAP_LOCK(pmap);
5144 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5145 allfree = 1;
5146 freed = 0;
5147 for (field = 0; field < _NPCM; field++) {
5148 inuse = ~pc->pc_map[field] & pc_freemask[field];
5149 while (inuse != 0) {
5150 bit = ffsl(inuse) - 1;
5151 bitmask = 1UL << bit;
5152 idx = field * 64 + bit;
5153 pv = &pc->pc_pventry[idx];
5154 inuse &= ~bitmask;
5155
5156 pde = pmap_pde(pmap, pv->pv_va, &lvl);
5157 KASSERT(pde != NULL,
5158 ("Attempting to remove an unmapped page"));
5159
5160 switch(lvl) {
5161 case 1:
5162 pte = pmap_l1_to_l2(pde, pv->pv_va);
5163 tpte = pmap_load(pte);
5164 KASSERT((tpte & ATTR_DESCR_MASK) ==
5165 L2_BLOCK,
5166 ("Attempting to remove an invalid "
5167 "block: %lx", tpte));
5168 break;
5169 case 2:
5170 pte = pmap_l2_to_l3(pde, pv->pv_va);
5171 tpte = pmap_load(pte);
5172 KASSERT((tpte & ATTR_DESCR_MASK) ==
5173 L3_PAGE,
5174 ("Attempting to remove an invalid "
5175 "page: %lx", tpte));
5176 break;
5177 default:
5178 panic(
5179 "Invalid page directory level: %d",
5180 lvl);
5181 }
5182
5183 /*
5184 * We cannot remove wired pages from a process' mapping at this time
5185 */
5186 if (tpte & ATTR_SW_WIRED) {
5187 allfree = 0;
5188 continue;
5189 }
5190
5191 /* Mark free */
5192 pc->pc_map[field] |= bitmask;
5193
5194 /*
5195 * Because this pmap is not active on other
5196 * processors, the dirty bit cannot have
5197 * changed state since we last loaded pte.
5198 */
5199 pmap_clear(pte);
5200
5201 pa = tpte & ~ATTR_MASK;
5202
5203 m = PHYS_TO_VM_PAGE(pa);
5204 KASSERT(m->phys_addr == pa,
5205 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5206 m, (uintmax_t)m->phys_addr,
5207 (uintmax_t)tpte));
5208
5209 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5210 m < &vm_page_array[vm_page_array_size],
5211 ("pmap_remove_pages: bad pte %#jx",
5212 (uintmax_t)tpte));
5213
5214 /*
5215 * Update the vm_page_t clean/reference bits.
5216 */
5217 if (pmap_pte_dirty(pmap, tpte)) {
5218 switch (lvl) {
5219 case 1:
5220 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5221 vm_page_dirty(mt);
5222 break;
5223 case 2:
5224 vm_page_dirty(m);
5225 break;
5226 }
5227 }
5228
5229 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5230
5231 switch (lvl) {
5232 case 1:
5233 pmap_resident_count_dec(pmap,
5234 L2_SIZE / PAGE_SIZE);
5235 pvh = page_to_pvh(m);
5236 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
5237 pvh->pv_gen++;
5238 if (TAILQ_EMPTY(&pvh->pv_list)) {
5239 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5240 if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5241 TAILQ_EMPTY(&mt->md.pv_list))
5242 vm_page_aflag_clear(mt, PGA_WRITEABLE);
5243 }
5244 ml3 = pmap_remove_pt_page(pmap,
5245 pv->pv_va);
5246 if (ml3 != NULL) {
5247 KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
5248 ("pmap_remove_pages: l3 page not promoted"));
5249 pmap_resident_count_dec(pmap,1);
5250 KASSERT(ml3->ref_count == NL3PG,
5251 ("pmap_remove_pages: l3 page ref count error"));
5252 ml3->ref_count = 0;
5253 pmap_add_delayed_free_list(ml3,
5254 &free, FALSE);
5255 }
5256 break;
5257 case 2:
5258 pmap_resident_count_dec(pmap, 1);
5259 TAILQ_REMOVE(&m->md.pv_list, pv,
5260 pv_next);
5261 m->md.pv_gen++;
5262 if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5263 TAILQ_EMPTY(&m->md.pv_list) &&
5264 (m->flags & PG_FICTITIOUS) == 0) {
5265 pvh = page_to_pvh(m);
5266 if (TAILQ_EMPTY(&pvh->pv_list))
5267 vm_page_aflag_clear(m,
5268 PGA_WRITEABLE);
5269 }
5270 break;
5271 }
5272 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
5273 &free);
5274 freed++;
5275 }
5276 }
5277 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5278 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5279 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5280 if (allfree) {
5281 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5282 free_pv_chunk(pc);
5283 }
5284 }
5285 if (lock != NULL)
5286 rw_wunlock(lock);
5287 pmap_invalidate_all(pmap);
5288 PMAP_UNLOCK(pmap);
5289 vm_page_free_pages_toq(&free, true);
5290 }
5291
5292 /*
5293 * This is used to check if a page has been accessed or modified.
5294 */
5295 static boolean_t
pmap_page_test_mappings(vm_page_t m,boolean_t accessed,boolean_t modified)5296 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5297 {
5298 struct rwlock *lock;
5299 pv_entry_t pv;
5300 struct md_page *pvh;
5301 pt_entry_t *pte, mask, value;
5302 pmap_t pmap;
5303 int lvl, md_gen, pvh_gen;
5304 boolean_t rv;
5305
5306 rv = FALSE;
5307 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5308 rw_rlock(lock);
5309 restart:
5310 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5311 pmap = PV_PMAP(pv);
5312 PMAP_ASSERT_STAGE1(pmap);
5313 if (!PMAP_TRYLOCK(pmap)) {
5314 md_gen = m->md.pv_gen;
5315 rw_runlock(lock);
5316 PMAP_LOCK(pmap);
5317 rw_rlock(lock);
5318 if (md_gen != m->md.pv_gen) {
5319 PMAP_UNLOCK(pmap);
5320 goto restart;
5321 }
5322 }
5323 pte = pmap_pte(pmap, pv->pv_va, &lvl);
5324 KASSERT(lvl == 3,
5325 ("pmap_page_test_mappings: Invalid level %d", lvl));
5326 mask = 0;
5327 value = 0;
5328 if (modified) {
5329 mask |= ATTR_S1_AP_RW_BIT;
5330 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5331 }
5332 if (accessed) {
5333 mask |= ATTR_AF | ATTR_DESCR_MASK;
5334 value |= ATTR_AF | L3_PAGE;
5335 }
5336 rv = (pmap_load(pte) & mask) == value;
5337 PMAP_UNLOCK(pmap);
5338 if (rv)
5339 goto out;
5340 }
5341 if ((m->flags & PG_FICTITIOUS) == 0) {
5342 pvh = page_to_pvh(m);
5343 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5344 pmap = PV_PMAP(pv);
5345 PMAP_ASSERT_STAGE1(pmap);
5346 if (!PMAP_TRYLOCK(pmap)) {
5347 md_gen = m->md.pv_gen;
5348 pvh_gen = pvh->pv_gen;
5349 rw_runlock(lock);
5350 PMAP_LOCK(pmap);
5351 rw_rlock(lock);
5352 if (md_gen != m->md.pv_gen ||
5353 pvh_gen != pvh->pv_gen) {
5354 PMAP_UNLOCK(pmap);
5355 goto restart;
5356 }
5357 }
5358 pte = pmap_pte(pmap, pv->pv_va, &lvl);
5359 KASSERT(lvl == 2,
5360 ("pmap_page_test_mappings: Invalid level %d", lvl));
5361 mask = 0;
5362 value = 0;
5363 if (modified) {
5364 mask |= ATTR_S1_AP_RW_BIT;
5365 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5366 }
5367 if (accessed) {
5368 mask |= ATTR_AF | ATTR_DESCR_MASK;
5369 value |= ATTR_AF | L2_BLOCK;
5370 }
5371 rv = (pmap_load(pte) & mask) == value;
5372 PMAP_UNLOCK(pmap);
5373 if (rv)
5374 goto out;
5375 }
5376 }
5377 out:
5378 rw_runlock(lock);
5379 return (rv);
5380 }
5381
5382 /*
5383 * pmap_is_modified:
5384 *
5385 * Return whether or not the specified physical page was modified
5386 * in any physical maps.
5387 */
5388 boolean_t
pmap_is_modified(vm_page_t m)5389 pmap_is_modified(vm_page_t m)
5390 {
5391
5392 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5393 ("pmap_is_modified: page %p is not managed", m));
5394
5395 /*
5396 * If the page is not busied then this check is racy.
5397 */
5398 if (!pmap_page_is_write_mapped(m))
5399 return (FALSE);
5400 return (pmap_page_test_mappings(m, FALSE, TRUE));
5401 }
5402
5403 /*
5404 * pmap_is_prefaultable:
5405 *
5406 * Return whether or not the specified virtual address is eligible
5407 * for prefault.
5408 */
5409 boolean_t
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)5410 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5411 {
5412 pt_entry_t *pte;
5413 boolean_t rv;
5414 int lvl;
5415
5416 rv = FALSE;
5417 PMAP_LOCK(pmap);
5418 pte = pmap_pte(pmap, addr, &lvl);
5419 if (pte != NULL && pmap_load(pte) != 0) {
5420 rv = TRUE;
5421 }
5422 PMAP_UNLOCK(pmap);
5423 return (rv);
5424 }
5425
5426 /*
5427 * pmap_is_referenced:
5428 *
5429 * Return whether or not the specified physical page was referenced
5430 * in any physical maps.
5431 */
5432 boolean_t
pmap_is_referenced(vm_page_t m)5433 pmap_is_referenced(vm_page_t m)
5434 {
5435
5436 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5437 ("pmap_is_referenced: page %p is not managed", m));
5438 return (pmap_page_test_mappings(m, TRUE, FALSE));
5439 }
5440
5441 /*
5442 * Clear the write and modified bits in each of the given page's mappings.
5443 */
5444 void
pmap_remove_write(vm_page_t m)5445 pmap_remove_write(vm_page_t m)
5446 {
5447 struct md_page *pvh;
5448 pmap_t pmap;
5449 struct rwlock *lock;
5450 pv_entry_t next_pv, pv;
5451 pt_entry_t oldpte, *pte;
5452 vm_offset_t va;
5453 int lvl, md_gen, pvh_gen;
5454
5455 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5456 ("pmap_remove_write: page %p is not managed", m));
5457 vm_page_assert_busied(m);
5458
5459 if (!pmap_page_is_write_mapped(m))
5460 return;
5461 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5462 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5463 rw_wlock(lock);
5464 retry:
5465 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5466 pmap = PV_PMAP(pv);
5467 PMAP_ASSERT_STAGE1(pmap);
5468 if (!PMAP_TRYLOCK(pmap)) {
5469 pvh_gen = pvh->pv_gen;
5470 rw_wunlock(lock);
5471 PMAP_LOCK(pmap);
5472 rw_wlock(lock);
5473 if (pvh_gen != pvh->pv_gen) {
5474 PMAP_UNLOCK(pmap);
5475 goto retry;
5476 }
5477 }
5478 va = pv->pv_va;
5479 pte = pmap_pte(pmap, va, &lvl);
5480 if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
5481 (void)pmap_demote_l2_locked(pmap, pte, va, &lock);
5482 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5483 ("inconsistent pv lock %p %p for page %p",
5484 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5485 PMAP_UNLOCK(pmap);
5486 }
5487 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5488 pmap = PV_PMAP(pv);
5489 PMAP_ASSERT_STAGE1(pmap);
5490 if (!PMAP_TRYLOCK(pmap)) {
5491 pvh_gen = pvh->pv_gen;
5492 md_gen = m->md.pv_gen;
5493 rw_wunlock(lock);
5494 PMAP_LOCK(pmap);
5495 rw_wlock(lock);
5496 if (pvh_gen != pvh->pv_gen ||
5497 md_gen != m->md.pv_gen) {
5498 PMAP_UNLOCK(pmap);
5499 goto retry;
5500 }
5501 }
5502 pte = pmap_pte(pmap, pv->pv_va, &lvl);
5503 oldpte = pmap_load(pte);
5504 if ((oldpte & ATTR_SW_DBM) != 0) {
5505 while (!atomic_fcmpset_64(pte, &oldpte,
5506 (oldpte | ATTR_S1_AP_RW_BIT) & ~ATTR_SW_DBM))
5507 cpu_spinwait();
5508 if ((oldpte & ATTR_S1_AP_RW_BIT) ==
5509 ATTR_S1_AP(ATTR_S1_AP_RW))
5510 vm_page_dirty(m);
5511 pmap_invalidate_page(pmap, pv->pv_va);
5512 }
5513 PMAP_UNLOCK(pmap);
5514 }
5515 rw_wunlock(lock);
5516 vm_page_aflag_clear(m, PGA_WRITEABLE);
5517 }
5518
5519 /*
5520 * pmap_ts_referenced:
5521 *
5522 * Return a count of reference bits for a page, clearing those bits.
5523 * It is not necessary for every reference bit to be cleared, but it
5524 * is necessary that 0 only be returned when there are truly no
5525 * reference bits set.
5526 *
5527 * As an optimization, update the page's dirty field if a modified bit is
5528 * found while counting reference bits. This opportunistic update can be
5529 * performed at low cost and can eliminate the need for some future calls
5530 * to pmap_is_modified(). However, since this function stops after
5531 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
5532 * dirty pages. Those dirty pages will only be detected by a future call
5533 * to pmap_is_modified().
5534 */
5535 int
pmap_ts_referenced(vm_page_t m)5536 pmap_ts_referenced(vm_page_t m)
5537 {
5538 struct md_page *pvh;
5539 pv_entry_t pv, pvf;
5540 pmap_t pmap;
5541 struct rwlock *lock;
5542 pd_entry_t *pde, tpde;
5543 pt_entry_t *pte, tpte;
5544 vm_offset_t va;
5545 vm_paddr_t pa;
5546 int cleared, lvl, md_gen, not_cleared, pvh_gen;
5547 struct spglist free;
5548
5549 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5550 ("pmap_ts_referenced: page %p is not managed", m));
5551 SLIST_INIT(&free);
5552 cleared = 0;
5553 pa = VM_PAGE_TO_PHYS(m);
5554 lock = PHYS_TO_PV_LIST_LOCK(pa);
5555 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5556 rw_wlock(lock);
5557 retry:
5558 not_cleared = 0;
5559 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5560 goto small_mappings;
5561 pv = pvf;
5562 do {
5563 if (pvf == NULL)
5564 pvf = pv;
5565 pmap = PV_PMAP(pv);
5566 if (!PMAP_TRYLOCK(pmap)) {
5567 pvh_gen = pvh->pv_gen;
5568 rw_wunlock(lock);
5569 PMAP_LOCK(pmap);
5570 rw_wlock(lock);
5571 if (pvh_gen != pvh->pv_gen) {
5572 PMAP_UNLOCK(pmap);
5573 goto retry;
5574 }
5575 }
5576 va = pv->pv_va;
5577 pde = pmap_pde(pmap, va, &lvl);
5578 KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found"));
5579 KASSERT(lvl == 1,
5580 ("pmap_ts_referenced: invalid pde level %d", lvl));
5581 tpde = pmap_load(pde);
5582 KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE,
5583 ("pmap_ts_referenced: found an invalid l1 table"));
5584 pte = pmap_l1_to_l2(pde, va);
5585 tpte = pmap_load(pte);
5586 if (pmap_pte_dirty(pmap, tpte)) {
5587 /*
5588 * Although "tpte" is mapping a 2MB page, because
5589 * this function is called at a 4KB page granularity,
5590 * we only update the 4KB page under test.
5591 */
5592 vm_page_dirty(m);
5593 }
5594
5595 if ((tpte & ATTR_AF) != 0) {
5596 /*
5597 * Since this reference bit is shared by 512 4KB pages,
5598 * it should not be cleared every time it is tested.
5599 * Apply a simple "hash" function on the physical page
5600 * number, the virtual superpage number, and the pmap
5601 * address to select one 4KB page out of the 512 on
5602 * which testing the reference bit will result in
5603 * clearing that reference bit. This function is
5604 * designed to avoid the selection of the same 4KB page
5605 * for every 2MB page mapping.
5606 *
5607 * On demotion, a mapping that hasn't been referenced
5608 * is simply destroyed. To avoid the possibility of a
5609 * subsequent page fault on a demoted wired mapping,
5610 * always leave its reference bit set. Moreover,
5611 * since the superpage is wired, the current state of
5612 * its reference bit won't affect page replacement.
5613 */
5614 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
5615 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
5616 (tpte & ATTR_SW_WIRED) == 0) {
5617 pmap_clear_bits(pte, ATTR_AF);
5618 pmap_invalidate_page(pmap, va);
5619 cleared++;
5620 } else
5621 not_cleared++;
5622 }
5623 PMAP_UNLOCK(pmap);
5624 /* Rotate the PV list if it has more than one entry. */
5625 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5626 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5627 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5628 pvh->pv_gen++;
5629 }
5630 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5631 goto out;
5632 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5633 small_mappings:
5634 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5635 goto out;
5636 pv = pvf;
5637 do {
5638 if (pvf == NULL)
5639 pvf = pv;
5640 pmap = PV_PMAP(pv);
5641 if (!PMAP_TRYLOCK(pmap)) {
5642 pvh_gen = pvh->pv_gen;
5643 md_gen = m->md.pv_gen;
5644 rw_wunlock(lock);
5645 PMAP_LOCK(pmap);
5646 rw_wlock(lock);
5647 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5648 PMAP_UNLOCK(pmap);
5649 goto retry;
5650 }
5651 }
5652 pde = pmap_pde(pmap, pv->pv_va, &lvl);
5653 KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
5654 KASSERT(lvl == 2,
5655 ("pmap_ts_referenced: invalid pde level %d", lvl));
5656 tpde = pmap_load(pde);
5657 KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
5658 ("pmap_ts_referenced: found an invalid l2 table"));
5659 pte = pmap_l2_to_l3(pde, pv->pv_va);
5660 tpte = pmap_load(pte);
5661 if (pmap_pte_dirty(pmap, tpte))
5662 vm_page_dirty(m);
5663 if ((tpte & ATTR_AF) != 0) {
5664 if ((tpte & ATTR_SW_WIRED) == 0) {
5665 pmap_clear_bits(pte, ATTR_AF);
5666 pmap_invalidate_page(pmap, pv->pv_va);
5667 cleared++;
5668 } else
5669 not_cleared++;
5670 }
5671 PMAP_UNLOCK(pmap);
5672 /* Rotate the PV list if it has more than one entry. */
5673 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
5674 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5675 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5676 m->md.pv_gen++;
5677 }
5678 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
5679 not_cleared < PMAP_TS_REFERENCED_MAX);
5680 out:
5681 rw_wunlock(lock);
5682 vm_page_free_pages_toq(&free, true);
5683 return (cleared + not_cleared);
5684 }
5685
5686 /*
5687 * Apply the given advice to the specified range of addresses within the
5688 * given pmap. Depending on the advice, clear the referenced and/or
5689 * modified flags in each mapping and set the mapped page's dirty field.
5690 */
5691 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)5692 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
5693 {
5694 struct rwlock *lock;
5695 vm_offset_t va, va_next;
5696 vm_page_t m;
5697 pd_entry_t *l0, *l1, *l2, oldl2;
5698 pt_entry_t *l3, oldl3;
5699
5700 PMAP_ASSERT_STAGE1(pmap);
5701
5702 if (advice != MADV_DONTNEED && advice != MADV_FREE)
5703 return;
5704
5705 PMAP_LOCK(pmap);
5706 for (; sva < eva; sva = va_next) {
5707 l0 = pmap_l0(pmap, sva);
5708 if (pmap_load(l0) == 0) {
5709 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
5710 if (va_next < sva)
5711 va_next = eva;
5712 continue;
5713 }
5714
5715 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
5716 if (va_next < sva)
5717 va_next = eva;
5718 l1 = pmap_l0_to_l1(l0, sva);
5719 if (pmap_load(l1) == 0)
5720 continue;
5721 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
5722 KASSERT(va_next <= eva,
5723 ("partial update of non-transparent 1G page "
5724 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
5725 pmap_load(l1), sva, eva, va_next));
5726 continue;
5727 }
5728
5729 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
5730 if (va_next < sva)
5731 va_next = eva;
5732 l2 = pmap_l1_to_l2(l1, sva);
5733 oldl2 = pmap_load(l2);
5734 if (oldl2 == 0)
5735 continue;
5736 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5737 if ((oldl2 & ATTR_SW_MANAGED) == 0)
5738 continue;
5739 lock = NULL;
5740 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
5741 if (lock != NULL)
5742 rw_wunlock(lock);
5743
5744 /*
5745 * The 2MB page mapping was destroyed.
5746 */
5747 continue;
5748 }
5749
5750 /*
5751 * Unless the page mappings are wired, remove the
5752 * mapping to a single page so that a subsequent
5753 * access may repromote. Choosing the last page
5754 * within the address range [sva, min(va_next, eva))
5755 * generally results in more repromotions. Since the
5756 * underlying page table page is fully populated, this
5757 * removal never frees a page table page.
5758 */
5759 if ((oldl2 & ATTR_SW_WIRED) == 0) {
5760 va = eva;
5761 if (va > va_next)
5762 va = va_next;
5763 va -= PAGE_SIZE;
5764 KASSERT(va >= sva,
5765 ("pmap_advise: no address gap"));
5766 l3 = pmap_l2_to_l3(l2, va);
5767 KASSERT(pmap_load(l3) != 0,
5768 ("pmap_advise: invalid PTE"));
5769 pmap_remove_l3(pmap, l3, va, pmap_load(l2),
5770 NULL, &lock);
5771 }
5772 if (lock != NULL)
5773 rw_wunlock(lock);
5774 }
5775 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
5776 ("pmap_advise: invalid L2 entry after demotion"));
5777 if (va_next > eva)
5778 va_next = eva;
5779 va = va_next;
5780 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
5781 sva += L3_SIZE) {
5782 oldl3 = pmap_load(l3);
5783 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
5784 (ATTR_SW_MANAGED | L3_PAGE))
5785 goto maybe_invlrng;
5786 else if (pmap_pte_dirty(pmap, oldl3)) {
5787 if (advice == MADV_DONTNEED) {
5788 /*
5789 * Future calls to pmap_is_modified()
5790 * can be avoided by making the page
5791 * dirty now.
5792 */
5793 m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK);
5794 vm_page_dirty(m);
5795 }
5796 while (!atomic_fcmpset_long(l3, &oldl3,
5797 (oldl3 & ~ATTR_AF) |
5798 ATTR_S1_AP(ATTR_S1_AP_RO)))
5799 cpu_spinwait();
5800 } else if ((oldl3 & ATTR_AF) != 0)
5801 pmap_clear_bits(l3, ATTR_AF);
5802 else
5803 goto maybe_invlrng;
5804 if (va == va_next)
5805 va = sva;
5806 continue;
5807 maybe_invlrng:
5808 if (va != va_next) {
5809 pmap_invalidate_range(pmap, va, sva);
5810 va = va_next;
5811 }
5812 }
5813 if (va != va_next)
5814 pmap_invalidate_range(pmap, va, sva);
5815 }
5816 PMAP_UNLOCK(pmap);
5817 }
5818
5819 /*
5820 * Clear the modify bits on the specified physical page.
5821 */
5822 void
pmap_clear_modify(vm_page_t m)5823 pmap_clear_modify(vm_page_t m)
5824 {
5825 struct md_page *pvh;
5826 struct rwlock *lock;
5827 pmap_t pmap;
5828 pv_entry_t next_pv, pv;
5829 pd_entry_t *l2, oldl2;
5830 pt_entry_t *l3, oldl3;
5831 vm_offset_t va;
5832 int md_gen, pvh_gen;
5833
5834 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5835 ("pmap_clear_modify: page %p is not managed", m));
5836 vm_page_assert_busied(m);
5837
5838 if (!pmap_page_is_write_mapped(m))
5839 return;
5840 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5841 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5842 rw_wlock(lock);
5843 restart:
5844 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5845 pmap = PV_PMAP(pv);
5846 PMAP_ASSERT_STAGE1(pmap);
5847 if (!PMAP_TRYLOCK(pmap)) {
5848 pvh_gen = pvh->pv_gen;
5849 rw_wunlock(lock);
5850 PMAP_LOCK(pmap);
5851 rw_wlock(lock);
5852 if (pvh_gen != pvh->pv_gen) {
5853 PMAP_UNLOCK(pmap);
5854 goto restart;
5855 }
5856 }
5857 va = pv->pv_va;
5858 l2 = pmap_l2(pmap, va);
5859 oldl2 = pmap_load(l2);
5860 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
5861 if ((oldl2 & ATTR_SW_DBM) != 0 &&
5862 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
5863 (oldl2 & ATTR_SW_WIRED) == 0) {
5864 /*
5865 * Write protect the mapping to a single page so that
5866 * a subsequent write access may repromote.
5867 */
5868 va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK);
5869 l3 = pmap_l2_to_l3(l2, va);
5870 oldl3 = pmap_load(l3);
5871 while (!atomic_fcmpset_long(l3, &oldl3,
5872 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
5873 cpu_spinwait();
5874 vm_page_dirty(m);
5875 pmap_invalidate_page(pmap, va);
5876 }
5877 PMAP_UNLOCK(pmap);
5878 }
5879 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5880 pmap = PV_PMAP(pv);
5881 PMAP_ASSERT_STAGE1(pmap);
5882 if (!PMAP_TRYLOCK(pmap)) {
5883 md_gen = m->md.pv_gen;
5884 pvh_gen = pvh->pv_gen;
5885 rw_wunlock(lock);
5886 PMAP_LOCK(pmap);
5887 rw_wlock(lock);
5888 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5889 PMAP_UNLOCK(pmap);
5890 goto restart;
5891 }
5892 }
5893 l2 = pmap_l2(pmap, pv->pv_va);
5894 l3 = pmap_l2_to_l3(l2, pv->pv_va);
5895 oldl3 = pmap_load(l3);
5896 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){
5897 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
5898 pmap_invalidate_page(pmap, pv->pv_va);
5899 }
5900 PMAP_UNLOCK(pmap);
5901 }
5902 rw_wunlock(lock);
5903 }
5904
5905 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)5906 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5907 {
5908 struct pmap_preinit_mapping *ppim;
5909 vm_offset_t va, offset;
5910 pd_entry_t *pde;
5911 pt_entry_t *l2;
5912 int i, lvl, l2_blocks, free_l2_count, start_idx;
5913
5914 if (!vm_initialized) {
5915 /*
5916 * No L3 ptables so map entire L2 blocks where start VA is:
5917 * preinit_map_va + start_idx * L2_SIZE
5918 * There may be duplicate mappings (multiple VA -> same PA) but
5919 * ARM64 dcache is always PIPT so that's acceptable.
5920 */
5921 if (size == 0)
5922 return (NULL);
5923
5924 /* Calculate how many L2 blocks are needed for the mapping */
5925 l2_blocks = (roundup2(pa + size, L2_SIZE) -
5926 rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
5927
5928 offset = pa & L2_OFFSET;
5929
5930 if (preinit_map_va == 0)
5931 return (NULL);
5932
5933 /* Map 2MiB L2 blocks from reserved VA space */
5934
5935 free_l2_count = 0;
5936 start_idx = -1;
5937 /* Find enough free contiguous VA space */
5938 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5939 ppim = pmap_preinit_mapping + i;
5940 if (free_l2_count > 0 && ppim->pa != 0) {
5941 /* Not enough space here */
5942 free_l2_count = 0;
5943 start_idx = -1;
5944 continue;
5945 }
5946
5947 if (ppim->pa == 0) {
5948 /* Free L2 block */
5949 if (start_idx == -1)
5950 start_idx = i;
5951 free_l2_count++;
5952 if (free_l2_count == l2_blocks)
5953 break;
5954 }
5955 }
5956 if (free_l2_count != l2_blocks)
5957 panic("%s: too many preinit mappings", __func__);
5958
5959 va = preinit_map_va + (start_idx * L2_SIZE);
5960 for (i = start_idx; i < start_idx + l2_blocks; i++) {
5961 /* Mark entries as allocated */
5962 ppim = pmap_preinit_mapping + i;
5963 ppim->pa = pa;
5964 ppim->va = va + offset;
5965 ppim->size = size;
5966 }
5967
5968 /* Map L2 blocks */
5969 pa = rounddown2(pa, L2_SIZE);
5970 for (i = 0; i < l2_blocks; i++) {
5971 pde = pmap_pde(kernel_pmap, va, &lvl);
5972 KASSERT(pde != NULL,
5973 ("pmap_mapbios: Invalid page entry, va: 0x%lx",
5974 va));
5975 KASSERT(lvl == 1,
5976 ("pmap_mapbios: Invalid level %d", lvl));
5977
5978 /* Insert L2_BLOCK */
5979 l2 = pmap_l1_to_l2(pde, va);
5980 pmap_load_store(l2,
5981 pa | ATTR_DEFAULT | ATTR_S1_XN |
5982 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
5983
5984 va += L2_SIZE;
5985 pa += L2_SIZE;
5986 }
5987 pmap_invalidate_all(kernel_pmap);
5988
5989 va = preinit_map_va + (start_idx * L2_SIZE);
5990
5991 } else {
5992 /* kva_alloc may be used to map the pages */
5993 offset = pa & PAGE_MASK;
5994 size = round_page(offset + size);
5995
5996 va = kva_alloc(size);
5997 if (va == 0)
5998 panic("%s: Couldn't allocate KVA", __func__);
5999
6000 pde = pmap_pde(kernel_pmap, va, &lvl);
6001 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
6002
6003 /* L3 table is linked */
6004 va = trunc_page(va);
6005 pa = trunc_page(pa);
6006 pmap_kenter(va, size, pa, memory_mapping_mode(pa));
6007 }
6008
6009 return ((void *)(va + offset));
6010 }
6011
6012 void
pmap_unmapbios(vm_offset_t va,vm_size_t size)6013 pmap_unmapbios(vm_offset_t va, vm_size_t size)
6014 {
6015 struct pmap_preinit_mapping *ppim;
6016 vm_offset_t offset, tmpsize, va_trunc;
6017 pd_entry_t *pde;
6018 pt_entry_t *l2;
6019 int i, lvl, l2_blocks, block;
6020 bool preinit_map;
6021
6022 l2_blocks =
6023 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
6024 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
6025
6026 /* Remove preinit mapping */
6027 preinit_map = false;
6028 block = 0;
6029 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6030 ppim = pmap_preinit_mapping + i;
6031 if (ppim->va == va) {
6032 KASSERT(ppim->size == size,
6033 ("pmap_unmapbios: size mismatch"));
6034 ppim->va = 0;
6035 ppim->pa = 0;
6036 ppim->size = 0;
6037 preinit_map = true;
6038 offset = block * L2_SIZE;
6039 va_trunc = rounddown2(va, L2_SIZE) + offset;
6040
6041 /* Remove L2_BLOCK */
6042 pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
6043 KASSERT(pde != NULL,
6044 ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
6045 va_trunc));
6046 l2 = pmap_l1_to_l2(pde, va_trunc);
6047 pmap_clear(l2);
6048
6049 if (block == (l2_blocks - 1))
6050 break;
6051 block++;
6052 }
6053 }
6054 if (preinit_map) {
6055 pmap_invalidate_all(kernel_pmap);
6056 return;
6057 }
6058
6059 /* Unmap the pages reserved with kva_alloc. */
6060 if (vm_initialized) {
6061 offset = va & PAGE_MASK;
6062 size = round_page(offset + size);
6063 va = trunc_page(va);
6064
6065 pde = pmap_pde(kernel_pmap, va, &lvl);
6066 KASSERT(pde != NULL,
6067 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va));
6068 KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl));
6069
6070 /* Unmap and invalidate the pages */
6071 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6072 pmap_kremove(va + tmpsize);
6073
6074 kva_free(va, size);
6075 }
6076 }
6077
6078 /*
6079 * Sets the memory attribute for the specified page.
6080 */
6081 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)6082 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6083 {
6084
6085 m->md.pv_memattr = ma;
6086
6087 /*
6088 * If "m" is a normal page, update its direct mapping. This update
6089 * can be relied upon to perform any cache operations that are
6090 * required for data coherence.
6091 */
6092 if ((m->flags & PG_FICTITIOUS) == 0 &&
6093 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6094 m->md.pv_memattr) != 0)
6095 panic("memory attribute change on the direct map failed");
6096 }
6097
6098 /*
6099 * Changes the specified virtual address range's memory type to that given by
6100 * the parameter "mode". The specified virtual address range must be
6101 * completely contained within either the direct map or the kernel map. If
6102 * the virtual address range is contained within the kernel map, then the
6103 * memory type for each of the corresponding ranges of the direct map is also
6104 * changed. (The corresponding ranges of the direct map are those ranges that
6105 * map the same physical pages as the specified virtual address range.) These
6106 * changes to the direct map are necessary because Intel describes the
6107 * behavior of their processors as "undefined" if two or more mappings to the
6108 * same physical page have different memory types.
6109 *
6110 * Returns zero if the change completed successfully, and either EINVAL or
6111 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
6112 * of the virtual address range was not mapped, and ENOMEM is returned if
6113 * there was insufficient memory available to complete the change. In the
6114 * latter case, the memory type may have been changed on some part of the
6115 * virtual address range or the direct map.
6116 */
6117 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)6118 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6119 {
6120 int error;
6121
6122 PMAP_LOCK(kernel_pmap);
6123 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
6124 PMAP_UNLOCK(kernel_pmap);
6125 return (error);
6126 }
6127
6128 /*
6129 * Changes the specified virtual address range's protections to those
6130 * specified by "prot". Like pmap_change_attr(), protections for aliases
6131 * in the direct map are updated as well. Protections on aliasing mappings may
6132 * be a subset of the requested protections; for example, mappings in the direct
6133 * map are never executable.
6134 */
6135 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)6136 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
6137 {
6138 int error;
6139
6140 /* Only supported within the kernel map. */
6141 if (va < VM_MIN_KERNEL_ADDRESS)
6142 return (EINVAL);
6143
6144 PMAP_LOCK(kernel_pmap);
6145 error = pmap_change_props_locked(va, size, prot, -1, false);
6146 PMAP_UNLOCK(kernel_pmap);
6147 return (error);
6148 }
6149
6150 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)6151 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
6152 int mode, bool skip_unmapped)
6153 {
6154 vm_offset_t base, offset, tmpva;
6155 vm_size_t pte_size;
6156 vm_paddr_t pa;
6157 pt_entry_t pte, *ptep, *newpte;
6158 pt_entry_t bits, mask;
6159 int lvl, rv;
6160
6161 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6162 base = trunc_page(va);
6163 offset = va & PAGE_MASK;
6164 size = round_page(offset + size);
6165
6166 if (!VIRT_IN_DMAP(base) &&
6167 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
6168 return (EINVAL);
6169
6170 bits = 0;
6171 mask = 0;
6172 if (mode != -1) {
6173 bits = ATTR_S1_IDX(mode);
6174 mask = ATTR_S1_IDX_MASK;
6175 if (mode == VM_MEMATTR_DEVICE) {
6176 mask |= ATTR_S1_XN;
6177 bits |= ATTR_S1_XN;
6178 }
6179 }
6180 if (prot != VM_PROT_NONE) {
6181 /* Don't mark the DMAP as executable. It never is on arm64. */
6182 if (VIRT_IN_DMAP(base)) {
6183 prot &= ~VM_PROT_EXECUTE;
6184 /*
6185 * XXX Mark the DMAP as writable for now. We rely
6186 * on this in ddb & dtrace to insert breakpoint
6187 * instructions.
6188 */
6189 prot |= VM_PROT_WRITE;
6190 }
6191
6192 if ((prot & VM_PROT_WRITE) == 0) {
6193 bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
6194 }
6195 if ((prot & VM_PROT_EXECUTE) == 0) {
6196 bits |= ATTR_S1_PXN;
6197 }
6198 bits |= ATTR_S1_UXN;
6199 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
6200 }
6201
6202 for (tmpva = base; tmpva < base + size; ) {
6203 ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
6204 if (ptep == NULL && !skip_unmapped) {
6205 return (EINVAL);
6206 } else if ((ptep == NULL && skip_unmapped) ||
6207 (pmap_load(ptep) & mask) == bits) {
6208 /*
6209 * We already have the correct attribute or there
6210 * is no memory mapped at this address and we are
6211 * skipping unmapped memory.
6212 */
6213 switch (lvl) {
6214 default:
6215 panic("Invalid DMAP table level: %d\n", lvl);
6216 case 1:
6217 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
6218 break;
6219 case 2:
6220 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
6221 break;
6222 case 3:
6223 tmpva += PAGE_SIZE;
6224 break;
6225 }
6226 } else {
6227 /*
6228 * Split the entry to an level 3 table, then
6229 * set the new attribute.
6230 */
6231 switch (lvl) {
6232 default:
6233 panic("Invalid DMAP table level: %d\n", lvl);
6234 case 1:
6235 if ((tmpva & L1_OFFSET) == 0 &&
6236 (base + size - tmpva) >= L1_SIZE) {
6237 pte_size = L1_SIZE;
6238 break;
6239 }
6240 newpte = pmap_demote_l1(kernel_pmap, ptep,
6241 tmpva & ~L1_OFFSET);
6242 if (newpte == NULL)
6243 return (EINVAL);
6244 ptep = pmap_l1_to_l2(ptep, tmpva);
6245 /* FALLTHROUGH */
6246 case 2:
6247 if ((tmpva & L2_OFFSET) == 0 &&
6248 (base + size - tmpva) >= L2_SIZE) {
6249 pte_size = L2_SIZE;
6250 break;
6251 }
6252 newpte = pmap_demote_l2(kernel_pmap, ptep,
6253 tmpva);
6254 if (newpte == NULL)
6255 return (EINVAL);
6256 ptep = pmap_l2_to_l3(ptep, tmpva);
6257 /* FALLTHROUGH */
6258 case 3:
6259 pte_size = PAGE_SIZE;
6260 break;
6261 }
6262
6263 /* Update the entry */
6264 pte = pmap_load(ptep);
6265 pte &= ~mask;
6266 pte |= bits;
6267
6268 pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
6269 pte_size);
6270
6271 pa = pte & ~ATTR_MASK;
6272 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
6273 /*
6274 * Keep the DMAP memory in sync.
6275 */
6276 rv = pmap_change_props_locked(
6277 PHYS_TO_DMAP(pa), pte_size,
6278 prot, mode, true);
6279 if (rv != 0)
6280 return (rv);
6281 }
6282
6283 /*
6284 * If moving to a non-cacheable entry flush
6285 * the cache.
6286 */
6287 if (mode == VM_MEMATTR_UNCACHEABLE)
6288 cpu_dcache_wbinv_range(tmpva, pte_size);
6289 tmpva += pte_size;
6290 }
6291 }
6292
6293 return (0);
6294 }
6295
6296 /*
6297 * Create an L2 table to map all addresses within an L1 mapping.
6298 */
6299 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)6300 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
6301 {
6302 pt_entry_t *l2, newl2, oldl1;
6303 vm_offset_t tmpl1;
6304 vm_paddr_t l2phys, phys;
6305 vm_page_t ml2;
6306 int i;
6307
6308 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6309 oldl1 = pmap_load(l1);
6310 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
6311 ("pmap_demote_l1: Demoting a non-block entry"));
6312 KASSERT((va & L1_OFFSET) == 0,
6313 ("pmap_demote_l1: Invalid virtual address %#lx", va));
6314 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
6315 ("pmap_demote_l1: Level 1 table shouldn't be managed"));
6316
6317 tmpl1 = 0;
6318 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
6319 tmpl1 = kva_alloc(PAGE_SIZE);
6320 if (tmpl1 == 0)
6321 return (NULL);
6322 }
6323
6324 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
6325 NULL) {
6326 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
6327 " in pmap %p", va, pmap);
6328 l2 = NULL;
6329 goto fail;
6330 }
6331
6332 l2phys = VM_PAGE_TO_PHYS(ml2);
6333 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
6334
6335 /* Address the range points at */
6336 phys = oldl1 & ~ATTR_MASK;
6337 /* The attributed from the old l1 table to be copied */
6338 newl2 = oldl1 & ATTR_MASK;
6339
6340 /* Create the new entries */
6341 for (i = 0; i < Ln_ENTRIES; i++) {
6342 l2[i] = newl2 | phys;
6343 phys += L2_SIZE;
6344 }
6345 KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
6346 ("Invalid l2 page (%lx != %lx)", l2[0],
6347 (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
6348
6349 if (tmpl1 != 0) {
6350 pmap_kenter(tmpl1, PAGE_SIZE,
6351 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
6352 VM_MEMATTR_WRITE_BACK);
6353 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
6354 }
6355
6356 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
6357
6358 fail:
6359 if (tmpl1 != 0) {
6360 pmap_kremove(tmpl1);
6361 kva_free(tmpl1, PAGE_SIZE);
6362 }
6363
6364 return (l2);
6365 }
6366
6367 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)6368 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
6369 {
6370 pt_entry_t *l3;
6371
6372 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
6373 *l3 = newl3;
6374 newl3 += L3_SIZE;
6375 }
6376 }
6377
6378 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)6379 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
6380 struct rwlock **lockp)
6381 {
6382 struct spglist free;
6383
6384 SLIST_INIT(&free);
6385 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
6386 lockp);
6387 vm_page_free_pages_toq(&free, true);
6388 }
6389
6390 /*
6391 * Create an L3 table to map all addresses within an L2 mapping.
6392 */
6393 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)6394 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
6395 struct rwlock **lockp)
6396 {
6397 pt_entry_t *l3, newl3, oldl2;
6398 vm_offset_t tmpl2;
6399 vm_paddr_t l3phys;
6400 vm_page_t ml3;
6401
6402 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6403 PMAP_ASSERT_STAGE1(pmap);
6404 KASSERT(ADDR_IS_CANONICAL(va),
6405 ("%s: Address not in canonical form: %lx", __func__, va));
6406
6407 l3 = NULL;
6408 oldl2 = pmap_load(l2);
6409 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
6410 ("pmap_demote_l2: Demoting a non-block entry"));
6411 va &= ~L2_OFFSET;
6412
6413 tmpl2 = 0;
6414 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
6415 tmpl2 = kva_alloc(PAGE_SIZE);
6416 if (tmpl2 == 0)
6417 return (NULL);
6418 }
6419
6420 /*
6421 * Invalidate the 2MB page mapping and return "failure" if the
6422 * mapping was never accessed.
6423 */
6424 if ((oldl2 & ATTR_AF) == 0) {
6425 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
6426 ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
6427 pmap_demote_l2_abort(pmap, va, l2, lockp);
6428 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
6429 va, pmap);
6430 goto fail;
6431 }
6432
6433 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
6434 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
6435 ("pmap_demote_l2: page table page for a wired mapping"
6436 " is missing"));
6437
6438 /*
6439 * If the page table page is missing and the mapping
6440 * is for a kernel address, the mapping must belong to
6441 * either the direct map or the early kernel memory.
6442 * Page table pages are preallocated for every other
6443 * part of the kernel address space, so the direct map
6444 * region and early kernel memory are the only parts of the
6445 * kernel address space that must be handled here.
6446 */
6447 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
6448 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
6449 ("pmap_demote_l2: No saved mpte for va %#lx", va));
6450
6451 /*
6452 * If the 2MB page mapping belongs to the direct map
6453 * region of the kernel's address space, then the page
6454 * allocation request specifies the highest possible
6455 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
6456 * priority is normal.
6457 */
6458 ml3 = vm_page_alloc_noobj(
6459 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
6460 VM_ALLOC_WIRED);
6461
6462 /*
6463 * If the allocation of the new page table page fails,
6464 * invalidate the 2MB page mapping and return "failure".
6465 */
6466 if (ml3 == NULL) {
6467 pmap_demote_l2_abort(pmap, va, l2, lockp);
6468 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
6469 " in pmap %p", va, pmap);
6470 goto fail;
6471 }
6472 ml3->pindex = pmap_l2_pindex(va);
6473
6474 if (!ADDR_IS_KERNEL(va)) {
6475 ml3->ref_count = NL3PG;
6476 pmap_resident_count_inc(pmap, 1);
6477 }
6478 }
6479 l3phys = VM_PAGE_TO_PHYS(ml3);
6480 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
6481 newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
6482 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
6483 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
6484 ("pmap_demote_l2: L2 entry is writeable but not dirty"));
6485
6486 /*
6487 * If the page table page is not leftover from an earlier promotion,
6488 * or the mapping attributes have changed, (re)initialize the L3 table.
6489 *
6490 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
6491 * performs a dsb(). That dsb() ensures that the stores for filling
6492 * "l3" are visible before "l3" is added to the page table.
6493 */
6494 if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK))
6495 pmap_fill_l3(l3, newl3);
6496
6497 /*
6498 * Map the temporary page so we don't lose access to the l2 table.
6499 */
6500 if (tmpl2 != 0) {
6501 pmap_kenter(tmpl2, PAGE_SIZE,
6502 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
6503 VM_MEMATTR_WRITE_BACK);
6504 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
6505 }
6506
6507 /*
6508 * The spare PV entries must be reserved prior to demoting the
6509 * mapping, that is, prior to changing the PDE. Otherwise, the state
6510 * of the L2 and the PV lists will be inconsistent, which can result
6511 * in reclaim_pv_chunk() attempting to remove a PV entry from the
6512 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
6513 * PV entry for the 2MB page mapping that is being demoted.
6514 */
6515 if ((oldl2 & ATTR_SW_MANAGED) != 0)
6516 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
6517
6518 /*
6519 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
6520 * the 2MB page mapping.
6521 */
6522 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
6523
6524 /*
6525 * Demote the PV entry.
6526 */
6527 if ((oldl2 & ATTR_SW_MANAGED) != 0)
6528 pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
6529
6530 atomic_add_long(&pmap_l2_demotions, 1);
6531 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
6532 " in pmap %p %lx", va, pmap, l3[0]);
6533
6534 fail:
6535 if (tmpl2 != 0) {
6536 pmap_kremove(tmpl2);
6537 kva_free(tmpl2, PAGE_SIZE);
6538 }
6539
6540 return (l3);
6541
6542 }
6543
6544 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)6545 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
6546 {
6547 struct rwlock *lock;
6548 pt_entry_t *l3;
6549
6550 lock = NULL;
6551 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
6552 if (lock != NULL)
6553 rw_wunlock(lock);
6554 return (l3);
6555 }
6556
6557 /*
6558 * Perform the pmap work for mincore(2). If the page is not both referenced and
6559 * modified by this pmap, returns its physical address so that the caller can
6560 * find other mappings.
6561 */
6562 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)6563 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
6564 {
6565 pt_entry_t *pte, tpte;
6566 vm_paddr_t mask, pa;
6567 int lvl, val;
6568 bool managed;
6569
6570 PMAP_ASSERT_STAGE1(pmap);
6571 PMAP_LOCK(pmap);
6572 pte = pmap_pte(pmap, addr, &lvl);
6573 if (pte != NULL) {
6574 tpte = pmap_load(pte);
6575
6576 switch (lvl) {
6577 case 3:
6578 mask = L3_OFFSET;
6579 break;
6580 case 2:
6581 mask = L2_OFFSET;
6582 break;
6583 case 1:
6584 mask = L1_OFFSET;
6585 break;
6586 default:
6587 panic("pmap_mincore: invalid level %d", lvl);
6588 }
6589
6590 managed = (tpte & ATTR_SW_MANAGED) != 0;
6591 val = MINCORE_INCORE;
6592 if (lvl != 3)
6593 val |= MINCORE_PSIND(3 - lvl);
6594 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
6595 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
6596 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6597 if ((tpte & ATTR_AF) == ATTR_AF)
6598 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6599
6600 pa = (tpte & ~ATTR_MASK) | (addr & mask);
6601 } else {
6602 managed = false;
6603 val = 0;
6604 }
6605
6606 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6607 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
6608 *pap = pa;
6609 }
6610 PMAP_UNLOCK(pmap);
6611 return (val);
6612 }
6613
6614 /*
6615 * Garbage collect every ASID that is neither active on a processor nor
6616 * reserved.
6617 */
6618 static void
pmap_reset_asid_set(pmap_t pmap)6619 pmap_reset_asid_set(pmap_t pmap)
6620 {
6621 pmap_t curpmap;
6622 int asid, cpuid, epoch;
6623 struct asid_set *set;
6624 enum pmap_stage stage;
6625
6626 set = pmap->pm_asid_set;
6627 stage = pmap->pm_stage;
6628
6629 set = pmap->pm_asid_set;
6630 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
6631 mtx_assert(&set->asid_set_mutex, MA_OWNED);
6632
6633 /*
6634 * Ensure that the store to asid_epoch is globally visible before the
6635 * loads from pc_curpmap are performed.
6636 */
6637 epoch = set->asid_epoch + 1;
6638 if (epoch == INT_MAX)
6639 epoch = 0;
6640 set->asid_epoch = epoch;
6641 dsb(ishst);
6642 if (stage == PM_STAGE1) {
6643 __asm __volatile("tlbi vmalle1is");
6644 } else {
6645 KASSERT(pmap_clean_stage2_tlbi != NULL,
6646 ("%s: Unset stage 2 tlb invalidation callback\n",
6647 __func__));
6648 pmap_clean_stage2_tlbi();
6649 }
6650 dsb(ish);
6651 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
6652 set->asid_set_size - 1);
6653 CPU_FOREACH(cpuid) {
6654 if (cpuid == curcpu)
6655 continue;
6656 if (stage == PM_STAGE1) {
6657 curpmap = pcpu_find(cpuid)->pc_curpmap;
6658 PMAP_ASSERT_STAGE1(pmap);
6659 } else {
6660 curpmap = pcpu_find(cpuid)->pc_curvmpmap;
6661 if (curpmap == NULL)
6662 continue;
6663 PMAP_ASSERT_STAGE2(pmap);
6664 }
6665 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
6666 asid = COOKIE_TO_ASID(curpmap->pm_cookie);
6667 if (asid == -1)
6668 continue;
6669 bit_set(set->asid_set, asid);
6670 curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
6671 }
6672 }
6673
6674 /*
6675 * Allocate a new ASID for the specified pmap.
6676 */
6677 static void
pmap_alloc_asid(pmap_t pmap)6678 pmap_alloc_asid(pmap_t pmap)
6679 {
6680 struct asid_set *set;
6681 int new_asid;
6682
6683 set = pmap->pm_asid_set;
6684 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
6685
6686 mtx_lock_spin(&set->asid_set_mutex);
6687
6688 /*
6689 * While this processor was waiting to acquire the asid set mutex,
6690 * pmap_reset_asid_set() running on another processor might have
6691 * updated this pmap's cookie to the current epoch. In which case, we
6692 * don't need to allocate a new ASID.
6693 */
6694 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
6695 goto out;
6696
6697 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
6698 &new_asid);
6699 if (new_asid == -1) {
6700 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
6701 set->asid_next, &new_asid);
6702 if (new_asid == -1) {
6703 pmap_reset_asid_set(pmap);
6704 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
6705 set->asid_set_size, &new_asid);
6706 KASSERT(new_asid != -1, ("ASID allocation failure"));
6707 }
6708 }
6709 bit_set(set->asid_set, new_asid);
6710 set->asid_next = new_asid + 1;
6711 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
6712 out:
6713 mtx_unlock_spin(&set->asid_set_mutex);
6714 }
6715
6716 /*
6717 * Compute the value that should be stored in ttbr0 to activate the specified
6718 * pmap. This value may change from time to time.
6719 */
6720 uint64_t
pmap_to_ttbr0(pmap_t pmap)6721 pmap_to_ttbr0(pmap_t pmap)
6722 {
6723
6724 return (ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) |
6725 pmap->pm_ttbr);
6726 }
6727
6728 static bool
pmap_activate_int(pmap_t pmap)6729 pmap_activate_int(pmap_t pmap)
6730 {
6731 struct asid_set *set;
6732 int epoch;
6733
6734 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
6735 KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
6736
6737 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
6738 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
6739 /*
6740 * Handle the possibility that the old thread was preempted
6741 * after an "ic" or "tlbi" instruction but before it performed
6742 * a "dsb" instruction. If the old thread migrates to a new
6743 * processor, its completion of a "dsb" instruction on that
6744 * new processor does not guarantee that the "ic" or "tlbi"
6745 * instructions performed on the old processor have completed.
6746 */
6747 dsb(ish);
6748 return (false);
6749 }
6750
6751 set = pmap->pm_asid_set;
6752 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
6753
6754 /*
6755 * Ensure that the store to curpmap is globally visible before the
6756 * load from asid_epoch is performed.
6757 */
6758 if (pmap->pm_stage == PM_STAGE1)
6759 PCPU_SET(curpmap, pmap);
6760 else
6761 PCPU_SET(curvmpmap, pmap);
6762 dsb(ish);
6763 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
6764 if (epoch >= 0 && epoch != set->asid_epoch)
6765 pmap_alloc_asid(pmap);
6766
6767 if (pmap->pm_stage == PM_STAGE1) {
6768 set_ttbr0(pmap_to_ttbr0(pmap));
6769 if (PCPU_GET(bcast_tlbi_workaround) != 0)
6770 invalidate_local_icache();
6771 }
6772 return (true);
6773 }
6774
6775 void
pmap_activate_vm(pmap_t pmap)6776 pmap_activate_vm(pmap_t pmap)
6777 {
6778
6779 PMAP_ASSERT_STAGE2(pmap);
6780
6781 (void)pmap_activate_int(pmap);
6782 }
6783
6784 void
pmap_activate(struct thread * td)6785 pmap_activate(struct thread *td)
6786 {
6787 pmap_t pmap;
6788
6789 pmap = vmspace_pmap(td->td_proc->p_vmspace);
6790 PMAP_ASSERT_STAGE1(pmap);
6791 critical_enter();
6792 (void)pmap_activate_int(pmap);
6793 critical_exit();
6794 }
6795
6796 /*
6797 * To eliminate the unused parameter "old", we would have to add an instruction
6798 * to cpu_switch().
6799 */
6800 struct pcb *
pmap_switch(struct thread * old __unused,struct thread * new)6801 pmap_switch(struct thread *old __unused, struct thread *new)
6802 {
6803 pcpu_bp_harden bp_harden;
6804 struct pcb *pcb;
6805
6806 /* Store the new curthread */
6807 PCPU_SET(curthread, new);
6808 #if defined(PERTHREAD_SSP)
6809 /* Set the new threads SSP canary */
6810 __asm("msr sp_el0, %0" :: "r"(&new->td_md.md_canary));
6811 #endif
6812
6813 /* And the new pcb */
6814 pcb = new->td_pcb;
6815 PCPU_SET(curpcb, pcb);
6816
6817 /*
6818 * TODO: We may need to flush the cache here if switching
6819 * to a user process.
6820 */
6821
6822 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
6823 /*
6824 * Stop userspace from training the branch predictor against
6825 * other processes. This will call into a CPU specific
6826 * function that clears the branch predictor state.
6827 */
6828 bp_harden = PCPU_GET(bp_harden);
6829 if (bp_harden != NULL)
6830 bp_harden();
6831 }
6832
6833 return (pcb);
6834 }
6835
6836 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)6837 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
6838 {
6839
6840 PMAP_ASSERT_STAGE1(pmap);
6841 KASSERT(ADDR_IS_CANONICAL(va),
6842 ("%s: Address not in canonical form: %lx", __func__, va));
6843
6844 if (ADDR_IS_KERNEL(va)) {
6845 cpu_icache_sync_range(va, sz);
6846 } else {
6847 u_int len, offset;
6848 vm_paddr_t pa;
6849
6850 /* Find the length of data in this page to flush */
6851 offset = va & PAGE_MASK;
6852 len = imin(PAGE_SIZE - offset, sz);
6853
6854 while (sz != 0) {
6855 /* Extract the physical address & find it in the DMAP */
6856 pa = pmap_extract(pmap, va);
6857 if (pa != 0)
6858 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
6859
6860 /* Move to the next page */
6861 sz -= len;
6862 va += len;
6863 /* Set the length for the next iteration */
6864 len = imin(PAGE_SIZE, sz);
6865 }
6866 }
6867 }
6868
6869 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)6870 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
6871 {
6872 pd_entry_t *pdep;
6873 pt_entry_t *ptep, pte;
6874 int rv, lvl, dfsc;
6875
6876 PMAP_ASSERT_STAGE2(pmap);
6877 rv = KERN_FAILURE;
6878
6879 /* Data and insn aborts use same encoding for FSC field. */
6880 dfsc = esr & ISS_DATA_DFSC_MASK;
6881 switch (dfsc) {
6882 case ISS_DATA_DFSC_TF_L0:
6883 case ISS_DATA_DFSC_TF_L1:
6884 case ISS_DATA_DFSC_TF_L2:
6885 case ISS_DATA_DFSC_TF_L3:
6886 PMAP_LOCK(pmap);
6887 pdep = pmap_pde(pmap, far, &lvl);
6888 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
6889 PMAP_LOCK(pmap);
6890 break;
6891 }
6892
6893 switch (lvl) {
6894 case 0:
6895 ptep = pmap_l0_to_l1(pdep, far);
6896 break;
6897 case 1:
6898 ptep = pmap_l1_to_l2(pdep, far);
6899 break;
6900 case 2:
6901 ptep = pmap_l2_to_l3(pdep, far);
6902 break;
6903 default:
6904 panic("%s: Invalid pde level %d", __func__,lvl);
6905 }
6906 goto fault_exec;
6907
6908 case ISS_DATA_DFSC_AFF_L1:
6909 case ISS_DATA_DFSC_AFF_L2:
6910 case ISS_DATA_DFSC_AFF_L3:
6911 PMAP_LOCK(pmap);
6912 ptep = pmap_pte(pmap, far, &lvl);
6913 fault_exec:
6914 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
6915 if (icache_vmid) {
6916 pmap_invalidate_vpipt_icache();
6917 } else {
6918 /*
6919 * If accessing an executable page invalidate
6920 * the I-cache so it will be valid when we
6921 * continue execution in the guest. The D-cache
6922 * is assumed to already be clean to the Point
6923 * of Coherency.
6924 */
6925 if ((pte & ATTR_S2_XN_MASK) !=
6926 ATTR_S2_XN(ATTR_S2_XN_NONE)) {
6927 invalidate_icache();
6928 }
6929 }
6930 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
6931 rv = KERN_SUCCESS;
6932 }
6933 PMAP_UNLOCK(pmap);
6934 break;
6935 }
6936
6937 return (rv);
6938 }
6939
6940 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)6941 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
6942 {
6943 pt_entry_t pte, *ptep;
6944 register_t intr;
6945 uint64_t ec, par;
6946 int lvl, rv;
6947
6948 rv = KERN_FAILURE;
6949
6950 ec = ESR_ELx_EXCEPTION(esr);
6951 switch (ec) {
6952 case EXCP_INSN_ABORT_L:
6953 case EXCP_INSN_ABORT:
6954 case EXCP_DATA_ABORT_L:
6955 case EXCP_DATA_ABORT:
6956 break;
6957 default:
6958 return (rv);
6959 }
6960
6961 if (pmap->pm_stage == PM_STAGE2)
6962 return (pmap_stage2_fault(pmap, esr, far));
6963
6964 /* Data and insn aborts use same encoding for FSC field. */
6965 switch (esr & ISS_DATA_DFSC_MASK) {
6966 case ISS_DATA_DFSC_AFF_L1:
6967 case ISS_DATA_DFSC_AFF_L2:
6968 case ISS_DATA_DFSC_AFF_L3:
6969 PMAP_LOCK(pmap);
6970 ptep = pmap_pte(pmap, far, &lvl);
6971 if (ptep != NULL) {
6972 pmap_set_bits(ptep, ATTR_AF);
6973 rv = KERN_SUCCESS;
6974 /*
6975 * XXXMJ as an optimization we could mark the entry
6976 * dirty if this is a write fault.
6977 */
6978 }
6979 PMAP_UNLOCK(pmap);
6980 break;
6981 case ISS_DATA_DFSC_PF_L1:
6982 case ISS_DATA_DFSC_PF_L2:
6983 case ISS_DATA_DFSC_PF_L3:
6984 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
6985 (esr & ISS_DATA_WnR) == 0)
6986 return (rv);
6987 PMAP_LOCK(pmap);
6988 ptep = pmap_pte(pmap, far, &lvl);
6989 if (ptep != NULL &&
6990 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
6991 if ((pte & ATTR_S1_AP_RW_BIT) ==
6992 ATTR_S1_AP(ATTR_S1_AP_RO)) {
6993 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
6994 pmap_invalidate_page(pmap, far);
6995 }
6996 rv = KERN_SUCCESS;
6997 }
6998 PMAP_UNLOCK(pmap);
6999 break;
7000 case ISS_DATA_DFSC_TF_L0:
7001 case ISS_DATA_DFSC_TF_L1:
7002 case ISS_DATA_DFSC_TF_L2:
7003 case ISS_DATA_DFSC_TF_L3:
7004 /*
7005 * Retry the translation. A break-before-make sequence can
7006 * produce a transient fault.
7007 */
7008 if (pmap == kernel_pmap) {
7009 /*
7010 * The translation fault may have occurred within a
7011 * critical section. Therefore, we must check the
7012 * address without acquiring the kernel pmap's lock.
7013 */
7014 if (pmap_klookup(far, NULL))
7015 rv = KERN_SUCCESS;
7016 } else {
7017 PMAP_LOCK(pmap);
7018 /* Ask the MMU to check the address. */
7019 intr = intr_disable();
7020 par = arm64_address_translate_s1e0r(far);
7021 intr_restore(intr);
7022 PMAP_UNLOCK(pmap);
7023
7024 /*
7025 * If the translation was successful, then we can
7026 * return success to the trap handler.
7027 */
7028 if (PAR_SUCCESS(par))
7029 rv = KERN_SUCCESS;
7030 }
7031 break;
7032 }
7033
7034 return (rv);
7035 }
7036
7037 /*
7038 * Increase the starting virtual address of the given mapping if a
7039 * different alignment might result in more superpage mappings.
7040 */
7041 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)7042 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
7043 vm_offset_t *addr, vm_size_t size)
7044 {
7045 vm_offset_t superpage_offset;
7046
7047 if (size < L2_SIZE)
7048 return;
7049 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
7050 offset += ptoa(object->pg_color);
7051 superpage_offset = offset & L2_OFFSET;
7052 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
7053 (*addr & L2_OFFSET) == superpage_offset)
7054 return;
7055 if ((*addr & L2_OFFSET) < superpage_offset)
7056 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
7057 else
7058 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
7059 }
7060
7061 /**
7062 * Get the kernel virtual address of a set of physical pages. If there are
7063 * physical addresses not covered by the DMAP perform a transient mapping
7064 * that will be removed when calling pmap_unmap_io_transient.
7065 *
7066 * \param page The pages the caller wishes to obtain the virtual
7067 * address on the kernel memory map.
7068 * \param vaddr On return contains the kernel virtual memory address
7069 * of the pages passed in the page parameter.
7070 * \param count Number of pages passed in.
7071 * \param can_fault TRUE if the thread using the mapped pages can take
7072 * page faults, FALSE otherwise.
7073 *
7074 * \returns TRUE if the caller must call pmap_unmap_io_transient when
7075 * finished or FALSE otherwise.
7076 *
7077 */
7078 boolean_t
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,boolean_t can_fault)7079 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7080 boolean_t can_fault)
7081 {
7082 vm_paddr_t paddr;
7083 boolean_t needs_mapping;
7084 int error, i;
7085
7086 /*
7087 * Allocate any KVA space that we need, this is done in a separate
7088 * loop to prevent calling vmem_alloc while pinned.
7089 */
7090 needs_mapping = FALSE;
7091 for (i = 0; i < count; i++) {
7092 paddr = VM_PAGE_TO_PHYS(page[i]);
7093 if (__predict_false(!PHYS_IN_DMAP(paddr))) {
7094 error = vmem_alloc(kernel_arena, PAGE_SIZE,
7095 M_BESTFIT | M_WAITOK, &vaddr[i]);
7096 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
7097 needs_mapping = TRUE;
7098 } else {
7099 vaddr[i] = PHYS_TO_DMAP(paddr);
7100 }
7101 }
7102
7103 /* Exit early if everything is covered by the DMAP */
7104 if (!needs_mapping)
7105 return (FALSE);
7106
7107 if (!can_fault)
7108 sched_pin();
7109 for (i = 0; i < count; i++) {
7110 paddr = VM_PAGE_TO_PHYS(page[i]);
7111 if (!PHYS_IN_DMAP(paddr)) {
7112 panic(
7113 "pmap_map_io_transient: TODO: Map out of DMAP data");
7114 }
7115 }
7116
7117 return (needs_mapping);
7118 }
7119
7120 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,boolean_t can_fault)7121 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7122 boolean_t can_fault)
7123 {
7124 vm_paddr_t paddr;
7125 int i;
7126
7127 if (!can_fault)
7128 sched_unpin();
7129 for (i = 0; i < count; i++) {
7130 paddr = VM_PAGE_TO_PHYS(page[i]);
7131 if (!PHYS_IN_DMAP(paddr)) {
7132 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
7133 }
7134 }
7135 }
7136
7137 boolean_t
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)7138 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
7139 {
7140
7141 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
7142 }
7143
7144 /*
7145 * Track a range of the kernel's virtual address space that is contiguous
7146 * in various mapping attributes.
7147 */
7148 struct pmap_kernel_map_range {
7149 vm_offset_t sva;
7150 pt_entry_t attrs;
7151 int l3pages;
7152 int l3contig;
7153 int l2blocks;
7154 int l1blocks;
7155 };
7156
7157 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)7158 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
7159 vm_offset_t eva)
7160 {
7161 const char *mode;
7162 int index;
7163
7164 if (eva <= range->sva)
7165 return;
7166
7167 index = range->attrs & ATTR_S1_IDX_MASK;
7168 switch (index) {
7169 case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
7170 mode = "DEV";
7171 break;
7172 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
7173 mode = "UC";
7174 break;
7175 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
7176 mode = "WB";
7177 break;
7178 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
7179 mode = "WT";
7180 break;
7181 default:
7182 printf(
7183 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
7184 __func__, index, range->sva, eva);
7185 mode = "??";
7186 break;
7187 }
7188
7189 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %3s %d %d %d %d\n",
7190 range->sva, eva,
7191 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
7192 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
7193 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
7194 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
7195 mode, range->l1blocks, range->l2blocks, range->l3contig,
7196 range->l3pages);
7197
7198 /* Reset to sentinel value. */
7199 range->sva = 0xfffffffffffffffful;
7200 }
7201
7202 /*
7203 * Determine whether the attributes specified by a page table entry match those
7204 * being tracked by the current range.
7205 */
7206 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)7207 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
7208 {
7209
7210 return (range->attrs == attrs);
7211 }
7212
7213 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)7214 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
7215 pt_entry_t attrs)
7216 {
7217
7218 memset(range, 0, sizeof(*range));
7219 range->sva = va;
7220 range->attrs = attrs;
7221 }
7222
7223 /* Get the block/page attributes that correspond to the table attributes */
7224 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)7225 sysctl_kmaps_table_attrs(pd_entry_t table)
7226 {
7227 pt_entry_t attrs;
7228
7229 attrs = 0;
7230 if ((table & TATTR_UXN_TABLE) != 0)
7231 attrs |= ATTR_S1_UXN;
7232 if ((table & TATTR_PXN_TABLE) != 0)
7233 attrs |= ATTR_S1_PXN;
7234 if ((table & TATTR_AP_TABLE_RO) != 0)
7235 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
7236
7237 return (attrs);
7238 }
7239
7240 /* Read the block/page attributes we care about */
7241 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)7242 sysctl_kmaps_block_attrs(pt_entry_t block)
7243 {
7244 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK));
7245 }
7246
7247 /*
7248 * Given a leaf PTE, derive the mapping's attributes. If they do not match
7249 * those of the current run, dump the address range and its attributes, and
7250 * begin a new run.
7251 */
7252 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)7253 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
7254 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
7255 pt_entry_t l3e)
7256 {
7257 pt_entry_t attrs;
7258
7259 attrs = sysctl_kmaps_table_attrs(l0e);
7260
7261 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
7262 attrs |= sysctl_kmaps_block_attrs(l1e);
7263 goto done;
7264 }
7265 attrs |= sysctl_kmaps_table_attrs(l1e);
7266
7267 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
7268 attrs |= sysctl_kmaps_block_attrs(l2e);
7269 goto done;
7270 }
7271 attrs |= sysctl_kmaps_table_attrs(l2e);
7272 attrs |= sysctl_kmaps_block_attrs(l3e);
7273
7274 done:
7275 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
7276 sysctl_kmaps_dump(sb, range, va);
7277 sysctl_kmaps_reinit(range, va, attrs);
7278 }
7279 }
7280
7281 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)7282 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
7283 {
7284 struct pmap_kernel_map_range range;
7285 struct sbuf sbuf, *sb;
7286 pd_entry_t l0e, *l1, l1e, *l2, l2e;
7287 pt_entry_t *l3, l3e;
7288 vm_offset_t sva;
7289 vm_paddr_t pa;
7290 int error, i, j, k, l;
7291
7292 error = sysctl_wire_old_buffer(req, 0);
7293 if (error != 0)
7294 return (error);
7295 sb = &sbuf;
7296 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
7297
7298 /* Sentinel value. */
7299 range.sva = 0xfffffffffffffffful;
7300
7301 /*
7302 * Iterate over the kernel page tables without holding the kernel pmap
7303 * lock. Kernel page table pages are never freed, so at worst we will
7304 * observe inconsistencies in the output.
7305 */
7306 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
7307 i++) {
7308 if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
7309 sbuf_printf(sb, "\nDirect map:\n");
7310 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
7311 sbuf_printf(sb, "\nKernel map:\n");
7312
7313 l0e = kernel_pmap->pm_l0[i];
7314 if ((l0e & ATTR_DESCR_VALID) == 0) {
7315 sysctl_kmaps_dump(sb, &range, sva);
7316 sva += L0_SIZE;
7317 continue;
7318 }
7319 pa = l0e & ~ATTR_MASK;
7320 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
7321
7322 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
7323 l1e = l1[j];
7324 if ((l1e & ATTR_DESCR_VALID) == 0) {
7325 sysctl_kmaps_dump(sb, &range, sva);
7326 sva += L1_SIZE;
7327 continue;
7328 }
7329 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
7330 sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
7331 0, 0);
7332 range.l1blocks++;
7333 sva += L1_SIZE;
7334 continue;
7335 }
7336 pa = l1e & ~ATTR_MASK;
7337 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
7338
7339 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
7340 l2e = l2[k];
7341 if ((l2e & ATTR_DESCR_VALID) == 0) {
7342 sysctl_kmaps_dump(sb, &range, sva);
7343 sva += L2_SIZE;
7344 continue;
7345 }
7346 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
7347 sysctl_kmaps_check(sb, &range, sva,
7348 l0e, l1e, l2e, 0);
7349 range.l2blocks++;
7350 sva += L2_SIZE;
7351 continue;
7352 }
7353 pa = l2e & ~ATTR_MASK;
7354 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
7355
7356 for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
7357 l++, sva += L3_SIZE) {
7358 l3e = l3[l];
7359 if ((l3e & ATTR_DESCR_VALID) == 0) {
7360 sysctl_kmaps_dump(sb, &range,
7361 sva);
7362 continue;
7363 }
7364 sysctl_kmaps_check(sb, &range, sva,
7365 l0e, l1e, l2e, l3e);
7366 if ((l3e & ATTR_CONTIGUOUS) != 0)
7367 range.l3contig += l % 16 == 0 ?
7368 1 : 0;
7369 else
7370 range.l3pages++;
7371 }
7372 }
7373 }
7374 }
7375
7376 error = sbuf_finish(sb);
7377 sbuf_delete(sb);
7378 return (error);
7379 }
7380 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
7381 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
7382 NULL, 0, sysctl_kmaps, "A",
7383 "Dump kernel address layout");
7384