1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <[email protected]>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
53 */
54 /*-
55 * Copyright (c) 2003 Networks Associates Technology, Inc.
56 * All rights reserved.
57 *
58 * This software was developed for the FreeBSD Project by Jake Burkholder,
59 * Safeport Network Services, and Network Associates Laboratories, the
60 * Security Research Division of Network Associates, Inc. under
61 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62 * CHATS research program.
63 *
64 * Redistribution and use in source and binary forms, with or without
65 * modification, are permitted provided that the following conditions
66 * are met:
67 * 1. Redistributions of source code must retain the above copyright
68 * notice, this list of conditions and the following disclaimer.
69 * 2. Redistributions in binary form must reproduce the above copyright
70 * notice, this list of conditions and the following disclaimer in the
71 * documentation and/or other materials provided with the distribution.
72 *
73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83 * SUCH DAMAGE.
84 */
85
86 #include <sys/cdefs.h>
87 /*
88 * Manages physical address maps.
89 *
90 * Since the information managed by this module is
91 * also stored by the logical address mapping module,
92 * this module may throw away valid virtual-to-physical
93 * mappings at almost any time. However, invalidations
94 * of virtual-to-physical mappings must be done as
95 * requested.
96 *
97 * In order to cope with hardware architectures which
98 * make virtual-to-physical map invalidates expensive,
99 * this module may delay invalidate or reduced protection
100 * operations until such time as they are actually
101 * necessary. This module is given full information as
102 * to which processors are currently using which maps,
103 * and to when physical maps must be made correct.
104 */
105
106 #include "opt_vm.h"
107
108 #include <sys/param.h>
109 #include <sys/asan.h>
110 #include <sys/bitstring.h>
111 #include <sys/bus.h>
112 #include <sys/systm.h>
113 #include <sys/kernel.h>
114 #include <sys/ktr.h>
115 #include <sys/limits.h>
116 #include <sys/lock.h>
117 #include <sys/malloc.h>
118 #include <sys/mman.h>
119 #include <sys/msgbuf.h>
120 #include <sys/mutex.h>
121 #include <sys/physmem.h>
122 #include <sys/proc.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147
148 #include <machine/asan.h>
149 #include <machine/machdep.h>
150 #include <machine/md_var.h>
151 #include <machine/pcb.h>
152
153 #ifdef NUMA
154 #define PMAP_MEMDOM MAXMEMDOM
155 #else
156 #define PMAP_MEMDOM 1
157 #endif
158
159 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1)
160 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2)
161
162 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
163 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
164 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
165 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))
166
167 #define NUL0E L0_ENTRIES
168 #define NUL1E (NUL0E * NL1PG)
169 #define NUL2E (NUL1E * NL2PG)
170
171 #ifdef PV_STATS
172 #define PV_STAT(x) do { x ; } while (0)
173 #define __pvused
174 #else
175 #define PV_STAT(x) do { } while (0)
176 #define __pvused __unused
177 #endif
178
179 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT))
180 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
181 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
182
183 #define PMAP_SAN_PTE_BITS (ATTR_DEFAULT | ATTR_S1_XN | \
184 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
185
186 struct pmap_large_md_page {
187 struct rwlock pv_lock;
188 struct md_page pv_page;
189 /* Pad to a power of 2, see pmap_init_pv_table(). */
190 int pv_pad[2];
191 };
192
193 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
194 #define pv_dummy pv_dummy_large.pv_page
195 __read_mostly static struct pmap_large_md_page *pv_table;
196
197 static struct pmap_large_md_page *
_pa_to_pmdp(vm_paddr_t pa)198 _pa_to_pmdp(vm_paddr_t pa)
199 {
200 struct vm_phys_seg *seg;
201
202 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
203 return ((struct pmap_large_md_page *)seg->md_first +
204 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
205 return (NULL);
206 }
207
208 static struct pmap_large_md_page *
pa_to_pmdp(vm_paddr_t pa)209 pa_to_pmdp(vm_paddr_t pa)
210 {
211 struct pmap_large_md_page *pvd;
212
213 pvd = _pa_to_pmdp(pa);
214 if (pvd == NULL)
215 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
216 return (pvd);
217 }
218
219 static struct pmap_large_md_page *
page_to_pmdp(vm_page_t m)220 page_to_pmdp(vm_page_t m)
221 {
222 struct vm_phys_seg *seg;
223
224 seg = &vm_phys_segs[m->segind];
225 return ((struct pmap_large_md_page *)seg->md_first +
226 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
227 }
228
229 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page))
230 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page))
231
232 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \
233 struct pmap_large_md_page *_pvd; \
234 struct rwlock *_lock; \
235 _pvd = _pa_to_pmdp(pa); \
236 if (__predict_false(_pvd == NULL)) \
237 _lock = &pv_dummy_large.pv_lock; \
238 else \
239 _lock = &(_pvd->pv_lock); \
240 _lock; \
241 })
242
243 static struct rwlock *
VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)244 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
245 {
246 if ((m->flags & PG_FICTITIOUS) == 0)
247 return (&page_to_pmdp(m)->pv_lock);
248 else
249 return (&pv_dummy_large.pv_lock);
250 }
251
252 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \
253 struct rwlock **_lockp = (lockp); \
254 struct rwlock *_new_lock = (new_lock); \
255 \
256 if (_new_lock != *_lockp) { \
257 if (*_lockp != NULL) \
258 rw_wunlock(*_lockp); \
259 *_lockp = _new_lock; \
260 rw_wlock(*_lockp); \
261 } \
262 } while (0)
263
264 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \
265 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
266
267 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
268 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
269
270 #define RELEASE_PV_LIST_LOCK(lockp) do { \
271 struct rwlock **_lockp = (lockp); \
272 \
273 if (*_lockp != NULL) { \
274 rw_wunlock(*_lockp); \
275 *_lockp = NULL; \
276 } \
277 } while (0)
278
279 /*
280 * The presence of this flag indicates that the mapping is writeable.
281 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
282 * it is dirty. This flag may only be set on managed mappings.
283 *
284 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
285 * as a software managed bit.
286 */
287 #define ATTR_SW_DBM ATTR_DBM
288
289 struct pmap kernel_pmap_store;
290
291 /* Used for mapping ACPI memory before VM is initialized */
292 #define PMAP_PREINIT_MAPPING_COUNT 32
293 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
294 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */
295 static int vm_initialized = 0; /* No need to use pre-init maps when set */
296
297 /*
298 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
299 * Always map entire L2 block for simplicity.
300 * VA of L2 block = preinit_map_va + i * L2_SIZE
301 */
302 static struct pmap_preinit_mapping {
303 vm_paddr_t pa;
304 vm_offset_t va;
305 vm_size_t size;
306 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
307
308 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
309 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
310 vm_offset_t kernel_vm_end = 0;
311
312 /*
313 * Data for the pv entry allocation mechanism.
314 */
315 #ifdef NUMA
316 static __inline int
pc_to_domain(struct pv_chunk * pc)317 pc_to_domain(struct pv_chunk *pc)
318 {
319 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
320 }
321 #else
322 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)323 pc_to_domain(struct pv_chunk *pc __unused)
324 {
325 return (0);
326 }
327 #endif
328
329 struct pv_chunks_list {
330 struct mtx pvc_lock;
331 TAILQ_HEAD(pch, pv_chunk) pvc_list;
332 int active_reclaims;
333 } __aligned(CACHE_LINE_SIZE);
334
335 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
336
337 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
338 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
339 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
340
341 extern pt_entry_t pagetable_l0_ttbr1[];
342
343 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
344 static vm_paddr_t physmap[PHYSMAP_SIZE];
345 static u_int physmap_idx;
346
347 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
348 "VM/pmap parameters");
349
350 #if PAGE_SIZE == PAGE_SIZE_4K
351 #define L1_BLOCKS_SUPPORTED 1
352 #else
353 /* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */
354 #define L1_BLOCKS_SUPPORTED 0
355 #endif
356
357 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED)
358
359 /*
360 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
361 * that it has currently allocated to a pmap, a cursor ("asid_next") to
362 * optimize its search for a free ASID in the bit vector, and an epoch number
363 * ("asid_epoch") to indicate when it has reclaimed all previously allocated
364 * ASIDs that are not currently active on a processor.
365 *
366 * The current epoch number is always in the range [0, INT_MAX). Negative
367 * numbers and INT_MAX are reserved for special cases that are described
368 * below.
369 */
370 struct asid_set {
371 int asid_bits;
372 bitstr_t *asid_set;
373 int asid_set_size;
374 int asid_next;
375 int asid_epoch;
376 struct mtx asid_set_mutex;
377 };
378
379 static struct asid_set asids;
380 static struct asid_set vmids;
381
382 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
383 "ASID allocator");
384 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
385 "The number of bits in an ASID");
386 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
387 "The last allocated ASID plus one");
388 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
389 "The current epoch number");
390
391 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
392 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
393 "The number of bits in an VMID");
394 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
395 "The last allocated VMID plus one");
396 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
397 "The current epoch number");
398
399 void (*pmap_clean_stage2_tlbi)(void);
400 void (*pmap_invalidate_vpipt_icache)(void);
401 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
402 void (*pmap_stage2_invalidate_all)(uint64_t);
403
404 /*
405 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved
406 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for
407 * dynamically allocated ASIDs have a non-negative epoch number.
408 *
409 * An invalid ASID is represented by -1.
410 *
411 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
412 * which indicates that an ASID should never be allocated to the pmap, and
413 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
414 * allocated when the pmap is next activated.
415 */
416 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \
417 ((u_long)(epoch) << 32)))
418 #define COOKIE_TO_ASID(cookie) ((int)(cookie))
419 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32))
420
421 #define TLBI_VA_SHIFT 12
422 #define TLBI_VA_MASK ((1ul << 44) - 1)
423 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
424 #define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT)
425
426 static int __read_frequently superpages_enabled = 1;
427 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
428 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
429 "Are large page mappings enabled?");
430
431 /*
432 * Internal flags for pmap_enter()'s helper functions.
433 */
434 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
435 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
436
437 TAILQ_HEAD(pv_chunklist, pv_chunk);
438
439 static void free_pv_chunk(struct pv_chunk *pc);
440 static void free_pv_chunk_batch(struct pv_chunklist *batch);
441 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
442 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
443 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
444 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
445 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
446 vm_offset_t va);
447
448 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
449 static bool pmap_activate_int(pmap_t pmap);
450 static void pmap_alloc_asid(pmap_t pmap);
451 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
452 vm_prot_t prot, int mode, bool skip_unmapped);
453 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
454 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
455 vm_offset_t va, struct rwlock **lockp);
456 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
457 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
458 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
459 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
460 u_int flags, vm_page_t m, struct rwlock **lockp);
461 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
462 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
463 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
464 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
465 static void pmap_reset_asid_set(pmap_t pmap);
466 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
467 vm_page_t m, struct rwlock **lockp);
468
469 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
470 struct rwlock **lockp);
471
472 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
473 struct spglist *free);
474 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
475 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
476
477 /*
478 * These load the old table data and store the new value.
479 * They need to be atomic as the System MMU may write to the table at
480 * the same time as the CPU.
481 */
482 #define pmap_clear(table) atomic_store_64(table, 0)
483 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits)
484 #define pmap_load(table) (*table)
485 #define pmap_load_clear(table) atomic_swap_64(table, 0)
486 #define pmap_load_store(table, entry) atomic_swap_64(table, entry)
487 #define pmap_set_bits(table, bits) atomic_set_64(table, bits)
488 #define pmap_store(table, entry) atomic_store_64(table, entry)
489
490 /********************/
491 /* Inline functions */
492 /********************/
493
494 static __inline void
pagecopy(void * s,void * d)495 pagecopy(void *s, void *d)
496 {
497
498 memcpy(d, s, PAGE_SIZE);
499 }
500
501 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)502 pmap_l0(pmap_t pmap, vm_offset_t va)
503 {
504
505 return (&pmap->pm_l0[pmap_l0_index(va)]);
506 }
507
508 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)509 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
510 {
511 pd_entry_t *l1;
512
513 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
514 return (&l1[pmap_l1_index(va)]);
515 }
516
517 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)518 pmap_l1(pmap_t pmap, vm_offset_t va)
519 {
520 pd_entry_t *l0;
521
522 l0 = pmap_l0(pmap, va);
523 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
524 return (NULL);
525
526 return (pmap_l0_to_l1(l0, va));
527 }
528
529 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)530 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
531 {
532 pd_entry_t l1, *l2p;
533
534 l1 = pmap_load(l1p);
535
536 KASSERT(ADDR_IS_CANONICAL(va),
537 ("%s: Address not in canonical form: %lx", __func__, va));
538 /*
539 * The valid bit may be clear if pmap_update_entry() is concurrently
540 * modifying the entry, so for KVA only the entry type may be checked.
541 */
542 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
543 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
544 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
545 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
546 l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1));
547 return (&l2p[pmap_l2_index(va)]);
548 }
549
550 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)551 pmap_l2(pmap_t pmap, vm_offset_t va)
552 {
553 pd_entry_t *l1;
554
555 l1 = pmap_l1(pmap, va);
556 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
557 return (NULL);
558
559 return (pmap_l1_to_l2(l1, va));
560 }
561
562 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)563 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
564 {
565 pd_entry_t l2;
566 pt_entry_t *l3p;
567
568 l2 = pmap_load(l2p);
569
570 KASSERT(ADDR_IS_CANONICAL(va),
571 ("%s: Address not in canonical form: %lx", __func__, va));
572 /*
573 * The valid bit may be clear if pmap_update_entry() is concurrently
574 * modifying the entry, so for KVA only the entry type may be checked.
575 */
576 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
577 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
578 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
579 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
580 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2));
581 return (&l3p[pmap_l3_index(va)]);
582 }
583
584 /*
585 * Returns the lowest valid pde for a given virtual address.
586 * The next level may or may not point to a valid page or block.
587 */
588 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)589 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
590 {
591 pd_entry_t *l0, *l1, *l2, desc;
592
593 l0 = pmap_l0(pmap, va);
594 desc = pmap_load(l0) & ATTR_DESCR_MASK;
595 if (desc != L0_TABLE) {
596 *level = -1;
597 return (NULL);
598 }
599
600 l1 = pmap_l0_to_l1(l0, va);
601 desc = pmap_load(l1) & ATTR_DESCR_MASK;
602 if (desc != L1_TABLE) {
603 *level = 0;
604 return (l0);
605 }
606
607 l2 = pmap_l1_to_l2(l1, va);
608 desc = pmap_load(l2) & ATTR_DESCR_MASK;
609 if (desc != L2_TABLE) {
610 *level = 1;
611 return (l1);
612 }
613
614 *level = 2;
615 return (l2);
616 }
617
618 /*
619 * Returns the lowest valid pte block or table entry for a given virtual
620 * address. If there are no valid entries return NULL and set the level to
621 * the first invalid level.
622 */
623 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)624 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
625 {
626 pd_entry_t *l1, *l2, desc;
627 pt_entry_t *l3;
628
629 l1 = pmap_l1(pmap, va);
630 if (l1 == NULL) {
631 *level = 0;
632 return (NULL);
633 }
634 desc = pmap_load(l1) & ATTR_DESCR_MASK;
635 if (desc == L1_BLOCK) {
636 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
637 *level = 1;
638 return (l1);
639 }
640
641 if (desc != L1_TABLE) {
642 *level = 1;
643 return (NULL);
644 }
645
646 l2 = pmap_l1_to_l2(l1, va);
647 desc = pmap_load(l2) & ATTR_DESCR_MASK;
648 if (desc == L2_BLOCK) {
649 *level = 2;
650 return (l2);
651 }
652
653 if (desc != L2_TABLE) {
654 *level = 2;
655 return (NULL);
656 }
657
658 *level = 3;
659 l3 = pmap_l2_to_l3(l2, va);
660 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
661 return (NULL);
662
663 return (l3);
664 }
665
666 /*
667 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
668 * level that maps the specified virtual address, then a pointer to that entry
669 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled
670 * and a diagnostic message is provided, in which case this function panics.
671 */
672 static __always_inline pt_entry_t *
pmap_pte_exists(pmap_t pmap,vm_offset_t va,int level,const char * diag)673 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
674 {
675 pd_entry_t *l0p, *l1p, *l2p;
676 pt_entry_t desc, *l3p;
677 int walk_level __diagused;
678
679 KASSERT(level >= 0 && level < 4,
680 ("%s: %s passed an out-of-range level (%d)", __func__, diag,
681 level));
682 l0p = pmap_l0(pmap, va);
683 desc = pmap_load(l0p) & ATTR_DESCR_MASK;
684 if (desc == L0_TABLE && level > 0) {
685 l1p = pmap_l0_to_l1(l0p, va);
686 desc = pmap_load(l1p) & ATTR_DESCR_MASK;
687 if (desc == L1_BLOCK && level == 1) {
688 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
689 return (l1p);
690 }
691 if (desc == L1_TABLE && level > 1) {
692 l2p = pmap_l1_to_l2(l1p, va);
693 desc = pmap_load(l2p) & ATTR_DESCR_MASK;
694 if (desc == L2_BLOCK && level == 2)
695 return (l2p);
696 else if (desc == L2_TABLE && level > 2) {
697 l3p = pmap_l2_to_l3(l2p, va);
698 desc = pmap_load(l3p) & ATTR_DESCR_MASK;
699 if (desc == L3_PAGE && level == 3)
700 return (l3p);
701 else
702 walk_level = 3;
703 } else
704 walk_level = 2;
705 } else
706 walk_level = 1;
707 } else
708 walk_level = 0;
709 KASSERT(diag == NULL,
710 ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
711 diag, va, level, desc, walk_level));
712 return (NULL);
713 }
714
715 bool
pmap_ps_enabled(pmap_t pmap)716 pmap_ps_enabled(pmap_t pmap)
717 {
718 /*
719 * Promotion requires a hypervisor call when the kernel is running
720 * in EL1. To stop this disable superpage support on non-stage 1
721 * pmaps for now.
722 */
723 if (pmap->pm_stage != PM_STAGE1)
724 return (false);
725
726 return (superpages_enabled != 0);
727 }
728
729 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)730 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
731 pd_entry_t **l2, pt_entry_t **l3)
732 {
733 pd_entry_t *l0p, *l1p, *l2p;
734
735 if (pmap->pm_l0 == NULL)
736 return (false);
737
738 l0p = pmap_l0(pmap, va);
739 *l0 = l0p;
740
741 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
742 return (false);
743
744 l1p = pmap_l0_to_l1(l0p, va);
745 *l1 = l1p;
746
747 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
748 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
749 *l2 = NULL;
750 *l3 = NULL;
751 return (true);
752 }
753
754 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
755 return (false);
756
757 l2p = pmap_l1_to_l2(l1p, va);
758 *l2 = l2p;
759
760 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
761 *l3 = NULL;
762 return (true);
763 }
764
765 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
766 return (false);
767
768 *l3 = pmap_l2_to_l3(l2p, va);
769
770 return (true);
771 }
772
773 static __inline int
pmap_l3_valid(pt_entry_t l3)774 pmap_l3_valid(pt_entry_t l3)
775 {
776
777 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
778 }
779
780 CTASSERT(L1_BLOCK == L2_BLOCK);
781
782 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)783 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
784 {
785 pt_entry_t val;
786
787 if (pmap->pm_stage == PM_STAGE1) {
788 val = ATTR_S1_IDX(memattr);
789 if (memattr == VM_MEMATTR_DEVICE)
790 val |= ATTR_S1_XN;
791 return (val);
792 }
793
794 val = 0;
795
796 switch (memattr) {
797 case VM_MEMATTR_DEVICE:
798 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
799 ATTR_S2_XN(ATTR_S2_XN_ALL));
800 case VM_MEMATTR_UNCACHEABLE:
801 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
802 case VM_MEMATTR_WRITE_BACK:
803 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
804 case VM_MEMATTR_WRITE_THROUGH:
805 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
806 default:
807 panic("%s: invalid memory attribute %x", __func__, memattr);
808 }
809 }
810
811 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)812 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
813 {
814 pt_entry_t val;
815
816 val = 0;
817 if (pmap->pm_stage == PM_STAGE1) {
818 if ((prot & VM_PROT_EXECUTE) == 0)
819 val |= ATTR_S1_XN;
820 if ((prot & VM_PROT_WRITE) == 0)
821 val |= ATTR_S1_AP(ATTR_S1_AP_RO);
822 } else {
823 if ((prot & VM_PROT_WRITE) != 0)
824 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
825 if ((prot & VM_PROT_READ) != 0)
826 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
827 if ((prot & VM_PROT_EXECUTE) == 0)
828 val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
829 }
830
831 return (val);
832 }
833
834 /*
835 * Checks if the PTE is dirty.
836 */
837 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)838 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
839 {
840
841 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
842
843 if (pmap->pm_stage == PM_STAGE1) {
844 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
845 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
846
847 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
848 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
849 }
850
851 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
852 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
853 }
854
855 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)856 pmap_resident_count_inc(pmap_t pmap, int count)
857 {
858
859 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
860 pmap->pm_stats.resident_count += count;
861 }
862
863 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)864 pmap_resident_count_dec(pmap_t pmap, int count)
865 {
866
867 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
868 KASSERT(pmap->pm_stats.resident_count >= count,
869 ("pmap %p resident count underflow %ld %d", pmap,
870 pmap->pm_stats.resident_count, count));
871 pmap->pm_stats.resident_count -= count;
872 }
873
874 static vm_paddr_t
pmap_early_vtophys(vm_offset_t va)875 pmap_early_vtophys(vm_offset_t va)
876 {
877 vm_paddr_t pa_page;
878
879 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
880 return (pa_page | (va & PAR_LOW_MASK));
881 }
882
883 /* State of the bootstrapped DMAP page tables */
884 struct pmap_bootstrap_state {
885 pt_entry_t *l1;
886 pt_entry_t *l2;
887 pt_entry_t *l3;
888 vm_offset_t freemempos;
889 vm_offset_t va;
890 vm_paddr_t pa;
891 pt_entry_t table_attrs;
892 u_int l0_slot;
893 u_int l1_slot;
894 u_int l2_slot;
895 bool dmap_valid;
896 };
897
898 /* The bootstrap state */
899 static struct pmap_bootstrap_state bs_state = {
900 .l1 = NULL,
901 .l2 = NULL,
902 .l3 = NULL,
903 .table_attrs = TATTR_PXN_TABLE,
904 .l0_slot = L0_ENTRIES,
905 .l1_slot = Ln_ENTRIES,
906 .l2_slot = Ln_ENTRIES,
907 .dmap_valid = false,
908 };
909
910 static void
pmap_bootstrap_l0_table(struct pmap_bootstrap_state * state)911 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
912 {
913 vm_paddr_t l1_pa;
914 pd_entry_t l0e;
915 u_int l0_slot;
916
917 /* Link the level 0 table to a level 1 table */
918 l0_slot = pmap_l0_index(state->va);
919 if (l0_slot != state->l0_slot) {
920 /*
921 * Make sure we move from a low address to high address
922 * before the DMAP region is ready. This ensures we never
923 * modify an existing mapping until we can map from a
924 * physical address to a virtual address.
925 */
926 MPASS(state->l0_slot < l0_slot ||
927 state->l0_slot == L0_ENTRIES ||
928 state->dmap_valid);
929
930 /* Reset lower levels */
931 state->l2 = NULL;
932 state->l3 = NULL;
933 state->l1_slot = Ln_ENTRIES;
934 state->l2_slot = Ln_ENTRIES;
935
936 /* Check the existing L0 entry */
937 state->l0_slot = l0_slot;
938 if (state->dmap_valid) {
939 l0e = pagetable_l0_ttbr1[l0_slot];
940 if ((l0e & ATTR_DESCR_VALID) != 0) {
941 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
942 l1_pa = PTE_TO_PHYS(l0e);
943 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
944 return;
945 }
946 }
947
948 /* Create a new L0 table entry */
949 state->l1 = (pt_entry_t *)state->freemempos;
950 memset(state->l1, 0, PAGE_SIZE);
951 state->freemempos += PAGE_SIZE;
952
953 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
954 MPASS((l1_pa & Ln_TABLE_MASK) == 0);
955 MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
956 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
957 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
958 }
959 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
960 }
961
962 static void
pmap_bootstrap_l1_table(struct pmap_bootstrap_state * state)963 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
964 {
965 vm_paddr_t l2_pa;
966 pd_entry_t l1e;
967 u_int l1_slot;
968
969 /* Make sure there is a valid L0 -> L1 table */
970 pmap_bootstrap_l0_table(state);
971
972 /* Link the level 1 table to a level 2 table */
973 l1_slot = pmap_l1_index(state->va);
974 if (l1_slot != state->l1_slot) {
975 /* See pmap_bootstrap_l0_table for a description */
976 MPASS(state->l1_slot < l1_slot ||
977 state->l1_slot == Ln_ENTRIES ||
978 state->dmap_valid);
979
980 /* Reset lower levels */
981 state->l3 = NULL;
982 state->l2_slot = Ln_ENTRIES;
983
984 /* Check the existing L1 entry */
985 state->l1_slot = l1_slot;
986 if (state->dmap_valid) {
987 l1e = state->l1[l1_slot];
988 if ((l1e & ATTR_DESCR_VALID) != 0) {
989 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
990 l2_pa = PTE_TO_PHYS(l1e);
991 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
992 return;
993 }
994 }
995
996 /* Create a new L1 table entry */
997 state->l2 = (pt_entry_t *)state->freemempos;
998 memset(state->l2, 0, PAGE_SIZE);
999 state->freemempos += PAGE_SIZE;
1000
1001 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1002 MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1003 MPASS(state->l1[l1_slot] == 0);
1004 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1005 state->table_attrs | L1_TABLE);
1006 }
1007 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1008 }
1009
1010 static void
pmap_bootstrap_l2_table(struct pmap_bootstrap_state * state)1011 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1012 {
1013 vm_paddr_t l3_pa;
1014 pd_entry_t l2e;
1015 u_int l2_slot;
1016
1017 /* Make sure there is a valid L1 -> L2 table */
1018 pmap_bootstrap_l1_table(state);
1019
1020 /* Link the level 2 table to a level 3 table */
1021 l2_slot = pmap_l2_index(state->va);
1022 if (l2_slot != state->l2_slot) {
1023 /* See pmap_bootstrap_l0_table for a description */
1024 MPASS(state->l2_slot < l2_slot ||
1025 state->l2_slot == Ln_ENTRIES ||
1026 state->dmap_valid);
1027
1028 /* Check the existing L2 entry */
1029 state->l2_slot = l2_slot;
1030 if (state->dmap_valid) {
1031 l2e = state->l2[l2_slot];
1032 if ((l2e & ATTR_DESCR_VALID) != 0) {
1033 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1034 l3_pa = PTE_TO_PHYS(l2e);
1035 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1036 return;
1037 }
1038 }
1039
1040 /* Create a new L2 table entry */
1041 state->l3 = (pt_entry_t *)state->freemempos;
1042 memset(state->l3, 0, PAGE_SIZE);
1043 state->freemempos += PAGE_SIZE;
1044
1045 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1046 MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1047 MPASS(state->l2[l2_slot] == 0);
1048 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1049 state->table_attrs | L2_TABLE);
1050 }
1051 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1052 }
1053
1054 static void
pmap_bootstrap_l2_block(struct pmap_bootstrap_state * state,int i)1055 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1056 {
1057 u_int l2_slot;
1058 bool first;
1059
1060 if ((physmap[i + 1] - state->pa) < L2_SIZE)
1061 return;
1062
1063 /* Make sure there is a valid L1 table */
1064 pmap_bootstrap_l1_table(state);
1065
1066 MPASS((state->va & L2_OFFSET) == 0);
1067 for (first = true;
1068 state->va < DMAP_MAX_ADDRESS &&
1069 (physmap[i + 1] - state->pa) >= L2_SIZE;
1070 state->va += L2_SIZE, state->pa += L2_SIZE) {
1071 /*
1072 * Stop if we are about to walk off the end of what the
1073 * current L1 slot can address.
1074 */
1075 if (!first && (state->pa & L1_OFFSET) == 0)
1076 break;
1077
1078 first = false;
1079 l2_slot = pmap_l2_index(state->va);
1080 MPASS((state->pa & L2_OFFSET) == 0);
1081 MPASS(state->l2[l2_slot] == 0);
1082 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1083 ATTR_DEFAULT | ATTR_S1_XN |
1084 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
1085 }
1086 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1087 }
1088
1089 static void
pmap_bootstrap_l3_page(struct pmap_bootstrap_state * state,int i)1090 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1091 {
1092 u_int l3_slot;
1093 bool first;
1094
1095 if ((physmap[i + 1] - state->pa) < L3_SIZE)
1096 return;
1097
1098 /* Make sure there is a valid L2 table */
1099 pmap_bootstrap_l2_table(state);
1100
1101 MPASS((state->va & L3_OFFSET) == 0);
1102 for (first = true;
1103 state->va < DMAP_MAX_ADDRESS &&
1104 (physmap[i + 1] - state->pa) >= L3_SIZE;
1105 state->va += L3_SIZE, state->pa += L3_SIZE) {
1106 /*
1107 * Stop if we are about to walk off the end of what the
1108 * current L2 slot can address.
1109 */
1110 if (!first && (state->pa & L2_OFFSET) == 0)
1111 break;
1112
1113 first = false;
1114 l3_slot = pmap_l3_index(state->va);
1115 MPASS((state->pa & L3_OFFSET) == 0);
1116 MPASS(state->l3[l3_slot] == 0);
1117 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1118 ATTR_DEFAULT | ATTR_S1_XN |
1119 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L3_PAGE);
1120 }
1121 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1122 }
1123
1124 static void
pmap_bootstrap_dmap(vm_paddr_t min_pa)1125 pmap_bootstrap_dmap(vm_paddr_t min_pa)
1126 {
1127 int i;
1128
1129 dmap_phys_base = min_pa & ~L1_OFFSET;
1130 dmap_phys_max = 0;
1131 dmap_max_addr = 0;
1132
1133 for (i = 0; i < (physmap_idx * 2); i += 2) {
1134 bs_state.pa = physmap[i] & ~L3_OFFSET;
1135 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1136
1137 /* Create L3 mappings at the start of the region */
1138 if ((bs_state.pa & L2_OFFSET) != 0)
1139 pmap_bootstrap_l3_page(&bs_state, i);
1140 MPASS(bs_state.pa <= physmap[i + 1]);
1141
1142 if (L1_BLOCKS_SUPPORTED) {
1143 /* Create L2 mappings at the start of the region */
1144 if ((bs_state.pa & L1_OFFSET) != 0)
1145 pmap_bootstrap_l2_block(&bs_state, i);
1146 MPASS(bs_state.pa <= physmap[i + 1]);
1147
1148 /* Create the main L1 block mappings */
1149 for (; bs_state.va < DMAP_MAX_ADDRESS &&
1150 (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1151 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1152 /* Make sure there is a valid L1 table */
1153 pmap_bootstrap_l0_table(&bs_state);
1154 MPASS((bs_state.pa & L1_OFFSET) == 0);
1155 pmap_store(
1156 &bs_state.l1[pmap_l1_index(bs_state.va)],
1157 PHYS_TO_PTE(bs_state.pa) | ATTR_DEFAULT |
1158 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1159 ATTR_S1_XN | L1_BLOCK);
1160 }
1161 MPASS(bs_state.pa <= physmap[i + 1]);
1162
1163 /* Create L2 mappings at the end of the region */
1164 pmap_bootstrap_l2_block(&bs_state, i);
1165 } else {
1166 while (bs_state.va < DMAP_MAX_ADDRESS &&
1167 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1168 pmap_bootstrap_l2_block(&bs_state, i);
1169 }
1170 }
1171 MPASS(bs_state.pa <= physmap[i + 1]);
1172
1173 /* Create L3 mappings at the end of the region */
1174 pmap_bootstrap_l3_page(&bs_state, i);
1175 MPASS(bs_state.pa == physmap[i + 1]);
1176
1177 if (bs_state.pa > dmap_phys_max) {
1178 dmap_phys_max = bs_state.pa;
1179 dmap_max_addr = bs_state.va;
1180 }
1181 }
1182
1183 cpu_tlb_flushID();
1184 }
1185
1186 static void
pmap_bootstrap_l2(vm_offset_t va)1187 pmap_bootstrap_l2(vm_offset_t va)
1188 {
1189 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1190
1191 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1192 bs_state.va = va;
1193
1194 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1195 pmap_bootstrap_l1_table(&bs_state);
1196 }
1197
1198 static void
pmap_bootstrap_l3(vm_offset_t va)1199 pmap_bootstrap_l3(vm_offset_t va)
1200 {
1201 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1202
1203 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1204 bs_state.va = va;
1205
1206 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1207 pmap_bootstrap_l2_table(&bs_state);
1208 }
1209
1210 #ifdef KASAN
1211 static void
pmap_bootstrap_allocate_kasan_l2(vm_paddr_t start_pa,vm_paddr_t end_pa,vm_offset_t * start_va,int * nkasan_l2)1212 pmap_bootstrap_allocate_kasan_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1213 vm_offset_t *start_va, int *nkasan_l2)
1214 {
1215 int i;
1216 vm_paddr_t pa;
1217 vm_offset_t va;
1218 pd_entry_t *l2;
1219
1220 va = *start_va;
1221 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1222 l2 = pmap_l2(kernel_pmap, va);
1223
1224 for (i = 0; pa >= start_pa && i < *nkasan_l2;
1225 i++, va += L2_SIZE, pa -= L2_SIZE, l2++) {
1226 /*
1227 * KASAN stack checking results in us having already allocated
1228 * part of our shadow map, so we can just skip those segments.
1229 */
1230 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1231 pa += L2_SIZE;
1232 continue;
1233 }
1234
1235 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1236 }
1237
1238 /*
1239 * Ended the allocation due to start_pa constraint, rather than because
1240 * we allocated everything. Adjust back up to the start_pa and remove
1241 * the invalid L2 block from our accounting.
1242 */
1243 if (pa < start_pa) {
1244 va += L2_SIZE;
1245 i--;
1246 pa = start_pa;
1247 }
1248
1249 bzero((void *)PHYS_TO_DMAP(pa), i * L2_SIZE);
1250 physmem_exclude_region(pa, i * L2_SIZE, EXFLAG_NOALLOC);
1251
1252 *nkasan_l2 -= i;
1253 *start_va = va;
1254 }
1255 #endif
1256
1257 /*
1258 * Bootstrap the system enough to run with virtual memory.
1259 */
1260 void
pmap_bootstrap(vm_size_t kernlen)1261 pmap_bootstrap(vm_size_t kernlen)
1262 {
1263 vm_offset_t dpcpu, msgbufpv;
1264 vm_paddr_t start_pa, pa, min_pa;
1265 int i;
1266
1267 /* Verify that the ASID is set through TTBR0. */
1268 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
1269 ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1270
1271 /* Set this early so we can use the pagetable walking functions */
1272 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1273 PMAP_LOCK_INIT(kernel_pmap);
1274 kernel_pmap->pm_l0_paddr =
1275 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1276 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1277 vm_radix_init(&kernel_pmap->pm_root);
1278 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1279 kernel_pmap->pm_stage = PM_STAGE1;
1280 kernel_pmap->pm_levels = 4;
1281 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1282 kernel_pmap->pm_asid_set = &asids;
1283
1284 /* Assume the address we were loaded to is a valid physical address */
1285 min_pa = pmap_early_vtophys(KERNBASE);
1286
1287 physmap_idx = physmem_avail(physmap, nitems(physmap));
1288 physmap_idx /= 2;
1289
1290 /*
1291 * Find the minimum physical address. physmap is sorted,
1292 * but may contain empty ranges.
1293 */
1294 for (i = 0; i < physmap_idx * 2; i += 2) {
1295 if (physmap[i] == physmap[i + 1])
1296 continue;
1297 if (physmap[i] <= min_pa)
1298 min_pa = physmap[i];
1299 }
1300
1301 bs_state.freemempos = KERNBASE + kernlen;
1302 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1303
1304 /* Create a direct map region early so we can use it for pa -> va */
1305 pmap_bootstrap_dmap(min_pa);
1306 bs_state.dmap_valid = true;
1307 /*
1308 * We only use PXN when we know nothing will be executed from it, e.g.
1309 * the DMAP region.
1310 */
1311 bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1312
1313 start_pa = pa = pmap_early_vtophys(KERNBASE);
1314
1315 /*
1316 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the
1317 * loader allocated the first and only l2 page table page used to map
1318 * the kernel, preloaded files and module metadata.
1319 */
1320 pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1321 /* And the l3 tables for the early devmap */
1322 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1323
1324 cpu_tlb_flushID();
1325
1326 #define alloc_pages(var, np) \
1327 (var) = bs_state.freemempos; \
1328 bs_state.freemempos += (np * PAGE_SIZE); \
1329 memset((char *)(var), 0, ((np) * PAGE_SIZE));
1330
1331 /* Allocate dynamic per-cpu area. */
1332 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1333 dpcpu_init((void *)dpcpu, 0);
1334
1335 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1336 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1337 msgbufp = (void *)msgbufpv;
1338
1339 /* Reserve some VA space for early BIOS/ACPI mapping */
1340 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1341
1342 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1343 virtual_avail = roundup2(virtual_avail, L1_SIZE);
1344 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1345 kernel_vm_end = virtual_avail;
1346
1347 pa = pmap_early_vtophys(bs_state.freemempos);
1348
1349 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1350
1351 cpu_tlb_flushID();
1352 }
1353
1354 #if defined(KASAN)
1355 /*
1356 * Finish constructing the initial shadow map:
1357 * - Count how many pages from KERNBASE to virtual_avail (scaled for
1358 * shadow map)
1359 * - Map that entire range using L2 superpages.
1360 */
1361 void
pmap_bootstrap_san(void)1362 pmap_bootstrap_san(void)
1363 {
1364 vm_offset_t va;
1365 vm_paddr_t kernstart;
1366 int i, shadow_npages, nkasan_l2;
1367
1368 kernstart = pmap_early_vtophys(KERNBASE);
1369
1370 /*
1371 * Rebuild physmap one more time, we may have excluded more regions from
1372 * allocation since pmap_bootstrap().
1373 */
1374 bzero(physmap, sizeof(physmap));
1375 physmap_idx = physmem_avail(physmap, nitems(physmap));
1376 physmap_idx /= 2;
1377
1378 shadow_npages = (virtual_avail - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE;
1379 shadow_npages = howmany(shadow_npages, KASAN_SHADOW_SCALE);
1380 nkasan_l2 = howmany(shadow_npages, Ln_ENTRIES);
1381
1382 /* Map the valid KVA up to this point. */
1383 va = KASAN_MIN_ADDRESS;
1384
1385 /*
1386 * Find a slot in the physmap large enough for what we needed. We try to put
1387 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1388 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1389 */
1390 for (i = (physmap_idx * 2) - 2; i >= 0 && nkasan_l2 > 0; i -= 2) {
1391 vm_paddr_t plow, phigh;
1392
1393 /* L2 mappings must be backed by memory that is L2-aligned */
1394 plow = roundup2(physmap[i], L2_SIZE);
1395 phigh = physmap[i + 1];
1396 if (plow >= phigh)
1397 continue;
1398 if (kernstart >= plow && kernstart < phigh)
1399 phigh = kernstart;
1400 if (phigh - plow >= L2_SIZE)
1401 pmap_bootstrap_allocate_kasan_l2(plow, phigh, &va,
1402 &nkasan_l2);
1403 }
1404
1405 if (nkasan_l2 != 0)
1406 panic("Could not find phys region for shadow map");
1407
1408 /*
1409 * Done. We should now have a valid shadow address mapped for all KVA
1410 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1411 * shadow accesses by the kasan(9) runtime will succeed for this range.
1412 * When the kernel virtual address range is later expanded, as will
1413 * happen in vm_mem_init(), the shadow map will be grown as well. This
1414 * is handled by pmap_san_enter().
1415 */
1416 }
1417 #endif
1418
1419 /*
1420 * Initialize a vm_page's machine-dependent fields.
1421 */
1422 void
pmap_page_init(vm_page_t m)1423 pmap_page_init(vm_page_t m)
1424 {
1425
1426 TAILQ_INIT(&m->md.pv_list);
1427 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1428 }
1429
1430 static void
pmap_init_asids(struct asid_set * set,int bits)1431 pmap_init_asids(struct asid_set *set, int bits)
1432 {
1433 int i;
1434
1435 set->asid_bits = bits;
1436
1437 /*
1438 * We may be too early in the overall initialization process to use
1439 * bit_alloc().
1440 */
1441 set->asid_set_size = 1 << set->asid_bits;
1442 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1443 M_WAITOK | M_ZERO);
1444 for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1445 bit_set(set->asid_set, i);
1446 set->asid_next = ASID_FIRST_AVAILABLE;
1447 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1448 }
1449
1450 static void
pmap_init_pv_table(void)1451 pmap_init_pv_table(void)
1452 {
1453 struct vm_phys_seg *seg, *next_seg;
1454 struct pmap_large_md_page *pvd;
1455 vm_size_t s;
1456 int domain, i, j, pages;
1457
1458 /*
1459 * We strongly depend on the size being a power of two, so the assert
1460 * is overzealous. However, should the struct be resized to a
1461 * different power of two, the code below needs to be revisited.
1462 */
1463 CTASSERT((sizeof(*pvd) == 64));
1464
1465 /*
1466 * Calculate the size of the array.
1467 */
1468 s = 0;
1469 for (i = 0; i < vm_phys_nsegs; i++) {
1470 seg = &vm_phys_segs[i];
1471 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1472 pmap_l2_pindex(seg->start);
1473 s += round_page(pages * sizeof(*pvd));
1474 }
1475 pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1476 if (pv_table == NULL)
1477 panic("%s: kva_alloc failed\n", __func__);
1478
1479 /*
1480 * Iterate physical segments to allocate domain-local memory for PV
1481 * list headers.
1482 */
1483 pvd = pv_table;
1484 for (i = 0; i < vm_phys_nsegs; i++) {
1485 seg = &vm_phys_segs[i];
1486 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1487 pmap_l2_pindex(seg->start);
1488 domain = seg->domain;
1489
1490 s = round_page(pages * sizeof(*pvd));
1491
1492 for (j = 0; j < s; j += PAGE_SIZE) {
1493 vm_page_t m = vm_page_alloc_noobj_domain(domain,
1494 VM_ALLOC_ZERO);
1495 if (m == NULL)
1496 panic("failed to allocate PV table page");
1497 pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1498 }
1499
1500 for (j = 0; j < s / sizeof(*pvd); j++) {
1501 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1502 TAILQ_INIT(&pvd->pv_page.pv_list);
1503 pvd++;
1504 }
1505 }
1506 pvd = &pv_dummy_large;
1507 memset(pvd, 0, sizeof(*pvd));
1508 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1509 TAILQ_INIT(&pvd->pv_page.pv_list);
1510
1511 /*
1512 * Set pointers from vm_phys_segs to pv_table.
1513 */
1514 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1515 seg = &vm_phys_segs[i];
1516 seg->md_first = pvd;
1517 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1518 pmap_l2_pindex(seg->start);
1519
1520 /*
1521 * If there is a following segment, and the final
1522 * superpage of this segment and the initial superpage
1523 * of the next segment are the same then adjust the
1524 * pv_table entry for that next segment down by one so
1525 * that the pv_table entries will be shared.
1526 */
1527 if (i + 1 < vm_phys_nsegs) {
1528 next_seg = &vm_phys_segs[i + 1];
1529 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1530 pmap_l2_pindex(next_seg->start)) {
1531 pvd--;
1532 }
1533 }
1534 }
1535 }
1536
1537 /*
1538 * Initialize the pmap module.
1539 *
1540 * Called by vm_mem_init(), to initialize any structures that the pmap
1541 * system needs to map virtual memory.
1542 */
1543 void
pmap_init(void)1544 pmap_init(void)
1545 {
1546 uint64_t mmfr1;
1547 int i, vmid_bits;
1548
1549 /*
1550 * Are large page mappings enabled?
1551 */
1552 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1553 if (superpages_enabled) {
1554 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1555 ("pmap_init: can't assign to pagesizes[1]"));
1556 pagesizes[1] = L2_SIZE;
1557 if (L1_BLOCKS_SUPPORTED) {
1558 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1559 ("pmap_init: can't assign to pagesizes[2]"));
1560 pagesizes[2] = L1_SIZE;
1561 }
1562 }
1563
1564 /*
1565 * Initialize the ASID allocator.
1566 */
1567 pmap_init_asids(&asids,
1568 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1569
1570 if (has_hyp()) {
1571 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1572 vmid_bits = 8;
1573
1574 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1575 ID_AA64MMFR1_VMIDBits_16)
1576 vmid_bits = 16;
1577 pmap_init_asids(&vmids, vmid_bits);
1578 }
1579
1580 /*
1581 * Initialize pv chunk lists.
1582 */
1583 for (i = 0; i < PMAP_MEMDOM; i++) {
1584 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1585 MTX_DEF);
1586 TAILQ_INIT(&pv_chunks[i].pvc_list);
1587 }
1588 pmap_init_pv_table();
1589
1590 vm_initialized = 1;
1591 }
1592
1593 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1594 "2MB page mapping counters");
1595
1596 static u_long pmap_l2_demotions;
1597 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1598 &pmap_l2_demotions, 0, "2MB page demotions");
1599
1600 static u_long pmap_l2_mappings;
1601 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1602 &pmap_l2_mappings, 0, "2MB page mappings");
1603
1604 static u_long pmap_l2_p_failures;
1605 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1606 &pmap_l2_p_failures, 0, "2MB page promotion failures");
1607
1608 static u_long pmap_l2_promotions;
1609 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1610 &pmap_l2_promotions, 0, "2MB page promotions");
1611
1612 /*
1613 * If the given value for "final_only" is false, then any cached intermediate-
1614 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1615 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1616 * Otherwise, just the cached final-level entry is invalidated.
1617 */
1618 static __inline void
pmap_s1_invalidate_kernel(uint64_t r,bool final_only)1619 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1620 {
1621 if (final_only)
1622 __asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1623 else
1624 __asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1625 }
1626
1627 static __inline void
pmap_s1_invalidate_user(uint64_t r,bool final_only)1628 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1629 {
1630 if (final_only)
1631 __asm __volatile("tlbi vale1is, %0" : : "r" (r));
1632 else
1633 __asm __volatile("tlbi vae1is, %0" : : "r" (r));
1634 }
1635
1636 /*
1637 * Invalidates any cached final- and optionally intermediate-level TLB entries
1638 * for the specified virtual address in the given virtual address space.
1639 */
1640 static __inline void
pmap_s1_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1641 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1642 {
1643 uint64_t r;
1644
1645 PMAP_ASSERT_STAGE1(pmap);
1646
1647 dsb(ishst);
1648 r = TLBI_VA(va);
1649 if (pmap == kernel_pmap) {
1650 pmap_s1_invalidate_kernel(r, final_only);
1651 } else {
1652 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1653 pmap_s1_invalidate_user(r, final_only);
1654 }
1655 dsb(ish);
1656 isb();
1657 }
1658
1659 static __inline void
pmap_s2_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1660 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1661 {
1662 PMAP_ASSERT_STAGE2(pmap);
1663 MPASS(pmap_stage2_invalidate_range != NULL);
1664 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1665 final_only);
1666 }
1667
1668 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1669 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1670 {
1671 if (pmap->pm_stage == PM_STAGE1)
1672 pmap_s1_invalidate_page(pmap, va, final_only);
1673 else
1674 pmap_s2_invalidate_page(pmap, va, final_only);
1675 }
1676
1677 /*
1678 * Invalidates any cached final- and optionally intermediate-level TLB entries
1679 * for the specified virtual address range in the given virtual address space.
1680 */
1681 static __inline void
pmap_s1_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1682 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1683 bool final_only)
1684 {
1685 uint64_t end, r, start;
1686
1687 PMAP_ASSERT_STAGE1(pmap);
1688
1689 dsb(ishst);
1690 if (pmap == kernel_pmap) {
1691 start = TLBI_VA(sva);
1692 end = TLBI_VA(eva);
1693 for (r = start; r < end; r += TLBI_VA_L3_INCR)
1694 pmap_s1_invalidate_kernel(r, final_only);
1695 } else {
1696 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1697 start |= TLBI_VA(sva);
1698 end |= TLBI_VA(eva);
1699 for (r = start; r < end; r += TLBI_VA_L3_INCR)
1700 pmap_s1_invalidate_user(r, final_only);
1701 }
1702 dsb(ish);
1703 isb();
1704 }
1705
1706 static __inline void
pmap_s2_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1707 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1708 bool final_only)
1709 {
1710 PMAP_ASSERT_STAGE2(pmap);
1711 MPASS(pmap_stage2_invalidate_range != NULL);
1712 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1713 }
1714
1715 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1716 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1717 bool final_only)
1718 {
1719 if (pmap->pm_stage == PM_STAGE1)
1720 pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1721 else
1722 pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1723 }
1724
1725 /*
1726 * Invalidates all cached intermediate- and final-level TLB entries for the
1727 * given virtual address space.
1728 */
1729 static __inline void
pmap_s1_invalidate_all(pmap_t pmap)1730 pmap_s1_invalidate_all(pmap_t pmap)
1731 {
1732 uint64_t r;
1733
1734 PMAP_ASSERT_STAGE1(pmap);
1735
1736 dsb(ishst);
1737 if (pmap == kernel_pmap) {
1738 __asm __volatile("tlbi vmalle1is");
1739 } else {
1740 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1741 __asm __volatile("tlbi aside1is, %0" : : "r" (r));
1742 }
1743 dsb(ish);
1744 isb();
1745 }
1746
1747 static __inline void
pmap_s2_invalidate_all(pmap_t pmap)1748 pmap_s2_invalidate_all(pmap_t pmap)
1749 {
1750 PMAP_ASSERT_STAGE2(pmap);
1751 MPASS(pmap_stage2_invalidate_all != NULL);
1752 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1753 }
1754
1755 static __inline void
pmap_invalidate_all(pmap_t pmap)1756 pmap_invalidate_all(pmap_t pmap)
1757 {
1758 if (pmap->pm_stage == PM_STAGE1)
1759 pmap_s1_invalidate_all(pmap);
1760 else
1761 pmap_s2_invalidate_all(pmap);
1762 }
1763
1764 /*
1765 * Routine: pmap_extract
1766 * Function:
1767 * Extract the physical page address associated
1768 * with the given map/virtual_address pair.
1769 */
1770 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1771 pmap_extract(pmap_t pmap, vm_offset_t va)
1772 {
1773 pt_entry_t *pte, tpte;
1774 vm_paddr_t pa;
1775 int lvl;
1776
1777 pa = 0;
1778 PMAP_LOCK(pmap);
1779 /*
1780 * Find the block or page map for this virtual address. pmap_pte
1781 * will return either a valid block/page entry, or NULL.
1782 */
1783 pte = pmap_pte(pmap, va, &lvl);
1784 if (pte != NULL) {
1785 tpte = pmap_load(pte);
1786 pa = PTE_TO_PHYS(tpte);
1787 switch(lvl) {
1788 case 1:
1789 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
1790 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1791 ("pmap_extract: Invalid L1 pte found: %lx",
1792 tpte & ATTR_DESCR_MASK));
1793 pa |= (va & L1_OFFSET);
1794 break;
1795 case 2:
1796 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1797 ("pmap_extract: Invalid L2 pte found: %lx",
1798 tpte & ATTR_DESCR_MASK));
1799 pa |= (va & L2_OFFSET);
1800 break;
1801 case 3:
1802 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1803 ("pmap_extract: Invalid L3 pte found: %lx",
1804 tpte & ATTR_DESCR_MASK));
1805 pa |= (va & L3_OFFSET);
1806 break;
1807 }
1808 }
1809 PMAP_UNLOCK(pmap);
1810 return (pa);
1811 }
1812
1813 /*
1814 * Routine: pmap_extract_and_hold
1815 * Function:
1816 * Atomically extract and hold the physical page
1817 * with the given pmap and virtual address pair
1818 * if that mapping permits the given protection.
1819 */
1820 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1821 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1822 {
1823 pt_entry_t *pte, tpte;
1824 vm_offset_t off;
1825 vm_page_t m;
1826 int lvl;
1827 bool use;
1828
1829 m = NULL;
1830 PMAP_LOCK(pmap);
1831 pte = pmap_pte(pmap, va, &lvl);
1832 if (pte != NULL) {
1833 tpte = pmap_load(pte);
1834
1835 KASSERT(lvl > 0 && lvl <= 3,
1836 ("pmap_extract_and_hold: Invalid level %d", lvl));
1837 /*
1838 * Check that the pte is either a L3 page, or a L1 or L2 block
1839 * entry. We can assume L1_BLOCK == L2_BLOCK.
1840 */
1841 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1842 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1843 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1844 tpte & ATTR_DESCR_MASK));
1845
1846 use = false;
1847 if ((prot & VM_PROT_WRITE) == 0)
1848 use = true;
1849 else if (pmap->pm_stage == PM_STAGE1 &&
1850 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1851 use = true;
1852 else if (pmap->pm_stage == PM_STAGE2 &&
1853 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1854 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1855 use = true;
1856
1857 if (use) {
1858 switch (lvl) {
1859 case 1:
1860 off = va & L1_OFFSET;
1861 break;
1862 case 2:
1863 off = va & L2_OFFSET;
1864 break;
1865 case 3:
1866 default:
1867 off = 0;
1868 }
1869 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
1870 if (m != NULL && !vm_page_wire_mapped(m))
1871 m = NULL;
1872 }
1873 }
1874 PMAP_UNLOCK(pmap);
1875 return (m);
1876 }
1877
1878 /*
1879 * Walks the page tables to translate a kernel virtual address to a
1880 * physical address. Returns true if the kva is valid and stores the
1881 * physical address in pa if it is not NULL.
1882 *
1883 * See the comment above data_abort() for the rationale for specifying
1884 * NO_PERTHREAD_SSP here.
1885 */
1886 bool NO_PERTHREAD_SSP
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)1887 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
1888 {
1889 pt_entry_t *pte, tpte;
1890 register_t intr;
1891 uint64_t par;
1892
1893 /*
1894 * Disable interrupts so we don't get interrupted between asking
1895 * for address translation, and getting the result back.
1896 */
1897 intr = intr_disable();
1898 par = arm64_address_translate_s1e1r(va);
1899 intr_restore(intr);
1900
1901 if (PAR_SUCCESS(par)) {
1902 if (pa != NULL)
1903 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
1904 return (true);
1905 }
1906
1907 /*
1908 * Fall back to walking the page table. The address translation
1909 * instruction may fail when the page is in a break-before-make
1910 * sequence. As we only clear the valid bit in said sequence we
1911 * can walk the page table to find the physical address.
1912 */
1913
1914 pte = pmap_l1(kernel_pmap, va);
1915 if (pte == NULL)
1916 return (false);
1917
1918 /*
1919 * A concurrent pmap_update_entry() will clear the entry's valid bit
1920 * but leave the rest of the entry unchanged. Therefore, we treat a
1921 * non-zero entry as being valid, and we ignore the valid bit when
1922 * determining whether the entry maps a block, page, or table.
1923 */
1924 tpte = pmap_load(pte);
1925 if (tpte == 0)
1926 return (false);
1927 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1928 if (pa != NULL)
1929 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
1930 return (true);
1931 }
1932 pte = pmap_l1_to_l2(&tpte, va);
1933 tpte = pmap_load(pte);
1934 if (tpte == 0)
1935 return (false);
1936 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1937 if (pa != NULL)
1938 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
1939 return (true);
1940 }
1941 pte = pmap_l2_to_l3(&tpte, va);
1942 tpte = pmap_load(pte);
1943 if (tpte == 0)
1944 return (false);
1945 if (pa != NULL)
1946 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
1947 return (true);
1948 }
1949
1950 vm_paddr_t
pmap_kextract(vm_offset_t va)1951 pmap_kextract(vm_offset_t va)
1952 {
1953 vm_paddr_t pa;
1954
1955 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
1956 return (DMAP_TO_PHYS(va));
1957
1958 if (pmap_klookup(va, &pa) == false)
1959 return (0);
1960 return (pa);
1961 }
1962
1963 /***************************************************
1964 * Low level mapping routines.....
1965 ***************************************************/
1966
1967 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)1968 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1969 {
1970 pd_entry_t *pde;
1971 pt_entry_t attr, old_l3e, *pte;
1972 vm_offset_t va;
1973 int lvl;
1974
1975 KASSERT((pa & L3_OFFSET) == 0,
1976 ("pmap_kenter: Invalid physical address"));
1977 KASSERT((sva & L3_OFFSET) == 0,
1978 ("pmap_kenter: Invalid virtual address"));
1979 KASSERT((size & PAGE_MASK) == 0,
1980 ("pmap_kenter: Mapping is not page-sized"));
1981
1982 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
1983 ATTR_S1_IDX(mode) | L3_PAGE;
1984 old_l3e = 0;
1985 va = sva;
1986 while (size != 0) {
1987 pde = pmap_pde(kernel_pmap, va, &lvl);
1988 KASSERT(pde != NULL,
1989 ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
1990 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
1991
1992 pte = pmap_l2_to_l3(pde, va);
1993 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr);
1994
1995 va += PAGE_SIZE;
1996 pa += PAGE_SIZE;
1997 size -= PAGE_SIZE;
1998 }
1999 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2000 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2001 else {
2002 /*
2003 * Because the old entries were invalid and the new mappings
2004 * are not executable, an isb is not required.
2005 */
2006 dsb(ishst);
2007 }
2008 }
2009
2010 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)2011 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2012 {
2013
2014 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2015 }
2016
2017 /*
2018 * Remove a page from the kernel pagetables.
2019 */
2020 void
pmap_kremove(vm_offset_t va)2021 pmap_kremove(vm_offset_t va)
2022 {
2023 pt_entry_t *pte;
2024
2025 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2026 pmap_clear(pte);
2027 pmap_s1_invalidate_page(kernel_pmap, va, true);
2028 }
2029
2030 /*
2031 * Remove the specified range of mappings from the kernel address space.
2032 *
2033 * Should only be applied to mappings that were created by pmap_kenter() or
2034 * pmap_kenter_device(). Nothing about this function is actually specific
2035 * to device mappings.
2036 */
2037 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)2038 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2039 {
2040 pt_entry_t *pte;
2041 vm_offset_t va;
2042
2043 KASSERT((sva & L3_OFFSET) == 0,
2044 ("pmap_kremove_device: Invalid virtual address"));
2045 KASSERT((size & PAGE_MASK) == 0,
2046 ("pmap_kremove_device: Mapping is not page-sized"));
2047
2048 va = sva;
2049 while (size != 0) {
2050 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2051 pmap_clear(pte);
2052
2053 va += PAGE_SIZE;
2054 size -= PAGE_SIZE;
2055 }
2056 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2057 }
2058
2059 /*
2060 * Used to map a range of physical addresses into kernel
2061 * virtual address space.
2062 *
2063 * The value passed in '*virt' is a suggested virtual address for
2064 * the mapping. Architectures which can support a direct-mapped
2065 * physical to virtual region can return the appropriate address
2066 * within that region, leaving '*virt' unchanged. Other
2067 * architectures should map the pages starting at '*virt' and
2068 * update '*virt' with the first usable address after the mapped
2069 * region.
2070 */
2071 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)2072 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2073 {
2074 return PHYS_TO_DMAP(start);
2075 }
2076
2077 /*
2078 * Add a list of wired pages to the kva
2079 * this routine is only used for temporary
2080 * kernel mappings that do not need to have
2081 * page modification or references recorded.
2082 * Note that old mappings are simply written
2083 * over. The page *must* be wired.
2084 * Note: SMP coherent. Uses a ranged shootdown IPI.
2085 */
2086 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)2087 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2088 {
2089 pd_entry_t *pde;
2090 pt_entry_t attr, old_l3e, pa, *pte;
2091 vm_offset_t va;
2092 vm_page_t m;
2093 int i, lvl;
2094
2095 old_l3e = 0;
2096 va = sva;
2097 for (i = 0; i < count; i++) {
2098 pde = pmap_pde(kernel_pmap, va, &lvl);
2099 KASSERT(pde != NULL,
2100 ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2101 KASSERT(lvl == 2,
2102 ("pmap_qenter: Invalid level %d", lvl));
2103
2104 m = ma[i];
2105 pa = VM_PAGE_TO_PHYS(m);
2106 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2107 ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2108 pte = pmap_l2_to_l3(pde, va);
2109 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr);
2110
2111 va += L3_SIZE;
2112 }
2113 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2114 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2115 else {
2116 /*
2117 * Because the old entries were invalid and the new mappings
2118 * are not executable, an isb is not required.
2119 */
2120 dsb(ishst);
2121 }
2122 }
2123
2124 /*
2125 * This routine tears out page mappings from the
2126 * kernel -- it is meant only for temporary mappings.
2127 */
2128 void
pmap_qremove(vm_offset_t sva,int count)2129 pmap_qremove(vm_offset_t sva, int count)
2130 {
2131 pt_entry_t *pte;
2132 vm_offset_t va;
2133
2134 KASSERT(ADDR_IS_CANONICAL(sva),
2135 ("%s: Address not in canonical form: %lx", __func__, sva));
2136 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2137
2138 va = sva;
2139 while (count-- > 0) {
2140 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2141 if (pte != NULL) {
2142 pmap_clear(pte);
2143 }
2144
2145 va += PAGE_SIZE;
2146 }
2147 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2148 }
2149
2150 /***************************************************
2151 * Page table page management routines.....
2152 ***************************************************/
2153 /*
2154 * Schedule the specified unused page table page to be freed. Specifically,
2155 * add the page to the specified list of pages that will be released to the
2156 * physical memory manager after the TLB has been updated.
2157 */
2158 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,boolean_t set_PG_ZERO)2159 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2160 boolean_t set_PG_ZERO)
2161 {
2162
2163 if (set_PG_ZERO)
2164 m->flags |= PG_ZERO;
2165 else
2166 m->flags &= ~PG_ZERO;
2167 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2168 }
2169
2170 /*
2171 * Decrements a page table page's reference count, which is used to record the
2172 * number of valid page table entries within the page. If the reference count
2173 * drops to zero, then the page table page is unmapped. Returns TRUE if the
2174 * page table page was unmapped and FALSE otherwise.
2175 */
2176 static inline boolean_t
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2177 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2178 {
2179
2180 --m->ref_count;
2181 if (m->ref_count == 0) {
2182 _pmap_unwire_l3(pmap, va, m, free);
2183 return (TRUE);
2184 } else
2185 return (FALSE);
2186 }
2187
2188 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2189 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2190 {
2191
2192 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2193 /*
2194 * unmap the page table page
2195 */
2196 if (m->pindex >= (NUL2E + NUL1E)) {
2197 /* l1 page */
2198 pd_entry_t *l0;
2199
2200 l0 = pmap_l0(pmap, va);
2201 pmap_clear(l0);
2202 } else if (m->pindex >= NUL2E) {
2203 /* l2 page */
2204 pd_entry_t *l1;
2205
2206 l1 = pmap_l1(pmap, va);
2207 pmap_clear(l1);
2208 } else {
2209 /* l3 page */
2210 pd_entry_t *l2;
2211
2212 l2 = pmap_l2(pmap, va);
2213 pmap_clear(l2);
2214 }
2215 pmap_resident_count_dec(pmap, 1);
2216 if (m->pindex < NUL2E) {
2217 /* We just released an l3, unhold the matching l2 */
2218 pd_entry_t *l1, tl1;
2219 vm_page_t l2pg;
2220
2221 l1 = pmap_l1(pmap, va);
2222 tl1 = pmap_load(l1);
2223 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tl1));
2224 pmap_unwire_l3(pmap, va, l2pg, free);
2225 } else if (m->pindex < (NUL2E + NUL1E)) {
2226 /* We just released an l2, unhold the matching l1 */
2227 pd_entry_t *l0, tl0;
2228 vm_page_t l1pg;
2229
2230 l0 = pmap_l0(pmap, va);
2231 tl0 = pmap_load(l0);
2232 l1pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tl0));
2233 pmap_unwire_l3(pmap, va, l1pg, free);
2234 }
2235 pmap_invalidate_page(pmap, va, false);
2236
2237 /*
2238 * Put page on a list so that it is released after
2239 * *ALL* TLB shootdown is done
2240 */
2241 pmap_add_delayed_free_list(m, free, TRUE);
2242 }
2243
2244 /*
2245 * After removing a page table entry, this routine is used to
2246 * conditionally free the page, and manage the reference count.
2247 */
2248 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)2249 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2250 struct spglist *free)
2251 {
2252 vm_page_t mpte;
2253
2254 KASSERT(ADDR_IS_CANONICAL(va),
2255 ("%s: Address not in canonical form: %lx", __func__, va));
2256 if (ADDR_IS_KERNEL(va))
2257 return (0);
2258 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2259 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde));
2260 return (pmap_unwire_l3(pmap, va, mpte, free));
2261 }
2262
2263 /*
2264 * Release a page table page reference after a failed attempt to create a
2265 * mapping.
2266 */
2267 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)2268 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2269 {
2270 struct spglist free;
2271
2272 SLIST_INIT(&free);
2273 if (pmap_unwire_l3(pmap, va, mpte, &free))
2274 vm_page_free_pages_toq(&free, true);
2275 }
2276
2277 void
pmap_pinit0(pmap_t pmap)2278 pmap_pinit0(pmap_t pmap)
2279 {
2280
2281 PMAP_LOCK_INIT(pmap);
2282 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2283 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2284 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2285 TAILQ_INIT(&pmap->pm_pvchunk);
2286 vm_radix_init(&pmap->pm_root);
2287 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2288 pmap->pm_stage = PM_STAGE1;
2289 pmap->pm_levels = 4;
2290 pmap->pm_ttbr = pmap->pm_l0_paddr;
2291 pmap->pm_asid_set = &asids;
2292
2293 PCPU_SET(curpmap, pmap);
2294 }
2295
2296 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)2297 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2298 {
2299 vm_page_t m;
2300
2301 /*
2302 * allocate the l0 page
2303 */
2304 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2305 VM_ALLOC_ZERO);
2306 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2307 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2308
2309 TAILQ_INIT(&pmap->pm_pvchunk);
2310 vm_radix_init(&pmap->pm_root);
2311 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2312 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2313
2314 MPASS(levels == 3 || levels == 4);
2315 pmap->pm_levels = levels;
2316 pmap->pm_stage = stage;
2317 switch (stage) {
2318 case PM_STAGE1:
2319 pmap->pm_asid_set = &asids;
2320 break;
2321 case PM_STAGE2:
2322 pmap->pm_asid_set = &vmids;
2323 break;
2324 default:
2325 panic("%s: Invalid pmap type %d", __func__, stage);
2326 break;
2327 }
2328
2329 /* XXX Temporarily disable deferred ASID allocation. */
2330 pmap_alloc_asid(pmap);
2331
2332 /*
2333 * Allocate the level 1 entry to use as the root. This will increase
2334 * the refcount on the level 1 page so it won't be removed until
2335 * pmap_release() is called.
2336 */
2337 if (pmap->pm_levels == 3) {
2338 PMAP_LOCK(pmap);
2339 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2340 PMAP_UNLOCK(pmap);
2341 }
2342 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2343
2344 return (1);
2345 }
2346
2347 int
pmap_pinit(pmap_t pmap)2348 pmap_pinit(pmap_t pmap)
2349 {
2350
2351 return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2352 }
2353
2354 /*
2355 * This routine is called if the desired page table page does not exist.
2356 *
2357 * If page table page allocation fails, this routine may sleep before
2358 * returning NULL. It sleeps only if a lock pointer was given.
2359 *
2360 * Note: If a page allocation fails at page table level two or three,
2361 * one or two pages may be held during the wait, only to be released
2362 * afterwards. This conservative approach is easily argued to avoid
2363 * race conditions.
2364 */
2365 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)2366 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2367 {
2368 vm_page_t m, l1pg, l2pg;
2369
2370 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2371
2372 /*
2373 * Allocate a page table page.
2374 */
2375 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2376 if (lockp != NULL) {
2377 RELEASE_PV_LIST_LOCK(lockp);
2378 PMAP_UNLOCK(pmap);
2379 vm_wait(NULL);
2380 PMAP_LOCK(pmap);
2381 }
2382
2383 /*
2384 * Indicate the need to retry. While waiting, the page table
2385 * page may have been allocated.
2386 */
2387 return (NULL);
2388 }
2389 m->pindex = ptepindex;
2390
2391 /*
2392 * Because of AArch64's weak memory consistency model, we must have a
2393 * barrier here to ensure that the stores for zeroing "m", whether by
2394 * pmap_zero_page() or an earlier function, are visible before adding
2395 * "m" to the page table. Otherwise, a page table walk by another
2396 * processor's MMU could see the mapping to "m" and a stale, non-zero
2397 * PTE within "m".
2398 */
2399 dmb(ishst);
2400
2401 /*
2402 * Map the pagetable page into the process address space, if
2403 * it isn't already there.
2404 */
2405
2406 if (ptepindex >= (NUL2E + NUL1E)) {
2407 pd_entry_t *l0p, l0e;
2408 vm_pindex_t l0index;
2409
2410 l0index = ptepindex - (NUL2E + NUL1E);
2411 l0p = &pmap->pm_l0[l0index];
2412 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2413 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2414 l0e = PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | L0_TABLE;
2415
2416 /*
2417 * Mark all kernel memory as not accessible from userspace
2418 * and userspace memory as not executable from the kernel.
2419 * This has been done for the bootstrap L0 entries in
2420 * locore.S.
2421 */
2422 if (pmap == kernel_pmap)
2423 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2424 else
2425 l0e |= TATTR_PXN_TABLE;
2426 pmap_store(l0p, l0e);
2427 } else if (ptepindex >= NUL2E) {
2428 vm_pindex_t l0index, l1index;
2429 pd_entry_t *l0, *l1;
2430 pd_entry_t tl0;
2431
2432 l1index = ptepindex - NUL2E;
2433 l0index = l1index >> Ln_ENTRIES_SHIFT;
2434
2435 l0 = &pmap->pm_l0[l0index];
2436 tl0 = pmap_load(l0);
2437 if (tl0 == 0) {
2438 /* recurse for allocating page dir */
2439 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2440 lockp) == NULL) {
2441 vm_page_unwire_noq(m);
2442 vm_page_free_zero(m);
2443 return (NULL);
2444 }
2445 } else {
2446 l1pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tl0));
2447 l1pg->ref_count++;
2448 }
2449
2450 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2451 l1 = &l1[ptepindex & Ln_ADDR_MASK];
2452 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2453 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2454 pmap_store(l1, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | L1_TABLE);
2455 } else {
2456 vm_pindex_t l0index, l1index;
2457 pd_entry_t *l0, *l1, *l2;
2458 pd_entry_t tl0, tl1;
2459
2460 l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2461 l0index = l1index >> Ln_ENTRIES_SHIFT;
2462
2463 l0 = &pmap->pm_l0[l0index];
2464 tl0 = pmap_load(l0);
2465 if (tl0 == 0) {
2466 /* recurse for allocating page dir */
2467 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2468 lockp) == NULL) {
2469 vm_page_unwire_noq(m);
2470 vm_page_free_zero(m);
2471 return (NULL);
2472 }
2473 tl0 = pmap_load(l0);
2474 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2475 l1 = &l1[l1index & Ln_ADDR_MASK];
2476 } else {
2477 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2478 l1 = &l1[l1index & Ln_ADDR_MASK];
2479 tl1 = pmap_load(l1);
2480 if (tl1 == 0) {
2481 /* recurse for allocating page dir */
2482 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2483 lockp) == NULL) {
2484 vm_page_unwire_noq(m);
2485 vm_page_free_zero(m);
2486 return (NULL);
2487 }
2488 } else {
2489 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tl1));
2490 l2pg->ref_count++;
2491 }
2492 }
2493
2494 l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
2495 l2 = &l2[ptepindex & Ln_ADDR_MASK];
2496 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2497 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2498 pmap_store(l2, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | L2_TABLE);
2499 }
2500
2501 pmap_resident_count_inc(pmap, 1);
2502
2503 return (m);
2504 }
2505
2506 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)2507 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2508 struct rwlock **lockp)
2509 {
2510 pd_entry_t *l1, *l2;
2511 vm_page_t l2pg;
2512 vm_pindex_t l2pindex;
2513
2514 KASSERT(ADDR_IS_CANONICAL(va),
2515 ("%s: Address not in canonical form: %lx", __func__, va));
2516
2517 retry:
2518 l1 = pmap_l1(pmap, va);
2519 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2520 l2 = pmap_l1_to_l2(l1, va);
2521 if (!ADDR_IS_KERNEL(va)) {
2522 /* Add a reference to the L2 page. */
2523 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1)));
2524 l2pg->ref_count++;
2525 } else
2526 l2pg = NULL;
2527 } else if (!ADDR_IS_KERNEL(va)) {
2528 /* Allocate a L2 page. */
2529 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2530 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2531 if (l2pg == NULL) {
2532 if (lockp != NULL)
2533 goto retry;
2534 else
2535 return (NULL);
2536 }
2537 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2538 l2 = &l2[pmap_l2_index(va)];
2539 } else
2540 panic("pmap_alloc_l2: missing page table page for va %#lx",
2541 va);
2542 *l2pgp = l2pg;
2543 return (l2);
2544 }
2545
2546 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)2547 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2548 {
2549 vm_pindex_t ptepindex;
2550 pd_entry_t *pde, tpde;
2551 #ifdef INVARIANTS
2552 pt_entry_t *pte;
2553 #endif
2554 vm_page_t m;
2555 int lvl;
2556
2557 /*
2558 * Calculate pagetable page index
2559 */
2560 ptepindex = pmap_l2_pindex(va);
2561 retry:
2562 /*
2563 * Get the page directory entry
2564 */
2565 pde = pmap_pde(pmap, va, &lvl);
2566
2567 /*
2568 * If the page table page is mapped, we just increment the hold count,
2569 * and activate it. If we get a level 2 pde it will point to a level 3
2570 * table.
2571 */
2572 switch (lvl) {
2573 case -1:
2574 break;
2575 case 0:
2576 #ifdef INVARIANTS
2577 pte = pmap_l0_to_l1(pde, va);
2578 KASSERT(pmap_load(pte) == 0,
2579 ("pmap_alloc_l3: TODO: l0 superpages"));
2580 #endif
2581 break;
2582 case 1:
2583 #ifdef INVARIANTS
2584 pte = pmap_l1_to_l2(pde, va);
2585 KASSERT(pmap_load(pte) == 0,
2586 ("pmap_alloc_l3: TODO: l1 superpages"));
2587 #endif
2588 break;
2589 case 2:
2590 tpde = pmap_load(pde);
2591 if (tpde != 0) {
2592 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpde));
2593 m->ref_count++;
2594 return (m);
2595 }
2596 break;
2597 default:
2598 panic("pmap_alloc_l3: Invalid level %d", lvl);
2599 }
2600
2601 /*
2602 * Here if the pte page isn't mapped, or if it has been deallocated.
2603 */
2604 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2605 if (m == NULL && lockp != NULL)
2606 goto retry;
2607
2608 return (m);
2609 }
2610
2611 /***************************************************
2612 * Pmap allocation/deallocation routines.
2613 ***************************************************/
2614
2615 /*
2616 * Release any resources held by the given physical map.
2617 * Called when a pmap initialized by pmap_pinit is being released.
2618 * Should only be called if the map contains no valid mappings.
2619 */
2620 void
pmap_release(pmap_t pmap)2621 pmap_release(pmap_t pmap)
2622 {
2623 boolean_t rv __diagused;
2624 struct spglist free;
2625 struct asid_set *set;
2626 vm_page_t m;
2627 int asid;
2628
2629 if (pmap->pm_levels != 4) {
2630 PMAP_ASSERT_STAGE2(pmap);
2631 KASSERT(pmap->pm_stats.resident_count == 1,
2632 ("pmap_release: pmap resident count %ld != 0",
2633 pmap->pm_stats.resident_count));
2634 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2635 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2636
2637 SLIST_INIT(&free);
2638 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2639 PMAP_LOCK(pmap);
2640 rv = pmap_unwire_l3(pmap, 0, m, &free);
2641 PMAP_UNLOCK(pmap);
2642 MPASS(rv == TRUE);
2643 vm_page_free_pages_toq(&free, true);
2644 }
2645
2646 KASSERT(pmap->pm_stats.resident_count == 0,
2647 ("pmap_release: pmap resident count %ld != 0",
2648 pmap->pm_stats.resident_count));
2649 KASSERT(vm_radix_is_empty(&pmap->pm_root),
2650 ("pmap_release: pmap has reserved page table page(s)"));
2651
2652 set = pmap->pm_asid_set;
2653 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2654
2655 /*
2656 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2657 * the entries when removing them so rely on a later tlb invalidation.
2658 * this will happen when updating the VMID generation. Because of this
2659 * we don't reuse VMIDs within a generation.
2660 */
2661 if (pmap->pm_stage == PM_STAGE1) {
2662 mtx_lock_spin(&set->asid_set_mutex);
2663 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2664 asid = COOKIE_TO_ASID(pmap->pm_cookie);
2665 KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2666 asid < set->asid_set_size,
2667 ("pmap_release: pmap cookie has out-of-range asid"));
2668 bit_clear(set->asid_set, asid);
2669 }
2670 mtx_unlock_spin(&set->asid_set_mutex);
2671 }
2672
2673 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2674 vm_page_unwire_noq(m);
2675 vm_page_free_zero(m);
2676 }
2677
2678 static int
kvm_size(SYSCTL_HANDLER_ARGS)2679 kvm_size(SYSCTL_HANDLER_ARGS)
2680 {
2681 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2682
2683 return sysctl_handle_long(oidp, &ksize, 0, req);
2684 }
2685 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2686 0, 0, kvm_size, "LU",
2687 "Size of KVM");
2688
2689 static int
kvm_free(SYSCTL_HANDLER_ARGS)2690 kvm_free(SYSCTL_HANDLER_ARGS)
2691 {
2692 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2693
2694 return sysctl_handle_long(oidp, &kfree, 0, req);
2695 }
2696 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2697 0, 0, kvm_free, "LU",
2698 "Amount of KVM free");
2699
2700 /*
2701 * grow the number of kernel page table entries, if needed
2702 */
2703 void
pmap_growkernel(vm_offset_t addr)2704 pmap_growkernel(vm_offset_t addr)
2705 {
2706 vm_paddr_t paddr;
2707 vm_page_t nkpg;
2708 pd_entry_t *l0, *l1, *l2;
2709
2710 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2711
2712 addr = roundup2(addr, L2_SIZE);
2713 if (addr - 1 >= vm_map_max(kernel_map))
2714 addr = vm_map_max(kernel_map);
2715 if (kernel_vm_end < addr)
2716 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2717 while (kernel_vm_end < addr) {
2718 l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2719 KASSERT(pmap_load(l0) != 0,
2720 ("pmap_growkernel: No level 0 kernel entry"));
2721
2722 l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2723 if (pmap_load(l1) == 0) {
2724 /* We need a new PDP entry */
2725 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2726 VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2727 if (nkpg == NULL)
2728 panic("pmap_growkernel: no memory to grow kernel");
2729 nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2730 /* See the dmb() in _pmap_alloc_l3(). */
2731 dmb(ishst);
2732 paddr = VM_PAGE_TO_PHYS(nkpg);
2733 pmap_store(l1, PHYS_TO_PTE(paddr) | L1_TABLE);
2734 continue; /* try again */
2735 }
2736 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2737 if (pmap_load(l2) != 0) {
2738 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2739 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2740 kernel_vm_end = vm_map_max(kernel_map);
2741 break;
2742 }
2743 continue;
2744 }
2745
2746 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2747 VM_ALLOC_ZERO);
2748 if (nkpg == NULL)
2749 panic("pmap_growkernel: no memory to grow kernel");
2750 nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2751 /* See the dmb() in _pmap_alloc_l3(). */
2752 dmb(ishst);
2753 paddr = VM_PAGE_TO_PHYS(nkpg);
2754 pmap_store(l2, PHYS_TO_PTE(paddr) | L2_TABLE);
2755
2756 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2757 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2758 kernel_vm_end = vm_map_max(kernel_map);
2759 break;
2760 }
2761 }
2762 }
2763
2764 /***************************************************
2765 * page management routines.
2766 ***************************************************/
2767
2768 static const uint64_t pc_freemask[_NPCM] = {
2769 [0 ... _NPCM - 2] = PC_FREEN,
2770 [_NPCM - 1] = PC_FREEL
2771 };
2772
2773 #ifdef PV_STATS
2774 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2775
2776 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2777 "Current number of pv entry chunks");
2778 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2779 "Current number of pv entry chunks allocated");
2780 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2781 "Current number of pv entry chunks frees");
2782 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2783 "Number of times tried to get a chunk page but failed.");
2784
2785 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2786 static int pv_entry_spare;
2787
2788 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2789 "Current number of pv entry frees");
2790 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2791 "Current number of pv entry allocs");
2792 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2793 "Current number of pv entries");
2794 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2795 "Current number of spare pv entries");
2796 #endif
2797
2798 /*
2799 * We are in a serious low memory condition. Resort to
2800 * drastic measures to free some pages so we can allocate
2801 * another pv entry chunk.
2802 *
2803 * Returns NULL if PV entries were reclaimed from the specified pmap.
2804 *
2805 * We do not, however, unmap 2mpages because subsequent accesses will
2806 * allocate per-page pv entries until repromotion occurs, thereby
2807 * exacerbating the shortage of free pv entries.
2808 */
2809 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)2810 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
2811 {
2812 struct pv_chunks_list *pvc;
2813 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
2814 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
2815 struct md_page *pvh;
2816 pd_entry_t *pde;
2817 pmap_t next_pmap, pmap;
2818 pt_entry_t *pte, tpte;
2819 pv_entry_t pv;
2820 vm_offset_t va;
2821 vm_page_t m, m_pc;
2822 struct spglist free;
2823 uint64_t inuse;
2824 int bit, field, freed, lvl;
2825
2826 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2827 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2828
2829 pmap = NULL;
2830 m_pc = NULL;
2831 SLIST_INIT(&free);
2832 bzero(&pc_marker_b, sizeof(pc_marker_b));
2833 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
2834 pc_marker = (struct pv_chunk *)&pc_marker_b;
2835 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
2836
2837 pvc = &pv_chunks[domain];
2838 mtx_lock(&pvc->pvc_lock);
2839 pvc->active_reclaims++;
2840 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
2841 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
2842 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
2843 SLIST_EMPTY(&free)) {
2844 next_pmap = pc->pc_pmap;
2845 if (next_pmap == NULL) {
2846 /*
2847 * The next chunk is a marker. However, it is
2848 * not our marker, so active_reclaims must be
2849 * > 1. Consequently, the next_chunk code
2850 * will not rotate the pv_chunks list.
2851 */
2852 goto next_chunk;
2853 }
2854 mtx_unlock(&pvc->pvc_lock);
2855
2856 /*
2857 * A pv_chunk can only be removed from the pc_lru list
2858 * when both pvc->pvc_lock is owned and the
2859 * corresponding pmap is locked.
2860 */
2861 if (pmap != next_pmap) {
2862 if (pmap != NULL && pmap != locked_pmap)
2863 PMAP_UNLOCK(pmap);
2864 pmap = next_pmap;
2865 /* Avoid deadlock and lock recursion. */
2866 if (pmap > locked_pmap) {
2867 RELEASE_PV_LIST_LOCK(lockp);
2868 PMAP_LOCK(pmap);
2869 mtx_lock(&pvc->pvc_lock);
2870 continue;
2871 } else if (pmap != locked_pmap) {
2872 if (PMAP_TRYLOCK(pmap)) {
2873 mtx_lock(&pvc->pvc_lock);
2874 continue;
2875 } else {
2876 pmap = NULL; /* pmap is not locked */
2877 mtx_lock(&pvc->pvc_lock);
2878 pc = TAILQ_NEXT(pc_marker, pc_lru);
2879 if (pc == NULL ||
2880 pc->pc_pmap != next_pmap)
2881 continue;
2882 goto next_chunk;
2883 }
2884 }
2885 }
2886
2887 /*
2888 * Destroy every non-wired, 4 KB page mapping in the chunk.
2889 */
2890 freed = 0;
2891 for (field = 0; field < _NPCM; field++) {
2892 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2893 inuse != 0; inuse &= ~(1UL << bit)) {
2894 bit = ffsl(inuse) - 1;
2895 pv = &pc->pc_pventry[field * 64 + bit];
2896 va = pv->pv_va;
2897 pde = pmap_pde(pmap, va, &lvl);
2898 if (lvl != 2)
2899 continue;
2900 pte = pmap_l2_to_l3(pde, va);
2901 tpte = pmap_load(pte);
2902 if ((tpte & ATTR_SW_WIRED) != 0)
2903 continue;
2904 tpte = pmap_load_clear(pte);
2905 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte));
2906 if (pmap_pte_dirty(pmap, tpte))
2907 vm_page_dirty(m);
2908 if ((tpte & ATTR_AF) != 0) {
2909 pmap_s1_invalidate_page(pmap, va, true);
2910 vm_page_aflag_set(m, PGA_REFERENCED);
2911 }
2912 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2913 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2914 m->md.pv_gen++;
2915 if (TAILQ_EMPTY(&m->md.pv_list) &&
2916 (m->flags & PG_FICTITIOUS) == 0) {
2917 pvh = page_to_pvh(m);
2918 if (TAILQ_EMPTY(&pvh->pv_list)) {
2919 vm_page_aflag_clear(m,
2920 PGA_WRITEABLE);
2921 }
2922 }
2923 pc->pc_map[field] |= 1UL << bit;
2924 pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
2925 freed++;
2926 }
2927 }
2928 if (freed == 0) {
2929 mtx_lock(&pvc->pvc_lock);
2930 goto next_chunk;
2931 }
2932 /* Every freed mapping is for a 4 KB page. */
2933 pmap_resident_count_dec(pmap, freed);
2934 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2935 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2936 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2937 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2938 if (pc_is_free(pc)) {
2939 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2940 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2941 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2942 /* Entire chunk is free; return it. */
2943 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2944 dump_drop_page(m_pc->phys_addr);
2945 mtx_lock(&pvc->pvc_lock);
2946 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
2947 break;
2948 }
2949 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2950 mtx_lock(&pvc->pvc_lock);
2951 /* One freed pv entry in locked_pmap is sufficient. */
2952 if (pmap == locked_pmap)
2953 break;
2954
2955 next_chunk:
2956 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
2957 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
2958 if (pvc->active_reclaims == 1 && pmap != NULL) {
2959 /*
2960 * Rotate the pv chunks list so that we do not
2961 * scan the same pv chunks that could not be
2962 * freed (because they contained a wired
2963 * and/or superpage mapping) on every
2964 * invocation of reclaim_pv_chunk().
2965 */
2966 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
2967 MPASS(pc->pc_pmap != NULL);
2968 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
2969 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
2970 }
2971 }
2972 }
2973 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
2974 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
2975 pvc->active_reclaims--;
2976 mtx_unlock(&pvc->pvc_lock);
2977 if (pmap != NULL && pmap != locked_pmap)
2978 PMAP_UNLOCK(pmap);
2979 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2980 m_pc = SLIST_FIRST(&free);
2981 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2982 /* Recycle a freed page table page. */
2983 m_pc->ref_count = 1;
2984 }
2985 vm_page_free_pages_toq(&free, true);
2986 return (m_pc);
2987 }
2988
2989 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)2990 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2991 {
2992 vm_page_t m;
2993 int i, domain;
2994
2995 domain = PCPU_GET(domain);
2996 for (i = 0; i < vm_ndomains; i++) {
2997 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
2998 if (m != NULL)
2999 break;
3000 domain = (domain + 1) % vm_ndomains;
3001 }
3002
3003 return (m);
3004 }
3005
3006 /*
3007 * free the pv_entry back to the free list
3008 */
3009 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)3010 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3011 {
3012 struct pv_chunk *pc;
3013 int idx, field, bit;
3014
3015 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3016 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3017 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3018 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3019 pc = pv_to_chunk(pv);
3020 idx = pv - &pc->pc_pventry[0];
3021 field = idx / 64;
3022 bit = idx % 64;
3023 pc->pc_map[field] |= 1ul << bit;
3024 if (!pc_is_free(pc)) {
3025 /* 98% of the time, pc is already at the head of the list. */
3026 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3027 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3028 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3029 }
3030 return;
3031 }
3032 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3033 free_pv_chunk(pc);
3034 }
3035
3036 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)3037 free_pv_chunk_dequeued(struct pv_chunk *pc)
3038 {
3039 vm_page_t m;
3040
3041 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3042 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3043 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3044 /* entire chunk is free, return it */
3045 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3046 dump_drop_page(m->phys_addr);
3047 vm_page_unwire_noq(m);
3048 vm_page_free(m);
3049 }
3050
3051 static void
free_pv_chunk(struct pv_chunk * pc)3052 free_pv_chunk(struct pv_chunk *pc)
3053 {
3054 struct pv_chunks_list *pvc;
3055
3056 pvc = &pv_chunks[pc_to_domain(pc)];
3057 mtx_lock(&pvc->pvc_lock);
3058 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3059 mtx_unlock(&pvc->pvc_lock);
3060 free_pv_chunk_dequeued(pc);
3061 }
3062
3063 static void
free_pv_chunk_batch(struct pv_chunklist * batch)3064 free_pv_chunk_batch(struct pv_chunklist *batch)
3065 {
3066 struct pv_chunks_list *pvc;
3067 struct pv_chunk *pc, *npc;
3068 int i;
3069
3070 for (i = 0; i < vm_ndomains; i++) {
3071 if (TAILQ_EMPTY(&batch[i]))
3072 continue;
3073 pvc = &pv_chunks[i];
3074 mtx_lock(&pvc->pvc_lock);
3075 TAILQ_FOREACH(pc, &batch[i], pc_list) {
3076 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3077 }
3078 mtx_unlock(&pvc->pvc_lock);
3079 }
3080
3081 for (i = 0; i < vm_ndomains; i++) {
3082 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3083 free_pv_chunk_dequeued(pc);
3084 }
3085 }
3086 }
3087
3088 /*
3089 * Returns a new PV entry, allocating a new PV chunk from the system when
3090 * needed. If this PV chunk allocation fails and a PV list lock pointer was
3091 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
3092 * returned.
3093 *
3094 * The given PV list lock may be released.
3095 */
3096 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)3097 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3098 {
3099 struct pv_chunks_list *pvc;
3100 int bit, field;
3101 pv_entry_t pv;
3102 struct pv_chunk *pc;
3103 vm_page_t m;
3104
3105 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3106 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3107 retry:
3108 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3109 if (pc != NULL) {
3110 for (field = 0; field < _NPCM; field++) {
3111 if (pc->pc_map[field]) {
3112 bit = ffsl(pc->pc_map[field]) - 1;
3113 break;
3114 }
3115 }
3116 if (field < _NPCM) {
3117 pv = &pc->pc_pventry[field * 64 + bit];
3118 pc->pc_map[field] &= ~(1ul << bit);
3119 /* If this was the last item, move it to tail */
3120 if (pc_is_full(pc)) {
3121 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3122 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3123 pc_list);
3124 }
3125 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3126 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3127 return (pv);
3128 }
3129 }
3130 /* No free items, allocate another chunk */
3131 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3132 if (m == NULL) {
3133 if (lockp == NULL) {
3134 PV_STAT(pc_chunk_tryfail++);
3135 return (NULL);
3136 }
3137 m = reclaim_pv_chunk(pmap, lockp);
3138 if (m == NULL)
3139 goto retry;
3140 }
3141 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3142 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3143 dump_add_page(m->phys_addr);
3144 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3145 pc->pc_pmap = pmap;
3146 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3147 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */
3148 pvc = &pv_chunks[vm_page_domain(m)];
3149 mtx_lock(&pvc->pvc_lock);
3150 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3151 mtx_unlock(&pvc->pvc_lock);
3152 pv = &pc->pc_pventry[0];
3153 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3154 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3155 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3156 return (pv);
3157 }
3158
3159 /*
3160 * Ensure that the number of spare PV entries in the specified pmap meets or
3161 * exceeds the given count, "needed".
3162 *
3163 * The given PV list lock may be released.
3164 */
3165 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)3166 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3167 {
3168 struct pv_chunks_list *pvc;
3169 struct pch new_tail[PMAP_MEMDOM];
3170 struct pv_chunk *pc;
3171 vm_page_t m;
3172 int avail, free, i;
3173 bool reclaimed;
3174
3175 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3176 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3177
3178 /*
3179 * Newly allocated PV chunks must be stored in a private list until
3180 * the required number of PV chunks have been allocated. Otherwise,
3181 * reclaim_pv_chunk() could recycle one of these chunks. In
3182 * contrast, these chunks must be added to the pmap upon allocation.
3183 */
3184 for (i = 0; i < PMAP_MEMDOM; i++)
3185 TAILQ_INIT(&new_tail[i]);
3186 retry:
3187 avail = 0;
3188 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3189 bit_count((bitstr_t *)pc->pc_map, 0,
3190 sizeof(pc->pc_map) * NBBY, &free);
3191 if (free == 0)
3192 break;
3193 avail += free;
3194 if (avail >= needed)
3195 break;
3196 }
3197 for (reclaimed = false; avail < needed; avail += _NPCPV) {
3198 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3199 if (m == NULL) {
3200 m = reclaim_pv_chunk(pmap, lockp);
3201 if (m == NULL)
3202 goto retry;
3203 reclaimed = true;
3204 }
3205 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3206 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3207 dump_add_page(m->phys_addr);
3208 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3209 pc->pc_pmap = pmap;
3210 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3211 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3212 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3213 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3214
3215 /*
3216 * The reclaim might have freed a chunk from the current pmap.
3217 * If that chunk contained available entries, we need to
3218 * re-count the number of available entries.
3219 */
3220 if (reclaimed)
3221 goto retry;
3222 }
3223 for (i = 0; i < vm_ndomains; i++) {
3224 if (TAILQ_EMPTY(&new_tail[i]))
3225 continue;
3226 pvc = &pv_chunks[i];
3227 mtx_lock(&pvc->pvc_lock);
3228 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3229 mtx_unlock(&pvc->pvc_lock);
3230 }
3231 }
3232
3233 /*
3234 * First find and then remove the pv entry for the specified pmap and virtual
3235 * address from the specified pv list. Returns the pv entry if found and NULL
3236 * otherwise. This operation can be performed on pv lists for either 4KB or
3237 * 2MB page mappings.
3238 */
3239 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3240 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3241 {
3242 pv_entry_t pv;
3243
3244 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3245 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3246 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3247 pvh->pv_gen++;
3248 break;
3249 }
3250 }
3251 return (pv);
3252 }
3253
3254 /*
3255 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3256 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3257 * entries for each of the 4KB page mappings.
3258 */
3259 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3260 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3261 struct rwlock **lockp)
3262 {
3263 struct md_page *pvh;
3264 struct pv_chunk *pc;
3265 pv_entry_t pv;
3266 vm_offset_t va_last;
3267 vm_page_t m;
3268 int bit, field;
3269
3270 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3271 KASSERT((va & L2_OFFSET) == 0,
3272 ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3273 KASSERT((pa & L2_OFFSET) == 0,
3274 ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3275 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3276
3277 /*
3278 * Transfer the 2mpage's pv entry for this mapping to the first
3279 * page's pv list. Once this transfer begins, the pv list lock
3280 * must not be released until the last pv entry is reinstantiated.
3281 */
3282 pvh = pa_to_pvh(pa);
3283 pv = pmap_pvh_remove(pvh, pmap, va);
3284 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3285 m = PHYS_TO_VM_PAGE(pa);
3286 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3287 m->md.pv_gen++;
3288 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3289 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3290 va_last = va + L2_SIZE - PAGE_SIZE;
3291 for (;;) {
3292 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3293 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3294 for (field = 0; field < _NPCM; field++) {
3295 while (pc->pc_map[field]) {
3296 bit = ffsl(pc->pc_map[field]) - 1;
3297 pc->pc_map[field] &= ~(1ul << bit);
3298 pv = &pc->pc_pventry[field * 64 + bit];
3299 va += PAGE_SIZE;
3300 pv->pv_va = va;
3301 m++;
3302 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3303 ("pmap_pv_demote_l2: page %p is not managed", m));
3304 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3305 m->md.pv_gen++;
3306 if (va == va_last)
3307 goto out;
3308 }
3309 }
3310 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3311 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3312 }
3313 out:
3314 if (pc_is_full(pc)) {
3315 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3316 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3317 }
3318 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3319 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3320 }
3321
3322 /*
3323 * First find and then destroy the pv entry for the specified pmap and virtual
3324 * address. This operation can be performed on pv lists for either 4KB or 2MB
3325 * page mappings.
3326 */
3327 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3328 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3329 {
3330 pv_entry_t pv;
3331
3332 pv = pmap_pvh_remove(pvh, pmap, va);
3333 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3334 free_pv_entry(pmap, pv);
3335 }
3336
3337 /*
3338 * Conditionally create the PV entry for a 4KB page mapping if the required
3339 * memory can be allocated without resorting to reclamation.
3340 */
3341 static boolean_t
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3342 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3343 struct rwlock **lockp)
3344 {
3345 pv_entry_t pv;
3346
3347 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3348 /* Pass NULL instead of the lock pointer to disable reclamation. */
3349 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3350 pv->pv_va = va;
3351 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3352 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3353 m->md.pv_gen++;
3354 return (TRUE);
3355 } else
3356 return (FALSE);
3357 }
3358
3359 /*
3360 * Create the PV entry for a 2MB page mapping. Always returns true unless the
3361 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
3362 * false if the PV entry cannot be allocated without resorting to reclamation.
3363 */
3364 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)3365 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3366 struct rwlock **lockp)
3367 {
3368 struct md_page *pvh;
3369 pv_entry_t pv;
3370 vm_paddr_t pa;
3371
3372 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3373 /* Pass NULL instead of the lock pointer to disable reclamation. */
3374 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3375 NULL : lockp)) == NULL)
3376 return (false);
3377 pv->pv_va = va;
3378 pa = PTE_TO_PHYS(l2e);
3379 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3380 pvh = pa_to_pvh(pa);
3381 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3382 pvh->pv_gen++;
3383 return (true);
3384 }
3385
3386 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)3387 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3388 {
3389 pt_entry_t newl2, oldl2 __diagused;
3390 vm_page_t ml3;
3391 vm_paddr_t ml3pa;
3392
3393 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3394 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3395 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3396
3397 ml3 = pmap_remove_pt_page(pmap, va);
3398 if (ml3 == NULL)
3399 panic("pmap_remove_kernel_l2: Missing pt page");
3400
3401 ml3pa = VM_PAGE_TO_PHYS(ml3);
3402 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3403
3404 /*
3405 * If this page table page was unmapped by a promotion, then it
3406 * contains valid mappings. Zero it to invalidate those mappings.
3407 */
3408 if (vm_page_any_valid(ml3))
3409 pagezero((void *)PHYS_TO_DMAP(ml3pa));
3410
3411 /*
3412 * Demote the mapping. The caller must have already invalidated the
3413 * mapping (i.e., the "break" in break-before-make).
3414 */
3415 oldl2 = pmap_load_store(l2, newl2);
3416 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3417 __func__, l2, oldl2));
3418 }
3419
3420 /*
3421 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3422 */
3423 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)3424 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
3425 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
3426 {
3427 struct md_page *pvh;
3428 pt_entry_t old_l2;
3429 vm_page_t m, ml3, mt;
3430
3431 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3432 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3433 old_l2 = pmap_load_clear(l2);
3434 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3435 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3436
3437 /*
3438 * Since a promotion must break the 4KB page mappings before making
3439 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3440 */
3441 pmap_s1_invalidate_page(pmap, sva, true);
3442
3443 if (old_l2 & ATTR_SW_WIRED)
3444 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3445 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3446 if (old_l2 & ATTR_SW_MANAGED) {
3447 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(old_l2));
3448 pvh = page_to_pvh(m);
3449 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3450 pmap_pvh_free(pvh, pmap, sva);
3451 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3452 if (pmap_pte_dirty(pmap, old_l2))
3453 vm_page_dirty(mt);
3454 if (old_l2 & ATTR_AF)
3455 vm_page_aflag_set(mt, PGA_REFERENCED);
3456 if (TAILQ_EMPTY(&mt->md.pv_list) &&
3457 TAILQ_EMPTY(&pvh->pv_list))
3458 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3459 }
3460 }
3461 if (pmap == kernel_pmap) {
3462 pmap_remove_kernel_l2(pmap, l2, sva);
3463 } else {
3464 ml3 = pmap_remove_pt_page(pmap, sva);
3465 if (ml3 != NULL) {
3466 KASSERT(vm_page_any_valid(ml3),
3467 ("pmap_remove_l2: l3 page not promoted"));
3468 pmap_resident_count_dec(pmap, 1);
3469 KASSERT(ml3->ref_count == NL3PG,
3470 ("pmap_remove_l2: l3 page ref count error"));
3471 ml3->ref_count = 0;
3472 pmap_add_delayed_free_list(ml3, free, FALSE);
3473 }
3474 }
3475 return (pmap_unuse_pt(pmap, sva, l1e, free));
3476 }
3477
3478 /*
3479 * pmap_remove_l3: do the things to unmap a page in a process
3480 */
3481 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)3482 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3483 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3484 {
3485 struct md_page *pvh;
3486 pt_entry_t old_l3;
3487 vm_page_t m;
3488
3489 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3490 old_l3 = pmap_load_clear(l3);
3491 pmap_s1_invalidate_page(pmap, va, true);
3492 if (old_l3 & ATTR_SW_WIRED)
3493 pmap->pm_stats.wired_count -= 1;
3494 pmap_resident_count_dec(pmap, 1);
3495 if (old_l3 & ATTR_SW_MANAGED) {
3496 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(old_l3));
3497 if (pmap_pte_dirty(pmap, old_l3))
3498 vm_page_dirty(m);
3499 if (old_l3 & ATTR_AF)
3500 vm_page_aflag_set(m, PGA_REFERENCED);
3501 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3502 pmap_pvh_free(&m->md, pmap, va);
3503 if (TAILQ_EMPTY(&m->md.pv_list) &&
3504 (m->flags & PG_FICTITIOUS) == 0) {
3505 pvh = page_to_pvh(m);
3506 if (TAILQ_EMPTY(&pvh->pv_list))
3507 vm_page_aflag_clear(m, PGA_WRITEABLE);
3508 }
3509 }
3510 return (pmap_unuse_pt(pmap, va, l2e, free));
3511 }
3512
3513 /*
3514 * Remove the specified range of addresses from the L3 page table that is
3515 * identified by the given L2 entry.
3516 */
3517 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)3518 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3519 vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3520 {
3521 struct md_page *pvh;
3522 struct rwlock *new_lock;
3523 pt_entry_t *l3, old_l3;
3524 vm_offset_t va;
3525 vm_page_t l3pg, m;
3526
3527 KASSERT(ADDR_IS_CANONICAL(sva),
3528 ("%s: Start address not in canonical form: %lx", __func__, sva));
3529 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3530 ("%s: End address not in canonical form: %lx", __func__, eva));
3531
3532 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3533 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3534 ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3535 l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(PTE_TO_PHYS(l2e)) : NULL;
3536 va = eva;
3537 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3538 if (!pmap_l3_valid(pmap_load(l3))) {
3539 if (va != eva) {
3540 pmap_invalidate_range(pmap, va, sva, true);
3541 va = eva;
3542 }
3543 continue;
3544 }
3545 old_l3 = pmap_load_clear(l3);
3546 if ((old_l3 & ATTR_SW_WIRED) != 0)
3547 pmap->pm_stats.wired_count--;
3548 pmap_resident_count_dec(pmap, 1);
3549 if ((old_l3 & ATTR_SW_MANAGED) != 0) {
3550 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(old_l3));
3551 if (pmap_pte_dirty(pmap, old_l3))
3552 vm_page_dirty(m);
3553 if ((old_l3 & ATTR_AF) != 0)
3554 vm_page_aflag_set(m, PGA_REFERENCED);
3555 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3556 if (new_lock != *lockp) {
3557 if (*lockp != NULL) {
3558 /*
3559 * Pending TLB invalidations must be
3560 * performed before the PV list lock is
3561 * released. Otherwise, a concurrent
3562 * pmap_remove_all() on a physical page
3563 * could return while a stale TLB entry
3564 * still provides access to that page.
3565 */
3566 if (va != eva) {
3567 pmap_invalidate_range(pmap, va,
3568 sva, true);
3569 va = eva;
3570 }
3571 rw_wunlock(*lockp);
3572 }
3573 *lockp = new_lock;
3574 rw_wlock(*lockp);
3575 }
3576 pmap_pvh_free(&m->md, pmap, sva);
3577 if (TAILQ_EMPTY(&m->md.pv_list) &&
3578 (m->flags & PG_FICTITIOUS) == 0) {
3579 pvh = page_to_pvh(m);
3580 if (TAILQ_EMPTY(&pvh->pv_list))
3581 vm_page_aflag_clear(m, PGA_WRITEABLE);
3582 }
3583 }
3584 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3585 /*
3586 * _pmap_unwire_l3() has already invalidated the TLB
3587 * entries at all levels for "sva". So, we need not
3588 * perform "sva += L3_SIZE;" here. Moreover, we need
3589 * not perform "va = sva;" if "sva" is at the start
3590 * of a new valid range consisting of a single page.
3591 */
3592 break;
3593 }
3594 if (va == eva)
3595 va = sva;
3596 }
3597 if (va != eva)
3598 pmap_invalidate_range(pmap, va, sva, true);
3599 }
3600
3601 /*
3602 * Remove the given range of addresses from the specified map.
3603 *
3604 * It is assumed that the start and end are properly
3605 * rounded to the page size.
3606 */
3607 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3608 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3609 {
3610 struct rwlock *lock;
3611 vm_offset_t va_next;
3612 pd_entry_t *l0, *l1, *l2;
3613 pt_entry_t l3_paddr;
3614 struct spglist free;
3615
3616 /*
3617 * Perform an unsynchronized read. This is, however, safe.
3618 */
3619 if (pmap->pm_stats.resident_count == 0)
3620 return;
3621
3622 SLIST_INIT(&free);
3623
3624 PMAP_LOCK(pmap);
3625
3626 lock = NULL;
3627 for (; sva < eva; sva = va_next) {
3628 if (pmap->pm_stats.resident_count == 0)
3629 break;
3630
3631 l0 = pmap_l0(pmap, sva);
3632 if (pmap_load(l0) == 0) {
3633 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3634 if (va_next < sva)
3635 va_next = eva;
3636 continue;
3637 }
3638
3639 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3640 if (va_next < sva)
3641 va_next = eva;
3642 l1 = pmap_l0_to_l1(l0, sva);
3643 if (pmap_load(l1) == 0)
3644 continue;
3645 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3646 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
3647 KASSERT(va_next <= eva,
3648 ("partial update of non-transparent 1G page "
3649 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3650 pmap_load(l1), sva, eva, va_next));
3651 MPASS(pmap != kernel_pmap);
3652 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3653 pmap_clear(l1);
3654 pmap_s1_invalidate_page(pmap, sva, true);
3655 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
3656 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
3657 continue;
3658 }
3659
3660 /*
3661 * Calculate index for next page table.
3662 */
3663 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3664 if (va_next < sva)
3665 va_next = eva;
3666
3667 l2 = pmap_l1_to_l2(l1, sva);
3668 if (l2 == NULL)
3669 continue;
3670
3671 l3_paddr = pmap_load(l2);
3672
3673 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
3674 if (sva + L2_SIZE == va_next && eva >= va_next) {
3675 pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
3676 &free, &lock);
3677 continue;
3678 } else if (pmap_demote_l2_locked(pmap, l2, sva,
3679 &lock) == NULL)
3680 continue;
3681 l3_paddr = pmap_load(l2);
3682 }
3683
3684 /*
3685 * Weed out invalid mappings.
3686 */
3687 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
3688 continue;
3689
3690 /*
3691 * Limit our scan to either the end of the va represented
3692 * by the current page table page, or to the end of the
3693 * range being removed.
3694 */
3695 if (va_next > eva)
3696 va_next = eva;
3697
3698 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
3699 &lock);
3700 }
3701 if (lock != NULL)
3702 rw_wunlock(lock);
3703 PMAP_UNLOCK(pmap);
3704 vm_page_free_pages_toq(&free, true);
3705 }
3706
3707 /*
3708 * Remove the given range of addresses as part of a logical unmap
3709 * operation. This has the effect of calling pmap_remove(), but
3710 * also clears any metadata that should persist for the lifetime
3711 * of a logical mapping.
3712 */
3713 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3714 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3715 {
3716 pmap_remove(pmap, sva, eva);
3717 }
3718
3719 /*
3720 * Routine: pmap_remove_all
3721 * Function:
3722 * Removes this physical page from
3723 * all physical maps in which it resides.
3724 * Reflects back modify bits to the pager.
3725 *
3726 * Notes:
3727 * Original versions of this routine were very
3728 * inefficient because they iteratively called
3729 * pmap_remove (slow...)
3730 */
3731
3732 void
pmap_remove_all(vm_page_t m)3733 pmap_remove_all(vm_page_t m)
3734 {
3735 struct md_page *pvh;
3736 pv_entry_t pv;
3737 pmap_t pmap;
3738 struct rwlock *lock;
3739 pd_entry_t *pde, tpde;
3740 pt_entry_t *pte, tpte;
3741 vm_offset_t va;
3742 struct spglist free;
3743 int lvl, pvh_gen, md_gen;
3744
3745 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3746 ("pmap_remove_all: page %p is not managed", m));
3747 SLIST_INIT(&free);
3748 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3749 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
3750 rw_wlock(lock);
3751 retry:
3752 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3753 pmap = PV_PMAP(pv);
3754 if (!PMAP_TRYLOCK(pmap)) {
3755 pvh_gen = pvh->pv_gen;
3756 rw_wunlock(lock);
3757 PMAP_LOCK(pmap);
3758 rw_wlock(lock);
3759 if (pvh_gen != pvh->pv_gen) {
3760 PMAP_UNLOCK(pmap);
3761 goto retry;
3762 }
3763 }
3764 va = pv->pv_va;
3765 pte = pmap_pte_exists(pmap, va, 2, __func__);
3766 pmap_demote_l2_locked(pmap, pte, va, &lock);
3767 PMAP_UNLOCK(pmap);
3768 }
3769 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3770 pmap = PV_PMAP(pv);
3771 if (!PMAP_TRYLOCK(pmap)) {
3772 pvh_gen = pvh->pv_gen;
3773 md_gen = m->md.pv_gen;
3774 rw_wunlock(lock);
3775 PMAP_LOCK(pmap);
3776 rw_wlock(lock);
3777 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3778 PMAP_UNLOCK(pmap);
3779 goto retry;
3780 }
3781 }
3782 pmap_resident_count_dec(pmap, 1);
3783
3784 pde = pmap_pde(pmap, pv->pv_va, &lvl);
3785 KASSERT(pde != NULL,
3786 ("pmap_remove_all: no page directory entry found"));
3787 KASSERT(lvl == 2,
3788 ("pmap_remove_all: invalid pde level %d", lvl));
3789 tpde = pmap_load(pde);
3790
3791 pte = pmap_l2_to_l3(pde, pv->pv_va);
3792 tpte = pmap_load_clear(pte);
3793 if (tpte & ATTR_SW_WIRED)
3794 pmap->pm_stats.wired_count--;
3795 if ((tpte & ATTR_AF) != 0) {
3796 pmap_invalidate_page(pmap, pv->pv_va, true);
3797 vm_page_aflag_set(m, PGA_REFERENCED);
3798 }
3799
3800 /*
3801 * Update the vm_page_t clean and reference bits.
3802 */
3803 if (pmap_pte_dirty(pmap, tpte))
3804 vm_page_dirty(m);
3805 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
3806 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3807 m->md.pv_gen++;
3808 free_pv_entry(pmap, pv);
3809 PMAP_UNLOCK(pmap);
3810 }
3811 vm_page_aflag_clear(m, PGA_WRITEABLE);
3812 rw_wunlock(lock);
3813 vm_page_free_pages_toq(&free, true);
3814 }
3815
3816 /*
3817 * Masks and sets bits in a level 2 page table entries in the specified pmap
3818 */
3819 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)3820 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
3821 pt_entry_t nbits)
3822 {
3823 pd_entry_t old_l2;
3824 vm_page_t m, mt;
3825
3826 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3827 PMAP_ASSERT_STAGE1(pmap);
3828 KASSERT((sva & L2_OFFSET) == 0,
3829 ("pmap_protect_l2: sva is not 2mpage aligned"));
3830 old_l2 = pmap_load(l2);
3831 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3832 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
3833
3834 /*
3835 * Return if the L2 entry already has the desired access restrictions
3836 * in place.
3837 */
3838 if ((old_l2 & mask) == nbits)
3839 return;
3840
3841 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
3842 cpu_spinwait();
3843
3844 /*
3845 * When a dirty read/write superpage mapping is write protected,
3846 * update the dirty field of each of the superpage's constituent 4KB
3847 * pages.
3848 */
3849 if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
3850 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3851 pmap_pte_dirty(pmap, old_l2)) {
3852 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(old_l2));
3853 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3854 vm_page_dirty(mt);
3855 }
3856
3857 /*
3858 * Since a promotion must break the 4KB page mappings before making
3859 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3860 */
3861 pmap_s1_invalidate_page(pmap, sva, true);
3862 }
3863
3864 /*
3865 * Masks and sets bits in last level page table entries in the specified
3866 * pmap and range
3867 */
3868 static void
pmap_mask_set_locked(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)3869 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
3870 pt_entry_t nbits, bool invalidate)
3871 {
3872 vm_offset_t va, va_next;
3873 pd_entry_t *l0, *l1, *l2;
3874 pt_entry_t *l3p, l3;
3875
3876 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3877 for (; sva < eva; sva = va_next) {
3878 l0 = pmap_l0(pmap, sva);
3879 if (pmap_load(l0) == 0) {
3880 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3881 if (va_next < sva)
3882 va_next = eva;
3883 continue;
3884 }
3885
3886 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3887 if (va_next < sva)
3888 va_next = eva;
3889 l1 = pmap_l0_to_l1(l0, sva);
3890 if (pmap_load(l1) == 0)
3891 continue;
3892 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3893 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
3894 KASSERT(va_next <= eva,
3895 ("partial update of non-transparent 1G page "
3896 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3897 pmap_load(l1), sva, eva, va_next));
3898 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3899 if ((pmap_load(l1) & mask) != nbits) {
3900 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
3901 if (invalidate)
3902 pmap_s1_invalidate_page(pmap, sva, true);
3903 }
3904 continue;
3905 }
3906
3907 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3908 if (va_next < sva)
3909 va_next = eva;
3910
3911 l2 = pmap_l1_to_l2(l1, sva);
3912 if (pmap_load(l2) == 0)
3913 continue;
3914
3915 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
3916 if (sva + L2_SIZE == va_next && eva >= va_next) {
3917 pmap_protect_l2(pmap, l2, sva, mask, nbits);
3918 continue;
3919 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
3920 continue;
3921 }
3922 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
3923 ("pmap_protect: Invalid L2 entry after demotion"));
3924
3925 if (va_next > eva)
3926 va_next = eva;
3927
3928 va = va_next;
3929 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
3930 sva += L3_SIZE) {
3931 l3 = pmap_load(l3p);
3932
3933 /*
3934 * Go to the next L3 entry if the current one is
3935 * invalid or already has the desired access
3936 * restrictions in place. (The latter case occurs
3937 * frequently. For example, in a "buildworld"
3938 * workload, almost 1 out of 4 L3 entries already
3939 * have the desired restrictions.)
3940 */
3941 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
3942 if (va != va_next) {
3943 if (invalidate)
3944 pmap_s1_invalidate_range(pmap,
3945 va, sva, true);
3946 va = va_next;
3947 }
3948 continue;
3949 }
3950
3951 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
3952 nbits))
3953 cpu_spinwait();
3954
3955 /*
3956 * When a dirty read/write mapping is write protected,
3957 * update the page's dirty field.
3958 */
3959 if ((l3 & ATTR_SW_MANAGED) != 0 &&
3960 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3961 pmap_pte_dirty(pmap, l3))
3962 vm_page_dirty(PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3)));
3963
3964 if (va == va_next)
3965 va = sva;
3966 }
3967 if (va != va_next && invalidate)
3968 pmap_s1_invalidate_range(pmap, va, sva, true);
3969 }
3970 }
3971
3972 static void
pmap_mask_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)3973 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
3974 pt_entry_t nbits, bool invalidate)
3975 {
3976 PMAP_LOCK(pmap);
3977 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
3978 PMAP_UNLOCK(pmap);
3979 }
3980
3981 /*
3982 * Set the physical protection on the
3983 * specified range of this map as requested.
3984 */
3985 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)3986 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3987 {
3988 pt_entry_t mask, nbits;
3989
3990 PMAP_ASSERT_STAGE1(pmap);
3991 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3992 if (prot == VM_PROT_NONE) {
3993 pmap_remove(pmap, sva, eva);
3994 return;
3995 }
3996
3997 mask = nbits = 0;
3998 if ((prot & VM_PROT_WRITE) == 0) {
3999 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4000 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4001 }
4002 if ((prot & VM_PROT_EXECUTE) == 0) {
4003 mask |= ATTR_S1_XN;
4004 nbits |= ATTR_S1_XN;
4005 }
4006 if (mask == 0)
4007 return;
4008
4009 pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4010 }
4011
4012 void
pmap_disable_promotion(vm_offset_t sva,vm_size_t size)4013 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4014 {
4015
4016 MPASS((sva & L3_OFFSET) == 0);
4017 MPASS(((sva + size) & L3_OFFSET) == 0);
4018
4019 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4020 ATTR_SW_NO_PROMOTE, false);
4021 }
4022
4023 /*
4024 * Inserts the specified page table page into the specified pmap's collection
4025 * of idle page table pages. Each of a pmap's page table pages is responsible
4026 * for mapping a distinct range of virtual addresses. The pmap's collection is
4027 * ordered by this virtual address range.
4028 *
4029 * If "promoted" is false, then the page table page "mpte" must be zero filled;
4030 * "mpte"'s valid field will be set to 0.
4031 *
4032 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4033 * contain valid mappings with identical attributes except for ATTR_AF;
4034 * "mpte"'s valid field will be set to 1.
4035 *
4036 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4037 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4038 * field will be set to VM_PAGE_BITS_ALL.
4039 */
4040 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_AF_set)4041 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4042 bool all_l3e_AF_set)
4043 {
4044
4045 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4046 KASSERT(promoted || !all_l3e_AF_set,
4047 ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4048 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4049 return (vm_radix_insert(&pmap->pm_root, mpte));
4050 }
4051
4052 /*
4053 * Removes the page table page mapping the specified virtual address from the
4054 * specified pmap's collection of idle page table pages, and returns it.
4055 * Otherwise, returns NULL if there is no page table page corresponding to the
4056 * specified virtual address.
4057 */
4058 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4059 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4060 {
4061
4062 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4063 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4064 }
4065
4066 /*
4067 * Performs a break-before-make update of a pmap entry. This is needed when
4068 * either promoting or demoting pages to ensure the TLB doesn't get into an
4069 * inconsistent state.
4070 */
4071 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * pte,pd_entry_t newpte,vm_offset_t va,vm_size_t size)4072 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
4073 vm_offset_t va, vm_size_t size)
4074 {
4075 register_t intr;
4076
4077 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4078
4079 if ((newpte & ATTR_SW_NO_PROMOTE) != 0)
4080 panic("%s: Updating non-promote pte", __func__);
4081
4082 /*
4083 * Ensure we don't get switched out with the page table in an
4084 * inconsistent state. We also need to ensure no interrupts fire
4085 * as they may make use of an address we are about to invalidate.
4086 */
4087 intr = intr_disable();
4088
4089 /*
4090 * Clear the old mapping's valid bit, but leave the rest of the entry
4091 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4092 * lookup the physical address.
4093 */
4094 pmap_clear_bits(pte, ATTR_DESCR_VALID);
4095
4096 /*
4097 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4098 * be cached, so we invalidate intermediate entries as well as final
4099 * entries.
4100 */
4101 pmap_s1_invalidate_range(pmap, va, va + size, false);
4102
4103 /* Create the new mapping */
4104 pmap_store(pte, newpte);
4105 dsb(ishst);
4106
4107 intr_restore(intr);
4108 }
4109
4110 #if VM_NRESERVLEVEL > 0
4111 /*
4112 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4113 * replace the many pv entries for the 4KB page mappings by a single pv entry
4114 * for the 2MB page mapping.
4115 */
4116 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)4117 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4118 struct rwlock **lockp)
4119 {
4120 struct md_page *pvh;
4121 pv_entry_t pv;
4122 vm_offset_t va_last;
4123 vm_page_t m;
4124
4125 KASSERT((pa & L2_OFFSET) == 0,
4126 ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4127 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4128
4129 /*
4130 * Transfer the first page's pv entry for this mapping to the 2mpage's
4131 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
4132 * a transfer avoids the possibility that get_pv_entry() calls
4133 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4134 * mappings that is being promoted.
4135 */
4136 m = PHYS_TO_VM_PAGE(pa);
4137 va = va & ~L2_OFFSET;
4138 pv = pmap_pvh_remove(&m->md, pmap, va);
4139 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4140 pvh = page_to_pvh(m);
4141 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4142 pvh->pv_gen++;
4143 /* Free the remaining NPTEPG - 1 pv entries. */
4144 va_last = va + L2_SIZE - PAGE_SIZE;
4145 do {
4146 m++;
4147 va += PAGE_SIZE;
4148 pmap_pvh_free(&m->md, pmap, va);
4149 } while (va < va_last);
4150 }
4151
4152 /*
4153 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4154 * single level 2 table entry to a single 2MB page mapping. For promotion
4155 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4156 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4157 * identical characteristics.
4158 */
4159 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)4160 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4161 struct rwlock **lockp)
4162 {
4163 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4164
4165 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4166
4167 /*
4168 * Currently, this function only supports promotion on stage 1 pmaps
4169 * because it tests stage 1 specific fields and performs a break-
4170 * before-make sequence that is incorrect for stage 2 pmaps.
4171 */
4172 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4173 return (false);
4174
4175 /*
4176 * Examine the first L3E in the specified PTP. Abort if this L3E is
4177 * ineligible for promotion...
4178 */
4179 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
4180 newl2 = pmap_load(firstl3);
4181 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4182 return (false);
4183 /* ... is not the first physical page within an L2 block */
4184 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
4185 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
4186 atomic_add_long(&pmap_l2_p_failures, 1);
4187 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4188 " in pmap %p", va, pmap);
4189 return (false);
4190 }
4191
4192 /*
4193 * Both here and in the below "for" loop, to allow for repromotion
4194 * after MADV_FREE, conditionally write protect a clean L3E before
4195 * possibly aborting the promotion due to other L3E attributes. Why?
4196 * Suppose that MADV_FREE is applied to a part of a superpage, the
4197 * address range [S, E). pmap_advise() will demote the superpage
4198 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4199 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later,
4200 * imagine that the memory in [S, E) is recycled, but the last 4KB
4201 * page in [S, E) is not the last to be rewritten, or simply accessed.
4202 * In other words, there is still a 4KB page in [S, E), call it P,
4203 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4204 * Unless we write protect P before aborting the promotion, if and
4205 * when P is finally rewritten, there won't be a page fault to trigger
4206 * repromotion.
4207 */
4208 setl2:
4209 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4210 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4211 /*
4212 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4213 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4214 */
4215 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4216 goto setl2;
4217 newl2 &= ~ATTR_SW_DBM;
4218 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
4219 " in pmap %p", va & ~L2_OFFSET, pmap);
4220 }
4221
4222 /*
4223 * Examine each of the other L3Es in the specified PTP. Abort if this
4224 * L3E maps an unexpected 4KB physical page or does not have identical
4225 * characteristics to the first L3E. If ATTR_AF is not set in every
4226 * PTE, then request that the PTP be refilled on demotion.
4227 */
4228 all_l3e_AF = newl2 & ATTR_AF;
4229 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
4230 + L2_SIZE - PAGE_SIZE;
4231 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4232 oldl3 = pmap_load(l3);
4233 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4234 atomic_add_long(&pmap_l2_p_failures, 1);
4235 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4236 " in pmap %p", va, pmap);
4237 return (false);
4238 }
4239 setl3:
4240 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4241 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4242 /*
4243 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4244 * set, ATTR_SW_DBM can be cleared without a TLB
4245 * invalidation.
4246 */
4247 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4248 ~ATTR_SW_DBM))
4249 goto setl3;
4250 oldl3 &= ~ATTR_SW_DBM;
4251 }
4252 if ((oldl3 & (ATTR_MASK & ~ATTR_AF)) != (newl2 & (ATTR_MASK &
4253 ~ATTR_AF))) {
4254 atomic_add_long(&pmap_l2_p_failures, 1);
4255 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4256 " in pmap %p", va, pmap);
4257 return (false);
4258 }
4259 all_l3e_AF &= oldl3;
4260 pa -= PAGE_SIZE;
4261 }
4262
4263 /*
4264 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4265 * mapping, so that promotions triggered by speculative mappings,
4266 * such as pmap_enter_quick(), don't automatically mark the
4267 * underlying pages as referenced.
4268 */
4269 newl2 &= ~ATTR_AF | all_l3e_AF;
4270
4271 /*
4272 * Save the page table page in its current state until the L2
4273 * mapping the superpage is demoted by pmap_demote_l2() or
4274 * destroyed by pmap_remove_l3().
4275 */
4276 if (mpte == NULL)
4277 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
4278 KASSERT(mpte >= vm_page_array &&
4279 mpte < &vm_page_array[vm_page_array_size],
4280 ("pmap_promote_l2: page table page is out of range"));
4281 KASSERT(mpte->pindex == pmap_l2_pindex(va),
4282 ("pmap_promote_l2: page table page's pindex is wrong"));
4283 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
4284 atomic_add_long(&pmap_l2_p_failures, 1);
4285 CTR2(KTR_PMAP,
4286 "pmap_promote_l2: failure for va %#lx in pmap %p", va,
4287 pmap);
4288 return (false);
4289 }
4290
4291 if ((newl2 & ATTR_SW_MANAGED) != 0)
4292 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
4293
4294 newl2 &= ~ATTR_DESCR_MASK;
4295 newl2 |= L2_BLOCK;
4296
4297 pmap_update_entry(pmap, l2, newl2, va & ~L2_OFFSET, L2_SIZE);
4298
4299 atomic_add_long(&pmap_l2_promotions, 1);
4300 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
4301 pmap);
4302 return (true);
4303 }
4304 #endif /* VM_NRESERVLEVEL > 0 */
4305
4306 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t newpte,int flags,int psind)4307 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
4308 int psind)
4309 {
4310 pd_entry_t *l0p, *l1p, *l2p, origpte;
4311 vm_page_t mp;
4312
4313 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4314 KASSERT(psind > 0 && psind < MAXPAGESIZES,
4315 ("psind %d unexpected", psind));
4316 KASSERT((PTE_TO_PHYS(newpte) & (pagesizes[psind] - 1)) == 0,
4317 ("unaligned phys address %#lx newpte %#lx psind %d",
4318 PTE_TO_PHYS(newpte), newpte, psind));
4319
4320 restart:
4321 if (psind == 2) {
4322 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4323
4324 l0p = pmap_l0(pmap, va);
4325 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
4326 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
4327 if (mp == NULL) {
4328 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4329 return (KERN_RESOURCE_SHORTAGE);
4330 PMAP_UNLOCK(pmap);
4331 vm_wait(NULL);
4332 PMAP_LOCK(pmap);
4333 goto restart;
4334 }
4335 l1p = pmap_l0_to_l1(l0p, va);
4336 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4337 origpte = pmap_load(l1p);
4338 } else {
4339 l1p = pmap_l0_to_l1(l0p, va);
4340 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4341 origpte = pmap_load(l1p);
4342 if ((origpte & ATTR_DESCR_VALID) == 0) {
4343 mp = PHYS_TO_VM_PAGE(
4344 PTE_TO_PHYS(pmap_load(l0p)));
4345 mp->ref_count++;
4346 }
4347 }
4348 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
4349 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
4350 (origpte & ATTR_DESCR_VALID) == 0,
4351 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
4352 va, origpte, newpte));
4353 pmap_store(l1p, newpte);
4354 } else /* (psind == 1) */ {
4355 l2p = pmap_l2(pmap, va);
4356 if (l2p == NULL) {
4357 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
4358 if (mp == NULL) {
4359 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4360 return (KERN_RESOURCE_SHORTAGE);
4361 PMAP_UNLOCK(pmap);
4362 vm_wait(NULL);
4363 PMAP_LOCK(pmap);
4364 goto restart;
4365 }
4366 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
4367 l2p = &l2p[pmap_l2_index(va)];
4368 origpte = pmap_load(l2p);
4369 } else {
4370 l1p = pmap_l1(pmap, va);
4371 origpte = pmap_load(l2p);
4372 if ((origpte & ATTR_DESCR_VALID) == 0) {
4373 mp = PHYS_TO_VM_PAGE(
4374 PTE_TO_PHYS(pmap_load(l1p)));
4375 mp->ref_count++;
4376 }
4377 }
4378 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
4379 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
4380 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
4381 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
4382 va, origpte, newpte));
4383 pmap_store(l2p, newpte);
4384 }
4385 dsb(ishst);
4386
4387 if ((origpte & ATTR_DESCR_VALID) == 0)
4388 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
4389 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
4390 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
4391 else if ((newpte & ATTR_SW_WIRED) == 0 &&
4392 (origpte & ATTR_SW_WIRED) != 0)
4393 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
4394
4395 return (KERN_SUCCESS);
4396 }
4397
4398 /*
4399 * Insert the given physical page (p) at
4400 * the specified virtual address (v) in the
4401 * target physical map with the protection requested.
4402 *
4403 * If specified, the page will be wired down, meaning
4404 * that the related pte can not be reclaimed.
4405 *
4406 * NB: This is the only routine which MAY NOT lazy-evaluate
4407 * or lose information. That is, this routine must actually
4408 * insert this page into the given map NOW.
4409 */
4410 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)4411 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4412 u_int flags, int8_t psind)
4413 {
4414 struct rwlock *lock;
4415 pd_entry_t *pde;
4416 pt_entry_t new_l3, orig_l3;
4417 pt_entry_t *l2, *l3;
4418 pv_entry_t pv;
4419 vm_paddr_t opa, pa;
4420 vm_page_t mpte, om;
4421 boolean_t nosleep;
4422 int lvl, rv;
4423
4424 KASSERT(ADDR_IS_CANONICAL(va),
4425 ("%s: Address not in canonical form: %lx", __func__, va));
4426
4427 va = trunc_page(va);
4428 if ((m->oflags & VPO_UNMANAGED) == 0)
4429 VM_PAGE_OBJECT_BUSY_ASSERT(m);
4430 pa = VM_PAGE_TO_PHYS(m);
4431 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_DEFAULT | L3_PAGE);
4432 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
4433 new_l3 |= pmap_pte_prot(pmap, prot);
4434
4435 if ((flags & PMAP_ENTER_WIRED) != 0)
4436 new_l3 |= ATTR_SW_WIRED;
4437 if (pmap->pm_stage == PM_STAGE1) {
4438 if (!ADDR_IS_KERNEL(va))
4439 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4440 else
4441 new_l3 |= ATTR_S1_UXN;
4442 if (pmap != kernel_pmap)
4443 new_l3 |= ATTR_S1_nG;
4444 } else {
4445 /*
4446 * Clear the access flag on executable mappings, this will be
4447 * set later when the page is accessed. The fault handler is
4448 * required to invalidate the I-cache.
4449 *
4450 * TODO: Switch to the valid flag to allow hardware management
4451 * of the access flag. Much of the pmap code assumes the
4452 * valid flag is set and fails to destroy the old page tables
4453 * correctly if it is clear.
4454 */
4455 if (prot & VM_PROT_EXECUTE)
4456 new_l3 &= ~ATTR_AF;
4457 }
4458 if ((m->oflags & VPO_UNMANAGED) == 0) {
4459 new_l3 |= ATTR_SW_MANAGED;
4460 if ((prot & VM_PROT_WRITE) != 0) {
4461 new_l3 |= ATTR_SW_DBM;
4462 if ((flags & VM_PROT_WRITE) == 0) {
4463 if (pmap->pm_stage == PM_STAGE1)
4464 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
4465 else
4466 new_l3 &=
4467 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
4468 }
4469 }
4470 }
4471
4472 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
4473
4474 lock = NULL;
4475 PMAP_LOCK(pmap);
4476 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
4477 KASSERT((m->oflags & VPO_UNMANAGED) != 0,
4478 ("managed largepage va %#lx flags %#x", va, flags));
4479 new_l3 &= ~L3_PAGE;
4480 if (psind == 2) {
4481 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4482 new_l3 |= L1_BLOCK;
4483 } else /* (psind == 1) */
4484 new_l3 |= L2_BLOCK;
4485 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
4486 goto out;
4487 }
4488 if (psind == 1) {
4489 /* Assert the required virtual and physical alignment. */
4490 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
4491 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
4492 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
4493 flags, m, &lock);
4494 goto out;
4495 }
4496 mpte = NULL;
4497
4498 /*
4499 * In the case that a page table page is not
4500 * resident, we are creating it here.
4501 */
4502 retry:
4503 pde = pmap_pde(pmap, va, &lvl);
4504 if (pde != NULL && lvl == 2) {
4505 l3 = pmap_l2_to_l3(pde, va);
4506 if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
4507 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(pde)));
4508 mpte->ref_count++;
4509 }
4510 goto havel3;
4511 } else if (pde != NULL && lvl == 1) {
4512 l2 = pmap_l1_to_l2(pde, va);
4513 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
4514 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
4515 l3 = &l3[pmap_l3_index(va)];
4516 if (!ADDR_IS_KERNEL(va)) {
4517 mpte = PHYS_TO_VM_PAGE(
4518 PTE_TO_PHYS(pmap_load(l2)));
4519 mpte->ref_count++;
4520 }
4521 goto havel3;
4522 }
4523 /* We need to allocate an L3 table. */
4524 }
4525 if (!ADDR_IS_KERNEL(va)) {
4526 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4527
4528 /*
4529 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
4530 * to handle the possibility that a superpage mapping for "va"
4531 * was created while we slept.
4532 */
4533 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
4534 nosleep ? NULL : &lock);
4535 if (mpte == NULL && nosleep) {
4536 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
4537 rv = KERN_RESOURCE_SHORTAGE;
4538 goto out;
4539 }
4540 goto retry;
4541 } else
4542 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
4543
4544 havel3:
4545 orig_l3 = pmap_load(l3);
4546 opa = PTE_TO_PHYS(orig_l3);
4547 pv = NULL;
4548
4549 /*
4550 * Is the specified virtual address already mapped?
4551 */
4552 if (pmap_l3_valid(orig_l3)) {
4553 /*
4554 * Wiring change, just update stats. We don't worry about
4555 * wiring PT pages as they remain resident as long as there
4556 * are valid mappings in them. Hence, if a user page is wired,
4557 * the PT page will be also.
4558 */
4559 if ((flags & PMAP_ENTER_WIRED) != 0 &&
4560 (orig_l3 & ATTR_SW_WIRED) == 0)
4561 pmap->pm_stats.wired_count++;
4562 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
4563 (orig_l3 & ATTR_SW_WIRED) != 0)
4564 pmap->pm_stats.wired_count--;
4565
4566 /*
4567 * Remove the extra PT page reference.
4568 */
4569 if (mpte != NULL) {
4570 mpte->ref_count--;
4571 KASSERT(mpte->ref_count > 0,
4572 ("pmap_enter: missing reference to page table page,"
4573 " va: 0x%lx", va));
4574 }
4575
4576 /*
4577 * Has the physical page changed?
4578 */
4579 if (opa == pa) {
4580 /*
4581 * No, might be a protection or wiring change.
4582 */
4583 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4584 (new_l3 & ATTR_SW_DBM) != 0)
4585 vm_page_aflag_set(m, PGA_WRITEABLE);
4586 goto validate;
4587 }
4588
4589 /*
4590 * The physical page has changed. Temporarily invalidate
4591 * the mapping.
4592 */
4593 orig_l3 = pmap_load_clear(l3);
4594 KASSERT(PTE_TO_PHYS(orig_l3) == opa,
4595 ("pmap_enter: unexpected pa update for %#lx", va));
4596 if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
4597 om = PHYS_TO_VM_PAGE(opa);
4598
4599 /*
4600 * The pmap lock is sufficient to synchronize with
4601 * concurrent calls to pmap_page_test_mappings() and
4602 * pmap_ts_referenced().
4603 */
4604 if (pmap_pte_dirty(pmap, orig_l3))
4605 vm_page_dirty(om);
4606 if ((orig_l3 & ATTR_AF) != 0) {
4607 pmap_invalidate_page(pmap, va, true);
4608 vm_page_aflag_set(om, PGA_REFERENCED);
4609 }
4610 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
4611 pv = pmap_pvh_remove(&om->md, pmap, va);
4612 if ((m->oflags & VPO_UNMANAGED) != 0)
4613 free_pv_entry(pmap, pv);
4614 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
4615 TAILQ_EMPTY(&om->md.pv_list) &&
4616 ((om->flags & PG_FICTITIOUS) != 0 ||
4617 TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
4618 vm_page_aflag_clear(om, PGA_WRITEABLE);
4619 } else {
4620 KASSERT((orig_l3 & ATTR_AF) != 0,
4621 ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
4622 pmap_invalidate_page(pmap, va, true);
4623 }
4624 orig_l3 = 0;
4625 } else {
4626 /*
4627 * Increment the counters.
4628 */
4629 if ((new_l3 & ATTR_SW_WIRED) != 0)
4630 pmap->pm_stats.wired_count++;
4631 pmap_resident_count_inc(pmap, 1);
4632 }
4633 /*
4634 * Enter on the PV list if part of our managed memory.
4635 */
4636 if ((m->oflags & VPO_UNMANAGED) == 0) {
4637 if (pv == NULL) {
4638 pv = get_pv_entry(pmap, &lock);
4639 pv->pv_va = va;
4640 }
4641 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
4642 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4643 m->md.pv_gen++;
4644 if ((new_l3 & ATTR_SW_DBM) != 0)
4645 vm_page_aflag_set(m, PGA_WRITEABLE);
4646 }
4647
4648 validate:
4649 if (pmap->pm_stage == PM_STAGE1) {
4650 /*
4651 * Sync icache if exec permission and attribute
4652 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
4653 * is stored and made valid for hardware table walk. If done
4654 * later, then other can access this page before caches are
4655 * properly synced. Don't do it for kernel memory which is
4656 * mapped with exec permission even if the memory isn't going
4657 * to hold executable code. The only time when icache sync is
4658 * needed is after kernel module is loaded and the relocation
4659 * info is processed. And it's done in elf_cpu_load_file().
4660 */
4661 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
4662 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
4663 (opa != pa || (orig_l3 & ATTR_S1_XN))) {
4664 PMAP_ASSERT_STAGE1(pmap);
4665 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4666 }
4667 } else {
4668 cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4669 }
4670
4671 /*
4672 * Update the L3 entry
4673 */
4674 if (pmap_l3_valid(orig_l3)) {
4675 KASSERT(opa == pa, ("pmap_enter: invalid update"));
4676 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
4677 /* same PA, different attributes */
4678 orig_l3 = pmap_load_store(l3, new_l3);
4679 pmap_invalidate_page(pmap, va, true);
4680 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4681 pmap_pte_dirty(pmap, orig_l3))
4682 vm_page_dirty(m);
4683 } else {
4684 /*
4685 * orig_l3 == new_l3
4686 * This can happens if multiple threads simultaneously
4687 * access not yet mapped page. This bad for performance
4688 * since this can cause full demotion-NOP-promotion
4689 * cycle.
4690 * Another possible reasons are:
4691 * - VM and pmap memory layout are diverged
4692 * - tlb flush is missing somewhere and CPU doesn't see
4693 * actual mapping.
4694 */
4695 CTR4(KTR_PMAP, "%s: already mapped page - "
4696 "pmap %p va 0x%#lx pte 0x%lx",
4697 __func__, pmap, va, new_l3);
4698 }
4699 } else {
4700 /* New mapping */
4701 pmap_store(l3, new_l3);
4702 dsb(ishst);
4703 }
4704
4705 #if VM_NRESERVLEVEL > 0
4706 /*
4707 * If both the page table page and the reservation are fully
4708 * populated, then attempt promotion.
4709 */
4710 if ((mpte == NULL || mpte->ref_count == NL3PG) &&
4711 (m->flags & PG_FICTITIOUS) == 0 &&
4712 vm_reserv_level_iffullpop(m) == 0)
4713 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
4714 #endif
4715
4716 rv = KERN_SUCCESS;
4717 out:
4718 if (lock != NULL)
4719 rw_wunlock(lock);
4720 PMAP_UNLOCK(pmap);
4721 return (rv);
4722 }
4723
4724 /*
4725 * Tries to create a read- and/or execute-only 2MB page mapping. Returns
4726 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
4727 * value. See pmap_enter_l2() for the possible error values when "no sleep",
4728 * "no replace", and "no reclaim" are specified.
4729 */
4730 static int
pmap_enter_2mpage(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)4731 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4732 struct rwlock **lockp)
4733 {
4734 pd_entry_t new_l2;
4735
4736 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4737 PMAP_ASSERT_STAGE1(pmap);
4738 KASSERT(ADDR_IS_CANONICAL(va),
4739 ("%s: Address not in canonical form: %lx", __func__, va));
4740
4741 new_l2 = (pd_entry_t)(PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | ATTR_DEFAULT |
4742 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
4743 L2_BLOCK);
4744 if ((m->oflags & VPO_UNMANAGED) == 0) {
4745 new_l2 |= ATTR_SW_MANAGED;
4746 new_l2 &= ~ATTR_AF;
4747 }
4748 if ((prot & VM_PROT_EXECUTE) == 0 ||
4749 m->md.pv_memattr == VM_MEMATTR_DEVICE)
4750 new_l2 |= ATTR_S1_XN;
4751 if (!ADDR_IS_KERNEL(va))
4752 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4753 else
4754 new_l2 |= ATTR_S1_UXN;
4755 if (pmap != kernel_pmap)
4756 new_l2 |= ATTR_S1_nG;
4757 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
4758 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
4759 }
4760
4761 /*
4762 * Returns true if every page table entry in the specified page table is
4763 * zero.
4764 */
4765 static bool
pmap_every_pte_zero(vm_paddr_t pa)4766 pmap_every_pte_zero(vm_paddr_t pa)
4767 {
4768 pt_entry_t *pt_end, *pte;
4769
4770 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
4771 pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
4772 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
4773 if (*pte != 0)
4774 return (false);
4775 }
4776 return (true);
4777 }
4778
4779 /*
4780 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if
4781 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
4782 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
4783 * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists
4784 * within the 2MB virtual address range starting at the specified virtual
4785 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
4786 * 2MB page mapping already exists at the specified virtual address. Returns
4787 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
4788 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
4789 * and a PV entry allocation failed.
4790 */
4791 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)4792 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
4793 vm_page_t m, struct rwlock **lockp)
4794 {
4795 struct spglist free;
4796 pd_entry_t *l2, old_l2;
4797 vm_page_t l2pg, mt;
4798 vm_page_t uwptpg;
4799
4800 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4801 KASSERT(ADDR_IS_CANONICAL(va),
4802 ("%s: Address not in canonical form: %lx", __func__, va));
4803
4804 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
4805 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
4806 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
4807 va, pmap);
4808 return (KERN_RESOURCE_SHORTAGE);
4809 }
4810
4811 /*
4812 * If there are existing mappings, either abort or remove them.
4813 */
4814 if ((old_l2 = pmap_load(l2)) != 0) {
4815 KASSERT(l2pg == NULL || l2pg->ref_count > 1,
4816 ("pmap_enter_l2: l2pg's ref count is too low"));
4817 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
4818 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
4819 if (l2pg != NULL)
4820 l2pg->ref_count--;
4821 CTR2(KTR_PMAP,
4822 "pmap_enter_l2: no space for va %#lx"
4823 " in pmap %p", va, pmap);
4824 return (KERN_NO_SPACE);
4825 } else if (!ADDR_IS_KERNEL(va) ||
4826 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
4827 if (l2pg != NULL)
4828 l2pg->ref_count--;
4829 CTR2(KTR_PMAP,
4830 "pmap_enter_l2: failure for va %#lx"
4831 " in pmap %p", va, pmap);
4832 return (KERN_FAILURE);
4833 }
4834 }
4835 SLIST_INIT(&free);
4836 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
4837 (void)pmap_remove_l2(pmap, l2, va,
4838 pmap_load(pmap_l1(pmap, va)), &free, lockp);
4839 else
4840 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
4841 &free, lockp);
4842 if (!ADDR_IS_KERNEL(va)) {
4843 vm_page_free_pages_toq(&free, true);
4844 KASSERT(pmap_load(l2) == 0,
4845 ("pmap_enter_l2: non-zero L2 entry %p", l2));
4846 } else {
4847 KASSERT(SLIST_EMPTY(&free),
4848 ("pmap_enter_l2: freed kernel page table page"));
4849
4850 /*
4851 * Both pmap_remove_l2() and pmap_remove_l3_range()
4852 * will leave the kernel page table page zero filled.
4853 * Nonetheless, the TLB could have an intermediate
4854 * entry for the kernel page table page, so request
4855 * an invalidation at all levels after clearing
4856 * the L2_TABLE entry.
4857 */
4858 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
4859 if (pmap_insert_pt_page(pmap, mt, false, false))
4860 panic("pmap_enter_l2: trie insert failed");
4861 pmap_clear(l2);
4862 pmap_s1_invalidate_page(pmap, va, false);
4863 }
4864 }
4865
4866 /*
4867 * Allocate leaf ptpage for wired userspace pages.
4868 */
4869 uwptpg = NULL;
4870 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
4871 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
4872 if (uwptpg == NULL) {
4873 return (KERN_RESOURCE_SHORTAGE);
4874 }
4875 uwptpg->pindex = pmap_l2_pindex(va);
4876 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
4877 vm_page_unwire_noq(uwptpg);
4878 vm_page_free(uwptpg);
4879 return (KERN_RESOURCE_SHORTAGE);
4880 }
4881 pmap_resident_count_inc(pmap, 1);
4882 uwptpg->ref_count = NL3PG;
4883 }
4884 if ((new_l2 & ATTR_SW_MANAGED) != 0) {
4885 /*
4886 * Abort this mapping if its PV entry could not be created.
4887 */
4888 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
4889 if (l2pg != NULL)
4890 pmap_abort_ptp(pmap, va, l2pg);
4891 if (uwptpg != NULL) {
4892 mt = pmap_remove_pt_page(pmap, va);
4893 KASSERT(mt == uwptpg,
4894 ("removed pt page %p, expected %p", mt,
4895 uwptpg));
4896 pmap_resident_count_dec(pmap, 1);
4897 uwptpg->ref_count = 1;
4898 vm_page_unwire_noq(uwptpg);
4899 vm_page_free(uwptpg);
4900 }
4901 CTR2(KTR_PMAP,
4902 "pmap_enter_l2: failure for va %#lx in pmap %p",
4903 va, pmap);
4904 return (KERN_RESOURCE_SHORTAGE);
4905 }
4906 if ((new_l2 & ATTR_SW_DBM) != 0)
4907 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4908 vm_page_aflag_set(mt, PGA_WRITEABLE);
4909 }
4910
4911 /*
4912 * Increment counters.
4913 */
4914 if ((new_l2 & ATTR_SW_WIRED) != 0)
4915 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
4916 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
4917
4918 /*
4919 * Conditionally sync the icache. See pmap_enter() for details.
4920 */
4921 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
4922 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
4923 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
4924 cpu_icache_sync_range(PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
4925 L2_SIZE);
4926 }
4927
4928 /*
4929 * Map the superpage.
4930 */
4931 pmap_store(l2, new_l2);
4932 dsb(ishst);
4933
4934 atomic_add_long(&pmap_l2_mappings, 1);
4935 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
4936 va, pmap);
4937
4938 return (KERN_SUCCESS);
4939 }
4940
4941 /*
4942 * Maps a sequence of resident pages belonging to the same object.
4943 * The sequence begins with the given page m_start. This page is
4944 * mapped at the given virtual address start. Each subsequent page is
4945 * mapped at a virtual address that is offset from start by the same
4946 * amount as the page is offset from m_start within the object. The
4947 * last page in the sequence is the page with the largest offset from
4948 * m_start that can be mapped at a virtual address less than the given
4949 * virtual address end. Not every virtual page between start and end
4950 * is mapped; only those for which a resident page exists with the
4951 * corresponding offset from m_start are mapped.
4952 */
4953 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)4954 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4955 vm_page_t m_start, vm_prot_t prot)
4956 {
4957 struct rwlock *lock;
4958 vm_offset_t va;
4959 vm_page_t m, mpte;
4960 vm_pindex_t diff, psize;
4961 int rv;
4962
4963 VM_OBJECT_ASSERT_LOCKED(m_start->object);
4964
4965 psize = atop(end - start);
4966 mpte = NULL;
4967 m = m_start;
4968 lock = NULL;
4969 PMAP_LOCK(pmap);
4970 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4971 va = start + ptoa(diff);
4972 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
4973 m->psind == 1 && pmap_ps_enabled(pmap) &&
4974 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) ==
4975 KERN_SUCCESS || rv == KERN_NO_SPACE))
4976 m = &m[L2_SIZE / PAGE_SIZE - 1];
4977 else
4978 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
4979 &lock);
4980 m = TAILQ_NEXT(m, listq);
4981 }
4982 if (lock != NULL)
4983 rw_wunlock(lock);
4984 PMAP_UNLOCK(pmap);
4985 }
4986
4987 /*
4988 * this code makes some *MAJOR* assumptions:
4989 * 1. Current pmap & pmap exists.
4990 * 2. Not wired.
4991 * 3. Read access.
4992 * 4. No page table pages.
4993 * but is *MUCH* faster than pmap_enter...
4994 */
4995
4996 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)4997 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4998 {
4999 struct rwlock *lock;
5000
5001 lock = NULL;
5002 PMAP_LOCK(pmap);
5003 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
5004 if (lock != NULL)
5005 rw_wunlock(lock);
5006 PMAP_UNLOCK(pmap);
5007 }
5008
5009 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)5010 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
5011 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
5012 {
5013 pd_entry_t *pde;
5014 pt_entry_t *l1, *l2, *l3, l3_val;
5015 vm_paddr_t pa;
5016 int lvl;
5017
5018 KASSERT(!VA_IS_CLEANMAP(va) ||
5019 (m->oflags & VPO_UNMANAGED) != 0,
5020 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
5021 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5022 PMAP_ASSERT_STAGE1(pmap);
5023 KASSERT(ADDR_IS_CANONICAL(va),
5024 ("%s: Address not in canonical form: %lx", __func__, va));
5025 l2 = NULL;
5026
5027 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
5028 /*
5029 * In the case that a page table page is not
5030 * resident, we are creating it here.
5031 */
5032 if (!ADDR_IS_KERNEL(va)) {
5033 vm_pindex_t l2pindex;
5034
5035 /*
5036 * Calculate pagetable page index
5037 */
5038 l2pindex = pmap_l2_pindex(va);
5039 if (mpte && (mpte->pindex == l2pindex)) {
5040 mpte->ref_count++;
5041 } else {
5042 /*
5043 * If the page table page is mapped, we just increment
5044 * the hold count, and activate it. Otherwise, we
5045 * attempt to allocate a page table page, passing NULL
5046 * instead of the PV list lock pointer because we don't
5047 * intend to sleep. If this attempt fails, we don't
5048 * retry. Instead, we give up.
5049 */
5050 l1 = pmap_l1(pmap, va);
5051 if (l1 != NULL && pmap_load(l1) != 0) {
5052 if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
5053 L1_BLOCK)
5054 return (NULL);
5055 l2 = pmap_l1_to_l2(l1, va);
5056 if (pmap_load(l2) != 0) {
5057 if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
5058 L2_BLOCK)
5059 return (NULL);
5060 mpte = PHYS_TO_VM_PAGE(
5061 PTE_TO_PHYS(pmap_load(l2)));
5062 mpte->ref_count++;
5063 } else {
5064 mpte = _pmap_alloc_l3(pmap, l2pindex,
5065 NULL);
5066 if (mpte == NULL)
5067 return (mpte);
5068 }
5069 } else {
5070 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
5071 if (mpte == NULL)
5072 return (mpte);
5073 }
5074 }
5075 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
5076 l3 = &l3[pmap_l3_index(va)];
5077 } else {
5078 mpte = NULL;
5079 pde = pmap_pde(kernel_pmap, va, &lvl);
5080 KASSERT(pde != NULL,
5081 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
5082 va));
5083 KASSERT(lvl == 2,
5084 ("pmap_enter_quick_locked: Invalid level %d", lvl));
5085 l3 = pmap_l2_to_l3(pde, va);
5086 }
5087
5088 /*
5089 * Abort if a mapping already exists.
5090 */
5091 if (pmap_load(l3) != 0) {
5092 if (mpte != NULL)
5093 mpte->ref_count--;
5094 return (NULL);
5095 }
5096
5097 /*
5098 * Enter on the PV list if part of our managed memory.
5099 */
5100 if ((m->oflags & VPO_UNMANAGED) == 0 &&
5101 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
5102 if (mpte != NULL)
5103 pmap_abort_ptp(pmap, va, mpte);
5104 return (NULL);
5105 }
5106
5107 /*
5108 * Increment counters
5109 */
5110 pmap_resident_count_inc(pmap, 1);
5111
5112 pa = VM_PAGE_TO_PHYS(m);
5113 l3_val = PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
5114 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
5115 if ((prot & VM_PROT_EXECUTE) == 0 ||
5116 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5117 l3_val |= ATTR_S1_XN;
5118 if (!ADDR_IS_KERNEL(va))
5119 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5120 else
5121 l3_val |= ATTR_S1_UXN;
5122 if (pmap != kernel_pmap)
5123 l3_val |= ATTR_S1_nG;
5124
5125 /*
5126 * Now validate mapping with RO protection
5127 */
5128 if ((m->oflags & VPO_UNMANAGED) == 0) {
5129 l3_val |= ATTR_SW_MANAGED;
5130 l3_val &= ~ATTR_AF;
5131 }
5132
5133 /* Sync icache before the mapping is stored to PTE */
5134 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5135 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5136 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
5137
5138 pmap_store(l3, l3_val);
5139 dsb(ishst);
5140
5141 #if VM_NRESERVLEVEL > 0
5142 /*
5143 * If both the PTP and the reservation are fully populated, then
5144 * attempt promotion.
5145 */
5146 if ((mpte == NULL || mpte->ref_count == NL3PG) &&
5147 (m->flags & PG_FICTITIOUS) == 0 &&
5148 vm_reserv_level_iffullpop(m) == 0) {
5149 if (l2 == NULL)
5150 l2 = pmap_pde(pmap, va, &lvl);
5151
5152 /*
5153 * If promotion succeeds, then the next call to this function
5154 * should not be given the unmapped PTP as a hint.
5155 */
5156 if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
5157 mpte = NULL;
5158 }
5159 #endif
5160
5161 return (mpte);
5162 }
5163
5164 /*
5165 * This code maps large physical mmap regions into the
5166 * processor address space. Note that some shortcuts
5167 * are taken, but the code works.
5168 */
5169 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)5170 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
5171 vm_pindex_t pindex, vm_size_t size)
5172 {
5173
5174 VM_OBJECT_ASSERT_WLOCKED(object);
5175 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
5176 ("pmap_object_init_pt: non-device object"));
5177 }
5178
5179 /*
5180 * Clear the wired attribute from the mappings for the specified range of
5181 * addresses in the given pmap. Every valid mapping within that range
5182 * must have the wired attribute set. In contrast, invalid mappings
5183 * cannot have the wired attribute set, so they are ignored.
5184 *
5185 * The wired attribute of the page table entry is not a hardware feature,
5186 * so there is no need to invalidate any TLB entries.
5187 */
5188 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)5189 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5190 {
5191 vm_offset_t va_next;
5192 pd_entry_t *l0, *l1, *l2;
5193 pt_entry_t *l3;
5194
5195 PMAP_LOCK(pmap);
5196 for (; sva < eva; sva = va_next) {
5197 l0 = pmap_l0(pmap, sva);
5198 if (pmap_load(l0) == 0) {
5199 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
5200 if (va_next < sva)
5201 va_next = eva;
5202 continue;
5203 }
5204
5205 l1 = pmap_l0_to_l1(l0, sva);
5206 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
5207 if (va_next < sva)
5208 va_next = eva;
5209 if (pmap_load(l1) == 0)
5210 continue;
5211
5212 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
5213 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5214 KASSERT(va_next <= eva,
5215 ("partial update of non-transparent 1G page "
5216 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
5217 pmap_load(l1), sva, eva, va_next));
5218 MPASS(pmap != kernel_pmap);
5219 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
5220 ATTR_SW_WIRED)) == ATTR_SW_WIRED);
5221 pmap_clear_bits(l1, ATTR_SW_WIRED);
5222 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
5223 continue;
5224 }
5225
5226 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
5227 if (va_next < sva)
5228 va_next = eva;
5229
5230 l2 = pmap_l1_to_l2(l1, sva);
5231 if (pmap_load(l2) == 0)
5232 continue;
5233
5234 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
5235 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
5236 panic("pmap_unwire: l2 %#jx is missing "
5237 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
5238
5239 /*
5240 * Are we unwiring the entire large page? If not,
5241 * demote the mapping and fall through.
5242 */
5243 if (sva + L2_SIZE == va_next && eva >= va_next) {
5244 pmap_clear_bits(l2, ATTR_SW_WIRED);
5245 pmap->pm_stats.wired_count -= L2_SIZE /
5246 PAGE_SIZE;
5247 continue;
5248 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
5249 panic("pmap_unwire: demotion failed");
5250 }
5251 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
5252 ("pmap_unwire: Invalid l2 entry after demotion"));
5253
5254 if (va_next > eva)
5255 va_next = eva;
5256 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
5257 sva += L3_SIZE) {
5258 if (pmap_load(l3) == 0)
5259 continue;
5260 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
5261 panic("pmap_unwire: l3 %#jx is missing "
5262 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
5263
5264 /*
5265 * ATTR_SW_WIRED must be cleared atomically. Although
5266 * the pmap lock synchronizes access to ATTR_SW_WIRED,
5267 * the System MMU may write to the entry concurrently.
5268 */
5269 pmap_clear_bits(l3, ATTR_SW_WIRED);
5270 pmap->pm_stats.wired_count--;
5271 }
5272 }
5273 PMAP_UNLOCK(pmap);
5274 }
5275
5276 /*
5277 * Copy the range specified by src_addr/len
5278 * from the source map to the range dst_addr/len
5279 * in the destination map.
5280 *
5281 * This routine is only advisory and need not do anything.
5282 *
5283 * Because the executable mappings created by this routine are copied,
5284 * it should not have to flush the instruction cache.
5285 */
5286 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)5287 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
5288 vm_offset_t src_addr)
5289 {
5290 struct rwlock *lock;
5291 pd_entry_t *l0, *l1, *l2, srcptepaddr;
5292 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
5293 vm_offset_t addr, end_addr, va_next;
5294 vm_page_t dst_m, dstmpte, srcmpte;
5295
5296 PMAP_ASSERT_STAGE1(dst_pmap);
5297 PMAP_ASSERT_STAGE1(src_pmap);
5298
5299 if (dst_addr != src_addr)
5300 return;
5301 end_addr = src_addr + len;
5302 lock = NULL;
5303 if (dst_pmap < src_pmap) {
5304 PMAP_LOCK(dst_pmap);
5305 PMAP_LOCK(src_pmap);
5306 } else {
5307 PMAP_LOCK(src_pmap);
5308 PMAP_LOCK(dst_pmap);
5309 }
5310 for (addr = src_addr; addr < end_addr; addr = va_next) {
5311 l0 = pmap_l0(src_pmap, addr);
5312 if (pmap_load(l0) == 0) {
5313 va_next = (addr + L0_SIZE) & ~L0_OFFSET;
5314 if (va_next < addr)
5315 va_next = end_addr;
5316 continue;
5317 }
5318
5319 va_next = (addr + L1_SIZE) & ~L1_OFFSET;
5320 if (va_next < addr)
5321 va_next = end_addr;
5322 l1 = pmap_l0_to_l1(l0, addr);
5323 if (pmap_load(l1) == 0)
5324 continue;
5325 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
5326 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5327 KASSERT(va_next <= end_addr,
5328 ("partial update of non-transparent 1G page "
5329 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
5330 pmap_load(l1), addr, end_addr, va_next));
5331 srcptepaddr = pmap_load(l1);
5332 l1 = pmap_l1(dst_pmap, addr);
5333 if (l1 == NULL) {
5334 if (_pmap_alloc_l3(dst_pmap,
5335 pmap_l0_pindex(addr), NULL) == NULL)
5336 break;
5337 l1 = pmap_l1(dst_pmap, addr);
5338 } else {
5339 l0 = pmap_l0(dst_pmap, addr);
5340 dst_m = PHYS_TO_VM_PAGE(
5341 PTE_TO_PHYS(pmap_load(l0)));
5342 dst_m->ref_count++;
5343 }
5344 KASSERT(pmap_load(l1) == 0,
5345 ("1G mapping present in dst pmap "
5346 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
5347 pmap_load(l1), addr, end_addr, va_next));
5348 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
5349 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
5350 continue;
5351 }
5352
5353 va_next = (addr + L2_SIZE) & ~L2_OFFSET;
5354 if (va_next < addr)
5355 va_next = end_addr;
5356 l2 = pmap_l1_to_l2(l1, addr);
5357 srcptepaddr = pmap_load(l2);
5358 if (srcptepaddr == 0)
5359 continue;
5360 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
5361 /*
5362 * We can only virtual copy whole superpages.
5363 */
5364 if ((addr & L2_OFFSET) != 0 ||
5365 addr + L2_SIZE > end_addr)
5366 continue;
5367 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
5368 if (l2 == NULL)
5369 break;
5370 if (pmap_load(l2) == 0 &&
5371 ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
5372 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
5373 PMAP_ENTER_NORECLAIM, &lock))) {
5374 /*
5375 * We leave the dirty bit unchanged because
5376 * managed read/write superpage mappings are
5377 * required to be dirty. However, managed
5378 * superpage mappings are not required to
5379 * have their accessed bit set, so we clear
5380 * it because we don't know if this mapping
5381 * will be used.
5382 */
5383 srcptepaddr &= ~ATTR_SW_WIRED;
5384 if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
5385 srcptepaddr &= ~ATTR_AF;
5386 pmap_store(l2, srcptepaddr);
5387 pmap_resident_count_inc(dst_pmap, L2_SIZE /
5388 PAGE_SIZE);
5389 atomic_add_long(&pmap_l2_mappings, 1);
5390 } else
5391 pmap_abort_ptp(dst_pmap, addr, dst_m);
5392 continue;
5393 }
5394 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
5395 ("pmap_copy: invalid L2 entry"));
5396 srcmpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(srcptepaddr));
5397 KASSERT(srcmpte->ref_count > 0,
5398 ("pmap_copy: source page table page is unused"));
5399 if (va_next > end_addr)
5400 va_next = end_addr;
5401 src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
5402 src_pte = &src_pte[pmap_l3_index(addr)];
5403 dstmpte = NULL;
5404 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
5405 ptetemp = pmap_load(src_pte);
5406
5407 /*
5408 * We only virtual copy managed pages.
5409 */
5410 if ((ptetemp & ATTR_SW_MANAGED) == 0)
5411 continue;
5412
5413 if (dstmpte != NULL) {
5414 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
5415 ("dstmpte pindex/addr mismatch"));
5416 dstmpte->ref_count++;
5417 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
5418 NULL)) == NULL)
5419 goto out;
5420 dst_pte = (pt_entry_t *)
5421 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
5422 dst_pte = &dst_pte[pmap_l3_index(addr)];
5423 if (pmap_load(dst_pte) == 0 &&
5424 pmap_try_insert_pv_entry(dst_pmap, addr,
5425 PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptetemp)), &lock)) {
5426 /*
5427 * Clear the wired, modified, and accessed
5428 * (referenced) bits during the copy.
5429 */
5430 mask = ATTR_AF | ATTR_SW_WIRED;
5431 nbits = 0;
5432 if ((ptetemp & ATTR_SW_DBM) != 0)
5433 nbits |= ATTR_S1_AP_RW_BIT;
5434 pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
5435 pmap_resident_count_inc(dst_pmap, 1);
5436 } else {
5437 pmap_abort_ptp(dst_pmap, addr, dstmpte);
5438 goto out;
5439 }
5440 /* Have we copied all of the valid mappings? */
5441 if (dstmpte->ref_count >= srcmpte->ref_count)
5442 break;
5443 }
5444 }
5445 out:
5446 /*
5447 * XXX This barrier may not be needed because the destination pmap is
5448 * not active.
5449 */
5450 dsb(ishst);
5451
5452 if (lock != NULL)
5453 rw_wunlock(lock);
5454 PMAP_UNLOCK(src_pmap);
5455 PMAP_UNLOCK(dst_pmap);
5456 }
5457
5458 /*
5459 * pmap_zero_page zeros the specified hardware page by mapping
5460 * the page into KVM and using bzero to clear its contents.
5461 */
5462 void
pmap_zero_page(vm_page_t m)5463 pmap_zero_page(vm_page_t m)
5464 {
5465 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5466
5467 pagezero((void *)va);
5468 }
5469
5470 /*
5471 * pmap_zero_page_area zeros the specified hardware page by mapping
5472 * the page into KVM and using bzero to clear its contents.
5473 *
5474 * off and size may not cover an area beyond a single hardware page.
5475 */
5476 void
pmap_zero_page_area(vm_page_t m,int off,int size)5477 pmap_zero_page_area(vm_page_t m, int off, int size)
5478 {
5479 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5480
5481 if (off == 0 && size == PAGE_SIZE)
5482 pagezero((void *)va);
5483 else
5484 bzero((char *)va + off, size);
5485 }
5486
5487 /*
5488 * pmap_copy_page copies the specified (machine independent)
5489 * page by mapping the page into virtual memory and using
5490 * bcopy to copy the page, one machine dependent page at a
5491 * time.
5492 */
5493 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)5494 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
5495 {
5496 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
5497 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
5498
5499 pagecopy((void *)src, (void *)dst);
5500 }
5501
5502 int unmapped_buf_allowed = 1;
5503
5504 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)5505 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
5506 vm_offset_t b_offset, int xfersize)
5507 {
5508 void *a_cp, *b_cp;
5509 vm_page_t m_a, m_b;
5510 vm_paddr_t p_a, p_b;
5511 vm_offset_t a_pg_offset, b_pg_offset;
5512 int cnt;
5513
5514 while (xfersize > 0) {
5515 a_pg_offset = a_offset & PAGE_MASK;
5516 m_a = ma[a_offset >> PAGE_SHIFT];
5517 p_a = m_a->phys_addr;
5518 b_pg_offset = b_offset & PAGE_MASK;
5519 m_b = mb[b_offset >> PAGE_SHIFT];
5520 p_b = m_b->phys_addr;
5521 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5522 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5523 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
5524 panic("!DMAP a %lx", p_a);
5525 } else {
5526 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
5527 }
5528 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
5529 panic("!DMAP b %lx", p_b);
5530 } else {
5531 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
5532 }
5533 bcopy(a_cp, b_cp, cnt);
5534 a_offset += cnt;
5535 b_offset += cnt;
5536 xfersize -= cnt;
5537 }
5538 }
5539
5540 vm_offset_t
pmap_quick_enter_page(vm_page_t m)5541 pmap_quick_enter_page(vm_page_t m)
5542 {
5543
5544 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
5545 }
5546
5547 void
pmap_quick_remove_page(vm_offset_t addr)5548 pmap_quick_remove_page(vm_offset_t addr)
5549 {
5550 }
5551
5552 /*
5553 * Returns true if the pmap's pv is one of the first
5554 * 16 pvs linked to from this page. This count may
5555 * be changed upwards or downwards in the future; it
5556 * is only necessary that true be returned for a small
5557 * subset of pmaps for proper page aging.
5558 */
5559 boolean_t
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)5560 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5561 {
5562 struct md_page *pvh;
5563 struct rwlock *lock;
5564 pv_entry_t pv;
5565 int loops = 0;
5566 boolean_t rv;
5567
5568 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5569 ("pmap_page_exists_quick: page %p is not managed", m));
5570 rv = FALSE;
5571 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5572 rw_rlock(lock);
5573 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5574 if (PV_PMAP(pv) == pmap) {
5575 rv = TRUE;
5576 break;
5577 }
5578 loops++;
5579 if (loops >= 16)
5580 break;
5581 }
5582 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5583 pvh = page_to_pvh(m);
5584 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5585 if (PV_PMAP(pv) == pmap) {
5586 rv = TRUE;
5587 break;
5588 }
5589 loops++;
5590 if (loops >= 16)
5591 break;
5592 }
5593 }
5594 rw_runlock(lock);
5595 return (rv);
5596 }
5597
5598 /*
5599 * pmap_page_wired_mappings:
5600 *
5601 * Return the number of managed mappings to the given physical page
5602 * that are wired.
5603 */
5604 int
pmap_page_wired_mappings(vm_page_t m)5605 pmap_page_wired_mappings(vm_page_t m)
5606 {
5607 struct rwlock *lock;
5608 struct md_page *pvh;
5609 pmap_t pmap;
5610 pt_entry_t *pte;
5611 pv_entry_t pv;
5612 int count, md_gen, pvh_gen;
5613
5614 if ((m->oflags & VPO_UNMANAGED) != 0)
5615 return (0);
5616 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5617 rw_rlock(lock);
5618 restart:
5619 count = 0;
5620 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5621 pmap = PV_PMAP(pv);
5622 if (!PMAP_TRYLOCK(pmap)) {
5623 md_gen = m->md.pv_gen;
5624 rw_runlock(lock);
5625 PMAP_LOCK(pmap);
5626 rw_rlock(lock);
5627 if (md_gen != m->md.pv_gen) {
5628 PMAP_UNLOCK(pmap);
5629 goto restart;
5630 }
5631 }
5632 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5633 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
5634 count++;
5635 PMAP_UNLOCK(pmap);
5636 }
5637 if ((m->flags & PG_FICTITIOUS) == 0) {
5638 pvh = page_to_pvh(m);
5639 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5640 pmap = PV_PMAP(pv);
5641 if (!PMAP_TRYLOCK(pmap)) {
5642 md_gen = m->md.pv_gen;
5643 pvh_gen = pvh->pv_gen;
5644 rw_runlock(lock);
5645 PMAP_LOCK(pmap);
5646 rw_rlock(lock);
5647 if (md_gen != m->md.pv_gen ||
5648 pvh_gen != pvh->pv_gen) {
5649 PMAP_UNLOCK(pmap);
5650 goto restart;
5651 }
5652 }
5653 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
5654 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
5655 count++;
5656 PMAP_UNLOCK(pmap);
5657 }
5658 }
5659 rw_runlock(lock);
5660 return (count);
5661 }
5662
5663 /*
5664 * Returns true if the given page is mapped individually or as part of
5665 * a 2mpage. Otherwise, returns false.
5666 */
5667 bool
pmap_page_is_mapped(vm_page_t m)5668 pmap_page_is_mapped(vm_page_t m)
5669 {
5670 struct rwlock *lock;
5671 bool rv;
5672
5673 if ((m->oflags & VPO_UNMANAGED) != 0)
5674 return (false);
5675 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5676 rw_rlock(lock);
5677 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5678 ((m->flags & PG_FICTITIOUS) == 0 &&
5679 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
5680 rw_runlock(lock);
5681 return (rv);
5682 }
5683
5684 /*
5685 * Destroy all managed, non-wired mappings in the given user-space
5686 * pmap. This pmap cannot be active on any processor besides the
5687 * caller.
5688 *
5689 * This function cannot be applied to the kernel pmap. Moreover, it
5690 * is not intended for general use. It is only to be used during
5691 * process termination. Consequently, it can be implemented in ways
5692 * that make it faster than pmap_remove(). First, it can more quickly
5693 * destroy mappings by iterating over the pmap's collection of PV
5694 * entries, rather than searching the page table. Second, it doesn't
5695 * have to test and clear the page table entries atomically, because
5696 * no processor is currently accessing the user address space. In
5697 * particular, a page table entry's dirty bit won't change state once
5698 * this function starts.
5699 */
5700 void
pmap_remove_pages(pmap_t pmap)5701 pmap_remove_pages(pmap_t pmap)
5702 {
5703 pd_entry_t *pde;
5704 pt_entry_t *pte, tpte;
5705 struct spglist free;
5706 struct pv_chunklist free_chunks[PMAP_MEMDOM];
5707 vm_page_t m, ml3, mt;
5708 pv_entry_t pv;
5709 struct md_page *pvh;
5710 struct pv_chunk *pc, *npc;
5711 struct rwlock *lock;
5712 int64_t bit;
5713 uint64_t inuse, bitmask;
5714 int allfree, field, i, idx, lvl;
5715 int freed __pvused;
5716 vm_paddr_t pa;
5717
5718 lock = NULL;
5719
5720 for (i = 0; i < PMAP_MEMDOM; i++)
5721 TAILQ_INIT(&free_chunks[i]);
5722 SLIST_INIT(&free);
5723 PMAP_LOCK(pmap);
5724 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5725 allfree = 1;
5726 freed = 0;
5727 for (field = 0; field < _NPCM; field++) {
5728 inuse = ~pc->pc_map[field] & pc_freemask[field];
5729 while (inuse != 0) {
5730 bit = ffsl(inuse) - 1;
5731 bitmask = 1UL << bit;
5732 idx = field * 64 + bit;
5733 pv = &pc->pc_pventry[idx];
5734 inuse &= ~bitmask;
5735
5736 pde = pmap_pde(pmap, pv->pv_va, &lvl);
5737 KASSERT(pde != NULL,
5738 ("Attempting to remove an unmapped page"));
5739
5740 switch(lvl) {
5741 case 1:
5742 pte = pmap_l1_to_l2(pde, pv->pv_va);
5743 tpte = pmap_load(pte);
5744 KASSERT((tpte & ATTR_DESCR_MASK) ==
5745 L2_BLOCK,
5746 ("Attempting to remove an invalid "
5747 "block: %lx", tpte));
5748 break;
5749 case 2:
5750 pte = pmap_l2_to_l3(pde, pv->pv_va);
5751 tpte = pmap_load(pte);
5752 KASSERT((tpte & ATTR_DESCR_MASK) ==
5753 L3_PAGE,
5754 ("Attempting to remove an invalid "
5755 "page: %lx", tpte));
5756 break;
5757 default:
5758 panic(
5759 "Invalid page directory level: %d",
5760 lvl);
5761 }
5762
5763 /*
5764 * We cannot remove wired pages from a process' mapping at this time
5765 */
5766 if (tpte & ATTR_SW_WIRED) {
5767 allfree = 0;
5768 continue;
5769 }
5770
5771 /* Mark free */
5772 pc->pc_map[field] |= bitmask;
5773
5774 /*
5775 * Because this pmap is not active on other
5776 * processors, the dirty bit cannot have
5777 * changed state since we last loaded pte.
5778 */
5779 pmap_clear(pte);
5780
5781 pa = PTE_TO_PHYS(tpte);
5782
5783 m = PHYS_TO_VM_PAGE(pa);
5784 KASSERT(m->phys_addr == pa,
5785 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5786 m, (uintmax_t)m->phys_addr,
5787 (uintmax_t)tpte));
5788
5789 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5790 m < &vm_page_array[vm_page_array_size],
5791 ("pmap_remove_pages: bad pte %#jx",
5792 (uintmax_t)tpte));
5793
5794 /*
5795 * Update the vm_page_t clean/reference bits.
5796 */
5797 if (pmap_pte_dirty(pmap, tpte)) {
5798 switch (lvl) {
5799 case 1:
5800 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5801 vm_page_dirty(mt);
5802 break;
5803 case 2:
5804 vm_page_dirty(m);
5805 break;
5806 }
5807 }
5808
5809 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5810
5811 switch (lvl) {
5812 case 1:
5813 pmap_resident_count_dec(pmap,
5814 L2_SIZE / PAGE_SIZE);
5815 pvh = page_to_pvh(m);
5816 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
5817 pvh->pv_gen++;
5818 if (TAILQ_EMPTY(&pvh->pv_list)) {
5819 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5820 if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5821 TAILQ_EMPTY(&mt->md.pv_list))
5822 vm_page_aflag_clear(mt, PGA_WRITEABLE);
5823 }
5824 ml3 = pmap_remove_pt_page(pmap,
5825 pv->pv_va);
5826 if (ml3 != NULL) {
5827 KASSERT(vm_page_any_valid(ml3),
5828 ("pmap_remove_pages: l3 page not promoted"));
5829 pmap_resident_count_dec(pmap,1);
5830 KASSERT(ml3->ref_count == NL3PG,
5831 ("pmap_remove_pages: l3 page ref count error"));
5832 ml3->ref_count = 0;
5833 pmap_add_delayed_free_list(ml3,
5834 &free, FALSE);
5835 }
5836 break;
5837 case 2:
5838 pmap_resident_count_dec(pmap, 1);
5839 TAILQ_REMOVE(&m->md.pv_list, pv,
5840 pv_next);
5841 m->md.pv_gen++;
5842 if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5843 TAILQ_EMPTY(&m->md.pv_list) &&
5844 (m->flags & PG_FICTITIOUS) == 0) {
5845 pvh = page_to_pvh(m);
5846 if (TAILQ_EMPTY(&pvh->pv_list))
5847 vm_page_aflag_clear(m,
5848 PGA_WRITEABLE);
5849 }
5850 break;
5851 }
5852 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
5853 &free);
5854 freed++;
5855 }
5856 }
5857 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5858 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5859 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5860 if (allfree) {
5861 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5862 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
5863 pc_list);
5864 }
5865 }
5866 if (lock != NULL)
5867 rw_wunlock(lock);
5868 pmap_invalidate_all(pmap);
5869 free_pv_chunk_batch(free_chunks);
5870 PMAP_UNLOCK(pmap);
5871 vm_page_free_pages_toq(&free, true);
5872 }
5873
5874 /*
5875 * This is used to check if a page has been accessed or modified.
5876 */
5877 static boolean_t
pmap_page_test_mappings(vm_page_t m,boolean_t accessed,boolean_t modified)5878 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5879 {
5880 struct rwlock *lock;
5881 pv_entry_t pv;
5882 struct md_page *pvh;
5883 pt_entry_t *pte, mask, value;
5884 pmap_t pmap;
5885 int md_gen, pvh_gen;
5886 boolean_t rv;
5887
5888 rv = FALSE;
5889 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5890 rw_rlock(lock);
5891 restart:
5892 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5893 pmap = PV_PMAP(pv);
5894 PMAP_ASSERT_STAGE1(pmap);
5895 if (!PMAP_TRYLOCK(pmap)) {
5896 md_gen = m->md.pv_gen;
5897 rw_runlock(lock);
5898 PMAP_LOCK(pmap);
5899 rw_rlock(lock);
5900 if (md_gen != m->md.pv_gen) {
5901 PMAP_UNLOCK(pmap);
5902 goto restart;
5903 }
5904 }
5905 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5906 mask = 0;
5907 value = 0;
5908 if (modified) {
5909 mask |= ATTR_S1_AP_RW_BIT;
5910 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5911 }
5912 if (accessed) {
5913 mask |= ATTR_AF | ATTR_DESCR_MASK;
5914 value |= ATTR_AF | L3_PAGE;
5915 }
5916 rv = (pmap_load(pte) & mask) == value;
5917 PMAP_UNLOCK(pmap);
5918 if (rv)
5919 goto out;
5920 }
5921 if ((m->flags & PG_FICTITIOUS) == 0) {
5922 pvh = page_to_pvh(m);
5923 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5924 pmap = PV_PMAP(pv);
5925 PMAP_ASSERT_STAGE1(pmap);
5926 if (!PMAP_TRYLOCK(pmap)) {
5927 md_gen = m->md.pv_gen;
5928 pvh_gen = pvh->pv_gen;
5929 rw_runlock(lock);
5930 PMAP_LOCK(pmap);
5931 rw_rlock(lock);
5932 if (md_gen != m->md.pv_gen ||
5933 pvh_gen != pvh->pv_gen) {
5934 PMAP_UNLOCK(pmap);
5935 goto restart;
5936 }
5937 }
5938 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
5939 mask = 0;
5940 value = 0;
5941 if (modified) {
5942 mask |= ATTR_S1_AP_RW_BIT;
5943 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5944 }
5945 if (accessed) {
5946 mask |= ATTR_AF | ATTR_DESCR_MASK;
5947 value |= ATTR_AF | L2_BLOCK;
5948 }
5949 rv = (pmap_load(pte) & mask) == value;
5950 PMAP_UNLOCK(pmap);
5951 if (rv)
5952 goto out;
5953 }
5954 }
5955 out:
5956 rw_runlock(lock);
5957 return (rv);
5958 }
5959
5960 /*
5961 * pmap_is_modified:
5962 *
5963 * Return whether or not the specified physical page was modified
5964 * in any physical maps.
5965 */
5966 boolean_t
pmap_is_modified(vm_page_t m)5967 pmap_is_modified(vm_page_t m)
5968 {
5969
5970 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5971 ("pmap_is_modified: page %p is not managed", m));
5972
5973 /*
5974 * If the page is not busied then this check is racy.
5975 */
5976 if (!pmap_page_is_write_mapped(m))
5977 return (FALSE);
5978 return (pmap_page_test_mappings(m, FALSE, TRUE));
5979 }
5980
5981 /*
5982 * pmap_is_prefaultable:
5983 *
5984 * Return whether or not the specified virtual address is eligible
5985 * for prefault.
5986 */
5987 boolean_t
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)5988 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5989 {
5990 pd_entry_t *pde;
5991 pt_entry_t *pte;
5992 boolean_t rv;
5993 int lvl;
5994
5995 /*
5996 * Return TRUE if and only if the L3 entry for the specified virtual
5997 * address is allocated but invalid.
5998 */
5999 rv = FALSE;
6000 PMAP_LOCK(pmap);
6001 pde = pmap_pde(pmap, addr, &lvl);
6002 if (pde != NULL && lvl == 2) {
6003 pte = pmap_l2_to_l3(pde, addr);
6004 rv = pmap_load(pte) == 0;
6005 }
6006 PMAP_UNLOCK(pmap);
6007 return (rv);
6008 }
6009
6010 /*
6011 * pmap_is_referenced:
6012 *
6013 * Return whether or not the specified physical page was referenced
6014 * in any physical maps.
6015 */
6016 boolean_t
pmap_is_referenced(vm_page_t m)6017 pmap_is_referenced(vm_page_t m)
6018 {
6019
6020 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6021 ("pmap_is_referenced: page %p is not managed", m));
6022 return (pmap_page_test_mappings(m, TRUE, FALSE));
6023 }
6024
6025 /*
6026 * Clear the write and modified bits in each of the given page's mappings.
6027 */
6028 void
pmap_remove_write(vm_page_t m)6029 pmap_remove_write(vm_page_t m)
6030 {
6031 struct md_page *pvh;
6032 pmap_t pmap;
6033 struct rwlock *lock;
6034 pv_entry_t next_pv, pv;
6035 pt_entry_t oldpte, *pte, set, clear, mask, val;
6036 vm_offset_t va;
6037 int md_gen, pvh_gen;
6038
6039 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6040 ("pmap_remove_write: page %p is not managed", m));
6041 vm_page_assert_busied(m);
6042
6043 if (!pmap_page_is_write_mapped(m))
6044 return;
6045 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6046 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
6047 rw_wlock(lock);
6048 retry:
6049 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6050 pmap = PV_PMAP(pv);
6051 PMAP_ASSERT_STAGE1(pmap);
6052 if (!PMAP_TRYLOCK(pmap)) {
6053 pvh_gen = pvh->pv_gen;
6054 rw_wunlock(lock);
6055 PMAP_LOCK(pmap);
6056 rw_wlock(lock);
6057 if (pvh_gen != pvh->pv_gen) {
6058 PMAP_UNLOCK(pmap);
6059 goto retry;
6060 }
6061 }
6062 va = pv->pv_va;
6063 pte = pmap_pte_exists(pmap, va, 2, __func__);
6064 if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
6065 (void)pmap_demote_l2_locked(pmap, pte, va, &lock);
6066 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6067 ("inconsistent pv lock %p %p for page %p",
6068 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6069 PMAP_UNLOCK(pmap);
6070 }
6071 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6072 pmap = PV_PMAP(pv);
6073 if (!PMAP_TRYLOCK(pmap)) {
6074 pvh_gen = pvh->pv_gen;
6075 md_gen = m->md.pv_gen;
6076 rw_wunlock(lock);
6077 PMAP_LOCK(pmap);
6078 rw_wlock(lock);
6079 if (pvh_gen != pvh->pv_gen ||
6080 md_gen != m->md.pv_gen) {
6081 PMAP_UNLOCK(pmap);
6082 goto retry;
6083 }
6084 }
6085 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6086 oldpte = pmap_load(pte);
6087 if ((oldpte & ATTR_SW_DBM) != 0) {
6088 if (pmap->pm_stage == PM_STAGE1) {
6089 set = ATTR_S1_AP_RW_BIT;
6090 clear = 0;
6091 mask = ATTR_S1_AP_RW_BIT;
6092 val = ATTR_S1_AP(ATTR_S1_AP_RW);
6093 } else {
6094 set = 0;
6095 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
6096 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
6097 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
6098 }
6099 clear |= ATTR_SW_DBM;
6100 while (!atomic_fcmpset_64(pte, &oldpte,
6101 (oldpte | set) & ~clear))
6102 cpu_spinwait();
6103
6104 if ((oldpte & mask) == val)
6105 vm_page_dirty(m);
6106 pmap_invalidate_page(pmap, pv->pv_va, true);
6107 }
6108 PMAP_UNLOCK(pmap);
6109 }
6110 rw_wunlock(lock);
6111 vm_page_aflag_clear(m, PGA_WRITEABLE);
6112 }
6113
6114 /*
6115 * pmap_ts_referenced:
6116 *
6117 * Return a count of reference bits for a page, clearing those bits.
6118 * It is not necessary for every reference bit to be cleared, but it
6119 * is necessary that 0 only be returned when there are truly no
6120 * reference bits set.
6121 *
6122 * As an optimization, update the page's dirty field if a modified bit is
6123 * found while counting reference bits. This opportunistic update can be
6124 * performed at low cost and can eliminate the need for some future calls
6125 * to pmap_is_modified(). However, since this function stops after
6126 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
6127 * dirty pages. Those dirty pages will only be detected by a future call
6128 * to pmap_is_modified().
6129 */
6130 int
pmap_ts_referenced(vm_page_t m)6131 pmap_ts_referenced(vm_page_t m)
6132 {
6133 struct md_page *pvh;
6134 pv_entry_t pv, pvf;
6135 pmap_t pmap;
6136 struct rwlock *lock;
6137 pt_entry_t *pte, tpte;
6138 vm_offset_t va;
6139 vm_paddr_t pa;
6140 int cleared, md_gen, not_cleared, pvh_gen;
6141 struct spglist free;
6142
6143 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6144 ("pmap_ts_referenced: page %p is not managed", m));
6145 SLIST_INIT(&free);
6146 cleared = 0;
6147 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
6148 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6149 rw_wlock(lock);
6150 retry:
6151 not_cleared = 0;
6152 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
6153 goto small_mappings;
6154 pv = pvf;
6155 do {
6156 if (pvf == NULL)
6157 pvf = pv;
6158 pmap = PV_PMAP(pv);
6159 if (!PMAP_TRYLOCK(pmap)) {
6160 pvh_gen = pvh->pv_gen;
6161 rw_wunlock(lock);
6162 PMAP_LOCK(pmap);
6163 rw_wlock(lock);
6164 if (pvh_gen != pvh->pv_gen) {
6165 PMAP_UNLOCK(pmap);
6166 goto retry;
6167 }
6168 }
6169 va = pv->pv_va;
6170 pte = pmap_pte_exists(pmap, va, 2, __func__);
6171 tpte = pmap_load(pte);
6172 if (pmap_pte_dirty(pmap, tpte)) {
6173 /*
6174 * Although "tpte" is mapping a 2MB page, because
6175 * this function is called at a 4KB page granularity,
6176 * we only update the 4KB page under test.
6177 */
6178 vm_page_dirty(m);
6179 }
6180 if ((tpte & ATTR_AF) != 0) {
6181 pa = VM_PAGE_TO_PHYS(m);
6182
6183 /*
6184 * Since this reference bit is shared by 512 4KB pages,
6185 * it should not be cleared every time it is tested.
6186 * Apply a simple "hash" function on the physical page
6187 * number, the virtual superpage number, and the pmap
6188 * address to select one 4KB page out of the 512 on
6189 * which testing the reference bit will result in
6190 * clearing that reference bit. This function is
6191 * designed to avoid the selection of the same 4KB page
6192 * for every 2MB page mapping.
6193 *
6194 * On demotion, a mapping that hasn't been referenced
6195 * is simply destroyed. To avoid the possibility of a
6196 * subsequent page fault on a demoted wired mapping,
6197 * always leave its reference bit set. Moreover,
6198 * since the superpage is wired, the current state of
6199 * its reference bit won't affect page replacement.
6200 */
6201 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
6202 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
6203 (tpte & ATTR_SW_WIRED) == 0) {
6204 pmap_clear_bits(pte, ATTR_AF);
6205 pmap_invalidate_page(pmap, va, true);
6206 cleared++;
6207 } else
6208 not_cleared++;
6209 }
6210 PMAP_UNLOCK(pmap);
6211 /* Rotate the PV list if it has more than one entry. */
6212 if (TAILQ_NEXT(pv, pv_next) != NULL) {
6213 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
6214 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
6215 pvh->pv_gen++;
6216 }
6217 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
6218 goto out;
6219 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
6220 small_mappings:
6221 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
6222 goto out;
6223 pv = pvf;
6224 do {
6225 if (pvf == NULL)
6226 pvf = pv;
6227 pmap = PV_PMAP(pv);
6228 if (!PMAP_TRYLOCK(pmap)) {
6229 pvh_gen = pvh->pv_gen;
6230 md_gen = m->md.pv_gen;
6231 rw_wunlock(lock);
6232 PMAP_LOCK(pmap);
6233 rw_wlock(lock);
6234 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6235 PMAP_UNLOCK(pmap);
6236 goto retry;
6237 }
6238 }
6239 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6240 tpte = pmap_load(pte);
6241 if (pmap_pte_dirty(pmap, tpte))
6242 vm_page_dirty(m);
6243 if ((tpte & ATTR_AF) != 0) {
6244 if ((tpte & ATTR_SW_WIRED) == 0) {
6245 pmap_clear_bits(pte, ATTR_AF);
6246 pmap_invalidate_page(pmap, pv->pv_va, true);
6247 cleared++;
6248 } else
6249 not_cleared++;
6250 }
6251 PMAP_UNLOCK(pmap);
6252 /* Rotate the PV list if it has more than one entry. */
6253 if (TAILQ_NEXT(pv, pv_next) != NULL) {
6254 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6255 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
6256 m->md.pv_gen++;
6257 }
6258 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
6259 not_cleared < PMAP_TS_REFERENCED_MAX);
6260 out:
6261 rw_wunlock(lock);
6262 vm_page_free_pages_toq(&free, true);
6263 return (cleared + not_cleared);
6264 }
6265
6266 /*
6267 * Apply the given advice to the specified range of addresses within the
6268 * given pmap. Depending on the advice, clear the referenced and/or
6269 * modified flags in each mapping and set the mapped page's dirty field.
6270 */
6271 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)6272 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
6273 {
6274 struct rwlock *lock;
6275 vm_offset_t va, va_next;
6276 vm_page_t m;
6277 pd_entry_t *l0, *l1, *l2, oldl2;
6278 pt_entry_t *l3, oldl3;
6279
6280 PMAP_ASSERT_STAGE1(pmap);
6281
6282 if (advice != MADV_DONTNEED && advice != MADV_FREE)
6283 return;
6284
6285 PMAP_LOCK(pmap);
6286 for (; sva < eva; sva = va_next) {
6287 l0 = pmap_l0(pmap, sva);
6288 if (pmap_load(l0) == 0) {
6289 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6290 if (va_next < sva)
6291 va_next = eva;
6292 continue;
6293 }
6294
6295 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6296 if (va_next < sva)
6297 va_next = eva;
6298 l1 = pmap_l0_to_l1(l0, sva);
6299 if (pmap_load(l1) == 0)
6300 continue;
6301 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6302 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6303 continue;
6304 }
6305
6306 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6307 if (va_next < sva)
6308 va_next = eva;
6309 l2 = pmap_l1_to_l2(l1, sva);
6310 oldl2 = pmap_load(l2);
6311 if (oldl2 == 0)
6312 continue;
6313 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
6314 if ((oldl2 & ATTR_SW_MANAGED) == 0)
6315 continue;
6316 lock = NULL;
6317 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
6318 if (lock != NULL)
6319 rw_wunlock(lock);
6320
6321 /*
6322 * The 2MB page mapping was destroyed.
6323 */
6324 continue;
6325 }
6326
6327 /*
6328 * Unless the page mappings are wired, remove the
6329 * mapping to a single page so that a subsequent
6330 * access may repromote. Choosing the last page
6331 * within the address range [sva, min(va_next, eva))
6332 * generally results in more repromotions. Since the
6333 * underlying page table page is fully populated, this
6334 * removal never frees a page table page.
6335 */
6336 if ((oldl2 & ATTR_SW_WIRED) == 0) {
6337 va = eva;
6338 if (va > va_next)
6339 va = va_next;
6340 va -= PAGE_SIZE;
6341 KASSERT(va >= sva,
6342 ("pmap_advise: no address gap"));
6343 l3 = pmap_l2_to_l3(l2, va);
6344 KASSERT(pmap_load(l3) != 0,
6345 ("pmap_advise: invalid PTE"));
6346 pmap_remove_l3(pmap, l3, va, pmap_load(l2),
6347 NULL, &lock);
6348 }
6349 if (lock != NULL)
6350 rw_wunlock(lock);
6351 }
6352 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6353 ("pmap_advise: invalid L2 entry after demotion"));
6354 if (va_next > eva)
6355 va_next = eva;
6356 va = va_next;
6357 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
6358 sva += L3_SIZE) {
6359 oldl3 = pmap_load(l3);
6360 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
6361 (ATTR_SW_MANAGED | L3_PAGE))
6362 goto maybe_invlrng;
6363 else if (pmap_pte_dirty(pmap, oldl3)) {
6364 if (advice == MADV_DONTNEED) {
6365 /*
6366 * Future calls to pmap_is_modified()
6367 * can be avoided by making the page
6368 * dirty now.
6369 */
6370 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl3));
6371 vm_page_dirty(m);
6372 }
6373 while (!atomic_fcmpset_long(l3, &oldl3,
6374 (oldl3 & ~ATTR_AF) |
6375 ATTR_S1_AP(ATTR_S1_AP_RO)))
6376 cpu_spinwait();
6377 } else if ((oldl3 & ATTR_AF) != 0)
6378 pmap_clear_bits(l3, ATTR_AF);
6379 else
6380 goto maybe_invlrng;
6381 if (va == va_next)
6382 va = sva;
6383 continue;
6384 maybe_invlrng:
6385 if (va != va_next) {
6386 pmap_s1_invalidate_range(pmap, va, sva, true);
6387 va = va_next;
6388 }
6389 }
6390 if (va != va_next)
6391 pmap_s1_invalidate_range(pmap, va, sva, true);
6392 }
6393 PMAP_UNLOCK(pmap);
6394 }
6395
6396 /*
6397 * Clear the modify bits on the specified physical page.
6398 */
6399 void
pmap_clear_modify(vm_page_t m)6400 pmap_clear_modify(vm_page_t m)
6401 {
6402 struct md_page *pvh;
6403 struct rwlock *lock;
6404 pmap_t pmap;
6405 pv_entry_t next_pv, pv;
6406 pd_entry_t *l2, oldl2;
6407 pt_entry_t *l3, oldl3;
6408 vm_offset_t va;
6409 int md_gen, pvh_gen;
6410
6411 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6412 ("pmap_clear_modify: page %p is not managed", m));
6413 vm_page_assert_busied(m);
6414
6415 if (!pmap_page_is_write_mapped(m))
6416 return;
6417 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
6418 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6419 rw_wlock(lock);
6420 restart:
6421 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6422 pmap = PV_PMAP(pv);
6423 PMAP_ASSERT_STAGE1(pmap);
6424 if (!PMAP_TRYLOCK(pmap)) {
6425 pvh_gen = pvh->pv_gen;
6426 rw_wunlock(lock);
6427 PMAP_LOCK(pmap);
6428 rw_wlock(lock);
6429 if (pvh_gen != pvh->pv_gen) {
6430 PMAP_UNLOCK(pmap);
6431 goto restart;
6432 }
6433 }
6434 va = pv->pv_va;
6435 l2 = pmap_l2(pmap, va);
6436 oldl2 = pmap_load(l2);
6437 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
6438 if ((oldl2 & ATTR_SW_DBM) != 0 &&
6439 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
6440 (oldl2 & ATTR_SW_WIRED) == 0) {
6441 /*
6442 * Write protect the mapping to a single page so that
6443 * a subsequent write access may repromote.
6444 */
6445 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
6446 l3 = pmap_l2_to_l3(l2, va);
6447 oldl3 = pmap_load(l3);
6448 while (!atomic_fcmpset_long(l3, &oldl3,
6449 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
6450 cpu_spinwait();
6451 vm_page_dirty(m);
6452 pmap_s1_invalidate_page(pmap, va, true);
6453 }
6454 PMAP_UNLOCK(pmap);
6455 }
6456 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6457 pmap = PV_PMAP(pv);
6458 PMAP_ASSERT_STAGE1(pmap);
6459 if (!PMAP_TRYLOCK(pmap)) {
6460 md_gen = m->md.pv_gen;
6461 pvh_gen = pvh->pv_gen;
6462 rw_wunlock(lock);
6463 PMAP_LOCK(pmap);
6464 rw_wlock(lock);
6465 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6466 PMAP_UNLOCK(pmap);
6467 goto restart;
6468 }
6469 }
6470 l2 = pmap_l2(pmap, pv->pv_va);
6471 l3 = pmap_l2_to_l3(l2, pv->pv_va);
6472 oldl3 = pmap_load(l3);
6473 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){
6474 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
6475 pmap_s1_invalidate_page(pmap, pv->pv_va, true);
6476 }
6477 PMAP_UNLOCK(pmap);
6478 }
6479 rw_wunlock(lock);
6480 }
6481
6482 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)6483 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6484 {
6485 struct pmap_preinit_mapping *ppim;
6486 vm_offset_t va, offset;
6487 pd_entry_t old_l2e, *pde;
6488 pt_entry_t *l2;
6489 int i, lvl, l2_blocks, free_l2_count, start_idx;
6490
6491 if (!vm_initialized) {
6492 /*
6493 * No L3 ptables so map entire L2 blocks where start VA is:
6494 * preinit_map_va + start_idx * L2_SIZE
6495 * There may be duplicate mappings (multiple VA -> same PA) but
6496 * ARM64 dcache is always PIPT so that's acceptable.
6497 */
6498 if (size == 0)
6499 return (NULL);
6500
6501 /* Calculate how many L2 blocks are needed for the mapping */
6502 l2_blocks = (roundup2(pa + size, L2_SIZE) -
6503 rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
6504
6505 offset = pa & L2_OFFSET;
6506
6507 if (preinit_map_va == 0)
6508 return (NULL);
6509
6510 /* Map 2MiB L2 blocks from reserved VA space */
6511
6512 free_l2_count = 0;
6513 start_idx = -1;
6514 /* Find enough free contiguous VA space */
6515 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6516 ppim = pmap_preinit_mapping + i;
6517 if (free_l2_count > 0 && ppim->pa != 0) {
6518 /* Not enough space here */
6519 free_l2_count = 0;
6520 start_idx = -1;
6521 continue;
6522 }
6523
6524 if (ppim->pa == 0) {
6525 /* Free L2 block */
6526 if (start_idx == -1)
6527 start_idx = i;
6528 free_l2_count++;
6529 if (free_l2_count == l2_blocks)
6530 break;
6531 }
6532 }
6533 if (free_l2_count != l2_blocks)
6534 panic("%s: too many preinit mappings", __func__);
6535
6536 va = preinit_map_va + (start_idx * L2_SIZE);
6537 for (i = start_idx; i < start_idx + l2_blocks; i++) {
6538 /* Mark entries as allocated */
6539 ppim = pmap_preinit_mapping + i;
6540 ppim->pa = pa;
6541 ppim->va = va + offset;
6542 ppim->size = size;
6543 }
6544
6545 /* Map L2 blocks */
6546 pa = rounddown2(pa, L2_SIZE);
6547 old_l2e = 0;
6548 for (i = 0; i < l2_blocks; i++) {
6549 pde = pmap_pde(kernel_pmap, va, &lvl);
6550 KASSERT(pde != NULL,
6551 ("pmap_mapbios: Invalid page entry, va: 0x%lx",
6552 va));
6553 KASSERT(lvl == 1,
6554 ("pmap_mapbios: Invalid level %d", lvl));
6555
6556 /* Insert L2_BLOCK */
6557 l2 = pmap_l1_to_l2(pde, va);
6558 old_l2e |= pmap_load_store(l2,
6559 PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_XN |
6560 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
6561
6562 va += L2_SIZE;
6563 pa += L2_SIZE;
6564 }
6565 if ((old_l2e & ATTR_DESCR_VALID) != 0)
6566 pmap_s1_invalidate_all(kernel_pmap);
6567 else {
6568 /*
6569 * Because the old entries were invalid and the new
6570 * mappings are not executable, an isb is not required.
6571 */
6572 dsb(ishst);
6573 }
6574
6575 va = preinit_map_va + (start_idx * L2_SIZE);
6576
6577 } else {
6578 /* kva_alloc may be used to map the pages */
6579 offset = pa & PAGE_MASK;
6580 size = round_page(offset + size);
6581
6582 va = kva_alloc(size);
6583 if (va == 0)
6584 panic("%s: Couldn't allocate KVA", __func__);
6585
6586 pde = pmap_pde(kernel_pmap, va, &lvl);
6587 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
6588
6589 /* L3 table is linked */
6590 va = trunc_page(va);
6591 pa = trunc_page(pa);
6592 pmap_kenter(va, size, pa, memory_mapping_mode(pa));
6593 }
6594
6595 return ((void *)(va + offset));
6596 }
6597
6598 void
pmap_unmapbios(void * p,vm_size_t size)6599 pmap_unmapbios(void *p, vm_size_t size)
6600 {
6601 struct pmap_preinit_mapping *ppim;
6602 vm_offset_t offset, va, va_trunc;
6603 pd_entry_t *pde;
6604 pt_entry_t *l2;
6605 int i, lvl, l2_blocks, block;
6606 bool preinit_map;
6607
6608 va = (vm_offset_t)p;
6609 l2_blocks =
6610 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
6611 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
6612
6613 /* Remove preinit mapping */
6614 preinit_map = false;
6615 block = 0;
6616 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6617 ppim = pmap_preinit_mapping + i;
6618 if (ppim->va == va) {
6619 KASSERT(ppim->size == size,
6620 ("pmap_unmapbios: size mismatch"));
6621 ppim->va = 0;
6622 ppim->pa = 0;
6623 ppim->size = 0;
6624 preinit_map = true;
6625 offset = block * L2_SIZE;
6626 va_trunc = rounddown2(va, L2_SIZE) + offset;
6627
6628 /* Remove L2_BLOCK */
6629 pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
6630 KASSERT(pde != NULL,
6631 ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
6632 va_trunc));
6633 l2 = pmap_l1_to_l2(pde, va_trunc);
6634 pmap_clear(l2);
6635
6636 if (block == (l2_blocks - 1))
6637 break;
6638 block++;
6639 }
6640 }
6641 if (preinit_map) {
6642 pmap_s1_invalidate_all(kernel_pmap);
6643 return;
6644 }
6645
6646 /* Unmap the pages reserved with kva_alloc. */
6647 if (vm_initialized) {
6648 offset = va & PAGE_MASK;
6649 size = round_page(offset + size);
6650 va = trunc_page(va);
6651
6652 /* Unmap and invalidate the pages */
6653 pmap_kremove_device(va, size);
6654
6655 kva_free(va, size);
6656 }
6657 }
6658
6659 /*
6660 * Sets the memory attribute for the specified page.
6661 */
6662 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)6663 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6664 {
6665
6666 m->md.pv_memattr = ma;
6667
6668 /*
6669 * If "m" is a normal page, update its direct mapping. This update
6670 * can be relied upon to perform any cache operations that are
6671 * required for data coherence.
6672 */
6673 if ((m->flags & PG_FICTITIOUS) == 0 &&
6674 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6675 m->md.pv_memattr) != 0)
6676 panic("memory attribute change on the direct map failed");
6677 }
6678
6679 /*
6680 * Changes the specified virtual address range's memory type to that given by
6681 * the parameter "mode". The specified virtual address range must be
6682 * completely contained within either the direct map or the kernel map. If
6683 * the virtual address range is contained within the kernel map, then the
6684 * memory type for each of the corresponding ranges of the direct map is also
6685 * changed. (The corresponding ranges of the direct map are those ranges that
6686 * map the same physical pages as the specified virtual address range.) These
6687 * changes to the direct map are necessary because Intel describes the
6688 * behavior of their processors as "undefined" if two or more mappings to the
6689 * same physical page have different memory types.
6690 *
6691 * Returns zero if the change completed successfully, and either EINVAL or
6692 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
6693 * of the virtual address range was not mapped, and ENOMEM is returned if
6694 * there was insufficient memory available to complete the change. In the
6695 * latter case, the memory type may have been changed on some part of the
6696 * virtual address range or the direct map.
6697 */
6698 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)6699 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6700 {
6701 int error;
6702
6703 PMAP_LOCK(kernel_pmap);
6704 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
6705 PMAP_UNLOCK(kernel_pmap);
6706 return (error);
6707 }
6708
6709 /*
6710 * Changes the specified virtual address range's protections to those
6711 * specified by "prot". Like pmap_change_attr(), protections for aliases
6712 * in the direct map are updated as well. Protections on aliasing mappings may
6713 * be a subset of the requested protections; for example, mappings in the direct
6714 * map are never executable.
6715 */
6716 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)6717 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
6718 {
6719 int error;
6720
6721 /* Only supported within the kernel map. */
6722 if (va < VM_MIN_KERNEL_ADDRESS)
6723 return (EINVAL);
6724
6725 PMAP_LOCK(kernel_pmap);
6726 error = pmap_change_props_locked(va, size, prot, -1, false);
6727 PMAP_UNLOCK(kernel_pmap);
6728 return (error);
6729 }
6730
6731 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)6732 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
6733 int mode, bool skip_unmapped)
6734 {
6735 vm_offset_t base, offset, tmpva;
6736 vm_size_t pte_size;
6737 vm_paddr_t pa;
6738 pt_entry_t pte, *ptep, *newpte;
6739 pt_entry_t bits, mask;
6740 int lvl, rv;
6741
6742 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6743 base = trunc_page(va);
6744 offset = va & PAGE_MASK;
6745 size = round_page(offset + size);
6746
6747 if (!VIRT_IN_DMAP(base) &&
6748 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
6749 return (EINVAL);
6750
6751 bits = 0;
6752 mask = 0;
6753 if (mode != -1) {
6754 bits = ATTR_S1_IDX(mode);
6755 mask = ATTR_S1_IDX_MASK;
6756 if (mode == VM_MEMATTR_DEVICE) {
6757 mask |= ATTR_S1_XN;
6758 bits |= ATTR_S1_XN;
6759 }
6760 }
6761 if (prot != VM_PROT_NONE) {
6762 /* Don't mark the DMAP as executable. It never is on arm64. */
6763 if (VIRT_IN_DMAP(base)) {
6764 prot &= ~VM_PROT_EXECUTE;
6765 /*
6766 * XXX Mark the DMAP as writable for now. We rely
6767 * on this in ddb & dtrace to insert breakpoint
6768 * instructions.
6769 */
6770 prot |= VM_PROT_WRITE;
6771 }
6772
6773 if ((prot & VM_PROT_WRITE) == 0) {
6774 bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
6775 }
6776 if ((prot & VM_PROT_EXECUTE) == 0) {
6777 bits |= ATTR_S1_PXN;
6778 }
6779 bits |= ATTR_S1_UXN;
6780 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
6781 }
6782
6783 for (tmpva = base; tmpva < base + size; ) {
6784 ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
6785 if (ptep == NULL && !skip_unmapped) {
6786 return (EINVAL);
6787 } else if ((ptep == NULL && skip_unmapped) ||
6788 (pmap_load(ptep) & mask) == bits) {
6789 /*
6790 * We already have the correct attribute or there
6791 * is no memory mapped at this address and we are
6792 * skipping unmapped memory.
6793 */
6794 switch (lvl) {
6795 default:
6796 panic("Invalid DMAP table level: %d\n", lvl);
6797 case 1:
6798 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
6799 break;
6800 case 2:
6801 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
6802 break;
6803 case 3:
6804 tmpva += PAGE_SIZE;
6805 break;
6806 }
6807 } else {
6808 /* We can't demote/promote this entry */
6809 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
6810
6811 /*
6812 * Split the entry to an level 3 table, then
6813 * set the new attribute.
6814 */
6815 switch (lvl) {
6816 default:
6817 panic("Invalid DMAP table level: %d\n", lvl);
6818 case 1:
6819 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6820 if ((tmpva & L1_OFFSET) == 0 &&
6821 (base + size - tmpva) >= L1_SIZE) {
6822 pte_size = L1_SIZE;
6823 break;
6824 }
6825 newpte = pmap_demote_l1(kernel_pmap, ptep,
6826 tmpva & ~L1_OFFSET);
6827 if (newpte == NULL)
6828 return (EINVAL);
6829 ptep = pmap_l1_to_l2(ptep, tmpva);
6830 /* FALLTHROUGH */
6831 case 2:
6832 if ((tmpva & L2_OFFSET) == 0 &&
6833 (base + size - tmpva) >= L2_SIZE) {
6834 pte_size = L2_SIZE;
6835 break;
6836 }
6837 newpte = pmap_demote_l2(kernel_pmap, ptep,
6838 tmpva);
6839 if (newpte == NULL)
6840 return (EINVAL);
6841 ptep = pmap_l2_to_l3(ptep, tmpva);
6842 /* FALLTHROUGH */
6843 case 3:
6844 pte_size = PAGE_SIZE;
6845 break;
6846 }
6847
6848 /* Update the entry */
6849 pte = pmap_load(ptep);
6850 pte &= ~mask;
6851 pte |= bits;
6852
6853 pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
6854 pte_size);
6855
6856 pa = PTE_TO_PHYS(pte);
6857 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
6858 /*
6859 * Keep the DMAP memory in sync.
6860 */
6861 rv = pmap_change_props_locked(
6862 PHYS_TO_DMAP(pa), pte_size,
6863 prot, mode, true);
6864 if (rv != 0)
6865 return (rv);
6866 }
6867
6868 /*
6869 * If moving to a non-cacheable entry flush
6870 * the cache.
6871 */
6872 if (mode == VM_MEMATTR_UNCACHEABLE)
6873 cpu_dcache_wbinv_range(tmpva, pte_size);
6874 tmpva += pte_size;
6875 }
6876 }
6877
6878 return (0);
6879 }
6880
6881 /*
6882 * Create an L2 table to map all addresses within an L1 mapping.
6883 */
6884 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)6885 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
6886 {
6887 pt_entry_t *l2, newl2, oldl1;
6888 vm_offset_t tmpl1;
6889 vm_paddr_t l2phys, phys;
6890 vm_page_t ml2;
6891 int i;
6892
6893 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6894 oldl1 = pmap_load(l1);
6895 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6896 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
6897 ("pmap_demote_l1: Demoting a non-block entry"));
6898 KASSERT((va & L1_OFFSET) == 0,
6899 ("pmap_demote_l1: Invalid virtual address %#lx", va));
6900 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
6901 ("pmap_demote_l1: Level 1 table shouldn't be managed"));
6902 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
6903 ("pmap_demote_l1: Demoting entry with no-demote flag set"));
6904
6905 tmpl1 = 0;
6906 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
6907 tmpl1 = kva_alloc(PAGE_SIZE);
6908 if (tmpl1 == 0)
6909 return (NULL);
6910 }
6911
6912 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
6913 NULL) {
6914 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
6915 " in pmap %p", va, pmap);
6916 l2 = NULL;
6917 goto fail;
6918 }
6919
6920 l2phys = VM_PAGE_TO_PHYS(ml2);
6921 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
6922
6923 /* Address the range points at */
6924 phys = PTE_TO_PHYS(oldl1);
6925 /* The attributed from the old l1 table to be copied */
6926 newl2 = oldl1 & ATTR_MASK;
6927
6928 /* Create the new entries */
6929 for (i = 0; i < Ln_ENTRIES; i++) {
6930 l2[i] = newl2 | phys;
6931 phys += L2_SIZE;
6932 }
6933 KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
6934 ("Invalid l2 page (%lx != %lx)", l2[0],
6935 (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
6936
6937 if (tmpl1 != 0) {
6938 pmap_kenter(tmpl1, PAGE_SIZE,
6939 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
6940 VM_MEMATTR_WRITE_BACK);
6941 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
6942 }
6943
6944 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
6945
6946 fail:
6947 if (tmpl1 != 0) {
6948 pmap_kremove(tmpl1);
6949 kva_free(tmpl1, PAGE_SIZE);
6950 }
6951
6952 return (l2);
6953 }
6954
6955 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)6956 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
6957 {
6958 pt_entry_t *l3;
6959
6960 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
6961 *l3 = newl3;
6962 newl3 += L3_SIZE;
6963 }
6964 }
6965
6966 static void
pmap_demote_l2_check(pt_entry_t * firstl3p __unused,pt_entry_t newl3e __unused)6967 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
6968 {
6969 #ifdef INVARIANTS
6970 #ifdef DIAGNOSTIC
6971 pt_entry_t *xl3p, *yl3p;
6972
6973 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
6974 xl3p++, newl3e += PAGE_SIZE) {
6975 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
6976 printf("pmap_demote_l2: xl3e %zd and newl3e map "
6977 "different pages: found %#lx, expected %#lx\n",
6978 xl3p - firstl3p, pmap_load(xl3p), newl3e);
6979 printf("page table dump\n");
6980 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
6981 yl3p++) {
6982 printf("%zd %#lx\n", yl3p - firstl3p,
6983 pmap_load(yl3p));
6984 }
6985 panic("firstpte");
6986 }
6987 }
6988 #else
6989 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
6990 ("pmap_demote_l2: firstl3 and newl3e map different physical"
6991 " addresses"));
6992 #endif
6993 #endif
6994 }
6995
6996 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)6997 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
6998 struct rwlock **lockp)
6999 {
7000 struct spglist free;
7001
7002 SLIST_INIT(&free);
7003 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
7004 lockp);
7005 vm_page_free_pages_toq(&free, true);
7006 }
7007
7008 /*
7009 * Create an L3 table to map all addresses within an L2 mapping.
7010 */
7011 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)7012 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
7013 struct rwlock **lockp)
7014 {
7015 pt_entry_t *l3, newl3, oldl2;
7016 vm_offset_t tmpl2;
7017 vm_paddr_t l3phys;
7018 vm_page_t ml3;
7019
7020 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7021 PMAP_ASSERT_STAGE1(pmap);
7022 KASSERT(ADDR_IS_CANONICAL(va),
7023 ("%s: Address not in canonical form: %lx", __func__, va));
7024
7025 l3 = NULL;
7026 oldl2 = pmap_load(l2);
7027 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
7028 ("pmap_demote_l2: Demoting a non-block entry"));
7029 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
7030 ("pmap_demote_l2: Demoting entry with no-demote flag set"));
7031 va &= ~L2_OFFSET;
7032
7033 tmpl2 = 0;
7034 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
7035 tmpl2 = kva_alloc(PAGE_SIZE);
7036 if (tmpl2 == 0)
7037 return (NULL);
7038 }
7039
7040 /*
7041 * Invalidate the 2MB page mapping and return "failure" if the
7042 * mapping was never accessed.
7043 */
7044 if ((oldl2 & ATTR_AF) == 0) {
7045 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
7046 ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
7047 pmap_demote_l2_abort(pmap, va, l2, lockp);
7048 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
7049 va, pmap);
7050 goto fail;
7051 }
7052
7053 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
7054 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
7055 ("pmap_demote_l2: page table page for a wired mapping"
7056 " is missing"));
7057
7058 /*
7059 * If the page table page is missing and the mapping
7060 * is for a kernel address, the mapping must belong to
7061 * either the direct map or the early kernel memory.
7062 * Page table pages are preallocated for every other
7063 * part of the kernel address space, so the direct map
7064 * region and early kernel memory are the only parts of the
7065 * kernel address space that must be handled here.
7066 */
7067 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
7068 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
7069 ("pmap_demote_l2: No saved mpte for va %#lx", va));
7070
7071 /*
7072 * If the 2MB page mapping belongs to the direct map
7073 * region of the kernel's address space, then the page
7074 * allocation request specifies the highest possible
7075 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
7076 * priority is normal.
7077 */
7078 ml3 = vm_page_alloc_noobj(
7079 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
7080 VM_ALLOC_WIRED);
7081
7082 /*
7083 * If the allocation of the new page table page fails,
7084 * invalidate the 2MB page mapping and return "failure".
7085 */
7086 if (ml3 == NULL) {
7087 pmap_demote_l2_abort(pmap, va, l2, lockp);
7088 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
7089 " in pmap %p", va, pmap);
7090 goto fail;
7091 }
7092 ml3->pindex = pmap_l2_pindex(va);
7093
7094 if (!ADDR_IS_KERNEL(va)) {
7095 ml3->ref_count = NL3PG;
7096 pmap_resident_count_inc(pmap, 1);
7097 }
7098 }
7099 l3phys = VM_PAGE_TO_PHYS(ml3);
7100 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
7101 newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
7102 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
7103 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
7104 ("pmap_demote_l2: L2 entry is writeable but not dirty"));
7105
7106 /*
7107 * If the PTP is not leftover from an earlier promotion or it does not
7108 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all
7109 * have ATTR_AF set.
7110 *
7111 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
7112 * performs a dsb(). That dsb() ensures that the stores for filling
7113 * "l3" are visible before "l3" is added to the page table.
7114 */
7115 if (!vm_page_all_valid(ml3))
7116 pmap_fill_l3(l3, newl3);
7117
7118 pmap_demote_l2_check(l3, newl3);
7119
7120 /*
7121 * If the mapping has changed attributes, update the L3Es.
7122 */
7123 if ((pmap_load(l3) & (ATTR_MASK & ~ATTR_AF)) != (newl3 & (ATTR_MASK &
7124 ~ATTR_AF)))
7125 pmap_fill_l3(l3, newl3);
7126
7127 /*
7128 * Map the temporary page so we don't lose access to the l2 table.
7129 */
7130 if (tmpl2 != 0) {
7131 pmap_kenter(tmpl2, PAGE_SIZE,
7132 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
7133 VM_MEMATTR_WRITE_BACK);
7134 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
7135 }
7136
7137 /*
7138 * The spare PV entries must be reserved prior to demoting the
7139 * mapping, that is, prior to changing the PDE. Otherwise, the state
7140 * of the L2 and the PV lists will be inconsistent, which can result
7141 * in reclaim_pv_chunk() attempting to remove a PV entry from the
7142 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
7143 * PV entry for the 2MB page mapping that is being demoted.
7144 */
7145 if ((oldl2 & ATTR_SW_MANAGED) != 0)
7146 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
7147
7148 /*
7149 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
7150 * the 2MB page mapping.
7151 */
7152 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
7153
7154 /*
7155 * Demote the PV entry.
7156 */
7157 if ((oldl2 & ATTR_SW_MANAGED) != 0)
7158 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
7159
7160 atomic_add_long(&pmap_l2_demotions, 1);
7161 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
7162 " in pmap %p %lx", va, pmap, l3[0]);
7163
7164 fail:
7165 if (tmpl2 != 0) {
7166 pmap_kremove(tmpl2);
7167 kva_free(tmpl2, PAGE_SIZE);
7168 }
7169
7170 return (l3);
7171
7172 }
7173
7174 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)7175 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
7176 {
7177 struct rwlock *lock;
7178 pt_entry_t *l3;
7179
7180 lock = NULL;
7181 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
7182 if (lock != NULL)
7183 rw_wunlock(lock);
7184 return (l3);
7185 }
7186
7187 /*
7188 * Perform the pmap work for mincore(2). If the page is not both referenced and
7189 * modified by this pmap, returns its physical address so that the caller can
7190 * find other mappings.
7191 */
7192 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)7193 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
7194 {
7195 pt_entry_t *pte, tpte;
7196 vm_paddr_t mask, pa;
7197 int lvl, val;
7198 bool managed;
7199
7200 PMAP_ASSERT_STAGE1(pmap);
7201 PMAP_LOCK(pmap);
7202 pte = pmap_pte(pmap, addr, &lvl);
7203 if (pte != NULL) {
7204 tpte = pmap_load(pte);
7205
7206 switch (lvl) {
7207 case 3:
7208 mask = L3_OFFSET;
7209 break;
7210 case 2:
7211 mask = L2_OFFSET;
7212 break;
7213 case 1:
7214 mask = L1_OFFSET;
7215 break;
7216 default:
7217 panic("pmap_mincore: invalid level %d", lvl);
7218 }
7219
7220 managed = (tpte & ATTR_SW_MANAGED) != 0;
7221 val = MINCORE_INCORE;
7222 if (lvl != 3)
7223 val |= MINCORE_PSIND(3 - lvl);
7224 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
7225 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
7226 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
7227 if ((tpte & ATTR_AF) == ATTR_AF)
7228 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
7229
7230 pa = PTE_TO_PHYS(tpte) | (addr & mask);
7231 } else {
7232 managed = false;
7233 val = 0;
7234 }
7235
7236 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
7237 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
7238 *pap = pa;
7239 }
7240 PMAP_UNLOCK(pmap);
7241 return (val);
7242 }
7243
7244 /*
7245 * Garbage collect every ASID that is neither active on a processor nor
7246 * reserved.
7247 */
7248 static void
pmap_reset_asid_set(pmap_t pmap)7249 pmap_reset_asid_set(pmap_t pmap)
7250 {
7251 pmap_t curpmap;
7252 int asid, cpuid, epoch;
7253 struct asid_set *set;
7254 enum pmap_stage stage;
7255
7256 set = pmap->pm_asid_set;
7257 stage = pmap->pm_stage;
7258
7259 set = pmap->pm_asid_set;
7260 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
7261 mtx_assert(&set->asid_set_mutex, MA_OWNED);
7262
7263 /*
7264 * Ensure that the store to asid_epoch is globally visible before the
7265 * loads from pc_curpmap are performed.
7266 */
7267 epoch = set->asid_epoch + 1;
7268 if (epoch == INT_MAX)
7269 epoch = 0;
7270 set->asid_epoch = epoch;
7271 dsb(ishst);
7272 if (stage == PM_STAGE1) {
7273 __asm __volatile("tlbi vmalle1is");
7274 } else {
7275 KASSERT(pmap_clean_stage2_tlbi != NULL,
7276 ("%s: Unset stage 2 tlb invalidation callback\n",
7277 __func__));
7278 pmap_clean_stage2_tlbi();
7279 }
7280 dsb(ish);
7281 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
7282 set->asid_set_size - 1);
7283 CPU_FOREACH(cpuid) {
7284 if (cpuid == curcpu)
7285 continue;
7286 if (stage == PM_STAGE1) {
7287 curpmap = pcpu_find(cpuid)->pc_curpmap;
7288 PMAP_ASSERT_STAGE1(pmap);
7289 } else {
7290 curpmap = pcpu_find(cpuid)->pc_curvmpmap;
7291 if (curpmap == NULL)
7292 continue;
7293 PMAP_ASSERT_STAGE2(pmap);
7294 }
7295 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
7296 asid = COOKIE_TO_ASID(curpmap->pm_cookie);
7297 if (asid == -1)
7298 continue;
7299 bit_set(set->asid_set, asid);
7300 curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
7301 }
7302 }
7303
7304 /*
7305 * Allocate a new ASID for the specified pmap.
7306 */
7307 static void
pmap_alloc_asid(pmap_t pmap)7308 pmap_alloc_asid(pmap_t pmap)
7309 {
7310 struct asid_set *set;
7311 int new_asid;
7312
7313 set = pmap->pm_asid_set;
7314 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
7315
7316 mtx_lock_spin(&set->asid_set_mutex);
7317
7318 /*
7319 * While this processor was waiting to acquire the asid set mutex,
7320 * pmap_reset_asid_set() running on another processor might have
7321 * updated this pmap's cookie to the current epoch. In which case, we
7322 * don't need to allocate a new ASID.
7323 */
7324 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
7325 goto out;
7326
7327 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
7328 &new_asid);
7329 if (new_asid == -1) {
7330 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
7331 set->asid_next, &new_asid);
7332 if (new_asid == -1) {
7333 pmap_reset_asid_set(pmap);
7334 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
7335 set->asid_set_size, &new_asid);
7336 KASSERT(new_asid != -1, ("ASID allocation failure"));
7337 }
7338 }
7339 bit_set(set->asid_set, new_asid);
7340 set->asid_next = new_asid + 1;
7341 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
7342 out:
7343 mtx_unlock_spin(&set->asid_set_mutex);
7344 }
7345
7346 static uint64_t __read_mostly ttbr_flags;
7347
7348 /*
7349 * Compute the value that should be stored in ttbr0 to activate the specified
7350 * pmap. This value may change from time to time.
7351 */
7352 uint64_t
pmap_to_ttbr0(pmap_t pmap)7353 pmap_to_ttbr0(pmap_t pmap)
7354 {
7355 uint64_t ttbr;
7356
7357 ttbr = pmap->pm_ttbr;
7358 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
7359 ttbr |= ttbr_flags;
7360
7361 return (ttbr);
7362 }
7363
7364 static void
pmap_set_cnp(void * arg)7365 pmap_set_cnp(void *arg)
7366 {
7367 uint64_t ttbr0, ttbr1;
7368 u_int cpuid;
7369
7370 cpuid = *(u_int *)arg;
7371 if (cpuid == curcpu) {
7372 /*
7373 * Set the flags while all CPUs are handling the
7374 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
7375 * to pmap_to_ttbr0 after this will have the CnP flag set.
7376 * The dsb after invalidating the TLB will act as a barrier
7377 * to ensure all CPUs can observe this change.
7378 */
7379 ttbr_flags |= TTBR_CnP;
7380 }
7381
7382 ttbr0 = READ_SPECIALREG(ttbr0_el1);
7383 ttbr0 |= TTBR_CnP;
7384
7385 ttbr1 = READ_SPECIALREG(ttbr1_el1);
7386 ttbr1 |= TTBR_CnP;
7387
7388 /* Update ttbr{0,1}_el1 with the CnP flag */
7389 WRITE_SPECIALREG(ttbr0_el1, ttbr0);
7390 WRITE_SPECIALREG(ttbr1_el1, ttbr1);
7391 isb();
7392 __asm __volatile("tlbi vmalle1is");
7393 dsb(ish);
7394 isb();
7395 }
7396
7397 /*
7398 * Defer enabling CnP until we have read the ID registers to know if it's
7399 * supported on all CPUs.
7400 */
7401 static void
pmap_init_cnp(void * dummy __unused)7402 pmap_init_cnp(void *dummy __unused)
7403 {
7404 uint64_t reg;
7405 u_int cpuid;
7406
7407 if (!get_kernel_reg(ID_AA64MMFR2_EL1, ®))
7408 return;
7409
7410 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
7411 if (bootverbose)
7412 printf("Enabling CnP\n");
7413 cpuid = curcpu;
7414 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
7415 }
7416
7417 }
7418 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
7419
7420 static bool
pmap_activate_int(pmap_t pmap)7421 pmap_activate_int(pmap_t pmap)
7422 {
7423 struct asid_set *set;
7424 int epoch;
7425
7426 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
7427 KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
7428
7429 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
7430 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
7431 /*
7432 * Handle the possibility that the old thread was preempted
7433 * after an "ic" or "tlbi" instruction but before it performed
7434 * a "dsb" instruction. If the old thread migrates to a new
7435 * processor, its completion of a "dsb" instruction on that
7436 * new processor does not guarantee that the "ic" or "tlbi"
7437 * instructions performed on the old processor have completed.
7438 */
7439 dsb(ish);
7440 return (false);
7441 }
7442
7443 set = pmap->pm_asid_set;
7444 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
7445
7446 /*
7447 * Ensure that the store to curpmap is globally visible before the
7448 * load from asid_epoch is performed.
7449 */
7450 if (pmap->pm_stage == PM_STAGE1)
7451 PCPU_SET(curpmap, pmap);
7452 else
7453 PCPU_SET(curvmpmap, pmap);
7454 dsb(ish);
7455 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
7456 if (epoch >= 0 && epoch != set->asid_epoch)
7457 pmap_alloc_asid(pmap);
7458
7459 if (pmap->pm_stage == PM_STAGE1) {
7460 set_ttbr0(pmap_to_ttbr0(pmap));
7461 if (PCPU_GET(bcast_tlbi_workaround) != 0)
7462 invalidate_local_icache();
7463 }
7464 return (true);
7465 }
7466
7467 void
pmap_activate_vm(pmap_t pmap)7468 pmap_activate_vm(pmap_t pmap)
7469 {
7470
7471 PMAP_ASSERT_STAGE2(pmap);
7472
7473 (void)pmap_activate_int(pmap);
7474 }
7475
7476 void
pmap_activate(struct thread * td)7477 pmap_activate(struct thread *td)
7478 {
7479 pmap_t pmap;
7480
7481 pmap = vmspace_pmap(td->td_proc->p_vmspace);
7482 PMAP_ASSERT_STAGE1(pmap);
7483 critical_enter();
7484 (void)pmap_activate_int(pmap);
7485 critical_exit();
7486 }
7487
7488 /*
7489 * Activate the thread we are switching to.
7490 * To simplify the assembly in cpu_throw return the new threads pcb.
7491 */
7492 struct pcb *
pmap_switch(struct thread * new)7493 pmap_switch(struct thread *new)
7494 {
7495 pcpu_bp_harden bp_harden;
7496 struct pcb *pcb;
7497
7498 /* Store the new curthread */
7499 PCPU_SET(curthread, new);
7500
7501 /* And the new pcb */
7502 pcb = new->td_pcb;
7503 PCPU_SET(curpcb, pcb);
7504
7505 /*
7506 * TODO: We may need to flush the cache here if switching
7507 * to a user process.
7508 */
7509
7510 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
7511 /*
7512 * Stop userspace from training the branch predictor against
7513 * other processes. This will call into a CPU specific
7514 * function that clears the branch predictor state.
7515 */
7516 bp_harden = PCPU_GET(bp_harden);
7517 if (bp_harden != NULL)
7518 bp_harden();
7519 }
7520
7521 return (pcb);
7522 }
7523
7524 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)7525 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
7526 {
7527
7528 PMAP_ASSERT_STAGE1(pmap);
7529 KASSERT(ADDR_IS_CANONICAL(va),
7530 ("%s: Address not in canonical form: %lx", __func__, va));
7531
7532 if (ADDR_IS_KERNEL(va)) {
7533 cpu_icache_sync_range(va, sz);
7534 } else {
7535 u_int len, offset;
7536 vm_paddr_t pa;
7537
7538 /* Find the length of data in this page to flush */
7539 offset = va & PAGE_MASK;
7540 len = imin(PAGE_SIZE - offset, sz);
7541
7542 while (sz != 0) {
7543 /* Extract the physical address & find it in the DMAP */
7544 pa = pmap_extract(pmap, va);
7545 if (pa != 0)
7546 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
7547
7548 /* Move to the next page */
7549 sz -= len;
7550 va += len;
7551 /* Set the length for the next iteration */
7552 len = imin(PAGE_SIZE, sz);
7553 }
7554 }
7555 }
7556
7557 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)7558 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
7559 {
7560 pd_entry_t *pdep;
7561 pt_entry_t *ptep, pte;
7562 int rv, lvl, dfsc;
7563
7564 PMAP_ASSERT_STAGE2(pmap);
7565 rv = KERN_FAILURE;
7566
7567 /* Data and insn aborts use same encoding for FSC field. */
7568 dfsc = esr & ISS_DATA_DFSC_MASK;
7569 switch (dfsc) {
7570 case ISS_DATA_DFSC_TF_L0:
7571 case ISS_DATA_DFSC_TF_L1:
7572 case ISS_DATA_DFSC_TF_L2:
7573 case ISS_DATA_DFSC_TF_L3:
7574 PMAP_LOCK(pmap);
7575 pdep = pmap_pde(pmap, far, &lvl);
7576 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
7577 PMAP_UNLOCK(pmap);
7578 break;
7579 }
7580
7581 switch (lvl) {
7582 case 0:
7583 ptep = pmap_l0_to_l1(pdep, far);
7584 break;
7585 case 1:
7586 ptep = pmap_l1_to_l2(pdep, far);
7587 break;
7588 case 2:
7589 ptep = pmap_l2_to_l3(pdep, far);
7590 break;
7591 default:
7592 panic("%s: Invalid pde level %d", __func__,lvl);
7593 }
7594 goto fault_exec;
7595
7596 case ISS_DATA_DFSC_AFF_L1:
7597 case ISS_DATA_DFSC_AFF_L2:
7598 case ISS_DATA_DFSC_AFF_L3:
7599 PMAP_LOCK(pmap);
7600 ptep = pmap_pte(pmap, far, &lvl);
7601 fault_exec:
7602 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
7603 if (icache_vmid) {
7604 pmap_invalidate_vpipt_icache();
7605 } else {
7606 /*
7607 * If accessing an executable page invalidate
7608 * the I-cache so it will be valid when we
7609 * continue execution in the guest. The D-cache
7610 * is assumed to already be clean to the Point
7611 * of Coherency.
7612 */
7613 if ((pte & ATTR_S2_XN_MASK) !=
7614 ATTR_S2_XN(ATTR_S2_XN_NONE)) {
7615 invalidate_icache();
7616 }
7617 }
7618 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
7619 rv = KERN_SUCCESS;
7620 }
7621 PMAP_UNLOCK(pmap);
7622 break;
7623 }
7624
7625 return (rv);
7626 }
7627
7628 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)7629 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
7630 {
7631 pt_entry_t pte, *ptep;
7632 register_t intr;
7633 uint64_t ec, par;
7634 int lvl, rv;
7635
7636 rv = KERN_FAILURE;
7637
7638 ec = ESR_ELx_EXCEPTION(esr);
7639 switch (ec) {
7640 case EXCP_INSN_ABORT_L:
7641 case EXCP_INSN_ABORT:
7642 case EXCP_DATA_ABORT_L:
7643 case EXCP_DATA_ABORT:
7644 break;
7645 default:
7646 return (rv);
7647 }
7648
7649 if (pmap->pm_stage == PM_STAGE2)
7650 return (pmap_stage2_fault(pmap, esr, far));
7651
7652 /* Data and insn aborts use same encoding for FSC field. */
7653 switch (esr & ISS_DATA_DFSC_MASK) {
7654 case ISS_DATA_DFSC_AFF_L1:
7655 case ISS_DATA_DFSC_AFF_L2:
7656 case ISS_DATA_DFSC_AFF_L3:
7657 PMAP_LOCK(pmap);
7658 ptep = pmap_pte(pmap, far, &lvl);
7659 if (ptep != NULL) {
7660 pmap_set_bits(ptep, ATTR_AF);
7661 rv = KERN_SUCCESS;
7662 /*
7663 * XXXMJ as an optimization we could mark the entry
7664 * dirty if this is a write fault.
7665 */
7666 }
7667 PMAP_UNLOCK(pmap);
7668 break;
7669 case ISS_DATA_DFSC_PF_L1:
7670 case ISS_DATA_DFSC_PF_L2:
7671 case ISS_DATA_DFSC_PF_L3:
7672 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
7673 (esr & ISS_DATA_WnR) == 0)
7674 return (rv);
7675 PMAP_LOCK(pmap);
7676 ptep = pmap_pte(pmap, far, &lvl);
7677 if (ptep != NULL &&
7678 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
7679 if ((pte & ATTR_S1_AP_RW_BIT) ==
7680 ATTR_S1_AP(ATTR_S1_AP_RO)) {
7681 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
7682 pmap_s1_invalidate_page(pmap, far, true);
7683 }
7684 rv = KERN_SUCCESS;
7685 }
7686 PMAP_UNLOCK(pmap);
7687 break;
7688 case ISS_DATA_DFSC_TF_L0:
7689 case ISS_DATA_DFSC_TF_L1:
7690 case ISS_DATA_DFSC_TF_L2:
7691 case ISS_DATA_DFSC_TF_L3:
7692 /*
7693 * Retry the translation. A break-before-make sequence can
7694 * produce a transient fault.
7695 */
7696 if (pmap == kernel_pmap) {
7697 /*
7698 * The translation fault may have occurred within a
7699 * critical section. Therefore, we must check the
7700 * address without acquiring the kernel pmap's lock.
7701 */
7702 if (pmap_klookup(far, NULL))
7703 rv = KERN_SUCCESS;
7704 } else {
7705 PMAP_LOCK(pmap);
7706 /* Ask the MMU to check the address. */
7707 intr = intr_disable();
7708 par = arm64_address_translate_s1e0r(far);
7709 intr_restore(intr);
7710 PMAP_UNLOCK(pmap);
7711
7712 /*
7713 * If the translation was successful, then we can
7714 * return success to the trap handler.
7715 */
7716 if (PAR_SUCCESS(par))
7717 rv = KERN_SUCCESS;
7718 }
7719 break;
7720 }
7721
7722 return (rv);
7723 }
7724
7725 /*
7726 * Increase the starting virtual address of the given mapping if a
7727 * different alignment might result in more superpage mappings.
7728 */
7729 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)7730 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
7731 vm_offset_t *addr, vm_size_t size)
7732 {
7733 vm_offset_t superpage_offset;
7734
7735 if (size < L2_SIZE)
7736 return;
7737 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
7738 offset += ptoa(object->pg_color);
7739 superpage_offset = offset & L2_OFFSET;
7740 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
7741 (*addr & L2_OFFSET) == superpage_offset)
7742 return;
7743 if ((*addr & L2_OFFSET) < superpage_offset)
7744 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
7745 else
7746 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
7747 }
7748
7749 /**
7750 * Get the kernel virtual address of a set of physical pages. If there are
7751 * physical addresses not covered by the DMAP perform a transient mapping
7752 * that will be removed when calling pmap_unmap_io_transient.
7753 *
7754 * \param page The pages the caller wishes to obtain the virtual
7755 * address on the kernel memory map.
7756 * \param vaddr On return contains the kernel virtual memory address
7757 * of the pages passed in the page parameter.
7758 * \param count Number of pages passed in.
7759 * \param can_fault true if the thread using the mapped pages can take
7760 * page faults, false otherwise.
7761 *
7762 * \returns true if the caller must call pmap_unmap_io_transient when
7763 * finished or false otherwise.
7764 *
7765 */
7766 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)7767 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7768 bool can_fault)
7769 {
7770 vm_paddr_t paddr;
7771 bool needs_mapping;
7772 int error __diagused, i;
7773
7774 /*
7775 * Allocate any KVA space that we need, this is done in a separate
7776 * loop to prevent calling vmem_alloc while pinned.
7777 */
7778 needs_mapping = false;
7779 for (i = 0; i < count; i++) {
7780 paddr = VM_PAGE_TO_PHYS(page[i]);
7781 if (__predict_false(!PHYS_IN_DMAP(paddr))) {
7782 error = vmem_alloc(kernel_arena, PAGE_SIZE,
7783 M_BESTFIT | M_WAITOK, &vaddr[i]);
7784 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
7785 needs_mapping = true;
7786 } else {
7787 vaddr[i] = PHYS_TO_DMAP(paddr);
7788 }
7789 }
7790
7791 /* Exit early if everything is covered by the DMAP */
7792 if (!needs_mapping)
7793 return (false);
7794
7795 if (!can_fault)
7796 sched_pin();
7797 for (i = 0; i < count; i++) {
7798 paddr = VM_PAGE_TO_PHYS(page[i]);
7799 if (!PHYS_IN_DMAP(paddr)) {
7800 panic(
7801 "pmap_map_io_transient: TODO: Map out of DMAP data");
7802 }
7803 }
7804
7805 return (needs_mapping);
7806 }
7807
7808 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)7809 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7810 bool can_fault)
7811 {
7812 vm_paddr_t paddr;
7813 int i;
7814
7815 if (!can_fault)
7816 sched_unpin();
7817 for (i = 0; i < count; i++) {
7818 paddr = VM_PAGE_TO_PHYS(page[i]);
7819 if (!PHYS_IN_DMAP(paddr)) {
7820 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
7821 }
7822 }
7823 }
7824
7825 boolean_t
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)7826 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
7827 {
7828
7829 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
7830 }
7831
7832 #if defined(KASAN)
7833 static pd_entry_t *pmap_san_early_l2;
7834
7835 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE)
7836 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE)
7837 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_l2(void)7838 pmap_san_enter_bootstrap_alloc_l2(void)
7839 {
7840 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
7841 static size_t offset = 0;
7842 vm_offset_t addr;
7843
7844 if (offset + L2_SIZE > sizeof(bootstrap_data)) {
7845 panic("%s: out of memory for the bootstrap shadow map L2 entries",
7846 __func__);
7847 }
7848
7849 addr = (uintptr_t)&bootstrap_data[offset];
7850 offset += L2_SIZE;
7851 return (addr);
7852 }
7853
7854 /*
7855 * SAN L1 + L2 pages, maybe L3 entries later?
7856 */
7857 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_pages(int npages)7858 pmap_san_enter_bootstrap_alloc_pages(int npages)
7859 {
7860 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
7861 static size_t offset = 0;
7862 vm_offset_t addr;
7863
7864 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
7865 panic("%s: out of memory for the bootstrap shadow map",
7866 __func__);
7867 }
7868
7869 addr = (uintptr_t)&bootstrap_data[offset];
7870 offset += (npages * PAGE_SIZE);
7871 return (addr);
7872 }
7873
7874 static void __nosanitizeaddress
pmap_san_enter_bootstrap(void)7875 pmap_san_enter_bootstrap(void)
7876 {
7877 vm_offset_t freemempos;
7878
7879 /* L1, L2 */
7880 freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
7881 bs_state.freemempos = freemempos;
7882 bs_state.va = KASAN_MIN_ADDRESS;
7883 pmap_bootstrap_l1_table(&bs_state);
7884 pmap_san_early_l2 = bs_state.l2;
7885 }
7886
7887 static vm_page_t
pmap_san_enter_alloc_l3(void)7888 pmap_san_enter_alloc_l3(void)
7889 {
7890 vm_page_t m;
7891
7892 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
7893 VM_ALLOC_ZERO);
7894 if (m == NULL)
7895 panic("%s: no memory to grow shadow map", __func__);
7896 return (m);
7897 }
7898
7899 static vm_page_t
pmap_san_enter_alloc_l2(void)7900 pmap_san_enter_alloc_l2(void)
7901 {
7902 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
7903 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
7904 }
7905
7906 void __nosanitizeaddress
pmap_san_enter(vm_offset_t va)7907 pmap_san_enter(vm_offset_t va)
7908 {
7909 pd_entry_t *l1, *l2;
7910 pt_entry_t *l3;
7911 vm_page_t m;
7912
7913 if (virtual_avail == 0) {
7914 vm_offset_t block;
7915 int slot;
7916 bool first;
7917
7918 /* Temporary shadow map prior to pmap_bootstrap(). */
7919 first = pmap_san_early_l2 == NULL;
7920 if (first)
7921 pmap_san_enter_bootstrap();
7922
7923 l2 = pmap_san_early_l2;
7924 slot = pmap_l2_index(va);
7925
7926 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
7927 MPASS(first);
7928 block = pmap_san_enter_bootstrap_alloc_l2();
7929 pmap_store(&l2[slot],
7930 PHYS_TO_PTE(pmap_early_vtophys(block)) |
7931 PMAP_SAN_PTE_BITS | L2_BLOCK);
7932 dmb(ishst);
7933 }
7934
7935 return;
7936 }
7937
7938 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
7939 l1 = pmap_l1(kernel_pmap, va);
7940 MPASS(l1 != NULL);
7941 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
7942 m = pmap_san_enter_alloc_l3();
7943 pmap_store(l1, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) | L1_TABLE);
7944 }
7945 l2 = pmap_l1_to_l2(l1, va);
7946 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
7947 m = pmap_san_enter_alloc_l2();
7948 if (m != NULL) {
7949 pmap_store(l2, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) |
7950 PMAP_SAN_PTE_BITS | L2_BLOCK);
7951 } else {
7952 m = pmap_san_enter_alloc_l3();
7953 pmap_store(l2, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) |
7954 L2_TABLE);
7955 }
7956 dmb(ishst);
7957 }
7958 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
7959 return;
7960 l3 = pmap_l2_to_l3(l2, va);
7961 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
7962 return;
7963 m = pmap_san_enter_alloc_l3();
7964 pmap_store(l3, PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) |
7965 PMAP_SAN_PTE_BITS | L3_PAGE);
7966 dmb(ishst);
7967 }
7968 #endif /* KASAN */
7969
7970 /*
7971 * Track a range of the kernel's virtual address space that is contiguous
7972 * in various mapping attributes.
7973 */
7974 struct pmap_kernel_map_range {
7975 vm_offset_t sva;
7976 pt_entry_t attrs;
7977 int l3pages;
7978 int l3contig;
7979 int l2blocks;
7980 int l1blocks;
7981 };
7982
7983 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)7984 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
7985 vm_offset_t eva)
7986 {
7987 const char *mode;
7988 int index;
7989
7990 if (eva <= range->sva)
7991 return;
7992
7993 index = range->attrs & ATTR_S1_IDX_MASK;
7994 switch (index) {
7995 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
7996 mode = "DEV-NP";
7997 break;
7998 case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
7999 mode = "DEV";
8000 break;
8001 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
8002 mode = "UC";
8003 break;
8004 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
8005 mode = "WB";
8006 break;
8007 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
8008 mode = "WT";
8009 break;
8010 default:
8011 printf(
8012 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
8013 __func__, index, range->sva, eva);
8014 mode = "??";
8015 break;
8016 }
8017
8018 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %6s %d %d %d %d\n",
8019 range->sva, eva,
8020 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
8021 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
8022 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
8023 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
8024 mode, range->l1blocks, range->l2blocks, range->l3contig,
8025 range->l3pages);
8026
8027 /* Reset to sentinel value. */
8028 range->sva = 0xfffffffffffffffful;
8029 }
8030
8031 /*
8032 * Determine whether the attributes specified by a page table entry match those
8033 * being tracked by the current range.
8034 */
8035 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)8036 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
8037 {
8038
8039 return (range->attrs == attrs);
8040 }
8041
8042 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)8043 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
8044 pt_entry_t attrs)
8045 {
8046
8047 memset(range, 0, sizeof(*range));
8048 range->sva = va;
8049 range->attrs = attrs;
8050 }
8051
8052 /* Get the block/page attributes that correspond to the table attributes */
8053 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)8054 sysctl_kmaps_table_attrs(pd_entry_t table)
8055 {
8056 pt_entry_t attrs;
8057
8058 attrs = 0;
8059 if ((table & TATTR_UXN_TABLE) != 0)
8060 attrs |= ATTR_S1_UXN;
8061 if ((table & TATTR_PXN_TABLE) != 0)
8062 attrs |= ATTR_S1_PXN;
8063 if ((table & TATTR_AP_TABLE_RO) != 0)
8064 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
8065
8066 return (attrs);
8067 }
8068
8069 /* Read the block/page attributes we care about */
8070 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)8071 sysctl_kmaps_block_attrs(pt_entry_t block)
8072 {
8073 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK));
8074 }
8075
8076 /*
8077 * Given a leaf PTE, derive the mapping's attributes. If they do not match
8078 * those of the current run, dump the address range and its attributes, and
8079 * begin a new run.
8080 */
8081 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)8082 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
8083 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
8084 pt_entry_t l3e)
8085 {
8086 pt_entry_t attrs;
8087
8088 attrs = sysctl_kmaps_table_attrs(l0e);
8089
8090 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
8091 attrs |= sysctl_kmaps_block_attrs(l1e);
8092 goto done;
8093 }
8094 attrs |= sysctl_kmaps_table_attrs(l1e);
8095
8096 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
8097 attrs |= sysctl_kmaps_block_attrs(l2e);
8098 goto done;
8099 }
8100 attrs |= sysctl_kmaps_table_attrs(l2e);
8101 attrs |= sysctl_kmaps_block_attrs(l3e);
8102
8103 done:
8104 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
8105 sysctl_kmaps_dump(sb, range, va);
8106 sysctl_kmaps_reinit(range, va, attrs);
8107 }
8108 }
8109
8110 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)8111 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
8112 {
8113 struct pmap_kernel_map_range range;
8114 struct sbuf sbuf, *sb;
8115 pd_entry_t l0e, *l1, l1e, *l2, l2e;
8116 pt_entry_t *l3, l3e;
8117 vm_offset_t sva;
8118 vm_paddr_t pa;
8119 int error, i, j, k, l;
8120
8121 error = sysctl_wire_old_buffer(req, 0);
8122 if (error != 0)
8123 return (error);
8124 sb = &sbuf;
8125 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
8126
8127 /* Sentinel value. */
8128 range.sva = 0xfffffffffffffffful;
8129
8130 /*
8131 * Iterate over the kernel page tables without holding the kernel pmap
8132 * lock. Kernel page table pages are never freed, so at worst we will
8133 * observe inconsistencies in the output.
8134 */
8135 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
8136 i++) {
8137 if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
8138 sbuf_printf(sb, "\nDirect map:\n");
8139 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
8140 sbuf_printf(sb, "\nKernel map:\n");
8141 #ifdef KASAN
8142 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
8143 sbuf_printf(sb, "\nKASAN shadow map:\n");
8144 #endif
8145
8146 l0e = kernel_pmap->pm_l0[i];
8147 if ((l0e & ATTR_DESCR_VALID) == 0) {
8148 sysctl_kmaps_dump(sb, &range, sva);
8149 sva += L0_SIZE;
8150 continue;
8151 }
8152 pa = PTE_TO_PHYS(l0e);
8153 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
8154
8155 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
8156 l1e = l1[j];
8157 if ((l1e & ATTR_DESCR_VALID) == 0) {
8158 sysctl_kmaps_dump(sb, &range, sva);
8159 sva += L1_SIZE;
8160 continue;
8161 }
8162 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
8163 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8164 sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
8165 0, 0);
8166 range.l1blocks++;
8167 sva += L1_SIZE;
8168 continue;
8169 }
8170 pa = PTE_TO_PHYS(l1e);
8171 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
8172
8173 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
8174 l2e = l2[k];
8175 if ((l2e & ATTR_DESCR_VALID) == 0) {
8176 sysctl_kmaps_dump(sb, &range, sva);
8177 sva += L2_SIZE;
8178 continue;
8179 }
8180 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
8181 sysctl_kmaps_check(sb, &range, sva,
8182 l0e, l1e, l2e, 0);
8183 range.l2blocks++;
8184 sva += L2_SIZE;
8185 continue;
8186 }
8187 pa = PTE_TO_PHYS(l2e);
8188 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
8189
8190 for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
8191 l++, sva += L3_SIZE) {
8192 l3e = l3[l];
8193 if ((l3e & ATTR_DESCR_VALID) == 0) {
8194 sysctl_kmaps_dump(sb, &range,
8195 sva);
8196 continue;
8197 }
8198 sysctl_kmaps_check(sb, &range, sva,
8199 l0e, l1e, l2e, l3e);
8200 if ((l3e & ATTR_CONTIGUOUS) != 0)
8201 range.l3contig += l % 16 == 0 ?
8202 1 : 0;
8203 else
8204 range.l3pages++;
8205 }
8206 }
8207 }
8208 }
8209
8210 error = sbuf_finish(sb);
8211 sbuf_delete(sb);
8212 return (error);
8213 }
8214 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
8215 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
8216 NULL, 0, sysctl_kmaps, "A",
8217 "Dump kernel address layout");
8218