1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 1991 Regents of the University of California.
5 * All rights reserved.
6 * Copyright (c) 1994 John S. Dyson
7 * All rights reserved.
8 * Copyright (c) 1994 David Greenman
9 * All rights reserved.
10 * Copyright (c) 2003 Peter Wemm
11 * All rights reserved.
12 * Copyright (c) 2005-2010 Alan L. Cox <[email protected]>
13 * All rights reserved.
14 *
15 * This code is derived from software contributed to Berkeley by
16 * the Systems Programming Group of the University of Utah Computer
17 * Science Department and William Jolitz of UUNET Technologies Inc.
18 *
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
21 * are met:
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
27 * 3. All advertising materials mentioning features or use of this software
28 * must display the following acknowledgement:
29 * This product includes software developed by the University of
30 * California, Berkeley and its contributors.
31 * 4. Neither the name of the University nor the names of its contributors
32 * may be used to endorse or promote products derived from this software
33 * without specific prior written permission.
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
45 * SUCH DAMAGE.
46 *
47 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
48 */
49 /*-
50 * Copyright (c) 2003 Networks Associates Technology, Inc.
51 * Copyright (c) 2014-2020 The FreeBSD Foundation
52 * All rights reserved.
53 *
54 * This software was developed for the FreeBSD Project by Jake Burkholder,
55 * Safeport Network Services, and Network Associates Laboratories, the
56 * Security Research Division of Network Associates, Inc. under
57 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
58 * CHATS research program.
59 *
60 * Portions of this software were developed by
61 * Konstantin Belousov <[email protected]> under sponsorship from
62 * the FreeBSD Foundation.
63 *
64 * Redistribution and use in source and binary forms, with or without
65 * modification, are permitted provided that the following conditions
66 * are met:
67 * 1. Redistributions of source code must retain the above copyright
68 * notice, this list of conditions and the following disclaimer.
69 * 2. Redistributions in binary form must reproduce the above copyright
70 * notice, this list of conditions and the following disclaimer in the
71 * documentation and/or other materials provided with the distribution.
72 *
73 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83 * SUCH DAMAGE.
84 */
85
86 #define AMD64_NPT_AWARE
87
88 #include <sys/cdefs.h>
89 /*
90 * Manages physical address maps.
91 *
92 * Since the information managed by this module is
93 * also stored by the logical address mapping module,
94 * this module may throw away valid virtual-to-physical
95 * mappings at almost any time. However, invalidations
96 * of virtual-to-physical mappings must be done as
97 * requested.
98 *
99 * In order to cope with hardware architectures which
100 * make virtual-to-physical map invalidates expensive,
101 * this module may delay invalidate or reduced protection
102 * operations until such time as they are actually
103 * necessary. This module is given full information as
104 * to which processors are currently using which maps,
105 * and to when physical maps must be made correct.
106 */
107
108 #include "opt_ddb.h"
109 #include "opt_pmap.h"
110 #include "opt_vm.h"
111
112 #include <sys/param.h>
113 #include <sys/asan.h>
114 #include <sys/bitstring.h>
115 #include <sys/bus.h>
116 #include <sys/systm.h>
117 #include <sys/counter.h>
118 #include <sys/kernel.h>
119 #include <sys/ktr.h>
120 #include <sys/lock.h>
121 #include <sys/malloc.h>
122 #include <sys/mman.h>
123 #include <sys/msan.h>
124 #include <sys/mutex.h>
125 #include <sys/proc.h>
126 #include <sys/rangeset.h>
127 #include <sys/rwlock.h>
128 #include <sys/sbuf.h>
129 #include <sys/smr.h>
130 #include <sys/sx.h>
131 #include <sys/turnstile.h>
132 #include <sys/vmem.h>
133 #include <sys/vmmeter.h>
134 #include <sys/sched.h>
135 #include <sys/sysctl.h>
136 #include <sys/smp.h>
137 #ifdef DDB
138 #include <sys/kdb.h>
139 #include <ddb/ddb.h>
140 #endif
141
142 #include <vm/vm.h>
143 #include <vm/vm_param.h>
144 #include <vm/vm_kern.h>
145 #include <vm/vm_page.h>
146 #include <vm/vm_map.h>
147 #include <vm/vm_object.h>
148 #include <vm/vm_extern.h>
149 #include <vm/vm_pageout.h>
150 #include <vm/vm_pager.h>
151 #include <vm/vm_phys.h>
152 #include <vm/vm_radix.h>
153 #include <vm/vm_reserv.h>
154 #include <vm/vm_dumpset.h>
155 #include <vm/uma.h>
156
157 #include <machine/asan.h>
158 #include <machine/intr_machdep.h>
159 #include <x86/apicvar.h>
160 #include <x86/ifunc.h>
161 #include <machine/cpu.h>
162 #include <machine/cputypes.h>
163 #include <machine/md_var.h>
164 #include <machine/msan.h>
165 #include <machine/pcb.h>
166 #include <machine/specialreg.h>
167 #ifdef SMP
168 #include <machine/smp.h>
169 #endif
170 #include <machine/sysarch.h>
171 #include <machine/tss.h>
172
173 #ifdef NUMA
174 #define PMAP_MEMDOM MAXMEMDOM
175 #else
176 #define PMAP_MEMDOM 1
177 #endif
178
179 static __inline boolean_t
pmap_type_guest(pmap_t pmap)180 pmap_type_guest(pmap_t pmap)
181 {
182
183 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
184 }
185
186 static __inline boolean_t
pmap_emulate_ad_bits(pmap_t pmap)187 pmap_emulate_ad_bits(pmap_t pmap)
188 {
189
190 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
191 }
192
193 static __inline pt_entry_t
pmap_valid_bit(pmap_t pmap)194 pmap_valid_bit(pmap_t pmap)
195 {
196 pt_entry_t mask;
197
198 switch (pmap->pm_type) {
199 case PT_X86:
200 case PT_RVI:
201 mask = X86_PG_V;
202 break;
203 case PT_EPT:
204 if (pmap_emulate_ad_bits(pmap))
205 mask = EPT_PG_EMUL_V;
206 else
207 mask = EPT_PG_READ;
208 break;
209 default:
210 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
211 }
212
213 return (mask);
214 }
215
216 static __inline pt_entry_t
pmap_rw_bit(pmap_t pmap)217 pmap_rw_bit(pmap_t pmap)
218 {
219 pt_entry_t mask;
220
221 switch (pmap->pm_type) {
222 case PT_X86:
223 case PT_RVI:
224 mask = X86_PG_RW;
225 break;
226 case PT_EPT:
227 if (pmap_emulate_ad_bits(pmap))
228 mask = EPT_PG_EMUL_RW;
229 else
230 mask = EPT_PG_WRITE;
231 break;
232 default:
233 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
234 }
235
236 return (mask);
237 }
238
239 static pt_entry_t pg_g;
240
241 static __inline pt_entry_t
pmap_global_bit(pmap_t pmap)242 pmap_global_bit(pmap_t pmap)
243 {
244 pt_entry_t mask;
245
246 switch (pmap->pm_type) {
247 case PT_X86:
248 mask = pg_g;
249 break;
250 case PT_RVI:
251 case PT_EPT:
252 mask = 0;
253 break;
254 default:
255 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
256 }
257
258 return (mask);
259 }
260
261 static __inline pt_entry_t
pmap_accessed_bit(pmap_t pmap)262 pmap_accessed_bit(pmap_t pmap)
263 {
264 pt_entry_t mask;
265
266 switch (pmap->pm_type) {
267 case PT_X86:
268 case PT_RVI:
269 mask = X86_PG_A;
270 break;
271 case PT_EPT:
272 if (pmap_emulate_ad_bits(pmap))
273 mask = EPT_PG_READ;
274 else
275 mask = EPT_PG_A;
276 break;
277 default:
278 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
279 }
280
281 return (mask);
282 }
283
284 static __inline pt_entry_t
pmap_modified_bit(pmap_t pmap)285 pmap_modified_bit(pmap_t pmap)
286 {
287 pt_entry_t mask;
288
289 switch (pmap->pm_type) {
290 case PT_X86:
291 case PT_RVI:
292 mask = X86_PG_M;
293 break;
294 case PT_EPT:
295 if (pmap_emulate_ad_bits(pmap))
296 mask = EPT_PG_WRITE;
297 else
298 mask = EPT_PG_M;
299 break;
300 default:
301 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
302 }
303
304 return (mask);
305 }
306
307 static __inline pt_entry_t
pmap_pku_mask_bit(pmap_t pmap)308 pmap_pku_mask_bit(pmap_t pmap)
309 {
310
311 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
312 }
313
314 static __inline boolean_t
safe_to_clear_referenced(pmap_t pmap,pt_entry_t pte)315 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
316 {
317
318 if (!pmap_emulate_ad_bits(pmap))
319 return (TRUE);
320
321 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
322
323 /*
324 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
325 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
326 * if the EPT_PG_WRITE bit is set.
327 */
328 if ((pte & EPT_PG_WRITE) != 0)
329 return (FALSE);
330
331 /*
332 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
333 */
334 if ((pte & EPT_PG_EXECUTE) == 0 ||
335 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
336 return (TRUE);
337 else
338 return (FALSE);
339 }
340
341 #ifdef PV_STATS
342 #define PV_STAT(x) do { x ; } while (0)
343 #else
344 #define PV_STAT(x) do { } while (0)
345 #endif
346
347 #undef pa_index
348 #ifdef NUMA
349 #define pa_index(pa) ({ \
350 KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \
351 ("address %lx beyond the last segment", (pa))); \
352 (pa) >> PDRSHIFT; \
353 })
354 #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)])
355 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page))
356 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \
357 struct rwlock *_lock; \
358 if (__predict_false((pa) > pmap_last_pa)) \
359 _lock = &pv_dummy_large.pv_lock; \
360 else \
361 _lock = &(pa_to_pmdp(pa)->pv_lock); \
362 _lock; \
363 })
364 #else
365 #define pa_index(pa) ((pa) >> PDRSHIFT)
366 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
367
368 #define NPV_LIST_LOCKS MAXCPU
369
370 #define PHYS_TO_PV_LIST_LOCK(pa) \
371 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
372 #endif
373
374 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
375 struct rwlock **_lockp = (lockp); \
376 struct rwlock *_new_lock; \
377 \
378 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
379 if (_new_lock != *_lockp) { \
380 if (*_lockp != NULL) \
381 rw_wunlock(*_lockp); \
382 *_lockp = _new_lock; \
383 rw_wlock(*_lockp); \
384 } \
385 } while (0)
386
387 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
388 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
389
390 #define RELEASE_PV_LIST_LOCK(lockp) do { \
391 struct rwlock **_lockp = (lockp); \
392 \
393 if (*_lockp != NULL) { \
394 rw_wunlock(*_lockp); \
395 *_lockp = NULL; \
396 } \
397 } while (0)
398
399 #define VM_PAGE_TO_PV_LIST_LOCK(m) \
400 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
401
402 /*
403 * Statically allocate kernel pmap memory. However, memory for
404 * pm_pcids is obtained after the dynamic allocator is operational.
405 * Initialize it with a non-canonical pointer to catch early accesses
406 * regardless of the active mapping.
407 */
408 struct pmap kernel_pmap_store = {
409 .pm_pcidp = (void *)0xdeadbeefdeadbeef,
410 };
411
412 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
413 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
414
415 int nkpt;
416 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
417 "Number of kernel page table pages allocated on bootup");
418
419 static int ndmpdp;
420 vm_paddr_t dmaplimit;
421 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
422 pt_entry_t pg_nx;
423
424 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
425 "VM/pmap parameters");
426
427 static int __read_frequently pg_ps_enabled = 1;
428 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
429 &pg_ps_enabled, 0, "Are large page mappings enabled?");
430
431 int __read_frequently la57 = 0;
432 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
433 &la57, 0,
434 "5-level paging for host is enabled");
435
436 static bool
pmap_is_la57(pmap_t pmap)437 pmap_is_la57(pmap_t pmap)
438 {
439 if (pmap->pm_type == PT_X86)
440 return (la57);
441 return (false); /* XXXKIB handle EPT */
442 }
443
444 #define PAT_INDEX_SIZE 8
445 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */
446
447 static u_int64_t KPTphys; /* phys addr of kernel level 1 */
448 static u_int64_t KPDphys; /* phys addr of kernel level 2 */
449 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */
450 u_int64_t KPML4phys; /* phys addr of kernel level 4 */
451 u_int64_t KPML5phys; /* phys addr of kernel level 5,
452 if supported */
453
454 #ifdef KASAN
455 static uint64_t KASANPDPphys;
456 #endif
457 #ifdef KMSAN
458 static uint64_t KMSANSHADPDPphys;
459 static uint64_t KMSANORIGPDPphys;
460
461 /*
462 * To support systems with large amounts of memory, it is necessary to extend
463 * the maximum size of the direct map. This could eat into the space reserved
464 * for the shadow map.
465 */
466 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow");
467 #endif
468
469 static pml4_entry_t *kernel_pml4;
470 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
471 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
472 static int ndmpdpphys; /* number of DMPDPphys pages */
473
474 vm_paddr_t kernphys; /* phys addr of start of bootstrap data */
475 vm_paddr_t KERNend; /* and the end */
476
477 /*
478 * pmap_mapdev support pre initialization (i.e. console)
479 */
480 #define PMAP_PREINIT_MAPPING_COUNT 8
481 static struct pmap_preinit_mapping {
482 vm_paddr_t pa;
483 vm_offset_t va;
484 vm_size_t sz;
485 int mode;
486 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
487 static int pmap_initialized;
488
489 /*
490 * Data for the pv entry allocation mechanism.
491 * Updates to pv_invl_gen are protected by the pv list lock but reads are not.
492 */
493 #ifdef NUMA
494 static __inline int
pc_to_domain(struct pv_chunk * pc)495 pc_to_domain(struct pv_chunk *pc)
496 {
497
498 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
499 }
500 #else
501 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)502 pc_to_domain(struct pv_chunk *pc __unused)
503 {
504
505 return (0);
506 }
507 #endif
508
509 struct pv_chunks_list {
510 struct mtx pvc_lock;
511 TAILQ_HEAD(pch, pv_chunk) pvc_list;
512 int active_reclaims;
513 } __aligned(CACHE_LINE_SIZE);
514
515 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
516
517 #ifdef NUMA
518 struct pmap_large_md_page {
519 struct rwlock pv_lock;
520 struct md_page pv_page;
521 u_long pv_invl_gen;
522 };
523 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
524 #define pv_dummy pv_dummy_large.pv_page
525 __read_mostly static struct pmap_large_md_page *pv_table;
526 __read_mostly vm_paddr_t pmap_last_pa;
527 #else
528 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
529 static u_long pv_invl_gen[NPV_LIST_LOCKS];
530 static struct md_page *pv_table;
531 static struct md_page pv_dummy;
532 #endif
533
534 /*
535 * All those kernel PT submaps that BSD is so fond of
536 */
537 pt_entry_t *CMAP1 = NULL;
538 caddr_t CADDR1 = 0;
539 static vm_offset_t qframe = 0;
540 static struct mtx qframe_mtx;
541
542 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */
543
544 static vmem_t *large_vmem;
545 static u_int lm_ents;
546 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \
547 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents)
548
549 int pmap_pcid_enabled = 1;
550 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
551 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
552 int invpcid_works = 0;
553 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
554 "Is the invpcid instruction available ?");
555 int invlpgb_works;
556 SYSCTL_INT(_vm_pmap, OID_AUTO, invlpgb_works, CTLFLAG_RD, &invlpgb_works, 0,
557 "Is the invlpgb instruction available?");
558 int invlpgb_maxcnt;
559 int pmap_pcid_invlpg_workaround = 0;
560 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround,
561 CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
562 &pmap_pcid_invlpg_workaround, 0,
563 "Enable small core PCID/INVLPG workaround");
564 int pmap_pcid_invlpg_workaround_uena = 1;
565
566 int __read_frequently pti = 0;
567 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
568 &pti, 0,
569 "Page Table Isolation enabled");
570 static vm_object_t pti_obj;
571 static pml4_entry_t *pti_pml4;
572 static vm_pindex_t pti_pg_idx;
573 static bool pti_finalized;
574
575 struct pmap_pkru_range {
576 struct rs_el pkru_rs_el;
577 u_int pkru_keyidx;
578 int pkru_flags;
579 };
580
581 static uma_zone_t pmap_pkru_ranges_zone;
582 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
583 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
584 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
585 static void *pkru_dup_range(void *ctx, void *data);
586 static void pkru_free_range(void *ctx, void *node);
587 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
588 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
589 static void pmap_pkru_deassign_all(pmap_t pmap);
590
591 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt);
592 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD,
593 &pcid_save_cnt, "Count of saved TLB context on switch");
594
595 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
596 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
597 static struct mtx invl_gen_mtx;
598 /* Fake lock object to satisfy turnstiles interface. */
599 static struct lock_object invl_gen_ts = {
600 .lo_name = "invlts",
601 };
602 static struct pmap_invl_gen pmap_invl_gen_head = {
603 .gen = 1,
604 .next = NULL,
605 };
606 static u_long pmap_invl_gen = 1;
607 static int pmap_invl_waiters;
608 static struct callout pmap_invl_callout;
609 static bool pmap_invl_callout_inited;
610
611 #define PMAP_ASSERT_NOT_IN_DI() \
612 KASSERT(pmap_not_in_di(), ("DI already started"))
613
614 static bool
pmap_di_locked(void)615 pmap_di_locked(void)
616 {
617 int tun;
618
619 if ((cpu_feature2 & CPUID2_CX16) == 0)
620 return (true);
621 tun = 0;
622 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun);
623 return (tun != 0);
624 }
625
626 static int
sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS)627 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS)
628 {
629 int locked;
630
631 locked = pmap_di_locked();
632 return (sysctl_handle_int(oidp, &locked, 0, req));
633 }
634 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN |
635 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "",
636 "Locked delayed invalidation");
637
638 static bool pmap_not_in_di_l(void);
639 static bool pmap_not_in_di_u(void);
640 DEFINE_IFUNC(, bool, pmap_not_in_di, (void))
641 {
642
643 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u);
644 }
645
646 static bool
pmap_not_in_di_l(void)647 pmap_not_in_di_l(void)
648 {
649 struct pmap_invl_gen *invl_gen;
650
651 invl_gen = &curthread->td_md.md_invl_gen;
652 return (invl_gen->gen == 0);
653 }
654
655 static void
pmap_thread_init_invl_gen_l(struct thread * td)656 pmap_thread_init_invl_gen_l(struct thread *td)
657 {
658 struct pmap_invl_gen *invl_gen;
659
660 invl_gen = &td->td_md.md_invl_gen;
661 invl_gen->gen = 0;
662 }
663
664 static void
pmap_delayed_invl_wait_block(u_long * m_gen,u_long * invl_gen)665 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen)
666 {
667 struct turnstile *ts;
668
669 ts = turnstile_trywait(&invl_gen_ts);
670 if (*m_gen > atomic_load_long(invl_gen))
671 turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
672 else
673 turnstile_cancel(ts);
674 }
675
676 static void
pmap_delayed_invl_finish_unblock(u_long new_gen)677 pmap_delayed_invl_finish_unblock(u_long new_gen)
678 {
679 struct turnstile *ts;
680
681 turnstile_chain_lock(&invl_gen_ts);
682 ts = turnstile_lookup(&invl_gen_ts);
683 if (new_gen != 0)
684 pmap_invl_gen = new_gen;
685 if (ts != NULL) {
686 turnstile_broadcast(ts, TS_SHARED_QUEUE);
687 turnstile_unpend(ts);
688 }
689 turnstile_chain_unlock(&invl_gen_ts);
690 }
691
692 /*
693 * Start a new Delayed Invalidation (DI) block of code, executed by
694 * the current thread. Within a DI block, the current thread may
695 * destroy both the page table and PV list entries for a mapping and
696 * then release the corresponding PV list lock before ensuring that
697 * the mapping is flushed from the TLBs of any processors with the
698 * pmap active.
699 */
700 static void
pmap_delayed_invl_start_l(void)701 pmap_delayed_invl_start_l(void)
702 {
703 struct pmap_invl_gen *invl_gen;
704 u_long currgen;
705
706 invl_gen = &curthread->td_md.md_invl_gen;
707 PMAP_ASSERT_NOT_IN_DI();
708 mtx_lock(&invl_gen_mtx);
709 if (LIST_EMPTY(&pmap_invl_gen_tracker))
710 currgen = pmap_invl_gen;
711 else
712 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
713 invl_gen->gen = currgen + 1;
714 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
715 mtx_unlock(&invl_gen_mtx);
716 }
717
718 /*
719 * Finish the DI block, previously started by the current thread. All
720 * required TLB flushes for the pages marked by
721 * pmap_delayed_invl_page() must be finished before this function is
722 * called.
723 *
724 * This function works by bumping the global DI generation number to
725 * the generation number of the current thread's DI, unless there is a
726 * pending DI that started earlier. In the latter case, bumping the
727 * global DI generation number would incorrectly signal that the
728 * earlier DI had finished. Instead, this function bumps the earlier
729 * DI's generation number to match the generation number of the
730 * current thread's DI.
731 */
732 static void
pmap_delayed_invl_finish_l(void)733 pmap_delayed_invl_finish_l(void)
734 {
735 struct pmap_invl_gen *invl_gen, *next;
736
737 invl_gen = &curthread->td_md.md_invl_gen;
738 KASSERT(invl_gen->gen != 0, ("missed invl_start"));
739 mtx_lock(&invl_gen_mtx);
740 next = LIST_NEXT(invl_gen, link);
741 if (next == NULL)
742 pmap_delayed_invl_finish_unblock(invl_gen->gen);
743 else
744 next->gen = invl_gen->gen;
745 LIST_REMOVE(invl_gen, link);
746 mtx_unlock(&invl_gen_mtx);
747 invl_gen->gen = 0;
748 }
749
750 static bool
pmap_not_in_di_u(void)751 pmap_not_in_di_u(void)
752 {
753 struct pmap_invl_gen *invl_gen;
754
755 invl_gen = &curthread->td_md.md_invl_gen;
756 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
757 }
758
759 static void
pmap_thread_init_invl_gen_u(struct thread * td)760 pmap_thread_init_invl_gen_u(struct thread *td)
761 {
762 struct pmap_invl_gen *invl_gen;
763
764 invl_gen = &td->td_md.md_invl_gen;
765 invl_gen->gen = 0;
766 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
767 }
768
769 static bool
pmap_di_load_invl(struct pmap_invl_gen * ptr,struct pmap_invl_gen * out)770 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
771 {
772 uint64_t new_high, new_low, old_high, old_low;
773 char res;
774
775 old_low = new_low = 0;
776 old_high = new_high = (uintptr_t)0;
777
778 __asm volatile("lock;cmpxchg16b\t%1"
779 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
780 : "b"(new_low), "c" (new_high)
781 : "memory", "cc");
782 if (res == 0) {
783 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
784 return (false);
785 out->gen = old_low;
786 out->next = (void *)old_high;
787 } else {
788 out->gen = new_low;
789 out->next = (void *)new_high;
790 }
791 return (true);
792 }
793
794 static bool
pmap_di_store_invl(struct pmap_invl_gen * ptr,struct pmap_invl_gen * old_val,struct pmap_invl_gen * new_val)795 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
796 struct pmap_invl_gen *new_val)
797 {
798 uint64_t new_high, new_low, old_high, old_low;
799 char res;
800
801 new_low = new_val->gen;
802 new_high = (uintptr_t)new_val->next;
803 old_low = old_val->gen;
804 old_high = (uintptr_t)old_val->next;
805
806 __asm volatile("lock;cmpxchg16b\t%1"
807 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
808 : "b"(new_low), "c" (new_high)
809 : "memory", "cc");
810 return (res);
811 }
812
813 static COUNTER_U64_DEFINE_EARLY(pv_page_count);
814 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD,
815 &pv_page_count, "Current number of allocated pv pages");
816
817 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count);
818 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD,
819 &user_pt_page_count,
820 "Current number of allocated page table pages for userspace");
821
822 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count);
823 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD,
824 &kernel_pt_page_count,
825 "Current number of allocated page table pages for the kernel");
826
827 #ifdef PV_STATS
828
829 static COUNTER_U64_DEFINE_EARLY(invl_start_restart);
830 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart,
831 CTLFLAG_RD, &invl_start_restart,
832 "Number of delayed TLB invalidation request restarts");
833
834 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart);
835 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
836 &invl_finish_restart,
837 "Number of delayed TLB invalidation completion restarts");
838
839 static int invl_max_qlen;
840 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
841 &invl_max_qlen, 0,
842 "Maximum delayed TLB invalidation request queue length");
843 #endif
844
845 #define di_delay locks_delay
846
847 static void
pmap_delayed_invl_start_u(void)848 pmap_delayed_invl_start_u(void)
849 {
850 struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
851 struct thread *td;
852 struct lock_delay_arg lda;
853 uintptr_t prevl;
854 u_char pri;
855 #ifdef PV_STATS
856 int i, ii;
857 #endif
858
859 td = curthread;
860 invl_gen = &td->td_md.md_invl_gen;
861 PMAP_ASSERT_NOT_IN_DI();
862 lock_delay_arg_init(&lda, &di_delay);
863 invl_gen->saved_pri = 0;
864 pri = td->td_base_pri;
865 if (pri > PVM) {
866 thread_lock(td);
867 pri = td->td_base_pri;
868 if (pri > PVM) {
869 invl_gen->saved_pri = pri;
870 sched_prio(td, PVM);
871 }
872 thread_unlock(td);
873 }
874 again:
875 PV_STAT(i = 0);
876 for (p = &pmap_invl_gen_head;; p = prev.next) {
877 PV_STAT(i++);
878 prevl = (uintptr_t)atomic_load_ptr(&p->next);
879 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
880 PV_STAT(counter_u64_add(invl_start_restart, 1));
881 lock_delay(&lda);
882 goto again;
883 }
884 if (prevl == 0)
885 break;
886 prev.next = (void *)prevl;
887 }
888 #ifdef PV_STATS
889 if ((ii = invl_max_qlen) < i)
890 atomic_cmpset_int(&invl_max_qlen, ii, i);
891 #endif
892
893 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
894 PV_STAT(counter_u64_add(invl_start_restart, 1));
895 lock_delay(&lda);
896 goto again;
897 }
898
899 new_prev.gen = prev.gen;
900 new_prev.next = invl_gen;
901 invl_gen->gen = prev.gen + 1;
902
903 /* Formal fence between store to invl->gen and updating *p. */
904 atomic_thread_fence_rel();
905
906 /*
907 * After inserting an invl_gen element with invalid bit set,
908 * this thread blocks any other thread trying to enter the
909 * delayed invalidation block. Do not allow to remove us from
910 * the CPU, because it causes starvation for other threads.
911 */
912 critical_enter();
913
914 /*
915 * ABA for *p is not possible there, since p->gen can only
916 * increase. So if the *p thread finished its di, then
917 * started a new one and got inserted into the list at the
918 * same place, its gen will appear greater than the previously
919 * read gen.
920 */
921 if (!pmap_di_store_invl(p, &prev, &new_prev)) {
922 critical_exit();
923 PV_STAT(counter_u64_add(invl_start_restart, 1));
924 lock_delay(&lda);
925 goto again;
926 }
927
928 /*
929 * There we clear PMAP_INVL_GEN_NEXT_INVALID in
930 * invl_gen->next, allowing other threads to iterate past us.
931 * pmap_di_store_invl() provides fence between the generation
932 * write and the update of next.
933 */
934 invl_gen->next = NULL;
935 critical_exit();
936 }
937
938 static bool
pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen * invl_gen,struct pmap_invl_gen * p)939 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen,
940 struct pmap_invl_gen *p)
941 {
942 struct pmap_invl_gen prev, new_prev;
943 u_long mygen;
944
945 /*
946 * Load invl_gen->gen after setting invl_gen->next
947 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger
948 * generations to propagate to our invl_gen->gen. Lock prefix
949 * in atomic_set_ptr() worked as seq_cst fence.
950 */
951 mygen = atomic_load_long(&invl_gen->gen);
952
953 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
954 return (false);
955
956 KASSERT(prev.gen < mygen,
957 ("invalid di gen sequence %lu %lu", prev.gen, mygen));
958 new_prev.gen = mygen;
959 new_prev.next = (void *)((uintptr_t)invl_gen->next &
960 ~PMAP_INVL_GEN_NEXT_INVALID);
961
962 /* Formal fence between load of prev and storing update to it. */
963 atomic_thread_fence_rel();
964
965 return (pmap_di_store_invl(p, &prev, &new_prev));
966 }
967
968 static void
pmap_delayed_invl_finish_u(void)969 pmap_delayed_invl_finish_u(void)
970 {
971 struct pmap_invl_gen *invl_gen, *p;
972 struct thread *td;
973 struct lock_delay_arg lda;
974 uintptr_t prevl;
975
976 td = curthread;
977 invl_gen = &td->td_md.md_invl_gen;
978 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
979 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
980 ("missed invl_start: INVALID"));
981 lock_delay_arg_init(&lda, &di_delay);
982
983 again:
984 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
985 prevl = (uintptr_t)atomic_load_ptr(&p->next);
986 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
987 PV_STAT(counter_u64_add(invl_finish_restart, 1));
988 lock_delay(&lda);
989 goto again;
990 }
991 if ((void *)prevl == invl_gen)
992 break;
993 }
994
995 /*
996 * It is legitimate to not find ourself on the list if a
997 * thread before us finished its DI and started it again.
998 */
999 if (__predict_false(p == NULL)) {
1000 PV_STAT(counter_u64_add(invl_finish_restart, 1));
1001 lock_delay(&lda);
1002 goto again;
1003 }
1004
1005 critical_enter();
1006 atomic_set_ptr((uintptr_t *)&invl_gen->next,
1007 PMAP_INVL_GEN_NEXT_INVALID);
1008 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) {
1009 atomic_clear_ptr((uintptr_t *)&invl_gen->next,
1010 PMAP_INVL_GEN_NEXT_INVALID);
1011 critical_exit();
1012 PV_STAT(counter_u64_add(invl_finish_restart, 1));
1013 lock_delay(&lda);
1014 goto again;
1015 }
1016 critical_exit();
1017 if (atomic_load_int(&pmap_invl_waiters) > 0)
1018 pmap_delayed_invl_finish_unblock(0);
1019 if (invl_gen->saved_pri != 0) {
1020 thread_lock(td);
1021 sched_prio(td, invl_gen->saved_pri);
1022 thread_unlock(td);
1023 }
1024 }
1025
1026 #ifdef DDB
DB_SHOW_COMMAND(di_queue,pmap_di_queue)1027 DB_SHOW_COMMAND(di_queue, pmap_di_queue)
1028 {
1029 struct pmap_invl_gen *p, *pn;
1030 struct thread *td;
1031 uintptr_t nextl;
1032 bool first;
1033
1034 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
1035 first = false) {
1036 nextl = (uintptr_t)atomic_load_ptr(&p->next);
1037 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
1038 td = first ? NULL : __containerof(p, struct thread,
1039 td_md.md_invl_gen);
1040 db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
1041 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
1042 td != NULL ? td->td_tid : -1);
1043 }
1044 }
1045 #endif
1046
1047 #ifdef PV_STATS
1048 static COUNTER_U64_DEFINE_EARLY(invl_wait);
1049 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait,
1050 CTLFLAG_RD, &invl_wait,
1051 "Number of times DI invalidation blocked pmap_remove_all/write");
1052
1053 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow);
1054 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD,
1055 &invl_wait_slow, "Number of slow invalidation waits for lockless DI");
1056
1057 #endif
1058
1059 #ifdef NUMA
1060 static u_long *
pmap_delayed_invl_genp(vm_page_t m)1061 pmap_delayed_invl_genp(vm_page_t m)
1062 {
1063 vm_paddr_t pa;
1064 u_long *gen;
1065
1066 pa = VM_PAGE_TO_PHYS(m);
1067 if (__predict_false((pa) > pmap_last_pa))
1068 gen = &pv_dummy_large.pv_invl_gen;
1069 else
1070 gen = &(pa_to_pmdp(pa)->pv_invl_gen);
1071
1072 return (gen);
1073 }
1074 #else
1075 static u_long *
pmap_delayed_invl_genp(vm_page_t m)1076 pmap_delayed_invl_genp(vm_page_t m)
1077 {
1078
1079 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
1080 }
1081 #endif
1082
1083 static void
pmap_delayed_invl_callout_func(void * arg __unused)1084 pmap_delayed_invl_callout_func(void *arg __unused)
1085 {
1086
1087 if (atomic_load_int(&pmap_invl_waiters) == 0)
1088 return;
1089 pmap_delayed_invl_finish_unblock(0);
1090 }
1091
1092 static void
pmap_delayed_invl_callout_init(void * arg __unused)1093 pmap_delayed_invl_callout_init(void *arg __unused)
1094 {
1095
1096 if (pmap_di_locked())
1097 return;
1098 callout_init(&pmap_invl_callout, 1);
1099 pmap_invl_callout_inited = true;
1100 }
1101 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY,
1102 pmap_delayed_invl_callout_init, NULL);
1103
1104 /*
1105 * Ensure that all currently executing DI blocks, that need to flush
1106 * TLB for the given page m, actually flushed the TLB at the time the
1107 * function returned. If the page m has an empty PV list and we call
1108 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
1109 * valid mapping for the page m in either its page table or TLB.
1110 *
1111 * This function works by blocking until the global DI generation
1112 * number catches up with the generation number associated with the
1113 * given page m and its PV list. Since this function's callers
1114 * typically own an object lock and sometimes own a page lock, it
1115 * cannot sleep. Instead, it blocks on a turnstile to relinquish the
1116 * processor.
1117 */
1118 static void
pmap_delayed_invl_wait_l(vm_page_t m)1119 pmap_delayed_invl_wait_l(vm_page_t m)
1120 {
1121 u_long *m_gen;
1122 #ifdef PV_STATS
1123 bool accounted = false;
1124 #endif
1125
1126 m_gen = pmap_delayed_invl_genp(m);
1127 while (*m_gen > pmap_invl_gen) {
1128 #ifdef PV_STATS
1129 if (!accounted) {
1130 counter_u64_add(invl_wait, 1);
1131 accounted = true;
1132 }
1133 #endif
1134 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen);
1135 }
1136 }
1137
1138 static void
pmap_delayed_invl_wait_u(vm_page_t m)1139 pmap_delayed_invl_wait_u(vm_page_t m)
1140 {
1141 u_long *m_gen;
1142 struct lock_delay_arg lda;
1143 bool fast;
1144
1145 fast = true;
1146 m_gen = pmap_delayed_invl_genp(m);
1147 lock_delay_arg_init(&lda, &di_delay);
1148 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
1149 if (fast || !pmap_invl_callout_inited) {
1150 PV_STAT(counter_u64_add(invl_wait, 1));
1151 lock_delay(&lda);
1152 fast = false;
1153 } else {
1154 /*
1155 * The page's invalidation generation number
1156 * is still below the current thread's number.
1157 * Prepare to block so that we do not waste
1158 * CPU cycles or worse, suffer livelock.
1159 *
1160 * Since it is impossible to block without
1161 * racing with pmap_delayed_invl_finish_u(),
1162 * prepare for the race by incrementing
1163 * pmap_invl_waiters and arming a 1-tick
1164 * callout which will unblock us if we lose
1165 * the race.
1166 */
1167 atomic_add_int(&pmap_invl_waiters, 1);
1168
1169 /*
1170 * Re-check the current thread's invalidation
1171 * generation after incrementing
1172 * pmap_invl_waiters, so that there is no race
1173 * with pmap_delayed_invl_finish_u() setting
1174 * the page generation and checking
1175 * pmap_invl_waiters. The only race allowed
1176 * is for a missed unblock, which is handled
1177 * by the callout.
1178 */
1179 if (*m_gen >
1180 atomic_load_long(&pmap_invl_gen_head.gen)) {
1181 callout_reset(&pmap_invl_callout, 1,
1182 pmap_delayed_invl_callout_func, NULL);
1183 PV_STAT(counter_u64_add(invl_wait_slow, 1));
1184 pmap_delayed_invl_wait_block(m_gen,
1185 &pmap_invl_gen_head.gen);
1186 }
1187 atomic_add_int(&pmap_invl_waiters, -1);
1188 }
1189 }
1190 }
1191
1192 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *))
1193 {
1194
1195 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l :
1196 pmap_thread_init_invl_gen_u);
1197 }
1198
1199 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void))
1200 {
1201
1202 return (pmap_di_locked() ? pmap_delayed_invl_start_l :
1203 pmap_delayed_invl_start_u);
1204 }
1205
1206 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void))
1207 {
1208
1209 return (pmap_di_locked() ? pmap_delayed_invl_finish_l :
1210 pmap_delayed_invl_finish_u);
1211 }
1212
1213 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t))
1214 {
1215
1216 return (pmap_di_locked() ? pmap_delayed_invl_wait_l :
1217 pmap_delayed_invl_wait_u);
1218 }
1219
1220 /*
1221 * Mark the page m's PV list as participating in the current thread's
1222 * DI block. Any threads concurrently using m's PV list to remove or
1223 * restrict all mappings to m will wait for the current thread's DI
1224 * block to complete before proceeding.
1225 *
1226 * The function works by setting the DI generation number for m's PV
1227 * list to at least the DI generation number of the current thread.
1228 * This forces a caller of pmap_delayed_invl_wait() to block until
1229 * current thread calls pmap_delayed_invl_finish().
1230 */
1231 static void
pmap_delayed_invl_page(vm_page_t m)1232 pmap_delayed_invl_page(vm_page_t m)
1233 {
1234 u_long gen, *m_gen;
1235
1236 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
1237 gen = curthread->td_md.md_invl_gen.gen;
1238 if (gen == 0)
1239 return;
1240 m_gen = pmap_delayed_invl_genp(m);
1241 if (*m_gen < gen)
1242 *m_gen = gen;
1243 }
1244
1245 /*
1246 * Crashdump maps.
1247 */
1248 static caddr_t crashdumpmap;
1249
1250 /*
1251 * Internal flags for pmap_enter()'s helper functions.
1252 */
1253 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
1254 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
1255
1256 /*
1257 * Internal flags for pmap_mapdev_internal() and
1258 * pmap_change_props_locked().
1259 */
1260 #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */
1261 #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */
1262 #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */
1263
1264 TAILQ_HEAD(pv_chunklist, pv_chunk);
1265
1266 static void free_pv_chunk(struct pv_chunk *pc);
1267 static void free_pv_chunk_batch(struct pv_chunklist *batch);
1268 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
1269 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
1270 static int popcnt_pc_map_pq(uint64_t *map);
1271 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
1272 static void reserve_pv_entries(pmap_t pmap, int needed,
1273 struct rwlock **lockp);
1274 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1275 struct rwlock **lockp);
1276 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
1277 u_int flags, struct rwlock **lockp);
1278 #if VM_NRESERVLEVEL > 0
1279 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1280 struct rwlock **lockp);
1281 #endif
1282 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
1283 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
1284 vm_offset_t va);
1285
1286 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
1287 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
1288 vm_prot_t prot, int mode, int flags);
1289 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
1290 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
1291 vm_offset_t va, struct rwlock **lockp);
1292 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
1293 vm_offset_t va);
1294 static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
1295 vm_prot_t prot, struct rwlock **lockp);
1296 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
1297 u_int flags, vm_page_t m, struct rwlock **lockp);
1298 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
1299 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
1300 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
1301 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
1302 bool allpte_PG_A_set);
1303 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
1304 vm_offset_t eva);
1305 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
1306 vm_offset_t eva);
1307 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
1308 pd_entry_t pde);
1309 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
1310 static vm_page_t pmap_large_map_getptp_unlocked(void);
1311 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va);
1312 #if VM_NRESERVLEVEL > 0
1313 static bool pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
1314 vm_page_t mpte, struct rwlock **lockp);
1315 #endif
1316 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
1317 vm_prot_t prot);
1318 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask);
1319 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
1320 bool exec);
1321 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
1322 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
1323 static void pmap_pti_wire_pte(void *pte);
1324 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
1325 struct spglist *free, struct rwlock **lockp);
1326 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
1327 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
1328 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
1329 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
1330 struct spglist *free);
1331 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1332 pd_entry_t *pde, struct spglist *free,
1333 struct rwlock **lockp);
1334 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
1335 vm_page_t m, struct rwlock **lockp);
1336 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
1337 pd_entry_t newpde);
1338 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
1339
1340 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
1341 struct rwlock **lockp);
1342 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex,
1343 struct rwlock **lockp, vm_offset_t va);
1344 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex,
1345 struct rwlock **lockp, vm_offset_t va);
1346 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
1347 struct rwlock **lockp);
1348
1349 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
1350 struct spglist *free);
1351 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
1352
1353 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int);
1354 static void pmap_free_pt_page(pmap_t, vm_page_t, bool);
1355
1356 /********************/
1357 /* Inline functions */
1358 /********************/
1359
1360 /*
1361 * Return a non-clipped indexes for a given VA, which are page table
1362 * pages indexes at the corresponding level.
1363 */
1364 static __inline vm_pindex_t
pmap_pde_pindex(vm_offset_t va)1365 pmap_pde_pindex(vm_offset_t va)
1366 {
1367 return (va >> PDRSHIFT);
1368 }
1369
1370 static __inline vm_pindex_t
pmap_pdpe_pindex(vm_offset_t va)1371 pmap_pdpe_pindex(vm_offset_t va)
1372 {
1373 return (NUPDE + (va >> PDPSHIFT));
1374 }
1375
1376 static __inline vm_pindex_t
pmap_pml4e_pindex(vm_offset_t va)1377 pmap_pml4e_pindex(vm_offset_t va)
1378 {
1379 return (NUPDE + NUPDPE + (va >> PML4SHIFT));
1380 }
1381
1382 static __inline vm_pindex_t
pmap_pml5e_pindex(vm_offset_t va)1383 pmap_pml5e_pindex(vm_offset_t va)
1384 {
1385 return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT));
1386 }
1387
1388 static __inline pml4_entry_t *
pmap_pml5e(pmap_t pmap,vm_offset_t va)1389 pmap_pml5e(pmap_t pmap, vm_offset_t va)
1390 {
1391
1392 MPASS(pmap_is_la57(pmap));
1393 return (&pmap->pm_pmltop[pmap_pml5e_index(va)]);
1394 }
1395
1396 static __inline pml4_entry_t *
pmap_pml5e_u(pmap_t pmap,vm_offset_t va)1397 pmap_pml5e_u(pmap_t pmap, vm_offset_t va)
1398 {
1399
1400 MPASS(pmap_is_la57(pmap));
1401 return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]);
1402 }
1403
1404 static __inline pml4_entry_t *
pmap_pml5e_to_pml4e(pml5_entry_t * pml5e,vm_offset_t va)1405 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va)
1406 {
1407 pml4_entry_t *pml4e;
1408
1409 /* XXX MPASS(pmap_is_la57(pmap); */
1410 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
1411 return (&pml4e[pmap_pml4e_index(va)]);
1412 }
1413
1414 /* Return a pointer to the PML4 slot that corresponds to a VA */
1415 static __inline pml4_entry_t *
pmap_pml4e(pmap_t pmap,vm_offset_t va)1416 pmap_pml4e(pmap_t pmap, vm_offset_t va)
1417 {
1418 pml5_entry_t *pml5e;
1419 pml4_entry_t *pml4e;
1420 pt_entry_t PG_V;
1421
1422 if (pmap_is_la57(pmap)) {
1423 pml5e = pmap_pml5e(pmap, va);
1424 PG_V = pmap_valid_bit(pmap);
1425 if ((*pml5e & PG_V) == 0)
1426 return (NULL);
1427 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
1428 } else {
1429 pml4e = pmap->pm_pmltop;
1430 }
1431 return (&pml4e[pmap_pml4e_index(va)]);
1432 }
1433
1434 static __inline pml4_entry_t *
pmap_pml4e_u(pmap_t pmap,vm_offset_t va)1435 pmap_pml4e_u(pmap_t pmap, vm_offset_t va)
1436 {
1437 MPASS(!pmap_is_la57(pmap));
1438 return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]);
1439 }
1440
1441 /* Return a pointer to the PDP slot that corresponds to a VA */
1442 static __inline pdp_entry_t *
pmap_pml4e_to_pdpe(pml4_entry_t * pml4e,vm_offset_t va)1443 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
1444 {
1445 pdp_entry_t *pdpe;
1446
1447 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
1448 return (&pdpe[pmap_pdpe_index(va)]);
1449 }
1450
1451 /* Return a pointer to the PDP slot that corresponds to a VA */
1452 static __inline pdp_entry_t *
pmap_pdpe(pmap_t pmap,vm_offset_t va)1453 pmap_pdpe(pmap_t pmap, vm_offset_t va)
1454 {
1455 pml4_entry_t *pml4e;
1456 pt_entry_t PG_V;
1457
1458 PG_V = pmap_valid_bit(pmap);
1459 pml4e = pmap_pml4e(pmap, va);
1460 if (pml4e == NULL || (*pml4e & PG_V) == 0)
1461 return (NULL);
1462 return (pmap_pml4e_to_pdpe(pml4e, va));
1463 }
1464
1465 /* Return a pointer to the PD slot that corresponds to a VA */
1466 static __inline pd_entry_t *
pmap_pdpe_to_pde(pdp_entry_t * pdpe,vm_offset_t va)1467 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
1468 {
1469 pd_entry_t *pde;
1470
1471 KASSERT((*pdpe & PG_PS) == 0,
1472 ("%s: pdpe %#lx is a leaf", __func__, *pdpe));
1473 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
1474 return (&pde[pmap_pde_index(va)]);
1475 }
1476
1477 /* Return a pointer to the PD slot that corresponds to a VA */
1478 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va)1479 pmap_pde(pmap_t pmap, vm_offset_t va)
1480 {
1481 pdp_entry_t *pdpe;
1482 pt_entry_t PG_V;
1483
1484 PG_V = pmap_valid_bit(pmap);
1485 pdpe = pmap_pdpe(pmap, va);
1486 if (pdpe == NULL || (*pdpe & PG_V) == 0)
1487 return (NULL);
1488 KASSERT((*pdpe & PG_PS) == 0,
1489 ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va));
1490 return (pmap_pdpe_to_pde(pdpe, va));
1491 }
1492
1493 /* Return a pointer to the PT slot that corresponds to a VA */
1494 static __inline pt_entry_t *
pmap_pde_to_pte(pd_entry_t * pde,vm_offset_t va)1495 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
1496 {
1497 pt_entry_t *pte;
1498
1499 KASSERT((*pde & PG_PS) == 0,
1500 ("%s: pde %#lx is a leaf", __func__, *pde));
1501 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
1502 return (&pte[pmap_pte_index(va)]);
1503 }
1504
1505 /* Return a pointer to the PT slot that corresponds to a VA */
1506 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va)1507 pmap_pte(pmap_t pmap, vm_offset_t va)
1508 {
1509 pd_entry_t *pde;
1510 pt_entry_t PG_V;
1511
1512 PG_V = pmap_valid_bit(pmap);
1513 pde = pmap_pde(pmap, va);
1514 if (pde == NULL || (*pde & PG_V) == 0)
1515 return (NULL);
1516 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */
1517 return ((pt_entry_t *)pde);
1518 return (pmap_pde_to_pte(pde, va));
1519 }
1520
1521 static __inline void
pmap_resident_count_adj(pmap_t pmap,int count)1522 pmap_resident_count_adj(pmap_t pmap, int count)
1523 {
1524
1525 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1526 KASSERT(pmap->pm_stats.resident_count + count >= 0,
1527 ("pmap %p resident count underflow %ld %d", pmap,
1528 pmap->pm_stats.resident_count, count));
1529 pmap->pm_stats.resident_count += count;
1530 }
1531
1532 static __inline void
pmap_pt_page_count_pinit(pmap_t pmap,int count)1533 pmap_pt_page_count_pinit(pmap_t pmap, int count)
1534 {
1535 KASSERT(pmap->pm_stats.resident_count + count >= 0,
1536 ("pmap %p resident count underflow %ld %d", pmap,
1537 pmap->pm_stats.resident_count, count));
1538 pmap->pm_stats.resident_count += count;
1539 }
1540
1541 static __inline void
pmap_pt_page_count_adj(pmap_t pmap,int count)1542 pmap_pt_page_count_adj(pmap_t pmap, int count)
1543 {
1544 if (pmap == kernel_pmap)
1545 counter_u64_add(kernel_pt_page_count, count);
1546 else {
1547 if (pmap != NULL)
1548 pmap_resident_count_adj(pmap, count);
1549 counter_u64_add(user_pt_page_count, count);
1550 }
1551 }
1552
1553 pt_entry_t vtoptem __read_mostly = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
1554 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1) << 3;
1555 vm_offset_t PTmap __read_mostly = (vm_offset_t)P4Tmap;
1556
1557 pt_entry_t *
vtopte(vm_offset_t va)1558 vtopte(vm_offset_t va)
1559 {
1560 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
1561
1562 return ((pt_entry_t *)(PTmap + ((va >> (PAGE_SHIFT - 3)) & vtoptem)));
1563 }
1564
1565 pd_entry_t vtopdem __read_mostly = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
1566 NPML4EPGSHIFT)) - 1) << 3;
1567 vm_offset_t PDmap __read_mostly = (vm_offset_t)P4Dmap;
1568
1569 static __inline pd_entry_t *
vtopde(vm_offset_t va)1570 vtopde(vm_offset_t va)
1571 {
1572 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
1573
1574 return ((pt_entry_t *)(PDmap + ((va >> (PDRSHIFT - 3)) & vtopdem)));
1575 }
1576
1577 static u_int64_t
allocpages(vm_paddr_t * firstaddr,int n)1578 allocpages(vm_paddr_t *firstaddr, int n)
1579 {
1580 u_int64_t ret;
1581
1582 ret = *firstaddr;
1583 bzero((void *)ret, n * PAGE_SIZE);
1584 *firstaddr += n * PAGE_SIZE;
1585 return (ret);
1586 }
1587
1588 CTASSERT(powerof2(NDMPML4E));
1589
1590 /* number of kernel PDP slots */
1591 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG)
1592
1593 static void
nkpt_init(vm_paddr_t addr)1594 nkpt_init(vm_paddr_t addr)
1595 {
1596 int pt_pages;
1597
1598 #ifdef NKPT
1599 pt_pages = NKPT;
1600 #else
1601 pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */
1602 pt_pages += NKPDPE(pt_pages);
1603
1604 /*
1605 * Add some slop beyond the bare minimum required for bootstrapping
1606 * the kernel.
1607 *
1608 * This is quite important when allocating KVA for kernel modules.
1609 * The modules are required to be linked in the negative 2GB of
1610 * the address space. If we run out of KVA in this region then
1611 * pmap_growkernel() will need to allocate page table pages to map
1612 * the entire 512GB of KVA space which is an unnecessary tax on
1613 * physical memory.
1614 *
1615 * Secondly, device memory mapped as part of setting up the low-
1616 * level console(s) is taken from KVA, starting at virtual_avail.
1617 * This is because cninit() is called after pmap_bootstrap() but
1618 * before vm_mem_init() and pmap_init(). 20MB for a frame buffer
1619 * is not uncommon.
1620 */
1621 pt_pages += 32; /* 64MB additional slop. */
1622 #endif
1623 nkpt = pt_pages;
1624 }
1625
1626 /*
1627 * Returns the proper write/execute permission for a physical page that is
1628 * part of the initial boot allocations.
1629 *
1630 * If the page has kernel text, it is marked as read-only. If the page has
1631 * kernel read-only data, it is marked as read-only/not-executable. If the
1632 * page has only read-write data, it is marked as read-write/not-executable.
1633 * If the page is below/above the kernel range, it is marked as read-write.
1634 *
1635 * This function operates on 2M pages, since we map the kernel space that
1636 * way.
1637 */
1638 static inline pt_entry_t
bootaddr_rwx(vm_paddr_t pa)1639 bootaddr_rwx(vm_paddr_t pa)
1640 {
1641 /*
1642 * The kernel is loaded at a 2MB-aligned address, and memory below that
1643 * need not be executable. The .bss section is padded to a 2MB
1644 * boundary, so memory following the kernel need not be executable
1645 * either. Preloaded kernel modules have their mapping permissions
1646 * fixed up by the linker.
1647 */
1648 if (pa < trunc_2mpage(kernphys + btext - KERNSTART) ||
1649 pa >= trunc_2mpage(kernphys + _end - KERNSTART))
1650 return (X86_PG_RW | pg_nx);
1651
1652 /*
1653 * The linker should ensure that the read-only and read-write
1654 * portions don't share the same 2M page, so this shouldn't
1655 * impact read-only data. However, in any case, any page with
1656 * read-write data needs to be read-write.
1657 */
1658 if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART))
1659 return (X86_PG_RW | pg_nx);
1660
1661 /*
1662 * Mark any 2M page containing kernel text as read-only. Mark
1663 * other pages with read-only data as read-only and not executable.
1664 * (It is likely a small portion of the read-only data section will
1665 * be marked as read-only, but executable. This should be acceptable
1666 * since the read-only protection will keep the data from changing.)
1667 * Note that fixups to the .text section will still work until we
1668 * set CR0.WP.
1669 */
1670 if (pa < round_2mpage(kernphys + etext - KERNSTART))
1671 return (0);
1672 return (pg_nx);
1673 }
1674
1675 static void
create_pagetables(vm_paddr_t * firstaddr)1676 create_pagetables(vm_paddr_t *firstaddr)
1677 {
1678 pd_entry_t *pd_p;
1679 pdp_entry_t *pdp_p;
1680 pml4_entry_t *p4_p;
1681 uint64_t DMPDkernphys;
1682 vm_paddr_t pax;
1683 #ifdef KASAN
1684 pt_entry_t *pt_p;
1685 uint64_t KASANPDphys, KASANPTphys, KASANphys;
1686 vm_offset_t kasankernbase;
1687 int kasankpdpi, kasankpdi, nkasanpte;
1688 #endif
1689 int i, j, ndm1g, nkpdpe, nkdmpde;
1690
1691 TSENTER();
1692 /* Allocate page table pages for the direct map */
1693 ndmpdp = howmany(ptoa(Maxmem), NBPDP);
1694 if (ndmpdp < 4) /* Minimum 4GB of dirmap */
1695 ndmpdp = 4;
1696 ndmpdpphys = howmany(ndmpdp, NPDPEPG);
1697 if (ndmpdpphys > NDMPML4E) {
1698 /*
1699 * Each NDMPML4E allows 512 GB, so limit to that,
1700 * and then readjust ndmpdp and ndmpdpphys.
1701 */
1702 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
1703 Maxmem = atop(NDMPML4E * NBPML4);
1704 ndmpdpphys = NDMPML4E;
1705 ndmpdp = NDMPML4E * NPDEPG;
1706 }
1707 DMPDPphys = allocpages(firstaddr, ndmpdpphys);
1708 ndm1g = 0;
1709 if ((amd_feature & AMDID_PAGE1GB) != 0) {
1710 /*
1711 * Calculate the number of 1G pages that will fully fit in
1712 * Maxmem.
1713 */
1714 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
1715
1716 /*
1717 * Allocate 2M pages for the kernel. These will be used in
1718 * place of the one or more 1G pages from ndm1g that maps
1719 * kernel memory into DMAP.
1720 */
1721 nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART +
1722 kernphys - rounddown2(kernphys, NBPDP), NBPDP);
1723 DMPDkernphys = allocpages(firstaddr, nkdmpde);
1724 }
1725 if (ndm1g < ndmpdp)
1726 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
1727 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
1728
1729 /* Allocate pages. */
1730 KPML4phys = allocpages(firstaddr, 1);
1731 KPDPphys = allocpages(firstaddr, NKPML4E);
1732 #ifdef KASAN
1733 KASANPDPphys = allocpages(firstaddr, NKASANPML4E);
1734 KASANPDphys = allocpages(firstaddr, 1);
1735 #endif
1736 #ifdef KMSAN
1737 /*
1738 * The KMSAN shadow maps are initially left unpopulated, since there is
1739 * no need to shadow memory above KERNBASE.
1740 */
1741 KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E);
1742 KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E);
1743 #endif
1744
1745 /*
1746 * Allocate the initial number of kernel page table pages required to
1747 * bootstrap. We defer this until after all memory-size dependent
1748 * allocations are done (e.g. direct map), so that we don't have to
1749 * build in too much slop in our estimate.
1750 *
1751 * Note that when NKPML4E > 1, we have an empty page underneath
1752 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
1753 * pages. (pmap_enter requires a PD page to exist for each KPML4E.)
1754 */
1755 nkpt_init(*firstaddr);
1756 nkpdpe = NKPDPE(nkpt);
1757
1758 KPTphys = allocpages(firstaddr, nkpt);
1759 KPDphys = allocpages(firstaddr, nkpdpe);
1760
1761 #ifdef KASAN
1762 nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE);
1763 KASANPTphys = allocpages(firstaddr, nkasanpte);
1764 KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG);
1765 #endif
1766
1767 /*
1768 * Connect the zero-filled PT pages to their PD entries. This
1769 * implicitly maps the PT pages at their correct locations within
1770 * the PTmap.
1771 */
1772 pd_p = (pd_entry_t *)KPDphys;
1773 for (i = 0; i < nkpt; i++)
1774 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1775
1776 /*
1777 * Map from start of the kernel in physical memory (staging
1778 * area) to the end of loader preallocated memory using 2MB
1779 * pages. This replaces some of the PD entries created above.
1780 * For compatibility, identity map 2M at the start.
1781 */
1782 pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A |
1783 X86_PG_RW | pg_nx;
1784 for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) {
1785 /* Preset PG_M and PG_A because demotion expects it. */
1786 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
1787 X86_PG_A | bootaddr_rwx(pax);
1788 }
1789
1790 /*
1791 * Because we map the physical blocks in 2M pages, adjust firstaddr
1792 * to record the physical blocks we've actually mapped into kernel
1793 * virtual address space.
1794 */
1795 if (*firstaddr < round_2mpage(KERNend))
1796 *firstaddr = round_2mpage(KERNend);
1797
1798 /* And connect up the PD to the PDP (leaving room for L4 pages) */
1799 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
1800 for (i = 0; i < nkpdpe; i++)
1801 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1802
1803 #ifdef KASAN
1804 kasankernbase = kasan_md_addr_to_shad(KERNBASE);
1805 kasankpdpi = pmap_pdpe_index(kasankernbase);
1806 kasankpdi = pmap_pde_index(kasankernbase);
1807
1808 pdp_p = (pdp_entry_t *)KASANPDPphys;
1809 pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx);
1810
1811 pd_p = (pd_entry_t *)KASANPDphys;
1812 for (i = 0; i < nkasanpte; i++)
1813 pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW |
1814 X86_PG_V | pg_nx;
1815
1816 pt_p = (pt_entry_t *)KASANPTphys;
1817 for (i = 0; i < nkasanpte * NPTEPG; i++)
1818 pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
1819 X86_PG_M | X86_PG_A | pg_nx;
1820 #endif
1821
1822 /*
1823 * Now, set up the direct map region using 2MB and/or 1GB pages. If
1824 * the end of physical memory is not aligned to a 1GB page boundary,
1825 * then the residual physical memory is mapped with 2MB pages. Later,
1826 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
1827 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
1828 * that are partially used.
1829 */
1830 pd_p = (pd_entry_t *)DMPDphys;
1831 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
1832 pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
1833 /* Preset PG_M and PG_A because demotion expects it. */
1834 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1835 X86_PG_M | X86_PG_A | pg_nx;
1836 }
1837 pdp_p = (pdp_entry_t *)DMPDPphys;
1838 for (i = 0; i < ndm1g; i++) {
1839 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
1840 /* Preset PG_M and PG_A because demotion expects it. */
1841 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1842 X86_PG_M | X86_PG_A | pg_nx;
1843 }
1844 for (j = 0; i < ndmpdp; i++, j++) {
1845 pdp_p[i] = DMPDphys + ptoa(j);
1846 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx;
1847 }
1848
1849 /*
1850 * Instead of using a 1G page for the memory containing the kernel,
1851 * use 2M pages with read-only and no-execute permissions. (If using 1G
1852 * pages, this will partially overwrite the PDPEs above.)
1853 */
1854 if (ndm1g > 0) {
1855 pd_p = (pd_entry_t *)DMPDkernphys;
1856 for (i = 0, pax = rounddown2(kernphys, NBPDP);
1857 i < NPDEPG * nkdmpde; i++, pax += NBPDR) {
1858 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
1859 X86_PG_A | pg_nx | bootaddr_rwx(pax);
1860 }
1861 j = rounddown2(kernphys, NBPDP) >> PDPSHIFT;
1862 for (i = 0; i < nkdmpde; i++) {
1863 pdp_p[i + j] = (DMPDkernphys + ptoa(i)) |
1864 X86_PG_RW | X86_PG_V | pg_nx;
1865 }
1866 }
1867
1868 /* And recursively map PML4 to itself in order to get PTmap */
1869 p4_p = (pml4_entry_t *)KPML4phys;
1870 p4_p[PML4PML4I] = KPML4phys;
1871 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
1872
1873 #ifdef KASAN
1874 /* Connect the KASAN shadow map slots up to the PML4. */
1875 for (i = 0; i < NKASANPML4E; i++) {
1876 p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i);
1877 p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
1878 }
1879 #endif
1880
1881 #ifdef KMSAN
1882 /* Connect the KMSAN shadow map slots up to the PML4. */
1883 for (i = 0; i < NKMSANSHADPML4E; i++) {
1884 p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i);
1885 p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
1886 }
1887
1888 /* Connect the KMSAN origin map slots up to the PML4. */
1889 for (i = 0; i < NKMSANORIGPML4E; i++) {
1890 p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i);
1891 p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
1892 }
1893 #endif
1894
1895 /* Connect the Direct Map slots up to the PML4. */
1896 for (i = 0; i < ndmpdpphys; i++) {
1897 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
1898 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
1899 }
1900
1901 /* Connect the KVA slots up to the PML4 */
1902 for (i = 0; i < NKPML4E; i++) {
1903 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
1904 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
1905 }
1906
1907 kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
1908 TSEXIT();
1909 }
1910
1911 /*
1912 * Bootstrap the system enough to run with virtual memory.
1913 *
1914 * On amd64 this is called after mapping has already been enabled
1915 * and just syncs the pmap module with what has already been done.
1916 * [We can't call it easily with mapping off since the kernel is not
1917 * mapped with PA == VA, hence we would have to relocate every address
1918 * from the linked base (virtual) address "KERNBASE" to the actual
1919 * (physical) address starting relative to 0]
1920 */
1921 void
pmap_bootstrap(vm_paddr_t * firstaddr)1922 pmap_bootstrap(vm_paddr_t *firstaddr)
1923 {
1924 vm_offset_t va;
1925 pt_entry_t *pte, *pcpu_pte;
1926 struct region_descriptor r_gdt;
1927 uint64_t cr4, pcpu0_phys;
1928 u_long res;
1929 int i;
1930
1931 TSENTER();
1932 KERNend = *firstaddr;
1933 res = atop(KERNend - (vm_paddr_t)kernphys);
1934
1935 if (!pti)
1936 pg_g = X86_PG_G;
1937
1938 /*
1939 * Create an initial set of page tables to run the kernel in.
1940 */
1941 create_pagetables(firstaddr);
1942
1943 pcpu0_phys = allocpages(firstaddr, 1);
1944
1945 /*
1946 * Add a physical memory segment (vm_phys_seg) corresponding to the
1947 * preallocated kernel page table pages so that vm_page structures
1948 * representing these pages will be created. The vm_page structures
1949 * are required for promotion of the corresponding kernel virtual
1950 * addresses to superpage mappings.
1951 */
1952 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1953
1954 /*
1955 * Account for the virtual addresses mapped by create_pagetables().
1956 */
1957 virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
1958 (vm_paddr_t)kernphys);
1959 virtual_end = VM_MAX_KERNEL_ADDRESS;
1960
1961 /*
1962 * Enable PG_G global pages, then switch to the kernel page
1963 * table from the bootstrap page table. After the switch, it
1964 * is possible to enable SMEP and SMAP since PG_U bits are
1965 * correct now.
1966 */
1967 cr4 = rcr4();
1968 cr4 |= CR4_PGE;
1969 load_cr4(cr4);
1970 load_cr3(KPML4phys);
1971 if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
1972 cr4 |= CR4_SMEP;
1973 if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
1974 cr4 |= CR4_SMAP;
1975 load_cr4(cr4);
1976
1977 /*
1978 * Initialize the kernel pmap (which is statically allocated).
1979 * Count bootstrap data as being resident in case any of this data is
1980 * later unmapped (using pmap_remove()) and freed.
1981 */
1982 PMAP_LOCK_INIT(kernel_pmap);
1983 kernel_pmap->pm_pmltop = kernel_pml4;
1984 kernel_pmap->pm_cr3 = KPML4phys;
1985 kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
1986 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1987 kernel_pmap->pm_stats.resident_count = res;
1988 vm_radix_init(&kernel_pmap->pm_root);
1989 kernel_pmap->pm_flags = pmap_flags;
1990 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
1991 rangeset_init(&kernel_pmap->pm_pkru, pkru_dup_range,
1992 pkru_free_range, kernel_pmap, M_NOWAIT);
1993 }
1994
1995 /*
1996 * The kernel pmap is always active on all CPUs. Once CPUs are
1997 * enumerated, the mask will be set equal to all_cpus.
1998 */
1999 CPU_FILL(&kernel_pmap->pm_active);
2000
2001 /*
2002 * Initialize the TLB invalidations generation number lock.
2003 */
2004 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
2005
2006 /*
2007 * Reserve some special page table entries/VA space for temporary
2008 * mapping of pages.
2009 */
2010 #define SYSMAP(c, p, v, n) \
2011 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
2012
2013 va = virtual_avail;
2014 pte = vtopte(va);
2015
2016 /*
2017 * Crashdump maps. The first page is reused as CMAP1 for the
2018 * memory test.
2019 */
2020 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
2021 CADDR1 = crashdumpmap;
2022
2023 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU);
2024 virtual_avail = va;
2025
2026 /*
2027 * Map the BSP PCPU now, the rest of the PCPUs are mapped by
2028 * amd64_mp_alloc_pcpu()/start_all_aps() when we know the
2029 * number of CPUs and NUMA affinity.
2030 */
2031 pcpu_pte[0] = pcpu0_phys | X86_PG_V | X86_PG_RW | pg_g | pg_nx |
2032 X86_PG_M | X86_PG_A;
2033 for (i = 1; i < MAXCPU; i++)
2034 pcpu_pte[i] = 0;
2035
2036 /*
2037 * Re-initialize PCPU area for BSP after switching.
2038 * Make hardware use gdt and common_tss from the new PCPU.
2039 */
2040 STAILQ_INIT(&cpuhead);
2041 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
2042 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu));
2043 amd64_bsp_pcpu_init1(&__pcpu[0]);
2044 amd64_bsp_ist_init(&__pcpu[0]);
2045 __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
2046 IOPERM_BITMAP_SIZE;
2047 memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT *
2048 sizeof(struct user_segment_descriptor));
2049 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss;
2050 ssdtosyssd(&gdt_segs[GPROC0_SEL],
2051 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
2052 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
2053 r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
2054 lgdt(&r_gdt);
2055 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
2056 ltr(GSEL(GPROC0_SEL, SEL_KPL));
2057 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic;
2058 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id;
2059
2060 /*
2061 * Initialize the PAT MSR.
2062 * pmap_init_pat() clears and sets CR4_PGE, which, as a
2063 * side-effect, invalidates stale PG_G TLB entries that might
2064 * have been created in our pre-boot environment.
2065 */
2066 pmap_init_pat();
2067
2068 /* Initialize TLB Context Id. */
2069 if (pmap_pcid_enabled) {
2070 kernel_pmap->pm_pcidp = (void *)(uintptr_t)
2071 offsetof(struct pcpu, pc_kpmap_store);
2072
2073 PCPU_SET(kpmap_store.pm_pcid, PMAP_PCID_KERN);
2074 PCPU_SET(kpmap_store.pm_gen, 1);
2075
2076 /*
2077 * PMAP_PCID_KERN + 1 is used for initialization of
2078 * proc0 pmap. The pmap' pcid state might be used by
2079 * EFIRT entry before first context switch, so it
2080 * needs to be valid.
2081 */
2082 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
2083 PCPU_SET(pcid_gen, 1);
2084
2085 /*
2086 * pcpu area for APs is zeroed during AP startup.
2087 * pc_pcid_next and pc_pcid_gen are initialized by AP
2088 * during pcpu setup.
2089 */
2090 load_cr4(rcr4() | CR4_PCIDE);
2091 }
2092 TSEXIT();
2093 }
2094
2095 /*
2096 * Setup the PAT MSR.
2097 */
2098 void
pmap_init_pat(void)2099 pmap_init_pat(void)
2100 {
2101 uint64_t pat_msr;
2102 u_long cr0, cr4;
2103 int i;
2104
2105 /* Bail if this CPU doesn't implement PAT. */
2106 if ((cpu_feature & CPUID_PAT) == 0)
2107 panic("no PAT??");
2108
2109 /* Set default PAT index table. */
2110 for (i = 0; i < PAT_INDEX_SIZE; i++)
2111 pat_index[i] = -1;
2112 pat_index[PAT_WRITE_BACK] = 0;
2113 pat_index[PAT_WRITE_THROUGH] = 1;
2114 pat_index[PAT_UNCACHEABLE] = 3;
2115 pat_index[PAT_WRITE_COMBINING] = 6;
2116 pat_index[PAT_WRITE_PROTECTED] = 5;
2117 pat_index[PAT_UNCACHED] = 2;
2118
2119 /*
2120 * Initialize default PAT entries.
2121 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
2122 * Program 5 and 6 as WP and WC.
2123 *
2124 * Leave 4 and 7 as WB and UC. Note that a recursive page table
2125 * mapping for a 2M page uses a PAT value with the bit 3 set due
2126 * to its overload with PG_PS.
2127 */
2128 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
2129 PAT_VALUE(1, PAT_WRITE_THROUGH) |
2130 PAT_VALUE(2, PAT_UNCACHED) |
2131 PAT_VALUE(3, PAT_UNCACHEABLE) |
2132 PAT_VALUE(4, PAT_WRITE_BACK) |
2133 PAT_VALUE(5, PAT_WRITE_PROTECTED) |
2134 PAT_VALUE(6, PAT_WRITE_COMBINING) |
2135 PAT_VALUE(7, PAT_UNCACHEABLE);
2136
2137 /* Disable PGE. */
2138 cr4 = rcr4();
2139 load_cr4(cr4 & ~CR4_PGE);
2140
2141 /* Disable caches (CD = 1, NW = 0). */
2142 cr0 = rcr0();
2143 load_cr0((cr0 & ~CR0_NW) | CR0_CD);
2144
2145 /* Flushes caches and TLBs. */
2146 wbinvd();
2147 invltlb();
2148
2149 /* Update PAT and index table. */
2150 wrmsr(MSR_PAT, pat_msr);
2151
2152 /* Flush caches and TLBs again. */
2153 wbinvd();
2154 invltlb();
2155
2156 /* Restore caches and PGE. */
2157 load_cr0(cr0);
2158 load_cr4(cr4);
2159 }
2160
2161 vm_page_t
pmap_page_alloc_below_4g(bool zeroed)2162 pmap_page_alloc_below_4g(bool zeroed)
2163 {
2164 return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0),
2165 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT));
2166 }
2167
2168 extern const char la57_trampoline[], la57_trampoline_gdt_desc[],
2169 la57_trampoline_gdt[], la57_trampoline_end[];
2170
2171 static void
pmap_bootstrap_la57(void * arg __unused)2172 pmap_bootstrap_la57(void *arg __unused)
2173 {
2174 char *v_code;
2175 pml5_entry_t *v_pml5;
2176 pml4_entry_t *v_pml4;
2177 pdp_entry_t *v_pdp;
2178 pd_entry_t *v_pd;
2179 pt_entry_t *v_pt;
2180 vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5;
2181 void (*la57_tramp)(uint64_t pml5);
2182 struct region_descriptor r_gdt;
2183
2184 if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0)
2185 return;
2186 TUNABLE_INT_FETCH("vm.pmap.la57", &la57);
2187 if (!la57)
2188 return;
2189
2190 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
2191 r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
2192
2193 m_code = pmap_page_alloc_below_4g(true);
2194 v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code));
2195 m_pml5 = pmap_page_alloc_below_4g(true);
2196 KPML5phys = VM_PAGE_TO_PHYS(m_pml5);
2197 v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys);
2198 m_pml4 = pmap_page_alloc_below_4g(true);
2199 v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
2200 m_pdp = pmap_page_alloc_below_4g(true);
2201 v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
2202 m_pd = pmap_page_alloc_below_4g(true);
2203 v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd));
2204 m_pt = pmap_page_alloc_below_4g(true);
2205 v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt));
2206
2207 /*
2208 * Map m_code 1:1, it appears below 4G in KVA due to physical
2209 * address being below 4G. Since kernel KVA is in upper half,
2210 * the pml4e should be zero and free for temporary use.
2211 */
2212 kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
2213 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
2214 X86_PG_M;
2215 v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] =
2216 VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A |
2217 X86_PG_M;
2218 v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] =
2219 VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A |
2220 X86_PG_M;
2221 v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] =
2222 VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A |
2223 X86_PG_M;
2224
2225 /*
2226 * Add pml5 entry at top of KVA pointing to existing pml4 table,
2227 * entering all existing kernel mappings into level 5 table.
2228 */
2229 v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
2230 X86_PG_RW | X86_PG_A | X86_PG_M;
2231
2232 /*
2233 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on.
2234 */
2235 v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] =
2236 VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A |
2237 X86_PG_M;
2238 v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
2239 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
2240 X86_PG_M;
2241
2242 /*
2243 * Copy and call the 48->57 trampoline, hope we return there, alive.
2244 */
2245 bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline);
2246 *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) =
2247 la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code);
2248 la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code);
2249 pmap_invalidate_all(kernel_pmap);
2250 if (bootverbose) {
2251 printf("entering LA57 trampoline at %#lx\n",
2252 (vm_offset_t)la57_tramp);
2253 }
2254 la57_tramp(KPML5phys);
2255
2256 /*
2257 * gdt was necessary reset, switch back to our gdt.
2258 */
2259 lgdt(&r_gdt);
2260 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
2261 load_ds(_udatasel);
2262 load_es(_udatasel);
2263 load_fs(_ufssel);
2264 ssdtosyssd(&gdt_segs[GPROC0_SEL],
2265 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
2266 ltr(GSEL(GPROC0_SEL, SEL_KPL));
2267 lidt(&r_idt);
2268
2269 if (bootverbose)
2270 printf("LA57 trampoline returned, CR4 %#lx\n", rcr4());
2271
2272 /*
2273 * Now unmap the trampoline, and free the pages.
2274 * Clear pml5 entry used for 1:1 trampoline mapping.
2275 */
2276 pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]);
2277 invlpg((vm_offset_t)v_code);
2278 vm_page_free(m_code);
2279 vm_page_free(m_pdp);
2280 vm_page_free(m_pd);
2281 vm_page_free(m_pt);
2282
2283 /*
2284 * Recursively map PML5 to itself in order to get PTmap and
2285 * PDmap.
2286 */
2287 v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx;
2288
2289 vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
2290 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3;
2291 PTmap = (vm_offset_t)P5Tmap;
2292 vtopdem = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
2293 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3;
2294 PDmap = (vm_offset_t)P5Dmap;
2295
2296 kernel_pmap->pm_cr3 = KPML5phys;
2297 kernel_pmap->pm_pmltop = v_pml5;
2298 pmap_pt_page_count_adj(kernel_pmap, 1);
2299 }
2300 SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL);
2301
2302 /*
2303 * Initialize a vm_page's machine-dependent fields.
2304 */
2305 void
pmap_page_init(vm_page_t m)2306 pmap_page_init(vm_page_t m)
2307 {
2308
2309 TAILQ_INIT(&m->md.pv_list);
2310 m->md.pat_mode = PAT_WRITE_BACK;
2311 }
2312
2313 static int pmap_allow_2m_x_ept;
2314 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
2315 &pmap_allow_2m_x_ept, 0,
2316 "Allow executable superpage mappings in EPT");
2317
2318 void
pmap_allow_2m_x_ept_recalculate(void)2319 pmap_allow_2m_x_ept_recalculate(void)
2320 {
2321 /*
2322 * SKL002, SKL012S. Since the EPT format is only used by
2323 * Intel CPUs, the vendor check is merely a formality.
2324 */
2325 if (!(cpu_vendor_id != CPU_VENDOR_INTEL ||
2326 (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 ||
2327 (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
2328 (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */
2329 CPUID_TO_MODEL(cpu_id) == 0x27 ||
2330 CPUID_TO_MODEL(cpu_id) == 0x35 ||
2331 CPUID_TO_MODEL(cpu_id) == 0x36 ||
2332 CPUID_TO_MODEL(cpu_id) == 0x37 ||
2333 CPUID_TO_MODEL(cpu_id) == 0x86 ||
2334 CPUID_TO_MODEL(cpu_id) == 0x1c ||
2335 CPUID_TO_MODEL(cpu_id) == 0x4a ||
2336 CPUID_TO_MODEL(cpu_id) == 0x4c ||
2337 CPUID_TO_MODEL(cpu_id) == 0x4d ||
2338 CPUID_TO_MODEL(cpu_id) == 0x5a ||
2339 CPUID_TO_MODEL(cpu_id) == 0x5c ||
2340 CPUID_TO_MODEL(cpu_id) == 0x5d ||
2341 CPUID_TO_MODEL(cpu_id) == 0x5f ||
2342 CPUID_TO_MODEL(cpu_id) == 0x6e ||
2343 CPUID_TO_MODEL(cpu_id) == 0x7a ||
2344 CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */
2345 CPUID_TO_MODEL(cpu_id) == 0x85))))
2346 pmap_allow_2m_x_ept = 1;
2347 #ifndef BURN_BRIDGES
2348 TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept);
2349 #endif
2350 TUNABLE_INT_FETCH("vm.pmap.allow_2m_x_ept", &pmap_allow_2m_x_ept);
2351 }
2352
2353 static bool
pmap_allow_2m_x_page(pmap_t pmap,bool executable)2354 pmap_allow_2m_x_page(pmap_t pmap, bool executable)
2355 {
2356
2357 return (pmap->pm_type != PT_EPT || !executable ||
2358 !pmap_allow_2m_x_ept);
2359 }
2360
2361 #ifdef NUMA
2362 static void
pmap_init_pv_table(void)2363 pmap_init_pv_table(void)
2364 {
2365 struct pmap_large_md_page *pvd;
2366 vm_size_t s;
2367 long start, end, highest, pv_npg;
2368 int domain, i, j, pages;
2369
2370 /*
2371 * For correctness we depend on the size being evenly divisible into a
2372 * page. As a tradeoff between performance and total memory use, the
2373 * entry is 64 bytes (aka one cacheline) in size. Not being smaller
2374 * avoids false-sharing, but not being 128 bytes potentially allows for
2375 * avoidable traffic due to adjacent cacheline prefetcher.
2376 *
2377 * Assert the size so that accidental changes fail to compile.
2378 */
2379 CTASSERT((sizeof(*pvd) == 64));
2380
2381 /*
2382 * Calculate the size of the array.
2383 */
2384 pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end;
2385 pv_npg = howmany(pmap_last_pa, NBPDR);
2386 s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page);
2387 s = round_page(s);
2388 pv_table = (struct pmap_large_md_page *)kva_alloc(s);
2389 if (pv_table == NULL)
2390 panic("%s: kva_alloc failed\n", __func__);
2391
2392 /*
2393 * Iterate physical segments to allocate space for respective pages.
2394 */
2395 highest = -1;
2396 s = 0;
2397 for (i = 0; i < vm_phys_nsegs; i++) {
2398 end = vm_phys_segs[i].end / NBPDR;
2399 domain = vm_phys_segs[i].domain;
2400
2401 if (highest >= end)
2402 continue;
2403
2404 start = highest + 1;
2405 pvd = &pv_table[start];
2406
2407 pages = end - start + 1;
2408 s = round_page(pages * sizeof(*pvd));
2409 highest = start + (s / sizeof(*pvd)) - 1;
2410
2411 for (j = 0; j < s; j += PAGE_SIZE) {
2412 vm_page_t m = vm_page_alloc_noobj_domain(domain, 0);
2413 if (m == NULL)
2414 panic("failed to allocate PV table page");
2415 pmap_qenter((vm_offset_t)pvd + j, &m, 1);
2416 }
2417
2418 for (j = 0; j < s / sizeof(*pvd); j++) {
2419 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
2420 TAILQ_INIT(&pvd->pv_page.pv_list);
2421 pvd->pv_page.pv_gen = 0;
2422 pvd->pv_page.pat_mode = 0;
2423 pvd->pv_invl_gen = 0;
2424 pvd++;
2425 }
2426 }
2427 pvd = &pv_dummy_large;
2428 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
2429 TAILQ_INIT(&pvd->pv_page.pv_list);
2430 pvd->pv_page.pv_gen = 0;
2431 pvd->pv_page.pat_mode = 0;
2432 pvd->pv_invl_gen = 0;
2433 }
2434 #else
2435 static void
pmap_init_pv_table(void)2436 pmap_init_pv_table(void)
2437 {
2438 vm_size_t s;
2439 long i, pv_npg;
2440
2441 /*
2442 * Initialize the pool of pv list locks.
2443 */
2444 for (i = 0; i < NPV_LIST_LOCKS; i++)
2445 rw_init(&pv_list_locks[i], "pmap pv list");
2446
2447 /*
2448 * Calculate the size of the pv head table for superpages.
2449 */
2450 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
2451
2452 /*
2453 * Allocate memory for the pv head table for superpages.
2454 */
2455 s = (vm_size_t)pv_npg * sizeof(struct md_page);
2456 s = round_page(s);
2457 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
2458 for (i = 0; i < pv_npg; i++)
2459 TAILQ_INIT(&pv_table[i].pv_list);
2460 TAILQ_INIT(&pv_dummy.pv_list);
2461 }
2462 #endif
2463
2464 /*
2465 * Initialize the pmap module.
2466 *
2467 * Called by vm_mem_init(), to initialize any structures that the pmap
2468 * system needs to map virtual memory.
2469 */
2470 void
pmap_init(void)2471 pmap_init(void)
2472 {
2473 struct pmap_preinit_mapping *ppim;
2474 vm_page_t m, mpte;
2475 int error, i, ret, skz63;
2476
2477 /* L1TF, reserve page @0 unconditionally */
2478 vm_page_blacklist_add(0, bootverbose);
2479
2480 /* Detect bare-metal Skylake Server and Skylake-X. */
2481 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
2482 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
2483 /*
2484 * Skylake-X errata SKZ63. Processor May Hang When
2485 * Executing Code In an HLE Transaction Region between
2486 * 40000000H and 403FFFFFH.
2487 *
2488 * Mark the pages in the range as preallocated. It
2489 * seems to be impossible to distinguish between
2490 * Skylake Server and Skylake X.
2491 */
2492 skz63 = 1;
2493 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
2494 if (skz63 != 0) {
2495 if (bootverbose)
2496 printf("SKZ63: skipping 4M RAM starting "
2497 "at physical 1G\n");
2498 for (i = 0; i < atop(0x400000); i++) {
2499 ret = vm_page_blacklist_add(0x40000000 +
2500 ptoa(i), FALSE);
2501 if (!ret && bootverbose)
2502 printf("page at %#lx already used\n",
2503 0x40000000 + ptoa(i));
2504 }
2505 }
2506 }
2507
2508 /* IFU */
2509 pmap_allow_2m_x_ept_recalculate();
2510
2511 /*
2512 * Initialize the vm page array entries for the kernel pmap's
2513 * page table pages.
2514 */
2515 PMAP_LOCK(kernel_pmap);
2516 for (i = 0; i < nkpt; i++) {
2517 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
2518 KASSERT(mpte >= vm_page_array &&
2519 mpte < &vm_page_array[vm_page_array_size],
2520 ("pmap_init: page table page is out of range"));
2521 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
2522 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
2523 mpte->ref_count = 1;
2524
2525 /*
2526 * Collect the page table pages that were replaced by a 2MB
2527 * page in create_pagetables(). They are zero filled.
2528 */
2529 if ((i == 0 ||
2530 kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) &&
2531 pmap_insert_pt_page(kernel_pmap, mpte, false, false))
2532 panic("pmap_init: pmap_insert_pt_page failed");
2533 }
2534 PMAP_UNLOCK(kernel_pmap);
2535 vm_wire_add(nkpt);
2536
2537 /*
2538 * If the kernel is running on a virtual machine, then it must assume
2539 * that MCA is enabled by the hypervisor. Moreover, the kernel must
2540 * be prepared for the hypervisor changing the vendor and family that
2541 * are reported by CPUID. Consequently, the workaround for AMD Family
2542 * 10h Erratum 383 is enabled if the processor's feature set does not
2543 * include at least one feature that is only supported by older Intel
2544 * or newer AMD processors.
2545 */
2546 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
2547 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
2548 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
2549 AMDID2_FMA4)) == 0)
2550 workaround_erratum383 = 1;
2551
2552 /*
2553 * Are large page mappings enabled?
2554 */
2555 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
2556 if (pg_ps_enabled) {
2557 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
2558 ("pmap_init: can't assign to pagesizes[1]"));
2559 pagesizes[1] = NBPDR;
2560 if ((amd_feature & AMDID_PAGE1GB) != 0) {
2561 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
2562 ("pmap_init: can't assign to pagesizes[2]"));
2563 pagesizes[2] = NBPDP;
2564 }
2565 }
2566
2567 /*
2568 * Initialize pv chunk lists.
2569 */
2570 for (i = 0; i < PMAP_MEMDOM; i++) {
2571 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF);
2572 TAILQ_INIT(&pv_chunks[i].pvc_list);
2573 }
2574 pmap_init_pv_table();
2575
2576 pmap_initialized = 1;
2577 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
2578 ppim = pmap_preinit_mapping + i;
2579 if (ppim->va == 0)
2580 continue;
2581 /* Make the direct map consistent */
2582 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) {
2583 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
2584 ppim->sz, ppim->mode);
2585 }
2586 if (!bootverbose)
2587 continue;
2588 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
2589 ppim->pa, ppim->va, ppim->sz, ppim->mode);
2590 }
2591
2592 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
2593 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
2594 (vmem_addr_t *)&qframe);
2595 if (error != 0)
2596 panic("qframe allocation failed");
2597
2598 lm_ents = 8;
2599 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
2600 if (lm_ents > LMEPML4I - LMSPML4I + 1)
2601 lm_ents = LMEPML4I - LMSPML4I + 1;
2602 #ifdef KMSAN
2603 if (lm_ents > KMSANORIGPML4I - LMSPML4I) {
2604 printf(
2605 "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n",
2606 lm_ents, KMSANORIGPML4I - LMSPML4I);
2607 lm_ents = KMSANORIGPML4I - LMSPML4I;
2608 }
2609 #endif
2610 if (bootverbose)
2611 printf("pmap: large map %u PML4 slots (%lu GB)\n",
2612 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
2613 if (lm_ents != 0) {
2614 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
2615 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
2616 if (large_vmem == NULL) {
2617 printf("pmap: cannot create large map\n");
2618 lm_ents = 0;
2619 }
2620 for (i = 0; i < lm_ents; i++) {
2621 m = pmap_large_map_getptp_unlocked();
2622 /* XXXKIB la57 */
2623 kernel_pml4[LMSPML4I + i] = X86_PG_V |
2624 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
2625 VM_PAGE_TO_PHYS(m);
2626 }
2627 }
2628 }
2629
2630 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries,
2631 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0,
2632 "Maximum number of PML4 entries for use by large map (tunable). "
2633 "Each entry corresponds to 512GB of address space.");
2634
2635 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
2636 "2MB page mapping counters");
2637
2638 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions);
2639 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions,
2640 CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions");
2641
2642 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings);
2643 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
2644 &pmap_pde_mappings, "2MB page mappings");
2645
2646 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures);
2647 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
2648 &pmap_pde_p_failures, "2MB page promotion failures");
2649
2650 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions);
2651 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
2652 &pmap_pde_promotions, "2MB page promotions");
2653
2654 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
2655 "1GB page mapping counters");
2656
2657 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions);
2658 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
2659 &pmap_pdpe_demotions, "1GB page demotions");
2660
2661 /***************************************************
2662 * Low level helper routines.....
2663 ***************************************************/
2664
2665 static pt_entry_t
pmap_swap_pat(pmap_t pmap,pt_entry_t entry)2666 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
2667 {
2668 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
2669
2670 switch (pmap->pm_type) {
2671 case PT_X86:
2672 case PT_RVI:
2673 /* Verify that both PAT bits are not set at the same time */
2674 KASSERT((entry & x86_pat_bits) != x86_pat_bits,
2675 ("Invalid PAT bits in entry %#lx", entry));
2676
2677 /* Swap the PAT bits if one of them is set */
2678 if ((entry & x86_pat_bits) != 0)
2679 entry ^= x86_pat_bits;
2680 break;
2681 case PT_EPT:
2682 /*
2683 * Nothing to do - the memory attributes are represented
2684 * the same way for regular pages and superpages.
2685 */
2686 break;
2687 default:
2688 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
2689 }
2690
2691 return (entry);
2692 }
2693
2694 boolean_t
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)2695 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
2696 {
2697
2698 return (mode >= 0 && mode < PAT_INDEX_SIZE &&
2699 pat_index[(int)mode] >= 0);
2700 }
2701
2702 /*
2703 * Determine the appropriate bits to set in a PTE or PDE for a specified
2704 * caching mode.
2705 */
2706 int
pmap_cache_bits(pmap_t pmap,int mode,boolean_t is_pde)2707 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
2708 {
2709 int cache_bits, pat_flag, pat_idx;
2710
2711 if (!pmap_is_valid_memattr(pmap, mode))
2712 panic("Unknown caching mode %d\n", mode);
2713
2714 switch (pmap->pm_type) {
2715 case PT_X86:
2716 case PT_RVI:
2717 /* The PAT bit is different for PTE's and PDE's. */
2718 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
2719
2720 /* Map the caching mode to a PAT index. */
2721 pat_idx = pat_index[mode];
2722
2723 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
2724 cache_bits = 0;
2725 if (pat_idx & 0x4)
2726 cache_bits |= pat_flag;
2727 if (pat_idx & 0x2)
2728 cache_bits |= PG_NC_PCD;
2729 if (pat_idx & 0x1)
2730 cache_bits |= PG_NC_PWT;
2731 break;
2732
2733 case PT_EPT:
2734 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
2735 break;
2736
2737 default:
2738 panic("unsupported pmap type %d", pmap->pm_type);
2739 }
2740
2741 return (cache_bits);
2742 }
2743
2744 static int
pmap_cache_mask(pmap_t pmap,boolean_t is_pde)2745 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
2746 {
2747 int mask;
2748
2749 switch (pmap->pm_type) {
2750 case PT_X86:
2751 case PT_RVI:
2752 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
2753 break;
2754 case PT_EPT:
2755 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
2756 break;
2757 default:
2758 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
2759 }
2760
2761 return (mask);
2762 }
2763
2764 static int
pmap_pat_index(pmap_t pmap,pt_entry_t pte,bool is_pde)2765 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde)
2766 {
2767 int pat_flag, pat_idx;
2768
2769 pat_idx = 0;
2770 switch (pmap->pm_type) {
2771 case PT_X86:
2772 case PT_RVI:
2773 /* The PAT bit is different for PTE's and PDE's. */
2774 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
2775
2776 if ((pte & pat_flag) != 0)
2777 pat_idx |= 0x4;
2778 if ((pte & PG_NC_PCD) != 0)
2779 pat_idx |= 0x2;
2780 if ((pte & PG_NC_PWT) != 0)
2781 pat_idx |= 0x1;
2782 break;
2783 case PT_EPT:
2784 if ((pte & EPT_PG_IGNORE_PAT) != 0)
2785 panic("EPT PTE %#lx has no PAT memory type", pte);
2786 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3;
2787 break;
2788 }
2789
2790 /* See pmap_init_pat(). */
2791 if (pat_idx == 4)
2792 pat_idx = 0;
2793 if (pat_idx == 7)
2794 pat_idx = 3;
2795
2796 return (pat_idx);
2797 }
2798
2799 bool
pmap_ps_enabled(pmap_t pmap)2800 pmap_ps_enabled(pmap_t pmap)
2801 {
2802
2803 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
2804 }
2805
2806 static void
pmap_update_pde_store(pmap_t pmap,pd_entry_t * pde,pd_entry_t newpde)2807 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
2808 {
2809
2810 switch (pmap->pm_type) {
2811 case PT_X86:
2812 break;
2813 case PT_RVI:
2814 case PT_EPT:
2815 /*
2816 * XXX
2817 * This is a little bogus since the generation number is
2818 * supposed to be bumped up when a region of the address
2819 * space is invalidated in the page tables.
2820 *
2821 * In this case the old PDE entry is valid but yet we want
2822 * to make sure that any mappings using the old entry are
2823 * invalidated in the TLB.
2824 *
2825 * The reason this works as expected is because we rendezvous
2826 * "all" host cpus and force any vcpu context to exit as a
2827 * side-effect.
2828 */
2829 atomic_add_long(&pmap->pm_eptgen, 1);
2830 break;
2831 default:
2832 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
2833 }
2834 pde_store(pde, newpde);
2835 }
2836
2837 /*
2838 * After changing the page size for the specified virtual address in the page
2839 * table, flush the corresponding entries from the processor's TLB. Only the
2840 * calling processor's TLB is affected.
2841 *
2842 * The calling thread must be pinned to a processor.
2843 */
2844 static void
pmap_update_pde_invalidate(pmap_t pmap,vm_offset_t va,pd_entry_t newpde)2845 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
2846 {
2847 pt_entry_t PG_G;
2848
2849 if (pmap_type_guest(pmap))
2850 return;
2851
2852 KASSERT(pmap->pm_type == PT_X86,
2853 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
2854
2855 PG_G = pmap_global_bit(pmap);
2856
2857 if ((newpde & PG_PS) == 0)
2858 /* Demotion: flush a specific 2MB page mapping. */
2859 pmap_invlpg(pmap, va);
2860 else if ((newpde & PG_G) == 0)
2861 /*
2862 * Promotion: flush every 4KB page mapping from the TLB
2863 * because there are too many to flush individually.
2864 */
2865 invltlb();
2866 else {
2867 /*
2868 * Promotion: flush every 4KB page mapping from the TLB,
2869 * including any global (PG_G) mappings.
2870 */
2871 invltlb_glob();
2872 }
2873 }
2874
2875 /*
2876 * The amd64 pmap uses different approaches to TLB invalidation
2877 * depending on the kernel configuration, available hardware features,
2878 * and known hardware errata. The kernel configuration option that
2879 * has the greatest operational impact on TLB invalidation is PTI,
2880 * which is enabled automatically on affected Intel CPUs. The most
2881 * impactful hardware features are first PCID, and then INVPCID
2882 * instruction presence. PCID usage is quite different for PTI
2883 * vs. non-PTI.
2884 *
2885 * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate
2886 * the Meltdown bug in some Intel CPUs. Under PTI, each user address
2887 * space is served by two page tables, user and kernel. The user
2888 * page table only maps user space and a kernel trampoline. The
2889 * kernel trampoline includes the entirety of the kernel text but
2890 * only the kernel data that is needed to switch from user to kernel
2891 * mode. The kernel page table maps the user and kernel address
2892 * spaces in their entirety. It is identical to the per-process
2893 * page table used in non-PTI mode.
2894 *
2895 * User page tables are only used when the CPU is in user mode.
2896 * Consequently, some TLB invalidations can be postponed until the
2897 * switch from kernel to user mode. In contrast, the user
2898 * space part of the kernel page table is used for copyout(9), so
2899 * TLB invalidations on this page table cannot be similarly postponed.
2900 *
2901 * The existence of a user mode page table for the given pmap is
2902 * indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in
2903 * which case pm_ucr3 contains the %cr3 register value for the user
2904 * mode page table's root.
2905 *
2906 * * The pm_active bitmask indicates which CPUs currently have the
2907 * pmap active. A CPU's bit is set on context switch to the pmap, and
2908 * cleared on switching off this CPU. For the kernel page table,
2909 * the pm_active field is immutable and contains all CPUs. The
2910 * kernel page table is always logically active on every processor,
2911 * but not necessarily in use by the hardware, e.g., in PTI mode.
2912 *
2913 * When requesting invalidation of virtual addresses with
2914 * pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to
2915 * all CPUs recorded as active in pm_active. Updates to and reads
2916 * from pm_active are not synchronized, and so they may race with
2917 * each other. Shootdown handlers are prepared to handle the race.
2918 *
2919 * * PCID is an optional feature of the long mode x86 MMU where TLB
2920 * entries are tagged with the 'Process ID' of the address space
2921 * they belong to. This feature provides a limited namespace for
2922 * process identifiers, 12 bits, supporting 4095 simultaneous IDs
2923 * total.
2924 *
2925 * Allocation of a PCID to a pmap is done by an algorithm described
2926 * in section 15.12, "Other TLB Consistency Algorithms", of
2927 * Vahalia's book "Unix Internals". A PCID cannot be allocated for
2928 * the whole lifetime of a pmap in pmap_pinit() due to the limited
2929 * namespace. Instead, a per-CPU, per-pmap PCID is assigned when
2930 * the CPU is about to start caching TLB entries from a pmap,
2931 * i.e., on the context switch that activates the pmap on the CPU.
2932 *
2933 * The PCID allocator maintains a per-CPU, per-pmap generation
2934 * count, pm_gen, which is incremented each time a new PCID is
2935 * allocated. On TLB invalidation, the generation counters for the
2936 * pmap are zeroed, which signals the context switch code that the
2937 * previously allocated PCID is no longer valid. Effectively,
2938 * zeroing any of these counters triggers a TLB shootdown for the
2939 * given CPU/address space, due to the allocation of a new PCID.
2940 *
2941 * Zeroing can be performed remotely. Consequently, if a pmap is
2942 * inactive on a CPU, then a TLB shootdown for that pmap and CPU can
2943 * be initiated by an ordinary memory access to reset the target
2944 * CPU's generation count within the pmap. The CPU initiating the
2945 * TLB shootdown does not need to send an IPI to the target CPU.
2946 *
2947 * * PTI + PCID. The available PCIDs are divided into two sets: PCIDs
2948 * for complete (kernel) page tables, and PCIDs for user mode page
2949 * tables. A user PCID value is obtained from the kernel PCID value
2950 * by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT).
2951 *
2952 * User space page tables are activated on return to user mode, by
2953 * loading pm_ucr3 into %cr3. If the PCPU(ucr3_load_mask) requests
2954 * clearing bit 63 of the loaded ucr3, this effectively causes
2955 * complete invalidation of the user mode TLB entries for the
2956 * current pmap. In which case, local invalidations of individual
2957 * pages in the user page table are skipped.
2958 *
2959 * * Local invalidation, all modes. If the requested invalidation is
2960 * for a specific address or the total invalidation of a currently
2961 * active pmap, then the TLB is flushed using INVLPG for a kernel
2962 * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a
2963 * user space page table(s).
2964 *
2965 * If the INVPCID instruction is available, it is used to flush user
2966 * entries from the kernel page table.
2967 *
2968 * When PCID is enabled, the INVLPG instruction invalidates all TLB
2969 * entries for the given page that either match the current PCID or
2970 * are global. Since TLB entries for the same page under different
2971 * PCIDs are unaffected, kernel pages which reside in all address
2972 * spaces could be problematic. We avoid the problem by creating
2973 * all kernel PTEs with the global flag (PG_G) set, when PTI is
2974 * disabled.
2975 *
2976 * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its
2977 * address space, all other 4095 PCIDs are used for user mode spaces
2978 * as described above. A context switch allocates a new PCID if
2979 * the recorded PCID is zero or the recorded generation does not match
2980 * the CPU's generation, effectively flushing the TLB for this address space.
2981 * Total remote invalidation is performed by zeroing pm_gen for all CPUs.
2982 * local user page: INVLPG
2983 * local kernel page: INVLPG
2984 * local user total: INVPCID(CTX)
2985 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob()
2986 * remote user page, inactive pmap: zero pm_gen
2987 * remote user page, active pmap: zero pm_gen + IPI:INVLPG
2988 * (Both actions are required to handle the aforementioned pm_active races.)
2989 * remote kernel page: IPI:INVLPG
2990 * remote user total, inactive pmap: zero pm_gen
2991 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or
2992 * reload %cr3)
2993 * (See note above about pm_active races.)
2994 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob())
2995 *
2996 * PTI enabled, PCID present.
2997 * local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3)
2998 * for upt
2999 * local kernel page: INVLPG
3000 * local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE
3001 * on loading UCR3 into %cr3 for upt
3002 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob()
3003 * remote user page, inactive pmap: zero pm_gen
3004 * remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt,
3005 * INVPCID(ADDR) for upt)
3006 * remote kernel page: IPI:INVLPG
3007 * remote user total, inactive pmap: zero pm_gen
3008 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt,
3009 * clear PCID_SAVE on loading UCR3 into $cr3 for upt)
3010 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob())
3011 *
3012 * No PCID.
3013 * local user page: INVLPG
3014 * local kernel page: INVLPG
3015 * local user total: reload %cr3
3016 * local kernel total: invltlb_glob()
3017 * remote user page, inactive pmap: -
3018 * remote user page, active pmap: IPI:INVLPG
3019 * remote kernel page: IPI:INVLPG
3020 * remote user total, inactive pmap: -
3021 * remote user total, active pmap: IPI:(reload %cr3)
3022 * remote kernel total: IPI:invltlb_glob()
3023 * Since on return to user mode, the reload of %cr3 with ucr3 causes
3024 * TLB invalidation, no specific action is required for user page table.
3025 *
3026 * EPT. EPT pmaps do not map KVA, all mappings are userspace.
3027 * XXX TODO
3028 */
3029
3030 #ifdef SMP
3031 /*
3032 * Interrupt the cpus that are executing in the guest context.
3033 * This will force the vcpu to exit and the cached EPT mappings
3034 * will be invalidated by the host before the next vmresume.
3035 */
3036 static __inline void
pmap_invalidate_ept(pmap_t pmap)3037 pmap_invalidate_ept(pmap_t pmap)
3038 {
3039 smr_seq_t goal;
3040 int ipinum;
3041
3042 sched_pin();
3043 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
3044 ("pmap_invalidate_ept: absurd pm_active"));
3045
3046 /*
3047 * The TLB mappings associated with a vcpu context are not
3048 * flushed each time a different vcpu is chosen to execute.
3049 *
3050 * This is in contrast with a process's vtop mappings that
3051 * are flushed from the TLB on each context switch.
3052 *
3053 * Therefore we need to do more than just a TLB shootdown on
3054 * the active cpus in 'pmap->pm_active'. To do this we keep
3055 * track of the number of invalidations performed on this pmap.
3056 *
3057 * Each vcpu keeps a cache of this counter and compares it
3058 * just before a vmresume. If the counter is out-of-date an
3059 * invept will be done to flush stale mappings from the TLB.
3060 *
3061 * To ensure that all vCPU threads have observed the new counter
3062 * value before returning, we use SMR. Ordering is important here:
3063 * the VMM enters an SMR read section before loading the counter
3064 * and after updating the pm_active bit set. Thus, pm_active is
3065 * a superset of active readers, and any reader that has observed
3066 * the goal has observed the new counter value.
3067 */
3068 atomic_add_long(&pmap->pm_eptgen, 1);
3069
3070 goal = smr_advance(pmap->pm_eptsmr);
3071
3072 /*
3073 * Force the vcpu to exit and trap back into the hypervisor.
3074 */
3075 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
3076 ipi_selected(pmap->pm_active, ipinum);
3077 sched_unpin();
3078
3079 /*
3080 * Ensure that all active vCPUs will observe the new generation counter
3081 * value before executing any more guest instructions.
3082 */
3083 smr_wait(pmap->pm_eptsmr, goal);
3084 }
3085
3086 static inline void
pmap_invalidate_preipi_pcid(pmap_t pmap)3087 pmap_invalidate_preipi_pcid(pmap_t pmap)
3088 {
3089 struct pmap_pcid *pcidp;
3090 u_int cpuid, i;
3091
3092 sched_pin();
3093
3094 cpuid = PCPU_GET(cpuid);
3095 if (pmap != PCPU_GET(curpmap))
3096 cpuid = 0xffffffff; /* An impossible value */
3097
3098 CPU_FOREACH(i) {
3099 if (cpuid != i) {
3100 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i);
3101 pcidp->pm_gen = 0;
3102 }
3103 }
3104
3105 /*
3106 * The fence is between stores to pm_gen and the read of the
3107 * pm_active mask. We need to ensure that it is impossible
3108 * for us to miss the bit update in pm_active and
3109 * simultaneously observe a non-zero pm_gen in
3110 * pmap_activate_sw(), otherwise TLB update is missed.
3111 * Without the fence, IA32 allows such an outcome. Note that
3112 * pm_active is updated by a locked operation, which provides
3113 * the reciprocal fence.
3114 */
3115 atomic_thread_fence_seq_cst();
3116 }
3117
3118 static void
pmap_invalidate_preipi_nopcid(pmap_t pmap __unused)3119 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused)
3120 {
3121 sched_pin();
3122 }
3123
3124 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t))
3125 {
3126 return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid :
3127 pmap_invalidate_preipi_nopcid);
3128 }
3129
3130 static inline void
pmap_invalidate_page_pcid_cb(pmap_t pmap,vm_offset_t va,const bool invpcid_works1)3131 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va,
3132 const bool invpcid_works1)
3133 {
3134 struct invpcid_descr d;
3135 uint64_t kcr3, ucr3;
3136 uint32_t pcid;
3137
3138 /*
3139 * Because pm_pcid is recalculated on a context switch, we
3140 * must ensure there is no preemption, not just pinning.
3141 * Otherwise, we might use a stale value below.
3142 */
3143 CRITICAL_ASSERT(curthread);
3144
3145 /*
3146 * No need to do anything with user page tables invalidation
3147 * if there is no user page table, or invalidation is deferred
3148 * until the return to userspace. ucr3_load_mask is stable
3149 * because we have preemption disabled.
3150 */
3151 if (pmap->pm_ucr3 == PMAP_NO_CR3 ||
3152 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK)
3153 return;
3154
3155 pcid = pmap_get_pcid(pmap);
3156 if (invpcid_works1) {
3157 d.pcid = pcid | PMAP_PCID_USER_PT;
3158 d.pad = 0;
3159 d.addr = va;
3160 invpcid(&d, INVPCID_ADDR);
3161 } else {
3162 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
3163 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
3164 pmap_pti_pcid_invlpg(ucr3, kcr3, va);
3165 }
3166 }
3167
3168 static void
pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap,vm_offset_t va)3169 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va)
3170 {
3171 pmap_invalidate_page_pcid_cb(pmap, va, true);
3172 }
3173
3174 static void
pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap,vm_offset_t va)3175 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va)
3176 {
3177 pmap_invalidate_page_pcid_cb(pmap, va, false);
3178 }
3179
3180 static void
pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused,vm_offset_t va __unused)3181 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused)
3182 {
3183 }
3184
3185 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t))
3186 {
3187 if (pmap_pcid_enabled)
3188 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb :
3189 pmap_invalidate_page_pcid_noinvpcid_cb);
3190 return (pmap_invalidate_page_nopcid_cb);
3191 }
3192
3193 static void
pmap_invalidate_page_curcpu_cb(pmap_t pmap,vm_offset_t va,vm_offset_t addr2 __unused)3194 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
3195 vm_offset_t addr2 __unused)
3196 {
3197 if (pmap == kernel_pmap) {
3198 pmap_invlpg(kernel_pmap, va);
3199 } else if (pmap == PCPU_GET(curpmap)) {
3200 invlpg(va);
3201 pmap_invalidate_page_cb(pmap, va);
3202 }
3203 }
3204
3205 void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)3206 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
3207 {
3208 if (pmap_type_guest(pmap)) {
3209 pmap_invalidate_ept(pmap);
3210 return;
3211 }
3212
3213 KASSERT(pmap->pm_type == PT_X86,
3214 ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
3215
3216 pmap_invalidate_preipi(pmap);
3217 smp_masked_invlpg(va, pmap, pmap_invalidate_page_curcpu_cb);
3218 }
3219
3220 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
3221 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE)
3222
3223 static void
pmap_invalidate_range_pcid_cb(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,const bool invpcid_works1)3224 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
3225 const bool invpcid_works1)
3226 {
3227 struct invpcid_descr d;
3228 uint64_t kcr3, ucr3;
3229 uint32_t pcid;
3230
3231 CRITICAL_ASSERT(curthread);
3232
3233 if (pmap != PCPU_GET(curpmap) ||
3234 pmap->pm_ucr3 == PMAP_NO_CR3 ||
3235 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK)
3236 return;
3237
3238 pcid = pmap_get_pcid(pmap);
3239 if (invpcid_works1) {
3240 d.pcid = pcid | PMAP_PCID_USER_PT;
3241 d.pad = 0;
3242 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE)
3243 invpcid(&d, INVPCID_ADDR);
3244 } else {
3245 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
3246 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
3247 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
3248 }
3249 }
3250
3251 static void
pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3252 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva,
3253 vm_offset_t eva)
3254 {
3255 pmap_invalidate_range_pcid_cb(pmap, sva, eva, true);
3256 }
3257
3258 static void
pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3259 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva,
3260 vm_offset_t eva)
3261 {
3262 pmap_invalidate_range_pcid_cb(pmap, sva, eva, false);
3263 }
3264
3265 static void
pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused,vm_offset_t sva __unused,vm_offset_t eva __unused)3266 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused,
3267 vm_offset_t eva __unused)
3268 {
3269 }
3270
3271 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t,
3272 vm_offset_t))
3273 {
3274 if (pmap_pcid_enabled)
3275 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb :
3276 pmap_invalidate_range_pcid_noinvpcid_cb);
3277 return (pmap_invalidate_range_nopcid_cb);
3278 }
3279
3280 static void
pmap_invalidate_range_curcpu_cb(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3281 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3282 {
3283 vm_offset_t addr;
3284
3285 if (pmap == kernel_pmap) {
3286 if (PCPU_GET(pcid_invlpg_workaround)) {
3287 struct invpcid_descr d = { 0 };
3288
3289 invpcid(&d, INVPCID_CTXGLOB);
3290 } else {
3291 for (addr = sva; addr < eva; addr += PAGE_SIZE)
3292 invlpg(addr);
3293 }
3294 } else if (pmap == PCPU_GET(curpmap)) {
3295 for (addr = sva; addr < eva; addr += PAGE_SIZE)
3296 invlpg(addr);
3297 pmap_invalidate_range_cb(pmap, sva, eva);
3298 }
3299 }
3300
3301 void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3302 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3303 {
3304 if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
3305 pmap_invalidate_all(pmap);
3306 return;
3307 }
3308
3309 if (pmap_type_guest(pmap)) {
3310 pmap_invalidate_ept(pmap);
3311 return;
3312 }
3313
3314 KASSERT(pmap->pm_type == PT_X86,
3315 ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
3316
3317 pmap_invalidate_preipi(pmap);
3318 smp_masked_invlpg_range(sva, eva, pmap,
3319 pmap_invalidate_range_curcpu_cb);
3320 }
3321
3322 static inline void
pmap_invalidate_all_pcid_cb(pmap_t pmap,bool invpcid_works1)3323 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1)
3324 {
3325 struct invpcid_descr d;
3326 uint64_t kcr3;
3327 uint32_t pcid;
3328
3329 if (pmap == kernel_pmap) {
3330 if (invpcid_works1) {
3331 bzero(&d, sizeof(d));
3332 invpcid(&d, INVPCID_CTXGLOB);
3333 } else {
3334 invltlb_glob();
3335 }
3336 } else if (pmap == PCPU_GET(curpmap)) {
3337 CRITICAL_ASSERT(curthread);
3338
3339 pcid = pmap_get_pcid(pmap);
3340 if (invpcid_works1) {
3341 d.pcid = pcid;
3342 d.pad = 0;
3343 d.addr = 0;
3344 invpcid(&d, INVPCID_CTX);
3345 } else {
3346 kcr3 = pmap->pm_cr3 | pcid;
3347 load_cr3(kcr3);
3348 }
3349 if (pmap->pm_ucr3 != PMAP_NO_CR3)
3350 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
3351 }
3352 }
3353
3354 static void
pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap)3355 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap)
3356 {
3357 pmap_invalidate_all_pcid_cb(pmap, true);
3358 }
3359
3360 static void
pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap)3361 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap)
3362 {
3363 pmap_invalidate_all_pcid_cb(pmap, false);
3364 }
3365
3366 static void
pmap_invalidate_all_nopcid_cb(pmap_t pmap)3367 pmap_invalidate_all_nopcid_cb(pmap_t pmap)
3368 {
3369 if (pmap == kernel_pmap)
3370 invltlb_glob();
3371 else if (pmap == PCPU_GET(curpmap))
3372 invltlb();
3373 }
3374
3375 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t))
3376 {
3377 if (pmap_pcid_enabled)
3378 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb :
3379 pmap_invalidate_all_pcid_noinvpcid_cb);
3380 return (pmap_invalidate_all_nopcid_cb);
3381 }
3382
3383 static void
pmap_invalidate_all_curcpu_cb(pmap_t pmap,vm_offset_t addr1 __unused,vm_offset_t addr2 __unused)3384 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused,
3385 vm_offset_t addr2 __unused)
3386 {
3387 pmap_invalidate_all_cb(pmap);
3388 }
3389
3390 void
pmap_invalidate_all(pmap_t pmap)3391 pmap_invalidate_all(pmap_t pmap)
3392 {
3393 if (pmap_type_guest(pmap)) {
3394 pmap_invalidate_ept(pmap);
3395 return;
3396 }
3397
3398 KASSERT(pmap->pm_type == PT_X86,
3399 ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
3400
3401 pmap_invalidate_preipi(pmap);
3402 smp_masked_invltlb(pmap, pmap_invalidate_all_curcpu_cb);
3403 }
3404
3405 static void
pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused,vm_offset_t va __unused,vm_offset_t addr2 __unused)3406 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused,
3407 vm_offset_t addr2 __unused)
3408 {
3409 wbinvd();
3410 }
3411
3412 void
pmap_invalidate_cache(void)3413 pmap_invalidate_cache(void)
3414 {
3415 sched_pin();
3416 smp_cache_flush(pmap_invalidate_cache_curcpu_cb);
3417 }
3418
3419 struct pde_action {
3420 cpuset_t invalidate; /* processors that invalidate their TLB */
3421 pmap_t pmap;
3422 vm_offset_t va;
3423 pd_entry_t *pde;
3424 pd_entry_t newpde;
3425 u_int store; /* processor that updates the PDE */
3426 };
3427
3428 static void
pmap_update_pde_action(void * arg)3429 pmap_update_pde_action(void *arg)
3430 {
3431 struct pde_action *act = arg;
3432
3433 if (act->store == PCPU_GET(cpuid))
3434 pmap_update_pde_store(act->pmap, act->pde, act->newpde);
3435 }
3436
3437 static void
pmap_update_pde_teardown(void * arg)3438 pmap_update_pde_teardown(void *arg)
3439 {
3440 struct pde_action *act = arg;
3441
3442 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
3443 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
3444 }
3445
3446 /*
3447 * Change the page size for the specified virtual address in a way that
3448 * prevents any possibility of the TLB ever having two entries that map the
3449 * same virtual address using different page sizes. This is the recommended
3450 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a
3451 * machine check exception for a TLB state that is improperly diagnosed as a
3452 * hardware error.
3453 */
3454 static void
pmap_update_pde(pmap_t pmap,vm_offset_t va,pd_entry_t * pde,pd_entry_t newpde)3455 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
3456 {
3457 struct pde_action act;
3458 cpuset_t active, other_cpus;
3459 u_int cpuid;
3460
3461 sched_pin();
3462 cpuid = PCPU_GET(cpuid);
3463 other_cpus = all_cpus;
3464 CPU_CLR(cpuid, &other_cpus);
3465 if (pmap == kernel_pmap || pmap_type_guest(pmap))
3466 active = all_cpus;
3467 else {
3468 active = pmap->pm_active;
3469 }
3470 if (CPU_OVERLAP(&active, &other_cpus)) {
3471 act.store = cpuid;
3472 act.invalidate = active;
3473 act.va = va;
3474 act.pmap = pmap;
3475 act.pde = pde;
3476 act.newpde = newpde;
3477 CPU_SET(cpuid, &active);
3478 smp_rendezvous_cpus(active,
3479 smp_no_rendezvous_barrier, pmap_update_pde_action,
3480 pmap_update_pde_teardown, &act);
3481 } else {
3482 pmap_update_pde_store(pmap, pde, newpde);
3483 if (CPU_ISSET(cpuid, &active))
3484 pmap_update_pde_invalidate(pmap, va, newpde);
3485 }
3486 sched_unpin();
3487 }
3488 #else /* !SMP */
3489 /*
3490 * Normal, non-SMP, invalidation functions.
3491 */
3492 void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)3493 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
3494 {
3495 struct invpcid_descr d;
3496 struct pmap_pcid *pcidp;
3497 uint64_t kcr3, ucr3;
3498 uint32_t pcid;
3499
3500 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
3501 pmap->pm_eptgen++;
3502 return;
3503 }
3504 KASSERT(pmap->pm_type == PT_X86,
3505 ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
3506
3507 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
3508 invlpg(va);
3509 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
3510 pmap->pm_ucr3 != PMAP_NO_CR3) {
3511 critical_enter();
3512 pcid = pmap_get_pcid(pmap);
3513 if (invpcid_works) {
3514 d.pcid = pcid | PMAP_PCID_USER_PT;
3515 d.pad = 0;
3516 d.addr = va;
3517 invpcid(&d, INVPCID_ADDR);
3518 } else {
3519 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
3520 ucr3 = pmap->pm_ucr3 | pcid |
3521 PMAP_PCID_USER_PT | CR3_PCID_SAVE;
3522 pmap_pti_pcid_invlpg(ucr3, kcr3, va);
3523 }
3524 critical_exit();
3525 }
3526 } else if (pmap_pcid_enabled) {
3527 pcidp = zpcpu_get(pmap->pm_pcidp);
3528 pcidp->pm_gen = 0;
3529 }
3530 }
3531
3532 void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)3533 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3534 {
3535 struct invpcid_descr d;
3536 struct pmap_pcid *pcidp;
3537 vm_offset_t addr;
3538 uint64_t kcr3, ucr3;
3539 uint32_t pcid;
3540
3541 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
3542 pmap->pm_eptgen++;
3543 return;
3544 }
3545 KASSERT(pmap->pm_type == PT_X86,
3546 ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
3547
3548 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
3549 for (addr = sva; addr < eva; addr += PAGE_SIZE)
3550 invlpg(addr);
3551 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
3552 pmap->pm_ucr3 != PMAP_NO_CR3) {
3553 critical_enter();
3554 pcid = pmap_get_pcid(pmap);
3555 if (invpcid_works) {
3556 d.pcid = pcid | PMAP_PCID_USER_PT;
3557 d.pad = 0;
3558 d.addr = sva;
3559 for (; d.addr < eva; d.addr += PAGE_SIZE)
3560 invpcid(&d, INVPCID_ADDR);
3561 } else {
3562 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
3563 ucr3 = pmap->pm_ucr3 | pcid |
3564 PMAP_PCID_USER_PT | CR3_PCID_SAVE;
3565 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
3566 }
3567 critical_exit();
3568 }
3569 } else if (pmap_pcid_enabled) {
3570 pcidp = zpcpu_get(pmap->pm_pcidp);
3571 pcidp->pm_gen = 0;
3572 }
3573 }
3574
3575 void
pmap_invalidate_all(pmap_t pmap)3576 pmap_invalidate_all(pmap_t pmap)
3577 {
3578 struct invpcid_descr d;
3579 struct pmap_pcid *pcidp;
3580 uint64_t kcr3, ucr3;
3581 uint32_t pcid;
3582
3583 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
3584 pmap->pm_eptgen++;
3585 return;
3586 }
3587 KASSERT(pmap->pm_type == PT_X86,
3588 ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
3589
3590 if (pmap == kernel_pmap) {
3591 if (pmap_pcid_enabled && invpcid_works) {
3592 bzero(&d, sizeof(d));
3593 invpcid(&d, INVPCID_CTXGLOB);
3594 } else {
3595 invltlb_glob();
3596 }
3597 } else if (pmap == PCPU_GET(curpmap)) {
3598 if (pmap_pcid_enabled) {
3599 critical_enter();
3600 pcid = pmap_get_pcid(pmap);
3601 if (invpcid_works) {
3602 d.pcid = pcid;
3603 d.pad = 0;
3604 d.addr = 0;
3605 invpcid(&d, INVPCID_CTX);
3606 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
3607 d.pcid |= PMAP_PCID_USER_PT;
3608 invpcid(&d, INVPCID_CTX);
3609 }
3610 } else {
3611 kcr3 = pmap->pm_cr3 | pcid;
3612 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
3613 ucr3 = pmap->pm_ucr3 | pcid |
3614 PMAP_PCID_USER_PT;
3615 pmap_pti_pcid_invalidate(ucr3, kcr3);
3616 } else
3617 load_cr3(kcr3);
3618 }
3619 critical_exit();
3620 } else {
3621 invltlb();
3622 }
3623 } else if (pmap_pcid_enabled) {
3624 pcidp = zpcpu_get(pmap->pm_pcidp);
3625 pcidp->pm_gen = 0;
3626 }
3627 }
3628
3629 void
pmap_invalidate_cache(void)3630 pmap_invalidate_cache(void)
3631 {
3632
3633 wbinvd();
3634 }
3635
3636 static void
pmap_update_pde(pmap_t pmap,vm_offset_t va,pd_entry_t * pde,pd_entry_t newpde)3637 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
3638 {
3639 struct pmap_pcid *pcidp;
3640
3641 pmap_update_pde_store(pmap, pde, newpde);
3642 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
3643 pmap_update_pde_invalidate(pmap, va, newpde);
3644 else {
3645 pcidp = zpcpu_get(pmap->pm_pcidp);
3646 pcidp->pm_gen = 0;
3647 }
3648 }
3649 #endif /* !SMP */
3650
3651 static void
pmap_invalidate_pde_page(pmap_t pmap,vm_offset_t va,pd_entry_t pde)3652 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
3653 {
3654
3655 /*
3656 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
3657 * by a promotion that did not invalidate the 512 4KB page mappings
3658 * that might exist in the TLB. Consequently, at this point, the TLB
3659 * may hold both 4KB and 2MB page mappings for the address range [va,
3660 * va + NBPDR). Therefore, the entire range must be invalidated here.
3661 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
3662 * 4KB page mappings for the address range [va, va + NBPDR), and so a
3663 * single INVLPG suffices to invalidate the 2MB page mapping from the
3664 * TLB.
3665 */
3666 if ((pde & PG_PROMOTED) != 0)
3667 pmap_invalidate_range(pmap, va, va + NBPDR - 1);
3668 else
3669 pmap_invalidate_page(pmap, va);
3670 }
3671
3672 DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
3673 (vm_offset_t sva, vm_offset_t eva))
3674 {
3675
3676 if ((cpu_feature & CPUID_SS) != 0)
3677 return (pmap_invalidate_cache_range_selfsnoop);
3678 if ((cpu_feature & CPUID_CLFSH) != 0)
3679 return (pmap_force_invalidate_cache_range);
3680 return (pmap_invalidate_cache_range_all);
3681 }
3682
3683 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024)
3684
3685 static void
pmap_invalidate_cache_range_check_align(vm_offset_t sva,vm_offset_t eva)3686 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
3687 {
3688
3689 KASSERT((sva & PAGE_MASK) == 0,
3690 ("pmap_invalidate_cache_range: sva not page-aligned"));
3691 KASSERT((eva & PAGE_MASK) == 0,
3692 ("pmap_invalidate_cache_range: eva not page-aligned"));
3693 }
3694
3695 static void
pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,vm_offset_t eva)3696 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
3697 {
3698
3699 pmap_invalidate_cache_range_check_align(sva, eva);
3700 }
3701
3702 void
pmap_force_invalidate_cache_range(vm_offset_t sva,vm_offset_t eva)3703 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
3704 {
3705
3706 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
3707
3708 /*
3709 * XXX: Some CPUs fault, hang, or trash the local APIC
3710 * registers if we use CLFLUSH on the local APIC range. The
3711 * local APIC is always uncached, so we don't need to flush
3712 * for that range anyway.
3713 */
3714 if (pmap_kextract(sva) == lapic_paddr)
3715 return;
3716
3717 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
3718 /*
3719 * Do per-cache line flush. Use a locked
3720 * instruction to insure that previous stores are
3721 * included in the write-back. The processor
3722 * propagates flush to other processors in the cache
3723 * coherence domain.
3724 */
3725 atomic_thread_fence_seq_cst();
3726 for (; sva < eva; sva += cpu_clflush_line_size)
3727 clflushopt(sva);
3728 atomic_thread_fence_seq_cst();
3729 } else {
3730 /*
3731 * Writes are ordered by CLFLUSH on Intel CPUs.
3732 */
3733 if (cpu_vendor_id != CPU_VENDOR_INTEL)
3734 mfence();
3735 for (; sva < eva; sva += cpu_clflush_line_size)
3736 clflush(sva);
3737 if (cpu_vendor_id != CPU_VENDOR_INTEL)
3738 mfence();
3739 }
3740 }
3741
3742 static void
pmap_invalidate_cache_range_all(vm_offset_t sva,vm_offset_t eva)3743 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
3744 {
3745
3746 pmap_invalidate_cache_range_check_align(sva, eva);
3747 pmap_invalidate_cache();
3748 }
3749
3750 /*
3751 * Remove the specified set of pages from the data and instruction caches.
3752 *
3753 * In contrast to pmap_invalidate_cache_range(), this function does not
3754 * rely on the CPU's self-snoop feature, because it is intended for use
3755 * when moving pages into a different cache domain.
3756 */
3757 void
pmap_invalidate_cache_pages(vm_page_t * pages,int count)3758 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
3759 {
3760 vm_offset_t daddr, eva;
3761 int i;
3762 bool useclflushopt;
3763
3764 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
3765 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
3766 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
3767 pmap_invalidate_cache();
3768 else {
3769 if (useclflushopt)
3770 atomic_thread_fence_seq_cst();
3771 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
3772 mfence();
3773 for (i = 0; i < count; i++) {
3774 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
3775 eva = daddr + PAGE_SIZE;
3776 for (; daddr < eva; daddr += cpu_clflush_line_size) {
3777 if (useclflushopt)
3778 clflushopt(daddr);
3779 else
3780 clflush(daddr);
3781 }
3782 }
3783 if (useclflushopt)
3784 atomic_thread_fence_seq_cst();
3785 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
3786 mfence();
3787 }
3788 }
3789
3790 void
pmap_flush_cache_range(vm_offset_t sva,vm_offset_t eva)3791 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
3792 {
3793
3794 pmap_invalidate_cache_range_check_align(sva, eva);
3795
3796 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
3797 pmap_force_invalidate_cache_range(sva, eva);
3798 return;
3799 }
3800
3801 /* See comment in pmap_force_invalidate_cache_range(). */
3802 if (pmap_kextract(sva) == lapic_paddr)
3803 return;
3804
3805 atomic_thread_fence_seq_cst();
3806 for (; sva < eva; sva += cpu_clflush_line_size)
3807 clwb(sva);
3808 atomic_thread_fence_seq_cst();
3809 }
3810
3811 void
pmap_flush_cache_phys_range(vm_paddr_t spa,vm_paddr_t epa,vm_memattr_t mattr)3812 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
3813 {
3814 pt_entry_t *pte;
3815 vm_offset_t vaddr;
3816 int error __diagused;
3817 int pte_bits;
3818
3819 KASSERT((spa & PAGE_MASK) == 0,
3820 ("pmap_flush_cache_phys_range: spa not page-aligned"));
3821 KASSERT((epa & PAGE_MASK) == 0,
3822 ("pmap_flush_cache_phys_range: epa not page-aligned"));
3823
3824 if (spa < dmaplimit) {
3825 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN(
3826 dmaplimit, epa)));
3827 if (dmaplimit >= epa)
3828 return;
3829 spa = dmaplimit;
3830 }
3831
3832 pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW |
3833 X86_PG_V;
3834 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
3835 &vaddr);
3836 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
3837 pte = vtopte(vaddr);
3838 for (; spa < epa; spa += PAGE_SIZE) {
3839 sched_pin();
3840 pte_store(pte, spa | pte_bits);
3841 pmap_invlpg(kernel_pmap, vaddr);
3842 /* XXXKIB atomic inside flush_cache_range are excessive */
3843 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
3844 sched_unpin();
3845 }
3846 vmem_free(kernel_arena, vaddr, PAGE_SIZE);
3847 }
3848
3849 /*
3850 * Routine: pmap_extract
3851 * Function:
3852 * Extract the physical page address associated
3853 * with the given map/virtual_address pair.
3854 */
3855 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)3856 pmap_extract(pmap_t pmap, vm_offset_t va)
3857 {
3858 pdp_entry_t *pdpe;
3859 pd_entry_t *pde;
3860 pt_entry_t *pte, PG_V;
3861 vm_paddr_t pa;
3862
3863 pa = 0;
3864 PG_V = pmap_valid_bit(pmap);
3865 PMAP_LOCK(pmap);
3866 pdpe = pmap_pdpe(pmap, va);
3867 if (pdpe != NULL && (*pdpe & PG_V) != 0) {
3868 if ((*pdpe & PG_PS) != 0)
3869 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
3870 else {
3871 pde = pmap_pdpe_to_pde(pdpe, va);
3872 if ((*pde & PG_V) != 0) {
3873 if ((*pde & PG_PS) != 0) {
3874 pa = (*pde & PG_PS_FRAME) |
3875 (va & PDRMASK);
3876 } else {
3877 pte = pmap_pde_to_pte(pde, va);
3878 pa = (*pte & PG_FRAME) |
3879 (va & PAGE_MASK);
3880 }
3881 }
3882 }
3883 }
3884 PMAP_UNLOCK(pmap);
3885 return (pa);
3886 }
3887
3888 /*
3889 * Routine: pmap_extract_and_hold
3890 * Function:
3891 * Atomically extract and hold the physical page
3892 * with the given pmap and virtual address pair
3893 * if that mapping permits the given protection.
3894 */
3895 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)3896 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
3897 {
3898 pdp_entry_t pdpe, *pdpep;
3899 pd_entry_t pde, *pdep;
3900 pt_entry_t pte, PG_RW, PG_V;
3901 vm_page_t m;
3902
3903 m = NULL;
3904 PG_RW = pmap_rw_bit(pmap);
3905 PG_V = pmap_valid_bit(pmap);
3906 PMAP_LOCK(pmap);
3907
3908 pdpep = pmap_pdpe(pmap, va);
3909 if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0)
3910 goto out;
3911 if ((pdpe & PG_PS) != 0) {
3912 if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)
3913 goto out;
3914 m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK));
3915 goto check_page;
3916 }
3917
3918 pdep = pmap_pdpe_to_pde(pdpep, va);
3919 if (pdep == NULL || ((pde = *pdep) & PG_V) == 0)
3920 goto out;
3921 if ((pde & PG_PS) != 0) {
3922 if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)
3923 goto out;
3924 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK));
3925 goto check_page;
3926 }
3927
3928 pte = *pmap_pde_to_pte(pdep, va);
3929 if ((pte & PG_V) == 0 ||
3930 ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0))
3931 goto out;
3932 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
3933
3934 check_page:
3935 if (m != NULL && !vm_page_wire_mapped(m))
3936 m = NULL;
3937 out:
3938 PMAP_UNLOCK(pmap);
3939 return (m);
3940 }
3941
3942 vm_paddr_t
pmap_kextract(vm_offset_t va)3943 pmap_kextract(vm_offset_t va)
3944 {
3945 pd_entry_t pde;
3946 vm_paddr_t pa;
3947
3948 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
3949 pa = DMAP_TO_PHYS(va);
3950 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) {
3951 pa = pmap_large_map_kextract(va);
3952 } else {
3953 pde = *vtopde(va);
3954 if (pde & PG_PS) {
3955 pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
3956 } else {
3957 /*
3958 * Beware of a concurrent promotion that changes the
3959 * PDE at this point! For example, vtopte() must not
3960 * be used to access the PTE because it would use the
3961 * new PDE. It is, however, safe to use the old PDE
3962 * because the page table page is preserved by the
3963 * promotion.
3964 */
3965 pa = *pmap_pde_to_pte(&pde, va);
3966 pa = (pa & PG_FRAME) | (va & PAGE_MASK);
3967 }
3968 }
3969 return (pa);
3970 }
3971
3972 /***************************************************
3973 * Low level mapping routines.....
3974 ***************************************************/
3975
3976 /*
3977 * Add a wired page to the kva.
3978 * Note: not SMP coherent.
3979 */
3980 void
pmap_kenter(vm_offset_t va,vm_paddr_t pa)3981 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
3982 {
3983 pt_entry_t *pte;
3984
3985 pte = vtopte(va);
3986 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M |
3987 X86_PG_RW | X86_PG_V);
3988 }
3989
3990 static __inline void
pmap_kenter_attr(vm_offset_t va,vm_paddr_t pa,int mode)3991 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
3992 {
3993 pt_entry_t *pte;
3994 int cache_bits;
3995
3996 pte = vtopte(va);
3997 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
3998 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M |
3999 X86_PG_RW | X86_PG_V | cache_bits);
4000 }
4001
4002 /*
4003 * Remove a page from the kernel pagetables.
4004 * Note: not SMP coherent.
4005 */
4006 void
pmap_kremove(vm_offset_t va)4007 pmap_kremove(vm_offset_t va)
4008 {
4009 pt_entry_t *pte;
4010
4011 pte = vtopte(va);
4012 pte_clear(pte);
4013 }
4014
4015 /*
4016 * Used to map a range of physical addresses into kernel
4017 * virtual address space.
4018 *
4019 * The value passed in '*virt' is a suggested virtual address for
4020 * the mapping. Architectures which can support a direct-mapped
4021 * physical to virtual region can return the appropriate address
4022 * within that region, leaving '*virt' unchanged. Other
4023 * architectures should map the pages starting at '*virt' and
4024 * update '*virt' with the first usable address after the mapped
4025 * region.
4026 */
4027 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)4028 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
4029 {
4030 return PHYS_TO_DMAP(start);
4031 }
4032
4033 /*
4034 * Add a list of wired pages to the kva
4035 * this routine is only used for temporary
4036 * kernel mappings that do not need to have
4037 * page modification or references recorded.
4038 * Note that old mappings are simply written
4039 * over. The page *must* be wired.
4040 * Note: SMP coherent. Uses a ranged shootdown IPI.
4041 */
4042 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)4043 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
4044 {
4045 pt_entry_t *endpte, oldpte, pa, *pte;
4046 vm_page_t m;
4047 int cache_bits;
4048
4049 oldpte = 0;
4050 pte = vtopte(sva);
4051 endpte = pte + count;
4052 while (pte < endpte) {
4053 m = *ma++;
4054 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
4055 pa = VM_PAGE_TO_PHYS(m) | cache_bits;
4056 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
4057 oldpte |= *pte;
4058 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A |
4059 X86_PG_M | X86_PG_RW | X86_PG_V);
4060 }
4061 pte++;
4062 }
4063 if (__predict_false((oldpte & X86_PG_V) != 0))
4064 pmap_invalidate_range(kernel_pmap, sva, sva + count *
4065 PAGE_SIZE);
4066 }
4067
4068 /*
4069 * This routine tears out page mappings from the
4070 * kernel -- it is meant only for temporary mappings.
4071 * Note: SMP coherent. Uses a ranged shootdown IPI.
4072 */
4073 void
pmap_qremove(vm_offset_t sva,int count)4074 pmap_qremove(vm_offset_t sva, int count)
4075 {
4076 vm_offset_t va;
4077
4078 va = sva;
4079 while (count-- > 0) {
4080 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
4081 pmap_kremove(va);
4082 va += PAGE_SIZE;
4083 }
4084 pmap_invalidate_range(kernel_pmap, sva, va);
4085 }
4086
4087 /***************************************************
4088 * Page table page management routines.....
4089 ***************************************************/
4090 /*
4091 * Schedule the specified unused page table page to be freed. Specifically,
4092 * add the page to the specified list of pages that will be released to the
4093 * physical memory manager after the TLB has been updated.
4094 */
4095 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,boolean_t set_PG_ZERO)4096 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
4097 boolean_t set_PG_ZERO)
4098 {
4099
4100 if (set_PG_ZERO)
4101 m->flags |= PG_ZERO;
4102 else
4103 m->flags &= ~PG_ZERO;
4104 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
4105 }
4106
4107 /*
4108 * Inserts the specified page table page into the specified pmap's collection
4109 * of idle page table pages. Each of a pmap's page table pages is responsible
4110 * for mapping a distinct range of virtual addresses. The pmap's collection is
4111 * ordered by this virtual address range.
4112 *
4113 * If "promoted" is false, then the page table page "mpte" must be zero filled;
4114 * "mpte"'s valid field will be set to 0.
4115 *
4116 * If "promoted" is true and "allpte_PG_A_set" is false, then "mpte" must
4117 * contain valid mappings with identical attributes except for PG_A; "mpte"'s
4118 * valid field will be set to 1.
4119 *
4120 * If "promoted" and "allpte_PG_A_set" are both true, then "mpte" must contain
4121 * valid mappings with identical attributes including PG_A; "mpte"'s valid
4122 * field will be set to VM_PAGE_BITS_ALL.
4123 */
4124 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool allpte_PG_A_set)4125 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4126 bool allpte_PG_A_set)
4127 {
4128
4129 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4130 KASSERT(promoted || !allpte_PG_A_set,
4131 ("a zero-filled PTP can't have PG_A set in every PTE"));
4132 mpte->valid = promoted ? (allpte_PG_A_set ? VM_PAGE_BITS_ALL : 1) : 0;
4133 return (vm_radix_insert(&pmap->pm_root, mpte));
4134 }
4135
4136 /*
4137 * Removes the page table page mapping the specified virtual address from the
4138 * specified pmap's collection of idle page table pages, and returns it.
4139 * Otherwise, returns NULL if there is no page table page corresponding to the
4140 * specified virtual address.
4141 */
4142 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4143 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4144 {
4145
4146 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4147 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
4148 }
4149
4150 /*
4151 * Decrements a page table page's reference count, which is used to record the
4152 * number of valid page table entries within the page. If the reference count
4153 * drops to zero, then the page table page is unmapped. Returns TRUE if the
4154 * page table page was unmapped and FALSE otherwise.
4155 */
4156 static inline boolean_t
pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)4157 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4158 {
4159
4160 --m->ref_count;
4161 if (m->ref_count == 0) {
4162 _pmap_unwire_ptp(pmap, va, m, free);
4163 return (TRUE);
4164 } else
4165 return (FALSE);
4166 }
4167
4168 static void
_pmap_unwire_ptp(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)4169 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4170 {
4171 pml5_entry_t *pml5;
4172 pml4_entry_t *pml4;
4173 pdp_entry_t *pdp;
4174 pd_entry_t *pd;
4175 vm_page_t pdpg, pdppg, pml4pg;
4176
4177 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4178
4179 /*
4180 * unmap the page table page
4181 */
4182 if (m->pindex >= NUPDE + NUPDPE + NUPML4E) {
4183 /* PML4 page */
4184 MPASS(pmap_is_la57(pmap));
4185 pml5 = pmap_pml5e(pmap, va);
4186 *pml5 = 0;
4187 if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) {
4188 pml5 = pmap_pml5e_u(pmap, va);
4189 *pml5 = 0;
4190 }
4191 } else if (m->pindex >= NUPDE + NUPDPE) {
4192 /* PDP page */
4193 pml4 = pmap_pml4e(pmap, va);
4194 *pml4 = 0;
4195 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
4196 va <= VM_MAXUSER_ADDRESS) {
4197 pml4 = pmap_pml4e_u(pmap, va);
4198 *pml4 = 0;
4199 }
4200 } else if (m->pindex >= NUPDE) {
4201 /* PD page */
4202 pdp = pmap_pdpe(pmap, va);
4203 *pdp = 0;
4204 } else {
4205 /* PTE page */
4206 pd = pmap_pde(pmap, va);
4207 *pd = 0;
4208 }
4209 if (m->pindex < NUPDE) {
4210 /* We just released a PT, unhold the matching PD */
4211 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
4212 pmap_unwire_ptp(pmap, va, pdpg, free);
4213 } else if (m->pindex < NUPDE + NUPDPE) {
4214 /* We just released a PD, unhold the matching PDP */
4215 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
4216 pmap_unwire_ptp(pmap, va, pdppg, free);
4217 } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) {
4218 /* We just released a PDP, unhold the matching PML4 */
4219 pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME);
4220 pmap_unwire_ptp(pmap, va, pml4pg, free);
4221 }
4222
4223 pmap_pt_page_count_adj(pmap, -1);
4224
4225 /*
4226 * Put page on a list so that it is released after
4227 * *ALL* TLB shootdown is done
4228 */
4229 pmap_add_delayed_free_list(m, free, TRUE);
4230 }
4231
4232 /*
4233 * After removing a page table entry, this routine is used to
4234 * conditionally free the page, and manage the reference count.
4235 */
4236 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)4237 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
4238 struct spglist *free)
4239 {
4240 vm_page_t mpte;
4241
4242 if (va >= VM_MAXUSER_ADDRESS)
4243 return (0);
4244 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
4245 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
4246 return (pmap_unwire_ptp(pmap, va, mpte, free));
4247 }
4248
4249 /*
4250 * Release a page table page reference after a failed attempt to create a
4251 * mapping.
4252 */
4253 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)4254 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
4255 {
4256 struct spglist free;
4257
4258 SLIST_INIT(&free);
4259 if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
4260 /*
4261 * Although "va" was never mapped, paging-structure caches
4262 * could nonetheless have entries that refer to the freed
4263 * page table pages. Invalidate those entries.
4264 */
4265 pmap_invalidate_page(pmap, va);
4266 vm_page_free_pages_toq(&free, true);
4267 }
4268 }
4269
4270 static void
pmap_pinit_pcids(pmap_t pmap,uint32_t pcid,int gen)4271 pmap_pinit_pcids(pmap_t pmap, uint32_t pcid, int gen)
4272 {
4273 struct pmap_pcid *pcidp;
4274 int i;
4275
4276 CPU_FOREACH(i) {
4277 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i);
4278 pcidp->pm_pcid = pcid;
4279 pcidp->pm_gen = gen;
4280 }
4281 }
4282
4283 void
pmap_pinit0(pmap_t pmap)4284 pmap_pinit0(pmap_t pmap)
4285 {
4286 struct proc *p;
4287 struct thread *td;
4288
4289 PMAP_LOCK_INIT(pmap);
4290 pmap->pm_pmltop = kernel_pmap->pm_pmltop;
4291 pmap->pm_pmltopu = NULL;
4292 pmap->pm_cr3 = kernel_pmap->pm_cr3;
4293 /* hack to keep pmap_pti_pcid_invalidate() alive */
4294 pmap->pm_ucr3 = PMAP_NO_CR3;
4295 vm_radix_init(&pmap->pm_root);
4296 CPU_ZERO(&pmap->pm_active);
4297 TAILQ_INIT(&pmap->pm_pvchunk);
4298 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4299 pmap->pm_flags = pmap_flags;
4300 pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK);
4301 pmap_pinit_pcids(pmap, PMAP_PCID_KERN + 1, 1);
4302 pmap_activate_boot(pmap);
4303 td = curthread;
4304 if (pti) {
4305 p = td->td_proc;
4306 PROC_LOCK(p);
4307 p->p_md.md_flags |= P_MD_KPTI;
4308 PROC_UNLOCK(p);
4309 }
4310 pmap_thread_init_invl_gen(td);
4311
4312 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
4313 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
4314 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
4315 UMA_ALIGN_PTR, 0);
4316 }
4317 }
4318
4319 void
pmap_pinit_pml4(vm_page_t pml4pg)4320 pmap_pinit_pml4(vm_page_t pml4pg)
4321 {
4322 pml4_entry_t *pm_pml4;
4323 int i;
4324
4325 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
4326
4327 /* Wire in kernel global address entries. */
4328 for (i = 0; i < NKPML4E; i++) {
4329 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
4330 X86_PG_V;
4331 }
4332 #ifdef KASAN
4333 for (i = 0; i < NKASANPML4E; i++) {
4334 pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW |
4335 X86_PG_V | pg_nx;
4336 }
4337 #endif
4338 #ifdef KMSAN
4339 for (i = 0; i < NKMSANSHADPML4E; i++) {
4340 pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) |
4341 X86_PG_RW | X86_PG_V | pg_nx;
4342 }
4343 for (i = 0; i < NKMSANORIGPML4E; i++) {
4344 pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) |
4345 X86_PG_RW | X86_PG_V | pg_nx;
4346 }
4347 #endif
4348 for (i = 0; i < ndmpdpphys; i++) {
4349 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
4350 X86_PG_V;
4351 }
4352
4353 /* install self-referential address mapping entry(s) */
4354 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
4355 X86_PG_A | X86_PG_M;
4356
4357 /* install large map entries if configured */
4358 for (i = 0; i < lm_ents; i++)
4359 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i];
4360 }
4361
4362 void
pmap_pinit_pml5(vm_page_t pml5pg)4363 pmap_pinit_pml5(vm_page_t pml5pg)
4364 {
4365 pml5_entry_t *pm_pml5;
4366
4367 pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg));
4368
4369 /*
4370 * Add pml5 entry at top of KVA pointing to existing pml4 table,
4371 * entering all existing kernel mappings into level 5 table.
4372 */
4373 pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
4374 X86_PG_RW | X86_PG_A | X86_PG_M;
4375
4376 /*
4377 * Install self-referential address mapping entry.
4378 */
4379 pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) |
4380 X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A;
4381 }
4382
4383 static void
pmap_pinit_pml4_pti(vm_page_t pml4pgu)4384 pmap_pinit_pml4_pti(vm_page_t pml4pgu)
4385 {
4386 pml4_entry_t *pm_pml4u;
4387 int i;
4388
4389 pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu));
4390 for (i = 0; i < NPML4EPG; i++)
4391 pm_pml4u[i] = pti_pml4[i];
4392 }
4393
4394 static void
pmap_pinit_pml5_pti(vm_page_t pml5pgu)4395 pmap_pinit_pml5_pti(vm_page_t pml5pgu)
4396 {
4397 pml5_entry_t *pm_pml5u;
4398
4399 pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu));
4400 pagezero(pm_pml5u);
4401
4402 /*
4403 * Add pml5 entry at top of KVA pointing to existing pml4 pti
4404 * table, entering all kernel mappings needed for usermode
4405 * into level 5 table.
4406 */
4407 pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] =
4408 pmap_kextract((vm_offset_t)pti_pml4) |
4409 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
4410 }
4411
4412 /* Allocate a page table page and do related bookkeeping */
4413 static vm_page_t
pmap_alloc_pt_page(pmap_t pmap,vm_pindex_t pindex,int flags)4414 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags)
4415 {
4416 vm_page_t m;
4417
4418 m = vm_page_alloc_noobj(flags);
4419 if (__predict_false(m == NULL))
4420 return (NULL);
4421 m->pindex = pindex;
4422 pmap_pt_page_count_adj(pmap, 1);
4423 return (m);
4424 }
4425
4426 static void
pmap_free_pt_page(pmap_t pmap,vm_page_t m,bool zerofilled)4427 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled)
4428 {
4429 /*
4430 * This function assumes the page will need to be unwired,
4431 * even though the counterpart allocation in pmap_alloc_pt_page()
4432 * doesn't enforce VM_ALLOC_WIRED. However, all current uses
4433 * of pmap_free_pt_page() require unwiring. The case in which
4434 * a PT page doesn't require unwiring because its ref_count has
4435 * naturally reached 0 is handled through _pmap_unwire_ptp().
4436 */
4437 vm_page_unwire_noq(m);
4438 if (zerofilled)
4439 vm_page_free_zero(m);
4440 else
4441 vm_page_free(m);
4442
4443 pmap_pt_page_count_adj(pmap, -1);
4444 }
4445
4446 _Static_assert(sizeof(struct pmap_pcid) == 8, "Fix pcpu zone for pm_pcidp");
4447
4448 /*
4449 * Initialize a preallocated and zeroed pmap structure,
4450 * such as one in a vmspace structure.
4451 */
4452 int
pmap_pinit_type(pmap_t pmap,enum pmap_type pm_type,int flags)4453 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
4454 {
4455 vm_page_t pmltop_pg, pmltop_pgu;
4456 vm_paddr_t pmltop_phys;
4457
4458 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4459
4460 /*
4461 * Allocate the page directory page. Pass NULL instead of a
4462 * pointer to the pmap here to avoid calling
4463 * pmap_resident_count_adj() through pmap_pt_page_count_adj(),
4464 * since that requires pmap lock. Instead do the accounting
4465 * manually.
4466 *
4467 * Note that final call to pmap_remove() optimization that
4468 * checks for zero resident_count is basically disabled by
4469 * accounting for top-level page. But the optimization was
4470 * not effective since we started using non-managed mapping of
4471 * the shared page.
4472 */
4473 pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO |
4474 VM_ALLOC_WAITOK);
4475 pmap_pt_page_count_pinit(pmap, 1);
4476
4477 pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg);
4478 pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys);
4479
4480 if (pmap_pcid_enabled) {
4481 if (pmap->pm_pcidp == NULL)
4482 pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8,
4483 M_WAITOK);
4484 pmap_pinit_pcids(pmap, PMAP_PCID_NONE, 0);
4485 }
4486 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */
4487 pmap->pm_ucr3 = PMAP_NO_CR3;
4488 pmap->pm_pmltopu = NULL;
4489
4490 pmap->pm_type = pm_type;
4491
4492 /*
4493 * Do not install the host kernel mappings in the nested page
4494 * tables. These mappings are meaningless in the guest physical
4495 * address space.
4496 * Install minimal kernel mappings in PTI case.
4497 */
4498 switch (pm_type) {
4499 case PT_X86:
4500 pmap->pm_cr3 = pmltop_phys;
4501 if (pmap_is_la57(pmap))
4502 pmap_pinit_pml5(pmltop_pg);
4503 else
4504 pmap_pinit_pml4(pmltop_pg);
4505 if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) {
4506 /*
4507 * As with pmltop_pg, pass NULL instead of a
4508 * pointer to the pmap to ensure that the PTI
4509 * page counted explicitly.
4510 */
4511 pmltop_pgu = pmap_alloc_pt_page(NULL, 0,
4512 VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
4513 pmap_pt_page_count_pinit(pmap, 1);
4514 pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP(
4515 VM_PAGE_TO_PHYS(pmltop_pgu));
4516 if (pmap_is_la57(pmap))
4517 pmap_pinit_pml5_pti(pmltop_pgu);
4518 else
4519 pmap_pinit_pml4_pti(pmltop_pgu);
4520 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu);
4521 }
4522 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
4523 rangeset_init(&pmap->pm_pkru, pkru_dup_range,
4524 pkru_free_range, pmap, M_NOWAIT);
4525 }
4526 break;
4527 case PT_EPT:
4528 case PT_RVI:
4529 pmap->pm_eptsmr = smr_create("pmap", 0, 0);
4530 break;
4531 }
4532
4533 vm_radix_init(&pmap->pm_root);
4534 CPU_ZERO(&pmap->pm_active);
4535 TAILQ_INIT(&pmap->pm_pvchunk);
4536 pmap->pm_flags = flags;
4537 pmap->pm_eptgen = 0;
4538
4539 return (1);
4540 }
4541
4542 int
pmap_pinit(pmap_t pmap)4543 pmap_pinit(pmap_t pmap)
4544 {
4545
4546 return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
4547 }
4548
4549 static void
pmap_allocpte_free_unref(pmap_t pmap,vm_offset_t va,pt_entry_t * pte)4550 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte)
4551 {
4552 vm_page_t mpg;
4553 struct spglist free;
4554
4555 mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
4556 if (mpg->ref_count != 0)
4557 return;
4558 SLIST_INIT(&free);
4559 _pmap_unwire_ptp(pmap, va, mpg, &free);
4560 pmap_invalidate_page(pmap, va);
4561 vm_page_free_pages_toq(&free, true);
4562 }
4563
4564 static pml4_entry_t *
pmap_allocpte_getpml4(pmap_t pmap,struct rwlock ** lockp,vm_offset_t va,bool addref)4565 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
4566 bool addref)
4567 {
4568 vm_pindex_t pml5index;
4569 pml5_entry_t *pml5;
4570 pml4_entry_t *pml4;
4571 vm_page_t pml4pg;
4572 pt_entry_t PG_V;
4573 bool allocated;
4574
4575 if (!pmap_is_la57(pmap))
4576 return (&pmap->pm_pmltop[pmap_pml4e_index(va)]);
4577
4578 PG_V = pmap_valid_bit(pmap);
4579 pml5index = pmap_pml5e_index(va);
4580 pml5 = &pmap->pm_pmltop[pml5index];
4581 if ((*pml5 & PG_V) == 0) {
4582 if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp,
4583 va) == NULL)
4584 return (NULL);
4585 allocated = true;
4586 } else {
4587 allocated = false;
4588 }
4589 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME);
4590 pml4 = &pml4[pmap_pml4e_index(va)];
4591 if ((*pml4 & PG_V) == 0) {
4592 pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME);
4593 if (allocated && !addref)
4594 pml4pg->ref_count--;
4595 else if (!allocated && addref)
4596 pml4pg->ref_count++;
4597 }
4598 return (pml4);
4599 }
4600
4601 static pdp_entry_t *
pmap_allocpte_getpdp(pmap_t pmap,struct rwlock ** lockp,vm_offset_t va,bool addref)4602 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
4603 bool addref)
4604 {
4605 vm_page_t pdppg;
4606 pml4_entry_t *pml4;
4607 pdp_entry_t *pdp;
4608 pt_entry_t PG_V;
4609 bool allocated;
4610
4611 PG_V = pmap_valid_bit(pmap);
4612
4613 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false);
4614 if (pml4 == NULL)
4615 return (NULL);
4616
4617 if ((*pml4 & PG_V) == 0) {
4618 /* Have to allocate a new pdp, recurse */
4619 if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp,
4620 va) == NULL) {
4621 if (pmap_is_la57(pmap))
4622 pmap_allocpte_free_unref(pmap, va,
4623 pmap_pml5e(pmap, va));
4624 return (NULL);
4625 }
4626 allocated = true;
4627 } else {
4628 allocated = false;
4629 }
4630 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
4631 pdp = &pdp[pmap_pdpe_index(va)];
4632 if ((*pdp & PG_V) == 0) {
4633 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
4634 if (allocated && !addref)
4635 pdppg->ref_count--;
4636 else if (!allocated && addref)
4637 pdppg->ref_count++;
4638 }
4639 return (pdp);
4640 }
4641
4642 /*
4643 * The ptepindexes, i.e. page indices, of the page table pages encountered
4644 * while translating virtual address va are defined as follows:
4645 * - for the page table page (last level),
4646 * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT,
4647 * in other words, it is just the index of the PDE that maps the page
4648 * table page.
4649 * - for the page directory page,
4650 * ptepindex = NUPDE (number of userland PD entries) +
4651 * (pmap_pde_index(va) >> NPDEPGSHIFT)
4652 * i.e. index of PDPE is put after the last index of PDE,
4653 * - for the page directory pointer page,
4654 * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT +
4655 * NPML4EPGSHIFT),
4656 * i.e. index of pml4e is put after the last index of PDPE,
4657 * - for the PML4 page (if LA57 mode is enabled),
4658 * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >>
4659 * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT),
4660 * i.e. index of pml5e is put after the last index of PML4E.
4661 *
4662 * Define an order on the paging entries, where all entries of the
4663 * same height are put together, then heights are put from deepest to
4664 * root. Then ptexpindex is the sequential number of the
4665 * corresponding paging entry in this order.
4666 *
4667 * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of
4668 * LA57 paging structures even in LA48 paging mode. Moreover, the
4669 * ptepindexes are calculated as if the paging structures were 5-level
4670 * regardless of the actual mode of operation.
4671 *
4672 * The root page at PML4/PML5 does not participate in this indexing scheme,
4673 * since it is statically allocated by pmap_pinit() and not by pmap_allocpte().
4674 */
4675 static vm_page_t
pmap_allocpte_nosleep(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp,vm_offset_t va)4676 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
4677 vm_offset_t va)
4678 {
4679 vm_pindex_t pml5index, pml4index;
4680 pml5_entry_t *pml5, *pml5u;
4681 pml4_entry_t *pml4, *pml4u;
4682 pdp_entry_t *pdp;
4683 pd_entry_t *pd;
4684 vm_page_t m, pdpg;
4685 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
4686
4687 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4688
4689 PG_A = pmap_accessed_bit(pmap);
4690 PG_M = pmap_modified_bit(pmap);
4691 PG_V = pmap_valid_bit(pmap);
4692 PG_RW = pmap_rw_bit(pmap);
4693
4694 /*
4695 * Allocate a page table page.
4696 */
4697 m = pmap_alloc_pt_page(pmap, ptepindex,
4698 VM_ALLOC_WIRED | VM_ALLOC_ZERO);
4699 if (m == NULL)
4700 return (NULL);
4701
4702 /*
4703 * Map the pagetable page into the process address space, if
4704 * it isn't already there.
4705 */
4706 if (ptepindex >= NUPDE + NUPDPE + NUPML4E) {
4707 MPASS(pmap_is_la57(pmap));
4708
4709 pml5index = pmap_pml5e_index(va);
4710 pml5 = &pmap->pm_pmltop[pml5index];
4711 KASSERT((*pml5 & PG_V) == 0,
4712 ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5));
4713 *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
4714
4715 if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) {
4716 MPASS(pmap->pm_ucr3 != PMAP_NO_CR3);
4717 *pml5 |= pg_nx;
4718
4719 pml5u = &pmap->pm_pmltopu[pml5index];
4720 *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
4721 PG_A | PG_M;
4722 }
4723 } else if (ptepindex >= NUPDE + NUPDPE) {
4724 pml4index = pmap_pml4e_index(va);
4725 /* Wire up a new PDPE page */
4726 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true);
4727 if (pml4 == NULL) {
4728 pmap_free_pt_page(pmap, m, true);
4729 return (NULL);
4730 }
4731 KASSERT((*pml4 & PG_V) == 0,
4732 ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4));
4733 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
4734
4735 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
4736 pml4index < NUPML4E) {
4737 MPASS(pmap->pm_ucr3 != PMAP_NO_CR3);
4738
4739 /*
4740 * PTI: Make all user-space mappings in the
4741 * kernel-mode page table no-execute so that
4742 * we detect any programming errors that leave
4743 * the kernel-mode page table active on return
4744 * to user space.
4745 */
4746 *pml4 |= pg_nx;
4747
4748 pml4u = &pmap->pm_pmltopu[pml4index];
4749 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
4750 PG_A | PG_M;
4751 }
4752 } else if (ptepindex >= NUPDE) {
4753 /* Wire up a new PDE page */
4754 pdp = pmap_allocpte_getpdp(pmap, lockp, va, true);
4755 if (pdp == NULL) {
4756 pmap_free_pt_page(pmap, m, true);
4757 return (NULL);
4758 }
4759 KASSERT((*pdp & PG_V) == 0,
4760 ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp));
4761 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
4762 } else {
4763 /* Wire up a new PTE page */
4764 pdp = pmap_allocpte_getpdp(pmap, lockp, va, false);
4765 if (pdp == NULL) {
4766 pmap_free_pt_page(pmap, m, true);
4767 return (NULL);
4768 }
4769 if ((*pdp & PG_V) == 0) {
4770 /* Have to allocate a new pd, recurse */
4771 if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va),
4772 lockp, va) == NULL) {
4773 pmap_allocpte_free_unref(pmap, va,
4774 pmap_pml4e(pmap, va));
4775 pmap_free_pt_page(pmap, m, true);
4776 return (NULL);
4777 }
4778 } else {
4779 /* Add reference to the pd page */
4780 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
4781 pdpg->ref_count++;
4782 }
4783 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
4784
4785 /* Now we know where the page directory page is */
4786 pd = &pd[pmap_pde_index(va)];
4787 KASSERT((*pd & PG_V) == 0,
4788 ("pmap %p va %#lx pd %#lx", pmap, va, *pd));
4789 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
4790 }
4791
4792 return (m);
4793 }
4794
4795 /*
4796 * This routine is called if the desired page table page does not exist.
4797 *
4798 * If page table page allocation fails, this routine may sleep before
4799 * returning NULL. It sleeps only if a lock pointer was given. Sleep
4800 * occurs right before returning to the caller. This way, we never
4801 * drop pmap lock to sleep while a page table page has ref_count == 0,
4802 * which prevents the page from being freed under us.
4803 */
4804 static vm_page_t
pmap_allocpte_alloc(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp,vm_offset_t va)4805 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
4806 vm_offset_t va)
4807 {
4808 vm_page_t m;
4809
4810 m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va);
4811 if (m == NULL && lockp != NULL) {
4812 RELEASE_PV_LIST_LOCK(lockp);
4813 PMAP_UNLOCK(pmap);
4814 PMAP_ASSERT_NOT_IN_DI();
4815 vm_wait(NULL);
4816 PMAP_LOCK(pmap);
4817 }
4818 return (m);
4819 }
4820
4821 static pd_entry_t *
pmap_alloc_pde(pmap_t pmap,vm_offset_t va,vm_page_t * pdpgp,struct rwlock ** lockp)4822 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
4823 struct rwlock **lockp)
4824 {
4825 pdp_entry_t *pdpe, PG_V;
4826 pd_entry_t *pde;
4827 vm_page_t pdpg;
4828 vm_pindex_t pdpindex;
4829
4830 PG_V = pmap_valid_bit(pmap);
4831
4832 retry:
4833 pdpe = pmap_pdpe(pmap, va);
4834 if (pdpe != NULL && (*pdpe & PG_V) != 0) {
4835 pde = pmap_pdpe_to_pde(pdpe, va);
4836 if (va < VM_MAXUSER_ADDRESS) {
4837 /* Add a reference to the pd page. */
4838 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
4839 pdpg->ref_count++;
4840 } else
4841 pdpg = NULL;
4842 } else if (va < VM_MAXUSER_ADDRESS) {
4843 /* Allocate a pd page. */
4844 pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT;
4845 pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va);
4846 if (pdpg == NULL) {
4847 if (lockp != NULL)
4848 goto retry;
4849 else
4850 return (NULL);
4851 }
4852 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4853 pde = &pde[pmap_pde_index(va)];
4854 } else
4855 panic("pmap_alloc_pde: missing page table page for va %#lx",
4856 va);
4857 *pdpgp = pdpg;
4858 return (pde);
4859 }
4860
4861 static vm_page_t
pmap_allocpte(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)4862 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4863 {
4864 vm_pindex_t ptepindex;
4865 pd_entry_t *pd, PG_V;
4866 vm_page_t m;
4867
4868 PG_V = pmap_valid_bit(pmap);
4869
4870 /*
4871 * Calculate pagetable page index
4872 */
4873 ptepindex = pmap_pde_pindex(va);
4874 retry:
4875 /*
4876 * Get the page directory entry
4877 */
4878 pd = pmap_pde(pmap, va);
4879
4880 /*
4881 * This supports switching from a 2MB page to a
4882 * normal 4K page.
4883 */
4884 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
4885 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
4886 /*
4887 * Invalidation of the 2MB page mapping may have caused
4888 * the deallocation of the underlying PD page.
4889 */
4890 pd = NULL;
4891 }
4892 }
4893
4894 /*
4895 * If the page table page is mapped, we just increment the
4896 * hold count, and activate it.
4897 */
4898 if (pd != NULL && (*pd & PG_V) != 0) {
4899 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
4900 m->ref_count++;
4901 } else {
4902 /*
4903 * Here if the pte page isn't mapped, or if it has been
4904 * deallocated.
4905 */
4906 m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va);
4907 if (m == NULL && lockp != NULL)
4908 goto retry;
4909 }
4910 return (m);
4911 }
4912
4913 /***************************************************
4914 * Pmap allocation/deallocation routines.
4915 ***************************************************/
4916
4917 /*
4918 * Release any resources held by the given physical map.
4919 * Called when a pmap initialized by pmap_pinit is being released.
4920 * Should only be called if the map contains no valid mappings.
4921 */
4922 void
pmap_release(pmap_t pmap)4923 pmap_release(pmap_t pmap)
4924 {
4925 vm_page_t m;
4926 int i;
4927
4928 KASSERT(vm_radix_is_empty(&pmap->pm_root),
4929 ("pmap_release: pmap %p has reserved page table page(s)",
4930 pmap));
4931 KASSERT(CPU_EMPTY(&pmap->pm_active),
4932 ("releasing active pmap %p", pmap));
4933
4934 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop));
4935
4936 if (pmap_is_la57(pmap)) {
4937 pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0;
4938 pmap->pm_pmltop[PML5PML5I] = 0;
4939 } else {
4940 for (i = 0; i < NKPML4E; i++) /* KVA */
4941 pmap->pm_pmltop[KPML4BASE + i] = 0;
4942 #ifdef KASAN
4943 for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */
4944 pmap->pm_pmltop[KASANPML4I + i] = 0;
4945 #endif
4946 #ifdef KMSAN
4947 for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */
4948 pmap->pm_pmltop[KMSANSHADPML4I + i] = 0;
4949 for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */
4950 pmap->pm_pmltop[KMSANORIGPML4I + i] = 0;
4951 #endif
4952 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
4953 pmap->pm_pmltop[DMPML4I + i] = 0;
4954 pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */
4955 for (i = 0; i < lm_ents; i++) /* Large Map */
4956 pmap->pm_pmltop[LMSPML4I + i] = 0;
4957 }
4958
4959 pmap_free_pt_page(NULL, m, true);
4960 pmap_pt_page_count_pinit(pmap, -1);
4961
4962 if (pmap->pm_pmltopu != NULL) {
4963 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->
4964 pm_pmltopu));
4965 pmap_free_pt_page(NULL, m, false);
4966 pmap_pt_page_count_pinit(pmap, -1);
4967 }
4968 if (pmap->pm_type == PT_X86 &&
4969 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
4970 rangeset_fini(&pmap->pm_pkru);
4971
4972 KASSERT(pmap->pm_stats.resident_count == 0,
4973 ("pmap_release: pmap %p resident count %ld != 0",
4974 pmap, pmap->pm_stats.resident_count));
4975 }
4976
4977 static int
kvm_size(SYSCTL_HANDLER_ARGS)4978 kvm_size(SYSCTL_HANDLER_ARGS)
4979 {
4980 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
4981
4982 return sysctl_handle_long(oidp, &ksize, 0, req);
4983 }
4984 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
4985 0, 0, kvm_size, "LU",
4986 "Size of KVM");
4987
4988 static int
kvm_free(SYSCTL_HANDLER_ARGS)4989 kvm_free(SYSCTL_HANDLER_ARGS)
4990 {
4991 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
4992
4993 return sysctl_handle_long(oidp, &kfree, 0, req);
4994 }
4995 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
4996 0, 0, kvm_free, "LU",
4997 "Amount of KVM free");
4998
4999 #ifdef KMSAN
5000 static void
pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa,vm_size_t size)5001 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size)
5002 {
5003 pdp_entry_t *pdpe;
5004 pd_entry_t *pde;
5005 pt_entry_t *pte;
5006 vm_paddr_t dummypa, dummypd, dummypt;
5007 int i, npde, npdpg;
5008
5009 npdpg = howmany(size, NBPDP);
5010 npde = size / NBPDR;
5011
5012 dummypa = vm_phys_early_alloc(-1, PAGE_SIZE);
5013 pagezero((void *)PHYS_TO_DMAP(dummypa));
5014
5015 dummypt = vm_phys_early_alloc(-1, PAGE_SIZE);
5016 pagezero((void *)PHYS_TO_DMAP(dummypt));
5017 dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg);
5018 for (i = 0; i < npdpg; i++)
5019 pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i)));
5020
5021 pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt);
5022 for (i = 0; i < NPTEPG; i++)
5023 pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW |
5024 X86_PG_A | X86_PG_M | pg_nx);
5025
5026 pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd);
5027 for (i = 0; i < npde; i++)
5028 pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx);
5029
5030 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa);
5031 for (i = 0; i < npdpg; i++)
5032 pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V |
5033 X86_PG_RW | pg_nx);
5034 }
5035
5036 static void
pmap_kmsan_page_array_startup(vm_offset_t start,vm_offset_t end)5037 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end)
5038 {
5039 vm_size_t size;
5040
5041 KASSERT(start % NBPDP == 0, ("unaligned page array start address"));
5042
5043 /*
5044 * The end of the page array's KVA region is 2MB aligned, see
5045 * kmem_init().
5046 */
5047 size = round_2mpage(end) - start;
5048 pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size);
5049 pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size);
5050 }
5051 #endif
5052
5053 /*
5054 * Allocate physical memory for the vm_page array and map it into KVA,
5055 * attempting to back the vm_pages with domain-local memory.
5056 */
5057 void
pmap_page_array_startup(long pages)5058 pmap_page_array_startup(long pages)
5059 {
5060 pdp_entry_t *pdpe;
5061 pd_entry_t *pde, newpdir;
5062 vm_offset_t va, start, end;
5063 vm_paddr_t pa;
5064 long pfn;
5065 int domain, i;
5066
5067 vm_page_array_size = pages;
5068
5069 start = VM_MIN_KERNEL_ADDRESS;
5070 end = start + pages * sizeof(struct vm_page);
5071 for (va = start; va < end; va += NBPDR) {
5072 pfn = first_page + (va - start) / sizeof(struct vm_page);
5073 domain = vm_phys_domain(ptoa(pfn));
5074 pdpe = pmap_pdpe(kernel_pmap, va);
5075 if ((*pdpe & X86_PG_V) == 0) {
5076 pa = vm_phys_early_alloc(domain, PAGE_SIZE);
5077 dump_add_page(pa);
5078 pagezero((void *)PHYS_TO_DMAP(pa));
5079 *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW |
5080 X86_PG_A | X86_PG_M);
5081 }
5082 pde = pmap_pdpe_to_pde(pdpe, va);
5083 if ((*pde & X86_PG_V) != 0)
5084 panic("Unexpected pde");
5085 pa = vm_phys_early_alloc(domain, NBPDR);
5086 for (i = 0; i < NPDEPG; i++)
5087 dump_add_page(pa + i * PAGE_SIZE);
5088 newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A |
5089 X86_PG_M | PG_PS | pg_g | pg_nx);
5090 pde_store(pde, newpdir);
5091 }
5092 vm_page_array = (vm_page_t)start;
5093
5094 #ifdef KMSAN
5095 pmap_kmsan_page_array_startup(start, end);
5096 #endif
5097 }
5098
5099 /*
5100 * grow the number of kernel page table entries, if needed
5101 */
5102 void
pmap_growkernel(vm_offset_t addr)5103 pmap_growkernel(vm_offset_t addr)
5104 {
5105 vm_paddr_t paddr;
5106 vm_page_t nkpg;
5107 pd_entry_t *pde, newpdir;
5108 pdp_entry_t *pdpe;
5109 vm_offset_t end;
5110
5111 TSENTER();
5112 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
5113
5114 /*
5115 * The kernel map covers two distinct regions of KVA: that used
5116 * for dynamic kernel memory allocations, and the uppermost 2GB
5117 * of the virtual address space. The latter is used to map the
5118 * kernel and loadable kernel modules. This scheme enables the
5119 * use of a special code generation model for kernel code which
5120 * takes advantage of compact addressing modes in machine code.
5121 *
5122 * Both regions grow upwards; to avoid wasting memory, the gap
5123 * in between is unmapped. If "addr" is above "KERNBASE", the
5124 * kernel's region is grown, otherwise the kmem region is grown.
5125 *
5126 * The correctness of this action is based on the following
5127 * argument: vm_map_insert() allocates contiguous ranges of the
5128 * kernel virtual address space. It calls this function if a range
5129 * ends after "kernel_vm_end". If the kernel is mapped between
5130 * "kernel_vm_end" and "addr", then the range cannot begin at
5131 * "kernel_vm_end". In fact, its beginning address cannot be less
5132 * than the kernel. Thus, there is no immediate need to allocate
5133 * any new kernel page table pages between "kernel_vm_end" and
5134 * "KERNBASE".
5135 */
5136 if (KERNBASE < addr) {
5137 end = KERNBASE + nkpt * NBPDR;
5138 if (end == 0) {
5139 TSEXIT();
5140 return;
5141 }
5142 } else {
5143 end = kernel_vm_end;
5144 }
5145
5146 addr = roundup2(addr, NBPDR);
5147 if (addr - 1 >= vm_map_max(kernel_map))
5148 addr = vm_map_max(kernel_map);
5149 if (addr <= end) {
5150 /*
5151 * The grown region is already mapped, so there is
5152 * nothing to do.
5153 */
5154 TSEXIT();
5155 return;
5156 }
5157
5158 kasan_shadow_map(end, addr - end);
5159 kmsan_shadow_map(end, addr - end);
5160 while (end < addr) {
5161 pdpe = pmap_pdpe(kernel_pmap, end);
5162 if ((*pdpe & X86_PG_V) == 0) {
5163 nkpg = pmap_alloc_pt_page(kernel_pmap,
5164 pmap_pdpe_pindex(end), VM_ALLOC_WIRED |
5165 VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO);
5166 if (nkpg == NULL)
5167 panic("pmap_growkernel: no memory to grow kernel");
5168 paddr = VM_PAGE_TO_PHYS(nkpg);
5169 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
5170 X86_PG_A | X86_PG_M);
5171 continue; /* try again */
5172 }
5173 pde = pmap_pdpe_to_pde(pdpe, end);
5174 if ((*pde & X86_PG_V) != 0) {
5175 end = (end + NBPDR) & ~PDRMASK;
5176 if (end - 1 >= vm_map_max(kernel_map)) {
5177 end = vm_map_max(kernel_map);
5178 break;
5179 }
5180 continue;
5181 }
5182
5183 nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(end),
5184 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT | VM_ALLOC_ZERO);
5185 if (nkpg == NULL)
5186 panic("pmap_growkernel: no memory to grow kernel");
5187 paddr = VM_PAGE_TO_PHYS(nkpg);
5188 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
5189 pde_store(pde, newpdir);
5190
5191 end = (end + NBPDR) & ~PDRMASK;
5192 if (end - 1 >= vm_map_max(kernel_map)) {
5193 end = vm_map_max(kernel_map);
5194 break;
5195 }
5196 }
5197
5198 if (end <= KERNBASE)
5199 kernel_vm_end = end;
5200 else
5201 nkpt = howmany(end - KERNBASE, NBPDR);
5202 TSEXIT();
5203 }
5204
5205 /***************************************************
5206 * page management routines.
5207 ***************************************************/
5208
5209 static const uint64_t pc_freemask[_NPCM] = {
5210 [0 ... _NPCM - 2] = PC_FREEN,
5211 [_NPCM - 1] = PC_FREEL
5212 };
5213
5214 #ifdef PV_STATS
5215
5216 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count);
5217 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD,
5218 &pc_chunk_count, "Current number of pv entry cnunks");
5219
5220 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs);
5221 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD,
5222 &pc_chunk_allocs, "Total number of pv entry chunks allocated");
5223
5224 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees);
5225 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD,
5226 &pc_chunk_frees, "Total number of pv entry chunks freed");
5227
5228 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail);
5229 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD,
5230 &pc_chunk_tryfail,
5231 "Number of failed attempts to get a pv entry chunk page");
5232
5233 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees);
5234 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD,
5235 &pv_entry_frees, "Total number of pv entries freed");
5236
5237 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs);
5238 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD,
5239 &pv_entry_allocs, "Total number of pv entries allocated");
5240
5241 static COUNTER_U64_DEFINE_EARLY(pv_entry_count);
5242 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD,
5243 &pv_entry_count, "Current number of pv entries");
5244
5245 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare);
5246 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD,
5247 &pv_entry_spare, "Current number of spare pv entries");
5248 #endif
5249
5250 static void
reclaim_pv_chunk_leave_pmap(pmap_t pmap,pmap_t locked_pmap,bool start_di)5251 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
5252 {
5253
5254 if (pmap == NULL)
5255 return;
5256 pmap_invalidate_all(pmap);
5257 if (pmap != locked_pmap)
5258 PMAP_UNLOCK(pmap);
5259 if (start_di)
5260 pmap_delayed_invl_finish();
5261 }
5262
5263 /*
5264 * We are in a serious low memory condition. Resort to
5265 * drastic measures to free some pages so we can allocate
5266 * another pv entry chunk.
5267 *
5268 * Returns NULL if PV entries were reclaimed from the specified pmap.
5269 *
5270 * We do not, however, unmap 2mpages because subsequent accesses will
5271 * allocate per-page pv entries until repromotion occurs, thereby
5272 * exacerbating the shortage of free pv entries.
5273 */
5274 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)5275 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
5276 {
5277 struct pv_chunks_list *pvc;
5278 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
5279 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
5280 struct md_page *pvh;
5281 pd_entry_t *pde;
5282 pmap_t next_pmap, pmap;
5283 pt_entry_t *pte, tpte;
5284 pt_entry_t PG_G, PG_A, PG_M, PG_RW;
5285 pv_entry_t pv;
5286 vm_offset_t va;
5287 vm_page_t m, m_pc;
5288 struct spglist free;
5289 uint64_t inuse;
5290 int bit, field, freed;
5291 bool start_di, restart;
5292
5293 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
5294 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
5295 pmap = NULL;
5296 m_pc = NULL;
5297 PG_G = PG_A = PG_M = PG_RW = 0;
5298 SLIST_INIT(&free);
5299 bzero(&pc_marker_b, sizeof(pc_marker_b));
5300 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
5301 pc_marker = (struct pv_chunk *)&pc_marker_b;
5302 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
5303
5304 /*
5305 * A delayed invalidation block should already be active if
5306 * pmap_advise() or pmap_remove() called this function by way
5307 * of pmap_demote_pde_locked().
5308 */
5309 start_di = pmap_not_in_di();
5310
5311 pvc = &pv_chunks[domain];
5312 mtx_lock(&pvc->pvc_lock);
5313 pvc->active_reclaims++;
5314 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
5315 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
5316 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
5317 SLIST_EMPTY(&free)) {
5318 next_pmap = pc->pc_pmap;
5319 if (next_pmap == NULL) {
5320 /*
5321 * The next chunk is a marker. However, it is
5322 * not our marker, so active_reclaims must be
5323 * > 1. Consequently, the next_chunk code
5324 * will not rotate the pv_chunks list.
5325 */
5326 goto next_chunk;
5327 }
5328 mtx_unlock(&pvc->pvc_lock);
5329
5330 /*
5331 * A pv_chunk can only be removed from the pc_lru list
5332 * when both pc_chunks_mutex is owned and the
5333 * corresponding pmap is locked.
5334 */
5335 if (pmap != next_pmap) {
5336 restart = false;
5337 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
5338 start_di);
5339 pmap = next_pmap;
5340 /* Avoid deadlock and lock recursion. */
5341 if (pmap > locked_pmap) {
5342 RELEASE_PV_LIST_LOCK(lockp);
5343 PMAP_LOCK(pmap);
5344 if (start_di)
5345 pmap_delayed_invl_start();
5346 mtx_lock(&pvc->pvc_lock);
5347 restart = true;
5348 } else if (pmap != locked_pmap) {
5349 if (PMAP_TRYLOCK(pmap)) {
5350 if (start_di)
5351 pmap_delayed_invl_start();
5352 mtx_lock(&pvc->pvc_lock);
5353 restart = true;
5354 } else {
5355 pmap = NULL; /* pmap is not locked */
5356 mtx_lock(&pvc->pvc_lock);
5357 pc = TAILQ_NEXT(pc_marker, pc_lru);
5358 if (pc == NULL ||
5359 pc->pc_pmap != next_pmap)
5360 continue;
5361 goto next_chunk;
5362 }
5363 } else if (start_di)
5364 pmap_delayed_invl_start();
5365 PG_G = pmap_global_bit(pmap);
5366 PG_A = pmap_accessed_bit(pmap);
5367 PG_M = pmap_modified_bit(pmap);
5368 PG_RW = pmap_rw_bit(pmap);
5369 if (restart)
5370 continue;
5371 }
5372
5373 /*
5374 * Destroy every non-wired, 4 KB page mapping in the chunk.
5375 */
5376 freed = 0;
5377 for (field = 0; field < _NPCM; field++) {
5378 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
5379 inuse != 0; inuse &= ~(1UL << bit)) {
5380 bit = bsfq(inuse);
5381 pv = &pc->pc_pventry[field * 64 + bit];
5382 va = pv->pv_va;
5383 pde = pmap_pde(pmap, va);
5384 if ((*pde & PG_PS) != 0)
5385 continue;
5386 pte = pmap_pde_to_pte(pde, va);
5387 if ((*pte & PG_W) != 0)
5388 continue;
5389 tpte = pte_load_clear(pte);
5390 if ((tpte & PG_G) != 0)
5391 pmap_invalidate_page(pmap, va);
5392 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
5393 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5394 vm_page_dirty(m);
5395 if ((tpte & PG_A) != 0)
5396 vm_page_aflag_set(m, PGA_REFERENCED);
5397 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
5398 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5399 m->md.pv_gen++;
5400 if (TAILQ_EMPTY(&m->md.pv_list) &&
5401 (m->flags & PG_FICTITIOUS) == 0) {
5402 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5403 if (TAILQ_EMPTY(&pvh->pv_list)) {
5404 vm_page_aflag_clear(m,
5405 PGA_WRITEABLE);
5406 }
5407 }
5408 pmap_delayed_invl_page(m);
5409 pc->pc_map[field] |= 1UL << bit;
5410 pmap_unuse_pt(pmap, va, *pde, &free);
5411 freed++;
5412 }
5413 }
5414 if (freed == 0) {
5415 mtx_lock(&pvc->pvc_lock);
5416 goto next_chunk;
5417 }
5418 /* Every freed mapping is for a 4 KB page. */
5419 pmap_resident_count_adj(pmap, -freed);
5420 PV_STAT(counter_u64_add(pv_entry_frees, freed));
5421 PV_STAT(counter_u64_add(pv_entry_spare, freed));
5422 PV_STAT(counter_u64_add(pv_entry_count, -freed));
5423 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5424 if (pc_is_free(pc)) {
5425 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV));
5426 PV_STAT(counter_u64_add(pc_chunk_count, -1));
5427 PV_STAT(counter_u64_add(pc_chunk_frees, 1));
5428 /* Entire chunk is free; return it. */
5429 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
5430 dump_drop_page(m_pc->phys_addr);
5431 mtx_lock(&pvc->pvc_lock);
5432 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
5433 break;
5434 }
5435 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
5436 mtx_lock(&pvc->pvc_lock);
5437 /* One freed pv entry in locked_pmap is sufficient. */
5438 if (pmap == locked_pmap)
5439 break;
5440 next_chunk:
5441 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
5442 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
5443 if (pvc->active_reclaims == 1 && pmap != NULL) {
5444 /*
5445 * Rotate the pv chunks list so that we do not
5446 * scan the same pv chunks that could not be
5447 * freed (because they contained a wired
5448 * and/or superpage mapping) on every
5449 * invocation of reclaim_pv_chunk().
5450 */
5451 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) {
5452 MPASS(pc->pc_pmap != NULL);
5453 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
5454 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
5455 }
5456 }
5457 }
5458 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
5459 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
5460 pvc->active_reclaims--;
5461 mtx_unlock(&pvc->pvc_lock);
5462 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
5463 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
5464 m_pc = SLIST_FIRST(&free);
5465 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
5466 /* Recycle a freed page table page. */
5467 m_pc->ref_count = 1;
5468 }
5469 vm_page_free_pages_toq(&free, true);
5470 return (m_pc);
5471 }
5472
5473 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)5474 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
5475 {
5476 vm_page_t m;
5477 int i, domain;
5478
5479 domain = PCPU_GET(domain);
5480 for (i = 0; i < vm_ndomains; i++) {
5481 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
5482 if (m != NULL)
5483 break;
5484 domain = (domain + 1) % vm_ndomains;
5485 }
5486
5487 return (m);
5488 }
5489
5490 /*
5491 * free the pv_entry back to the free list
5492 */
5493 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)5494 free_pv_entry(pmap_t pmap, pv_entry_t pv)
5495 {
5496 struct pv_chunk *pc;
5497 int idx, field, bit;
5498
5499 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5500 PV_STAT(counter_u64_add(pv_entry_frees, 1));
5501 PV_STAT(counter_u64_add(pv_entry_spare, 1));
5502 PV_STAT(counter_u64_add(pv_entry_count, -1));
5503 pc = pv_to_chunk(pv);
5504 idx = pv - &pc->pc_pventry[0];
5505 field = idx / 64;
5506 bit = idx % 64;
5507 pc->pc_map[field] |= 1ul << bit;
5508 if (!pc_is_free(pc)) {
5509 /* 98% of the time, pc is already at the head of the list. */
5510 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
5511 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5512 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
5513 }
5514 return;
5515 }
5516 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5517 free_pv_chunk(pc);
5518 }
5519
5520 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)5521 free_pv_chunk_dequeued(struct pv_chunk *pc)
5522 {
5523 vm_page_t m;
5524
5525 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV));
5526 PV_STAT(counter_u64_add(pc_chunk_count, -1));
5527 PV_STAT(counter_u64_add(pc_chunk_frees, 1));
5528 counter_u64_add(pv_page_count, -1);
5529 /* entire chunk is free, return it */
5530 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
5531 dump_drop_page(m->phys_addr);
5532 vm_page_unwire_noq(m);
5533 vm_page_free(m);
5534 }
5535
5536 static void
free_pv_chunk(struct pv_chunk * pc)5537 free_pv_chunk(struct pv_chunk *pc)
5538 {
5539 struct pv_chunks_list *pvc;
5540
5541 pvc = &pv_chunks[pc_to_domain(pc)];
5542 mtx_lock(&pvc->pvc_lock);
5543 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
5544 mtx_unlock(&pvc->pvc_lock);
5545 free_pv_chunk_dequeued(pc);
5546 }
5547
5548 static void
free_pv_chunk_batch(struct pv_chunklist * batch)5549 free_pv_chunk_batch(struct pv_chunklist *batch)
5550 {
5551 struct pv_chunks_list *pvc;
5552 struct pv_chunk *pc, *npc;
5553 int i;
5554
5555 for (i = 0; i < vm_ndomains; i++) {
5556 if (TAILQ_EMPTY(&batch[i]))
5557 continue;
5558 pvc = &pv_chunks[i];
5559 mtx_lock(&pvc->pvc_lock);
5560 TAILQ_FOREACH(pc, &batch[i], pc_list) {
5561 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
5562 }
5563 mtx_unlock(&pvc->pvc_lock);
5564 }
5565
5566 for (i = 0; i < vm_ndomains; i++) {
5567 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
5568 free_pv_chunk_dequeued(pc);
5569 }
5570 }
5571 }
5572
5573 /*
5574 * Returns a new PV entry, allocating a new PV chunk from the system when
5575 * needed. If this PV chunk allocation fails and a PV list lock pointer was
5576 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
5577 * returned.
5578 *
5579 * The given PV list lock may be released.
5580 */
5581 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)5582 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
5583 {
5584 struct pv_chunks_list *pvc;
5585 int bit, field;
5586 pv_entry_t pv;
5587 struct pv_chunk *pc;
5588 vm_page_t m;
5589
5590 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5591 PV_STAT(counter_u64_add(pv_entry_allocs, 1));
5592 retry:
5593 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
5594 if (pc != NULL) {
5595 for (field = 0; field < _NPCM; field++) {
5596 if (pc->pc_map[field]) {
5597 bit = bsfq(pc->pc_map[field]);
5598 break;
5599 }
5600 }
5601 if (field < _NPCM) {
5602 pv = &pc->pc_pventry[field * 64 + bit];
5603 pc->pc_map[field] &= ~(1ul << bit);
5604 /* If this was the last item, move it to tail */
5605 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
5606 pc->pc_map[2] == 0) {
5607 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5608 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
5609 pc_list);
5610 }
5611 PV_STAT(counter_u64_add(pv_entry_count, 1));
5612 PV_STAT(counter_u64_add(pv_entry_spare, -1));
5613 return (pv);
5614 }
5615 }
5616 /* No free items, allocate another chunk */
5617 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5618 if (m == NULL) {
5619 if (lockp == NULL) {
5620 PV_STAT(counter_u64_add(pc_chunk_tryfail, 1));
5621 return (NULL);
5622 }
5623 m = reclaim_pv_chunk(pmap, lockp);
5624 if (m == NULL)
5625 goto retry;
5626 } else
5627 counter_u64_add(pv_page_count, 1);
5628 PV_STAT(counter_u64_add(pc_chunk_count, 1));
5629 PV_STAT(counter_u64_add(pc_chunk_allocs, 1));
5630 dump_add_page(m->phys_addr);
5631 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
5632 pc->pc_pmap = pmap;
5633 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */
5634 pc->pc_map[1] = PC_FREEN;
5635 pc->pc_map[2] = PC_FREEL;
5636 pvc = &pv_chunks[vm_page_domain(m)];
5637 mtx_lock(&pvc->pvc_lock);
5638 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
5639 mtx_unlock(&pvc->pvc_lock);
5640 pv = &pc->pc_pventry[0];
5641 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
5642 PV_STAT(counter_u64_add(pv_entry_count, 1));
5643 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1));
5644 return (pv);
5645 }
5646
5647 /*
5648 * Returns the number of one bits within the given PV chunk map.
5649 *
5650 * The erratas for Intel processors state that "POPCNT Instruction May
5651 * Take Longer to Execute Than Expected". It is believed that the
5652 * issue is the spurious dependency on the destination register.
5653 * Provide a hint to the register rename logic that the destination
5654 * value is overwritten, by clearing it, as suggested in the
5655 * optimization manual. It should be cheap for unaffected processors
5656 * as well.
5657 *
5658 * Reference numbers for erratas are
5659 * 4th Gen Core: HSD146
5660 * 5th Gen Core: BDM85
5661 * 6th Gen Core: SKL029
5662 */
5663 static int
popcnt_pc_map_pq(uint64_t * map)5664 popcnt_pc_map_pq(uint64_t *map)
5665 {
5666 u_long result, tmp;
5667
5668 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
5669 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
5670 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
5671 : "=&r" (result), "=&r" (tmp)
5672 : "m" (map[0]), "m" (map[1]), "m" (map[2]));
5673 return (result);
5674 }
5675
5676 /*
5677 * Ensure that the number of spare PV entries in the specified pmap meets or
5678 * exceeds the given count, "needed".
5679 *
5680 * The given PV list lock may be released.
5681 */
5682 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)5683 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
5684 {
5685 struct pv_chunks_list *pvc;
5686 struct pch new_tail[PMAP_MEMDOM];
5687 struct pv_chunk *pc;
5688 vm_page_t m;
5689 int avail, free, i;
5690 bool reclaimed;
5691
5692 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5693 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
5694
5695 /*
5696 * Newly allocated PV chunks must be stored in a private list until
5697 * the required number of PV chunks have been allocated. Otherwise,
5698 * reclaim_pv_chunk() could recycle one of these chunks. In
5699 * contrast, these chunks must be added to the pmap upon allocation.
5700 */
5701 for (i = 0; i < PMAP_MEMDOM; i++)
5702 TAILQ_INIT(&new_tail[i]);
5703 retry:
5704 avail = 0;
5705 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
5706 #ifndef __POPCNT__
5707 if ((cpu_feature2 & CPUID2_POPCNT) == 0)
5708 bit_count((bitstr_t *)pc->pc_map, 0,
5709 sizeof(pc->pc_map) * NBBY, &free);
5710 else
5711 #endif
5712 free = popcnt_pc_map_pq(pc->pc_map);
5713 if (free == 0)
5714 break;
5715 avail += free;
5716 if (avail >= needed)
5717 break;
5718 }
5719 for (reclaimed = false; avail < needed; avail += _NPCPV) {
5720 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5721 if (m == NULL) {
5722 m = reclaim_pv_chunk(pmap, lockp);
5723 if (m == NULL)
5724 goto retry;
5725 reclaimed = true;
5726 } else
5727 counter_u64_add(pv_page_count, 1);
5728 PV_STAT(counter_u64_add(pc_chunk_count, 1));
5729 PV_STAT(counter_u64_add(pc_chunk_allocs, 1));
5730 dump_add_page(m->phys_addr);
5731 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
5732 pc->pc_pmap = pmap;
5733 pc->pc_map[0] = PC_FREEN;
5734 pc->pc_map[1] = PC_FREEN;
5735 pc->pc_map[2] = PC_FREEL;
5736 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
5737 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
5738 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV));
5739
5740 /*
5741 * The reclaim might have freed a chunk from the current pmap.
5742 * If that chunk contained available entries, we need to
5743 * re-count the number of available entries.
5744 */
5745 if (reclaimed)
5746 goto retry;
5747 }
5748 for (i = 0; i < vm_ndomains; i++) {
5749 if (TAILQ_EMPTY(&new_tail[i]))
5750 continue;
5751 pvc = &pv_chunks[i];
5752 mtx_lock(&pvc->pvc_lock);
5753 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
5754 mtx_unlock(&pvc->pvc_lock);
5755 }
5756 }
5757
5758 /*
5759 * First find and then remove the pv entry for the specified pmap and virtual
5760 * address from the specified pv list. Returns the pv entry if found and NULL
5761 * otherwise. This operation can be performed on pv lists for either 4KB or
5762 * 2MB page mappings.
5763 */
5764 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)5765 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
5766 {
5767 pv_entry_t pv;
5768
5769 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5770 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
5771 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5772 pvh->pv_gen++;
5773 break;
5774 }
5775 }
5776 return (pv);
5777 }
5778
5779 /*
5780 * After demotion from a 2MB page mapping to 512 4KB page mappings,
5781 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
5782 * entries for each of the 4KB page mappings.
5783 */
5784 static void
pmap_pv_demote_pde(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)5785 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
5786 struct rwlock **lockp)
5787 {
5788 struct md_page *pvh;
5789 struct pv_chunk *pc;
5790 pv_entry_t pv;
5791 vm_offset_t va_last;
5792 vm_page_t m;
5793 int bit, field;
5794
5795 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5796 KASSERT((pa & PDRMASK) == 0,
5797 ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
5798 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
5799
5800 /*
5801 * Transfer the 2mpage's pv entry for this mapping to the first
5802 * page's pv list. Once this transfer begins, the pv list lock
5803 * must not be released until the last pv entry is reinstantiated.
5804 */
5805 pvh = pa_to_pvh(pa);
5806 va = trunc_2mpage(va);
5807 pv = pmap_pvh_remove(pvh, pmap, va);
5808 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
5809 m = PHYS_TO_VM_PAGE(pa);
5810 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5811 m->md.pv_gen++;
5812 /* Instantiate the remaining NPTEPG - 1 pv entries. */
5813 PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1));
5814 va_last = va + NBPDR - PAGE_SIZE;
5815 for (;;) {
5816 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
5817 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
5818 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
5819 for (field = 0; field < _NPCM; field++) {
5820 while (pc->pc_map[field]) {
5821 bit = bsfq(pc->pc_map[field]);
5822 pc->pc_map[field] &= ~(1ul << bit);
5823 pv = &pc->pc_pventry[field * 64 + bit];
5824 va += PAGE_SIZE;
5825 pv->pv_va = va;
5826 m++;
5827 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5828 ("pmap_pv_demote_pde: page %p is not managed", m));
5829 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5830 m->md.pv_gen++;
5831 if (va == va_last)
5832 goto out;
5833 }
5834 }
5835 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5836 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
5837 }
5838 out:
5839 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
5840 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5841 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
5842 }
5843 PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1));
5844 PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1)));
5845 }
5846
5847 #if VM_NRESERVLEVEL > 0
5848 /*
5849 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
5850 * replace the many pv entries for the 4KB page mappings by a single pv entry
5851 * for the 2MB page mapping.
5852 */
5853 static void
pmap_pv_promote_pde(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)5854 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
5855 struct rwlock **lockp)
5856 {
5857 struct md_page *pvh;
5858 pv_entry_t pv;
5859 vm_offset_t va_last;
5860 vm_page_t m;
5861
5862 KASSERT((pa & PDRMASK) == 0,
5863 ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
5864 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
5865
5866 /*
5867 * Transfer the first page's pv entry for this mapping to the 2mpage's
5868 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
5869 * a transfer avoids the possibility that get_pv_entry() calls
5870 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
5871 * mappings that is being promoted.
5872 */
5873 m = PHYS_TO_VM_PAGE(pa);
5874 va = trunc_2mpage(va);
5875 pv = pmap_pvh_remove(&m->md, pmap, va);
5876 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
5877 pvh = pa_to_pvh(pa);
5878 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5879 pvh->pv_gen++;
5880 /* Free the remaining NPTEPG - 1 pv entries. */
5881 va_last = va + NBPDR - PAGE_SIZE;
5882 do {
5883 m++;
5884 va += PAGE_SIZE;
5885 pmap_pvh_free(&m->md, pmap, va);
5886 } while (va < va_last);
5887 }
5888 #endif /* VM_NRESERVLEVEL > 0 */
5889
5890 /*
5891 * First find and then destroy the pv entry for the specified pmap and virtual
5892 * address. This operation can be performed on pv lists for either 4KB or 2MB
5893 * page mappings.
5894 */
5895 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)5896 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
5897 {
5898 pv_entry_t pv;
5899
5900 pv = pmap_pvh_remove(pvh, pmap, va);
5901 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
5902 free_pv_entry(pmap, pv);
5903 }
5904
5905 /*
5906 * Conditionally create the PV entry for a 4KB page mapping if the required
5907 * memory can be allocated without resorting to reclamation.
5908 */
5909 static boolean_t
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)5910 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
5911 struct rwlock **lockp)
5912 {
5913 pv_entry_t pv;
5914
5915 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5916 /* Pass NULL instead of the lock pointer to disable reclamation. */
5917 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
5918 pv->pv_va = va;
5919 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
5920 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5921 m->md.pv_gen++;
5922 return (TRUE);
5923 } else
5924 return (FALSE);
5925 }
5926
5927 /*
5928 * Create the PV entry for a 2MB page mapping. Always returns true unless the
5929 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
5930 * false if the PV entry cannot be allocated without resorting to reclamation.
5931 */
5932 static bool
pmap_pv_insert_pde(pmap_t pmap,vm_offset_t va,pd_entry_t pde,u_int flags,struct rwlock ** lockp)5933 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
5934 struct rwlock **lockp)
5935 {
5936 struct md_page *pvh;
5937 pv_entry_t pv;
5938 vm_paddr_t pa;
5939
5940 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5941 /* Pass NULL instead of the lock pointer to disable reclamation. */
5942 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
5943 NULL : lockp)) == NULL)
5944 return (false);
5945 pv->pv_va = va;
5946 pa = pde & PG_PS_FRAME;
5947 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
5948 pvh = pa_to_pvh(pa);
5949 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5950 pvh->pv_gen++;
5951 return (true);
5952 }
5953
5954 /*
5955 * Fills a page table page with mappings to consecutive physical pages.
5956 */
5957 static void
pmap_fill_ptp(pt_entry_t * firstpte,pt_entry_t newpte)5958 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
5959 {
5960 pt_entry_t *pte;
5961
5962 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
5963 *pte = newpte;
5964 newpte += PAGE_SIZE;
5965 }
5966 }
5967
5968 /*
5969 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page
5970 * mapping is invalidated.
5971 */
5972 static boolean_t
pmap_demote_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t va)5973 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
5974 {
5975 struct rwlock *lock;
5976 boolean_t rv;
5977
5978 lock = NULL;
5979 rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
5980 if (lock != NULL)
5981 rw_wunlock(lock);
5982 return (rv);
5983 }
5984
5985 static void
pmap_demote_pde_check(pt_entry_t * firstpte __unused,pt_entry_t newpte __unused)5986 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused)
5987 {
5988 #ifdef INVARIANTS
5989 #ifdef DIAGNOSTIC
5990 pt_entry_t *xpte, *ypte;
5991
5992 for (xpte = firstpte; xpte < firstpte + NPTEPG;
5993 xpte++, newpte += PAGE_SIZE) {
5994 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) {
5995 printf("pmap_demote_pde: xpte %zd and newpte map "
5996 "different pages: found %#lx, expected %#lx\n",
5997 xpte - firstpte, *xpte, newpte);
5998 printf("page table dump\n");
5999 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++)
6000 printf("%zd %#lx\n", ypte - firstpte, *ypte);
6001 panic("firstpte");
6002 }
6003 }
6004 #else
6005 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
6006 ("pmap_demote_pde: firstpte and newpte map different physical"
6007 " addresses"));
6008 #endif
6009 #endif
6010 }
6011
6012 static void
pmap_demote_pde_abort(pmap_t pmap,vm_offset_t va,pd_entry_t * pde,pd_entry_t oldpde,struct rwlock ** lockp)6013 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
6014 pd_entry_t oldpde, struct rwlock **lockp)
6015 {
6016 struct spglist free;
6017 vm_offset_t sva;
6018
6019 SLIST_INIT(&free);
6020 sva = trunc_2mpage(va);
6021 pmap_remove_pde(pmap, pde, sva, &free, lockp);
6022 if ((oldpde & pmap_global_bit(pmap)) == 0)
6023 pmap_invalidate_pde_page(pmap, sva, oldpde);
6024 vm_page_free_pages_toq(&free, true);
6025 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p",
6026 va, pmap);
6027 }
6028
6029 static boolean_t
pmap_demote_pde_locked(pmap_t pmap,pd_entry_t * pde,vm_offset_t va,struct rwlock ** lockp)6030 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
6031 struct rwlock **lockp)
6032 {
6033 pd_entry_t newpde, oldpde;
6034 pt_entry_t *firstpte, newpte;
6035 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
6036 vm_paddr_t mptepa;
6037 vm_page_t mpte;
6038 int PG_PTE_CACHE;
6039 bool in_kernel;
6040
6041 PG_A = pmap_accessed_bit(pmap);
6042 PG_G = pmap_global_bit(pmap);
6043 PG_M = pmap_modified_bit(pmap);
6044 PG_RW = pmap_rw_bit(pmap);
6045 PG_V = pmap_valid_bit(pmap);
6046 PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
6047 PG_PKU_MASK = pmap_pku_mask_bit(pmap);
6048
6049 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6050 in_kernel = va >= VM_MAXUSER_ADDRESS;
6051 oldpde = *pde;
6052 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
6053 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
6054
6055 /*
6056 * Invalidate the 2MB page mapping and return "failure" if the
6057 * mapping was never accessed.
6058 */
6059 if ((oldpde & PG_A) == 0) {
6060 KASSERT((oldpde & PG_W) == 0,
6061 ("pmap_demote_pde: a wired mapping is missing PG_A"));
6062 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
6063 return (FALSE);
6064 }
6065
6066 mpte = pmap_remove_pt_page(pmap, va);
6067 if (mpte == NULL) {
6068 KASSERT((oldpde & PG_W) == 0,
6069 ("pmap_demote_pde: page table page for a wired mapping"
6070 " is missing"));
6071
6072 /*
6073 * If the page table page is missing and the mapping
6074 * is for a kernel address, the mapping must belong to
6075 * the direct map. Page table pages are preallocated
6076 * for every other part of the kernel address space,
6077 * so the direct map region is the only part of the
6078 * kernel address space that must be handled here.
6079 */
6080 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS &&
6081 va < DMAP_MAX_ADDRESS),
6082 ("pmap_demote_pde: No saved mpte for va %#lx", va));
6083
6084 /*
6085 * If the 2MB page mapping belongs to the direct map
6086 * region of the kernel's address space, then the page
6087 * allocation request specifies the highest possible
6088 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
6089 * priority is normal.
6090 */
6091 mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va),
6092 (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED);
6093
6094 /*
6095 * If the allocation of the new page table page fails,
6096 * invalidate the 2MB page mapping and return "failure".
6097 */
6098 if (mpte == NULL) {
6099 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
6100 return (FALSE);
6101 }
6102
6103 if (!in_kernel)
6104 mpte->ref_count = NPTEPG;
6105 }
6106 mptepa = VM_PAGE_TO_PHYS(mpte);
6107 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
6108 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
6109 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
6110 ("pmap_demote_pde: oldpde is missing PG_M"));
6111 newpte = oldpde & ~PG_PS;
6112 newpte = pmap_swap_pat(pmap, newpte);
6113
6114 /*
6115 * If the PTP is not leftover from an earlier promotion or it does not
6116 * have PG_A set in every PTE, then fill it. The new PTEs will all
6117 * have PG_A set.
6118 */
6119 if (!vm_page_all_valid(mpte))
6120 pmap_fill_ptp(firstpte, newpte);
6121
6122 pmap_demote_pde_check(firstpte, newpte);
6123
6124 /*
6125 * If the mapping has changed attributes, update the PTEs.
6126 */
6127 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
6128 pmap_fill_ptp(firstpte, newpte);
6129
6130 /*
6131 * The spare PV entries must be reserved prior to demoting the
6132 * mapping, that is, prior to changing the PDE. Otherwise, the state
6133 * of the PDE and the PV lists will be inconsistent, which can result
6134 * in reclaim_pv_chunk() attempting to remove a PV entry from the
6135 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
6136 * PV entry for the 2MB page mapping that is being demoted.
6137 */
6138 if ((oldpde & PG_MANAGED) != 0)
6139 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
6140
6141 /*
6142 * Demote the mapping. This pmap is locked. The old PDE has
6143 * PG_A set. If the old PDE has PG_RW set, it also has PG_M
6144 * set. Thus, there is no danger of a race with another
6145 * processor changing the setting of PG_A and/or PG_M between
6146 * the read above and the store below.
6147 */
6148 if (workaround_erratum383)
6149 pmap_update_pde(pmap, va, pde, newpde);
6150 else
6151 pde_store(pde, newpde);
6152
6153 /*
6154 * Invalidate a stale recursive mapping of the page table page.
6155 */
6156 if (in_kernel)
6157 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
6158
6159 /*
6160 * Demote the PV entry.
6161 */
6162 if ((oldpde & PG_MANAGED) != 0)
6163 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
6164
6165 counter_u64_add(pmap_pde_demotions, 1);
6166 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p",
6167 va, pmap);
6168 return (TRUE);
6169 }
6170
6171 /*
6172 * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
6173 */
6174 static void
pmap_remove_kernel_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t va)6175 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
6176 {
6177 pd_entry_t newpde;
6178 vm_paddr_t mptepa;
6179 vm_page_t mpte;
6180
6181 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
6182 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6183 mpte = pmap_remove_pt_page(pmap, va);
6184 if (mpte == NULL)
6185 panic("pmap_remove_kernel_pde: Missing pt page.");
6186
6187 mptepa = VM_PAGE_TO_PHYS(mpte);
6188 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
6189
6190 /*
6191 * If this page table page was unmapped by a promotion, then it
6192 * contains valid mappings. Zero it to invalidate those mappings.
6193 */
6194 if (vm_page_any_valid(mpte))
6195 pagezero((void *)PHYS_TO_DMAP(mptepa));
6196
6197 /*
6198 * Demote the mapping.
6199 */
6200 if (workaround_erratum383)
6201 pmap_update_pde(pmap, va, pde, newpde);
6202 else
6203 pde_store(pde, newpde);
6204
6205 /*
6206 * Invalidate a stale recursive mapping of the page table page.
6207 */
6208 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
6209 }
6210
6211 /*
6212 * pmap_remove_pde: do the things to unmap a superpage in a process
6213 */
6214 static int
pmap_remove_pde(pmap_t pmap,pd_entry_t * pdq,vm_offset_t sva,struct spglist * free,struct rwlock ** lockp)6215 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
6216 struct spglist *free, struct rwlock **lockp)
6217 {
6218 struct md_page *pvh;
6219 pd_entry_t oldpde;
6220 vm_offset_t eva, va;
6221 vm_page_t m, mpte;
6222 pt_entry_t PG_G, PG_A, PG_M, PG_RW;
6223
6224 PG_G = pmap_global_bit(pmap);
6225 PG_A = pmap_accessed_bit(pmap);
6226 PG_M = pmap_modified_bit(pmap);
6227 PG_RW = pmap_rw_bit(pmap);
6228
6229 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6230 KASSERT((sva & PDRMASK) == 0,
6231 ("pmap_remove_pde: sva is not 2mpage aligned"));
6232 oldpde = pte_load_clear(pdq);
6233 if (oldpde & PG_W)
6234 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
6235 if ((oldpde & PG_G) != 0)
6236 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
6237 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE);
6238 if (oldpde & PG_MANAGED) {
6239 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
6240 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
6241 pmap_pvh_free(pvh, pmap, sva);
6242 eva = sva + NBPDR;
6243 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
6244 va < eva; va += PAGE_SIZE, m++) {
6245 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
6246 vm_page_dirty(m);
6247 if (oldpde & PG_A)
6248 vm_page_aflag_set(m, PGA_REFERENCED);
6249 if (TAILQ_EMPTY(&m->md.pv_list) &&
6250 TAILQ_EMPTY(&pvh->pv_list))
6251 vm_page_aflag_clear(m, PGA_WRITEABLE);
6252 pmap_delayed_invl_page(m);
6253 }
6254 }
6255 if (pmap == kernel_pmap) {
6256 pmap_remove_kernel_pde(pmap, pdq, sva);
6257 } else {
6258 mpte = pmap_remove_pt_page(pmap, sva);
6259 if (mpte != NULL) {
6260 KASSERT(vm_page_any_valid(mpte),
6261 ("pmap_remove_pde: pte page not promoted"));
6262 pmap_pt_page_count_adj(pmap, -1);
6263 KASSERT(mpte->ref_count == NPTEPG,
6264 ("pmap_remove_pde: pte page ref count error"));
6265 mpte->ref_count = 0;
6266 pmap_add_delayed_free_list(mpte, free, FALSE);
6267 }
6268 }
6269 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
6270 }
6271
6272 /*
6273 * pmap_remove_pte: do the things to unmap a page in a process
6274 */
6275 static int
pmap_remove_pte(pmap_t pmap,pt_entry_t * ptq,vm_offset_t va,pd_entry_t ptepde,struct spglist * free,struct rwlock ** lockp)6276 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
6277 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
6278 {
6279 struct md_page *pvh;
6280 pt_entry_t oldpte, PG_A, PG_M, PG_RW;
6281 vm_page_t m;
6282
6283 PG_A = pmap_accessed_bit(pmap);
6284 PG_M = pmap_modified_bit(pmap);
6285 PG_RW = pmap_rw_bit(pmap);
6286
6287 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6288 oldpte = pte_load_clear(ptq);
6289 if (oldpte & PG_W)
6290 pmap->pm_stats.wired_count -= 1;
6291 pmap_resident_count_adj(pmap, -1);
6292 if (oldpte & PG_MANAGED) {
6293 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
6294 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6295 vm_page_dirty(m);
6296 if (oldpte & PG_A)
6297 vm_page_aflag_set(m, PGA_REFERENCED);
6298 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
6299 pmap_pvh_free(&m->md, pmap, va);
6300 if (TAILQ_EMPTY(&m->md.pv_list) &&
6301 (m->flags & PG_FICTITIOUS) == 0) {
6302 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6303 if (TAILQ_EMPTY(&pvh->pv_list))
6304 vm_page_aflag_clear(m, PGA_WRITEABLE);
6305 }
6306 pmap_delayed_invl_page(m);
6307 }
6308 return (pmap_unuse_pt(pmap, va, ptepde, free));
6309 }
6310
6311 /*
6312 * Remove a single page from a process address space
6313 */
6314 static void
pmap_remove_page(pmap_t pmap,vm_offset_t va,pd_entry_t * pde,struct spglist * free)6315 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
6316 struct spglist *free)
6317 {
6318 struct rwlock *lock;
6319 pt_entry_t *pte, PG_V;
6320
6321 PG_V = pmap_valid_bit(pmap);
6322 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6323 if ((*pde & PG_V) == 0)
6324 return;
6325 pte = pmap_pde_to_pte(pde, va);
6326 if ((*pte & PG_V) == 0)
6327 return;
6328 lock = NULL;
6329 pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
6330 if (lock != NULL)
6331 rw_wunlock(lock);
6332 pmap_invalidate_page(pmap, va);
6333 }
6334
6335 /*
6336 * Removes the specified range of addresses from the page table page.
6337 */
6338 static bool
pmap_remove_ptes(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pd_entry_t * pde,struct spglist * free,struct rwlock ** lockp)6339 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
6340 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
6341 {
6342 pt_entry_t PG_G, *pte;
6343 vm_offset_t va;
6344 bool anyvalid;
6345
6346 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6347 PG_G = pmap_global_bit(pmap);
6348 anyvalid = false;
6349 va = eva;
6350 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
6351 sva += PAGE_SIZE) {
6352 if (*pte == 0) {
6353 if (va != eva) {
6354 pmap_invalidate_range(pmap, va, sva);
6355 va = eva;
6356 }
6357 continue;
6358 }
6359 if ((*pte & PG_G) == 0)
6360 anyvalid = true;
6361 else if (va == eva)
6362 va = sva;
6363 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
6364 sva += PAGE_SIZE;
6365 break;
6366 }
6367 }
6368 if (va != eva)
6369 pmap_invalidate_range(pmap, va, sva);
6370 return (anyvalid);
6371 }
6372
6373 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)6374 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
6375 {
6376 struct rwlock *lock;
6377 vm_page_t mt;
6378 vm_offset_t va_next;
6379 pml5_entry_t *pml5e;
6380 pml4_entry_t *pml4e;
6381 pdp_entry_t *pdpe;
6382 pd_entry_t ptpaddr, *pde;
6383 pt_entry_t PG_G, PG_V;
6384 struct spglist free;
6385 int anyvalid;
6386
6387 PG_G = pmap_global_bit(pmap);
6388 PG_V = pmap_valid_bit(pmap);
6389
6390 /*
6391 * If there are no resident pages besides the top level page
6392 * table page(s), there is nothing to do. Kernel pmap always
6393 * accounts whole preloaded area as resident, which makes its
6394 * resident count > 2.
6395 * Perform an unsynchronized read. This is, however, safe.
6396 */
6397 if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ?
6398 1 : 0))
6399 return;
6400
6401 anyvalid = 0;
6402 SLIST_INIT(&free);
6403
6404 pmap_delayed_invl_start();
6405 PMAP_LOCK(pmap);
6406 if (map_delete)
6407 pmap_pkru_on_remove(pmap, sva, eva);
6408
6409 /*
6410 * special handling of removing one page. a very
6411 * common operation and easy to short circuit some
6412 * code.
6413 */
6414 if (sva + PAGE_SIZE == eva) {
6415 pde = pmap_pde(pmap, sva);
6416 if (pde && (*pde & PG_PS) == 0) {
6417 pmap_remove_page(pmap, sva, pde, &free);
6418 goto out;
6419 }
6420 }
6421
6422 lock = NULL;
6423 for (; sva < eva; sva = va_next) {
6424 if (pmap->pm_stats.resident_count == 0)
6425 break;
6426
6427 if (pmap_is_la57(pmap)) {
6428 pml5e = pmap_pml5e(pmap, sva);
6429 if ((*pml5e & PG_V) == 0) {
6430 va_next = (sva + NBPML5) & ~PML5MASK;
6431 if (va_next < sva)
6432 va_next = eva;
6433 continue;
6434 }
6435 pml4e = pmap_pml5e_to_pml4e(pml5e, sva);
6436 } else {
6437 pml4e = pmap_pml4e(pmap, sva);
6438 }
6439 if ((*pml4e & PG_V) == 0) {
6440 va_next = (sva + NBPML4) & ~PML4MASK;
6441 if (va_next < sva)
6442 va_next = eva;
6443 continue;
6444 }
6445
6446 va_next = (sva + NBPDP) & ~PDPMASK;
6447 if (va_next < sva)
6448 va_next = eva;
6449 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6450 if ((*pdpe & PG_V) == 0)
6451 continue;
6452 if ((*pdpe & PG_PS) != 0) {
6453 KASSERT(va_next <= eva,
6454 ("partial update of non-transparent 1G mapping "
6455 "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
6456 *pdpe, sva, eva, va_next));
6457 MPASS(pmap != kernel_pmap); /* XXXKIB */
6458 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
6459 anyvalid = 1;
6460 *pdpe = 0;
6461 pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE);
6462 mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME);
6463 pmap_unwire_ptp(pmap, sva, mt, &free);
6464 continue;
6465 }
6466
6467 /*
6468 * Calculate index for next page table.
6469 */
6470 va_next = (sva + NBPDR) & ~PDRMASK;
6471 if (va_next < sva)
6472 va_next = eva;
6473
6474 pde = pmap_pdpe_to_pde(pdpe, sva);
6475 ptpaddr = *pde;
6476
6477 /*
6478 * Weed out invalid mappings.
6479 */
6480 if (ptpaddr == 0)
6481 continue;
6482
6483 /*
6484 * Check for large page.
6485 */
6486 if ((ptpaddr & PG_PS) != 0) {
6487 /*
6488 * Are we removing the entire large page? If not,
6489 * demote the mapping and fall through.
6490 */
6491 if (sva + NBPDR == va_next && eva >= va_next) {
6492 /*
6493 * The TLB entry for a PG_G mapping is
6494 * invalidated by pmap_remove_pde().
6495 */
6496 if ((ptpaddr & PG_G) == 0)
6497 anyvalid = 1;
6498 pmap_remove_pde(pmap, pde, sva, &free, &lock);
6499 continue;
6500 } else if (!pmap_demote_pde_locked(pmap, pde, sva,
6501 &lock)) {
6502 /* The large page mapping was destroyed. */
6503 continue;
6504 } else
6505 ptpaddr = *pde;
6506 }
6507
6508 /*
6509 * Limit our scan to either the end of the va represented
6510 * by the current page table page, or to the end of the
6511 * range being removed.
6512 */
6513 if (va_next > eva)
6514 va_next = eva;
6515
6516 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
6517 anyvalid = 1;
6518 }
6519 if (lock != NULL)
6520 rw_wunlock(lock);
6521 out:
6522 if (anyvalid)
6523 pmap_invalidate_all(pmap);
6524 PMAP_UNLOCK(pmap);
6525 pmap_delayed_invl_finish();
6526 vm_page_free_pages_toq(&free, true);
6527 }
6528
6529 /*
6530 * Remove the given range of addresses from the specified map.
6531 *
6532 * It is assumed that the start and end are properly
6533 * rounded to the page size.
6534 */
6535 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6536 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6537 {
6538 pmap_remove1(pmap, sva, eva, false);
6539 }
6540
6541 /*
6542 * Remove the given range of addresses as part of a logical unmap
6543 * operation. This has the effect of calling pmap_remove(), but
6544 * also clears any metadata that should persist for the lifetime
6545 * of a logical mapping.
6546 */
6547 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6548 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6549 {
6550 pmap_remove1(pmap, sva, eva, true);
6551 }
6552
6553 /*
6554 * Routine: pmap_remove_all
6555 * Function:
6556 * Removes this physical page from
6557 * all physical maps in which it resides.
6558 * Reflects back modify bits to the pager.
6559 *
6560 * Notes:
6561 * Original versions of this routine were very
6562 * inefficient because they iteratively called
6563 * pmap_remove (slow...)
6564 */
6565
6566 void
pmap_remove_all(vm_page_t m)6567 pmap_remove_all(vm_page_t m)
6568 {
6569 struct md_page *pvh;
6570 pv_entry_t pv;
6571 pmap_t pmap;
6572 struct rwlock *lock;
6573 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
6574 pd_entry_t *pde;
6575 vm_offset_t va;
6576 struct spglist free;
6577 int pvh_gen, md_gen;
6578
6579 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6580 ("pmap_remove_all: page %p is not managed", m));
6581 SLIST_INIT(&free);
6582 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6583 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
6584 pa_to_pvh(VM_PAGE_TO_PHYS(m));
6585 rw_wlock(lock);
6586 retry:
6587 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
6588 pmap = PV_PMAP(pv);
6589 if (!PMAP_TRYLOCK(pmap)) {
6590 pvh_gen = pvh->pv_gen;
6591 rw_wunlock(lock);
6592 PMAP_LOCK(pmap);
6593 rw_wlock(lock);
6594 if (pvh_gen != pvh->pv_gen) {
6595 PMAP_UNLOCK(pmap);
6596 goto retry;
6597 }
6598 }
6599 va = pv->pv_va;
6600 pde = pmap_pde(pmap, va);
6601 (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
6602 PMAP_UNLOCK(pmap);
6603 }
6604 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
6605 pmap = PV_PMAP(pv);
6606 if (!PMAP_TRYLOCK(pmap)) {
6607 pvh_gen = pvh->pv_gen;
6608 md_gen = m->md.pv_gen;
6609 rw_wunlock(lock);
6610 PMAP_LOCK(pmap);
6611 rw_wlock(lock);
6612 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6613 PMAP_UNLOCK(pmap);
6614 goto retry;
6615 }
6616 }
6617 PG_A = pmap_accessed_bit(pmap);
6618 PG_M = pmap_modified_bit(pmap);
6619 PG_RW = pmap_rw_bit(pmap);
6620 pmap_resident_count_adj(pmap, -1);
6621 pde = pmap_pde(pmap, pv->pv_va);
6622 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
6623 " a 2mpage in page %p's pv list", m));
6624 pte = pmap_pde_to_pte(pde, pv->pv_va);
6625 tpte = pte_load_clear(pte);
6626 if (tpte & PG_W)
6627 pmap->pm_stats.wired_count--;
6628 if (tpte & PG_A)
6629 vm_page_aflag_set(m, PGA_REFERENCED);
6630
6631 /*
6632 * Update the vm_page_t clean and reference bits.
6633 */
6634 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6635 vm_page_dirty(m);
6636 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
6637 pmap_invalidate_page(pmap, pv->pv_va);
6638 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6639 m->md.pv_gen++;
6640 free_pv_entry(pmap, pv);
6641 PMAP_UNLOCK(pmap);
6642 }
6643 vm_page_aflag_clear(m, PGA_WRITEABLE);
6644 rw_wunlock(lock);
6645 pmap_delayed_invl_wait(m);
6646 vm_page_free_pages_toq(&free, true);
6647 }
6648
6649 /*
6650 * pmap_protect_pde: do the things to protect a 2mpage in a process
6651 */
6652 static boolean_t
pmap_protect_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t sva,vm_prot_t prot)6653 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
6654 {
6655 pd_entry_t newpde, oldpde;
6656 vm_page_t m, mt;
6657 boolean_t anychanged;
6658 pt_entry_t PG_G, PG_M, PG_RW;
6659
6660 PG_G = pmap_global_bit(pmap);
6661 PG_M = pmap_modified_bit(pmap);
6662 PG_RW = pmap_rw_bit(pmap);
6663
6664 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6665 KASSERT((sva & PDRMASK) == 0,
6666 ("pmap_protect_pde: sva is not 2mpage aligned"));
6667 anychanged = FALSE;
6668 retry:
6669 oldpde = newpde = *pde;
6670 if ((prot & VM_PROT_WRITE) == 0) {
6671 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
6672 (PG_MANAGED | PG_M | PG_RW)) {
6673 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
6674 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
6675 vm_page_dirty(mt);
6676 }
6677 newpde &= ~(PG_RW | PG_M);
6678 }
6679 if ((prot & VM_PROT_EXECUTE) == 0)
6680 newpde |= pg_nx;
6681 if (newpde != oldpde) {
6682 /*
6683 * As an optimization to future operations on this PDE, clear
6684 * PG_PROMOTED. The impending invalidation will remove any
6685 * lingering 4KB page mappings from the TLB.
6686 */
6687 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
6688 goto retry;
6689 if ((oldpde & PG_G) != 0)
6690 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
6691 else
6692 anychanged = TRUE;
6693 }
6694 return (anychanged);
6695 }
6696
6697 /*
6698 * Set the physical protection on the
6699 * specified range of this map as requested.
6700 */
6701 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)6702 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
6703 {
6704 vm_page_t m;
6705 vm_offset_t va_next;
6706 pml4_entry_t *pml4e;
6707 pdp_entry_t *pdpe;
6708 pd_entry_t ptpaddr, *pde;
6709 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
6710 pt_entry_t obits, pbits;
6711 boolean_t anychanged;
6712
6713 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
6714 if (prot == VM_PROT_NONE) {
6715 pmap_remove(pmap, sva, eva);
6716 return;
6717 }
6718
6719 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
6720 (VM_PROT_WRITE|VM_PROT_EXECUTE))
6721 return;
6722
6723 PG_G = pmap_global_bit(pmap);
6724 PG_M = pmap_modified_bit(pmap);
6725 PG_V = pmap_valid_bit(pmap);
6726 PG_RW = pmap_rw_bit(pmap);
6727 anychanged = FALSE;
6728
6729 /*
6730 * Although this function delays and batches the invalidation
6731 * of stale TLB entries, it does not need to call
6732 * pmap_delayed_invl_start() and
6733 * pmap_delayed_invl_finish(), because it does not
6734 * ordinarily destroy mappings. Stale TLB entries from
6735 * protection-only changes need only be invalidated before the
6736 * pmap lock is released, because protection-only changes do
6737 * not destroy PV entries. Even operations that iterate over
6738 * a physical page's PV list of mappings, like
6739 * pmap_remove_write(), acquire the pmap lock for each
6740 * mapping. Consequently, for protection-only changes, the
6741 * pmap lock suffices to synchronize both page table and TLB
6742 * updates.
6743 *
6744 * This function only destroys a mapping if pmap_demote_pde()
6745 * fails. In that case, stale TLB entries are immediately
6746 * invalidated.
6747 */
6748
6749 PMAP_LOCK(pmap);
6750 for (; sva < eva; sva = va_next) {
6751 pml4e = pmap_pml4e(pmap, sva);
6752 if (pml4e == NULL || (*pml4e & PG_V) == 0) {
6753 va_next = (sva + NBPML4) & ~PML4MASK;
6754 if (va_next < sva)
6755 va_next = eva;
6756 continue;
6757 }
6758
6759 va_next = (sva + NBPDP) & ~PDPMASK;
6760 if (va_next < sva)
6761 va_next = eva;
6762 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6763 if ((*pdpe & PG_V) == 0)
6764 continue;
6765 if ((*pdpe & PG_PS) != 0) {
6766 KASSERT(va_next <= eva,
6767 ("partial update of non-transparent 1G mapping "
6768 "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
6769 *pdpe, sva, eva, va_next));
6770 retry_pdpe:
6771 obits = pbits = *pdpe;
6772 MPASS((pbits & (PG_MANAGED | PG_G)) == 0);
6773 MPASS(pmap != kernel_pmap); /* XXXKIB */
6774 if ((prot & VM_PROT_WRITE) == 0)
6775 pbits &= ~(PG_RW | PG_M);
6776 if ((prot & VM_PROT_EXECUTE) == 0)
6777 pbits |= pg_nx;
6778
6779 if (pbits != obits) {
6780 if (!atomic_cmpset_long(pdpe, obits, pbits))
6781 /* PG_PS cannot be cleared under us, */
6782 goto retry_pdpe;
6783 anychanged = TRUE;
6784 }
6785 continue;
6786 }
6787
6788 va_next = (sva + NBPDR) & ~PDRMASK;
6789 if (va_next < sva)
6790 va_next = eva;
6791
6792 pde = pmap_pdpe_to_pde(pdpe, sva);
6793 ptpaddr = *pde;
6794
6795 /*
6796 * Weed out invalid mappings.
6797 */
6798 if (ptpaddr == 0)
6799 continue;
6800
6801 /*
6802 * Check for large page.
6803 */
6804 if ((ptpaddr & PG_PS) != 0) {
6805 /*
6806 * Are we protecting the entire large page? If not,
6807 * demote the mapping and fall through.
6808 */
6809 if (sva + NBPDR == va_next && eva >= va_next) {
6810 /*
6811 * The TLB entry for a PG_G mapping is
6812 * invalidated by pmap_protect_pde().
6813 */
6814 if (pmap_protect_pde(pmap, pde, sva, prot))
6815 anychanged = TRUE;
6816 continue;
6817 } else if (!pmap_demote_pde(pmap, pde, sva)) {
6818 /*
6819 * The large page mapping was destroyed.
6820 */
6821 continue;
6822 }
6823 }
6824
6825 if (va_next > eva)
6826 va_next = eva;
6827
6828 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6829 sva += PAGE_SIZE) {
6830 retry:
6831 obits = pbits = *pte;
6832 if ((pbits & PG_V) == 0)
6833 continue;
6834
6835 if ((prot & VM_PROT_WRITE) == 0) {
6836 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
6837 (PG_MANAGED | PG_M | PG_RW)) {
6838 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
6839 vm_page_dirty(m);
6840 }
6841 pbits &= ~(PG_RW | PG_M);
6842 }
6843 if ((prot & VM_PROT_EXECUTE) == 0)
6844 pbits |= pg_nx;
6845
6846 if (pbits != obits) {
6847 if (!atomic_cmpset_long(pte, obits, pbits))
6848 goto retry;
6849 if (obits & PG_G)
6850 pmap_invalidate_page(pmap, sva);
6851 else
6852 anychanged = TRUE;
6853 }
6854 }
6855 }
6856 if (anychanged)
6857 pmap_invalidate_all(pmap);
6858 PMAP_UNLOCK(pmap);
6859 }
6860
6861 static bool
pmap_pde_ept_executable(pmap_t pmap,pd_entry_t pde)6862 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde)
6863 {
6864
6865 if (pmap->pm_type != PT_EPT)
6866 return (false);
6867 return ((pde & EPT_PG_EXECUTE) != 0);
6868 }
6869
6870 #if VM_NRESERVLEVEL > 0
6871 /*
6872 * Tries to promote the 512, contiguous 4KB page mappings that are within a
6873 * single page table page (PTP) to a single 2MB page mapping. For promotion
6874 * to occur, two conditions must be met: (1) the 4KB page mappings must map
6875 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
6876 * identical characteristics.
6877 */
6878 static bool
pmap_promote_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)6879 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte,
6880 struct rwlock **lockp)
6881 {
6882 pd_entry_t newpde;
6883 pt_entry_t *firstpte, oldpte, pa, *pte;
6884 pt_entry_t allpte_PG_A, PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
6885 int PG_PTE_CACHE;
6886
6887 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6888 if (!pmap_ps_enabled(pmap))
6889 return (false);
6890
6891 PG_A = pmap_accessed_bit(pmap);
6892 PG_G = pmap_global_bit(pmap);
6893 PG_M = pmap_modified_bit(pmap);
6894 PG_V = pmap_valid_bit(pmap);
6895 PG_RW = pmap_rw_bit(pmap);
6896 PG_PKU_MASK = pmap_pku_mask_bit(pmap);
6897 PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
6898
6899 /*
6900 * Examine the first PTE in the specified PTP. Abort if this PTE is
6901 * ineligible for promotion due to hardware errata, invalid, or does
6902 * not map the first 4KB physical page within a 2MB page.
6903 */
6904 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
6905 newpde = *firstpte;
6906 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde)))
6907 return (false);
6908 if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) {
6909 counter_u64_add(pmap_pde_p_failures, 1);
6910 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
6911 " in pmap %p", va, pmap);
6912 return (false);
6913 }
6914
6915 /*
6916 * Both here and in the below "for" loop, to allow for repromotion
6917 * after MADV_FREE, conditionally write protect a clean PTE before
6918 * possibly aborting the promotion due to other PTE attributes. Why?
6919 * Suppose that MADV_FREE is applied to a part of a superpage, the
6920 * address range [S, E). pmap_advise() will demote the superpage
6921 * mapping, destroy the 4KB page mapping at the end of [S, E), and
6922 * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later,
6923 * imagine that the memory in [S, E) is recycled, but the last 4KB
6924 * page in [S, E) is not the last to be rewritten, or simply accessed.
6925 * In other words, there is still a 4KB page in [S, E), call it P,
6926 * that is writeable but PG_M and PG_A are clear in P's PTE. Unless
6927 * we write protect P before aborting the promotion, if and when P is
6928 * finally rewritten, there won't be a page fault to trigger
6929 * repromotion.
6930 */
6931 setpde:
6932 if ((newpde & (PG_M | PG_RW)) == PG_RW) {
6933 /*
6934 * When PG_M is already clear, PG_RW can be cleared without
6935 * a TLB invalidation.
6936 */
6937 if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW))
6938 goto setpde;
6939 newpde &= ~PG_RW;
6940 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
6941 " in pmap %p", va & ~PDRMASK, pmap);
6942 }
6943
6944 /*
6945 * Examine each of the other PTEs in the specified PTP. Abort if this
6946 * PTE maps an unexpected 4KB physical page or does not have identical
6947 * characteristics to the first PTE.
6948 */
6949 allpte_PG_A = newpde & PG_A;
6950 pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE;
6951 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
6952 oldpte = *pte;
6953 if ((oldpte & (PG_FRAME | PG_V)) != pa) {
6954 counter_u64_add(pmap_pde_p_failures, 1);
6955 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
6956 " in pmap %p", va, pmap);
6957 return (false);
6958 }
6959 setpte:
6960 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
6961 /*
6962 * When PG_M is already clear, PG_RW can be cleared
6963 * without a TLB invalidation.
6964 */
6965 if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW))
6966 goto setpte;
6967 oldpte &= ~PG_RW;
6968 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
6969 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
6970 (va & ~PDRMASK), pmap);
6971 }
6972 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
6973 counter_u64_add(pmap_pde_p_failures, 1);
6974 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
6975 " in pmap %p", va, pmap);
6976 return (false);
6977 }
6978 allpte_PG_A &= oldpte;
6979 pa -= PAGE_SIZE;
6980 }
6981
6982 /*
6983 * Unless all PTEs have PG_A set, clear it from the superpage mapping,
6984 * so that promotions triggered by speculative mappings, such as
6985 * pmap_enter_quick(), don't automatically mark the underlying pages
6986 * as referenced.
6987 */
6988 newpde &= ~PG_A | allpte_PG_A;
6989
6990 /*
6991 * EPT PTEs with PG_M set and PG_A clear are not supported by early
6992 * MMUs supporting EPT.
6993 */
6994 KASSERT((newpde & PG_A) != 0 || safe_to_clear_referenced(pmap, newpde),
6995 ("unsupported EPT PTE"));
6996
6997 /*
6998 * Save the PTP in its current state until the PDE mapping the
6999 * superpage is demoted by pmap_demote_pde() or destroyed by
7000 * pmap_remove_pde(). If PG_A is not set in every PTE, then request
7001 * that the PTP be refilled on demotion.
7002 */
7003 if (mpte == NULL)
7004 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
7005 KASSERT(mpte >= vm_page_array &&
7006 mpte < &vm_page_array[vm_page_array_size],
7007 ("pmap_promote_pde: page table page is out of range"));
7008 KASSERT(mpte->pindex == pmap_pde_pindex(va),
7009 ("pmap_promote_pde: page table page's pindex is wrong "
7010 "mpte %p pidx %#lx va %#lx va pde pidx %#lx",
7011 mpte, mpte->pindex, va, pmap_pde_pindex(va)));
7012 if (pmap_insert_pt_page(pmap, mpte, true, allpte_PG_A != 0)) {
7013 counter_u64_add(pmap_pde_p_failures, 1);
7014 CTR2(KTR_PMAP,
7015 "pmap_promote_pde: failure for va %#lx in pmap %p", va,
7016 pmap);
7017 return (false);
7018 }
7019
7020 /*
7021 * Promote the pv entries.
7022 */
7023 if ((newpde & PG_MANAGED) != 0)
7024 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
7025
7026 /*
7027 * Propagate the PAT index to its proper position.
7028 */
7029 newpde = pmap_swap_pat(pmap, newpde);
7030
7031 /*
7032 * Map the superpage.
7033 */
7034 if (workaround_erratum383)
7035 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
7036 else
7037 pde_store(pde, PG_PROMOTED | PG_PS | newpde);
7038
7039 counter_u64_add(pmap_pde_promotions, 1);
7040 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
7041 " in pmap %p", va, pmap);
7042 return (true);
7043 }
7044 #endif /* VM_NRESERVLEVEL > 0 */
7045
7046 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t newpte,int flags,int psind)7047 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
7048 int psind)
7049 {
7050 vm_page_t mp;
7051 pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V;
7052
7053 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7054 KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0,
7055 ("psind %d unexpected", psind));
7056 KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0,
7057 ("unaligned phys address %#lx newpte %#lx psind %d",
7058 newpte & PG_FRAME, newpte, psind));
7059 KASSERT((va & (pagesizes[psind] - 1)) == 0,
7060 ("unaligned va %#lx psind %d", va, psind));
7061 KASSERT(va < VM_MAXUSER_ADDRESS,
7062 ("kernel mode non-transparent superpage")); /* XXXKIB */
7063 KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS,
7064 ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */
7065
7066 PG_V = pmap_valid_bit(pmap);
7067
7068 restart:
7069 if (!pmap_pkru_same(pmap, va, va + pagesizes[psind]))
7070 return (KERN_PROTECTION_FAILURE);
7071 pten = newpte;
7072 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
7073 pten |= pmap_pkru_get(pmap, va);
7074
7075 if (psind == 2) { /* 1G */
7076 pml4e = pmap_pml4e(pmap, va);
7077 if (pml4e == NULL || (*pml4e & PG_V) == 0) {
7078 mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va),
7079 NULL, va);
7080 if (mp == NULL)
7081 goto allocf;
7082 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
7083 pdpe = &pdpe[pmap_pdpe_index(va)];
7084 origpte = *pdpe;
7085 MPASS(origpte == 0);
7086 } else {
7087 pdpe = pmap_pml4e_to_pdpe(pml4e, va);
7088 KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va));
7089 origpte = *pdpe;
7090 if ((origpte & PG_V) == 0) {
7091 mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME);
7092 mp->ref_count++;
7093 }
7094 }
7095 *pdpe = pten;
7096 } else /* (psind == 1) */ { /* 2M */
7097 pde = pmap_pde(pmap, va);
7098 if (pde == NULL) {
7099 mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va),
7100 NULL, va);
7101 if (mp == NULL)
7102 goto allocf;
7103 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
7104 pde = &pde[pmap_pde_index(va)];
7105 origpte = *pde;
7106 MPASS(origpte == 0);
7107 } else {
7108 origpte = *pde;
7109 if ((origpte & PG_V) == 0) {
7110 pdpe = pmap_pdpe(pmap, va);
7111 MPASS(pdpe != NULL && (*pdpe & PG_V) != 0);
7112 mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
7113 mp->ref_count++;
7114 }
7115 }
7116 *pde = pten;
7117 }
7118 KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 &&
7119 (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)),
7120 ("va %#lx changing %s phys page origpte %#lx pten %#lx",
7121 va, psind == 2 ? "1G" : "2M", origpte, pten));
7122 if ((pten & PG_W) != 0 && (origpte & PG_W) == 0)
7123 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
7124 else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0)
7125 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
7126 if ((origpte & PG_V) == 0)
7127 pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE);
7128
7129 return (KERN_SUCCESS);
7130
7131 allocf:
7132 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
7133 return (KERN_RESOURCE_SHORTAGE);
7134 PMAP_UNLOCK(pmap);
7135 vm_wait(NULL);
7136 PMAP_LOCK(pmap);
7137 goto restart;
7138 }
7139
7140 /*
7141 * Insert the given physical page (p) at
7142 * the specified virtual address (v) in the
7143 * target physical map with the protection requested.
7144 *
7145 * If specified, the page will be wired down, meaning
7146 * that the related pte can not be reclaimed.
7147 *
7148 * NB: This is the only routine which MAY NOT lazy-evaluate
7149 * or lose information. That is, this routine must actually
7150 * insert this page into the given map NOW.
7151 *
7152 * When destroying both a page table and PV entry, this function
7153 * performs the TLB invalidation before releasing the PV list
7154 * lock, so we do not need pmap_delayed_invl_page() calls here.
7155 */
7156 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)7157 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
7158 u_int flags, int8_t psind)
7159 {
7160 struct rwlock *lock;
7161 pd_entry_t *pde;
7162 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
7163 pt_entry_t newpte, origpte;
7164 pv_entry_t pv;
7165 vm_paddr_t opa, pa;
7166 vm_page_t mpte, om;
7167 int rv;
7168 boolean_t nosleep;
7169
7170 PG_A = pmap_accessed_bit(pmap);
7171 PG_G = pmap_global_bit(pmap);
7172 PG_M = pmap_modified_bit(pmap);
7173 PG_V = pmap_valid_bit(pmap);
7174 PG_RW = pmap_rw_bit(pmap);
7175
7176 va = trunc_page(va);
7177 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
7178 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
7179 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
7180 va));
7181 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va),
7182 ("pmap_enter: managed mapping within the clean submap"));
7183 if ((m->oflags & VPO_UNMANAGED) == 0)
7184 VM_PAGE_OBJECT_BUSY_ASSERT(m);
7185 KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
7186 ("pmap_enter: flags %u has reserved bits set", flags));
7187 pa = VM_PAGE_TO_PHYS(m);
7188 newpte = (pt_entry_t)(pa | PG_A | PG_V);
7189 if ((flags & VM_PROT_WRITE) != 0)
7190 newpte |= PG_M;
7191 if ((prot & VM_PROT_WRITE) != 0)
7192 newpte |= PG_RW;
7193 KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
7194 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
7195 if ((prot & VM_PROT_EXECUTE) == 0)
7196 newpte |= pg_nx;
7197 if ((flags & PMAP_ENTER_WIRED) != 0)
7198 newpte |= PG_W;
7199 if (va < VM_MAXUSER_ADDRESS)
7200 newpte |= PG_U;
7201 if (pmap == kernel_pmap)
7202 newpte |= PG_G;
7203 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
7204
7205 /*
7206 * Set modified bit gratuitously for writeable mappings if
7207 * the page is unmanaged. We do not want to take a fault
7208 * to do the dirty bit accounting for these mappings.
7209 */
7210 if ((m->oflags & VPO_UNMANAGED) != 0) {
7211 if ((newpte & PG_RW) != 0)
7212 newpte |= PG_M;
7213 } else
7214 newpte |= PG_MANAGED;
7215
7216 lock = NULL;
7217 PMAP_LOCK(pmap);
7218 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
7219 KASSERT((m->oflags & VPO_UNMANAGED) != 0,
7220 ("managed largepage va %#lx flags %#x", va, flags));
7221 rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags,
7222 psind);
7223 goto out;
7224 }
7225 if (psind == 1) {
7226 /* Assert the required virtual and physical alignment. */
7227 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
7228 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
7229 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
7230 goto out;
7231 }
7232 mpte = NULL;
7233
7234 /*
7235 * In the case that a page table page is not
7236 * resident, we are creating it here.
7237 */
7238 retry:
7239 pde = pmap_pde(pmap, va);
7240 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
7241 pmap_demote_pde_locked(pmap, pde, va, &lock))) {
7242 pte = pmap_pde_to_pte(pde, va);
7243 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
7244 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
7245 mpte->ref_count++;
7246 }
7247 } else if (va < VM_MAXUSER_ADDRESS) {
7248 /*
7249 * Here if the pte page isn't mapped, or if it has been
7250 * deallocated.
7251 */
7252 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
7253 mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va),
7254 nosleep ? NULL : &lock, va);
7255 if (mpte == NULL && nosleep) {
7256 rv = KERN_RESOURCE_SHORTAGE;
7257 goto out;
7258 }
7259 goto retry;
7260 } else
7261 panic("pmap_enter: invalid page directory va=%#lx", va);
7262
7263 origpte = *pte;
7264 pv = NULL;
7265 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
7266 newpte |= pmap_pkru_get(pmap, va);
7267
7268 /*
7269 * Is the specified virtual address already mapped?
7270 */
7271 if ((origpte & PG_V) != 0) {
7272 /*
7273 * Wiring change, just update stats. We don't worry about
7274 * wiring PT pages as they remain resident as long as there
7275 * are valid mappings in them. Hence, if a user page is wired,
7276 * the PT page will be also.
7277 */
7278 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
7279 pmap->pm_stats.wired_count++;
7280 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
7281 pmap->pm_stats.wired_count--;
7282
7283 /*
7284 * Remove the extra PT page reference.
7285 */
7286 if (mpte != NULL) {
7287 mpte->ref_count--;
7288 KASSERT(mpte->ref_count > 0,
7289 ("pmap_enter: missing reference to page table page,"
7290 " va: 0x%lx", va));
7291 }
7292
7293 /*
7294 * Has the physical page changed?
7295 */
7296 opa = origpte & PG_FRAME;
7297 if (opa == pa) {
7298 /*
7299 * No, might be a protection or wiring change.
7300 */
7301 if ((origpte & PG_MANAGED) != 0 &&
7302 (newpte & PG_RW) != 0)
7303 vm_page_aflag_set(m, PGA_WRITEABLE);
7304 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
7305 goto unchanged;
7306 goto validate;
7307 }
7308
7309 /*
7310 * The physical page has changed. Temporarily invalidate
7311 * the mapping. This ensures that all threads sharing the
7312 * pmap keep a consistent view of the mapping, which is
7313 * necessary for the correct handling of COW faults. It
7314 * also permits reuse of the old mapping's PV entry,
7315 * avoiding an allocation.
7316 *
7317 * For consistency, handle unmanaged mappings the same way.
7318 */
7319 origpte = pte_load_clear(pte);
7320 KASSERT((origpte & PG_FRAME) == opa,
7321 ("pmap_enter: unexpected pa update for %#lx", va));
7322 if ((origpte & PG_MANAGED) != 0) {
7323 om = PHYS_TO_VM_PAGE(opa);
7324
7325 /*
7326 * The pmap lock is sufficient to synchronize with
7327 * concurrent calls to pmap_page_test_mappings() and
7328 * pmap_ts_referenced().
7329 */
7330 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
7331 vm_page_dirty(om);
7332 if ((origpte & PG_A) != 0) {
7333 pmap_invalidate_page(pmap, va);
7334 vm_page_aflag_set(om, PGA_REFERENCED);
7335 }
7336 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
7337 pv = pmap_pvh_remove(&om->md, pmap, va);
7338 KASSERT(pv != NULL,
7339 ("pmap_enter: no PV entry for %#lx", va));
7340 if ((newpte & PG_MANAGED) == 0)
7341 free_pv_entry(pmap, pv);
7342 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
7343 TAILQ_EMPTY(&om->md.pv_list) &&
7344 ((om->flags & PG_FICTITIOUS) != 0 ||
7345 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
7346 vm_page_aflag_clear(om, PGA_WRITEABLE);
7347 } else {
7348 /*
7349 * Since this mapping is unmanaged, assume that PG_A
7350 * is set.
7351 */
7352 pmap_invalidate_page(pmap, va);
7353 }
7354 origpte = 0;
7355 } else {
7356 /*
7357 * Increment the counters.
7358 */
7359 if ((newpte & PG_W) != 0)
7360 pmap->pm_stats.wired_count++;
7361 pmap_resident_count_adj(pmap, 1);
7362 }
7363
7364 /*
7365 * Enter on the PV list if part of our managed memory.
7366 */
7367 if ((newpte & PG_MANAGED) != 0) {
7368 if (pv == NULL) {
7369 pv = get_pv_entry(pmap, &lock);
7370 pv->pv_va = va;
7371 }
7372 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
7373 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7374 m->md.pv_gen++;
7375 if ((newpte & PG_RW) != 0)
7376 vm_page_aflag_set(m, PGA_WRITEABLE);
7377 }
7378
7379 /*
7380 * Update the PTE.
7381 */
7382 if ((origpte & PG_V) != 0) {
7383 validate:
7384 origpte = pte_load_store(pte, newpte);
7385 KASSERT((origpte & PG_FRAME) == pa,
7386 ("pmap_enter: unexpected pa update for %#lx", va));
7387 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
7388 (PG_M | PG_RW)) {
7389 if ((origpte & PG_MANAGED) != 0)
7390 vm_page_dirty(m);
7391
7392 /*
7393 * Although the PTE may still have PG_RW set, TLB
7394 * invalidation may nonetheless be required because
7395 * the PTE no longer has PG_M set.
7396 */
7397 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
7398 /*
7399 * This PTE change does not require TLB invalidation.
7400 */
7401 goto unchanged;
7402 }
7403 if ((origpte & PG_A) != 0)
7404 pmap_invalidate_page(pmap, va);
7405 } else
7406 pte_store(pte, newpte);
7407
7408 unchanged:
7409
7410 #if VM_NRESERVLEVEL > 0
7411 /*
7412 * If both the page table page and the reservation are fully
7413 * populated, then attempt promotion.
7414 */
7415 if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
7416 (m->flags & PG_FICTITIOUS) == 0 &&
7417 vm_reserv_level_iffullpop(m) == 0)
7418 (void)pmap_promote_pde(pmap, pde, va, mpte, &lock);
7419 #endif
7420
7421 rv = KERN_SUCCESS;
7422 out:
7423 if (lock != NULL)
7424 rw_wunlock(lock);
7425 PMAP_UNLOCK(pmap);
7426 return (rv);
7427 }
7428
7429 /*
7430 * Tries to create a read- and/or execute-only 2MB page mapping. Returns
7431 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
7432 * value. See pmap_enter_pde() for the possible error values when "no sleep",
7433 * "no replace", and "no reclaim" are specified.
7434 */
7435 static int
pmap_enter_2mpage(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)7436 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
7437 struct rwlock **lockp)
7438 {
7439 pd_entry_t newpde;
7440 pt_entry_t PG_V;
7441
7442 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7443 PG_V = pmap_valid_bit(pmap);
7444 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
7445 PG_PS | PG_V;
7446 if ((m->oflags & VPO_UNMANAGED) == 0)
7447 newpde |= PG_MANAGED;
7448 if ((prot & VM_PROT_EXECUTE) == 0)
7449 newpde |= pg_nx;
7450 if (va < VM_MAXUSER_ADDRESS)
7451 newpde |= PG_U;
7452 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
7453 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp));
7454 }
7455
7456 /*
7457 * Returns true if every page table entry in the specified page table page is
7458 * zero.
7459 */
7460 static bool
pmap_every_pte_zero(vm_paddr_t pa)7461 pmap_every_pte_zero(vm_paddr_t pa)
7462 {
7463 pt_entry_t *pt_end, *pte;
7464
7465 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
7466 pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
7467 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) {
7468 if (*pte != 0)
7469 return (false);
7470 }
7471 return (true);
7472 }
7473
7474 /*
7475 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if
7476 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE,
7477 * KERN_PROTECTION_FAILURE, or KERN_RESOURCE_SHORTAGE otherwise. Returns
7478 * KERN_FAILURE if either (1) PMAP_ENTER_NOREPLACE was specified and a 4KB
7479 * page mapping already exists within the 2MB virtual address range starting
7480 * at the specified virtual address or (2) the requested 2MB page mapping is
7481 * not supported due to hardware errata. Returns KERN_NO_SPACE if
7482 * PMAP_ENTER_NOREPLACE was specified and a 2MB page mapping already exists at
7483 * the specified virtual address. Returns KERN_PROTECTION_FAILURE if the PKRU
7484 * settings are not the same across the 2MB virtual address range starting at
7485 * the specified virtual address. Returns KERN_RESOURCE_SHORTAGE if either
7486 * (1) PMAP_ENTER_NOSLEEP was specified and a page table page allocation
7487 * failed or (2) PMAP_ENTER_NORECLAIM was specified and a PV entry allocation
7488 * failed.
7489 *
7490 * The parameter "m" is only used when creating a managed, writeable mapping.
7491 */
7492 static int
pmap_enter_pde(pmap_t pmap,vm_offset_t va,pd_entry_t newpde,u_int flags,vm_page_t m,struct rwlock ** lockp)7493 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
7494 vm_page_t m, struct rwlock **lockp)
7495 {
7496 struct spglist free;
7497 pd_entry_t oldpde, *pde;
7498 pt_entry_t PG_G, PG_RW, PG_V;
7499 vm_page_t mt, pdpg;
7500 vm_page_t uwptpg;
7501
7502 PG_G = pmap_global_bit(pmap);
7503 PG_RW = pmap_rw_bit(pmap);
7504 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
7505 ("pmap_enter_pde: newpde is missing PG_M"));
7506 PG_V = pmap_valid_bit(pmap);
7507 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7508
7509 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap,
7510 newpde))) {
7511 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx"
7512 " in pmap %p", va, pmap);
7513 return (KERN_FAILURE);
7514 }
7515 if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags &
7516 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
7517 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
7518 " in pmap %p", va, pmap);
7519 return (KERN_RESOURCE_SHORTAGE);
7520 }
7521
7522 /*
7523 * If pkru is not same for the whole pde range, return failure
7524 * and let vm_fault() cope. Check after pde allocation, since
7525 * it could sleep.
7526 */
7527 if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
7528 pmap_abort_ptp(pmap, va, pdpg);
7529 return (KERN_PROTECTION_FAILURE);
7530 }
7531 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) {
7532 newpde &= ~X86_PG_PKU_MASK;
7533 newpde |= pmap_pkru_get(pmap, va);
7534 }
7535
7536 /*
7537 * If there are existing mappings, either abort or remove them.
7538 */
7539 oldpde = *pde;
7540 if ((oldpde & PG_V) != 0) {
7541 KASSERT(pdpg == NULL || pdpg->ref_count > 1,
7542 ("pmap_enter_pde: pdpg's reference count is too low"));
7543 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
7544 if ((oldpde & PG_PS) != 0) {
7545 if (pdpg != NULL)
7546 pdpg->ref_count--;
7547 CTR2(KTR_PMAP,
7548 "pmap_enter_pde: no space for va %#lx"
7549 " in pmap %p", va, pmap);
7550 return (KERN_NO_SPACE);
7551 } else if (va < VM_MAXUSER_ADDRESS ||
7552 !pmap_every_pte_zero(oldpde & PG_FRAME)) {
7553 if (pdpg != NULL)
7554 pdpg->ref_count--;
7555 CTR2(KTR_PMAP,
7556 "pmap_enter_pde: failure for va %#lx"
7557 " in pmap %p", va, pmap);
7558 return (KERN_FAILURE);
7559 }
7560 }
7561 /* Break the existing mapping(s). */
7562 SLIST_INIT(&free);
7563 if ((oldpde & PG_PS) != 0) {
7564 /*
7565 * The reference to the PD page that was acquired by
7566 * pmap_alloc_pde() ensures that it won't be freed.
7567 * However, if the PDE resulted from a promotion, then
7568 * a reserved PT page could be freed.
7569 */
7570 (void)pmap_remove_pde(pmap, pde, va, &free, lockp);
7571 if ((oldpde & PG_G) == 0)
7572 pmap_invalidate_pde_page(pmap, va, oldpde);
7573 } else {
7574 pmap_delayed_invl_start();
7575 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
7576 lockp))
7577 pmap_invalidate_all(pmap);
7578 pmap_delayed_invl_finish();
7579 }
7580 if (va < VM_MAXUSER_ADDRESS) {
7581 vm_page_free_pages_toq(&free, true);
7582 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
7583 pde));
7584 } else {
7585 KASSERT(SLIST_EMPTY(&free),
7586 ("pmap_enter_pde: freed kernel page table page"));
7587
7588 /*
7589 * Both pmap_remove_pde() and pmap_remove_ptes() will
7590 * leave the kernel page table page zero filled.
7591 */
7592 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
7593 if (pmap_insert_pt_page(pmap, mt, false, false))
7594 panic("pmap_enter_pde: trie insert failed");
7595 }
7596 }
7597
7598 /*
7599 * Allocate leaf ptpage for wired userspace pages.
7600 */
7601 uwptpg = NULL;
7602 if ((newpde & PG_W) != 0 && pmap != kernel_pmap) {
7603 uwptpg = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va),
7604 VM_ALLOC_WIRED);
7605 if (uwptpg == NULL)
7606 return (KERN_RESOURCE_SHORTAGE);
7607 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
7608 pmap_free_pt_page(pmap, uwptpg, false);
7609 return (KERN_RESOURCE_SHORTAGE);
7610 }
7611
7612 uwptpg->ref_count = NPTEPG;
7613 }
7614 if ((newpde & PG_MANAGED) != 0) {
7615 /*
7616 * Abort this mapping if its PV entry could not be created.
7617 */
7618 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
7619 if (pdpg != NULL)
7620 pmap_abort_ptp(pmap, va, pdpg);
7621 if (uwptpg != NULL) {
7622 mt = pmap_remove_pt_page(pmap, va);
7623 KASSERT(mt == uwptpg,
7624 ("removed pt page %p, expected %p", mt,
7625 uwptpg));
7626 uwptpg->ref_count = 1;
7627 pmap_free_pt_page(pmap, uwptpg, false);
7628 }
7629 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
7630 " in pmap %p", va, pmap);
7631 return (KERN_RESOURCE_SHORTAGE);
7632 }
7633 if ((newpde & PG_RW) != 0) {
7634 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
7635 vm_page_aflag_set(mt, PGA_WRITEABLE);
7636 }
7637 }
7638
7639 /*
7640 * Increment counters.
7641 */
7642 if ((newpde & PG_W) != 0)
7643 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
7644 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE);
7645
7646 /*
7647 * Map the superpage. (This is not a promoted mapping; there will not
7648 * be any lingering 4KB page mappings in the TLB.)
7649 */
7650 pde_store(pde, newpde);
7651
7652 counter_u64_add(pmap_pde_mappings, 1);
7653 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p",
7654 va, pmap);
7655 return (KERN_SUCCESS);
7656 }
7657
7658 /*
7659 * Maps a sequence of resident pages belonging to the same object.
7660 * The sequence begins with the given page m_start. This page is
7661 * mapped at the given virtual address start. Each subsequent page is
7662 * mapped at a virtual address that is offset from start by the same
7663 * amount as the page is offset from m_start within the object. The
7664 * last page in the sequence is the page with the largest offset from
7665 * m_start that can be mapped at a virtual address less than the given
7666 * virtual address end. Not every virtual page between start and end
7667 * is mapped; only those for which a resident page exists with the
7668 * corresponding offset from m_start are mapped.
7669 */
7670 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)7671 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
7672 vm_page_t m_start, vm_prot_t prot)
7673 {
7674 struct rwlock *lock;
7675 vm_offset_t va;
7676 vm_page_t m, mpte;
7677 vm_pindex_t diff, psize;
7678 int rv;
7679
7680 VM_OBJECT_ASSERT_LOCKED(m_start->object);
7681
7682 psize = atop(end - start);
7683 mpte = NULL;
7684 m = m_start;
7685 lock = NULL;
7686 PMAP_LOCK(pmap);
7687 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
7688 va = start + ptoa(diff);
7689 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
7690 m->psind == 1 && pmap_ps_enabled(pmap) &&
7691 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) ==
7692 KERN_SUCCESS || rv == KERN_NO_SPACE))
7693 m = &m[NBPDR / PAGE_SIZE - 1];
7694 else
7695 mpte = pmap_enter_quick_locked(pmap, va, m, prot,
7696 mpte, &lock);
7697 m = TAILQ_NEXT(m, listq);
7698 }
7699 if (lock != NULL)
7700 rw_wunlock(lock);
7701 PMAP_UNLOCK(pmap);
7702 }
7703
7704 /*
7705 * this code makes some *MAJOR* assumptions:
7706 * 1. Current pmap & pmap exists.
7707 * 2. Not wired.
7708 * 3. Read access.
7709 * 4. No page table pages.
7710 * but is *MUCH* faster than pmap_enter...
7711 */
7712
7713 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)7714 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
7715 {
7716 struct rwlock *lock;
7717
7718 lock = NULL;
7719 PMAP_LOCK(pmap);
7720 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
7721 if (lock != NULL)
7722 rw_wunlock(lock);
7723 PMAP_UNLOCK(pmap);
7724 }
7725
7726 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)7727 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
7728 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
7729 {
7730 pd_entry_t *pde;
7731 pt_entry_t newpte, *pte, PG_V;
7732
7733 KASSERT(!VA_IS_CLEANMAP(va) ||
7734 (m->oflags & VPO_UNMANAGED) != 0,
7735 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
7736 PG_V = pmap_valid_bit(pmap);
7737 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7738 pde = NULL;
7739
7740 /*
7741 * In the case that a page table page is not
7742 * resident, we are creating it here.
7743 */
7744 if (va < VM_MAXUSER_ADDRESS) {
7745 pdp_entry_t *pdpe;
7746 vm_pindex_t ptepindex;
7747
7748 /*
7749 * Calculate pagetable page index
7750 */
7751 ptepindex = pmap_pde_pindex(va);
7752 if (mpte && (mpte->pindex == ptepindex)) {
7753 mpte->ref_count++;
7754 } else {
7755 /*
7756 * If the page table page is mapped, we just increment
7757 * the hold count, and activate it. Otherwise, we
7758 * attempt to allocate a page table page, passing NULL
7759 * instead of the PV list lock pointer because we don't
7760 * intend to sleep. If this attempt fails, we don't
7761 * retry. Instead, we give up.
7762 */
7763 pdpe = pmap_pdpe(pmap, va);
7764 if (pdpe != NULL && (*pdpe & PG_V) != 0) {
7765 if ((*pdpe & PG_PS) != 0)
7766 return (NULL);
7767 pde = pmap_pdpe_to_pde(pdpe, va);
7768 if ((*pde & PG_V) != 0) {
7769 if ((*pde & PG_PS) != 0)
7770 return (NULL);
7771 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
7772 mpte->ref_count++;
7773 } else {
7774 mpte = pmap_allocpte_alloc(pmap,
7775 ptepindex, NULL, va);
7776 if (mpte == NULL)
7777 return (NULL);
7778 }
7779 } else {
7780 mpte = pmap_allocpte_alloc(pmap, ptepindex,
7781 NULL, va);
7782 if (mpte == NULL)
7783 return (NULL);
7784 }
7785 }
7786 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
7787 pte = &pte[pmap_pte_index(va)];
7788 } else {
7789 mpte = NULL;
7790 pte = vtopte(va);
7791 }
7792 if (*pte) {
7793 if (mpte != NULL)
7794 mpte->ref_count--;
7795 return (NULL);
7796 }
7797
7798 /*
7799 * Enter on the PV list if part of our managed memory.
7800 */
7801 if ((m->oflags & VPO_UNMANAGED) == 0 &&
7802 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
7803 if (mpte != NULL)
7804 pmap_abort_ptp(pmap, va, mpte);
7805 return (NULL);
7806 }
7807
7808 /*
7809 * Increment counters
7810 */
7811 pmap_resident_count_adj(pmap, 1);
7812
7813 newpte = VM_PAGE_TO_PHYS(m) | PG_V |
7814 pmap_cache_bits(pmap, m->md.pat_mode, 0);
7815 if ((m->oflags & VPO_UNMANAGED) == 0)
7816 newpte |= PG_MANAGED;
7817 if ((prot & VM_PROT_EXECUTE) == 0)
7818 newpte |= pg_nx;
7819 if (va < VM_MAXUSER_ADDRESS)
7820 newpte |= PG_U | pmap_pkru_get(pmap, va);
7821 pte_store(pte, newpte);
7822
7823 #if VM_NRESERVLEVEL > 0
7824 /*
7825 * If both the PTP and the reservation are fully populated, then
7826 * attempt promotion.
7827 */
7828 if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
7829 (m->flags & PG_FICTITIOUS) == 0 &&
7830 vm_reserv_level_iffullpop(m) == 0) {
7831 if (pde == NULL)
7832 pde = pmap_pde(pmap, va);
7833
7834 /*
7835 * If promotion succeeds, then the next call to this function
7836 * should not be given the unmapped PTP as a hint.
7837 */
7838 if (pmap_promote_pde(pmap, pde, va, mpte, lockp))
7839 mpte = NULL;
7840 }
7841 #endif
7842
7843 return (mpte);
7844 }
7845
7846 /*
7847 * Make a temporary mapping for a physical address. This is only intended
7848 * to be used for panic dumps.
7849 */
7850 void *
pmap_kenter_temporary(vm_paddr_t pa,int i)7851 pmap_kenter_temporary(vm_paddr_t pa, int i)
7852 {
7853 vm_offset_t va;
7854
7855 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
7856 pmap_kenter(va, pa);
7857 pmap_invlpg(kernel_pmap, va);
7858 return ((void *)crashdumpmap);
7859 }
7860
7861 /*
7862 * This code maps large physical mmap regions into the
7863 * processor address space. Note that some shortcuts
7864 * are taken, but the code works.
7865 */
7866 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)7867 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
7868 vm_pindex_t pindex, vm_size_t size)
7869 {
7870 pd_entry_t *pde;
7871 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
7872 vm_paddr_t pa, ptepa;
7873 vm_page_t p, pdpg;
7874 int pat_mode;
7875
7876 PG_A = pmap_accessed_bit(pmap);
7877 PG_M = pmap_modified_bit(pmap);
7878 PG_V = pmap_valid_bit(pmap);
7879 PG_RW = pmap_rw_bit(pmap);
7880
7881 VM_OBJECT_ASSERT_WLOCKED(object);
7882 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
7883 ("pmap_object_init_pt: non-device object"));
7884 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
7885 if (!pmap_ps_enabled(pmap))
7886 return;
7887 if (!vm_object_populate(object, pindex, pindex + atop(size)))
7888 return;
7889 p = vm_page_lookup(object, pindex);
7890 KASSERT(vm_page_all_valid(p),
7891 ("pmap_object_init_pt: invalid page %p", p));
7892 pat_mode = p->md.pat_mode;
7893
7894 /*
7895 * Abort the mapping if the first page is not physically
7896 * aligned to a 2MB page boundary.
7897 */
7898 ptepa = VM_PAGE_TO_PHYS(p);
7899 if (ptepa & (NBPDR - 1))
7900 return;
7901
7902 /*
7903 * Skip the first page. Abort the mapping if the rest of
7904 * the pages are not physically contiguous or have differing
7905 * memory attributes.
7906 */
7907 p = TAILQ_NEXT(p, listq);
7908 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
7909 pa += PAGE_SIZE) {
7910 KASSERT(vm_page_all_valid(p),
7911 ("pmap_object_init_pt: invalid page %p", p));
7912 if (pa != VM_PAGE_TO_PHYS(p) ||
7913 pat_mode != p->md.pat_mode)
7914 return;
7915 p = TAILQ_NEXT(p, listq);
7916 }
7917
7918 /*
7919 * Map using 2MB pages. Since "ptepa" is 2M aligned and
7920 * "size" is a multiple of 2M, adding the PAT setting to "pa"
7921 * will not affect the termination of this loop.
7922 */
7923 PMAP_LOCK(pmap);
7924 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
7925 pa < ptepa + size; pa += NBPDR) {
7926 pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL);
7927 if (pde == NULL) {
7928 /*
7929 * The creation of mappings below is only an
7930 * optimization. If a page directory page
7931 * cannot be allocated without blocking,
7932 * continue on to the next mapping rather than
7933 * blocking.
7934 */
7935 addr += NBPDR;
7936 continue;
7937 }
7938 if ((*pde & PG_V) == 0) {
7939 pde_store(pde, pa | PG_PS | PG_M | PG_A |
7940 PG_U | PG_RW | PG_V);
7941 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE);
7942 counter_u64_add(pmap_pde_mappings, 1);
7943 } else {
7944 /* Continue on if the PDE is already valid. */
7945 pdpg->ref_count--;
7946 KASSERT(pdpg->ref_count > 0,
7947 ("pmap_object_init_pt: missing reference "
7948 "to page directory page, va: 0x%lx", addr));
7949 }
7950 addr += NBPDR;
7951 }
7952 PMAP_UNLOCK(pmap);
7953 }
7954 }
7955
7956 /*
7957 * Clear the wired attribute from the mappings for the specified range of
7958 * addresses in the given pmap. Every valid mapping within that range
7959 * must have the wired attribute set. In contrast, invalid mappings
7960 * cannot have the wired attribute set, so they are ignored.
7961 *
7962 * The wired attribute of the page table entry is not a hardware
7963 * feature, so there is no need to invalidate any TLB entries.
7964 * Since pmap_demote_pde() for the wired entry must never fail,
7965 * pmap_delayed_invl_start()/finish() calls around the
7966 * function are not needed.
7967 */
7968 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)7969 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
7970 {
7971 vm_offset_t va_next;
7972 pml4_entry_t *pml4e;
7973 pdp_entry_t *pdpe;
7974 pd_entry_t *pde;
7975 pt_entry_t *pte, PG_V, PG_G __diagused;
7976
7977 PG_V = pmap_valid_bit(pmap);
7978 PG_G = pmap_global_bit(pmap);
7979 PMAP_LOCK(pmap);
7980 for (; sva < eva; sva = va_next) {
7981 pml4e = pmap_pml4e(pmap, sva);
7982 if (pml4e == NULL || (*pml4e & PG_V) == 0) {
7983 va_next = (sva + NBPML4) & ~PML4MASK;
7984 if (va_next < sva)
7985 va_next = eva;
7986 continue;
7987 }
7988
7989 va_next = (sva + NBPDP) & ~PDPMASK;
7990 if (va_next < sva)
7991 va_next = eva;
7992 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
7993 if ((*pdpe & PG_V) == 0)
7994 continue;
7995 if ((*pdpe & PG_PS) != 0) {
7996 KASSERT(va_next <= eva,
7997 ("partial update of non-transparent 1G mapping "
7998 "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
7999 *pdpe, sva, eva, va_next));
8000 MPASS(pmap != kernel_pmap); /* XXXKIB */
8001 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
8002 atomic_clear_long(pdpe, PG_W);
8003 pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE;
8004 continue;
8005 }
8006
8007 va_next = (sva + NBPDR) & ~PDRMASK;
8008 if (va_next < sva)
8009 va_next = eva;
8010 pde = pmap_pdpe_to_pde(pdpe, sva);
8011 if ((*pde & PG_V) == 0)
8012 continue;
8013 if ((*pde & PG_PS) != 0) {
8014 if ((*pde & PG_W) == 0)
8015 panic("pmap_unwire: pde %#jx is missing PG_W",
8016 (uintmax_t)*pde);
8017
8018 /*
8019 * Are we unwiring the entire large page? If not,
8020 * demote the mapping and fall through.
8021 */
8022 if (sva + NBPDR == va_next && eva >= va_next) {
8023 atomic_clear_long(pde, PG_W);
8024 pmap->pm_stats.wired_count -= NBPDR /
8025 PAGE_SIZE;
8026 continue;
8027 } else if (!pmap_demote_pde(pmap, pde, sva))
8028 panic("pmap_unwire: demotion failed");
8029 }
8030 if (va_next > eva)
8031 va_next = eva;
8032 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
8033 sva += PAGE_SIZE) {
8034 if ((*pte & PG_V) == 0)
8035 continue;
8036 if ((*pte & PG_W) == 0)
8037 panic("pmap_unwire: pte %#jx is missing PG_W",
8038 (uintmax_t)*pte);
8039
8040 /*
8041 * PG_W must be cleared atomically. Although the pmap
8042 * lock synchronizes access to PG_W, another processor
8043 * could be setting PG_M and/or PG_A concurrently.
8044 */
8045 atomic_clear_long(pte, PG_W);
8046 pmap->pm_stats.wired_count--;
8047 }
8048 }
8049 PMAP_UNLOCK(pmap);
8050 }
8051
8052 /*
8053 * Copy the range specified by src_addr/len
8054 * from the source map to the range dst_addr/len
8055 * in the destination map.
8056 *
8057 * This routine is only advisory and need not do anything.
8058 */
8059 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)8060 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
8061 vm_offset_t src_addr)
8062 {
8063 struct rwlock *lock;
8064 pml4_entry_t *pml4e;
8065 pdp_entry_t *pdpe;
8066 pd_entry_t *pde, srcptepaddr;
8067 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte;
8068 vm_offset_t addr, end_addr, va_next;
8069 vm_page_t dst_pdpg, dstmpte, srcmpte;
8070
8071 if (dst_addr != src_addr)
8072 return;
8073
8074 if (dst_pmap->pm_type != src_pmap->pm_type)
8075 return;
8076
8077 /*
8078 * EPT page table entries that require emulation of A/D bits are
8079 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
8080 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
8081 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
8082 * implementations flag an EPT misconfiguration for exec-only
8083 * mappings we skip this function entirely for emulated pmaps.
8084 */
8085 if (pmap_emulate_ad_bits(dst_pmap))
8086 return;
8087
8088 end_addr = src_addr + len;
8089 lock = NULL;
8090 if (dst_pmap < src_pmap) {
8091 PMAP_LOCK(dst_pmap);
8092 PMAP_LOCK(src_pmap);
8093 } else {
8094 PMAP_LOCK(src_pmap);
8095 PMAP_LOCK(dst_pmap);
8096 }
8097
8098 PG_A = pmap_accessed_bit(dst_pmap);
8099 PG_M = pmap_modified_bit(dst_pmap);
8100 PG_V = pmap_valid_bit(dst_pmap);
8101
8102 for (addr = src_addr; addr < end_addr; addr = va_next) {
8103 KASSERT(addr < UPT_MIN_ADDRESS,
8104 ("pmap_copy: invalid to pmap_copy page tables"));
8105
8106 pml4e = pmap_pml4e(src_pmap, addr);
8107 if (pml4e == NULL || (*pml4e & PG_V) == 0) {
8108 va_next = (addr + NBPML4) & ~PML4MASK;
8109 if (va_next < addr)
8110 va_next = end_addr;
8111 continue;
8112 }
8113
8114 va_next = (addr + NBPDP) & ~PDPMASK;
8115 if (va_next < addr)
8116 va_next = end_addr;
8117 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
8118 if ((*pdpe & PG_V) == 0)
8119 continue;
8120 if ((*pdpe & PG_PS) != 0) {
8121 KASSERT(va_next <= end_addr,
8122 ("partial update of non-transparent 1G mapping "
8123 "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
8124 *pdpe, addr, end_addr, va_next));
8125 MPASS((addr & PDPMASK) == 0);
8126 MPASS((*pdpe & PG_MANAGED) == 0);
8127 srcptepaddr = *pdpe;
8128 pdpe = pmap_pdpe(dst_pmap, addr);
8129 if (pdpe == NULL) {
8130 if (pmap_allocpte_alloc(dst_pmap,
8131 pmap_pml4e_pindex(addr), NULL, addr) ==
8132 NULL)
8133 break;
8134 pdpe = pmap_pdpe(dst_pmap, addr);
8135 } else {
8136 pml4e = pmap_pml4e(dst_pmap, addr);
8137 dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME);
8138 dst_pdpg->ref_count++;
8139 }
8140 KASSERT(*pdpe == 0,
8141 ("1G mapping present in dst pmap "
8142 "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
8143 *pdpe, addr, end_addr, va_next));
8144 *pdpe = srcptepaddr & ~PG_W;
8145 pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE);
8146 continue;
8147 }
8148
8149 va_next = (addr + NBPDR) & ~PDRMASK;
8150 if (va_next < addr)
8151 va_next = end_addr;
8152
8153 pde = pmap_pdpe_to_pde(pdpe, addr);
8154 srcptepaddr = *pde;
8155 if (srcptepaddr == 0)
8156 continue;
8157
8158 if (srcptepaddr & PG_PS) {
8159 /*
8160 * We can only virtual copy whole superpages.
8161 */
8162 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
8163 continue;
8164 pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL);
8165 if (pde == NULL)
8166 break;
8167 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
8168 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
8169 PMAP_ENTER_NORECLAIM, &lock))) {
8170 /*
8171 * We leave the dirty bit unchanged because
8172 * managed read/write superpage mappings are
8173 * required to be dirty. However, managed
8174 * superpage mappings are not required to
8175 * have their accessed bit set, so we clear
8176 * it because we don't know if this mapping
8177 * will be used.
8178 */
8179 srcptepaddr &= ~PG_W;
8180 if ((srcptepaddr & PG_MANAGED) != 0)
8181 srcptepaddr &= ~PG_A;
8182 *pde = srcptepaddr;
8183 pmap_resident_count_adj(dst_pmap, NBPDR /
8184 PAGE_SIZE);
8185 counter_u64_add(pmap_pde_mappings, 1);
8186 } else
8187 pmap_abort_ptp(dst_pmap, addr, dst_pdpg);
8188 continue;
8189 }
8190
8191 srcptepaddr &= PG_FRAME;
8192 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
8193 KASSERT(srcmpte->ref_count > 0,
8194 ("pmap_copy: source page table page is unused"));
8195
8196 if (va_next > end_addr)
8197 va_next = end_addr;
8198
8199 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
8200 src_pte = &src_pte[pmap_pte_index(addr)];
8201 dstmpte = NULL;
8202 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
8203 ptetemp = *src_pte;
8204
8205 /*
8206 * We only virtual copy managed pages.
8207 */
8208 if ((ptetemp & PG_MANAGED) == 0)
8209 continue;
8210
8211 if (dstmpte != NULL) {
8212 KASSERT(dstmpte->pindex ==
8213 pmap_pde_pindex(addr),
8214 ("dstmpte pindex/addr mismatch"));
8215 dstmpte->ref_count++;
8216 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr,
8217 NULL)) == NULL)
8218 goto out;
8219 dst_pte = (pt_entry_t *)
8220 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
8221 dst_pte = &dst_pte[pmap_pte_index(addr)];
8222 if (*dst_pte == 0 &&
8223 pmap_try_insert_pv_entry(dst_pmap, addr,
8224 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) {
8225 /*
8226 * Clear the wired, modified, and accessed
8227 * (referenced) bits during the copy.
8228 */
8229 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A);
8230 pmap_resident_count_adj(dst_pmap, 1);
8231 } else {
8232 pmap_abort_ptp(dst_pmap, addr, dstmpte);
8233 goto out;
8234 }
8235 /* Have we copied all of the valid mappings? */
8236 if (dstmpte->ref_count >= srcmpte->ref_count)
8237 break;
8238 }
8239 }
8240 out:
8241 if (lock != NULL)
8242 rw_wunlock(lock);
8243 PMAP_UNLOCK(src_pmap);
8244 PMAP_UNLOCK(dst_pmap);
8245 }
8246
8247 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)8248 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
8249 {
8250 int error;
8251
8252 if (dst_pmap->pm_type != src_pmap->pm_type ||
8253 dst_pmap->pm_type != PT_X86 ||
8254 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
8255 return (0);
8256 for (;;) {
8257 if (dst_pmap < src_pmap) {
8258 PMAP_LOCK(dst_pmap);
8259 PMAP_LOCK(src_pmap);
8260 } else {
8261 PMAP_LOCK(src_pmap);
8262 PMAP_LOCK(dst_pmap);
8263 }
8264 error = pmap_pkru_copy(dst_pmap, src_pmap);
8265 /* Clean up partial copy on failure due to no memory. */
8266 if (error == ENOMEM)
8267 pmap_pkru_deassign_all(dst_pmap);
8268 PMAP_UNLOCK(src_pmap);
8269 PMAP_UNLOCK(dst_pmap);
8270 if (error != ENOMEM)
8271 break;
8272 vm_wait(NULL);
8273 }
8274 return (error);
8275 }
8276
8277 /*
8278 * Zero the specified hardware page.
8279 */
8280 void
pmap_zero_page(vm_page_t m)8281 pmap_zero_page(vm_page_t m)
8282 {
8283 vm_offset_t va;
8284
8285 #ifdef TSLOG_PAGEZERO
8286 TSENTER();
8287 #endif
8288 va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
8289 pagezero((void *)va);
8290 #ifdef TSLOG_PAGEZERO
8291 TSEXIT();
8292 #endif
8293 }
8294
8295 /*
8296 * Zero an area within a single hardware page. off and size must not
8297 * cover an area beyond a single hardware page.
8298 */
8299 void
pmap_zero_page_area(vm_page_t m,int off,int size)8300 pmap_zero_page_area(vm_page_t m, int off, int size)
8301 {
8302 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
8303
8304 if (off == 0 && size == PAGE_SIZE)
8305 pagezero((void *)va);
8306 else
8307 bzero((char *)va + off, size);
8308 }
8309
8310 /*
8311 * Copy 1 specified hardware page to another.
8312 */
8313 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)8314 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
8315 {
8316 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
8317 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
8318
8319 pagecopy((void *)src, (void *)dst);
8320 }
8321
8322 int unmapped_buf_allowed = 1;
8323
8324 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)8325 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
8326 vm_offset_t b_offset, int xfersize)
8327 {
8328 void *a_cp, *b_cp;
8329 vm_page_t pages[2];
8330 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
8331 int cnt;
8332 boolean_t mapped;
8333
8334 while (xfersize > 0) {
8335 a_pg_offset = a_offset & PAGE_MASK;
8336 pages[0] = ma[a_offset >> PAGE_SHIFT];
8337 b_pg_offset = b_offset & PAGE_MASK;
8338 pages[1] = mb[b_offset >> PAGE_SHIFT];
8339 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
8340 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
8341 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
8342 a_cp = (char *)vaddr[0] + a_pg_offset;
8343 b_cp = (char *)vaddr[1] + b_pg_offset;
8344 bcopy(a_cp, b_cp, cnt);
8345 if (__predict_false(mapped))
8346 pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
8347 a_offset += cnt;
8348 b_offset += cnt;
8349 xfersize -= cnt;
8350 }
8351 }
8352
8353 /*
8354 * Returns true if the pmap's pv is one of the first
8355 * 16 pvs linked to from this page. This count may
8356 * be changed upwards or downwards in the future; it
8357 * is only necessary that true be returned for a small
8358 * subset of pmaps for proper page aging.
8359 */
8360 boolean_t
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)8361 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
8362 {
8363 struct md_page *pvh;
8364 struct rwlock *lock;
8365 pv_entry_t pv;
8366 int loops = 0;
8367 boolean_t rv;
8368
8369 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
8370 ("pmap_page_exists_quick: page %p is not managed", m));
8371 rv = FALSE;
8372 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
8373 rw_rlock(lock);
8374 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
8375 if (PV_PMAP(pv) == pmap) {
8376 rv = TRUE;
8377 break;
8378 }
8379 loops++;
8380 if (loops >= 16)
8381 break;
8382 }
8383 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
8384 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
8385 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
8386 if (PV_PMAP(pv) == pmap) {
8387 rv = TRUE;
8388 break;
8389 }
8390 loops++;
8391 if (loops >= 16)
8392 break;
8393 }
8394 }
8395 rw_runlock(lock);
8396 return (rv);
8397 }
8398
8399 /*
8400 * pmap_page_wired_mappings:
8401 *
8402 * Return the number of managed mappings to the given physical page
8403 * that are wired.
8404 */
8405 int
pmap_page_wired_mappings(vm_page_t m)8406 pmap_page_wired_mappings(vm_page_t m)
8407 {
8408 struct rwlock *lock;
8409 struct md_page *pvh;
8410 pmap_t pmap;
8411 pt_entry_t *pte;
8412 pv_entry_t pv;
8413 int count, md_gen, pvh_gen;
8414
8415 if ((m->oflags & VPO_UNMANAGED) != 0)
8416 return (0);
8417 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
8418 rw_rlock(lock);
8419 restart:
8420 count = 0;
8421 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
8422 pmap = PV_PMAP(pv);
8423 if (!PMAP_TRYLOCK(pmap)) {
8424 md_gen = m->md.pv_gen;
8425 rw_runlock(lock);
8426 PMAP_LOCK(pmap);
8427 rw_rlock(lock);
8428 if (md_gen != m->md.pv_gen) {
8429 PMAP_UNLOCK(pmap);
8430 goto restart;
8431 }
8432 }
8433 pte = pmap_pte(pmap, pv->pv_va);
8434 if ((*pte & PG_W) != 0)
8435 count++;
8436 PMAP_UNLOCK(pmap);
8437 }
8438 if ((m->flags & PG_FICTITIOUS) == 0) {
8439 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
8440 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
8441 pmap = PV_PMAP(pv);
8442 if (!PMAP_TRYLOCK(pmap)) {
8443 md_gen = m->md.pv_gen;
8444 pvh_gen = pvh->pv_gen;
8445 rw_runlock(lock);
8446 PMAP_LOCK(pmap);
8447 rw_rlock(lock);
8448 if (md_gen != m->md.pv_gen ||
8449 pvh_gen != pvh->pv_gen) {
8450 PMAP_UNLOCK(pmap);
8451 goto restart;
8452 }
8453 }
8454 pte = pmap_pde(pmap, pv->pv_va);
8455 if ((*pte & PG_W) != 0)
8456 count++;
8457 PMAP_UNLOCK(pmap);
8458 }
8459 }
8460 rw_runlock(lock);
8461 return (count);
8462 }
8463
8464 /*
8465 * Returns TRUE if the given page is mapped individually or as part of
8466 * a 2mpage. Otherwise, returns FALSE.
8467 */
8468 boolean_t
pmap_page_is_mapped(vm_page_t m)8469 pmap_page_is_mapped(vm_page_t m)
8470 {
8471 struct rwlock *lock;
8472 boolean_t rv;
8473
8474 if ((m->oflags & VPO_UNMANAGED) != 0)
8475 return (FALSE);
8476 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
8477 rw_rlock(lock);
8478 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
8479 ((m->flags & PG_FICTITIOUS) == 0 &&
8480 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
8481 rw_runlock(lock);
8482 return (rv);
8483 }
8484
8485 /*
8486 * Destroy all managed, non-wired mappings in the given user-space
8487 * pmap. This pmap cannot be active on any processor besides the
8488 * caller.
8489 *
8490 * This function cannot be applied to the kernel pmap. Moreover, it
8491 * is not intended for general use. It is only to be used during
8492 * process termination. Consequently, it can be implemented in ways
8493 * that make it faster than pmap_remove(). First, it can more quickly
8494 * destroy mappings by iterating over the pmap's collection of PV
8495 * entries, rather than searching the page table. Second, it doesn't
8496 * have to test and clear the page table entries atomically, because
8497 * no processor is currently accessing the user address space. In
8498 * particular, a page table entry's dirty bit won't change state once
8499 * this function starts.
8500 *
8501 * Although this function destroys all of the pmap's managed,
8502 * non-wired mappings, it can delay and batch the invalidation of TLB
8503 * entries without calling pmap_delayed_invl_start() and
8504 * pmap_delayed_invl_finish(). Because the pmap is not active on
8505 * any other processor, none of these TLB entries will ever be used
8506 * before their eventual invalidation. Consequently, there is no need
8507 * for either pmap_remove_all() or pmap_remove_write() to wait for
8508 * that eventual TLB invalidation.
8509 */
8510 void
pmap_remove_pages(pmap_t pmap)8511 pmap_remove_pages(pmap_t pmap)
8512 {
8513 pd_entry_t ptepde;
8514 pt_entry_t *pte, tpte;
8515 pt_entry_t PG_M, PG_RW, PG_V;
8516 struct spglist free;
8517 struct pv_chunklist free_chunks[PMAP_MEMDOM];
8518 vm_page_t m, mpte, mt;
8519 pv_entry_t pv;
8520 struct md_page *pvh;
8521 struct pv_chunk *pc, *npc;
8522 struct rwlock *lock;
8523 int64_t bit;
8524 uint64_t inuse, bitmask;
8525 int allfree, field, i, idx;
8526 #ifdef PV_STATS
8527 int freed;
8528 #endif
8529 boolean_t superpage;
8530 vm_paddr_t pa;
8531
8532 /*
8533 * Assert that the given pmap is only active on the current
8534 * CPU. Unfortunately, we cannot block another CPU from
8535 * activating the pmap while this function is executing.
8536 */
8537 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
8538 #ifdef INVARIANTS
8539 {
8540 cpuset_t other_cpus;
8541
8542 other_cpus = all_cpus;
8543 critical_enter();
8544 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
8545 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active);
8546 critical_exit();
8547 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
8548 }
8549 #endif
8550
8551 lock = NULL;
8552 PG_M = pmap_modified_bit(pmap);
8553 PG_V = pmap_valid_bit(pmap);
8554 PG_RW = pmap_rw_bit(pmap);
8555
8556 for (i = 0; i < PMAP_MEMDOM; i++)
8557 TAILQ_INIT(&free_chunks[i]);
8558 SLIST_INIT(&free);
8559 PMAP_LOCK(pmap);
8560 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
8561 allfree = 1;
8562 #ifdef PV_STATS
8563 freed = 0;
8564 #endif
8565 for (field = 0; field < _NPCM; field++) {
8566 inuse = ~pc->pc_map[field] & pc_freemask[field];
8567 while (inuse != 0) {
8568 bit = bsfq(inuse);
8569 bitmask = 1UL << bit;
8570 idx = field * 64 + bit;
8571 pv = &pc->pc_pventry[idx];
8572 inuse &= ~bitmask;
8573
8574 pte = pmap_pdpe(pmap, pv->pv_va);
8575 ptepde = *pte;
8576 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
8577 tpte = *pte;
8578 if ((tpte & (PG_PS | PG_V)) == PG_V) {
8579 superpage = FALSE;
8580 ptepde = tpte;
8581 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
8582 PG_FRAME);
8583 pte = &pte[pmap_pte_index(pv->pv_va)];
8584 tpte = *pte;
8585 } else {
8586 /*
8587 * Keep track whether 'tpte' is a
8588 * superpage explicitly instead of
8589 * relying on PG_PS being set.
8590 *
8591 * This is because PG_PS is numerically
8592 * identical to PG_PTE_PAT and thus a
8593 * regular page could be mistaken for
8594 * a superpage.
8595 */
8596 superpage = TRUE;
8597 }
8598
8599 if ((tpte & PG_V) == 0) {
8600 panic("bad pte va %lx pte %lx",
8601 pv->pv_va, tpte);
8602 }
8603
8604 /*
8605 * We cannot remove wired pages from a process' mapping at this time
8606 */
8607 if (tpte & PG_W) {
8608 allfree = 0;
8609 continue;
8610 }
8611
8612 /* Mark free */
8613 pc->pc_map[field] |= bitmask;
8614
8615 /*
8616 * Because this pmap is not active on other
8617 * processors, the dirty bit cannot have
8618 * changed state since we last loaded pte.
8619 */
8620 pte_clear(pte);
8621
8622 if (superpage)
8623 pa = tpte & PG_PS_FRAME;
8624 else
8625 pa = tpte & PG_FRAME;
8626
8627 m = PHYS_TO_VM_PAGE(pa);
8628 KASSERT(m->phys_addr == pa,
8629 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
8630 m, (uintmax_t)m->phys_addr,
8631 (uintmax_t)tpte));
8632
8633 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
8634 m < &vm_page_array[vm_page_array_size],
8635 ("pmap_remove_pages: bad tpte %#jx",
8636 (uintmax_t)tpte));
8637
8638 /*
8639 * Update the vm_page_t clean/reference bits.
8640 */
8641 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
8642 if (superpage) {
8643 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
8644 vm_page_dirty(mt);
8645 } else
8646 vm_page_dirty(m);
8647 }
8648
8649 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
8650
8651 if (superpage) {
8652 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE);
8653 pvh = pa_to_pvh(tpte & PG_PS_FRAME);
8654 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
8655 pvh->pv_gen++;
8656 if (TAILQ_EMPTY(&pvh->pv_list)) {
8657 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
8658 if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
8659 TAILQ_EMPTY(&mt->md.pv_list))
8660 vm_page_aflag_clear(mt, PGA_WRITEABLE);
8661 }
8662 mpte = pmap_remove_pt_page(pmap, pv->pv_va);
8663 if (mpte != NULL) {
8664 KASSERT(vm_page_any_valid(mpte),
8665 ("pmap_remove_pages: pte page not promoted"));
8666 pmap_pt_page_count_adj(pmap, -1);
8667 KASSERT(mpte->ref_count == NPTEPG,
8668 ("pmap_remove_pages: pte page reference count error"));
8669 mpte->ref_count = 0;
8670 pmap_add_delayed_free_list(mpte, &free, FALSE);
8671 }
8672 } else {
8673 pmap_resident_count_adj(pmap, -1);
8674 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
8675 m->md.pv_gen++;
8676 if ((m->a.flags & PGA_WRITEABLE) != 0 &&
8677 TAILQ_EMPTY(&m->md.pv_list) &&
8678 (m->flags & PG_FICTITIOUS) == 0) {
8679 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
8680 if (TAILQ_EMPTY(&pvh->pv_list))
8681 vm_page_aflag_clear(m, PGA_WRITEABLE);
8682 }
8683 }
8684 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
8685 #ifdef PV_STATS
8686 freed++;
8687 #endif
8688 }
8689 }
8690 PV_STAT(counter_u64_add(pv_entry_frees, freed));
8691 PV_STAT(counter_u64_add(pv_entry_spare, freed));
8692 PV_STAT(counter_u64_add(pv_entry_count, -freed));
8693 if (allfree) {
8694 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
8695 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list);
8696 }
8697 }
8698 if (lock != NULL)
8699 rw_wunlock(lock);
8700 pmap_invalidate_all(pmap);
8701 pmap_pkru_deassign_all(pmap);
8702 free_pv_chunk_batch((struct pv_chunklist *)&free_chunks);
8703 PMAP_UNLOCK(pmap);
8704 vm_page_free_pages_toq(&free, true);
8705 }
8706
8707 static boolean_t
pmap_page_test_mappings(vm_page_t m,boolean_t accessed,boolean_t modified)8708 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
8709 {
8710 struct rwlock *lock;
8711 pv_entry_t pv;
8712 struct md_page *pvh;
8713 pt_entry_t *pte, mask;
8714 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
8715 pmap_t pmap;
8716 int md_gen, pvh_gen;
8717 boolean_t rv;
8718
8719 rv = FALSE;
8720 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
8721 rw_rlock(lock);
8722 restart:
8723 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
8724 pmap = PV_PMAP(pv);
8725 if (!PMAP_TRYLOCK(pmap)) {
8726 md_gen = m->md.pv_gen;
8727 rw_runlock(lock);
8728 PMAP_LOCK(pmap);
8729 rw_rlock(lock);
8730 if (md_gen != m->md.pv_gen) {
8731 PMAP_UNLOCK(pmap);
8732 goto restart;
8733 }
8734 }
8735 pte = pmap_pte(pmap, pv->pv_va);
8736 mask = 0;
8737 if (modified) {
8738 PG_M = pmap_modified_bit(pmap);
8739 PG_RW = pmap_rw_bit(pmap);
8740 mask |= PG_RW | PG_M;
8741 }
8742 if (accessed) {
8743 PG_A = pmap_accessed_bit(pmap);
8744 PG_V = pmap_valid_bit(pmap);
8745 mask |= PG_V | PG_A;
8746 }
8747 rv = (*pte & mask) == mask;
8748 PMAP_UNLOCK(pmap);
8749 if (rv)
8750 goto out;
8751 }
8752 if ((m->flags & PG_FICTITIOUS) == 0) {
8753 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
8754 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
8755 pmap = PV_PMAP(pv);
8756 if (!PMAP_TRYLOCK(pmap)) {
8757 md_gen = m->md.pv_gen;
8758 pvh_gen = pvh->pv_gen;
8759 rw_runlock(lock);
8760 PMAP_LOCK(pmap);
8761 rw_rlock(lock);
8762 if (md_gen != m->md.pv_gen ||
8763 pvh_gen != pvh->pv_gen) {
8764 PMAP_UNLOCK(pmap);
8765 goto restart;
8766 }
8767 }
8768 pte = pmap_pde(pmap, pv->pv_va);
8769 mask = 0;
8770 if (modified) {
8771 PG_M = pmap_modified_bit(pmap);
8772 PG_RW = pmap_rw_bit(pmap);
8773 mask |= PG_RW | PG_M;
8774 }
8775 if (accessed) {
8776 PG_A = pmap_accessed_bit(pmap);
8777 PG_V = pmap_valid_bit(pmap);
8778 mask |= PG_V | PG_A;
8779 }
8780 rv = (*pte & mask) == mask;
8781 PMAP_UNLOCK(pmap);
8782 if (rv)
8783 goto out;
8784 }
8785 }
8786 out:
8787 rw_runlock(lock);
8788 return (rv);
8789 }
8790
8791 /*
8792 * pmap_is_modified:
8793 *
8794 * Return whether or not the specified physical page was modified
8795 * in any physical maps.
8796 */
8797 boolean_t
pmap_is_modified(vm_page_t m)8798 pmap_is_modified(vm_page_t m)
8799 {
8800
8801 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
8802 ("pmap_is_modified: page %p is not managed", m));
8803
8804 /*
8805 * If the page is not busied then this check is racy.
8806 */
8807 if (!pmap_page_is_write_mapped(m))
8808 return (FALSE);
8809 return (pmap_page_test_mappings(m, FALSE, TRUE));
8810 }
8811
8812 /*
8813 * pmap_is_prefaultable:
8814 *
8815 * Return whether or not the specified virtual address is eligible
8816 * for prefault.
8817 */
8818 boolean_t
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)8819 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
8820 {
8821 pd_entry_t *pde;
8822 pt_entry_t *pte, PG_V;
8823 boolean_t rv;
8824
8825 PG_V = pmap_valid_bit(pmap);
8826
8827 /*
8828 * Return TRUE if and only if the PTE for the specified virtual
8829 * address is allocated but invalid.
8830 */
8831 rv = FALSE;
8832 PMAP_LOCK(pmap);
8833 pde = pmap_pde(pmap, addr);
8834 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
8835 pte = pmap_pde_to_pte(pde, addr);
8836 rv = (*pte & PG_V) == 0;
8837 }
8838 PMAP_UNLOCK(pmap);
8839 return (rv);
8840 }
8841
8842 /*
8843 * pmap_is_referenced:
8844 *
8845 * Return whether or not the specified physical page was referenced
8846 * in any physical maps.
8847 */
8848 boolean_t
pmap_is_referenced(vm_page_t m)8849 pmap_is_referenced(vm_page_t m)
8850 {
8851
8852 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
8853 ("pmap_is_referenced: page %p is not managed", m));
8854 return (pmap_page_test_mappings(m, TRUE, FALSE));
8855 }
8856
8857 /*
8858 * Clear the write and modified bits in each of the given page's mappings.
8859 */
8860 void
pmap_remove_write(vm_page_t m)8861 pmap_remove_write(vm_page_t m)
8862 {
8863 struct md_page *pvh;
8864 pmap_t pmap;
8865 struct rwlock *lock;
8866 pv_entry_t next_pv, pv;
8867 pd_entry_t *pde;
8868 pt_entry_t oldpte, *pte, PG_M, PG_RW;
8869 vm_offset_t va;
8870 int pvh_gen, md_gen;
8871
8872 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
8873 ("pmap_remove_write: page %p is not managed", m));
8874
8875 vm_page_assert_busied(m);
8876 if (!pmap_page_is_write_mapped(m))
8877 return;
8878
8879 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
8880 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
8881 pa_to_pvh(VM_PAGE_TO_PHYS(m));
8882 rw_wlock(lock);
8883 retry:
8884 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
8885 pmap = PV_PMAP(pv);
8886 if (!PMAP_TRYLOCK(pmap)) {
8887 pvh_gen = pvh->pv_gen;
8888 rw_wunlock(lock);
8889 PMAP_LOCK(pmap);
8890 rw_wlock(lock);
8891 if (pvh_gen != pvh->pv_gen) {
8892 PMAP_UNLOCK(pmap);
8893 goto retry;
8894 }
8895 }
8896 PG_RW = pmap_rw_bit(pmap);
8897 va = pv->pv_va;
8898 pde = pmap_pde(pmap, va);
8899 if ((*pde & PG_RW) != 0)
8900 (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
8901 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
8902 ("inconsistent pv lock %p %p for page %p",
8903 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
8904 PMAP_UNLOCK(pmap);
8905 }
8906 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
8907 pmap = PV_PMAP(pv);
8908 if (!PMAP_TRYLOCK(pmap)) {
8909 pvh_gen = pvh->pv_gen;
8910 md_gen = m->md.pv_gen;
8911 rw_wunlock(lock);
8912 PMAP_LOCK(pmap);
8913 rw_wlock(lock);
8914 if (pvh_gen != pvh->pv_gen ||
8915 md_gen != m->md.pv_gen) {
8916 PMAP_UNLOCK(pmap);
8917 goto retry;
8918 }
8919 }
8920 PG_M = pmap_modified_bit(pmap);
8921 PG_RW = pmap_rw_bit(pmap);
8922 pde = pmap_pde(pmap, pv->pv_va);
8923 KASSERT((*pde & PG_PS) == 0,
8924 ("pmap_remove_write: found a 2mpage in page %p's pv list",
8925 m));
8926 pte = pmap_pde_to_pte(pde, pv->pv_va);
8927 oldpte = *pte;
8928 if (oldpte & PG_RW) {
8929 while (!atomic_fcmpset_long(pte, &oldpte, oldpte &
8930 ~(PG_RW | PG_M)))
8931 cpu_spinwait();
8932 if ((oldpte & PG_M) != 0)
8933 vm_page_dirty(m);
8934 pmap_invalidate_page(pmap, pv->pv_va);
8935 }
8936 PMAP_UNLOCK(pmap);
8937 }
8938 rw_wunlock(lock);
8939 vm_page_aflag_clear(m, PGA_WRITEABLE);
8940 pmap_delayed_invl_wait(m);
8941 }
8942
8943 /*
8944 * pmap_ts_referenced:
8945 *
8946 * Return a count of reference bits for a page, clearing those bits.
8947 * It is not necessary for every reference bit to be cleared, but it
8948 * is necessary that 0 only be returned when there are truly no
8949 * reference bits set.
8950 *
8951 * As an optimization, update the page's dirty field if a modified bit is
8952 * found while counting reference bits. This opportunistic update can be
8953 * performed at low cost and can eliminate the need for some future calls
8954 * to pmap_is_modified(). However, since this function stops after
8955 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
8956 * dirty pages. Those dirty pages will only be detected by a future call
8957 * to pmap_is_modified().
8958 *
8959 * A DI block is not needed within this function, because
8960 * invalidations are performed before the PV list lock is
8961 * released.
8962 */
8963 int
pmap_ts_referenced(vm_page_t m)8964 pmap_ts_referenced(vm_page_t m)
8965 {
8966 struct md_page *pvh;
8967 pv_entry_t pv, pvf;
8968 pmap_t pmap;
8969 struct rwlock *lock;
8970 pd_entry_t oldpde, *pde;
8971 pt_entry_t *pte, PG_A, PG_M, PG_RW;
8972 vm_offset_t va;
8973 vm_paddr_t pa;
8974 int cleared, md_gen, not_cleared, pvh_gen;
8975 struct spglist free;
8976 boolean_t demoted;
8977
8978 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
8979 ("pmap_ts_referenced: page %p is not managed", m));
8980 SLIST_INIT(&free);
8981 cleared = 0;
8982 pa = VM_PAGE_TO_PHYS(m);
8983 lock = PHYS_TO_PV_LIST_LOCK(pa);
8984 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
8985 rw_wlock(lock);
8986 retry:
8987 not_cleared = 0;
8988 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
8989 goto small_mappings;
8990 pv = pvf;
8991 do {
8992 if (pvf == NULL)
8993 pvf = pv;
8994 pmap = PV_PMAP(pv);
8995 if (!PMAP_TRYLOCK(pmap)) {
8996 pvh_gen = pvh->pv_gen;
8997 rw_wunlock(lock);
8998 PMAP_LOCK(pmap);
8999 rw_wlock(lock);
9000 if (pvh_gen != pvh->pv_gen) {
9001 PMAP_UNLOCK(pmap);
9002 goto retry;
9003 }
9004 }
9005 PG_A = pmap_accessed_bit(pmap);
9006 PG_M = pmap_modified_bit(pmap);
9007 PG_RW = pmap_rw_bit(pmap);
9008 va = pv->pv_va;
9009 pde = pmap_pde(pmap, pv->pv_va);
9010 oldpde = *pde;
9011 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
9012 /*
9013 * Although "oldpde" is mapping a 2MB page, because
9014 * this function is called at a 4KB page granularity,
9015 * we only update the 4KB page under test.
9016 */
9017 vm_page_dirty(m);
9018 }
9019 if ((oldpde & PG_A) != 0) {
9020 /*
9021 * Since this reference bit is shared by 512 4KB
9022 * pages, it should not be cleared every time it is
9023 * tested. Apply a simple "hash" function on the
9024 * physical page number, the virtual superpage number,
9025 * and the pmap address to select one 4KB page out of
9026 * the 512 on which testing the reference bit will
9027 * result in clearing that reference bit. This
9028 * function is designed to avoid the selection of the
9029 * same 4KB page for every 2MB page mapping.
9030 *
9031 * On demotion, a mapping that hasn't been referenced
9032 * is simply destroyed. To avoid the possibility of a
9033 * subsequent page fault on a demoted wired mapping,
9034 * always leave its reference bit set. Moreover,
9035 * since the superpage is wired, the current state of
9036 * its reference bit won't affect page replacement.
9037 */
9038 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
9039 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
9040 (oldpde & PG_W) == 0) {
9041 if (safe_to_clear_referenced(pmap, oldpde)) {
9042 atomic_clear_long(pde, PG_A);
9043 pmap_invalidate_page(pmap, pv->pv_va);
9044 demoted = FALSE;
9045 } else if (pmap_demote_pde_locked(pmap, pde,
9046 pv->pv_va, &lock)) {
9047 /*
9048 * Remove the mapping to a single page
9049 * so that a subsequent access may
9050 * repromote. Since the underlying
9051 * page table page is fully populated,
9052 * this removal never frees a page
9053 * table page.
9054 */
9055 demoted = TRUE;
9056 va += VM_PAGE_TO_PHYS(m) - (oldpde &
9057 PG_PS_FRAME);
9058 pte = pmap_pde_to_pte(pde, va);
9059 pmap_remove_pte(pmap, pte, va, *pde,
9060 NULL, &lock);
9061 pmap_invalidate_page(pmap, va);
9062 } else
9063 demoted = TRUE;
9064
9065 if (demoted) {
9066 /*
9067 * The superpage mapping was removed
9068 * entirely and therefore 'pv' is no
9069 * longer valid.
9070 */
9071 if (pvf == pv)
9072 pvf = NULL;
9073 pv = NULL;
9074 }
9075 cleared++;
9076 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
9077 ("inconsistent pv lock %p %p for page %p",
9078 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
9079 } else
9080 not_cleared++;
9081 }
9082 PMAP_UNLOCK(pmap);
9083 /* Rotate the PV list if it has more than one entry. */
9084 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
9085 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
9086 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
9087 pvh->pv_gen++;
9088 }
9089 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
9090 goto out;
9091 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
9092 small_mappings:
9093 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
9094 goto out;
9095 pv = pvf;
9096 do {
9097 if (pvf == NULL)
9098 pvf = pv;
9099 pmap = PV_PMAP(pv);
9100 if (!PMAP_TRYLOCK(pmap)) {
9101 pvh_gen = pvh->pv_gen;
9102 md_gen = m->md.pv_gen;
9103 rw_wunlock(lock);
9104 PMAP_LOCK(pmap);
9105 rw_wlock(lock);
9106 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
9107 PMAP_UNLOCK(pmap);
9108 goto retry;
9109 }
9110 }
9111 PG_A = pmap_accessed_bit(pmap);
9112 PG_M = pmap_modified_bit(pmap);
9113 PG_RW = pmap_rw_bit(pmap);
9114 pde = pmap_pde(pmap, pv->pv_va);
9115 KASSERT((*pde & PG_PS) == 0,
9116 ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
9117 m));
9118 pte = pmap_pde_to_pte(pde, pv->pv_va);
9119 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
9120 vm_page_dirty(m);
9121 if ((*pte & PG_A) != 0) {
9122 if (safe_to_clear_referenced(pmap, *pte)) {
9123 atomic_clear_long(pte, PG_A);
9124 pmap_invalidate_page(pmap, pv->pv_va);
9125 cleared++;
9126 } else if ((*pte & PG_W) == 0) {
9127 /*
9128 * Wired pages cannot be paged out so
9129 * doing accessed bit emulation for
9130 * them is wasted effort. We do the
9131 * hard work for unwired pages only.
9132 */
9133 pmap_remove_pte(pmap, pte, pv->pv_va,
9134 *pde, &free, &lock);
9135 pmap_invalidate_page(pmap, pv->pv_va);
9136 cleared++;
9137 if (pvf == pv)
9138 pvf = NULL;
9139 pv = NULL;
9140 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
9141 ("inconsistent pv lock %p %p for page %p",
9142 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
9143 } else
9144 not_cleared++;
9145 }
9146 PMAP_UNLOCK(pmap);
9147 /* Rotate the PV list if it has more than one entry. */
9148 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
9149 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
9150 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
9151 m->md.pv_gen++;
9152 }
9153 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
9154 not_cleared < PMAP_TS_REFERENCED_MAX);
9155 out:
9156 rw_wunlock(lock);
9157 vm_page_free_pages_toq(&free, true);
9158 return (cleared + not_cleared);
9159 }
9160
9161 /*
9162 * Apply the given advice to the specified range of addresses within the
9163 * given pmap. Depending on the advice, clear the referenced and/or
9164 * modified flags in each mapping and set the mapped page's dirty field.
9165 */
9166 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)9167 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
9168 {
9169 struct rwlock *lock;
9170 pml4_entry_t *pml4e;
9171 pdp_entry_t *pdpe;
9172 pd_entry_t oldpde, *pde;
9173 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
9174 vm_offset_t va, va_next;
9175 vm_page_t m;
9176 bool anychanged;
9177
9178 if (advice != MADV_DONTNEED && advice != MADV_FREE)
9179 return;
9180
9181 /*
9182 * A/D bit emulation requires an alternate code path when clearing
9183 * the modified and accessed bits below. Since this function is
9184 * advisory in nature we skip it entirely for pmaps that require
9185 * A/D bit emulation.
9186 */
9187 if (pmap_emulate_ad_bits(pmap))
9188 return;
9189
9190 PG_A = pmap_accessed_bit(pmap);
9191 PG_G = pmap_global_bit(pmap);
9192 PG_M = pmap_modified_bit(pmap);
9193 PG_V = pmap_valid_bit(pmap);
9194 PG_RW = pmap_rw_bit(pmap);
9195 anychanged = false;
9196 pmap_delayed_invl_start();
9197 PMAP_LOCK(pmap);
9198 for (; sva < eva; sva = va_next) {
9199 pml4e = pmap_pml4e(pmap, sva);
9200 if (pml4e == NULL || (*pml4e & PG_V) == 0) {
9201 va_next = (sva + NBPML4) & ~PML4MASK;
9202 if (va_next < sva)
9203 va_next = eva;
9204 continue;
9205 }
9206
9207 va_next = (sva + NBPDP) & ~PDPMASK;
9208 if (va_next < sva)
9209 va_next = eva;
9210 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
9211 if ((*pdpe & PG_V) == 0)
9212 continue;
9213 if ((*pdpe & PG_PS) != 0)
9214 continue;
9215
9216 va_next = (sva + NBPDR) & ~PDRMASK;
9217 if (va_next < sva)
9218 va_next = eva;
9219 pde = pmap_pdpe_to_pde(pdpe, sva);
9220 oldpde = *pde;
9221 if ((oldpde & PG_V) == 0)
9222 continue;
9223 else if ((oldpde & PG_PS) != 0) {
9224 if ((oldpde & PG_MANAGED) == 0)
9225 continue;
9226 lock = NULL;
9227 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
9228 if (lock != NULL)
9229 rw_wunlock(lock);
9230
9231 /*
9232 * The large page mapping was destroyed.
9233 */
9234 continue;
9235 }
9236
9237 /*
9238 * Unless the page mappings are wired, remove the
9239 * mapping to a single page so that a subsequent
9240 * access may repromote. Choosing the last page
9241 * within the address range [sva, min(va_next, eva))
9242 * generally results in more repromotions. Since the
9243 * underlying page table page is fully populated, this
9244 * removal never frees a page table page.
9245 */
9246 if ((oldpde & PG_W) == 0) {
9247 va = eva;
9248 if (va > va_next)
9249 va = va_next;
9250 va -= PAGE_SIZE;
9251 KASSERT(va >= sva,
9252 ("pmap_advise: no address gap"));
9253 pte = pmap_pde_to_pte(pde, va);
9254 KASSERT((*pte & PG_V) != 0,
9255 ("pmap_advise: invalid PTE"));
9256 pmap_remove_pte(pmap, pte, va, *pde, NULL,
9257 &lock);
9258 anychanged = true;
9259 }
9260 if (lock != NULL)
9261 rw_wunlock(lock);
9262 }
9263 if (va_next > eva)
9264 va_next = eva;
9265 va = va_next;
9266 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
9267 sva += PAGE_SIZE) {
9268 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
9269 goto maybe_invlrng;
9270 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
9271 if (advice == MADV_DONTNEED) {
9272 /*
9273 * Future calls to pmap_is_modified()
9274 * can be avoided by making the page
9275 * dirty now.
9276 */
9277 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
9278 vm_page_dirty(m);
9279 }
9280 atomic_clear_long(pte, PG_M | PG_A);
9281 } else if ((*pte & PG_A) != 0)
9282 atomic_clear_long(pte, PG_A);
9283 else
9284 goto maybe_invlrng;
9285
9286 if ((*pte & PG_G) != 0) {
9287 if (va == va_next)
9288 va = sva;
9289 } else
9290 anychanged = true;
9291 continue;
9292 maybe_invlrng:
9293 if (va != va_next) {
9294 pmap_invalidate_range(pmap, va, sva);
9295 va = va_next;
9296 }
9297 }
9298 if (va != va_next)
9299 pmap_invalidate_range(pmap, va, sva);
9300 }
9301 if (anychanged)
9302 pmap_invalidate_all(pmap);
9303 PMAP_UNLOCK(pmap);
9304 pmap_delayed_invl_finish();
9305 }
9306
9307 /*
9308 * Clear the modify bits on the specified physical page.
9309 */
9310 void
pmap_clear_modify(vm_page_t m)9311 pmap_clear_modify(vm_page_t m)
9312 {
9313 struct md_page *pvh;
9314 pmap_t pmap;
9315 pv_entry_t next_pv, pv;
9316 pd_entry_t oldpde, *pde;
9317 pt_entry_t *pte, PG_M, PG_RW;
9318 struct rwlock *lock;
9319 vm_offset_t va;
9320 int md_gen, pvh_gen;
9321
9322 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
9323 ("pmap_clear_modify: page %p is not managed", m));
9324 vm_page_assert_busied(m);
9325
9326 if (!pmap_page_is_write_mapped(m))
9327 return;
9328 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
9329 pa_to_pvh(VM_PAGE_TO_PHYS(m));
9330 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
9331 rw_wlock(lock);
9332 restart:
9333 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
9334 pmap = PV_PMAP(pv);
9335 if (!PMAP_TRYLOCK(pmap)) {
9336 pvh_gen = pvh->pv_gen;
9337 rw_wunlock(lock);
9338 PMAP_LOCK(pmap);
9339 rw_wlock(lock);
9340 if (pvh_gen != pvh->pv_gen) {
9341 PMAP_UNLOCK(pmap);
9342 goto restart;
9343 }
9344 }
9345 PG_M = pmap_modified_bit(pmap);
9346 PG_RW = pmap_rw_bit(pmap);
9347 va = pv->pv_va;
9348 pde = pmap_pde(pmap, va);
9349 oldpde = *pde;
9350 /* If oldpde has PG_RW set, then it also has PG_M set. */
9351 if ((oldpde & PG_RW) != 0 &&
9352 pmap_demote_pde_locked(pmap, pde, va, &lock) &&
9353 (oldpde & PG_W) == 0) {
9354 /*
9355 * Write protect the mapping to a single page so that
9356 * a subsequent write access may repromote.
9357 */
9358 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME);
9359 pte = pmap_pde_to_pte(pde, va);
9360 atomic_clear_long(pte, PG_M | PG_RW);
9361 vm_page_dirty(m);
9362 pmap_invalidate_page(pmap, va);
9363 }
9364 PMAP_UNLOCK(pmap);
9365 }
9366 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
9367 pmap = PV_PMAP(pv);
9368 if (!PMAP_TRYLOCK(pmap)) {
9369 md_gen = m->md.pv_gen;
9370 pvh_gen = pvh->pv_gen;
9371 rw_wunlock(lock);
9372 PMAP_LOCK(pmap);
9373 rw_wlock(lock);
9374 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
9375 PMAP_UNLOCK(pmap);
9376 goto restart;
9377 }
9378 }
9379 PG_M = pmap_modified_bit(pmap);
9380 PG_RW = pmap_rw_bit(pmap);
9381 pde = pmap_pde(pmap, pv->pv_va);
9382 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
9383 " a 2mpage in page %p's pv list", m));
9384 pte = pmap_pde_to_pte(pde, pv->pv_va);
9385 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
9386 atomic_clear_long(pte, PG_M);
9387 pmap_invalidate_page(pmap, pv->pv_va);
9388 }
9389 PMAP_UNLOCK(pmap);
9390 }
9391 rw_wunlock(lock);
9392 }
9393
9394 /*
9395 * Miscellaneous support routines follow
9396 */
9397
9398 /* Adjust the properties for a leaf page table entry. */
9399 static __inline void
pmap_pte_props(pt_entry_t * pte,u_long bits,u_long mask)9400 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask)
9401 {
9402 u_long opte, npte;
9403
9404 opte = *(u_long *)pte;
9405 do {
9406 npte = opte & ~mask;
9407 npte |= bits;
9408 } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte,
9409 npte));
9410 }
9411
9412 /*
9413 * Map a set of physical memory pages into the kernel virtual
9414 * address space. Return a pointer to where it is mapped. This
9415 * routine is intended to be used for mapping device memory,
9416 * NOT real memory.
9417 */
9418 static void *
pmap_mapdev_internal(vm_paddr_t pa,vm_size_t size,int mode,int flags)9419 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags)
9420 {
9421 struct pmap_preinit_mapping *ppim;
9422 vm_offset_t va, offset;
9423 vm_size_t tmpsize;
9424 int i;
9425
9426 offset = pa & PAGE_MASK;
9427 size = round_page(offset + size);
9428 pa = trunc_page(pa);
9429
9430 if (!pmap_initialized) {
9431 va = 0;
9432 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
9433 ppim = pmap_preinit_mapping + i;
9434 if (ppim->va == 0) {
9435 ppim->pa = pa;
9436 ppim->sz = size;
9437 ppim->mode = mode;
9438 ppim->va = virtual_avail;
9439 virtual_avail += size;
9440 va = ppim->va;
9441 break;
9442 }
9443 }
9444 if (va == 0)
9445 panic("%s: too many preinit mappings", __func__);
9446 } else {
9447 /*
9448 * If we have a preinit mapping, re-use it.
9449 */
9450 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
9451 ppim = pmap_preinit_mapping + i;
9452 if (ppim->pa == pa && ppim->sz == size &&
9453 (ppim->mode == mode ||
9454 (flags & MAPDEV_SETATTR) == 0))
9455 return ((void *)(ppim->va + offset));
9456 }
9457 /*
9458 * If the specified range of physical addresses fits within
9459 * the direct map window, use the direct map.
9460 */
9461 if (pa < dmaplimit && pa + size <= dmaplimit) {
9462 va = PHYS_TO_DMAP(pa);
9463 if ((flags & MAPDEV_SETATTR) != 0) {
9464 PMAP_LOCK(kernel_pmap);
9465 i = pmap_change_props_locked(va, size,
9466 PROT_NONE, mode, flags);
9467 PMAP_UNLOCK(kernel_pmap);
9468 } else
9469 i = 0;
9470 if (!i)
9471 return ((void *)(va + offset));
9472 }
9473 va = kva_alloc(size);
9474 if (va == 0)
9475 panic("%s: Couldn't allocate KVA", __func__);
9476 }
9477 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
9478 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
9479 pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
9480 if ((flags & MAPDEV_FLUSHCACHE) != 0)
9481 pmap_invalidate_cache_range(va, va + tmpsize);
9482 return ((void *)(va + offset));
9483 }
9484
9485 void *
pmap_mapdev_attr(vm_paddr_t pa,vm_size_t size,int mode)9486 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
9487 {
9488
9489 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE |
9490 MAPDEV_SETATTR));
9491 }
9492
9493 void *
pmap_mapdev(vm_paddr_t pa,vm_size_t size)9494 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
9495 {
9496
9497 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
9498 }
9499
9500 void *
pmap_mapdev_pciecfg(vm_paddr_t pa,vm_size_t size)9501 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size)
9502 {
9503
9504 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE,
9505 MAPDEV_SETATTR));
9506 }
9507
9508 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)9509 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
9510 {
9511
9512 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK,
9513 MAPDEV_FLUSHCACHE));
9514 }
9515
9516 void
pmap_unmapdev(void * p,vm_size_t size)9517 pmap_unmapdev(void *p, vm_size_t size)
9518 {
9519 struct pmap_preinit_mapping *ppim;
9520 vm_offset_t offset, va;
9521 int i;
9522
9523 va = (vm_offset_t)p;
9524
9525 /* If we gave a direct map region in pmap_mapdev, do nothing */
9526 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
9527 return;
9528 offset = va & PAGE_MASK;
9529 size = round_page(offset + size);
9530 va = trunc_page(va);
9531 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
9532 ppim = pmap_preinit_mapping + i;
9533 if (ppim->va == va && ppim->sz == size) {
9534 if (pmap_initialized)
9535 return;
9536 ppim->pa = 0;
9537 ppim->va = 0;
9538 ppim->sz = 0;
9539 ppim->mode = 0;
9540 if (va + size == virtual_avail)
9541 virtual_avail = va;
9542 return;
9543 }
9544 }
9545 if (pmap_initialized) {
9546 pmap_qremove(va, atop(size));
9547 kva_free(va, size);
9548 }
9549 }
9550
9551 /*
9552 * Tries to demote a 1GB page mapping.
9553 */
9554 static boolean_t
pmap_demote_pdpe(pmap_t pmap,pdp_entry_t * pdpe,vm_offset_t va)9555 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
9556 {
9557 pdp_entry_t newpdpe, oldpdpe;
9558 pd_entry_t *firstpde, newpde, *pde;
9559 pt_entry_t PG_A, PG_M, PG_RW, PG_V;
9560 vm_paddr_t pdpgpa;
9561 vm_page_t pdpg;
9562
9563 PG_A = pmap_accessed_bit(pmap);
9564 PG_M = pmap_modified_bit(pmap);
9565 PG_V = pmap_valid_bit(pmap);
9566 PG_RW = pmap_rw_bit(pmap);
9567
9568 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9569 oldpdpe = *pdpe;
9570 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
9571 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
9572 pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT,
9573 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT);
9574 if (pdpg == NULL) {
9575 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
9576 " in pmap %p", va, pmap);
9577 return (FALSE);
9578 }
9579 pdpgpa = VM_PAGE_TO_PHYS(pdpg);
9580 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
9581 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
9582 KASSERT((oldpdpe & PG_A) != 0,
9583 ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
9584 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
9585 ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
9586 newpde = oldpdpe;
9587
9588 /*
9589 * Initialize the page directory page.
9590 */
9591 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
9592 *pde = newpde;
9593 newpde += NBPDR;
9594 }
9595
9596 /*
9597 * Demote the mapping.
9598 */
9599 *pdpe = newpdpe;
9600
9601 /*
9602 * Invalidate a stale recursive mapping of the page directory page.
9603 */
9604 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
9605
9606 counter_u64_add(pmap_pdpe_demotions, 1);
9607 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
9608 " in pmap %p", va, pmap);
9609 return (TRUE);
9610 }
9611
9612 /*
9613 * Sets the memory attribute for the specified page.
9614 */
9615 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)9616 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
9617 {
9618
9619 m->md.pat_mode = ma;
9620
9621 /*
9622 * If "m" is a normal page, update its direct mapping. This update
9623 * can be relied upon to perform any cache operations that are
9624 * required for data coherence.
9625 */
9626 if ((m->flags & PG_FICTITIOUS) == 0 &&
9627 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
9628 m->md.pat_mode))
9629 panic("memory attribute change on the direct map failed");
9630 }
9631
9632 void
pmap_page_set_memattr_noflush(vm_page_t m,vm_memattr_t ma)9633 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma)
9634 {
9635 int error;
9636
9637 m->md.pat_mode = ma;
9638
9639 if ((m->flags & PG_FICTITIOUS) != 0)
9640 return;
9641 PMAP_LOCK(kernel_pmap);
9642 error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)),
9643 PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0);
9644 PMAP_UNLOCK(kernel_pmap);
9645 if (error != 0)
9646 panic("memory attribute change on the direct map failed");
9647 }
9648
9649 /*
9650 * Changes the specified virtual address range's memory type to that given by
9651 * the parameter "mode". The specified virtual address range must be
9652 * completely contained within either the direct map or the kernel map. If
9653 * the virtual address range is contained within the kernel map, then the
9654 * memory type for each of the corresponding ranges of the direct map is also
9655 * changed. (The corresponding ranges of the direct map are those ranges that
9656 * map the same physical pages as the specified virtual address range.) These
9657 * changes to the direct map are necessary because Intel describes the
9658 * behavior of their processors as "undefined" if two or more mappings to the
9659 * same physical page have different memory types.
9660 *
9661 * Returns zero if the change completed successfully, and either EINVAL or
9662 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
9663 * of the virtual address range was not mapped, and ENOMEM is returned if
9664 * there was insufficient memory available to complete the change. In the
9665 * latter case, the memory type may have been changed on some part of the
9666 * virtual address range or the direct map.
9667 */
9668 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)9669 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
9670 {
9671 int error;
9672
9673 PMAP_LOCK(kernel_pmap);
9674 error = pmap_change_props_locked(va, size, PROT_NONE, mode,
9675 MAPDEV_FLUSHCACHE);
9676 PMAP_UNLOCK(kernel_pmap);
9677 return (error);
9678 }
9679
9680 /*
9681 * Changes the specified virtual address range's protections to those
9682 * specified by "prot". Like pmap_change_attr(), protections for aliases
9683 * in the direct map are updated as well. Protections on aliasing mappings may
9684 * be a subset of the requested protections; for example, mappings in the direct
9685 * map are never executable.
9686 */
9687 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)9688 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
9689 {
9690 int error;
9691
9692 /* Only supported within the kernel map. */
9693 if (va < VM_MIN_KERNEL_ADDRESS)
9694 return (EINVAL);
9695
9696 PMAP_LOCK(kernel_pmap);
9697 error = pmap_change_props_locked(va, size, prot, -1,
9698 MAPDEV_ASSERTVALID);
9699 PMAP_UNLOCK(kernel_pmap);
9700 return (error);
9701 }
9702
9703 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,int flags)9704 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
9705 int mode, int flags)
9706 {
9707 vm_offset_t base, offset, tmpva;
9708 vm_paddr_t pa_start, pa_end, pa_end1;
9709 pdp_entry_t *pdpe;
9710 pd_entry_t *pde, pde_bits, pde_mask;
9711 pt_entry_t *pte, pte_bits, pte_mask;
9712 int error;
9713 bool changed;
9714
9715 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
9716 base = trunc_page(va);
9717 offset = va & PAGE_MASK;
9718 size = round_page(offset + size);
9719
9720 /*
9721 * Only supported on kernel virtual addresses, including the direct
9722 * map but excluding the recursive map.
9723 */
9724 if (base < DMAP_MIN_ADDRESS)
9725 return (EINVAL);
9726
9727 /*
9728 * Construct our flag sets and masks. "bits" is the subset of
9729 * "mask" that will be set in each modified PTE.
9730 *
9731 * Mappings in the direct map are never allowed to be executable.
9732 */
9733 pde_bits = pte_bits = 0;
9734 pde_mask = pte_mask = 0;
9735 if (mode != -1) {
9736 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true);
9737 pde_mask |= X86_PG_PDE_CACHE;
9738 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false);
9739 pte_mask |= X86_PG_PTE_CACHE;
9740 }
9741 if (prot != VM_PROT_NONE) {
9742 if ((prot & VM_PROT_WRITE) != 0) {
9743 pde_bits |= X86_PG_RW;
9744 pte_bits |= X86_PG_RW;
9745 }
9746 if ((prot & VM_PROT_EXECUTE) == 0 ||
9747 va < VM_MIN_KERNEL_ADDRESS) {
9748 pde_bits |= pg_nx;
9749 pte_bits |= pg_nx;
9750 }
9751 pde_mask |= X86_PG_RW | pg_nx;
9752 pte_mask |= X86_PG_RW | pg_nx;
9753 }
9754
9755 /*
9756 * Pages that aren't mapped aren't supported. Also break down 2MB pages
9757 * into 4KB pages if required.
9758 */
9759 for (tmpva = base; tmpva < base + size; ) {
9760 pdpe = pmap_pdpe(kernel_pmap, tmpva);
9761 if (pdpe == NULL || *pdpe == 0) {
9762 KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
9763 ("%s: addr %#lx is not mapped", __func__, tmpva));
9764 return (EINVAL);
9765 }
9766 if (*pdpe & PG_PS) {
9767 /*
9768 * If the current 1GB page already has the required
9769 * properties, then we need not demote this page. Just
9770 * increment tmpva to the next 1GB page frame.
9771 */
9772 if ((*pdpe & pde_mask) == pde_bits) {
9773 tmpva = trunc_1gpage(tmpva) + NBPDP;
9774 continue;
9775 }
9776
9777 /*
9778 * If the current offset aligns with a 1GB page frame
9779 * and there is at least 1GB left within the range, then
9780 * we need not break down this page into 2MB pages.
9781 */
9782 if ((tmpva & PDPMASK) == 0 &&
9783 tmpva + PDPMASK < base + size) {
9784 tmpva += NBPDP;
9785 continue;
9786 }
9787 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
9788 return (ENOMEM);
9789 }
9790 pde = pmap_pdpe_to_pde(pdpe, tmpva);
9791 if (*pde == 0) {
9792 KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
9793 ("%s: addr %#lx is not mapped", __func__, tmpva));
9794 return (EINVAL);
9795 }
9796 if (*pde & PG_PS) {
9797 /*
9798 * If the current 2MB page already has the required
9799 * properties, then we need not demote this page. Just
9800 * increment tmpva to the next 2MB page frame.
9801 */
9802 if ((*pde & pde_mask) == pde_bits) {
9803 tmpva = trunc_2mpage(tmpva) + NBPDR;
9804 continue;
9805 }
9806
9807 /*
9808 * If the current offset aligns with a 2MB page frame
9809 * and there is at least 2MB left within the range, then
9810 * we need not break down this page into 4KB pages.
9811 */
9812 if ((tmpva & PDRMASK) == 0 &&
9813 tmpva + PDRMASK < base + size) {
9814 tmpva += NBPDR;
9815 continue;
9816 }
9817 if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
9818 return (ENOMEM);
9819 }
9820 pte = pmap_pde_to_pte(pde, tmpva);
9821 if (*pte == 0) {
9822 KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
9823 ("%s: addr %#lx is not mapped", __func__, tmpva));
9824 return (EINVAL);
9825 }
9826 tmpva += PAGE_SIZE;
9827 }
9828 error = 0;
9829
9830 /*
9831 * Ok, all the pages exist, so run through them updating their
9832 * properties if required.
9833 */
9834 changed = false;
9835 pa_start = pa_end = 0;
9836 for (tmpva = base; tmpva < base + size; ) {
9837 pdpe = pmap_pdpe(kernel_pmap, tmpva);
9838 if (*pdpe & PG_PS) {
9839 if ((*pdpe & pde_mask) != pde_bits) {
9840 pmap_pte_props(pdpe, pde_bits, pde_mask);
9841 changed = true;
9842 }
9843 if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
9844 (*pdpe & PG_PS_FRAME) < dmaplimit) {
9845 if (pa_start == pa_end) {
9846 /* Start physical address run. */
9847 pa_start = *pdpe & PG_PS_FRAME;
9848 pa_end = pa_start + NBPDP;
9849 } else if (pa_end == (*pdpe & PG_PS_FRAME))
9850 pa_end += NBPDP;
9851 else {
9852 /* Run ended, update direct map. */
9853 error = pmap_change_props_locked(
9854 PHYS_TO_DMAP(pa_start),
9855 pa_end - pa_start, prot, mode,
9856 flags);
9857 if (error != 0)
9858 break;
9859 /* Start physical address run. */
9860 pa_start = *pdpe & PG_PS_FRAME;
9861 pa_end = pa_start + NBPDP;
9862 }
9863 }
9864 tmpva = trunc_1gpage(tmpva) + NBPDP;
9865 continue;
9866 }
9867 pde = pmap_pdpe_to_pde(pdpe, tmpva);
9868 if (*pde & PG_PS) {
9869 if ((*pde & pde_mask) != pde_bits) {
9870 pmap_pte_props(pde, pde_bits, pde_mask);
9871 changed = true;
9872 }
9873 if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
9874 (*pde & PG_PS_FRAME) < dmaplimit) {
9875 if (pa_start == pa_end) {
9876 /* Start physical address run. */
9877 pa_start = *pde & PG_PS_FRAME;
9878 pa_end = pa_start + NBPDR;
9879 } else if (pa_end == (*pde & PG_PS_FRAME))
9880 pa_end += NBPDR;
9881 else {
9882 /* Run ended, update direct map. */
9883 error = pmap_change_props_locked(
9884 PHYS_TO_DMAP(pa_start),
9885 pa_end - pa_start, prot, mode,
9886 flags);
9887 if (error != 0)
9888 break;
9889 /* Start physical address run. */
9890 pa_start = *pde & PG_PS_FRAME;
9891 pa_end = pa_start + NBPDR;
9892 }
9893 }
9894 tmpva = trunc_2mpage(tmpva) + NBPDR;
9895 } else {
9896 pte = pmap_pde_to_pte(pde, tmpva);
9897 if ((*pte & pte_mask) != pte_bits) {
9898 pmap_pte_props(pte, pte_bits, pte_mask);
9899 changed = true;
9900 }
9901 if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
9902 (*pte & PG_FRAME) < dmaplimit) {
9903 if (pa_start == pa_end) {
9904 /* Start physical address run. */
9905 pa_start = *pte & PG_FRAME;
9906 pa_end = pa_start + PAGE_SIZE;
9907 } else if (pa_end == (*pte & PG_FRAME))
9908 pa_end += PAGE_SIZE;
9909 else {
9910 /* Run ended, update direct map. */
9911 error = pmap_change_props_locked(
9912 PHYS_TO_DMAP(pa_start),
9913 pa_end - pa_start, prot, mode,
9914 flags);
9915 if (error != 0)
9916 break;
9917 /* Start physical address run. */
9918 pa_start = *pte & PG_FRAME;
9919 pa_end = pa_start + PAGE_SIZE;
9920 }
9921 }
9922 tmpva += PAGE_SIZE;
9923 }
9924 }
9925 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
9926 pa_end1 = MIN(pa_end, dmaplimit);
9927 if (pa_start != pa_end1)
9928 error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start),
9929 pa_end1 - pa_start, prot, mode, flags);
9930 }
9931
9932 /*
9933 * Flush CPU caches if required to make sure any data isn't cached that
9934 * shouldn't be, etc.
9935 */
9936 if (changed) {
9937 pmap_invalidate_range(kernel_pmap, base, tmpva);
9938 if ((flags & MAPDEV_FLUSHCACHE) != 0)
9939 pmap_invalidate_cache_range(base, tmpva);
9940 }
9941 return (error);
9942 }
9943
9944 /*
9945 * Demotes any mapping within the direct map region that covers more than the
9946 * specified range of physical addresses. This range's size must be a power
9947 * of two and its starting address must be a multiple of its size. Since the
9948 * demotion does not change any attributes of the mapping, a TLB invalidation
9949 * is not mandatory. The caller may, however, request a TLB invalidation.
9950 */
9951 void
pmap_demote_DMAP(vm_paddr_t base,vm_size_t len,boolean_t invalidate)9952 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
9953 {
9954 pdp_entry_t *pdpe;
9955 pd_entry_t *pde;
9956 vm_offset_t va;
9957 boolean_t changed;
9958
9959 if (len == 0)
9960 return;
9961 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
9962 KASSERT((base & (len - 1)) == 0,
9963 ("pmap_demote_DMAP: base is not a multiple of len"));
9964 if (len < NBPDP && base < dmaplimit) {
9965 va = PHYS_TO_DMAP(base);
9966 changed = FALSE;
9967 PMAP_LOCK(kernel_pmap);
9968 pdpe = pmap_pdpe(kernel_pmap, va);
9969 if ((*pdpe & X86_PG_V) == 0)
9970 panic("pmap_demote_DMAP: invalid PDPE");
9971 if ((*pdpe & PG_PS) != 0) {
9972 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
9973 panic("pmap_demote_DMAP: PDPE failed");
9974 changed = TRUE;
9975 }
9976 if (len < NBPDR) {
9977 pde = pmap_pdpe_to_pde(pdpe, va);
9978 if ((*pde & X86_PG_V) == 0)
9979 panic("pmap_demote_DMAP: invalid PDE");
9980 if ((*pde & PG_PS) != 0) {
9981 if (!pmap_demote_pde(kernel_pmap, pde, va))
9982 panic("pmap_demote_DMAP: PDE failed");
9983 changed = TRUE;
9984 }
9985 }
9986 if (changed && invalidate)
9987 pmap_invalidate_page(kernel_pmap, va);
9988 PMAP_UNLOCK(kernel_pmap);
9989 }
9990 }
9991
9992 /*
9993 * Perform the pmap work for mincore(2). If the page is not both referenced and
9994 * modified by this pmap, returns its physical address so that the caller can
9995 * find other mappings.
9996 */
9997 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)9998 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
9999 {
10000 pdp_entry_t *pdpe;
10001 pd_entry_t *pdep;
10002 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
10003 vm_paddr_t pa;
10004 int val;
10005
10006 PG_A = pmap_accessed_bit(pmap);
10007 PG_M = pmap_modified_bit(pmap);
10008 PG_V = pmap_valid_bit(pmap);
10009 PG_RW = pmap_rw_bit(pmap);
10010
10011 PMAP_LOCK(pmap);
10012 pte = 0;
10013 pa = 0;
10014 val = 0;
10015 pdpe = pmap_pdpe(pmap, addr);
10016 if (pdpe == NULL)
10017 goto out;
10018 if ((*pdpe & PG_V) != 0) {
10019 if ((*pdpe & PG_PS) != 0) {
10020 pte = *pdpe;
10021 pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) &
10022 PG_FRAME;
10023 val = MINCORE_PSIND(2);
10024 } else {
10025 pdep = pmap_pde(pmap, addr);
10026 if (pdep != NULL && (*pdep & PG_V) != 0) {
10027 if ((*pdep & PG_PS) != 0) {
10028 pte = *pdep;
10029 /* Compute the physical address of the 4KB page. */
10030 pa = ((pte & PG_PS_FRAME) | (addr &
10031 PDRMASK)) & PG_FRAME;
10032 val = MINCORE_PSIND(1);
10033 } else {
10034 pte = *pmap_pde_to_pte(pdep, addr);
10035 pa = pte & PG_FRAME;
10036 val = 0;
10037 }
10038 }
10039 }
10040 }
10041 if ((pte & PG_V) != 0) {
10042 val |= MINCORE_INCORE;
10043 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
10044 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
10045 if ((pte & PG_A) != 0)
10046 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
10047 }
10048 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
10049 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
10050 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
10051 *pap = pa;
10052 }
10053 out:
10054 PMAP_UNLOCK(pmap);
10055 return (val);
10056 }
10057
10058 static uint64_t
pmap_pcid_alloc(pmap_t pmap,struct pmap_pcid * pcidp)10059 pmap_pcid_alloc(pmap_t pmap, struct pmap_pcid *pcidp)
10060 {
10061 uint32_t gen, new_gen, pcid_next;
10062
10063 CRITICAL_ASSERT(curthread);
10064 gen = PCPU_GET(pcid_gen);
10065 if (pcidp->pm_pcid == PMAP_PCID_KERN)
10066 return (pti ? 0 : CR3_PCID_SAVE);
10067 if (pcidp->pm_gen == gen)
10068 return (CR3_PCID_SAVE);
10069 pcid_next = PCPU_GET(pcid_next);
10070 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
10071 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
10072 ("cpu %d pcid_next %#x", PCPU_GET(cpuid), pcid_next));
10073 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
10074 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
10075 new_gen = gen + 1;
10076 if (new_gen == 0)
10077 new_gen = 1;
10078 PCPU_SET(pcid_gen, new_gen);
10079 pcid_next = PMAP_PCID_KERN + 1;
10080 } else {
10081 new_gen = gen;
10082 }
10083 pcidp->pm_pcid = pcid_next;
10084 pcidp->pm_gen = new_gen;
10085 PCPU_SET(pcid_next, pcid_next + 1);
10086 return (0);
10087 }
10088
10089 static uint64_t
pmap_pcid_alloc_checked(pmap_t pmap,struct pmap_pcid * pcidp)10090 pmap_pcid_alloc_checked(pmap_t pmap, struct pmap_pcid *pcidp)
10091 {
10092 uint64_t cached;
10093
10094 cached = pmap_pcid_alloc(pmap, pcidp);
10095 KASSERT(pcidp->pm_pcid < PMAP_PCID_OVERMAX,
10096 ("pmap %p cpu %d pcid %#x", pmap, PCPU_GET(cpuid), pcidp->pm_pcid));
10097 KASSERT(pcidp->pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap,
10098 ("non-kernel pmap pmap %p cpu %d pcid %#x",
10099 pmap, PCPU_GET(cpuid), pcidp->pm_pcid));
10100 return (cached);
10101 }
10102
10103 static void
pmap_activate_sw_pti_post(struct thread * td,pmap_t pmap)10104 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap)
10105 {
10106
10107 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ?
10108 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base;
10109 }
10110
10111 static void
pmap_activate_sw_pcid_pti(struct thread * td,pmap_t pmap,u_int cpuid)10112 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid)
10113 {
10114 pmap_t old_pmap;
10115 struct pmap_pcid *pcidp, *old_pcidp;
10116 uint64_t cached, cr3, kcr3, ucr3;
10117
10118 KASSERT((read_rflags() & PSL_I) == 0,
10119 ("PCID needs interrupts disabled in pmap_activate_sw()"));
10120
10121 /* See the comment in pmap_invalidate_page_pcid(). */
10122 if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) {
10123 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
10124 old_pmap = PCPU_GET(curpmap);
10125 MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3);
10126 old_pcidp = zpcpu_get_cpu(old_pmap->pm_pcidp, cpuid);
10127 old_pcidp->pm_gen = 0;
10128 }
10129
10130 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid);
10131 cached = pmap_pcid_alloc_checked(pmap, pcidp);
10132 cr3 = rcr3();
10133 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
10134 load_cr3(pmap->pm_cr3 | pcidp->pm_pcid);
10135 PCPU_SET(curpmap, pmap);
10136 kcr3 = pmap->pm_cr3 | pcidp->pm_pcid;
10137 ucr3 = pmap->pm_ucr3 | pcidp->pm_pcid | PMAP_PCID_USER_PT;
10138
10139 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3)
10140 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
10141
10142 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
10143 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
10144 if (cached)
10145 counter_u64_add(pcid_save_cnt, 1);
10146
10147 pmap_activate_sw_pti_post(td, pmap);
10148 }
10149
10150 static void
pmap_activate_sw_pcid_nopti(struct thread * td __unused,pmap_t pmap,u_int cpuid)10151 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap,
10152 u_int cpuid)
10153 {
10154 struct pmap_pcid *pcidp;
10155 uint64_t cached, cr3;
10156
10157 KASSERT((read_rflags() & PSL_I) == 0,
10158 ("PCID needs interrupts disabled in pmap_activate_sw()"));
10159
10160 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid);
10161 cached = pmap_pcid_alloc_checked(pmap, pcidp);
10162 cr3 = rcr3();
10163 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
10164 load_cr3(pmap->pm_cr3 | pcidp->pm_pcid | cached);
10165 PCPU_SET(curpmap, pmap);
10166 if (cached)
10167 counter_u64_add(pcid_save_cnt, 1);
10168 }
10169
10170 static void
pmap_activate_sw_nopcid_nopti(struct thread * td __unused,pmap_t pmap,u_int cpuid __unused)10171 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap,
10172 u_int cpuid __unused)
10173 {
10174
10175 load_cr3(pmap->pm_cr3);
10176 PCPU_SET(curpmap, pmap);
10177 }
10178
10179 static void
pmap_activate_sw_nopcid_pti(struct thread * td,pmap_t pmap,u_int cpuid __unused)10180 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap,
10181 u_int cpuid __unused)
10182 {
10183
10184 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid);
10185 PCPU_SET(kcr3, pmap->pm_cr3);
10186 PCPU_SET(ucr3, pmap->pm_ucr3);
10187 pmap_activate_sw_pti_post(td, pmap);
10188 }
10189
10190 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t,
10191 u_int))
10192 {
10193
10194 if (pmap_pcid_enabled && pti)
10195 return (pmap_activate_sw_pcid_pti);
10196 else if (pmap_pcid_enabled && !pti)
10197 return (pmap_activate_sw_pcid_nopti);
10198 else if (!pmap_pcid_enabled && pti)
10199 return (pmap_activate_sw_nopcid_pti);
10200 else /* if (!pmap_pcid_enabled && !pti) */
10201 return (pmap_activate_sw_nopcid_nopti);
10202 }
10203
10204 void
pmap_activate_sw(struct thread * td)10205 pmap_activate_sw(struct thread *td)
10206 {
10207 pmap_t oldpmap, pmap;
10208 u_int cpuid;
10209
10210 oldpmap = PCPU_GET(curpmap);
10211 pmap = vmspace_pmap(td->td_proc->p_vmspace);
10212 if (oldpmap == pmap) {
10213 if (cpu_vendor_id != CPU_VENDOR_INTEL)
10214 mfence();
10215 return;
10216 }
10217 cpuid = PCPU_GET(cpuid);
10218 #ifdef SMP
10219 CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
10220 #else
10221 CPU_SET(cpuid, &pmap->pm_active);
10222 #endif
10223 pmap_activate_sw_mode(td, pmap, cpuid);
10224 #ifdef SMP
10225 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
10226 #else
10227 CPU_CLR(cpuid, &oldpmap->pm_active);
10228 #endif
10229 }
10230
10231 void
pmap_activate(struct thread * td)10232 pmap_activate(struct thread *td)
10233 {
10234 /*
10235 * invltlb_{invpcid,}_pcid_handler() is used to handle an
10236 * invalidate_all IPI, which checks for curpmap ==
10237 * smp_tlb_pmap. The below sequence of operations has a
10238 * window where %CR3 is loaded with the new pmap's PML4
10239 * address, but the curpmap value has not yet been updated.
10240 * This causes the invltlb IPI handler, which is called
10241 * between the updates, to execute as a NOP, which leaves
10242 * stale TLB entries.
10243 *
10244 * Note that the most common use of pmap_activate_sw(), from
10245 * a context switch, is immune to this race, because
10246 * interrupts are disabled (while the thread lock is owned),
10247 * so the IPI is delayed until after curpmap is updated. Protect
10248 * other callers in a similar way, by disabling interrupts
10249 * around the %cr3 register reload and curpmap assignment.
10250 */
10251 spinlock_enter();
10252 pmap_activate_sw(td);
10253 spinlock_exit();
10254 }
10255
10256 void
pmap_activate_boot(pmap_t pmap)10257 pmap_activate_boot(pmap_t pmap)
10258 {
10259 uint64_t kcr3;
10260 u_int cpuid;
10261
10262 /*
10263 * kernel_pmap must be never deactivated, and we ensure that
10264 * by never activating it at all.
10265 */
10266 MPASS(pmap != kernel_pmap);
10267
10268 cpuid = PCPU_GET(cpuid);
10269 #ifdef SMP
10270 CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
10271 #else
10272 CPU_SET(cpuid, &pmap->pm_active);
10273 #endif
10274 PCPU_SET(curpmap, pmap);
10275 if (pti) {
10276 kcr3 = pmap->pm_cr3;
10277 if (pmap_pcid_enabled)
10278 kcr3 |= pmap_get_pcid(pmap) | CR3_PCID_SAVE;
10279 } else {
10280 kcr3 = PMAP_NO_CR3;
10281 }
10282 PCPU_SET(kcr3, kcr3);
10283 PCPU_SET(ucr3, PMAP_NO_CR3);
10284 }
10285
10286 void
pmap_active_cpus(pmap_t pmap,cpuset_t * res)10287 pmap_active_cpus(pmap_t pmap, cpuset_t *res)
10288 {
10289 *res = pmap->pm_active;
10290 }
10291
10292 void
pmap_sync_icache(pmap_t pm,vm_offset_t va,vm_size_t sz)10293 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
10294 {
10295 }
10296
10297 /*
10298 * Increase the starting virtual address of the given mapping if a
10299 * different alignment might result in more superpage mappings.
10300 */
10301 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)10302 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
10303 vm_offset_t *addr, vm_size_t size)
10304 {
10305 vm_offset_t superpage_offset;
10306
10307 if (size < NBPDR)
10308 return;
10309 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
10310 offset += ptoa(object->pg_color);
10311 superpage_offset = offset & PDRMASK;
10312 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
10313 (*addr & PDRMASK) == superpage_offset)
10314 return;
10315 if ((*addr & PDRMASK) < superpage_offset)
10316 *addr = (*addr & ~PDRMASK) + superpage_offset;
10317 else
10318 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
10319 }
10320
10321 #ifdef INVARIANTS
10322 static unsigned long num_dirty_emulations;
10323 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
10324 &num_dirty_emulations, 0, NULL);
10325
10326 static unsigned long num_accessed_emulations;
10327 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
10328 &num_accessed_emulations, 0, NULL);
10329
10330 static unsigned long num_superpage_accessed_emulations;
10331 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
10332 &num_superpage_accessed_emulations, 0, NULL);
10333
10334 static unsigned long ad_emulation_superpage_promotions;
10335 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
10336 &ad_emulation_superpage_promotions, 0, NULL);
10337 #endif /* INVARIANTS */
10338
10339 int
pmap_emulate_accessed_dirty(pmap_t pmap,vm_offset_t va,int ftype)10340 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
10341 {
10342 int rv;
10343 struct rwlock *lock;
10344 #if VM_NRESERVLEVEL > 0
10345 vm_page_t m, mpte;
10346 #endif
10347 pd_entry_t *pde;
10348 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
10349
10350 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
10351 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
10352
10353 if (!pmap_emulate_ad_bits(pmap))
10354 return (-1);
10355
10356 PG_A = pmap_accessed_bit(pmap);
10357 PG_M = pmap_modified_bit(pmap);
10358 PG_V = pmap_valid_bit(pmap);
10359 PG_RW = pmap_rw_bit(pmap);
10360
10361 rv = -1;
10362 lock = NULL;
10363 PMAP_LOCK(pmap);
10364
10365 pde = pmap_pde(pmap, va);
10366 if (pde == NULL || (*pde & PG_V) == 0)
10367 goto done;
10368
10369 if ((*pde & PG_PS) != 0) {
10370 if (ftype == VM_PROT_READ) {
10371 #ifdef INVARIANTS
10372 atomic_add_long(&num_superpage_accessed_emulations, 1);
10373 #endif
10374 *pde |= PG_A;
10375 rv = 0;
10376 }
10377 goto done;
10378 }
10379
10380 pte = pmap_pde_to_pte(pde, va);
10381 if ((*pte & PG_V) == 0)
10382 goto done;
10383
10384 if (ftype == VM_PROT_WRITE) {
10385 if ((*pte & PG_RW) == 0)
10386 goto done;
10387 /*
10388 * Set the modified and accessed bits simultaneously.
10389 *
10390 * Intel EPT PTEs that do software emulation of A/D bits map
10391 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
10392 * An EPT misconfiguration is triggered if the PTE is writable
10393 * but not readable (WR=10). This is avoided by setting PG_A
10394 * and PG_M simultaneously.
10395 */
10396 *pte |= PG_M | PG_A;
10397 } else {
10398 *pte |= PG_A;
10399 }
10400
10401 #if VM_NRESERVLEVEL > 0
10402 /* try to promote the mapping */
10403 if (va < VM_MAXUSER_ADDRESS)
10404 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
10405 else
10406 mpte = NULL;
10407
10408 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
10409
10410 if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
10411 (m->flags & PG_FICTITIOUS) == 0 &&
10412 vm_reserv_level_iffullpop(m) == 0 &&
10413 pmap_promote_pde(pmap, pde, va, mpte, &lock)) {
10414 #ifdef INVARIANTS
10415 atomic_add_long(&ad_emulation_superpage_promotions, 1);
10416 #endif
10417 }
10418 #endif
10419
10420 #ifdef INVARIANTS
10421 if (ftype == VM_PROT_WRITE)
10422 atomic_add_long(&num_dirty_emulations, 1);
10423 else
10424 atomic_add_long(&num_accessed_emulations, 1);
10425 #endif
10426 rv = 0; /* success */
10427 done:
10428 if (lock != NULL)
10429 rw_wunlock(lock);
10430 PMAP_UNLOCK(pmap);
10431 return (rv);
10432 }
10433
10434 void
pmap_get_mapping(pmap_t pmap,vm_offset_t va,uint64_t * ptr,int * num)10435 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
10436 {
10437 pml4_entry_t *pml4;
10438 pdp_entry_t *pdp;
10439 pd_entry_t *pde;
10440 pt_entry_t *pte, PG_V;
10441 int idx;
10442
10443 idx = 0;
10444 PG_V = pmap_valid_bit(pmap);
10445 PMAP_LOCK(pmap);
10446
10447 pml4 = pmap_pml4e(pmap, va);
10448 if (pml4 == NULL)
10449 goto done;
10450 ptr[idx++] = *pml4;
10451 if ((*pml4 & PG_V) == 0)
10452 goto done;
10453
10454 pdp = pmap_pml4e_to_pdpe(pml4, va);
10455 ptr[idx++] = *pdp;
10456 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
10457 goto done;
10458
10459 pde = pmap_pdpe_to_pde(pdp, va);
10460 ptr[idx++] = *pde;
10461 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
10462 goto done;
10463
10464 pte = pmap_pde_to_pte(pde, va);
10465 ptr[idx++] = *pte;
10466
10467 done:
10468 PMAP_UNLOCK(pmap);
10469 *num = idx;
10470 }
10471
10472 /**
10473 * Get the kernel virtual address of a set of physical pages. If there are
10474 * physical addresses not covered by the DMAP perform a transient mapping
10475 * that will be removed when calling pmap_unmap_io_transient.
10476 *
10477 * \param page The pages the caller wishes to obtain the virtual
10478 * address on the kernel memory map.
10479 * \param vaddr On return contains the kernel virtual memory address
10480 * of the pages passed in the page parameter.
10481 * \param count Number of pages passed in.
10482 * \param can_fault true if the thread using the mapped pages can take
10483 * page faults, false otherwise.
10484 *
10485 * \returns true if the caller must call pmap_unmap_io_transient when
10486 * finished or false otherwise.
10487 *
10488 */
10489 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)10490 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
10491 bool can_fault)
10492 {
10493 vm_paddr_t paddr;
10494 bool needs_mapping;
10495 pt_entry_t *pte;
10496 int cache_bits, error __unused, i;
10497
10498 /*
10499 * Allocate any KVA space that we need, this is done in a separate
10500 * loop to prevent calling vmem_alloc while pinned.
10501 */
10502 needs_mapping = false;
10503 for (i = 0; i < count; i++) {
10504 paddr = VM_PAGE_TO_PHYS(page[i]);
10505 if (__predict_false(paddr >= dmaplimit)) {
10506 error = vmem_alloc(kernel_arena, PAGE_SIZE,
10507 M_BESTFIT | M_WAITOK, &vaddr[i]);
10508 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
10509 needs_mapping = true;
10510 } else {
10511 vaddr[i] = PHYS_TO_DMAP(paddr);
10512 }
10513 }
10514
10515 /* Exit early if everything is covered by the DMAP */
10516 if (!needs_mapping)
10517 return (false);
10518
10519 /*
10520 * NB: The sequence of updating a page table followed by accesses
10521 * to the corresponding pages used in the !DMAP case is subject to
10522 * the situation described in the "AMD64 Architecture Programmer's
10523 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
10524 * Coherency Considerations". Therefore, issuing the INVLPG right
10525 * after modifying the PTE bits is crucial.
10526 */
10527 if (!can_fault)
10528 sched_pin();
10529 for (i = 0; i < count; i++) {
10530 paddr = VM_PAGE_TO_PHYS(page[i]);
10531 if (paddr >= dmaplimit) {
10532 if (can_fault) {
10533 /*
10534 * Slow path, since we can get page faults
10535 * while mappings are active don't pin the
10536 * thread to the CPU and instead add a global
10537 * mapping visible to all CPUs.
10538 */
10539 pmap_qenter(vaddr[i], &page[i], 1);
10540 } else {
10541 pte = vtopte(vaddr[i]);
10542 cache_bits = pmap_cache_bits(kernel_pmap,
10543 page[i]->md.pat_mode, false);
10544 pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
10545 cache_bits);
10546 pmap_invlpg(kernel_pmap, vaddr[i]);
10547 }
10548 }
10549 }
10550
10551 return (needs_mapping);
10552 }
10553
10554 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)10555 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
10556 bool can_fault)
10557 {
10558 vm_paddr_t paddr;
10559 int i;
10560
10561 if (!can_fault)
10562 sched_unpin();
10563 for (i = 0; i < count; i++) {
10564 paddr = VM_PAGE_TO_PHYS(page[i]);
10565 if (paddr >= dmaplimit) {
10566 if (can_fault)
10567 pmap_qremove(vaddr[i], 1);
10568 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
10569 }
10570 }
10571 }
10572
10573 vm_offset_t
pmap_quick_enter_page(vm_page_t m)10574 pmap_quick_enter_page(vm_page_t m)
10575 {
10576 vm_paddr_t paddr;
10577
10578 paddr = VM_PAGE_TO_PHYS(m);
10579 if (paddr < dmaplimit)
10580 return (PHYS_TO_DMAP(paddr));
10581 mtx_lock_spin(&qframe_mtx);
10582 KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
10583
10584 /*
10585 * Since qframe is exclusively mapped by us, and we do not set
10586 * PG_G, we can use INVLPG here.
10587 */
10588 invlpg(qframe);
10589
10590 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
10591 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
10592 return (qframe);
10593 }
10594
10595 void
pmap_quick_remove_page(vm_offset_t addr)10596 pmap_quick_remove_page(vm_offset_t addr)
10597 {
10598
10599 if (addr != qframe)
10600 return;
10601 pte_store(vtopte(qframe), 0);
10602 mtx_unlock_spin(&qframe_mtx);
10603 }
10604
10605 /*
10606 * Pdp pages from the large map are managed differently from either
10607 * kernel or user page table pages. They are permanently allocated at
10608 * initialization time, and their reference count is permanently set to
10609 * zero. The pml4 entries pointing to those pages are copied into
10610 * each allocated pmap.
10611 *
10612 * In contrast, pd and pt pages are managed like user page table
10613 * pages. They are dynamically allocated, and their reference count
10614 * represents the number of valid entries within the page.
10615 */
10616 static vm_page_t
pmap_large_map_getptp_unlocked(void)10617 pmap_large_map_getptp_unlocked(void)
10618 {
10619 return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO));
10620 }
10621
10622 static vm_page_t
pmap_large_map_getptp(void)10623 pmap_large_map_getptp(void)
10624 {
10625 vm_page_t m;
10626
10627 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
10628 m = pmap_large_map_getptp_unlocked();
10629 if (m == NULL) {
10630 PMAP_UNLOCK(kernel_pmap);
10631 vm_wait(NULL);
10632 PMAP_LOCK(kernel_pmap);
10633 /* Callers retry. */
10634 }
10635 return (m);
10636 }
10637
10638 static pdp_entry_t *
pmap_large_map_pdpe(vm_offset_t va)10639 pmap_large_map_pdpe(vm_offset_t va)
10640 {
10641 vm_pindex_t pml4_idx;
10642 vm_paddr_t mphys;
10643
10644 pml4_idx = pmap_pml4e_index(va);
10645 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
10646 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
10647 "%#jx lm_ents %d",
10648 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
10649 KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
10650 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
10651 "LMSPML4I %#jx lm_ents %d",
10652 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
10653 mphys = kernel_pml4[pml4_idx] & PG_FRAME;
10654 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
10655 }
10656
10657 static pd_entry_t *
pmap_large_map_pde(vm_offset_t va)10658 pmap_large_map_pde(vm_offset_t va)
10659 {
10660 pdp_entry_t *pdpe;
10661 vm_page_t m;
10662 vm_paddr_t mphys;
10663
10664 retry:
10665 pdpe = pmap_large_map_pdpe(va);
10666 if (*pdpe == 0) {
10667 m = pmap_large_map_getptp();
10668 if (m == NULL)
10669 goto retry;
10670 mphys = VM_PAGE_TO_PHYS(m);
10671 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
10672 } else {
10673 MPASS((*pdpe & X86_PG_PS) == 0);
10674 mphys = *pdpe & PG_FRAME;
10675 }
10676 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va));
10677 }
10678
10679 static pt_entry_t *
pmap_large_map_pte(vm_offset_t va)10680 pmap_large_map_pte(vm_offset_t va)
10681 {
10682 pd_entry_t *pde;
10683 vm_page_t m;
10684 vm_paddr_t mphys;
10685
10686 retry:
10687 pde = pmap_large_map_pde(va);
10688 if (*pde == 0) {
10689 m = pmap_large_map_getptp();
10690 if (m == NULL)
10691 goto retry;
10692 mphys = VM_PAGE_TO_PHYS(m);
10693 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
10694 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++;
10695 } else {
10696 MPASS((*pde & X86_PG_PS) == 0);
10697 mphys = *pde & PG_FRAME;
10698 }
10699 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va));
10700 }
10701
10702 static vm_paddr_t
pmap_large_map_kextract(vm_offset_t va)10703 pmap_large_map_kextract(vm_offset_t va)
10704 {
10705 pdp_entry_t *pdpe, pdp;
10706 pd_entry_t *pde, pd;
10707 pt_entry_t *pte, pt;
10708
10709 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va),
10710 ("not largemap range %#lx", (u_long)va));
10711 pdpe = pmap_large_map_pdpe(va);
10712 pdp = *pdpe;
10713 KASSERT((pdp & X86_PG_V) != 0,
10714 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
10715 (u_long)pdpe, pdp));
10716 if ((pdp & X86_PG_PS) != 0) {
10717 KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
10718 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
10719 (u_long)pdpe, pdp));
10720 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK));
10721 }
10722 pde = pmap_pdpe_to_pde(pdpe, va);
10723 pd = *pde;
10724 KASSERT((pd & X86_PG_V) != 0,
10725 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd));
10726 if ((pd & X86_PG_PS) != 0)
10727 return ((pd & PG_PS_FRAME) | (va & PDRMASK));
10728 pte = pmap_pde_to_pte(pde, va);
10729 pt = *pte;
10730 KASSERT((pt & X86_PG_V) != 0,
10731 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt));
10732 return ((pt & PG_FRAME) | (va & PAGE_MASK));
10733 }
10734
10735 static int
pmap_large_map_getva(vm_size_t len,vm_offset_t align,vm_offset_t phase,vmem_addr_t * vmem_res)10736 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase,
10737 vmem_addr_t *vmem_res)
10738 {
10739
10740 /*
10741 * Large mappings are all but static. Consequently, there
10742 * is no point in waiting for an earlier allocation to be
10743 * freed.
10744 */
10745 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN,
10746 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res));
10747 }
10748
10749 int
pmap_large_map(vm_paddr_t spa,vm_size_t len,void ** addr,vm_memattr_t mattr)10750 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr,
10751 vm_memattr_t mattr)
10752 {
10753 pdp_entry_t *pdpe;
10754 pd_entry_t *pde;
10755 pt_entry_t *pte;
10756 vm_offset_t va, inc;
10757 vmem_addr_t vmem_res;
10758 vm_paddr_t pa;
10759 int error;
10760
10761 if (len == 0 || spa + len < spa)
10762 return (EINVAL);
10763
10764 /* See if DMAP can serve. */
10765 if (spa + len <= dmaplimit) {
10766 va = PHYS_TO_DMAP(spa);
10767 *addr = (void *)va;
10768 return (pmap_change_attr(va, len, mattr));
10769 }
10770
10771 /*
10772 * No, allocate KVA. Fit the address with best possible
10773 * alignment for superpages. Fall back to worse align if
10774 * failed.
10775 */
10776 error = ENOMEM;
10777 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len,
10778 NBPDP) >= roundup2(spa, NBPDP) + NBPDP)
10779 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK,
10780 &vmem_res);
10781 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa,
10782 NBPDR) + NBPDR)
10783 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK,
10784 &vmem_res);
10785 if (error != 0)
10786 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res);
10787 if (error != 0)
10788 return (error);
10789
10790 /*
10791 * Fill pagetable. PG_M is not pre-set, we scan modified bits
10792 * in the pagetable to minimize flushing. No need to
10793 * invalidate TLB, since we only update invalid entries.
10794 */
10795 PMAP_LOCK(kernel_pmap);
10796 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc,
10797 len -= inc) {
10798 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP &&
10799 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) {
10800 pdpe = pmap_large_map_pdpe(va);
10801 MPASS(*pdpe == 0);
10802 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW |
10803 X86_PG_V | X86_PG_A | pg_nx |
10804 pmap_cache_bits(kernel_pmap, mattr, TRUE);
10805 inc = NBPDP;
10806 } else if (len >= NBPDR && (pa & PDRMASK) == 0 &&
10807 (va & PDRMASK) == 0) {
10808 pde = pmap_large_map_pde(va);
10809 MPASS(*pde == 0);
10810 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW |
10811 X86_PG_V | X86_PG_A | pg_nx |
10812 pmap_cache_bits(kernel_pmap, mattr, TRUE);
10813 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->
10814 ref_count++;
10815 inc = NBPDR;
10816 } else {
10817 pte = pmap_large_map_pte(va);
10818 MPASS(*pte == 0);
10819 *pte = pa | pg_g | X86_PG_RW | X86_PG_V |
10820 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap,
10821 mattr, FALSE);
10822 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))->
10823 ref_count++;
10824 inc = PAGE_SIZE;
10825 }
10826 }
10827 PMAP_UNLOCK(kernel_pmap);
10828 MPASS(len == 0);
10829
10830 *addr = (void *)vmem_res;
10831 return (0);
10832 }
10833
10834 void
pmap_large_unmap(void * svaa,vm_size_t len)10835 pmap_large_unmap(void *svaa, vm_size_t len)
10836 {
10837 vm_offset_t sva, va;
10838 vm_size_t inc;
10839 pdp_entry_t *pdpe, pdp;
10840 pd_entry_t *pde, pd;
10841 pt_entry_t *pte;
10842 vm_page_t m;
10843 struct spglist spgf;
10844
10845 sva = (vm_offset_t)svaa;
10846 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
10847 sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
10848 return;
10849
10850 SLIST_INIT(&spgf);
10851 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) &&
10852 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1),
10853 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len));
10854 PMAP_LOCK(kernel_pmap);
10855 for (va = sva; va < sva + len; va += inc) {
10856 pdpe = pmap_large_map_pdpe(va);
10857 pdp = *pdpe;
10858 KASSERT((pdp & X86_PG_V) != 0,
10859 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
10860 (u_long)pdpe, pdp));
10861 if ((pdp & X86_PG_PS) != 0) {
10862 KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
10863 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
10864 (u_long)pdpe, pdp));
10865 KASSERT((va & PDPMASK) == 0,
10866 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va,
10867 (u_long)pdpe, pdp));
10868 KASSERT(va + NBPDP <= sva + len,
10869 ("unmap covers partial 1GB page, sva %#lx va %#lx "
10870 "pdpe %#lx pdp %#lx len %#lx", sva, va,
10871 (u_long)pdpe, pdp, len));
10872 *pdpe = 0;
10873 inc = NBPDP;
10874 continue;
10875 }
10876 pde = pmap_pdpe_to_pde(pdpe, va);
10877 pd = *pde;
10878 KASSERT((pd & X86_PG_V) != 0,
10879 ("invalid pd va %#lx pde %#lx pd %#lx", va,
10880 (u_long)pde, pd));
10881 if ((pd & X86_PG_PS) != 0) {
10882 KASSERT((va & PDRMASK) == 0,
10883 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va,
10884 (u_long)pde, pd));
10885 KASSERT(va + NBPDR <= sva + len,
10886 ("unmap covers partial 2MB page, sva %#lx va %#lx "
10887 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde,
10888 pd, len));
10889 pde_store(pde, 0);
10890 inc = NBPDR;
10891 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
10892 m->ref_count--;
10893 if (m->ref_count == 0) {
10894 *pdpe = 0;
10895 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
10896 }
10897 continue;
10898 }
10899 pte = pmap_pde_to_pte(pde, va);
10900 KASSERT((*pte & X86_PG_V) != 0,
10901 ("invalid pte va %#lx pte %#lx pt %#lx", va,
10902 (u_long)pte, *pte));
10903 pte_clear(pte);
10904 inc = PAGE_SIZE;
10905 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte));
10906 m->ref_count--;
10907 if (m->ref_count == 0) {
10908 *pde = 0;
10909 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
10910 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
10911 m->ref_count--;
10912 if (m->ref_count == 0) {
10913 *pdpe = 0;
10914 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
10915 }
10916 }
10917 }
10918 pmap_invalidate_range(kernel_pmap, sva, sva + len);
10919 PMAP_UNLOCK(kernel_pmap);
10920 vm_page_free_pages_toq(&spgf, false);
10921 vmem_free(large_vmem, sva, len);
10922 }
10923
10924 static void
pmap_large_map_wb_fence_mfence(void)10925 pmap_large_map_wb_fence_mfence(void)
10926 {
10927
10928 mfence();
10929 }
10930
10931 static void
pmap_large_map_wb_fence_atomic(void)10932 pmap_large_map_wb_fence_atomic(void)
10933 {
10934
10935 atomic_thread_fence_seq_cst();
10936 }
10937
10938 static void
pmap_large_map_wb_fence_nop(void)10939 pmap_large_map_wb_fence_nop(void)
10940 {
10941 }
10942
10943 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void))
10944 {
10945
10946 if (cpu_vendor_id != CPU_VENDOR_INTEL)
10947 return (pmap_large_map_wb_fence_mfence);
10948 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB |
10949 CPUID_STDEXT_CLFLUSHOPT)) == 0)
10950 return (pmap_large_map_wb_fence_atomic);
10951 else
10952 /* clflush is strongly enough ordered */
10953 return (pmap_large_map_wb_fence_nop);
10954 }
10955
10956 static void
pmap_large_map_flush_range_clwb(vm_offset_t va,vm_size_t len)10957 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)
10958 {
10959
10960 for (; len > 0; len -= cpu_clflush_line_size,
10961 va += cpu_clflush_line_size)
10962 clwb(va);
10963 }
10964
10965 static void
pmap_large_map_flush_range_clflushopt(vm_offset_t va,vm_size_t len)10966 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len)
10967 {
10968
10969 for (; len > 0; len -= cpu_clflush_line_size,
10970 va += cpu_clflush_line_size)
10971 clflushopt(va);
10972 }
10973
10974 static void
pmap_large_map_flush_range_clflush(vm_offset_t va,vm_size_t len)10975 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len)
10976 {
10977
10978 for (; len > 0; len -= cpu_clflush_line_size,
10979 va += cpu_clflush_line_size)
10980 clflush(va);
10981 }
10982
10983 static void
pmap_large_map_flush_range_nop(vm_offset_t sva __unused,vm_size_t len __unused)10984 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused)
10985 {
10986 }
10987
10988 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t))
10989 {
10990
10991 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0)
10992 return (pmap_large_map_flush_range_clwb);
10993 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0)
10994 return (pmap_large_map_flush_range_clflushopt);
10995 else if ((cpu_feature & CPUID_CLFSH) != 0)
10996 return (pmap_large_map_flush_range_clflush);
10997 else
10998 return (pmap_large_map_flush_range_nop);
10999 }
11000
11001 static void
pmap_large_map_wb_large(vm_offset_t sva,vm_offset_t eva)11002 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva)
11003 {
11004 volatile u_long *pe;
11005 u_long p;
11006 vm_offset_t va;
11007 vm_size_t inc;
11008 bool seen_other;
11009
11010 for (va = sva; va < eva; va += inc) {
11011 inc = 0;
11012 if ((amd_feature & AMDID_PAGE1GB) != 0) {
11013 pe = (volatile u_long *)pmap_large_map_pdpe(va);
11014 p = *pe;
11015 if ((p & X86_PG_PS) != 0)
11016 inc = NBPDP;
11017 }
11018 if (inc == 0) {
11019 pe = (volatile u_long *)pmap_large_map_pde(va);
11020 p = *pe;
11021 if ((p & X86_PG_PS) != 0)
11022 inc = NBPDR;
11023 }
11024 if (inc == 0) {
11025 pe = (volatile u_long *)pmap_large_map_pte(va);
11026 p = *pe;
11027 inc = PAGE_SIZE;
11028 }
11029 seen_other = false;
11030 for (;;) {
11031 if ((p & X86_PG_AVAIL1) != 0) {
11032 /*
11033 * Spin-wait for the end of a parallel
11034 * write-back.
11035 */
11036 cpu_spinwait();
11037 p = *pe;
11038
11039 /*
11040 * If we saw other write-back
11041 * occuring, we cannot rely on PG_M to
11042 * indicate state of the cache. The
11043 * PG_M bit is cleared before the
11044 * flush to avoid ignoring new writes,
11045 * and writes which are relevant for
11046 * us might happen after.
11047 */
11048 seen_other = true;
11049 continue;
11050 }
11051
11052 if ((p & X86_PG_M) != 0 || seen_other) {
11053 if (!atomic_fcmpset_long(pe, &p,
11054 (p & ~X86_PG_M) | X86_PG_AVAIL1))
11055 /*
11056 * If we saw PG_M without
11057 * PG_AVAIL1, and then on the
11058 * next attempt we do not
11059 * observe either PG_M or
11060 * PG_AVAIL1, the other
11061 * write-back started after us
11062 * and finished before us. We
11063 * can rely on it doing our
11064 * work.
11065 */
11066 continue;
11067 pmap_large_map_flush_range(va, inc);
11068 atomic_clear_long(pe, X86_PG_AVAIL1);
11069 }
11070 break;
11071 }
11072 maybe_yield();
11073 }
11074 }
11075
11076 /*
11077 * Write-back cache lines for the given address range.
11078 *
11079 * Must be called only on the range or sub-range returned from
11080 * pmap_large_map(). Must not be called on the coalesced ranges.
11081 *
11082 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH
11083 * instructions support.
11084 */
11085 void
pmap_large_map_wb(void * svap,vm_size_t len)11086 pmap_large_map_wb(void *svap, vm_size_t len)
11087 {
11088 vm_offset_t eva, sva;
11089
11090 sva = (vm_offset_t)svap;
11091 eva = sva + len;
11092 pmap_large_map_wb_fence();
11093 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
11094 pmap_large_map_flush_range(sva, len);
11095 } else {
11096 KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
11097 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
11098 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
11099 pmap_large_map_wb_large(sva, eva);
11100 }
11101 pmap_large_map_wb_fence();
11102 }
11103
11104 static vm_page_t
pmap_pti_alloc_page(void)11105 pmap_pti_alloc_page(void)
11106 {
11107 vm_page_t m;
11108
11109 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11110 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_WIRED | VM_ALLOC_ZERO);
11111 return (m);
11112 }
11113
11114 static bool
pmap_pti_free_page(vm_page_t m)11115 pmap_pti_free_page(vm_page_t m)
11116 {
11117 if (!vm_page_unwire_noq(m))
11118 return (false);
11119 vm_page_xbusy_claim(m);
11120 vm_page_free_zero(m);
11121 return (true);
11122 }
11123
11124 static void
pmap_pti_init(void)11125 pmap_pti_init(void)
11126 {
11127 vm_page_t pml4_pg;
11128 pdp_entry_t *pdpe;
11129 vm_offset_t va;
11130 int i;
11131
11132 if (!pti)
11133 return;
11134 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
11135 VM_OBJECT_WLOCK(pti_obj);
11136 pml4_pg = pmap_pti_alloc_page();
11137 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
11138 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
11139 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
11140 pdpe = pmap_pti_pdpe(va);
11141 pmap_pti_wire_pte(pdpe);
11142 }
11143 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
11144 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
11145 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
11146 sizeof(struct gate_descriptor) * NIDT, false);
11147 CPU_FOREACH(i) {
11148 /* Doublefault stack IST 1 */
11149 va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu);
11150 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false);
11151 /* NMI stack IST 2 */
11152 va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu);
11153 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false);
11154 /* MC# stack IST 3 */
11155 va = __pcpu[i].pc_common_tss.tss_ist3 +
11156 sizeof(struct nmi_pcpu);
11157 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false);
11158 /* DB# stack IST 4 */
11159 va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
11160 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false);
11161 }
11162 pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext,
11163 true);
11164 pti_finalized = true;
11165 VM_OBJECT_WUNLOCK(pti_obj);
11166 }
11167
11168 static void
pmap_cpu_init(void * arg __unused)11169 pmap_cpu_init(void *arg __unused)
11170 {
11171 CPU_COPY(&all_cpus, &kernel_pmap->pm_active);
11172 pmap_pti_init();
11173 }
11174 SYSINIT(pmap_cpu, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_cpu_init, NULL);
11175
11176 static pdp_entry_t *
pmap_pti_pdpe(vm_offset_t va)11177 pmap_pti_pdpe(vm_offset_t va)
11178 {
11179 pml4_entry_t *pml4e;
11180 pdp_entry_t *pdpe;
11181 vm_page_t m;
11182 vm_pindex_t pml4_idx;
11183 vm_paddr_t mphys;
11184
11185 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11186
11187 pml4_idx = pmap_pml4e_index(va);
11188 pml4e = &pti_pml4[pml4_idx];
11189 m = NULL;
11190 if (*pml4e == 0) {
11191 if (pti_finalized)
11192 panic("pml4 alloc after finalization\n");
11193 m = pmap_pti_alloc_page();
11194 if (*pml4e != 0) {
11195 pmap_pti_free_page(m);
11196 mphys = *pml4e & ~PAGE_MASK;
11197 } else {
11198 mphys = VM_PAGE_TO_PHYS(m);
11199 *pml4e = mphys | X86_PG_RW | X86_PG_V;
11200 }
11201 } else {
11202 mphys = *pml4e & ~PAGE_MASK;
11203 }
11204 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
11205 return (pdpe);
11206 }
11207
11208 static void
pmap_pti_wire_pte(void * pte)11209 pmap_pti_wire_pte(void *pte)
11210 {
11211 vm_page_t m;
11212
11213 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11214 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
11215 m->ref_count++;
11216 }
11217
11218 static void
pmap_pti_unwire_pde(void * pde,bool only_ref)11219 pmap_pti_unwire_pde(void *pde, bool only_ref)
11220 {
11221 vm_page_t m;
11222
11223 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11224 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
11225 MPASS(only_ref || m->ref_count > 1);
11226 pmap_pti_free_page(m);
11227 }
11228
11229 static void
pmap_pti_unwire_pte(void * pte,vm_offset_t va)11230 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
11231 {
11232 vm_page_t m;
11233 pd_entry_t *pde;
11234
11235 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11236 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
11237 if (pmap_pti_free_page(m)) {
11238 pde = pmap_pti_pde(va);
11239 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
11240 *pde = 0;
11241 pmap_pti_unwire_pde(pde, false);
11242 }
11243 }
11244
11245 static pd_entry_t *
pmap_pti_pde(vm_offset_t va)11246 pmap_pti_pde(vm_offset_t va)
11247 {
11248 pdp_entry_t *pdpe;
11249 pd_entry_t *pde;
11250 vm_page_t m;
11251 vm_pindex_t pd_idx;
11252 vm_paddr_t mphys;
11253
11254 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11255
11256 pdpe = pmap_pti_pdpe(va);
11257 if (*pdpe == 0) {
11258 m = pmap_pti_alloc_page();
11259 if (*pdpe != 0) {
11260 pmap_pti_free_page(m);
11261 MPASS((*pdpe & X86_PG_PS) == 0);
11262 mphys = *pdpe & ~PAGE_MASK;
11263 } else {
11264 mphys = VM_PAGE_TO_PHYS(m);
11265 *pdpe = mphys | X86_PG_RW | X86_PG_V;
11266 }
11267 } else {
11268 MPASS((*pdpe & X86_PG_PS) == 0);
11269 mphys = *pdpe & ~PAGE_MASK;
11270 }
11271
11272 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
11273 pd_idx = pmap_pde_index(va);
11274 pde += pd_idx;
11275 return (pde);
11276 }
11277
11278 static pt_entry_t *
pmap_pti_pte(vm_offset_t va,bool * unwire_pde)11279 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
11280 {
11281 pd_entry_t *pde;
11282 pt_entry_t *pte;
11283 vm_page_t m;
11284 vm_paddr_t mphys;
11285
11286 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11287
11288 pde = pmap_pti_pde(va);
11289 if (unwire_pde != NULL) {
11290 *unwire_pde = true;
11291 pmap_pti_wire_pte(pde);
11292 }
11293 if (*pde == 0) {
11294 m = pmap_pti_alloc_page();
11295 if (*pde != 0) {
11296 pmap_pti_free_page(m);
11297 MPASS((*pde & X86_PG_PS) == 0);
11298 mphys = *pde & ~(PAGE_MASK | pg_nx);
11299 } else {
11300 mphys = VM_PAGE_TO_PHYS(m);
11301 *pde = mphys | X86_PG_RW | X86_PG_V;
11302 if (unwire_pde != NULL)
11303 *unwire_pde = false;
11304 }
11305 } else {
11306 MPASS((*pde & X86_PG_PS) == 0);
11307 mphys = *pde & ~(PAGE_MASK | pg_nx);
11308 }
11309
11310 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
11311 pte += pmap_pte_index(va);
11312
11313 return (pte);
11314 }
11315
11316 static void
pmap_pti_add_kva_locked(vm_offset_t sva,vm_offset_t eva,bool exec)11317 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
11318 {
11319 vm_paddr_t pa;
11320 pd_entry_t *pde;
11321 pt_entry_t *pte, ptev;
11322 bool unwire_pde;
11323
11324 VM_OBJECT_ASSERT_WLOCKED(pti_obj);
11325
11326 sva = trunc_page(sva);
11327 MPASS(sva > VM_MAXUSER_ADDRESS);
11328 eva = round_page(eva);
11329 MPASS(sva < eva);
11330 for (; sva < eva; sva += PAGE_SIZE) {
11331 pte = pmap_pti_pte(sva, &unwire_pde);
11332 pa = pmap_kextract(sva);
11333 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G |
11334 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
11335 VM_MEMATTR_DEFAULT, FALSE);
11336 if (*pte == 0) {
11337 pte_store(pte, ptev);
11338 pmap_pti_wire_pte(pte);
11339 } else {
11340 KASSERT(!pti_finalized,
11341 ("pti overlap after fin %#lx %#lx %#lx",
11342 sva, *pte, ptev));
11343 KASSERT(*pte == ptev,
11344 ("pti non-identical pte after fin %#lx %#lx %#lx",
11345 sva, *pte, ptev));
11346 }
11347 if (unwire_pde) {
11348 pde = pmap_pti_pde(sva);
11349 pmap_pti_unwire_pde(pde, true);
11350 }
11351 }
11352 }
11353
11354 void
pmap_pti_add_kva(vm_offset_t sva,vm_offset_t eva,bool exec)11355 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
11356 {
11357
11358 if (!pti)
11359 return;
11360 VM_OBJECT_WLOCK(pti_obj);
11361 pmap_pti_add_kva_locked(sva, eva, exec);
11362 VM_OBJECT_WUNLOCK(pti_obj);
11363 }
11364
11365 void
pmap_pti_remove_kva(vm_offset_t sva,vm_offset_t eva)11366 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
11367 {
11368 pt_entry_t *pte;
11369 vm_offset_t va;
11370
11371 if (!pti)
11372 return;
11373 sva = rounddown2(sva, PAGE_SIZE);
11374 MPASS(sva > VM_MAXUSER_ADDRESS);
11375 eva = roundup2(eva, PAGE_SIZE);
11376 MPASS(sva < eva);
11377 VM_OBJECT_WLOCK(pti_obj);
11378 for (va = sva; va < eva; va += PAGE_SIZE) {
11379 pte = pmap_pti_pte(va, NULL);
11380 KASSERT((*pte & X86_PG_V) != 0,
11381 ("invalid pte va %#lx pte %#lx pt %#lx", va,
11382 (u_long)pte, *pte));
11383 pte_clear(pte);
11384 pmap_pti_unwire_pte(pte, va);
11385 }
11386 pmap_invalidate_range(kernel_pmap, sva, eva);
11387 VM_OBJECT_WUNLOCK(pti_obj);
11388 }
11389
11390 static void *
pkru_dup_range(void * ctx __unused,void * data)11391 pkru_dup_range(void *ctx __unused, void *data)
11392 {
11393 struct pmap_pkru_range *node, *new_node;
11394
11395 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
11396 if (new_node == NULL)
11397 return (NULL);
11398 node = data;
11399 memcpy(new_node, node, sizeof(*node));
11400 return (new_node);
11401 }
11402
11403 static void
pkru_free_range(void * ctx __unused,void * node)11404 pkru_free_range(void *ctx __unused, void *node)
11405 {
11406
11407 uma_zfree(pmap_pkru_ranges_zone, node);
11408 }
11409
11410 static int
pmap_pkru_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,u_int keyidx,int flags)11411 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
11412 int flags)
11413 {
11414 struct pmap_pkru_range *ppr;
11415 int error;
11416
11417 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11418 MPASS(pmap->pm_type == PT_X86);
11419 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
11420 if ((flags & AMD64_PKRU_EXCL) != 0 &&
11421 !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
11422 return (EBUSY);
11423 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
11424 if (ppr == NULL)
11425 return (ENOMEM);
11426 ppr->pkru_keyidx = keyidx;
11427 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
11428 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
11429 if (error != 0)
11430 uma_zfree(pmap_pkru_ranges_zone, ppr);
11431 return (error);
11432 }
11433
11434 static int
pmap_pkru_deassign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)11435 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
11436 {
11437
11438 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11439 MPASS(pmap->pm_type == PT_X86);
11440 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
11441 return (rangeset_remove(&pmap->pm_pkru, sva, eva));
11442 }
11443
11444 static void
pmap_pkru_deassign_all(pmap_t pmap)11445 pmap_pkru_deassign_all(pmap_t pmap)
11446 {
11447
11448 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11449 if (pmap->pm_type == PT_X86 &&
11450 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
11451 rangeset_remove_all(&pmap->pm_pkru);
11452 }
11453
11454 static bool
pmap_pkru_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)11455 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
11456 {
11457 struct pmap_pkru_range *ppr, *prev_ppr;
11458 vm_offset_t va;
11459
11460 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11461 if (pmap->pm_type != PT_X86 ||
11462 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
11463 sva >= VM_MAXUSER_ADDRESS)
11464 return (true);
11465 MPASS(eva <= VM_MAXUSER_ADDRESS);
11466 for (va = sva; va < eva; prev_ppr = ppr) {
11467 ppr = rangeset_lookup(&pmap->pm_pkru, va);
11468 if (va == sva)
11469 prev_ppr = ppr;
11470 else if ((ppr == NULL) ^ (prev_ppr == NULL))
11471 return (false);
11472 if (ppr == NULL) {
11473 va += PAGE_SIZE;
11474 continue;
11475 }
11476 if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx)
11477 return (false);
11478 va = ppr->pkru_rs_el.re_end;
11479 }
11480 return (true);
11481 }
11482
11483 static pt_entry_t
pmap_pkru_get(pmap_t pmap,vm_offset_t va)11484 pmap_pkru_get(pmap_t pmap, vm_offset_t va)
11485 {
11486 struct pmap_pkru_range *ppr;
11487
11488 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11489 if (pmap->pm_type != PT_X86 ||
11490 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
11491 va >= VM_MAXUSER_ADDRESS)
11492 return (0);
11493 ppr = rangeset_lookup(&pmap->pm_pkru, va);
11494 if (ppr != NULL)
11495 return (X86_PG_PKU(ppr->pkru_keyidx));
11496 return (0);
11497 }
11498
11499 static bool
pred_pkru_on_remove(void * ctx __unused,void * r)11500 pred_pkru_on_remove(void *ctx __unused, void *r)
11501 {
11502 struct pmap_pkru_range *ppr;
11503
11504 ppr = r;
11505 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
11506 }
11507
11508 static void
pmap_pkru_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)11509 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
11510 {
11511
11512 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11513 if (pmap->pm_type == PT_X86 &&
11514 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
11515 rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
11516 pred_pkru_on_remove);
11517 }
11518 }
11519
11520 static int
pmap_pkru_copy(pmap_t dst_pmap,pmap_t src_pmap)11521 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
11522 {
11523
11524 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
11525 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
11526 MPASS(dst_pmap->pm_type == PT_X86);
11527 MPASS(src_pmap->pm_type == PT_X86);
11528 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
11529 if (src_pmap->pm_pkru.rs_data_ctx == NULL)
11530 return (0);
11531 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
11532 }
11533
11534 static void
pmap_pkru_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,u_int keyidx)11535 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
11536 u_int keyidx)
11537 {
11538 pml4_entry_t *pml4e;
11539 pdp_entry_t *pdpe;
11540 pd_entry_t newpde, ptpaddr, *pde;
11541 pt_entry_t newpte, *ptep, pte;
11542 vm_offset_t va, va_next;
11543 bool changed;
11544
11545 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
11546 MPASS(pmap->pm_type == PT_X86);
11547 MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
11548
11549 for (changed = false, va = sva; va < eva; va = va_next) {
11550 pml4e = pmap_pml4e(pmap, va);
11551 if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) {
11552 va_next = (va + NBPML4) & ~PML4MASK;
11553 if (va_next < va)
11554 va_next = eva;
11555 continue;
11556 }
11557
11558 pdpe = pmap_pml4e_to_pdpe(pml4e, va);
11559 if ((*pdpe & X86_PG_V) == 0) {
11560 va_next = (va + NBPDP) & ~PDPMASK;
11561 if (va_next < va)
11562 va_next = eva;
11563 continue;
11564 }
11565
11566 va_next = (va + NBPDR) & ~PDRMASK;
11567 if (va_next < va)
11568 va_next = eva;
11569
11570 pde = pmap_pdpe_to_pde(pdpe, va);
11571 ptpaddr = *pde;
11572 if (ptpaddr == 0)
11573 continue;
11574
11575 MPASS((ptpaddr & X86_PG_V) != 0);
11576 if ((ptpaddr & PG_PS) != 0) {
11577 if (va + NBPDR == va_next && eva >= va_next) {
11578 newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
11579 X86_PG_PKU(keyidx);
11580 if (newpde != ptpaddr) {
11581 *pde = newpde;
11582 changed = true;
11583 }
11584 continue;
11585 } else if (!pmap_demote_pde(pmap, pde, va)) {
11586 continue;
11587 }
11588 }
11589
11590 if (va_next > eva)
11591 va_next = eva;
11592
11593 for (ptep = pmap_pde_to_pte(pde, va); va != va_next;
11594 ptep++, va += PAGE_SIZE) {
11595 pte = *ptep;
11596 if ((pte & X86_PG_V) == 0)
11597 continue;
11598 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
11599 if (newpte != pte) {
11600 *ptep = newpte;
11601 changed = true;
11602 }
11603 }
11604 }
11605 if (changed)
11606 pmap_invalidate_range(pmap, sva, eva);
11607 }
11608
11609 static int
pmap_pkru_check_uargs(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,u_int keyidx,int flags)11610 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
11611 u_int keyidx, int flags)
11612 {
11613
11614 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
11615 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
11616 return (EINVAL);
11617 if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
11618 return (EFAULT);
11619 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
11620 return (ENOTSUP);
11621 return (0);
11622 }
11623
11624 int
pmap_pkru_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,u_int keyidx,int flags)11625 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
11626 int flags)
11627 {
11628 int error;
11629
11630 sva = trunc_page(sva);
11631 eva = round_page(eva);
11632 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
11633 if (error != 0)
11634 return (error);
11635 for (;;) {
11636 PMAP_LOCK(pmap);
11637 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
11638 if (error == 0)
11639 pmap_pkru_update_range(pmap, sva, eva, keyidx);
11640 PMAP_UNLOCK(pmap);
11641 if (error != ENOMEM)
11642 break;
11643 vm_wait(NULL);
11644 }
11645 return (error);
11646 }
11647
11648 int
pmap_pkru_clear(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)11649 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
11650 {
11651 int error;
11652
11653 sva = trunc_page(sva);
11654 eva = round_page(eva);
11655 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
11656 if (error != 0)
11657 return (error);
11658 for (;;) {
11659 PMAP_LOCK(pmap);
11660 error = pmap_pkru_deassign(pmap, sva, eva);
11661 if (error == 0)
11662 pmap_pkru_update_range(pmap, sva, eva, 0);
11663 PMAP_UNLOCK(pmap);
11664 if (error != ENOMEM)
11665 break;
11666 vm_wait(NULL);
11667 }
11668 return (error);
11669 }
11670
11671 #if defined(KASAN) || defined(KMSAN)
11672
11673 /*
11674 * Reserve enough memory to:
11675 * 1) allocate PDP pages for the shadow map(s),
11676 * 2) shadow the boot stack of KSTACK_PAGES pages,
11677 * so we need one PD page, one or two PT pages, and KSTACK_PAGES shadow pages
11678 * per shadow map.
11679 */
11680 #ifdef KASAN
11681 #define SAN_EARLY_PAGES \
11682 (NKASANPML4E + 1 + 2 + howmany(KSTACK_PAGES, KASAN_SHADOW_SCALE))
11683 #else
11684 #define SAN_EARLY_PAGES \
11685 (NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * (1 + 2 + KSTACK_PAGES))
11686 #endif
11687
11688 static uint64_t __nosanitizeaddress __nosanitizememory
pmap_san_enter_early_alloc_4k(uint64_t pabase)11689 pmap_san_enter_early_alloc_4k(uint64_t pabase)
11690 {
11691 static uint8_t data[PAGE_SIZE * SAN_EARLY_PAGES] __aligned(PAGE_SIZE);
11692 static size_t offset = 0;
11693 uint64_t pa;
11694
11695 if (offset == sizeof(data)) {
11696 panic("%s: ran out of memory for the bootstrap shadow map",
11697 __func__);
11698 }
11699
11700 pa = pabase + ((vm_offset_t)&data[offset] - KERNSTART);
11701 offset += PAGE_SIZE;
11702 return (pa);
11703 }
11704
11705 /*
11706 * Map a shadow page, before the kernel has bootstrapped its page tables. This
11707 * is currently only used to shadow the temporary boot stack set up by locore.
11708 */
11709 static void __nosanitizeaddress __nosanitizememory
pmap_san_enter_early(vm_offset_t va)11710 pmap_san_enter_early(vm_offset_t va)
11711 {
11712 static bool first = true;
11713 pml4_entry_t *pml4e;
11714 pdp_entry_t *pdpe;
11715 pd_entry_t *pde;
11716 pt_entry_t *pte;
11717 uint64_t cr3, pa, base;
11718 int i;
11719
11720 base = amd64_loadaddr();
11721 cr3 = rcr3();
11722
11723 if (first) {
11724 /*
11725 * If this the first call, we need to allocate new PML4Es for
11726 * the bootstrap shadow map(s). We don't know how the PML4 page
11727 * was initialized by the boot loader, so we can't simply test
11728 * whether the shadow map's PML4Es are zero.
11729 */
11730 first = false;
11731 #ifdef KASAN
11732 for (i = 0; i < NKASANPML4E; i++) {
11733 pa = pmap_san_enter_early_alloc_4k(base);
11734
11735 pml4e = (pml4_entry_t *)cr3 +
11736 pmap_pml4e_index(KASAN_MIN_ADDRESS + i * NBPML4);
11737 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V);
11738 }
11739 #else
11740 for (i = 0; i < NKMSANORIGPML4E; i++) {
11741 pa = pmap_san_enter_early_alloc_4k(base);
11742
11743 pml4e = (pml4_entry_t *)cr3 +
11744 pmap_pml4e_index(KMSAN_ORIG_MIN_ADDRESS +
11745 i * NBPML4);
11746 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V);
11747 }
11748 for (i = 0; i < NKMSANSHADPML4E; i++) {
11749 pa = pmap_san_enter_early_alloc_4k(base);
11750
11751 pml4e = (pml4_entry_t *)cr3 +
11752 pmap_pml4e_index(KMSAN_SHAD_MIN_ADDRESS +
11753 i * NBPML4);
11754 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V);
11755 }
11756 #endif
11757 }
11758 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(va);
11759 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(va);
11760 if (*pdpe == 0) {
11761 pa = pmap_san_enter_early_alloc_4k(base);
11762 *pdpe = (pdp_entry_t)(pa | X86_PG_RW | X86_PG_V);
11763 }
11764 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(va);
11765 if (*pde == 0) {
11766 pa = pmap_san_enter_early_alloc_4k(base);
11767 *pde = (pd_entry_t)(pa | X86_PG_RW | X86_PG_V);
11768 }
11769 pte = (pt_entry_t *)(*pde & PG_FRAME) + pmap_pte_index(va);
11770 if (*pte != 0)
11771 panic("%s: PTE for %#lx is already initialized", __func__, va);
11772 pa = pmap_san_enter_early_alloc_4k(base);
11773 *pte = (pt_entry_t)(pa | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V);
11774 }
11775
11776 static vm_page_t
pmap_san_enter_alloc_4k(void)11777 pmap_san_enter_alloc_4k(void)
11778 {
11779 vm_page_t m;
11780
11781 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
11782 VM_ALLOC_ZERO);
11783 if (m == NULL)
11784 panic("%s: no memory to grow shadow map", __func__);
11785 return (m);
11786 }
11787
11788 static vm_page_t
pmap_san_enter_alloc_2m(void)11789 pmap_san_enter_alloc_2m(void)
11790 {
11791 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
11792 NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT));
11793 }
11794
11795 /*
11796 * Grow a shadow map by at least one 4KB page at the specified address. Use 2MB
11797 * pages when possible.
11798 */
11799 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)11800 pmap_san_enter(vm_offset_t va)
11801 {
11802 pdp_entry_t *pdpe;
11803 pd_entry_t *pde;
11804 pt_entry_t *pte;
11805 vm_page_t m;
11806
11807 if (kernphys == 0) {
11808 /*
11809 * We're creating a temporary shadow map for the boot stack.
11810 */
11811 pmap_san_enter_early(va);
11812 return;
11813 }
11814
11815 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
11816
11817 pdpe = pmap_pdpe(kernel_pmap, va);
11818 if ((*pdpe & X86_PG_V) == 0) {
11819 m = pmap_san_enter_alloc_4k();
11820 *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
11821 X86_PG_V | pg_nx);
11822 }
11823 pde = pmap_pdpe_to_pde(pdpe, va);
11824 if ((*pde & X86_PG_V) == 0) {
11825 m = pmap_san_enter_alloc_2m();
11826 if (m != NULL) {
11827 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
11828 X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx);
11829 } else {
11830 m = pmap_san_enter_alloc_4k();
11831 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW |
11832 X86_PG_V | pg_nx);
11833 }
11834 }
11835 if ((*pde & X86_PG_PS) != 0)
11836 return;
11837 pte = pmap_pde_to_pte(pde, va);
11838 if ((*pte & X86_PG_V) != 0)
11839 return;
11840 m = pmap_san_enter_alloc_4k();
11841 *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V |
11842 X86_PG_M | X86_PG_A | pg_nx);
11843 }
11844 #endif
11845
11846 /*
11847 * Track a range of the kernel's virtual address space that is contiguous
11848 * in various mapping attributes.
11849 */
11850 struct pmap_kernel_map_range {
11851 vm_offset_t sva;
11852 pt_entry_t attrs;
11853 int ptes;
11854 int pdes;
11855 int pdpes;
11856 };
11857
11858 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)11859 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
11860 vm_offset_t eva)
11861 {
11862 const char *mode;
11863 int i, pat_idx;
11864
11865 if (eva <= range->sva)
11866 return;
11867
11868 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true);
11869 for (i = 0; i < PAT_INDEX_SIZE; i++)
11870 if (pat_index[i] == pat_idx)
11871 break;
11872
11873 switch (i) {
11874 case PAT_WRITE_BACK:
11875 mode = "WB";
11876 break;
11877 case PAT_WRITE_THROUGH:
11878 mode = "WT";
11879 break;
11880 case PAT_UNCACHEABLE:
11881 mode = "UC";
11882 break;
11883 case PAT_UNCACHED:
11884 mode = "U-";
11885 break;
11886 case PAT_WRITE_PROTECTED:
11887 mode = "WP";
11888 break;
11889 case PAT_WRITE_COMBINING:
11890 mode = "WC";
11891 break;
11892 default:
11893 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n",
11894 __func__, pat_idx, range->sva, eva);
11895 mode = "??";
11896 break;
11897 }
11898
11899 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
11900 range->sva, eva,
11901 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-',
11902 (range->attrs & pg_nx) != 0 ? '-' : 'x',
11903 (range->attrs & X86_PG_U) != 0 ? 'u' : 's',
11904 (range->attrs & X86_PG_G) != 0 ? 'g' : '-',
11905 mode, range->pdpes, range->pdes, range->ptes);
11906
11907 /* Reset to sentinel value. */
11908 range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
11909 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
11910 NPDEPG - 1, NPTEPG - 1);
11911 }
11912
11913 /*
11914 * Determine whether the attributes specified by a page table entry match those
11915 * being tracked by the current range. This is not quite as simple as a direct
11916 * flag comparison since some PAT modes have multiple representations.
11917 */
11918 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)11919 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
11920 {
11921 pt_entry_t diff, mask;
11922
11923 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx;
11924 diff = (range->attrs ^ attrs) & mask;
11925 if (diff == 0)
11926 return (true);
11927 if ((diff & ~X86_PG_PDE_PAT) == 0 &&
11928 pmap_pat_index(kernel_pmap, range->attrs, true) ==
11929 pmap_pat_index(kernel_pmap, attrs, true))
11930 return (true);
11931 return (false);
11932 }
11933
11934 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)11935 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
11936 pt_entry_t attrs)
11937 {
11938
11939 memset(range, 0, sizeof(*range));
11940 range->sva = va;
11941 range->attrs = attrs;
11942 }
11943
11944 /*
11945 * Given a leaf PTE, derive the mapping's attributes. If they do not match
11946 * those of the current run, dump the address range and its attributes, and
11947 * begin a new run.
11948 */
11949 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pml4_entry_t pml4e,pdp_entry_t pdpe,pd_entry_t pde,pt_entry_t pte)11950 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
11951 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
11952 pt_entry_t pte)
11953 {
11954 pt_entry_t attrs;
11955
11956 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
11957
11958 attrs |= pdpe & pg_nx;
11959 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
11960 if ((pdpe & PG_PS) != 0) {
11961 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE);
11962 } else if (pde != 0) {
11963 attrs |= pde & pg_nx;
11964 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U));
11965 }
11966 if ((pde & PG_PS) != 0) {
11967 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE);
11968 } else if (pte != 0) {
11969 attrs |= pte & pg_nx;
11970 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U));
11971 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE);
11972
11973 /* Canonicalize by always using the PDE PAT bit. */
11974 if ((attrs & X86_PG_PTE_PAT) != 0)
11975 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT;
11976 }
11977
11978 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
11979 sysctl_kmaps_dump(sb, range, va);
11980 sysctl_kmaps_reinit(range, va, attrs);
11981 }
11982 }
11983
11984 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)11985 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
11986 {
11987 struct pmap_kernel_map_range range;
11988 struct sbuf sbuf, *sb;
11989 pml4_entry_t pml4e;
11990 pdp_entry_t *pdp, pdpe;
11991 pd_entry_t *pd, pde;
11992 pt_entry_t *pt, pte;
11993 vm_offset_t sva;
11994 vm_paddr_t pa;
11995 int error, i, j, k, l;
11996
11997 error = sysctl_wire_old_buffer(req, 0);
11998 if (error != 0)
11999 return (error);
12000 sb = &sbuf;
12001 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
12002
12003 /* Sentinel value. */
12004 range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
12005 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
12006 NPDEPG - 1, NPTEPG - 1);
12007
12008 /*
12009 * Iterate over the kernel page tables without holding the kernel pmap
12010 * lock. Outside of the large map, kernel page table pages are never
12011 * freed, so at worst we will observe inconsistencies in the output.
12012 * Within the large map, ensure that PDP and PD page addresses are
12013 * valid before descending.
12014 */
12015 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
12016 switch (i) {
12017 case PML4PML4I:
12018 sbuf_printf(sb, "\nRecursive map:\n");
12019 break;
12020 case DMPML4I:
12021 sbuf_printf(sb, "\nDirect map:\n");
12022 break;
12023 #ifdef KASAN
12024 case KASANPML4I:
12025 sbuf_printf(sb, "\nKASAN shadow map:\n");
12026 break;
12027 #endif
12028 #ifdef KMSAN
12029 case KMSANSHADPML4I:
12030 sbuf_printf(sb, "\nKMSAN shadow map:\n");
12031 break;
12032 case KMSANORIGPML4I:
12033 sbuf_printf(sb, "\nKMSAN origin map:\n");
12034 break;
12035 #endif
12036 case KPML4BASE:
12037 sbuf_printf(sb, "\nKernel map:\n");
12038 break;
12039 case LMSPML4I:
12040 sbuf_printf(sb, "\nLarge map:\n");
12041 break;
12042 }
12043
12044 /* Convert to canonical form. */
12045 if (sva == 1ul << 47)
12046 sva |= -1ul << 48;
12047
12048 restart:
12049 pml4e = kernel_pml4[i];
12050 if ((pml4e & X86_PG_V) == 0) {
12051 sva = rounddown2(sva, NBPML4);
12052 sysctl_kmaps_dump(sb, &range, sva);
12053 sva += NBPML4;
12054 continue;
12055 }
12056 pa = pml4e & PG_FRAME;
12057 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa);
12058
12059 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) {
12060 pdpe = pdp[j];
12061 if ((pdpe & X86_PG_V) == 0) {
12062 sva = rounddown2(sva, NBPDP);
12063 sysctl_kmaps_dump(sb, &range, sva);
12064 sva += NBPDP;
12065 continue;
12066 }
12067 pa = pdpe & PG_FRAME;
12068 if ((pdpe & PG_PS) != 0) {
12069 sva = rounddown2(sva, NBPDP);
12070 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
12071 0, 0);
12072 range.pdpes++;
12073 sva += NBPDP;
12074 continue;
12075 }
12076 if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
12077 vm_phys_paddr_to_vm_page(pa) == NULL) {
12078 /*
12079 * Page table pages for the large map may be
12080 * freed. Validate the next-level address
12081 * before descending.
12082 */
12083 goto restart;
12084 }
12085 pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
12086
12087 for (k = pmap_pde_index(sva); k < NPDEPG; k++) {
12088 pde = pd[k];
12089 if ((pde & X86_PG_V) == 0) {
12090 sva = rounddown2(sva, NBPDR);
12091 sysctl_kmaps_dump(sb, &range, sva);
12092 sva += NBPDR;
12093 continue;
12094 }
12095 pa = pde & PG_FRAME;
12096 if ((pde & PG_PS) != 0) {
12097 sva = rounddown2(sva, NBPDR);
12098 sysctl_kmaps_check(sb, &range, sva,
12099 pml4e, pdpe, pde, 0);
12100 range.pdes++;
12101 sva += NBPDR;
12102 continue;
12103 }
12104 if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
12105 vm_phys_paddr_to_vm_page(pa) == NULL) {
12106 /*
12107 * Page table pages for the large map
12108 * may be freed. Validate the
12109 * next-level address before descending.
12110 */
12111 goto restart;
12112 }
12113 pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
12114
12115 for (l = pmap_pte_index(sva); l < NPTEPG; l++,
12116 sva += PAGE_SIZE) {
12117 pte = pt[l];
12118 if ((pte & X86_PG_V) == 0) {
12119 sysctl_kmaps_dump(sb, &range,
12120 sva);
12121 continue;
12122 }
12123 sysctl_kmaps_check(sb, &range, sva,
12124 pml4e, pdpe, pde, pte);
12125 range.ptes++;
12126 }
12127 }
12128 }
12129 }
12130
12131 error = sbuf_finish(sb);
12132 sbuf_delete(sb);
12133 return (error);
12134 }
12135 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
12136 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
12137 NULL, 0, sysctl_kmaps, "A",
12138 "Dump kernel address layout");
12139
12140 #ifdef DDB
DB_SHOW_COMMAND(pte,pmap_print_pte)12141 DB_SHOW_COMMAND(pte, pmap_print_pte)
12142 {
12143 pmap_t pmap;
12144 pml5_entry_t *pml5;
12145 pml4_entry_t *pml4;
12146 pdp_entry_t *pdp;
12147 pd_entry_t *pde;
12148 pt_entry_t *pte, PG_V;
12149 vm_offset_t va;
12150
12151 if (!have_addr) {
12152 db_printf("show pte addr\n");
12153 return;
12154 }
12155 va = (vm_offset_t)addr;
12156
12157 if (kdb_thread != NULL)
12158 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
12159 else
12160 pmap = PCPU_GET(curpmap);
12161
12162 PG_V = pmap_valid_bit(pmap);
12163 db_printf("VA 0x%016lx", va);
12164
12165 if (pmap_is_la57(pmap)) {
12166 pml5 = pmap_pml5e(pmap, va);
12167 db_printf(" pml5e 0x%016lx", *pml5);
12168 if ((*pml5 & PG_V) == 0) {
12169 db_printf("\n");
12170 return;
12171 }
12172 pml4 = pmap_pml5e_to_pml4e(pml5, va);
12173 } else {
12174 pml4 = pmap_pml4e(pmap, va);
12175 }
12176 db_printf(" pml4e 0x%016lx", *pml4);
12177 if ((*pml4 & PG_V) == 0) {
12178 db_printf("\n");
12179 return;
12180 }
12181 pdp = pmap_pml4e_to_pdpe(pml4, va);
12182 db_printf(" pdpe 0x%016lx", *pdp);
12183 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
12184 db_printf("\n");
12185 return;
12186 }
12187 pde = pmap_pdpe_to_pde(pdp, va);
12188 db_printf(" pde 0x%016lx", *pde);
12189 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
12190 db_printf("\n");
12191 return;
12192 }
12193 pte = pmap_pde_to_pte(pde, va);
12194 db_printf(" pte 0x%016lx\n", *pte);
12195 }
12196
DB_SHOW_COMMAND(phys2dmap,pmap_phys2dmap)12197 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
12198 {
12199 vm_paddr_t a;
12200
12201 if (have_addr) {
12202 a = (vm_paddr_t)addr;
12203 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
12204 } else {
12205 db_printf("show phys2dmap addr\n");
12206 }
12207 }
12208
12209 static void
ptpages_show_page(int level,int idx,vm_page_t pg)12210 ptpages_show_page(int level, int idx, vm_page_t pg)
12211 {
12212 db_printf("l %d i %d pg %p phys %#lx ref %x\n",
12213 level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count);
12214 }
12215
12216 static void
ptpages_show_complain(int level,int idx,uint64_t pte)12217 ptpages_show_complain(int level, int idx, uint64_t pte)
12218 {
12219 db_printf("l %d i %d pte %#lx\n", level, idx, pte);
12220 }
12221
12222 static void
ptpages_show_pml4(vm_page_t pg4,int num_entries,uint64_t PG_V)12223 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V)
12224 {
12225 vm_page_t pg3, pg2, pg1;
12226 pml4_entry_t *pml4;
12227 pdp_entry_t *pdp;
12228 pd_entry_t *pd;
12229 int i4, i3, i2;
12230
12231 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4));
12232 for (i4 = 0; i4 < num_entries; i4++) {
12233 if ((pml4[i4] & PG_V) == 0)
12234 continue;
12235 pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME);
12236 if (pg3 == NULL) {
12237 ptpages_show_complain(3, i4, pml4[i4]);
12238 continue;
12239 }
12240 ptpages_show_page(3, i4, pg3);
12241 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3));
12242 for (i3 = 0; i3 < NPDPEPG; i3++) {
12243 if ((pdp[i3] & PG_V) == 0)
12244 continue;
12245 pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME);
12246 if (pg3 == NULL) {
12247 ptpages_show_complain(2, i3, pdp[i3]);
12248 continue;
12249 }
12250 ptpages_show_page(2, i3, pg2);
12251 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2));
12252 for (i2 = 0; i2 < NPDEPG; i2++) {
12253 if ((pd[i2] & PG_V) == 0)
12254 continue;
12255 pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME);
12256 if (pg1 == NULL) {
12257 ptpages_show_complain(1, i2, pd[i2]);
12258 continue;
12259 }
12260 ptpages_show_page(1, i2, pg1);
12261 }
12262 }
12263 }
12264 }
12265
DB_SHOW_COMMAND(ptpages,pmap_ptpages)12266 DB_SHOW_COMMAND(ptpages, pmap_ptpages)
12267 {
12268 pmap_t pmap;
12269 vm_page_t pg;
12270 pml5_entry_t *pml5;
12271 uint64_t PG_V;
12272 int i5;
12273
12274 if (have_addr)
12275 pmap = (pmap_t)addr;
12276 else
12277 pmap = PCPU_GET(curpmap);
12278
12279 PG_V = pmap_valid_bit(pmap);
12280
12281 if (pmap_is_la57(pmap)) {
12282 pml5 = pmap->pm_pmltop;
12283 for (i5 = 0; i5 < NUPML5E; i5++) {
12284 if ((pml5[i5] & PG_V) == 0)
12285 continue;
12286 pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME);
12287 if (pg == NULL) {
12288 ptpages_show_complain(4, i5, pml5[i5]);
12289 continue;
12290 }
12291 ptpages_show_page(4, i5, pg);
12292 ptpages_show_pml4(pg, NPML4EPG, PG_V);
12293 }
12294 } else {
12295 ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS(
12296 (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V);
12297 }
12298 }
12299 #endif
12300