1a9643ea8Slogwang /*-
2*22ce4affSfengbojiang * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3*22ce4affSfengbojiang *
4a9643ea8Slogwang * Copyright (c) 2002-2006 Rice University
5a9643ea8Slogwang * Copyright (c) 2007 Alan L. Cox <[email protected]>
6a9643ea8Slogwang * All rights reserved.
7a9643ea8Slogwang *
8a9643ea8Slogwang * This software was developed for the FreeBSD Project by Alan L. Cox,
9a9643ea8Slogwang * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10a9643ea8Slogwang *
11a9643ea8Slogwang * Redistribution and use in source and binary forms, with or without
12a9643ea8Slogwang * modification, are permitted provided that the following conditions
13a9643ea8Slogwang * are met:
14a9643ea8Slogwang * 1. Redistributions of source code must retain the above copyright
15a9643ea8Slogwang * notice, this list of conditions and the following disclaimer.
16a9643ea8Slogwang * 2. Redistributions in binary form must reproduce the above copyright
17a9643ea8Slogwang * notice, this list of conditions and the following disclaimer in the
18a9643ea8Slogwang * documentation and/or other materials provided with the distribution.
19a9643ea8Slogwang *
20a9643ea8Slogwang * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21a9643ea8Slogwang * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22a9643ea8Slogwang * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23a9643ea8Slogwang * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24a9643ea8Slogwang * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25a9643ea8Slogwang * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26a9643ea8Slogwang * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27a9643ea8Slogwang * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28a9643ea8Slogwang * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29a9643ea8Slogwang * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30a9643ea8Slogwang * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31a9643ea8Slogwang * POSSIBILITY OF SUCH DAMAGE.
32a9643ea8Slogwang */
33a9643ea8Slogwang
34a9643ea8Slogwang /*
35a9643ea8Slogwang * Physical memory system implementation
36a9643ea8Slogwang *
37a9643ea8Slogwang * Any external functions defined by this module are only to be used by the
38a9643ea8Slogwang * virtual memory system.
39a9643ea8Slogwang */
40a9643ea8Slogwang
41a9643ea8Slogwang #include <sys/cdefs.h>
42a9643ea8Slogwang __FBSDID("$FreeBSD$");
43a9643ea8Slogwang
44a9643ea8Slogwang #include "opt_ddb.h"
45a9643ea8Slogwang #include "opt_vm.h"
46a9643ea8Slogwang
47a9643ea8Slogwang #include <sys/param.h>
48a9643ea8Slogwang #include <sys/systm.h>
49*22ce4affSfengbojiang #include <sys/domainset.h>
50a9643ea8Slogwang #include <sys/lock.h>
51a9643ea8Slogwang #include <sys/kernel.h>
52a9643ea8Slogwang #include <sys/malloc.h>
53a9643ea8Slogwang #include <sys/mutex.h>
54a9643ea8Slogwang #include <sys/proc.h>
55a9643ea8Slogwang #include <sys/queue.h>
56a9643ea8Slogwang #include <sys/rwlock.h>
57a9643ea8Slogwang #include <sys/sbuf.h>
58a9643ea8Slogwang #include <sys/sysctl.h>
59a9643ea8Slogwang #include <sys/tree.h>
60a9643ea8Slogwang #include <sys/vmmeter.h>
61a9643ea8Slogwang
62a9643ea8Slogwang #include <ddb/ddb.h>
63a9643ea8Slogwang
64a9643ea8Slogwang #include <vm/vm.h>
65a9643ea8Slogwang #include <vm/vm_param.h>
66a9643ea8Slogwang #include <vm/vm_kern.h>
67a9643ea8Slogwang #include <vm/vm_object.h>
68a9643ea8Slogwang #include <vm/vm_page.h>
69a9643ea8Slogwang #include <vm/vm_phys.h>
70*22ce4affSfengbojiang #include <vm/vm_pagequeue.h>
71a9643ea8Slogwang
72a9643ea8Slogwang _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
73a9643ea8Slogwang "Too many physsegs.");
74a9643ea8Slogwang
75*22ce4affSfengbojiang #ifdef NUMA
76*22ce4affSfengbojiang struct mem_affinity __read_mostly *mem_affinity;
77*22ce4affSfengbojiang int __read_mostly *mem_locality;
78a9643ea8Slogwang #endif
79a9643ea8Slogwang
80*22ce4affSfengbojiang int __read_mostly vm_ndomains = 1;
81*22ce4affSfengbojiang domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
82a9643ea8Slogwang
83*22ce4affSfengbojiang struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
84*22ce4affSfengbojiang int __read_mostly vm_phys_nsegs;
85*22ce4affSfengbojiang static struct vm_phys_seg vm_phys_early_segs[8];
86*22ce4affSfengbojiang static int vm_phys_early_nsegs;
87a9643ea8Slogwang
88a9643ea8Slogwang struct vm_phys_fictitious_seg;
89a9643ea8Slogwang static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
90a9643ea8Slogwang struct vm_phys_fictitious_seg *);
91a9643ea8Slogwang
92a9643ea8Slogwang RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
93*22ce4affSfengbojiang RB_INITIALIZER(&vm_phys_fictitious_tree);
94a9643ea8Slogwang
95a9643ea8Slogwang struct vm_phys_fictitious_seg {
96a9643ea8Slogwang RB_ENTRY(vm_phys_fictitious_seg) node;
97a9643ea8Slogwang /* Memory region data */
98a9643ea8Slogwang vm_paddr_t start;
99a9643ea8Slogwang vm_paddr_t end;
100a9643ea8Slogwang vm_page_t first_page;
101a9643ea8Slogwang };
102a9643ea8Slogwang
103a9643ea8Slogwang RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
104a9643ea8Slogwang vm_phys_fictitious_cmp);
105a9643ea8Slogwang
106*22ce4affSfengbojiang static struct rwlock_padalign vm_phys_fictitious_reg_lock;
107a9643ea8Slogwang MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
108a9643ea8Slogwang
109*22ce4affSfengbojiang static struct vm_freelist __aligned(CACHE_LINE_SIZE)
110*22ce4affSfengbojiang vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
111*22ce4affSfengbojiang [VM_NFREEORDER_MAX];
112a9643ea8Slogwang
113*22ce4affSfengbojiang static int __read_mostly vm_nfreelists;
114*22ce4affSfengbojiang
115*22ce4affSfengbojiang /*
116*22ce4affSfengbojiang * These "avail lists" are globals used to communicate boot-time physical
117*22ce4affSfengbojiang * memory layout to other parts of the kernel. Each physically contiguous
118*22ce4affSfengbojiang * region of memory is defined by a start address at an even index and an
119*22ce4affSfengbojiang * end address at the following odd index. Each list is terminated by a
120*22ce4affSfengbojiang * pair of zero entries.
121*22ce4affSfengbojiang *
122*22ce4affSfengbojiang * dump_avail tells the dump code what regions to include in a crash dump, and
123*22ce4affSfengbojiang * phys_avail is all of the remaining physical memory that is available for
124*22ce4affSfengbojiang * the vm system.
125*22ce4affSfengbojiang *
126*22ce4affSfengbojiang * Initially dump_avail and phys_avail are identical. Boot time memory
127*22ce4affSfengbojiang * allocations remove extents from phys_avail that may still be included
128*22ce4affSfengbojiang * in dumps.
129*22ce4affSfengbojiang */
130*22ce4affSfengbojiang vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
131*22ce4affSfengbojiang vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
132a9643ea8Slogwang
133a9643ea8Slogwang /*
134a9643ea8Slogwang * Provides the mapping from VM_FREELIST_* to free list indices (flind).
135a9643ea8Slogwang */
136*22ce4affSfengbojiang static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
137a9643ea8Slogwang
138a9643ea8Slogwang CTASSERT(VM_FREELIST_DEFAULT == 0);
139a9643ea8Slogwang
140a9643ea8Slogwang #ifdef VM_FREELIST_DMA32
141a9643ea8Slogwang #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32)
142a9643ea8Slogwang #endif
143a9643ea8Slogwang
144a9643ea8Slogwang /*
145a9643ea8Slogwang * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
146a9643ea8Slogwang * the ordering of the free list boundaries.
147a9643ea8Slogwang */
148a9643ea8Slogwang #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
149a9643ea8Slogwang CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
150a9643ea8Slogwang #endif
151a9643ea8Slogwang
152a9643ea8Slogwang static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
153*22ce4affSfengbojiang SYSCTL_OID(_vm, OID_AUTO, phys_free,
154*22ce4affSfengbojiang CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
155*22ce4affSfengbojiang sysctl_vm_phys_free, "A",
156*22ce4affSfengbojiang "Phys Free Info");
157a9643ea8Slogwang
158a9643ea8Slogwang static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
159*22ce4affSfengbojiang SYSCTL_OID(_vm, OID_AUTO, phys_segs,
160*22ce4affSfengbojiang CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
161*22ce4affSfengbojiang sysctl_vm_phys_segs, "A",
162*22ce4affSfengbojiang "Phys Seg Info");
163a9643ea8Slogwang
164*22ce4affSfengbojiang #ifdef NUMA
165a9643ea8Slogwang static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
166*22ce4affSfengbojiang SYSCTL_OID(_vm, OID_AUTO, phys_locality,
167*22ce4affSfengbojiang CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
168*22ce4affSfengbojiang sysctl_vm_phys_locality, "A",
169*22ce4affSfengbojiang "Phys Locality Info");
170a9643ea8Slogwang #endif
171a9643ea8Slogwang
172a9643ea8Slogwang SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
173a9643ea8Slogwang &vm_ndomains, 0, "Number of physical memory domains available.");
174a9643ea8Slogwang
175a9643ea8Slogwang static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
176a9643ea8Slogwang u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
177a9643ea8Slogwang vm_paddr_t boundary);
178a9643ea8Slogwang static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
179a9643ea8Slogwang static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
180a9643ea8Slogwang static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
181*22ce4affSfengbojiang int order, int tail);
182a9643ea8Slogwang
183a9643ea8Slogwang /*
184a9643ea8Slogwang * Red-black tree helpers for vm fictitious range management.
185a9643ea8Slogwang */
186a9643ea8Slogwang static inline int
vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg * p,struct vm_phys_fictitious_seg * range)187a9643ea8Slogwang vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
188a9643ea8Slogwang struct vm_phys_fictitious_seg *range)
189a9643ea8Slogwang {
190a9643ea8Slogwang
191a9643ea8Slogwang KASSERT(range->start != 0 && range->end != 0,
192a9643ea8Slogwang ("Invalid range passed on search for vm_fictitious page"));
193a9643ea8Slogwang if (p->start >= range->end)
194a9643ea8Slogwang return (1);
195a9643ea8Slogwang if (p->start < range->start)
196a9643ea8Slogwang return (-1);
197a9643ea8Slogwang
198a9643ea8Slogwang return (0);
199a9643ea8Slogwang }
200a9643ea8Slogwang
201a9643ea8Slogwang static int
vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg * p1,struct vm_phys_fictitious_seg * p2)202a9643ea8Slogwang vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
203a9643ea8Slogwang struct vm_phys_fictitious_seg *p2)
204a9643ea8Slogwang {
205a9643ea8Slogwang
206a9643ea8Slogwang /* Check if this is a search for a page */
207a9643ea8Slogwang if (p1->end == 0)
208a9643ea8Slogwang return (vm_phys_fictitious_in_range(p1, p2));
209a9643ea8Slogwang
210a9643ea8Slogwang KASSERT(p2->end != 0,
211a9643ea8Slogwang ("Invalid range passed as second parameter to vm fictitious comparison"));
212a9643ea8Slogwang
213a9643ea8Slogwang /* Searching to add a new range */
214a9643ea8Slogwang if (p1->end <= p2->start)
215a9643ea8Slogwang return (-1);
216a9643ea8Slogwang if (p1->start >= p2->end)
217a9643ea8Slogwang return (1);
218a9643ea8Slogwang
219a9643ea8Slogwang panic("Trying to add overlapping vm fictitious ranges:\n"
220a9643ea8Slogwang "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
221a9643ea8Slogwang (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
222a9643ea8Slogwang }
223a9643ea8Slogwang
224*22ce4affSfengbojiang int
vm_phys_domain_match(int prefer,vm_paddr_t low,vm_paddr_t high)225*22ce4affSfengbojiang vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high)
226a9643ea8Slogwang {
227*22ce4affSfengbojiang #ifdef NUMA
228*22ce4affSfengbojiang domainset_t mask;
229*22ce4affSfengbojiang int i;
230a9643ea8Slogwang
231*22ce4affSfengbojiang if (vm_ndomains == 1 || mem_affinity == NULL)
232*22ce4affSfengbojiang return (0);
233a9643ea8Slogwang
234*22ce4affSfengbojiang DOMAINSET_ZERO(&mask);
235*22ce4affSfengbojiang /*
236*22ce4affSfengbojiang * Check for any memory that overlaps low, high.
237*22ce4affSfengbojiang */
238*22ce4affSfengbojiang for (i = 0; mem_affinity[i].end != 0; i++)
239*22ce4affSfengbojiang if (mem_affinity[i].start <= high &&
240*22ce4affSfengbojiang mem_affinity[i].end >= low)
241*22ce4affSfengbojiang DOMAINSET_SET(mem_affinity[i].domain, &mask);
242*22ce4affSfengbojiang if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
243*22ce4affSfengbojiang return (prefer);
244*22ce4affSfengbojiang if (DOMAINSET_EMPTY(&mask))
245*22ce4affSfengbojiang panic("vm_phys_domain_match: Impossible constraint");
246*22ce4affSfengbojiang return (DOMAINSET_FFS(&mask) - 1);
247a9643ea8Slogwang #else
248a9643ea8Slogwang return (0);
249a9643ea8Slogwang #endif
250a9643ea8Slogwang }
251a9643ea8Slogwang
252a9643ea8Slogwang /*
253a9643ea8Slogwang * Outputs the state of the physical memory allocator, specifically,
254a9643ea8Slogwang * the amount of physical memory in each free list.
255a9643ea8Slogwang */
256a9643ea8Slogwang static int
sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)257a9643ea8Slogwang sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
258a9643ea8Slogwang {
259a9643ea8Slogwang struct sbuf sbuf;
260a9643ea8Slogwang struct vm_freelist *fl;
261a9643ea8Slogwang int dom, error, flind, oind, pind;
262a9643ea8Slogwang
263a9643ea8Slogwang error = sysctl_wire_old_buffer(req, 0);
264a9643ea8Slogwang if (error != 0)
265a9643ea8Slogwang return (error);
266a9643ea8Slogwang sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
267a9643ea8Slogwang for (dom = 0; dom < vm_ndomains; dom++) {
268a9643ea8Slogwang sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
269a9643ea8Slogwang for (flind = 0; flind < vm_nfreelists; flind++) {
270a9643ea8Slogwang sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
271a9643ea8Slogwang "\n ORDER (SIZE) | NUMBER"
272a9643ea8Slogwang "\n ", flind);
273a9643ea8Slogwang for (pind = 0; pind < VM_NFREEPOOL; pind++)
274a9643ea8Slogwang sbuf_printf(&sbuf, " | POOL %d", pind);
275a9643ea8Slogwang sbuf_printf(&sbuf, "\n-- ");
276a9643ea8Slogwang for (pind = 0; pind < VM_NFREEPOOL; pind++)
277a9643ea8Slogwang sbuf_printf(&sbuf, "-- -- ");
278a9643ea8Slogwang sbuf_printf(&sbuf, "--\n");
279a9643ea8Slogwang for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
280a9643ea8Slogwang sbuf_printf(&sbuf, " %2d (%6dK)", oind,
281a9643ea8Slogwang 1 << (PAGE_SHIFT - 10 + oind));
282a9643ea8Slogwang for (pind = 0; pind < VM_NFREEPOOL; pind++) {
283a9643ea8Slogwang fl = vm_phys_free_queues[dom][flind][pind];
284a9643ea8Slogwang sbuf_printf(&sbuf, " | %6d",
285a9643ea8Slogwang fl[oind].lcnt);
286a9643ea8Slogwang }
287a9643ea8Slogwang sbuf_printf(&sbuf, "\n");
288a9643ea8Slogwang }
289a9643ea8Slogwang }
290a9643ea8Slogwang }
291a9643ea8Slogwang error = sbuf_finish(&sbuf);
292a9643ea8Slogwang sbuf_delete(&sbuf);
293a9643ea8Slogwang return (error);
294a9643ea8Slogwang }
295a9643ea8Slogwang
296a9643ea8Slogwang /*
297a9643ea8Slogwang * Outputs the set of physical memory segments.
298a9643ea8Slogwang */
299a9643ea8Slogwang static int
sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)300a9643ea8Slogwang sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
301a9643ea8Slogwang {
302a9643ea8Slogwang struct sbuf sbuf;
303a9643ea8Slogwang struct vm_phys_seg *seg;
304a9643ea8Slogwang int error, segind;
305a9643ea8Slogwang
306a9643ea8Slogwang error = sysctl_wire_old_buffer(req, 0);
307a9643ea8Slogwang if (error != 0)
308a9643ea8Slogwang return (error);
309a9643ea8Slogwang sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
310a9643ea8Slogwang for (segind = 0; segind < vm_phys_nsegs; segind++) {
311a9643ea8Slogwang sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
312a9643ea8Slogwang seg = &vm_phys_segs[segind];
313a9643ea8Slogwang sbuf_printf(&sbuf, "start: %#jx\n",
314a9643ea8Slogwang (uintmax_t)seg->start);
315a9643ea8Slogwang sbuf_printf(&sbuf, "end: %#jx\n",
316a9643ea8Slogwang (uintmax_t)seg->end);
317a9643ea8Slogwang sbuf_printf(&sbuf, "domain: %d\n", seg->domain);
318a9643ea8Slogwang sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
319a9643ea8Slogwang }
320a9643ea8Slogwang error = sbuf_finish(&sbuf);
321a9643ea8Slogwang sbuf_delete(&sbuf);
322a9643ea8Slogwang return (error);
323a9643ea8Slogwang }
324a9643ea8Slogwang
325a9643ea8Slogwang /*
326a9643ea8Slogwang * Return affinity, or -1 if there's no affinity information.
327a9643ea8Slogwang */
328a9643ea8Slogwang int
vm_phys_mem_affinity(int f,int t)329a9643ea8Slogwang vm_phys_mem_affinity(int f, int t)
330a9643ea8Slogwang {
331a9643ea8Slogwang
332*22ce4affSfengbojiang #ifdef NUMA
333a9643ea8Slogwang if (mem_locality == NULL)
334a9643ea8Slogwang return (-1);
335a9643ea8Slogwang if (f >= vm_ndomains || t >= vm_ndomains)
336a9643ea8Slogwang return (-1);
337a9643ea8Slogwang return (mem_locality[f * vm_ndomains + t]);
338a9643ea8Slogwang #else
339a9643ea8Slogwang return (-1);
340a9643ea8Slogwang #endif
341a9643ea8Slogwang }
342a9643ea8Slogwang
343*22ce4affSfengbojiang #ifdef NUMA
344a9643ea8Slogwang /*
345a9643ea8Slogwang * Outputs the VM locality table.
346a9643ea8Slogwang */
347a9643ea8Slogwang static int
sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)348a9643ea8Slogwang sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
349a9643ea8Slogwang {
350a9643ea8Slogwang struct sbuf sbuf;
351a9643ea8Slogwang int error, i, j;
352a9643ea8Slogwang
353a9643ea8Slogwang error = sysctl_wire_old_buffer(req, 0);
354a9643ea8Slogwang if (error != 0)
355a9643ea8Slogwang return (error);
356a9643ea8Slogwang sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
357a9643ea8Slogwang
358a9643ea8Slogwang sbuf_printf(&sbuf, "\n");
359a9643ea8Slogwang
360a9643ea8Slogwang for (i = 0; i < vm_ndomains; i++) {
361a9643ea8Slogwang sbuf_printf(&sbuf, "%d: ", i);
362a9643ea8Slogwang for (j = 0; j < vm_ndomains; j++) {
363a9643ea8Slogwang sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
364a9643ea8Slogwang }
365a9643ea8Slogwang sbuf_printf(&sbuf, "\n");
366a9643ea8Slogwang }
367a9643ea8Slogwang error = sbuf_finish(&sbuf);
368a9643ea8Slogwang sbuf_delete(&sbuf);
369a9643ea8Slogwang return (error);
370a9643ea8Slogwang }
371a9643ea8Slogwang #endif
372a9643ea8Slogwang
373a9643ea8Slogwang static void
vm_freelist_add(struct vm_freelist * fl,vm_page_t m,int order,int tail)374a9643ea8Slogwang vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
375a9643ea8Slogwang {
376a9643ea8Slogwang
377a9643ea8Slogwang m->order = order;
378a9643ea8Slogwang if (tail)
379*22ce4affSfengbojiang TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
380a9643ea8Slogwang else
381*22ce4affSfengbojiang TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
382a9643ea8Slogwang fl[order].lcnt++;
383a9643ea8Slogwang }
384a9643ea8Slogwang
385a9643ea8Slogwang static void
vm_freelist_rem(struct vm_freelist * fl,vm_page_t m,int order)386a9643ea8Slogwang vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
387a9643ea8Slogwang {
388a9643ea8Slogwang
389*22ce4affSfengbojiang TAILQ_REMOVE(&fl[order].pl, m, listq);
390a9643ea8Slogwang fl[order].lcnt--;
391a9643ea8Slogwang m->order = VM_NFREEORDER;
392a9643ea8Slogwang }
393a9643ea8Slogwang
394a9643ea8Slogwang /*
395a9643ea8Slogwang * Create a physical memory segment.
396a9643ea8Slogwang */
397a9643ea8Slogwang static void
_vm_phys_create_seg(vm_paddr_t start,vm_paddr_t end,int domain)398a9643ea8Slogwang _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
399a9643ea8Slogwang {
400a9643ea8Slogwang struct vm_phys_seg *seg;
401a9643ea8Slogwang
402a9643ea8Slogwang KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
403a9643ea8Slogwang ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
404*22ce4affSfengbojiang KASSERT(domain >= 0 && domain < vm_ndomains,
405a9643ea8Slogwang ("vm_phys_create_seg: invalid domain provided"));
406a9643ea8Slogwang seg = &vm_phys_segs[vm_phys_nsegs++];
407a9643ea8Slogwang while (seg > vm_phys_segs && (seg - 1)->start >= end) {
408a9643ea8Slogwang *seg = *(seg - 1);
409a9643ea8Slogwang seg--;
410a9643ea8Slogwang }
411a9643ea8Slogwang seg->start = start;
412a9643ea8Slogwang seg->end = end;
413a9643ea8Slogwang seg->domain = domain;
414a9643ea8Slogwang }
415a9643ea8Slogwang
416a9643ea8Slogwang static void
vm_phys_create_seg(vm_paddr_t start,vm_paddr_t end)417a9643ea8Slogwang vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
418a9643ea8Slogwang {
419*22ce4affSfengbojiang #ifdef NUMA
420a9643ea8Slogwang int i;
421a9643ea8Slogwang
422a9643ea8Slogwang if (mem_affinity == NULL) {
423a9643ea8Slogwang _vm_phys_create_seg(start, end, 0);
424a9643ea8Slogwang return;
425a9643ea8Slogwang }
426a9643ea8Slogwang
427a9643ea8Slogwang for (i = 0;; i++) {
428a9643ea8Slogwang if (mem_affinity[i].end == 0)
429a9643ea8Slogwang panic("Reached end of affinity info");
430a9643ea8Slogwang if (mem_affinity[i].end <= start)
431a9643ea8Slogwang continue;
432a9643ea8Slogwang if (mem_affinity[i].start > start)
433a9643ea8Slogwang panic("No affinity info for start %jx",
434a9643ea8Slogwang (uintmax_t)start);
435a9643ea8Slogwang if (mem_affinity[i].end >= end) {
436a9643ea8Slogwang _vm_phys_create_seg(start, end,
437a9643ea8Slogwang mem_affinity[i].domain);
438a9643ea8Slogwang break;
439a9643ea8Slogwang }
440a9643ea8Slogwang _vm_phys_create_seg(start, mem_affinity[i].end,
441a9643ea8Slogwang mem_affinity[i].domain);
442a9643ea8Slogwang start = mem_affinity[i].end;
443a9643ea8Slogwang }
444a9643ea8Slogwang #else
445a9643ea8Slogwang _vm_phys_create_seg(start, end, 0);
446a9643ea8Slogwang #endif
447a9643ea8Slogwang }
448a9643ea8Slogwang
449a9643ea8Slogwang /*
450a9643ea8Slogwang * Add a physical memory segment.
451a9643ea8Slogwang */
452a9643ea8Slogwang void
vm_phys_add_seg(vm_paddr_t start,vm_paddr_t end)453a9643ea8Slogwang vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
454a9643ea8Slogwang {
455a9643ea8Slogwang vm_paddr_t paddr;
456a9643ea8Slogwang
457a9643ea8Slogwang KASSERT((start & PAGE_MASK) == 0,
458a9643ea8Slogwang ("vm_phys_define_seg: start is not page aligned"));
459a9643ea8Slogwang KASSERT((end & PAGE_MASK) == 0,
460a9643ea8Slogwang ("vm_phys_define_seg: end is not page aligned"));
461a9643ea8Slogwang
462a9643ea8Slogwang /*
463a9643ea8Slogwang * Split the physical memory segment if it spans two or more free
464a9643ea8Slogwang * list boundaries.
465a9643ea8Slogwang */
466a9643ea8Slogwang paddr = start;
467a9643ea8Slogwang #ifdef VM_FREELIST_LOWMEM
468a9643ea8Slogwang if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
469a9643ea8Slogwang vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
470a9643ea8Slogwang paddr = VM_LOWMEM_BOUNDARY;
471a9643ea8Slogwang }
472a9643ea8Slogwang #endif
473a9643ea8Slogwang #ifdef VM_FREELIST_DMA32
474a9643ea8Slogwang if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
475a9643ea8Slogwang vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
476a9643ea8Slogwang paddr = VM_DMA32_BOUNDARY;
477a9643ea8Slogwang }
478a9643ea8Slogwang #endif
479a9643ea8Slogwang vm_phys_create_seg(paddr, end);
480a9643ea8Slogwang }
481a9643ea8Slogwang
482a9643ea8Slogwang /*
483a9643ea8Slogwang * Initialize the physical memory allocator.
484a9643ea8Slogwang *
485a9643ea8Slogwang * Requires that vm_page_array is initialized!
486a9643ea8Slogwang */
487a9643ea8Slogwang void
vm_phys_init(void)488a9643ea8Slogwang vm_phys_init(void)
489a9643ea8Slogwang {
490a9643ea8Slogwang struct vm_freelist *fl;
491*22ce4affSfengbojiang struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
492a9643ea8Slogwang u_long npages;
493a9643ea8Slogwang int dom, flind, freelist, oind, pind, segind;
494a9643ea8Slogwang
495a9643ea8Slogwang /*
496a9643ea8Slogwang * Compute the number of free lists, and generate the mapping from the
497a9643ea8Slogwang * manifest constants VM_FREELIST_* to the free list indices.
498a9643ea8Slogwang *
499a9643ea8Slogwang * Initially, the entries of vm_freelist_to_flind[] are set to either
500a9643ea8Slogwang * 0 or 1 to indicate which free lists should be created.
501a9643ea8Slogwang */
502a9643ea8Slogwang npages = 0;
503a9643ea8Slogwang for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
504a9643ea8Slogwang seg = &vm_phys_segs[segind];
505a9643ea8Slogwang #ifdef VM_FREELIST_LOWMEM
506a9643ea8Slogwang if (seg->end <= VM_LOWMEM_BOUNDARY)
507a9643ea8Slogwang vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
508a9643ea8Slogwang else
509a9643ea8Slogwang #endif
510a9643ea8Slogwang #ifdef VM_FREELIST_DMA32
511a9643ea8Slogwang if (
512a9643ea8Slogwang #ifdef VM_DMA32_NPAGES_THRESHOLD
513a9643ea8Slogwang /*
514a9643ea8Slogwang * Create the DMA32 free list only if the amount of
515a9643ea8Slogwang * physical memory above physical address 4G exceeds the
516a9643ea8Slogwang * given threshold.
517a9643ea8Slogwang */
518a9643ea8Slogwang npages > VM_DMA32_NPAGES_THRESHOLD &&
519a9643ea8Slogwang #endif
520a9643ea8Slogwang seg->end <= VM_DMA32_BOUNDARY)
521a9643ea8Slogwang vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
522a9643ea8Slogwang else
523a9643ea8Slogwang #endif
524a9643ea8Slogwang {
525a9643ea8Slogwang npages += atop(seg->end - seg->start);
526a9643ea8Slogwang vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
527a9643ea8Slogwang }
528a9643ea8Slogwang }
529a9643ea8Slogwang /* Change each entry into a running total of the free lists. */
530a9643ea8Slogwang for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
531a9643ea8Slogwang vm_freelist_to_flind[freelist] +=
532a9643ea8Slogwang vm_freelist_to_flind[freelist - 1];
533a9643ea8Slogwang }
534a9643ea8Slogwang vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
535a9643ea8Slogwang KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
536a9643ea8Slogwang /* Change each entry into a free list index. */
537a9643ea8Slogwang for (freelist = 0; freelist < VM_NFREELIST; freelist++)
538a9643ea8Slogwang vm_freelist_to_flind[freelist]--;
539a9643ea8Slogwang
540a9643ea8Slogwang /*
541a9643ea8Slogwang * Initialize the first_page and free_queues fields of each physical
542a9643ea8Slogwang * memory segment.
543a9643ea8Slogwang */
544a9643ea8Slogwang #ifdef VM_PHYSSEG_SPARSE
545a9643ea8Slogwang npages = 0;
546a9643ea8Slogwang #endif
547a9643ea8Slogwang for (segind = 0; segind < vm_phys_nsegs; segind++) {
548a9643ea8Slogwang seg = &vm_phys_segs[segind];
549a9643ea8Slogwang #ifdef VM_PHYSSEG_SPARSE
550a9643ea8Slogwang seg->first_page = &vm_page_array[npages];
551a9643ea8Slogwang npages += atop(seg->end - seg->start);
552a9643ea8Slogwang #else
553a9643ea8Slogwang seg->first_page = PHYS_TO_VM_PAGE(seg->start);
554a9643ea8Slogwang #endif
555a9643ea8Slogwang #ifdef VM_FREELIST_LOWMEM
556a9643ea8Slogwang if (seg->end <= VM_LOWMEM_BOUNDARY) {
557a9643ea8Slogwang flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
558a9643ea8Slogwang KASSERT(flind >= 0,
559a9643ea8Slogwang ("vm_phys_init: LOWMEM flind < 0"));
560a9643ea8Slogwang } else
561a9643ea8Slogwang #endif
562a9643ea8Slogwang #ifdef VM_FREELIST_DMA32
563a9643ea8Slogwang if (seg->end <= VM_DMA32_BOUNDARY) {
564a9643ea8Slogwang flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
565a9643ea8Slogwang KASSERT(flind >= 0,
566a9643ea8Slogwang ("vm_phys_init: DMA32 flind < 0"));
567a9643ea8Slogwang } else
568a9643ea8Slogwang #endif
569a9643ea8Slogwang {
570a9643ea8Slogwang flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
571a9643ea8Slogwang KASSERT(flind >= 0,
572a9643ea8Slogwang ("vm_phys_init: DEFAULT flind < 0"));
573a9643ea8Slogwang }
574a9643ea8Slogwang seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
575a9643ea8Slogwang }
576a9643ea8Slogwang
577a9643ea8Slogwang /*
578*22ce4affSfengbojiang * Coalesce physical memory segments that are contiguous and share the
579*22ce4affSfengbojiang * same per-domain free queues.
580*22ce4affSfengbojiang */
581*22ce4affSfengbojiang prev_seg = vm_phys_segs;
582*22ce4affSfengbojiang seg = &vm_phys_segs[1];
583*22ce4affSfengbojiang end_seg = &vm_phys_segs[vm_phys_nsegs];
584*22ce4affSfengbojiang while (seg < end_seg) {
585*22ce4affSfengbojiang if (prev_seg->end == seg->start &&
586*22ce4affSfengbojiang prev_seg->free_queues == seg->free_queues) {
587*22ce4affSfengbojiang prev_seg->end = seg->end;
588*22ce4affSfengbojiang KASSERT(prev_seg->domain == seg->domain,
589*22ce4affSfengbojiang ("vm_phys_init: free queues cannot span domains"));
590*22ce4affSfengbojiang vm_phys_nsegs--;
591*22ce4affSfengbojiang end_seg--;
592*22ce4affSfengbojiang for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
593*22ce4affSfengbojiang *tmp_seg = *(tmp_seg + 1);
594*22ce4affSfengbojiang } else {
595*22ce4affSfengbojiang prev_seg = seg;
596*22ce4affSfengbojiang seg++;
597*22ce4affSfengbojiang }
598*22ce4affSfengbojiang }
599*22ce4affSfengbojiang
600*22ce4affSfengbojiang /*
601a9643ea8Slogwang * Initialize the free queues.
602a9643ea8Slogwang */
603a9643ea8Slogwang for (dom = 0; dom < vm_ndomains; dom++) {
604a9643ea8Slogwang for (flind = 0; flind < vm_nfreelists; flind++) {
605a9643ea8Slogwang for (pind = 0; pind < VM_NFREEPOOL; pind++) {
606a9643ea8Slogwang fl = vm_phys_free_queues[dom][flind][pind];
607a9643ea8Slogwang for (oind = 0; oind < VM_NFREEORDER; oind++)
608a9643ea8Slogwang TAILQ_INIT(&fl[oind].pl);
609a9643ea8Slogwang }
610a9643ea8Slogwang }
611a9643ea8Slogwang }
612a9643ea8Slogwang
613a9643ea8Slogwang rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
614a9643ea8Slogwang }
615a9643ea8Slogwang
616a9643ea8Slogwang /*
617*22ce4affSfengbojiang * Register info about the NUMA topology of the system.
618*22ce4affSfengbojiang *
619*22ce4affSfengbojiang * Invoked by platform-dependent code prior to vm_phys_init().
620*22ce4affSfengbojiang */
621*22ce4affSfengbojiang void
vm_phys_register_domains(int ndomains,struct mem_affinity * affinity,int * locality)622*22ce4affSfengbojiang vm_phys_register_domains(int ndomains, struct mem_affinity *affinity,
623*22ce4affSfengbojiang int *locality)
624*22ce4affSfengbojiang {
625*22ce4affSfengbojiang #ifdef NUMA
626*22ce4affSfengbojiang int d, i;
627*22ce4affSfengbojiang
628*22ce4affSfengbojiang /*
629*22ce4affSfengbojiang * For now the only override value that we support is 1, which
630*22ce4affSfengbojiang * effectively disables NUMA-awareness in the allocators.
631*22ce4affSfengbojiang */
632*22ce4affSfengbojiang d = 0;
633*22ce4affSfengbojiang TUNABLE_INT_FETCH("vm.numa.disabled", &d);
634*22ce4affSfengbojiang if (d)
635*22ce4affSfengbojiang ndomains = 1;
636*22ce4affSfengbojiang
637*22ce4affSfengbojiang if (ndomains > 1) {
638*22ce4affSfengbojiang vm_ndomains = ndomains;
639*22ce4affSfengbojiang mem_affinity = affinity;
640*22ce4affSfengbojiang mem_locality = locality;
641*22ce4affSfengbojiang }
642*22ce4affSfengbojiang
643*22ce4affSfengbojiang for (i = 0; i < vm_ndomains; i++)
644*22ce4affSfengbojiang DOMAINSET_SET(i, &all_domains);
645*22ce4affSfengbojiang #else
646*22ce4affSfengbojiang (void)ndomains;
647*22ce4affSfengbojiang (void)affinity;
648*22ce4affSfengbojiang (void)locality;
649*22ce4affSfengbojiang #endif
650*22ce4affSfengbojiang }
651*22ce4affSfengbojiang
652*22ce4affSfengbojiang /*
653a9643ea8Slogwang * Split a contiguous, power of two-sized set of physical pages.
654*22ce4affSfengbojiang *
655*22ce4affSfengbojiang * When this function is called by a page allocation function, the caller
656*22ce4affSfengbojiang * should request insertion at the head unless the order [order, oind) queues
657*22ce4affSfengbojiang * are known to be empty. The objective being to reduce the likelihood of
658*22ce4affSfengbojiang * long-term fragmentation by promoting contemporaneous allocation and
659*22ce4affSfengbojiang * (hopefully) deallocation.
660a9643ea8Slogwang */
661a9643ea8Slogwang static __inline void
vm_phys_split_pages(vm_page_t m,int oind,struct vm_freelist * fl,int order,int tail)662*22ce4affSfengbojiang vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
663*22ce4affSfengbojiang int tail)
664a9643ea8Slogwang {
665a9643ea8Slogwang vm_page_t m_buddy;
666a9643ea8Slogwang
667a9643ea8Slogwang while (oind > order) {
668a9643ea8Slogwang oind--;
669a9643ea8Slogwang m_buddy = &m[1 << oind];
670a9643ea8Slogwang KASSERT(m_buddy->order == VM_NFREEORDER,
671a9643ea8Slogwang ("vm_phys_split_pages: page %p has unexpected order %d",
672a9643ea8Slogwang m_buddy, m_buddy->order));
673*22ce4affSfengbojiang vm_freelist_add(fl, m_buddy, oind, tail);
674a9643ea8Slogwang }
675a9643ea8Slogwang }
676a9643ea8Slogwang
677a9643ea8Slogwang /*
678*22ce4affSfengbojiang * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
679*22ce4affSfengbojiang * and sized set to the specified free list.
680*22ce4affSfengbojiang *
681*22ce4affSfengbojiang * When this function is called by a page allocation function, the caller
682*22ce4affSfengbojiang * should request insertion at the head unless the lower-order queues are
683*22ce4affSfengbojiang * known to be empty. The objective being to reduce the likelihood of long-
684*22ce4affSfengbojiang * term fragmentation by promoting contemporaneous allocation and (hopefully)
685*22ce4affSfengbojiang * deallocation.
686*22ce4affSfengbojiang *
687*22ce4affSfengbojiang * The physical page m's buddy must not be free.
688a9643ea8Slogwang */
689*22ce4affSfengbojiang static void
vm_phys_enq_range(vm_page_t m,u_int npages,struct vm_freelist * fl,int tail)690*22ce4affSfengbojiang vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
691a9643ea8Slogwang {
692*22ce4affSfengbojiang u_int n;
693*22ce4affSfengbojiang int order;
694a9643ea8Slogwang
695*22ce4affSfengbojiang KASSERT(npages > 0, ("vm_phys_enq_range: npages is 0"));
696*22ce4affSfengbojiang KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
697*22ce4affSfengbojiang ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0,
698*22ce4affSfengbojiang ("vm_phys_enq_range: page %p and npages %u are misaligned",
699*22ce4affSfengbojiang m, npages));
700*22ce4affSfengbojiang do {
701a9643ea8Slogwang KASSERT(m->order == VM_NFREEORDER,
702*22ce4affSfengbojiang ("vm_phys_enq_range: page %p has unexpected order %d",
703a9643ea8Slogwang m, m->order));
704*22ce4affSfengbojiang order = ffs(npages) - 1;
705*22ce4affSfengbojiang KASSERT(order < VM_NFREEORDER,
706*22ce4affSfengbojiang ("vm_phys_enq_range: order %d is out of range", order));
707*22ce4affSfengbojiang vm_freelist_add(fl, m, order, tail);
708*22ce4affSfengbojiang n = 1 << order;
709*22ce4affSfengbojiang m += n;
710*22ce4affSfengbojiang npages -= n;
711*22ce4affSfengbojiang } while (npages > 0);
712*22ce4affSfengbojiang }
713*22ce4affSfengbojiang
714*22ce4affSfengbojiang /*
715*22ce4affSfengbojiang * Tries to allocate the specified number of pages from the specified pool
716*22ce4affSfengbojiang * within the specified domain. Returns the actual number of allocated pages
717*22ce4affSfengbojiang * and a pointer to each page through the array ma[].
718*22ce4affSfengbojiang *
719*22ce4affSfengbojiang * The returned pages may not be physically contiguous. However, in contrast
720*22ce4affSfengbojiang * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
721*22ce4affSfengbojiang * calling this function once to allocate the desired number of pages will
722*22ce4affSfengbojiang * avoid wasted time in vm_phys_split_pages().
723*22ce4affSfengbojiang *
724*22ce4affSfengbojiang * The free page queues for the specified domain must be locked.
725*22ce4affSfengbojiang */
726*22ce4affSfengbojiang int
vm_phys_alloc_npages(int domain,int pool,int npages,vm_page_t ma[])727*22ce4affSfengbojiang vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
728*22ce4affSfengbojiang {
729*22ce4affSfengbojiang struct vm_freelist *alt, *fl;
730*22ce4affSfengbojiang vm_page_t m;
731*22ce4affSfengbojiang int avail, end, flind, freelist, i, need, oind, pind;
732*22ce4affSfengbojiang
733*22ce4affSfengbojiang KASSERT(domain >= 0 && domain < vm_ndomains,
734*22ce4affSfengbojiang ("vm_phys_alloc_npages: domain %d is out of range", domain));
735*22ce4affSfengbojiang KASSERT(pool < VM_NFREEPOOL,
736*22ce4affSfengbojiang ("vm_phys_alloc_npages: pool %d is out of range", pool));
737*22ce4affSfengbojiang KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
738*22ce4affSfengbojiang ("vm_phys_alloc_npages: npages %d is out of range", npages));
739*22ce4affSfengbojiang vm_domain_free_assert_locked(VM_DOMAIN(domain));
740*22ce4affSfengbojiang i = 0;
741*22ce4affSfengbojiang for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
742*22ce4affSfengbojiang flind = vm_freelist_to_flind[freelist];
743*22ce4affSfengbojiang if (flind < 0)
744*22ce4affSfengbojiang continue;
745*22ce4affSfengbojiang fl = vm_phys_free_queues[domain][flind][pool];
746*22ce4affSfengbojiang for (oind = 0; oind < VM_NFREEORDER; oind++) {
747*22ce4affSfengbojiang while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
748*22ce4affSfengbojiang vm_freelist_rem(fl, m, oind);
749*22ce4affSfengbojiang avail = 1 << oind;
750*22ce4affSfengbojiang need = imin(npages - i, avail);
751*22ce4affSfengbojiang for (end = i + need; i < end;)
752*22ce4affSfengbojiang ma[i++] = m++;
753*22ce4affSfengbojiang if (need < avail) {
754*22ce4affSfengbojiang /*
755*22ce4affSfengbojiang * Return excess pages to fl. Its
756*22ce4affSfengbojiang * order [0, oind) queues are empty.
757*22ce4affSfengbojiang */
758*22ce4affSfengbojiang vm_phys_enq_range(m, avail - need, fl,
759*22ce4affSfengbojiang 1);
760*22ce4affSfengbojiang return (npages);
761*22ce4affSfengbojiang } else if (i == npages)
762*22ce4affSfengbojiang return (npages);
763*22ce4affSfengbojiang }
764*22ce4affSfengbojiang }
765*22ce4affSfengbojiang for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
766*22ce4affSfengbojiang for (pind = 0; pind < VM_NFREEPOOL; pind++) {
767*22ce4affSfengbojiang alt = vm_phys_free_queues[domain][flind][pind];
768*22ce4affSfengbojiang while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
769*22ce4affSfengbojiang NULL) {
770*22ce4affSfengbojiang vm_freelist_rem(alt, m, oind);
771*22ce4affSfengbojiang vm_phys_set_pool(pool, m, oind);
772*22ce4affSfengbojiang avail = 1 << oind;
773*22ce4affSfengbojiang need = imin(npages - i, avail);
774*22ce4affSfengbojiang for (end = i + need; i < end;)
775*22ce4affSfengbojiang ma[i++] = m++;
776*22ce4affSfengbojiang if (need < avail) {
777*22ce4affSfengbojiang /*
778*22ce4affSfengbojiang * Return excess pages to fl.
779*22ce4affSfengbojiang * Its order [0, oind) queues
780*22ce4affSfengbojiang * are empty.
781*22ce4affSfengbojiang */
782*22ce4affSfengbojiang vm_phys_enq_range(m, avail -
783*22ce4affSfengbojiang need, fl, 1);
784*22ce4affSfengbojiang return (npages);
785*22ce4affSfengbojiang } else if (i == npages)
786*22ce4affSfengbojiang return (npages);
787*22ce4affSfengbojiang }
788*22ce4affSfengbojiang }
789*22ce4affSfengbojiang }
790*22ce4affSfengbojiang }
791*22ce4affSfengbojiang return (i);
792a9643ea8Slogwang }
793a9643ea8Slogwang
794a9643ea8Slogwang /*
795a9643ea8Slogwang * Allocate a contiguous, power of two-sized set of physical pages
796a9643ea8Slogwang * from the free lists.
797a9643ea8Slogwang *
798a9643ea8Slogwang * The free page queues must be locked.
799a9643ea8Slogwang */
800a9643ea8Slogwang vm_page_t
vm_phys_alloc_pages(int domain,int pool,int order)801*22ce4affSfengbojiang vm_phys_alloc_pages(int domain, int pool, int order)
802a9643ea8Slogwang {
803a9643ea8Slogwang vm_page_t m;
804*22ce4affSfengbojiang int freelist;
805a9643ea8Slogwang
806*22ce4affSfengbojiang for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
807*22ce4affSfengbojiang m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
808a9643ea8Slogwang if (m != NULL)
809a9643ea8Slogwang return (m);
810a9643ea8Slogwang }
811a9643ea8Slogwang return (NULL);
812a9643ea8Slogwang }
813a9643ea8Slogwang
814a9643ea8Slogwang /*
815a9643ea8Slogwang * Allocate a contiguous, power of two-sized set of physical pages from the
816a9643ea8Slogwang * specified free list. The free list must be specified using one of the
817a9643ea8Slogwang * manifest constants VM_FREELIST_*.
818a9643ea8Slogwang *
819a9643ea8Slogwang * The free page queues must be locked.
820a9643ea8Slogwang */
821a9643ea8Slogwang vm_page_t
vm_phys_alloc_freelist_pages(int domain,int freelist,int pool,int order)822*22ce4affSfengbojiang vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
823a9643ea8Slogwang {
824*22ce4affSfengbojiang struct vm_freelist *alt, *fl;
825a9643ea8Slogwang vm_page_t m;
826*22ce4affSfengbojiang int oind, pind, flind;
827a9643ea8Slogwang
828*22ce4affSfengbojiang KASSERT(domain >= 0 && domain < vm_ndomains,
829*22ce4affSfengbojiang ("vm_phys_alloc_freelist_pages: domain %d is out of range",
830*22ce4affSfengbojiang domain));
831a9643ea8Slogwang KASSERT(freelist < VM_NFREELIST,
832a9643ea8Slogwang ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
833a9643ea8Slogwang freelist));
834a9643ea8Slogwang KASSERT(pool < VM_NFREEPOOL,
835a9643ea8Slogwang ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
836a9643ea8Slogwang KASSERT(order < VM_NFREEORDER,
837a9643ea8Slogwang ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
838a9643ea8Slogwang
839*22ce4affSfengbojiang flind = vm_freelist_to_flind[freelist];
840*22ce4affSfengbojiang /* Check if freelist is present */
841*22ce4affSfengbojiang if (flind < 0)
842a9643ea8Slogwang return (NULL);
843a9643ea8Slogwang
844*22ce4affSfengbojiang vm_domain_free_assert_locked(VM_DOMAIN(domain));
845a9643ea8Slogwang fl = &vm_phys_free_queues[domain][flind][pool][0];
846a9643ea8Slogwang for (oind = order; oind < VM_NFREEORDER; oind++) {
847a9643ea8Slogwang m = TAILQ_FIRST(&fl[oind].pl);
848a9643ea8Slogwang if (m != NULL) {
849a9643ea8Slogwang vm_freelist_rem(fl, m, oind);
850*22ce4affSfengbojiang /* The order [order, oind) queues are empty. */
851*22ce4affSfengbojiang vm_phys_split_pages(m, oind, fl, order, 1);
852a9643ea8Slogwang return (m);
853a9643ea8Slogwang }
854a9643ea8Slogwang }
855a9643ea8Slogwang
856a9643ea8Slogwang /*
857a9643ea8Slogwang * The given pool was empty. Find the largest
858a9643ea8Slogwang * contiguous, power-of-two-sized set of pages in any
859a9643ea8Slogwang * pool. Transfer these pages to the given pool, and
860a9643ea8Slogwang * use them to satisfy the allocation.
861a9643ea8Slogwang */
862a9643ea8Slogwang for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
863a9643ea8Slogwang for (pind = 0; pind < VM_NFREEPOOL; pind++) {
864a9643ea8Slogwang alt = &vm_phys_free_queues[domain][flind][pind][0];
865a9643ea8Slogwang m = TAILQ_FIRST(&alt[oind].pl);
866a9643ea8Slogwang if (m != NULL) {
867a9643ea8Slogwang vm_freelist_rem(alt, m, oind);
868a9643ea8Slogwang vm_phys_set_pool(pool, m, oind);
869*22ce4affSfengbojiang /* The order [order, oind) queues are empty. */
870*22ce4affSfengbojiang vm_phys_split_pages(m, oind, fl, order, 1);
871a9643ea8Slogwang return (m);
872a9643ea8Slogwang }
873a9643ea8Slogwang }
874a9643ea8Slogwang }
875a9643ea8Slogwang return (NULL);
876a9643ea8Slogwang }
877a9643ea8Slogwang
878a9643ea8Slogwang /*
879a9643ea8Slogwang * Find the vm_page corresponding to the given physical address.
880a9643ea8Slogwang */
881a9643ea8Slogwang vm_page_t
vm_phys_paddr_to_vm_page(vm_paddr_t pa)882a9643ea8Slogwang vm_phys_paddr_to_vm_page(vm_paddr_t pa)
883a9643ea8Slogwang {
884a9643ea8Slogwang struct vm_phys_seg *seg;
885a9643ea8Slogwang int segind;
886a9643ea8Slogwang
887a9643ea8Slogwang for (segind = 0; segind < vm_phys_nsegs; segind++) {
888a9643ea8Slogwang seg = &vm_phys_segs[segind];
889a9643ea8Slogwang if (pa >= seg->start && pa < seg->end)
890a9643ea8Slogwang return (&seg->first_page[atop(pa - seg->start)]);
891a9643ea8Slogwang }
892a9643ea8Slogwang return (NULL);
893a9643ea8Slogwang }
894a9643ea8Slogwang
895a9643ea8Slogwang vm_page_t
vm_phys_fictitious_to_vm_page(vm_paddr_t pa)896a9643ea8Slogwang vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
897a9643ea8Slogwang {
898a9643ea8Slogwang struct vm_phys_fictitious_seg tmp, *seg;
899a9643ea8Slogwang vm_page_t m;
900a9643ea8Slogwang
901a9643ea8Slogwang m = NULL;
902a9643ea8Slogwang tmp.start = pa;
903a9643ea8Slogwang tmp.end = 0;
904a9643ea8Slogwang
905a9643ea8Slogwang rw_rlock(&vm_phys_fictitious_reg_lock);
906a9643ea8Slogwang seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
907a9643ea8Slogwang rw_runlock(&vm_phys_fictitious_reg_lock);
908a9643ea8Slogwang if (seg == NULL)
909a9643ea8Slogwang return (NULL);
910a9643ea8Slogwang
911a9643ea8Slogwang m = &seg->first_page[atop(pa - seg->start)];
912a9643ea8Slogwang KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
913a9643ea8Slogwang
914a9643ea8Slogwang return (m);
915a9643ea8Slogwang }
916a9643ea8Slogwang
917a9643ea8Slogwang static inline void
vm_phys_fictitious_init_range(vm_page_t range,vm_paddr_t start,long page_count,vm_memattr_t memattr)918a9643ea8Slogwang vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
919a9643ea8Slogwang long page_count, vm_memattr_t memattr)
920a9643ea8Slogwang {
921a9643ea8Slogwang long i;
922a9643ea8Slogwang
923*22ce4affSfengbojiang bzero(range, page_count * sizeof(*range));
924a9643ea8Slogwang for (i = 0; i < page_count; i++) {
925a9643ea8Slogwang vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
926a9643ea8Slogwang range[i].oflags &= ~VPO_UNMANAGED;
927a9643ea8Slogwang range[i].busy_lock = VPB_UNBUSIED;
928a9643ea8Slogwang }
929a9643ea8Slogwang }
930a9643ea8Slogwang
931a9643ea8Slogwang int
vm_phys_fictitious_reg_range(vm_paddr_t start,vm_paddr_t end,vm_memattr_t memattr)932a9643ea8Slogwang vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
933a9643ea8Slogwang vm_memattr_t memattr)
934a9643ea8Slogwang {
935a9643ea8Slogwang struct vm_phys_fictitious_seg *seg;
936a9643ea8Slogwang vm_page_t fp;
937a9643ea8Slogwang long page_count;
938a9643ea8Slogwang #ifdef VM_PHYSSEG_DENSE
939a9643ea8Slogwang long pi, pe;
940a9643ea8Slogwang long dpage_count;
941a9643ea8Slogwang #endif
942a9643ea8Slogwang
943a9643ea8Slogwang KASSERT(start < end,
944a9643ea8Slogwang ("Start of segment isn't less than end (start: %jx end: %jx)",
945a9643ea8Slogwang (uintmax_t)start, (uintmax_t)end));
946a9643ea8Slogwang
947a9643ea8Slogwang page_count = (end - start) / PAGE_SIZE;
948a9643ea8Slogwang
949a9643ea8Slogwang #ifdef VM_PHYSSEG_DENSE
950a9643ea8Slogwang pi = atop(start);
951a9643ea8Slogwang pe = atop(end);
952a9643ea8Slogwang if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
953a9643ea8Slogwang fp = &vm_page_array[pi - first_page];
954a9643ea8Slogwang if ((pe - first_page) > vm_page_array_size) {
955a9643ea8Slogwang /*
956a9643ea8Slogwang * We have a segment that starts inside
957a9643ea8Slogwang * of vm_page_array, but ends outside of it.
958a9643ea8Slogwang *
959a9643ea8Slogwang * Use vm_page_array pages for those that are
960a9643ea8Slogwang * inside of the vm_page_array range, and
961a9643ea8Slogwang * allocate the remaining ones.
962a9643ea8Slogwang */
963a9643ea8Slogwang dpage_count = vm_page_array_size - (pi - first_page);
964a9643ea8Slogwang vm_phys_fictitious_init_range(fp, start, dpage_count,
965a9643ea8Slogwang memattr);
966a9643ea8Slogwang page_count -= dpage_count;
967a9643ea8Slogwang start += ptoa(dpage_count);
968a9643ea8Slogwang goto alloc;
969a9643ea8Slogwang }
970a9643ea8Slogwang /*
971a9643ea8Slogwang * We can allocate the full range from vm_page_array,
972a9643ea8Slogwang * so there's no need to register the range in the tree.
973a9643ea8Slogwang */
974a9643ea8Slogwang vm_phys_fictitious_init_range(fp, start, page_count, memattr);
975a9643ea8Slogwang return (0);
976a9643ea8Slogwang } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
977a9643ea8Slogwang /*
978a9643ea8Slogwang * We have a segment that ends inside of vm_page_array,
979a9643ea8Slogwang * but starts outside of it.
980a9643ea8Slogwang */
981a9643ea8Slogwang fp = &vm_page_array[0];
982a9643ea8Slogwang dpage_count = pe - first_page;
983a9643ea8Slogwang vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
984a9643ea8Slogwang memattr);
985a9643ea8Slogwang end -= ptoa(dpage_count);
986a9643ea8Slogwang page_count -= dpage_count;
987a9643ea8Slogwang goto alloc;
988a9643ea8Slogwang } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
989a9643ea8Slogwang /*
990a9643ea8Slogwang * Trying to register a fictitious range that expands before
991a9643ea8Slogwang * and after vm_page_array.
992a9643ea8Slogwang */
993a9643ea8Slogwang return (EINVAL);
994a9643ea8Slogwang } else {
995a9643ea8Slogwang alloc:
996a9643ea8Slogwang #endif
997a9643ea8Slogwang fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
998*22ce4affSfengbojiang M_WAITOK);
999a9643ea8Slogwang #ifdef VM_PHYSSEG_DENSE
1000a9643ea8Slogwang }
1001a9643ea8Slogwang #endif
1002a9643ea8Slogwang vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1003a9643ea8Slogwang
1004a9643ea8Slogwang seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
1005a9643ea8Slogwang seg->start = start;
1006a9643ea8Slogwang seg->end = end;
1007a9643ea8Slogwang seg->first_page = fp;
1008a9643ea8Slogwang
1009a9643ea8Slogwang rw_wlock(&vm_phys_fictitious_reg_lock);
1010a9643ea8Slogwang RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
1011a9643ea8Slogwang rw_wunlock(&vm_phys_fictitious_reg_lock);
1012a9643ea8Slogwang
1013a9643ea8Slogwang return (0);
1014a9643ea8Slogwang }
1015a9643ea8Slogwang
1016a9643ea8Slogwang void
vm_phys_fictitious_unreg_range(vm_paddr_t start,vm_paddr_t end)1017a9643ea8Slogwang vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1018a9643ea8Slogwang {
1019a9643ea8Slogwang struct vm_phys_fictitious_seg *seg, tmp;
1020a9643ea8Slogwang #ifdef VM_PHYSSEG_DENSE
1021a9643ea8Slogwang long pi, pe;
1022a9643ea8Slogwang #endif
1023a9643ea8Slogwang
1024a9643ea8Slogwang KASSERT(start < end,
1025a9643ea8Slogwang ("Start of segment isn't less than end (start: %jx end: %jx)",
1026a9643ea8Slogwang (uintmax_t)start, (uintmax_t)end));
1027a9643ea8Slogwang
1028a9643ea8Slogwang #ifdef VM_PHYSSEG_DENSE
1029a9643ea8Slogwang pi = atop(start);
1030a9643ea8Slogwang pe = atop(end);
1031a9643ea8Slogwang if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1032a9643ea8Slogwang if ((pe - first_page) <= vm_page_array_size) {
1033a9643ea8Slogwang /*
1034a9643ea8Slogwang * This segment was allocated using vm_page_array
1035a9643ea8Slogwang * only, there's nothing to do since those pages
1036a9643ea8Slogwang * were never added to the tree.
1037a9643ea8Slogwang */
1038a9643ea8Slogwang return;
1039a9643ea8Slogwang }
1040a9643ea8Slogwang /*
1041a9643ea8Slogwang * We have a segment that starts inside
1042a9643ea8Slogwang * of vm_page_array, but ends outside of it.
1043a9643ea8Slogwang *
1044a9643ea8Slogwang * Calculate how many pages were added to the
1045a9643ea8Slogwang * tree and free them.
1046a9643ea8Slogwang */
1047a9643ea8Slogwang start = ptoa(first_page + vm_page_array_size);
1048a9643ea8Slogwang } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1049a9643ea8Slogwang /*
1050a9643ea8Slogwang * We have a segment that ends inside of vm_page_array,
1051a9643ea8Slogwang * but starts outside of it.
1052a9643ea8Slogwang */
1053a9643ea8Slogwang end = ptoa(first_page);
1054a9643ea8Slogwang } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1055a9643ea8Slogwang /* Since it's not possible to register such a range, panic. */
1056a9643ea8Slogwang panic(
1057a9643ea8Slogwang "Unregistering not registered fictitious range [%#jx:%#jx]",
1058a9643ea8Slogwang (uintmax_t)start, (uintmax_t)end);
1059a9643ea8Slogwang }
1060a9643ea8Slogwang #endif
1061a9643ea8Slogwang tmp.start = start;
1062a9643ea8Slogwang tmp.end = 0;
1063a9643ea8Slogwang
1064a9643ea8Slogwang rw_wlock(&vm_phys_fictitious_reg_lock);
1065a9643ea8Slogwang seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1066a9643ea8Slogwang if (seg->start != start || seg->end != end) {
1067a9643ea8Slogwang rw_wunlock(&vm_phys_fictitious_reg_lock);
1068a9643ea8Slogwang panic(
1069a9643ea8Slogwang "Unregistering not registered fictitious range [%#jx:%#jx]",
1070a9643ea8Slogwang (uintmax_t)start, (uintmax_t)end);
1071a9643ea8Slogwang }
1072a9643ea8Slogwang RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1073a9643ea8Slogwang rw_wunlock(&vm_phys_fictitious_reg_lock);
1074a9643ea8Slogwang free(seg->first_page, M_FICT_PAGES);
1075a9643ea8Slogwang free(seg, M_FICT_PAGES);
1076a9643ea8Slogwang }
1077a9643ea8Slogwang
1078a9643ea8Slogwang /*
1079a9643ea8Slogwang * Free a contiguous, power of two-sized set of physical pages.
1080a9643ea8Slogwang *
1081a9643ea8Slogwang * The free page queues must be locked.
1082a9643ea8Slogwang */
1083a9643ea8Slogwang void
vm_phys_free_pages(vm_page_t m,int order)1084a9643ea8Slogwang vm_phys_free_pages(vm_page_t m, int order)
1085a9643ea8Slogwang {
1086a9643ea8Slogwang struct vm_freelist *fl;
1087a9643ea8Slogwang struct vm_phys_seg *seg;
1088a9643ea8Slogwang vm_paddr_t pa;
1089a9643ea8Slogwang vm_page_t m_buddy;
1090a9643ea8Slogwang
1091a9643ea8Slogwang KASSERT(m->order == VM_NFREEORDER,
1092a9643ea8Slogwang ("vm_phys_free_pages: page %p has unexpected order %d",
1093a9643ea8Slogwang m, m->order));
1094a9643ea8Slogwang KASSERT(m->pool < VM_NFREEPOOL,
1095a9643ea8Slogwang ("vm_phys_free_pages: page %p has unexpected pool %d",
1096a9643ea8Slogwang m, m->pool));
1097a9643ea8Slogwang KASSERT(order < VM_NFREEORDER,
1098a9643ea8Slogwang ("vm_phys_free_pages: order %d is out of range", order));
1099a9643ea8Slogwang seg = &vm_phys_segs[m->segind];
1100*22ce4affSfengbojiang vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1101a9643ea8Slogwang if (order < VM_NFREEORDER - 1) {
1102a9643ea8Slogwang pa = VM_PAGE_TO_PHYS(m);
1103a9643ea8Slogwang do {
1104a9643ea8Slogwang pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1105a9643ea8Slogwang if (pa < seg->start || pa >= seg->end)
1106a9643ea8Slogwang break;
1107a9643ea8Slogwang m_buddy = &seg->first_page[atop(pa - seg->start)];
1108a9643ea8Slogwang if (m_buddy->order != order)
1109a9643ea8Slogwang break;
1110a9643ea8Slogwang fl = (*seg->free_queues)[m_buddy->pool];
1111a9643ea8Slogwang vm_freelist_rem(fl, m_buddy, order);
1112a9643ea8Slogwang if (m_buddy->pool != m->pool)
1113a9643ea8Slogwang vm_phys_set_pool(m->pool, m_buddy, order);
1114a9643ea8Slogwang order++;
1115a9643ea8Slogwang pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1116a9643ea8Slogwang m = &seg->first_page[atop(pa - seg->start)];
1117a9643ea8Slogwang } while (order < VM_NFREEORDER - 1);
1118a9643ea8Slogwang }
1119a9643ea8Slogwang fl = (*seg->free_queues)[m->pool];
1120a9643ea8Slogwang vm_freelist_add(fl, m, order, 1);
1121a9643ea8Slogwang }
1122a9643ea8Slogwang
1123a9643ea8Slogwang /*
1124*22ce4affSfengbojiang * Return the largest possible order of a set of pages starting at m.
1125a9643ea8Slogwang */
1126*22ce4affSfengbojiang static int
max_order(vm_page_t m)1127*22ce4affSfengbojiang max_order(vm_page_t m)
1128a9643ea8Slogwang {
1129a9643ea8Slogwang
1130a9643ea8Slogwang /*
1131a9643ea8Slogwang * Unsigned "min" is used here so that "order" is assigned
1132a9643ea8Slogwang * "VM_NFREEORDER - 1" when "m"'s physical address is zero
1133a9643ea8Slogwang * or the low-order bits of its physical address are zero
1134a9643ea8Slogwang * because the size of a physical address exceeds the size of
1135a9643ea8Slogwang * a long.
1136a9643ea8Slogwang */
1137*22ce4affSfengbojiang return (min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
1138*22ce4affSfengbojiang VM_NFREEORDER - 1));
1139a9643ea8Slogwang }
1140*22ce4affSfengbojiang
1141*22ce4affSfengbojiang /*
1142*22ce4affSfengbojiang * Free a contiguous, arbitrarily sized set of physical pages, without
1143*22ce4affSfengbojiang * merging across set boundaries.
1144*22ce4affSfengbojiang *
1145*22ce4affSfengbojiang * The free page queues must be locked.
1146*22ce4affSfengbojiang */
1147*22ce4affSfengbojiang void
vm_phys_enqueue_contig(vm_page_t m,u_long npages)1148*22ce4affSfengbojiang vm_phys_enqueue_contig(vm_page_t m, u_long npages)
1149*22ce4affSfengbojiang {
1150*22ce4affSfengbojiang struct vm_freelist *fl;
1151*22ce4affSfengbojiang struct vm_phys_seg *seg;
1152*22ce4affSfengbojiang vm_page_t m_end;
1153*22ce4affSfengbojiang int order;
1154*22ce4affSfengbojiang
1155*22ce4affSfengbojiang /*
1156*22ce4affSfengbojiang * Avoid unnecessary coalescing by freeing the pages in the largest
1157*22ce4affSfengbojiang * possible power-of-two-sized subsets.
1158*22ce4affSfengbojiang */
1159*22ce4affSfengbojiang vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1160*22ce4affSfengbojiang seg = &vm_phys_segs[m->segind];
1161*22ce4affSfengbojiang fl = (*seg->free_queues)[m->pool];
1162*22ce4affSfengbojiang m_end = m + npages;
1163*22ce4affSfengbojiang /* Free blocks of increasing size. */
1164*22ce4affSfengbojiang while ((order = max_order(m)) < VM_NFREEORDER - 1 &&
1165*22ce4affSfengbojiang m + (1 << order) <= m_end) {
1166*22ce4affSfengbojiang KASSERT(seg == &vm_phys_segs[m->segind],
1167*22ce4affSfengbojiang ("%s: page range [%p,%p) spans multiple segments",
1168*22ce4affSfengbojiang __func__, m_end - npages, m));
1169*22ce4affSfengbojiang vm_freelist_add(fl, m, order, 1);
1170*22ce4affSfengbojiang m += 1 << order;
1171a9643ea8Slogwang }
1172*22ce4affSfengbojiang /* Free blocks of maximum size. */
1173*22ce4affSfengbojiang while (m + (1 << order) <= m_end) {
1174*22ce4affSfengbojiang KASSERT(seg == &vm_phys_segs[m->segind],
1175*22ce4affSfengbojiang ("%s: page range [%p,%p) spans multiple segments",
1176*22ce4affSfengbojiang __func__, m_end - npages, m));
1177*22ce4affSfengbojiang vm_freelist_add(fl, m, order, 1);
1178*22ce4affSfengbojiang m += 1 << order;
1179*22ce4affSfengbojiang }
1180*22ce4affSfengbojiang /* Free blocks of diminishing size. */
1181*22ce4affSfengbojiang while (m < m_end) {
1182*22ce4affSfengbojiang KASSERT(seg == &vm_phys_segs[m->segind],
1183*22ce4affSfengbojiang ("%s: page range [%p,%p) spans multiple segments",
1184*22ce4affSfengbojiang __func__, m_end - npages, m));
1185*22ce4affSfengbojiang order = flsl(m_end - m) - 1;
1186*22ce4affSfengbojiang vm_freelist_add(fl, m, order, 1);
1187*22ce4affSfengbojiang m += 1 << order;
1188*22ce4affSfengbojiang }
1189*22ce4affSfengbojiang }
1190*22ce4affSfengbojiang
1191*22ce4affSfengbojiang /*
1192*22ce4affSfengbojiang * Free a contiguous, arbitrarily sized set of physical pages.
1193*22ce4affSfengbojiang *
1194*22ce4affSfengbojiang * The free page queues must be locked.
1195*22ce4affSfengbojiang */
1196*22ce4affSfengbojiang void
vm_phys_free_contig(vm_page_t m,u_long npages)1197*22ce4affSfengbojiang vm_phys_free_contig(vm_page_t m, u_long npages)
1198*22ce4affSfengbojiang {
1199*22ce4affSfengbojiang int order_start, order_end;
1200*22ce4affSfengbojiang vm_page_t m_start, m_end;
1201*22ce4affSfengbojiang
1202*22ce4affSfengbojiang vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1203*22ce4affSfengbojiang
1204*22ce4affSfengbojiang m_start = m;
1205*22ce4affSfengbojiang order_start = max_order(m_start);
1206*22ce4affSfengbojiang if (order_start < VM_NFREEORDER - 1)
1207*22ce4affSfengbojiang m_start += 1 << order_start;
1208*22ce4affSfengbojiang m_end = m + npages;
1209*22ce4affSfengbojiang order_end = max_order(m_end);
1210*22ce4affSfengbojiang if (order_end < VM_NFREEORDER - 1)
1211*22ce4affSfengbojiang m_end -= 1 << order_end;
1212*22ce4affSfengbojiang /*
1213*22ce4affSfengbojiang * Avoid unnecessary coalescing by freeing the pages at the start and
1214*22ce4affSfengbojiang * end of the range last.
1215*22ce4affSfengbojiang */
1216*22ce4affSfengbojiang if (m_start < m_end)
1217*22ce4affSfengbojiang vm_phys_enqueue_contig(m_start, m_end - m_start);
1218*22ce4affSfengbojiang if (order_start < VM_NFREEORDER - 1)
1219*22ce4affSfengbojiang vm_phys_free_pages(m, order_start);
1220*22ce4affSfengbojiang if (order_end < VM_NFREEORDER - 1)
1221*22ce4affSfengbojiang vm_phys_free_pages(m_end, order_end);
1222a9643ea8Slogwang }
1223a9643ea8Slogwang
1224a9643ea8Slogwang /*
1225a9643ea8Slogwang * Scan physical memory between the specified addresses "low" and "high" for a
1226a9643ea8Slogwang * run of contiguous physical pages that satisfy the specified conditions, and
1227a9643ea8Slogwang * return the lowest page in the run. The specified "alignment" determines
1228a9643ea8Slogwang * the alignment of the lowest physical page in the run. If the specified
1229a9643ea8Slogwang * "boundary" is non-zero, then the run of physical pages cannot span a
1230a9643ea8Slogwang * physical address that is a multiple of "boundary".
1231a9643ea8Slogwang *
1232a9643ea8Slogwang * "npages" must be greater than zero. Both "alignment" and "boundary" must
1233a9643ea8Slogwang * be a power of two.
1234a9643ea8Slogwang */
1235a9643ea8Slogwang vm_page_t
vm_phys_scan_contig(int domain,u_long npages,vm_paddr_t low,vm_paddr_t high,u_long alignment,vm_paddr_t boundary,int options)1236*22ce4affSfengbojiang vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1237a9643ea8Slogwang u_long alignment, vm_paddr_t boundary, int options)
1238a9643ea8Slogwang {
1239a9643ea8Slogwang vm_paddr_t pa_end;
1240a9643ea8Slogwang vm_page_t m_end, m_run, m_start;
1241a9643ea8Slogwang struct vm_phys_seg *seg;
1242a9643ea8Slogwang int segind;
1243a9643ea8Slogwang
1244a9643ea8Slogwang KASSERT(npages > 0, ("npages is 0"));
1245a9643ea8Slogwang KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1246a9643ea8Slogwang KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1247a9643ea8Slogwang if (low >= high)
1248a9643ea8Slogwang return (NULL);
1249a9643ea8Slogwang for (segind = 0; segind < vm_phys_nsegs; segind++) {
1250a9643ea8Slogwang seg = &vm_phys_segs[segind];
1251*22ce4affSfengbojiang if (seg->domain != domain)
1252*22ce4affSfengbojiang continue;
1253a9643ea8Slogwang if (seg->start >= high)
1254a9643ea8Slogwang break;
1255a9643ea8Slogwang if (low >= seg->end)
1256a9643ea8Slogwang continue;
1257a9643ea8Slogwang if (low <= seg->start)
1258a9643ea8Slogwang m_start = seg->first_page;
1259a9643ea8Slogwang else
1260a9643ea8Slogwang m_start = &seg->first_page[atop(low - seg->start)];
1261a9643ea8Slogwang if (high < seg->end)
1262a9643ea8Slogwang pa_end = high;
1263a9643ea8Slogwang else
1264a9643ea8Slogwang pa_end = seg->end;
1265a9643ea8Slogwang if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
1266a9643ea8Slogwang continue;
1267a9643ea8Slogwang m_end = &seg->first_page[atop(pa_end - seg->start)];
1268a9643ea8Slogwang m_run = vm_page_scan_contig(npages, m_start, m_end,
1269a9643ea8Slogwang alignment, boundary, options);
1270a9643ea8Slogwang if (m_run != NULL)
1271a9643ea8Slogwang return (m_run);
1272a9643ea8Slogwang }
1273a9643ea8Slogwang return (NULL);
1274a9643ea8Slogwang }
1275a9643ea8Slogwang
1276a9643ea8Slogwang /*
1277a9643ea8Slogwang * Set the pool for a contiguous, power of two-sized set of physical pages.
1278a9643ea8Slogwang */
1279a9643ea8Slogwang void
vm_phys_set_pool(int pool,vm_page_t m,int order)1280a9643ea8Slogwang vm_phys_set_pool(int pool, vm_page_t m, int order)
1281a9643ea8Slogwang {
1282a9643ea8Slogwang vm_page_t m_tmp;
1283a9643ea8Slogwang
1284a9643ea8Slogwang for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
1285a9643ea8Slogwang m_tmp->pool = pool;
1286a9643ea8Slogwang }
1287a9643ea8Slogwang
1288a9643ea8Slogwang /*
1289a9643ea8Slogwang * Search for the given physical page "m" in the free lists. If the search
1290a9643ea8Slogwang * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return
1291a9643ea8Slogwang * FALSE, indicating that "m" is not in the free lists.
1292a9643ea8Slogwang *
1293a9643ea8Slogwang * The free page queues must be locked.
1294a9643ea8Slogwang */
1295a9643ea8Slogwang boolean_t
vm_phys_unfree_page(vm_page_t m)1296a9643ea8Slogwang vm_phys_unfree_page(vm_page_t m)
1297a9643ea8Slogwang {
1298a9643ea8Slogwang struct vm_freelist *fl;
1299a9643ea8Slogwang struct vm_phys_seg *seg;
1300a9643ea8Slogwang vm_paddr_t pa, pa_half;
1301a9643ea8Slogwang vm_page_t m_set, m_tmp;
1302a9643ea8Slogwang int order;
1303a9643ea8Slogwang
1304a9643ea8Slogwang /*
1305a9643ea8Slogwang * First, find the contiguous, power of two-sized set of free
1306a9643ea8Slogwang * physical pages containing the given physical page "m" and
1307a9643ea8Slogwang * assign it to "m_set".
1308a9643ea8Slogwang */
1309a9643ea8Slogwang seg = &vm_phys_segs[m->segind];
1310*22ce4affSfengbojiang vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1311a9643ea8Slogwang for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1312a9643ea8Slogwang order < VM_NFREEORDER - 1; ) {
1313a9643ea8Slogwang order++;
1314a9643ea8Slogwang pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1315a9643ea8Slogwang if (pa >= seg->start)
1316a9643ea8Slogwang m_set = &seg->first_page[atop(pa - seg->start)];
1317a9643ea8Slogwang else
1318a9643ea8Slogwang return (FALSE);
1319a9643ea8Slogwang }
1320a9643ea8Slogwang if (m_set->order < order)
1321a9643ea8Slogwang return (FALSE);
1322a9643ea8Slogwang if (m_set->order == VM_NFREEORDER)
1323a9643ea8Slogwang return (FALSE);
1324a9643ea8Slogwang KASSERT(m_set->order < VM_NFREEORDER,
1325a9643ea8Slogwang ("vm_phys_unfree_page: page %p has unexpected order %d",
1326a9643ea8Slogwang m_set, m_set->order));
1327a9643ea8Slogwang
1328a9643ea8Slogwang /*
1329a9643ea8Slogwang * Next, remove "m_set" from the free lists. Finally, extract
1330a9643ea8Slogwang * "m" from "m_set" using an iterative algorithm: While "m_set"
1331a9643ea8Slogwang * is larger than a page, shrink "m_set" by returning the half
1332a9643ea8Slogwang * of "m_set" that does not contain "m" to the free lists.
1333a9643ea8Slogwang */
1334a9643ea8Slogwang fl = (*seg->free_queues)[m_set->pool];
1335a9643ea8Slogwang order = m_set->order;
1336a9643ea8Slogwang vm_freelist_rem(fl, m_set, order);
1337a9643ea8Slogwang while (order > 0) {
1338a9643ea8Slogwang order--;
1339a9643ea8Slogwang pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1340a9643ea8Slogwang if (m->phys_addr < pa_half)
1341a9643ea8Slogwang m_tmp = &seg->first_page[atop(pa_half - seg->start)];
1342a9643ea8Slogwang else {
1343a9643ea8Slogwang m_tmp = m_set;
1344a9643ea8Slogwang m_set = &seg->first_page[atop(pa_half - seg->start)];
1345a9643ea8Slogwang }
1346a9643ea8Slogwang vm_freelist_add(fl, m_tmp, order, 0);
1347a9643ea8Slogwang }
1348a9643ea8Slogwang KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1349a9643ea8Slogwang return (TRUE);
1350a9643ea8Slogwang }
1351a9643ea8Slogwang
1352a9643ea8Slogwang /*
1353a9643ea8Slogwang * Allocate a contiguous set of physical pages of the given size
1354a9643ea8Slogwang * "npages" from the free lists. All of the physical pages must be at
1355a9643ea8Slogwang * or above the given physical address "low" and below the given
1356a9643ea8Slogwang * physical address "high". The given value "alignment" determines the
1357a9643ea8Slogwang * alignment of the first physical page in the set. If the given value
1358a9643ea8Slogwang * "boundary" is non-zero, then the set of physical pages cannot cross
1359a9643ea8Slogwang * any physical address boundary that is a multiple of that value. Both
1360a9643ea8Slogwang * "alignment" and "boundary" must be a power of two.
1361a9643ea8Slogwang */
1362a9643ea8Slogwang vm_page_t
vm_phys_alloc_contig(int domain,u_long npages,vm_paddr_t low,vm_paddr_t high,u_long alignment,vm_paddr_t boundary)1363*22ce4affSfengbojiang vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1364a9643ea8Slogwang u_long alignment, vm_paddr_t boundary)
1365a9643ea8Slogwang {
1366a9643ea8Slogwang vm_paddr_t pa_end, pa_start;
1367a9643ea8Slogwang vm_page_t m_run;
1368a9643ea8Slogwang struct vm_phys_seg *seg;
1369*22ce4affSfengbojiang int segind;
1370a9643ea8Slogwang
1371a9643ea8Slogwang KASSERT(npages > 0, ("npages is 0"));
1372a9643ea8Slogwang KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1373a9643ea8Slogwang KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1374*22ce4affSfengbojiang vm_domain_free_assert_locked(VM_DOMAIN(domain));
1375a9643ea8Slogwang if (low >= high)
1376a9643ea8Slogwang return (NULL);
1377a9643ea8Slogwang m_run = NULL;
1378a9643ea8Slogwang for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1379a9643ea8Slogwang seg = &vm_phys_segs[segind];
1380a9643ea8Slogwang if (seg->start >= high || seg->domain != domain)
1381a9643ea8Slogwang continue;
1382a9643ea8Slogwang if (low >= seg->end)
1383a9643ea8Slogwang break;
1384a9643ea8Slogwang if (low <= seg->start)
1385a9643ea8Slogwang pa_start = seg->start;
1386a9643ea8Slogwang else
1387a9643ea8Slogwang pa_start = low;
1388a9643ea8Slogwang if (high < seg->end)
1389a9643ea8Slogwang pa_end = high;
1390a9643ea8Slogwang else
1391a9643ea8Slogwang pa_end = seg->end;
1392a9643ea8Slogwang if (pa_end - pa_start < ptoa(npages))
1393a9643ea8Slogwang continue;
1394a9643ea8Slogwang m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
1395a9643ea8Slogwang alignment, boundary);
1396a9643ea8Slogwang if (m_run != NULL)
1397a9643ea8Slogwang break;
1398a9643ea8Slogwang }
1399a9643ea8Slogwang return (m_run);
1400a9643ea8Slogwang }
1401a9643ea8Slogwang
1402a9643ea8Slogwang /*
1403a9643ea8Slogwang * Allocate a run of contiguous physical pages from the free list for the
1404a9643ea8Slogwang * specified segment.
1405a9643ea8Slogwang */
1406a9643ea8Slogwang static vm_page_t
vm_phys_alloc_seg_contig(struct vm_phys_seg * seg,u_long npages,vm_paddr_t low,vm_paddr_t high,u_long alignment,vm_paddr_t boundary)1407a9643ea8Slogwang vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
1408a9643ea8Slogwang vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1409a9643ea8Slogwang {
1410a9643ea8Slogwang struct vm_freelist *fl;
1411a9643ea8Slogwang vm_paddr_t pa, pa_end, size;
1412a9643ea8Slogwang vm_page_t m, m_ret;
1413a9643ea8Slogwang u_long npages_end;
1414a9643ea8Slogwang int oind, order, pind;
1415a9643ea8Slogwang
1416a9643ea8Slogwang KASSERT(npages > 0, ("npages is 0"));
1417a9643ea8Slogwang KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1418a9643ea8Slogwang KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1419*22ce4affSfengbojiang vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1420a9643ea8Slogwang /* Compute the queue that is the best fit for npages. */
1421*22ce4affSfengbojiang order = flsl(npages - 1);
1422a9643ea8Slogwang /* Search for a run satisfying the specified conditions. */
1423a9643ea8Slogwang size = npages << PAGE_SHIFT;
1424a9643ea8Slogwang for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
1425a9643ea8Slogwang oind++) {
1426a9643ea8Slogwang for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1427a9643ea8Slogwang fl = (*seg->free_queues)[pind];
1428*22ce4affSfengbojiang TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
1429a9643ea8Slogwang /*
1430a9643ea8Slogwang * Is the size of this allocation request
1431a9643ea8Slogwang * larger than the largest block size?
1432a9643ea8Slogwang */
1433a9643ea8Slogwang if (order >= VM_NFREEORDER) {
1434a9643ea8Slogwang /*
1435a9643ea8Slogwang * Determine if a sufficient number of
1436a9643ea8Slogwang * subsequent blocks to satisfy the
1437a9643ea8Slogwang * allocation request are free.
1438a9643ea8Slogwang */
1439a9643ea8Slogwang pa = VM_PAGE_TO_PHYS(m_ret);
1440a9643ea8Slogwang pa_end = pa + size;
1441*22ce4affSfengbojiang if (pa_end < pa)
1442*22ce4affSfengbojiang continue;
1443a9643ea8Slogwang for (;;) {
1444a9643ea8Slogwang pa += 1 << (PAGE_SHIFT +
1445a9643ea8Slogwang VM_NFREEORDER - 1);
1446a9643ea8Slogwang if (pa >= pa_end ||
1447a9643ea8Slogwang pa < seg->start ||
1448a9643ea8Slogwang pa >= seg->end)
1449a9643ea8Slogwang break;
1450a9643ea8Slogwang m = &seg->first_page[atop(pa -
1451a9643ea8Slogwang seg->start)];
1452a9643ea8Slogwang if (m->order != VM_NFREEORDER -
1453a9643ea8Slogwang 1)
1454a9643ea8Slogwang break;
1455a9643ea8Slogwang }
1456a9643ea8Slogwang /* If not, go to the next block. */
1457a9643ea8Slogwang if (pa < pa_end)
1458a9643ea8Slogwang continue;
1459a9643ea8Slogwang }
1460a9643ea8Slogwang
1461a9643ea8Slogwang /*
1462a9643ea8Slogwang * Determine if the blocks are within the
1463a9643ea8Slogwang * given range, satisfy the given alignment,
1464a9643ea8Slogwang * and do not cross the given boundary.
1465a9643ea8Slogwang */
1466a9643ea8Slogwang pa = VM_PAGE_TO_PHYS(m_ret);
1467a9643ea8Slogwang pa_end = pa + size;
1468a9643ea8Slogwang if (pa >= low && pa_end <= high &&
1469a9643ea8Slogwang (pa & (alignment - 1)) == 0 &&
1470a9643ea8Slogwang rounddown2(pa ^ (pa_end - 1), boundary) == 0)
1471a9643ea8Slogwang goto done;
1472a9643ea8Slogwang }
1473a9643ea8Slogwang }
1474a9643ea8Slogwang }
1475a9643ea8Slogwang return (NULL);
1476a9643ea8Slogwang done:
1477a9643ea8Slogwang for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
1478a9643ea8Slogwang fl = (*seg->free_queues)[m->pool];
1479*22ce4affSfengbojiang vm_freelist_rem(fl, m, oind);
1480*22ce4affSfengbojiang if (m->pool != VM_FREEPOOL_DEFAULT)
1481*22ce4affSfengbojiang vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
1482a9643ea8Slogwang }
1483a9643ea8Slogwang /* Return excess pages to the free lists. */
1484*22ce4affSfengbojiang npages_end = roundup2(npages, 1 << oind);
1485*22ce4affSfengbojiang if (npages < npages_end) {
1486*22ce4affSfengbojiang fl = (*seg->free_queues)[VM_FREEPOOL_DEFAULT];
1487*22ce4affSfengbojiang vm_phys_enq_range(&m_ret[npages], npages_end - npages, fl, 0);
1488*22ce4affSfengbojiang }
1489a9643ea8Slogwang return (m_ret);
1490a9643ea8Slogwang }
1491a9643ea8Slogwang
1492*22ce4affSfengbojiang /*
1493*22ce4affSfengbojiang * Return the index of the first unused slot which may be the terminating
1494*22ce4affSfengbojiang * entry.
1495*22ce4affSfengbojiang */
1496*22ce4affSfengbojiang static int
vm_phys_avail_count(void)1497*22ce4affSfengbojiang vm_phys_avail_count(void)
1498*22ce4affSfengbojiang {
1499*22ce4affSfengbojiang int i;
1500*22ce4affSfengbojiang
1501*22ce4affSfengbojiang for (i = 0; phys_avail[i + 1]; i += 2)
1502*22ce4affSfengbojiang continue;
1503*22ce4affSfengbojiang if (i > PHYS_AVAIL_ENTRIES)
1504*22ce4affSfengbojiang panic("Improperly terminated phys_avail %d entries", i);
1505*22ce4affSfengbojiang
1506*22ce4affSfengbojiang return (i);
1507*22ce4affSfengbojiang }
1508*22ce4affSfengbojiang
1509*22ce4affSfengbojiang /*
1510*22ce4affSfengbojiang * Assert that a phys_avail entry is valid.
1511*22ce4affSfengbojiang */
1512*22ce4affSfengbojiang static void
vm_phys_avail_check(int i)1513*22ce4affSfengbojiang vm_phys_avail_check(int i)
1514*22ce4affSfengbojiang {
1515*22ce4affSfengbojiang if (phys_avail[i] & PAGE_MASK)
1516*22ce4affSfengbojiang panic("Unaligned phys_avail[%d]: %#jx", i,
1517*22ce4affSfengbojiang (intmax_t)phys_avail[i]);
1518*22ce4affSfengbojiang if (phys_avail[i+1] & PAGE_MASK)
1519*22ce4affSfengbojiang panic("Unaligned phys_avail[%d + 1]: %#jx", i,
1520*22ce4affSfengbojiang (intmax_t)phys_avail[i]);
1521*22ce4affSfengbojiang if (phys_avail[i + 1] < phys_avail[i])
1522*22ce4affSfengbojiang panic("phys_avail[%d] start %#jx < end %#jx", i,
1523*22ce4affSfengbojiang (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]);
1524*22ce4affSfengbojiang }
1525*22ce4affSfengbojiang
1526*22ce4affSfengbojiang /*
1527*22ce4affSfengbojiang * Return the index of an overlapping phys_avail entry or -1.
1528*22ce4affSfengbojiang */
1529*22ce4affSfengbojiang #ifdef NUMA
1530*22ce4affSfengbojiang static int
vm_phys_avail_find(vm_paddr_t pa)1531*22ce4affSfengbojiang vm_phys_avail_find(vm_paddr_t pa)
1532*22ce4affSfengbojiang {
1533*22ce4affSfengbojiang int i;
1534*22ce4affSfengbojiang
1535*22ce4affSfengbojiang for (i = 0; phys_avail[i + 1]; i += 2)
1536*22ce4affSfengbojiang if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
1537*22ce4affSfengbojiang return (i);
1538*22ce4affSfengbojiang return (-1);
1539*22ce4affSfengbojiang }
1540*22ce4affSfengbojiang #endif
1541*22ce4affSfengbojiang
1542*22ce4affSfengbojiang /*
1543*22ce4affSfengbojiang * Return the index of the largest entry.
1544*22ce4affSfengbojiang */
1545*22ce4affSfengbojiang int
vm_phys_avail_largest(void)1546*22ce4affSfengbojiang vm_phys_avail_largest(void)
1547*22ce4affSfengbojiang {
1548*22ce4affSfengbojiang vm_paddr_t sz, largesz;
1549*22ce4affSfengbojiang int largest;
1550*22ce4affSfengbojiang int i;
1551*22ce4affSfengbojiang
1552*22ce4affSfengbojiang largest = 0;
1553*22ce4affSfengbojiang largesz = 0;
1554*22ce4affSfengbojiang for (i = 0; phys_avail[i + 1]; i += 2) {
1555*22ce4affSfengbojiang sz = vm_phys_avail_size(i);
1556*22ce4affSfengbojiang if (sz > largesz) {
1557*22ce4affSfengbojiang largesz = sz;
1558*22ce4affSfengbojiang largest = i;
1559*22ce4affSfengbojiang }
1560*22ce4affSfengbojiang }
1561*22ce4affSfengbojiang
1562*22ce4affSfengbojiang return (largest);
1563*22ce4affSfengbojiang }
1564*22ce4affSfengbojiang
1565*22ce4affSfengbojiang vm_paddr_t
vm_phys_avail_size(int i)1566*22ce4affSfengbojiang vm_phys_avail_size(int i)
1567*22ce4affSfengbojiang {
1568*22ce4affSfengbojiang
1569*22ce4affSfengbojiang return (phys_avail[i + 1] - phys_avail[i]);
1570*22ce4affSfengbojiang }
1571*22ce4affSfengbojiang
1572*22ce4affSfengbojiang /*
1573*22ce4affSfengbojiang * Split an entry at the address 'pa'. Return zero on success or errno.
1574*22ce4affSfengbojiang */
1575*22ce4affSfengbojiang static int
vm_phys_avail_split(vm_paddr_t pa,int i)1576*22ce4affSfengbojiang vm_phys_avail_split(vm_paddr_t pa, int i)
1577*22ce4affSfengbojiang {
1578*22ce4affSfengbojiang int cnt;
1579*22ce4affSfengbojiang
1580*22ce4affSfengbojiang vm_phys_avail_check(i);
1581*22ce4affSfengbojiang if (pa <= phys_avail[i] || pa >= phys_avail[i + 1])
1582*22ce4affSfengbojiang panic("vm_phys_avail_split: invalid address");
1583*22ce4affSfengbojiang cnt = vm_phys_avail_count();
1584*22ce4affSfengbojiang if (cnt >= PHYS_AVAIL_ENTRIES)
1585*22ce4affSfengbojiang return (ENOSPC);
1586*22ce4affSfengbojiang memmove(&phys_avail[i + 2], &phys_avail[i],
1587*22ce4affSfengbojiang (cnt - i) * sizeof(phys_avail[0]));
1588*22ce4affSfengbojiang phys_avail[i + 1] = pa;
1589*22ce4affSfengbojiang phys_avail[i + 2] = pa;
1590*22ce4affSfengbojiang vm_phys_avail_check(i);
1591*22ce4affSfengbojiang vm_phys_avail_check(i+2);
1592*22ce4affSfengbojiang
1593*22ce4affSfengbojiang return (0);
1594*22ce4affSfengbojiang }
1595*22ce4affSfengbojiang
1596*22ce4affSfengbojiang void
vm_phys_early_add_seg(vm_paddr_t start,vm_paddr_t end)1597*22ce4affSfengbojiang vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
1598*22ce4affSfengbojiang {
1599*22ce4affSfengbojiang struct vm_phys_seg *seg;
1600*22ce4affSfengbojiang
1601*22ce4affSfengbojiang if (vm_phys_early_nsegs == -1)
1602*22ce4affSfengbojiang panic("%s: called after initialization", __func__);
1603*22ce4affSfengbojiang if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
1604*22ce4affSfengbojiang panic("%s: ran out of early segments", __func__);
1605*22ce4affSfengbojiang
1606*22ce4affSfengbojiang seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
1607*22ce4affSfengbojiang seg->start = start;
1608*22ce4affSfengbojiang seg->end = end;
1609*22ce4affSfengbojiang }
1610*22ce4affSfengbojiang
1611*22ce4affSfengbojiang /*
1612*22ce4affSfengbojiang * This routine allocates NUMA node specific memory before the page
1613*22ce4affSfengbojiang * allocator is bootstrapped.
1614*22ce4affSfengbojiang */
1615*22ce4affSfengbojiang vm_paddr_t
vm_phys_early_alloc(int domain,size_t alloc_size)1616*22ce4affSfengbojiang vm_phys_early_alloc(int domain, size_t alloc_size)
1617*22ce4affSfengbojiang {
1618*22ce4affSfengbojiang int i, mem_index, biggestone;
1619*22ce4affSfengbojiang vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
1620*22ce4affSfengbojiang
1621*22ce4affSfengbojiang KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
1622*22ce4affSfengbojiang ("%s: invalid domain index %d", __func__, domain));
1623*22ce4affSfengbojiang
1624*22ce4affSfengbojiang /*
1625*22ce4affSfengbojiang * Search the mem_affinity array for the biggest address
1626*22ce4affSfengbojiang * range in the desired domain. This is used to constrain
1627*22ce4affSfengbojiang * the phys_avail selection below.
1628*22ce4affSfengbojiang */
1629*22ce4affSfengbojiang biggestsize = 0;
1630*22ce4affSfengbojiang mem_index = 0;
1631*22ce4affSfengbojiang mem_start = 0;
1632*22ce4affSfengbojiang mem_end = -1;
1633*22ce4affSfengbojiang #ifdef NUMA
1634*22ce4affSfengbojiang if (mem_affinity != NULL) {
1635*22ce4affSfengbojiang for (i = 0;; i++) {
1636*22ce4affSfengbojiang size = mem_affinity[i].end - mem_affinity[i].start;
1637*22ce4affSfengbojiang if (size == 0)
1638*22ce4affSfengbojiang break;
1639*22ce4affSfengbojiang if (domain != -1 && mem_affinity[i].domain != domain)
1640*22ce4affSfengbojiang continue;
1641*22ce4affSfengbojiang if (size > biggestsize) {
1642*22ce4affSfengbojiang mem_index = i;
1643*22ce4affSfengbojiang biggestsize = size;
1644*22ce4affSfengbojiang }
1645*22ce4affSfengbojiang }
1646*22ce4affSfengbojiang mem_start = mem_affinity[mem_index].start;
1647*22ce4affSfengbojiang mem_end = mem_affinity[mem_index].end;
1648*22ce4affSfengbojiang }
1649*22ce4affSfengbojiang #endif
1650*22ce4affSfengbojiang
1651*22ce4affSfengbojiang /*
1652*22ce4affSfengbojiang * Now find biggest physical segment in within the desired
1653*22ce4affSfengbojiang * numa domain.
1654*22ce4affSfengbojiang */
1655*22ce4affSfengbojiang biggestsize = 0;
1656*22ce4affSfengbojiang biggestone = 0;
1657*22ce4affSfengbojiang for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1658*22ce4affSfengbojiang /* skip regions that are out of range */
1659*22ce4affSfengbojiang if (phys_avail[i+1] - alloc_size < mem_start ||
1660*22ce4affSfengbojiang phys_avail[i+1] > mem_end)
1661*22ce4affSfengbojiang continue;
1662*22ce4affSfengbojiang size = vm_phys_avail_size(i);
1663*22ce4affSfengbojiang if (size > biggestsize) {
1664*22ce4affSfengbojiang biggestone = i;
1665*22ce4affSfengbojiang biggestsize = size;
1666*22ce4affSfengbojiang }
1667*22ce4affSfengbojiang }
1668*22ce4affSfengbojiang alloc_size = round_page(alloc_size);
1669*22ce4affSfengbojiang
1670*22ce4affSfengbojiang /*
1671*22ce4affSfengbojiang * Grab single pages from the front to reduce fragmentation.
1672*22ce4affSfengbojiang */
1673*22ce4affSfengbojiang if (alloc_size == PAGE_SIZE) {
1674*22ce4affSfengbojiang pa = phys_avail[biggestone];
1675*22ce4affSfengbojiang phys_avail[biggestone] += PAGE_SIZE;
1676*22ce4affSfengbojiang vm_phys_avail_check(biggestone);
1677*22ce4affSfengbojiang return (pa);
1678*22ce4affSfengbojiang }
1679*22ce4affSfengbojiang
1680*22ce4affSfengbojiang /*
1681*22ce4affSfengbojiang * Naturally align large allocations.
1682*22ce4affSfengbojiang */
1683*22ce4affSfengbojiang align = phys_avail[biggestone + 1] & (alloc_size - 1);
1684*22ce4affSfengbojiang if (alloc_size + align > biggestsize)
1685*22ce4affSfengbojiang panic("cannot find a large enough size\n");
1686*22ce4affSfengbojiang if (align != 0 &&
1687*22ce4affSfengbojiang vm_phys_avail_split(phys_avail[biggestone + 1] - align,
1688*22ce4affSfengbojiang biggestone) != 0)
1689*22ce4affSfengbojiang /* Wasting memory. */
1690*22ce4affSfengbojiang phys_avail[biggestone + 1] -= align;
1691*22ce4affSfengbojiang
1692*22ce4affSfengbojiang phys_avail[biggestone + 1] -= alloc_size;
1693*22ce4affSfengbojiang vm_phys_avail_check(biggestone);
1694*22ce4affSfengbojiang pa = phys_avail[biggestone + 1];
1695*22ce4affSfengbojiang return (pa);
1696*22ce4affSfengbojiang }
1697*22ce4affSfengbojiang
1698*22ce4affSfengbojiang void
vm_phys_early_startup(void)1699*22ce4affSfengbojiang vm_phys_early_startup(void)
1700*22ce4affSfengbojiang {
1701*22ce4affSfengbojiang struct vm_phys_seg *seg;
1702*22ce4affSfengbojiang int i;
1703*22ce4affSfengbojiang
1704*22ce4affSfengbojiang for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1705*22ce4affSfengbojiang phys_avail[i] = round_page(phys_avail[i]);
1706*22ce4affSfengbojiang phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
1707*22ce4affSfengbojiang }
1708*22ce4affSfengbojiang
1709*22ce4affSfengbojiang for (i = 0; i < vm_phys_early_nsegs; i++) {
1710*22ce4affSfengbojiang seg = &vm_phys_early_segs[i];
1711*22ce4affSfengbojiang vm_phys_add_seg(seg->start, seg->end);
1712*22ce4affSfengbojiang }
1713*22ce4affSfengbojiang vm_phys_early_nsegs = -1;
1714*22ce4affSfengbojiang
1715*22ce4affSfengbojiang #ifdef NUMA
1716*22ce4affSfengbojiang /* Force phys_avail to be split by domain. */
1717*22ce4affSfengbojiang if (mem_affinity != NULL) {
1718*22ce4affSfengbojiang int idx;
1719*22ce4affSfengbojiang
1720*22ce4affSfengbojiang for (i = 0; mem_affinity[i].end != 0; i++) {
1721*22ce4affSfengbojiang idx = vm_phys_avail_find(mem_affinity[i].start);
1722*22ce4affSfengbojiang if (idx != -1 &&
1723*22ce4affSfengbojiang phys_avail[idx] != mem_affinity[i].start)
1724*22ce4affSfengbojiang vm_phys_avail_split(mem_affinity[i].start, idx);
1725*22ce4affSfengbojiang idx = vm_phys_avail_find(mem_affinity[i].end);
1726*22ce4affSfengbojiang if (idx != -1 &&
1727*22ce4affSfengbojiang phys_avail[idx] != mem_affinity[i].end)
1728*22ce4affSfengbojiang vm_phys_avail_split(mem_affinity[i].end, idx);
1729*22ce4affSfengbojiang }
1730*22ce4affSfengbojiang }
1731*22ce4affSfengbojiang #endif
1732*22ce4affSfengbojiang }
1733*22ce4affSfengbojiang
1734a9643ea8Slogwang #ifdef DDB
1735a9643ea8Slogwang /*
1736a9643ea8Slogwang * Show the number of physical pages in each of the free lists.
1737a9643ea8Slogwang */
DB_SHOW_COMMAND(freepages,db_show_freepages)1738a9643ea8Slogwang DB_SHOW_COMMAND(freepages, db_show_freepages)
1739a9643ea8Slogwang {
1740a9643ea8Slogwang struct vm_freelist *fl;
1741a9643ea8Slogwang int flind, oind, pind, dom;
1742a9643ea8Slogwang
1743a9643ea8Slogwang for (dom = 0; dom < vm_ndomains; dom++) {
1744a9643ea8Slogwang db_printf("DOMAIN: %d\n", dom);
1745a9643ea8Slogwang for (flind = 0; flind < vm_nfreelists; flind++) {
1746a9643ea8Slogwang db_printf("FREE LIST %d:\n"
1747a9643ea8Slogwang "\n ORDER (SIZE) | NUMBER"
1748a9643ea8Slogwang "\n ", flind);
1749a9643ea8Slogwang for (pind = 0; pind < VM_NFREEPOOL; pind++)
1750a9643ea8Slogwang db_printf(" | POOL %d", pind);
1751a9643ea8Slogwang db_printf("\n-- ");
1752a9643ea8Slogwang for (pind = 0; pind < VM_NFREEPOOL; pind++)
1753a9643ea8Slogwang db_printf("-- -- ");
1754a9643ea8Slogwang db_printf("--\n");
1755a9643ea8Slogwang for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
1756a9643ea8Slogwang db_printf(" %2.2d (%6.6dK)", oind,
1757a9643ea8Slogwang 1 << (PAGE_SHIFT - 10 + oind));
1758a9643ea8Slogwang for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1759a9643ea8Slogwang fl = vm_phys_free_queues[dom][flind][pind];
1760a9643ea8Slogwang db_printf(" | %6.6d", fl[oind].lcnt);
1761a9643ea8Slogwang }
1762a9643ea8Slogwang db_printf("\n");
1763a9643ea8Slogwang }
1764a9643ea8Slogwang db_printf("\n");
1765a9643ea8Slogwang }
1766a9643ea8Slogwang db_printf("\n");
1767a9643ea8Slogwang }
1768a9643ea8Slogwang }
1769a9643ea8Slogwang #endif
1770