1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation.
3 * Copyright(c) 2013 6WIND S.A.
4 */
5
6 #include <inttypes.h>
7 #include <string.h>
8
9 #include <rte_log.h>
10 #include <rte_string_fns.h>
11
12 #include "eal_internal_cfg.h"
13 #include "eal_memalloc.h"
14 #include "eal_memcfg.h"
15 #include "eal_private.h"
16
17 /** @file Functions common to EALs that support dynamic memory allocation. */
18
19 int
eal_dynmem_memseg_lists_init(void)20 eal_dynmem_memseg_lists_init(void)
21 {
22 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
23 struct memtype {
24 uint64_t page_sz;
25 int socket_id;
26 } *memtypes = NULL;
27 int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
28 struct rte_memseg_list *msl;
29 uint64_t max_mem, max_mem_per_type;
30 unsigned int max_seglists_per_type;
31 unsigned int n_memtypes, cur_type;
32 struct internal_config *internal_conf =
33 eal_get_internal_configuration();
34
35 /* no-huge does not need this at all */
36 if (internal_conf->no_hugetlbfs)
37 return 0;
38
39 /*
40 * figuring out amount of memory we're going to have is a long and very
41 * involved process. the basic element we're operating with is a memory
42 * type, defined as a combination of NUMA node ID and page size (so that
43 * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
44 *
45 * deciding amount of memory going towards each memory type is a
46 * balancing act between maximum segments per type, maximum memory per
47 * type, and number of detected NUMA nodes. the goal is to make sure
48 * each memory type gets at least one memseg list.
49 *
50 * the total amount of memory is limited by RTE_MAX_MEM_MB value.
51 *
52 * the total amount of memory per type is limited by either
53 * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
54 * of detected NUMA nodes. additionally, maximum number of segments per
55 * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
56 * smaller page sizes, it can take hundreds of thousands of segments to
57 * reach the above specified per-type memory limits.
58 *
59 * additionally, each type may have multiple memseg lists associated
60 * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
61 * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
62 *
63 * the number of memseg lists per type is decided based on the above
64 * limits, and also taking number of detected NUMA nodes, to make sure
65 * that we don't run out of memseg lists before we populate all NUMA
66 * nodes with memory.
67 *
68 * we do this in three stages. first, we collect the number of types.
69 * then, we figure out memory constraints and populate the list of
70 * would-be memseg lists. then, we go ahead and allocate the memseg
71 * lists.
72 */
73
74 /* create space for mem types */
75 n_memtypes = internal_conf->num_hugepage_sizes * rte_socket_count();
76 memtypes = calloc(n_memtypes, sizeof(*memtypes));
77 if (memtypes == NULL) {
78 RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
79 return -1;
80 }
81
82 /* populate mem types */
83 cur_type = 0;
84 for (hpi_idx = 0; hpi_idx < (int) internal_conf->num_hugepage_sizes;
85 hpi_idx++) {
86 struct hugepage_info *hpi;
87 uint64_t hugepage_sz;
88
89 hpi = &internal_conf->hugepage_info[hpi_idx];
90 hugepage_sz = hpi->hugepage_sz;
91
92 for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
93 int socket_id = rte_socket_id_by_idx(i);
94
95 #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
96 /* we can still sort pages by socket in legacy mode */
97 if (!internal_conf->legacy_mem && socket_id > 0)
98 break;
99 #endif
100 memtypes[cur_type].page_sz = hugepage_sz;
101 memtypes[cur_type].socket_id = socket_id;
102
103 RTE_LOG(DEBUG, EAL, "Detected memory type: "
104 "socket_id:%u hugepage_sz:%" PRIu64 "\n",
105 socket_id, hugepage_sz);
106 }
107 }
108 /* number of memtypes could have been lower due to no NUMA support */
109 n_memtypes = cur_type;
110
111 /* set up limits for types */
112 max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
113 max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
114 max_mem / n_memtypes);
115 /*
116 * limit maximum number of segment lists per type to ensure there's
117 * space for memseg lists for all NUMA nodes with all page sizes
118 */
119 max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
120
121 if (max_seglists_per_type == 0) {
122 RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
123 RTE_STR(RTE_MAX_MEMSEG_LISTS));
124 goto out;
125 }
126
127 /* go through all mem types and create segment lists */
128 msl_idx = 0;
129 for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
130 unsigned int cur_seglist, n_seglists, n_segs;
131 unsigned int max_segs_per_type, max_segs_per_list;
132 struct memtype *type = &memtypes[cur_type];
133 uint64_t max_mem_per_list, pagesz;
134 int socket_id;
135
136 pagesz = type->page_sz;
137 socket_id = type->socket_id;
138
139 /*
140 * we need to create segment lists for this type. we must take
141 * into account the following things:
142 *
143 * 1. total amount of memory we can use for this memory type
144 * 2. total amount of memory per memseg list allowed
145 * 3. number of segments needed to fit the amount of memory
146 * 4. number of segments allowed per type
147 * 5. number of segments allowed per memseg list
148 * 6. number of memseg lists we are allowed to take up
149 */
150
151 /* calculate how much segments we will need in total */
152 max_segs_per_type = max_mem_per_type / pagesz;
153 /* limit number of segments to maximum allowed per type */
154 max_segs_per_type = RTE_MIN(max_segs_per_type,
155 (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
156 /* limit number of segments to maximum allowed per list */
157 max_segs_per_list = RTE_MIN(max_segs_per_type,
158 (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
159
160 /* calculate how much memory we can have per segment list */
161 max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
162 (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
163
164 /* calculate how many segments each segment list will have */
165 n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
166
167 /* calculate how many segment lists we can have */
168 n_seglists = RTE_MIN(max_segs_per_type / n_segs,
169 max_mem_per_type / max_mem_per_list);
170
171 /* limit number of segment lists according to our maximum */
172 n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
173
174 RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
175 "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
176 n_seglists, n_segs, socket_id, pagesz);
177
178 /* create all segment lists */
179 for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
180 if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
181 RTE_LOG(ERR, EAL,
182 "No more space in memseg lists, please increase %s\n",
183 RTE_STR(RTE_MAX_MEMSEG_LISTS));
184 goto out;
185 }
186 msl = &mcfg->memsegs[msl_idx++];
187
188 if (eal_memseg_list_init(msl, pagesz, n_segs,
189 socket_id, cur_seglist, true))
190 goto out;
191
192 if (eal_memseg_list_alloc(msl, 0)) {
193 RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
194 goto out;
195 }
196 }
197 }
198 /* we're successful */
199 ret = 0;
200 out:
201 free(memtypes);
202 return ret;
203 }
204
205 static int __rte_unused
hugepage_count_walk(const struct rte_memseg_list * msl,void * arg)206 hugepage_count_walk(const struct rte_memseg_list *msl, void *arg)
207 {
208 struct hugepage_info *hpi = arg;
209
210 if (msl->page_sz != hpi->hugepage_sz)
211 return 0;
212
213 hpi->num_pages[msl->socket_id] += msl->memseg_arr.len;
214 return 0;
215 }
216
217 static int
limits_callback(int socket_id,size_t cur_limit,size_t new_len)218 limits_callback(int socket_id, size_t cur_limit, size_t new_len)
219 {
220 RTE_SET_USED(socket_id);
221 RTE_SET_USED(cur_limit);
222 RTE_SET_USED(new_len);
223 return -1;
224 }
225
226 int
eal_dynmem_hugepage_init(void)227 eal_dynmem_hugepage_init(void)
228 {
229 struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
230 uint64_t memory[RTE_MAX_NUMA_NODES];
231 int hp_sz_idx, socket_id;
232 struct internal_config *internal_conf =
233 eal_get_internal_configuration();
234
235 memset(used_hp, 0, sizeof(used_hp));
236
237 for (hp_sz_idx = 0;
238 hp_sz_idx < (int) internal_conf->num_hugepage_sizes;
239 hp_sz_idx++) {
240 #ifndef RTE_ARCH_64
241 struct hugepage_info dummy;
242 unsigned int i;
243 #endif
244 /* also initialize used_hp hugepage sizes in used_hp */
245 struct hugepage_info *hpi;
246 hpi = &internal_conf->hugepage_info[hp_sz_idx];
247 used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
248
249 #ifndef RTE_ARCH_64
250 /* for 32-bit, limit number of pages on socket to whatever we've
251 * preallocated, as we cannot allocate more.
252 */
253 memset(&dummy, 0, sizeof(dummy));
254 dummy.hugepage_sz = hpi->hugepage_sz;
255 if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0)
256 return -1;
257
258 for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
259 hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i],
260 dummy.num_pages[i]);
261 }
262 #endif
263 }
264
265 /* make a copy of socket_mem, needed for balanced allocation. */
266 for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
267 memory[hp_sz_idx] = internal_conf->socket_mem[hp_sz_idx];
268
269 /* calculate final number of pages */
270 if (eal_dynmem_calc_num_pages_per_socket(memory,
271 internal_conf->hugepage_info, used_hp,
272 internal_conf->num_hugepage_sizes) < 0)
273 return -1;
274
275 for (hp_sz_idx = 0;
276 hp_sz_idx < (int)internal_conf->num_hugepage_sizes;
277 hp_sz_idx++) {
278 for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
279 socket_id++) {
280 struct rte_memseg **pages;
281 struct hugepage_info *hpi = &used_hp[hp_sz_idx];
282 unsigned int num_pages = hpi->num_pages[socket_id];
283 unsigned int num_pages_alloc;
284
285 if (num_pages == 0)
286 continue;
287
288 RTE_LOG(DEBUG, EAL,
289 "Allocating %u pages of size %" PRIu64 "M "
290 "on socket %i\n",
291 num_pages, hpi->hugepage_sz >> 20, socket_id);
292
293 /* we may not be able to allocate all pages in one go,
294 * because we break up our memory map into multiple
295 * memseg lists. therefore, try allocating multiple
296 * times and see if we can get the desired number of
297 * pages from multiple allocations.
298 */
299
300 num_pages_alloc = 0;
301 do {
302 int i, cur_pages, needed;
303
304 needed = num_pages - num_pages_alloc;
305
306 pages = malloc(sizeof(*pages) * needed);
307
308 /* do not request exact number of pages */
309 cur_pages = eal_memalloc_alloc_seg_bulk(pages,
310 needed, hpi->hugepage_sz,
311 socket_id, false);
312 if (cur_pages <= 0) {
313 free(pages);
314 return -1;
315 }
316
317 /* mark preallocated pages as unfreeable */
318 for (i = 0; i < cur_pages; i++) {
319 struct rte_memseg *ms = pages[i];
320 ms->flags |=
321 RTE_MEMSEG_FLAG_DO_NOT_FREE;
322 }
323 free(pages);
324
325 num_pages_alloc += cur_pages;
326 } while (num_pages_alloc != num_pages);
327 }
328 }
329
330 /* if socket limits were specified, set them */
331 if (internal_conf->force_socket_limits) {
332 unsigned int i;
333 for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
334 uint64_t limit = internal_conf->socket_limit[i];
335 if (limit == 0)
336 continue;
337 if (rte_mem_alloc_validator_register("socket-limit",
338 limits_callback, i, limit))
339 RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n");
340 }
341 }
342 return 0;
343 }
344
345 __rte_unused /* function is unused on 32-bit builds */
346 static inline uint64_t
get_socket_mem_size(int socket)347 get_socket_mem_size(int socket)
348 {
349 uint64_t size = 0;
350 unsigned int i;
351 struct internal_config *internal_conf =
352 eal_get_internal_configuration();
353
354 for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
355 struct hugepage_info *hpi = &internal_conf->hugepage_info[i];
356 size += hpi->hugepage_sz * hpi->num_pages[socket];
357 }
358
359 return size;
360 }
361
362 int
eal_dynmem_calc_num_pages_per_socket(uint64_t * memory,struct hugepage_info * hp_info,struct hugepage_info * hp_used,unsigned int num_hp_info)363 eal_dynmem_calc_num_pages_per_socket(
364 uint64_t *memory, struct hugepage_info *hp_info,
365 struct hugepage_info *hp_used, unsigned int num_hp_info)
366 {
367 unsigned int socket, j, i = 0;
368 unsigned int requested, available;
369 int total_num_pages = 0;
370 uint64_t remaining_mem, cur_mem;
371 const struct internal_config *internal_conf =
372 eal_get_internal_configuration();
373 uint64_t total_mem = internal_conf->memory;
374
375 if (num_hp_info == 0)
376 return -1;
377
378 /* if specific memory amounts per socket weren't requested */
379 if (internal_conf->force_sockets == 0) {
380 size_t total_size;
381 #ifdef RTE_ARCH_64
382 int cpu_per_socket[RTE_MAX_NUMA_NODES];
383 size_t default_size;
384 unsigned int lcore_id;
385
386 /* Compute number of cores per socket */
387 memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
388 RTE_LCORE_FOREACH(lcore_id) {
389 cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
390 }
391
392 /*
393 * Automatically spread requested memory amongst detected
394 * sockets according to number of cores from CPU mask present
395 * on each socket.
396 */
397 total_size = internal_conf->memory;
398 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
399 socket++) {
400
401 /* Set memory amount per socket */
402 default_size = internal_conf->memory *
403 cpu_per_socket[socket] / rte_lcore_count();
404
405 /* Limit to maximum available memory on socket */
406 default_size = RTE_MIN(
407 default_size, get_socket_mem_size(socket));
408
409 /* Update sizes */
410 memory[socket] = default_size;
411 total_size -= default_size;
412 }
413
414 /*
415 * If some memory is remaining, try to allocate it by getting
416 * all available memory from sockets, one after the other.
417 */
418 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
419 socket++) {
420 /* take whatever is available */
421 default_size = RTE_MIN(
422 get_socket_mem_size(socket) - memory[socket],
423 total_size);
424
425 /* Update sizes */
426 memory[socket] += default_size;
427 total_size -= default_size;
428 }
429 #else
430 /* in 32-bit mode, allocate all of the memory only on main
431 * lcore socket
432 */
433 total_size = internal_conf->memory;
434 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
435 socket++) {
436 struct rte_config *cfg = rte_eal_get_configuration();
437 unsigned int main_lcore_socket;
438
439 main_lcore_socket =
440 rte_lcore_to_socket_id(cfg->main_lcore);
441
442 if (main_lcore_socket != socket)
443 continue;
444
445 /* Update sizes */
446 memory[socket] = total_size;
447 break;
448 }
449 #endif
450 }
451
452 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0;
453 socket++) {
454 /* skips if the memory on specific socket wasn't requested */
455 for (i = 0; i < num_hp_info && memory[socket] != 0; i++) {
456 rte_strscpy(hp_used[i].hugedir, hp_info[i].hugedir,
457 sizeof(hp_used[i].hugedir));
458 hp_used[i].num_pages[socket] = RTE_MIN(
459 memory[socket] / hp_info[i].hugepage_sz,
460 hp_info[i].num_pages[socket]);
461
462 cur_mem = hp_used[i].num_pages[socket] *
463 hp_used[i].hugepage_sz;
464
465 memory[socket] -= cur_mem;
466 total_mem -= cur_mem;
467
468 total_num_pages += hp_used[i].num_pages[socket];
469
470 /* check if we have met all memory requests */
471 if (memory[socket] == 0)
472 break;
473
474 /* Check if we have any more pages left at this size,
475 * if so, move on to next size.
476 */
477 if (hp_used[i].num_pages[socket] ==
478 hp_info[i].num_pages[socket])
479 continue;
480 /* At this point we know that there are more pages
481 * available that are bigger than the memory we want,
482 * so lets see if we can get enough from other page
483 * sizes.
484 */
485 remaining_mem = 0;
486 for (j = i+1; j < num_hp_info; j++)
487 remaining_mem += hp_info[j].hugepage_sz *
488 hp_info[j].num_pages[socket];
489
490 /* Is there enough other memory?
491 * If not, allocate another page and quit.
492 */
493 if (remaining_mem < memory[socket]) {
494 cur_mem = RTE_MIN(
495 memory[socket], hp_info[i].hugepage_sz);
496 memory[socket] -= cur_mem;
497 total_mem -= cur_mem;
498 hp_used[i].num_pages[socket]++;
499 total_num_pages++;
500 break; /* we are done with this socket*/
501 }
502 }
503
504 /* if we didn't satisfy all memory requirements per socket */
505 if (memory[socket] > 0 &&
506 internal_conf->socket_mem[socket] != 0) {
507 /* to prevent icc errors */
508 requested = (unsigned int)(
509 internal_conf->socket_mem[socket] / 0x100000);
510 available = requested -
511 ((unsigned int)(memory[socket] / 0x100000));
512 RTE_LOG(ERR, EAL, "Not enough memory available on "
513 "socket %u! Requested: %uMB, available: %uMB\n",
514 socket, requested, available);
515 return -1;
516 }
517 }
518
519 /* if we didn't satisfy total memory requirements */
520 if (total_mem > 0) {
521 requested = (unsigned int)(internal_conf->memory / 0x100000);
522 available = requested - (unsigned int)(total_mem / 0x100000);
523 RTE_LOG(ERR, EAL, "Not enough memory available! "
524 "Requested: %uMB, available: %uMB\n",
525 requested, available);
526 return -1;
527 }
528 return total_num_pages;
529 }
530