1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory subsystem support 4 * 5 * Written by Matt Tolentino <[email protected]> 6 * Dave Hansen <[email protected]> 7 * 8 * This file provides the necessary infrastructure to represent 9 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 10 * All arch-independent code that assumes MEMORY_HOTPLUG requires 11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 #include <linux/xarray.h> 25 26 #include <linux/atomic.h> 27 #include <linux/uaccess.h> 28 29 #define MEMORY_CLASS_NAME "memory" 30 31 static const char *const online_type_to_str[] = { 32 [MMOP_OFFLINE] = "offline", 33 [MMOP_ONLINE] = "online", 34 [MMOP_ONLINE_KERNEL] = "online_kernel", 35 [MMOP_ONLINE_MOVABLE] = "online_movable", 36 }; 37 38 int mhp_online_type_from_str(const char *str) 39 { 40 int i; 41 42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { 43 if (sysfs_streq(str, online_type_to_str[i])) 44 return i; 45 } 46 return -EINVAL; 47 } 48 49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 50 51 static int sections_per_block; 52 53 static inline unsigned long memory_block_id(unsigned long section_nr) 54 { 55 return section_nr / sections_per_block; 56 } 57 58 static inline unsigned long pfn_to_block_id(unsigned long pfn) 59 { 60 return memory_block_id(pfn_to_section_nr(pfn)); 61 } 62 63 static inline unsigned long phys_to_block_id(unsigned long phys) 64 { 65 return pfn_to_block_id(PFN_DOWN(phys)); 66 } 67 68 static int memory_subsys_online(struct device *dev); 69 static int memory_subsys_offline(struct device *dev); 70 71 static struct bus_type memory_subsys = { 72 .name = MEMORY_CLASS_NAME, 73 .dev_name = MEMORY_CLASS_NAME, 74 .online = memory_subsys_online, 75 .offline = memory_subsys_offline, 76 }; 77 78 /* 79 * Memory blocks are cached in a local radix tree to avoid 80 * a costly linear search for the corresponding device on 81 * the subsystem bus. 82 */ 83 static DEFINE_XARRAY(memory_blocks); 84 85 /* 86 * Memory groups, indexed by memory group id (mgid). 87 */ 88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); 89 90 static BLOCKING_NOTIFIER_HEAD(memory_chain); 91 92 int register_memory_notifier(struct notifier_block *nb) 93 { 94 return blocking_notifier_chain_register(&memory_chain, nb); 95 } 96 EXPORT_SYMBOL(register_memory_notifier); 97 98 void unregister_memory_notifier(struct notifier_block *nb) 99 { 100 blocking_notifier_chain_unregister(&memory_chain, nb); 101 } 102 EXPORT_SYMBOL(unregister_memory_notifier); 103 104 static void memory_block_release(struct device *dev) 105 { 106 struct memory_block *mem = to_memory_block(dev); 107 108 kfree(mem); 109 } 110 111 unsigned long __weak memory_block_size_bytes(void) 112 { 113 return MIN_MEMORY_BLOCK_SIZE; 114 } 115 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 116 117 /* 118 * Show the first physical section index (number) of this memory block. 119 */ 120 static ssize_t phys_index_show(struct device *dev, 121 struct device_attribute *attr, char *buf) 122 { 123 struct memory_block *mem = to_memory_block(dev); 124 unsigned long phys_index; 125 126 phys_index = mem->start_section_nr / sections_per_block; 127 128 return sysfs_emit(buf, "%08lx\n", phys_index); 129 } 130 131 /* 132 * Legacy interface that we cannot remove. Always indicate "removable" 133 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic. 134 */ 135 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 136 char *buf) 137 { 138 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)); 139 } 140 141 /* 142 * online, offline, going offline, etc. 143 */ 144 static ssize_t state_show(struct device *dev, struct device_attribute *attr, 145 char *buf) 146 { 147 struct memory_block *mem = to_memory_block(dev); 148 const char *output; 149 150 /* 151 * We can probably put these states in a nice little array 152 * so that they're not open-coded 153 */ 154 switch (mem->state) { 155 case MEM_ONLINE: 156 output = "online"; 157 break; 158 case MEM_OFFLINE: 159 output = "offline"; 160 break; 161 case MEM_GOING_OFFLINE: 162 output = "going-offline"; 163 break; 164 default: 165 WARN_ON(1); 166 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); 167 } 168 169 return sysfs_emit(buf, "%s\n", output); 170 } 171 172 int memory_notify(unsigned long val, void *v) 173 { 174 return blocking_notifier_call_chain(&memory_chain, val, v); 175 } 176 177 static int memory_block_online(struct memory_block *mem) 178 { 179 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 180 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 181 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 182 struct zone *zone; 183 int ret; 184 185 zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, 186 start_pfn, nr_pages); 187 188 /* 189 * Although vmemmap pages have a different lifecycle than the pages 190 * they describe (they remain until the memory is unplugged), doing 191 * their initialization and accounting at memory onlining/offlining 192 * stage helps to keep accounting easier to follow - e.g vmemmaps 193 * belong to the same zone as the memory they backed. 194 */ 195 if (nr_vmemmap_pages) { 196 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); 197 if (ret) 198 return ret; 199 } 200 201 ret = online_pages(start_pfn + nr_vmemmap_pages, 202 nr_pages - nr_vmemmap_pages, zone, mem->group); 203 if (ret) { 204 if (nr_vmemmap_pages) 205 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 206 return ret; 207 } 208 209 /* 210 * Account once onlining succeeded. If the zone was unpopulated, it is 211 * now already properly populated. 212 */ 213 if (nr_vmemmap_pages) 214 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 215 nr_vmemmap_pages); 216 217 return ret; 218 } 219 220 static int memory_block_offline(struct memory_block *mem) 221 { 222 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 223 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 224 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 225 int ret; 226 227 /* 228 * Unaccount before offlining, such that unpopulated zone and kthreads 229 * can properly be torn down in offline_pages(). 230 */ 231 if (nr_vmemmap_pages) 232 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 233 -nr_vmemmap_pages); 234 235 ret = offline_pages(start_pfn + nr_vmemmap_pages, 236 nr_pages - nr_vmemmap_pages, mem->group); 237 if (ret) { 238 /* offline_pages() failed. Account back. */ 239 if (nr_vmemmap_pages) 240 adjust_present_page_count(pfn_to_page(start_pfn), 241 mem->group, nr_vmemmap_pages); 242 return ret; 243 } 244 245 if (nr_vmemmap_pages) 246 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 247 248 return ret; 249 } 250 251 /* 252 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 253 * OK to have direct references to sparsemem variables in here. 254 */ 255 static int 256 memory_block_action(struct memory_block *mem, unsigned long action) 257 { 258 int ret; 259 260 switch (action) { 261 case MEM_ONLINE: 262 ret = memory_block_online(mem); 263 break; 264 case MEM_OFFLINE: 265 ret = memory_block_offline(mem); 266 break; 267 default: 268 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 269 "%ld\n", __func__, mem->start_section_nr, action, action); 270 ret = -EINVAL; 271 } 272 273 return ret; 274 } 275 276 static int memory_block_change_state(struct memory_block *mem, 277 unsigned long to_state, unsigned long from_state_req) 278 { 279 int ret = 0; 280 281 if (mem->state != from_state_req) 282 return -EINVAL; 283 284 if (to_state == MEM_OFFLINE) 285 mem->state = MEM_GOING_OFFLINE; 286 287 ret = memory_block_action(mem, to_state); 288 mem->state = ret ? from_state_req : to_state; 289 290 return ret; 291 } 292 293 /* The device lock serializes operations on memory_subsys_[online|offline] */ 294 static int memory_subsys_online(struct device *dev) 295 { 296 struct memory_block *mem = to_memory_block(dev); 297 int ret; 298 299 if (mem->state == MEM_ONLINE) 300 return 0; 301 302 /* 303 * When called via device_online() without configuring the online_type, 304 * we want to default to MMOP_ONLINE. 305 */ 306 if (mem->online_type == MMOP_OFFLINE) 307 mem->online_type = MMOP_ONLINE; 308 309 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 310 mem->online_type = MMOP_OFFLINE; 311 312 return ret; 313 } 314 315 static int memory_subsys_offline(struct device *dev) 316 { 317 struct memory_block *mem = to_memory_block(dev); 318 319 if (mem->state == MEM_OFFLINE) 320 return 0; 321 322 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 323 } 324 325 static ssize_t state_store(struct device *dev, struct device_attribute *attr, 326 const char *buf, size_t count) 327 { 328 const int online_type = mhp_online_type_from_str(buf); 329 struct memory_block *mem = to_memory_block(dev); 330 int ret; 331 332 if (online_type < 0) 333 return -EINVAL; 334 335 ret = lock_device_hotplug_sysfs(); 336 if (ret) 337 return ret; 338 339 switch (online_type) { 340 case MMOP_ONLINE_KERNEL: 341 case MMOP_ONLINE_MOVABLE: 342 case MMOP_ONLINE: 343 /* mem->online_type is protected by device_hotplug_lock */ 344 mem->online_type = online_type; 345 ret = device_online(&mem->dev); 346 break; 347 case MMOP_OFFLINE: 348 ret = device_offline(&mem->dev); 349 break; 350 default: 351 ret = -EINVAL; /* should never happen */ 352 } 353 354 unlock_device_hotplug(); 355 356 if (ret < 0) 357 return ret; 358 if (ret) 359 return -EINVAL; 360 361 return count; 362 } 363 364 /* 365 * Legacy interface that we cannot remove: s390x exposes the storage increment 366 * covered by a memory block, allowing for identifying which memory blocks 367 * comprise a storage increment. Since a memory block spans complete 368 * storage increments nowadays, this interface is basically unused. Other 369 * archs never exposed != 0. 370 */ 371 static ssize_t phys_device_show(struct device *dev, 372 struct device_attribute *attr, char *buf) 373 { 374 struct memory_block *mem = to_memory_block(dev); 375 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 376 377 return sysfs_emit(buf, "%d\n", 378 arch_get_memory_phys_device(start_pfn)); 379 } 380 381 #ifdef CONFIG_MEMORY_HOTREMOVE 382 static int print_allowed_zone(char *buf, int len, int nid, 383 struct memory_group *group, 384 unsigned long start_pfn, unsigned long nr_pages, 385 int online_type, struct zone *default_zone) 386 { 387 struct zone *zone; 388 389 zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); 390 if (zone == default_zone) 391 return 0; 392 393 return sysfs_emit_at(buf, len, " %s", zone->name); 394 } 395 396 static ssize_t valid_zones_show(struct device *dev, 397 struct device_attribute *attr, char *buf) 398 { 399 struct memory_block *mem = to_memory_block(dev); 400 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 401 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 402 struct memory_group *group = mem->group; 403 struct zone *default_zone; 404 int nid = mem->nid; 405 int len = 0; 406 407 /* 408 * Check the existing zone. Make sure that we do that only on the 409 * online nodes otherwise the page_zone is not reliable 410 */ 411 if (mem->state == MEM_ONLINE) { 412 /* 413 * The block contains more than one zone can not be offlined. 414 * This can happen e.g. for ZONE_DMA and ZONE_DMA32 415 */ 416 default_zone = test_pages_in_a_zone(start_pfn, 417 start_pfn + nr_pages); 418 if (!default_zone) 419 return sysfs_emit(buf, "%s\n", "none"); 420 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 421 goto out; 422 } 423 424 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, 425 start_pfn, nr_pages); 426 427 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 428 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 429 MMOP_ONLINE_KERNEL, default_zone); 430 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 431 MMOP_ONLINE_MOVABLE, default_zone); 432 out: 433 len += sysfs_emit_at(buf, len, "\n"); 434 return len; 435 } 436 static DEVICE_ATTR_RO(valid_zones); 437 #endif 438 439 static DEVICE_ATTR_RO(phys_index); 440 static DEVICE_ATTR_RW(state); 441 static DEVICE_ATTR_RO(phys_device); 442 static DEVICE_ATTR_RO(removable); 443 444 /* 445 * Show the memory block size (shared by all memory blocks). 446 */ 447 static ssize_t block_size_bytes_show(struct device *dev, 448 struct device_attribute *attr, char *buf) 449 { 450 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes()); 451 } 452 453 static DEVICE_ATTR_RO(block_size_bytes); 454 455 /* 456 * Memory auto online policy. 457 */ 458 459 static ssize_t auto_online_blocks_show(struct device *dev, 460 struct device_attribute *attr, char *buf) 461 { 462 return sysfs_emit(buf, "%s\n", 463 online_type_to_str[mhp_default_online_type]); 464 } 465 466 static ssize_t auto_online_blocks_store(struct device *dev, 467 struct device_attribute *attr, 468 const char *buf, size_t count) 469 { 470 const int online_type = mhp_online_type_from_str(buf); 471 472 if (online_type < 0) 473 return -EINVAL; 474 475 mhp_default_online_type = online_type; 476 return count; 477 } 478 479 static DEVICE_ATTR_RW(auto_online_blocks); 480 481 /* 482 * Some architectures will have custom drivers to do this, and 483 * will not need to do it from userspace. The fake hot-add code 484 * as well as ppc64 will do all of their discovery in userspace 485 * and will require this interface. 486 */ 487 #ifdef CONFIG_ARCH_MEMORY_PROBE 488 static ssize_t probe_store(struct device *dev, struct device_attribute *attr, 489 const char *buf, size_t count) 490 { 491 u64 phys_addr; 492 int nid, ret; 493 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 494 495 ret = kstrtoull(buf, 0, &phys_addr); 496 if (ret) 497 return ret; 498 499 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 500 return -EINVAL; 501 502 ret = lock_device_hotplug_sysfs(); 503 if (ret) 504 return ret; 505 506 nid = memory_add_physaddr_to_nid(phys_addr); 507 ret = __add_memory(nid, phys_addr, 508 MIN_MEMORY_BLOCK_SIZE * sections_per_block, 509 MHP_NONE); 510 511 if (ret) 512 goto out; 513 514 ret = count; 515 out: 516 unlock_device_hotplug(); 517 return ret; 518 } 519 520 static DEVICE_ATTR_WO(probe); 521 #endif 522 523 #ifdef CONFIG_MEMORY_FAILURE 524 /* 525 * Support for offlining pages of memory 526 */ 527 528 /* Soft offline a page */ 529 static ssize_t soft_offline_page_store(struct device *dev, 530 struct device_attribute *attr, 531 const char *buf, size_t count) 532 { 533 int ret; 534 u64 pfn; 535 if (!capable(CAP_SYS_ADMIN)) 536 return -EPERM; 537 if (kstrtoull(buf, 0, &pfn) < 0) 538 return -EINVAL; 539 pfn >>= PAGE_SHIFT; 540 ret = soft_offline_page(pfn, 0); 541 return ret == 0 ? count : ret; 542 } 543 544 /* Forcibly offline a page, including killing processes. */ 545 static ssize_t hard_offline_page_store(struct device *dev, 546 struct device_attribute *attr, 547 const char *buf, size_t count) 548 { 549 int ret; 550 u64 pfn; 551 if (!capable(CAP_SYS_ADMIN)) 552 return -EPERM; 553 if (kstrtoull(buf, 0, &pfn) < 0) 554 return -EINVAL; 555 pfn >>= PAGE_SHIFT; 556 ret = memory_failure(pfn, 0); 557 return ret ? ret : count; 558 } 559 560 static DEVICE_ATTR_WO(soft_offline_page); 561 static DEVICE_ATTR_WO(hard_offline_page); 562 #endif 563 564 /* See phys_device_show(). */ 565 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 566 { 567 return 0; 568 } 569 570 /* 571 * A reference for the returned memory block device is acquired. 572 * 573 * Called under device_hotplug_lock. 574 */ 575 static struct memory_block *find_memory_block_by_id(unsigned long block_id) 576 { 577 struct memory_block *mem; 578 579 mem = xa_load(&memory_blocks, block_id); 580 if (mem) 581 get_device(&mem->dev); 582 return mem; 583 } 584 585 /* 586 * Called under device_hotplug_lock. 587 */ 588 struct memory_block *find_memory_block(struct mem_section *section) 589 { 590 unsigned long block_id = memory_block_id(__section_nr(section)); 591 592 return find_memory_block_by_id(block_id); 593 } 594 595 static struct attribute *memory_memblk_attrs[] = { 596 &dev_attr_phys_index.attr, 597 &dev_attr_state.attr, 598 &dev_attr_phys_device.attr, 599 &dev_attr_removable.attr, 600 #ifdef CONFIG_MEMORY_HOTREMOVE 601 &dev_attr_valid_zones.attr, 602 #endif 603 NULL 604 }; 605 606 static const struct attribute_group memory_memblk_attr_group = { 607 .attrs = memory_memblk_attrs, 608 }; 609 610 static const struct attribute_group *memory_memblk_attr_groups[] = { 611 &memory_memblk_attr_group, 612 NULL, 613 }; 614 615 /* 616 * register_memory - Setup a sysfs device for a memory block 617 */ 618 static 619 int register_memory(struct memory_block *memory) 620 { 621 int ret; 622 623 memory->dev.bus = &memory_subsys; 624 memory->dev.id = memory->start_section_nr / sections_per_block; 625 memory->dev.release = memory_block_release; 626 memory->dev.groups = memory_memblk_attr_groups; 627 memory->dev.offline = memory->state == MEM_OFFLINE; 628 629 ret = device_register(&memory->dev); 630 if (ret) { 631 put_device(&memory->dev); 632 return ret; 633 } 634 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, 635 GFP_KERNEL)); 636 if (ret) { 637 put_device(&memory->dev); 638 device_unregister(&memory->dev); 639 } 640 return ret; 641 } 642 643 static int init_memory_block(unsigned long block_id, unsigned long state, 644 unsigned long nr_vmemmap_pages, 645 struct memory_group *group) 646 { 647 struct memory_block *mem; 648 int ret = 0; 649 650 mem = find_memory_block_by_id(block_id); 651 if (mem) { 652 put_device(&mem->dev); 653 return -EEXIST; 654 } 655 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 656 if (!mem) 657 return -ENOMEM; 658 659 mem->start_section_nr = block_id * sections_per_block; 660 mem->state = state; 661 mem->nid = NUMA_NO_NODE; 662 mem->nr_vmemmap_pages = nr_vmemmap_pages; 663 INIT_LIST_HEAD(&mem->group_next); 664 665 if (group) { 666 mem->group = group; 667 list_add(&mem->group_next, &group->memory_blocks); 668 } 669 670 ret = register_memory(mem); 671 672 return ret; 673 } 674 675 static int add_memory_block(unsigned long base_section_nr) 676 { 677 int section_count = 0; 678 unsigned long nr; 679 680 for (nr = base_section_nr; nr < base_section_nr + sections_per_block; 681 nr++) 682 if (present_section_nr(nr)) 683 section_count++; 684 685 if (section_count == 0) 686 return 0; 687 return init_memory_block(memory_block_id(base_section_nr), 688 MEM_ONLINE, 0, NULL); 689 } 690 691 static void unregister_memory(struct memory_block *memory) 692 { 693 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) 694 return; 695 696 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); 697 698 if (memory->group) { 699 list_del(&memory->group_next); 700 memory->group = NULL; 701 } 702 703 /* drop the ref. we got via find_memory_block() */ 704 put_device(&memory->dev); 705 device_unregister(&memory->dev); 706 } 707 708 /* 709 * Create memory block devices for the given memory area. Start and size 710 * have to be aligned to memory block granularity. Memory block devices 711 * will be initialized as offline. 712 * 713 * Called under device_hotplug_lock. 714 */ 715 int create_memory_block_devices(unsigned long start, unsigned long size, 716 unsigned long vmemmap_pages, 717 struct memory_group *group) 718 { 719 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 720 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 721 struct memory_block *mem; 722 unsigned long block_id; 723 int ret = 0; 724 725 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 726 !IS_ALIGNED(size, memory_block_size_bytes()))) 727 return -EINVAL; 728 729 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 730 ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages, 731 group); 732 if (ret) 733 break; 734 } 735 if (ret) { 736 end_block_id = block_id; 737 for (block_id = start_block_id; block_id != end_block_id; 738 block_id++) { 739 mem = find_memory_block_by_id(block_id); 740 if (WARN_ON_ONCE(!mem)) 741 continue; 742 unregister_memory(mem); 743 } 744 } 745 return ret; 746 } 747 748 /* 749 * Remove memory block devices for the given memory area. Start and size 750 * have to be aligned to memory block granularity. Memory block devices 751 * have to be offline. 752 * 753 * Called under device_hotplug_lock. 754 */ 755 void remove_memory_block_devices(unsigned long start, unsigned long size) 756 { 757 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 758 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 759 struct memory_block *mem; 760 unsigned long block_id; 761 762 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 763 !IS_ALIGNED(size, memory_block_size_bytes()))) 764 return; 765 766 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 767 mem = find_memory_block_by_id(block_id); 768 if (WARN_ON_ONCE(!mem)) 769 continue; 770 unregister_memory_block_under_nodes(mem); 771 unregister_memory(mem); 772 } 773 } 774 775 /* return true if the memory block is offlined, otherwise, return false */ 776 bool is_memblock_offlined(struct memory_block *mem) 777 { 778 return mem->state == MEM_OFFLINE; 779 } 780 781 static struct attribute *memory_root_attrs[] = { 782 #ifdef CONFIG_ARCH_MEMORY_PROBE 783 &dev_attr_probe.attr, 784 #endif 785 786 #ifdef CONFIG_MEMORY_FAILURE 787 &dev_attr_soft_offline_page.attr, 788 &dev_attr_hard_offline_page.attr, 789 #endif 790 791 &dev_attr_block_size_bytes.attr, 792 &dev_attr_auto_online_blocks.attr, 793 NULL 794 }; 795 796 static const struct attribute_group memory_root_attr_group = { 797 .attrs = memory_root_attrs, 798 }; 799 800 static const struct attribute_group *memory_root_attr_groups[] = { 801 &memory_root_attr_group, 802 NULL, 803 }; 804 805 /* 806 * Initialize the sysfs support for memory devices. At the time this function 807 * is called, we cannot have concurrent creation/deletion of memory block 808 * devices, the device_hotplug_lock is not needed. 809 */ 810 void __init memory_dev_init(void) 811 { 812 int ret; 813 unsigned long block_sz, nr; 814 815 /* Validate the configured memory block size */ 816 block_sz = memory_block_size_bytes(); 817 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 818 panic("Memory block size not suitable: 0x%lx\n", block_sz); 819 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 820 821 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 822 if (ret) 823 panic("%s() failed to register subsystem: %d\n", __func__, ret); 824 825 /* 826 * Create entries for memory sections that were found 827 * during boot and have been initialized 828 */ 829 for (nr = 0; nr <= __highest_present_section_nr; 830 nr += sections_per_block) { 831 ret = add_memory_block(nr); 832 if (ret) 833 panic("%s() failed to add memory block: %d\n", __func__, 834 ret); 835 } 836 } 837 838 /** 839 * walk_memory_blocks - walk through all present memory blocks overlapped 840 * by the range [start, start + size) 841 * 842 * @start: start address of the memory range 843 * @size: size of the memory range 844 * @arg: argument passed to func 845 * @func: callback for each memory section walked 846 * 847 * This function walks through all present memory blocks overlapped by the 848 * range [start, start + size), calling func on each memory block. 849 * 850 * In case func() returns an error, walking is aborted and the error is 851 * returned. 852 * 853 * Called under device_hotplug_lock. 854 */ 855 int walk_memory_blocks(unsigned long start, unsigned long size, 856 void *arg, walk_memory_blocks_func_t func) 857 { 858 const unsigned long start_block_id = phys_to_block_id(start); 859 const unsigned long end_block_id = phys_to_block_id(start + size - 1); 860 struct memory_block *mem; 861 unsigned long block_id; 862 int ret = 0; 863 864 if (!size) 865 return 0; 866 867 for (block_id = start_block_id; block_id <= end_block_id; block_id++) { 868 mem = find_memory_block_by_id(block_id); 869 if (!mem) 870 continue; 871 872 ret = func(mem, arg); 873 put_device(&mem->dev); 874 if (ret) 875 break; 876 } 877 return ret; 878 } 879 880 struct for_each_memory_block_cb_data { 881 walk_memory_blocks_func_t func; 882 void *arg; 883 }; 884 885 static int for_each_memory_block_cb(struct device *dev, void *data) 886 { 887 struct memory_block *mem = to_memory_block(dev); 888 struct for_each_memory_block_cb_data *cb_data = data; 889 890 return cb_data->func(mem, cb_data->arg); 891 } 892 893 /** 894 * for_each_memory_block - walk through all present memory blocks 895 * 896 * @arg: argument passed to func 897 * @func: callback for each memory block walked 898 * 899 * This function walks through all present memory blocks, calling func on 900 * each memory block. 901 * 902 * In case func() returns an error, walking is aborted and the error is 903 * returned. 904 */ 905 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) 906 { 907 struct for_each_memory_block_cb_data cb_data = { 908 .func = func, 909 .arg = arg, 910 }; 911 912 return bus_for_each_dev(&memory_subsys, NULL, &cb_data, 913 for_each_memory_block_cb); 914 } 915 916 /* 917 * This is an internal helper to unify allocation and initialization of 918 * memory groups. Note that the passed memory group will be copied to a 919 * dynamically allocated memory group. After this call, the passed 920 * memory group should no longer be used. 921 */ 922 static int memory_group_register(struct memory_group group) 923 { 924 struct memory_group *new_group; 925 uint32_t mgid; 926 int ret; 927 928 if (!node_possible(group.nid)) 929 return -EINVAL; 930 931 new_group = kzalloc(sizeof(group), GFP_KERNEL); 932 if (!new_group) 933 return -ENOMEM; 934 *new_group = group; 935 INIT_LIST_HEAD(&new_group->memory_blocks); 936 937 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, 938 GFP_KERNEL); 939 if (ret) { 940 kfree(new_group); 941 return ret; 942 } 943 return mgid; 944 } 945 946 /** 947 * memory_group_register_static() - Register a static memory group. 948 * @nid: The node id. 949 * @max_pages: The maximum number of pages we'll have in this static memory 950 * group. 951 * 952 * Register a new static memory group and return the memory group id. 953 * All memory in the group belongs to a single unit, such as a DIMM. All 954 * memory belonging to a static memory group is added in one go to be removed 955 * in one go -- it's static. 956 * 957 * Returns an error if out of memory, if the node id is invalid, if no new 958 * memory groups can be registered, or if max_pages is invalid (0). Otherwise, 959 * returns the new memory group id. 960 */ 961 int memory_group_register_static(int nid, unsigned long max_pages) 962 { 963 struct memory_group group = { 964 .nid = nid, 965 .s = { 966 .max_pages = max_pages, 967 }, 968 }; 969 970 if (!max_pages) 971 return -EINVAL; 972 return memory_group_register(group); 973 } 974 EXPORT_SYMBOL_GPL(memory_group_register_static); 975 976 /** 977 * memory_group_register_dynamic() - Register a dynamic memory group. 978 * @nid: The node id. 979 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic 980 * memory group. 981 * 982 * Register a new dynamic memory group and return the memory group id. 983 * Memory within a dynamic memory group is added/removed dynamically 984 * in unit_pages. 985 * 986 * Returns an error if out of memory, if the node id is invalid, if no new 987 * memory groups can be registered, or if unit_pages is invalid (0, not a 988 * power of two, smaller than a single memory block). Otherwise, returns the 989 * new memory group id. 990 */ 991 int memory_group_register_dynamic(int nid, unsigned long unit_pages) 992 { 993 struct memory_group group = { 994 .nid = nid, 995 .is_dynamic = true, 996 .d = { 997 .unit_pages = unit_pages, 998 }, 999 }; 1000 1001 if (!unit_pages || !is_power_of_2(unit_pages) || 1002 unit_pages < PHYS_PFN(memory_block_size_bytes())) 1003 return -EINVAL; 1004 return memory_group_register(group); 1005 } 1006 EXPORT_SYMBOL_GPL(memory_group_register_dynamic); 1007 1008 /** 1009 * memory_group_unregister() - Unregister a memory group. 1010 * @mgid: the memory group id 1011 * 1012 * Unregister a memory group. If any memory block still belongs to this 1013 * memory group, unregistering will fail. 1014 * 1015 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some 1016 * memory blocks still belong to this memory group and returns 0 if 1017 * unregistering succeeded. 1018 */ 1019 int memory_group_unregister(int mgid) 1020 { 1021 struct memory_group *group; 1022 1023 if (mgid < 0) 1024 return -EINVAL; 1025 1026 group = xa_load(&memory_groups, mgid); 1027 if (!group) 1028 return -EINVAL; 1029 if (!list_empty(&group->memory_blocks)) 1030 return -EBUSY; 1031 xa_erase(&memory_groups, mgid); 1032 kfree(group); 1033 return 0; 1034 } 1035 EXPORT_SYMBOL_GPL(memory_group_unregister); 1036 1037 /* 1038 * This is an internal helper only to be used in core memory hotplug code to 1039 * lookup a memory group. We don't care about locking, as we don't expect a 1040 * memory group to get unregistered while adding memory to it -- because 1041 * the group and the memory is managed by the same driver. 1042 */ 1043 struct memory_group *memory_group_find_by_id(int mgid) 1044 { 1045 return xa_load(&memory_groups, mgid); 1046 } 1047