1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory subsystem support 4 * 5 * Written by Matt Tolentino <[email protected]> 6 * Dave Hansen <[email protected]> 7 * 8 * This file provides the necessary infrastructure to represent 9 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 10 * All arch-independent code that assumes MEMORY_HOTPLUG requires 11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 #include <linux/xarray.h> 25 26 #include <linux/atomic.h> 27 #include <linux/uaccess.h> 28 29 #define MEMORY_CLASS_NAME "memory" 30 31 static const char *const online_type_to_str[] = { 32 [MMOP_OFFLINE] = "offline", 33 [MMOP_ONLINE] = "online", 34 [MMOP_ONLINE_KERNEL] = "online_kernel", 35 [MMOP_ONLINE_MOVABLE] = "online_movable", 36 }; 37 38 int mhp_online_type_from_str(const char *str) 39 { 40 int i; 41 42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { 43 if (sysfs_streq(str, online_type_to_str[i])) 44 return i; 45 } 46 return -EINVAL; 47 } 48 49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 50 51 static int sections_per_block; 52 53 static inline unsigned long memory_block_id(unsigned long section_nr) 54 { 55 return section_nr / sections_per_block; 56 } 57 58 static inline unsigned long pfn_to_block_id(unsigned long pfn) 59 { 60 return memory_block_id(pfn_to_section_nr(pfn)); 61 } 62 63 static inline unsigned long phys_to_block_id(unsigned long phys) 64 { 65 return pfn_to_block_id(PFN_DOWN(phys)); 66 } 67 68 static int memory_subsys_online(struct device *dev); 69 static int memory_subsys_offline(struct device *dev); 70 71 static struct bus_type memory_subsys = { 72 .name = MEMORY_CLASS_NAME, 73 .dev_name = MEMORY_CLASS_NAME, 74 .online = memory_subsys_online, 75 .offline = memory_subsys_offline, 76 }; 77 78 /* 79 * Memory blocks are cached in a local radix tree to avoid 80 * a costly linear search for the corresponding device on 81 * the subsystem bus. 82 */ 83 static DEFINE_XARRAY(memory_blocks); 84 85 /* 86 * Memory groups, indexed by memory group id (mgid). 87 */ 88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); 89 #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 90 91 static BLOCKING_NOTIFIER_HEAD(memory_chain); 92 93 int register_memory_notifier(struct notifier_block *nb) 94 { 95 return blocking_notifier_chain_register(&memory_chain, nb); 96 } 97 EXPORT_SYMBOL(register_memory_notifier); 98 99 void unregister_memory_notifier(struct notifier_block *nb) 100 { 101 blocking_notifier_chain_unregister(&memory_chain, nb); 102 } 103 EXPORT_SYMBOL(unregister_memory_notifier); 104 105 static void memory_block_release(struct device *dev) 106 { 107 struct memory_block *mem = to_memory_block(dev); 108 109 kfree(mem); 110 } 111 112 unsigned long __weak memory_block_size_bytes(void) 113 { 114 return MIN_MEMORY_BLOCK_SIZE; 115 } 116 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 117 118 /* 119 * Show the first physical section index (number) of this memory block. 120 */ 121 static ssize_t phys_index_show(struct device *dev, 122 struct device_attribute *attr, char *buf) 123 { 124 struct memory_block *mem = to_memory_block(dev); 125 unsigned long phys_index; 126 127 phys_index = mem->start_section_nr / sections_per_block; 128 129 return sysfs_emit(buf, "%08lx\n", phys_index); 130 } 131 132 /* 133 * Legacy interface that we cannot remove. Always indicate "removable" 134 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic. 135 */ 136 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 137 char *buf) 138 { 139 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)); 140 } 141 142 /* 143 * online, offline, going offline, etc. 144 */ 145 static ssize_t state_show(struct device *dev, struct device_attribute *attr, 146 char *buf) 147 { 148 struct memory_block *mem = to_memory_block(dev); 149 const char *output; 150 151 /* 152 * We can probably put these states in a nice little array 153 * so that they're not open-coded 154 */ 155 switch (mem->state) { 156 case MEM_ONLINE: 157 output = "online"; 158 break; 159 case MEM_OFFLINE: 160 output = "offline"; 161 break; 162 case MEM_GOING_OFFLINE: 163 output = "going-offline"; 164 break; 165 default: 166 WARN_ON(1); 167 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); 168 } 169 170 return sysfs_emit(buf, "%s\n", output); 171 } 172 173 int memory_notify(unsigned long val, void *v) 174 { 175 return blocking_notifier_call_chain(&memory_chain, val, v); 176 } 177 178 static int memory_block_online(struct memory_block *mem) 179 { 180 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 181 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 182 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 183 struct zone *zone; 184 int ret; 185 186 zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, 187 start_pfn, nr_pages); 188 189 /* 190 * Although vmemmap pages have a different lifecycle than the pages 191 * they describe (they remain until the memory is unplugged), doing 192 * their initialization and accounting at memory onlining/offlining 193 * stage helps to keep accounting easier to follow - e.g vmemmaps 194 * belong to the same zone as the memory they backed. 195 */ 196 if (nr_vmemmap_pages) { 197 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); 198 if (ret) 199 return ret; 200 } 201 202 ret = online_pages(start_pfn + nr_vmemmap_pages, 203 nr_pages - nr_vmemmap_pages, zone, mem->group); 204 if (ret) { 205 if (nr_vmemmap_pages) 206 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 207 return ret; 208 } 209 210 /* 211 * Account once onlining succeeded. If the zone was unpopulated, it is 212 * now already properly populated. 213 */ 214 if (nr_vmemmap_pages) 215 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 216 nr_vmemmap_pages); 217 218 return ret; 219 } 220 221 static int memory_block_offline(struct memory_block *mem) 222 { 223 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 224 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 225 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 226 int ret; 227 228 /* 229 * Unaccount before offlining, such that unpopulated zone and kthreads 230 * can properly be torn down in offline_pages(). 231 */ 232 if (nr_vmemmap_pages) 233 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 234 -nr_vmemmap_pages); 235 236 ret = offline_pages(start_pfn + nr_vmemmap_pages, 237 nr_pages - nr_vmemmap_pages, mem->group); 238 if (ret) { 239 /* offline_pages() failed. Account back. */ 240 if (nr_vmemmap_pages) 241 adjust_present_page_count(pfn_to_page(start_pfn), 242 mem->group, nr_vmemmap_pages); 243 return ret; 244 } 245 246 if (nr_vmemmap_pages) 247 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 248 249 return ret; 250 } 251 252 /* 253 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 254 * OK to have direct references to sparsemem variables in here. 255 */ 256 static int 257 memory_block_action(struct memory_block *mem, unsigned long action) 258 { 259 int ret; 260 261 switch (action) { 262 case MEM_ONLINE: 263 ret = memory_block_online(mem); 264 break; 265 case MEM_OFFLINE: 266 ret = memory_block_offline(mem); 267 break; 268 default: 269 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 270 "%ld\n", __func__, mem->start_section_nr, action, action); 271 ret = -EINVAL; 272 } 273 274 return ret; 275 } 276 277 static int memory_block_change_state(struct memory_block *mem, 278 unsigned long to_state, unsigned long from_state_req) 279 { 280 int ret = 0; 281 282 if (mem->state != from_state_req) 283 return -EINVAL; 284 285 if (to_state == MEM_OFFLINE) 286 mem->state = MEM_GOING_OFFLINE; 287 288 ret = memory_block_action(mem, to_state); 289 mem->state = ret ? from_state_req : to_state; 290 291 return ret; 292 } 293 294 /* The device lock serializes operations on memory_subsys_[online|offline] */ 295 static int memory_subsys_online(struct device *dev) 296 { 297 struct memory_block *mem = to_memory_block(dev); 298 int ret; 299 300 if (mem->state == MEM_ONLINE) 301 return 0; 302 303 /* 304 * When called via device_online() without configuring the online_type, 305 * we want to default to MMOP_ONLINE. 306 */ 307 if (mem->online_type == MMOP_OFFLINE) 308 mem->online_type = MMOP_ONLINE; 309 310 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 311 mem->online_type = MMOP_OFFLINE; 312 313 return ret; 314 } 315 316 static int memory_subsys_offline(struct device *dev) 317 { 318 struct memory_block *mem = to_memory_block(dev); 319 320 if (mem->state == MEM_OFFLINE) 321 return 0; 322 323 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 324 } 325 326 static ssize_t state_store(struct device *dev, struct device_attribute *attr, 327 const char *buf, size_t count) 328 { 329 const int online_type = mhp_online_type_from_str(buf); 330 struct memory_block *mem = to_memory_block(dev); 331 int ret; 332 333 if (online_type < 0) 334 return -EINVAL; 335 336 ret = lock_device_hotplug_sysfs(); 337 if (ret) 338 return ret; 339 340 switch (online_type) { 341 case MMOP_ONLINE_KERNEL: 342 case MMOP_ONLINE_MOVABLE: 343 case MMOP_ONLINE: 344 /* mem->online_type is protected by device_hotplug_lock */ 345 mem->online_type = online_type; 346 ret = device_online(&mem->dev); 347 break; 348 case MMOP_OFFLINE: 349 ret = device_offline(&mem->dev); 350 break; 351 default: 352 ret = -EINVAL; /* should never happen */ 353 } 354 355 unlock_device_hotplug(); 356 357 if (ret < 0) 358 return ret; 359 if (ret) 360 return -EINVAL; 361 362 return count; 363 } 364 365 /* 366 * Legacy interface that we cannot remove: s390x exposes the storage increment 367 * covered by a memory block, allowing for identifying which memory blocks 368 * comprise a storage increment. Since a memory block spans complete 369 * storage increments nowadays, this interface is basically unused. Other 370 * archs never exposed != 0. 371 */ 372 static ssize_t phys_device_show(struct device *dev, 373 struct device_attribute *attr, char *buf) 374 { 375 struct memory_block *mem = to_memory_block(dev); 376 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 377 378 return sysfs_emit(buf, "%d\n", 379 arch_get_memory_phys_device(start_pfn)); 380 } 381 382 #ifdef CONFIG_MEMORY_HOTREMOVE 383 static int print_allowed_zone(char *buf, int len, int nid, 384 struct memory_group *group, 385 unsigned long start_pfn, unsigned long nr_pages, 386 int online_type, struct zone *default_zone) 387 { 388 struct zone *zone; 389 390 zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); 391 if (zone == default_zone) 392 return 0; 393 394 return sysfs_emit_at(buf, len, " %s", zone->name); 395 } 396 397 static ssize_t valid_zones_show(struct device *dev, 398 struct device_attribute *attr, char *buf) 399 { 400 struct memory_block *mem = to_memory_block(dev); 401 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 402 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 403 struct memory_group *group = mem->group; 404 struct zone *default_zone; 405 int nid = mem->nid; 406 int len = 0; 407 408 /* 409 * Check the existing zone. Make sure that we do that only on the 410 * online nodes otherwise the page_zone is not reliable 411 */ 412 if (mem->state == MEM_ONLINE) { 413 /* 414 * The block contains more than one zone can not be offlined. 415 * This can happen e.g. for ZONE_DMA and ZONE_DMA32 416 */ 417 default_zone = test_pages_in_a_zone(start_pfn, 418 start_pfn + nr_pages); 419 if (!default_zone) 420 return sysfs_emit(buf, "%s\n", "none"); 421 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 422 goto out; 423 } 424 425 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, 426 start_pfn, nr_pages); 427 428 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 429 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 430 MMOP_ONLINE_KERNEL, default_zone); 431 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 432 MMOP_ONLINE_MOVABLE, default_zone); 433 out: 434 len += sysfs_emit_at(buf, len, "\n"); 435 return len; 436 } 437 static DEVICE_ATTR_RO(valid_zones); 438 #endif 439 440 static DEVICE_ATTR_RO(phys_index); 441 static DEVICE_ATTR_RW(state); 442 static DEVICE_ATTR_RO(phys_device); 443 static DEVICE_ATTR_RO(removable); 444 445 /* 446 * Show the memory block size (shared by all memory blocks). 447 */ 448 static ssize_t block_size_bytes_show(struct device *dev, 449 struct device_attribute *attr, char *buf) 450 { 451 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes()); 452 } 453 454 static DEVICE_ATTR_RO(block_size_bytes); 455 456 /* 457 * Memory auto online policy. 458 */ 459 460 static ssize_t auto_online_blocks_show(struct device *dev, 461 struct device_attribute *attr, char *buf) 462 { 463 return sysfs_emit(buf, "%s\n", 464 online_type_to_str[mhp_default_online_type]); 465 } 466 467 static ssize_t auto_online_blocks_store(struct device *dev, 468 struct device_attribute *attr, 469 const char *buf, size_t count) 470 { 471 const int online_type = mhp_online_type_from_str(buf); 472 473 if (online_type < 0) 474 return -EINVAL; 475 476 mhp_default_online_type = online_type; 477 return count; 478 } 479 480 static DEVICE_ATTR_RW(auto_online_blocks); 481 482 /* 483 * Some architectures will have custom drivers to do this, and 484 * will not need to do it from userspace. The fake hot-add code 485 * as well as ppc64 will do all of their discovery in userspace 486 * and will require this interface. 487 */ 488 #ifdef CONFIG_ARCH_MEMORY_PROBE 489 static ssize_t probe_store(struct device *dev, struct device_attribute *attr, 490 const char *buf, size_t count) 491 { 492 u64 phys_addr; 493 int nid, ret; 494 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 495 496 ret = kstrtoull(buf, 0, &phys_addr); 497 if (ret) 498 return ret; 499 500 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 501 return -EINVAL; 502 503 ret = lock_device_hotplug_sysfs(); 504 if (ret) 505 return ret; 506 507 nid = memory_add_physaddr_to_nid(phys_addr); 508 ret = __add_memory(nid, phys_addr, 509 MIN_MEMORY_BLOCK_SIZE * sections_per_block, 510 MHP_NONE); 511 512 if (ret) 513 goto out; 514 515 ret = count; 516 out: 517 unlock_device_hotplug(); 518 return ret; 519 } 520 521 static DEVICE_ATTR_WO(probe); 522 #endif 523 524 #ifdef CONFIG_MEMORY_FAILURE 525 /* 526 * Support for offlining pages of memory 527 */ 528 529 /* Soft offline a page */ 530 static ssize_t soft_offline_page_store(struct device *dev, 531 struct device_attribute *attr, 532 const char *buf, size_t count) 533 { 534 int ret; 535 u64 pfn; 536 if (!capable(CAP_SYS_ADMIN)) 537 return -EPERM; 538 if (kstrtoull(buf, 0, &pfn) < 0) 539 return -EINVAL; 540 pfn >>= PAGE_SHIFT; 541 ret = soft_offline_page(pfn, 0); 542 return ret == 0 ? count : ret; 543 } 544 545 /* Forcibly offline a page, including killing processes. */ 546 static ssize_t hard_offline_page_store(struct device *dev, 547 struct device_attribute *attr, 548 const char *buf, size_t count) 549 { 550 int ret; 551 u64 pfn; 552 if (!capable(CAP_SYS_ADMIN)) 553 return -EPERM; 554 if (kstrtoull(buf, 0, &pfn) < 0) 555 return -EINVAL; 556 pfn >>= PAGE_SHIFT; 557 ret = memory_failure(pfn, 0); 558 if (ret == -EOPNOTSUPP) 559 ret = 0; 560 return ret ? ret : count; 561 } 562 563 static DEVICE_ATTR_WO(soft_offline_page); 564 static DEVICE_ATTR_WO(hard_offline_page); 565 #endif 566 567 /* See phys_device_show(). */ 568 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 569 { 570 return 0; 571 } 572 573 /* 574 * A reference for the returned memory block device is acquired. 575 * 576 * Called under device_hotplug_lock. 577 */ 578 static struct memory_block *find_memory_block_by_id(unsigned long block_id) 579 { 580 struct memory_block *mem; 581 582 mem = xa_load(&memory_blocks, block_id); 583 if (mem) 584 get_device(&mem->dev); 585 return mem; 586 } 587 588 /* 589 * Called under device_hotplug_lock. 590 */ 591 struct memory_block *find_memory_block(unsigned long section_nr) 592 { 593 unsigned long block_id = memory_block_id(section_nr); 594 595 return find_memory_block_by_id(block_id); 596 } 597 598 static struct attribute *memory_memblk_attrs[] = { 599 &dev_attr_phys_index.attr, 600 &dev_attr_state.attr, 601 &dev_attr_phys_device.attr, 602 &dev_attr_removable.attr, 603 #ifdef CONFIG_MEMORY_HOTREMOVE 604 &dev_attr_valid_zones.attr, 605 #endif 606 NULL 607 }; 608 609 static const struct attribute_group memory_memblk_attr_group = { 610 .attrs = memory_memblk_attrs, 611 }; 612 613 static const struct attribute_group *memory_memblk_attr_groups[] = { 614 &memory_memblk_attr_group, 615 NULL, 616 }; 617 618 /* 619 * register_memory - Setup a sysfs device for a memory block 620 */ 621 static 622 int register_memory(struct memory_block *memory) 623 { 624 int ret; 625 626 memory->dev.bus = &memory_subsys; 627 memory->dev.id = memory->start_section_nr / sections_per_block; 628 memory->dev.release = memory_block_release; 629 memory->dev.groups = memory_memblk_attr_groups; 630 memory->dev.offline = memory->state == MEM_OFFLINE; 631 632 ret = device_register(&memory->dev); 633 if (ret) { 634 put_device(&memory->dev); 635 return ret; 636 } 637 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, 638 GFP_KERNEL)); 639 if (ret) { 640 put_device(&memory->dev); 641 device_unregister(&memory->dev); 642 } 643 return ret; 644 } 645 646 static int init_memory_block(unsigned long block_id, unsigned long state, 647 unsigned long nr_vmemmap_pages, 648 struct memory_group *group) 649 { 650 struct memory_block *mem; 651 int ret = 0; 652 653 mem = find_memory_block_by_id(block_id); 654 if (mem) { 655 put_device(&mem->dev); 656 return -EEXIST; 657 } 658 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 659 if (!mem) 660 return -ENOMEM; 661 662 mem->start_section_nr = block_id * sections_per_block; 663 mem->state = state; 664 mem->nid = NUMA_NO_NODE; 665 mem->nr_vmemmap_pages = nr_vmemmap_pages; 666 INIT_LIST_HEAD(&mem->group_next); 667 668 ret = register_memory(mem); 669 if (ret) 670 return ret; 671 672 if (group) { 673 mem->group = group; 674 list_add(&mem->group_next, &group->memory_blocks); 675 } 676 677 return 0; 678 } 679 680 static int add_memory_block(unsigned long base_section_nr) 681 { 682 int section_count = 0; 683 unsigned long nr; 684 685 for (nr = base_section_nr; nr < base_section_nr + sections_per_block; 686 nr++) 687 if (present_section_nr(nr)) 688 section_count++; 689 690 if (section_count == 0) 691 return 0; 692 return init_memory_block(memory_block_id(base_section_nr), 693 MEM_ONLINE, 0, NULL); 694 } 695 696 static void unregister_memory(struct memory_block *memory) 697 { 698 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) 699 return; 700 701 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); 702 703 if (memory->group) { 704 list_del(&memory->group_next); 705 memory->group = NULL; 706 } 707 708 /* drop the ref. we got via find_memory_block() */ 709 put_device(&memory->dev); 710 device_unregister(&memory->dev); 711 } 712 713 /* 714 * Create memory block devices for the given memory area. Start and size 715 * have to be aligned to memory block granularity. Memory block devices 716 * will be initialized as offline. 717 * 718 * Called under device_hotplug_lock. 719 */ 720 int create_memory_block_devices(unsigned long start, unsigned long size, 721 unsigned long vmemmap_pages, 722 struct memory_group *group) 723 { 724 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 725 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 726 struct memory_block *mem; 727 unsigned long block_id; 728 int ret = 0; 729 730 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 731 !IS_ALIGNED(size, memory_block_size_bytes()))) 732 return -EINVAL; 733 734 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 735 ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages, 736 group); 737 if (ret) 738 break; 739 } 740 if (ret) { 741 end_block_id = block_id; 742 for (block_id = start_block_id; block_id != end_block_id; 743 block_id++) { 744 mem = find_memory_block_by_id(block_id); 745 if (WARN_ON_ONCE(!mem)) 746 continue; 747 unregister_memory(mem); 748 } 749 } 750 return ret; 751 } 752 753 /* 754 * Remove memory block devices for the given memory area. Start and size 755 * have to be aligned to memory block granularity. Memory block devices 756 * have to be offline. 757 * 758 * Called under device_hotplug_lock. 759 */ 760 void remove_memory_block_devices(unsigned long start, unsigned long size) 761 { 762 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 763 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 764 struct memory_block *mem; 765 unsigned long block_id; 766 767 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 768 !IS_ALIGNED(size, memory_block_size_bytes()))) 769 return; 770 771 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 772 mem = find_memory_block_by_id(block_id); 773 if (WARN_ON_ONCE(!mem)) 774 continue; 775 unregister_memory_block_under_nodes(mem); 776 unregister_memory(mem); 777 } 778 } 779 780 /* return true if the memory block is offlined, otherwise, return false */ 781 bool is_memblock_offlined(struct memory_block *mem) 782 { 783 return mem->state == MEM_OFFLINE; 784 } 785 786 static struct attribute *memory_root_attrs[] = { 787 #ifdef CONFIG_ARCH_MEMORY_PROBE 788 &dev_attr_probe.attr, 789 #endif 790 791 #ifdef CONFIG_MEMORY_FAILURE 792 &dev_attr_soft_offline_page.attr, 793 &dev_attr_hard_offline_page.attr, 794 #endif 795 796 &dev_attr_block_size_bytes.attr, 797 &dev_attr_auto_online_blocks.attr, 798 NULL 799 }; 800 801 static const struct attribute_group memory_root_attr_group = { 802 .attrs = memory_root_attrs, 803 }; 804 805 static const struct attribute_group *memory_root_attr_groups[] = { 806 &memory_root_attr_group, 807 NULL, 808 }; 809 810 /* 811 * Initialize the sysfs support for memory devices. At the time this function 812 * is called, we cannot have concurrent creation/deletion of memory block 813 * devices, the device_hotplug_lock is not needed. 814 */ 815 void __init memory_dev_init(void) 816 { 817 int ret; 818 unsigned long block_sz, nr; 819 820 /* Validate the configured memory block size */ 821 block_sz = memory_block_size_bytes(); 822 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 823 panic("Memory block size not suitable: 0x%lx\n", block_sz); 824 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 825 826 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 827 if (ret) 828 panic("%s() failed to register subsystem: %d\n", __func__, ret); 829 830 /* 831 * Create entries for memory sections that were found 832 * during boot and have been initialized 833 */ 834 for (nr = 0; nr <= __highest_present_section_nr; 835 nr += sections_per_block) { 836 ret = add_memory_block(nr); 837 if (ret) 838 panic("%s() failed to add memory block: %d\n", __func__, 839 ret); 840 } 841 } 842 843 /** 844 * walk_memory_blocks - walk through all present memory blocks overlapped 845 * by the range [start, start + size) 846 * 847 * @start: start address of the memory range 848 * @size: size of the memory range 849 * @arg: argument passed to func 850 * @func: callback for each memory section walked 851 * 852 * This function walks through all present memory blocks overlapped by the 853 * range [start, start + size), calling func on each memory block. 854 * 855 * In case func() returns an error, walking is aborted and the error is 856 * returned. 857 * 858 * Called under device_hotplug_lock. 859 */ 860 int walk_memory_blocks(unsigned long start, unsigned long size, 861 void *arg, walk_memory_blocks_func_t func) 862 { 863 const unsigned long start_block_id = phys_to_block_id(start); 864 const unsigned long end_block_id = phys_to_block_id(start + size - 1); 865 struct memory_block *mem; 866 unsigned long block_id; 867 int ret = 0; 868 869 if (!size) 870 return 0; 871 872 for (block_id = start_block_id; block_id <= end_block_id; block_id++) { 873 mem = find_memory_block_by_id(block_id); 874 if (!mem) 875 continue; 876 877 ret = func(mem, arg); 878 put_device(&mem->dev); 879 if (ret) 880 break; 881 } 882 return ret; 883 } 884 885 struct for_each_memory_block_cb_data { 886 walk_memory_blocks_func_t func; 887 void *arg; 888 }; 889 890 static int for_each_memory_block_cb(struct device *dev, void *data) 891 { 892 struct memory_block *mem = to_memory_block(dev); 893 struct for_each_memory_block_cb_data *cb_data = data; 894 895 return cb_data->func(mem, cb_data->arg); 896 } 897 898 /** 899 * for_each_memory_block - walk through all present memory blocks 900 * 901 * @arg: argument passed to func 902 * @func: callback for each memory block walked 903 * 904 * This function walks through all present memory blocks, calling func on 905 * each memory block. 906 * 907 * In case func() returns an error, walking is aborted and the error is 908 * returned. 909 */ 910 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) 911 { 912 struct for_each_memory_block_cb_data cb_data = { 913 .func = func, 914 .arg = arg, 915 }; 916 917 return bus_for_each_dev(&memory_subsys, NULL, &cb_data, 918 for_each_memory_block_cb); 919 } 920 921 /* 922 * This is an internal helper to unify allocation and initialization of 923 * memory groups. Note that the passed memory group will be copied to a 924 * dynamically allocated memory group. After this call, the passed 925 * memory group should no longer be used. 926 */ 927 static int memory_group_register(struct memory_group group) 928 { 929 struct memory_group *new_group; 930 uint32_t mgid; 931 int ret; 932 933 if (!node_possible(group.nid)) 934 return -EINVAL; 935 936 new_group = kzalloc(sizeof(group), GFP_KERNEL); 937 if (!new_group) 938 return -ENOMEM; 939 *new_group = group; 940 INIT_LIST_HEAD(&new_group->memory_blocks); 941 942 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, 943 GFP_KERNEL); 944 if (ret) { 945 kfree(new_group); 946 return ret; 947 } else if (group.is_dynamic) { 948 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); 949 } 950 return mgid; 951 } 952 953 /** 954 * memory_group_register_static() - Register a static memory group. 955 * @nid: The node id. 956 * @max_pages: The maximum number of pages we'll have in this static memory 957 * group. 958 * 959 * Register a new static memory group and return the memory group id. 960 * All memory in the group belongs to a single unit, such as a DIMM. All 961 * memory belonging to a static memory group is added in one go to be removed 962 * in one go -- it's static. 963 * 964 * Returns an error if out of memory, if the node id is invalid, if no new 965 * memory groups can be registered, or if max_pages is invalid (0). Otherwise, 966 * returns the new memory group id. 967 */ 968 int memory_group_register_static(int nid, unsigned long max_pages) 969 { 970 struct memory_group group = { 971 .nid = nid, 972 .s = { 973 .max_pages = max_pages, 974 }, 975 }; 976 977 if (!max_pages) 978 return -EINVAL; 979 return memory_group_register(group); 980 } 981 EXPORT_SYMBOL_GPL(memory_group_register_static); 982 983 /** 984 * memory_group_register_dynamic() - Register a dynamic memory group. 985 * @nid: The node id. 986 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic 987 * memory group. 988 * 989 * Register a new dynamic memory group and return the memory group id. 990 * Memory within a dynamic memory group is added/removed dynamically 991 * in unit_pages. 992 * 993 * Returns an error if out of memory, if the node id is invalid, if no new 994 * memory groups can be registered, or if unit_pages is invalid (0, not a 995 * power of two, smaller than a single memory block). Otherwise, returns the 996 * new memory group id. 997 */ 998 int memory_group_register_dynamic(int nid, unsigned long unit_pages) 999 { 1000 struct memory_group group = { 1001 .nid = nid, 1002 .is_dynamic = true, 1003 .d = { 1004 .unit_pages = unit_pages, 1005 }, 1006 }; 1007 1008 if (!unit_pages || !is_power_of_2(unit_pages) || 1009 unit_pages < PHYS_PFN(memory_block_size_bytes())) 1010 return -EINVAL; 1011 return memory_group_register(group); 1012 } 1013 EXPORT_SYMBOL_GPL(memory_group_register_dynamic); 1014 1015 /** 1016 * memory_group_unregister() - Unregister a memory group. 1017 * @mgid: the memory group id 1018 * 1019 * Unregister a memory group. If any memory block still belongs to this 1020 * memory group, unregistering will fail. 1021 * 1022 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some 1023 * memory blocks still belong to this memory group and returns 0 if 1024 * unregistering succeeded. 1025 */ 1026 int memory_group_unregister(int mgid) 1027 { 1028 struct memory_group *group; 1029 1030 if (mgid < 0) 1031 return -EINVAL; 1032 1033 group = xa_load(&memory_groups, mgid); 1034 if (!group) 1035 return -EINVAL; 1036 if (!list_empty(&group->memory_blocks)) 1037 return -EBUSY; 1038 xa_erase(&memory_groups, mgid); 1039 kfree(group); 1040 return 0; 1041 } 1042 EXPORT_SYMBOL_GPL(memory_group_unregister); 1043 1044 /* 1045 * This is an internal helper only to be used in core memory hotplug code to 1046 * lookup a memory group. We don't care about locking, as we don't expect a 1047 * memory group to get unregistered while adding memory to it -- because 1048 * the group and the memory is managed by the same driver. 1049 */ 1050 struct memory_group *memory_group_find_by_id(int mgid) 1051 { 1052 return xa_load(&memory_groups, mgid); 1053 } 1054 1055 /* 1056 * This is an internal helper only to be used in core memory hotplug code to 1057 * walk all dynamic memory groups excluding a given memory group, either 1058 * belonging to a specific node, or belonging to any node. 1059 */ 1060 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, 1061 struct memory_group *excluded, void *arg) 1062 { 1063 struct memory_group *group; 1064 unsigned long index; 1065 int ret = 0; 1066 1067 xa_for_each_marked(&memory_groups, index, group, 1068 MEMORY_GROUP_MARK_DYNAMIC) { 1069 if (group == excluded) 1070 continue; 1071 #ifdef CONFIG_NUMA 1072 if (nid != NUMA_NO_NODE && group->nid != nid) 1073 continue; 1074 #endif /* CONFIG_NUMA */ 1075 ret = func(group, arg); 1076 if (ret) 1077 break; 1078 } 1079 return ret; 1080 } 1081