1 /* 2 * drivers/base/memory.c - basic Memory class support 3 * 4 * Written by Matt Tolentino <[email protected]> 5 * Dave Hansen <[email protected]> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/sysdev.h> 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/kobject.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/mm.h> 23 #include <linux/mutex.h> 24 #include <linux/stat.h> 25 #include <linux/slab.h> 26 27 #include <asm/atomic.h> 28 #include <asm/uaccess.h> 29 30 static DEFINE_MUTEX(mem_sysfs_mutex); 31 32 #define MEMORY_CLASS_NAME "memory" 33 #define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) 34 35 static int sections_per_block; 36 37 static inline int base_memory_block_id(int section_nr) 38 { 39 return section_nr / sections_per_block; 40 } 41 42 static struct sysdev_class memory_sysdev_class = { 43 .name = MEMORY_CLASS_NAME, 44 }; 45 46 static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj) 47 { 48 return MEMORY_CLASS_NAME; 49 } 50 51 static int memory_uevent(struct kset *kset, struct kobject *obj, 52 struct kobj_uevent_env *env) 53 { 54 int retval = 0; 55 56 return retval; 57 } 58 59 static const struct kset_uevent_ops memory_uevent_ops = { 60 .name = memory_uevent_name, 61 .uevent = memory_uevent, 62 }; 63 64 static BLOCKING_NOTIFIER_HEAD(memory_chain); 65 66 int register_memory_notifier(struct notifier_block *nb) 67 { 68 return blocking_notifier_chain_register(&memory_chain, nb); 69 } 70 EXPORT_SYMBOL(register_memory_notifier); 71 72 void unregister_memory_notifier(struct notifier_block *nb) 73 { 74 blocking_notifier_chain_unregister(&memory_chain, nb); 75 } 76 EXPORT_SYMBOL(unregister_memory_notifier); 77 78 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 79 80 int register_memory_isolate_notifier(struct notifier_block *nb) 81 { 82 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 83 } 84 EXPORT_SYMBOL(register_memory_isolate_notifier); 85 86 void unregister_memory_isolate_notifier(struct notifier_block *nb) 87 { 88 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 89 } 90 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 91 92 /* 93 * register_memory - Setup a sysfs device for a memory block 94 */ 95 static 96 int register_memory(struct memory_block *memory) 97 { 98 int error; 99 100 memory->sysdev.cls = &memory_sysdev_class; 101 memory->sysdev.id = memory->start_section_nr / sections_per_block; 102 103 error = sysdev_register(&memory->sysdev); 104 return error; 105 } 106 107 static void 108 unregister_memory(struct memory_block *memory) 109 { 110 BUG_ON(memory->sysdev.cls != &memory_sysdev_class); 111 112 /* drop the ref. we got in remove_memory_block() */ 113 kobject_put(&memory->sysdev.kobj); 114 sysdev_unregister(&memory->sysdev); 115 } 116 117 unsigned long __weak memory_block_size_bytes(void) 118 { 119 return MIN_MEMORY_BLOCK_SIZE; 120 } 121 122 static unsigned long get_memory_block_size(void) 123 { 124 unsigned long block_sz; 125 126 block_sz = memory_block_size_bytes(); 127 128 /* Validate blk_sz is a power of 2 and not less than section size */ 129 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 130 WARN_ON(1); 131 block_sz = MIN_MEMORY_BLOCK_SIZE; 132 } 133 134 return block_sz; 135 } 136 137 /* 138 * use this as the physical section index that this memsection 139 * uses. 140 */ 141 142 static ssize_t show_mem_start_phys_index(struct sys_device *dev, 143 struct sysdev_attribute *attr, char *buf) 144 { 145 struct memory_block *mem = 146 container_of(dev, struct memory_block, sysdev); 147 unsigned long phys_index; 148 149 phys_index = mem->start_section_nr / sections_per_block; 150 return sprintf(buf, "%08lx\n", phys_index); 151 } 152 153 static ssize_t show_mem_end_phys_index(struct sys_device *dev, 154 struct sysdev_attribute *attr, char *buf) 155 { 156 struct memory_block *mem = 157 container_of(dev, struct memory_block, sysdev); 158 unsigned long phys_index; 159 160 phys_index = mem->end_section_nr / sections_per_block; 161 return sprintf(buf, "%08lx\n", phys_index); 162 } 163 164 /* 165 * Show whether the section of memory is likely to be hot-removable 166 */ 167 static ssize_t show_mem_removable(struct sys_device *dev, 168 struct sysdev_attribute *attr, char *buf) 169 { 170 unsigned long i, pfn; 171 int ret = 1; 172 struct memory_block *mem = 173 container_of(dev, struct memory_block, sysdev); 174 175 for (i = 0; i < sections_per_block; i++) { 176 pfn = section_nr_to_pfn(mem->start_section_nr + i); 177 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 178 } 179 180 return sprintf(buf, "%d\n", ret); 181 } 182 183 /* 184 * online, offline, going offline, etc. 185 */ 186 static ssize_t show_mem_state(struct sys_device *dev, 187 struct sysdev_attribute *attr, char *buf) 188 { 189 struct memory_block *mem = 190 container_of(dev, struct memory_block, sysdev); 191 ssize_t len = 0; 192 193 /* 194 * We can probably put these states in a nice little array 195 * so that they're not open-coded 196 */ 197 switch (mem->state) { 198 case MEM_ONLINE: 199 len = sprintf(buf, "online\n"); 200 break; 201 case MEM_OFFLINE: 202 len = sprintf(buf, "offline\n"); 203 break; 204 case MEM_GOING_OFFLINE: 205 len = sprintf(buf, "going-offline\n"); 206 break; 207 default: 208 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 209 mem->state); 210 WARN_ON(1); 211 break; 212 } 213 214 return len; 215 } 216 217 int memory_notify(unsigned long val, void *v) 218 { 219 return blocking_notifier_call_chain(&memory_chain, val, v); 220 } 221 222 int memory_isolate_notify(unsigned long val, void *v) 223 { 224 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 225 } 226 227 /* 228 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 229 * OK to have direct references to sparsemem variables in here. 230 */ 231 static int 232 memory_block_action(unsigned long phys_index, unsigned long action) 233 { 234 int i; 235 unsigned long start_pfn, start_paddr; 236 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 237 struct page *first_page; 238 int ret; 239 240 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 241 242 /* 243 * The probe routines leave the pages reserved, just 244 * as the bootmem code does. Make sure they're still 245 * that way. 246 */ 247 if (action == MEM_ONLINE) { 248 for (i = 0; i < nr_pages; i++) { 249 if (PageReserved(first_page+i)) 250 continue; 251 252 printk(KERN_WARNING "section number %ld page number %d " 253 "not reserved, was it already online?\n", 254 phys_index, i); 255 return -EBUSY; 256 } 257 } 258 259 switch (action) { 260 case MEM_ONLINE: 261 start_pfn = page_to_pfn(first_page); 262 ret = online_pages(start_pfn, nr_pages); 263 break; 264 case MEM_OFFLINE: 265 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 266 ret = remove_memory(start_paddr, 267 nr_pages << PAGE_SHIFT); 268 break; 269 default: 270 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 271 "%ld\n", __func__, phys_index, action, action); 272 ret = -EINVAL; 273 } 274 275 return ret; 276 } 277 278 static int memory_block_change_state(struct memory_block *mem, 279 unsigned long to_state, unsigned long from_state_req) 280 { 281 int ret = 0; 282 283 mutex_lock(&mem->state_mutex); 284 285 if (mem->state != from_state_req) { 286 ret = -EINVAL; 287 goto out; 288 } 289 290 if (to_state == MEM_OFFLINE) 291 mem->state = MEM_GOING_OFFLINE; 292 293 ret = memory_block_action(mem->start_section_nr, to_state); 294 295 if (ret) 296 mem->state = from_state_req; 297 else 298 mem->state = to_state; 299 300 out: 301 mutex_unlock(&mem->state_mutex); 302 return ret; 303 } 304 305 static ssize_t 306 store_mem_state(struct sys_device *dev, 307 struct sysdev_attribute *attr, const char *buf, size_t count) 308 { 309 struct memory_block *mem; 310 int ret = -EINVAL; 311 312 mem = container_of(dev, struct memory_block, sysdev); 313 314 if (!strncmp(buf, "online", min((int)count, 6))) 315 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 316 else if(!strncmp(buf, "offline", min((int)count, 7))) 317 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 318 319 if (ret) 320 return ret; 321 return count; 322 } 323 324 /* 325 * phys_device is a bad name for this. What I really want 326 * is a way to differentiate between memory ranges that 327 * are part of physical devices that constitute 328 * a complete removable unit or fru. 329 * i.e. do these ranges belong to the same physical device, 330 * s.t. if I offline all of these sections I can then 331 * remove the physical device? 332 */ 333 static ssize_t show_phys_device(struct sys_device *dev, 334 struct sysdev_attribute *attr, char *buf) 335 { 336 struct memory_block *mem = 337 container_of(dev, struct memory_block, sysdev); 338 return sprintf(buf, "%d\n", mem->phys_device); 339 } 340 341 static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 342 static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 343 static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); 344 static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); 345 static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL); 346 347 #define mem_create_simple_file(mem, attr_name) \ 348 sysdev_create_file(&mem->sysdev, &attr_##attr_name) 349 #define mem_remove_simple_file(mem, attr_name) \ 350 sysdev_remove_file(&mem->sysdev, &attr_##attr_name) 351 352 /* 353 * Block size attribute stuff 354 */ 355 static ssize_t 356 print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr, 357 char *buf) 358 { 359 return sprintf(buf, "%lx\n", get_memory_block_size()); 360 } 361 362 static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); 363 364 static int block_size_init(void) 365 { 366 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 367 &attr_block_size_bytes.attr); 368 } 369 370 /* 371 * Some architectures will have custom drivers to do this, and 372 * will not need to do it from userspace. The fake hot-add code 373 * as well as ppc64 will do all of their discovery in userspace 374 * and will require this interface. 375 */ 376 #ifdef CONFIG_ARCH_MEMORY_PROBE 377 static ssize_t 378 memory_probe_store(struct class *class, struct class_attribute *attr, 379 const char *buf, size_t count) 380 { 381 u64 phys_addr; 382 int nid; 383 int i, ret; 384 385 phys_addr = simple_strtoull(buf, NULL, 0); 386 387 for (i = 0; i < sections_per_block; i++) { 388 nid = memory_add_physaddr_to_nid(phys_addr); 389 ret = add_memory(nid, phys_addr, 390 PAGES_PER_SECTION << PAGE_SHIFT); 391 if (ret) 392 break; 393 394 phys_addr += MIN_MEMORY_BLOCK_SIZE; 395 } 396 397 if (ret) 398 count = ret; 399 400 return count; 401 } 402 static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 403 404 static int memory_probe_init(void) 405 { 406 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 407 &class_attr_probe.attr); 408 } 409 #else 410 static inline int memory_probe_init(void) 411 { 412 return 0; 413 } 414 #endif 415 416 #ifdef CONFIG_MEMORY_FAILURE 417 /* 418 * Support for offlining pages of memory 419 */ 420 421 /* Soft offline a page */ 422 static ssize_t 423 store_soft_offline_page(struct class *class, 424 struct class_attribute *attr, 425 const char *buf, size_t count) 426 { 427 int ret; 428 u64 pfn; 429 if (!capable(CAP_SYS_ADMIN)) 430 return -EPERM; 431 if (strict_strtoull(buf, 0, &pfn) < 0) 432 return -EINVAL; 433 pfn >>= PAGE_SHIFT; 434 if (!pfn_valid(pfn)) 435 return -ENXIO; 436 ret = soft_offline_page(pfn_to_page(pfn), 0); 437 return ret == 0 ? count : ret; 438 } 439 440 /* Forcibly offline a page, including killing processes. */ 441 static ssize_t 442 store_hard_offline_page(struct class *class, 443 struct class_attribute *attr, 444 const char *buf, size_t count) 445 { 446 int ret; 447 u64 pfn; 448 if (!capable(CAP_SYS_ADMIN)) 449 return -EPERM; 450 if (strict_strtoull(buf, 0, &pfn) < 0) 451 return -EINVAL; 452 pfn >>= PAGE_SHIFT; 453 ret = __memory_failure(pfn, 0, 0); 454 return ret ? ret : count; 455 } 456 457 static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 458 static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 459 460 static __init int memory_fail_init(void) 461 { 462 int err; 463 464 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 465 &class_attr_soft_offline_page.attr); 466 if (!err) 467 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 468 &class_attr_hard_offline_page.attr); 469 return err; 470 } 471 #else 472 static inline int memory_fail_init(void) 473 { 474 return 0; 475 } 476 #endif 477 478 /* 479 * Note that phys_device is optional. It is here to allow for 480 * differentiation between which *physical* devices each 481 * section belongs to... 482 */ 483 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 484 { 485 return 0; 486 } 487 488 struct memory_block *find_memory_block_hinted(struct mem_section *section, 489 struct memory_block *hint) 490 { 491 struct kobject *kobj; 492 struct sys_device *sysdev; 493 struct memory_block *mem; 494 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; 495 int block_id = base_memory_block_id(__section_nr(section)); 496 497 kobj = hint ? &hint->sysdev.kobj : NULL; 498 499 /* 500 * This only works because we know that section == sysdev->id 501 * slightly redundant with sysdev_register() 502 */ 503 sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id); 504 505 kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj); 506 if (!kobj) 507 return NULL; 508 509 sysdev = container_of(kobj, struct sys_device, kobj); 510 mem = container_of(sysdev, struct memory_block, sysdev); 511 512 return mem; 513 } 514 515 /* 516 * For now, we have a linear search to go find the appropriate 517 * memory_block corresponding to a particular phys_index. If 518 * this gets to be a real problem, we can always use a radix 519 * tree or something here. 520 * 521 * This could be made generic for all sysdev classes. 522 */ 523 struct memory_block *find_memory_block(struct mem_section *section) 524 { 525 return find_memory_block_hinted(section, NULL); 526 } 527 528 static int init_memory_block(struct memory_block **memory, 529 struct mem_section *section, unsigned long state) 530 { 531 struct memory_block *mem; 532 unsigned long start_pfn; 533 int scn_nr; 534 int ret = 0; 535 536 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 537 if (!mem) 538 return -ENOMEM; 539 540 scn_nr = __section_nr(section); 541 mem->start_section_nr = 542 base_memory_block_id(scn_nr) * sections_per_block; 543 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 544 mem->state = state; 545 mem->section_count++; 546 mutex_init(&mem->state_mutex); 547 start_pfn = section_nr_to_pfn(mem->start_section_nr); 548 mem->phys_device = arch_get_memory_phys_device(start_pfn); 549 550 ret = register_memory(mem); 551 if (!ret) 552 ret = mem_create_simple_file(mem, phys_index); 553 if (!ret) 554 ret = mem_create_simple_file(mem, end_phys_index); 555 if (!ret) 556 ret = mem_create_simple_file(mem, state); 557 if (!ret) 558 ret = mem_create_simple_file(mem, phys_device); 559 if (!ret) 560 ret = mem_create_simple_file(mem, removable); 561 562 *memory = mem; 563 return ret; 564 } 565 566 static int add_memory_section(int nid, struct mem_section *section, 567 unsigned long state, enum mem_add_context context) 568 { 569 struct memory_block *mem; 570 int ret = 0; 571 572 mutex_lock(&mem_sysfs_mutex); 573 574 mem = find_memory_block(section); 575 if (mem) { 576 mem->section_count++; 577 kobject_put(&mem->sysdev.kobj); 578 } else 579 ret = init_memory_block(&mem, section, state); 580 581 if (!ret) { 582 if (context == HOTPLUG && 583 mem->section_count == sections_per_block) 584 ret = register_mem_sect_under_node(mem, nid); 585 } 586 587 mutex_unlock(&mem_sysfs_mutex); 588 return ret; 589 } 590 591 int remove_memory_block(unsigned long node_id, struct mem_section *section, 592 int phys_device) 593 { 594 struct memory_block *mem; 595 596 mutex_lock(&mem_sysfs_mutex); 597 mem = find_memory_block(section); 598 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 599 600 mem->section_count--; 601 if (mem->section_count == 0) { 602 mem_remove_simple_file(mem, phys_index); 603 mem_remove_simple_file(mem, end_phys_index); 604 mem_remove_simple_file(mem, state); 605 mem_remove_simple_file(mem, phys_device); 606 mem_remove_simple_file(mem, removable); 607 unregister_memory(mem); 608 kfree(mem); 609 } else 610 kobject_put(&mem->sysdev.kobj); 611 612 mutex_unlock(&mem_sysfs_mutex); 613 return 0; 614 } 615 616 /* 617 * need an interface for the VM to add new memory regions, 618 * but without onlining it. 619 */ 620 int register_new_memory(int nid, struct mem_section *section) 621 { 622 return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG); 623 } 624 625 int unregister_memory_section(struct mem_section *section) 626 { 627 if (!present_section(section)) 628 return -EINVAL; 629 630 return remove_memory_block(0, section, 0); 631 } 632 633 /* 634 * Initialize the sysfs support for memory devices... 635 */ 636 int __init memory_dev_init(void) 637 { 638 unsigned int i; 639 int ret; 640 int err; 641 unsigned long block_sz; 642 643 memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops; 644 ret = sysdev_class_register(&memory_sysdev_class); 645 if (ret) 646 goto out; 647 648 block_sz = get_memory_block_size(); 649 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 650 651 /* 652 * Create entries for memory sections that were found 653 * during boot and have been initialized 654 */ 655 for (i = 0; i < NR_MEM_SECTIONS; i++) { 656 if (!present_section_nr(i)) 657 continue; 658 err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE, 659 BOOT); 660 if (!ret) 661 ret = err; 662 } 663 664 err = memory_probe_init(); 665 if (!ret) 666 ret = err; 667 err = memory_fail_init(); 668 if (!ret) 669 ret = err; 670 err = block_size_init(); 671 if (!ret) 672 ret = err; 673 out: 674 if (ret) 675 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 676 return ret; 677 } 678