1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/compiler.h> 13 #include <linux/export.h> 14 #include <linux/pagevec.h> 15 #include <linux/writeback.h> 16 #include <linux/slab.h> 17 #include <linux/sysctl.h> 18 #include <linux/cpu.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/highmem.h> 22 #include <linux/vmalloc.h> 23 #include <linux/ioport.h> 24 #include <linux/delay.h> 25 #include <linux/migrate.h> 26 #include <linux/page-isolation.h> 27 #include <linux/pfn.h> 28 #include <linux/suspend.h> 29 #include <linux/mm_inline.h> 30 #include <linux/firmware-map.h> 31 #include <linux/stop_machine.h> 32 #include <linux/hugetlb.h> 33 #include <linux/memblock.h> 34 35 #include <asm/tlbflush.h> 36 37 #include "internal.h" 38 39 /* 40 * online_page_callback contains pointer to current page onlining function. 41 * Initially it is generic_online_page(). If it is required it could be 42 * changed by calling set_online_page_callback() for callback registration 43 * and restore_online_page_callback() for generic callback restore. 44 */ 45 46 static void generic_online_page(struct page *page); 47 48 static online_page_callback_t online_page_callback = generic_online_page; 49 50 DEFINE_MUTEX(mem_hotplug_mutex); 51 52 void lock_memory_hotplug(void) 53 { 54 mutex_lock(&mem_hotplug_mutex); 55 } 56 57 void unlock_memory_hotplug(void) 58 { 59 mutex_unlock(&mem_hotplug_mutex); 60 } 61 62 63 /* add this memory to iomem resource */ 64 static struct resource *register_memory_resource(u64 start, u64 size) 65 { 66 struct resource *res; 67 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 68 BUG_ON(!res); 69 70 res->name = "System RAM"; 71 res->start = start; 72 res->end = start + size - 1; 73 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 74 if (request_resource(&iomem_resource, res) < 0) { 75 pr_debug("System RAM resource %pR cannot be added\n", res); 76 kfree(res); 77 res = NULL; 78 } 79 return res; 80 } 81 82 static void release_memory_resource(struct resource *res) 83 { 84 if (!res) 85 return; 86 release_resource(res); 87 kfree(res); 88 return; 89 } 90 91 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 92 void get_page_bootmem(unsigned long info, struct page *page, 93 unsigned long type) 94 { 95 page->lru.next = (struct list_head *) type; 96 SetPagePrivate(page); 97 set_page_private(page, info); 98 atomic_inc(&page->_count); 99 } 100 101 void put_page_bootmem(struct page *page) 102 { 103 unsigned long type; 104 105 type = (unsigned long) page->lru.next; 106 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 107 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 108 109 if (atomic_dec_return(&page->_count) == 1) { 110 ClearPagePrivate(page); 111 set_page_private(page, 0); 112 INIT_LIST_HEAD(&page->lru); 113 free_reserved_page(page); 114 } 115 } 116 117 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 118 #ifndef CONFIG_SPARSEMEM_VMEMMAP 119 static void register_page_bootmem_info_section(unsigned long start_pfn) 120 { 121 unsigned long *usemap, mapsize, section_nr, i; 122 struct mem_section *ms; 123 struct page *page, *memmap; 124 125 section_nr = pfn_to_section_nr(start_pfn); 126 ms = __nr_to_section(section_nr); 127 128 /* Get section's memmap address */ 129 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 130 131 /* 132 * Get page for the memmap's phys address 133 * XXX: need more consideration for sparse_vmemmap... 134 */ 135 page = virt_to_page(memmap); 136 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 137 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 138 139 /* remember memmap's page */ 140 for (i = 0; i < mapsize; i++, page++) 141 get_page_bootmem(section_nr, page, SECTION_INFO); 142 143 usemap = __nr_to_section(section_nr)->pageblock_flags; 144 page = virt_to_page(usemap); 145 146 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 147 148 for (i = 0; i < mapsize; i++, page++) 149 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 150 151 } 152 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 153 static void register_page_bootmem_info_section(unsigned long start_pfn) 154 { 155 unsigned long *usemap, mapsize, section_nr, i; 156 struct mem_section *ms; 157 struct page *page, *memmap; 158 159 if (!pfn_valid(start_pfn)) 160 return; 161 162 section_nr = pfn_to_section_nr(start_pfn); 163 ms = __nr_to_section(section_nr); 164 165 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 166 167 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 168 169 usemap = __nr_to_section(section_nr)->pageblock_flags; 170 page = virt_to_page(usemap); 171 172 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 173 174 for (i = 0; i < mapsize; i++, page++) 175 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 176 } 177 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 178 179 void register_page_bootmem_info_node(struct pglist_data *pgdat) 180 { 181 unsigned long i, pfn, end_pfn, nr_pages; 182 int node = pgdat->node_id; 183 struct page *page; 184 struct zone *zone; 185 186 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 187 page = virt_to_page(pgdat); 188 189 for (i = 0; i < nr_pages; i++, page++) 190 get_page_bootmem(node, page, NODE_INFO); 191 192 zone = &pgdat->node_zones[0]; 193 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 194 if (zone_is_initialized(zone)) { 195 nr_pages = zone->wait_table_hash_nr_entries 196 * sizeof(wait_queue_head_t); 197 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 198 page = virt_to_page(zone->wait_table); 199 200 for (i = 0; i < nr_pages; i++, page++) 201 get_page_bootmem(node, page, NODE_INFO); 202 } 203 } 204 205 pfn = pgdat->node_start_pfn; 206 end_pfn = pgdat_end_pfn(pgdat); 207 208 /* register section info */ 209 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 210 /* 211 * Some platforms can assign the same pfn to multiple nodes - on 212 * node0 as well as nodeN. To avoid registering a pfn against 213 * multiple nodes we check that this pfn does not already 214 * reside in some other nodes. 215 */ 216 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 217 register_page_bootmem_info_section(pfn); 218 } 219 } 220 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 221 222 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 223 unsigned long end_pfn) 224 { 225 unsigned long old_zone_end_pfn; 226 227 zone_span_writelock(zone); 228 229 old_zone_end_pfn = zone_end_pfn(zone); 230 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 231 zone->zone_start_pfn = start_pfn; 232 233 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 234 zone->zone_start_pfn; 235 236 zone_span_writeunlock(zone); 237 } 238 239 static void resize_zone(struct zone *zone, unsigned long start_pfn, 240 unsigned long end_pfn) 241 { 242 zone_span_writelock(zone); 243 244 if (end_pfn - start_pfn) { 245 zone->zone_start_pfn = start_pfn; 246 zone->spanned_pages = end_pfn - start_pfn; 247 } else { 248 /* 249 * make it consist as free_area_init_core(), 250 * if spanned_pages = 0, then keep start_pfn = 0 251 */ 252 zone->zone_start_pfn = 0; 253 zone->spanned_pages = 0; 254 } 255 256 zone_span_writeunlock(zone); 257 } 258 259 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 260 unsigned long end_pfn) 261 { 262 enum zone_type zid = zone_idx(zone); 263 int nid = zone->zone_pgdat->node_id; 264 unsigned long pfn; 265 266 for (pfn = start_pfn; pfn < end_pfn; pfn++) 267 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 268 } 269 270 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 271 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 272 static int __ref ensure_zone_is_initialized(struct zone *zone, 273 unsigned long start_pfn, unsigned long num_pages) 274 { 275 if (!zone_is_initialized(zone)) 276 return init_currently_empty_zone(zone, start_pfn, num_pages, 277 MEMMAP_HOTPLUG); 278 return 0; 279 } 280 281 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 282 unsigned long start_pfn, unsigned long end_pfn) 283 { 284 int ret; 285 unsigned long flags; 286 unsigned long z1_start_pfn; 287 288 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 289 if (ret) 290 return ret; 291 292 pgdat_resize_lock(z1->zone_pgdat, &flags); 293 294 /* can't move pfns which are higher than @z2 */ 295 if (end_pfn > zone_end_pfn(z2)) 296 goto out_fail; 297 /* the move out part must be at the left most of @z2 */ 298 if (start_pfn > z2->zone_start_pfn) 299 goto out_fail; 300 /* must included/overlap */ 301 if (end_pfn <= z2->zone_start_pfn) 302 goto out_fail; 303 304 /* use start_pfn for z1's start_pfn if z1 is empty */ 305 if (!zone_is_empty(z1)) 306 z1_start_pfn = z1->zone_start_pfn; 307 else 308 z1_start_pfn = start_pfn; 309 310 resize_zone(z1, z1_start_pfn, end_pfn); 311 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 312 313 pgdat_resize_unlock(z1->zone_pgdat, &flags); 314 315 fix_zone_id(z1, start_pfn, end_pfn); 316 317 return 0; 318 out_fail: 319 pgdat_resize_unlock(z1->zone_pgdat, &flags); 320 return -1; 321 } 322 323 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 324 unsigned long start_pfn, unsigned long end_pfn) 325 { 326 int ret; 327 unsigned long flags; 328 unsigned long z2_end_pfn; 329 330 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 331 if (ret) 332 return ret; 333 334 pgdat_resize_lock(z1->zone_pgdat, &flags); 335 336 /* can't move pfns which are lower than @z1 */ 337 if (z1->zone_start_pfn > start_pfn) 338 goto out_fail; 339 /* the move out part mast at the right most of @z1 */ 340 if (zone_end_pfn(z1) > end_pfn) 341 goto out_fail; 342 /* must included/overlap */ 343 if (start_pfn >= zone_end_pfn(z1)) 344 goto out_fail; 345 346 /* use end_pfn for z2's end_pfn if z2 is empty */ 347 if (!zone_is_empty(z2)) 348 z2_end_pfn = zone_end_pfn(z2); 349 else 350 z2_end_pfn = end_pfn; 351 352 resize_zone(z1, z1->zone_start_pfn, start_pfn); 353 resize_zone(z2, start_pfn, z2_end_pfn); 354 355 pgdat_resize_unlock(z1->zone_pgdat, &flags); 356 357 fix_zone_id(z2, start_pfn, end_pfn); 358 359 return 0; 360 out_fail: 361 pgdat_resize_unlock(z1->zone_pgdat, &flags); 362 return -1; 363 } 364 365 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 366 unsigned long end_pfn) 367 { 368 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 369 370 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 371 pgdat->node_start_pfn = start_pfn; 372 373 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 374 pgdat->node_start_pfn; 375 } 376 377 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 378 { 379 struct pglist_data *pgdat = zone->zone_pgdat; 380 int nr_pages = PAGES_PER_SECTION; 381 int nid = pgdat->node_id; 382 int zone_type; 383 unsigned long flags; 384 int ret; 385 386 zone_type = zone - pgdat->node_zones; 387 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 388 if (ret) 389 return ret; 390 391 pgdat_resize_lock(zone->zone_pgdat, &flags); 392 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 393 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 394 phys_start_pfn + nr_pages); 395 pgdat_resize_unlock(zone->zone_pgdat, &flags); 396 memmap_init_zone(nr_pages, nid, zone_type, 397 phys_start_pfn, MEMMAP_HOTPLUG); 398 return 0; 399 } 400 401 static int __meminit __add_section(int nid, struct zone *zone, 402 unsigned long phys_start_pfn) 403 { 404 int ret; 405 406 if (pfn_valid(phys_start_pfn)) 407 return -EEXIST; 408 409 ret = sparse_add_one_section(zone, phys_start_pfn); 410 411 if (ret < 0) 412 return ret; 413 414 ret = __add_zone(zone, phys_start_pfn); 415 416 if (ret < 0) 417 return ret; 418 419 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 420 } 421 422 /* 423 * Reasonably generic function for adding memory. It is 424 * expected that archs that support memory hotplug will 425 * call this function after deciding the zone to which to 426 * add the new pages. 427 */ 428 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 429 unsigned long nr_pages) 430 { 431 unsigned long i; 432 int err = 0; 433 int start_sec, end_sec; 434 /* during initialize mem_map, align hot-added range to section */ 435 start_sec = pfn_to_section_nr(phys_start_pfn); 436 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 437 438 for (i = start_sec; i <= end_sec; i++) { 439 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 440 441 /* 442 * EEXIST is finally dealt with by ioresource collision 443 * check. see add_memory() => register_memory_resource() 444 * Warning will be printed if there is collision. 445 */ 446 if (err && (err != -EEXIST)) 447 break; 448 err = 0; 449 } 450 451 return err; 452 } 453 EXPORT_SYMBOL_GPL(__add_pages); 454 455 #ifdef CONFIG_MEMORY_HOTREMOVE 456 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 457 static int find_smallest_section_pfn(int nid, struct zone *zone, 458 unsigned long start_pfn, 459 unsigned long end_pfn) 460 { 461 struct mem_section *ms; 462 463 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 464 ms = __pfn_to_section(start_pfn); 465 466 if (unlikely(!valid_section(ms))) 467 continue; 468 469 if (unlikely(pfn_to_nid(start_pfn) != nid)) 470 continue; 471 472 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 473 continue; 474 475 return start_pfn; 476 } 477 478 return 0; 479 } 480 481 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 482 static int find_biggest_section_pfn(int nid, struct zone *zone, 483 unsigned long start_pfn, 484 unsigned long end_pfn) 485 { 486 struct mem_section *ms; 487 unsigned long pfn; 488 489 /* pfn is the end pfn of a memory section. */ 490 pfn = end_pfn - 1; 491 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 492 ms = __pfn_to_section(pfn); 493 494 if (unlikely(!valid_section(ms))) 495 continue; 496 497 if (unlikely(pfn_to_nid(pfn) != nid)) 498 continue; 499 500 if (zone && zone != page_zone(pfn_to_page(pfn))) 501 continue; 502 503 return pfn; 504 } 505 506 return 0; 507 } 508 509 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 510 unsigned long end_pfn) 511 { 512 unsigned long zone_start_pfn = zone->zone_start_pfn; 513 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 514 unsigned long zone_end_pfn = z; 515 unsigned long pfn; 516 struct mem_section *ms; 517 int nid = zone_to_nid(zone); 518 519 zone_span_writelock(zone); 520 if (zone_start_pfn == start_pfn) { 521 /* 522 * If the section is smallest section in the zone, it need 523 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 524 * In this case, we find second smallest valid mem_section 525 * for shrinking zone. 526 */ 527 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 528 zone_end_pfn); 529 if (pfn) { 530 zone->zone_start_pfn = pfn; 531 zone->spanned_pages = zone_end_pfn - pfn; 532 } 533 } else if (zone_end_pfn == end_pfn) { 534 /* 535 * If the section is biggest section in the zone, it need 536 * shrink zone->spanned_pages. 537 * In this case, we find second biggest valid mem_section for 538 * shrinking zone. 539 */ 540 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 541 start_pfn); 542 if (pfn) 543 zone->spanned_pages = pfn - zone_start_pfn + 1; 544 } 545 546 /* 547 * The section is not biggest or smallest mem_section in the zone, it 548 * only creates a hole in the zone. So in this case, we need not 549 * change the zone. But perhaps, the zone has only hole data. Thus 550 * it check the zone has only hole or not. 551 */ 552 pfn = zone_start_pfn; 553 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 554 ms = __pfn_to_section(pfn); 555 556 if (unlikely(!valid_section(ms))) 557 continue; 558 559 if (page_zone(pfn_to_page(pfn)) != zone) 560 continue; 561 562 /* If the section is current section, it continues the loop */ 563 if (start_pfn == pfn) 564 continue; 565 566 /* If we find valid section, we have nothing to do */ 567 zone_span_writeunlock(zone); 568 return; 569 } 570 571 /* The zone has no valid section */ 572 zone->zone_start_pfn = 0; 573 zone->spanned_pages = 0; 574 zone_span_writeunlock(zone); 575 } 576 577 static void shrink_pgdat_span(struct pglist_data *pgdat, 578 unsigned long start_pfn, unsigned long end_pfn) 579 { 580 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 581 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 582 unsigned long pgdat_end_pfn = p; 583 unsigned long pfn; 584 struct mem_section *ms; 585 int nid = pgdat->node_id; 586 587 if (pgdat_start_pfn == start_pfn) { 588 /* 589 * If the section is smallest section in the pgdat, it need 590 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 591 * In this case, we find second smallest valid mem_section 592 * for shrinking zone. 593 */ 594 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 595 pgdat_end_pfn); 596 if (pfn) { 597 pgdat->node_start_pfn = pfn; 598 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 599 } 600 } else if (pgdat_end_pfn == end_pfn) { 601 /* 602 * If the section is biggest section in the pgdat, it need 603 * shrink pgdat->node_spanned_pages. 604 * In this case, we find second biggest valid mem_section for 605 * shrinking zone. 606 */ 607 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 608 start_pfn); 609 if (pfn) 610 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 611 } 612 613 /* 614 * If the section is not biggest or smallest mem_section in the pgdat, 615 * it only creates a hole in the pgdat. So in this case, we need not 616 * change the pgdat. 617 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 618 * has only hole or not. 619 */ 620 pfn = pgdat_start_pfn; 621 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 622 ms = __pfn_to_section(pfn); 623 624 if (unlikely(!valid_section(ms))) 625 continue; 626 627 if (pfn_to_nid(pfn) != nid) 628 continue; 629 630 /* If the section is current section, it continues the loop */ 631 if (start_pfn == pfn) 632 continue; 633 634 /* If we find valid section, we have nothing to do */ 635 return; 636 } 637 638 /* The pgdat has no valid section */ 639 pgdat->node_start_pfn = 0; 640 pgdat->node_spanned_pages = 0; 641 } 642 643 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 644 { 645 struct pglist_data *pgdat = zone->zone_pgdat; 646 int nr_pages = PAGES_PER_SECTION; 647 int zone_type; 648 unsigned long flags; 649 650 zone_type = zone - pgdat->node_zones; 651 652 pgdat_resize_lock(zone->zone_pgdat, &flags); 653 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 654 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 655 pgdat_resize_unlock(zone->zone_pgdat, &flags); 656 } 657 658 static int __remove_section(struct zone *zone, struct mem_section *ms) 659 { 660 unsigned long start_pfn; 661 int scn_nr; 662 int ret = -EINVAL; 663 664 if (!valid_section(ms)) 665 return ret; 666 667 ret = unregister_memory_section(ms); 668 if (ret) 669 return ret; 670 671 scn_nr = __section_nr(ms); 672 start_pfn = section_nr_to_pfn(scn_nr); 673 __remove_zone(zone, start_pfn); 674 675 sparse_remove_one_section(zone, ms); 676 return 0; 677 } 678 679 /** 680 * __remove_pages() - remove sections of pages from a zone 681 * @zone: zone from which pages need to be removed 682 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 683 * @nr_pages: number of pages to remove (must be multiple of section size) 684 * 685 * Generic helper function to remove section mappings and sysfs entries 686 * for the section of the memory we are removing. Caller needs to make 687 * sure that pages are marked reserved and zones are adjust properly by 688 * calling offline_pages(). 689 */ 690 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 691 unsigned long nr_pages) 692 { 693 unsigned long i; 694 int sections_to_remove; 695 resource_size_t start, size; 696 int ret = 0; 697 698 /* 699 * We can only remove entire sections 700 */ 701 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 702 BUG_ON(nr_pages % PAGES_PER_SECTION); 703 704 start = phys_start_pfn << PAGE_SHIFT; 705 size = nr_pages * PAGE_SIZE; 706 ret = release_mem_region_adjustable(&iomem_resource, start, size); 707 if (ret) { 708 resource_size_t endres = start + size - 1; 709 710 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 711 &start, &endres, ret); 712 } 713 714 sections_to_remove = nr_pages / PAGES_PER_SECTION; 715 for (i = 0; i < sections_to_remove; i++) { 716 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 717 ret = __remove_section(zone, __pfn_to_section(pfn)); 718 if (ret) 719 break; 720 } 721 return ret; 722 } 723 EXPORT_SYMBOL_GPL(__remove_pages); 724 #endif /* CONFIG_MEMORY_HOTREMOVE */ 725 726 int set_online_page_callback(online_page_callback_t callback) 727 { 728 int rc = -EINVAL; 729 730 lock_memory_hotplug(); 731 732 if (online_page_callback == generic_online_page) { 733 online_page_callback = callback; 734 rc = 0; 735 } 736 737 unlock_memory_hotplug(); 738 739 return rc; 740 } 741 EXPORT_SYMBOL_GPL(set_online_page_callback); 742 743 int restore_online_page_callback(online_page_callback_t callback) 744 { 745 int rc = -EINVAL; 746 747 lock_memory_hotplug(); 748 749 if (online_page_callback == callback) { 750 online_page_callback = generic_online_page; 751 rc = 0; 752 } 753 754 unlock_memory_hotplug(); 755 756 return rc; 757 } 758 EXPORT_SYMBOL_GPL(restore_online_page_callback); 759 760 void __online_page_set_limits(struct page *page) 761 { 762 } 763 EXPORT_SYMBOL_GPL(__online_page_set_limits); 764 765 void __online_page_increment_counters(struct page *page) 766 { 767 adjust_managed_page_count(page, 1); 768 } 769 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 770 771 void __online_page_free(struct page *page) 772 { 773 __free_reserved_page(page); 774 } 775 EXPORT_SYMBOL_GPL(__online_page_free); 776 777 static void generic_online_page(struct page *page) 778 { 779 __online_page_set_limits(page); 780 __online_page_increment_counters(page); 781 __online_page_free(page); 782 } 783 784 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 785 void *arg) 786 { 787 unsigned long i; 788 unsigned long onlined_pages = *(unsigned long *)arg; 789 struct page *page; 790 if (PageReserved(pfn_to_page(start_pfn))) 791 for (i = 0; i < nr_pages; i++) { 792 page = pfn_to_page(start_pfn + i); 793 (*online_page_callback)(page); 794 onlined_pages++; 795 } 796 *(unsigned long *)arg = onlined_pages; 797 return 0; 798 } 799 800 #ifdef CONFIG_MOVABLE_NODE 801 /* 802 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 803 * normal memory. 804 */ 805 static bool can_online_high_movable(struct zone *zone) 806 { 807 return true; 808 } 809 #else /* CONFIG_MOVABLE_NODE */ 810 /* ensure every online node has NORMAL memory */ 811 static bool can_online_high_movable(struct zone *zone) 812 { 813 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 814 } 815 #endif /* CONFIG_MOVABLE_NODE */ 816 817 /* check which state of node_states will be changed when online memory */ 818 static void node_states_check_changes_online(unsigned long nr_pages, 819 struct zone *zone, struct memory_notify *arg) 820 { 821 int nid = zone_to_nid(zone); 822 enum zone_type zone_last = ZONE_NORMAL; 823 824 /* 825 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 826 * contains nodes which have zones of 0...ZONE_NORMAL, 827 * set zone_last to ZONE_NORMAL. 828 * 829 * If we don't have HIGHMEM nor movable node, 830 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 831 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 832 */ 833 if (N_MEMORY == N_NORMAL_MEMORY) 834 zone_last = ZONE_MOVABLE; 835 836 /* 837 * if the memory to be online is in a zone of 0...zone_last, and 838 * the zones of 0...zone_last don't have memory before online, we will 839 * need to set the node to node_states[N_NORMAL_MEMORY] after 840 * the memory is online. 841 */ 842 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 843 arg->status_change_nid_normal = nid; 844 else 845 arg->status_change_nid_normal = -1; 846 847 #ifdef CONFIG_HIGHMEM 848 /* 849 * If we have movable node, node_states[N_HIGH_MEMORY] 850 * contains nodes which have zones of 0...ZONE_HIGHMEM, 851 * set zone_last to ZONE_HIGHMEM. 852 * 853 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 854 * contains nodes which have zones of 0...ZONE_MOVABLE, 855 * set zone_last to ZONE_MOVABLE. 856 */ 857 zone_last = ZONE_HIGHMEM; 858 if (N_MEMORY == N_HIGH_MEMORY) 859 zone_last = ZONE_MOVABLE; 860 861 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 862 arg->status_change_nid_high = nid; 863 else 864 arg->status_change_nid_high = -1; 865 #else 866 arg->status_change_nid_high = arg->status_change_nid_normal; 867 #endif 868 869 /* 870 * if the node don't have memory befor online, we will need to 871 * set the node to node_states[N_MEMORY] after the memory 872 * is online. 873 */ 874 if (!node_state(nid, N_MEMORY)) 875 arg->status_change_nid = nid; 876 else 877 arg->status_change_nid = -1; 878 } 879 880 static void node_states_set_node(int node, struct memory_notify *arg) 881 { 882 if (arg->status_change_nid_normal >= 0) 883 node_set_state(node, N_NORMAL_MEMORY); 884 885 if (arg->status_change_nid_high >= 0) 886 node_set_state(node, N_HIGH_MEMORY); 887 888 node_set_state(node, N_MEMORY); 889 } 890 891 892 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 893 { 894 unsigned long flags; 895 unsigned long onlined_pages = 0; 896 struct zone *zone; 897 int need_zonelists_rebuild = 0; 898 int nid; 899 int ret; 900 struct memory_notify arg; 901 902 lock_memory_hotplug(); 903 /* 904 * This doesn't need a lock to do pfn_to_page(). 905 * The section can't be removed here because of the 906 * memory_block->state_mutex. 907 */ 908 zone = page_zone(pfn_to_page(pfn)); 909 910 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 911 !can_online_high_movable(zone)) { 912 unlock_memory_hotplug(); 913 return -EINVAL; 914 } 915 916 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 917 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 918 unlock_memory_hotplug(); 919 return -EINVAL; 920 } 921 } 922 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 923 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 924 unlock_memory_hotplug(); 925 return -EINVAL; 926 } 927 } 928 929 /* Previous code may changed the zone of the pfn range */ 930 zone = page_zone(pfn_to_page(pfn)); 931 932 arg.start_pfn = pfn; 933 arg.nr_pages = nr_pages; 934 node_states_check_changes_online(nr_pages, zone, &arg); 935 936 nid = pfn_to_nid(pfn); 937 938 ret = memory_notify(MEM_GOING_ONLINE, &arg); 939 ret = notifier_to_errno(ret); 940 if (ret) { 941 memory_notify(MEM_CANCEL_ONLINE, &arg); 942 unlock_memory_hotplug(); 943 return ret; 944 } 945 /* 946 * If this zone is not populated, then it is not in zonelist. 947 * This means the page allocator ignores this zone. 948 * So, zonelist must be updated after online. 949 */ 950 mutex_lock(&zonelists_mutex); 951 if (!populated_zone(zone)) { 952 need_zonelists_rebuild = 1; 953 build_all_zonelists(NULL, zone); 954 } 955 956 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 957 online_pages_range); 958 if (ret) { 959 if (need_zonelists_rebuild) 960 zone_pcp_reset(zone); 961 mutex_unlock(&zonelists_mutex); 962 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 963 (unsigned long long) pfn << PAGE_SHIFT, 964 (((unsigned long long) pfn + nr_pages) 965 << PAGE_SHIFT) - 1); 966 memory_notify(MEM_CANCEL_ONLINE, &arg); 967 unlock_memory_hotplug(); 968 return ret; 969 } 970 971 zone->present_pages += onlined_pages; 972 973 pgdat_resize_lock(zone->zone_pgdat, &flags); 974 zone->zone_pgdat->node_present_pages += onlined_pages; 975 pgdat_resize_unlock(zone->zone_pgdat, &flags); 976 977 if (onlined_pages) { 978 node_states_set_node(zone_to_nid(zone), &arg); 979 if (need_zonelists_rebuild) 980 build_all_zonelists(NULL, NULL); 981 else 982 zone_pcp_update(zone); 983 } 984 985 mutex_unlock(&zonelists_mutex); 986 987 init_per_zone_wmark_min(); 988 989 if (onlined_pages) 990 kswapd_run(zone_to_nid(zone)); 991 992 vm_total_pages = nr_free_pagecache_pages(); 993 994 writeback_set_ratelimit(); 995 996 if (onlined_pages) 997 memory_notify(MEM_ONLINE, &arg); 998 unlock_memory_hotplug(); 999 1000 return 0; 1001 } 1002 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1003 1004 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1005 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1006 { 1007 struct pglist_data *pgdat; 1008 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1009 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1010 unsigned long start_pfn = start >> PAGE_SHIFT; 1011 1012 pgdat = NODE_DATA(nid); 1013 if (!pgdat) { 1014 pgdat = arch_alloc_nodedata(nid); 1015 if (!pgdat) 1016 return NULL; 1017 1018 arch_refresh_nodedata(nid, pgdat); 1019 } 1020 1021 /* we can use NODE_DATA(nid) from here */ 1022 1023 /* init node's zones as empty zones, we don't have any present pages.*/ 1024 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1025 1026 /* 1027 * The node we allocated has no zone fallback lists. For avoiding 1028 * to access not-initialized zonelist, build here. 1029 */ 1030 mutex_lock(&zonelists_mutex); 1031 build_all_zonelists(pgdat, NULL); 1032 mutex_unlock(&zonelists_mutex); 1033 1034 return pgdat; 1035 } 1036 1037 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1038 { 1039 arch_refresh_nodedata(nid, NULL); 1040 arch_free_nodedata(pgdat); 1041 return; 1042 } 1043 1044 1045 /** 1046 * try_online_node - online a node if offlined 1047 * 1048 * called by cpu_up() to online a node without onlined memory. 1049 */ 1050 int try_online_node(int nid) 1051 { 1052 pg_data_t *pgdat; 1053 int ret; 1054 1055 if (node_online(nid)) 1056 return 0; 1057 1058 lock_memory_hotplug(); 1059 pgdat = hotadd_new_pgdat(nid, 0); 1060 if (!pgdat) { 1061 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1062 ret = -ENOMEM; 1063 goto out; 1064 } 1065 node_set_online(nid); 1066 ret = register_one_node(nid); 1067 BUG_ON(ret); 1068 1069 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 1070 mutex_lock(&zonelists_mutex); 1071 build_all_zonelists(NULL, NULL); 1072 mutex_unlock(&zonelists_mutex); 1073 } 1074 1075 out: 1076 unlock_memory_hotplug(); 1077 return ret; 1078 } 1079 1080 static int check_hotplug_memory_range(u64 start, u64 size) 1081 { 1082 u64 start_pfn = start >> PAGE_SHIFT; 1083 u64 nr_pages = size >> PAGE_SHIFT; 1084 1085 /* Memory range must be aligned with section */ 1086 if ((start_pfn & ~PAGE_SECTION_MASK) || 1087 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1088 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1089 (unsigned long long)start, 1090 (unsigned long long)size); 1091 return -EINVAL; 1092 } 1093 1094 return 0; 1095 } 1096 1097 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1098 int __ref add_memory(int nid, u64 start, u64 size) 1099 { 1100 pg_data_t *pgdat = NULL; 1101 bool new_pgdat; 1102 bool new_node; 1103 struct resource *res; 1104 int ret; 1105 1106 ret = check_hotplug_memory_range(start, size); 1107 if (ret) 1108 return ret; 1109 1110 lock_memory_hotplug(); 1111 1112 res = register_memory_resource(start, size); 1113 ret = -EEXIST; 1114 if (!res) 1115 goto out; 1116 1117 { /* Stupid hack to suppress address-never-null warning */ 1118 void *p = NODE_DATA(nid); 1119 new_pgdat = !p; 1120 } 1121 new_node = !node_online(nid); 1122 if (new_node) { 1123 pgdat = hotadd_new_pgdat(nid, start); 1124 ret = -ENOMEM; 1125 if (!pgdat) 1126 goto error; 1127 } 1128 1129 /* call arch's memory hotadd */ 1130 ret = arch_add_memory(nid, start, size); 1131 1132 if (ret < 0) 1133 goto error; 1134 1135 /* we online node here. we can't roll back from here. */ 1136 node_set_online(nid); 1137 1138 if (new_node) { 1139 ret = register_one_node(nid); 1140 /* 1141 * If sysfs file of new node can't create, cpu on the node 1142 * can't be hot-added. There is no rollback way now. 1143 * So, check by BUG_ON() to catch it reluctantly.. 1144 */ 1145 BUG_ON(ret); 1146 } 1147 1148 /* create new memmap entry */ 1149 firmware_map_add_hotplug(start, start + size, "System RAM"); 1150 1151 goto out; 1152 1153 error: 1154 /* rollback pgdat allocation and others */ 1155 if (new_pgdat) 1156 rollback_node_hotadd(nid, pgdat); 1157 release_memory_resource(res); 1158 1159 out: 1160 unlock_memory_hotplug(); 1161 return ret; 1162 } 1163 EXPORT_SYMBOL_GPL(add_memory); 1164 1165 #ifdef CONFIG_MEMORY_HOTREMOVE 1166 /* 1167 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1168 * set and the size of the free page is given by page_order(). Using this, 1169 * the function determines if the pageblock contains only free pages. 1170 * Due to buddy contraints, a free page at least the size of a pageblock will 1171 * be located at the start of the pageblock 1172 */ 1173 static inline int pageblock_free(struct page *page) 1174 { 1175 return PageBuddy(page) && page_order(page) >= pageblock_order; 1176 } 1177 1178 /* Return the start of the next active pageblock after a given page */ 1179 static struct page *next_active_pageblock(struct page *page) 1180 { 1181 /* Ensure the starting page is pageblock-aligned */ 1182 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1183 1184 /* If the entire pageblock is free, move to the end of free page */ 1185 if (pageblock_free(page)) { 1186 int order; 1187 /* be careful. we don't have locks, page_order can be changed.*/ 1188 order = page_order(page); 1189 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1190 return page + (1 << order); 1191 } 1192 1193 return page + pageblock_nr_pages; 1194 } 1195 1196 /* Checks if this range of memory is likely to be hot-removable. */ 1197 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1198 { 1199 struct page *page = pfn_to_page(start_pfn); 1200 struct page *end_page = page + nr_pages; 1201 1202 /* Check the starting page of each pageblock within the range */ 1203 for (; page < end_page; page = next_active_pageblock(page)) { 1204 if (!is_pageblock_removable_nolock(page)) 1205 return 0; 1206 cond_resched(); 1207 } 1208 1209 /* All pageblocks in the memory block are likely to be hot-removable */ 1210 return 1; 1211 } 1212 1213 /* 1214 * Confirm all pages in a range [start, end) is belongs to the same zone. 1215 */ 1216 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1217 { 1218 unsigned long pfn; 1219 struct zone *zone = NULL; 1220 struct page *page; 1221 int i; 1222 for (pfn = start_pfn; 1223 pfn < end_pfn; 1224 pfn += MAX_ORDER_NR_PAGES) { 1225 i = 0; 1226 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1227 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1228 i++; 1229 if (i == MAX_ORDER_NR_PAGES) 1230 continue; 1231 page = pfn_to_page(pfn + i); 1232 if (zone && page_zone(page) != zone) 1233 return 0; 1234 zone = page_zone(page); 1235 } 1236 return 1; 1237 } 1238 1239 /* 1240 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages 1241 * and hugepages). We scan pfn because it's much easier than scanning over 1242 * linked list. This function returns the pfn of the first found movable 1243 * page if it's found, otherwise 0. 1244 */ 1245 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1246 { 1247 unsigned long pfn; 1248 struct page *page; 1249 for (pfn = start; pfn < end; pfn++) { 1250 if (pfn_valid(pfn)) { 1251 page = pfn_to_page(pfn); 1252 if (PageLRU(page)) 1253 return pfn; 1254 if (PageHuge(page)) { 1255 if (is_hugepage_active(page)) 1256 return pfn; 1257 else 1258 pfn = round_up(pfn + 1, 1259 1 << compound_order(page)) - 1; 1260 } 1261 } 1262 } 1263 return 0; 1264 } 1265 1266 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1267 static int 1268 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1269 { 1270 unsigned long pfn; 1271 struct page *page; 1272 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1273 int not_managed = 0; 1274 int ret = 0; 1275 LIST_HEAD(source); 1276 1277 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1278 if (!pfn_valid(pfn)) 1279 continue; 1280 page = pfn_to_page(pfn); 1281 1282 if (PageHuge(page)) { 1283 struct page *head = compound_head(page); 1284 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1285 if (compound_order(head) > PFN_SECTION_SHIFT) { 1286 ret = -EBUSY; 1287 break; 1288 } 1289 if (isolate_huge_page(page, &source)) 1290 move_pages -= 1 << compound_order(head); 1291 continue; 1292 } 1293 1294 if (!get_page_unless_zero(page)) 1295 continue; 1296 /* 1297 * We can skip free pages. And we can only deal with pages on 1298 * LRU. 1299 */ 1300 ret = isolate_lru_page(page); 1301 if (!ret) { /* Success */ 1302 put_page(page); 1303 list_add_tail(&page->lru, &source); 1304 move_pages--; 1305 inc_zone_page_state(page, NR_ISOLATED_ANON + 1306 page_is_file_cache(page)); 1307 1308 } else { 1309 #ifdef CONFIG_DEBUG_VM 1310 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1311 pfn); 1312 dump_page(page, "failed to remove from LRU"); 1313 #endif 1314 put_page(page); 1315 /* Because we don't have big zone->lock. we should 1316 check this again here. */ 1317 if (page_count(page)) { 1318 not_managed++; 1319 ret = -EBUSY; 1320 break; 1321 } 1322 } 1323 } 1324 if (!list_empty(&source)) { 1325 if (not_managed) { 1326 putback_movable_pages(&source); 1327 goto out; 1328 } 1329 1330 /* 1331 * alloc_migrate_target should be improooooved!! 1332 * migrate_pages returns # of failed pages. 1333 */ 1334 ret = migrate_pages(&source, alloc_migrate_target, 0, 1335 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1336 if (ret) 1337 putback_movable_pages(&source); 1338 } 1339 out: 1340 return ret; 1341 } 1342 1343 /* 1344 * remove from free_area[] and mark all as Reserved. 1345 */ 1346 static int 1347 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1348 void *data) 1349 { 1350 __offline_isolated_pages(start, start + nr_pages); 1351 return 0; 1352 } 1353 1354 static void 1355 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1356 { 1357 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1358 offline_isolated_pages_cb); 1359 } 1360 1361 /* 1362 * Check all pages in range, recoreded as memory resource, are isolated. 1363 */ 1364 static int 1365 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1366 void *data) 1367 { 1368 int ret; 1369 long offlined = *(long *)data; 1370 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1371 offlined = nr_pages; 1372 if (!ret) 1373 *(long *)data += offlined; 1374 return ret; 1375 } 1376 1377 static long 1378 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1379 { 1380 long offlined = 0; 1381 int ret; 1382 1383 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1384 check_pages_isolated_cb); 1385 if (ret < 0) 1386 offlined = (long)ret; 1387 return offlined; 1388 } 1389 1390 #ifdef CONFIG_MOVABLE_NODE 1391 /* 1392 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1393 * normal memory. 1394 */ 1395 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1396 { 1397 return true; 1398 } 1399 #else /* CONFIG_MOVABLE_NODE */ 1400 /* ensure the node has NORMAL memory if it is still online */ 1401 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1402 { 1403 struct pglist_data *pgdat = zone->zone_pgdat; 1404 unsigned long present_pages = 0; 1405 enum zone_type zt; 1406 1407 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1408 present_pages += pgdat->node_zones[zt].present_pages; 1409 1410 if (present_pages > nr_pages) 1411 return true; 1412 1413 present_pages = 0; 1414 for (; zt <= ZONE_MOVABLE; zt++) 1415 present_pages += pgdat->node_zones[zt].present_pages; 1416 1417 /* 1418 * we can't offline the last normal memory until all 1419 * higher memory is offlined. 1420 */ 1421 return present_pages == 0; 1422 } 1423 #endif /* CONFIG_MOVABLE_NODE */ 1424 1425 static int __init cmdline_parse_movable_node(char *p) 1426 { 1427 #ifdef CONFIG_MOVABLE_NODE 1428 /* 1429 * Memory used by the kernel cannot be hot-removed because Linux 1430 * cannot migrate the kernel pages. When memory hotplug is 1431 * enabled, we should prevent memblock from allocating memory 1432 * for the kernel. 1433 * 1434 * ACPI SRAT records all hotpluggable memory ranges. But before 1435 * SRAT is parsed, we don't know about it. 1436 * 1437 * The kernel image is loaded into memory at very early time. We 1438 * cannot prevent this anyway. So on NUMA system, we set any 1439 * node the kernel resides in as un-hotpluggable. 1440 * 1441 * Since on modern servers, one node could have double-digit 1442 * gigabytes memory, we can assume the memory around the kernel 1443 * image is also un-hotpluggable. So before SRAT is parsed, just 1444 * allocate memory near the kernel image to try the best to keep 1445 * the kernel away from hotpluggable memory. 1446 */ 1447 memblock_set_bottom_up(true); 1448 movable_node_enabled = true; 1449 #else 1450 pr_warn("movable_node option not supported\n"); 1451 #endif 1452 return 0; 1453 } 1454 early_param("movable_node", cmdline_parse_movable_node); 1455 1456 /* check which state of node_states will be changed when offline memory */ 1457 static void node_states_check_changes_offline(unsigned long nr_pages, 1458 struct zone *zone, struct memory_notify *arg) 1459 { 1460 struct pglist_data *pgdat = zone->zone_pgdat; 1461 unsigned long present_pages = 0; 1462 enum zone_type zt, zone_last = ZONE_NORMAL; 1463 1464 /* 1465 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1466 * contains nodes which have zones of 0...ZONE_NORMAL, 1467 * set zone_last to ZONE_NORMAL. 1468 * 1469 * If we don't have HIGHMEM nor movable node, 1470 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1471 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1472 */ 1473 if (N_MEMORY == N_NORMAL_MEMORY) 1474 zone_last = ZONE_MOVABLE; 1475 1476 /* 1477 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1478 * If the memory to be offline is in a zone of 0...zone_last, 1479 * and it is the last present memory, 0...zone_last will 1480 * become empty after offline , thus we can determind we will 1481 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1482 */ 1483 for (zt = 0; zt <= zone_last; zt++) 1484 present_pages += pgdat->node_zones[zt].present_pages; 1485 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1486 arg->status_change_nid_normal = zone_to_nid(zone); 1487 else 1488 arg->status_change_nid_normal = -1; 1489 1490 #ifdef CONFIG_HIGHMEM 1491 /* 1492 * If we have movable node, node_states[N_HIGH_MEMORY] 1493 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1494 * set zone_last to ZONE_HIGHMEM. 1495 * 1496 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1497 * contains nodes which have zones of 0...ZONE_MOVABLE, 1498 * set zone_last to ZONE_MOVABLE. 1499 */ 1500 zone_last = ZONE_HIGHMEM; 1501 if (N_MEMORY == N_HIGH_MEMORY) 1502 zone_last = ZONE_MOVABLE; 1503 1504 for (; zt <= zone_last; zt++) 1505 present_pages += pgdat->node_zones[zt].present_pages; 1506 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1507 arg->status_change_nid_high = zone_to_nid(zone); 1508 else 1509 arg->status_change_nid_high = -1; 1510 #else 1511 arg->status_change_nid_high = arg->status_change_nid_normal; 1512 #endif 1513 1514 /* 1515 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1516 */ 1517 zone_last = ZONE_MOVABLE; 1518 1519 /* 1520 * check whether node_states[N_HIGH_MEMORY] will be changed 1521 * If we try to offline the last present @nr_pages from the node, 1522 * we can determind we will need to clear the node from 1523 * node_states[N_HIGH_MEMORY]. 1524 */ 1525 for (; zt <= zone_last; zt++) 1526 present_pages += pgdat->node_zones[zt].present_pages; 1527 if (nr_pages >= present_pages) 1528 arg->status_change_nid = zone_to_nid(zone); 1529 else 1530 arg->status_change_nid = -1; 1531 } 1532 1533 static void node_states_clear_node(int node, struct memory_notify *arg) 1534 { 1535 if (arg->status_change_nid_normal >= 0) 1536 node_clear_state(node, N_NORMAL_MEMORY); 1537 1538 if ((N_MEMORY != N_NORMAL_MEMORY) && 1539 (arg->status_change_nid_high >= 0)) 1540 node_clear_state(node, N_HIGH_MEMORY); 1541 1542 if ((N_MEMORY != N_HIGH_MEMORY) && 1543 (arg->status_change_nid >= 0)) 1544 node_clear_state(node, N_MEMORY); 1545 } 1546 1547 static int __ref __offline_pages(unsigned long start_pfn, 1548 unsigned long end_pfn, unsigned long timeout) 1549 { 1550 unsigned long pfn, nr_pages, expire; 1551 long offlined_pages; 1552 int ret, drain, retry_max, node; 1553 unsigned long flags; 1554 struct zone *zone; 1555 struct memory_notify arg; 1556 1557 /* at least, alignment against pageblock is necessary */ 1558 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1559 return -EINVAL; 1560 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1561 return -EINVAL; 1562 /* This makes hotplug much easier...and readable. 1563 we assume this for now. .*/ 1564 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1565 return -EINVAL; 1566 1567 lock_memory_hotplug(); 1568 1569 zone = page_zone(pfn_to_page(start_pfn)); 1570 node = zone_to_nid(zone); 1571 nr_pages = end_pfn - start_pfn; 1572 1573 ret = -EINVAL; 1574 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1575 goto out; 1576 1577 /* set above range as isolated */ 1578 ret = start_isolate_page_range(start_pfn, end_pfn, 1579 MIGRATE_MOVABLE, true); 1580 if (ret) 1581 goto out; 1582 1583 arg.start_pfn = start_pfn; 1584 arg.nr_pages = nr_pages; 1585 node_states_check_changes_offline(nr_pages, zone, &arg); 1586 1587 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1588 ret = notifier_to_errno(ret); 1589 if (ret) 1590 goto failed_removal; 1591 1592 pfn = start_pfn; 1593 expire = jiffies + timeout; 1594 drain = 0; 1595 retry_max = 5; 1596 repeat: 1597 /* start memory hot removal */ 1598 ret = -EAGAIN; 1599 if (time_after(jiffies, expire)) 1600 goto failed_removal; 1601 ret = -EINTR; 1602 if (signal_pending(current)) 1603 goto failed_removal; 1604 ret = 0; 1605 if (drain) { 1606 lru_add_drain_all(); 1607 cond_resched(); 1608 drain_all_pages(); 1609 } 1610 1611 pfn = scan_movable_pages(start_pfn, end_pfn); 1612 if (pfn) { /* We have movable pages */ 1613 ret = do_migrate_range(pfn, end_pfn); 1614 if (!ret) { 1615 drain = 1; 1616 goto repeat; 1617 } else { 1618 if (ret < 0) 1619 if (--retry_max == 0) 1620 goto failed_removal; 1621 yield(); 1622 drain = 1; 1623 goto repeat; 1624 } 1625 } 1626 /* drain all zone's lru pagevec, this is asynchronous... */ 1627 lru_add_drain_all(); 1628 yield(); 1629 /* drain pcp pages, this is synchronous. */ 1630 drain_all_pages(); 1631 /* 1632 * dissolve free hugepages in the memory block before doing offlining 1633 * actually in order to make hugetlbfs's object counting consistent. 1634 */ 1635 dissolve_free_huge_pages(start_pfn, end_pfn); 1636 /* check again */ 1637 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1638 if (offlined_pages < 0) { 1639 ret = -EBUSY; 1640 goto failed_removal; 1641 } 1642 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1643 /* Ok, all of our target is isolated. 1644 We cannot do rollback at this point. */ 1645 offline_isolated_pages(start_pfn, end_pfn); 1646 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1647 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1648 /* removal success */ 1649 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1650 zone->present_pages -= offlined_pages; 1651 1652 pgdat_resize_lock(zone->zone_pgdat, &flags); 1653 zone->zone_pgdat->node_present_pages -= offlined_pages; 1654 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1655 1656 init_per_zone_wmark_min(); 1657 1658 if (!populated_zone(zone)) { 1659 zone_pcp_reset(zone); 1660 mutex_lock(&zonelists_mutex); 1661 build_all_zonelists(NULL, NULL); 1662 mutex_unlock(&zonelists_mutex); 1663 } else 1664 zone_pcp_update(zone); 1665 1666 node_states_clear_node(node, &arg); 1667 if (arg.status_change_nid >= 0) 1668 kswapd_stop(node); 1669 1670 vm_total_pages = nr_free_pagecache_pages(); 1671 writeback_set_ratelimit(); 1672 1673 memory_notify(MEM_OFFLINE, &arg); 1674 unlock_memory_hotplug(); 1675 return 0; 1676 1677 failed_removal: 1678 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1679 (unsigned long long) start_pfn << PAGE_SHIFT, 1680 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1681 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1682 /* pushback to free area */ 1683 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1684 1685 out: 1686 unlock_memory_hotplug(); 1687 return ret; 1688 } 1689 1690 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1691 { 1692 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1693 } 1694 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1695 1696 /** 1697 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1698 * @start_pfn: start pfn of the memory range 1699 * @end_pfn: end pfn of the memory range 1700 * @arg: argument passed to func 1701 * @func: callback for each memory section walked 1702 * 1703 * This function walks through all present mem sections in range 1704 * [start_pfn, end_pfn) and call func on each mem section. 1705 * 1706 * Returns the return value of func. 1707 */ 1708 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1709 void *arg, int (*func)(struct memory_block *, void *)) 1710 { 1711 struct memory_block *mem = NULL; 1712 struct mem_section *section; 1713 unsigned long pfn, section_nr; 1714 int ret; 1715 1716 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1717 section_nr = pfn_to_section_nr(pfn); 1718 if (!present_section_nr(section_nr)) 1719 continue; 1720 1721 section = __nr_to_section(section_nr); 1722 /* same memblock? */ 1723 if (mem) 1724 if ((section_nr >= mem->start_section_nr) && 1725 (section_nr <= mem->end_section_nr)) 1726 continue; 1727 1728 mem = find_memory_block_hinted(section, mem); 1729 if (!mem) 1730 continue; 1731 1732 ret = func(mem, arg); 1733 if (ret) { 1734 kobject_put(&mem->dev.kobj); 1735 return ret; 1736 } 1737 } 1738 1739 if (mem) 1740 kobject_put(&mem->dev.kobj); 1741 1742 return 0; 1743 } 1744 1745 #ifdef CONFIG_MEMORY_HOTREMOVE 1746 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 1747 { 1748 int ret = !is_memblock_offlined(mem); 1749 1750 if (unlikely(ret)) { 1751 phys_addr_t beginpa, endpa; 1752 1753 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1754 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1755 pr_warn("removing memory fails, because memory " 1756 "[%pa-%pa] is onlined\n", 1757 &beginpa, &endpa); 1758 } 1759 1760 return ret; 1761 } 1762 1763 static int check_cpu_on_node(pg_data_t *pgdat) 1764 { 1765 int cpu; 1766 1767 for_each_present_cpu(cpu) { 1768 if (cpu_to_node(cpu) == pgdat->node_id) 1769 /* 1770 * the cpu on this node isn't removed, and we can't 1771 * offline this node. 1772 */ 1773 return -EBUSY; 1774 } 1775 1776 return 0; 1777 } 1778 1779 static void unmap_cpu_on_node(pg_data_t *pgdat) 1780 { 1781 #ifdef CONFIG_ACPI_NUMA 1782 int cpu; 1783 1784 for_each_possible_cpu(cpu) 1785 if (cpu_to_node(cpu) == pgdat->node_id) 1786 numa_clear_node(cpu); 1787 #endif 1788 } 1789 1790 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 1791 { 1792 int ret; 1793 1794 ret = check_cpu_on_node(pgdat); 1795 if (ret) 1796 return ret; 1797 1798 /* 1799 * the node will be offlined when we come here, so we can clear 1800 * the cpu_to_node() now. 1801 */ 1802 1803 unmap_cpu_on_node(pgdat); 1804 return 0; 1805 } 1806 1807 /** 1808 * try_offline_node 1809 * 1810 * Offline a node if all memory sections and cpus of the node are removed. 1811 * 1812 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1813 * and online/offline operations before this call. 1814 */ 1815 void try_offline_node(int nid) 1816 { 1817 pg_data_t *pgdat = NODE_DATA(nid); 1818 unsigned long start_pfn = pgdat->node_start_pfn; 1819 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1820 unsigned long pfn; 1821 struct page *pgdat_page = virt_to_page(pgdat); 1822 int i; 1823 1824 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1825 unsigned long section_nr = pfn_to_section_nr(pfn); 1826 1827 if (!present_section_nr(section_nr)) 1828 continue; 1829 1830 if (pfn_to_nid(pfn) != nid) 1831 continue; 1832 1833 /* 1834 * some memory sections of this node are not removed, and we 1835 * can't offline node now. 1836 */ 1837 return; 1838 } 1839 1840 if (check_and_unmap_cpu_on_node(pgdat)) 1841 return; 1842 1843 /* 1844 * all memory/cpu of this node are removed, we can offline this 1845 * node now. 1846 */ 1847 node_set_offline(nid); 1848 unregister_one_node(nid); 1849 1850 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1851 /* node data is allocated from boot memory */ 1852 return; 1853 1854 /* free waittable in each zone */ 1855 for (i = 0; i < MAX_NR_ZONES; i++) { 1856 struct zone *zone = pgdat->node_zones + i; 1857 1858 /* 1859 * wait_table may be allocated from boot memory, 1860 * here only free if it's allocated by vmalloc. 1861 */ 1862 if (is_vmalloc_addr(zone->wait_table)) 1863 vfree(zone->wait_table); 1864 } 1865 1866 /* 1867 * Since there is no way to guarentee the address of pgdat/zone is not 1868 * on stack of any kernel threads or used by other kernel objects 1869 * without reference counting or other symchronizing method, do not 1870 * reset node_data and free pgdat here. Just reset it to 0 and reuse 1871 * the memory when the node is online again. 1872 */ 1873 memset(pgdat, 0, sizeof(*pgdat)); 1874 } 1875 EXPORT_SYMBOL(try_offline_node); 1876 1877 /** 1878 * remove_memory 1879 * 1880 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1881 * and online/offline operations before this call, as required by 1882 * try_offline_node(). 1883 */ 1884 void __ref remove_memory(int nid, u64 start, u64 size) 1885 { 1886 int ret; 1887 1888 BUG_ON(check_hotplug_memory_range(start, size)); 1889 1890 lock_memory_hotplug(); 1891 1892 /* 1893 * All memory blocks must be offlined before removing memory. Check 1894 * whether all memory blocks in question are offline and trigger a BUG() 1895 * if this is not the case. 1896 */ 1897 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1898 check_memblock_offlined_cb); 1899 if (ret) { 1900 unlock_memory_hotplug(); 1901 BUG(); 1902 } 1903 1904 /* remove memmap entry */ 1905 firmware_map_remove(start, start + size, "System RAM"); 1906 1907 arch_remove_memory(start, size); 1908 1909 try_offline_node(nid); 1910 1911 unlock_memory_hotplug(); 1912 } 1913 EXPORT_SYMBOL_GPL(remove_memory); 1914 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1915