1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _LINUX_RMAP_H 3 #define _LINUX_RMAP_H 4 /* 5 * Declarations for Reverse Mapping functions in mm/rmap.c 6 */ 7 8 #include <linux/list.h> 9 #include <linux/slab.h> 10 #include <linux/mm.h> 11 #include <linux/rwsem.h> 12 #include <linux/memcontrol.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/memremap.h> 16 17 /* 18 * The anon_vma heads a list of private "related" vmas, to scan if 19 * an anonymous page pointing to this anon_vma needs to be unmapped: 20 * the vmas on the list will be related by forking, or by splitting. 21 * 22 * Since vmas come and go as they are split and merged (particularly 23 * in mprotect), the mapping field of an anonymous page cannot point 24 * directly to a vma: instead it points to an anon_vma, on whose list 25 * the related vmas can be easily linked or unlinked. 26 * 27 * After unlinking the last vma on the list, we must garbage collect 28 * the anon_vma object itself: we're guaranteed no page can be 29 * pointing to this anon_vma once its vma list is empty. 30 */ 31 struct anon_vma { 32 struct anon_vma *root; /* Root of this anon_vma tree */ 33 struct rw_semaphore rwsem; /* W: modification, R: walking the list */ 34 /* 35 * The refcount is taken on an anon_vma when there is no 36 * guarantee that the vma of page tables will exist for 37 * the duration of the operation. A caller that takes 38 * the reference is responsible for clearing up the 39 * anon_vma if they are the last user on release 40 */ 41 atomic_t refcount; 42 43 /* 44 * Count of child anon_vmas. Equals to the count of all anon_vmas that 45 * have ->parent pointing to this one, including itself. 46 * 47 * This counter is used for making decision about reusing anon_vma 48 * instead of forking new one. See comments in function anon_vma_clone. 49 */ 50 unsigned long num_children; 51 /* Count of VMAs whose ->anon_vma pointer points to this object. */ 52 unsigned long num_active_vmas; 53 54 struct anon_vma *parent; /* Parent of this anon_vma */ 55 56 /* 57 * NOTE: the LSB of the rb_root.rb_node is set by 58 * mm_take_all_locks() _after_ taking the above lock. So the 59 * rb_root must only be read/written after taking the above lock 60 * to be sure to see a valid next pointer. The LSB bit itself 61 * is serialized by a system wide lock only visible to 62 * mm_take_all_locks() (mm_all_locks_mutex). 63 */ 64 65 /* Interval tree of private "related" vmas */ 66 struct rb_root_cached rb_root; 67 }; 68 69 /* 70 * The copy-on-write semantics of fork mean that an anon_vma 71 * can become associated with multiple processes. Furthermore, 72 * each child process will have its own anon_vma, where new 73 * pages for that process are instantiated. 74 * 75 * This structure allows us to find the anon_vmas associated 76 * with a VMA, or the VMAs associated with an anon_vma. 77 * The "same_vma" list contains the anon_vma_chains linking 78 * all the anon_vmas associated with this VMA. 79 * The "rb" field indexes on an interval tree the anon_vma_chains 80 * which link all the VMAs associated with this anon_vma. 81 */ 82 struct anon_vma_chain { 83 struct vm_area_struct *vma; 84 struct anon_vma *anon_vma; 85 struct list_head same_vma; /* locked by mmap_lock & page_table_lock */ 86 struct rb_node rb; /* locked by anon_vma->rwsem */ 87 unsigned long rb_subtree_last; 88 #ifdef CONFIG_DEBUG_VM_RB 89 unsigned long cached_vma_start, cached_vma_last; 90 #endif 91 }; 92 93 enum ttu_flags { 94 TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ 95 TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ 96 TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ 97 TTU_HWPOISON = 0x20, /* do convert pte to hwpoison entry */ 98 TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible 99 * and caller guarantees they will 100 * do a final flush if necessary */ 101 TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: 102 * caller holds it */ 103 }; 104 105 #ifdef CONFIG_MMU 106 static inline void get_anon_vma(struct anon_vma *anon_vma) 107 { 108 atomic_inc(&anon_vma->refcount); 109 } 110 111 void __put_anon_vma(struct anon_vma *anon_vma); 112 113 static inline void put_anon_vma(struct anon_vma *anon_vma) 114 { 115 if (atomic_dec_and_test(&anon_vma->refcount)) 116 __put_anon_vma(anon_vma); 117 } 118 119 static inline void anon_vma_lock_write(struct anon_vma *anon_vma) 120 { 121 down_write(&anon_vma->root->rwsem); 122 } 123 124 static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) 125 { 126 return down_write_trylock(&anon_vma->root->rwsem); 127 } 128 129 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) 130 { 131 up_write(&anon_vma->root->rwsem); 132 } 133 134 static inline void anon_vma_lock_read(struct anon_vma *anon_vma) 135 { 136 down_read(&anon_vma->root->rwsem); 137 } 138 139 static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) 140 { 141 return down_read_trylock(&anon_vma->root->rwsem); 142 } 143 144 static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) 145 { 146 up_read(&anon_vma->root->rwsem); 147 } 148 149 150 /* 151 * anon_vma helper functions. 152 */ 153 void anon_vma_init(void); /* create anon_vma_cachep */ 154 int __anon_vma_prepare(struct vm_area_struct *); 155 void unlink_anon_vmas(struct vm_area_struct *); 156 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); 157 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); 158 159 static inline int anon_vma_prepare(struct vm_area_struct *vma) 160 { 161 if (likely(vma->anon_vma)) 162 return 0; 163 164 return __anon_vma_prepare(vma); 165 } 166 167 static inline void anon_vma_merge(struct vm_area_struct *vma, 168 struct vm_area_struct *next) 169 { 170 VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); 171 unlink_anon_vmas(next); 172 } 173 174 struct anon_vma *folio_get_anon_vma(const struct folio *folio); 175 176 /* RMAP flags, currently only relevant for some anon rmap operations. */ 177 typedef int __bitwise rmap_t; 178 179 /* 180 * No special request: A mapped anonymous (sub)page is possibly shared between 181 * processes. 182 */ 183 #define RMAP_NONE ((__force rmap_t)0) 184 185 /* The anonymous (sub)page is exclusive to a single process. */ 186 #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) 187 188 /* 189 * Internally, we're using an enum to specify the granularity. We make the 190 * compiler emit specialized code for each granularity. 191 */ 192 enum rmap_level { 193 RMAP_LEVEL_PTE = 0, 194 RMAP_LEVEL_PMD, 195 RMAP_LEVEL_PUD, 196 }; 197 198 static inline void __folio_rmap_sanity_checks(const struct folio *folio, 199 const struct page *page, int nr_pages, enum rmap_level level) 200 { 201 /* hugetlb folios are handled separately. */ 202 VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); 203 204 /* When (un)mapping zeropages, we should never touch ref+mapcount. */ 205 VM_WARN_ON_FOLIO(is_zero_folio(folio), folio); 206 207 /* 208 * TODO: we get driver-allocated folios that have nothing to do with 209 * the rmap using vm_insert_page(); therefore, we cannot assume that 210 * folio_test_large_rmappable() holds for large folios. We should 211 * handle any desired mapcount+stats accounting for these folios in 212 * VM_MIXEDMAP VMAs separately, and then sanity-check here that 213 * we really only get rmappable folios. 214 */ 215 216 VM_WARN_ON_ONCE(nr_pages <= 0); 217 VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); 218 VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); 219 220 switch (level) { 221 case RMAP_LEVEL_PTE: 222 break; 223 case RMAP_LEVEL_PMD: 224 /* 225 * We don't support folios larger than a single PMD yet. So 226 * when RMAP_LEVEL_PMD is set, we assume that we are creating 227 * a single "entire" mapping of the folio. 228 */ 229 VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); 230 VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); 231 break; 232 case RMAP_LEVEL_PUD: 233 /* 234 * Assume that we are creating a single "entire" mapping of the 235 * folio. 236 */ 237 VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); 238 VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); 239 break; 240 default: 241 VM_WARN_ON_ONCE(true); 242 } 243 } 244 245 /* 246 * rmap interfaces called when adding or removing pte of page 247 */ 248 void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); 249 void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, 250 struct vm_area_struct *, unsigned long address, rmap_t flags); 251 #define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \ 252 folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) 253 void folio_add_anon_rmap_pmd(struct folio *, struct page *, 254 struct vm_area_struct *, unsigned long address, rmap_t flags); 255 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, 256 unsigned long address, rmap_t flags); 257 void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, 258 struct vm_area_struct *); 259 #define folio_add_file_rmap_pte(folio, page, vma) \ 260 folio_add_file_rmap_ptes(folio, page, 1, vma) 261 void folio_add_file_rmap_pmd(struct folio *, struct page *, 262 struct vm_area_struct *); 263 void folio_add_file_rmap_pud(struct folio *, struct page *, 264 struct vm_area_struct *); 265 void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, 266 struct vm_area_struct *); 267 #define folio_remove_rmap_pte(folio, page, vma) \ 268 folio_remove_rmap_ptes(folio, page, 1, vma) 269 void folio_remove_rmap_pmd(struct folio *, struct page *, 270 struct vm_area_struct *); 271 void folio_remove_rmap_pud(struct folio *, struct page *, 272 struct vm_area_struct *); 273 274 void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, 275 unsigned long address, rmap_t flags); 276 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, 277 unsigned long address); 278 279 /* See folio_try_dup_anon_rmap_*() */ 280 static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, 281 struct vm_area_struct *vma) 282 { 283 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 284 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 285 286 if (PageAnonExclusive(&folio->page)) { 287 if (unlikely(folio_needs_cow_for_dma(vma, folio))) 288 return -EBUSY; 289 ClearPageAnonExclusive(&folio->page); 290 } 291 atomic_inc(&folio->_entire_mapcount); 292 atomic_inc(&folio->_large_mapcount); 293 return 0; 294 } 295 296 /* See folio_try_share_anon_rmap_*() */ 297 static inline int hugetlb_try_share_anon_rmap(struct folio *folio) 298 { 299 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 300 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 301 VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); 302 303 /* Paired with the memory barrier in try_grab_folio(). */ 304 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 305 smp_mb(); 306 307 if (unlikely(folio_maybe_dma_pinned(folio))) 308 return -EBUSY; 309 ClearPageAnonExclusive(&folio->page); 310 311 /* 312 * This is conceptually a smp_wmb() paired with the smp_rmb() in 313 * gup_must_unshare(). 314 */ 315 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 316 smp_mb__after_atomic(); 317 return 0; 318 } 319 320 static inline void hugetlb_add_file_rmap(struct folio *folio) 321 { 322 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 323 VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); 324 325 atomic_inc(&folio->_entire_mapcount); 326 atomic_inc(&folio->_large_mapcount); 327 } 328 329 static inline void hugetlb_remove_rmap(struct folio *folio) 330 { 331 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 332 333 atomic_dec(&folio->_entire_mapcount); 334 atomic_dec(&folio->_large_mapcount); 335 } 336 337 static __always_inline void __folio_dup_file_rmap(struct folio *folio, 338 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 339 enum rmap_level level) 340 { 341 const int orig_nr_pages = nr_pages; 342 343 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 344 345 switch (level) { 346 case RMAP_LEVEL_PTE: 347 if (!folio_test_large(folio)) { 348 atomic_inc(&folio->_mapcount); 349 break; 350 } 351 352 do { 353 atomic_inc(&page->_mapcount); 354 } while (page++, --nr_pages > 0); 355 atomic_add(orig_nr_pages, &folio->_large_mapcount); 356 break; 357 case RMAP_LEVEL_PMD: 358 case RMAP_LEVEL_PUD: 359 atomic_inc(&folio->_entire_mapcount); 360 atomic_inc(&folio->_large_mapcount); 361 break; 362 } 363 } 364 365 /** 366 * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio 367 * @folio: The folio to duplicate the mappings of 368 * @page: The first page to duplicate the mappings of 369 * @nr_pages: The number of pages of which the mapping will be duplicated 370 * @dst_vma: The destination vm area 371 * 372 * The page range of the folio is defined by [page, page + nr_pages) 373 * 374 * The caller needs to hold the page table lock. 375 */ 376 static inline void folio_dup_file_rmap_ptes(struct folio *folio, 377 struct page *page, int nr_pages, struct vm_area_struct *dst_vma) 378 { 379 __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE); 380 } 381 382 static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, 383 struct page *page, struct vm_area_struct *dst_vma) 384 { 385 __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE); 386 } 387 388 /** 389 * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio 390 * @folio: The folio to duplicate the mapping of 391 * @page: The first page to duplicate the mapping of 392 * @dst_vma: The destination vm area 393 * 394 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 395 * 396 * The caller needs to hold the page table lock. 397 */ 398 static inline void folio_dup_file_rmap_pmd(struct folio *folio, 399 struct page *page, struct vm_area_struct *dst_vma) 400 { 401 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 402 __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE); 403 #else 404 WARN_ON_ONCE(true); 405 #endif 406 } 407 408 static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, 409 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 410 struct vm_area_struct *src_vma, enum rmap_level level) 411 { 412 const int orig_nr_pages = nr_pages; 413 bool maybe_pinned; 414 int i; 415 416 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 417 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 418 419 /* 420 * If this folio may have been pinned by the parent process, 421 * don't allow to duplicate the mappings but instead require to e.g., 422 * copy the subpage immediately for the child so that we'll always 423 * guarantee the pinned folio won't be randomly replaced in the 424 * future on write faults. 425 */ 426 maybe_pinned = likely(!folio_is_device_private(folio)) && 427 unlikely(folio_needs_cow_for_dma(src_vma, folio)); 428 429 /* 430 * No need to check+clear for already shared PTEs/PMDs of the 431 * folio. But if any page is PageAnonExclusive, we must fallback to 432 * copying if the folio maybe pinned. 433 */ 434 switch (level) { 435 case RMAP_LEVEL_PTE: 436 if (unlikely(maybe_pinned)) { 437 for (i = 0; i < nr_pages; i++) 438 if (PageAnonExclusive(page + i)) 439 return -EBUSY; 440 } 441 442 if (!folio_test_large(folio)) { 443 if (PageAnonExclusive(page)) 444 ClearPageAnonExclusive(page); 445 atomic_inc(&folio->_mapcount); 446 break; 447 } 448 449 do { 450 if (PageAnonExclusive(page)) 451 ClearPageAnonExclusive(page); 452 atomic_inc(&page->_mapcount); 453 } while (page++, --nr_pages > 0); 454 atomic_add(orig_nr_pages, &folio->_large_mapcount); 455 break; 456 case RMAP_LEVEL_PMD: 457 case RMAP_LEVEL_PUD: 458 if (PageAnonExclusive(page)) { 459 if (unlikely(maybe_pinned)) 460 return -EBUSY; 461 ClearPageAnonExclusive(page); 462 } 463 atomic_inc(&folio->_entire_mapcount); 464 atomic_inc(&folio->_large_mapcount); 465 break; 466 } 467 return 0; 468 } 469 470 /** 471 * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range 472 * of a folio 473 * @folio: The folio to duplicate the mappings of 474 * @page: The first page to duplicate the mappings of 475 * @nr_pages: The number of pages of which the mapping will be duplicated 476 * @dst_vma: The destination vm area 477 * @src_vma: The vm area from which the mappings are duplicated 478 * 479 * The page range of the folio is defined by [page, page + nr_pages) 480 * 481 * The caller needs to hold the page table lock and the 482 * vma->vma_mm->write_protect_seq. 483 * 484 * Duplicating the mappings can only fail if the folio may be pinned; device 485 * private folios cannot get pinned and consequently this function cannot fail 486 * for them. 487 * 488 * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in 489 * the parent and the child. They must *not* be writable after this call 490 * succeeded. 491 * 492 * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. 493 */ 494 static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, 495 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 496 struct vm_area_struct *src_vma) 497 { 498 return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, 499 src_vma, RMAP_LEVEL_PTE); 500 } 501 502 static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, 503 struct page *page, struct vm_area_struct *dst_vma, 504 struct vm_area_struct *src_vma) 505 { 506 return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, 507 RMAP_LEVEL_PTE); 508 } 509 510 /** 511 * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range 512 * of a folio 513 * @folio: The folio to duplicate the mapping of 514 * @page: The first page to duplicate the mapping of 515 * @dst_vma: The destination vm area 516 * @src_vma: The vm area from which the mapping is duplicated 517 * 518 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 519 * 520 * The caller needs to hold the page table lock and the 521 * vma->vma_mm->write_protect_seq. 522 * 523 * Duplicating the mapping can only fail if the folio may be pinned; device 524 * private folios cannot get pinned and consequently this function cannot fail 525 * for them. 526 * 527 * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in 528 * the parent and the child. They must *not* be writable after this call 529 * succeeded. 530 * 531 * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. 532 */ 533 static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, 534 struct page *page, struct vm_area_struct *dst_vma, 535 struct vm_area_struct *src_vma) 536 { 537 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 538 return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, 539 src_vma, RMAP_LEVEL_PMD); 540 #else 541 WARN_ON_ONCE(true); 542 return -EBUSY; 543 #endif 544 } 545 546 static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, 547 struct page *page, int nr_pages, enum rmap_level level) 548 { 549 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 550 VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); 551 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 552 553 /* device private folios cannot get pinned via GUP. */ 554 if (unlikely(folio_is_device_private(folio))) { 555 ClearPageAnonExclusive(page); 556 return 0; 557 } 558 559 /* 560 * We have to make sure that when we clear PageAnonExclusive, that 561 * the page is not pinned and that concurrent GUP-fast won't succeed in 562 * concurrently pinning the page. 563 * 564 * Conceptually, PageAnonExclusive clearing consists of: 565 * (A1) Clear PTE 566 * (A2) Check if the page is pinned; back off if so. 567 * (A3) Clear PageAnonExclusive 568 * (A4) Restore PTE (optional, but certainly not writable) 569 * 570 * When clearing PageAnonExclusive, we cannot possibly map the page 571 * writable again, because anon pages that may be shared must never 572 * be writable. So in any case, if the PTE was writable it cannot 573 * be writable anymore afterwards and there would be a PTE change. Only 574 * if the PTE wasn't writable, there might not be a PTE change. 575 * 576 * Conceptually, GUP-fast pinning of an anon page consists of: 577 * (B1) Read the PTE 578 * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so. 579 * (B3) Pin the mapped page 580 * (B4) Check if the PTE changed by re-reading it; back off if so. 581 * (B5) If the original PTE is not writable, check if 582 * PageAnonExclusive is not set; back off if so. 583 * 584 * If the PTE was writable, we only have to make sure that GUP-fast 585 * observes a PTE change and properly backs off. 586 * 587 * If the PTE was not writable, we have to make sure that GUP-fast either 588 * detects a (temporary) PTE change or that PageAnonExclusive is cleared 589 * and properly backs off. 590 * 591 * Consequently, when clearing PageAnonExclusive(), we have to make 592 * sure that (A1), (A2)/(A3) and (A4) happen in the right memory 593 * order. In GUP-fast pinning code, we have to make sure that (B3),(B4) 594 * and (B5) happen in the right memory order. 595 * 596 * We assume that there might not be a memory barrier after 597 * clearing/invalidating the PTE (A1) and before restoring the PTE (A4), 598 * so we use explicit ones here. 599 */ 600 601 /* Paired with the memory barrier in try_grab_folio(). */ 602 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 603 smp_mb(); 604 605 if (unlikely(folio_maybe_dma_pinned(folio))) 606 return -EBUSY; 607 ClearPageAnonExclusive(page); 608 609 /* 610 * This is conceptually a smp_wmb() paired with the smp_rmb() in 611 * gup_must_unshare(). 612 */ 613 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 614 smp_mb__after_atomic(); 615 return 0; 616 } 617 618 /** 619 * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page 620 * mapped by a PTE possibly shared to prepare 621 * for KSM or temporary unmapping 622 * @folio: The folio to share a mapping of 623 * @page: The mapped exclusive page 624 * 625 * The caller needs to hold the page table lock and has to have the page table 626 * entries cleared/invalidated. 627 * 628 * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during 629 * fork() to duplicate mappings, but instead to prepare for KSM or temporarily 630 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte(). 631 * 632 * Marking the mapped page shared can only fail if the folio maybe pinned; 633 * device private folios cannot get pinned and consequently this function cannot 634 * fail. 635 * 636 * Returns 0 if marking the mapped page possibly shared succeeded. Returns 637 * -EBUSY otherwise. 638 */ 639 static inline int folio_try_share_anon_rmap_pte(struct folio *folio, 640 struct page *page) 641 { 642 return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); 643 } 644 645 /** 646 * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page 647 * range mapped by a PMD possibly shared to 648 * prepare for temporary unmapping 649 * @folio: The folio to share the mapping of 650 * @page: The first page to share the mapping of 651 * 652 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 653 * 654 * The caller needs to hold the page table lock and has to have the page table 655 * entries cleared/invalidated. 656 * 657 * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during 658 * fork() to duplicate a mapping, but instead to prepare for temporarily 659 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd(). 660 * 661 * Marking the mapped pages shared can only fail if the folio maybe pinned; 662 * device private folios cannot get pinned and consequently this function cannot 663 * fail. 664 * 665 * Returns 0 if marking the mapped pages possibly shared succeeded. Returns 666 * -EBUSY otherwise. 667 */ 668 static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, 669 struct page *page) 670 { 671 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 672 return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, 673 RMAP_LEVEL_PMD); 674 #else 675 WARN_ON_ONCE(true); 676 return -EBUSY; 677 #endif 678 } 679 680 /* 681 * Called from mm/vmscan.c to handle paging out 682 */ 683 int folio_referenced(struct folio *, int is_locked, 684 struct mem_cgroup *memcg, unsigned long *vm_flags); 685 686 void try_to_migrate(struct folio *folio, enum ttu_flags flags); 687 void try_to_unmap(struct folio *, enum ttu_flags flags); 688 689 struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, 690 void *owner, struct folio **foliop); 691 692 /* Avoid racy checks */ 693 #define PVMW_SYNC (1 << 0) 694 /* Look for migration entries rather than present PTEs */ 695 #define PVMW_MIGRATION (1 << 1) 696 697 struct page_vma_mapped_walk { 698 unsigned long pfn; 699 unsigned long nr_pages; 700 pgoff_t pgoff; 701 struct vm_area_struct *vma; 702 unsigned long address; 703 pmd_t *pmd; 704 pte_t *pte; 705 spinlock_t *ptl; 706 unsigned int flags; 707 }; 708 709 #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ 710 struct page_vma_mapped_walk name = { \ 711 .pfn = folio_pfn(_folio), \ 712 .nr_pages = folio_nr_pages(_folio), \ 713 .pgoff = folio_pgoff(_folio), \ 714 .vma = _vma, \ 715 .address = _address, \ 716 .flags = _flags, \ 717 } 718 719 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) 720 { 721 /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ 722 if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma)) 723 pte_unmap(pvmw->pte); 724 if (pvmw->ptl) 725 spin_unlock(pvmw->ptl); 726 } 727 728 /** 729 * page_vma_mapped_walk_restart - Restart the page table walk. 730 * @pvmw: Pointer to struct page_vma_mapped_walk. 731 * 732 * It restarts the page table walk when changes occur in the page 733 * table, such as splitting a PMD. Ensures that the PTL held during 734 * the previous walk is released and resets the state to allow for 735 * a new walk starting at the current address stored in pvmw->address. 736 */ 737 static inline void 738 page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) 739 { 740 WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte); 741 742 if (likely(pvmw->ptl)) 743 spin_unlock(pvmw->ptl); 744 else 745 WARN_ON_ONCE(1); 746 747 pvmw->ptl = NULL; 748 pvmw->pmd = NULL; 749 pvmw->pte = NULL; 750 } 751 752 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); 753 unsigned long page_address_in_vma(const struct folio *folio, 754 const struct page *, const struct vm_area_struct *); 755 756 /* 757 * Cleans the PTEs of shared mappings. 758 * (and since clean PTEs should also be readonly, write protects them too) 759 * 760 * returns the number of cleaned PTEs. 761 */ 762 int folio_mkclean(struct folio *); 763 764 int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, 765 unsigned long pfn, unsigned long nr_pages); 766 767 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, 768 struct vm_area_struct *vma); 769 770 enum rmp_flags { 771 RMP_LOCKED = 1 << 0, 772 RMP_USE_SHARED_ZEROPAGE = 1 << 1, 773 }; 774 775 void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); 776 777 /* 778 * rmap_walk_control: To control rmap traversing for specific needs 779 * 780 * arg: passed to rmap_one() and invalid_vma() 781 * try_lock: bail out if the rmap lock is contended 782 * contended: indicate the rmap traversal bailed out due to lock contention 783 * rmap_one: executed on each vma where page is mapped 784 * done: for checking traversing termination condition 785 * anon_lock: for getting anon_lock by optimized way rather than default 786 * invalid_vma: for skipping uninterested vma 787 */ 788 struct rmap_walk_control { 789 void *arg; 790 bool try_lock; 791 bool contended; 792 /* 793 * Return false if page table scanning in rmap_walk should be stopped. 794 * Otherwise, return true. 795 */ 796 bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, 797 unsigned long addr, void *arg); 798 int (*done)(struct folio *folio); 799 struct anon_vma *(*anon_lock)(const struct folio *folio, 800 struct rmap_walk_control *rwc); 801 bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); 802 }; 803 804 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); 805 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); 806 struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, 807 struct rmap_walk_control *rwc); 808 809 #else /* !CONFIG_MMU */ 810 811 #define anon_vma_init() do {} while (0) 812 #define anon_vma_prepare(vma) (0) 813 814 static inline int folio_referenced(struct folio *folio, int is_locked, 815 struct mem_cgroup *memcg, 816 unsigned long *vm_flags) 817 { 818 *vm_flags = 0; 819 return 0; 820 } 821 822 static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags) 823 { 824 } 825 826 static inline int folio_mkclean(struct folio *folio) 827 { 828 return 0; 829 } 830 #endif /* CONFIG_MMU */ 831 832 #endif /* _LINUX_RMAP_H */ 833