1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _LINUX_RMAP_H 3 #define _LINUX_RMAP_H 4 /* 5 * Declarations for Reverse Mapping functions in mm/rmap.c 6 */ 7 8 #include <linux/list.h> 9 #include <linux/slab.h> 10 #include <linux/mm.h> 11 #include <linux/rwsem.h> 12 #include <linux/memcontrol.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/memremap.h> 16 #include <linux/bit_spinlock.h> 17 18 /* 19 * The anon_vma heads a list of private "related" vmas, to scan if 20 * an anonymous page pointing to this anon_vma needs to be unmapped: 21 * the vmas on the list will be related by forking, or by splitting. 22 * 23 * Since vmas come and go as they are split and merged (particularly 24 * in mprotect), the mapping field of an anonymous page cannot point 25 * directly to a vma: instead it points to an anon_vma, on whose list 26 * the related vmas can be easily linked or unlinked. 27 * 28 * After unlinking the last vma on the list, we must garbage collect 29 * the anon_vma object itself: we're guaranteed no page can be 30 * pointing to this anon_vma once its vma list is empty. 31 */ 32 struct anon_vma { 33 struct anon_vma *root; /* Root of this anon_vma tree */ 34 struct rw_semaphore rwsem; /* W: modification, R: walking the list */ 35 /* 36 * The refcount is taken on an anon_vma when there is no 37 * guarantee that the vma of page tables will exist for 38 * the duration of the operation. A caller that takes 39 * the reference is responsible for clearing up the 40 * anon_vma if they are the last user on release 41 */ 42 atomic_t refcount; 43 44 /* 45 * Count of child anon_vmas. Equals to the count of all anon_vmas that 46 * have ->parent pointing to this one, including itself. 47 * 48 * This counter is used for making decision about reusing anon_vma 49 * instead of forking new one. See comments in function anon_vma_clone. 50 */ 51 unsigned long num_children; 52 /* Count of VMAs whose ->anon_vma pointer points to this object. */ 53 unsigned long num_active_vmas; 54 55 struct anon_vma *parent; /* Parent of this anon_vma */ 56 57 /* 58 * NOTE: the LSB of the rb_root.rb_node is set by 59 * mm_take_all_locks() _after_ taking the above lock. So the 60 * rb_root must only be read/written after taking the above lock 61 * to be sure to see a valid next pointer. The LSB bit itself 62 * is serialized by a system wide lock only visible to 63 * mm_take_all_locks() (mm_all_locks_mutex). 64 */ 65 66 /* Interval tree of private "related" vmas */ 67 struct rb_root_cached rb_root; 68 }; 69 70 /* 71 * The copy-on-write semantics of fork mean that an anon_vma 72 * can become associated with multiple processes. Furthermore, 73 * each child process will have its own anon_vma, where new 74 * pages for that process are instantiated. 75 * 76 * This structure allows us to find the anon_vmas associated 77 * with a VMA, or the VMAs associated with an anon_vma. 78 * The "same_vma" list contains the anon_vma_chains linking 79 * all the anon_vmas associated with this VMA. 80 * The "rb" field indexes on an interval tree the anon_vma_chains 81 * which link all the VMAs associated with this anon_vma. 82 */ 83 struct anon_vma_chain { 84 struct vm_area_struct *vma; 85 struct anon_vma *anon_vma; 86 struct list_head same_vma; /* locked by mmap_lock & page_table_lock */ 87 struct rb_node rb; /* locked by anon_vma->rwsem */ 88 unsigned long rb_subtree_last; 89 #ifdef CONFIG_DEBUG_VM_RB 90 unsigned long cached_vma_start, cached_vma_last; 91 #endif 92 }; 93 94 enum ttu_flags { 95 TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ 96 TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ 97 TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ 98 TTU_HWPOISON = 0x20, /* do convert pte to hwpoison entry */ 99 TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible 100 * and caller guarantees they will 101 * do a final flush if necessary */ 102 TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: 103 * caller holds it */ 104 }; 105 106 #ifdef CONFIG_MMU 107 static inline void get_anon_vma(struct anon_vma *anon_vma) 108 { 109 atomic_inc(&anon_vma->refcount); 110 } 111 112 void __put_anon_vma(struct anon_vma *anon_vma); 113 114 static inline void put_anon_vma(struct anon_vma *anon_vma) 115 { 116 if (atomic_dec_and_test(&anon_vma->refcount)) 117 __put_anon_vma(anon_vma); 118 } 119 120 static inline void anon_vma_lock_write(struct anon_vma *anon_vma) 121 { 122 down_write(&anon_vma->root->rwsem); 123 } 124 125 static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) 126 { 127 return down_write_trylock(&anon_vma->root->rwsem); 128 } 129 130 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) 131 { 132 up_write(&anon_vma->root->rwsem); 133 } 134 135 static inline void anon_vma_lock_read(struct anon_vma *anon_vma) 136 { 137 down_read(&anon_vma->root->rwsem); 138 } 139 140 static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) 141 { 142 return down_read_trylock(&anon_vma->root->rwsem); 143 } 144 145 static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) 146 { 147 up_read(&anon_vma->root->rwsem); 148 } 149 150 151 /* 152 * anon_vma helper functions. 153 */ 154 void anon_vma_init(void); /* create anon_vma_cachep */ 155 int __anon_vma_prepare(struct vm_area_struct *); 156 void unlink_anon_vmas(struct vm_area_struct *); 157 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); 158 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); 159 160 static inline int anon_vma_prepare(struct vm_area_struct *vma) 161 { 162 if (likely(vma->anon_vma)) 163 return 0; 164 165 return __anon_vma_prepare(vma); 166 } 167 168 static inline void anon_vma_merge(struct vm_area_struct *vma, 169 struct vm_area_struct *next) 170 { 171 VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); 172 unlink_anon_vmas(next); 173 } 174 175 struct anon_vma *folio_get_anon_vma(const struct folio *folio); 176 177 #ifdef CONFIG_MM_ID 178 static __always_inline void folio_lock_large_mapcount(struct folio *folio) 179 { 180 bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); 181 } 182 183 static __always_inline void folio_unlock_large_mapcount(struct folio *folio) 184 { 185 __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); 186 } 187 188 static inline unsigned int folio_mm_id(const struct folio *folio, int idx) 189 { 190 VM_WARN_ON_ONCE(idx != 0 && idx != 1); 191 return folio->_mm_id[idx] & MM_ID_MASK; 192 } 193 194 static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id) 195 { 196 VM_WARN_ON_ONCE(idx != 0 && idx != 1); 197 folio->_mm_id[idx] &= ~MM_ID_MASK; 198 folio->_mm_id[idx] |= id; 199 } 200 201 static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio, 202 int diff, mm_id_t mm_id) 203 { 204 VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio)); 205 VM_WARN_ON_ONCE(diff <= 0); 206 VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX); 207 208 /* 209 * Make sure we can detect at least one complete PTE mapping of the 210 * folio in a single MM as "exclusively mapped". This is primarily 211 * a check on 32bit, where we currently reduce the size of the per-MM 212 * mapcount to a short. 213 */ 214 VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio)); 215 VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX); 216 217 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY && 218 folio->_mm_id_mapcount[0] != -1); 219 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY && 220 folio->_mm_id_mapcount[0] < 0); 221 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY && 222 folio->_mm_id_mapcount[1] != -1); 223 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY && 224 folio->_mm_id_mapcount[1] < 0); 225 VM_WARN_ON_ONCE(!folio_mapped(folio) && 226 folio_test_large_maybe_mapped_shared(folio)); 227 } 228 229 static __always_inline void folio_set_large_mapcount(struct folio *folio, 230 int mapcount, struct vm_area_struct *vma) 231 { 232 __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id); 233 234 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY); 235 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY); 236 237 /* Note: mapcounts start at -1. */ 238 atomic_set(&folio->_large_mapcount, mapcount - 1); 239 folio->_mm_id_mapcount[0] = mapcount - 1; 240 folio_set_mm_id(folio, 0, vma->vm_mm->mm_id); 241 } 242 243 static __always_inline void folio_add_large_mapcount(struct folio *folio, 244 int diff, struct vm_area_struct *vma) 245 { 246 const mm_id_t mm_id = vma->vm_mm->mm_id; 247 int new_mapcount_val; 248 249 folio_lock_large_mapcount(folio); 250 __folio_large_mapcount_sanity_checks(folio, diff, mm_id); 251 252 new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff; 253 atomic_set(&folio->_large_mapcount, new_mapcount_val); 254 255 /* 256 * If a folio is mapped more than once into an MM on 32bit, we 257 * can in theory overflow the per-MM mapcount (although only for 258 * fairly large folios), turning it negative. In that case, just 259 * free up the slot and mark the folio "mapped shared", otherwise 260 * we might be in trouble when unmapping pages later. 261 */ 262 if (folio_mm_id(folio, 0) == mm_id) { 263 folio->_mm_id_mapcount[0] += diff; 264 if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) { 265 folio->_mm_id_mapcount[0] = -1; 266 folio_set_mm_id(folio, 0, MM_ID_DUMMY); 267 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 268 } 269 } else if (folio_mm_id(folio, 1) == mm_id) { 270 folio->_mm_id_mapcount[1] += diff; 271 if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) { 272 folio->_mm_id_mapcount[1] = -1; 273 folio_set_mm_id(folio, 1, MM_ID_DUMMY); 274 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 275 } 276 } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) { 277 folio_set_mm_id(folio, 0, mm_id); 278 folio->_mm_id_mapcount[0] = diff - 1; 279 /* We might have other mappings already. */ 280 if (new_mapcount_val != diff - 1) 281 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 282 } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) { 283 folio_set_mm_id(folio, 1, mm_id); 284 folio->_mm_id_mapcount[1] = diff - 1; 285 /* Slot 0 certainly has mappings as well. */ 286 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 287 } 288 folio_unlock_large_mapcount(folio); 289 } 290 291 static __always_inline void folio_sub_large_mapcount(struct folio *folio, 292 int diff, struct vm_area_struct *vma) 293 { 294 const mm_id_t mm_id = vma->vm_mm->mm_id; 295 int new_mapcount_val; 296 297 folio_lock_large_mapcount(folio); 298 __folio_large_mapcount_sanity_checks(folio, diff, mm_id); 299 300 new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff; 301 atomic_set(&folio->_large_mapcount, new_mapcount_val); 302 303 /* 304 * There are valid corner cases where we might underflow a per-MM 305 * mapcount (some mappings added when no slot was free, some mappings 306 * added once a slot was free), so we always set it to -1 once we go 307 * negative. 308 */ 309 if (folio_mm_id(folio, 0) == mm_id) { 310 folio->_mm_id_mapcount[0] -= diff; 311 if (folio->_mm_id_mapcount[0] >= 0) 312 goto out; 313 folio->_mm_id_mapcount[0] = -1; 314 folio_set_mm_id(folio, 0, MM_ID_DUMMY); 315 } else if (folio_mm_id(folio, 1) == mm_id) { 316 folio->_mm_id_mapcount[1] -= diff; 317 if (folio->_mm_id_mapcount[1] >= 0) 318 goto out; 319 folio->_mm_id_mapcount[1] = -1; 320 folio_set_mm_id(folio, 1, MM_ID_DUMMY); 321 } 322 323 /* 324 * If one MM slot owns all mappings, the folio is mapped exclusively. 325 * Note that if the folio is now unmapped (new_mapcount_val == -1), both 326 * slots must be free (mapcount == -1), and we'll also mark it as 327 * exclusive. 328 */ 329 if (folio->_mm_id_mapcount[0] == new_mapcount_val || 330 folio->_mm_id_mapcount[1] == new_mapcount_val) 331 folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT; 332 out: 333 folio_unlock_large_mapcount(folio); 334 } 335 #else /* !CONFIG_MM_ID */ 336 /* 337 * See __folio_rmap_sanity_checks(), we might map large folios even without 338 * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now. 339 */ 340 static inline void folio_set_large_mapcount(struct folio *folio, int mapcount, 341 struct vm_area_struct *vma) 342 { 343 /* Note: mapcounts start at -1. */ 344 atomic_set(&folio->_large_mapcount, mapcount - 1); 345 } 346 347 static inline void folio_add_large_mapcount(struct folio *folio, 348 int diff, struct vm_area_struct *vma) 349 { 350 atomic_add(diff, &folio->_large_mapcount); 351 } 352 353 static inline void folio_sub_large_mapcount(struct folio *folio, 354 int diff, struct vm_area_struct *vma) 355 { 356 atomic_sub(diff, &folio->_large_mapcount); 357 } 358 #endif /* CONFIG_MM_ID */ 359 360 #define folio_inc_large_mapcount(folio, vma) \ 361 folio_add_large_mapcount(folio, 1, vma) 362 #define folio_dec_large_mapcount(folio, vma) \ 363 folio_sub_large_mapcount(folio, 1, vma) 364 365 /* RMAP flags, currently only relevant for some anon rmap operations. */ 366 typedef int __bitwise rmap_t; 367 368 /* 369 * No special request: A mapped anonymous (sub)page is possibly shared between 370 * processes. 371 */ 372 #define RMAP_NONE ((__force rmap_t)0) 373 374 /* The anonymous (sub)page is exclusive to a single process. */ 375 #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) 376 377 /* 378 * Internally, we're using an enum to specify the granularity. We make the 379 * compiler emit specialized code for each granularity. 380 */ 381 enum rmap_level { 382 RMAP_LEVEL_PTE = 0, 383 RMAP_LEVEL_PMD, 384 RMAP_LEVEL_PUD, 385 }; 386 387 static inline void __folio_rmap_sanity_checks(const struct folio *folio, 388 const struct page *page, int nr_pages, enum rmap_level level) 389 { 390 /* hugetlb folios are handled separately. */ 391 VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); 392 393 /* When (un)mapping zeropages, we should never touch ref+mapcount. */ 394 VM_WARN_ON_FOLIO(is_zero_folio(folio), folio); 395 396 /* 397 * TODO: we get driver-allocated folios that have nothing to do with 398 * the rmap using vm_insert_page(); therefore, we cannot assume that 399 * folio_test_large_rmappable() holds for large folios. We should 400 * handle any desired mapcount+stats accounting for these folios in 401 * VM_MIXEDMAP VMAs separately, and then sanity-check here that 402 * we really only get rmappable folios. 403 */ 404 405 VM_WARN_ON_ONCE(nr_pages <= 0); 406 VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); 407 VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); 408 409 switch (level) { 410 case RMAP_LEVEL_PTE: 411 break; 412 case RMAP_LEVEL_PMD: 413 /* 414 * We don't support folios larger than a single PMD yet. So 415 * when RMAP_LEVEL_PMD is set, we assume that we are creating 416 * a single "entire" mapping of the folio. 417 */ 418 VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); 419 VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); 420 break; 421 case RMAP_LEVEL_PUD: 422 /* 423 * Assume that we are creating a single "entire" mapping of the 424 * folio. 425 */ 426 VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); 427 VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); 428 break; 429 default: 430 VM_WARN_ON_ONCE(true); 431 } 432 } 433 434 /* 435 * rmap interfaces called when adding or removing pte of page 436 */ 437 void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); 438 void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, 439 struct vm_area_struct *, unsigned long address, rmap_t flags); 440 #define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \ 441 folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) 442 void folio_add_anon_rmap_pmd(struct folio *, struct page *, 443 struct vm_area_struct *, unsigned long address, rmap_t flags); 444 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, 445 unsigned long address, rmap_t flags); 446 void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, 447 struct vm_area_struct *); 448 #define folio_add_file_rmap_pte(folio, page, vma) \ 449 folio_add_file_rmap_ptes(folio, page, 1, vma) 450 void folio_add_file_rmap_pmd(struct folio *, struct page *, 451 struct vm_area_struct *); 452 void folio_add_file_rmap_pud(struct folio *, struct page *, 453 struct vm_area_struct *); 454 void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, 455 struct vm_area_struct *); 456 #define folio_remove_rmap_pte(folio, page, vma) \ 457 folio_remove_rmap_ptes(folio, page, 1, vma) 458 void folio_remove_rmap_pmd(struct folio *, struct page *, 459 struct vm_area_struct *); 460 void folio_remove_rmap_pud(struct folio *, struct page *, 461 struct vm_area_struct *); 462 463 void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, 464 unsigned long address, rmap_t flags); 465 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, 466 unsigned long address); 467 468 /* See folio_try_dup_anon_rmap_*() */ 469 static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, 470 struct vm_area_struct *vma) 471 { 472 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 473 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 474 475 if (PageAnonExclusive(&folio->page)) { 476 if (unlikely(folio_needs_cow_for_dma(vma, folio))) 477 return -EBUSY; 478 ClearPageAnonExclusive(&folio->page); 479 } 480 atomic_inc(&folio->_entire_mapcount); 481 atomic_inc(&folio->_large_mapcount); 482 return 0; 483 } 484 485 /* See folio_try_share_anon_rmap_*() */ 486 static inline int hugetlb_try_share_anon_rmap(struct folio *folio) 487 { 488 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 489 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 490 VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); 491 492 /* Paired with the memory barrier in try_grab_folio(). */ 493 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 494 smp_mb(); 495 496 if (unlikely(folio_maybe_dma_pinned(folio))) 497 return -EBUSY; 498 ClearPageAnonExclusive(&folio->page); 499 500 /* 501 * This is conceptually a smp_wmb() paired with the smp_rmb() in 502 * gup_must_unshare(). 503 */ 504 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 505 smp_mb__after_atomic(); 506 return 0; 507 } 508 509 static inline void hugetlb_add_file_rmap(struct folio *folio) 510 { 511 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 512 VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); 513 514 atomic_inc(&folio->_entire_mapcount); 515 atomic_inc(&folio->_large_mapcount); 516 } 517 518 static inline void hugetlb_remove_rmap(struct folio *folio) 519 { 520 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 521 522 atomic_dec(&folio->_entire_mapcount); 523 atomic_dec(&folio->_large_mapcount); 524 } 525 526 static __always_inline void __folio_dup_file_rmap(struct folio *folio, 527 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 528 enum rmap_level level) 529 { 530 const int orig_nr_pages = nr_pages; 531 532 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 533 534 switch (level) { 535 case RMAP_LEVEL_PTE: 536 if (!folio_test_large(folio)) { 537 atomic_inc(&folio->_mapcount); 538 break; 539 } 540 541 do { 542 atomic_inc(&page->_mapcount); 543 } while (page++, --nr_pages > 0); 544 folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); 545 break; 546 case RMAP_LEVEL_PMD: 547 case RMAP_LEVEL_PUD: 548 atomic_inc(&folio->_entire_mapcount); 549 folio_inc_large_mapcount(folio, dst_vma); 550 break; 551 } 552 } 553 554 /** 555 * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio 556 * @folio: The folio to duplicate the mappings of 557 * @page: The first page to duplicate the mappings of 558 * @nr_pages: The number of pages of which the mapping will be duplicated 559 * @dst_vma: The destination vm area 560 * 561 * The page range of the folio is defined by [page, page + nr_pages) 562 * 563 * The caller needs to hold the page table lock. 564 */ 565 static inline void folio_dup_file_rmap_ptes(struct folio *folio, 566 struct page *page, int nr_pages, struct vm_area_struct *dst_vma) 567 { 568 __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE); 569 } 570 571 static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, 572 struct page *page, struct vm_area_struct *dst_vma) 573 { 574 __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE); 575 } 576 577 /** 578 * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio 579 * @folio: The folio to duplicate the mapping of 580 * @page: The first page to duplicate the mapping of 581 * @dst_vma: The destination vm area 582 * 583 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 584 * 585 * The caller needs to hold the page table lock. 586 */ 587 static inline void folio_dup_file_rmap_pmd(struct folio *folio, 588 struct page *page, struct vm_area_struct *dst_vma) 589 { 590 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 591 __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE); 592 #else 593 WARN_ON_ONCE(true); 594 #endif 595 } 596 597 static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, 598 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 599 struct vm_area_struct *src_vma, enum rmap_level level) 600 { 601 const int orig_nr_pages = nr_pages; 602 bool maybe_pinned; 603 int i; 604 605 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 606 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 607 608 /* 609 * If this folio may have been pinned by the parent process, 610 * don't allow to duplicate the mappings but instead require to e.g., 611 * copy the subpage immediately for the child so that we'll always 612 * guarantee the pinned folio won't be randomly replaced in the 613 * future on write faults. 614 */ 615 maybe_pinned = likely(!folio_is_device_private(folio)) && 616 unlikely(folio_needs_cow_for_dma(src_vma, folio)); 617 618 /* 619 * No need to check+clear for already shared PTEs/PMDs of the 620 * folio. But if any page is PageAnonExclusive, we must fallback to 621 * copying if the folio maybe pinned. 622 */ 623 switch (level) { 624 case RMAP_LEVEL_PTE: 625 if (unlikely(maybe_pinned)) { 626 for (i = 0; i < nr_pages; i++) 627 if (PageAnonExclusive(page + i)) 628 return -EBUSY; 629 } 630 631 if (!folio_test_large(folio)) { 632 if (PageAnonExclusive(page)) 633 ClearPageAnonExclusive(page); 634 atomic_inc(&folio->_mapcount); 635 break; 636 } 637 638 do { 639 if (PageAnonExclusive(page)) 640 ClearPageAnonExclusive(page); 641 atomic_inc(&page->_mapcount); 642 } while (page++, --nr_pages > 0); 643 folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); 644 break; 645 case RMAP_LEVEL_PMD: 646 case RMAP_LEVEL_PUD: 647 if (PageAnonExclusive(page)) { 648 if (unlikely(maybe_pinned)) 649 return -EBUSY; 650 ClearPageAnonExclusive(page); 651 } 652 atomic_inc(&folio->_entire_mapcount); 653 folio_inc_large_mapcount(folio, dst_vma); 654 break; 655 } 656 return 0; 657 } 658 659 /** 660 * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range 661 * of a folio 662 * @folio: The folio to duplicate the mappings of 663 * @page: The first page to duplicate the mappings of 664 * @nr_pages: The number of pages of which the mapping will be duplicated 665 * @dst_vma: The destination vm area 666 * @src_vma: The vm area from which the mappings are duplicated 667 * 668 * The page range of the folio is defined by [page, page + nr_pages) 669 * 670 * The caller needs to hold the page table lock and the 671 * vma->vma_mm->write_protect_seq. 672 * 673 * Duplicating the mappings can only fail if the folio may be pinned; device 674 * private folios cannot get pinned and consequently this function cannot fail 675 * for them. 676 * 677 * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in 678 * the parent and the child. They must *not* be writable after this call 679 * succeeded. 680 * 681 * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. 682 */ 683 static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, 684 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 685 struct vm_area_struct *src_vma) 686 { 687 return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, 688 src_vma, RMAP_LEVEL_PTE); 689 } 690 691 static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, 692 struct page *page, struct vm_area_struct *dst_vma, 693 struct vm_area_struct *src_vma) 694 { 695 return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, 696 RMAP_LEVEL_PTE); 697 } 698 699 /** 700 * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range 701 * of a folio 702 * @folio: The folio to duplicate the mapping of 703 * @page: The first page to duplicate the mapping of 704 * @dst_vma: The destination vm area 705 * @src_vma: The vm area from which the mapping is duplicated 706 * 707 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 708 * 709 * The caller needs to hold the page table lock and the 710 * vma->vma_mm->write_protect_seq. 711 * 712 * Duplicating the mapping can only fail if the folio may be pinned; device 713 * private folios cannot get pinned and consequently this function cannot fail 714 * for them. 715 * 716 * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in 717 * the parent and the child. They must *not* be writable after this call 718 * succeeded. 719 * 720 * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. 721 */ 722 static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, 723 struct page *page, struct vm_area_struct *dst_vma, 724 struct vm_area_struct *src_vma) 725 { 726 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 727 return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, 728 src_vma, RMAP_LEVEL_PMD); 729 #else 730 WARN_ON_ONCE(true); 731 return -EBUSY; 732 #endif 733 } 734 735 static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, 736 struct page *page, int nr_pages, enum rmap_level level) 737 { 738 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 739 VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); 740 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 741 742 /* device private folios cannot get pinned via GUP. */ 743 if (unlikely(folio_is_device_private(folio))) { 744 ClearPageAnonExclusive(page); 745 return 0; 746 } 747 748 /* 749 * We have to make sure that when we clear PageAnonExclusive, that 750 * the page is not pinned and that concurrent GUP-fast won't succeed in 751 * concurrently pinning the page. 752 * 753 * Conceptually, PageAnonExclusive clearing consists of: 754 * (A1) Clear PTE 755 * (A2) Check if the page is pinned; back off if so. 756 * (A3) Clear PageAnonExclusive 757 * (A4) Restore PTE (optional, but certainly not writable) 758 * 759 * When clearing PageAnonExclusive, we cannot possibly map the page 760 * writable again, because anon pages that may be shared must never 761 * be writable. So in any case, if the PTE was writable it cannot 762 * be writable anymore afterwards and there would be a PTE change. Only 763 * if the PTE wasn't writable, there might not be a PTE change. 764 * 765 * Conceptually, GUP-fast pinning of an anon page consists of: 766 * (B1) Read the PTE 767 * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so. 768 * (B3) Pin the mapped page 769 * (B4) Check if the PTE changed by re-reading it; back off if so. 770 * (B5) If the original PTE is not writable, check if 771 * PageAnonExclusive is not set; back off if so. 772 * 773 * If the PTE was writable, we only have to make sure that GUP-fast 774 * observes a PTE change and properly backs off. 775 * 776 * If the PTE was not writable, we have to make sure that GUP-fast either 777 * detects a (temporary) PTE change or that PageAnonExclusive is cleared 778 * and properly backs off. 779 * 780 * Consequently, when clearing PageAnonExclusive(), we have to make 781 * sure that (A1), (A2)/(A3) and (A4) happen in the right memory 782 * order. In GUP-fast pinning code, we have to make sure that (B3),(B4) 783 * and (B5) happen in the right memory order. 784 * 785 * We assume that there might not be a memory barrier after 786 * clearing/invalidating the PTE (A1) and before restoring the PTE (A4), 787 * so we use explicit ones here. 788 */ 789 790 /* Paired with the memory barrier in try_grab_folio(). */ 791 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 792 smp_mb(); 793 794 if (unlikely(folio_maybe_dma_pinned(folio))) 795 return -EBUSY; 796 ClearPageAnonExclusive(page); 797 798 /* 799 * This is conceptually a smp_wmb() paired with the smp_rmb() in 800 * gup_must_unshare(). 801 */ 802 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 803 smp_mb__after_atomic(); 804 return 0; 805 } 806 807 /** 808 * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page 809 * mapped by a PTE possibly shared to prepare 810 * for KSM or temporary unmapping 811 * @folio: The folio to share a mapping of 812 * @page: The mapped exclusive page 813 * 814 * The caller needs to hold the page table lock and has to have the page table 815 * entries cleared/invalidated. 816 * 817 * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during 818 * fork() to duplicate mappings, but instead to prepare for KSM or temporarily 819 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte(). 820 * 821 * Marking the mapped page shared can only fail if the folio maybe pinned; 822 * device private folios cannot get pinned and consequently this function cannot 823 * fail. 824 * 825 * Returns 0 if marking the mapped page possibly shared succeeded. Returns 826 * -EBUSY otherwise. 827 */ 828 static inline int folio_try_share_anon_rmap_pte(struct folio *folio, 829 struct page *page) 830 { 831 return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); 832 } 833 834 /** 835 * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page 836 * range mapped by a PMD possibly shared to 837 * prepare for temporary unmapping 838 * @folio: The folio to share the mapping of 839 * @page: The first page to share the mapping of 840 * 841 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 842 * 843 * The caller needs to hold the page table lock and has to have the page table 844 * entries cleared/invalidated. 845 * 846 * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during 847 * fork() to duplicate a mapping, but instead to prepare for temporarily 848 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd(). 849 * 850 * Marking the mapped pages shared can only fail if the folio maybe pinned; 851 * device private folios cannot get pinned and consequently this function cannot 852 * fail. 853 * 854 * Returns 0 if marking the mapped pages possibly shared succeeded. Returns 855 * -EBUSY otherwise. 856 */ 857 static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, 858 struct page *page) 859 { 860 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 861 return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, 862 RMAP_LEVEL_PMD); 863 #else 864 WARN_ON_ONCE(true); 865 return -EBUSY; 866 #endif 867 } 868 869 /* 870 * Called from mm/vmscan.c to handle paging out 871 */ 872 int folio_referenced(struct folio *, int is_locked, 873 struct mem_cgroup *memcg, unsigned long *vm_flags); 874 875 void try_to_migrate(struct folio *folio, enum ttu_flags flags); 876 void try_to_unmap(struct folio *, enum ttu_flags flags); 877 878 struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, 879 void *owner, struct folio **foliop); 880 881 /* Avoid racy checks */ 882 #define PVMW_SYNC (1 << 0) 883 /* Look for migration entries rather than present PTEs */ 884 #define PVMW_MIGRATION (1 << 1) 885 886 struct page_vma_mapped_walk { 887 unsigned long pfn; 888 unsigned long nr_pages; 889 pgoff_t pgoff; 890 struct vm_area_struct *vma; 891 unsigned long address; 892 pmd_t *pmd; 893 pte_t *pte; 894 spinlock_t *ptl; 895 unsigned int flags; 896 }; 897 898 #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ 899 struct page_vma_mapped_walk name = { \ 900 .pfn = folio_pfn(_folio), \ 901 .nr_pages = folio_nr_pages(_folio), \ 902 .pgoff = folio_pgoff(_folio), \ 903 .vma = _vma, \ 904 .address = _address, \ 905 .flags = _flags, \ 906 } 907 908 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) 909 { 910 /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ 911 if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma)) 912 pte_unmap(pvmw->pte); 913 if (pvmw->ptl) 914 spin_unlock(pvmw->ptl); 915 } 916 917 /** 918 * page_vma_mapped_walk_restart - Restart the page table walk. 919 * @pvmw: Pointer to struct page_vma_mapped_walk. 920 * 921 * It restarts the page table walk when changes occur in the page 922 * table, such as splitting a PMD. Ensures that the PTL held during 923 * the previous walk is released and resets the state to allow for 924 * a new walk starting at the current address stored in pvmw->address. 925 */ 926 static inline void 927 page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) 928 { 929 WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte); 930 931 if (likely(pvmw->ptl)) 932 spin_unlock(pvmw->ptl); 933 else 934 WARN_ON_ONCE(1); 935 936 pvmw->ptl = NULL; 937 pvmw->pmd = NULL; 938 pvmw->pte = NULL; 939 } 940 941 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); 942 unsigned long page_address_in_vma(const struct folio *folio, 943 const struct page *, const struct vm_area_struct *); 944 945 /* 946 * Cleans the PTEs of shared mappings. 947 * (and since clean PTEs should also be readonly, write protects them too) 948 * 949 * returns the number of cleaned PTEs. 950 */ 951 int folio_mkclean(struct folio *); 952 953 int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, 954 unsigned long pfn, unsigned long nr_pages); 955 956 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, 957 struct vm_area_struct *vma); 958 959 enum rmp_flags { 960 RMP_LOCKED = 1 << 0, 961 RMP_USE_SHARED_ZEROPAGE = 1 << 1, 962 }; 963 964 void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); 965 966 /* 967 * rmap_walk_control: To control rmap traversing for specific needs 968 * 969 * arg: passed to rmap_one() and invalid_vma() 970 * try_lock: bail out if the rmap lock is contended 971 * contended: indicate the rmap traversal bailed out due to lock contention 972 * rmap_one: executed on each vma where page is mapped 973 * done: for checking traversing termination condition 974 * anon_lock: for getting anon_lock by optimized way rather than default 975 * invalid_vma: for skipping uninterested vma 976 */ 977 struct rmap_walk_control { 978 void *arg; 979 bool try_lock; 980 bool contended; 981 /* 982 * Return false if page table scanning in rmap_walk should be stopped. 983 * Otherwise, return true. 984 */ 985 bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, 986 unsigned long addr, void *arg); 987 int (*done)(struct folio *folio); 988 struct anon_vma *(*anon_lock)(const struct folio *folio, 989 struct rmap_walk_control *rwc); 990 bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); 991 }; 992 993 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); 994 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); 995 struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, 996 struct rmap_walk_control *rwc); 997 998 #else /* !CONFIG_MMU */ 999 1000 #define anon_vma_init() do {} while (0) 1001 #define anon_vma_prepare(vma) (0) 1002 1003 static inline int folio_referenced(struct folio *folio, int is_locked, 1004 struct mem_cgroup *memcg, 1005 unsigned long *vm_flags) 1006 { 1007 *vm_flags = 0; 1008 return 0; 1009 } 1010 1011 static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags) 1012 { 1013 } 1014 1015 static inline int folio_mkclean(struct folio *folio) 1016 { 1017 return 0; 1018 } 1019 #endif /* CONFIG_MMU */ 1020 1021 #endif /* _LINUX_RMAP_H */ 1022