1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef LINUX_MM_INLINE_H 3 #define LINUX_MM_INLINE_H 4 5 #include <linux/atomic.h> 6 #include <linux/huge_mm.h> 7 #include <linux/swap.h> 8 #include <linux/string.h> 9 #include <linux/userfaultfd_k.h> 10 #include <linux/swapops.h> 11 12 /** 13 * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? 14 * @folio: The folio to test. 15 * 16 * We would like to get this info without a page flag, but the state 17 * needs to survive until the folio is last deleted from the LRU, which 18 * could be as far down as __page_cache_release. 19 * 20 * Return: An integer (not a boolean!) used to sort a folio onto the 21 * right LRU list and to account folios correctly. 22 * 1 if @folio is a regular filesystem backed page cache folio 23 * or a lazily freed anonymous folio (e.g. via MADV_FREE). 24 * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise 25 * ram or swap backed folio. 26 */ 27 static inline int folio_is_file_lru(struct folio *folio) 28 { 29 return !folio_test_swapbacked(folio); 30 } 31 32 static inline int page_is_file_lru(struct page *page) 33 { 34 return folio_is_file_lru(page_folio(page)); 35 } 36 37 static __always_inline void __update_lru_size(struct lruvec *lruvec, 38 enum lru_list lru, enum zone_type zid, 39 long nr_pages) 40 { 41 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 42 43 lockdep_assert_held(&lruvec->lru_lock); 44 WARN_ON_ONCE(nr_pages != (int)nr_pages); 45 46 __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); 47 __mod_zone_page_state(&pgdat->node_zones[zid], 48 NR_ZONE_LRU_BASE + lru, nr_pages); 49 } 50 51 static __always_inline void update_lru_size(struct lruvec *lruvec, 52 enum lru_list lru, enum zone_type zid, 53 long nr_pages) 54 { 55 __update_lru_size(lruvec, lru, zid, nr_pages); 56 #ifdef CONFIG_MEMCG 57 mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); 58 #endif 59 } 60 61 /** 62 * __folio_clear_lru_flags - Clear page lru flags before releasing a page. 63 * @folio: The folio that was on lru and now has a zero reference. 64 */ 65 static __always_inline void __folio_clear_lru_flags(struct folio *folio) 66 { 67 VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); 68 69 __folio_clear_lru(folio); 70 71 /* this shouldn't happen, so leave the flags to bad_page() */ 72 if (folio_test_active(folio) && folio_test_unevictable(folio)) 73 return; 74 75 __folio_clear_active(folio); 76 __folio_clear_unevictable(folio); 77 } 78 79 static __always_inline void __clear_page_lru_flags(struct page *page) 80 { 81 __folio_clear_lru_flags(page_folio(page)); 82 } 83 84 /** 85 * folio_lru_list - Which LRU list should a folio be on? 86 * @folio: The folio to test. 87 * 88 * Return: The LRU list a folio should be on, as an index 89 * into the array of LRU lists. 90 */ 91 static __always_inline enum lru_list folio_lru_list(struct folio *folio) 92 { 93 enum lru_list lru; 94 95 VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); 96 97 if (folio_test_unevictable(folio)) 98 return LRU_UNEVICTABLE; 99 100 lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; 101 if (folio_test_active(folio)) 102 lru += LRU_ACTIVE; 103 104 return lru; 105 } 106 107 #ifdef CONFIG_LRU_GEN 108 109 #ifdef CONFIG_LRU_GEN_ENABLED 110 static inline bool lru_gen_enabled(void) 111 { 112 DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); 113 114 return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); 115 } 116 #else 117 static inline bool lru_gen_enabled(void) 118 { 119 DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); 120 121 return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); 122 } 123 #endif 124 125 static inline bool lru_gen_in_fault(void) 126 { 127 return current->in_lru_fault; 128 } 129 130 static inline int lru_gen_from_seq(unsigned long seq) 131 { 132 return seq % MAX_NR_GENS; 133 } 134 135 static inline int lru_hist_from_seq(unsigned long seq) 136 { 137 return seq % NR_HIST_GENS; 138 } 139 140 static inline int lru_tier_from_refs(int refs) 141 { 142 VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); 143 144 /* see the comment in folio_lru_refs() */ 145 return order_base_2(refs + 1); 146 } 147 148 static inline int folio_lru_refs(struct folio *folio) 149 { 150 unsigned long flags = READ_ONCE(folio->flags); 151 bool workingset = flags & BIT(PG_workingset); 152 153 /* 154 * Return the number of accesses beyond PG_referenced, i.e., N-1 if the 155 * total number of accesses is N>1, since N=0,1 both map to the first 156 * tier. lru_tier_from_refs() will account for this off-by-one. Also see 157 * the comment on MAX_NR_TIERS. 158 */ 159 return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; 160 } 161 162 static inline int folio_lru_gen(struct folio *folio) 163 { 164 unsigned long flags = READ_ONCE(folio->flags); 165 166 return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; 167 } 168 169 static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) 170 { 171 unsigned long max_seq = lruvec->lrugen.max_seq; 172 173 VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); 174 175 /* see the comment on MIN_NR_GENS */ 176 return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); 177 } 178 179 static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, 180 int old_gen, int new_gen) 181 { 182 int type = folio_is_file_lru(folio); 183 int zone = folio_zonenum(folio); 184 int delta = folio_nr_pages(folio); 185 enum lru_list lru = type * LRU_INACTIVE_FILE; 186 struct lru_gen_struct *lrugen = &lruvec->lrugen; 187 188 VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); 189 VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); 190 VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); 191 192 if (old_gen >= 0) 193 WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], 194 lrugen->nr_pages[old_gen][type][zone] - delta); 195 if (new_gen >= 0) 196 WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], 197 lrugen->nr_pages[new_gen][type][zone] + delta); 198 199 /* addition */ 200 if (old_gen < 0) { 201 if (lru_gen_is_active(lruvec, new_gen)) 202 lru += LRU_ACTIVE; 203 __update_lru_size(lruvec, lru, zone, delta); 204 return; 205 } 206 207 /* deletion */ 208 if (new_gen < 0) { 209 if (lru_gen_is_active(lruvec, old_gen)) 210 lru += LRU_ACTIVE; 211 __update_lru_size(lruvec, lru, zone, -delta); 212 return; 213 } 214 215 /* promotion */ 216 if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { 217 __update_lru_size(lruvec, lru, zone, -delta); 218 __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); 219 } 220 221 /* demotion requires isolation, e.g., lru_deactivate_fn() */ 222 VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); 223 } 224 225 static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) 226 { 227 unsigned long seq; 228 unsigned long flags; 229 int gen = folio_lru_gen(folio); 230 int type = folio_is_file_lru(folio); 231 int zone = folio_zonenum(folio); 232 struct lru_gen_struct *lrugen = &lruvec->lrugen; 233 234 VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); 235 236 if (folio_test_unevictable(folio) || !lrugen->enabled) 237 return false; 238 /* 239 * There are three common cases for this page: 240 * 1. If it's hot, e.g., freshly faulted in or previously hot and 241 * migrated, add it to the youngest generation. 242 * 2. If it's cold but can't be evicted immediately, i.e., an anon page 243 * not in swapcache or a dirty page pending writeback, add it to the 244 * second oldest generation. 245 * 3. Everything else (clean, cold) is added to the oldest generation. 246 */ 247 if (folio_test_active(folio)) 248 seq = lrugen->max_seq; 249 else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || 250 (folio_test_reclaim(folio) && 251 (folio_test_dirty(folio) || folio_test_writeback(folio)))) 252 seq = lrugen->min_seq[type] + 1; 253 else 254 seq = lrugen->min_seq[type]; 255 256 gen = lru_gen_from_seq(seq); 257 flags = (gen + 1UL) << LRU_GEN_PGOFF; 258 /* see the comment on MIN_NR_GENS about PG_active */ 259 set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); 260 261 lru_gen_update_size(lruvec, folio, -1, gen); 262 /* for folio_rotate_reclaimable() */ 263 if (reclaiming) 264 list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); 265 else 266 list_add(&folio->lru, &lrugen->lists[gen][type][zone]); 267 268 return true; 269 } 270 271 static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) 272 { 273 unsigned long flags; 274 int gen = folio_lru_gen(folio); 275 276 if (gen < 0) 277 return false; 278 279 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 280 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 281 282 /* for folio_migrate_flags() */ 283 flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; 284 flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); 285 gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; 286 287 lru_gen_update_size(lruvec, folio, gen, -1); 288 list_del(&folio->lru); 289 290 return true; 291 } 292 293 #else /* !CONFIG_LRU_GEN */ 294 295 static inline bool lru_gen_enabled(void) 296 { 297 return false; 298 } 299 300 static inline bool lru_gen_in_fault(void) 301 { 302 return false; 303 } 304 305 static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) 306 { 307 return false; 308 } 309 310 static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) 311 { 312 return false; 313 } 314 315 #endif /* CONFIG_LRU_GEN */ 316 317 static __always_inline 318 void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) 319 { 320 enum lru_list lru = folio_lru_list(folio); 321 322 if (lru_gen_add_folio(lruvec, folio, false)) 323 return; 324 325 update_lru_size(lruvec, lru, folio_zonenum(folio), 326 folio_nr_pages(folio)); 327 if (lru != LRU_UNEVICTABLE) 328 list_add(&folio->lru, &lruvec->lists[lru]); 329 } 330 331 static __always_inline void add_page_to_lru_list(struct page *page, 332 struct lruvec *lruvec) 333 { 334 lruvec_add_folio(lruvec, page_folio(page)); 335 } 336 337 static __always_inline 338 void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) 339 { 340 enum lru_list lru = folio_lru_list(folio); 341 342 if (lru_gen_add_folio(lruvec, folio, true)) 343 return; 344 345 update_lru_size(lruvec, lru, folio_zonenum(folio), 346 folio_nr_pages(folio)); 347 /* This is not expected to be used on LRU_UNEVICTABLE */ 348 list_add_tail(&folio->lru, &lruvec->lists[lru]); 349 } 350 351 static __always_inline void add_page_to_lru_list_tail(struct page *page, 352 struct lruvec *lruvec) 353 { 354 lruvec_add_folio_tail(lruvec, page_folio(page)); 355 } 356 357 static __always_inline 358 void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) 359 { 360 enum lru_list lru = folio_lru_list(folio); 361 362 if (lru_gen_del_folio(lruvec, folio, false)) 363 return; 364 365 if (lru != LRU_UNEVICTABLE) 366 list_del(&folio->lru); 367 update_lru_size(lruvec, lru, folio_zonenum(folio), 368 -folio_nr_pages(folio)); 369 } 370 371 static __always_inline void del_page_from_lru_list(struct page *page, 372 struct lruvec *lruvec) 373 { 374 lruvec_del_folio(lruvec, page_folio(page)); 375 } 376 377 #ifdef CONFIG_ANON_VMA_NAME 378 /* 379 * mmap_lock should be read-locked when calling anon_vma_name(). Caller should 380 * either keep holding the lock while using the returned pointer or it should 381 * raise anon_vma_name refcount before releasing the lock. 382 */ 383 extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); 384 extern struct anon_vma_name *anon_vma_name_alloc(const char *name); 385 extern void anon_vma_name_free(struct kref *kref); 386 387 /* mmap_lock should be read-locked */ 388 static inline void anon_vma_name_get(struct anon_vma_name *anon_name) 389 { 390 if (anon_name) 391 kref_get(&anon_name->kref); 392 } 393 394 static inline void anon_vma_name_put(struct anon_vma_name *anon_name) 395 { 396 if (anon_name) 397 kref_put(&anon_name->kref, anon_vma_name_free); 398 } 399 400 static inline 401 struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) 402 { 403 /* Prevent anon_name refcount saturation early on */ 404 if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { 405 anon_vma_name_get(anon_name); 406 return anon_name; 407 408 } 409 return anon_vma_name_alloc(anon_name->name); 410 } 411 412 static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, 413 struct vm_area_struct *new_vma) 414 { 415 struct anon_vma_name *anon_name = anon_vma_name(orig_vma); 416 417 if (anon_name) 418 new_vma->anon_name = anon_vma_name_reuse(anon_name); 419 } 420 421 static inline void free_anon_vma_name(struct vm_area_struct *vma) 422 { 423 /* 424 * Not using anon_vma_name because it generates a warning if mmap_lock 425 * is not held, which might be the case here. 426 */ 427 if (!vma->vm_file) 428 anon_vma_name_put(vma->anon_name); 429 } 430 431 static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, 432 struct anon_vma_name *anon_name2) 433 { 434 if (anon_name1 == anon_name2) 435 return true; 436 437 return anon_name1 && anon_name2 && 438 !strcmp(anon_name1->name, anon_name2->name); 439 } 440 441 #else /* CONFIG_ANON_VMA_NAME */ 442 static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 443 { 444 return NULL; 445 } 446 447 static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) 448 { 449 return NULL; 450 } 451 452 static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} 453 static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} 454 static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, 455 struct vm_area_struct *new_vma) {} 456 static inline void free_anon_vma_name(struct vm_area_struct *vma) {} 457 458 static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, 459 struct anon_vma_name *anon_name2) 460 { 461 return true; 462 } 463 464 #endif /* CONFIG_ANON_VMA_NAME */ 465 466 static inline void init_tlb_flush_pending(struct mm_struct *mm) 467 { 468 atomic_set(&mm->tlb_flush_pending, 0); 469 } 470 471 static inline void inc_tlb_flush_pending(struct mm_struct *mm) 472 { 473 atomic_inc(&mm->tlb_flush_pending); 474 /* 475 * The only time this value is relevant is when there are indeed pages 476 * to flush. And we'll only flush pages after changing them, which 477 * requires the PTL. 478 * 479 * So the ordering here is: 480 * 481 * atomic_inc(&mm->tlb_flush_pending); 482 * spin_lock(&ptl); 483 * ... 484 * set_pte_at(); 485 * spin_unlock(&ptl); 486 * 487 * spin_lock(&ptl) 488 * mm_tlb_flush_pending(); 489 * .... 490 * spin_unlock(&ptl); 491 * 492 * flush_tlb_range(); 493 * atomic_dec(&mm->tlb_flush_pending); 494 * 495 * Where the increment if constrained by the PTL unlock, it thus 496 * ensures that the increment is visible if the PTE modification is 497 * visible. After all, if there is no PTE modification, nobody cares 498 * about TLB flushes either. 499 * 500 * This very much relies on users (mm_tlb_flush_pending() and 501 * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and 502 * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc 503 * locks (PPC) the unlock of one doesn't order against the lock of 504 * another PTL. 505 * 506 * The decrement is ordered by the flush_tlb_range(), such that 507 * mm_tlb_flush_pending() will not return false unless all flushes have 508 * completed. 509 */ 510 } 511 512 static inline void dec_tlb_flush_pending(struct mm_struct *mm) 513 { 514 /* 515 * See inc_tlb_flush_pending(). 516 * 517 * This cannot be smp_mb__before_atomic() because smp_mb() simply does 518 * not order against TLB invalidate completion, which is what we need. 519 * 520 * Therefore we must rely on tlb_flush_*() to guarantee order. 521 */ 522 atomic_dec(&mm->tlb_flush_pending); 523 } 524 525 static inline bool mm_tlb_flush_pending(struct mm_struct *mm) 526 { 527 /* 528 * Must be called after having acquired the PTL; orders against that 529 * PTLs release and therefore ensures that if we observe the modified 530 * PTE we must also observe the increment from inc_tlb_flush_pending(). 531 * 532 * That is, it only guarantees to return true if there is a flush 533 * pending for _this_ PTL. 534 */ 535 return atomic_read(&mm->tlb_flush_pending); 536 } 537 538 static inline bool mm_tlb_flush_nested(struct mm_struct *mm) 539 { 540 /* 541 * Similar to mm_tlb_flush_pending(), we must have acquired the PTL 542 * for which there is a TLB flush pending in order to guarantee 543 * we've seen both that PTE modification and the increment. 544 * 545 * (no requirement on actually still holding the PTL, that is irrelevant) 546 */ 547 return atomic_read(&mm->tlb_flush_pending) > 1; 548 } 549 550 /* 551 * If this pte is wr-protected by uffd-wp in any form, arm the special pte to 552 * replace a none pte. NOTE! This should only be called when *pte is already 553 * cleared so we will never accidentally replace something valuable. Meanwhile 554 * none pte also means we are not demoting the pte so tlb flushed is not needed. 555 * E.g., when pte cleared the caller should have taken care of the tlb flush. 556 * 557 * Must be called with pgtable lock held so that no thread will see the none 558 * pte, and if they see it, they'll fault and serialize at the pgtable lock. 559 * 560 * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. 561 */ 562 static inline void 563 pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, 564 pte_t *pte, pte_t pteval) 565 { 566 #ifdef CONFIG_PTE_MARKER_UFFD_WP 567 bool arm_uffd_pte = false; 568 569 /* The current status of the pte should be "cleared" before calling */ 570 WARN_ON_ONCE(!pte_none(*pte)); 571 572 if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) 573 return; 574 575 /* A uffd-wp wr-protected normal pte */ 576 if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) 577 arm_uffd_pte = true; 578 579 /* 580 * A uffd-wp wr-protected swap pte. Note: this should even cover an 581 * existing pte marker with uffd-wp bit set. 582 */ 583 if (unlikely(pte_swp_uffd_wp_any(pteval))) 584 arm_uffd_pte = true; 585 586 if (unlikely(arm_uffd_pte)) 587 set_pte_at(vma->vm_mm, addr, pte, 588 make_pte_marker(PTE_MARKER_UFFD_WP)); 589 #endif 590 } 591 592 #endif 593