1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _LINUX_MMU_NOTIFIER_H 3 #define _LINUX_MMU_NOTIFIER_H 4 5 #include <linux/list.h> 6 #include <linux/spinlock.h> 7 #include <linux/mm_types.h> 8 #include <linux/srcu.h> 9 10 struct mmu_notifier; 11 struct mmu_notifier_ops; 12 13 /** 14 * enum mmu_notifier_event - reason for the mmu notifier callback 15 * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that 16 * move the range 17 * 18 * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like 19 * madvise() or replacing a page by another one, ...). 20 * 21 * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range 22 * ie using the vma access permission (vm_page_prot) to update the whole range 23 * is enough no need to inspect changes to the CPU page table (mprotect() 24 * syscall) 25 * 26 * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for 27 * pages in the range so to mirror those changes the user must inspect the CPU 28 * page table (from the end callback). 29 * 30 * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same 31 * access flags). User should soft dirty the page in the end callback to make 32 * sure that anyone relying on soft dirtyness catch pages that might be written 33 * through non CPU mappings. 34 */ 35 enum mmu_notifier_event { 36 MMU_NOTIFY_UNMAP = 0, 37 MMU_NOTIFY_CLEAR, 38 MMU_NOTIFY_PROTECTION_VMA, 39 MMU_NOTIFY_PROTECTION_PAGE, 40 MMU_NOTIFY_SOFT_DIRTY, 41 }; 42 43 #ifdef CONFIG_MMU_NOTIFIER 44 45 #ifdef CONFIG_LOCKDEP 46 extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; 47 #endif 48 49 /* 50 * The mmu notifier_mm structure is allocated and installed in 51 * mm->mmu_notifier_mm inside the mm_take_all_locks() protected 52 * critical section and it's released only when mm_count reaches zero 53 * in mmdrop(). 54 */ 55 struct mmu_notifier_mm { 56 /* all mmu notifiers registerd in this mm are queued in this list */ 57 struct hlist_head list; 58 /* to serialize the list modifications and hlist_unhashed */ 59 spinlock_t lock; 60 }; 61 62 #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) 63 64 struct mmu_notifier_range { 65 struct vm_area_struct *vma; 66 struct mm_struct *mm; 67 unsigned long start; 68 unsigned long end; 69 unsigned flags; 70 enum mmu_notifier_event event; 71 }; 72 73 struct mmu_notifier_ops { 74 /* 75 * Called either by mmu_notifier_unregister or when the mm is 76 * being destroyed by exit_mmap, always before all pages are 77 * freed. This can run concurrently with other mmu notifier 78 * methods (the ones invoked outside the mm context) and it 79 * should tear down all secondary mmu mappings and freeze the 80 * secondary mmu. If this method isn't implemented you've to 81 * be sure that nothing could possibly write to the pages 82 * through the secondary mmu by the time the last thread with 83 * tsk->mm == mm exits. 84 * 85 * As side note: the pages freed after ->release returns could 86 * be immediately reallocated by the gart at an alias physical 87 * address with a different cache model, so if ->release isn't 88 * implemented because all _software_ driven memory accesses 89 * through the secondary mmu are terminated by the time the 90 * last thread of this mm quits, you've also to be sure that 91 * speculative _hardware_ operations can't allocate dirty 92 * cachelines in the cpu that could not be snooped and made 93 * coherent with the other read and write operations happening 94 * through the gart alias address, so leading to memory 95 * corruption. 96 */ 97 void (*release)(struct mmu_notifier *mn, 98 struct mm_struct *mm); 99 100 /* 101 * clear_flush_young is called after the VM is 102 * test-and-clearing the young/accessed bitflag in the 103 * pte. This way the VM will provide proper aging to the 104 * accesses to the page through the secondary MMUs and not 105 * only to the ones through the Linux pte. 106 * Start-end is necessary in case the secondary MMU is mapping the page 107 * at a smaller granularity than the primary MMU. 108 */ 109 int (*clear_flush_young)(struct mmu_notifier *mn, 110 struct mm_struct *mm, 111 unsigned long start, 112 unsigned long end); 113 114 /* 115 * clear_young is a lightweight version of clear_flush_young. Like the 116 * latter, it is supposed to test-and-clear the young/accessed bitflag 117 * in the secondary pte, but it may omit flushing the secondary tlb. 118 */ 119 int (*clear_young)(struct mmu_notifier *mn, 120 struct mm_struct *mm, 121 unsigned long start, 122 unsigned long end); 123 124 /* 125 * test_young is called to check the young/accessed bitflag in 126 * the secondary pte. This is used to know if the page is 127 * frequently used without actually clearing the flag or tearing 128 * down the secondary mapping on the page. 129 */ 130 int (*test_young)(struct mmu_notifier *mn, 131 struct mm_struct *mm, 132 unsigned long address); 133 134 /* 135 * change_pte is called in cases that pte mapping to page is changed: 136 * for example, when ksm remaps pte to point to a new shared page. 137 */ 138 void (*change_pte)(struct mmu_notifier *mn, 139 struct mm_struct *mm, 140 unsigned long address, 141 pte_t pte); 142 143 /* 144 * invalidate_range_start() and invalidate_range_end() must be 145 * paired and are called only when the mmap_sem and/or the 146 * locks protecting the reverse maps are held. If the subsystem 147 * can't guarantee that no additional references are taken to 148 * the pages in the range, it has to implement the 149 * invalidate_range() notifier to remove any references taken 150 * after invalidate_range_start(). 151 * 152 * Invalidation of multiple concurrent ranges may be 153 * optionally permitted by the driver. Either way the 154 * establishment of sptes is forbidden in the range passed to 155 * invalidate_range_begin/end for the whole duration of the 156 * invalidate_range_begin/end critical section. 157 * 158 * invalidate_range_start() is called when all pages in the 159 * range are still mapped and have at least a refcount of one. 160 * 161 * invalidate_range_end() is called when all pages in the 162 * range have been unmapped and the pages have been freed by 163 * the VM. 164 * 165 * The VM will remove the page table entries and potentially 166 * the page between invalidate_range_start() and 167 * invalidate_range_end(). If the page must not be freed 168 * because of pending I/O or other circumstances then the 169 * invalidate_range_start() callback (or the initial mapping 170 * by the driver) must make sure that the refcount is kept 171 * elevated. 172 * 173 * If the driver increases the refcount when the pages are 174 * initially mapped into an address space then either 175 * invalidate_range_start() or invalidate_range_end() may 176 * decrease the refcount. If the refcount is decreased on 177 * invalidate_range_start() then the VM can free pages as page 178 * table entries are removed. If the refcount is only 179 * droppped on invalidate_range_end() then the driver itself 180 * will drop the last refcount but it must take care to flush 181 * any secondary tlb before doing the final free on the 182 * page. Pages will no longer be referenced by the linux 183 * address space but may still be referenced by sptes until 184 * the last refcount is dropped. 185 * 186 * If blockable argument is set to false then the callback cannot 187 * sleep and has to return with -EAGAIN. 0 should be returned 188 * otherwise. Please note that if invalidate_range_start approves 189 * a non-blocking behavior then the same applies to 190 * invalidate_range_end. 191 * 192 */ 193 int (*invalidate_range_start)(struct mmu_notifier *mn, 194 const struct mmu_notifier_range *range); 195 void (*invalidate_range_end)(struct mmu_notifier *mn, 196 const struct mmu_notifier_range *range); 197 198 /* 199 * invalidate_range() is either called between 200 * invalidate_range_start() and invalidate_range_end() when the 201 * VM has to free pages that where unmapped, but before the 202 * pages are actually freed, or outside of _start()/_end() when 203 * a (remote) TLB is necessary. 204 * 205 * If invalidate_range() is used to manage a non-CPU TLB with 206 * shared page-tables, it not necessary to implement the 207 * invalidate_range_start()/end() notifiers, as 208 * invalidate_range() alread catches the points in time when an 209 * external TLB range needs to be flushed. For more in depth 210 * discussion on this see Documentation/vm/mmu_notifier.rst 211 * 212 * Note that this function might be called with just a sub-range 213 * of what was passed to invalidate_range_start()/end(), if 214 * called between those functions. 215 */ 216 void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, 217 unsigned long start, unsigned long end); 218 219 /* 220 * These callbacks are used with the get/put interface to manage the 221 * lifetime of the mmu_notifier memory. alloc_notifier() returns a new 222 * notifier for use with the mm. 223 * 224 * free_notifier() is only called after the mmu_notifier has been 225 * fully put, calls to any ops callback are prevented and no ops 226 * callbacks are currently running. It is called from a SRCU callback 227 * and cannot sleep. 228 */ 229 struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); 230 void (*free_notifier)(struct mmu_notifier *mn); 231 }; 232 233 /* 234 * The notifier chains are protected by mmap_sem and/or the reverse map 235 * semaphores. Notifier chains are only changed when all reverse maps and 236 * the mmap_sem locks are taken. 237 * 238 * Therefore notifier chains can only be traversed when either 239 * 240 * 1. mmap_sem is held. 241 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). 242 * 3. No other concurrent thread can access the list (release) 243 */ 244 struct mmu_notifier { 245 struct hlist_node hlist; 246 const struct mmu_notifier_ops *ops; 247 struct mm_struct *mm; 248 struct rcu_head rcu; 249 unsigned int users; 250 }; 251 252 static inline int mm_has_notifiers(struct mm_struct *mm) 253 { 254 return unlikely(mm->mmu_notifier_mm); 255 } 256 257 struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, 258 struct mm_struct *mm); 259 static inline struct mmu_notifier * 260 mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) 261 { 262 struct mmu_notifier *ret; 263 264 down_write(&mm->mmap_sem); 265 ret = mmu_notifier_get_locked(ops, mm); 266 up_write(&mm->mmap_sem); 267 return ret; 268 } 269 void mmu_notifier_put(struct mmu_notifier *mn); 270 void mmu_notifier_synchronize(void); 271 272 extern int mmu_notifier_register(struct mmu_notifier *mn, 273 struct mm_struct *mm); 274 extern int __mmu_notifier_register(struct mmu_notifier *mn, 275 struct mm_struct *mm); 276 extern void mmu_notifier_unregister(struct mmu_notifier *mn, 277 struct mm_struct *mm); 278 extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); 279 extern void __mmu_notifier_release(struct mm_struct *mm); 280 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, 281 unsigned long start, 282 unsigned long end); 283 extern int __mmu_notifier_clear_young(struct mm_struct *mm, 284 unsigned long start, 285 unsigned long end); 286 extern int __mmu_notifier_test_young(struct mm_struct *mm, 287 unsigned long address); 288 extern void __mmu_notifier_change_pte(struct mm_struct *mm, 289 unsigned long address, pte_t pte); 290 extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); 291 extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, 292 bool only_end); 293 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, 294 unsigned long start, unsigned long end); 295 extern bool 296 mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); 297 298 static inline bool 299 mmu_notifier_range_blockable(const struct mmu_notifier_range *range) 300 { 301 return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); 302 } 303 304 static inline void mmu_notifier_release(struct mm_struct *mm) 305 { 306 if (mm_has_notifiers(mm)) 307 __mmu_notifier_release(mm); 308 } 309 310 static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, 311 unsigned long start, 312 unsigned long end) 313 { 314 if (mm_has_notifiers(mm)) 315 return __mmu_notifier_clear_flush_young(mm, start, end); 316 return 0; 317 } 318 319 static inline int mmu_notifier_clear_young(struct mm_struct *mm, 320 unsigned long start, 321 unsigned long end) 322 { 323 if (mm_has_notifiers(mm)) 324 return __mmu_notifier_clear_young(mm, start, end); 325 return 0; 326 } 327 328 static inline int mmu_notifier_test_young(struct mm_struct *mm, 329 unsigned long address) 330 { 331 if (mm_has_notifiers(mm)) 332 return __mmu_notifier_test_young(mm, address); 333 return 0; 334 } 335 336 static inline void mmu_notifier_change_pte(struct mm_struct *mm, 337 unsigned long address, pte_t pte) 338 { 339 if (mm_has_notifiers(mm)) 340 __mmu_notifier_change_pte(mm, address, pte); 341 } 342 343 static inline void 344 mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) 345 { 346 might_sleep(); 347 348 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 349 if (mm_has_notifiers(range->mm)) { 350 range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; 351 __mmu_notifier_invalidate_range_start(range); 352 } 353 lock_map_release(&__mmu_notifier_invalidate_range_start_map); 354 } 355 356 static inline int 357 mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) 358 { 359 int ret = 0; 360 361 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 362 if (mm_has_notifiers(range->mm)) { 363 range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; 364 ret = __mmu_notifier_invalidate_range_start(range); 365 } 366 lock_map_release(&__mmu_notifier_invalidate_range_start_map); 367 return ret; 368 } 369 370 static inline void 371 mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) 372 { 373 if (mmu_notifier_range_blockable(range)) 374 might_sleep(); 375 376 if (mm_has_notifiers(range->mm)) 377 __mmu_notifier_invalidate_range_end(range, false); 378 } 379 380 static inline void 381 mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) 382 { 383 if (mm_has_notifiers(range->mm)) 384 __mmu_notifier_invalidate_range_end(range, true); 385 } 386 387 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, 388 unsigned long start, unsigned long end) 389 { 390 if (mm_has_notifiers(mm)) 391 __mmu_notifier_invalidate_range(mm, start, end); 392 } 393 394 static inline void mmu_notifier_mm_init(struct mm_struct *mm) 395 { 396 mm->mmu_notifier_mm = NULL; 397 } 398 399 static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) 400 { 401 if (mm_has_notifiers(mm)) 402 __mmu_notifier_mm_destroy(mm); 403 } 404 405 406 static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, 407 enum mmu_notifier_event event, 408 unsigned flags, 409 struct vm_area_struct *vma, 410 struct mm_struct *mm, 411 unsigned long start, 412 unsigned long end) 413 { 414 range->vma = vma; 415 range->event = event; 416 range->mm = mm; 417 range->start = start; 418 range->end = end; 419 range->flags = flags; 420 } 421 422 #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ 423 ({ \ 424 int __young; \ 425 struct vm_area_struct *___vma = __vma; \ 426 unsigned long ___address = __address; \ 427 __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ 428 __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ 429 ___address, \ 430 ___address + \ 431 PAGE_SIZE); \ 432 __young; \ 433 }) 434 435 #define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ 436 ({ \ 437 int __young; \ 438 struct vm_area_struct *___vma = __vma; \ 439 unsigned long ___address = __address; \ 440 __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ 441 __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ 442 ___address, \ 443 ___address + \ 444 PMD_SIZE); \ 445 __young; \ 446 }) 447 448 #define ptep_clear_young_notify(__vma, __address, __ptep) \ 449 ({ \ 450 int __young; \ 451 struct vm_area_struct *___vma = __vma; \ 452 unsigned long ___address = __address; \ 453 __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ 454 __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ 455 ___address + PAGE_SIZE); \ 456 __young; \ 457 }) 458 459 #define pmdp_clear_young_notify(__vma, __address, __pmdp) \ 460 ({ \ 461 int __young; \ 462 struct vm_area_struct *___vma = __vma; \ 463 unsigned long ___address = __address; \ 464 __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ 465 __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ 466 ___address + PMD_SIZE); \ 467 __young; \ 468 }) 469 470 #define ptep_clear_flush_notify(__vma, __address, __ptep) \ 471 ({ \ 472 unsigned long ___addr = __address & PAGE_MASK; \ 473 struct mm_struct *___mm = (__vma)->vm_mm; \ 474 pte_t ___pte; \ 475 \ 476 ___pte = ptep_clear_flush(__vma, __address, __ptep); \ 477 mmu_notifier_invalidate_range(___mm, ___addr, \ 478 ___addr + PAGE_SIZE); \ 479 \ 480 ___pte; \ 481 }) 482 483 #define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ 484 ({ \ 485 unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ 486 struct mm_struct *___mm = (__vma)->vm_mm; \ 487 pmd_t ___pmd; \ 488 \ 489 ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ 490 mmu_notifier_invalidate_range(___mm, ___haddr, \ 491 ___haddr + HPAGE_PMD_SIZE); \ 492 \ 493 ___pmd; \ 494 }) 495 496 #define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \ 497 ({ \ 498 unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \ 499 struct mm_struct *___mm = (__vma)->vm_mm; \ 500 pud_t ___pud; \ 501 \ 502 ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \ 503 mmu_notifier_invalidate_range(___mm, ___haddr, \ 504 ___haddr + HPAGE_PUD_SIZE); \ 505 \ 506 ___pud; \ 507 }) 508 509 /* 510 * set_pte_at_notify() sets the pte _after_ running the notifier. 511 * This is safe to start by updating the secondary MMUs, because the primary MMU 512 * pte invalidate must have already happened with a ptep_clear_flush() before 513 * set_pte_at_notify() has been invoked. Updating the secondary MMUs first is 514 * required when we change both the protection of the mapping from read-only to 515 * read-write and the pfn (like during copy on write page faults). Otherwise the 516 * old page would remain mapped readonly in the secondary MMUs after the new 517 * page is already writable by some CPU through the primary MMU. 518 */ 519 #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ 520 ({ \ 521 struct mm_struct *___mm = __mm; \ 522 unsigned long ___address = __address; \ 523 pte_t ___pte = __pte; \ 524 \ 525 mmu_notifier_change_pte(___mm, ___address, ___pte); \ 526 set_pte_at(___mm, ___address, __ptep, ___pte); \ 527 }) 528 529 #else /* CONFIG_MMU_NOTIFIER */ 530 531 struct mmu_notifier_range { 532 unsigned long start; 533 unsigned long end; 534 }; 535 536 static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, 537 unsigned long start, 538 unsigned long end) 539 { 540 range->start = start; 541 range->end = end; 542 } 543 544 #define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ 545 _mmu_notifier_range_init(range, start, end) 546 547 static inline bool 548 mmu_notifier_range_blockable(const struct mmu_notifier_range *range) 549 { 550 return true; 551 } 552 553 static inline int mm_has_notifiers(struct mm_struct *mm) 554 { 555 return 0; 556 } 557 558 static inline void mmu_notifier_release(struct mm_struct *mm) 559 { 560 } 561 562 static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, 563 unsigned long start, 564 unsigned long end) 565 { 566 return 0; 567 } 568 569 static inline int mmu_notifier_test_young(struct mm_struct *mm, 570 unsigned long address) 571 { 572 return 0; 573 } 574 575 static inline void mmu_notifier_change_pte(struct mm_struct *mm, 576 unsigned long address, pte_t pte) 577 { 578 } 579 580 static inline void 581 mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) 582 { 583 } 584 585 static inline int 586 mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) 587 { 588 return 0; 589 } 590 591 static inline 592 void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) 593 { 594 } 595 596 static inline void 597 mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) 598 { 599 } 600 601 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, 602 unsigned long start, unsigned long end) 603 { 604 } 605 606 static inline void mmu_notifier_mm_init(struct mm_struct *mm) 607 { 608 } 609 610 static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) 611 { 612 } 613 614 #define mmu_notifier_range_update_to_read_only(r) false 615 616 #define ptep_clear_flush_young_notify ptep_clear_flush_young 617 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young 618 #define ptep_clear_young_notify ptep_test_and_clear_young 619 #define pmdp_clear_young_notify pmdp_test_and_clear_young 620 #define ptep_clear_flush_notify ptep_clear_flush 621 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush 622 #define pudp_huge_clear_flush_notify pudp_huge_clear_flush 623 #define set_pte_at_notify set_pte_at 624 625 static inline void mmu_notifier_synchronize(void) 626 { 627 } 628 629 #endif /* CONFIG_MMU_NOTIFIER */ 630 631 #endif /* _LINUX_MMU_NOTIFIER_H */ 632