1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE 5 * Copyright (C) 2016 - 2020 Christoph Hellwig 6 */ 7 8 #include <linux/init.h> 9 #include <linux/mm.h> 10 #include <linux/slab.h> 11 #include <linux/kmod.h> 12 #include <linux/major.h> 13 #include <linux/device_cgroup.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-integrity.h> 16 #include <linux/backing-dev.h> 17 #include <linux/module.h> 18 #include <linux/blkpg.h> 19 #include <linux/magic.h> 20 #include <linux/buffer_head.h> 21 #include <linux/swap.h> 22 #include <linux/writeback.h> 23 #include <linux/mount.h> 24 #include <linux/pseudo_fs.h> 25 #include <linux/uio.h> 26 #include <linux/namei.h> 27 #include <linux/security.h> 28 #include <linux/part_stat.h> 29 #include <linux/uaccess.h> 30 #include <linux/stat.h> 31 #include "../fs/internal.h" 32 #include "blk.h" 33 34 /* Should we allow writing to mounted block devices? */ 35 static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); 36 37 struct bdev_inode { 38 struct block_device bdev; 39 struct inode vfs_inode; 40 }; 41 42 static inline struct bdev_inode *BDEV_I(struct inode *inode) 43 { 44 return container_of(inode, struct bdev_inode, vfs_inode); 45 } 46 47 static inline struct inode *BD_INODE(struct block_device *bdev) 48 { 49 return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode; 50 } 51 52 struct block_device *I_BDEV(struct inode *inode) 53 { 54 return &BDEV_I(inode)->bdev; 55 } 56 EXPORT_SYMBOL(I_BDEV); 57 58 struct block_device *file_bdev(struct file *bdev_file) 59 { 60 return I_BDEV(bdev_file->f_mapping->host); 61 } 62 EXPORT_SYMBOL(file_bdev); 63 64 static void bdev_write_inode(struct block_device *bdev) 65 { 66 struct inode *inode = BD_INODE(bdev); 67 int ret; 68 69 spin_lock(&inode->i_lock); 70 while (inode->i_state & I_DIRTY) { 71 spin_unlock(&inode->i_lock); 72 ret = write_inode_now(inode, true); 73 if (ret) 74 pr_warn_ratelimited( 75 "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", 76 bdev, ret); 77 spin_lock(&inode->i_lock); 78 } 79 spin_unlock(&inode->i_lock); 80 } 81 82 /* Kill _all_ buffers and pagecache , dirty or not.. */ 83 static void kill_bdev(struct block_device *bdev) 84 { 85 struct address_space *mapping = bdev->bd_mapping; 86 87 if (mapping_empty(mapping)) 88 return; 89 90 invalidate_bh_lrus(); 91 truncate_inode_pages(mapping, 0); 92 } 93 94 /* Invalidate clean unused buffers and pagecache. */ 95 void invalidate_bdev(struct block_device *bdev) 96 { 97 struct address_space *mapping = bdev->bd_mapping; 98 99 if (mapping->nrpages) { 100 invalidate_bh_lrus(); 101 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 102 invalidate_mapping_pages(mapping, 0, -1); 103 } 104 } 105 EXPORT_SYMBOL(invalidate_bdev); 106 107 /* 108 * Drop all buffers & page cache for given bdev range. This function bails 109 * with error if bdev has other exclusive owner (such as filesystem). 110 */ 111 int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, 112 loff_t lstart, loff_t lend) 113 { 114 /* 115 * If we don't hold exclusive handle for the device, upgrade to it 116 * while we discard the buffer cache to avoid discarding buffers 117 * under live filesystem. 118 */ 119 if (!(mode & BLK_OPEN_EXCL)) { 120 int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); 121 if (err) 122 goto invalidate; 123 } 124 125 truncate_inode_pages_range(bdev->bd_mapping, lstart, lend); 126 if (!(mode & BLK_OPEN_EXCL)) 127 bd_abort_claiming(bdev, truncate_bdev_range); 128 return 0; 129 130 invalidate: 131 /* 132 * Someone else has handle exclusively open. Try invalidating instead. 133 * The 'end' argument is inclusive so the rounding is safe. 134 */ 135 return invalidate_inode_pages2_range(bdev->bd_mapping, 136 lstart >> PAGE_SHIFT, 137 lend >> PAGE_SHIFT); 138 } 139 140 static void set_init_blocksize(struct block_device *bdev) 141 { 142 unsigned int bsize = bdev_logical_block_size(bdev); 143 loff_t size = i_size_read(BD_INODE(bdev)); 144 145 while (bsize < PAGE_SIZE) { 146 if (size & bsize) 147 break; 148 bsize <<= 1; 149 } 150 BD_INODE(bdev)->i_blkbits = blksize_bits(bsize); 151 mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping, 152 get_order(bsize)); 153 } 154 155 int set_blocksize(struct file *file, int size) 156 { 157 struct inode *inode = file->f_mapping->host; 158 struct block_device *bdev = I_BDEV(inode); 159 160 if (blk_validate_block_size(size)) 161 return -EINVAL; 162 163 /* Size cannot be smaller than the size supported by the device */ 164 if (size < bdev_logical_block_size(bdev)) 165 return -EINVAL; 166 167 if (!file->private_data) 168 return -EINVAL; 169 170 /* Don't change the size if it is same as current */ 171 if (inode->i_blkbits != blksize_bits(size)) { 172 sync_blockdev(bdev); 173 inode->i_blkbits = blksize_bits(size); 174 mapping_set_folio_min_order(inode->i_mapping, get_order(size)); 175 kill_bdev(bdev); 176 } 177 return 0; 178 } 179 180 EXPORT_SYMBOL(set_blocksize); 181 182 int sb_set_blocksize(struct super_block *sb, int size) 183 { 184 if (set_blocksize(sb->s_bdev_file, size)) 185 return 0; 186 /* If we get here, we know size is validated */ 187 sb->s_blocksize = size; 188 sb->s_blocksize_bits = blksize_bits(size); 189 return sb->s_blocksize; 190 } 191 192 EXPORT_SYMBOL(sb_set_blocksize); 193 194 int sb_min_blocksize(struct super_block *sb, int size) 195 { 196 int minsize = bdev_logical_block_size(sb->s_bdev); 197 if (size < minsize) 198 size = minsize; 199 return sb_set_blocksize(sb, size); 200 } 201 202 EXPORT_SYMBOL(sb_min_blocksize); 203 204 int sync_blockdev_nowait(struct block_device *bdev) 205 { 206 if (!bdev) 207 return 0; 208 return filemap_flush(bdev->bd_mapping); 209 } 210 EXPORT_SYMBOL_GPL(sync_blockdev_nowait); 211 212 /* 213 * Write out and wait upon all the dirty data associated with a block 214 * device via its mapping. Does not take the superblock lock. 215 */ 216 int sync_blockdev(struct block_device *bdev) 217 { 218 if (!bdev) 219 return 0; 220 return filemap_write_and_wait(bdev->bd_mapping); 221 } 222 EXPORT_SYMBOL(sync_blockdev); 223 224 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) 225 { 226 return filemap_write_and_wait_range(bdev->bd_mapping, 227 lstart, lend); 228 } 229 EXPORT_SYMBOL(sync_blockdev_range); 230 231 /** 232 * bdev_freeze - lock a filesystem and force it into a consistent state 233 * @bdev: blockdevice to lock 234 * 235 * If a superblock is found on this device, we take the s_umount semaphore 236 * on it to make sure nobody unmounts until the snapshot creation is done. 237 * The reference counter (bd_fsfreeze_count) guarantees that only the last 238 * unfreeze process can unfreeze the frozen filesystem actually when multiple 239 * freeze requests arrive simultaneously. It counts up in bdev_freeze() and 240 * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze 241 * actually. 242 * 243 * Return: On success zero is returned, negative error code on failure. 244 */ 245 int bdev_freeze(struct block_device *bdev) 246 { 247 int error = 0; 248 249 mutex_lock(&bdev->bd_fsfreeze_mutex); 250 251 if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { 252 mutex_unlock(&bdev->bd_fsfreeze_mutex); 253 return 0; 254 } 255 256 mutex_lock(&bdev->bd_holder_lock); 257 if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { 258 error = bdev->bd_holder_ops->freeze(bdev); 259 lockdep_assert_not_held(&bdev->bd_holder_lock); 260 } else { 261 mutex_unlock(&bdev->bd_holder_lock); 262 error = sync_blockdev(bdev); 263 } 264 265 if (error) 266 atomic_dec(&bdev->bd_fsfreeze_count); 267 268 mutex_unlock(&bdev->bd_fsfreeze_mutex); 269 return error; 270 } 271 EXPORT_SYMBOL(bdev_freeze); 272 273 /** 274 * bdev_thaw - unlock filesystem 275 * @bdev: blockdevice to unlock 276 * 277 * Unlocks the filesystem and marks it writeable again after bdev_freeze(). 278 * 279 * Return: On success zero is returned, negative error code on failure. 280 */ 281 int bdev_thaw(struct block_device *bdev) 282 { 283 int error = -EINVAL, nr_freeze; 284 285 mutex_lock(&bdev->bd_fsfreeze_mutex); 286 287 /* 288 * If this returns < 0 it means that @bd_fsfreeze_count was 289 * already 0 and no decrement was performed. 290 */ 291 nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); 292 if (nr_freeze < 0) 293 goto out; 294 295 error = 0; 296 if (nr_freeze > 0) 297 goto out; 298 299 mutex_lock(&bdev->bd_holder_lock); 300 if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { 301 error = bdev->bd_holder_ops->thaw(bdev); 302 lockdep_assert_not_held(&bdev->bd_holder_lock); 303 } else { 304 mutex_unlock(&bdev->bd_holder_lock); 305 } 306 307 if (error) 308 atomic_inc(&bdev->bd_fsfreeze_count); 309 out: 310 mutex_unlock(&bdev->bd_fsfreeze_mutex); 311 return error; 312 } 313 EXPORT_SYMBOL(bdev_thaw); 314 315 /* 316 * pseudo-fs 317 */ 318 319 static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); 320 static struct kmem_cache *bdev_cachep __ro_after_init; 321 322 static struct inode *bdev_alloc_inode(struct super_block *sb) 323 { 324 struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); 325 326 if (!ei) 327 return NULL; 328 memset(&ei->bdev, 0, sizeof(ei->bdev)); 329 330 if (security_bdev_alloc(&ei->bdev)) { 331 kmem_cache_free(bdev_cachep, ei); 332 return NULL; 333 } 334 return &ei->vfs_inode; 335 } 336 337 static void bdev_free_inode(struct inode *inode) 338 { 339 struct block_device *bdev = I_BDEV(inode); 340 341 free_percpu(bdev->bd_stats); 342 kfree(bdev->bd_meta_info); 343 security_bdev_free(bdev); 344 345 if (!bdev_is_partition(bdev)) { 346 if (bdev->bd_disk && bdev->bd_disk->bdi) 347 bdi_put(bdev->bd_disk->bdi); 348 kfree(bdev->bd_disk); 349 } 350 351 if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) 352 blk_free_ext_minor(MINOR(bdev->bd_dev)); 353 354 kmem_cache_free(bdev_cachep, BDEV_I(inode)); 355 } 356 357 static void init_once(void *data) 358 { 359 struct bdev_inode *ei = data; 360 361 inode_init_once(&ei->vfs_inode); 362 } 363 364 static void bdev_evict_inode(struct inode *inode) 365 { 366 truncate_inode_pages_final(&inode->i_data); 367 invalidate_inode_buffers(inode); /* is it needed here? */ 368 clear_inode(inode); 369 } 370 371 static const struct super_operations bdev_sops = { 372 .statfs = simple_statfs, 373 .alloc_inode = bdev_alloc_inode, 374 .free_inode = bdev_free_inode, 375 .drop_inode = generic_delete_inode, 376 .evict_inode = bdev_evict_inode, 377 }; 378 379 static int bd_init_fs_context(struct fs_context *fc) 380 { 381 struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); 382 if (!ctx) 383 return -ENOMEM; 384 fc->s_iflags |= SB_I_CGROUPWB; 385 ctx->ops = &bdev_sops; 386 return 0; 387 } 388 389 static struct file_system_type bd_type = { 390 .name = "bdev", 391 .init_fs_context = bd_init_fs_context, 392 .kill_sb = kill_anon_super, 393 }; 394 395 struct super_block *blockdev_superblock __ro_after_init; 396 static struct vfsmount *blockdev_mnt __ro_after_init; 397 EXPORT_SYMBOL_GPL(blockdev_superblock); 398 399 void __init bdev_cache_init(void) 400 { 401 int err; 402 403 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 404 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 405 SLAB_ACCOUNT|SLAB_PANIC), 406 init_once); 407 err = register_filesystem(&bd_type); 408 if (err) 409 panic("Cannot register bdev pseudo-fs"); 410 blockdev_mnt = kern_mount(&bd_type); 411 if (IS_ERR(blockdev_mnt)) 412 panic("Cannot create bdev pseudo-fs"); 413 blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ 414 } 415 416 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) 417 { 418 struct block_device *bdev; 419 struct inode *inode; 420 421 inode = new_inode(blockdev_superblock); 422 if (!inode) 423 return NULL; 424 inode->i_mode = S_IFBLK; 425 inode->i_rdev = 0; 426 inode->i_data.a_ops = &def_blk_aops; 427 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 428 429 bdev = I_BDEV(inode); 430 mutex_init(&bdev->bd_fsfreeze_mutex); 431 spin_lock_init(&bdev->bd_size_lock); 432 mutex_init(&bdev->bd_holder_lock); 433 atomic_set(&bdev->__bd_flags, partno); 434 bdev->bd_mapping = &inode->i_data; 435 bdev->bd_queue = disk->queue; 436 if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO)) 437 bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO); 438 bdev->bd_stats = alloc_percpu(struct disk_stats); 439 if (!bdev->bd_stats) { 440 iput(inode); 441 return NULL; 442 } 443 bdev->bd_disk = disk; 444 return bdev; 445 } 446 447 void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) 448 { 449 spin_lock(&bdev->bd_size_lock); 450 i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT); 451 bdev->bd_nr_sectors = sectors; 452 spin_unlock(&bdev->bd_size_lock); 453 } 454 455 void bdev_add(struct block_device *bdev, dev_t dev) 456 { 457 struct inode *inode = BD_INODE(bdev); 458 if (bdev_stable_writes(bdev)) 459 mapping_set_stable_writes(bdev->bd_mapping); 460 bdev->bd_dev = dev; 461 inode->i_rdev = dev; 462 inode->i_ino = dev; 463 insert_inode_hash(inode); 464 } 465 466 void bdev_unhash(struct block_device *bdev) 467 { 468 remove_inode_hash(BD_INODE(bdev)); 469 } 470 471 void bdev_drop(struct block_device *bdev) 472 { 473 iput(BD_INODE(bdev)); 474 } 475 476 long nr_blockdev_pages(void) 477 { 478 struct inode *inode; 479 long ret = 0; 480 481 spin_lock(&blockdev_superblock->s_inode_list_lock); 482 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) 483 ret += inode->i_mapping->nrpages; 484 spin_unlock(&blockdev_superblock->s_inode_list_lock); 485 486 return ret; 487 } 488 489 /** 490 * bd_may_claim - test whether a block device can be claimed 491 * @bdev: block device of interest 492 * @holder: holder trying to claim @bdev 493 * @hops: holder ops 494 * 495 * Test whether @bdev can be claimed by @holder. 496 * 497 * RETURNS: 498 * %true if @bdev can be claimed, %false otherwise. 499 */ 500 static bool bd_may_claim(struct block_device *bdev, void *holder, 501 const struct blk_holder_ops *hops) 502 { 503 struct block_device *whole = bdev_whole(bdev); 504 505 lockdep_assert_held(&bdev_lock); 506 507 if (bdev->bd_holder) { 508 /* 509 * The same holder can always re-claim. 510 */ 511 if (bdev->bd_holder == holder) { 512 if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) 513 return false; 514 return true; 515 } 516 return false; 517 } 518 519 /* 520 * If the whole devices holder is set to bd_may_claim, a partition on 521 * the device is claimed, but not the whole device. 522 */ 523 if (whole != bdev && 524 whole->bd_holder && whole->bd_holder != bd_may_claim) 525 return false; 526 return true; 527 } 528 529 /** 530 * bd_prepare_to_claim - claim a block device 531 * @bdev: block device of interest 532 * @holder: holder trying to claim @bdev 533 * @hops: holder ops. 534 * 535 * Claim @bdev. This function fails if @bdev is already claimed by another 536 * holder and waits if another claiming is in progress. return, the caller 537 * has ownership of bd_claiming and bd_holder[s]. 538 * 539 * RETURNS: 540 * 0 if @bdev can be claimed, -EBUSY otherwise. 541 */ 542 int bd_prepare_to_claim(struct block_device *bdev, void *holder, 543 const struct blk_holder_ops *hops) 544 { 545 struct block_device *whole = bdev_whole(bdev); 546 547 if (WARN_ON_ONCE(!holder)) 548 return -EINVAL; 549 retry: 550 mutex_lock(&bdev_lock); 551 /* if someone else claimed, fail */ 552 if (!bd_may_claim(bdev, holder, hops)) { 553 mutex_unlock(&bdev_lock); 554 return -EBUSY; 555 } 556 557 /* if claiming is already in progress, wait for it to finish */ 558 if (whole->bd_claiming) { 559 wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming); 560 DEFINE_WAIT(wait); 561 562 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 563 mutex_unlock(&bdev_lock); 564 schedule(); 565 finish_wait(wq, &wait); 566 goto retry; 567 } 568 569 /* yay, all mine */ 570 whole->bd_claiming = holder; 571 mutex_unlock(&bdev_lock); 572 return 0; 573 } 574 EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ 575 576 static void bd_clear_claiming(struct block_device *whole, void *holder) 577 { 578 lockdep_assert_held(&bdev_lock); 579 /* tell others that we're done */ 580 BUG_ON(whole->bd_claiming != holder); 581 whole->bd_claiming = NULL; 582 wake_up_var(&whole->bd_claiming); 583 } 584 585 /** 586 * bd_finish_claiming - finish claiming of a block device 587 * @bdev: block device of interest 588 * @holder: holder that has claimed @bdev 589 * @hops: block device holder operations 590 * 591 * Finish exclusive open of a block device. Mark the device as exlusively 592 * open by the holder and wake up all waiters for exclusive open to finish. 593 */ 594 static void bd_finish_claiming(struct block_device *bdev, void *holder, 595 const struct blk_holder_ops *hops) 596 { 597 struct block_device *whole = bdev_whole(bdev); 598 599 mutex_lock(&bdev_lock); 600 BUG_ON(!bd_may_claim(bdev, holder, hops)); 601 /* 602 * Note that for a whole device bd_holders will be incremented twice, 603 * and bd_holder will be set to bd_may_claim before being set to holder 604 */ 605 whole->bd_holders++; 606 whole->bd_holder = bd_may_claim; 607 bdev->bd_holders++; 608 mutex_lock(&bdev->bd_holder_lock); 609 bdev->bd_holder = holder; 610 bdev->bd_holder_ops = hops; 611 mutex_unlock(&bdev->bd_holder_lock); 612 bd_clear_claiming(whole, holder); 613 mutex_unlock(&bdev_lock); 614 } 615 616 /** 617 * bd_abort_claiming - abort claiming of a block device 618 * @bdev: block device of interest 619 * @holder: holder that has claimed @bdev 620 * 621 * Abort claiming of a block device when the exclusive open failed. This can be 622 * also used when exclusive open is not actually desired and we just needed 623 * to block other exclusive openers for a while. 624 */ 625 void bd_abort_claiming(struct block_device *bdev, void *holder) 626 { 627 mutex_lock(&bdev_lock); 628 bd_clear_claiming(bdev_whole(bdev), holder); 629 mutex_unlock(&bdev_lock); 630 } 631 EXPORT_SYMBOL(bd_abort_claiming); 632 633 static void bd_end_claim(struct block_device *bdev, void *holder) 634 { 635 struct block_device *whole = bdev_whole(bdev); 636 bool unblock = false; 637 638 /* 639 * Release a claim on the device. The holder fields are protected with 640 * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. 641 */ 642 mutex_lock(&bdev_lock); 643 WARN_ON_ONCE(bdev->bd_holder != holder); 644 WARN_ON_ONCE(--bdev->bd_holders < 0); 645 WARN_ON_ONCE(--whole->bd_holders < 0); 646 if (!bdev->bd_holders) { 647 mutex_lock(&bdev->bd_holder_lock); 648 bdev->bd_holder = NULL; 649 bdev->bd_holder_ops = NULL; 650 mutex_unlock(&bdev->bd_holder_lock); 651 if (bdev_test_flag(bdev, BD_WRITE_HOLDER)) 652 unblock = true; 653 } 654 if (!whole->bd_holders) 655 whole->bd_holder = NULL; 656 mutex_unlock(&bdev_lock); 657 658 /* 659 * If this was the last claim, remove holder link and unblock evpoll if 660 * it was a write holder. 661 */ 662 if (unblock) { 663 disk_unblock_events(bdev->bd_disk); 664 bdev_clear_flag(bdev, BD_WRITE_HOLDER); 665 } 666 } 667 668 static void blkdev_flush_mapping(struct block_device *bdev) 669 { 670 WARN_ON_ONCE(bdev->bd_holders); 671 sync_blockdev(bdev); 672 kill_bdev(bdev); 673 bdev_write_inode(bdev); 674 } 675 676 static void blkdev_put_whole(struct block_device *bdev) 677 { 678 if (atomic_dec_and_test(&bdev->bd_openers)) 679 blkdev_flush_mapping(bdev); 680 if (bdev->bd_disk->fops->release) 681 bdev->bd_disk->fops->release(bdev->bd_disk); 682 } 683 684 static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) 685 { 686 struct gendisk *disk = bdev->bd_disk; 687 int ret; 688 689 if (disk->fops->open) { 690 ret = disk->fops->open(disk, mode); 691 if (ret) { 692 /* avoid ghost partitions on a removed medium */ 693 if (ret == -ENOMEDIUM && 694 test_bit(GD_NEED_PART_SCAN, &disk->state)) 695 bdev_disk_changed(disk, true); 696 return ret; 697 } 698 } 699 700 if (!atomic_read(&bdev->bd_openers)) 701 set_init_blocksize(bdev); 702 atomic_inc(&bdev->bd_openers); 703 if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { 704 /* 705 * Only return scanning errors if we are called from contexts 706 * that explicitly want them, e.g. the BLKRRPART ioctl. 707 */ 708 ret = bdev_disk_changed(disk, false); 709 if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { 710 blkdev_put_whole(bdev); 711 return ret; 712 } 713 } 714 return 0; 715 } 716 717 static int blkdev_get_part(struct block_device *part, blk_mode_t mode) 718 { 719 struct gendisk *disk = part->bd_disk; 720 int ret; 721 722 ret = blkdev_get_whole(bdev_whole(part), mode); 723 if (ret) 724 return ret; 725 726 ret = -ENXIO; 727 if (!bdev_nr_sectors(part)) 728 goto out_blkdev_put; 729 730 if (!atomic_read(&part->bd_openers)) { 731 disk->open_partitions++; 732 set_init_blocksize(part); 733 } 734 atomic_inc(&part->bd_openers); 735 return 0; 736 737 out_blkdev_put: 738 blkdev_put_whole(bdev_whole(part)); 739 return ret; 740 } 741 742 int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) 743 { 744 int ret; 745 746 ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, 747 MAJOR(dev), MINOR(dev), 748 ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | 749 ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); 750 if (ret) 751 return ret; 752 753 /* Blocking writes requires exclusive opener */ 754 if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) 755 return -EINVAL; 756 757 /* 758 * We're using error pointers to indicate to ->release() when we 759 * failed to open that block device. Also this doesn't make sense. 760 */ 761 if (WARN_ON_ONCE(IS_ERR(holder))) 762 return -EINVAL; 763 764 return 0; 765 } 766 767 static void blkdev_put_part(struct block_device *part) 768 { 769 struct block_device *whole = bdev_whole(part); 770 771 if (atomic_dec_and_test(&part->bd_openers)) { 772 blkdev_flush_mapping(part); 773 whole->bd_disk->open_partitions--; 774 } 775 blkdev_put_whole(whole); 776 } 777 778 struct block_device *blkdev_get_no_open(dev_t dev) 779 { 780 struct block_device *bdev; 781 struct inode *inode; 782 783 inode = ilookup(blockdev_superblock, dev); 784 if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { 785 blk_request_module(dev); 786 inode = ilookup(blockdev_superblock, dev); 787 if (inode) 788 pr_warn_ratelimited( 789 "block device autoloading is deprecated and will be removed.\n"); 790 } 791 if (!inode) 792 return NULL; 793 794 /* switch from the inode reference to a device mode one: */ 795 bdev = &BDEV_I(inode)->bdev; 796 if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) 797 bdev = NULL; 798 iput(inode); 799 return bdev; 800 } 801 802 void blkdev_put_no_open(struct block_device *bdev) 803 { 804 put_device(&bdev->bd_device); 805 } 806 807 static bool bdev_writes_blocked(struct block_device *bdev) 808 { 809 return bdev->bd_writers < 0; 810 } 811 812 static void bdev_block_writes(struct block_device *bdev) 813 { 814 bdev->bd_writers--; 815 } 816 817 static void bdev_unblock_writes(struct block_device *bdev) 818 { 819 bdev->bd_writers++; 820 } 821 822 static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) 823 { 824 if (bdev_allow_write_mounted) 825 return true; 826 /* Writes blocked? */ 827 if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) 828 return false; 829 if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) 830 return false; 831 return true; 832 } 833 834 static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) 835 { 836 if (bdev_allow_write_mounted) 837 return; 838 839 /* Claim exclusive or shared write access. */ 840 if (mode & BLK_OPEN_RESTRICT_WRITES) 841 bdev_block_writes(bdev); 842 else if (mode & BLK_OPEN_WRITE) 843 bdev->bd_writers++; 844 } 845 846 static inline bool bdev_unclaimed(const struct file *bdev_file) 847 { 848 return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); 849 } 850 851 static void bdev_yield_write_access(struct file *bdev_file) 852 { 853 struct block_device *bdev; 854 855 if (bdev_allow_write_mounted) 856 return; 857 858 if (bdev_unclaimed(bdev_file)) 859 return; 860 861 bdev = file_bdev(bdev_file); 862 863 if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) 864 bdev_unblock_writes(bdev); 865 else if (bdev_file->f_mode & FMODE_WRITE) 866 bdev->bd_writers--; 867 } 868 869 /** 870 * bdev_open - open a block device 871 * @bdev: block device to open 872 * @mode: open mode (BLK_OPEN_*) 873 * @holder: exclusive holder identifier 874 * @hops: holder operations 875 * @bdev_file: file for the block device 876 * 877 * Open the block device. If @holder is not %NULL, the block device is opened 878 * with exclusive access. Exclusive opens may nest for the same @holder. 879 * 880 * CONTEXT: 881 * Might sleep. 882 * 883 * RETURNS: 884 * zero on success, -errno on failure. 885 */ 886 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, 887 const struct blk_holder_ops *hops, struct file *bdev_file) 888 { 889 bool unblock_events = true; 890 struct gendisk *disk = bdev->bd_disk; 891 int ret; 892 893 if (holder) { 894 mode |= BLK_OPEN_EXCL; 895 ret = bd_prepare_to_claim(bdev, holder, hops); 896 if (ret) 897 return ret; 898 } else { 899 if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) 900 return -EIO; 901 } 902 903 disk_block_events(disk); 904 905 mutex_lock(&disk->open_mutex); 906 ret = -ENXIO; 907 if (!disk_live(disk)) 908 goto abort_claiming; 909 if (!try_module_get(disk->fops->owner)) 910 goto abort_claiming; 911 ret = -EBUSY; 912 if (!bdev_may_open(bdev, mode)) 913 goto put_module; 914 if (bdev_is_partition(bdev)) 915 ret = blkdev_get_part(bdev, mode); 916 else 917 ret = blkdev_get_whole(bdev, mode); 918 if (ret) 919 goto put_module; 920 bdev_claim_write_access(bdev, mode); 921 if (holder) { 922 bd_finish_claiming(bdev, holder, hops); 923 924 /* 925 * Block event polling for write claims if requested. Any write 926 * holder makes the write_holder state stick until all are 927 * released. This is good enough and tracking individual 928 * writeable reference is too fragile given the way @mode is 929 * used in blkdev_get/put(). 930 */ 931 if ((mode & BLK_OPEN_WRITE) && 932 !bdev_test_flag(bdev, BD_WRITE_HOLDER) && 933 (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { 934 bdev_set_flag(bdev, BD_WRITE_HOLDER); 935 unblock_events = false; 936 } 937 } 938 mutex_unlock(&disk->open_mutex); 939 940 if (unblock_events) 941 disk_unblock_events(disk); 942 943 bdev_file->f_flags |= O_LARGEFILE; 944 bdev_file->f_mode |= FMODE_CAN_ODIRECT; 945 if (bdev_nowait(bdev)) 946 bdev_file->f_mode |= FMODE_NOWAIT; 947 if (mode & BLK_OPEN_RESTRICT_WRITES) 948 bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; 949 bdev_file->f_mapping = bdev->bd_mapping; 950 bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); 951 bdev_file->private_data = holder; 952 953 return 0; 954 put_module: 955 module_put(disk->fops->owner); 956 abort_claiming: 957 if (holder) 958 bd_abort_claiming(bdev, holder); 959 mutex_unlock(&disk->open_mutex); 960 disk_unblock_events(disk); 961 return ret; 962 } 963 964 /* 965 * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk 966 * associated with the floppy driver where it has allowed ioctls if the 967 * file was opened for writing, but does not allow reads or writes. 968 * Make sure that this quirk is reflected in @f_flags. 969 * 970 * It can also happen if a block device is opened as O_RDWR | O_WRONLY. 971 */ 972 static unsigned blk_to_file_flags(blk_mode_t mode) 973 { 974 unsigned int flags = 0; 975 976 if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == 977 (BLK_OPEN_READ | BLK_OPEN_WRITE)) 978 flags |= O_RDWR; 979 else if (mode & BLK_OPEN_WRITE_IOCTL) 980 flags |= O_RDWR | O_WRONLY; 981 else if (mode & BLK_OPEN_WRITE) 982 flags |= O_WRONLY; 983 else if (mode & BLK_OPEN_READ) 984 flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ 985 else 986 WARN_ON_ONCE(true); 987 988 if (mode & BLK_OPEN_NDELAY) 989 flags |= O_NDELAY; 990 991 return flags; 992 } 993 994 struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, 995 const struct blk_holder_ops *hops) 996 { 997 struct file *bdev_file; 998 struct block_device *bdev; 999 unsigned int flags; 1000 int ret; 1001 1002 ret = bdev_permission(dev, mode, holder); 1003 if (ret) 1004 return ERR_PTR(ret); 1005 1006 bdev = blkdev_get_no_open(dev); 1007 if (!bdev) 1008 return ERR_PTR(-ENXIO); 1009 1010 flags = blk_to_file_flags(mode); 1011 bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev), 1012 blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); 1013 if (IS_ERR(bdev_file)) { 1014 blkdev_put_no_open(bdev); 1015 return bdev_file; 1016 } 1017 ihold(BD_INODE(bdev)); 1018 1019 ret = bdev_open(bdev, mode, holder, hops, bdev_file); 1020 if (ret) { 1021 /* We failed to open the block device. Let ->release() know. */ 1022 bdev_file->private_data = ERR_PTR(ret); 1023 fput(bdev_file); 1024 return ERR_PTR(ret); 1025 } 1026 return bdev_file; 1027 } 1028 EXPORT_SYMBOL(bdev_file_open_by_dev); 1029 1030 struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, 1031 void *holder, 1032 const struct blk_holder_ops *hops) 1033 { 1034 struct file *file; 1035 dev_t dev; 1036 int error; 1037 1038 error = lookup_bdev(path, &dev); 1039 if (error) 1040 return ERR_PTR(error); 1041 1042 file = bdev_file_open_by_dev(dev, mode, holder, hops); 1043 if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) { 1044 if (bdev_read_only(file_bdev(file))) { 1045 fput(file); 1046 file = ERR_PTR(-EACCES); 1047 } 1048 } 1049 1050 return file; 1051 } 1052 EXPORT_SYMBOL(bdev_file_open_by_path); 1053 1054 static inline void bd_yield_claim(struct file *bdev_file) 1055 { 1056 struct block_device *bdev = file_bdev(bdev_file); 1057 void *holder = bdev_file->private_data; 1058 1059 lockdep_assert_held(&bdev->bd_disk->open_mutex); 1060 1061 if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) 1062 return; 1063 1064 if (!bdev_unclaimed(bdev_file)) 1065 bd_end_claim(bdev, holder); 1066 } 1067 1068 void bdev_release(struct file *bdev_file) 1069 { 1070 struct block_device *bdev = file_bdev(bdev_file); 1071 void *holder = bdev_file->private_data; 1072 struct gendisk *disk = bdev->bd_disk; 1073 1074 /* We failed to open that block device. */ 1075 if (IS_ERR(holder)) 1076 goto put_no_open; 1077 1078 /* 1079 * Sync early if it looks like we're the last one. If someone else 1080 * opens the block device between now and the decrement of bd_openers 1081 * then we did a sync that we didn't need to, but that's not the end 1082 * of the world and we want to avoid long (could be several minute) 1083 * syncs while holding the mutex. 1084 */ 1085 if (atomic_read(&bdev->bd_openers) == 1) 1086 sync_blockdev(bdev); 1087 1088 mutex_lock(&disk->open_mutex); 1089 bdev_yield_write_access(bdev_file); 1090 1091 if (holder) 1092 bd_yield_claim(bdev_file); 1093 1094 /* 1095 * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1096 * event. This is to ensure detection of media removal commanded 1097 * from userland - e.g. eject(1). 1098 */ 1099 disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); 1100 1101 if (bdev_is_partition(bdev)) 1102 blkdev_put_part(bdev); 1103 else 1104 blkdev_put_whole(bdev); 1105 mutex_unlock(&disk->open_mutex); 1106 1107 module_put(disk->fops->owner); 1108 put_no_open: 1109 blkdev_put_no_open(bdev); 1110 } 1111 1112 /** 1113 * bdev_fput - yield claim to the block device and put the file 1114 * @bdev_file: open block device 1115 * 1116 * Yield claim on the block device and put the file. Ensure that the 1117 * block device can be reclaimed before the file is closed which is a 1118 * deferred operation. 1119 */ 1120 void bdev_fput(struct file *bdev_file) 1121 { 1122 if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) 1123 return; 1124 1125 if (bdev_file->private_data) { 1126 struct block_device *bdev = file_bdev(bdev_file); 1127 struct gendisk *disk = bdev->bd_disk; 1128 1129 mutex_lock(&disk->open_mutex); 1130 bdev_yield_write_access(bdev_file); 1131 bd_yield_claim(bdev_file); 1132 /* 1133 * Tell release we already gave up our hold on the 1134 * device and if write restrictions are available that 1135 * we already gave up write access to the device. 1136 */ 1137 bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); 1138 mutex_unlock(&disk->open_mutex); 1139 } 1140 1141 fput(bdev_file); 1142 } 1143 EXPORT_SYMBOL(bdev_fput); 1144 1145 /** 1146 * lookup_bdev() - Look up a struct block_device by name. 1147 * @pathname: Name of the block device in the filesystem. 1148 * @dev: Pointer to the block device's dev_t, if found. 1149 * 1150 * Lookup the block device's dev_t at @pathname in the current 1151 * namespace if possible and return it in @dev. 1152 * 1153 * Context: May sleep. 1154 * Return: 0 if succeeded, negative errno otherwise. 1155 */ 1156 int lookup_bdev(const char *pathname, dev_t *dev) 1157 { 1158 struct inode *inode; 1159 struct path path; 1160 int error; 1161 1162 if (!pathname || !*pathname) 1163 return -EINVAL; 1164 1165 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1166 if (error) 1167 return error; 1168 1169 inode = d_backing_inode(path.dentry); 1170 error = -ENOTBLK; 1171 if (!S_ISBLK(inode->i_mode)) 1172 goto out_path_put; 1173 error = -EACCES; 1174 if (!may_open_dev(&path)) 1175 goto out_path_put; 1176 1177 *dev = inode->i_rdev; 1178 error = 0; 1179 out_path_put: 1180 path_put(&path); 1181 return error; 1182 } 1183 EXPORT_SYMBOL(lookup_bdev); 1184 1185 /** 1186 * bdev_mark_dead - mark a block device as dead 1187 * @bdev: block device to operate on 1188 * @surprise: indicate a surprise removal 1189 * 1190 * Tell the file system that this devices or media is dead. If @surprise is set 1191 * to %true the device or media is already gone, if not we are preparing for an 1192 * orderly removal. 1193 * 1194 * This calls into the file system, which then typicall syncs out all dirty data 1195 * and writes back inodes and then invalidates any cached data in the inodes on 1196 * the file system. In addition we also invalidate the block device mapping. 1197 */ 1198 void bdev_mark_dead(struct block_device *bdev, bool surprise) 1199 { 1200 mutex_lock(&bdev->bd_holder_lock); 1201 if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) 1202 bdev->bd_holder_ops->mark_dead(bdev, surprise); 1203 else { 1204 mutex_unlock(&bdev->bd_holder_lock); 1205 sync_blockdev(bdev); 1206 } 1207 1208 invalidate_bdev(bdev); 1209 } 1210 /* 1211 * New drivers should not use this directly. There are some drivers however 1212 * that needs this for historical reasons. For example, the DASD driver has 1213 * historically had a shutdown to offline mode that doesn't actually remove the 1214 * gendisk that otherwise looks a lot like a safe device removal. 1215 */ 1216 EXPORT_SYMBOL_GPL(bdev_mark_dead); 1217 1218 void sync_bdevs(bool wait) 1219 { 1220 struct inode *inode, *old_inode = NULL; 1221 1222 spin_lock(&blockdev_superblock->s_inode_list_lock); 1223 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { 1224 struct address_space *mapping = inode->i_mapping; 1225 struct block_device *bdev; 1226 1227 spin_lock(&inode->i_lock); 1228 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || 1229 mapping->nrpages == 0) { 1230 spin_unlock(&inode->i_lock); 1231 continue; 1232 } 1233 __iget(inode); 1234 spin_unlock(&inode->i_lock); 1235 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1236 /* 1237 * We hold a reference to 'inode' so it couldn't have been 1238 * removed from s_inodes list while we dropped the 1239 * s_inode_list_lock We cannot iput the inode now as we can 1240 * be holding the last reference and we cannot iput it under 1241 * s_inode_list_lock. So we keep the reference and iput it 1242 * later. 1243 */ 1244 iput(old_inode); 1245 old_inode = inode; 1246 bdev = I_BDEV(inode); 1247 1248 mutex_lock(&bdev->bd_disk->open_mutex); 1249 if (!atomic_read(&bdev->bd_openers)) { 1250 ; /* skip */ 1251 } else if (wait) { 1252 /* 1253 * We keep the error status of individual mapping so 1254 * that applications can catch the writeback error using 1255 * fsync(2). See filemap_fdatawait_keep_errors() for 1256 * details. 1257 */ 1258 filemap_fdatawait_keep_errors(inode->i_mapping); 1259 } else { 1260 filemap_fdatawrite(inode->i_mapping); 1261 } 1262 mutex_unlock(&bdev->bd_disk->open_mutex); 1263 1264 spin_lock(&blockdev_superblock->s_inode_list_lock); 1265 } 1266 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1267 iput(old_inode); 1268 } 1269 1270 /* 1271 * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. 1272 */ 1273 void bdev_statx(struct path *path, struct kstat *stat, 1274 u32 request_mask) 1275 { 1276 struct inode *backing_inode; 1277 struct block_device *bdev; 1278 1279 backing_inode = d_backing_inode(path->dentry); 1280 1281 /* 1282 * Note that backing_inode is the inode of a block device node file, 1283 * not the block device's internal inode. Therefore it is *not* valid 1284 * to use I_BDEV() here; the block device has to be looked up by i_rdev 1285 * instead. 1286 */ 1287 bdev = blkdev_get_no_open(backing_inode->i_rdev); 1288 if (!bdev) 1289 return; 1290 1291 if (request_mask & STATX_DIOALIGN) { 1292 stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 1293 stat->dio_offset_align = bdev_logical_block_size(bdev); 1294 stat->result_mask |= STATX_DIOALIGN; 1295 } 1296 1297 if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) { 1298 struct request_queue *bd_queue = bdev->bd_queue; 1299 1300 generic_fill_statx_atomic_writes(stat, 1301 queue_atomic_write_unit_min_bytes(bd_queue), 1302 queue_atomic_write_unit_max_bytes(bd_queue)); 1303 } 1304 1305 stat->blksize = bdev_io_min(bdev); 1306 1307 blkdev_put_no_open(bdev); 1308 } 1309 1310 bool disk_live(struct gendisk *disk) 1311 { 1312 return !inode_unhashed(BD_INODE(disk->part0)); 1313 } 1314 EXPORT_SYMBOL_GPL(disk_live); 1315 1316 unsigned int block_size(struct block_device *bdev) 1317 { 1318 return 1 << BD_INODE(bdev)->i_blkbits; 1319 } 1320 EXPORT_SYMBOL_GPL(block_size); 1321 1322 static int __init setup_bdev_allow_write_mounted(char *str) 1323 { 1324 if (kstrtobool(str, &bdev_allow_write_mounted)) 1325 pr_warn("Invalid option string for bdev_allow_write_mounted:" 1326 " '%s'\n", str); 1327 return 1; 1328 } 1329 __setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); 1330