1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE 5 * Copyright (C) 2016 - 2020 Christoph Hellwig 6 */ 7 8 #include <linux/init.h> 9 #include <linux/mm.h> 10 #include <linux/slab.h> 11 #include <linux/kmod.h> 12 #include <linux/major.h> 13 #include <linux/device_cgroup.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-integrity.h> 16 #include <linux/backing-dev.h> 17 #include <linux/module.h> 18 #include <linux/blkpg.h> 19 #include <linux/magic.h> 20 #include <linux/buffer_head.h> 21 #include <linux/swap.h> 22 #include <linux/writeback.h> 23 #include <linux/mount.h> 24 #include <linux/pseudo_fs.h> 25 #include <linux/uio.h> 26 #include <linux/namei.h> 27 #include <linux/part_stat.h> 28 #include <linux/uaccess.h> 29 #include <linux/stat.h> 30 #include "../fs/internal.h" 31 #include "blk.h" 32 33 /* Should we allow writing to mounted block devices? */ 34 static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); 35 36 struct bdev_inode { 37 struct block_device bdev; 38 struct inode vfs_inode; 39 }; 40 41 static inline struct bdev_inode *BDEV_I(struct inode *inode) 42 { 43 return container_of(inode, struct bdev_inode, vfs_inode); 44 } 45 46 struct block_device *I_BDEV(struct inode *inode) 47 { 48 return &BDEV_I(inode)->bdev; 49 } 50 EXPORT_SYMBOL(I_BDEV); 51 52 struct block_device *file_bdev(struct file *bdev_file) 53 { 54 return I_BDEV(bdev_file->f_mapping->host); 55 } 56 EXPORT_SYMBOL(file_bdev); 57 58 static void bdev_write_inode(struct block_device *bdev) 59 { 60 struct inode *inode = bdev->bd_inode; 61 int ret; 62 63 spin_lock(&inode->i_lock); 64 while (inode->i_state & I_DIRTY) { 65 spin_unlock(&inode->i_lock); 66 ret = write_inode_now(inode, true); 67 if (ret) 68 pr_warn_ratelimited( 69 "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", 70 bdev, ret); 71 spin_lock(&inode->i_lock); 72 } 73 spin_unlock(&inode->i_lock); 74 } 75 76 /* Kill _all_ buffers and pagecache , dirty or not.. */ 77 static void kill_bdev(struct block_device *bdev) 78 { 79 struct address_space *mapping = bdev->bd_inode->i_mapping; 80 81 if (mapping_empty(mapping)) 82 return; 83 84 invalidate_bh_lrus(); 85 truncate_inode_pages(mapping, 0); 86 } 87 88 /* Invalidate clean unused buffers and pagecache. */ 89 void invalidate_bdev(struct block_device *bdev) 90 { 91 struct address_space *mapping = bdev->bd_inode->i_mapping; 92 93 if (mapping->nrpages) { 94 invalidate_bh_lrus(); 95 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 96 invalidate_mapping_pages(mapping, 0, -1); 97 } 98 } 99 EXPORT_SYMBOL(invalidate_bdev); 100 101 /* 102 * Drop all buffers & page cache for given bdev range. This function bails 103 * with error if bdev has other exclusive owner (such as filesystem). 104 */ 105 int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, 106 loff_t lstart, loff_t lend) 107 { 108 /* 109 * If we don't hold exclusive handle for the device, upgrade to it 110 * while we discard the buffer cache to avoid discarding buffers 111 * under live filesystem. 112 */ 113 if (!(mode & BLK_OPEN_EXCL)) { 114 int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); 115 if (err) 116 goto invalidate; 117 } 118 119 truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); 120 if (!(mode & BLK_OPEN_EXCL)) 121 bd_abort_claiming(bdev, truncate_bdev_range); 122 return 0; 123 124 invalidate: 125 /* 126 * Someone else has handle exclusively open. Try invalidating instead. 127 * The 'end' argument is inclusive so the rounding is safe. 128 */ 129 return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, 130 lstart >> PAGE_SHIFT, 131 lend >> PAGE_SHIFT); 132 } 133 134 static void set_init_blocksize(struct block_device *bdev) 135 { 136 unsigned int bsize = bdev_logical_block_size(bdev); 137 loff_t size = i_size_read(bdev->bd_inode); 138 139 while (bsize < PAGE_SIZE) { 140 if (size & bsize) 141 break; 142 bsize <<= 1; 143 } 144 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 145 } 146 147 int set_blocksize(struct file *file, int size) 148 { 149 struct inode *inode = file->f_mapping->host; 150 struct block_device *bdev = I_BDEV(inode); 151 152 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 153 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 154 return -EINVAL; 155 156 /* Size cannot be smaller than the size supported by the device */ 157 if (size < bdev_logical_block_size(bdev)) 158 return -EINVAL; 159 160 if (!file->private_data) 161 return -EINVAL; 162 163 /* Don't change the size if it is same as current */ 164 if (inode->i_blkbits != blksize_bits(size)) { 165 sync_blockdev(bdev); 166 inode->i_blkbits = blksize_bits(size); 167 kill_bdev(bdev); 168 } 169 return 0; 170 } 171 172 EXPORT_SYMBOL(set_blocksize); 173 174 int sb_set_blocksize(struct super_block *sb, int size) 175 { 176 if (set_blocksize(sb->s_bdev_file, size)) 177 return 0; 178 /* If we get here, we know size is power of two 179 * and it's value is between 512 and PAGE_SIZE */ 180 sb->s_blocksize = size; 181 sb->s_blocksize_bits = blksize_bits(size); 182 return sb->s_blocksize; 183 } 184 185 EXPORT_SYMBOL(sb_set_blocksize); 186 187 int sb_min_blocksize(struct super_block *sb, int size) 188 { 189 int minsize = bdev_logical_block_size(sb->s_bdev); 190 if (size < minsize) 191 size = minsize; 192 return sb_set_blocksize(sb, size); 193 } 194 195 EXPORT_SYMBOL(sb_min_blocksize); 196 197 int sync_blockdev_nowait(struct block_device *bdev) 198 { 199 if (!bdev) 200 return 0; 201 return filemap_flush(bdev->bd_inode->i_mapping); 202 } 203 EXPORT_SYMBOL_GPL(sync_blockdev_nowait); 204 205 /* 206 * Write out and wait upon all the dirty data associated with a block 207 * device via its mapping. Does not take the superblock lock. 208 */ 209 int sync_blockdev(struct block_device *bdev) 210 { 211 if (!bdev) 212 return 0; 213 return filemap_write_and_wait(bdev->bd_inode->i_mapping); 214 } 215 EXPORT_SYMBOL(sync_blockdev); 216 217 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) 218 { 219 return filemap_write_and_wait_range(bdev->bd_inode->i_mapping, 220 lstart, lend); 221 } 222 EXPORT_SYMBOL(sync_blockdev_range); 223 224 /** 225 * bdev_freeze - lock a filesystem and force it into a consistent state 226 * @bdev: blockdevice to lock 227 * 228 * If a superblock is found on this device, we take the s_umount semaphore 229 * on it to make sure nobody unmounts until the snapshot creation is done. 230 * The reference counter (bd_fsfreeze_count) guarantees that only the last 231 * unfreeze process can unfreeze the frozen filesystem actually when multiple 232 * freeze requests arrive simultaneously. It counts up in bdev_freeze() and 233 * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze 234 * actually. 235 * 236 * Return: On success zero is returned, negative error code on failure. 237 */ 238 int bdev_freeze(struct block_device *bdev) 239 { 240 int error = 0; 241 242 mutex_lock(&bdev->bd_fsfreeze_mutex); 243 244 if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { 245 mutex_unlock(&bdev->bd_fsfreeze_mutex); 246 return 0; 247 } 248 249 mutex_lock(&bdev->bd_holder_lock); 250 if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { 251 error = bdev->bd_holder_ops->freeze(bdev); 252 lockdep_assert_not_held(&bdev->bd_holder_lock); 253 } else { 254 mutex_unlock(&bdev->bd_holder_lock); 255 error = sync_blockdev(bdev); 256 } 257 258 if (error) 259 atomic_dec(&bdev->bd_fsfreeze_count); 260 261 mutex_unlock(&bdev->bd_fsfreeze_mutex); 262 return error; 263 } 264 EXPORT_SYMBOL(bdev_freeze); 265 266 /** 267 * bdev_thaw - unlock filesystem 268 * @bdev: blockdevice to unlock 269 * 270 * Unlocks the filesystem and marks it writeable again after bdev_freeze(). 271 * 272 * Return: On success zero is returned, negative error code on failure. 273 */ 274 int bdev_thaw(struct block_device *bdev) 275 { 276 int error = -EINVAL, nr_freeze; 277 278 mutex_lock(&bdev->bd_fsfreeze_mutex); 279 280 /* 281 * If this returns < 0 it means that @bd_fsfreeze_count was 282 * already 0 and no decrement was performed. 283 */ 284 nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); 285 if (nr_freeze < 0) 286 goto out; 287 288 error = 0; 289 if (nr_freeze > 0) 290 goto out; 291 292 mutex_lock(&bdev->bd_holder_lock); 293 if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { 294 error = bdev->bd_holder_ops->thaw(bdev); 295 lockdep_assert_not_held(&bdev->bd_holder_lock); 296 } else { 297 mutex_unlock(&bdev->bd_holder_lock); 298 } 299 300 if (error) 301 atomic_inc(&bdev->bd_fsfreeze_count); 302 out: 303 mutex_unlock(&bdev->bd_fsfreeze_mutex); 304 return error; 305 } 306 EXPORT_SYMBOL(bdev_thaw); 307 308 /* 309 * pseudo-fs 310 */ 311 312 static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); 313 static struct kmem_cache *bdev_cachep __ro_after_init; 314 315 static struct inode *bdev_alloc_inode(struct super_block *sb) 316 { 317 struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); 318 319 if (!ei) 320 return NULL; 321 memset(&ei->bdev, 0, sizeof(ei->bdev)); 322 return &ei->vfs_inode; 323 } 324 325 static void bdev_free_inode(struct inode *inode) 326 { 327 struct block_device *bdev = I_BDEV(inode); 328 329 free_percpu(bdev->bd_stats); 330 kfree(bdev->bd_meta_info); 331 332 if (!bdev_is_partition(bdev)) { 333 if (bdev->bd_disk && bdev->bd_disk->bdi) 334 bdi_put(bdev->bd_disk->bdi); 335 kfree(bdev->bd_disk); 336 } 337 338 if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) 339 blk_free_ext_minor(MINOR(bdev->bd_dev)); 340 341 kmem_cache_free(bdev_cachep, BDEV_I(inode)); 342 } 343 344 static void init_once(void *data) 345 { 346 struct bdev_inode *ei = data; 347 348 inode_init_once(&ei->vfs_inode); 349 } 350 351 static void bdev_evict_inode(struct inode *inode) 352 { 353 truncate_inode_pages_final(&inode->i_data); 354 invalidate_inode_buffers(inode); /* is it needed here? */ 355 clear_inode(inode); 356 } 357 358 static const struct super_operations bdev_sops = { 359 .statfs = simple_statfs, 360 .alloc_inode = bdev_alloc_inode, 361 .free_inode = bdev_free_inode, 362 .drop_inode = generic_delete_inode, 363 .evict_inode = bdev_evict_inode, 364 }; 365 366 static int bd_init_fs_context(struct fs_context *fc) 367 { 368 struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); 369 if (!ctx) 370 return -ENOMEM; 371 fc->s_iflags |= SB_I_CGROUPWB; 372 ctx->ops = &bdev_sops; 373 return 0; 374 } 375 376 static struct file_system_type bd_type = { 377 .name = "bdev", 378 .init_fs_context = bd_init_fs_context, 379 .kill_sb = kill_anon_super, 380 }; 381 382 struct super_block *blockdev_superblock __ro_after_init; 383 struct vfsmount *blockdev_mnt __ro_after_init; 384 EXPORT_SYMBOL_GPL(blockdev_superblock); 385 386 void __init bdev_cache_init(void) 387 { 388 int err; 389 390 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 391 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 392 SLAB_ACCOUNT|SLAB_PANIC), 393 init_once); 394 err = register_filesystem(&bd_type); 395 if (err) 396 panic("Cannot register bdev pseudo-fs"); 397 blockdev_mnt = kern_mount(&bd_type); 398 if (IS_ERR(blockdev_mnt)) 399 panic("Cannot create bdev pseudo-fs"); 400 blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ 401 } 402 403 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) 404 { 405 struct block_device *bdev; 406 struct inode *inode; 407 408 inode = new_inode(blockdev_superblock); 409 if (!inode) 410 return NULL; 411 inode->i_mode = S_IFBLK; 412 inode->i_rdev = 0; 413 inode->i_data.a_ops = &def_blk_aops; 414 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 415 416 bdev = I_BDEV(inode); 417 mutex_init(&bdev->bd_fsfreeze_mutex); 418 spin_lock_init(&bdev->bd_size_lock); 419 mutex_init(&bdev->bd_holder_lock); 420 bdev->bd_partno = partno; 421 bdev->bd_inode = inode; 422 bdev->bd_queue = disk->queue; 423 if (partno) 424 bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio; 425 else 426 bdev->bd_has_submit_bio = false; 427 bdev->bd_stats = alloc_percpu(struct disk_stats); 428 if (!bdev->bd_stats) { 429 iput(inode); 430 return NULL; 431 } 432 bdev->bd_disk = disk; 433 return bdev; 434 } 435 436 void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) 437 { 438 spin_lock(&bdev->bd_size_lock); 439 i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); 440 bdev->bd_nr_sectors = sectors; 441 spin_unlock(&bdev->bd_size_lock); 442 } 443 444 void bdev_add(struct block_device *bdev, dev_t dev) 445 { 446 if (bdev_stable_writes(bdev)) 447 mapping_set_stable_writes(bdev->bd_inode->i_mapping); 448 bdev->bd_dev = dev; 449 bdev->bd_inode->i_rdev = dev; 450 bdev->bd_inode->i_ino = dev; 451 insert_inode_hash(bdev->bd_inode); 452 } 453 454 long nr_blockdev_pages(void) 455 { 456 struct inode *inode; 457 long ret = 0; 458 459 spin_lock(&blockdev_superblock->s_inode_list_lock); 460 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) 461 ret += inode->i_mapping->nrpages; 462 spin_unlock(&blockdev_superblock->s_inode_list_lock); 463 464 return ret; 465 } 466 467 /** 468 * bd_may_claim - test whether a block device can be claimed 469 * @bdev: block device of interest 470 * @holder: holder trying to claim @bdev 471 * @hops: holder ops 472 * 473 * Test whether @bdev can be claimed by @holder. 474 * 475 * RETURNS: 476 * %true if @bdev can be claimed, %false otherwise. 477 */ 478 static bool bd_may_claim(struct block_device *bdev, void *holder, 479 const struct blk_holder_ops *hops) 480 { 481 struct block_device *whole = bdev_whole(bdev); 482 483 lockdep_assert_held(&bdev_lock); 484 485 if (bdev->bd_holder) { 486 /* 487 * The same holder can always re-claim. 488 */ 489 if (bdev->bd_holder == holder) { 490 if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) 491 return false; 492 return true; 493 } 494 return false; 495 } 496 497 /* 498 * If the whole devices holder is set to bd_may_claim, a partition on 499 * the device is claimed, but not the whole device. 500 */ 501 if (whole != bdev && 502 whole->bd_holder && whole->bd_holder != bd_may_claim) 503 return false; 504 return true; 505 } 506 507 /** 508 * bd_prepare_to_claim - claim a block device 509 * @bdev: block device of interest 510 * @holder: holder trying to claim @bdev 511 * @hops: holder ops. 512 * 513 * Claim @bdev. This function fails if @bdev is already claimed by another 514 * holder and waits if another claiming is in progress. return, the caller 515 * has ownership of bd_claiming and bd_holder[s]. 516 * 517 * RETURNS: 518 * 0 if @bdev can be claimed, -EBUSY otherwise. 519 */ 520 int bd_prepare_to_claim(struct block_device *bdev, void *holder, 521 const struct blk_holder_ops *hops) 522 { 523 struct block_device *whole = bdev_whole(bdev); 524 525 if (WARN_ON_ONCE(!holder)) 526 return -EINVAL; 527 retry: 528 mutex_lock(&bdev_lock); 529 /* if someone else claimed, fail */ 530 if (!bd_may_claim(bdev, holder, hops)) { 531 mutex_unlock(&bdev_lock); 532 return -EBUSY; 533 } 534 535 /* if claiming is already in progress, wait for it to finish */ 536 if (whole->bd_claiming) { 537 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); 538 DEFINE_WAIT(wait); 539 540 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 541 mutex_unlock(&bdev_lock); 542 schedule(); 543 finish_wait(wq, &wait); 544 goto retry; 545 } 546 547 /* yay, all mine */ 548 whole->bd_claiming = holder; 549 mutex_unlock(&bdev_lock); 550 return 0; 551 } 552 EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ 553 554 static void bd_clear_claiming(struct block_device *whole, void *holder) 555 { 556 lockdep_assert_held(&bdev_lock); 557 /* tell others that we're done */ 558 BUG_ON(whole->bd_claiming != holder); 559 whole->bd_claiming = NULL; 560 wake_up_bit(&whole->bd_claiming, 0); 561 } 562 563 /** 564 * bd_finish_claiming - finish claiming of a block device 565 * @bdev: block device of interest 566 * @holder: holder that has claimed @bdev 567 * @hops: block device holder operations 568 * 569 * Finish exclusive open of a block device. Mark the device as exlusively 570 * open by the holder and wake up all waiters for exclusive open to finish. 571 */ 572 static void bd_finish_claiming(struct block_device *bdev, void *holder, 573 const struct blk_holder_ops *hops) 574 { 575 struct block_device *whole = bdev_whole(bdev); 576 577 mutex_lock(&bdev_lock); 578 BUG_ON(!bd_may_claim(bdev, holder, hops)); 579 /* 580 * Note that for a whole device bd_holders will be incremented twice, 581 * and bd_holder will be set to bd_may_claim before being set to holder 582 */ 583 whole->bd_holders++; 584 whole->bd_holder = bd_may_claim; 585 bdev->bd_holders++; 586 mutex_lock(&bdev->bd_holder_lock); 587 bdev->bd_holder = holder; 588 bdev->bd_holder_ops = hops; 589 mutex_unlock(&bdev->bd_holder_lock); 590 bd_clear_claiming(whole, holder); 591 mutex_unlock(&bdev_lock); 592 } 593 594 /** 595 * bd_abort_claiming - abort claiming of a block device 596 * @bdev: block device of interest 597 * @holder: holder that has claimed @bdev 598 * 599 * Abort claiming of a block device when the exclusive open failed. This can be 600 * also used when exclusive open is not actually desired and we just needed 601 * to block other exclusive openers for a while. 602 */ 603 void bd_abort_claiming(struct block_device *bdev, void *holder) 604 { 605 mutex_lock(&bdev_lock); 606 bd_clear_claiming(bdev_whole(bdev), holder); 607 mutex_unlock(&bdev_lock); 608 } 609 EXPORT_SYMBOL(bd_abort_claiming); 610 611 static void bd_end_claim(struct block_device *bdev, void *holder) 612 { 613 struct block_device *whole = bdev_whole(bdev); 614 bool unblock = false; 615 616 /* 617 * Release a claim on the device. The holder fields are protected with 618 * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. 619 */ 620 mutex_lock(&bdev_lock); 621 WARN_ON_ONCE(bdev->bd_holder != holder); 622 WARN_ON_ONCE(--bdev->bd_holders < 0); 623 WARN_ON_ONCE(--whole->bd_holders < 0); 624 if (!bdev->bd_holders) { 625 mutex_lock(&bdev->bd_holder_lock); 626 bdev->bd_holder = NULL; 627 bdev->bd_holder_ops = NULL; 628 mutex_unlock(&bdev->bd_holder_lock); 629 if (bdev->bd_write_holder) 630 unblock = true; 631 } 632 if (!whole->bd_holders) 633 whole->bd_holder = NULL; 634 mutex_unlock(&bdev_lock); 635 636 /* 637 * If this was the last claim, remove holder link and unblock evpoll if 638 * it was a write holder. 639 */ 640 if (unblock) { 641 disk_unblock_events(bdev->bd_disk); 642 bdev->bd_write_holder = false; 643 } 644 } 645 646 static void blkdev_flush_mapping(struct block_device *bdev) 647 { 648 WARN_ON_ONCE(bdev->bd_holders); 649 sync_blockdev(bdev); 650 kill_bdev(bdev); 651 bdev_write_inode(bdev); 652 } 653 654 static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) 655 { 656 struct gendisk *disk = bdev->bd_disk; 657 int ret; 658 659 if (disk->fops->open) { 660 ret = disk->fops->open(disk, mode); 661 if (ret) { 662 /* avoid ghost partitions on a removed medium */ 663 if (ret == -ENOMEDIUM && 664 test_bit(GD_NEED_PART_SCAN, &disk->state)) 665 bdev_disk_changed(disk, true); 666 return ret; 667 } 668 } 669 670 if (!atomic_read(&bdev->bd_openers)) 671 set_init_blocksize(bdev); 672 if (test_bit(GD_NEED_PART_SCAN, &disk->state)) 673 bdev_disk_changed(disk, false); 674 atomic_inc(&bdev->bd_openers); 675 return 0; 676 } 677 678 static void blkdev_put_whole(struct block_device *bdev) 679 { 680 if (atomic_dec_and_test(&bdev->bd_openers)) 681 blkdev_flush_mapping(bdev); 682 if (bdev->bd_disk->fops->release) 683 bdev->bd_disk->fops->release(bdev->bd_disk); 684 } 685 686 static int blkdev_get_part(struct block_device *part, blk_mode_t mode) 687 { 688 struct gendisk *disk = part->bd_disk; 689 int ret; 690 691 ret = blkdev_get_whole(bdev_whole(part), mode); 692 if (ret) 693 return ret; 694 695 ret = -ENXIO; 696 if (!bdev_nr_sectors(part)) 697 goto out_blkdev_put; 698 699 if (!atomic_read(&part->bd_openers)) { 700 disk->open_partitions++; 701 set_init_blocksize(part); 702 } 703 atomic_inc(&part->bd_openers); 704 return 0; 705 706 out_blkdev_put: 707 blkdev_put_whole(bdev_whole(part)); 708 return ret; 709 } 710 711 int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) 712 { 713 int ret; 714 715 ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, 716 MAJOR(dev), MINOR(dev), 717 ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | 718 ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); 719 if (ret) 720 return ret; 721 722 /* Blocking writes requires exclusive opener */ 723 if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) 724 return -EINVAL; 725 726 /* 727 * We're using error pointers to indicate to ->release() when we 728 * failed to open that block device. Also this doesn't make sense. 729 */ 730 if (WARN_ON_ONCE(IS_ERR(holder))) 731 return -EINVAL; 732 733 return 0; 734 } 735 736 static void blkdev_put_part(struct block_device *part) 737 { 738 struct block_device *whole = bdev_whole(part); 739 740 if (atomic_dec_and_test(&part->bd_openers)) { 741 blkdev_flush_mapping(part); 742 whole->bd_disk->open_partitions--; 743 } 744 blkdev_put_whole(whole); 745 } 746 747 struct block_device *blkdev_get_no_open(dev_t dev) 748 { 749 struct block_device *bdev; 750 struct inode *inode; 751 752 inode = ilookup(blockdev_superblock, dev); 753 if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { 754 blk_request_module(dev); 755 inode = ilookup(blockdev_superblock, dev); 756 if (inode) 757 pr_warn_ratelimited( 758 "block device autoloading is deprecated and will be removed.\n"); 759 } 760 if (!inode) 761 return NULL; 762 763 /* switch from the inode reference to a device mode one: */ 764 bdev = &BDEV_I(inode)->bdev; 765 if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) 766 bdev = NULL; 767 iput(inode); 768 return bdev; 769 } 770 771 void blkdev_put_no_open(struct block_device *bdev) 772 { 773 put_device(&bdev->bd_device); 774 } 775 776 static bool bdev_writes_blocked(struct block_device *bdev) 777 { 778 return bdev->bd_writers < 0; 779 } 780 781 static void bdev_block_writes(struct block_device *bdev) 782 { 783 bdev->bd_writers--; 784 } 785 786 static void bdev_unblock_writes(struct block_device *bdev) 787 { 788 bdev->bd_writers++; 789 } 790 791 static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) 792 { 793 if (bdev_allow_write_mounted) 794 return true; 795 /* Writes blocked? */ 796 if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) 797 return false; 798 if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) 799 return false; 800 return true; 801 } 802 803 static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) 804 { 805 if (bdev_allow_write_mounted) 806 return; 807 808 /* Claim exclusive or shared write access. */ 809 if (mode & BLK_OPEN_RESTRICT_WRITES) 810 bdev_block_writes(bdev); 811 else if (mode & BLK_OPEN_WRITE) 812 bdev->bd_writers++; 813 } 814 815 static inline bool bdev_unclaimed(const struct file *bdev_file) 816 { 817 return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); 818 } 819 820 static void bdev_yield_write_access(struct file *bdev_file) 821 { 822 struct block_device *bdev; 823 824 if (bdev_allow_write_mounted) 825 return; 826 827 if (bdev_unclaimed(bdev_file)) 828 return; 829 830 bdev = file_bdev(bdev_file); 831 832 if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) 833 bdev_unblock_writes(bdev); 834 else if (bdev_file->f_mode & FMODE_WRITE) 835 bdev->bd_writers--; 836 } 837 838 /** 839 * bdev_open - open a block device 840 * @bdev: block device to open 841 * @mode: open mode (BLK_OPEN_*) 842 * @holder: exclusive holder identifier 843 * @hops: holder operations 844 * @bdev_file: file for the block device 845 * 846 * Open the block device. If @holder is not %NULL, the block device is opened 847 * with exclusive access. Exclusive opens may nest for the same @holder. 848 * 849 * CONTEXT: 850 * Might sleep. 851 * 852 * RETURNS: 853 * zero on success, -errno on failure. 854 */ 855 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, 856 const struct blk_holder_ops *hops, struct file *bdev_file) 857 { 858 bool unblock_events = true; 859 struct gendisk *disk = bdev->bd_disk; 860 int ret; 861 862 if (holder) { 863 mode |= BLK_OPEN_EXCL; 864 ret = bd_prepare_to_claim(bdev, holder, hops); 865 if (ret) 866 return ret; 867 } else { 868 if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) 869 return -EIO; 870 } 871 872 disk_block_events(disk); 873 874 mutex_lock(&disk->open_mutex); 875 ret = -ENXIO; 876 if (!disk_live(disk)) 877 goto abort_claiming; 878 if (!try_module_get(disk->fops->owner)) 879 goto abort_claiming; 880 ret = -EBUSY; 881 if (!bdev_may_open(bdev, mode)) 882 goto abort_claiming; 883 if (bdev_is_partition(bdev)) 884 ret = blkdev_get_part(bdev, mode); 885 else 886 ret = blkdev_get_whole(bdev, mode); 887 if (ret) 888 goto put_module; 889 bdev_claim_write_access(bdev, mode); 890 if (holder) { 891 bd_finish_claiming(bdev, holder, hops); 892 893 /* 894 * Block event polling for write claims if requested. Any write 895 * holder makes the write_holder state stick until all are 896 * released. This is good enough and tracking individual 897 * writeable reference is too fragile given the way @mode is 898 * used in blkdev_get/put(). 899 */ 900 if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder && 901 (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { 902 bdev->bd_write_holder = true; 903 unblock_events = false; 904 } 905 } 906 mutex_unlock(&disk->open_mutex); 907 908 if (unblock_events) 909 disk_unblock_events(disk); 910 911 bdev_file->f_flags |= O_LARGEFILE; 912 bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; 913 if (bdev_nowait(bdev)) 914 bdev_file->f_mode |= FMODE_NOWAIT; 915 if (mode & BLK_OPEN_RESTRICT_WRITES) 916 bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; 917 bdev_file->f_mapping = bdev->bd_inode->i_mapping; 918 bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); 919 bdev_file->private_data = holder; 920 921 return 0; 922 put_module: 923 module_put(disk->fops->owner); 924 abort_claiming: 925 if (holder) 926 bd_abort_claiming(bdev, holder); 927 mutex_unlock(&disk->open_mutex); 928 disk_unblock_events(disk); 929 return ret; 930 } 931 932 /* 933 * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk 934 * associated with the floppy driver where it has allowed ioctls if the 935 * file was opened for writing, but does not allow reads or writes. 936 * Make sure that this quirk is reflected in @f_flags. 937 * 938 * It can also happen if a block device is opened as O_RDWR | O_WRONLY. 939 */ 940 static unsigned blk_to_file_flags(blk_mode_t mode) 941 { 942 unsigned int flags = 0; 943 944 if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == 945 (BLK_OPEN_READ | BLK_OPEN_WRITE)) 946 flags |= O_RDWR; 947 else if (mode & BLK_OPEN_WRITE_IOCTL) 948 flags |= O_RDWR | O_WRONLY; 949 else if (mode & BLK_OPEN_WRITE) 950 flags |= O_WRONLY; 951 else if (mode & BLK_OPEN_READ) 952 flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ 953 else 954 WARN_ON_ONCE(true); 955 956 if (mode & BLK_OPEN_NDELAY) 957 flags |= O_NDELAY; 958 959 return flags; 960 } 961 962 struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, 963 const struct blk_holder_ops *hops) 964 { 965 struct file *bdev_file; 966 struct block_device *bdev; 967 unsigned int flags; 968 int ret; 969 970 ret = bdev_permission(dev, mode, holder); 971 if (ret) 972 return ERR_PTR(ret); 973 974 bdev = blkdev_get_no_open(dev); 975 if (!bdev) 976 return ERR_PTR(-ENXIO); 977 978 flags = blk_to_file_flags(mode); 979 bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode, 980 blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); 981 if (IS_ERR(bdev_file)) { 982 blkdev_put_no_open(bdev); 983 return bdev_file; 984 } 985 ihold(bdev->bd_inode); 986 987 ret = bdev_open(bdev, mode, holder, hops, bdev_file); 988 if (ret) { 989 /* We failed to open the block device. Let ->release() know. */ 990 bdev_file->private_data = ERR_PTR(ret); 991 fput(bdev_file); 992 return ERR_PTR(ret); 993 } 994 return bdev_file; 995 } 996 EXPORT_SYMBOL(bdev_file_open_by_dev); 997 998 struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, 999 void *holder, 1000 const struct blk_holder_ops *hops) 1001 { 1002 struct file *file; 1003 dev_t dev; 1004 int error; 1005 1006 error = lookup_bdev(path, &dev); 1007 if (error) 1008 return ERR_PTR(error); 1009 1010 file = bdev_file_open_by_dev(dev, mode, holder, hops); 1011 if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) { 1012 if (bdev_read_only(file_bdev(file))) { 1013 fput(file); 1014 file = ERR_PTR(-EACCES); 1015 } 1016 } 1017 1018 return file; 1019 } 1020 EXPORT_SYMBOL(bdev_file_open_by_path); 1021 1022 static inline void bd_yield_claim(struct file *bdev_file) 1023 { 1024 struct block_device *bdev = file_bdev(bdev_file); 1025 void *holder = bdev_file->private_data; 1026 1027 lockdep_assert_held(&bdev->bd_disk->open_mutex); 1028 1029 if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) 1030 return; 1031 1032 if (!bdev_unclaimed(bdev_file)) 1033 bd_end_claim(bdev, holder); 1034 } 1035 1036 void bdev_release(struct file *bdev_file) 1037 { 1038 struct block_device *bdev = file_bdev(bdev_file); 1039 void *holder = bdev_file->private_data; 1040 struct gendisk *disk = bdev->bd_disk; 1041 1042 /* We failed to open that block device. */ 1043 if (IS_ERR(holder)) 1044 goto put_no_open; 1045 1046 /* 1047 * Sync early if it looks like we're the last one. If someone else 1048 * opens the block device between now and the decrement of bd_openers 1049 * then we did a sync that we didn't need to, but that's not the end 1050 * of the world and we want to avoid long (could be several minute) 1051 * syncs while holding the mutex. 1052 */ 1053 if (atomic_read(&bdev->bd_openers) == 1) 1054 sync_blockdev(bdev); 1055 1056 mutex_lock(&disk->open_mutex); 1057 bdev_yield_write_access(bdev_file); 1058 1059 if (holder) 1060 bd_yield_claim(bdev_file); 1061 1062 /* 1063 * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1064 * event. This is to ensure detection of media removal commanded 1065 * from userland - e.g. eject(1). 1066 */ 1067 disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); 1068 1069 if (bdev_is_partition(bdev)) 1070 blkdev_put_part(bdev); 1071 else 1072 blkdev_put_whole(bdev); 1073 mutex_unlock(&disk->open_mutex); 1074 1075 module_put(disk->fops->owner); 1076 put_no_open: 1077 blkdev_put_no_open(bdev); 1078 } 1079 1080 /** 1081 * bdev_fput - yield claim to the block device and put the file 1082 * @bdev_file: open block device 1083 * 1084 * Yield claim on the block device and put the file. Ensure that the 1085 * block device can be reclaimed before the file is closed which is a 1086 * deferred operation. 1087 */ 1088 void bdev_fput(struct file *bdev_file) 1089 { 1090 if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) 1091 return; 1092 1093 if (bdev_file->private_data) { 1094 struct block_device *bdev = file_bdev(bdev_file); 1095 struct gendisk *disk = bdev->bd_disk; 1096 1097 mutex_lock(&disk->open_mutex); 1098 bdev_yield_write_access(bdev_file); 1099 bd_yield_claim(bdev_file); 1100 /* 1101 * Tell release we already gave up our hold on the 1102 * device and if write restrictions are available that 1103 * we already gave up write access to the device. 1104 */ 1105 bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); 1106 mutex_unlock(&disk->open_mutex); 1107 } 1108 1109 fput(bdev_file); 1110 } 1111 EXPORT_SYMBOL(bdev_fput); 1112 1113 /** 1114 * lookup_bdev() - Look up a struct block_device by name. 1115 * @pathname: Name of the block device in the filesystem. 1116 * @dev: Pointer to the block device's dev_t, if found. 1117 * 1118 * Lookup the block device's dev_t at @pathname in the current 1119 * namespace if possible and return it in @dev. 1120 * 1121 * Context: May sleep. 1122 * Return: 0 if succeeded, negative errno otherwise. 1123 */ 1124 int lookup_bdev(const char *pathname, dev_t *dev) 1125 { 1126 struct inode *inode; 1127 struct path path; 1128 int error; 1129 1130 if (!pathname || !*pathname) 1131 return -EINVAL; 1132 1133 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1134 if (error) 1135 return error; 1136 1137 inode = d_backing_inode(path.dentry); 1138 error = -ENOTBLK; 1139 if (!S_ISBLK(inode->i_mode)) 1140 goto out_path_put; 1141 error = -EACCES; 1142 if (!may_open_dev(&path)) 1143 goto out_path_put; 1144 1145 *dev = inode->i_rdev; 1146 error = 0; 1147 out_path_put: 1148 path_put(&path); 1149 return error; 1150 } 1151 EXPORT_SYMBOL(lookup_bdev); 1152 1153 /** 1154 * bdev_mark_dead - mark a block device as dead 1155 * @bdev: block device to operate on 1156 * @surprise: indicate a surprise removal 1157 * 1158 * Tell the file system that this devices or media is dead. If @surprise is set 1159 * to %true the device or media is already gone, if not we are preparing for an 1160 * orderly removal. 1161 * 1162 * This calls into the file system, which then typicall syncs out all dirty data 1163 * and writes back inodes and then invalidates any cached data in the inodes on 1164 * the file system. In addition we also invalidate the block device mapping. 1165 */ 1166 void bdev_mark_dead(struct block_device *bdev, bool surprise) 1167 { 1168 mutex_lock(&bdev->bd_holder_lock); 1169 if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) 1170 bdev->bd_holder_ops->mark_dead(bdev, surprise); 1171 else { 1172 mutex_unlock(&bdev->bd_holder_lock); 1173 sync_blockdev(bdev); 1174 } 1175 1176 invalidate_bdev(bdev); 1177 } 1178 /* 1179 * New drivers should not use this directly. There are some drivers however 1180 * that needs this for historical reasons. For example, the DASD driver has 1181 * historically had a shutdown to offline mode that doesn't actually remove the 1182 * gendisk that otherwise looks a lot like a safe device removal. 1183 */ 1184 EXPORT_SYMBOL_GPL(bdev_mark_dead); 1185 1186 void sync_bdevs(bool wait) 1187 { 1188 struct inode *inode, *old_inode = NULL; 1189 1190 spin_lock(&blockdev_superblock->s_inode_list_lock); 1191 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { 1192 struct address_space *mapping = inode->i_mapping; 1193 struct block_device *bdev; 1194 1195 spin_lock(&inode->i_lock); 1196 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || 1197 mapping->nrpages == 0) { 1198 spin_unlock(&inode->i_lock); 1199 continue; 1200 } 1201 __iget(inode); 1202 spin_unlock(&inode->i_lock); 1203 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1204 /* 1205 * We hold a reference to 'inode' so it couldn't have been 1206 * removed from s_inodes list while we dropped the 1207 * s_inode_list_lock We cannot iput the inode now as we can 1208 * be holding the last reference and we cannot iput it under 1209 * s_inode_list_lock. So we keep the reference and iput it 1210 * later. 1211 */ 1212 iput(old_inode); 1213 old_inode = inode; 1214 bdev = I_BDEV(inode); 1215 1216 mutex_lock(&bdev->bd_disk->open_mutex); 1217 if (!atomic_read(&bdev->bd_openers)) { 1218 ; /* skip */ 1219 } else if (wait) { 1220 /* 1221 * We keep the error status of individual mapping so 1222 * that applications can catch the writeback error using 1223 * fsync(2). See filemap_fdatawait_keep_errors() for 1224 * details. 1225 */ 1226 filemap_fdatawait_keep_errors(inode->i_mapping); 1227 } else { 1228 filemap_fdatawrite(inode->i_mapping); 1229 } 1230 mutex_unlock(&bdev->bd_disk->open_mutex); 1231 1232 spin_lock(&blockdev_superblock->s_inode_list_lock); 1233 } 1234 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1235 iput(old_inode); 1236 } 1237 1238 /* 1239 * Handle STATX_DIOALIGN for block devices. 1240 * 1241 * Note that the inode passed to this is the inode of a block device node file, 1242 * not the block device's internal inode. Therefore it is *not* valid to use 1243 * I_BDEV() here; the block device has to be looked up by i_rdev instead. 1244 */ 1245 void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) 1246 { 1247 struct block_device *bdev; 1248 1249 bdev = blkdev_get_no_open(inode->i_rdev); 1250 if (!bdev) 1251 return; 1252 1253 stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 1254 stat->dio_offset_align = bdev_logical_block_size(bdev); 1255 stat->result_mask |= STATX_DIOALIGN; 1256 1257 blkdev_put_no_open(bdev); 1258 } 1259 1260 bool disk_live(struct gendisk *disk) 1261 { 1262 return !inode_unhashed(disk->part0->bd_inode); 1263 } 1264 EXPORT_SYMBOL_GPL(disk_live); 1265 1266 unsigned int block_size(struct block_device *bdev) 1267 { 1268 return 1 << bdev->bd_inode->i_blkbits; 1269 } 1270 EXPORT_SYMBOL_GPL(block_size); 1271 1272 static int __init setup_bdev_allow_write_mounted(char *str) 1273 { 1274 if (kstrtobool(str, &bdev_allow_write_mounted)) 1275 pr_warn("Invalid option string for bdev_allow_write_mounted:" 1276 " '%s'\n", str); 1277 return 1; 1278 } 1279 __setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); 1280