1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 1993 by Theodore Ts'o. 4 */ 5 #include <linux/module.h> 6 #include <linux/moduleparam.h> 7 #include <linux/sched.h> 8 #include <linux/fs.h> 9 #include <linux/pagemap.h> 10 #include <linux/file.h> 11 #include <linux/stat.h> 12 #include <linux/errno.h> 13 #include <linux/major.h> 14 #include <linux/wait.h> 15 #include <linux/blkpg.h> 16 #include <linux/init.h> 17 #include <linux/swap.h> 18 #include <linux/slab.h> 19 #include <linux/compat.h> 20 #include <linux/suspend.h> 21 #include <linux/freezer.h> 22 #include <linux/mutex.h> 23 #include <linux/writeback.h> 24 #include <linux/completion.h> 25 #include <linux/highmem.h> 26 #include <linux/splice.h> 27 #include <linux/sysfs.h> 28 #include <linux/miscdevice.h> 29 #include <linux/falloc.h> 30 #include <linux/uio.h> 31 #include <linux/ioprio.h> 32 #include <linux/blk-cgroup.h> 33 #include <linux/sched/mm.h> 34 #include <linux/statfs.h> 35 #include <linux/uaccess.h> 36 #include <linux/blk-mq.h> 37 #include <linux/spinlock.h> 38 #include <uapi/linux/loop.h> 39 40 /* Possible states of device */ 41 enum { 42 Lo_unbound, 43 Lo_bound, 44 Lo_rundown, 45 Lo_deleting, 46 }; 47 48 struct loop_func_table; 49 50 struct loop_device { 51 int lo_number; 52 loff_t lo_offset; 53 loff_t lo_sizelimit; 54 int lo_flags; 55 char lo_file_name[LO_NAME_SIZE]; 56 57 struct file * lo_backing_file; 58 struct block_device *lo_device; 59 60 gfp_t old_gfp_mask; 61 62 spinlock_t lo_lock; 63 int lo_state; 64 spinlock_t lo_work_lock; 65 struct workqueue_struct *workqueue; 66 struct work_struct rootcg_work; 67 struct list_head rootcg_cmd_list; 68 struct list_head idle_worker_list; 69 struct rb_root worker_tree; 70 struct timer_list timer; 71 bool use_dio; 72 bool sysfs_inited; 73 74 struct request_queue *lo_queue; 75 struct blk_mq_tag_set tag_set; 76 struct gendisk *lo_disk; 77 struct mutex lo_mutex; 78 bool idr_visible; 79 }; 80 81 struct loop_cmd { 82 struct list_head list_entry; 83 bool use_aio; /* use AIO interface to handle I/O */ 84 atomic_t ref; /* only for aio */ 85 long ret; 86 struct kiocb iocb; 87 struct bio_vec *bvec; 88 struct cgroup_subsys_state *blkcg_css; 89 struct cgroup_subsys_state *memcg_css; 90 }; 91 92 #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) 93 #define LOOP_DEFAULT_HW_Q_DEPTH 128 94 95 static DEFINE_IDR(loop_index_idr); 96 static DEFINE_MUTEX(loop_ctl_mutex); 97 static DEFINE_MUTEX(loop_validate_mutex); 98 99 /** 100 * loop_global_lock_killable() - take locks for safe loop_validate_file() test 101 * 102 * @lo: struct loop_device 103 * @global: true if @lo is about to bind another "struct loop_device", false otherwise 104 * 105 * Returns 0 on success, -EINTR otherwise. 106 * 107 * Since loop_validate_file() traverses on other "struct loop_device" if 108 * is_loop_device() is true, we need a global lock for serializing concurrent 109 * loop_configure()/loop_change_fd()/__loop_clr_fd() calls. 110 */ 111 static int loop_global_lock_killable(struct loop_device *lo, bool global) 112 { 113 int err; 114 115 if (global) { 116 err = mutex_lock_killable(&loop_validate_mutex); 117 if (err) 118 return err; 119 } 120 err = mutex_lock_killable(&lo->lo_mutex); 121 if (err && global) 122 mutex_unlock(&loop_validate_mutex); 123 return err; 124 } 125 126 /** 127 * loop_global_unlock() - release locks taken by loop_global_lock_killable() 128 * 129 * @lo: struct loop_device 130 * @global: true if @lo was about to bind another "struct loop_device", false otherwise 131 */ 132 static void loop_global_unlock(struct loop_device *lo, bool global) 133 { 134 mutex_unlock(&lo->lo_mutex); 135 if (global) 136 mutex_unlock(&loop_validate_mutex); 137 } 138 139 static int max_part; 140 static int part_shift; 141 142 static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) 143 { 144 loff_t loopsize; 145 146 /* Compute loopsize in bytes */ 147 loopsize = i_size_read(file->f_mapping->host); 148 if (offset > 0) 149 loopsize -= offset; 150 /* offset is beyond i_size, weird but possible */ 151 if (loopsize < 0) 152 return 0; 153 154 if (sizelimit > 0 && sizelimit < loopsize) 155 loopsize = sizelimit; 156 /* 157 * Unfortunately, if we want to do I/O on the device, 158 * the number of 512-byte sectors has to fit into a sector_t. 159 */ 160 return loopsize >> 9; 161 } 162 163 static loff_t get_loop_size(struct loop_device *lo, struct file *file) 164 { 165 return get_size(lo->lo_offset, lo->lo_sizelimit, file); 166 } 167 168 /* 169 * We support direct I/O only if lo_offset is aligned with the logical I/O size 170 * of backing device, and the logical block size of loop is bigger than that of 171 * the backing device. 172 */ 173 static bool lo_bdev_can_use_dio(struct loop_device *lo, 174 struct block_device *backing_bdev) 175 { 176 unsigned int sb_bsize = bdev_logical_block_size(backing_bdev); 177 178 if (queue_logical_block_size(lo->lo_queue) < sb_bsize) 179 return false; 180 if (lo->lo_offset & (sb_bsize - 1)) 181 return false; 182 return true; 183 } 184 185 static bool lo_can_use_dio(struct loop_device *lo) 186 { 187 struct inode *inode = lo->lo_backing_file->f_mapping->host; 188 189 if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT)) 190 return false; 191 192 if (S_ISBLK(inode->i_mode)) 193 return lo_bdev_can_use_dio(lo, I_BDEV(inode)); 194 if (inode->i_sb->s_bdev) 195 return lo_bdev_can_use_dio(lo, inode->i_sb->s_bdev); 196 return true; 197 } 198 199 static inline void loop_update_dio(struct loop_device *lo) 200 { 201 bool dio = lo->use_dio || (lo->lo_backing_file->f_flags & O_DIRECT); 202 bool use_dio = dio && lo_can_use_dio(lo); 203 204 lockdep_assert_held(&lo->lo_mutex); 205 WARN_ON_ONCE(lo->lo_state == Lo_bound && 206 lo->lo_queue->mq_freeze_depth == 0); 207 208 if (lo->use_dio == use_dio) 209 return; 210 211 /* flush dirty pages before starting to use direct I/O */ 212 if (use_dio) 213 vfs_fsync(lo->lo_backing_file, 0); 214 215 /* 216 * The flag of LO_FLAGS_DIRECT_IO is handled similarly with 217 * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup 218 * will get updated by ioctl(LOOP_GET_STATUS) 219 */ 220 lo->use_dio = use_dio; 221 if (use_dio) 222 lo->lo_flags |= LO_FLAGS_DIRECT_IO; 223 else 224 lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; 225 } 226 227 /** 228 * loop_set_size() - sets device size and notifies userspace 229 * @lo: struct loop_device to set the size for 230 * @size: new size of the loop device 231 * 232 * Callers must validate that the size passed into this function fits into 233 * a sector_t, eg using loop_validate_size() 234 */ 235 static void loop_set_size(struct loop_device *lo, loff_t size) 236 { 237 if (!set_capacity_and_notify(lo->lo_disk, size)) 238 kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); 239 } 240 241 static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) 242 { 243 struct iov_iter i; 244 ssize_t bw; 245 246 iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len); 247 248 bw = vfs_iter_write(file, &i, ppos, 0); 249 250 if (likely(bw == bvec->bv_len)) 251 return 0; 252 253 printk_ratelimited(KERN_ERR 254 "loop: Write error at byte offset %llu, length %i.\n", 255 (unsigned long long)*ppos, bvec->bv_len); 256 if (bw >= 0) 257 bw = -EIO; 258 return bw; 259 } 260 261 static int lo_write_simple(struct loop_device *lo, struct request *rq, 262 loff_t pos) 263 { 264 struct bio_vec bvec; 265 struct req_iterator iter; 266 int ret = 0; 267 268 rq_for_each_segment(bvec, rq, iter) { 269 ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos); 270 if (ret < 0) 271 break; 272 cond_resched(); 273 } 274 275 return ret; 276 } 277 278 static int lo_read_simple(struct loop_device *lo, struct request *rq, 279 loff_t pos) 280 { 281 struct bio_vec bvec; 282 struct req_iterator iter; 283 struct iov_iter i; 284 ssize_t len; 285 286 rq_for_each_segment(bvec, rq, iter) { 287 iov_iter_bvec(&i, ITER_DEST, &bvec, 1, bvec.bv_len); 288 len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); 289 if (len < 0) 290 return len; 291 292 flush_dcache_page(bvec.bv_page); 293 294 if (len != bvec.bv_len) { 295 struct bio *bio; 296 297 __rq_for_each_bio(bio, rq) 298 zero_fill_bio(bio); 299 break; 300 } 301 cond_resched(); 302 } 303 304 return 0; 305 } 306 307 static void loop_clear_limits(struct loop_device *lo, int mode) 308 { 309 struct queue_limits lim = queue_limits_start_update(lo->lo_queue); 310 311 if (mode & FALLOC_FL_ZERO_RANGE) 312 lim.max_write_zeroes_sectors = 0; 313 314 if (mode & FALLOC_FL_PUNCH_HOLE) { 315 lim.max_hw_discard_sectors = 0; 316 lim.discard_granularity = 0; 317 } 318 319 /* 320 * XXX: this updates the queue limits without freezing the queue, which 321 * is against the locking protocol and dangerous. But we can't just 322 * freeze the queue as we're inside the ->queue_rq method here. So this 323 * should move out into a workqueue unless we get the file operations to 324 * advertise if they support specific fallocate operations. 325 */ 326 queue_limits_commit_update(lo->lo_queue, &lim); 327 } 328 329 static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, 330 int mode) 331 { 332 /* 333 * We use fallocate to manipulate the space mappings used by the image 334 * a.k.a. discard/zerorange. 335 */ 336 struct file *file = lo->lo_backing_file; 337 int ret; 338 339 mode |= FALLOC_FL_KEEP_SIZE; 340 341 if (!bdev_max_discard_sectors(lo->lo_device)) 342 return -EOPNOTSUPP; 343 344 ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); 345 if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) 346 return -EIO; 347 348 /* 349 * We initially configure the limits in a hope that fallocate is 350 * supported and clear them here if that turns out not to be true. 351 */ 352 if (unlikely(ret == -EOPNOTSUPP)) 353 loop_clear_limits(lo, mode); 354 355 return ret; 356 } 357 358 static int lo_req_flush(struct loop_device *lo, struct request *rq) 359 { 360 int ret = vfs_fsync(lo->lo_backing_file, 0); 361 if (unlikely(ret && ret != -EINVAL)) 362 ret = -EIO; 363 364 return ret; 365 } 366 367 static void lo_complete_rq(struct request *rq) 368 { 369 struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); 370 blk_status_t ret = BLK_STS_OK; 371 372 if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || 373 req_op(rq) != REQ_OP_READ) { 374 if (cmd->ret < 0) 375 ret = errno_to_blk_status(cmd->ret); 376 goto end_io; 377 } 378 379 /* 380 * Short READ - if we got some data, advance our request and 381 * retry it. If we got no data, end the rest with EIO. 382 */ 383 if (cmd->ret) { 384 blk_update_request(rq, BLK_STS_OK, cmd->ret); 385 cmd->ret = 0; 386 blk_mq_requeue_request(rq, true); 387 } else { 388 if (cmd->use_aio) { 389 struct bio *bio = rq->bio; 390 391 while (bio) { 392 zero_fill_bio(bio); 393 bio = bio->bi_next; 394 } 395 } 396 ret = BLK_STS_IOERR; 397 end_io: 398 blk_mq_end_request(rq, ret); 399 } 400 } 401 402 static void lo_rw_aio_do_completion(struct loop_cmd *cmd) 403 { 404 struct request *rq = blk_mq_rq_from_pdu(cmd); 405 406 if (!atomic_dec_and_test(&cmd->ref)) 407 return; 408 kfree(cmd->bvec); 409 cmd->bvec = NULL; 410 if (likely(!blk_should_fake_timeout(rq->q))) 411 blk_mq_complete_request(rq); 412 } 413 414 static void lo_rw_aio_complete(struct kiocb *iocb, long ret) 415 { 416 struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); 417 418 cmd->ret = ret; 419 lo_rw_aio_do_completion(cmd); 420 } 421 422 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, 423 loff_t pos, int rw) 424 { 425 struct iov_iter iter; 426 struct req_iterator rq_iter; 427 struct bio_vec *bvec; 428 struct request *rq = blk_mq_rq_from_pdu(cmd); 429 struct bio *bio = rq->bio; 430 struct file *file = lo->lo_backing_file; 431 struct bio_vec tmp; 432 unsigned int offset; 433 int nr_bvec = 0; 434 int ret; 435 436 rq_for_each_bvec(tmp, rq, rq_iter) 437 nr_bvec++; 438 439 if (rq->bio != rq->biotail) { 440 441 bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), 442 GFP_NOIO); 443 if (!bvec) 444 return -EIO; 445 cmd->bvec = bvec; 446 447 /* 448 * The bios of the request may be started from the middle of 449 * the 'bvec' because of bio splitting, so we can't directly 450 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec 451 * API will take care of all details for us. 452 */ 453 rq_for_each_bvec(tmp, rq, rq_iter) { 454 *bvec = tmp; 455 bvec++; 456 } 457 bvec = cmd->bvec; 458 offset = 0; 459 } else { 460 /* 461 * Same here, this bio may be started from the middle of the 462 * 'bvec' because of bio splitting, so offset from the bvec 463 * must be passed to iov iterator 464 */ 465 offset = bio->bi_iter.bi_bvec_done; 466 bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); 467 } 468 atomic_set(&cmd->ref, 2); 469 470 iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); 471 iter.iov_offset = offset; 472 473 cmd->iocb.ki_pos = pos; 474 cmd->iocb.ki_filp = file; 475 cmd->iocb.ki_complete = lo_rw_aio_complete; 476 cmd->iocb.ki_flags = IOCB_DIRECT; 477 cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); 478 479 if (rw == ITER_SOURCE) 480 ret = file->f_op->write_iter(&cmd->iocb, &iter); 481 else 482 ret = file->f_op->read_iter(&cmd->iocb, &iter); 483 484 lo_rw_aio_do_completion(cmd); 485 486 if (ret != -EIOCBQUEUED) 487 lo_rw_aio_complete(&cmd->iocb, ret); 488 return 0; 489 } 490 491 static int do_req_filebacked(struct loop_device *lo, struct request *rq) 492 { 493 struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); 494 loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; 495 496 /* 497 * lo_write_simple and lo_read_simple should have been covered 498 * by io submit style function like lo_rw_aio(), one blocker 499 * is that lo_read_simple() need to call flush_dcache_page after 500 * the page is written from kernel, and it isn't easy to handle 501 * this in io submit style function which submits all segments 502 * of the req at one time. And direct read IO doesn't need to 503 * run flush_dcache_page(). 504 */ 505 switch (req_op(rq)) { 506 case REQ_OP_FLUSH: 507 return lo_req_flush(lo, rq); 508 case REQ_OP_WRITE_ZEROES: 509 /* 510 * If the caller doesn't want deallocation, call zeroout to 511 * write zeroes the range. Otherwise, punch them out. 512 */ 513 return lo_fallocate(lo, rq, pos, 514 (rq->cmd_flags & REQ_NOUNMAP) ? 515 FALLOC_FL_ZERO_RANGE : 516 FALLOC_FL_PUNCH_HOLE); 517 case REQ_OP_DISCARD: 518 return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); 519 case REQ_OP_WRITE: 520 if (cmd->use_aio) 521 return lo_rw_aio(lo, cmd, pos, ITER_SOURCE); 522 else 523 return lo_write_simple(lo, rq, pos); 524 case REQ_OP_READ: 525 if (cmd->use_aio) 526 return lo_rw_aio(lo, cmd, pos, ITER_DEST); 527 else 528 return lo_read_simple(lo, rq, pos); 529 default: 530 WARN_ON_ONCE(1); 531 return -EIO; 532 } 533 } 534 535 static void loop_reread_partitions(struct loop_device *lo) 536 { 537 int rc; 538 539 mutex_lock(&lo->lo_disk->open_mutex); 540 rc = bdev_disk_changed(lo->lo_disk, false); 541 mutex_unlock(&lo->lo_disk->open_mutex); 542 if (rc) 543 pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", 544 __func__, lo->lo_number, lo->lo_file_name, rc); 545 } 546 547 static inline int is_loop_device(struct file *file) 548 { 549 struct inode *i = file->f_mapping->host; 550 551 return i && S_ISBLK(i->i_mode) && imajor(i) == LOOP_MAJOR; 552 } 553 554 static int loop_validate_file(struct file *file, struct block_device *bdev) 555 { 556 struct inode *inode = file->f_mapping->host; 557 struct file *f = file; 558 559 /* Avoid recursion */ 560 while (is_loop_device(f)) { 561 struct loop_device *l; 562 563 lockdep_assert_held(&loop_validate_mutex); 564 if (f->f_mapping->host->i_rdev == bdev->bd_dev) 565 return -EBADF; 566 567 l = I_BDEV(f->f_mapping->host)->bd_disk->private_data; 568 if (l->lo_state != Lo_bound) 569 return -EINVAL; 570 /* Order wrt setting lo->lo_backing_file in loop_configure(). */ 571 rmb(); 572 f = l->lo_backing_file; 573 } 574 if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) 575 return -EINVAL; 576 return 0; 577 } 578 579 /* 580 * loop_change_fd switched the backing store of a loopback device to 581 * a new file. This is useful for operating system installers to free up 582 * the original file and in High Availability environments to switch to 583 * an alternative location for the content in case of server meltdown. 584 * This can only work if the loop device is used read-only, and if the 585 * new backing store is the same size and type as the old backing store. 586 */ 587 static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, 588 unsigned int arg) 589 { 590 struct file *file = fget(arg); 591 struct file *old_file; 592 int error; 593 bool partscan; 594 bool is_loop; 595 596 if (!file) 597 return -EBADF; 598 599 /* suppress uevents while reconfiguring the device */ 600 dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); 601 602 is_loop = is_loop_device(file); 603 error = loop_global_lock_killable(lo, is_loop); 604 if (error) 605 goto out_putf; 606 error = -ENXIO; 607 if (lo->lo_state != Lo_bound) 608 goto out_err; 609 610 /* the loop device has to be read-only */ 611 error = -EINVAL; 612 if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) 613 goto out_err; 614 615 error = loop_validate_file(file, bdev); 616 if (error) 617 goto out_err; 618 619 old_file = lo->lo_backing_file; 620 621 error = -EINVAL; 622 623 /* size of the new backing store needs to be the same */ 624 if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) 625 goto out_err; 626 627 /* and ... switch */ 628 disk_force_media_change(lo->lo_disk); 629 blk_mq_freeze_queue(lo->lo_queue); 630 mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); 631 lo->lo_backing_file = file; 632 lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); 633 mapping_set_gfp_mask(file->f_mapping, 634 lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); 635 loop_update_dio(lo); 636 blk_mq_unfreeze_queue(lo->lo_queue); 637 partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; 638 loop_global_unlock(lo, is_loop); 639 640 /* 641 * Flush loop_validate_file() before fput(), for l->lo_backing_file 642 * might be pointing at old_file which might be the last reference. 643 */ 644 if (!is_loop) { 645 mutex_lock(&loop_validate_mutex); 646 mutex_unlock(&loop_validate_mutex); 647 } 648 /* 649 * We must drop file reference outside of lo_mutex as dropping 650 * the file ref can take open_mutex which creates circular locking 651 * dependency. 652 */ 653 fput(old_file); 654 if (partscan) 655 loop_reread_partitions(lo); 656 657 error = 0; 658 done: 659 /* enable and uncork uevent now that we are done */ 660 dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); 661 return error; 662 663 out_err: 664 loop_global_unlock(lo, is_loop); 665 out_putf: 666 fput(file); 667 goto done; 668 } 669 670 /* loop sysfs attributes */ 671 672 static ssize_t loop_attr_show(struct device *dev, char *page, 673 ssize_t (*callback)(struct loop_device *, char *)) 674 { 675 struct gendisk *disk = dev_to_disk(dev); 676 struct loop_device *lo = disk->private_data; 677 678 return callback(lo, page); 679 } 680 681 #define LOOP_ATTR_RO(_name) \ 682 static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \ 683 static ssize_t loop_attr_do_show_##_name(struct device *d, \ 684 struct device_attribute *attr, char *b) \ 685 { \ 686 return loop_attr_show(d, b, loop_attr_##_name##_show); \ 687 } \ 688 static struct device_attribute loop_attr_##_name = \ 689 __ATTR(_name, 0444, loop_attr_do_show_##_name, NULL); 690 691 static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) 692 { 693 ssize_t ret; 694 char *p = NULL; 695 696 spin_lock_irq(&lo->lo_lock); 697 if (lo->lo_backing_file) 698 p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1); 699 spin_unlock_irq(&lo->lo_lock); 700 701 if (IS_ERR_OR_NULL(p)) 702 ret = PTR_ERR(p); 703 else { 704 ret = strlen(p); 705 memmove(buf, p, ret); 706 buf[ret++] = '\n'; 707 buf[ret] = 0; 708 } 709 710 return ret; 711 } 712 713 static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) 714 { 715 return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset); 716 } 717 718 static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) 719 { 720 return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); 721 } 722 723 static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) 724 { 725 int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); 726 727 return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0"); 728 } 729 730 static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) 731 { 732 int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); 733 734 return sysfs_emit(buf, "%s\n", partscan ? "1" : "0"); 735 } 736 737 static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) 738 { 739 int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); 740 741 return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); 742 } 743 744 LOOP_ATTR_RO(backing_file); 745 LOOP_ATTR_RO(offset); 746 LOOP_ATTR_RO(sizelimit); 747 LOOP_ATTR_RO(autoclear); 748 LOOP_ATTR_RO(partscan); 749 LOOP_ATTR_RO(dio); 750 751 static struct attribute *loop_attrs[] = { 752 &loop_attr_backing_file.attr, 753 &loop_attr_offset.attr, 754 &loop_attr_sizelimit.attr, 755 &loop_attr_autoclear.attr, 756 &loop_attr_partscan.attr, 757 &loop_attr_dio.attr, 758 NULL, 759 }; 760 761 static struct attribute_group loop_attribute_group = { 762 .name = "loop", 763 .attrs= loop_attrs, 764 }; 765 766 static void loop_sysfs_init(struct loop_device *lo) 767 { 768 lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj, 769 &loop_attribute_group); 770 } 771 772 static void loop_sysfs_exit(struct loop_device *lo) 773 { 774 if (lo->sysfs_inited) 775 sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj, 776 &loop_attribute_group); 777 } 778 779 static void loop_get_discard_config(struct loop_device *lo, 780 u32 *granularity, u32 *max_discard_sectors) 781 { 782 struct file *file = lo->lo_backing_file; 783 struct inode *inode = file->f_mapping->host; 784 struct kstatfs sbuf; 785 786 /* 787 * If the backing device is a block device, mirror its zeroing 788 * capability. Set the discard sectors to the block device's zeroing 789 * capabilities because loop discards result in blkdev_issue_zeroout(), 790 * not blkdev_issue_discard(). This maintains consistent behavior with 791 * file-backed loop devices: discarded regions read back as zero. 792 */ 793 if (S_ISBLK(inode->i_mode)) { 794 struct block_device *bdev = I_BDEV(inode); 795 796 *max_discard_sectors = bdev_write_zeroes_sectors(bdev); 797 *granularity = bdev_discard_granularity(bdev); 798 799 /* 800 * We use punch hole to reclaim the free space used by the 801 * image a.k.a. discard. 802 */ 803 } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { 804 *max_discard_sectors = UINT_MAX >> 9; 805 *granularity = sbuf.f_bsize; 806 } 807 } 808 809 struct loop_worker { 810 struct rb_node rb_node; 811 struct work_struct work; 812 struct list_head cmd_list; 813 struct list_head idle_list; 814 struct loop_device *lo; 815 struct cgroup_subsys_state *blkcg_css; 816 unsigned long last_ran_at; 817 }; 818 819 static void loop_workfn(struct work_struct *work); 820 821 #ifdef CONFIG_BLK_CGROUP 822 static inline int queue_on_root_worker(struct cgroup_subsys_state *css) 823 { 824 return !css || css == blkcg_root_css; 825 } 826 #else 827 static inline int queue_on_root_worker(struct cgroup_subsys_state *css) 828 { 829 return !css; 830 } 831 #endif 832 833 static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) 834 { 835 struct rb_node **node, *parent = NULL; 836 struct loop_worker *cur_worker, *worker = NULL; 837 struct work_struct *work; 838 struct list_head *cmd_list; 839 840 spin_lock_irq(&lo->lo_work_lock); 841 842 if (queue_on_root_worker(cmd->blkcg_css)) 843 goto queue_work; 844 845 node = &lo->worker_tree.rb_node; 846 847 while (*node) { 848 parent = *node; 849 cur_worker = container_of(*node, struct loop_worker, rb_node); 850 if (cur_worker->blkcg_css == cmd->blkcg_css) { 851 worker = cur_worker; 852 break; 853 } else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) { 854 node = &(*node)->rb_left; 855 } else { 856 node = &(*node)->rb_right; 857 } 858 } 859 if (worker) 860 goto queue_work; 861 862 worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN); 863 /* 864 * In the event we cannot allocate a worker, just queue on the 865 * rootcg worker and issue the I/O as the rootcg 866 */ 867 if (!worker) { 868 cmd->blkcg_css = NULL; 869 if (cmd->memcg_css) 870 css_put(cmd->memcg_css); 871 cmd->memcg_css = NULL; 872 goto queue_work; 873 } 874 875 worker->blkcg_css = cmd->blkcg_css; 876 css_get(worker->blkcg_css); 877 INIT_WORK(&worker->work, loop_workfn); 878 INIT_LIST_HEAD(&worker->cmd_list); 879 INIT_LIST_HEAD(&worker->idle_list); 880 worker->lo = lo; 881 rb_link_node(&worker->rb_node, parent, node); 882 rb_insert_color(&worker->rb_node, &lo->worker_tree); 883 queue_work: 884 if (worker) { 885 /* 886 * We need to remove from the idle list here while 887 * holding the lock so that the idle timer doesn't 888 * free the worker 889 */ 890 if (!list_empty(&worker->idle_list)) 891 list_del_init(&worker->idle_list); 892 work = &worker->work; 893 cmd_list = &worker->cmd_list; 894 } else { 895 work = &lo->rootcg_work; 896 cmd_list = &lo->rootcg_cmd_list; 897 } 898 list_add_tail(&cmd->list_entry, cmd_list); 899 queue_work(lo->workqueue, work); 900 spin_unlock_irq(&lo->lo_work_lock); 901 } 902 903 static void loop_set_timer(struct loop_device *lo) 904 { 905 timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); 906 } 907 908 static void loop_free_idle_workers(struct loop_device *lo, bool delete_all) 909 { 910 struct loop_worker *pos, *worker; 911 912 spin_lock_irq(&lo->lo_work_lock); 913 list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, 914 idle_list) { 915 if (!delete_all && 916 time_is_after_jiffies(worker->last_ran_at + 917 LOOP_IDLE_WORKER_TIMEOUT)) 918 break; 919 list_del(&worker->idle_list); 920 rb_erase(&worker->rb_node, &lo->worker_tree); 921 css_put(worker->blkcg_css); 922 kfree(worker); 923 } 924 if (!list_empty(&lo->idle_worker_list)) 925 loop_set_timer(lo); 926 spin_unlock_irq(&lo->lo_work_lock); 927 } 928 929 static void loop_free_idle_workers_timer(struct timer_list *timer) 930 { 931 struct loop_device *lo = container_of(timer, struct loop_device, timer); 932 933 return loop_free_idle_workers(lo, false); 934 } 935 936 /** 937 * loop_set_status_from_info - configure device from loop_info 938 * @lo: struct loop_device to configure 939 * @info: struct loop_info64 to configure the device with 940 * 941 * Configures the loop device parameters according to the passed 942 * in loop_info64 configuration. 943 */ 944 static int 945 loop_set_status_from_info(struct loop_device *lo, 946 const struct loop_info64 *info) 947 { 948 if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) 949 return -EINVAL; 950 951 switch (info->lo_encrypt_type) { 952 case LO_CRYPT_NONE: 953 break; 954 case LO_CRYPT_XOR: 955 pr_warn("support for the xor transformation has been removed.\n"); 956 return -EINVAL; 957 case LO_CRYPT_CRYPTOAPI: 958 pr_warn("support for cryptoloop has been removed. Use dm-crypt instead.\n"); 959 return -EINVAL; 960 default: 961 return -EINVAL; 962 } 963 964 /* Avoid assigning overflow values */ 965 if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX) 966 return -EOVERFLOW; 967 968 lo->lo_offset = info->lo_offset; 969 lo->lo_sizelimit = info->lo_sizelimit; 970 971 memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); 972 lo->lo_file_name[LO_NAME_SIZE-1] = 0; 973 return 0; 974 } 975 976 static unsigned int loop_default_blocksize(struct loop_device *lo, 977 struct block_device *backing_bdev) 978 { 979 /* In case of direct I/O, match underlying block size */ 980 if ((lo->lo_backing_file->f_flags & O_DIRECT) && backing_bdev) 981 return bdev_logical_block_size(backing_bdev); 982 return SECTOR_SIZE; 983 } 984 985 static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, 986 unsigned int bsize) 987 { 988 struct file *file = lo->lo_backing_file; 989 struct inode *inode = file->f_mapping->host; 990 struct block_device *backing_bdev = NULL; 991 u32 granularity = 0, max_discard_sectors = 0; 992 993 if (S_ISBLK(inode->i_mode)) 994 backing_bdev = I_BDEV(inode); 995 else if (inode->i_sb->s_bdev) 996 backing_bdev = inode->i_sb->s_bdev; 997 998 if (!bsize) 999 bsize = loop_default_blocksize(lo, backing_bdev); 1000 1001 loop_get_discard_config(lo, &granularity, &max_discard_sectors); 1002 1003 lim->logical_block_size = bsize; 1004 lim->physical_block_size = bsize; 1005 lim->io_min = bsize; 1006 lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); 1007 if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) 1008 lim->features |= BLK_FEAT_WRITE_CACHE; 1009 if (backing_bdev && !bdev_nonrot(backing_bdev)) 1010 lim->features |= BLK_FEAT_ROTATIONAL; 1011 lim->max_hw_discard_sectors = max_discard_sectors; 1012 lim->max_write_zeroes_sectors = max_discard_sectors; 1013 if (max_discard_sectors) 1014 lim->discard_granularity = granularity; 1015 else 1016 lim->discard_granularity = 0; 1017 } 1018 1019 static int loop_configure(struct loop_device *lo, blk_mode_t mode, 1020 struct block_device *bdev, 1021 const struct loop_config *config) 1022 { 1023 struct file *file = fget(config->fd); 1024 struct address_space *mapping; 1025 struct queue_limits lim; 1026 int error; 1027 loff_t size; 1028 bool partscan; 1029 bool is_loop; 1030 1031 if (!file) 1032 return -EBADF; 1033 is_loop = is_loop_device(file); 1034 1035 /* This is safe, since we have a reference from open(). */ 1036 __module_get(THIS_MODULE); 1037 1038 /* 1039 * If we don't hold exclusive handle for the device, upgrade to it 1040 * here to avoid changing device under exclusive owner. 1041 */ 1042 if (!(mode & BLK_OPEN_EXCL)) { 1043 error = bd_prepare_to_claim(bdev, loop_configure, NULL); 1044 if (error) 1045 goto out_putf; 1046 } 1047 1048 error = loop_global_lock_killable(lo, is_loop); 1049 if (error) 1050 goto out_bdev; 1051 1052 error = -EBUSY; 1053 if (lo->lo_state != Lo_unbound) 1054 goto out_unlock; 1055 1056 error = loop_validate_file(file, bdev); 1057 if (error) 1058 goto out_unlock; 1059 1060 mapping = file->f_mapping; 1061 1062 if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { 1063 error = -EINVAL; 1064 goto out_unlock; 1065 } 1066 1067 error = loop_set_status_from_info(lo, &config->info); 1068 if (error) 1069 goto out_unlock; 1070 lo->lo_flags = config->info.lo_flags; 1071 1072 if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) || 1073 !file->f_op->write_iter) 1074 lo->lo_flags |= LO_FLAGS_READ_ONLY; 1075 1076 if (!lo->workqueue) { 1077 lo->workqueue = alloc_workqueue("loop%d", 1078 WQ_UNBOUND | WQ_FREEZABLE, 1079 0, lo->lo_number); 1080 if (!lo->workqueue) { 1081 error = -ENOMEM; 1082 goto out_unlock; 1083 } 1084 } 1085 1086 /* suppress uevents while reconfiguring the device */ 1087 dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); 1088 1089 disk_force_media_change(lo->lo_disk); 1090 set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); 1091 1092 lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; 1093 lo->lo_device = bdev; 1094 lo->lo_backing_file = file; 1095 lo->old_gfp_mask = mapping_gfp_mask(mapping); 1096 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); 1097 1098 lim = queue_limits_start_update(lo->lo_queue); 1099 loop_update_limits(lo, &lim, config->block_size); 1100 /* No need to freeze the queue as the device isn't bound yet. */ 1101 error = queue_limits_commit_update(lo->lo_queue, &lim); 1102 if (error) 1103 goto out_unlock; 1104 1105 loop_update_dio(lo); 1106 loop_sysfs_init(lo); 1107 1108 size = get_loop_size(lo, file); 1109 loop_set_size(lo, size); 1110 1111 /* Order wrt reading lo_state in loop_validate_file(). */ 1112 wmb(); 1113 1114 lo->lo_state = Lo_bound; 1115 if (part_shift) 1116 lo->lo_flags |= LO_FLAGS_PARTSCAN; 1117 partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; 1118 if (partscan) 1119 clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); 1120 1121 /* enable and uncork uevent now that we are done */ 1122 dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); 1123 1124 loop_global_unlock(lo, is_loop); 1125 if (partscan) 1126 loop_reread_partitions(lo); 1127 1128 if (!(mode & BLK_OPEN_EXCL)) 1129 bd_abort_claiming(bdev, loop_configure); 1130 1131 return 0; 1132 1133 out_unlock: 1134 loop_global_unlock(lo, is_loop); 1135 out_bdev: 1136 if (!(mode & BLK_OPEN_EXCL)) 1137 bd_abort_claiming(bdev, loop_configure); 1138 out_putf: 1139 fput(file); 1140 /* This is safe: open() is still holding a reference. */ 1141 module_put(THIS_MODULE); 1142 return error; 1143 } 1144 1145 static void __loop_clr_fd(struct loop_device *lo) 1146 { 1147 struct queue_limits lim; 1148 struct file *filp; 1149 gfp_t gfp = lo->old_gfp_mask; 1150 1151 spin_lock_irq(&lo->lo_lock); 1152 filp = lo->lo_backing_file; 1153 lo->lo_backing_file = NULL; 1154 spin_unlock_irq(&lo->lo_lock); 1155 1156 lo->lo_device = NULL; 1157 lo->lo_offset = 0; 1158 lo->lo_sizelimit = 0; 1159 memset(lo->lo_file_name, 0, LO_NAME_SIZE); 1160 1161 /* 1162 * Reset the block size to the default. 1163 * 1164 * No queue freezing needed because this is called from the final 1165 * ->release call only, so there can't be any outstanding I/O. 1166 */ 1167 lim = queue_limits_start_update(lo->lo_queue); 1168 lim.logical_block_size = SECTOR_SIZE; 1169 lim.physical_block_size = SECTOR_SIZE; 1170 lim.io_min = SECTOR_SIZE; 1171 queue_limits_commit_update(lo->lo_queue, &lim); 1172 1173 invalidate_disk(lo->lo_disk); 1174 loop_sysfs_exit(lo); 1175 /* let user-space know about this change */ 1176 kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); 1177 mapping_set_gfp_mask(filp->f_mapping, gfp); 1178 /* This is safe: open() is still holding a reference. */ 1179 module_put(THIS_MODULE); 1180 1181 disk_force_media_change(lo->lo_disk); 1182 1183 if (lo->lo_flags & LO_FLAGS_PARTSCAN) { 1184 int err; 1185 1186 /* 1187 * open_mutex has been held already in release path, so don't 1188 * acquire it if this function is called in such case. 1189 * 1190 * If the reread partition isn't from release path, lo_refcnt 1191 * must be at least one and it can only become zero when the 1192 * current holder is released. 1193 */ 1194 err = bdev_disk_changed(lo->lo_disk, false); 1195 if (err) 1196 pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", 1197 __func__, lo->lo_number, err); 1198 /* Device is gone, no point in returning error */ 1199 } 1200 1201 /* 1202 * lo->lo_state is set to Lo_unbound here after above partscan has 1203 * finished. There cannot be anybody else entering __loop_clr_fd() as 1204 * Lo_rundown state protects us from all the other places trying to 1205 * change the 'lo' device. 1206 */ 1207 lo->lo_flags = 0; 1208 if (!part_shift) 1209 set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); 1210 mutex_lock(&lo->lo_mutex); 1211 lo->lo_state = Lo_unbound; 1212 mutex_unlock(&lo->lo_mutex); 1213 1214 /* 1215 * Need not hold lo_mutex to fput backing file. Calling fput holding 1216 * lo_mutex triggers a circular lock dependency possibility warning as 1217 * fput can take open_mutex which is usually taken before lo_mutex. 1218 */ 1219 fput(filp); 1220 } 1221 1222 static int loop_clr_fd(struct loop_device *lo) 1223 { 1224 int err; 1225 1226 /* 1227 * Since lo_ioctl() is called without locks held, it is possible that 1228 * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. 1229 * 1230 * Therefore, use global lock when setting Lo_rundown state in order to 1231 * make sure that loop_validate_file() will fail if the "struct file" 1232 * which loop_configure()/loop_change_fd() found via fget() was this 1233 * loop device. 1234 */ 1235 err = loop_global_lock_killable(lo, true); 1236 if (err) 1237 return err; 1238 if (lo->lo_state != Lo_bound) { 1239 loop_global_unlock(lo, true); 1240 return -ENXIO; 1241 } 1242 /* 1243 * Mark the device for removing the backing device on last close. 1244 * If we are the only opener, also switch the state to roundown here to 1245 * prevent new openers from coming in. 1246 */ 1247 1248 lo->lo_flags |= LO_FLAGS_AUTOCLEAR; 1249 if (disk_openers(lo->lo_disk) == 1) 1250 lo->lo_state = Lo_rundown; 1251 loop_global_unlock(lo, true); 1252 1253 return 0; 1254 } 1255 1256 static int 1257 loop_set_status(struct loop_device *lo, const struct loop_info64 *info) 1258 { 1259 int err; 1260 bool partscan = false; 1261 bool size_changed = false; 1262 1263 err = mutex_lock_killable(&lo->lo_mutex); 1264 if (err) 1265 return err; 1266 if (lo->lo_state != Lo_bound) { 1267 err = -ENXIO; 1268 goto out_unlock; 1269 } 1270 1271 if (lo->lo_offset != info->lo_offset || 1272 lo->lo_sizelimit != info->lo_sizelimit) { 1273 size_changed = true; 1274 sync_blockdev(lo->lo_device); 1275 invalidate_bdev(lo->lo_device); 1276 } 1277 1278 /* I/O needs to be drained before changing lo_offset or lo_sizelimit */ 1279 blk_mq_freeze_queue(lo->lo_queue); 1280 1281 err = loop_set_status_from_info(lo, info); 1282 if (err) 1283 goto out_unfreeze; 1284 1285 partscan = !(lo->lo_flags & LO_FLAGS_PARTSCAN) && 1286 (info->lo_flags & LO_FLAGS_PARTSCAN); 1287 1288 lo->lo_flags &= ~(LOOP_SET_STATUS_SETTABLE_FLAGS | 1289 LOOP_SET_STATUS_CLEARABLE_FLAGS); 1290 lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS); 1291 1292 if (size_changed) { 1293 loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, 1294 lo->lo_backing_file); 1295 loop_set_size(lo, new_size); 1296 } 1297 1298 /* update the direct I/O flag if lo_offset changed */ 1299 loop_update_dio(lo); 1300 1301 out_unfreeze: 1302 blk_mq_unfreeze_queue(lo->lo_queue); 1303 if (partscan) 1304 clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); 1305 out_unlock: 1306 mutex_unlock(&lo->lo_mutex); 1307 if (partscan) 1308 loop_reread_partitions(lo); 1309 1310 return err; 1311 } 1312 1313 static int 1314 loop_get_status(struct loop_device *lo, struct loop_info64 *info) 1315 { 1316 struct path path; 1317 struct kstat stat; 1318 int ret; 1319 1320 ret = mutex_lock_killable(&lo->lo_mutex); 1321 if (ret) 1322 return ret; 1323 if (lo->lo_state != Lo_bound) { 1324 mutex_unlock(&lo->lo_mutex); 1325 return -ENXIO; 1326 } 1327 1328 memset(info, 0, sizeof(*info)); 1329 info->lo_number = lo->lo_number; 1330 info->lo_offset = lo->lo_offset; 1331 info->lo_sizelimit = lo->lo_sizelimit; 1332 info->lo_flags = lo->lo_flags; 1333 memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); 1334 1335 /* Drop lo_mutex while we call into the filesystem. */ 1336 path = lo->lo_backing_file->f_path; 1337 path_get(&path); 1338 mutex_unlock(&lo->lo_mutex); 1339 ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); 1340 if (!ret) { 1341 info->lo_device = huge_encode_dev(stat.dev); 1342 info->lo_inode = stat.ino; 1343 info->lo_rdevice = huge_encode_dev(stat.rdev); 1344 } 1345 path_put(&path); 1346 return ret; 1347 } 1348 1349 static void 1350 loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) 1351 { 1352 memset(info64, 0, sizeof(*info64)); 1353 info64->lo_number = info->lo_number; 1354 info64->lo_device = info->lo_device; 1355 info64->lo_inode = info->lo_inode; 1356 info64->lo_rdevice = info->lo_rdevice; 1357 info64->lo_offset = info->lo_offset; 1358 info64->lo_sizelimit = 0; 1359 info64->lo_flags = info->lo_flags; 1360 memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); 1361 } 1362 1363 static int 1364 loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) 1365 { 1366 memset(info, 0, sizeof(*info)); 1367 info->lo_number = info64->lo_number; 1368 info->lo_device = info64->lo_device; 1369 info->lo_inode = info64->lo_inode; 1370 info->lo_rdevice = info64->lo_rdevice; 1371 info->lo_offset = info64->lo_offset; 1372 info->lo_flags = info64->lo_flags; 1373 memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); 1374 1375 /* error in case values were truncated */ 1376 if (info->lo_device != info64->lo_device || 1377 info->lo_rdevice != info64->lo_rdevice || 1378 info->lo_inode != info64->lo_inode || 1379 info->lo_offset != info64->lo_offset) 1380 return -EOVERFLOW; 1381 1382 return 0; 1383 } 1384 1385 static int 1386 loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg) 1387 { 1388 struct loop_info info; 1389 struct loop_info64 info64; 1390 1391 if (copy_from_user(&info, arg, sizeof (struct loop_info))) 1392 return -EFAULT; 1393 loop_info64_from_old(&info, &info64); 1394 return loop_set_status(lo, &info64); 1395 } 1396 1397 static int 1398 loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg) 1399 { 1400 struct loop_info64 info64; 1401 1402 if (copy_from_user(&info64, arg, sizeof (struct loop_info64))) 1403 return -EFAULT; 1404 return loop_set_status(lo, &info64); 1405 } 1406 1407 static int 1408 loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) { 1409 struct loop_info info; 1410 struct loop_info64 info64; 1411 int err; 1412 1413 if (!arg) 1414 return -EINVAL; 1415 err = loop_get_status(lo, &info64); 1416 if (!err) 1417 err = loop_info64_to_old(&info64, &info); 1418 if (!err && copy_to_user(arg, &info, sizeof(info))) 1419 err = -EFAULT; 1420 1421 return err; 1422 } 1423 1424 static int 1425 loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { 1426 struct loop_info64 info64; 1427 int err; 1428 1429 if (!arg) 1430 return -EINVAL; 1431 err = loop_get_status(lo, &info64); 1432 if (!err && copy_to_user(arg, &info64, sizeof(info64))) 1433 err = -EFAULT; 1434 1435 return err; 1436 } 1437 1438 static int loop_set_capacity(struct loop_device *lo) 1439 { 1440 loff_t size; 1441 1442 if (unlikely(lo->lo_state != Lo_bound)) 1443 return -ENXIO; 1444 1445 size = get_loop_size(lo, lo->lo_backing_file); 1446 loop_set_size(lo, size); 1447 1448 return 0; 1449 } 1450 1451 static int loop_set_dio(struct loop_device *lo, unsigned long arg) 1452 { 1453 bool use_dio = !!arg; 1454 1455 if (lo->lo_state != Lo_bound) 1456 return -ENXIO; 1457 if (use_dio == lo->use_dio) 1458 return 0; 1459 1460 if (use_dio) { 1461 if (!lo_can_use_dio(lo)) 1462 return -EINVAL; 1463 /* flush dirty pages before starting to use direct I/O */ 1464 vfs_fsync(lo->lo_backing_file, 0); 1465 } 1466 1467 blk_mq_freeze_queue(lo->lo_queue); 1468 lo->use_dio = use_dio; 1469 if (use_dio) 1470 lo->lo_flags |= LO_FLAGS_DIRECT_IO; 1471 else 1472 lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; 1473 blk_mq_unfreeze_queue(lo->lo_queue); 1474 return 0; 1475 } 1476 1477 static int loop_set_block_size(struct loop_device *lo, unsigned long arg) 1478 { 1479 struct queue_limits lim; 1480 int err = 0; 1481 1482 if (lo->lo_state != Lo_bound) 1483 return -ENXIO; 1484 1485 if (lo->lo_queue->limits.logical_block_size == arg) 1486 return 0; 1487 1488 sync_blockdev(lo->lo_device); 1489 invalidate_bdev(lo->lo_device); 1490 1491 lim = queue_limits_start_update(lo->lo_queue); 1492 loop_update_limits(lo, &lim, arg); 1493 1494 blk_mq_freeze_queue(lo->lo_queue); 1495 err = queue_limits_commit_update(lo->lo_queue, &lim); 1496 loop_update_dio(lo); 1497 blk_mq_unfreeze_queue(lo->lo_queue); 1498 1499 return err; 1500 } 1501 1502 static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, 1503 unsigned long arg) 1504 { 1505 int err; 1506 1507 err = mutex_lock_killable(&lo->lo_mutex); 1508 if (err) 1509 return err; 1510 switch (cmd) { 1511 case LOOP_SET_CAPACITY: 1512 err = loop_set_capacity(lo); 1513 break; 1514 case LOOP_SET_DIRECT_IO: 1515 err = loop_set_dio(lo, arg); 1516 break; 1517 case LOOP_SET_BLOCK_SIZE: 1518 err = loop_set_block_size(lo, arg); 1519 break; 1520 default: 1521 err = -EINVAL; 1522 } 1523 mutex_unlock(&lo->lo_mutex); 1524 return err; 1525 } 1526 1527 static int lo_ioctl(struct block_device *bdev, blk_mode_t mode, 1528 unsigned int cmd, unsigned long arg) 1529 { 1530 struct loop_device *lo = bdev->bd_disk->private_data; 1531 void __user *argp = (void __user *) arg; 1532 int err; 1533 1534 switch (cmd) { 1535 case LOOP_SET_FD: { 1536 /* 1537 * Legacy case - pass in a zeroed out struct loop_config with 1538 * only the file descriptor set , which corresponds with the 1539 * default parameters we'd have used otherwise. 1540 */ 1541 struct loop_config config; 1542 1543 memset(&config, 0, sizeof(config)); 1544 config.fd = arg; 1545 1546 return loop_configure(lo, mode, bdev, &config); 1547 } 1548 case LOOP_CONFIGURE: { 1549 struct loop_config config; 1550 1551 if (copy_from_user(&config, argp, sizeof(config))) 1552 return -EFAULT; 1553 1554 return loop_configure(lo, mode, bdev, &config); 1555 } 1556 case LOOP_CHANGE_FD: 1557 return loop_change_fd(lo, bdev, arg); 1558 case LOOP_CLR_FD: 1559 return loop_clr_fd(lo); 1560 case LOOP_SET_STATUS: 1561 err = -EPERM; 1562 if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) 1563 err = loop_set_status_old(lo, argp); 1564 break; 1565 case LOOP_GET_STATUS: 1566 return loop_get_status_old(lo, argp); 1567 case LOOP_SET_STATUS64: 1568 err = -EPERM; 1569 if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) 1570 err = loop_set_status64(lo, argp); 1571 break; 1572 case LOOP_GET_STATUS64: 1573 return loop_get_status64(lo, argp); 1574 case LOOP_SET_CAPACITY: 1575 case LOOP_SET_DIRECT_IO: 1576 case LOOP_SET_BLOCK_SIZE: 1577 if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) 1578 return -EPERM; 1579 fallthrough; 1580 default: 1581 err = lo_simple_ioctl(lo, cmd, arg); 1582 break; 1583 } 1584 1585 return err; 1586 } 1587 1588 #ifdef CONFIG_COMPAT 1589 struct compat_loop_info { 1590 compat_int_t lo_number; /* ioctl r/o */ 1591 compat_dev_t lo_device; /* ioctl r/o */ 1592 compat_ulong_t lo_inode; /* ioctl r/o */ 1593 compat_dev_t lo_rdevice; /* ioctl r/o */ 1594 compat_int_t lo_offset; 1595 compat_int_t lo_encrypt_type; /* obsolete, ignored */ 1596 compat_int_t lo_encrypt_key_size; /* ioctl w/o */ 1597 compat_int_t lo_flags; /* ioctl r/o */ 1598 char lo_name[LO_NAME_SIZE]; 1599 unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */ 1600 compat_ulong_t lo_init[2]; 1601 char reserved[4]; 1602 }; 1603 1604 /* 1605 * Transfer 32-bit compatibility structure in userspace to 64-bit loop info 1606 * - noinlined to reduce stack space usage in main part of driver 1607 */ 1608 static noinline int 1609 loop_info64_from_compat(const struct compat_loop_info __user *arg, 1610 struct loop_info64 *info64) 1611 { 1612 struct compat_loop_info info; 1613 1614 if (copy_from_user(&info, arg, sizeof(info))) 1615 return -EFAULT; 1616 1617 memset(info64, 0, sizeof(*info64)); 1618 info64->lo_number = info.lo_number; 1619 info64->lo_device = info.lo_device; 1620 info64->lo_inode = info.lo_inode; 1621 info64->lo_rdevice = info.lo_rdevice; 1622 info64->lo_offset = info.lo_offset; 1623 info64->lo_sizelimit = 0; 1624 info64->lo_flags = info.lo_flags; 1625 memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); 1626 return 0; 1627 } 1628 1629 /* 1630 * Transfer 64-bit loop info to 32-bit compatibility structure in userspace 1631 * - noinlined to reduce stack space usage in main part of driver 1632 */ 1633 static noinline int 1634 loop_info64_to_compat(const struct loop_info64 *info64, 1635 struct compat_loop_info __user *arg) 1636 { 1637 struct compat_loop_info info; 1638 1639 memset(&info, 0, sizeof(info)); 1640 info.lo_number = info64->lo_number; 1641 info.lo_device = info64->lo_device; 1642 info.lo_inode = info64->lo_inode; 1643 info.lo_rdevice = info64->lo_rdevice; 1644 info.lo_offset = info64->lo_offset; 1645 info.lo_flags = info64->lo_flags; 1646 memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); 1647 1648 /* error in case values were truncated */ 1649 if (info.lo_device != info64->lo_device || 1650 info.lo_rdevice != info64->lo_rdevice || 1651 info.lo_inode != info64->lo_inode || 1652 info.lo_offset != info64->lo_offset) 1653 return -EOVERFLOW; 1654 1655 if (copy_to_user(arg, &info, sizeof(info))) 1656 return -EFAULT; 1657 return 0; 1658 } 1659 1660 static int 1661 loop_set_status_compat(struct loop_device *lo, 1662 const struct compat_loop_info __user *arg) 1663 { 1664 struct loop_info64 info64; 1665 int ret; 1666 1667 ret = loop_info64_from_compat(arg, &info64); 1668 if (ret < 0) 1669 return ret; 1670 return loop_set_status(lo, &info64); 1671 } 1672 1673 static int 1674 loop_get_status_compat(struct loop_device *lo, 1675 struct compat_loop_info __user *arg) 1676 { 1677 struct loop_info64 info64; 1678 int err; 1679 1680 if (!arg) 1681 return -EINVAL; 1682 err = loop_get_status(lo, &info64); 1683 if (!err) 1684 err = loop_info64_to_compat(&info64, arg); 1685 return err; 1686 } 1687 1688 static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 1689 unsigned int cmd, unsigned long arg) 1690 { 1691 struct loop_device *lo = bdev->bd_disk->private_data; 1692 int err; 1693 1694 switch(cmd) { 1695 case LOOP_SET_STATUS: 1696 err = loop_set_status_compat(lo, 1697 (const struct compat_loop_info __user *)arg); 1698 break; 1699 case LOOP_GET_STATUS: 1700 err = loop_get_status_compat(lo, 1701 (struct compat_loop_info __user *)arg); 1702 break; 1703 case LOOP_SET_CAPACITY: 1704 case LOOP_CLR_FD: 1705 case LOOP_GET_STATUS64: 1706 case LOOP_SET_STATUS64: 1707 case LOOP_CONFIGURE: 1708 arg = (unsigned long) compat_ptr(arg); 1709 fallthrough; 1710 case LOOP_SET_FD: 1711 case LOOP_CHANGE_FD: 1712 case LOOP_SET_BLOCK_SIZE: 1713 case LOOP_SET_DIRECT_IO: 1714 err = lo_ioctl(bdev, mode, cmd, arg); 1715 break; 1716 default: 1717 err = -ENOIOCTLCMD; 1718 break; 1719 } 1720 return err; 1721 } 1722 #endif 1723 1724 static int lo_open(struct gendisk *disk, blk_mode_t mode) 1725 { 1726 struct loop_device *lo = disk->private_data; 1727 int err; 1728 1729 err = mutex_lock_killable(&lo->lo_mutex); 1730 if (err) 1731 return err; 1732 1733 if (lo->lo_state == Lo_deleting || lo->lo_state == Lo_rundown) 1734 err = -ENXIO; 1735 mutex_unlock(&lo->lo_mutex); 1736 return err; 1737 } 1738 1739 static void lo_release(struct gendisk *disk) 1740 { 1741 struct loop_device *lo = disk->private_data; 1742 bool need_clear = false; 1743 1744 if (disk_openers(disk) > 0) 1745 return; 1746 /* 1747 * Clear the backing device information if this is the last close of 1748 * a device that's been marked for auto clear, or on which LOOP_CLR_FD 1749 * has been called. 1750 */ 1751 1752 mutex_lock(&lo->lo_mutex); 1753 if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) 1754 lo->lo_state = Lo_rundown; 1755 1756 need_clear = (lo->lo_state == Lo_rundown); 1757 mutex_unlock(&lo->lo_mutex); 1758 1759 if (need_clear) 1760 __loop_clr_fd(lo); 1761 } 1762 1763 static void lo_free_disk(struct gendisk *disk) 1764 { 1765 struct loop_device *lo = disk->private_data; 1766 1767 if (lo->workqueue) 1768 destroy_workqueue(lo->workqueue); 1769 loop_free_idle_workers(lo, true); 1770 timer_shutdown_sync(&lo->timer); 1771 mutex_destroy(&lo->lo_mutex); 1772 kfree(lo); 1773 } 1774 1775 static const struct block_device_operations lo_fops = { 1776 .owner = THIS_MODULE, 1777 .open = lo_open, 1778 .release = lo_release, 1779 .ioctl = lo_ioctl, 1780 #ifdef CONFIG_COMPAT 1781 .compat_ioctl = lo_compat_ioctl, 1782 #endif 1783 .free_disk = lo_free_disk, 1784 }; 1785 1786 /* 1787 * And now the modules code and kernel interface. 1788 */ 1789 1790 /* 1791 * If max_loop is specified, create that many devices upfront. 1792 * This also becomes a hard limit. If max_loop is not specified, 1793 * the default isn't a hard limit (as before commit 85c50197716c 1794 * changed the default value from 0 for max_loop=0 reasons), just 1795 * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module 1796 * init time. Loop devices can be requested on-demand with the 1797 * /dev/loop-control interface, or be instantiated by accessing 1798 * a 'dead' device node. 1799 */ 1800 static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT; 1801 1802 #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD 1803 static bool max_loop_specified; 1804 1805 static int max_loop_param_set_int(const char *val, 1806 const struct kernel_param *kp) 1807 { 1808 int ret; 1809 1810 ret = param_set_int(val, kp); 1811 if (ret < 0) 1812 return ret; 1813 1814 max_loop_specified = true; 1815 return 0; 1816 } 1817 1818 static const struct kernel_param_ops max_loop_param_ops = { 1819 .set = max_loop_param_set_int, 1820 .get = param_get_int, 1821 }; 1822 1823 module_param_cb(max_loop, &max_loop_param_ops, &max_loop, 0444); 1824 MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); 1825 #else 1826 module_param(max_loop, int, 0444); 1827 MODULE_PARM_DESC(max_loop, "Initial number of loop devices"); 1828 #endif 1829 1830 module_param(max_part, int, 0444); 1831 MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); 1832 1833 static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; 1834 1835 static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) 1836 { 1837 int qd, ret; 1838 1839 ret = kstrtoint(s, 0, &qd); 1840 if (ret < 0) 1841 return ret; 1842 if (qd < 1) 1843 return -EINVAL; 1844 hw_queue_depth = qd; 1845 return 0; 1846 } 1847 1848 static const struct kernel_param_ops loop_hw_qdepth_param_ops = { 1849 .set = loop_set_hw_queue_depth, 1850 .get = param_get_int, 1851 }; 1852 1853 device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); 1854 MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH)); 1855 1856 MODULE_DESCRIPTION("Loopback device support"); 1857 MODULE_LICENSE("GPL"); 1858 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); 1859 1860 static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, 1861 const struct blk_mq_queue_data *bd) 1862 { 1863 struct request *rq = bd->rq; 1864 struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); 1865 struct loop_device *lo = rq->q->queuedata; 1866 1867 blk_mq_start_request(rq); 1868 1869 if (lo->lo_state != Lo_bound) 1870 return BLK_STS_IOERR; 1871 1872 switch (req_op(rq)) { 1873 case REQ_OP_FLUSH: 1874 case REQ_OP_DISCARD: 1875 case REQ_OP_WRITE_ZEROES: 1876 cmd->use_aio = false; 1877 break; 1878 default: 1879 cmd->use_aio = lo->use_dio; 1880 break; 1881 } 1882 1883 /* always use the first bio's css */ 1884 cmd->blkcg_css = NULL; 1885 cmd->memcg_css = NULL; 1886 #ifdef CONFIG_BLK_CGROUP 1887 if (rq->bio) { 1888 cmd->blkcg_css = bio_blkcg_css(rq->bio); 1889 #ifdef CONFIG_MEMCG 1890 if (cmd->blkcg_css) { 1891 cmd->memcg_css = 1892 cgroup_get_e_css(cmd->blkcg_css->cgroup, 1893 &memory_cgrp_subsys); 1894 } 1895 #endif 1896 } 1897 #endif 1898 loop_queue_work(lo, cmd); 1899 1900 return BLK_STS_OK; 1901 } 1902 1903 static void loop_handle_cmd(struct loop_cmd *cmd) 1904 { 1905 struct cgroup_subsys_state *cmd_blkcg_css = cmd->blkcg_css; 1906 struct cgroup_subsys_state *cmd_memcg_css = cmd->memcg_css; 1907 struct request *rq = blk_mq_rq_from_pdu(cmd); 1908 const bool write = op_is_write(req_op(rq)); 1909 struct loop_device *lo = rq->q->queuedata; 1910 int ret = 0; 1911 struct mem_cgroup *old_memcg = NULL; 1912 const bool use_aio = cmd->use_aio; 1913 1914 if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { 1915 ret = -EIO; 1916 goto failed; 1917 } 1918 1919 if (cmd_blkcg_css) 1920 kthread_associate_blkcg(cmd_blkcg_css); 1921 if (cmd_memcg_css) 1922 old_memcg = set_active_memcg( 1923 mem_cgroup_from_css(cmd_memcg_css)); 1924 1925 /* 1926 * do_req_filebacked() may call blk_mq_complete_request() synchronously 1927 * or asynchronously if using aio. Hence, do not touch 'cmd' after 1928 * do_req_filebacked() has returned unless we are sure that 'cmd' has 1929 * not yet been completed. 1930 */ 1931 ret = do_req_filebacked(lo, rq); 1932 1933 if (cmd_blkcg_css) 1934 kthread_associate_blkcg(NULL); 1935 1936 if (cmd_memcg_css) { 1937 set_active_memcg(old_memcg); 1938 css_put(cmd_memcg_css); 1939 } 1940 failed: 1941 /* complete non-aio request */ 1942 if (!use_aio || ret) { 1943 if (ret == -EOPNOTSUPP) 1944 cmd->ret = ret; 1945 else 1946 cmd->ret = ret ? -EIO : 0; 1947 if (likely(!blk_should_fake_timeout(rq->q))) 1948 blk_mq_complete_request(rq); 1949 } 1950 } 1951 1952 static void loop_process_work(struct loop_worker *worker, 1953 struct list_head *cmd_list, struct loop_device *lo) 1954 { 1955 int orig_flags = current->flags; 1956 struct loop_cmd *cmd; 1957 1958 current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; 1959 spin_lock_irq(&lo->lo_work_lock); 1960 while (!list_empty(cmd_list)) { 1961 cmd = container_of( 1962 cmd_list->next, struct loop_cmd, list_entry); 1963 list_del(cmd_list->next); 1964 spin_unlock_irq(&lo->lo_work_lock); 1965 1966 loop_handle_cmd(cmd); 1967 cond_resched(); 1968 1969 spin_lock_irq(&lo->lo_work_lock); 1970 } 1971 1972 /* 1973 * We only add to the idle list if there are no pending cmds 1974 * *and* the worker will not run again which ensures that it 1975 * is safe to free any worker on the idle list 1976 */ 1977 if (worker && !work_pending(&worker->work)) { 1978 worker->last_ran_at = jiffies; 1979 list_add_tail(&worker->idle_list, &lo->idle_worker_list); 1980 loop_set_timer(lo); 1981 } 1982 spin_unlock_irq(&lo->lo_work_lock); 1983 current->flags = orig_flags; 1984 } 1985 1986 static void loop_workfn(struct work_struct *work) 1987 { 1988 struct loop_worker *worker = 1989 container_of(work, struct loop_worker, work); 1990 loop_process_work(worker, &worker->cmd_list, worker->lo); 1991 } 1992 1993 static void loop_rootcg_workfn(struct work_struct *work) 1994 { 1995 struct loop_device *lo = 1996 container_of(work, struct loop_device, rootcg_work); 1997 loop_process_work(NULL, &lo->rootcg_cmd_list, lo); 1998 } 1999 2000 static const struct blk_mq_ops loop_mq_ops = { 2001 .queue_rq = loop_queue_rq, 2002 .complete = lo_complete_rq, 2003 }; 2004 2005 static int loop_add(int i) 2006 { 2007 struct queue_limits lim = { 2008 /* 2009 * Random number picked from the historic block max_sectors cap. 2010 */ 2011 .max_hw_sectors = 2560u, 2012 }; 2013 struct loop_device *lo; 2014 struct gendisk *disk; 2015 int err; 2016 2017 err = -ENOMEM; 2018 lo = kzalloc(sizeof(*lo), GFP_KERNEL); 2019 if (!lo) 2020 goto out; 2021 lo->worker_tree = RB_ROOT; 2022 INIT_LIST_HEAD(&lo->idle_worker_list); 2023 timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); 2024 lo->lo_state = Lo_unbound; 2025 2026 err = mutex_lock_killable(&loop_ctl_mutex); 2027 if (err) 2028 goto out_free_dev; 2029 2030 /* allocate id, if @id >= 0, we're requesting that specific id */ 2031 if (i >= 0) { 2032 err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); 2033 if (err == -ENOSPC) 2034 err = -EEXIST; 2035 } else { 2036 err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL); 2037 } 2038 mutex_unlock(&loop_ctl_mutex); 2039 if (err < 0) 2040 goto out_free_dev; 2041 i = err; 2042 2043 lo->tag_set.ops = &loop_mq_ops; 2044 lo->tag_set.nr_hw_queues = 1; 2045 lo->tag_set.queue_depth = hw_queue_depth; 2046 lo->tag_set.numa_node = NUMA_NO_NODE; 2047 lo->tag_set.cmd_size = sizeof(struct loop_cmd); 2048 lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; 2049 lo->tag_set.driver_data = lo; 2050 2051 err = blk_mq_alloc_tag_set(&lo->tag_set); 2052 if (err) 2053 goto out_free_idr; 2054 2055 disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo); 2056 if (IS_ERR(disk)) { 2057 err = PTR_ERR(disk); 2058 goto out_cleanup_tags; 2059 } 2060 lo->lo_queue = lo->lo_disk->queue; 2061 2062 /* 2063 * Disable partition scanning by default. The in-kernel partition 2064 * scanning can be requested individually per-device during its 2065 * setup. Userspace can always add and remove partitions from all 2066 * devices. The needed partition minors are allocated from the 2067 * extended minor space, the main loop device numbers will continue 2068 * to match the loop minors, regardless of the number of partitions 2069 * used. 2070 * 2071 * If max_part is given, partition scanning is globally enabled for 2072 * all loop devices. The minors for the main loop devices will be 2073 * multiples of max_part. 2074 * 2075 * Note: Global-for-all-devices, set-only-at-init, read-only module 2076 * parameteters like 'max_loop' and 'max_part' make things needlessly 2077 * complicated, are too static, inflexible and may surprise 2078 * userspace tools. Parameters like this in general should be avoided. 2079 */ 2080 if (!part_shift) 2081 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); 2082 mutex_init(&lo->lo_mutex); 2083 lo->lo_number = i; 2084 spin_lock_init(&lo->lo_lock); 2085 spin_lock_init(&lo->lo_work_lock); 2086 INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); 2087 INIT_LIST_HEAD(&lo->rootcg_cmd_list); 2088 disk->major = LOOP_MAJOR; 2089 disk->first_minor = i << part_shift; 2090 disk->minors = 1 << part_shift; 2091 disk->fops = &lo_fops; 2092 disk->private_data = lo; 2093 disk->queue = lo->lo_queue; 2094 disk->events = DISK_EVENT_MEDIA_CHANGE; 2095 disk->event_flags = DISK_EVENT_FLAG_UEVENT; 2096 sprintf(disk->disk_name, "loop%d", i); 2097 /* Make this loop device reachable from pathname. */ 2098 err = add_disk(disk); 2099 if (err) 2100 goto out_cleanup_disk; 2101 2102 /* Show this loop device. */ 2103 mutex_lock(&loop_ctl_mutex); 2104 lo->idr_visible = true; 2105 mutex_unlock(&loop_ctl_mutex); 2106 2107 return i; 2108 2109 out_cleanup_disk: 2110 put_disk(disk); 2111 out_cleanup_tags: 2112 blk_mq_free_tag_set(&lo->tag_set); 2113 out_free_idr: 2114 mutex_lock(&loop_ctl_mutex); 2115 idr_remove(&loop_index_idr, i); 2116 mutex_unlock(&loop_ctl_mutex); 2117 out_free_dev: 2118 kfree(lo); 2119 out: 2120 return err; 2121 } 2122 2123 static void loop_remove(struct loop_device *lo) 2124 { 2125 /* Make this loop device unreachable from pathname. */ 2126 del_gendisk(lo->lo_disk); 2127 blk_mq_free_tag_set(&lo->tag_set); 2128 2129 mutex_lock(&loop_ctl_mutex); 2130 idr_remove(&loop_index_idr, lo->lo_number); 2131 mutex_unlock(&loop_ctl_mutex); 2132 2133 put_disk(lo->lo_disk); 2134 } 2135 2136 #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD 2137 static void loop_probe(dev_t dev) 2138 { 2139 int idx = MINOR(dev) >> part_shift; 2140 2141 if (max_loop_specified && max_loop && idx >= max_loop) 2142 return; 2143 loop_add(idx); 2144 } 2145 #else 2146 #define loop_probe NULL 2147 #endif /* !CONFIG_BLOCK_LEGACY_AUTOLOAD */ 2148 2149 static int loop_control_remove(int idx) 2150 { 2151 struct loop_device *lo; 2152 int ret; 2153 2154 if (idx < 0) { 2155 pr_warn_once("deleting an unspecified loop device is not supported.\n"); 2156 return -EINVAL; 2157 } 2158 2159 /* Hide this loop device for serialization. */ 2160 ret = mutex_lock_killable(&loop_ctl_mutex); 2161 if (ret) 2162 return ret; 2163 lo = idr_find(&loop_index_idr, idx); 2164 if (!lo || !lo->idr_visible) 2165 ret = -ENODEV; 2166 else 2167 lo->idr_visible = false; 2168 mutex_unlock(&loop_ctl_mutex); 2169 if (ret) 2170 return ret; 2171 2172 /* Check whether this loop device can be removed. */ 2173 ret = mutex_lock_killable(&lo->lo_mutex); 2174 if (ret) 2175 goto mark_visible; 2176 if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { 2177 mutex_unlock(&lo->lo_mutex); 2178 ret = -EBUSY; 2179 goto mark_visible; 2180 } 2181 /* Mark this loop device as no more bound, but not quite unbound yet */ 2182 lo->lo_state = Lo_deleting; 2183 mutex_unlock(&lo->lo_mutex); 2184 2185 loop_remove(lo); 2186 return 0; 2187 2188 mark_visible: 2189 /* Show this loop device again. */ 2190 mutex_lock(&loop_ctl_mutex); 2191 lo->idr_visible = true; 2192 mutex_unlock(&loop_ctl_mutex); 2193 return ret; 2194 } 2195 2196 static int loop_control_get_free(int idx) 2197 { 2198 struct loop_device *lo; 2199 int id, ret; 2200 2201 ret = mutex_lock_killable(&loop_ctl_mutex); 2202 if (ret) 2203 return ret; 2204 idr_for_each_entry(&loop_index_idr, lo, id) { 2205 /* Hitting a race results in creating a new loop device which is harmless. */ 2206 if (lo->idr_visible && data_race(lo->lo_state) == Lo_unbound) 2207 goto found; 2208 } 2209 mutex_unlock(&loop_ctl_mutex); 2210 return loop_add(-1); 2211 found: 2212 mutex_unlock(&loop_ctl_mutex); 2213 return id; 2214 } 2215 2216 static long loop_control_ioctl(struct file *file, unsigned int cmd, 2217 unsigned long parm) 2218 { 2219 switch (cmd) { 2220 case LOOP_CTL_ADD: 2221 return loop_add(parm); 2222 case LOOP_CTL_REMOVE: 2223 return loop_control_remove(parm); 2224 case LOOP_CTL_GET_FREE: 2225 return loop_control_get_free(parm); 2226 default: 2227 return -ENOSYS; 2228 } 2229 } 2230 2231 static const struct file_operations loop_ctl_fops = { 2232 .open = nonseekable_open, 2233 .unlocked_ioctl = loop_control_ioctl, 2234 .compat_ioctl = loop_control_ioctl, 2235 .owner = THIS_MODULE, 2236 .llseek = noop_llseek, 2237 }; 2238 2239 static struct miscdevice loop_misc = { 2240 .minor = LOOP_CTRL_MINOR, 2241 .name = "loop-control", 2242 .fops = &loop_ctl_fops, 2243 }; 2244 2245 MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR); 2246 MODULE_ALIAS("devname:loop-control"); 2247 2248 static int __init loop_init(void) 2249 { 2250 int i; 2251 int err; 2252 2253 part_shift = 0; 2254 if (max_part > 0) { 2255 part_shift = fls(max_part); 2256 2257 /* 2258 * Adjust max_part according to part_shift as it is exported 2259 * to user space so that user can decide correct minor number 2260 * if [s]he want to create more devices. 2261 * 2262 * Note that -1 is required because partition 0 is reserved 2263 * for the whole disk. 2264 */ 2265 max_part = (1UL << part_shift) - 1; 2266 } 2267 2268 if ((1UL << part_shift) > DISK_MAX_PARTS) { 2269 err = -EINVAL; 2270 goto err_out; 2271 } 2272 2273 if (max_loop > 1UL << (MINORBITS - part_shift)) { 2274 err = -EINVAL; 2275 goto err_out; 2276 } 2277 2278 err = misc_register(&loop_misc); 2279 if (err < 0) 2280 goto err_out; 2281 2282 2283 if (__register_blkdev(LOOP_MAJOR, "loop", loop_probe)) { 2284 err = -EIO; 2285 goto misc_out; 2286 } 2287 2288 /* pre-create number of devices given by config or max_loop */ 2289 for (i = 0; i < max_loop; i++) 2290 loop_add(i); 2291 2292 printk(KERN_INFO "loop: module loaded\n"); 2293 return 0; 2294 2295 misc_out: 2296 misc_deregister(&loop_misc); 2297 err_out: 2298 return err; 2299 } 2300 2301 static void __exit loop_exit(void) 2302 { 2303 struct loop_device *lo; 2304 int id; 2305 2306 unregister_blkdev(LOOP_MAJOR, "loop"); 2307 misc_deregister(&loop_misc); 2308 2309 /* 2310 * There is no need to use loop_ctl_mutex here, for nobody else can 2311 * access loop_index_idr when this module is unloading (unless forced 2312 * module unloading is requested). If this is not a clean unloading, 2313 * we have no means to avoid kernel crash. 2314 */ 2315 idr_for_each_entry(&loop_index_idr, lo, id) 2316 loop_remove(lo); 2317 2318 idr_destroy(&loop_index_idr); 2319 } 2320 2321 module_init(loop_init); 2322 module_exit(loop_exit); 2323 2324 #ifndef MODULE 2325 static int __init max_loop_setup(char *str) 2326 { 2327 max_loop = simple_strtol(str, NULL, 0); 2328 #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD 2329 max_loop_specified = true; 2330 #endif 2331 return 1; 2332 } 2333 2334 __setup("max_loop=", max_loop_setup); 2335 #endif 2336