1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <[email protected]>
25 * All rights reserved.
26 *
27 * Portions Copyright 2010 Robert Milkowski
28 *
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
33 */
34
35 /* Portions Copyright 2011 Martin Matuska <[email protected]> */
36
37 /*
38 * ZFS volume emulation driver.
39 *
40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41 * Volumes are accessed through the symbolic links named:
42 *
43 * /dev/zvol/<pool_name>/<dataset_name>
44 *
45 * Volumes are persistent through reboot. No user command needs to be
46 * run before opening and using a device.
47 *
48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49 * in the system. Except when they're simply character devices (volmode=dev).
50 */
51
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95
96 #include <geom/geom.h>
97 #include <sys/zvol.h>
98 #include <sys/zvol_impl.h>
99
100 #include "zfs_namecheck.h"
101
102 #define ZVOL_DUMPSIZE "dumpsize"
103
104 #ifdef ZVOL_LOCK_DEBUG
105 #define ZVOL_RW_READER RW_WRITER
106 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
107 #else
108 #define ZVOL_RW_READER RW_READER
109 #define ZVOL_RW_READ_HELD RW_READ_HELD
110 #endif
111
112 enum zvol_geom_state {
113 ZVOL_GEOM_UNINIT,
114 ZVOL_GEOM_STOPPED,
115 ZVOL_GEOM_RUNNING,
116 };
117
118 struct zvol_state_os {
119 #define zso_dev _zso_state._zso_dev
120 #define zso_geom _zso_state._zso_geom
121 union {
122 /* volmode=dev */
123 struct zvol_state_dev {
124 struct cdev *zsd_cdev;
125 uint64_t zsd_sync_cnt;
126 } _zso_dev;
127
128 /* volmode=geom */
129 struct zvol_state_geom {
130 struct g_provider *zsg_provider;
131 struct bio_queue_head zsg_queue;
132 struct mtx zsg_queue_mtx;
133 enum zvol_geom_state zsg_state;
134 } _zso_geom;
135 } _zso_state;
136 int zso_dying;
137 };
138
139 static uint32_t zvol_minors;
140
141 SYSCTL_DECL(_vfs_zfs);
142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
144 "Expose as GEOM providers (1), device files (2) or neither");
145 static boolean_t zpool_on_zvol = B_FALSE;
146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
147 "Allow zpools to use zvols as vdevs (DANGEROUS)");
148
149 /*
150 * Toggle unmap functionality.
151 */
152 boolean_t zvol_unmap_enabled = B_TRUE;
153
154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
155 &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
156
157 /*
158 * zvol maximum transfer in one DMU tx.
159 */
160 int zvol_maxphys = DMU_MAX_ACCESS / 2;
161
162 static void zvol_ensure_zilog(zvol_state_t *zv);
163
164 static d_open_t zvol_cdev_open;
165 static d_close_t zvol_cdev_close;
166 static d_ioctl_t zvol_cdev_ioctl;
167 static d_read_t zvol_cdev_read;
168 static d_write_t zvol_cdev_write;
169 static d_strategy_t zvol_geom_bio_strategy;
170
171 static struct cdevsw zvol_cdevsw = {
172 .d_name = "zvol",
173 .d_version = D_VERSION,
174 .d_flags = D_DISK | D_TRACKCLOSE,
175 .d_open = zvol_cdev_open,
176 .d_close = zvol_cdev_close,
177 .d_ioctl = zvol_cdev_ioctl,
178 .d_read = zvol_cdev_read,
179 .d_write = zvol_cdev_write,
180 .d_strategy = zvol_geom_bio_strategy,
181 };
182
183 extern uint_t zfs_geom_probe_vdev_key;
184
185 struct g_class zfs_zvol_class = {
186 .name = "ZFS::ZVOL",
187 .version = G_VERSION,
188 };
189
190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
191
192 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
193 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
194 static void zvol_geom_run(zvol_state_t *zv);
195 static void zvol_geom_destroy(zvol_state_t *zv);
196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
197 static void zvol_geom_worker(void *arg);
198 static void zvol_geom_bio_start(struct bio *bp);
199 static int zvol_geom_bio_getattr(struct bio *bp);
200 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
201
202 /*
203 * GEOM mode implementation
204 */
205
206 /*ARGSUSED*/
207 static int
zvol_geom_open(struct g_provider * pp,int flag,int count)208 zvol_geom_open(struct g_provider *pp, int flag, int count)
209 {
210 zvol_state_t *zv;
211 int err = 0;
212 boolean_t drop_suspend = B_FALSE;
213 boolean_t drop_namespace = B_FALSE;
214
215 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
216 /*
217 * if zfs_geom_probe_vdev_key is set, that means that zfs is
218 * attempting to probe geom providers while looking for a
219 * replacement for a missing VDEV. In this case, the
220 * spa_namespace_lock will not be held, but it is still illegal
221 * to use a zvol as a vdev. Deadlocks can result if another
222 * thread has spa_namespace_lock
223 */
224 return (SET_ERROR(EOPNOTSUPP));
225 }
226
227 retry:
228 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
229 zv = pp->private;
230 if (zv == NULL) {
231 rw_exit(&zvol_state_lock);
232 err = SET_ERROR(ENXIO);
233 goto out_locked;
234 }
235
236 if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
237 /*
238 * We need to guarantee that the namespace lock is held
239 * to avoid spurious failures in zvol_first_open.
240 */
241 drop_namespace = B_TRUE;
242 if (!mutex_tryenter(&spa_namespace_lock)) {
243 rw_exit(&zvol_state_lock);
244 mutex_enter(&spa_namespace_lock);
245 goto retry;
246 }
247 }
248 mutex_enter(&zv->zv_state_lock);
249 if (zv->zv_zso->zso_dying) {
250 rw_exit(&zvol_state_lock);
251 err = SET_ERROR(ENXIO);
252 goto out_zv_locked;
253 }
254 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
255
256 /*
257 * make sure zvol is not suspended during first open
258 * (hold zv_suspend_lock) and respect proper lock acquisition
259 * ordering - zv_suspend_lock before zv_state_lock
260 */
261 if (zv->zv_open_count == 0) {
262 drop_suspend = B_TRUE;
263 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
264 mutex_exit(&zv->zv_state_lock);
265 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
266 mutex_enter(&zv->zv_state_lock);
267 /* check to see if zv_suspend_lock is needed */
268 if (zv->zv_open_count != 0) {
269 rw_exit(&zv->zv_suspend_lock);
270 drop_suspend = B_FALSE;
271 }
272 }
273 }
274 rw_exit(&zvol_state_lock);
275
276 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
277
278 if (zv->zv_open_count == 0) {
279 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
280 err = zvol_first_open(zv, !(flag & FWRITE));
281 if (err)
282 goto out_zv_locked;
283 pp->mediasize = zv->zv_volsize;
284 pp->stripeoffset = 0;
285 pp->stripesize = zv->zv_volblocksize;
286 }
287
288 /*
289 * Check for a bad on-disk format version now since we
290 * lied about owning the dataset readonly before.
291 */
292 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
293 dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
294 err = SET_ERROR(EROFS);
295 goto out_opened;
296 }
297 if (zv->zv_flags & ZVOL_EXCL) {
298 err = SET_ERROR(EBUSY);
299 goto out_opened;
300 }
301 #ifdef FEXCL
302 if (flag & FEXCL) {
303 if (zv->zv_open_count != 0) {
304 err = SET_ERROR(EBUSY);
305 goto out_opened;
306 }
307 zv->zv_flags |= ZVOL_EXCL;
308 }
309 #endif
310
311 zv->zv_open_count += count;
312 out_opened:
313 if (zv->zv_open_count == 0) {
314 zvol_last_close(zv);
315 wakeup(zv);
316 }
317 out_zv_locked:
318 mutex_exit(&zv->zv_state_lock);
319 out_locked:
320 if (drop_namespace)
321 mutex_exit(&spa_namespace_lock);
322 if (drop_suspend)
323 rw_exit(&zv->zv_suspend_lock);
324 return (err);
325 }
326
327 /*ARGSUSED*/
328 static int
zvol_geom_close(struct g_provider * pp,int flag,int count)329 zvol_geom_close(struct g_provider *pp, int flag, int count)
330 {
331 zvol_state_t *zv;
332 boolean_t drop_suspend = B_TRUE;
333 int new_open_count;
334
335 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
336 zv = pp->private;
337 if (zv == NULL) {
338 rw_exit(&zvol_state_lock);
339 return (SET_ERROR(ENXIO));
340 }
341
342 mutex_enter(&zv->zv_state_lock);
343 if (zv->zv_flags & ZVOL_EXCL) {
344 ASSERT3U(zv->zv_open_count, ==, 1);
345 zv->zv_flags &= ~ZVOL_EXCL;
346 }
347
348 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
349
350 /*
351 * If the open count is zero, this is a spurious close.
352 * That indicates a bug in the kernel / DDI framework.
353 */
354 ASSERT3U(zv->zv_open_count, >, 0);
355
356 /*
357 * make sure zvol is not suspended during last close
358 * (hold zv_suspend_lock) and respect proper lock acquisition
359 * ordering - zv_suspend_lock before zv_state_lock
360 */
361 new_open_count = zv->zv_open_count - count;
362 if (new_open_count == 0) {
363 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
364 mutex_exit(&zv->zv_state_lock);
365 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
366 mutex_enter(&zv->zv_state_lock);
367 /* check to see if zv_suspend_lock is needed */
368 new_open_count = zv->zv_open_count - count;
369 if (new_open_count != 0) {
370 rw_exit(&zv->zv_suspend_lock);
371 drop_suspend = B_FALSE;
372 }
373 }
374 } else {
375 drop_suspend = B_FALSE;
376 }
377 rw_exit(&zvol_state_lock);
378
379 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
380
381 /*
382 * You may get multiple opens, but only one close.
383 */
384 zv->zv_open_count = new_open_count;
385 if (zv->zv_open_count == 0) {
386 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
387 zvol_last_close(zv);
388 wakeup(zv);
389 }
390
391 mutex_exit(&zv->zv_state_lock);
392
393 if (drop_suspend)
394 rw_exit(&zv->zv_suspend_lock);
395 return (0);
396 }
397
398 static void
zvol_geom_run(zvol_state_t * zv)399 zvol_geom_run(zvol_state_t *zv)
400 {
401 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
402 struct g_provider *pp = zsg->zsg_provider;
403
404 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
405
406 g_error_provider(pp, 0);
407
408 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
409 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
410 }
411
412 static void
zvol_geom_destroy(zvol_state_t * zv)413 zvol_geom_destroy(zvol_state_t *zv)
414 {
415 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
416 struct g_provider *pp = zsg->zsg_provider;
417
418 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
419
420 g_topology_assert();
421
422 mutex_enter(&zv->zv_state_lock);
423 VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
424 mutex_exit(&zv->zv_state_lock);
425 zsg->zsg_provider = NULL;
426 g_wither_geom(pp->geom, ENXIO);
427 }
428
429 void
zvol_wait_close(zvol_state_t * zv)430 zvol_wait_close(zvol_state_t *zv)
431 {
432
433 if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
434 return;
435 mutex_enter(&zv->zv_state_lock);
436 zv->zv_zso->zso_dying = B_TRUE;
437
438 if (zv->zv_open_count)
439 msleep(zv, &zv->zv_state_lock,
440 PRIBIO, "zvol:dying", 10*hz);
441 mutex_exit(&zv->zv_state_lock);
442 }
443
444
445 static int
zvol_geom_access(struct g_provider * pp,int acr,int acw,int ace)446 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
447 {
448 int count, error, flags;
449
450 g_topology_assert();
451
452 /*
453 * To make it easier we expect either open or close, but not both
454 * at the same time.
455 */
456 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
457 (acr <= 0 && acw <= 0 && ace <= 0),
458 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
459 pp->name, acr, acw, ace));
460
461 if (pp->private == NULL) {
462 if (acr <= 0 && acw <= 0 && ace <= 0)
463 return (0);
464 return (pp->error);
465 }
466
467 /*
468 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
469 * ace != 0, because GEOM already handles that and handles it a bit
470 * differently. GEOM allows for multiple read/exclusive consumers and
471 * ZFS allows only one exclusive consumer, no matter if it is reader or
472 * writer. I like better the way GEOM works so I'll leave it for GEOM
473 * to decide what to do.
474 */
475
476 count = acr + acw + ace;
477 if (count == 0)
478 return (0);
479
480 flags = 0;
481 if (acr != 0 || ace != 0)
482 flags |= FREAD;
483 if (acw != 0)
484 flags |= FWRITE;
485
486 g_topology_unlock();
487 if (count > 0)
488 error = zvol_geom_open(pp, flags, count);
489 else
490 error = zvol_geom_close(pp, flags, -count);
491 g_topology_lock();
492 return (error);
493 }
494
495 static void
zvol_geom_worker(void * arg)496 zvol_geom_worker(void *arg)
497 {
498 zvol_state_t *zv = arg;
499 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
500 struct bio *bp;
501
502 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
503
504 thread_lock(curthread);
505 sched_prio(curthread, PRIBIO);
506 thread_unlock(curthread);
507
508 for (;;) {
509 mtx_lock(&zsg->zsg_queue_mtx);
510 bp = bioq_takefirst(&zsg->zsg_queue);
511 if (bp == NULL) {
512 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
513 zsg->zsg_state = ZVOL_GEOM_RUNNING;
514 wakeup(&zsg->zsg_state);
515 mtx_unlock(&zsg->zsg_queue_mtx);
516 kthread_exit();
517 }
518 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
519 PRIBIO | PDROP, "zvol:io", 0);
520 continue;
521 }
522 mtx_unlock(&zsg->zsg_queue_mtx);
523 zvol_geom_bio_strategy(bp);
524 }
525 }
526
527 static void
zvol_geom_bio_start(struct bio * bp)528 zvol_geom_bio_start(struct bio *bp)
529 {
530 zvol_state_t *zv = bp->bio_to->private;
531 struct zvol_state_geom *zsg;
532 boolean_t first;
533
534 if (zv == NULL) {
535 g_io_deliver(bp, ENXIO);
536 return;
537 }
538 if (bp->bio_cmd == BIO_GETATTR) {
539 if (zvol_geom_bio_getattr(bp))
540 g_io_deliver(bp, EOPNOTSUPP);
541 return;
542 }
543
544 if (!THREAD_CAN_SLEEP()) {
545 zsg = &zv->zv_zso->zso_geom;
546 mtx_lock(&zsg->zsg_queue_mtx);
547 first = (bioq_first(&zsg->zsg_queue) == NULL);
548 bioq_insert_tail(&zsg->zsg_queue, bp);
549 mtx_unlock(&zsg->zsg_queue_mtx);
550 if (first)
551 wakeup_one(&zsg->zsg_queue);
552 return;
553 }
554
555 zvol_geom_bio_strategy(bp);
556 }
557
558 static int
zvol_geom_bio_getattr(struct bio * bp)559 zvol_geom_bio_getattr(struct bio *bp)
560 {
561 zvol_state_t *zv;
562
563 zv = bp->bio_to->private;
564 ASSERT3P(zv, !=, NULL);
565
566 spa_t *spa = dmu_objset_spa(zv->zv_objset);
567 uint64_t refd, avail, usedobjs, availobjs;
568
569 if (g_handleattr_int(bp, "GEOM::candelete", 1))
570 return (0);
571 if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
572 dmu_objset_space(zv->zv_objset, &refd, &avail,
573 &usedobjs, &availobjs);
574 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
575 return (0);
576 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
577 dmu_objset_space(zv->zv_objset, &refd, &avail,
578 &usedobjs, &availobjs);
579 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
580 return (0);
581 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
582 avail = metaslab_class_get_space(spa_normal_class(spa));
583 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
584 if (g_handleattr_off_t(bp, "poolblocksavail",
585 avail / DEV_BSIZE))
586 return (0);
587 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
588 refd = metaslab_class_get_alloc(spa_normal_class(spa));
589 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
590 return (0);
591 }
592 return (1);
593 }
594
595 static void
zvol_geom_bio_strategy(struct bio * bp)596 zvol_geom_bio_strategy(struct bio *bp)
597 {
598 zvol_state_t *zv;
599 uint64_t off, volsize;
600 size_t resid;
601 char *addr;
602 objset_t *os;
603 zfs_locked_range_t *lr;
604 int error = 0;
605 boolean_t doread = B_FALSE;
606 boolean_t is_dumpified;
607 boolean_t sync;
608
609 if (bp->bio_to)
610 zv = bp->bio_to->private;
611 else
612 zv = bp->bio_dev->si_drv2;
613
614 if (zv == NULL) {
615 error = SET_ERROR(ENXIO);
616 goto out;
617 }
618
619 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
620
621 switch (bp->bio_cmd) {
622 case BIO_READ:
623 doread = B_TRUE;
624 break;
625 case BIO_WRITE:
626 case BIO_FLUSH:
627 case BIO_DELETE:
628 if (zv->zv_flags & ZVOL_RDONLY) {
629 error = SET_ERROR(EROFS);
630 goto resume;
631 }
632 zvol_ensure_zilog(zv);
633 if (bp->bio_cmd == BIO_FLUSH)
634 goto sync;
635 break;
636 default:
637 error = SET_ERROR(EOPNOTSUPP);
638 goto resume;
639 }
640
641 off = bp->bio_offset;
642 volsize = zv->zv_volsize;
643
644 os = zv->zv_objset;
645 ASSERT3P(os, !=, NULL);
646
647 addr = bp->bio_data;
648 resid = bp->bio_length;
649
650 if (resid > 0 && off >= volsize) {
651 error = SET_ERROR(EIO);
652 goto resume;
653 }
654
655 is_dumpified = B_FALSE;
656 sync = !doread && !is_dumpified &&
657 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
658
659 /*
660 * There must be no buffer changes when doing a dmu_sync() because
661 * we can't change the data whilst calculating the checksum.
662 */
663 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
664 doread ? RL_READER : RL_WRITER);
665
666 if (bp->bio_cmd == BIO_DELETE) {
667 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
668 error = dmu_tx_assign(tx, TXG_WAIT);
669 if (error != 0) {
670 dmu_tx_abort(tx);
671 } else {
672 zvol_log_truncate(zv, tx, off, resid, sync);
673 dmu_tx_commit(tx);
674 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
675 off, resid);
676 resid = 0;
677 }
678 goto unlock;
679 }
680 while (resid != 0 && off < volsize) {
681 size_t size = MIN(resid, zvol_maxphys);
682 if (doread) {
683 error = dmu_read(os, ZVOL_OBJ, off, size, addr,
684 DMU_READ_PREFETCH);
685 } else {
686 dmu_tx_t *tx = dmu_tx_create(os);
687 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
688 error = dmu_tx_assign(tx, TXG_WAIT);
689 if (error) {
690 dmu_tx_abort(tx);
691 } else {
692 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
693 zvol_log_write(zv, tx, off, size, sync);
694 dmu_tx_commit(tx);
695 }
696 }
697 if (error) {
698 /* convert checksum errors into IO errors */
699 if (error == ECKSUM)
700 error = SET_ERROR(EIO);
701 break;
702 }
703 off += size;
704 addr += size;
705 resid -= size;
706 }
707 unlock:
708 zfs_rangelock_exit(lr);
709
710 bp->bio_completed = bp->bio_length - resid;
711 if (bp->bio_completed < bp->bio_length && off > volsize)
712 error = SET_ERROR(EINVAL);
713
714 switch (bp->bio_cmd) {
715 case BIO_FLUSH:
716 break;
717 case BIO_READ:
718 dataset_kstats_update_read_kstats(&zv->zv_kstat,
719 bp->bio_completed);
720 break;
721 case BIO_WRITE:
722 dataset_kstats_update_write_kstats(&zv->zv_kstat,
723 bp->bio_completed);
724 break;
725 case BIO_DELETE:
726 break;
727 default:
728 break;
729 }
730
731 if (sync) {
732 sync:
733 zil_commit(zv->zv_zilog, ZVOL_OBJ);
734 }
735 resume:
736 rw_exit(&zv->zv_suspend_lock);
737 out:
738 if (bp->bio_to)
739 g_io_deliver(bp, error);
740 else
741 biofinish(bp, NULL, error);
742 }
743
744 /*
745 * Character device mode implementation
746 */
747
748 static int
zvol_cdev_read(struct cdev * dev,struct uio * uio,int ioflag)749 zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag)
750 {
751 zvol_state_t *zv;
752 uint64_t volsize;
753 zfs_locked_range_t *lr;
754 int error = 0;
755
756 zv = dev->si_drv2;
757
758 volsize = zv->zv_volsize;
759 /*
760 * uio_loffset == volsize isn't an error as
761 * its required for EOF processing.
762 */
763 if (uio->uio_resid > 0 &&
764 (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
765 return (SET_ERROR(EIO));
766
767 lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
768 uio->uio_resid, RL_READER);
769 while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
770 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
771
772 /* don't read past the end */
773 if (bytes > volsize - uio->uio_loffset)
774 bytes = volsize - uio->uio_loffset;
775
776 error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
777 if (error) {
778 /* convert checksum errors into IO errors */
779 if (error == ECKSUM)
780 error = SET_ERROR(EIO);
781 break;
782 }
783 }
784 zfs_rangelock_exit(lr);
785
786 return (error);
787 }
788
789 static int
zvol_cdev_write(struct cdev * dev,struct uio * uio,int ioflag)790 zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag)
791 {
792 zvol_state_t *zv;
793 uint64_t volsize;
794 zfs_locked_range_t *lr;
795 int error = 0;
796 boolean_t sync;
797
798 zv = dev->si_drv2;
799
800 volsize = zv->zv_volsize;
801
802 if (uio->uio_resid > 0 &&
803 (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
804 return (SET_ERROR(EIO));
805
806 sync = (ioflag & IO_SYNC) ||
807 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
808
809 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
810 zvol_ensure_zilog(zv);
811
812 lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
813 uio->uio_resid, RL_WRITER);
814 while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
815 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
816 uint64_t off = uio->uio_loffset;
817 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
818
819 if (bytes > volsize - off) /* don't write past the end */
820 bytes = volsize - off;
821
822 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
823 error = dmu_tx_assign(tx, TXG_WAIT);
824 if (error) {
825 dmu_tx_abort(tx);
826 break;
827 }
828 error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
829 if (error == 0)
830 zvol_log_write(zv, tx, off, bytes, sync);
831 dmu_tx_commit(tx);
832
833 if (error)
834 break;
835 }
836 zfs_rangelock_exit(lr);
837 if (sync)
838 zil_commit(zv->zv_zilog, ZVOL_OBJ);
839 rw_exit(&zv->zv_suspend_lock);
840 return (error);
841 }
842
843 static int
zvol_cdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)844 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
845 {
846 zvol_state_t *zv;
847 struct zvol_state_dev *zsd;
848 int err = 0;
849 boolean_t drop_suspend = B_FALSE;
850 boolean_t drop_namespace = B_FALSE;
851
852 retry:
853 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
854 zv = dev->si_drv2;
855 if (zv == NULL) {
856 rw_exit(&zvol_state_lock);
857 err = SET_ERROR(ENXIO);
858 goto out_locked;
859 }
860
861 if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
862 /*
863 * We need to guarantee that the namespace lock is held
864 * to avoid spurious failures in zvol_first_open.
865 */
866 drop_namespace = B_TRUE;
867 if (!mutex_tryenter(&spa_namespace_lock)) {
868 rw_exit(&zvol_state_lock);
869 mutex_enter(&spa_namespace_lock);
870 goto retry;
871 }
872 }
873 mutex_enter(&zv->zv_state_lock);
874
875 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
876
877 /*
878 * make sure zvol is not suspended during first open
879 * (hold zv_suspend_lock) and respect proper lock acquisition
880 * ordering - zv_suspend_lock before zv_state_lock
881 */
882 if (zv->zv_open_count == 0) {
883 drop_suspend = B_TRUE;
884 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
885 mutex_exit(&zv->zv_state_lock);
886 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
887 mutex_enter(&zv->zv_state_lock);
888 /* check to see if zv_suspend_lock is needed */
889 if (zv->zv_open_count != 0) {
890 rw_exit(&zv->zv_suspend_lock);
891 drop_suspend = B_FALSE;
892 }
893 }
894 }
895 rw_exit(&zvol_state_lock);
896
897 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
898
899 if (zv->zv_open_count == 0) {
900 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
901 err = zvol_first_open(zv, !(flags & FWRITE));
902 if (err)
903 goto out_zv_locked;
904 }
905
906 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
907 err = SET_ERROR(EROFS);
908 goto out_opened;
909 }
910 if (zv->zv_flags & ZVOL_EXCL) {
911 err = SET_ERROR(EBUSY);
912 goto out_opened;
913 }
914 #ifdef FEXCL
915 if (flags & FEXCL) {
916 if (zv->zv_open_count != 0) {
917 err = SET_ERROR(EBUSY);
918 goto out_opened;
919 }
920 zv->zv_flags |= ZVOL_EXCL;
921 }
922 #endif
923
924 zv->zv_open_count++;
925 if (flags & (FSYNC | FDSYNC)) {
926 zsd = &zv->zv_zso->zso_dev;
927 zsd->zsd_sync_cnt++;
928 if (zsd->zsd_sync_cnt == 1 &&
929 (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
930 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
931 }
932 out_opened:
933 if (zv->zv_open_count == 0) {
934 zvol_last_close(zv);
935 wakeup(zv);
936 }
937 out_zv_locked:
938 mutex_exit(&zv->zv_state_lock);
939 out_locked:
940 if (drop_namespace)
941 mutex_exit(&spa_namespace_lock);
942 if (drop_suspend)
943 rw_exit(&zv->zv_suspend_lock);
944 return (err);
945 }
946
947 static int
zvol_cdev_close(struct cdev * dev,int flags,int fmt,struct thread * td)948 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
949 {
950 zvol_state_t *zv;
951 struct zvol_state_dev *zsd;
952 boolean_t drop_suspend = B_TRUE;
953
954 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
955 zv = dev->si_drv2;
956 if (zv == NULL) {
957 rw_exit(&zvol_state_lock);
958 return (SET_ERROR(ENXIO));
959 }
960
961 mutex_enter(&zv->zv_state_lock);
962 if (zv->zv_flags & ZVOL_EXCL) {
963 ASSERT3U(zv->zv_open_count, ==, 1);
964 zv->zv_flags &= ~ZVOL_EXCL;
965 }
966
967 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
968
969 /*
970 * If the open count is zero, this is a spurious close.
971 * That indicates a bug in the kernel / DDI framework.
972 */
973 ASSERT3U(zv->zv_open_count, >, 0);
974 /*
975 * make sure zvol is not suspended during last close
976 * (hold zv_suspend_lock) and respect proper lock acquisition
977 * ordering - zv_suspend_lock before zv_state_lock
978 */
979 if (zv->zv_open_count == 1) {
980 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
981 mutex_exit(&zv->zv_state_lock);
982 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
983 mutex_enter(&zv->zv_state_lock);
984 /* check to see if zv_suspend_lock is needed */
985 if (zv->zv_open_count != 1) {
986 rw_exit(&zv->zv_suspend_lock);
987 drop_suspend = B_FALSE;
988 }
989 }
990 } else {
991 drop_suspend = B_FALSE;
992 }
993 rw_exit(&zvol_state_lock);
994
995 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
996
997 /*
998 * You may get multiple opens, but only one close.
999 */
1000 zv->zv_open_count--;
1001 if (flags & (FSYNC | FDSYNC)) {
1002 zsd = &zv->zv_zso->zso_dev;
1003 zsd->zsd_sync_cnt--;
1004 }
1005
1006 if (zv->zv_open_count == 0) {
1007 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1008 zvol_last_close(zv);
1009 wakeup(zv);
1010 }
1011
1012 mutex_exit(&zv->zv_state_lock);
1013
1014 if (drop_suspend)
1015 rw_exit(&zv->zv_suspend_lock);
1016 return (0);
1017 }
1018
1019 static int
zvol_cdev_ioctl(struct cdev * dev,ulong_t cmd,caddr_t data,int fflag,struct thread * td)1020 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1021 int fflag, struct thread *td)
1022 {
1023 zvol_state_t *zv;
1024 zfs_locked_range_t *lr;
1025 off_t offset, length;
1026 int i, error;
1027 boolean_t sync;
1028
1029 zv = dev->si_drv2;
1030
1031 error = 0;
1032 KASSERT(zv->zv_open_count > 0,
1033 ("Device with zero access count in %s", __func__));
1034
1035 i = IOCPARM_LEN(cmd);
1036 switch (cmd) {
1037 case DIOCGSECTORSIZE:
1038 *(uint32_t *)data = DEV_BSIZE;
1039 break;
1040 case DIOCGMEDIASIZE:
1041 *(off_t *)data = zv->zv_volsize;
1042 break;
1043 case DIOCGFLUSH:
1044 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1045 if (zv->zv_zilog != NULL)
1046 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1047 rw_exit(&zv->zv_suspend_lock);
1048 break;
1049 case DIOCGDELETE:
1050 if (!zvol_unmap_enabled)
1051 break;
1052
1053 offset = ((off_t *)data)[0];
1054 length = ((off_t *)data)[1];
1055 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1056 offset < 0 || offset >= zv->zv_volsize ||
1057 length <= 0) {
1058 printf("%s: offset=%jd length=%jd\n", __func__, offset,
1059 length);
1060 error = SET_ERROR(EINVAL);
1061 break;
1062 }
1063 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1064 zvol_ensure_zilog(zv);
1065 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1066 RL_WRITER);
1067 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1068 error = dmu_tx_assign(tx, TXG_WAIT);
1069 if (error != 0) {
1070 sync = FALSE;
1071 dmu_tx_abort(tx);
1072 } else {
1073 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1074 zvol_log_truncate(zv, tx, offset, length, sync);
1075 dmu_tx_commit(tx);
1076 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1077 offset, length);
1078 }
1079 zfs_rangelock_exit(lr);
1080 if (sync)
1081 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1082 rw_exit(&zv->zv_suspend_lock);
1083 break;
1084 case DIOCGSTRIPESIZE:
1085 *(off_t *)data = zv->zv_volblocksize;
1086 break;
1087 case DIOCGSTRIPEOFFSET:
1088 *(off_t *)data = 0;
1089 break;
1090 case DIOCGATTR: {
1091 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1092 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1093 uint64_t refd, avail, usedobjs, availobjs;
1094
1095 if (strcmp(arg->name, "GEOM::candelete") == 0)
1096 arg->value.i = 1;
1097 else if (strcmp(arg->name, "blocksavail") == 0) {
1098 dmu_objset_space(zv->zv_objset, &refd, &avail,
1099 &usedobjs, &availobjs);
1100 arg->value.off = avail / DEV_BSIZE;
1101 } else if (strcmp(arg->name, "blocksused") == 0) {
1102 dmu_objset_space(zv->zv_objset, &refd, &avail,
1103 &usedobjs, &availobjs);
1104 arg->value.off = refd / DEV_BSIZE;
1105 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1106 avail = metaslab_class_get_space(spa_normal_class(spa));
1107 avail -= metaslab_class_get_alloc(
1108 spa_normal_class(spa));
1109 arg->value.off = avail / DEV_BSIZE;
1110 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1111 refd = metaslab_class_get_alloc(spa_normal_class(spa));
1112 arg->value.off = refd / DEV_BSIZE;
1113 } else
1114 error = SET_ERROR(ENOIOCTL);
1115 break;
1116 }
1117 case FIOSEEKHOLE:
1118 case FIOSEEKDATA: {
1119 off_t *off = (off_t *)data;
1120 uint64_t noff;
1121 boolean_t hole;
1122
1123 hole = (cmd == FIOSEEKHOLE);
1124 noff = *off;
1125 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1126 *off = noff;
1127 break;
1128 }
1129 default:
1130 error = SET_ERROR(ENOIOCTL);
1131 }
1132
1133 return (error);
1134 }
1135
1136 /*
1137 * Misc. helpers
1138 */
1139
1140 static void
zvol_ensure_zilog(zvol_state_t * zv)1141 zvol_ensure_zilog(zvol_state_t *zv)
1142 {
1143 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1144
1145 /*
1146 * Open a ZIL if this is the first time we have written to this
1147 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1148 * than zv_state_lock so that we don't need to acquire an
1149 * additional lock in this path.
1150 */
1151 if (zv->zv_zilog == NULL) {
1152 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1153 rw_exit(&zv->zv_suspend_lock);
1154 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1155 }
1156 if (zv->zv_zilog == NULL) {
1157 zv->zv_zilog = zil_open(zv->zv_objset,
1158 zvol_get_data);
1159 zv->zv_flags |= ZVOL_WRITTEN_TO;
1160 }
1161 rw_downgrade(&zv->zv_suspend_lock);
1162 }
1163 }
1164
1165 static boolean_t
zvol_is_zvol_impl(const char * device)1166 zvol_is_zvol_impl(const char *device)
1167 {
1168 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1169 }
1170
1171 static void
zvol_rename_minor(zvol_state_t * zv,const char * newname)1172 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1173 {
1174 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1175 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1176
1177 /* move to new hashtable entry */
1178 zv->zv_hash = zvol_name_hash(zv->zv_name);
1179 hlist_del(&zv->zv_hlink);
1180 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1181
1182 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1183 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1184 struct g_provider *pp = zsg->zsg_provider;
1185 struct g_geom *gp;
1186
1187 g_topology_lock();
1188 gp = pp->geom;
1189 ASSERT3P(gp, !=, NULL);
1190
1191 zsg->zsg_provider = NULL;
1192 g_wither_provider(pp, ENXIO);
1193
1194 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1195 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1196 pp->sectorsize = DEV_BSIZE;
1197 pp->mediasize = zv->zv_volsize;
1198 pp->private = zv;
1199 zsg->zsg_provider = pp;
1200 g_error_provider(pp, 0);
1201 g_topology_unlock();
1202 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1203 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1204 struct cdev *dev;
1205 struct make_dev_args args;
1206
1207 dev = zsd->zsd_cdev;
1208 if (dev != NULL) {
1209 destroy_dev(dev);
1210 dev = zsd->zsd_cdev = NULL;
1211 if (zv->zv_open_count > 0) {
1212 zv->zv_flags &= ~ZVOL_EXCL;
1213 zv->zv_open_count = 0;
1214 /* XXX need suspend lock but lock order */
1215 zvol_last_close(zv);
1216 }
1217 }
1218
1219 make_dev_args_init(&args);
1220 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1221 args.mda_devsw = &zvol_cdevsw;
1222 args.mda_cr = NULL;
1223 args.mda_uid = UID_ROOT;
1224 args.mda_gid = GID_OPERATOR;
1225 args.mda_mode = 0640;
1226 args.mda_si_drv2 = zv;
1227 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1228 == 0) {
1229 dev->si_iosize_max = maxphys;
1230 zsd->zsd_cdev = dev;
1231 }
1232 }
1233 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1234 }
1235
1236 /*
1237 * Remove minor node for the specified volume.
1238 */
1239 static void
zvol_free(zvol_state_t * zv)1240 zvol_free(zvol_state_t *zv)
1241 {
1242 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1243 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1244 ASSERT0(zv->zv_open_count);
1245
1246 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1247
1248 rw_destroy(&zv->zv_suspend_lock);
1249 zfs_rangelock_fini(&zv->zv_rangelock);
1250
1251 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1252 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1253 struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1254
1255 ASSERT3P(pp->private, ==, NULL);
1256
1257 g_topology_lock();
1258 zvol_geom_destroy(zv);
1259 g_topology_unlock();
1260 mtx_destroy(&zsg->zsg_queue_mtx);
1261 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1262 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1263 struct cdev *dev = zsd->zsd_cdev;
1264
1265 ASSERT3P(dev->si_drv2, ==, NULL);
1266
1267 destroy_dev(dev);
1268 }
1269
1270 mutex_destroy(&zv->zv_state_lock);
1271 dataset_kstats_destroy(&zv->zv_kstat);
1272 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1273 kmem_free(zv, sizeof (zvol_state_t));
1274 zvol_minors--;
1275 }
1276
1277 /*
1278 * Create a minor node (plus a whole lot more) for the specified volume.
1279 */
1280 static int
zvol_create_minor_impl(const char * name)1281 zvol_create_minor_impl(const char *name)
1282 {
1283 zvol_state_t *zv;
1284 objset_t *os;
1285 dmu_object_info_t *doi;
1286 uint64_t volsize;
1287 uint64_t volmode, hash;
1288 int error;
1289
1290 ZFS_LOG(1, "Creating ZVOL %s...", name);
1291 hash = zvol_name_hash(name);
1292 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1293 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1294 mutex_exit(&zv->zv_state_lock);
1295 return (SET_ERROR(EEXIST));
1296 }
1297
1298 DROP_GIANT();
1299
1300 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1301
1302 /* lie and say we're read-only */
1303 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1304 if (error)
1305 goto out_doi;
1306
1307 error = dmu_object_info(os, ZVOL_OBJ, doi);
1308 if (error)
1309 goto out_dmu_objset_disown;
1310
1311 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1312 if (error)
1313 goto out_dmu_objset_disown;
1314
1315 error = dsl_prop_get_integer(name,
1316 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1317 if (error || volmode == ZFS_VOLMODE_DEFAULT)
1318 volmode = zvol_volmode;
1319 error = 0;
1320
1321 /*
1322 * zvol_alloc equivalent ...
1323 */
1324 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1325 zv->zv_hash = hash;
1326 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1327 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1328 zv->zv_volmode = volmode;
1329 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1330 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1331 struct g_provider *pp;
1332 struct g_geom *gp;
1333
1334 zsg->zsg_state = ZVOL_GEOM_UNINIT;
1335 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1336
1337 g_topology_lock();
1338 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1339 gp->start = zvol_geom_bio_start;
1340 gp->access = zvol_geom_access;
1341 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1342 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1343 pp->sectorsize = DEV_BSIZE;
1344 pp->mediasize = 0;
1345 pp->private = zv;
1346
1347 zsg->zsg_provider = pp;
1348 bioq_init(&zsg->zsg_queue);
1349 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1350 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1351 struct cdev *dev;
1352 struct make_dev_args args;
1353
1354 make_dev_args_init(&args);
1355 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1356 args.mda_devsw = &zvol_cdevsw;
1357 args.mda_cr = NULL;
1358 args.mda_uid = UID_ROOT;
1359 args.mda_gid = GID_OPERATOR;
1360 args.mda_mode = 0640;
1361 args.mda_si_drv2 = zv;
1362 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1363 if (error) {
1364 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1365 mutex_destroy(&zv->zv_state_lock);
1366 kmem_free(zv, sizeof (*zv));
1367 dmu_objset_disown(os, B_TRUE, FTAG);
1368 goto out_doi;
1369 }
1370 dev->si_iosize_max = maxphys;
1371 zsd->zsd_cdev = dev;
1372 }
1373 (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1374 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1375 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1376
1377 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1378 zv->zv_flags |= ZVOL_RDONLY;
1379
1380 zv->zv_volblocksize = doi->doi_data_block_size;
1381 zv->zv_volsize = volsize;
1382 zv->zv_objset = os;
1383
1384 if (spa_writeable(dmu_objset_spa(os))) {
1385 if (zil_replay_disable)
1386 zil_destroy(dmu_objset_zil(os), B_FALSE);
1387 else
1388 zil_replay(os, zv, zvol_replay_vector);
1389 }
1390 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1391 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1392
1393 /* TODO: prefetch for geom tasting */
1394
1395 zv->zv_objset = NULL;
1396 out_dmu_objset_disown:
1397 dmu_objset_disown(os, B_TRUE, FTAG);
1398
1399 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1400 zvol_geom_run(zv);
1401 g_topology_unlock();
1402 }
1403 out_doi:
1404 kmem_free(doi, sizeof (dmu_object_info_t));
1405 if (error == 0) {
1406 rw_enter(&zvol_state_lock, RW_WRITER);
1407 zvol_insert(zv);
1408 zvol_minors++;
1409 rw_exit(&zvol_state_lock);
1410 ZFS_LOG(1, "ZVOL %s created.", name);
1411 }
1412 PICKUP_GIANT();
1413 return (error);
1414 }
1415
1416 static void
zvol_clear_private(zvol_state_t * zv)1417 zvol_clear_private(zvol_state_t *zv)
1418 {
1419 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1420 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1421 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1422 struct g_provider *pp = zsg->zsg_provider;
1423
1424 if (pp->private == NULL) /* already cleared */
1425 return;
1426
1427 mtx_lock(&zsg->zsg_queue_mtx);
1428 zsg->zsg_state = ZVOL_GEOM_STOPPED;
1429 pp->private = NULL;
1430 wakeup_one(&zsg->zsg_queue);
1431 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1432 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1433 0, "zvol:w", 0);
1434 mtx_unlock(&zsg->zsg_queue_mtx);
1435 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1436 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1437 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1438 struct cdev *dev = zsd->zsd_cdev;
1439
1440 dev->si_drv2 = NULL;
1441 }
1442 }
1443
1444 static int
zvol_update_volsize(zvol_state_t * zv,uint64_t volsize)1445 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
1446 {
1447 zv->zv_volsize = volsize;
1448 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1449 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1450 struct g_provider *pp = zsg->zsg_provider;
1451
1452 g_topology_lock();
1453
1454 if (pp->private == NULL) {
1455 g_topology_unlock();
1456 return (SET_ERROR(ENXIO));
1457 }
1458
1459 /*
1460 * Do not invoke resize event when initial size was zero.
1461 * ZVOL initializes the size on first open, this is not
1462 * real resizing.
1463 */
1464 if (pp->mediasize == 0)
1465 pp->mediasize = zv->zv_volsize;
1466 else
1467 g_resize_provider(pp, zv->zv_volsize);
1468
1469 g_topology_unlock();
1470 }
1471 return (0);
1472 }
1473
1474 static void
zvol_set_disk_ro_impl(zvol_state_t * zv,int flags)1475 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1476 {
1477 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1478 }
1479
1480 static void
zvol_set_capacity_impl(zvol_state_t * zv,uint64_t capacity)1481 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1482 {
1483 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1484 }
1485
1486 const static zvol_platform_ops_t zvol_freebsd_ops = {
1487 .zv_free = zvol_free,
1488 .zv_rename_minor = zvol_rename_minor,
1489 .zv_create_minor = zvol_create_minor_impl,
1490 .zv_update_volsize = zvol_update_volsize,
1491 .zv_clear_private = zvol_clear_private,
1492 .zv_is_zvol = zvol_is_zvol_impl,
1493 .zv_set_disk_ro = zvol_set_disk_ro_impl,
1494 .zv_set_capacity = zvol_set_capacity_impl,
1495 };
1496
1497 /*
1498 * Public interfaces
1499 */
1500
1501 int
zvol_busy(void)1502 zvol_busy(void)
1503 {
1504 return (zvol_minors != 0);
1505 }
1506
1507 int
zvol_init(void)1508 zvol_init(void)
1509 {
1510 zvol_init_impl();
1511 zvol_register_ops(&zvol_freebsd_ops);
1512 return (0);
1513 }
1514
1515 void
zvol_fini(void)1516 zvol_fini(void)
1517 {
1518 zvol_fini_impl();
1519 }
1520