1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25  * Copyright 2015 RackTop Systems.
26  * Copyright (c) 2016, Intel Corporation.
27  */
28 
29 /*
30  * Pool import support functions.
31  *
32  * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since
33  * these commands are expected to run in the global zone, we can assume
34  * that the devices are all readable when called.
35  *
36  * To import a pool, we rely on reading the configuration information from the
37  * ZFS label of each device.  If we successfully read the label, then we
38  * organize the configuration information in the following hierarchy:
39  *
40  *	pool guid -> toplevel vdev guid -> label txg
41  *
42  * Duplicate entries matching this same tuple will be discarded.  Once we have
43  * examined every device, we pick the best label txg config for each toplevel
44  * vdev.  We then arrange these toplevel vdevs into a complete pool config, and
45  * update any paths that have changed.  Finally, we attempt to import the pool
46  * using our derived config, and record the results.
47  */
48 
49 #include <ctype.h>
50 #include <dirent.h>
51 #include <errno.h>
52 #include <libintl.h>
53 #include <libgen.h>
54 #include <stddef.h>
55 #include <stdlib.h>
56 #include <stdio.h>
57 #include <string.h>
58 #include <sys/stat.h>
59 #include <unistd.h>
60 #include <fcntl.h>
61 #include <sys/dktp/fdisk.h>
62 #include <sys/vdev_impl.h>
63 #include <sys/fs/zfs.h>
64 #include <sys/vdev_impl.h>
65 
66 #include <thread_pool.h>
67 #include <libzutil.h>
68 #include <libnvpair.h>
69 
70 #include "zutil_import.h"
71 
72 #ifdef HAVE_LIBUDEV
73 #include <libudev.h>
74 #include <sched.h>
75 #endif
76 #include <blkid/blkid.h>
77 
78 #define	DEFAULT_IMPORT_PATH_SIZE	9
79 #define	DEV_BYID_PATH	"/dev/disk/by-id/"
80 
81 static boolean_t
is_watchdog_dev(char * dev)82 is_watchdog_dev(char *dev)
83 {
84 	/* For 'watchdog' dev */
85 	if (strcmp(dev, "watchdog") == 0)
86 		return (B_TRUE);
87 
88 	/* For 'watchdog<digit><whatever> */
89 	if (strstr(dev, "watchdog") == dev && isdigit(dev[8]))
90 		return (B_TRUE);
91 
92 	return (B_FALSE);
93 }
94 
95 int
zfs_dev_flush(int fd)96 zfs_dev_flush(int fd)
97 {
98 	return (ioctl(fd, BLKFLSBUF));
99 }
100 
101 void
zpool_open_func(void * arg)102 zpool_open_func(void *arg)
103 {
104 	rdsk_node_t *rn = arg;
105 	libpc_handle_t *hdl = rn->rn_hdl;
106 	struct stat64 statbuf;
107 	nvlist_t *config;
108 	char *bname, *dupname;
109 	uint64_t vdev_guid = 0;
110 	int error;
111 	int num_labels = 0;
112 	int fd;
113 
114 	/*
115 	 * Skip devices with well known prefixes there can be side effects
116 	 * when opening devices which need to be avoided.
117 	 *
118 	 * hpet     - High Precision Event Timer
119 	 * watchdog - Watchdog must be closed in a special way.
120 	 */
121 	dupname = zutil_strdup(hdl, rn->rn_name);
122 	bname = basename(dupname);
123 	error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname));
124 	free(dupname);
125 	if (error)
126 		return;
127 
128 	/*
129 	 * Ignore failed stats.  We only want regular files and block devices.
130 	 */
131 	if (stat64(rn->rn_name, &statbuf) != 0 ||
132 	    (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
133 		return;
134 
135 	/*
136 	 * Preferentially open using O_DIRECT to bypass the block device
137 	 * cache which may be stale for multipath devices.  An EINVAL errno
138 	 * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
139 	 */
140 	fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
141 	if ((fd < 0) && (errno == EINVAL))
142 		fd = open(rn->rn_name, O_RDONLY);
143 	if ((fd < 0) && (errno == EACCES))
144 		hdl->lpc_open_access_error = B_TRUE;
145 	if (fd < 0)
146 		return;
147 
148 	/*
149 	 * This file is too small to hold a zpool
150 	 */
151 	if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
152 		(void) close(fd);
153 		return;
154 	}
155 
156 	error = zpool_read_label(fd, &config, &num_labels);
157 	if (error != 0) {
158 		(void) close(fd);
159 		return;
160 	}
161 
162 	if (num_labels == 0) {
163 		(void) close(fd);
164 		nvlist_free(config);
165 		return;
166 	}
167 
168 	/*
169 	 * Check that the vdev is for the expected guid.  Additional entries
170 	 * are speculatively added based on the paths stored in the labels.
171 	 * Entries with valid paths but incorrect guids must be removed.
172 	 */
173 	error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
174 	if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
175 		(void) close(fd);
176 		nvlist_free(config);
177 		return;
178 	}
179 
180 	(void) close(fd);
181 
182 	rn->rn_config = config;
183 	rn->rn_num_labels = num_labels;
184 
185 	/*
186 	 * Add additional entries for paths described by this label.
187 	 */
188 	if (rn->rn_labelpaths) {
189 		char *path = NULL;
190 		char *devid = NULL;
191 		char *env = NULL;
192 		rdsk_node_t *slice;
193 		avl_index_t where;
194 		int timeout;
195 		int error;
196 
197 		if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid))
198 			return;
199 
200 		env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS");
201 		if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 ||
202 		    timeout < 0) {
203 			timeout = DISK_LABEL_WAIT;
204 		}
205 
206 		/*
207 		 * Allow devlinks to stabilize so all paths are available.
208 		 */
209 		zpool_label_disk_wait(rn->rn_name, timeout);
210 
211 		if (path != NULL) {
212 			slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
213 			slice->rn_name = zutil_strdup(hdl, path);
214 			slice->rn_vdev_guid = vdev_guid;
215 			slice->rn_avl = rn->rn_avl;
216 			slice->rn_hdl = hdl;
217 			slice->rn_order = IMPORT_ORDER_PREFERRED_1;
218 			slice->rn_labelpaths = B_FALSE;
219 			pthread_mutex_lock(rn->rn_lock);
220 			if (avl_find(rn->rn_avl, slice, &where)) {
221 			pthread_mutex_unlock(rn->rn_lock);
222 				free(slice->rn_name);
223 				free(slice);
224 			} else {
225 				avl_insert(rn->rn_avl, slice, where);
226 				pthread_mutex_unlock(rn->rn_lock);
227 				zpool_open_func(slice);
228 			}
229 		}
230 
231 		if (devid != NULL) {
232 			slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
233 			error = asprintf(&slice->rn_name, "%s%s",
234 			    DEV_BYID_PATH, devid);
235 			if (error == -1) {
236 				free(slice);
237 				return;
238 			}
239 
240 			slice->rn_vdev_guid = vdev_guid;
241 			slice->rn_avl = rn->rn_avl;
242 			slice->rn_hdl = hdl;
243 			slice->rn_order = IMPORT_ORDER_PREFERRED_2;
244 			slice->rn_labelpaths = B_FALSE;
245 			pthread_mutex_lock(rn->rn_lock);
246 			if (avl_find(rn->rn_avl, slice, &where)) {
247 				pthread_mutex_unlock(rn->rn_lock);
248 				free(slice->rn_name);
249 				free(slice);
250 			} else {
251 				avl_insert(rn->rn_avl, slice, where);
252 				pthread_mutex_unlock(rn->rn_lock);
253 				zpool_open_func(slice);
254 			}
255 		}
256 	}
257 }
258 
259 static char *
260 zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = {
261 	"/dev/disk/by-vdev",	/* Custom rules, use first if they exist */
262 	"/dev/mapper",		/* Use multipath devices before components */
263 	"/dev/disk/by-partlabel", /* Single unique entry set by user */
264 	"/dev/disk/by-partuuid", /* Generated partition uuid */
265 	"/dev/disk/by-label",	/* Custom persistent labels */
266 	"/dev/disk/by-uuid",	/* Single unique entry and persistent */
267 	"/dev/disk/by-id",	/* May be multiple entries and persistent */
268 	"/dev/disk/by-path",	/* Encodes physical location and persistent */
269 	"/dev"			/* UNSAFE device names will change */
270 };
271 
272 const char * const *
zpool_default_search_paths(size_t * count)273 zpool_default_search_paths(size_t *count)
274 {
275 	*count = DEFAULT_IMPORT_PATH_SIZE;
276 	return ((const char * const *)zpool_default_import_path);
277 }
278 
279 /*
280  * Given a full path to a device determine if that device appears in the
281  * import search path.  If it does return the first match and store the
282  * index in the passed 'order' variable, otherwise return an error.
283  */
284 static int
zfs_path_order(char * name,int * order)285 zfs_path_order(char *name, int *order)
286 {
287 	int i = 0, error = ENOENT;
288 	char *dir, *env, *envdup;
289 
290 	env = getenv("ZPOOL_IMPORT_PATH");
291 	if (env) {
292 		envdup = strdup(env);
293 		dir = strtok(envdup, ":");
294 		while (dir) {
295 			if (strncmp(name, dir, strlen(dir)) == 0) {
296 				*order = i;
297 				error = 0;
298 				break;
299 			}
300 			dir = strtok(NULL, ":");
301 			i++;
302 		}
303 		free(envdup);
304 	} else {
305 		for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) {
306 			if (strncmp(name, zpool_default_import_path[i],
307 			    strlen(zpool_default_import_path[i])) == 0) {
308 				*order = i;
309 				error = 0;
310 				break;
311 			}
312 		}
313 	}
314 
315 	return (error);
316 }
317 
318 /*
319  * Use libblkid to quickly enumerate all known zfs devices.
320  */
321 int
zpool_find_import_blkid(libpc_handle_t * hdl,pthread_mutex_t * lock,avl_tree_t ** slice_cache)322 zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock,
323     avl_tree_t **slice_cache)
324 {
325 	rdsk_node_t *slice;
326 	blkid_cache cache;
327 	blkid_dev_iterate iter;
328 	blkid_dev dev;
329 	avl_index_t where;
330 	int error;
331 
332 	*slice_cache = NULL;
333 
334 	error = blkid_get_cache(&cache, NULL);
335 	if (error != 0)
336 		return (error);
337 
338 	error = blkid_probe_all_new(cache);
339 	if (error != 0) {
340 		blkid_put_cache(cache);
341 		return (error);
342 	}
343 
344 	iter = blkid_dev_iterate_begin(cache);
345 	if (iter == NULL) {
346 		blkid_put_cache(cache);
347 		return (EINVAL);
348 	}
349 
350 	error = blkid_dev_set_search(iter, "TYPE", "zfs_member");
351 	if (error != 0) {
352 		blkid_dev_iterate_end(iter);
353 		blkid_put_cache(cache);
354 		return (error);
355 	}
356 
357 	*slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t));
358 	avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t),
359 	    offsetof(rdsk_node_t, rn_node));
360 
361 	while (blkid_dev_next(iter, &dev) == 0) {
362 		slice = zutil_alloc(hdl, sizeof (rdsk_node_t));
363 		slice->rn_name = zutil_strdup(hdl, blkid_dev_devname(dev));
364 		slice->rn_vdev_guid = 0;
365 		slice->rn_lock = lock;
366 		slice->rn_avl = *slice_cache;
367 		slice->rn_hdl = hdl;
368 		slice->rn_labelpaths = B_TRUE;
369 
370 		error = zfs_path_order(slice->rn_name, &slice->rn_order);
371 		if (error == 0)
372 			slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
373 		else
374 			slice->rn_order = IMPORT_ORDER_DEFAULT;
375 
376 		pthread_mutex_lock(lock);
377 		if (avl_find(*slice_cache, slice, &where)) {
378 			free(slice->rn_name);
379 			free(slice);
380 		} else {
381 			avl_insert(*slice_cache, slice, where);
382 		}
383 		pthread_mutex_unlock(lock);
384 	}
385 
386 	blkid_dev_iterate_end(iter);
387 	blkid_put_cache(cache);
388 
389 	return (0);
390 }
391 
392 /*
393  * Linux persistent device strings for vdev labels
394  *
395  * based on libudev for consistency with libudev disk add/remove events
396  */
397 
398 typedef struct vdev_dev_strs {
399 	char	vds_devid[128];
400 	char	vds_devphys[128];
401 } vdev_dev_strs_t;
402 
403 #ifdef HAVE_LIBUDEV
404 
405 /*
406  * Obtain the persistent device id string (describes what)
407  *
408  * used by ZED vdev matching for auto-{online,expand,replace}
409  */
410 int
zfs_device_get_devid(struct udev_device * dev,char * bufptr,size_t buflen)411 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
412 {
413 	struct udev_list_entry *entry;
414 	const char *bus;
415 	char devbyid[MAXPATHLEN];
416 
417 	/* The bus based by-id path is preferred */
418 	bus = udev_device_get_property_value(dev, "ID_BUS");
419 
420 	if (bus == NULL) {
421 		const char *dm_uuid;
422 
423 		/*
424 		 * For multipath nodes use the persistent uuid based identifier
425 		 *
426 		 * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f
427 		 */
428 		dm_uuid = udev_device_get_property_value(dev, "DM_UUID");
429 		if (dm_uuid != NULL) {
430 			(void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
431 			return (0);
432 		}
433 
434 		/*
435 		 * For volumes use the persistent /dev/zvol/dataset identifier
436 		 */
437 		entry = udev_device_get_devlinks_list_entry(dev);
438 		while (entry != NULL) {
439 			const char *name;
440 
441 			name = udev_list_entry_get_name(entry);
442 			if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
443 				(void) strlcpy(bufptr, name, buflen);
444 				return (0);
445 			}
446 			entry = udev_list_entry_get_next(entry);
447 		}
448 
449 		/*
450 		 * NVME 'by-id' symlinks are similar to bus case
451 		 */
452 		struct udev_device *parent;
453 
454 		parent = udev_device_get_parent_with_subsystem_devtype(dev,
455 		    "nvme", NULL);
456 		if (parent != NULL)
457 			bus = "nvme";	/* continue with bus symlink search */
458 		else
459 			return (ENODATA);
460 	}
461 
462 	/*
463 	 * locate the bus specific by-id link
464 	 */
465 	(void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus);
466 	entry = udev_device_get_devlinks_list_entry(dev);
467 	while (entry != NULL) {
468 		const char *name;
469 
470 		name = udev_list_entry_get_name(entry);
471 		if (strncmp(name, devbyid, strlen(devbyid)) == 0) {
472 			name += strlen(DEV_BYID_PATH);
473 			(void) strlcpy(bufptr, name, buflen);
474 			return (0);
475 		}
476 		entry = udev_list_entry_get_next(entry);
477 	}
478 
479 	return (ENODATA);
480 }
481 
482 /*
483  * Obtain the persistent physical location string (describes where)
484  *
485  * used by ZED vdev matching for auto-{online,expand,replace}
486  */
487 int
zfs_device_get_physical(struct udev_device * dev,char * bufptr,size_t buflen)488 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
489 {
490 	const char *physpath = NULL;
491 	struct udev_list_entry *entry;
492 
493 	/*
494 	 * Normal disks use ID_PATH for their physical path.
495 	 */
496 	physpath = udev_device_get_property_value(dev, "ID_PATH");
497 	if (physpath != NULL && strlen(physpath) > 0) {
498 		(void) strlcpy(bufptr, physpath, buflen);
499 		return (0);
500 	}
501 
502 	/*
503 	 * Device mapper devices are virtual and don't have a physical
504 	 * path. For them we use ID_VDEV instead, which is setup via the
505 	 * /etc/vdev_id.conf file.  ID_VDEV provides a persistent path
506 	 * to a virtual device.  If you don't have vdev_id.conf setup,
507 	 * you cannot use multipath autoreplace with device mapper.
508 	 */
509 	physpath = udev_device_get_property_value(dev, "ID_VDEV");
510 	if (physpath != NULL && strlen(physpath) > 0) {
511 		(void) strlcpy(bufptr, physpath, buflen);
512 		return (0);
513 	}
514 
515 	/*
516 	 * For ZFS volumes use the persistent /dev/zvol/dataset identifier
517 	 */
518 	entry = udev_device_get_devlinks_list_entry(dev);
519 	while (entry != NULL) {
520 		physpath = udev_list_entry_get_name(entry);
521 		if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
522 			(void) strlcpy(bufptr, physpath, buflen);
523 			return (0);
524 		}
525 		entry = udev_list_entry_get_next(entry);
526 	}
527 
528 	/*
529 	 * For all other devices fallback to using the by-uuid name.
530 	 */
531 	entry = udev_device_get_devlinks_list_entry(dev);
532 	while (entry != NULL) {
533 		physpath = udev_list_entry_get_name(entry);
534 		if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
535 			(void) strlcpy(bufptr, physpath, buflen);
536 			return (0);
537 		}
538 		entry = udev_list_entry_get_next(entry);
539 	}
540 
541 	return (ENODATA);
542 }
543 
544 /*
545  * A disk is considered a multipath whole disk when:
546  *	DEVNAME key value has "dm-"
547  *	DM_NAME key value has "mpath" prefix
548  *	DM_UUID key exists
549  *	ID_PART_TABLE_TYPE key does not exist or is not gpt
550  */
551 static boolean_t
udev_mpath_whole_disk(struct udev_device * dev)552 udev_mpath_whole_disk(struct udev_device *dev)
553 {
554 	const char *devname, *type, *uuid;
555 
556 	devname = udev_device_get_property_value(dev, "DEVNAME");
557 	type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
558 	uuid = udev_device_get_property_value(dev, "DM_UUID");
559 
560 	if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
561 	    ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
562 	    (uuid != NULL)) {
563 		return (B_TRUE);
564 	}
565 
566 	return (B_FALSE);
567 }
568 
569 static int
udev_device_is_ready(struct udev_device * dev)570 udev_device_is_ready(struct udev_device *dev)
571 {
572 #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED
573 	return (udev_device_get_is_initialized(dev));
574 #else
575 	/* wait for DEVLINKS property to be initialized */
576 	return (udev_device_get_property_value(dev, "DEVLINKS") != NULL);
577 #endif
578 }
579 
580 #else
581 
582 /* ARGSUSED */
583 int
zfs_device_get_devid(struct udev_device * dev,char * bufptr,size_t buflen)584 zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
585 {
586 	return (ENODATA);
587 }
588 
589 /* ARGSUSED */
590 int
zfs_device_get_physical(struct udev_device * dev,char * bufptr,size_t buflen)591 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
592 {
593 	return (ENODATA);
594 }
595 
596 #endif /* HAVE_LIBUDEV */
597 
598 /*
599  * Wait up to timeout_ms for udev to set up the device node.  The device is
600  * considered ready when libudev determines it has been initialized, all of
601  * the device links have been verified to exist, and it has been allowed to
602  * settle.  At this point the device the device can be accessed reliably.
603  * Depending on the complexity of the udev rules this process could take
604  * several seconds.
605  */
606 int
zpool_label_disk_wait(const char * path,int timeout_ms)607 zpool_label_disk_wait(const char *path, int timeout_ms)
608 {
609 #ifdef HAVE_LIBUDEV
610 	struct udev *udev;
611 	struct udev_device *dev = NULL;
612 	char nodepath[MAXPATHLEN];
613 	char *sysname = NULL;
614 	int ret = ENODEV;
615 	int settle_ms = 50;
616 	long sleep_ms = 10;
617 	hrtime_t start, settle;
618 
619 	if ((udev = udev_new()) == NULL)
620 		return (ENXIO);
621 
622 	start = gethrtime();
623 	settle = 0;
624 
625 	do {
626 		if (sysname == NULL) {
627 			if (realpath(path, nodepath) != NULL) {
628 				sysname = strrchr(nodepath, '/') + 1;
629 			} else {
630 				(void) usleep(sleep_ms * MILLISEC);
631 				continue;
632 			}
633 		}
634 
635 		dev = udev_device_new_from_subsystem_sysname(udev,
636 		    "block", sysname);
637 		if ((dev != NULL) && udev_device_is_ready(dev)) {
638 			struct udev_list_entry *links, *link = NULL;
639 
640 			ret = 0;
641 			links = udev_device_get_devlinks_list_entry(dev);
642 
643 			udev_list_entry_foreach(link, links) {
644 				struct stat64 statbuf;
645 				const char *name;
646 
647 				name = udev_list_entry_get_name(link);
648 				errno = 0;
649 				if (stat64(name, &statbuf) == 0 && errno == 0)
650 					continue;
651 
652 				settle = 0;
653 				ret = ENODEV;
654 				break;
655 			}
656 
657 			if (ret == 0) {
658 				if (settle == 0) {
659 					settle = gethrtime();
660 				} else if (NSEC2MSEC(gethrtime() - settle) >=
661 				    settle_ms) {
662 					udev_device_unref(dev);
663 					break;
664 				}
665 			}
666 		}
667 
668 		udev_device_unref(dev);
669 		(void) usleep(sleep_ms * MILLISEC);
670 
671 	} while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
672 
673 	udev_unref(udev);
674 
675 	return (ret);
676 #else
677 	int settle_ms = 50;
678 	long sleep_ms = 10;
679 	hrtime_t start, settle;
680 	struct stat64 statbuf;
681 
682 	start = gethrtime();
683 	settle = 0;
684 
685 	do {
686 		errno = 0;
687 		if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
688 			if (settle == 0)
689 				settle = gethrtime();
690 			else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
691 				return (0);
692 		} else if (errno != ENOENT) {
693 			return (errno);
694 		}
695 
696 		usleep(sleep_ms * MILLISEC);
697 	} while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
698 
699 	return (ENODEV);
700 #endif /* HAVE_LIBUDEV */
701 }
702 
703 /*
704  * Encode the persistent devices strings
705  * used for the vdev disk label
706  */
707 static int
encode_device_strings(const char * path,vdev_dev_strs_t * ds,boolean_t wholedisk)708 encode_device_strings(const char *path, vdev_dev_strs_t *ds,
709     boolean_t wholedisk)
710 {
711 #ifdef HAVE_LIBUDEV
712 	struct udev *udev;
713 	struct udev_device *dev = NULL;
714 	char nodepath[MAXPATHLEN];
715 	char *sysname;
716 	int ret = ENODEV;
717 	hrtime_t start;
718 
719 	if ((udev = udev_new()) == NULL)
720 		return (ENXIO);
721 
722 	/* resolve path to a runtime device node instance */
723 	if (realpath(path, nodepath) == NULL)
724 		goto no_dev;
725 
726 	sysname = strrchr(nodepath, '/') + 1;
727 
728 	/*
729 	 * Wait up to 3 seconds for udev to set up the device node context
730 	 */
731 	start = gethrtime();
732 	do {
733 		dev = udev_device_new_from_subsystem_sysname(udev, "block",
734 		    sysname);
735 		if (dev == NULL)
736 			goto no_dev;
737 		if (udev_device_is_ready(dev))
738 			break;  /* udev ready */
739 
740 		udev_device_unref(dev);
741 		dev = NULL;
742 
743 		if (NSEC2MSEC(gethrtime() - start) < 10)
744 			(void) sched_yield();	/* yield/busy wait up to 10ms */
745 		else
746 			(void) usleep(10 * MILLISEC);
747 
748 	} while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC));
749 
750 	if (dev == NULL)
751 		goto no_dev;
752 
753 	/*
754 	 * Only whole disks require extra device strings
755 	 */
756 	if (!wholedisk && !udev_mpath_whole_disk(dev))
757 		goto no_dev;
758 
759 	ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
760 	if (ret != 0)
761 		goto no_dev_ref;
762 
763 	/* physical location string (optional) */
764 	if (zfs_device_get_physical(dev, ds->vds_devphys,
765 	    sizeof (ds->vds_devphys)) != 0) {
766 		ds->vds_devphys[0] = '\0'; /* empty string --> not available */
767 	}
768 
769 no_dev_ref:
770 	udev_device_unref(dev);
771 no_dev:
772 	udev_unref(udev);
773 
774 	return (ret);
775 #else
776 	return (ENOENT);
777 #endif
778 }
779 
780 /*
781  * Update a leaf vdev's persistent device strings
782  *
783  * - only applies for a dedicated leaf vdev (aka whole disk)
784  * - updated during pool create|add|attach|import
785  * - used for matching device matching during auto-{online,expand,replace}
786  * - stored in a leaf disk config label (i.e. alongside 'path' NVP)
787  * - these strings are currently not used in kernel (i.e. for vdev_disk_open)
788  *
789  * single device node example:
790  * 	devid:		'scsi-MG03SCA300_350000494a8cb3d67-part1'
791  * 	phys_path:	'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0'
792  *
793  * multipath device node example:
794  * 	devid:		'dm-uuid-mpath-35000c5006304de3f'
795  *
796  * We also store the enclosure sysfs path for turning on enclosure LEDs
797  * (if applicable):
798  *	vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
799  */
800 void
update_vdev_config_dev_strs(nvlist_t * nv)801 update_vdev_config_dev_strs(nvlist_t *nv)
802 {
803 	vdev_dev_strs_t vds;
804 	char *env, *type, *path;
805 	uint64_t wholedisk = 0;
806 	char *upath, *spath;
807 
808 	/*
809 	 * For the benefit of legacy ZFS implementations, allow
810 	 * for opting out of devid strings in the vdev label.
811 	 *
812 	 * example use:
813 	 *	env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer
814 	 *
815 	 * explanation:
816 	 * Older OpenZFS implementations had issues when attempting to
817 	 * display pool config VDEV names if a "devid" NVP value is
818 	 * present in the pool's config.
819 	 *
820 	 * For example, a pool that originated on illumos platform would
821 	 * have a devid value in the config and "zpool status" would fail
822 	 * when listing the config.
823 	 *
824 	 * A pool can be stripped of any "devid" values on import or
825 	 * prevented from adding them on zpool create|add by setting
826 	 * ZFS_VDEV_DEVID_OPT_OUT.
827 	 */
828 	env = getenv("ZFS_VDEV_DEVID_OPT_OUT");
829 	if (env && (strtoul(env, NULL, 0) > 0 ||
830 	    !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) {
831 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
832 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
833 		return;
834 	}
835 
836 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 ||
837 	    strcmp(type, VDEV_TYPE_DISK) != 0) {
838 		return;
839 	}
840 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
841 		return;
842 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
843 
844 	/*
845 	 * Update device string values in the config nvlist.
846 	 */
847 	if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) {
848 		(void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid);
849 		if (vds.vds_devphys[0] != '\0') {
850 			(void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
851 			    vds.vds_devphys);
852 		}
853 
854 		/* Add enclosure sysfs path (if disk is in an enclosure). */
855 		upath = zfs_get_underlying_path(path);
856 		spath = zfs_get_enclosure_sysfs_path(upath);
857 		if (spath)
858 			nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
859 			    spath);
860 		else
861 			nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
862 
863 		free(upath);
864 		free(spath);
865 	} else {
866 		/* Clear out any stale entries. */
867 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
868 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
869 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
870 	}
871 }
872