1 /*-
2 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions, and the following disclaimer,
10 * without modification.
11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12 * substantially similar to the "NO WARRANTY" disclaimer below
13 * ("Disclaimer") and any redistribution must be conditioned upon
14 * including a substantially similar Disclaimer requirement for further
15 * binary redistribution.
16 *
17 * NO WARRANTY
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGES.
29 *
30 * Authors: Justin T. Gibbs (Spectra Logic Corporation)
31 */
32
33 /**
34 * \file case_file.cc
35 *
36 * We keep case files for any leaf vdev that is not in the optimal state.
37 * However, we only serialize to disk those events that need to be preserved
38 * across reboots. For now, this is just a log of soft errors which we
39 * accumulate in order to mark a device as degraded.
40 */
41 #include <sys/cdefs.h>
42 #include <sys/byteorder.h>
43 #include <sys/time.h>
44
45 #include <sys/fs/zfs.h>
46
47 #include <dirent.h>
48 #include <fcntl.h>
49 #include <iomanip>
50 #include <fstream>
51 #include <functional>
52 #include <sstream>
53 #include <syslog.h>
54 #include <unistd.h>
55
56 #include <libzfs.h>
57
58 #include <list>
59 #include <map>
60 #include <string>
61
62 #include <devdctl/guid.h>
63 #include <devdctl/event.h>
64 #include <devdctl/event_factory.h>
65 #include <devdctl/exception.h>
66 #include <devdctl/consumer.h>
67
68 #include "callout.h"
69 #include "vdev_iterator.h"
70 #include "zfsd_event.h"
71 #include "case_file.h"
72 #include "vdev.h"
73 #include "zfsd.h"
74 #include "zfsd_exception.h"
75 #include "zpool_list.h"
76
77 __FBSDID("$FreeBSD$");
78
79 /*============================ Namespace Control =============================*/
80 using std::hex;
81 using std::ifstream;
82 using std::stringstream;
83 using std::setfill;
84 using std::setw;
85
86 using DevdCtl::Event;
87 using DevdCtl::EventFactory;
88 using DevdCtl::EventList;
89 using DevdCtl::Guid;
90 using DevdCtl::ParseException;
91
92 /*--------------------------------- CaseFile ---------------------------------*/
93 //- CaseFile Static Data -------------------------------------------------------
94
95 CaseFileList CaseFile::s_activeCases;
96 const string CaseFile::s_caseFilePath = "/var/db/zfsd/cases";
97 const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/};
98
99 //- CaseFile Static Public Methods ---------------------------------------------
100 CaseFile *
Find(Guid poolGUID,Guid vdevGUID)101 CaseFile::Find(Guid poolGUID, Guid vdevGUID)
102 {
103 for (CaseFileList::iterator curCase = s_activeCases.begin();
104 curCase != s_activeCases.end(); curCase++) {
105
106 if (((*curCase)->PoolGUID() != poolGUID
107 && Guid::InvalidGuid() != poolGUID)
108 || (*curCase)->VdevGUID() != vdevGUID)
109 continue;
110
111 /*
112 * We only carry one active case per-vdev.
113 */
114 return (*curCase);
115 }
116 return (NULL);
117 }
118
119 CaseFile *
Find(const string & physPath)120 CaseFile::Find(const string &physPath)
121 {
122 CaseFile *result = NULL;
123
124 for (CaseFileList::iterator curCase = s_activeCases.begin();
125 curCase != s_activeCases.end(); curCase++) {
126
127 if ((*curCase)->PhysicalPath() != physPath)
128 continue;
129
130 if (result != NULL) {
131 syslog(LOG_WARNING, "Multiple casefiles found for "
132 "physical path %s. "
133 "This is most likely a bug in zfsd",
134 physPath.c_str());
135 }
136 result = *curCase;
137 }
138 return (result);
139 }
140
141
142 void
ReEvaluateByGuid(Guid poolGUID,const ZfsEvent & event)143 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
144 {
145 CaseFileList::iterator casefile;
146 for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
147 CaseFileList::iterator next = casefile;
148 next++;
149 if (poolGUID == (*casefile)->PoolGUID())
150 (*casefile)->ReEvaluate(event);
151 casefile = next;
152 }
153 }
154
155 CaseFile &
Create(Vdev & vdev)156 CaseFile::Create(Vdev &vdev)
157 {
158 CaseFile *activeCase;
159
160 activeCase = Find(vdev.PoolGUID(), vdev.GUID());
161 if (activeCase == NULL)
162 activeCase = new CaseFile(vdev);
163
164 return (*activeCase);
165 }
166
167 void
DeSerialize()168 CaseFile::DeSerialize()
169 {
170 struct dirent **caseFiles;
171
172 int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
173 DeSerializeSelector, /*compar*/NULL));
174
175 if (numCaseFiles == -1)
176 return;
177 if (numCaseFiles == 0) {
178 free(caseFiles);
179 return;
180 }
181
182 for (int i = 0; i < numCaseFiles; i++) {
183
184 DeSerializeFile(caseFiles[i]->d_name);
185 free(caseFiles[i]);
186 }
187 free(caseFiles);
188 }
189
190 bool
Empty()191 CaseFile::Empty()
192 {
193 return (s_activeCases.empty());
194 }
195
196 void
LogAll()197 CaseFile::LogAll()
198 {
199 for (CaseFileList::iterator curCase = s_activeCases.begin();
200 curCase != s_activeCases.end(); curCase++)
201 (*curCase)->Log();
202 }
203
204 void
PurgeAll()205 CaseFile::PurgeAll()
206 {
207 /*
208 * Serialize casefiles before deleting them so that they can be reread
209 * and revalidated during BuildCaseFiles.
210 * CaseFiles remove themselves from this list on destruction.
211 */
212 while (s_activeCases.size() != 0) {
213 CaseFile *casefile = s_activeCases.front();
214 casefile->Serialize();
215 delete casefile;
216 }
217
218 }
219
220 //- CaseFile Public Methods ----------------------------------------------------
221 bool
RefreshVdevState()222 CaseFile::RefreshVdevState()
223 {
224 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
225 zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
226 if (casePool == NULL)
227 return (false);
228
229 Vdev vd(casePool, CaseVdev(casePool));
230 if (vd.DoesNotExist())
231 return (false);
232
233 m_vdevState = vd.State();
234 m_vdevPhysPath = vd.PhysicalPath();
235 return (true);
236 }
237
238 bool
ReEvaluate(const string & devPath,const string & physPath,Vdev * vdev)239 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
240 {
241 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
242 zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
243
244 if (pool == NULL || !RefreshVdevState()) {
245 /*
246 * The pool or vdev for this case file is no longer
247 * part of the configuration. This can happen
248 * if we process a device arrival notification
249 * before seeing the ZFS configuration change
250 * event.
251 */
252 syslog(LOG_INFO,
253 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured. "
254 "Closing\n",
255 PoolGUIDString().c_str(),
256 VdevGUIDString().c_str());
257 Close();
258
259 /*
260 * Since this event was not used to close this
261 * case, do not report it as consumed.
262 */
263 return (/*consumed*/false);
264 }
265
266 if (VdevState() > VDEV_STATE_CANT_OPEN) {
267 /*
268 * For now, newly discovered devices only help for
269 * devices that are missing. In the future, we might
270 * use a newly inserted spare to replace a degraded
271 * or faulted device.
272 */
273 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
274 PoolGUIDString().c_str(), VdevGUIDString().c_str());
275 return (/*consumed*/false);
276 }
277
278 if (vdev != NULL
279 && ( vdev->PoolGUID() == m_poolGUID
280 || vdev->PoolGUID() == Guid::InvalidGuid())
281 && vdev->GUID() == m_vdevGUID) {
282
283 if (zpool_vdev_online(pool, vdev->GUIDString().c_str(),
284 ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE,
285 &m_vdevState) != 0) {
286 syslog(LOG_ERR,
287 "Failed to online vdev(%s/%s:%s): %s: %s\n",
288 zpool_get_name(pool), vdev->GUIDString().c_str(),
289 devPath.c_str(), libzfs_error_action(g_zfsHandle),
290 libzfs_error_description(g_zfsHandle));
291 return (/*consumed*/false);
292 }
293
294 syslog(LOG_INFO, "Onlined vdev(%s/%s:%s). State now %s.\n",
295 zpool_get_name(pool), vdev->GUIDString().c_str(),
296 devPath.c_str(),
297 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
298
299 /*
300 * Check the vdev state post the online action to see
301 * if we can retire this case.
302 */
303 CloseIfSolved();
304
305 return (/*consumed*/true);
306 }
307
308 /*
309 * If the auto-replace policy is enabled, and we have physical
310 * path information, try a physical path replacement.
311 */
312 if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
313 syslog(LOG_INFO,
314 "CaseFile(%s:%s:%s): AutoReplace not set. "
315 "Ignoring device insertion.\n",
316 PoolGUIDString().c_str(),
317 VdevGUIDString().c_str(),
318 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
319 return (/*consumed*/false);
320 }
321
322 if (PhysicalPath().empty()) {
323 syslog(LOG_INFO,
324 "CaseFile(%s:%s:%s): No physical path information. "
325 "Ignoring device insertion.\n",
326 PoolGUIDString().c_str(),
327 VdevGUIDString().c_str(),
328 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
329 return (/*consumed*/false);
330 }
331
332 if (physPath != PhysicalPath()) {
333 syslog(LOG_INFO,
334 "CaseFile(%s:%s:%s): Physical path mismatch. "
335 "Ignoring device insertion.\n",
336 PoolGUIDString().c_str(),
337 VdevGUIDString().c_str(),
338 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
339 return (/*consumed*/false);
340 }
341
342 /* Write a label on the newly inserted disk. */
343 if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
344 syslog(LOG_ERR,
345 "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
346 zpool_get_name(pool), VdevGUIDString().c_str(),
347 libzfs_error_action(g_zfsHandle),
348 libzfs_error_description(g_zfsHandle));
349 return (/*consumed*/false);
350 }
351
352 syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
353 PoolGUIDString().c_str(), VdevGUIDString().c_str(),
354 devPath.c_str());
355 return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
356 }
357
358 bool
ReEvaluate(const ZfsEvent & event)359 CaseFile::ReEvaluate(const ZfsEvent &event)
360 {
361 bool consumed(false);
362
363 if (event.Value("type") == "misc.fs.zfs.vdev_remove") {
364 /*
365 * The Vdev we represent has been removed from the
366 * configuration. This case is no longer of value.
367 */
368 Close();
369
370 return (/*consumed*/true);
371 } else if (event.Value("type") == "misc.fs.zfs.pool_destroy") {
372 /* This Pool has been destroyed. Discard the case */
373 Close();
374
375 return (/*consumed*/true);
376 } else if (event.Value("type") == "misc.fs.zfs.config_sync") {
377 RefreshVdevState();
378 if (VdevState() < VDEV_STATE_HEALTHY)
379 consumed = ActivateSpare();
380 }
381
382
383 if (event.Value("class") == "resource.fs.zfs.removed") {
384 bool spare_activated;
385
386 if (!RefreshVdevState()) {
387 /*
388 * The pool or vdev for this case file is no longer
389 * part of the configuration. This can happen
390 * if we process a device arrival notification
391 * before seeing the ZFS configuration change
392 * event.
393 */
394 syslog(LOG_INFO,
395 "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
396 "unconfigured. Closing\n",
397 PoolGUIDString().c_str(),
398 VdevGUIDString().c_str());
399 /*
400 * Close the case now so we won't waste cycles in the
401 * system rescan
402 */
403 Close();
404
405 /*
406 * Since this event was not used to close this
407 * case, do not report it as consumed.
408 */
409 return (/*consumed*/false);
410 }
411
412 /*
413 * Discard any tentative I/O error events for
414 * this case. They were most likely caused by the
415 * hot-unplug of this device.
416 */
417 PurgeTentativeEvents();
418
419 /* Try to activate spares if they are available */
420 spare_activated = ActivateSpare();
421
422 /*
423 * Rescan the drives in the system to see if a recent
424 * drive arrival can be used to solve this case.
425 */
426 ZfsDaemon::RequestSystemRescan();
427
428 /*
429 * Consume the event if we successfully activated a spare.
430 * Otherwise, leave it in the unconsumed events list so that the
431 * future addition of a spare to this pool might be able to
432 * close the case
433 */
434 consumed = spare_activated;
435 } else if (event.Value("class") == "resource.fs.zfs.statechange") {
436 RefreshVdevState();
437 /*
438 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
439 * activate a hotspare. Otherwise, ignore the event
440 */
441 if (VdevState() == VDEV_STATE_FAULTED ||
442 VdevState() == VDEV_STATE_DEGRADED ||
443 VdevState() == VDEV_STATE_CANT_OPEN)
444 (void) ActivateSpare();
445 consumed = true;
446 }
447 else if (event.Value("class") == "ereport.fs.zfs.io" ||
448 event.Value("class") == "ereport.fs.zfs.checksum") {
449
450 m_tentativeEvents.push_front(event.DeepCopy());
451 RegisterCallout(event);
452 consumed = true;
453 }
454
455 bool closed(CloseIfSolved());
456
457 return (consumed || closed);
458 }
459
460 /* Find a Vdev containing the vdev with the given GUID */
461 static nvlist_t*
find_parent(nvlist_t * pool_config,nvlist_t * config,DevdCtl::Guid child_guid)462 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid)
463 {
464 nvlist_t **vdevChildren;
465 int error;
466 unsigned ch, numChildren;
467
468 error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
469 &vdevChildren, &numChildren);
470
471 if (error != 0 || numChildren == 0)
472 return (NULL);
473
474 for (ch = 0; ch < numChildren; ch++) {
475 nvlist *result;
476 Vdev vdev(pool_config, vdevChildren[ch]);
477
478 if (vdev.GUID() == child_guid)
479 return (config);
480
481 result = find_parent(pool_config, vdevChildren[ch], child_guid);
482 if (result != NULL)
483 return (result);
484 }
485
486 return (NULL);
487 }
488
489 bool
ActivateSpare()490 CaseFile::ActivateSpare() {
491 nvlist_t *config, *nvroot, *parent_config;
492 nvlist_t **spares;
493 char *devPath, *vdev_type;
494 const char *poolname;
495 u_int nspares, i;
496 int error;
497
498 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
499 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
500 if (zhp == NULL) {
501 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
502 "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
503 return (false);
504 }
505 poolname = zpool_get_name(zhp);
506 config = zpool_get_config(zhp, NULL);
507 if (config == NULL) {
508 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
509 "config for pool %s", poolname);
510 return (false);
511 }
512 error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
513 if (error != 0){
514 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
515 "tree for pool %s", poolname);
516 return (false);
517 }
518
519 parent_config = find_parent(config, nvroot, m_vdevGUID);
520 if (parent_config != NULL) {
521 char *parent_type;
522
523 /*
524 * Don't activate spares for members of a "replacing" vdev.
525 * They're already dealt with. Sparing them will just drag out
526 * the resilver process.
527 */
528 error = nvlist_lookup_string(parent_config,
529 ZPOOL_CONFIG_TYPE, &parent_type);
530 if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0)
531 return (false);
532 }
533
534 nspares = 0;
535 nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
536 &nspares);
537 if (nspares == 0) {
538 /* The pool has no spares configured */
539 syslog(LOG_INFO, "CaseFile::ActivateSpare: "
540 "No spares available for pool %s", poolname);
541 return (false);
542 }
543 for (i = 0; i < nspares; i++) {
544 uint64_t *nvlist_array;
545 vdev_stat_t *vs;
546 uint_t nstats;
547
548 if (nvlist_lookup_uint64_array(spares[i],
549 ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
550 syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
551 "find vdev stats for pool %s, spare %d",
552 poolname, i);
553 return (false);
554 }
555 vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);
556
557 if ((vs->vs_aux != VDEV_AUX_SPARED)
558 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
559 /* We found a usable spare */
560 break;
561 }
562 }
563
564 if (i == nspares) {
565 /* No available spares were found */
566 return (false);
567 }
568
569 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
570 if (error != 0) {
571 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
572 "the path of pool %s, spare %d. Error %d",
573 poolname, i, error);
574 return (false);
575 }
576
577 error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
578 if (error != 0) {
579 syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
580 "the vdev type of pool %s, spare %d. Error %d",
581 poolname, i, error);
582 return (false);
583 }
584
585 return (Replace(vdev_type, devPath, /*isspare*/true));
586 }
587
588 void
RegisterCallout(const Event & event)589 CaseFile::RegisterCallout(const Event &event)
590 {
591 timeval now, countdown, elapsed, timestamp, zero, remaining;
592
593 gettimeofday(&now, 0);
594 timestamp = event.GetTimestamp();
595 timersub(&now, ×tamp, &elapsed);
596 timersub(&s_removeGracePeriod, &elapsed, &countdown);
597 /*
598 * If countdown is <= zero, Reset the timer to the
599 * smallest positive time value instead
600 */
601 timerclear(&zero);
602 if (timercmp(&countdown, &zero, <=)) {
603 timerclear(&countdown);
604 countdown.tv_usec = 1;
605 }
606
607 remaining = m_tentativeTimer.TimeRemaining();
608
609 if (!m_tentativeTimer.IsPending()
610 || timercmp(&countdown, &remaining, <))
611 m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
612 }
613
614
615 bool
CloseIfSolved()616 CaseFile::CloseIfSolved()
617 {
618 if (m_events.empty()
619 && m_tentativeEvents.empty()) {
620
621 /*
622 * We currently do not track or take actions on
623 * devices in the degraded or faulted state.
624 * Once we have support for spare pools, we'll
625 * retain these cases so that any spares added in
626 * the future can be applied to them.
627 */
628 switch (VdevState()) {
629 case VDEV_STATE_HEALTHY:
630 /* No need to keep cases for healthy vdevs */
631 Close();
632 return (true);
633 case VDEV_STATE_REMOVED:
634 case VDEV_STATE_CANT_OPEN:
635 /*
636 * Keep open. We may solve it with a newly inserted
637 * device.
638 */
639 case VDEV_STATE_FAULTED:
640 case VDEV_STATE_DEGRADED:
641 /*
642 * Keep open. We may solve it with the future
643 * addition of a spare to the pool
644 */
645 case VDEV_STATE_UNKNOWN:
646 case VDEV_STATE_CLOSED:
647 case VDEV_STATE_OFFLINE:
648 /*
649 * Keep open? This may not be the correct behavior,
650 * but it's what we've always done
651 */
652 ;
653 }
654
655 /*
656 * Re-serialize the case in order to remove any
657 * previous event data.
658 */
659 Serialize();
660 }
661
662 return (false);
663 }
664
665 void
Log()666 CaseFile::Log()
667 {
668 syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
669 VdevGUIDString().c_str(), PhysicalPath().c_str());
670 syslog(LOG_INFO, "\tVdev State = %s\n",
671 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
672 if (m_tentativeEvents.size() != 0) {
673 syslog(LOG_INFO, "\t=== Tentative Events ===\n");
674 for (EventList::iterator event(m_tentativeEvents.begin());
675 event != m_tentativeEvents.end(); event++)
676 (*event)->Log(LOG_INFO);
677 }
678 if (m_events.size() != 0) {
679 syslog(LOG_INFO, "\t=== Events ===\n");
680 for (EventList::iterator event(m_events.begin());
681 event != m_events.end(); event++)
682 (*event)->Log(LOG_INFO);
683 }
684 }
685
686 //- CaseFile Static Protected Methods ------------------------------------------
687 void
OnGracePeriodEnded(void * arg)688 CaseFile::OnGracePeriodEnded(void *arg)
689 {
690 CaseFile &casefile(*static_cast<CaseFile *>(arg));
691
692 casefile.OnGracePeriodEnded();
693 }
694
695 int
DeSerializeSelector(const struct dirent * dirEntry)696 CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
697 {
698 uint64_t poolGUID;
699 uint64_t vdevGUID;
700
701 if (dirEntry->d_type == DT_REG
702 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
703 &poolGUID, &vdevGUID) == 2)
704 return (1);
705 return (0);
706 }
707
708 void
DeSerializeFile(const char * fileName)709 CaseFile::DeSerializeFile(const char *fileName)
710 {
711 string fullName(s_caseFilePath + '/' + fileName);
712 CaseFile *existingCaseFile(NULL);
713 CaseFile *caseFile(NULL);
714
715 try {
716 uint64_t poolGUID;
717 uint64_t vdevGUID;
718 nvlist_t *vdevConf;
719
720 if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
721 &poolGUID, &vdevGUID) != 2) {
722 throw ZfsdException("CaseFile::DeSerialize: "
723 "Unintelligible CaseFile filename %s.\n", fileName);
724 }
725 existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
726 if (existingCaseFile != NULL) {
727 /*
728 * If the vdev is already degraded or faulted,
729 * there's no point in keeping the state around
730 * that we use to put a drive into the degraded
731 * state. However, if the vdev is simply missing,
732 * preserve the case data in the hopes that it will
733 * return.
734 */
735 caseFile = existingCaseFile;
736 vdev_state curState(caseFile->VdevState());
737 if (curState > VDEV_STATE_CANT_OPEN
738 && curState < VDEV_STATE_HEALTHY) {
739 unlink(fileName);
740 return;
741 }
742 } else {
743 ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
744 if (zpl.empty()
745 || (vdevConf = VdevIterator(zpl.front())
746 .Find(vdevGUID)) == NULL) {
747 /*
748 * Either the pool no longer exists
749 * or this vdev is no longer a member of
750 * the pool.
751 */
752 unlink(fullName.c_str());
753 return;
754 }
755
756 /*
757 * Any vdev we find that does not have a case file
758 * must be in the healthy state and thus worthy of
759 * continued SERD data tracking.
760 */
761 caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
762 }
763
764 ifstream caseStream(fullName.c_str());
765 if (!caseStream)
766 throw ZfsdException("CaseFile::DeSerialize: Unable to "
767 "read %s.\n", fileName);
768
769 caseFile->DeSerialize(caseStream);
770 } catch (const ParseException &exp) {
771
772 exp.Log();
773 if (caseFile != existingCaseFile)
774 delete caseFile;
775
776 /*
777 * Since we can't parse the file, unlink it so we don't
778 * trip over it again.
779 */
780 unlink(fileName);
781 } catch (const ZfsdException &zfsException) {
782
783 zfsException.Log();
784 if (caseFile != existingCaseFile)
785 delete caseFile;
786 }
787 }
788
789 //- CaseFile Protected Methods -------------------------------------------------
CaseFile(const Vdev & vdev)790 CaseFile::CaseFile(const Vdev &vdev)
791 : m_poolGUID(vdev.PoolGUID()),
792 m_vdevGUID(vdev.GUID()),
793 m_vdevState(vdev.State()),
794 m_vdevPhysPath(vdev.PhysicalPath())
795 {
796 stringstream guidString;
797
798 guidString << m_vdevGUID;
799 m_vdevGUIDString = guidString.str();
800 guidString.str("");
801 guidString << m_poolGUID;
802 m_poolGUIDString = guidString.str();
803
804 s_activeCases.push_back(this);
805
806 syslog(LOG_INFO, "Creating new CaseFile:\n");
807 Log();
808 }
809
~CaseFile()810 CaseFile::~CaseFile()
811 {
812 PurgeEvents();
813 PurgeTentativeEvents();
814 m_tentativeTimer.Stop();
815 s_activeCases.remove(this);
816 }
817
818 void
PurgeEvents()819 CaseFile::PurgeEvents()
820 {
821 for (EventList::iterator event(m_events.begin());
822 event != m_events.end(); event++)
823 delete *event;
824
825 m_events.clear();
826 }
827
828 void
PurgeTentativeEvents()829 CaseFile::PurgeTentativeEvents()
830 {
831 for (EventList::iterator event(m_tentativeEvents.begin());
832 event != m_tentativeEvents.end(); event++)
833 delete *event;
834
835 m_tentativeEvents.clear();
836 }
837
838 void
SerializeEvList(const EventList events,int fd,const char * prefix) const839 CaseFile::SerializeEvList(const EventList events, int fd,
840 const char* prefix) const
841 {
842 if (events.empty())
843 return;
844 for (EventList::const_iterator curEvent = events.begin();
845 curEvent != events.end(); curEvent++) {
846 const string &eventString((*curEvent)->GetEventString());
847
848 // TODO: replace many write(2) calls with a single writev(2)
849 if (prefix)
850 write(fd, prefix, strlen(prefix));
851 write(fd, eventString.c_str(), eventString.length());
852 }
853 }
854
855 void
Serialize()856 CaseFile::Serialize()
857 {
858 stringstream saveFile;
859
860 saveFile << setfill('0')
861 << s_caseFilePath << "/"
862 << "pool_" << PoolGUIDString()
863 << "_vdev_" << VdevGUIDString()
864 << ".case";
865
866 if (m_events.empty() && m_tentativeEvents.empty()) {
867 unlink(saveFile.str().c_str());
868 return;
869 }
870
871 int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
872 if (fd == -1) {
873 syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
874 saveFile.str().c_str());
875 return;
876 }
877 SerializeEvList(m_events, fd);
878 SerializeEvList(m_tentativeEvents, fd, "tentative ");
879 close(fd);
880 }
881
882 /*
883 * XXX: This method assumes that events may not contain embedded newlines. If
884 * ever events can contain embedded newlines, then CaseFile must switch
885 * serialization formats
886 */
887 void
DeSerialize(ifstream & caseStream)888 CaseFile::DeSerialize(ifstream &caseStream)
889 {
890 string evString;
891 const EventFactory &factory(ZfsDaemon::Get().GetFactory());
892
893 caseStream >> std::noskipws >> std::ws;
894 while (caseStream.good()) {
895 /*
896 * Outline:
897 * read the beginning of a line and check it for
898 * "tentative". If found, discard "tentative".
899 * Create a new event
900 * continue
901 */
902 EventList* destEvents;
903 const string tentFlag("tentative ");
904 string line;
905 std::stringbuf lineBuf;
906
907 caseStream.get(lineBuf);
908 caseStream.ignore(); /*discard the newline character*/
909 line = lineBuf.str();
910 if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
911 /* Discard "tentative" */
912 line.erase(0, tentFlag.size());
913 destEvents = &m_tentativeEvents;
914 } else {
915 destEvents = &m_events;
916 }
917 Event *event(Event::CreateEvent(factory, line));
918 if (event != NULL) {
919 destEvents->push_back(event);
920 RegisterCallout(*event);
921 }
922 }
923 }
924
925 void
Close()926 CaseFile::Close()
927 {
928 /*
929 * This case is no longer relevant. Clean up our
930 * serialization file, and delete the case.
931 */
932 syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
933 PoolGUIDString().c_str(), VdevGUIDString().c_str(),
934 zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
935
936 /*
937 * Serialization of a Case with no event data, clears the
938 * Serialization data for that event.
939 */
940 PurgeEvents();
941 Serialize();
942
943 delete this;
944 }
945
946 void
OnGracePeriodEnded()947 CaseFile::OnGracePeriodEnded()
948 {
949 bool should_fault, should_degrade;
950 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
951 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
952
953 m_events.splice(m_events.begin(), m_tentativeEvents);
954 should_fault = ShouldFault();
955 should_degrade = ShouldDegrade();
956
957 if (should_fault || should_degrade) {
958 if (zhp == NULL
959 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
960 /*
961 * Either the pool no longer exists
962 * or this vdev is no longer a member of
963 * the pool.
964 */
965 Close();
966 return;
967 }
968
969 }
970
971 /* A fault condition has priority over a degrade condition */
972 if (ShouldFault()) {
973 /* Fault the vdev and close the case. */
974 if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
975 VDEV_AUX_ERR_EXCEEDED) == 0) {
976 syslog(LOG_INFO, "Faulting vdev(%s/%s)",
977 PoolGUIDString().c_str(),
978 VdevGUIDString().c_str());
979 Close();
980 return;
981 }
982 else {
983 syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
984 PoolGUIDString().c_str(),
985 VdevGUIDString().c_str(),
986 libzfs_error_action(g_zfsHandle),
987 libzfs_error_description(g_zfsHandle));
988 }
989 }
990 else if (ShouldDegrade()) {
991 /* Degrade the vdev and close the case. */
992 if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
993 VDEV_AUX_ERR_EXCEEDED) == 0) {
994 syslog(LOG_INFO, "Degrading vdev(%s/%s)",
995 PoolGUIDString().c_str(),
996 VdevGUIDString().c_str());
997 Close();
998 return;
999 }
1000 else {
1001 syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
1002 PoolGUIDString().c_str(),
1003 VdevGUIDString().c_str(),
1004 libzfs_error_action(g_zfsHandle),
1005 libzfs_error_description(g_zfsHandle));
1006 }
1007 }
1008 Serialize();
1009 }
1010
1011 Vdev
BeingReplacedBy(zpool_handle_t * zhp)1012 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
1013 Vdev vd(zhp, CaseVdev(zhp));
1014 std::list<Vdev> children;
1015 std::list<Vdev>::iterator children_it;
1016
1017 Vdev parent(vd.Parent());
1018 Vdev replacing(NonexistentVdev);
1019
1020 /*
1021 * To determine whether we are being replaced by another spare that
1022 * is still working, then make sure that it is currently spared and
1023 * that the spare is either resilvering or healthy. If any of these
1024 * conditions fail, then we are not being replaced by a spare.
1025 *
1026 * If the spare is healthy, then the case file should be closed very
1027 * soon after this check.
1028 */
1029 if (parent.DoesNotExist()
1030 || parent.Name(zhp, /*verbose*/false) != "spare")
1031 return (NonexistentVdev);
1032
1033 children = parent.Children();
1034 children_it = children.begin();
1035 for (;children_it != children.end(); children_it++) {
1036 Vdev child = *children_it;
1037
1038 /* Skip our vdev. */
1039 if (child.GUID() == VdevGUID())
1040 continue;
1041 /*
1042 * Accept the first child that doesn't match our GUID, or
1043 * any resilvering/healthy device if one exists.
1044 */
1045 if (replacing.DoesNotExist() || child.IsResilvering()
1046 || child.State() == VDEV_STATE_HEALTHY)
1047 replacing = child;
1048 }
1049
1050 return (replacing);
1051 }
1052
1053 bool
Replace(const char * vdev_type,const char * path,bool isspare)1054 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
1055 nvlist_t *nvroot, *newvd;
1056 const char *poolname;
1057 string oldstr(VdevGUIDString());
1058 bool retval = true;
1059
1060 /* Figure out what pool we're working on */
1061 ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1062 zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1063 if (zhp == NULL) {
1064 syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
1065 "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
1066 return (false);
1067 }
1068 poolname = zpool_get_name(zhp);
1069 Vdev vd(zhp, CaseVdev(zhp));
1070 Vdev replaced(BeingReplacedBy(zhp));
1071
1072 if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
1073 /* If we are already being replaced by a working spare, pass. */
1074 if (replaced.IsResilvering()
1075 || replaced.State() == VDEV_STATE_HEALTHY) {
1076 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
1077 "replaced", VdevGUIDString().c_str(), path);
1078 return (/*consumed*/false);
1079 }
1080 /*
1081 * If we have already been replaced by a spare, but that spare
1082 * is broken, we must spare the spare, not the original device.
1083 */
1084 oldstr = replaced.GUIDString();
1085 syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
1086 "broken spare %s instead", VdevGUIDString().c_str(),
1087 path, oldstr.c_str());
1088 }
1089
1090 /*
1091 * Build a root vdev/leaf vdev configuration suitable for
1092 * zpool_vdev_attach. Only enough data for the kernel to find
1093 * the device (i.e. type and disk device node path) are needed.
1094 */
1095 nvroot = NULL;
1096 newvd = NULL;
1097
1098 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
1099 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
1100 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
1101 "configuration data.", poolname, oldstr.c_str());
1102 if (nvroot != NULL)
1103 nvlist_free(nvroot);
1104 return (false);
1105 }
1106 if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
1107 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
1108 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
1109 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1110 &newvd, 1) != 0) {
1111 syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
1112 "configuration data.", poolname, oldstr.c_str());
1113 nvlist_free(newvd);
1114 nvlist_free(nvroot);
1115 return (true);
1116 }
1117
1118 /* Data was copied when added to the root vdev. */
1119 nvlist_free(newvd);
1120
1121 retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
1122 /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0);
1123 if (retval)
1124 syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
1125 poolname, oldstr.c_str(), path);
1126 else
1127 syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
1128 poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
1129 libzfs_error_description(g_zfsHandle));
1130 nvlist_free(nvroot);
1131
1132 return (retval);
1133 }
1134
1135 /* Does the argument event refer to a checksum error? */
1136 static bool
IsChecksumEvent(const Event * const event)1137 IsChecksumEvent(const Event* const event)
1138 {
1139 return ("ereport.fs.zfs.checksum" == event->Value("type"));
1140 }
1141
1142 /* Does the argument event refer to an IO error? */
1143 static bool
IsIOEvent(const Event * const event)1144 IsIOEvent(const Event* const event)
1145 {
1146 return ("ereport.fs.zfs.io" == event->Value("type"));
1147 }
1148
1149 bool
ShouldDegrade() const1150 CaseFile::ShouldDegrade() const
1151 {
1152 return (std::count_if(m_events.begin(), m_events.end(),
1153 IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT);
1154 }
1155
1156 bool
ShouldFault() const1157 CaseFile::ShouldFault() const
1158 {
1159 return (std::count_if(m_events.begin(), m_events.end(),
1160 IsIOEvent) > ZFS_DEGRADE_IO_COUNT);
1161 }
1162
1163 nvlist_t *
CaseVdev(zpool_handle_t * zhp) const1164 CaseFile::CaseVdev(zpool_handle_t *zhp) const
1165 {
1166 return (VdevIterator(zhp).Find(VdevGUID()));
1167 }
1168