1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
24 */
25
26 #include <sys/zfs_context.h>
27 #include <sys/time.h>
28 #include <sys/wait.h>
29 #include <sys/zio.h>
30 #include <umem.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
33 #include <assert.h>
34 #include <stdio.h>
35 #include "raidz_test.h"
36
37 static int *rand_data;
38 raidz_test_opts_t rto_opts;
39
40 static char gdb[256];
41 static const char gdb_tmpl[] = "gdb -ex \"set pagination 0\" -p %d";
42
sig_handler(int signo)43 static void sig_handler(int signo)
44 {
45 struct sigaction action;
46 /*
47 * Restore default action and re-raise signal so SIGSEGV and
48 * SIGABRT can trigger a core dump.
49 */
50 action.sa_handler = SIG_DFL;
51 sigemptyset(&action.sa_mask);
52 action.sa_flags = 0;
53 (void) sigaction(signo, &action, NULL);
54
55 if (rto_opts.rto_gdb)
56 if (system(gdb)) { }
57
58 raise(signo);
59 }
60
print_opts(raidz_test_opts_t * opts,boolean_t force)61 static void print_opts(raidz_test_opts_t *opts, boolean_t force)
62 {
63 char *verbose;
64 switch (opts->rto_v) {
65 case 0:
66 verbose = "no";
67 break;
68 case 1:
69 verbose = "info";
70 break;
71 default:
72 verbose = "debug";
73 break;
74 }
75
76 if (force || opts->rto_v >= D_INFO) {
77 (void) fprintf(stdout, DBLSEP "Running with options:\n"
78 " (-a) zio ashift : %zu\n"
79 " (-o) zio offset : 1 << %zu\n"
80 " (-e) expanded map : %s\n"
81 " (-r) reflow offset : %llx\n"
82 " (-d) number of raidz data columns : %zu\n"
83 " (-s) size of DATA : 1 << %zu\n"
84 " (-S) sweep parameters : %s \n"
85 " (-v) verbose : %s \n\n",
86 opts->rto_ashift, /* -a */
87 ilog2(opts->rto_offset), /* -o */
88 opts->rto_expand ? "yes" : "no", /* -e */
89 (u_longlong_t)opts->rto_expand_offset, /* -r */
90 opts->rto_dcols, /* -d */
91 ilog2(opts->rto_dsize), /* -s */
92 opts->rto_sweep ? "yes" : "no", /* -S */
93 verbose); /* -v */
94 }
95 }
96
usage(boolean_t requested)97 static void usage(boolean_t requested)
98 {
99 const raidz_test_opts_t *o = &rto_opts_defaults;
100
101 FILE *fp = requested ? stdout : stderr;
102
103 (void) fprintf(fp, "Usage:\n"
104 "\t[-a zio ashift (default: %zu)]\n"
105 "\t[-o zio offset, exponent radix 2 (default: %zu)]\n"
106 "\t[-d number of raidz data columns (default: %zu)]\n"
107 "\t[-s zio size, exponent radix 2 (default: %zu)]\n"
108 "\t[-S parameter sweep (default: %s)]\n"
109 "\t[-t timeout for parameter sweep test]\n"
110 "\t[-B benchmark all raidz implementations]\n"
111 "\t[-e use expanded raidz map (default: %s)]\n"
112 "\t[-r expanded raidz map reflow offset (default: %llx)]\n"
113 "\t[-v increase verbosity (default: %zu)]\n"
114 "\t[-h (print help)]\n"
115 "\t[-T test the test, see if failure would be detected]\n"
116 "\t[-D debug (attach gdb on SIGSEGV)]\n"
117 "",
118 o->rto_ashift, /* -a */
119 ilog2(o->rto_offset), /* -o */
120 o->rto_dcols, /* -d */
121 ilog2(o->rto_dsize), /* -s */
122 rto_opts.rto_sweep ? "yes" : "no", /* -S */
123 rto_opts.rto_expand ? "yes" : "no", /* -e */
124 (u_longlong_t)o->rto_expand_offset, /* -r */
125 o->rto_v); /* -d */
126
127 exit(requested ? 0 : 1);
128 }
129
process_options(int argc,char ** argv)130 static void process_options(int argc, char **argv)
131 {
132 size_t value;
133 int opt;
134
135 raidz_test_opts_t *o = &rto_opts;
136
137 bcopy(&rto_opts_defaults, o, sizeof (*o));
138
139 while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) {
140 value = 0;
141
142 switch (opt) {
143 case 'a':
144 value = strtoull(optarg, NULL, 0);
145 o->rto_ashift = MIN(13, MAX(9, value));
146 break;
147 case 'e':
148 o->rto_expand = 1;
149 break;
150 case 'r':
151 o->rto_expand_offset = strtoull(optarg, NULL, 0);
152 break;
153 case 'o':
154 value = strtoull(optarg, NULL, 0);
155 o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
156 break;
157 case 'd':
158 value = strtoull(optarg, NULL, 0);
159 o->rto_dcols = MIN(255, MAX(1, value));
160 break;
161 case 's':
162 value = strtoull(optarg, NULL, 0);
163 o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT,
164 MAX(SPA_MINBLOCKSHIFT, value));
165 break;
166 case 't':
167 value = strtoull(optarg, NULL, 0);
168 o->rto_sweep_timeout = value;
169 break;
170 case 'v':
171 o->rto_v++;
172 break;
173 case 'S':
174 o->rto_sweep = 1;
175 break;
176 case 'B':
177 o->rto_benchmark = 1;
178 break;
179 case 'D':
180 o->rto_gdb = 1;
181 break;
182 case 'T':
183 o->rto_sanity = 1;
184 break;
185 case 'h':
186 usage(B_TRUE);
187 break;
188 case '?':
189 default:
190 usage(B_FALSE);
191 break;
192 }
193 }
194 }
195
196 #define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd)
197 #define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size)
198
199 #define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd)
200 #define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size)
201
202 static int
cmp_code(raidz_test_opts_t * opts,const raidz_map_t * rm,const int parity)203 cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
204 {
205 int r, i, ret = 0;
206
207 VERIFY(parity >= 1 && parity <= 3);
208
209 for (r = 0; r < rm->rm_nrows; r++) {
210 raidz_row_t * const rr = rm->rm_row[r];
211 raidz_row_t * const rrg = opts->rm_golden->rm_row[r];
212 for (i = 0; i < parity; i++) {
213 if (CODE_COL_SIZE(rrg, i) == 0) {
214 VERIFY0(CODE_COL_SIZE(rr, i));
215 continue;
216 }
217
218 if (abd_cmp(CODE_COL(rr, i),
219 CODE_COL(rrg, i)) != 0) {
220 ret++;
221 LOG_OPT(D_DEBUG, opts,
222 "\nParity block [%d] different!\n", i);
223 }
224 }
225 }
226 return (ret);
227 }
228
229 static int
cmp_data(raidz_test_opts_t * opts,raidz_map_t * rm)230 cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
231 {
232 int r, i, dcols, ret = 0;
233
234 for (r = 0; r < rm->rm_nrows; r++) {
235 raidz_row_t *rr = rm->rm_row[r];
236 raidz_row_t *rrg = opts->rm_golden->rm_row[r];
237 dcols = opts->rm_golden->rm_row[0]->rr_cols -
238 raidz_parity(opts->rm_golden);
239 for (i = 0; i < dcols; i++) {
240 if (DATA_COL_SIZE(rrg, i) == 0) {
241 VERIFY0(DATA_COL_SIZE(rr, i));
242 continue;
243 }
244
245 if (abd_cmp(DATA_COL(rrg, i),
246 DATA_COL(rr, i)) != 0) {
247 ret++;
248
249 LOG_OPT(D_DEBUG, opts,
250 "\nData block [%d] different!\n", i);
251 }
252 }
253 }
254 return (ret);
255 }
256
257 static int
init_rand(void * data,size_t size,void * private)258 init_rand(void *data, size_t size, void *private)
259 {
260 int i;
261 int *dst = (int *)data;
262
263 for (i = 0; i < size / sizeof (int); i++)
264 dst[i] = rand_data[i];
265
266 return (0);
267 }
268
269 static void
corrupt_colums(raidz_map_t * rm,const int * tgts,const int cnt)270 corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
271 {
272 for (int r = 0; r < rm->rm_nrows; r++) {
273 raidz_row_t *rr = rm->rm_row[r];
274 for (int i = 0; i < cnt; i++) {
275 raidz_col_t *col = &rr->rr_col[tgts[i]];
276 abd_iterate_func(col->rc_abd, 0, col->rc_size,
277 init_rand, NULL);
278 }
279 }
280 }
281
282 void
init_zio_abd(zio_t * zio)283 init_zio_abd(zio_t *zio)
284 {
285 abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL);
286 }
287
288 static void
fini_raidz_map(zio_t ** zio,raidz_map_t ** rm)289 fini_raidz_map(zio_t **zio, raidz_map_t **rm)
290 {
291 vdev_raidz_map_free(*rm);
292 raidz_free((*zio)->io_abd, (*zio)->io_size);
293 umem_free(*zio, sizeof (zio_t));
294
295 *zio = NULL;
296 *rm = NULL;
297 }
298
299 static int
init_raidz_golden_map(raidz_test_opts_t * opts,const int parity)300 init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
301 {
302 int err = 0;
303 zio_t *zio_test;
304 raidz_map_t *rm_test;
305 const size_t total_ncols = opts->rto_dcols + parity;
306
307 if (opts->rm_golden) {
308 fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
309 }
310
311 opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
312 zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
313
314 opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset;
315 opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize;
316
317 opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize);
318 zio_test->io_abd = raidz_alloc(opts->rto_dsize);
319
320 init_zio_abd(opts->zio_golden);
321 init_zio_abd(zio_test);
322
323 VERIFY0(vdev_raidz_impl_set("original"));
324
325 if (opts->rto_expand) {
326 opts->rm_golden =
327 vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
328 opts->zio_golden->io_size, opts->zio_golden->io_offset,
329 opts->rto_ashift, total_ncols+1, total_ncols,
330 parity, opts->rto_expand_offset);
331 rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
332 zio_test->io_size, zio_test->io_offset,
333 opts->rto_ashift, total_ncols+1, total_ncols,
334 parity, opts->rto_expand_offset);
335 } else {
336 opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
337 opts->rto_ashift, total_ncols, parity);
338 rm_test = vdev_raidz_map_alloc(zio_test,
339 opts->rto_ashift, total_ncols, parity);
340 }
341
342 VERIFY(opts->zio_golden);
343 VERIFY(opts->rm_golden);
344
345 vdev_raidz_generate_parity(opts->rm_golden);
346 vdev_raidz_generate_parity(rm_test);
347
348 /* sanity check */
349 err |= cmp_data(opts, rm_test);
350 err |= cmp_code(opts, rm_test, parity);
351
352 if (err)
353 ERR("initializing the golden copy ... [FAIL]!\n");
354
355 /* tear down raidz_map of test zio */
356 fini_raidz_map(&zio_test, &rm_test);
357
358 return (err);
359 }
360
361 /*
362 * If reflow is not in progress, reflow_offset should be UINT64_MAX.
363 * For each row, if the row is entirely before reflow_offset, it will
364 * come from the new location. Otherwise this row will come from the
365 * old location. Therefore, rows that straddle the reflow_offset will
366 * come from the old location.
367 *
368 * NOTE: Until raidz expansion is implemented this function is only
369 * needed by raidz_test.c to the multi-row raid_map_t functionality.
370 */
371 raidz_map_t *
vdev_raidz_map_alloc_expanded(abd_t * abd,uint64_t size,uint64_t offset,uint64_t ashift,uint64_t physical_cols,uint64_t logical_cols,uint64_t nparity,uint64_t reflow_offset)372 vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
373 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
374 uint64_t nparity, uint64_t reflow_offset)
375 {
376 /* The zio's size in units of the vdev's minimum sector size. */
377 uint64_t s = size >> ashift;
378 uint64_t q, r, bc, devidx, asize = 0, tot;
379
380 /*
381 * "Quotient": The number of data sectors for this stripe on all but
382 * the "big column" child vdevs that also contain "remainder" data.
383 * AKA "full rows"
384 */
385 q = s / (logical_cols - nparity);
386
387 /*
388 * "Remainder": The number of partial stripe data sectors in this I/O.
389 * This will add a sector to some, but not all, child vdevs.
390 */
391 r = s - q * (logical_cols - nparity);
392
393 /* The number of "big columns" - those which contain remainder data. */
394 bc = (r == 0 ? 0 : r + nparity);
395
396 /*
397 * The total number of data and parity sectors associated with
398 * this I/O.
399 */
400 tot = s + nparity * (q + (r == 0 ? 0 : 1));
401
402 /* How many rows contain data (not skip) */
403 uint64_t rows = howmany(tot, logical_cols);
404 int cols = MIN(tot, logical_cols);
405
406 raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
407 KM_SLEEP);
408 rm->rm_nrows = rows;
409
410 for (uint64_t row = 0; row < rows; row++) {
411 raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
412 rr_col[cols]), KM_SLEEP);
413 rm->rm_row[row] = rr;
414
415 /* The starting RAIDZ (parent) vdev sector of the row. */
416 uint64_t b = (offset >> ashift) + row * logical_cols;
417
418 /*
419 * If we are in the middle of a reflow, and any part of this
420 * row has not been copied, then use the old location of
421 * this row.
422 */
423 int row_phys_cols = physical_cols;
424 if (b + (logical_cols - nparity) > reflow_offset >> ashift)
425 row_phys_cols--;
426
427 /* starting child of this row */
428 uint64_t child_id = b % row_phys_cols;
429 /* The starting byte offset on each child vdev. */
430 uint64_t child_offset = (b / row_phys_cols) << ashift;
431
432 /*
433 * We set cols to the entire width of the block, even
434 * if this row is shorter. This is needed because parity
435 * generation (for Q and R) needs to know the entire width,
436 * because it treats the short row as though it was
437 * full-width (and the "phantom" sectors were zero-filled).
438 *
439 * Another approach to this would be to set cols shorter
440 * (to just the number of columns that we might do i/o to)
441 * and have another mechanism to tell the parity generation
442 * about the "entire width". Reconstruction (at least
443 * vdev_raidz_reconstruct_general()) would also need to
444 * know about the "entire width".
445 */
446 rr->rr_cols = cols;
447 rr->rr_bigcols = bc;
448 rr->rr_missingdata = 0;
449 rr->rr_missingparity = 0;
450 rr->rr_firstdatacol = nparity;
451 rr->rr_abd_copy = NULL;
452 rr->rr_abd_empty = NULL;
453 rr->rr_nempty = 0;
454
455 for (int c = 0; c < rr->rr_cols; c++, child_id++) {
456 if (child_id >= row_phys_cols) {
457 child_id -= row_phys_cols;
458 child_offset += 1ULL << ashift;
459 }
460 rr->rr_col[c].rc_devidx = child_id;
461 rr->rr_col[c].rc_offset = child_offset;
462 rr->rr_col[c].rc_gdata = NULL;
463 rr->rr_col[c].rc_orig_data = NULL;
464 rr->rr_col[c].rc_error = 0;
465 rr->rr_col[c].rc_tried = 0;
466 rr->rr_col[c].rc_skipped = 0;
467 rr->rr_col[c].rc_need_orig_restore = B_FALSE;
468
469 uint64_t dc = c - rr->rr_firstdatacol;
470 if (c < rr->rr_firstdatacol) {
471 rr->rr_col[c].rc_size = 1ULL << ashift;
472 rr->rr_col[c].rc_abd =
473 abd_alloc_linear(rr->rr_col[c].rc_size,
474 B_TRUE);
475 } else if (row == rows - 1 && bc != 0 && c >= bc) {
476 /*
477 * Past the end, this for parity generation.
478 */
479 rr->rr_col[c].rc_size = 0;
480 rr->rr_col[c].rc_abd = NULL;
481 } else {
482 /*
483 * "data column" (col excluding parity)
484 * Add an ASCII art diagram here
485 */
486 uint64_t off;
487
488 if (c < bc || r == 0) {
489 off = dc * rows + row;
490 } else {
491 off = r * rows +
492 (dc - r) * (rows - 1) + row;
493 }
494 rr->rr_col[c].rc_size = 1ULL << ashift;
495 rr->rr_col[c].rc_abd =
496 abd_get_offset(abd, off << ashift);
497 }
498
499 asize += rr->rr_col[c].rc_size;
500 }
501 /*
502 * If all data stored spans all columns, there's a danger that
503 * parity will always be on the same device and, since parity
504 * isn't read during normal operation, that that device's I/O
505 * bandwidth won't be used effectively. We therefore switch
506 * the parity every 1MB.
507 *
508 * ...at least that was, ostensibly, the theory. As a practical
509 * matter unless we juggle the parity between all devices
510 * evenly, we won't see any benefit. Further, occasional writes
511 * that aren't a multiple of the LCM of the number of children
512 * and the minimum stripe width are sufficient to avoid pessimal
513 * behavior. Unfortunately, this decision created an implicit
514 * on-disk format requirement that we need to support for all
515 * eternity, but only for single-parity RAID-Z.
516 *
517 * If we intend to skip a sector in the zeroth column for
518 * padding we must make sure to note this swap. We will never
519 * intend to skip the first column since at least one data and
520 * one parity column must appear in each row.
521 */
522 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
523 (offset & (1ULL << 20))) {
524 ASSERT(rr->rr_cols >= 2);
525 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
526 devidx = rr->rr_col[0].rc_devidx;
527 uint64_t o = rr->rr_col[0].rc_offset;
528 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
529 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
530 rr->rr_col[1].rc_devidx = devidx;
531 rr->rr_col[1].rc_offset = o;
532 }
533
534 }
535 ASSERT3U(asize, ==, tot << ashift);
536
537 /* init RAIDZ parity ops */
538 rm->rm_ops = vdev_raidz_math_get_ops();
539
540 return (rm);
541 }
542
543 static raidz_map_t *
init_raidz_map(raidz_test_opts_t * opts,zio_t ** zio,const int parity)544 init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
545 {
546 raidz_map_t *rm = NULL;
547 const size_t alloc_dsize = opts->rto_dsize;
548 const size_t total_ncols = opts->rto_dcols + parity;
549 const int ccols[] = { 0, 1, 2 };
550
551 VERIFY(zio);
552 VERIFY(parity <= 3 && parity >= 1);
553
554 *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
555
556 (*zio)->io_offset = 0;
557 (*zio)->io_size = alloc_dsize;
558 (*zio)->io_abd = raidz_alloc(alloc_dsize);
559 init_zio_abd(*zio);
560
561 if (opts->rto_expand) {
562 rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
563 (*zio)->io_size, (*zio)->io_offset,
564 opts->rto_ashift, total_ncols+1, total_ncols,
565 parity, opts->rto_expand_offset);
566 } else {
567 rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
568 total_ncols, parity);
569 }
570 VERIFY(rm);
571
572 /* Make sure code columns are destroyed */
573 corrupt_colums(rm, ccols, parity);
574
575 return (rm);
576 }
577
578 static int
run_gen_check(raidz_test_opts_t * opts)579 run_gen_check(raidz_test_opts_t *opts)
580 {
581 char **impl_name;
582 int fn, err = 0;
583 zio_t *zio_test;
584 raidz_map_t *rm_test;
585
586 err = init_raidz_golden_map(opts, PARITY_PQR);
587 if (0 != err)
588 return (err);
589
590 LOG(D_INFO, DBLSEP);
591 LOG(D_INFO, "Testing parity generation...\n");
592
593 for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
594 impl_name++) {
595
596 LOG(D_INFO, SEP);
597 LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
598
599 if (0 != vdev_raidz_impl_set(*impl_name)) {
600 LOG(D_INFO, "[SKIP]\n");
601 continue;
602 } else {
603 LOG(D_INFO, "[SUPPORTED]\n");
604 }
605
606 for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
607
608 /* Check if should stop */
609 if (rto_opts.rto_should_stop)
610 return (err);
611
612 /* create suitable raidz_map */
613 rm_test = init_raidz_map(opts, &zio_test, fn+1);
614 VERIFY(rm_test);
615
616 LOG(D_INFO, "\t\tTesting method [%s] ...",
617 raidz_gen_name[fn]);
618
619 if (!opts->rto_sanity)
620 vdev_raidz_generate_parity(rm_test);
621
622 if (cmp_code(opts, rm_test, fn+1) != 0) {
623 LOG(D_INFO, "[FAIL]\n");
624 err++;
625 } else
626 LOG(D_INFO, "[PASS]\n");
627
628 fini_raidz_map(&zio_test, &rm_test);
629 }
630 }
631
632 fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
633
634 return (err);
635 }
636
637 static int
run_rec_check_impl(raidz_test_opts_t * opts,raidz_map_t * rm,const int fn)638 run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
639 {
640 int x0, x1, x2;
641 int tgtidx[3];
642 int err = 0;
643 static const int rec_tgts[7][3] = {
644 {1, 2, 3}, /* rec_p: bad QR & D[0] */
645 {0, 2, 3}, /* rec_q: bad PR & D[0] */
646 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
647 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
648 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
649 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
650 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
651 };
652
653 memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx));
654
655 if (fn < RAIDZ_REC_PQ) {
656 /* can reconstruct 1 failed data disk */
657 for (x0 = 0; x0 < opts->rto_dcols; x0++) {
658 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
659 continue;
660
661 /* Check if should stop */
662 if (rto_opts.rto_should_stop)
663 return (err);
664
665 LOG(D_DEBUG, "[%d] ", x0);
666
667 tgtidx[2] = x0 + raidz_parity(rm);
668
669 corrupt_colums(rm, tgtidx+2, 1);
670
671 if (!opts->rto_sanity)
672 vdev_raidz_reconstruct(rm, tgtidx, 3);
673
674 if (cmp_data(opts, rm) != 0) {
675 err++;
676 LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0);
677 }
678 }
679
680 } else if (fn < RAIDZ_REC_PQR) {
681 /* can reconstruct 2 failed data disk */
682 for (x0 = 0; x0 < opts->rto_dcols; x0++) {
683 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
684 continue;
685 for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
686 if (x1 >= rm->rm_row[0]->rr_cols -
687 raidz_parity(rm))
688 continue;
689
690 /* Check if should stop */
691 if (rto_opts.rto_should_stop)
692 return (err);
693
694 LOG(D_DEBUG, "[%d %d] ", x0, x1);
695
696 tgtidx[1] = x0 + raidz_parity(rm);
697 tgtidx[2] = x1 + raidz_parity(rm);
698
699 corrupt_colums(rm, tgtidx+1, 2);
700
701 if (!opts->rto_sanity)
702 vdev_raidz_reconstruct(rm, tgtidx, 3);
703
704 if (cmp_data(opts, rm) != 0) {
705 err++;
706 LOG(D_DEBUG, "\nREC D[%d %d]... "
707 "[FAIL]\n", x0, x1);
708 }
709 }
710 }
711 } else {
712 /* can reconstruct 3 failed data disk */
713 for (x0 = 0; x0 < opts->rto_dcols; x0++) {
714 if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
715 continue;
716 for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
717 if (x1 >= rm->rm_row[0]->rr_cols -
718 raidz_parity(rm))
719 continue;
720 for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
721 if (x2 >= rm->rm_row[0]->rr_cols -
722 raidz_parity(rm))
723 continue;
724
725 /* Check if should stop */
726 if (rto_opts.rto_should_stop)
727 return (err);
728
729 LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2);
730
731 tgtidx[0] = x0 + raidz_parity(rm);
732 tgtidx[1] = x1 + raidz_parity(rm);
733 tgtidx[2] = x2 + raidz_parity(rm);
734
735 corrupt_colums(rm, tgtidx, 3);
736
737 if (!opts->rto_sanity)
738 vdev_raidz_reconstruct(rm,
739 tgtidx, 3);
740
741 if (cmp_data(opts, rm) != 0) {
742 err++;
743 LOG(D_DEBUG,
744 "\nREC D[%d %d %d]... "
745 "[FAIL]\n", x0, x1, x2);
746 }
747 }
748 }
749 }
750 }
751 return (err);
752 }
753
754 static int
run_rec_check(raidz_test_opts_t * opts)755 run_rec_check(raidz_test_opts_t *opts)
756 {
757 char **impl_name;
758 unsigned fn, err = 0;
759 zio_t *zio_test;
760 raidz_map_t *rm_test;
761
762 err = init_raidz_golden_map(opts, PARITY_PQR);
763 if (0 != err)
764 return (err);
765
766 LOG(D_INFO, DBLSEP);
767 LOG(D_INFO, "Testing data reconstruction...\n");
768
769 for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
770 impl_name++) {
771
772 LOG(D_INFO, SEP);
773 LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
774
775 if (vdev_raidz_impl_set(*impl_name) != 0) {
776 LOG(D_INFO, "[SKIP]\n");
777 continue;
778 } else
779 LOG(D_INFO, "[SUPPORTED]\n");
780
781
782 /* create suitable raidz_map */
783 rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR);
784 /* generate parity */
785 vdev_raidz_generate_parity(rm_test);
786
787 for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
788
789 LOG(D_INFO, "\t\tTesting method [%s] ...",
790 raidz_rec_name[fn]);
791
792 if (run_rec_check_impl(opts, rm_test, fn) != 0) {
793 LOG(D_INFO, "[FAIL]\n");
794 err++;
795
796 } else
797 LOG(D_INFO, "[PASS]\n");
798
799 }
800 /* tear down test raidz_map */
801 fini_raidz_map(&zio_test, &rm_test);
802 }
803
804 fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
805
806 return (err);
807 }
808
809 static int
run_test(raidz_test_opts_t * opts)810 run_test(raidz_test_opts_t *opts)
811 {
812 int err = 0;
813
814 if (opts == NULL)
815 opts = &rto_opts;
816
817 print_opts(opts, B_FALSE);
818
819 err |= run_gen_check(opts);
820 err |= run_rec_check(opts);
821
822 return (err);
823 }
824
825 #define SWEEP_RUNNING 0
826 #define SWEEP_FINISHED 1
827 #define SWEEP_ERROR 2
828 #define SWEEP_TIMEOUT 3
829
830 static int sweep_state = 0;
831 static raidz_test_opts_t failed_opts;
832
833 static kmutex_t sem_mtx;
834 static kcondvar_t sem_cv;
835 static int max_free_slots;
836 static int free_slots;
837
838 static void
sweep_thread(void * arg)839 sweep_thread(void *arg)
840 {
841 int err = 0;
842 raidz_test_opts_t *opts = (raidz_test_opts_t *)arg;
843 VERIFY(opts != NULL);
844
845 err = run_test(opts);
846
847 if (rto_opts.rto_sanity) {
848 /* 25% chance that a sweep test fails */
849 if (rand() < (RAND_MAX/4))
850 err = 1;
851 }
852
853 if (0 != err) {
854 mutex_enter(&sem_mtx);
855 memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t));
856 sweep_state = SWEEP_ERROR;
857 mutex_exit(&sem_mtx);
858 }
859
860 umem_free(opts, sizeof (raidz_test_opts_t));
861
862 /* signal the next thread */
863 mutex_enter(&sem_mtx);
864 free_slots++;
865 cv_signal(&sem_cv);
866 mutex_exit(&sem_mtx);
867
868 thread_exit();
869 }
870
871 static int
run_sweep(void)872 run_sweep(void)
873 {
874 static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 };
875 static const size_t ashift_v[] = { 9, 12, 14 };
876 static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12),
877 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE };
878
879 (void) setvbuf(stdout, NULL, _IONBF, 0);
880
881 ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) *
882 ARRAY_SIZE(dcols_v);
883 ulong_t tried_comb = 0;
884 hrtime_t time_diff, start_time = gethrtime();
885 raidz_test_opts_t *opts;
886 int a, d, s;
887
888 max_free_slots = free_slots = MAX(2, boot_ncpus);
889
890 mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL);
891 cv_init(&sem_cv, NULL, CV_DEFAULT, NULL);
892
893 for (s = 0; s < ARRAY_SIZE(size_v); s++)
894 for (a = 0; a < ARRAY_SIZE(ashift_v); a++)
895 for (d = 0; d < ARRAY_SIZE(dcols_v); d++) {
896
897 if (size_v[s] < (1 << ashift_v[a])) {
898 total_comb--;
899 continue;
900 }
901
902 if (++tried_comb % 20 == 0)
903 LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb);
904
905 /* wait for signal to start new thread */
906 mutex_enter(&sem_mtx);
907 while (cv_timedwait_sig(&sem_cv, &sem_mtx,
908 ddi_get_lbolt() + hz)) {
909
910 /* check if should stop the test (timeout) */
911 time_diff = (gethrtime() - start_time) / NANOSEC;
912 if (rto_opts.rto_sweep_timeout > 0 &&
913 time_diff >= rto_opts.rto_sweep_timeout) {
914 sweep_state = SWEEP_TIMEOUT;
915 rto_opts.rto_should_stop = B_TRUE;
916 mutex_exit(&sem_mtx);
917 goto exit;
918 }
919
920 /* check if should stop the test (error) */
921 if (sweep_state != SWEEP_RUNNING) {
922 mutex_exit(&sem_mtx);
923 goto exit;
924 }
925
926 /* exit loop if a slot is available */
927 if (free_slots > 0) {
928 break;
929 }
930 }
931
932 free_slots--;
933 mutex_exit(&sem_mtx);
934
935 opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL);
936 opts->rto_ashift = ashift_v[a];
937 opts->rto_dcols = dcols_v[d];
938 opts->rto_offset = (1 << ashift_v[a]) * rand();
939 opts->rto_dsize = size_v[s];
940 opts->rto_expand = rto_opts.rto_expand;
941 opts->rto_expand_offset = rto_opts.rto_expand_offset;
942 opts->rto_v = 0; /* be quiet */
943
944 VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
945 0, NULL, TS_RUN, defclsyspri), !=, NULL);
946 }
947
948 exit:
949 LOG(D_ALL, "\nWaiting for test threads to finish...\n");
950 mutex_enter(&sem_mtx);
951 VERIFY(free_slots <= max_free_slots);
952 while (free_slots < max_free_slots) {
953 (void) cv_wait(&sem_cv, &sem_mtx);
954 }
955 mutex_exit(&sem_mtx);
956
957 if (sweep_state == SWEEP_ERROR) {
958 ERR("Sweep test failed! Failed option: \n");
959 print_opts(&failed_opts, B_TRUE);
960 } else {
961 if (sweep_state == SWEEP_TIMEOUT)
962 LOG(D_ALL, "Test timeout (%lus). Stopping...\n",
963 (ulong_t)rto_opts.rto_sweep_timeout);
964
965 LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n",
966 (ulong_t)tried_comb);
967 }
968
969 mutex_destroy(&sem_mtx);
970
971 return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
972 }
973
974
975 int
main(int argc,char ** argv)976 main(int argc, char **argv)
977 {
978 size_t i;
979 struct sigaction action;
980 int err = 0;
981
982 /* init gdb string early */
983 (void) sprintf(gdb, gdb_tmpl, getpid());
984
985 action.sa_handler = sig_handler;
986 sigemptyset(&action.sa_mask);
987 action.sa_flags = 0;
988
989 if (sigaction(SIGSEGV, &action, NULL) < 0) {
990 ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno));
991 exit(EXIT_FAILURE);
992 }
993
994 (void) setvbuf(stdout, NULL, _IOLBF, 0);
995
996 dprintf_setup(&argc, argv);
997
998 process_options(argc, argv);
999
1000 kernel_init(SPA_MODE_READ);
1001
1002 /* setup random data because rand() is not reentrant */
1003 rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
1004 srand((unsigned)time(NULL) * getpid());
1005 for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++)
1006 rand_data[i] = rand();
1007
1008 mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ);
1009
1010 if (rto_opts.rto_benchmark) {
1011 run_raidz_benchmark();
1012 } else if (rto_opts.rto_sweep) {
1013 err = run_sweep();
1014 } else {
1015 err = run_test(NULL);
1016 }
1017
1018 umem_free(rand_data, SPA_MAXBLOCKSIZE);
1019 kernel_fini();
1020
1021 return (err);
1022 }
1023