1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
27 * Copyright (c) 2014 Integros [integros.com]
28 * Copyright 2016 Toomas Soome <tsoome@me.com>
29 * Copyright 2017 Joyent, Inc.
30 * Copyright (c) 2017, Intel Corporation.
31 * Copyright (c) 2019, Datto Inc. All rights reserved.
32 * Copyright (c) 2021, Klara Inc.
33 * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
34 */
35
36 #include <sys/zfs_context.h>
37 #include <sys/fm/fs/zfs.h>
38 #include <sys/spa.h>
39 #include <sys/spa_impl.h>
40 #include <sys/bpobj.h>
41 #include <sys/dmu.h>
42 #include <sys/dmu_tx.h>
43 #include <sys/dsl_dir.h>
44 #include <sys/vdev_impl.h>
45 #include <sys/vdev_rebuild.h>
46 #include <sys/vdev_draid.h>
47 #include <sys/uberblock_impl.h>
48 #include <sys/metaslab.h>
49 #include <sys/metaslab_impl.h>
50 #include <sys/space_map.h>
51 #include <sys/space_reftree.h>
52 #include <sys/zio.h>
53 #include <sys/zap.h>
54 #include <sys/fs/zfs.h>
55 #include <sys/arc.h>
56 #include <sys/zil.h>
57 #include <sys/dsl_scan.h>
58 #include <sys/vdev_raidz.h>
59 #include <sys/abd.h>
60 #include <sys/vdev_initialize.h>
61 #include <sys/vdev_trim.h>
62 #include <sys/vdev_raidz.h>
63 #include <sys/zvol.h>
64 #include <sys/zfs_ratelimit.h>
65 #include "zfs_prop.h"
66
67 /*
68 * One metaslab from each (normal-class) vdev is used by the ZIL. These are
69 * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
70 * part of the spa_embedded_log_class. The metaslab with the most free space
71 * in each vdev is selected for this purpose when the pool is opened (or a
72 * vdev is added). See vdev_metaslab_init().
73 *
74 * Log blocks can be allocated from the following locations. Each one is tried
75 * in order until the allocation succeeds:
76 * 1. dedicated log vdevs, aka "slog" (spa_log_class)
77 * 2. embedded slog metaslabs (spa_embedded_log_class)
78 * 3. other metaslabs in normal vdevs (spa_normal_class)
79 *
80 * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
81 * than this number of metaslabs in the vdev. This ensures that we don't set
82 * aside an unreasonable amount of space for the ZIL. If set to less than
83 * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
84 * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
85 */
86 static uint_t zfs_embedded_slog_min_ms = 64;
87
88 /* default target for number of metaslabs per top-level vdev */
89 static uint_t zfs_vdev_default_ms_count = 200;
90
91 /* minimum number of metaslabs per top-level vdev */
92 static uint_t zfs_vdev_min_ms_count = 16;
93
94 /* practical upper limit of total metaslabs per top-level vdev */
95 static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
96
97 /* lower limit for metaslab size (512M) */
98 static uint_t zfs_vdev_default_ms_shift = 29;
99
100 /* upper limit for metaslab size (16G) */
101 static uint_t zfs_vdev_max_ms_shift = 34;
102
103 int vdev_validate_skip = B_FALSE;
104
105 /*
106 * Since the DTL space map of a vdev is not expected to have a lot of
107 * entries, we default its block size to 4K.
108 */
109 int zfs_vdev_dtl_sm_blksz = (1 << 12);
110
111 /*
112 * Rate limit slow IO (delay) events to this many per second.
113 */
114 static unsigned int zfs_slow_io_events_per_second = 20;
115
116 /*
117 * Rate limit deadman "hung IO" events to this many per second.
118 */
119 static unsigned int zfs_deadman_events_per_second = 1;
120
121 /*
122 * Rate limit direct write IO verify failures to this many per scond.
123 */
124 static unsigned int zfs_dio_write_verify_events_per_second = 20;
125
126 /*
127 * Rate limit checksum events after this many checksum errors per second.
128 */
129 static unsigned int zfs_checksum_events_per_second = 20;
130
131 /*
132 * Ignore errors during scrub/resilver. Allows to work around resilver
133 * upon import when there are pool errors.
134 */
135 static int zfs_scan_ignore_errors = 0;
136
137 /*
138 * vdev-wide space maps that have lots of entries written to them at
139 * the end of each transaction can benefit from a higher I/O bandwidth
140 * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
141 */
142 int zfs_vdev_standard_sm_blksz = (1 << 17);
143
144 /*
145 * Tunable parameter for debugging or performance analysis. Setting this
146 * will cause pool corruption on power loss if a volatile out-of-order
147 * write cache is enabled.
148 */
149 int zfs_nocacheflush = 0;
150
151 /*
152 * Maximum and minimum ashift values that can be automatically set based on
153 * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX
154 * is higher than the maximum value, it is intentionally limited here to not
155 * excessively impact pool space efficiency. Higher ashift values may still
156 * be forced by vdev logical ashift or by user via ashift property, but won't
157 * be set automatically as a performance optimization.
158 */
159 uint_t zfs_vdev_max_auto_ashift = 14;
160 uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
161
162 /*
163 * VDEV checksum verification for Direct I/O writes. This is neccessary for
164 * Linux, because anonymous pages can not be placed under write protection
165 * during Direct I/O writes.
166 */
167 #if !defined(__FreeBSD__)
168 uint_t zfs_vdev_direct_write_verify = 1;
169 #else
170 uint_t zfs_vdev_direct_write_verify = 0;
171 #endif
172
173 void
vdev_dbgmsg(vdev_t * vd,const char * fmt,...)174 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
175 {
176 va_list adx;
177 char buf[256];
178
179 va_start(adx, fmt);
180 (void) vsnprintf(buf, sizeof (buf), fmt, adx);
181 va_end(adx);
182
183 if (vd->vdev_path != NULL) {
184 zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
185 vd->vdev_path, buf);
186 } else {
187 zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
188 vd->vdev_ops->vdev_op_type,
189 (u_longlong_t)vd->vdev_id,
190 (u_longlong_t)vd->vdev_guid, buf);
191 }
192 }
193
194 void
vdev_dbgmsg_print_tree(vdev_t * vd,int indent)195 vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
196 {
197 char state[20];
198
199 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
200 zfs_dbgmsg("%*svdev %llu: %s", indent, "",
201 (u_longlong_t)vd->vdev_id,
202 vd->vdev_ops->vdev_op_type);
203 return;
204 }
205
206 switch (vd->vdev_state) {
207 case VDEV_STATE_UNKNOWN:
208 (void) snprintf(state, sizeof (state), "unknown");
209 break;
210 case VDEV_STATE_CLOSED:
211 (void) snprintf(state, sizeof (state), "closed");
212 break;
213 case VDEV_STATE_OFFLINE:
214 (void) snprintf(state, sizeof (state), "offline");
215 break;
216 case VDEV_STATE_REMOVED:
217 (void) snprintf(state, sizeof (state), "removed");
218 break;
219 case VDEV_STATE_CANT_OPEN:
220 (void) snprintf(state, sizeof (state), "can't open");
221 break;
222 case VDEV_STATE_FAULTED:
223 (void) snprintf(state, sizeof (state), "faulted");
224 break;
225 case VDEV_STATE_DEGRADED:
226 (void) snprintf(state, sizeof (state), "degraded");
227 break;
228 case VDEV_STATE_HEALTHY:
229 (void) snprintf(state, sizeof (state), "healthy");
230 break;
231 default:
232 (void) snprintf(state, sizeof (state), "<state %u>",
233 (uint_t)vd->vdev_state);
234 }
235
236 zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
237 "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
238 vd->vdev_islog ? " (log)" : "",
239 (u_longlong_t)vd->vdev_guid,
240 vd->vdev_path ? vd->vdev_path : "N/A", state);
241
242 for (uint64_t i = 0; i < vd->vdev_children; i++)
243 vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
244 }
245
246 /*
247 * Virtual device management.
248 */
249
250 static vdev_ops_t *const vdev_ops_table[] = {
251 &vdev_root_ops,
252 &vdev_raidz_ops,
253 &vdev_draid_ops,
254 &vdev_draid_spare_ops,
255 &vdev_mirror_ops,
256 &vdev_replacing_ops,
257 &vdev_spare_ops,
258 &vdev_disk_ops,
259 &vdev_file_ops,
260 &vdev_missing_ops,
261 &vdev_hole_ops,
262 &vdev_indirect_ops,
263 NULL
264 };
265
266 /*
267 * Given a vdev type, return the appropriate ops vector.
268 */
269 static vdev_ops_t *
vdev_getops(const char * type)270 vdev_getops(const char *type)
271 {
272 vdev_ops_t *ops, *const *opspp;
273
274 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
275 if (strcmp(ops->vdev_op_type, type) == 0)
276 break;
277
278 return (ops);
279 }
280
281 /*
282 * Given a vdev and a metaslab class, find which metaslab group we're
283 * interested in. All vdevs may belong to two different metaslab classes.
284 * Dedicated slog devices use only the primary metaslab group, rather than a
285 * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
286 */
287 metaslab_group_t *
vdev_get_mg(vdev_t * vd,metaslab_class_t * mc)288 vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
289 {
290 if (mc == spa_embedded_log_class(vd->vdev_spa) &&
291 vd->vdev_log_mg != NULL)
292 return (vd->vdev_log_mg);
293 else
294 return (vd->vdev_mg);
295 }
296
297 void
vdev_default_xlate(vdev_t * vd,const zfs_range_seg64_t * logical_rs,zfs_range_seg64_t * physical_rs,zfs_range_seg64_t * remain_rs)298 vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
299 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
300 {
301 (void) vd, (void) remain_rs;
302
303 physical_rs->rs_start = logical_rs->rs_start;
304 physical_rs->rs_end = logical_rs->rs_end;
305 }
306
307 /*
308 * Derive the enumerated allocation bias from string input.
309 * String origin is either the per-vdev zap or zpool(8).
310 */
311 static vdev_alloc_bias_t
vdev_derive_alloc_bias(const char * bias)312 vdev_derive_alloc_bias(const char *bias)
313 {
314 vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
315
316 if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
317 alloc_bias = VDEV_BIAS_LOG;
318 else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
319 alloc_bias = VDEV_BIAS_SPECIAL;
320 else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
321 alloc_bias = VDEV_BIAS_DEDUP;
322
323 return (alloc_bias);
324 }
325
326 uint64_t
vdev_default_psize(vdev_t * vd,uint64_t asize,uint64_t txg)327 vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
328 {
329 ASSERT0(asize % (1ULL << vd->vdev_top->vdev_ashift));
330 uint64_t csize, psize = asize;
331 for (int c = 0; c < vd->vdev_children; c++) {
332 csize = vdev_asize_to_psize_txg(vd->vdev_child[c], asize, txg);
333 psize = MIN(psize, csize);
334 }
335
336 return (psize);
337 }
338
339 /*
340 * Default asize function: return the MAX of psize with the asize of
341 * all children. This is what's used by anything other than RAID-Z.
342 */
343 uint64_t
vdev_default_asize(vdev_t * vd,uint64_t psize,uint64_t txg)344 vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
345 {
346 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
347 uint64_t csize;
348
349 for (int c = 0; c < vd->vdev_children; c++) {
350 csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
351 asize = MAX(asize, csize);
352 }
353
354 return (asize);
355 }
356
357 uint64_t
vdev_default_min_asize(vdev_t * vd)358 vdev_default_min_asize(vdev_t *vd)
359 {
360 return (vd->vdev_min_asize);
361 }
362
363 /*
364 * Get the minimum allocatable size. We define the allocatable size as
365 * the vdev's asize rounded to the nearest metaslab. This allows us to
366 * replace or attach devices which don't have the same physical size but
367 * can still satisfy the same number of allocations.
368 */
369 uint64_t
vdev_get_min_asize(vdev_t * vd)370 vdev_get_min_asize(vdev_t *vd)
371 {
372 vdev_t *pvd = vd->vdev_parent;
373
374 /*
375 * If our parent is NULL (inactive spare or cache) or is the root,
376 * just return our own asize.
377 */
378 if (pvd == NULL)
379 return (vd->vdev_asize);
380
381 /*
382 * The top-level vdev just returns the allocatable size rounded
383 * to the nearest metaslab.
384 */
385 if (vd == vd->vdev_top)
386 return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift,
387 uint64_t));
388
389 return (pvd->vdev_ops->vdev_op_min_asize(pvd));
390 }
391
392 void
vdev_set_min_asize(vdev_t * vd)393 vdev_set_min_asize(vdev_t *vd)
394 {
395 vd->vdev_min_asize = vdev_get_min_asize(vd);
396
397 for (int c = 0; c < vd->vdev_children; c++)
398 vdev_set_min_asize(vd->vdev_child[c]);
399 }
400
401 /*
402 * Get the minimal allocation size for the top-level vdev.
403 */
404 uint64_t
vdev_get_min_alloc(vdev_t * vd)405 vdev_get_min_alloc(vdev_t *vd)
406 {
407 uint64_t min_alloc = 1ULL << vd->vdev_ashift;
408
409 if (vd->vdev_ops->vdev_op_min_alloc != NULL)
410 min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
411
412 return (min_alloc);
413 }
414
415 /*
416 * Get the parity level for a top-level vdev.
417 */
418 uint64_t
vdev_get_nparity(vdev_t * vd)419 vdev_get_nparity(vdev_t *vd)
420 {
421 uint64_t nparity = 0;
422
423 if (vd->vdev_ops->vdev_op_nparity != NULL)
424 nparity = vd->vdev_ops->vdev_op_nparity(vd);
425
426 return (nparity);
427 }
428
429 static int
vdev_prop_get_int(vdev_t * vd,vdev_prop_t prop,uint64_t * value)430 vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
431 {
432 spa_t *spa = vd->vdev_spa;
433 objset_t *mos = spa->spa_meta_objset;
434 uint64_t objid;
435 int err;
436
437 if (vd->vdev_root_zap != 0) {
438 objid = vd->vdev_root_zap;
439 } else if (vd->vdev_top_zap != 0) {
440 objid = vd->vdev_top_zap;
441 } else if (vd->vdev_leaf_zap != 0) {
442 objid = vd->vdev_leaf_zap;
443 } else {
444 return (EINVAL);
445 }
446
447 err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
448 sizeof (uint64_t), 1, value);
449
450 if (err == ENOENT)
451 *value = vdev_prop_default_numeric(prop);
452
453 return (err);
454 }
455
456 /*
457 * Get the number of data disks for a top-level vdev.
458 */
459 uint64_t
vdev_get_ndisks(vdev_t * vd)460 vdev_get_ndisks(vdev_t *vd)
461 {
462 uint64_t ndisks = 1;
463
464 if (vd->vdev_ops->vdev_op_ndisks != NULL)
465 ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
466
467 return (ndisks);
468 }
469
470 vdev_t *
vdev_lookup_top(spa_t * spa,uint64_t vdev)471 vdev_lookup_top(spa_t *spa, uint64_t vdev)
472 {
473 vdev_t *rvd = spa->spa_root_vdev;
474
475 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
476
477 if (vdev < rvd->vdev_children) {
478 ASSERT(rvd->vdev_child[vdev] != NULL);
479 return (rvd->vdev_child[vdev]);
480 }
481
482 return (NULL);
483 }
484
485 vdev_t *
vdev_lookup_by_guid(vdev_t * vd,uint64_t guid)486 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
487 {
488 vdev_t *mvd;
489
490 if (vd->vdev_guid == guid)
491 return (vd);
492
493 for (int c = 0; c < vd->vdev_children; c++)
494 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
495 NULL)
496 return (mvd);
497
498 return (NULL);
499 }
500
501 static int
vdev_count_leaves_impl(vdev_t * vd)502 vdev_count_leaves_impl(vdev_t *vd)
503 {
504 int n = 0;
505
506 if (vd->vdev_ops->vdev_op_leaf)
507 return (1);
508
509 for (int c = 0; c < vd->vdev_children; c++)
510 n += vdev_count_leaves_impl(vd->vdev_child[c]);
511
512 return (n);
513 }
514
515 int
vdev_count_leaves(spa_t * spa)516 vdev_count_leaves(spa_t *spa)
517 {
518 int rc;
519
520 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
521 rc = vdev_count_leaves_impl(spa->spa_root_vdev);
522 spa_config_exit(spa, SCL_VDEV, FTAG);
523
524 return (rc);
525 }
526
527 void
vdev_add_child(vdev_t * pvd,vdev_t * cvd)528 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
529 {
530 size_t oldsize, newsize;
531 uint64_t id = cvd->vdev_id;
532 vdev_t **newchild;
533
534 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
535 ASSERT(cvd->vdev_parent == NULL);
536
537 cvd->vdev_parent = pvd;
538
539 if (pvd == NULL)
540 return;
541
542 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
543
544 oldsize = pvd->vdev_children * sizeof (vdev_t *);
545 pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
546 newsize = pvd->vdev_children * sizeof (vdev_t *);
547
548 newchild = kmem_alloc(newsize, KM_SLEEP);
549 if (pvd->vdev_child != NULL) {
550 memcpy(newchild, pvd->vdev_child, oldsize);
551 kmem_free(pvd->vdev_child, oldsize);
552 }
553
554 pvd->vdev_child = newchild;
555 pvd->vdev_child[id] = cvd;
556
557 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
558 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
559
560 /*
561 * Walk up all ancestors to update guid sum.
562 */
563 for (; pvd != NULL; pvd = pvd->vdev_parent)
564 pvd->vdev_guid_sum += cvd->vdev_guid_sum;
565
566 if (cvd->vdev_ops->vdev_op_leaf) {
567 list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
568 cvd->vdev_spa->spa_leaf_list_gen++;
569 }
570 }
571
572 void
vdev_remove_child(vdev_t * pvd,vdev_t * cvd)573 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
574 {
575 int c;
576 uint_t id = cvd->vdev_id;
577
578 ASSERT(cvd->vdev_parent == pvd);
579
580 if (pvd == NULL)
581 return;
582
583 ASSERT(id < pvd->vdev_children);
584 ASSERT(pvd->vdev_child[id] == cvd);
585
586 pvd->vdev_child[id] = NULL;
587 cvd->vdev_parent = NULL;
588
589 for (c = 0; c < pvd->vdev_children; c++)
590 if (pvd->vdev_child[c])
591 break;
592
593 if (c == pvd->vdev_children) {
594 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
595 pvd->vdev_child = NULL;
596 pvd->vdev_children = 0;
597 }
598
599 if (cvd->vdev_ops->vdev_op_leaf) {
600 spa_t *spa = cvd->vdev_spa;
601 list_remove(&spa->spa_leaf_list, cvd);
602 spa->spa_leaf_list_gen++;
603 }
604
605 /*
606 * Walk up all ancestors to update guid sum.
607 */
608 for (; pvd != NULL; pvd = pvd->vdev_parent)
609 pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
610 }
611
612 /*
613 * Remove any holes in the child array.
614 */
615 void
vdev_compact_children(vdev_t * pvd)616 vdev_compact_children(vdev_t *pvd)
617 {
618 vdev_t **newchild, *cvd;
619 int oldc = pvd->vdev_children;
620 int newc;
621
622 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
623
624 if (oldc == 0)
625 return;
626
627 for (int c = newc = 0; c < oldc; c++)
628 if (pvd->vdev_child[c])
629 newc++;
630
631 if (newc > 0) {
632 newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
633
634 for (int c = newc = 0; c < oldc; c++) {
635 if ((cvd = pvd->vdev_child[c]) != NULL) {
636 newchild[newc] = cvd;
637 cvd->vdev_id = newc++;
638 }
639 }
640 } else {
641 newchild = NULL;
642 }
643
644 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
645 pvd->vdev_child = newchild;
646 pvd->vdev_children = newc;
647 }
648
649 /*
650 * Allocate and minimally initialize a vdev_t.
651 */
652 vdev_t *
vdev_alloc_common(spa_t * spa,uint_t id,uint64_t guid,vdev_ops_t * ops)653 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
654 {
655 vdev_t *vd;
656 vdev_indirect_config_t *vic;
657
658 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
659 vic = &vd->vdev_indirect_config;
660
661 if (spa->spa_root_vdev == NULL) {
662 ASSERT(ops == &vdev_root_ops);
663 spa->spa_root_vdev = vd;
664 spa->spa_load_guid = spa_generate_load_guid();
665 }
666
667 if (guid == 0 && ops != &vdev_hole_ops) {
668 if (spa->spa_root_vdev == vd) {
669 /*
670 * The root vdev's guid will also be the pool guid,
671 * which must be unique among all pools.
672 */
673 guid = spa_generate_guid(NULL);
674 } else {
675 /*
676 * Any other vdev's guid must be unique within the pool.
677 */
678 guid = spa_generate_guid(spa);
679 }
680 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
681 }
682
683 vd->vdev_spa = spa;
684 vd->vdev_id = id;
685 vd->vdev_guid = guid;
686 vd->vdev_guid_sum = guid;
687 vd->vdev_ops = ops;
688 vd->vdev_state = VDEV_STATE_CLOSED;
689 vd->vdev_ishole = (ops == &vdev_hole_ops);
690 vic->vic_prev_indirect_vdev = UINT64_MAX;
691
692 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
693 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
694 vd->vdev_obsolete_segments = zfs_range_tree_create(NULL,
695 ZFS_RANGE_SEG64, NULL, 0, 0);
696
697 /*
698 * Initialize rate limit structs for events. We rate limit ZIO delay
699 * and checksum events so that we don't overwhelm ZED with thousands
700 * of events when a disk is acting up.
701 */
702 zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
703 1);
704 zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
705 1);
706 zfs_ratelimit_init(&vd->vdev_dio_verify_rl,
707 &zfs_dio_write_verify_events_per_second, 1);
708 zfs_ratelimit_init(&vd->vdev_checksum_rl,
709 &zfs_checksum_events_per_second, 1);
710
711 /*
712 * Default Thresholds for tuning ZED
713 */
714 vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
715 vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
716 vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
717 vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
718 vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
719 vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
720
721 list_link_init(&vd->vdev_config_dirty_node);
722 list_link_init(&vd->vdev_state_dirty_node);
723 list_link_init(&vd->vdev_initialize_node);
724 list_link_init(&vd->vdev_leaf_node);
725 list_link_init(&vd->vdev_trim_node);
726
727 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
728 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
729 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
730 mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
731
732 mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
733 mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
734 cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
735 cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
736
737 mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
738 mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
739 mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
740 cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
741 cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
742 cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
743 cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
744
745 mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
746 cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
747
748 for (int t = 0; t < DTL_TYPES; t++) {
749 vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
750 NULL, 0, 0);
751 }
752
753 txg_list_create(&vd->vdev_ms_list, spa,
754 offsetof(struct metaslab, ms_txg_node));
755 txg_list_create(&vd->vdev_dtl_list, spa,
756 offsetof(struct vdev, vdev_dtl_node));
757 vd->vdev_stat.vs_timestamp = gethrtime();
758 vdev_queue_init(vd);
759
760 return (vd);
761 }
762
763 /*
764 * Allocate a new vdev. The 'alloctype' is used to control whether we are
765 * creating a new vdev or loading an existing one - the behavior is slightly
766 * different for each case.
767 */
768 int
vdev_alloc(spa_t * spa,vdev_t ** vdp,nvlist_t * nv,vdev_t * parent,uint_t id,int alloctype)769 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
770 int alloctype)
771 {
772 vdev_ops_t *ops;
773 const char *type;
774 uint64_t guid = 0, islog;
775 vdev_t *vd;
776 vdev_indirect_config_t *vic;
777 const char *tmp = NULL;
778 int rc;
779 vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
780 boolean_t top_level = (parent && !parent->vdev_parent);
781
782 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
783
784 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
785 return (SET_ERROR(EINVAL));
786
787 if ((ops = vdev_getops(type)) == NULL)
788 return (SET_ERROR(EINVAL));
789
790 /*
791 * If this is a load, get the vdev guid from the nvlist.
792 * Otherwise, vdev_alloc_common() will generate one for us.
793 */
794 if (alloctype == VDEV_ALLOC_LOAD) {
795 uint64_t label_id;
796
797 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
798 label_id != id)
799 return (SET_ERROR(EINVAL));
800
801 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
802 return (SET_ERROR(EINVAL));
803 } else if (alloctype == VDEV_ALLOC_SPARE) {
804 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
805 return (SET_ERROR(EINVAL));
806 } else if (alloctype == VDEV_ALLOC_L2CACHE) {
807 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
808 return (SET_ERROR(EINVAL));
809 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
810 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
811 return (SET_ERROR(EINVAL));
812 }
813
814 /*
815 * The first allocated vdev must be of type 'root'.
816 */
817 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
818 return (SET_ERROR(EINVAL));
819
820 /*
821 * Determine whether we're a log vdev.
822 */
823 islog = 0;
824 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
825 if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
826 return (SET_ERROR(ENOTSUP));
827
828 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
829 return (SET_ERROR(ENOTSUP));
830
831 if (top_level && alloctype == VDEV_ALLOC_ADD) {
832 const char *bias;
833
834 /*
835 * If creating a top-level vdev, check for allocation
836 * classes input.
837 */
838 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
839 &bias) == 0) {
840 alloc_bias = vdev_derive_alloc_bias(bias);
841
842 /* spa_vdev_add() expects feature to be enabled */
843 if (spa->spa_load_state != SPA_LOAD_CREATE &&
844 !spa_feature_is_enabled(spa,
845 SPA_FEATURE_ALLOCATION_CLASSES)) {
846 return (SET_ERROR(ENOTSUP));
847 }
848 }
849
850 /* spa_vdev_add() expects feature to be enabled */
851 if (ops == &vdev_draid_ops &&
852 spa->spa_load_state != SPA_LOAD_CREATE &&
853 !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
854 return (SET_ERROR(ENOTSUP));
855 }
856 }
857
858 /*
859 * Initialize the vdev specific data. This is done before calling
860 * vdev_alloc_common() since it may fail and this simplifies the
861 * error reporting and cleanup code paths.
862 */
863 void *tsd = NULL;
864 if (ops->vdev_op_init != NULL) {
865 rc = ops->vdev_op_init(spa, nv, &tsd);
866 if (rc != 0) {
867 return (rc);
868 }
869 }
870
871 vd = vdev_alloc_common(spa, id, guid, ops);
872 vd->vdev_tsd = tsd;
873 vd->vdev_islog = islog;
874
875 if (top_level && alloc_bias != VDEV_BIAS_NONE)
876 vd->vdev_alloc_bias = alloc_bias;
877
878 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
879 vd->vdev_path = spa_strdup(tmp);
880
881 /*
882 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
883 * fault on a vdev and want it to persist across imports (like with
884 * zpool offline -f).
885 */
886 rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
887 if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
888 vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
889 vd->vdev_faulted = 1;
890 vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
891 }
892
893 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
894 vd->vdev_devid = spa_strdup(tmp);
895 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
896 vd->vdev_physpath = spa_strdup(tmp);
897
898 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
899 &tmp) == 0)
900 vd->vdev_enc_sysfs_path = spa_strdup(tmp);
901
902 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
903 vd->vdev_fru = spa_strdup(tmp);
904
905 /*
906 * Set the whole_disk property. If it's not specified, leave the value
907 * as -1.
908 */
909 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
910 &vd->vdev_wholedisk) != 0)
911 vd->vdev_wholedisk = -1ULL;
912
913 vic = &vd->vdev_indirect_config;
914
915 ASSERT0(vic->vic_mapping_object);
916 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
917 &vic->vic_mapping_object);
918 ASSERT0(vic->vic_births_object);
919 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
920 &vic->vic_births_object);
921 ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
922 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
923 &vic->vic_prev_indirect_vdev);
924
925 /*
926 * Look for the 'not present' flag. This will only be set if the device
927 * was not present at the time of import.
928 */
929 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
930 &vd->vdev_not_present);
931
932 /*
933 * Get the alignment requirement. Ignore pool ashift for vdev
934 * attach case.
935 */
936 if (alloctype != VDEV_ALLOC_ATTACH) {
937 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
938 &vd->vdev_ashift);
939 } else {
940 vd->vdev_attaching = B_TRUE;
941 }
942
943 /*
944 * Retrieve the vdev creation time.
945 */
946 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
947 &vd->vdev_crtxg);
948
949 if (vd->vdev_ops == &vdev_root_ops &&
950 (alloctype == VDEV_ALLOC_LOAD ||
951 alloctype == VDEV_ALLOC_SPLIT ||
952 alloctype == VDEV_ALLOC_ROOTPOOL)) {
953 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
954 &vd->vdev_root_zap);
955 }
956
957 /*
958 * If we're a top-level vdev, try to load the allocation parameters.
959 */
960 if (top_level &&
961 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
962 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
963 &vd->vdev_ms_array);
964 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
965 &vd->vdev_ms_shift);
966 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
967 &vd->vdev_asize);
968 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
969 &vd->vdev_noalloc);
970 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
971 &vd->vdev_removing);
972 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
973 &vd->vdev_top_zap);
974 vd->vdev_rz_expanding = nvlist_exists(nv,
975 ZPOOL_CONFIG_RAIDZ_EXPANDING);
976 } else {
977 ASSERT0(vd->vdev_top_zap);
978 }
979
980 if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
981 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
982 alloctype == VDEV_ALLOC_ADD ||
983 alloctype == VDEV_ALLOC_SPLIT ||
984 alloctype == VDEV_ALLOC_ROOTPOOL);
985 /* Note: metaslab_group_create() is now deferred */
986 }
987
988 if (vd->vdev_ops->vdev_op_leaf &&
989 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
990 (void) nvlist_lookup_uint64(nv,
991 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
992 } else {
993 ASSERT0(vd->vdev_leaf_zap);
994 }
995
996 /*
997 * If we're a leaf vdev, try to load the DTL object and other state.
998 */
999
1000 if (vd->vdev_ops->vdev_op_leaf &&
1001 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
1002 alloctype == VDEV_ALLOC_ROOTPOOL)) {
1003 if (alloctype == VDEV_ALLOC_LOAD) {
1004 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
1005 &vd->vdev_dtl_object);
1006 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
1007 &vd->vdev_unspare);
1008 }
1009
1010 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
1011 uint64_t spare = 0;
1012
1013 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
1014 &spare) == 0 && spare)
1015 spa_spare_add(vd);
1016 }
1017
1018 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
1019 &vd->vdev_offline);
1020
1021 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
1022 &vd->vdev_resilver_txg);
1023
1024 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
1025 &vd->vdev_rebuild_txg);
1026
1027 if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
1028 vdev_defer_resilver(vd);
1029
1030 /*
1031 * In general, when importing a pool we want to ignore the
1032 * persistent fault state, as the diagnosis made on another
1033 * system may not be valid in the current context. The only
1034 * exception is if we forced a vdev to a persistently faulted
1035 * state with 'zpool offline -f'. The persistent fault will
1036 * remain across imports until cleared.
1037 *
1038 * Local vdevs will remain in the faulted state.
1039 */
1040 if (spa_load_state(spa) == SPA_LOAD_OPEN ||
1041 spa_load_state(spa) == SPA_LOAD_IMPORT) {
1042 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
1043 &vd->vdev_faulted);
1044 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
1045 &vd->vdev_degraded);
1046 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
1047 &vd->vdev_removed);
1048
1049 if (vd->vdev_faulted || vd->vdev_degraded) {
1050 const char *aux;
1051
1052 vd->vdev_label_aux =
1053 VDEV_AUX_ERR_EXCEEDED;
1054 if (nvlist_lookup_string(nv,
1055 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
1056 strcmp(aux, "external") == 0)
1057 vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
1058 else
1059 vd->vdev_faulted = 0ULL;
1060 }
1061 }
1062 }
1063
1064 /*
1065 * Add ourselves to the parent's list of children.
1066 */
1067 vdev_add_child(parent, vd);
1068
1069 *vdp = vd;
1070
1071 return (0);
1072 }
1073
1074 void
vdev_free(vdev_t * vd)1075 vdev_free(vdev_t *vd)
1076 {
1077 spa_t *spa = vd->vdev_spa;
1078
1079 ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
1080 ASSERT3P(vd->vdev_trim_thread, ==, NULL);
1081 ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
1082 ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
1083
1084 /*
1085 * Scan queues are normally destroyed at the end of a scan. If the
1086 * queue exists here, that implies the vdev is being removed while
1087 * the scan is still running.
1088 */
1089 if (vd->vdev_scan_io_queue != NULL) {
1090 mutex_enter(&vd->vdev_scan_io_queue_lock);
1091 dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
1092 vd->vdev_scan_io_queue = NULL;
1093 mutex_exit(&vd->vdev_scan_io_queue_lock);
1094 }
1095
1096 /*
1097 * vdev_free() implies closing the vdev first. This is simpler than
1098 * trying to ensure complicated semantics for all callers.
1099 */
1100 vdev_close(vd);
1101
1102 ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
1103 ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
1104
1105 /*
1106 * Free all children.
1107 */
1108 for (int c = 0; c < vd->vdev_children; c++)
1109 vdev_free(vd->vdev_child[c]);
1110
1111 ASSERT(vd->vdev_child == NULL);
1112 ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
1113
1114 if (vd->vdev_ops->vdev_op_fini != NULL)
1115 vd->vdev_ops->vdev_op_fini(vd);
1116
1117 /*
1118 * Discard allocation state.
1119 */
1120 if (vd->vdev_mg != NULL) {
1121 vdev_metaslab_fini(vd);
1122 metaslab_group_destroy(vd->vdev_mg);
1123 vd->vdev_mg = NULL;
1124 }
1125 if (vd->vdev_log_mg != NULL) {
1126 ASSERT0(vd->vdev_ms_count);
1127 metaslab_group_destroy(vd->vdev_log_mg);
1128 vd->vdev_log_mg = NULL;
1129 }
1130
1131 ASSERT0(vd->vdev_stat.vs_space);
1132 ASSERT0(vd->vdev_stat.vs_dspace);
1133 ASSERT0(vd->vdev_stat.vs_alloc);
1134
1135 /*
1136 * Remove this vdev from its parent's child list.
1137 */
1138 vdev_remove_child(vd->vdev_parent, vd);
1139
1140 ASSERT(vd->vdev_parent == NULL);
1141 ASSERT(!list_link_active(&vd->vdev_leaf_node));
1142
1143 /*
1144 * Clean up vdev structure.
1145 */
1146 vdev_queue_fini(vd);
1147
1148 if (vd->vdev_path)
1149 spa_strfree(vd->vdev_path);
1150 if (vd->vdev_devid)
1151 spa_strfree(vd->vdev_devid);
1152 if (vd->vdev_physpath)
1153 spa_strfree(vd->vdev_physpath);
1154
1155 if (vd->vdev_enc_sysfs_path)
1156 spa_strfree(vd->vdev_enc_sysfs_path);
1157
1158 if (vd->vdev_fru)
1159 spa_strfree(vd->vdev_fru);
1160
1161 if (vd->vdev_isspare)
1162 spa_spare_remove(vd);
1163 if (vd->vdev_isl2cache)
1164 spa_l2cache_remove(vd);
1165
1166 txg_list_destroy(&vd->vdev_ms_list);
1167 txg_list_destroy(&vd->vdev_dtl_list);
1168
1169 mutex_enter(&vd->vdev_dtl_lock);
1170 space_map_close(vd->vdev_dtl_sm);
1171 for (int t = 0; t < DTL_TYPES; t++) {
1172 zfs_range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
1173 zfs_range_tree_destroy(vd->vdev_dtl[t]);
1174 }
1175 mutex_exit(&vd->vdev_dtl_lock);
1176
1177 EQUIV(vd->vdev_indirect_births != NULL,
1178 vd->vdev_indirect_mapping != NULL);
1179 if (vd->vdev_indirect_births != NULL) {
1180 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
1181 vdev_indirect_births_close(vd->vdev_indirect_births);
1182 }
1183
1184 if (vd->vdev_obsolete_sm != NULL) {
1185 ASSERT(vd->vdev_removing ||
1186 vd->vdev_ops == &vdev_indirect_ops);
1187 space_map_close(vd->vdev_obsolete_sm);
1188 vd->vdev_obsolete_sm = NULL;
1189 }
1190 zfs_range_tree_destroy(vd->vdev_obsolete_segments);
1191 rw_destroy(&vd->vdev_indirect_rwlock);
1192 mutex_destroy(&vd->vdev_obsolete_lock);
1193
1194 mutex_destroy(&vd->vdev_dtl_lock);
1195 mutex_destroy(&vd->vdev_stat_lock);
1196 mutex_destroy(&vd->vdev_probe_lock);
1197 mutex_destroy(&vd->vdev_scan_io_queue_lock);
1198
1199 mutex_destroy(&vd->vdev_initialize_lock);
1200 mutex_destroy(&vd->vdev_initialize_io_lock);
1201 cv_destroy(&vd->vdev_initialize_io_cv);
1202 cv_destroy(&vd->vdev_initialize_cv);
1203
1204 mutex_destroy(&vd->vdev_trim_lock);
1205 mutex_destroy(&vd->vdev_autotrim_lock);
1206 mutex_destroy(&vd->vdev_trim_io_lock);
1207 cv_destroy(&vd->vdev_trim_cv);
1208 cv_destroy(&vd->vdev_autotrim_cv);
1209 cv_destroy(&vd->vdev_autotrim_kick_cv);
1210 cv_destroy(&vd->vdev_trim_io_cv);
1211
1212 mutex_destroy(&vd->vdev_rebuild_lock);
1213 cv_destroy(&vd->vdev_rebuild_cv);
1214
1215 zfs_ratelimit_fini(&vd->vdev_delay_rl);
1216 zfs_ratelimit_fini(&vd->vdev_deadman_rl);
1217 zfs_ratelimit_fini(&vd->vdev_dio_verify_rl);
1218 zfs_ratelimit_fini(&vd->vdev_checksum_rl);
1219
1220 if (vd == spa->spa_root_vdev)
1221 spa->spa_root_vdev = NULL;
1222
1223 kmem_free(vd, sizeof (vdev_t));
1224 }
1225
1226 /*
1227 * Transfer top-level vdev state from svd to tvd.
1228 */
1229 static void
vdev_top_transfer(vdev_t * svd,vdev_t * tvd)1230 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
1231 {
1232 spa_t *spa = svd->vdev_spa;
1233 metaslab_t *msp;
1234 vdev_t *vd;
1235 int t;
1236
1237 ASSERT(tvd == tvd->vdev_top);
1238
1239 tvd->vdev_ms_array = svd->vdev_ms_array;
1240 tvd->vdev_ms_shift = svd->vdev_ms_shift;
1241 tvd->vdev_ms_count = svd->vdev_ms_count;
1242 tvd->vdev_top_zap = svd->vdev_top_zap;
1243
1244 svd->vdev_ms_array = 0;
1245 svd->vdev_ms_shift = 0;
1246 svd->vdev_ms_count = 0;
1247 svd->vdev_top_zap = 0;
1248
1249 if (tvd->vdev_mg)
1250 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
1251 if (tvd->vdev_log_mg)
1252 ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
1253 tvd->vdev_mg = svd->vdev_mg;
1254 tvd->vdev_log_mg = svd->vdev_log_mg;
1255 tvd->vdev_ms = svd->vdev_ms;
1256
1257 svd->vdev_mg = NULL;
1258 svd->vdev_log_mg = NULL;
1259 svd->vdev_ms = NULL;
1260
1261 if (tvd->vdev_mg != NULL)
1262 tvd->vdev_mg->mg_vd = tvd;
1263 if (tvd->vdev_log_mg != NULL)
1264 tvd->vdev_log_mg->mg_vd = tvd;
1265
1266 tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
1267 svd->vdev_checkpoint_sm = NULL;
1268
1269 tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
1270 svd->vdev_alloc_bias = VDEV_BIAS_NONE;
1271
1272 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
1273 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
1274 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
1275
1276 svd->vdev_stat.vs_alloc = 0;
1277 svd->vdev_stat.vs_space = 0;
1278 svd->vdev_stat.vs_dspace = 0;
1279
1280 /*
1281 * State which may be set on a top-level vdev that's in the
1282 * process of being removed.
1283 */
1284 ASSERT0(tvd->vdev_indirect_config.vic_births_object);
1285 ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
1286 ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
1287 ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
1288 ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
1289 ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
1290 ASSERT0(tvd->vdev_noalloc);
1291 ASSERT0(tvd->vdev_removing);
1292 ASSERT0(tvd->vdev_rebuilding);
1293 tvd->vdev_noalloc = svd->vdev_noalloc;
1294 tvd->vdev_removing = svd->vdev_removing;
1295 tvd->vdev_rebuilding = svd->vdev_rebuilding;
1296 tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
1297 tvd->vdev_indirect_config = svd->vdev_indirect_config;
1298 tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
1299 tvd->vdev_indirect_births = svd->vdev_indirect_births;
1300 zfs_range_tree_swap(&svd->vdev_obsolete_segments,
1301 &tvd->vdev_obsolete_segments);
1302 tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
1303 svd->vdev_indirect_config.vic_mapping_object = 0;
1304 svd->vdev_indirect_config.vic_births_object = 0;
1305 svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
1306 svd->vdev_indirect_mapping = NULL;
1307 svd->vdev_indirect_births = NULL;
1308 svd->vdev_obsolete_sm = NULL;
1309 svd->vdev_noalloc = 0;
1310 svd->vdev_removing = 0;
1311 svd->vdev_rebuilding = 0;
1312
1313 for (t = 0; t < TXG_SIZE; t++) {
1314 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
1315 (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
1316 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
1317 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
1318 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
1319 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
1320 }
1321
1322 if (list_link_active(&svd->vdev_config_dirty_node)) {
1323 vdev_config_clean(svd);
1324 vdev_config_dirty(tvd);
1325 }
1326
1327 if (list_link_active(&svd->vdev_state_dirty_node)) {
1328 vdev_state_clean(svd);
1329 vdev_state_dirty(tvd);
1330 }
1331
1332 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
1333 svd->vdev_deflate_ratio = 0;
1334
1335 tvd->vdev_islog = svd->vdev_islog;
1336 svd->vdev_islog = 0;
1337
1338 dsl_scan_io_queue_vdev_xfer(svd, tvd);
1339 }
1340
1341 static void
vdev_top_update(vdev_t * tvd,vdev_t * vd)1342 vdev_top_update(vdev_t *tvd, vdev_t *vd)
1343 {
1344 if (vd == NULL)
1345 return;
1346
1347 vd->vdev_top = tvd;
1348
1349 for (int c = 0; c < vd->vdev_children; c++)
1350 vdev_top_update(tvd, vd->vdev_child[c]);
1351 }
1352
1353 /*
1354 * Add a mirror/replacing vdev above an existing vdev. There is no need to
1355 * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
1356 */
1357 vdev_t *
vdev_add_parent(vdev_t * cvd,vdev_ops_t * ops)1358 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
1359 {
1360 spa_t *spa = cvd->vdev_spa;
1361 vdev_t *pvd = cvd->vdev_parent;
1362 vdev_t *mvd;
1363
1364 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1365
1366 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
1367
1368 mvd->vdev_asize = cvd->vdev_asize;
1369 mvd->vdev_min_asize = cvd->vdev_min_asize;
1370 mvd->vdev_max_asize = cvd->vdev_max_asize;
1371 mvd->vdev_psize = cvd->vdev_psize;
1372 mvd->vdev_ashift = cvd->vdev_ashift;
1373 mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
1374 mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
1375 mvd->vdev_state = cvd->vdev_state;
1376 mvd->vdev_crtxg = cvd->vdev_crtxg;
1377
1378 vdev_remove_child(pvd, cvd);
1379 vdev_add_child(pvd, mvd);
1380 cvd->vdev_id = mvd->vdev_children;
1381 vdev_add_child(mvd, cvd);
1382 vdev_top_update(cvd->vdev_top, cvd->vdev_top);
1383
1384 if (mvd == mvd->vdev_top)
1385 vdev_top_transfer(cvd, mvd);
1386
1387 return (mvd);
1388 }
1389
1390 /*
1391 * Remove a 1-way mirror/replacing vdev from the tree.
1392 */
1393 void
vdev_remove_parent(vdev_t * cvd)1394 vdev_remove_parent(vdev_t *cvd)
1395 {
1396 vdev_t *mvd = cvd->vdev_parent;
1397 vdev_t *pvd = mvd->vdev_parent;
1398
1399 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1400
1401 ASSERT(mvd->vdev_children == 1);
1402 ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
1403 mvd->vdev_ops == &vdev_replacing_ops ||
1404 mvd->vdev_ops == &vdev_spare_ops);
1405 cvd->vdev_ashift = mvd->vdev_ashift;
1406 cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
1407 cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
1408 vdev_remove_child(mvd, cvd);
1409 vdev_remove_child(pvd, mvd);
1410
1411 /*
1412 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
1413 * Otherwise, we could have detached an offline device, and when we
1414 * go to import the pool we'll think we have two top-level vdevs,
1415 * instead of a different version of the same top-level vdev.
1416 */
1417 if (mvd->vdev_top == mvd) {
1418 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
1419 cvd->vdev_orig_guid = cvd->vdev_guid;
1420 cvd->vdev_guid += guid_delta;
1421 cvd->vdev_guid_sum += guid_delta;
1422
1423 /*
1424 * If pool not set for autoexpand, we need to also preserve
1425 * mvd's asize to prevent automatic expansion of cvd.
1426 * Otherwise if we are adjusting the mirror by attaching and
1427 * detaching children of non-uniform sizes, the mirror could
1428 * autoexpand, unexpectedly requiring larger devices to
1429 * re-establish the mirror.
1430 */
1431 if (!cvd->vdev_spa->spa_autoexpand)
1432 cvd->vdev_asize = mvd->vdev_asize;
1433 }
1434 cvd->vdev_id = mvd->vdev_id;
1435 vdev_add_child(pvd, cvd);
1436 vdev_top_update(cvd->vdev_top, cvd->vdev_top);
1437
1438 if (cvd == cvd->vdev_top)
1439 vdev_top_transfer(mvd, cvd);
1440
1441 ASSERT(mvd->vdev_children == 0);
1442 vdev_free(mvd);
1443 }
1444
1445 /*
1446 * Choose GCD for spa_gcd_alloc.
1447 */
1448 static uint64_t
vdev_gcd(uint64_t a,uint64_t b)1449 vdev_gcd(uint64_t a, uint64_t b)
1450 {
1451 while (b != 0) {
1452 uint64_t t = b;
1453 b = a % b;
1454 a = t;
1455 }
1456 return (a);
1457 }
1458
1459 /*
1460 * Set spa_min_alloc and spa_gcd_alloc.
1461 */
1462 static void
vdev_spa_set_alloc(spa_t * spa,uint64_t min_alloc)1463 vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
1464 {
1465 if (min_alloc < spa->spa_min_alloc)
1466 spa->spa_min_alloc = min_alloc;
1467 if (spa->spa_gcd_alloc == INT_MAX) {
1468 spa->spa_gcd_alloc = min_alloc;
1469 } else {
1470 spa->spa_gcd_alloc = vdev_gcd(min_alloc,
1471 spa->spa_gcd_alloc);
1472 }
1473 }
1474
1475 void
vdev_metaslab_group_create(vdev_t * vd)1476 vdev_metaslab_group_create(vdev_t *vd)
1477 {
1478 spa_t *spa = vd->vdev_spa;
1479
1480 /*
1481 * metaslab_group_create was delayed until allocation bias was available
1482 */
1483 if (vd->vdev_mg == NULL) {
1484 metaslab_class_t *mc;
1485
1486 if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
1487 vd->vdev_alloc_bias = VDEV_BIAS_LOG;
1488
1489 ASSERT3U(vd->vdev_islog, ==,
1490 (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
1491
1492 switch (vd->vdev_alloc_bias) {
1493 case VDEV_BIAS_LOG:
1494 mc = spa_log_class(spa);
1495 break;
1496 case VDEV_BIAS_SPECIAL:
1497 mc = spa_special_class(spa);
1498 break;
1499 case VDEV_BIAS_DEDUP:
1500 mc = spa_dedup_class(spa);
1501 break;
1502 default:
1503 mc = spa_normal_class(spa);
1504 }
1505
1506 vd->vdev_mg = metaslab_group_create(mc, vd);
1507
1508 if (!vd->vdev_islog) {
1509 vd->vdev_log_mg = metaslab_group_create(
1510 spa_embedded_log_class(spa), vd);
1511 }
1512
1513 /*
1514 * The spa ashift min/max only apply for the normal metaslab
1515 * class. Class destination is late binding so ashift boundary
1516 * setting had to wait until now.
1517 */
1518 if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
1519 mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
1520 if (vd->vdev_ashift > spa->spa_max_ashift)
1521 spa->spa_max_ashift = vd->vdev_ashift;
1522 if (vd->vdev_ashift < spa->spa_min_ashift)
1523 spa->spa_min_ashift = vd->vdev_ashift;
1524
1525 uint64_t min_alloc = vdev_get_min_alloc(vd);
1526 vdev_spa_set_alloc(spa, min_alloc);
1527 }
1528 }
1529 }
1530
1531 void
vdev_update_nonallocating_space(vdev_t * vd,boolean_t add)1532 vdev_update_nonallocating_space(vdev_t *vd, boolean_t add)
1533 {
1534 spa_t *spa = vd->vdev_spa;
1535
1536 if (vd->vdev_mg->mg_class != spa_normal_class(spa))
1537 return;
1538
1539 uint64_t raw_space = metaslab_group_get_space(vd->vdev_mg);
1540 uint64_t dspace = spa_deflate(spa) ?
1541 vdev_deflated_space(vd, raw_space) : raw_space;
1542 if (add) {
1543 spa->spa_nonallocating_dspace += dspace;
1544 } else {
1545 ASSERT3U(spa->spa_nonallocating_dspace, >=, dspace);
1546 spa->spa_nonallocating_dspace -= dspace;
1547 }
1548 }
1549
1550 int
vdev_metaslab_init(vdev_t * vd,uint64_t txg)1551 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
1552 {
1553 spa_t *spa = vd->vdev_spa;
1554 uint64_t oldc = vd->vdev_ms_count;
1555 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
1556 metaslab_t **mspp;
1557 int error;
1558 boolean_t expanding = (oldc != 0);
1559
1560 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1561
1562 /*
1563 * This vdev is not being allocated from yet or is a hole.
1564 */
1565 if (vd->vdev_ms_shift == 0)
1566 return (0);
1567
1568 ASSERT(!vd->vdev_ishole);
1569
1570 ASSERT(oldc <= newc);
1571
1572 mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
1573
1574 if (expanding) {
1575 memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
1576 vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
1577 }
1578
1579 vd->vdev_ms = mspp;
1580 vd->vdev_ms_count = newc;
1581
1582 for (uint64_t m = oldc; m < newc; m++) {
1583 uint64_t object = 0;
1584 /*
1585 * vdev_ms_array may be 0 if we are creating the "fake"
1586 * metaslabs for an indirect vdev for zdb's leak detection.
1587 * See zdb_leak_init().
1588 */
1589 if (txg == 0 && vd->vdev_ms_array != 0) {
1590 error = dmu_read(spa->spa_meta_objset,
1591 vd->vdev_ms_array,
1592 m * sizeof (uint64_t), sizeof (uint64_t), &object,
1593 DMU_READ_PREFETCH);
1594 if (error != 0) {
1595 vdev_dbgmsg(vd, "unable to read the metaslab "
1596 "array [error=%d]", error);
1597 return (error);
1598 }
1599 }
1600
1601 error = metaslab_init(vd->vdev_mg, m, object, txg,
1602 &(vd->vdev_ms[m]));
1603 if (error != 0) {
1604 vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
1605 error);
1606 return (error);
1607 }
1608 }
1609
1610 /*
1611 * Find the emptiest metaslab on the vdev and mark it for use for
1612 * embedded slog by moving it from the regular to the log metaslab
1613 * group.
1614 */
1615 if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
1616 vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
1617 avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
1618 uint64_t slog_msid = 0;
1619 uint64_t smallest = UINT64_MAX;
1620
1621 /*
1622 * Note, we only search the new metaslabs, because the old
1623 * (pre-existing) ones may be active (e.g. have non-empty
1624 * range_tree's), and we don't move them to the new
1625 * metaslab_t.
1626 */
1627 for (uint64_t m = oldc; m < newc; m++) {
1628 uint64_t alloc =
1629 space_map_allocated(vd->vdev_ms[m]->ms_sm);
1630 if (alloc < smallest) {
1631 slog_msid = m;
1632 smallest = alloc;
1633 }
1634 }
1635 metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
1636 /*
1637 * The metaslab was marked as dirty at the end of
1638 * metaslab_init(). Remove it from the dirty list so that we
1639 * can uninitialize and reinitialize it to the new class.
1640 */
1641 if (txg != 0) {
1642 (void) txg_list_remove_this(&vd->vdev_ms_list,
1643 slog_ms, txg);
1644 }
1645 uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
1646 metaslab_fini(slog_ms);
1647 VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
1648 &vd->vdev_ms[slog_msid]));
1649 }
1650
1651 if (txg == 0)
1652 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
1653
1654 /*
1655 * If the vdev is marked as non-allocating then don't
1656 * activate the metaslabs since we want to ensure that
1657 * no allocations are performed on this device.
1658 */
1659 if (vd->vdev_noalloc) {
1660 /* track non-allocating vdev space */
1661 vdev_update_nonallocating_space(vd, B_TRUE);
1662 } else if (!expanding) {
1663 metaslab_group_activate(vd->vdev_mg);
1664 if (vd->vdev_log_mg != NULL)
1665 metaslab_group_activate(vd->vdev_log_mg);
1666 }
1667
1668 if (txg == 0)
1669 spa_config_exit(spa, SCL_ALLOC, FTAG);
1670
1671 return (0);
1672 }
1673
1674 void
vdev_metaslab_fini(vdev_t * vd)1675 vdev_metaslab_fini(vdev_t *vd)
1676 {
1677 if (vd->vdev_checkpoint_sm != NULL) {
1678 ASSERT(spa_feature_is_active(vd->vdev_spa,
1679 SPA_FEATURE_POOL_CHECKPOINT));
1680 space_map_close(vd->vdev_checkpoint_sm);
1681 /*
1682 * Even though we close the space map, we need to set its
1683 * pointer to NULL. The reason is that vdev_metaslab_fini()
1684 * may be called multiple times for certain operations
1685 * (i.e. when destroying a pool) so we need to ensure that
1686 * this clause never executes twice. This logic is similar
1687 * to the one used for the vdev_ms clause below.
1688 */
1689 vd->vdev_checkpoint_sm = NULL;
1690 }
1691
1692 if (vd->vdev_ms != NULL) {
1693 metaslab_group_t *mg = vd->vdev_mg;
1694
1695 metaslab_group_passivate(mg);
1696 if (vd->vdev_log_mg != NULL) {
1697 ASSERT(!vd->vdev_islog);
1698 metaslab_group_passivate(vd->vdev_log_mg);
1699 }
1700
1701 uint64_t count = vd->vdev_ms_count;
1702 for (uint64_t m = 0; m < count; m++) {
1703 metaslab_t *msp = vd->vdev_ms[m];
1704 if (msp != NULL)
1705 metaslab_fini(msp);
1706 }
1707 vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1708 vd->vdev_ms = NULL;
1709 vd->vdev_ms_count = 0;
1710
1711 for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
1712 ASSERT0(mg->mg_histogram[i]);
1713 if (vd->vdev_log_mg != NULL)
1714 ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
1715 }
1716 }
1717 ASSERT0(vd->vdev_ms_count);
1718 }
1719
1720 typedef struct vdev_probe_stats {
1721 boolean_t vps_readable;
1722 boolean_t vps_writeable;
1723 boolean_t vps_zio_done_probe;
1724 int vps_flags;
1725 } vdev_probe_stats_t;
1726
1727 static void
vdev_probe_done(zio_t * zio)1728 vdev_probe_done(zio_t *zio)
1729 {
1730 spa_t *spa = zio->io_spa;
1731 vdev_t *vd = zio->io_vd;
1732 vdev_probe_stats_t *vps = zio->io_private;
1733
1734 ASSERT(vd->vdev_probe_zio != NULL);
1735
1736 if (zio->io_type == ZIO_TYPE_READ) {
1737 if (zio->io_error == 0)
1738 vps->vps_readable = 1;
1739 if (zio->io_error == 0 && spa_writeable(spa)) {
1740 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
1741 zio->io_offset, zio->io_size, zio->io_abd,
1742 ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1743 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
1744 } else {
1745 abd_free(zio->io_abd);
1746 }
1747 } else if (zio->io_type == ZIO_TYPE_WRITE) {
1748 if (zio->io_error == 0)
1749 vps->vps_writeable = 1;
1750 abd_free(zio->io_abd);
1751 } else if (zio->io_type == ZIO_TYPE_NULL) {
1752 zio_t *pio;
1753 zio_link_t *zl;
1754
1755 vd->vdev_cant_read |= !vps->vps_readable;
1756 vd->vdev_cant_write |= !vps->vps_writeable;
1757 vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
1758 vd->vdev_cant_read, vd->vdev_cant_write);
1759
1760 if (vdev_readable(vd) &&
1761 (vdev_writeable(vd) || !spa_writeable(spa))) {
1762 zio->io_error = 0;
1763 } else {
1764 ASSERT(zio->io_error != 0);
1765 vdev_dbgmsg(vd, "failed probe");
1766 (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
1767 spa, vd, NULL, NULL, 0);
1768 zio->io_error = SET_ERROR(ENXIO);
1769
1770 /*
1771 * If this probe was initiated from zio pipeline, then
1772 * change the state in a spa_async_request. Probes that
1773 * were initiated from a vdev_open can change the state
1774 * as part of the open call.
1775 */
1776 if (vps->vps_zio_done_probe) {
1777 vd->vdev_fault_wanted = B_TRUE;
1778 spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
1779 }
1780 }
1781
1782 mutex_enter(&vd->vdev_probe_lock);
1783 ASSERT(vd->vdev_probe_zio == zio);
1784 vd->vdev_probe_zio = NULL;
1785 mutex_exit(&vd->vdev_probe_lock);
1786
1787 zl = NULL;
1788 while ((pio = zio_walk_parents(zio, &zl)) != NULL)
1789 if (!vdev_accessible(vd, pio))
1790 pio->io_error = SET_ERROR(ENXIO);
1791
1792 kmem_free(vps, sizeof (*vps));
1793 }
1794 }
1795
1796 /*
1797 * Determine whether this device is accessible.
1798 *
1799 * Read and write to several known locations: the pad regions of each
1800 * vdev label but the first, which we leave alone in case it contains
1801 * a VTOC.
1802 */
1803 zio_t *
vdev_probe(vdev_t * vd,zio_t * zio)1804 vdev_probe(vdev_t *vd, zio_t *zio)
1805 {
1806 spa_t *spa = vd->vdev_spa;
1807 vdev_probe_stats_t *vps = NULL;
1808 zio_t *pio;
1809
1810 ASSERT(vd->vdev_ops->vdev_op_leaf);
1811
1812 /*
1813 * Don't probe the probe.
1814 */
1815 if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
1816 return (NULL);
1817
1818 /*
1819 * To prevent 'probe storms' when a device fails, we create
1820 * just one probe i/o at a time. All zios that want to probe
1821 * this vdev will become parents of the probe io.
1822 */
1823 mutex_enter(&vd->vdev_probe_lock);
1824
1825 if ((pio = vd->vdev_probe_zio) == NULL) {
1826 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
1827
1828 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
1829 ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
1830 vps->vps_zio_done_probe = (zio != NULL);
1831
1832 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
1833 /*
1834 * vdev_cant_read and vdev_cant_write can only
1835 * transition from TRUE to FALSE when we have the
1836 * SCL_ZIO lock as writer; otherwise they can only
1837 * transition from FALSE to TRUE. This ensures that
1838 * any zio looking at these values can assume that
1839 * failures persist for the life of the I/O. That's
1840 * important because when a device has intermittent
1841 * connectivity problems, we want to ensure that
1842 * they're ascribed to the device (ENXIO) and not
1843 * the zio (EIO).
1844 *
1845 * Since we hold SCL_ZIO as writer here, clear both
1846 * values so the probe can reevaluate from first
1847 * principles.
1848 */
1849 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
1850 vd->vdev_cant_read = B_FALSE;
1851 vd->vdev_cant_write = B_FALSE;
1852 }
1853
1854 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
1855 vdev_probe_done, vps,
1856 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
1857 }
1858
1859 if (zio != NULL)
1860 zio_add_child(zio, pio);
1861
1862 mutex_exit(&vd->vdev_probe_lock);
1863
1864 if (vps == NULL) {
1865 ASSERT(zio != NULL);
1866 return (NULL);
1867 }
1868
1869 for (int l = 1; l < VDEV_LABELS; l++) {
1870 zio_nowait(zio_read_phys(pio, vd,
1871 vdev_label_offset(vd->vdev_psize, l,
1872 offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
1873 abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
1874 ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1875 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
1876 }
1877
1878 if (zio == NULL)
1879 return (pio);
1880
1881 zio_nowait(pio);
1882 return (NULL);
1883 }
1884
1885 static void
vdev_load_child(void * arg)1886 vdev_load_child(void *arg)
1887 {
1888 vdev_t *vd = arg;
1889
1890 vd->vdev_load_error = vdev_load(vd);
1891 }
1892
1893 static void
vdev_open_child(void * arg)1894 vdev_open_child(void *arg)
1895 {
1896 vdev_t *vd = arg;
1897
1898 vd->vdev_open_thread = curthread;
1899 vd->vdev_open_error = vdev_open(vd);
1900 vd->vdev_open_thread = NULL;
1901 }
1902
1903 static boolean_t
vdev_uses_zvols(vdev_t * vd)1904 vdev_uses_zvols(vdev_t *vd)
1905 {
1906 #ifdef _KERNEL
1907 if (zvol_is_zvol(vd->vdev_path))
1908 return (B_TRUE);
1909 #endif
1910
1911 for (int c = 0; c < vd->vdev_children; c++)
1912 if (vdev_uses_zvols(vd->vdev_child[c]))
1913 return (B_TRUE);
1914
1915 return (B_FALSE);
1916 }
1917
1918 /*
1919 * Returns B_TRUE if the passed child should be opened.
1920 */
1921 static boolean_t
vdev_default_open_children_func(vdev_t * vd)1922 vdev_default_open_children_func(vdev_t *vd)
1923 {
1924 (void) vd;
1925 return (B_TRUE);
1926 }
1927
1928 /*
1929 * Open the requested child vdevs. If any of the leaf vdevs are using
1930 * a ZFS volume then do the opens in a single thread. This avoids a
1931 * deadlock when the current thread is holding the spa_namespace_lock.
1932 */
1933 static void
vdev_open_children_impl(vdev_t * vd,vdev_open_children_func_t * open_func)1934 vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
1935 {
1936 int children = vd->vdev_children;
1937
1938 taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
1939 children, children, TASKQ_PREPOPULATE);
1940 vd->vdev_nonrot = B_TRUE;
1941
1942 for (int c = 0; c < children; c++) {
1943 vdev_t *cvd = vd->vdev_child[c];
1944
1945 if (open_func(cvd) == B_FALSE)
1946 continue;
1947
1948 if (tq == NULL || vdev_uses_zvols(vd)) {
1949 cvd->vdev_open_error = vdev_open(cvd);
1950 } else {
1951 VERIFY(taskq_dispatch(tq, vdev_open_child,
1952 cvd, TQ_SLEEP) != TASKQID_INVALID);
1953 }
1954 }
1955
1956 if (tq != NULL)
1957 taskq_wait(tq);
1958 for (int c = 0; c < children; c++) {
1959 vdev_t *cvd = vd->vdev_child[c];
1960 vd->vdev_nonrot &= cvd->vdev_nonrot;
1961 }
1962
1963 if (tq != NULL)
1964 taskq_destroy(tq);
1965 }
1966
1967 /*
1968 * Open all child vdevs.
1969 */
1970 void
vdev_open_children(vdev_t * vd)1971 vdev_open_children(vdev_t *vd)
1972 {
1973 vdev_open_children_impl(vd, vdev_default_open_children_func);
1974 }
1975
1976 /*
1977 * Conditionally open a subset of child vdevs.
1978 */
1979 void
vdev_open_children_subset(vdev_t * vd,vdev_open_children_func_t * open_func)1980 vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
1981 {
1982 vdev_open_children_impl(vd, open_func);
1983 }
1984
1985 /*
1986 * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17)
1987 * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE
1988 * changed, this algorithm can not change, otherwise it would inconsistently
1989 * account for existing bp's. We also hard-code txg 0 for the same reason
1990 * since expanded RAIDZ vdevs can use a different asize for different birth
1991 * txg's.
1992 */
1993 static void
vdev_set_deflate_ratio(vdev_t * vd)1994 vdev_set_deflate_ratio(vdev_t *vd)
1995 {
1996 if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
1997 vd->vdev_deflate_ratio = (1 << 17) /
1998 (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
1999 SPA_MINBLOCKSHIFT);
2000 }
2001 }
2002
2003 /*
2004 * Choose the best of two ashifts, preferring one between logical ashift
2005 * (absolute minimum) and administrator defined maximum, otherwise take
2006 * the biggest of the two.
2007 */
2008 uint64_t
vdev_best_ashift(uint64_t logical,uint64_t a,uint64_t b)2009 vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
2010 {
2011 if (a > logical && a <= zfs_vdev_max_auto_ashift) {
2012 if (b <= logical || b > zfs_vdev_max_auto_ashift)
2013 return (a);
2014 else
2015 return (MAX(a, b));
2016 } else if (b <= logical || b > zfs_vdev_max_auto_ashift)
2017 return (MAX(a, b));
2018 return (b);
2019 }
2020
2021 /*
2022 * Maximize performance by inflating the configured ashift for top level
2023 * vdevs to be as close to the physical ashift as possible while maintaining
2024 * administrator defined limits and ensuring it doesn't go below the
2025 * logical ashift.
2026 */
2027 static void
vdev_ashift_optimize(vdev_t * vd)2028 vdev_ashift_optimize(vdev_t *vd)
2029 {
2030 ASSERT(vd == vd->vdev_top);
2031
2032 if (vd->vdev_ashift < vd->vdev_physical_ashift &&
2033 vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
2034 vd->vdev_ashift = MIN(
2035 MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
2036 MAX(zfs_vdev_min_auto_ashift,
2037 vd->vdev_physical_ashift));
2038 } else {
2039 /*
2040 * If the logical and physical ashifts are the same, then
2041 * we ensure that the top-level vdev's ashift is not smaller
2042 * than our minimum ashift value. For the unusual case
2043 * where logical ashift > physical ashift, we can't cap
2044 * the calculated ashift based on max ashift as that
2045 * would cause failures.
2046 * We still check if we need to increase it to match
2047 * the min ashift.
2048 */
2049 vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
2050 vd->vdev_ashift);
2051 }
2052 }
2053
2054 /*
2055 * Prepare a virtual device for access.
2056 */
2057 int
vdev_open(vdev_t * vd)2058 vdev_open(vdev_t *vd)
2059 {
2060 spa_t *spa = vd->vdev_spa;
2061 int error;
2062 uint64_t osize = 0;
2063 uint64_t max_osize = 0;
2064 uint64_t asize, max_asize, psize;
2065 uint64_t logical_ashift = 0;
2066 uint64_t physical_ashift = 0;
2067
2068 ASSERT(vd->vdev_open_thread == curthread ||
2069 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2070 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
2071 vd->vdev_state == VDEV_STATE_CANT_OPEN ||
2072 vd->vdev_state == VDEV_STATE_OFFLINE);
2073
2074 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2075 vd->vdev_cant_read = B_FALSE;
2076 vd->vdev_cant_write = B_FALSE;
2077 vd->vdev_fault_wanted = B_FALSE;
2078 vd->vdev_remove_wanted = B_FALSE;
2079 vd->vdev_min_asize = vdev_get_min_asize(vd);
2080
2081 /*
2082 * If this vdev is not removed, check its fault status. If it's
2083 * faulted, bail out of the open.
2084 */
2085 if (!vd->vdev_removed && vd->vdev_faulted) {
2086 ASSERT(vd->vdev_children == 0);
2087 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
2088 vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
2089 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
2090 vd->vdev_label_aux);
2091 return (SET_ERROR(ENXIO));
2092 } else if (vd->vdev_offline) {
2093 ASSERT(vd->vdev_children == 0);
2094 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
2095 return (SET_ERROR(ENXIO));
2096 }
2097
2098 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
2099 &logical_ashift, &physical_ashift);
2100
2101 /* Keep the device in removed state if unplugged */
2102 if (error == ENOENT && vd->vdev_removed) {
2103 vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
2104 VDEV_AUX_NONE);
2105 return (error);
2106 }
2107
2108 /*
2109 * Physical volume size should never be larger than its max size, unless
2110 * the disk has shrunk while we were reading it or the device is buggy
2111 * or damaged: either way it's not safe for use, bail out of the open.
2112 */
2113 if (osize > max_osize) {
2114 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2115 VDEV_AUX_OPEN_FAILED);
2116 return (SET_ERROR(ENXIO));
2117 }
2118
2119 /*
2120 * Reset the vdev_reopening flag so that we actually close
2121 * the vdev on error.
2122 */
2123 vd->vdev_reopening = B_FALSE;
2124 if (zio_injection_enabled && error == 0)
2125 error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
2126
2127 if (error) {
2128 if (vd->vdev_removed &&
2129 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
2130 vd->vdev_removed = B_FALSE;
2131
2132 if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
2133 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
2134 vd->vdev_stat.vs_aux);
2135 } else {
2136 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2137 vd->vdev_stat.vs_aux);
2138 }
2139 return (error);
2140 }
2141
2142 vd->vdev_removed = B_FALSE;
2143
2144 /*
2145 * Recheck the faulted flag now that we have confirmed that
2146 * the vdev is accessible. If we're faulted, bail.
2147 */
2148 if (vd->vdev_faulted) {
2149 ASSERT(vd->vdev_children == 0);
2150 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
2151 vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
2152 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
2153 vd->vdev_label_aux);
2154 return (SET_ERROR(ENXIO));
2155 }
2156
2157 if (vd->vdev_degraded) {
2158 ASSERT(vd->vdev_children == 0);
2159 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
2160 VDEV_AUX_ERR_EXCEEDED);
2161 } else {
2162 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
2163 }
2164
2165 /*
2166 * For hole or missing vdevs we just return success.
2167 */
2168 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
2169 return (0);
2170
2171 for (int c = 0; c < vd->vdev_children; c++) {
2172 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
2173 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
2174 VDEV_AUX_NONE);
2175 break;
2176 }
2177 }
2178
2179 osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t);
2180 max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t);
2181
2182 if (vd->vdev_children == 0) {
2183 if (osize < SPA_MINDEVSIZE) {
2184 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2185 VDEV_AUX_TOO_SMALL);
2186 return (SET_ERROR(EOVERFLOW));
2187 }
2188 psize = osize;
2189 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
2190 max_asize = max_osize - (VDEV_LABEL_START_SIZE +
2191 VDEV_LABEL_END_SIZE);
2192 } else {
2193 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
2194 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
2195 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2196 VDEV_AUX_TOO_SMALL);
2197 return (SET_ERROR(EOVERFLOW));
2198 }
2199 psize = 0;
2200 asize = osize;
2201 max_asize = max_osize;
2202 }
2203
2204 /*
2205 * If the vdev was expanded, record this so that we can re-create the
2206 * uberblock rings in labels {2,3}, during the next sync.
2207 */
2208 if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
2209 vd->vdev_copy_uberblocks = B_TRUE;
2210
2211 vd->vdev_psize = psize;
2212
2213 /*
2214 * Make sure the allocatable size hasn't shrunk too much.
2215 */
2216 if (asize < vd->vdev_min_asize) {
2217 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2218 VDEV_AUX_BAD_LABEL);
2219 return (SET_ERROR(EINVAL));
2220 }
2221
2222 /*
2223 * We can always set the logical/physical ashift members since
2224 * their values are only used to calculate the vdev_ashift when
2225 * the device is first added to the config. These values should
2226 * not be used for anything else since they may change whenever
2227 * the device is reopened and we don't store them in the label.
2228 */
2229 vd->vdev_physical_ashift =
2230 MAX(physical_ashift, vd->vdev_physical_ashift);
2231 vd->vdev_logical_ashift = MAX(logical_ashift,
2232 vd->vdev_logical_ashift);
2233
2234 if (vd->vdev_asize == 0) {
2235 /*
2236 * This is the first-ever open, so use the computed values.
2237 * For compatibility, a different ashift can be requested.
2238 */
2239 vd->vdev_asize = asize;
2240 vd->vdev_max_asize = max_asize;
2241
2242 /*
2243 * If the vdev_ashift was not overridden at creation time
2244 * (0) or the override value is impossible for the device,
2245 * then set it the logical ashift and optimize the ashift.
2246 */
2247 if (vd->vdev_ashift < vd->vdev_logical_ashift) {
2248 vd->vdev_ashift = vd->vdev_logical_ashift;
2249
2250 if (vd->vdev_logical_ashift > ASHIFT_MAX) {
2251 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2252 VDEV_AUX_ASHIFT_TOO_BIG);
2253 return (SET_ERROR(EDOM));
2254 }
2255
2256 if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
2257 vdev_ashift_optimize(vd);
2258 vd->vdev_attaching = B_FALSE;
2259 }
2260 if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
2261 vd->vdev_ashift > ASHIFT_MAX)) {
2262 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2263 VDEV_AUX_BAD_ASHIFT);
2264 return (SET_ERROR(EDOM));
2265 }
2266 } else {
2267 /*
2268 * Make sure the alignment required hasn't increased.
2269 */
2270 if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
2271 vd->vdev_ops->vdev_op_leaf) {
2272 (void) zfs_ereport_post(
2273 FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
2274 spa, vd, NULL, NULL, 0);
2275 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2276 VDEV_AUX_BAD_LABEL);
2277 return (SET_ERROR(EDOM));
2278 }
2279 vd->vdev_max_asize = max_asize;
2280 }
2281
2282 /*
2283 * If all children are healthy we update asize if either:
2284 * The asize has increased, due to a device expansion caused by dynamic
2285 * LUN growth or vdev replacement, and automatic expansion is enabled;
2286 * making the additional space available.
2287 *
2288 * The asize has decreased, due to a device shrink usually caused by a
2289 * vdev replace with a smaller device. This ensures that calculations
2290 * based of max_asize and asize e.g. esize are always valid. It's safe
2291 * to do this as we've already validated that asize is greater than
2292 * vdev_min_asize.
2293 */
2294 if (vd->vdev_state == VDEV_STATE_HEALTHY &&
2295 ((asize > vd->vdev_asize &&
2296 (vd->vdev_expanding || spa->spa_autoexpand)) ||
2297 (asize < vd->vdev_asize)))
2298 vd->vdev_asize = asize;
2299
2300 vdev_set_min_asize(vd);
2301
2302 /*
2303 * Ensure we can issue some IO before declaring the
2304 * vdev open for business.
2305 */
2306 if (vd->vdev_ops->vdev_op_leaf &&
2307 (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
2308 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
2309 VDEV_AUX_ERR_EXCEEDED);
2310 return (error);
2311 }
2312
2313 /*
2314 * Track the minimum allocation size.
2315 */
2316 if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
2317 vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
2318 uint64_t min_alloc = vdev_get_min_alloc(vd);
2319 vdev_spa_set_alloc(spa, min_alloc);
2320 }
2321
2322 /*
2323 * If this is a leaf vdev, assess whether a resilver is needed.
2324 * But don't do this if we are doing a reopen for a scrub, since
2325 * this would just restart the scrub we are already doing.
2326 */
2327 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
2328 dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
2329
2330 return (0);
2331 }
2332
2333 static void
vdev_validate_child(void * arg)2334 vdev_validate_child(void *arg)
2335 {
2336 vdev_t *vd = arg;
2337
2338 vd->vdev_validate_thread = curthread;
2339 vd->vdev_validate_error = vdev_validate(vd);
2340 vd->vdev_validate_thread = NULL;
2341 }
2342
2343 /*
2344 * Called once the vdevs are all opened, this routine validates the label
2345 * contents. This needs to be done before vdev_load() so that we don't
2346 * inadvertently do repair I/Os to the wrong device.
2347 *
2348 * This function will only return failure if one of the vdevs indicates that it
2349 * has since been destroyed or exported. This is only possible if
2350 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
2351 * will be updated but the function will return 0.
2352 */
2353 int
vdev_validate(vdev_t * vd)2354 vdev_validate(vdev_t *vd)
2355 {
2356 spa_t *spa = vd->vdev_spa;
2357 taskq_t *tq = NULL;
2358 nvlist_t *label;
2359 uint64_t guid = 0, aux_guid = 0, top_guid;
2360 uint64_t state;
2361 nvlist_t *nvl;
2362 uint64_t txg;
2363 int children = vd->vdev_children;
2364
2365 if (vdev_validate_skip)
2366 return (0);
2367
2368 if (children > 0) {
2369 tq = taskq_create("vdev_validate", children, minclsyspri,
2370 children, children, TASKQ_PREPOPULATE);
2371 }
2372
2373 for (uint64_t c = 0; c < children; c++) {
2374 vdev_t *cvd = vd->vdev_child[c];
2375
2376 if (tq == NULL || vdev_uses_zvols(cvd)) {
2377 vdev_validate_child(cvd);
2378 } else {
2379 VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
2380 TQ_SLEEP) != TASKQID_INVALID);
2381 }
2382 }
2383 if (tq != NULL) {
2384 taskq_wait(tq);
2385 taskq_destroy(tq);
2386 }
2387 for (int c = 0; c < children; c++) {
2388 int error = vd->vdev_child[c]->vdev_validate_error;
2389
2390 if (error != 0)
2391 return (SET_ERROR(EBADF));
2392 }
2393
2394
2395 /*
2396 * If the device has already failed, or was marked offline, don't do
2397 * any further validation. Otherwise, label I/O will fail and we will
2398 * overwrite the previous state.
2399 */
2400 if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
2401 return (0);
2402
2403 /*
2404 * If we are performing an extreme rewind, we allow for a label that
2405 * was modified at a point after the current txg.
2406 * If config lock is not held do not check for the txg. spa_sync could
2407 * be updating the vdev's label before updating spa_last_synced_txg.
2408 */
2409 if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
2410 spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
2411 txg = UINT64_MAX;
2412 else
2413 txg = spa_last_synced_txg(spa);
2414
2415 if ((label = vdev_label_read_config(vd, txg)) == NULL) {
2416 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2417 VDEV_AUX_BAD_LABEL);
2418 vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
2419 "txg %llu", (u_longlong_t)txg);
2420 return (0);
2421 }
2422
2423 /*
2424 * Determine if this vdev has been split off into another
2425 * pool. If so, then refuse to open it.
2426 */
2427 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
2428 &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
2429 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2430 VDEV_AUX_SPLIT_POOL);
2431 nvlist_free(label);
2432 vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
2433 return (0);
2434 }
2435
2436 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
2437 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2438 VDEV_AUX_CORRUPT_DATA);
2439 nvlist_free(label);
2440 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2441 ZPOOL_CONFIG_POOL_GUID);
2442 return (0);
2443 }
2444
2445 /*
2446 * If config is not trusted then ignore the spa guid check. This is
2447 * necessary because if the machine crashed during a re-guid the new
2448 * guid might have been written to all of the vdev labels, but not the
2449 * cached config. The check will be performed again once we have the
2450 * trusted config from the MOS.
2451 */
2452 if (spa->spa_trust_config && guid != spa_guid(spa)) {
2453 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2454 VDEV_AUX_CORRUPT_DATA);
2455 nvlist_free(label);
2456 vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
2457 "match config (%llu != %llu)", (u_longlong_t)guid,
2458 (u_longlong_t)spa_guid(spa));
2459 return (0);
2460 }
2461
2462 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
2463 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
2464 &aux_guid) != 0)
2465 aux_guid = 0;
2466
2467 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
2468 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2469 VDEV_AUX_CORRUPT_DATA);
2470 nvlist_free(label);
2471 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2472 ZPOOL_CONFIG_GUID);
2473 return (0);
2474 }
2475
2476 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
2477 != 0) {
2478 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2479 VDEV_AUX_CORRUPT_DATA);
2480 nvlist_free(label);
2481 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2482 ZPOOL_CONFIG_TOP_GUID);
2483 return (0);
2484 }
2485
2486 /*
2487 * If this vdev just became a top-level vdev because its sibling was
2488 * detached, it will have adopted the parent's vdev guid -- but the
2489 * label may or may not be on disk yet. Fortunately, either version
2490 * of the label will have the same top guid, so if we're a top-level
2491 * vdev, we can safely compare to that instead.
2492 * However, if the config comes from a cachefile that failed to update
2493 * after the detach, a top-level vdev will appear as a non top-level
2494 * vdev in the config. Also relax the constraints if we perform an
2495 * extreme rewind.
2496 *
2497 * If we split this vdev off instead, then we also check the
2498 * original pool's guid. We don't want to consider the vdev
2499 * corrupt if it is partway through a split operation.
2500 */
2501 if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
2502 boolean_t mismatch = B_FALSE;
2503 if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
2504 if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
2505 mismatch = B_TRUE;
2506 } else {
2507 if (vd->vdev_guid != top_guid &&
2508 vd->vdev_top->vdev_guid != guid)
2509 mismatch = B_TRUE;
2510 }
2511
2512 if (mismatch) {
2513 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2514 VDEV_AUX_CORRUPT_DATA);
2515 nvlist_free(label);
2516 vdev_dbgmsg(vd, "vdev_validate: config guid "
2517 "doesn't match label guid");
2518 vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
2519 (u_longlong_t)vd->vdev_guid,
2520 (u_longlong_t)vd->vdev_top->vdev_guid);
2521 vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
2522 "aux_guid %llu", (u_longlong_t)guid,
2523 (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
2524 return (0);
2525 }
2526 }
2527
2528 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
2529 &state) != 0) {
2530 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2531 VDEV_AUX_CORRUPT_DATA);
2532 nvlist_free(label);
2533 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2534 ZPOOL_CONFIG_POOL_STATE);
2535 return (0);
2536 }
2537
2538 nvlist_free(label);
2539
2540 /*
2541 * If this is a verbatim import, no need to check the
2542 * state of the pool.
2543 */
2544 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
2545 spa_load_state(spa) == SPA_LOAD_OPEN &&
2546 state != POOL_STATE_ACTIVE) {
2547 vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
2548 "for spa %s", (u_longlong_t)state, spa->spa_name);
2549 return (SET_ERROR(EBADF));
2550 }
2551
2552 /*
2553 * If we were able to open and validate a vdev that was
2554 * previously marked permanently unavailable, clear that state
2555 * now.
2556 */
2557 if (vd->vdev_not_present)
2558 vd->vdev_not_present = 0;
2559
2560 return (0);
2561 }
2562
2563 static void
vdev_update_path(const char * prefix,char * svd,char ** dvd,uint64_t guid)2564 vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
2565 {
2566 if (svd != NULL && *dvd != NULL) {
2567 if (strcmp(svd, *dvd) != 0) {
2568 zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
2569 "from '%s' to '%s'", (u_longlong_t)guid, prefix,
2570 *dvd, svd);
2571 spa_strfree(*dvd);
2572 *dvd = spa_strdup(svd);
2573 }
2574 } else if (svd != NULL) {
2575 *dvd = spa_strdup(svd);
2576 zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
2577 (u_longlong_t)guid, *dvd);
2578 }
2579 }
2580
2581 static void
vdev_copy_path_impl(vdev_t * svd,vdev_t * dvd)2582 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
2583 {
2584 char *old, *new;
2585
2586 vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
2587 dvd->vdev_guid);
2588
2589 vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
2590 dvd->vdev_guid);
2591
2592 vdev_update_path("vdev_physpath", svd->vdev_physpath,
2593 &dvd->vdev_physpath, dvd->vdev_guid);
2594
2595 /*
2596 * Our enclosure sysfs path may have changed between imports
2597 */
2598 old = dvd->vdev_enc_sysfs_path;
2599 new = svd->vdev_enc_sysfs_path;
2600 if ((old != NULL && new == NULL) ||
2601 (old == NULL && new != NULL) ||
2602 ((old != NULL && new != NULL) && strcmp(new, old) != 0)) {
2603 zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path "
2604 "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
2605 old, new);
2606
2607 if (dvd->vdev_enc_sysfs_path)
2608 spa_strfree(dvd->vdev_enc_sysfs_path);
2609
2610 if (svd->vdev_enc_sysfs_path) {
2611 dvd->vdev_enc_sysfs_path = spa_strdup(
2612 svd->vdev_enc_sysfs_path);
2613 } else {
2614 dvd->vdev_enc_sysfs_path = NULL;
2615 }
2616 }
2617 }
2618
2619 /*
2620 * Recursively copy vdev paths from one vdev to another. Source and destination
2621 * vdev trees must have same geometry otherwise return error. Intended to copy
2622 * paths from userland config into MOS config.
2623 */
2624 int
vdev_copy_path_strict(vdev_t * svd,vdev_t * dvd)2625 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
2626 {
2627 if ((svd->vdev_ops == &vdev_missing_ops) ||
2628 (svd->vdev_ishole && dvd->vdev_ishole) ||
2629 (dvd->vdev_ops == &vdev_indirect_ops))
2630 return (0);
2631
2632 if (svd->vdev_ops != dvd->vdev_ops) {
2633 vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
2634 svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
2635 return (SET_ERROR(EINVAL));
2636 }
2637
2638 if (svd->vdev_guid != dvd->vdev_guid) {
2639 vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
2640 "%llu)", (u_longlong_t)svd->vdev_guid,
2641 (u_longlong_t)dvd->vdev_guid);
2642 return (SET_ERROR(EINVAL));
2643 }
2644
2645 if (svd->vdev_children != dvd->vdev_children) {
2646 vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
2647 "%llu != %llu", (u_longlong_t)svd->vdev_children,
2648 (u_longlong_t)dvd->vdev_children);
2649 return (SET_ERROR(EINVAL));
2650 }
2651
2652 for (uint64_t i = 0; i < svd->vdev_children; i++) {
2653 int error = vdev_copy_path_strict(svd->vdev_child[i],
2654 dvd->vdev_child[i]);
2655 if (error != 0)
2656 return (error);
2657 }
2658
2659 if (svd->vdev_ops->vdev_op_leaf)
2660 vdev_copy_path_impl(svd, dvd);
2661
2662 return (0);
2663 }
2664
2665 static void
vdev_copy_path_search(vdev_t * stvd,vdev_t * dvd)2666 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
2667 {
2668 ASSERT(stvd->vdev_top == stvd);
2669 ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
2670
2671 for (uint64_t i = 0; i < dvd->vdev_children; i++) {
2672 vdev_copy_path_search(stvd, dvd->vdev_child[i]);
2673 }
2674
2675 if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
2676 return;
2677
2678 /*
2679 * The idea here is that while a vdev can shift positions within
2680 * a top vdev (when replacing, attaching mirror, etc.) it cannot
2681 * step outside of it.
2682 */
2683 vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
2684
2685 if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
2686 return;
2687
2688 ASSERT(vd->vdev_ops->vdev_op_leaf);
2689
2690 vdev_copy_path_impl(vd, dvd);
2691 }
2692
2693 /*
2694 * Recursively copy vdev paths from one root vdev to another. Source and
2695 * destination vdev trees may differ in geometry. For each destination leaf
2696 * vdev, search a vdev with the same guid and top vdev id in the source.
2697 * Intended to copy paths from userland config into MOS config.
2698 */
2699 void
vdev_copy_path_relaxed(vdev_t * srvd,vdev_t * drvd)2700 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
2701 {
2702 uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
2703 ASSERT(srvd->vdev_ops == &vdev_root_ops);
2704 ASSERT(drvd->vdev_ops == &vdev_root_ops);
2705
2706 for (uint64_t i = 0; i < children; i++) {
2707 vdev_copy_path_search(srvd->vdev_child[i],
2708 drvd->vdev_child[i]);
2709 }
2710 }
2711
2712 /*
2713 * Close a virtual device.
2714 */
2715 void
vdev_close(vdev_t * vd)2716 vdev_close(vdev_t *vd)
2717 {
2718 vdev_t *pvd = vd->vdev_parent;
2719 spa_t *spa __maybe_unused = vd->vdev_spa;
2720
2721 ASSERT(vd != NULL);
2722 ASSERT(vd->vdev_open_thread == curthread ||
2723 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2724
2725 /*
2726 * If our parent is reopening, then we are as well, unless we are
2727 * going offline.
2728 */
2729 if (pvd != NULL && pvd->vdev_reopening)
2730 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
2731
2732 vd->vdev_ops->vdev_op_close(vd);
2733
2734 /*
2735 * We record the previous state before we close it, so that if we are
2736 * doing a reopen(), we don't generate FMA ereports if we notice that
2737 * it's still faulted.
2738 */
2739 vd->vdev_prevstate = vd->vdev_state;
2740
2741 if (vd->vdev_offline)
2742 vd->vdev_state = VDEV_STATE_OFFLINE;
2743 else
2744 vd->vdev_state = VDEV_STATE_CLOSED;
2745 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2746 }
2747
2748 void
vdev_hold(vdev_t * vd)2749 vdev_hold(vdev_t *vd)
2750 {
2751 spa_t *spa = vd->vdev_spa;
2752
2753 ASSERT(spa_is_root(spa));
2754 if (spa->spa_state == POOL_STATE_UNINITIALIZED)
2755 return;
2756
2757 for (int c = 0; c < vd->vdev_children; c++)
2758 vdev_hold(vd->vdev_child[c]);
2759
2760 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
2761 vd->vdev_ops->vdev_op_hold(vd);
2762 }
2763
2764 void
vdev_rele(vdev_t * vd)2765 vdev_rele(vdev_t *vd)
2766 {
2767 ASSERT(spa_is_root(vd->vdev_spa));
2768 for (int c = 0; c < vd->vdev_children; c++)
2769 vdev_rele(vd->vdev_child[c]);
2770
2771 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
2772 vd->vdev_ops->vdev_op_rele(vd);
2773 }
2774
2775 /*
2776 * Reopen all interior vdevs and any unopened leaves. We don't actually
2777 * reopen leaf vdevs which had previously been opened as they might deadlock
2778 * on the spa_config_lock. Instead we only obtain the leaf's physical size.
2779 * If the leaf has never been opened then open it, as usual.
2780 */
2781 void
vdev_reopen(vdev_t * vd)2782 vdev_reopen(vdev_t *vd)
2783 {
2784 spa_t *spa = vd->vdev_spa;
2785
2786 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2787
2788 /* set the reopening flag unless we're taking the vdev offline */
2789 vd->vdev_reopening = !vd->vdev_offline;
2790 vdev_close(vd);
2791 (void) vdev_open(vd);
2792
2793 /*
2794 * Call vdev_validate() here to make sure we have the same device.
2795 * Otherwise, a device with an invalid label could be successfully
2796 * opened in response to vdev_reopen().
2797 */
2798 if (vd->vdev_aux) {
2799 (void) vdev_validate_aux(vd);
2800 if (vdev_readable(vd) && vdev_writeable(vd) &&
2801 vd->vdev_aux == &spa->spa_l2cache) {
2802 /*
2803 * In case the vdev is present we should evict all ARC
2804 * buffers and pointers to log blocks and reclaim their
2805 * space before restoring its contents to L2ARC.
2806 */
2807 if (l2arc_vdev_present(vd)) {
2808 l2arc_rebuild_vdev(vd, B_TRUE);
2809 } else {
2810 l2arc_add_vdev(spa, vd);
2811 }
2812 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
2813 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
2814 }
2815 } else {
2816 (void) vdev_validate(vd);
2817 }
2818
2819 /*
2820 * Recheck if resilver is still needed and cancel any
2821 * scheduled resilver if resilver is unneeded.
2822 */
2823 if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
2824 spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
2825 mutex_enter(&spa->spa_async_lock);
2826 spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
2827 mutex_exit(&spa->spa_async_lock);
2828 }
2829
2830 /*
2831 * Reassess parent vdev's health.
2832 */
2833 vdev_propagate_state(vd);
2834 }
2835
2836 int
vdev_create(vdev_t * vd,uint64_t txg,boolean_t isreplacing)2837 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
2838 {
2839 int error;
2840
2841 /*
2842 * Normally, partial opens (e.g. of a mirror) are allowed.
2843 * For a create, however, we want to fail the request if
2844 * there are any components we can't open.
2845 */
2846 error = vdev_open(vd);
2847
2848 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
2849 vdev_close(vd);
2850 return (error ? error : SET_ERROR(ENXIO));
2851 }
2852
2853 /*
2854 * Recursively load DTLs and initialize all labels.
2855 */
2856 if ((error = vdev_dtl_load(vd)) != 0 ||
2857 (error = vdev_label_init(vd, txg, isreplacing ?
2858 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
2859 vdev_close(vd);
2860 return (error);
2861 }
2862
2863 return (0);
2864 }
2865
2866 void
vdev_metaslab_set_size(vdev_t * vd)2867 vdev_metaslab_set_size(vdev_t *vd)
2868 {
2869 uint64_t asize = vd->vdev_asize;
2870 uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
2871 uint64_t ms_shift;
2872
2873 /*
2874 * There are two dimensions to the metaslab sizing calculation:
2875 * the size of the metaslab and the count of metaslabs per vdev.
2876 *
2877 * The default values used below are a good balance between memory
2878 * usage (larger metaslab size means more memory needed for loaded
2879 * metaslabs; more metaslabs means more memory needed for the
2880 * metaslab_t structs), metaslab load time (larger metaslabs take
2881 * longer to load), and metaslab sync time (more metaslabs means
2882 * more time spent syncing all of them).
2883 *
2884 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
2885 * The range of the dimensions are as follows:
2886 *
2887 * 2^29 <= ms_size <= 2^34
2888 * 16 <= ms_count <= 131,072
2889 *
2890 * On the lower end of vdev sizes, we aim for metaslabs sizes of
2891 * at least 512MB (2^29) to minimize fragmentation effects when
2892 * testing with smaller devices. However, the count constraint
2893 * of at least 16 metaslabs will override this minimum size goal.
2894 *
2895 * On the upper end of vdev sizes, we aim for a maximum metaslab
2896 * size of 16GB. However, we will cap the total count to 2^17
2897 * metaslabs to keep our memory footprint in check and let the
2898 * metaslab size grow from there if that limit is hit.
2899 *
2900 * The net effect of applying above constrains is summarized below.
2901 *
2902 * vdev size metaslab count
2903 * --------------|-----------------
2904 * < 8GB ~16
2905 * 8GB - 100GB one per 512MB
2906 * 100GB - 3TB ~200
2907 * 3TB - 2PB one per 16GB
2908 * > 2PB ~131,072
2909 * --------------------------------
2910 *
2911 * Finally, note that all of the above calculate the initial
2912 * number of metaslabs. Expanding a top-level vdev will result
2913 * in additional metaslabs being allocated making it possible
2914 * to exceed the zfs_vdev_ms_count_limit.
2915 */
2916
2917 if (ms_count < zfs_vdev_min_ms_count)
2918 ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
2919 else if (ms_count > zfs_vdev_default_ms_count)
2920 ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
2921 else
2922 ms_shift = zfs_vdev_default_ms_shift;
2923
2924 if (ms_shift < SPA_MAXBLOCKSHIFT) {
2925 ms_shift = SPA_MAXBLOCKSHIFT;
2926 } else if (ms_shift > zfs_vdev_max_ms_shift) {
2927 ms_shift = zfs_vdev_max_ms_shift;
2928 /* cap the total count to constrain memory footprint */
2929 if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
2930 ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
2931 }
2932
2933 vd->vdev_ms_shift = ms_shift;
2934 ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
2935 }
2936
2937 void
vdev_dirty(vdev_t * vd,int flags,void * arg,uint64_t txg)2938 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
2939 {
2940 ASSERT(vd == vd->vdev_top);
2941 /* indirect vdevs don't have metaslabs or dtls */
2942 ASSERT(vdev_is_concrete(vd) || flags == 0);
2943 ASSERT(ISP2(flags));
2944 ASSERT(spa_writeable(vd->vdev_spa));
2945
2946 if (flags & VDD_METASLAB)
2947 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
2948
2949 if (flags & VDD_DTL)
2950 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
2951
2952 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
2953 }
2954
2955 void
vdev_dirty_leaves(vdev_t * vd,int flags,uint64_t txg)2956 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
2957 {
2958 for (int c = 0; c < vd->vdev_children; c++)
2959 vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
2960
2961 if (vd->vdev_ops->vdev_op_leaf)
2962 vdev_dirty(vd->vdev_top, flags, vd, txg);
2963 }
2964
2965 /*
2966 * DTLs.
2967 *
2968 * A vdev's DTL (dirty time log) is the set of transaction groups for which
2969 * the vdev has less than perfect replication. There are four kinds of DTL:
2970 *
2971 * DTL_MISSING: txgs for which the vdev has no valid copies of the data
2972 *
2973 * DTL_PARTIAL: txgs for which data is available, but not fully replicated
2974 *
2975 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
2976 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
2977 * txgs that was scrubbed.
2978 *
2979 * DTL_OUTAGE: txgs which cannot currently be read, whether due to
2980 * persistent errors or just some device being offline.
2981 * Unlike the other three, the DTL_OUTAGE map is not generally
2982 * maintained; it's only computed when needed, typically to
2983 * determine whether a device can be detached.
2984 *
2985 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
2986 * either has the data or it doesn't.
2987 *
2988 * For interior vdevs such as mirror and RAID-Z the picture is more complex.
2989 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
2990 * if any child is less than fully replicated, then so is its parent.
2991 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
2992 * comprising only those txgs which appear in 'maxfaults' or more children;
2993 * those are the txgs we don't have enough replication to read. For example,
2994 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
2995 * thus, its DTL_MISSING consists of the set of txgs that appear in more than
2996 * two child DTL_MISSING maps.
2997 *
2998 * It should be clear from the above that to compute the DTLs and outage maps
2999 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
3000 * Therefore, that is all we keep on disk. When loading the pool, or after
3001 * a configuration change, we generate all other DTLs from first principles.
3002 */
3003 void
vdev_dtl_dirty(vdev_t * vd,vdev_dtl_type_t t,uint64_t txg,uint64_t size)3004 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
3005 {
3006 zfs_range_tree_t *rt = vd->vdev_dtl[t];
3007
3008 ASSERT(t < DTL_TYPES);
3009 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
3010 ASSERT(spa_writeable(vd->vdev_spa));
3011
3012 mutex_enter(&vd->vdev_dtl_lock);
3013 if (!zfs_range_tree_contains(rt, txg, size))
3014 zfs_range_tree_add(rt, txg, size);
3015 mutex_exit(&vd->vdev_dtl_lock);
3016 }
3017
3018 boolean_t
vdev_dtl_contains(vdev_t * vd,vdev_dtl_type_t t,uint64_t txg,uint64_t size)3019 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
3020 {
3021 zfs_range_tree_t *rt = vd->vdev_dtl[t];
3022 boolean_t dirty = B_FALSE;
3023
3024 ASSERT(t < DTL_TYPES);
3025 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
3026
3027 /*
3028 * While we are loading the pool, the DTLs have not been loaded yet.
3029 * This isn't a problem but it can result in devices being tried
3030 * which are known to not have the data. In which case, the import
3031 * is relying on the checksum to ensure that we get the right data.
3032 * Note that while importing we are only reading the MOS, which is
3033 * always checksummed.
3034 */
3035 mutex_enter(&vd->vdev_dtl_lock);
3036 if (!zfs_range_tree_is_empty(rt))
3037 dirty = zfs_range_tree_contains(rt, txg, size);
3038 mutex_exit(&vd->vdev_dtl_lock);
3039
3040 return (dirty);
3041 }
3042
3043 boolean_t
vdev_dtl_empty(vdev_t * vd,vdev_dtl_type_t t)3044 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
3045 {
3046 zfs_range_tree_t *rt = vd->vdev_dtl[t];
3047 boolean_t empty;
3048
3049 mutex_enter(&vd->vdev_dtl_lock);
3050 empty = zfs_range_tree_is_empty(rt);
3051 mutex_exit(&vd->vdev_dtl_lock);
3052
3053 return (empty);
3054 }
3055
3056 /*
3057 * Check if the txg falls within the range which must be
3058 * resilvered. DVAs outside this range can always be skipped.
3059 */
3060 boolean_t
vdev_default_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)3061 vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3062 uint64_t phys_birth)
3063 {
3064 (void) dva, (void) psize;
3065
3066 /* Set by sequential resilver. */
3067 if (phys_birth == TXG_UNKNOWN)
3068 return (B_TRUE);
3069
3070 return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
3071 }
3072
3073 /*
3074 * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
3075 */
3076 boolean_t
vdev_dtl_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)3077 vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3078 uint64_t phys_birth)
3079 {
3080 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
3081
3082 if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
3083 vd->vdev_ops->vdev_op_leaf)
3084 return (B_TRUE);
3085
3086 return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
3087 phys_birth));
3088 }
3089
3090 /*
3091 * Returns the lowest txg in the DTL range.
3092 */
3093 static uint64_t
vdev_dtl_min(vdev_t * vd)3094 vdev_dtl_min(vdev_t *vd)
3095 {
3096 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
3097 ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
3098 ASSERT0(vd->vdev_children);
3099
3100 return (zfs_range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
3101 }
3102
3103 /*
3104 * Returns the highest txg in the DTL.
3105 */
3106 static uint64_t
vdev_dtl_max(vdev_t * vd)3107 vdev_dtl_max(vdev_t *vd)
3108 {
3109 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
3110 ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
3111 ASSERT0(vd->vdev_children);
3112
3113 return (zfs_range_tree_max(vd->vdev_dtl[DTL_MISSING]));
3114 }
3115
3116 /*
3117 * Determine if a resilvering vdev should remove any DTL entries from
3118 * its range. If the vdev was resilvering for the entire duration of the
3119 * scan then it should excise that range from its DTLs. Otherwise, this
3120 * vdev is considered partially resilvered and should leave its DTL
3121 * entries intact. The comment in vdev_dtl_reassess() describes how we
3122 * excise the DTLs.
3123 */
3124 static boolean_t
vdev_dtl_should_excise(vdev_t * vd,boolean_t rebuild_done)3125 vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
3126 {
3127 ASSERT0(vd->vdev_children);
3128
3129 if (vd->vdev_state < VDEV_STATE_DEGRADED)
3130 return (B_FALSE);
3131
3132 if (vd->vdev_resilver_deferred)
3133 return (B_FALSE);
3134
3135 if (zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
3136 return (B_TRUE);
3137
3138 if (rebuild_done) {
3139 vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
3140 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
3141
3142 /* Rebuild not initiated by attach */
3143 if (vd->vdev_rebuild_txg == 0)
3144 return (B_TRUE);
3145
3146 /*
3147 * When a rebuild completes without error then all missing data
3148 * up to the rebuild max txg has been reconstructed and the DTL
3149 * is eligible for excision.
3150 */
3151 if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
3152 vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
3153 ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
3154 ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
3155 ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
3156 return (B_TRUE);
3157 }
3158 } else {
3159 dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
3160 dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
3161
3162 /* Resilver not initiated by attach */
3163 if (vd->vdev_resilver_txg == 0)
3164 return (B_TRUE);
3165
3166 /*
3167 * When a resilver is initiated the scan will assign the
3168 * scn_max_txg value to the highest txg value that exists
3169 * in all DTLs. If this device's max DTL is not part of this
3170 * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
3171 * then it is not eligible for excision.
3172 */
3173 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
3174 ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
3175 ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
3176 ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
3177 return (B_TRUE);
3178 }
3179 }
3180
3181 return (B_FALSE);
3182 }
3183
3184 /*
3185 * Reassess DTLs after a config change or scrub completion. If txg == 0 no
3186 * write operations will be issued to the pool.
3187 */
3188 static void
vdev_dtl_reassess_impl(vdev_t * vd,uint64_t txg,uint64_t scrub_txg,boolean_t scrub_done,boolean_t rebuild_done,boolean_t faulting)3189 vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
3190 boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting)
3191 {
3192 spa_t *spa = vd->vdev_spa;
3193 avl_tree_t reftree;
3194 int minref;
3195
3196 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
3197
3198 for (int c = 0; c < vd->vdev_children; c++)
3199 vdev_dtl_reassess_impl(vd->vdev_child[c], txg,
3200 scrub_txg, scrub_done, rebuild_done, faulting);
3201
3202 if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
3203 return;
3204
3205 if (vd->vdev_ops->vdev_op_leaf) {
3206 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
3207 vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
3208 boolean_t check_excise = B_FALSE;
3209 boolean_t wasempty = B_TRUE;
3210
3211 mutex_enter(&vd->vdev_dtl_lock);
3212
3213 /*
3214 * If requested, pretend the scan or rebuild completed cleanly.
3215 */
3216 if (zfs_scan_ignore_errors) {
3217 if (scn != NULL)
3218 scn->scn_phys.scn_errors = 0;
3219 if (vr != NULL)
3220 vr->vr_rebuild_phys.vrp_errors = 0;
3221 }
3222
3223 if (scrub_txg != 0 &&
3224 !zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
3225 wasempty = B_FALSE;
3226 zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
3227 "dtl:%llu/%llu errors:%llu",
3228 (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
3229 (u_longlong_t)scrub_txg, spa->spa_scrub_started,
3230 (u_longlong_t)vdev_dtl_min(vd),
3231 (u_longlong_t)vdev_dtl_max(vd),
3232 (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
3233 }
3234
3235 /*
3236 * If we've completed a scrub/resilver or a rebuild cleanly
3237 * then determine if this vdev should remove any DTLs. We
3238 * only want to excise regions on vdevs that were available
3239 * during the entire duration of this scan.
3240 */
3241 if (rebuild_done &&
3242 vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
3243 check_excise = B_TRUE;
3244 } else {
3245 if (spa->spa_scrub_started ||
3246 (scn != NULL && scn->scn_phys.scn_errors == 0)) {
3247 check_excise = B_TRUE;
3248 }
3249 }
3250
3251 if (scrub_txg && check_excise &&
3252 vdev_dtl_should_excise(vd, rebuild_done)) {
3253 /*
3254 * We completed a scrub, resilver or rebuild up to
3255 * scrub_txg. If we did it without rebooting, then
3256 * the scrub dtl will be valid, so excise the old
3257 * region and fold in the scrub dtl. Otherwise,
3258 * leave the dtl as-is if there was an error.
3259 *
3260 * There's little trick here: to excise the beginning
3261 * of the DTL_MISSING map, we put it into a reference
3262 * tree and then add a segment with refcnt -1 that
3263 * covers the range [0, scrub_txg). This means
3264 * that each txg in that range has refcnt -1 or 0.
3265 * We then add DTL_SCRUB with a refcnt of 2, so that
3266 * entries in the range [0, scrub_txg) will have a
3267 * positive refcnt -- either 1 or 2. We then convert
3268 * the reference tree into the new DTL_MISSING map.
3269 */
3270 space_reftree_create(&reftree);
3271 space_reftree_add_map(&reftree,
3272 vd->vdev_dtl[DTL_MISSING], 1);
3273 space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
3274 space_reftree_add_map(&reftree,
3275 vd->vdev_dtl[DTL_SCRUB], 2);
3276 space_reftree_generate_map(&reftree,
3277 vd->vdev_dtl[DTL_MISSING], 1);
3278 space_reftree_destroy(&reftree);
3279
3280 if (!zfs_range_tree_is_empty(
3281 vd->vdev_dtl[DTL_MISSING])) {
3282 zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
3283 (u_longlong_t)vdev_dtl_min(vd),
3284 (u_longlong_t)vdev_dtl_max(vd));
3285 } else if (!wasempty) {
3286 zfs_dbgmsg("DTL_MISSING is now empty");
3287 }
3288 }
3289 zfs_range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
3290 zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING],
3291 zfs_range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
3292 if (scrub_done)
3293 zfs_range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL,
3294 NULL);
3295 zfs_range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
3296
3297 /*
3298 * For the faulting case, treat members of a replacing vdev
3299 * as if they are not available. It's more likely than not that
3300 * a vdev in a replacing vdev could encounter read errors so
3301 * treat it as not being able to contribute.
3302 */
3303 if (!vdev_readable(vd) ||
3304 (faulting && vd->vdev_parent != NULL &&
3305 vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) {
3306 zfs_range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
3307 } else {
3308 zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING],
3309 zfs_range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
3310 }
3311
3312 /*
3313 * If the vdev was resilvering or rebuilding and no longer
3314 * has any DTLs then reset the appropriate flag and dirty
3315 * the top level so that we persist the change.
3316 */
3317 if (txg != 0 &&
3318 zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
3319 zfs_range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
3320 if (vd->vdev_rebuild_txg != 0) {
3321 vd->vdev_rebuild_txg = 0;
3322 vdev_config_dirty(vd->vdev_top);
3323 } else if (vd->vdev_resilver_txg != 0) {
3324 vd->vdev_resilver_txg = 0;
3325 vdev_config_dirty(vd->vdev_top);
3326 }
3327 }
3328
3329 mutex_exit(&vd->vdev_dtl_lock);
3330
3331 if (txg != 0)
3332 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
3333 } else {
3334 mutex_enter(&vd->vdev_dtl_lock);
3335 for (int t = 0; t < DTL_TYPES; t++) {
3336 /* account for child's outage in parent's missing map */
3337 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
3338 if (t == DTL_SCRUB) {
3339 /* leaf vdevs only */
3340 continue;
3341 }
3342 if (t == DTL_PARTIAL) {
3343 /* i.e. non-zero */
3344 minref = 1;
3345 } else if (vdev_get_nparity(vd) != 0) {
3346 /* RAIDZ, DRAID */
3347 minref = vdev_get_nparity(vd) + 1;
3348 } else {
3349 /* any kind of mirror */
3350 minref = vd->vdev_children;
3351 }
3352 space_reftree_create(&reftree);
3353 for (int c = 0; c < vd->vdev_children; c++) {
3354 vdev_t *cvd = vd->vdev_child[c];
3355 mutex_enter(&cvd->vdev_dtl_lock);
3356 space_reftree_add_map(&reftree,
3357 cvd->vdev_dtl[s], 1);
3358 mutex_exit(&cvd->vdev_dtl_lock);
3359 }
3360 space_reftree_generate_map(&reftree,
3361 vd->vdev_dtl[t], minref);
3362 space_reftree_destroy(&reftree);
3363 }
3364 mutex_exit(&vd->vdev_dtl_lock);
3365 }
3366
3367 if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
3368 raidz_dtl_reassessed(vd);
3369 }
3370 }
3371
3372 void
vdev_dtl_reassess(vdev_t * vd,uint64_t txg,uint64_t scrub_txg,boolean_t scrub_done,boolean_t rebuild_done)3373 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
3374 boolean_t scrub_done, boolean_t rebuild_done)
3375 {
3376 return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done,
3377 rebuild_done, B_FALSE));
3378 }
3379
3380 /*
3381 * Iterate over all the vdevs except spare, and post kobj events
3382 */
3383 void
vdev_post_kobj_evt(vdev_t * vd)3384 vdev_post_kobj_evt(vdev_t *vd)
3385 {
3386 if (vd->vdev_ops->vdev_op_kobj_evt_post &&
3387 vd->vdev_kobj_flag == B_FALSE) {
3388 vd->vdev_kobj_flag = B_TRUE;
3389 vd->vdev_ops->vdev_op_kobj_evt_post(vd);
3390 }
3391
3392 for (int c = 0; c < vd->vdev_children; c++)
3393 vdev_post_kobj_evt(vd->vdev_child[c]);
3394 }
3395
3396 /*
3397 * Iterate over all the vdevs except spare, and clear kobj events
3398 */
3399 void
vdev_clear_kobj_evt(vdev_t * vd)3400 vdev_clear_kobj_evt(vdev_t *vd)
3401 {
3402 vd->vdev_kobj_flag = B_FALSE;
3403
3404 for (int c = 0; c < vd->vdev_children; c++)
3405 vdev_clear_kobj_evt(vd->vdev_child[c]);
3406 }
3407
3408 int
vdev_dtl_load(vdev_t * vd)3409 vdev_dtl_load(vdev_t *vd)
3410 {
3411 spa_t *spa = vd->vdev_spa;
3412 objset_t *mos = spa->spa_meta_objset;
3413 zfs_range_tree_t *rt;
3414 int error = 0;
3415
3416 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
3417 ASSERT(vdev_is_concrete(vd));
3418
3419 /*
3420 * If the dtl cannot be sync'd there is no need to open it.
3421 */
3422 if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)
3423 return (0);
3424
3425 error = space_map_open(&vd->vdev_dtl_sm, mos,
3426 vd->vdev_dtl_object, 0, -1ULL, 0);
3427 if (error)
3428 return (error);
3429 ASSERT(vd->vdev_dtl_sm != NULL);
3430
3431 rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
3432 error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
3433 if (error == 0) {
3434 mutex_enter(&vd->vdev_dtl_lock);
3435 zfs_range_tree_walk(rt, zfs_range_tree_add,
3436 vd->vdev_dtl[DTL_MISSING]);
3437 mutex_exit(&vd->vdev_dtl_lock);
3438 }
3439
3440 zfs_range_tree_vacate(rt, NULL, NULL);
3441 zfs_range_tree_destroy(rt);
3442
3443 return (error);
3444 }
3445
3446 for (int c = 0; c < vd->vdev_children; c++) {
3447 error = vdev_dtl_load(vd->vdev_child[c]);
3448 if (error != 0)
3449 break;
3450 }
3451
3452 return (error);
3453 }
3454
3455 static void
vdev_zap_allocation_data(vdev_t * vd,dmu_tx_t * tx)3456 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
3457 {
3458 spa_t *spa = vd->vdev_spa;
3459 objset_t *mos = spa->spa_meta_objset;
3460 vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
3461 const char *string;
3462
3463 ASSERT(alloc_bias != VDEV_BIAS_NONE);
3464
3465 string =
3466 (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
3467 (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
3468 (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
3469
3470 ASSERT(string != NULL);
3471 VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
3472 1, strlen(string) + 1, string, tx));
3473
3474 if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
3475 spa_activate_allocation_classes(spa, tx);
3476 }
3477 }
3478
3479 void
vdev_destroy_unlink_zap(vdev_t * vd,uint64_t zapobj,dmu_tx_t * tx)3480 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
3481 {
3482 spa_t *spa = vd->vdev_spa;
3483
3484 VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
3485 VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
3486 zapobj, tx));
3487 }
3488
3489 uint64_t
vdev_create_link_zap(vdev_t * vd,dmu_tx_t * tx)3490 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
3491 {
3492 spa_t *spa = vd->vdev_spa;
3493 uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
3494 DMU_OT_NONE, 0, tx);
3495
3496 ASSERT(zap != 0);
3497 VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
3498 zap, tx));
3499
3500 return (zap);
3501 }
3502
3503 void
vdev_construct_zaps(vdev_t * vd,dmu_tx_t * tx)3504 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
3505 {
3506 if (vd->vdev_ops != &vdev_hole_ops &&
3507 vd->vdev_ops != &vdev_missing_ops &&
3508 vd->vdev_ops != &vdev_root_ops &&
3509 !vd->vdev_top->vdev_removing) {
3510 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
3511 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
3512 }
3513 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
3514 vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
3515 if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
3516 vdev_zap_allocation_data(vd, tx);
3517 }
3518 }
3519 if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
3520 spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
3521 if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
3522 spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
3523 vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
3524 }
3525
3526 for (uint64_t i = 0; i < vd->vdev_children; i++) {
3527 vdev_construct_zaps(vd->vdev_child[i], tx);
3528 }
3529 }
3530
3531 static void
vdev_dtl_sync(vdev_t * vd,uint64_t txg)3532 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
3533 {
3534 spa_t *spa = vd->vdev_spa;
3535 zfs_range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
3536 objset_t *mos = spa->spa_meta_objset;
3537 zfs_range_tree_t *rtsync;
3538 dmu_tx_t *tx;
3539 uint64_t object = space_map_object(vd->vdev_dtl_sm);
3540
3541 ASSERT(vdev_is_concrete(vd));
3542 ASSERT(vd->vdev_ops->vdev_op_leaf);
3543
3544 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3545
3546 if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
3547 mutex_enter(&vd->vdev_dtl_lock);
3548 space_map_free(vd->vdev_dtl_sm, tx);
3549 space_map_close(vd->vdev_dtl_sm);
3550 vd->vdev_dtl_sm = NULL;
3551 mutex_exit(&vd->vdev_dtl_lock);
3552
3553 /*
3554 * We only destroy the leaf ZAP for detached leaves or for
3555 * removed log devices. Removed data devices handle leaf ZAP
3556 * cleanup later, once cancellation is no longer possible.
3557 */
3558 if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
3559 vd->vdev_top->vdev_islog)) {
3560 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
3561 vd->vdev_leaf_zap = 0;
3562 }
3563
3564 dmu_tx_commit(tx);
3565 return;
3566 }
3567
3568 if (vd->vdev_dtl_sm == NULL) {
3569 uint64_t new_object;
3570
3571 new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
3572 VERIFY3U(new_object, !=, 0);
3573
3574 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
3575 0, -1ULL, 0));
3576 ASSERT(vd->vdev_dtl_sm != NULL);
3577 }
3578
3579 rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
3580
3581 mutex_enter(&vd->vdev_dtl_lock);
3582 zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync);
3583 mutex_exit(&vd->vdev_dtl_lock);
3584
3585 space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
3586 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
3587 zfs_range_tree_vacate(rtsync, NULL, NULL);
3588
3589 zfs_range_tree_destroy(rtsync);
3590
3591 /*
3592 * If the object for the space map has changed then dirty
3593 * the top level so that we update the config.
3594 */
3595 if (object != space_map_object(vd->vdev_dtl_sm)) {
3596 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
3597 "new object %llu", (u_longlong_t)txg, spa_name(spa),
3598 (u_longlong_t)object,
3599 (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
3600 vdev_config_dirty(vd->vdev_top);
3601 }
3602
3603 dmu_tx_commit(tx);
3604 }
3605
3606 /*
3607 * Determine whether the specified vdev can be
3608 * - offlined
3609 * - detached
3610 * - removed
3611 * - faulted
3612 * without losing data.
3613 */
3614 boolean_t
vdev_dtl_required(vdev_t * vd)3615 vdev_dtl_required(vdev_t *vd)
3616 {
3617 spa_t *spa = vd->vdev_spa;
3618 vdev_t *tvd = vd->vdev_top;
3619 uint8_t cant_read = vd->vdev_cant_read;
3620 boolean_t required;
3621 boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED;
3622
3623 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3624
3625 if (vd == spa->spa_root_vdev || vd == tvd)
3626 return (B_TRUE);
3627
3628 /*
3629 * Temporarily mark the device as unreadable, and then determine
3630 * whether this results in any DTL outages in the top-level vdev.
3631 * If not, we can safely offline/detach/remove the device.
3632 */
3633 vd->vdev_cant_read = B_TRUE;
3634 vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
3635 required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
3636 vd->vdev_cant_read = cant_read;
3637 vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
3638
3639 if (!required && zio_injection_enabled) {
3640 required = !!zio_handle_device_injection(vd, NULL,
3641 SET_ERROR(ECHILD));
3642 }
3643
3644 return (required);
3645 }
3646
3647 /*
3648 * Determine if resilver is needed, and if so the txg range.
3649 */
3650 boolean_t
vdev_resilver_needed(vdev_t * vd,uint64_t * minp,uint64_t * maxp)3651 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
3652 {
3653 boolean_t needed = B_FALSE;
3654 uint64_t thismin = UINT64_MAX;
3655 uint64_t thismax = 0;
3656
3657 if (vd->vdev_children == 0) {
3658 mutex_enter(&vd->vdev_dtl_lock);
3659 if (!zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
3660 vdev_writeable(vd)) {
3661
3662 thismin = vdev_dtl_min(vd);
3663 thismax = vdev_dtl_max(vd);
3664 needed = B_TRUE;
3665 }
3666 mutex_exit(&vd->vdev_dtl_lock);
3667 } else {
3668 for (int c = 0; c < vd->vdev_children; c++) {
3669 vdev_t *cvd = vd->vdev_child[c];
3670 uint64_t cmin, cmax;
3671
3672 if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
3673 thismin = MIN(thismin, cmin);
3674 thismax = MAX(thismax, cmax);
3675 needed = B_TRUE;
3676 }
3677 }
3678 }
3679
3680 if (needed && minp) {
3681 *minp = thismin;
3682 *maxp = thismax;
3683 }
3684 return (needed);
3685 }
3686
3687 /*
3688 * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj
3689 * will contain either the checkpoint spacemap object or zero if none exists.
3690 * All other errors are returned to the caller.
3691 */
3692 int
vdev_checkpoint_sm_object(vdev_t * vd,uint64_t * sm_obj)3693 vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
3694 {
3695 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
3696
3697 if (vd->vdev_top_zap == 0) {
3698 *sm_obj = 0;
3699 return (0);
3700 }
3701
3702 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
3703 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
3704 if (error == ENOENT) {
3705 *sm_obj = 0;
3706 error = 0;
3707 }
3708
3709 return (error);
3710 }
3711
3712 int
vdev_load(vdev_t * vd)3713 vdev_load(vdev_t *vd)
3714 {
3715 int children = vd->vdev_children;
3716 int error = 0;
3717 taskq_t *tq = NULL;
3718
3719 /*
3720 * It's only worthwhile to use the taskq for the root vdev, because the
3721 * slow part is metaslab_init, and that only happens for top-level
3722 * vdevs.
3723 */
3724 if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
3725 tq = taskq_create("vdev_load", children, minclsyspri,
3726 children, children, TASKQ_PREPOPULATE);
3727 }
3728
3729 /*
3730 * Recursively load all children.
3731 */
3732 for (int c = 0; c < vd->vdev_children; c++) {
3733 vdev_t *cvd = vd->vdev_child[c];
3734
3735 if (tq == NULL || vdev_uses_zvols(cvd)) {
3736 cvd->vdev_load_error = vdev_load(cvd);
3737 } else {
3738 VERIFY(taskq_dispatch(tq, vdev_load_child,
3739 cvd, TQ_SLEEP) != TASKQID_INVALID);
3740 }
3741 }
3742
3743 if (tq != NULL) {
3744 taskq_wait(tq);
3745 taskq_destroy(tq);
3746 }
3747
3748 for (int c = 0; c < vd->vdev_children; c++) {
3749 int error = vd->vdev_child[c]->vdev_load_error;
3750
3751 if (error != 0)
3752 return (error);
3753 }
3754
3755 vdev_set_deflate_ratio(vd);
3756
3757 if (vd->vdev_ops == &vdev_raidz_ops) {
3758 error = vdev_raidz_load(vd);
3759 if (error != 0)
3760 return (error);
3761 }
3762
3763 /*
3764 * On spa_load path, grab the allocation bias from our zap
3765 */
3766 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3767 spa_t *spa = vd->vdev_spa;
3768 char bias_str[64];
3769
3770 error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
3771 VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
3772 bias_str);
3773 if (error == 0) {
3774 ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
3775 vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
3776 } else if (error != ENOENT) {
3777 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3778 VDEV_AUX_CORRUPT_DATA);
3779 vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
3780 "failed [error=%d]",
3781 (u_longlong_t)vd->vdev_top_zap, error);
3782 return (error);
3783 }
3784 }
3785
3786 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3787 spa_t *spa = vd->vdev_spa;
3788 uint64_t failfast;
3789
3790 error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
3791 vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
3792 1, &failfast);
3793 if (error == 0) {
3794 vd->vdev_failfast = failfast & 1;
3795 } else if (error == ENOENT) {
3796 vd->vdev_failfast = vdev_prop_default_numeric(
3797 VDEV_PROP_FAILFAST);
3798 } else {
3799 vdev_dbgmsg(vd,
3800 "vdev_load: zap_lookup(top_zap=%llu) "
3801 "failed [error=%d]",
3802 (u_longlong_t)vd->vdev_top_zap, error);
3803 }
3804 }
3805
3806 /*
3807 * Load any rebuild state from the top-level vdev zap.
3808 */
3809 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3810 error = vdev_rebuild_load(vd);
3811 if (error && error != ENOTSUP) {
3812 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3813 VDEV_AUX_CORRUPT_DATA);
3814 vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
3815 "failed [error=%d]", error);
3816 return (error);
3817 }
3818 }
3819
3820 if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
3821 uint64_t zapobj;
3822
3823 if (vd->vdev_top_zap != 0)
3824 zapobj = vd->vdev_top_zap;
3825 else
3826 zapobj = vd->vdev_leaf_zap;
3827
3828 error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
3829 &vd->vdev_checksum_n);
3830 if (error && error != ENOENT)
3831 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3832 "failed [error=%d]", (u_longlong_t)zapobj, error);
3833
3834 error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
3835 &vd->vdev_checksum_t);
3836 if (error && error != ENOENT)
3837 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3838 "failed [error=%d]", (u_longlong_t)zapobj, error);
3839
3840 error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
3841 &vd->vdev_io_n);
3842 if (error && error != ENOENT)
3843 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3844 "failed [error=%d]", (u_longlong_t)zapobj, error);
3845
3846 error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
3847 &vd->vdev_io_t);
3848 if (error && error != ENOENT)
3849 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3850 "failed [error=%d]", (u_longlong_t)zapobj, error);
3851
3852 error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
3853 &vd->vdev_slow_io_n);
3854 if (error && error != ENOENT)
3855 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3856 "failed [error=%d]", (u_longlong_t)zapobj, error);
3857
3858 error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
3859 &vd->vdev_slow_io_t);
3860 if (error && error != ENOENT)
3861 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3862 "failed [error=%d]", (u_longlong_t)zapobj, error);
3863 }
3864
3865 /*
3866 * If this is a top-level vdev, initialize its metaslabs.
3867 */
3868 if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
3869 vdev_metaslab_group_create(vd);
3870
3871 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
3872 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3873 VDEV_AUX_CORRUPT_DATA);
3874 vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
3875 "asize=%llu", (u_longlong_t)vd->vdev_ashift,
3876 (u_longlong_t)vd->vdev_asize);
3877 return (SET_ERROR(ENXIO));
3878 }
3879
3880 error = vdev_metaslab_init(vd, 0);
3881 if (error != 0) {
3882 vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
3883 "[error=%d]", error);
3884 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3885 VDEV_AUX_CORRUPT_DATA);
3886 return (error);
3887 }
3888
3889 uint64_t checkpoint_sm_obj;
3890 error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
3891 if (error == 0 && checkpoint_sm_obj != 0) {
3892 objset_t *mos = spa_meta_objset(vd->vdev_spa);
3893 ASSERT(vd->vdev_asize != 0);
3894 ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
3895
3896 error = space_map_open(&vd->vdev_checkpoint_sm,
3897 mos, checkpoint_sm_obj, 0, vd->vdev_asize,
3898 vd->vdev_ashift);
3899 if (error != 0) {
3900 vdev_dbgmsg(vd, "vdev_load: space_map_open "
3901 "failed for checkpoint spacemap (obj %llu) "
3902 "[error=%d]",
3903 (u_longlong_t)checkpoint_sm_obj, error);
3904 return (error);
3905 }
3906 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3907
3908 /*
3909 * Since the checkpoint_sm contains free entries
3910 * exclusively we can use space_map_allocated() to
3911 * indicate the cumulative checkpointed space that
3912 * has been freed.
3913 */
3914 vd->vdev_stat.vs_checkpoint_space =
3915 -space_map_allocated(vd->vdev_checkpoint_sm);
3916 vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
3917 vd->vdev_stat.vs_checkpoint_space;
3918 } else if (error != 0) {
3919 vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
3920 "checkpoint space map object from vdev ZAP "
3921 "[error=%d]", error);
3922 return (error);
3923 }
3924 }
3925
3926 /*
3927 * If this is a leaf vdev, load its DTL.
3928 */
3929 if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
3930 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3931 VDEV_AUX_CORRUPT_DATA);
3932 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
3933 "[error=%d]", error);
3934 return (error);
3935 }
3936
3937 uint64_t obsolete_sm_object;
3938 error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
3939 if (error == 0 && obsolete_sm_object != 0) {
3940 objset_t *mos = vd->vdev_spa->spa_meta_objset;
3941 ASSERT(vd->vdev_asize != 0);
3942 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
3943
3944 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
3945 obsolete_sm_object, 0, vd->vdev_asize, 0))) {
3946 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3947 VDEV_AUX_CORRUPT_DATA);
3948 vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
3949 "obsolete spacemap (obj %llu) [error=%d]",
3950 (u_longlong_t)obsolete_sm_object, error);
3951 return (error);
3952 }
3953 } else if (error != 0) {
3954 vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
3955 "space map object from vdev ZAP [error=%d]", error);
3956 return (error);
3957 }
3958
3959 return (0);
3960 }
3961
3962 /*
3963 * The special vdev case is used for hot spares and l2cache devices. Its
3964 * sole purpose it to set the vdev state for the associated vdev. To do this,
3965 * we make sure that we can open the underlying device, then try to read the
3966 * label, and make sure that the label is sane and that it hasn't been
3967 * repurposed to another pool.
3968 */
3969 int
vdev_validate_aux(vdev_t * vd)3970 vdev_validate_aux(vdev_t *vd)
3971 {
3972 nvlist_t *label;
3973 uint64_t guid, version;
3974 uint64_t state;
3975
3976 if (!vdev_readable(vd))
3977 return (0);
3978
3979 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
3980 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
3981 VDEV_AUX_CORRUPT_DATA);
3982 return (-1);
3983 }
3984
3985 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
3986 !SPA_VERSION_IS_SUPPORTED(version) ||
3987 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
3988 guid != vd->vdev_guid ||
3989 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
3990 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
3991 VDEV_AUX_CORRUPT_DATA);
3992 nvlist_free(label);
3993 return (-1);
3994 }
3995
3996 /*
3997 * We don't actually check the pool state here. If it's in fact in
3998 * use by another pool, we update this fact on the fly when requested.
3999 */
4000 nvlist_free(label);
4001 return (0);
4002 }
4003
4004 static void
vdev_destroy_ms_flush_data(vdev_t * vd,dmu_tx_t * tx)4005 vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
4006 {
4007 objset_t *mos = spa_meta_objset(vd->vdev_spa);
4008
4009 if (vd->vdev_top_zap == 0)
4010 return;
4011
4012 uint64_t object = 0;
4013 int err = zap_lookup(mos, vd->vdev_top_zap,
4014 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
4015 if (err == ENOENT)
4016 return;
4017 VERIFY0(err);
4018
4019 VERIFY0(dmu_object_free(mos, object, tx));
4020 VERIFY0(zap_remove(mos, vd->vdev_top_zap,
4021 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
4022 }
4023
4024 /*
4025 * Free the objects used to store this vdev's spacemaps, and the array
4026 * that points to them.
4027 */
4028 void
vdev_destroy_spacemaps(vdev_t * vd,dmu_tx_t * tx)4029 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
4030 {
4031 if (vd->vdev_ms_array == 0)
4032 return;
4033
4034 objset_t *mos = vd->vdev_spa->spa_meta_objset;
4035 uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
4036 size_t array_bytes = array_count * sizeof (uint64_t);
4037 uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
4038 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
4039 array_bytes, smobj_array, 0));
4040
4041 for (uint64_t i = 0; i < array_count; i++) {
4042 uint64_t smobj = smobj_array[i];
4043 if (smobj == 0)
4044 continue;
4045
4046 space_map_free_obj(mos, smobj, tx);
4047 }
4048
4049 kmem_free(smobj_array, array_bytes);
4050 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
4051 vdev_destroy_ms_flush_data(vd, tx);
4052 vd->vdev_ms_array = 0;
4053 }
4054
4055 static void
vdev_remove_empty_log(vdev_t * vd,uint64_t txg)4056 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
4057 {
4058 spa_t *spa = vd->vdev_spa;
4059
4060 ASSERT(vd->vdev_islog);
4061 ASSERT(vd == vd->vdev_top);
4062 ASSERT3U(txg, ==, spa_syncing_txg(spa));
4063
4064 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
4065
4066 vdev_destroy_spacemaps(vd, tx);
4067 if (vd->vdev_top_zap != 0) {
4068 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
4069 vd->vdev_top_zap = 0;
4070 }
4071
4072 dmu_tx_commit(tx);
4073 }
4074
4075 void
vdev_sync_done(vdev_t * vd,uint64_t txg)4076 vdev_sync_done(vdev_t *vd, uint64_t txg)
4077 {
4078 metaslab_t *msp;
4079 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
4080
4081 ASSERT(vdev_is_concrete(vd));
4082
4083 while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
4084 != NULL)
4085 metaslab_sync_done(msp, txg);
4086
4087 if (reassess) {
4088 metaslab_sync_reassess(vd->vdev_mg);
4089 if (vd->vdev_log_mg != NULL)
4090 metaslab_sync_reassess(vd->vdev_log_mg);
4091 }
4092 }
4093
4094 void
vdev_sync(vdev_t * vd,uint64_t txg)4095 vdev_sync(vdev_t *vd, uint64_t txg)
4096 {
4097 spa_t *spa = vd->vdev_spa;
4098 vdev_t *lvd;
4099 metaslab_t *msp;
4100
4101 ASSERT3U(txg, ==, spa->spa_syncing_txg);
4102 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
4103 if (zfs_range_tree_space(vd->vdev_obsolete_segments) > 0) {
4104 ASSERT(vd->vdev_removing ||
4105 vd->vdev_ops == &vdev_indirect_ops);
4106
4107 vdev_indirect_sync_obsolete(vd, tx);
4108
4109 /*
4110 * If the vdev is indirect, it can't have dirty
4111 * metaslabs or DTLs.
4112 */
4113 if (vd->vdev_ops == &vdev_indirect_ops) {
4114 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
4115 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
4116 dmu_tx_commit(tx);
4117 return;
4118 }
4119 }
4120
4121 ASSERT(vdev_is_concrete(vd));
4122
4123 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
4124 !vd->vdev_removing) {
4125 ASSERT(vd == vd->vdev_top);
4126 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
4127 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
4128 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
4129 ASSERT(vd->vdev_ms_array != 0);
4130 vdev_config_dirty(vd);
4131 }
4132
4133 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
4134 metaslab_sync(msp, txg);
4135 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
4136 }
4137
4138 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
4139 vdev_dtl_sync(lvd, txg);
4140
4141 /*
4142 * If this is an empty log device being removed, destroy the
4143 * metadata associated with it.
4144 */
4145 if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
4146 vdev_remove_empty_log(vd, txg);
4147
4148 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
4149 dmu_tx_commit(tx);
4150 }
4151 uint64_t
vdev_asize_to_psize_txg(vdev_t * vd,uint64_t asize,uint64_t txg)4152 vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, uint64_t txg)
4153 {
4154 return (vd->vdev_ops->vdev_op_asize_to_psize(vd, asize, txg));
4155 }
4156
4157 /*
4158 * Return the amount of space that should be (or was) allocated for the given
4159 * psize (compressed block size) in the given TXG. Note that for expanded
4160 * RAIDZ vdevs, the size allocated for older BP's may be larger. See
4161 * vdev_raidz_psize_to_asize().
4162 */
4163 uint64_t
vdev_psize_to_asize_txg(vdev_t * vd,uint64_t psize,uint64_t txg)4164 vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
4165 {
4166 return (vd->vdev_ops->vdev_op_psize_to_asize(vd, psize, txg));
4167 }
4168
4169 uint64_t
vdev_psize_to_asize(vdev_t * vd,uint64_t psize)4170 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
4171 {
4172 return (vdev_psize_to_asize_txg(vd, psize, 0));
4173 }
4174
4175 /*
4176 * Mark the given vdev faulted. A faulted vdev behaves as if the device could
4177 * not be opened, and no I/O is attempted.
4178 */
4179 int
vdev_fault(spa_t * spa,uint64_t guid,vdev_aux_t aux)4180 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
4181 {
4182 vdev_t *vd, *tvd;
4183
4184 spa_vdev_state_enter(spa, SCL_NONE);
4185
4186 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4187 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4188
4189 if (!vd->vdev_ops->vdev_op_leaf)
4190 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
4191
4192 tvd = vd->vdev_top;
4193
4194 /*
4195 * If user did a 'zpool offline -f' then make the fault persist across
4196 * reboots.
4197 */
4198 if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
4199 /*
4200 * There are two kinds of forced faults: temporary and
4201 * persistent. Temporary faults go away at pool import, while
4202 * persistent faults stay set. Both types of faults can be
4203 * cleared with a zpool clear.
4204 *
4205 * We tell if a vdev is persistently faulted by looking at the
4206 * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at
4207 * import then it's a persistent fault. Otherwise, it's
4208 * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external"
4209 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This
4210 * tells vdev_config_generate() (which gets run later) to set
4211 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
4212 */
4213 vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
4214 vd->vdev_tmpoffline = B_FALSE;
4215 aux = VDEV_AUX_EXTERNAL;
4216 } else {
4217 vd->vdev_tmpoffline = B_TRUE;
4218 }
4219
4220 /*
4221 * We don't directly use the aux state here, but if we do a
4222 * vdev_reopen(), we need this value to be present to remember why we
4223 * were faulted.
4224 */
4225 vd->vdev_label_aux = aux;
4226
4227 /*
4228 * Faulted state takes precedence over degraded.
4229 */
4230 vd->vdev_delayed_close = B_FALSE;
4231 vd->vdev_faulted = 1ULL;
4232 vd->vdev_degraded = 0ULL;
4233 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
4234
4235 /*
4236 * If this device has the only valid copy of the data, then
4237 * back off and simply mark the vdev as degraded instead.
4238 */
4239 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
4240 vd->vdev_degraded = 1ULL;
4241 vd->vdev_faulted = 0ULL;
4242
4243 /*
4244 * If we reopen the device and it's not dead, only then do we
4245 * mark it degraded.
4246 */
4247 vdev_reopen(tvd);
4248
4249 if (vdev_readable(vd))
4250 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
4251 }
4252
4253 return (spa_vdev_state_exit(spa, vd, 0));
4254 }
4255
4256 /*
4257 * Mark the given vdev degraded. A degraded vdev is purely an indication to the
4258 * user that something is wrong. The vdev continues to operate as normal as far
4259 * as I/O is concerned.
4260 */
4261 int
vdev_degrade(spa_t * spa,uint64_t guid,vdev_aux_t aux)4262 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
4263 {
4264 vdev_t *vd;
4265
4266 spa_vdev_state_enter(spa, SCL_NONE);
4267
4268 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4269 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4270
4271 if (!vd->vdev_ops->vdev_op_leaf)
4272 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
4273
4274 /*
4275 * If the vdev is already faulted, then don't do anything.
4276 */
4277 if (vd->vdev_faulted || vd->vdev_degraded)
4278 return (spa_vdev_state_exit(spa, NULL, 0));
4279
4280 vd->vdev_degraded = 1ULL;
4281 if (!vdev_is_dead(vd))
4282 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
4283 aux);
4284
4285 return (spa_vdev_state_exit(spa, vd, 0));
4286 }
4287
4288 int
vdev_remove_wanted(spa_t * spa,uint64_t guid)4289 vdev_remove_wanted(spa_t *spa, uint64_t guid)
4290 {
4291 vdev_t *vd;
4292
4293 spa_vdev_state_enter(spa, SCL_NONE);
4294
4295 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4296 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4297
4298 /*
4299 * If the vdev is already removed, or expanding which can trigger
4300 * repartition add/remove events, then don't do anything.
4301 */
4302 if (vd->vdev_removed || vd->vdev_expanding)
4303 return (spa_vdev_state_exit(spa, NULL, 0));
4304
4305 /*
4306 * Confirm the vdev has been removed, otherwise don't do anything.
4307 */
4308 if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
4309 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
4310
4311 vd->vdev_remove_wanted = B_TRUE;
4312 spa_async_request(spa, SPA_ASYNC_REMOVE_BY_USER);
4313
4314 return (spa_vdev_state_exit(spa, vd, 0));
4315 }
4316
4317
4318 /*
4319 * Online the given vdev.
4320 *
4321 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
4322 * spare device should be detached when the device finishes resilvering.
4323 * Second, the online should be treated like a 'test' online case, so no FMA
4324 * events are generated if the device fails to open.
4325 */
4326 int
vdev_online(spa_t * spa,uint64_t guid,uint64_t flags,vdev_state_t * newstate)4327 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
4328 {
4329 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
4330 boolean_t wasoffline;
4331 vdev_state_t oldstate;
4332
4333 spa_vdev_state_enter(spa, SCL_NONE);
4334
4335 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4336 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4337
4338 wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
4339 oldstate = vd->vdev_state;
4340
4341 tvd = vd->vdev_top;
4342 vd->vdev_offline = B_FALSE;
4343 vd->vdev_tmpoffline = B_FALSE;
4344 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
4345 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
4346
4347 /* XXX - L2ARC 1.0 does not support expansion */
4348 if (!vd->vdev_aux) {
4349 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
4350 pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
4351 spa->spa_autoexpand);
4352 vd->vdev_expansion_time = gethrestime_sec();
4353 }
4354
4355 vdev_reopen(tvd);
4356 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
4357
4358 if (!vd->vdev_aux) {
4359 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
4360 pvd->vdev_expanding = B_FALSE;
4361 }
4362
4363 if (newstate)
4364 *newstate = vd->vdev_state;
4365 if ((flags & ZFS_ONLINE_UNSPARE) &&
4366 !vdev_is_dead(vd) && vd->vdev_parent &&
4367 vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
4368 vd->vdev_parent->vdev_child[0] == vd)
4369 vd->vdev_unspare = B_TRUE;
4370
4371 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
4372
4373 /* XXX - L2ARC 1.0 does not support expansion */
4374 if (vd->vdev_aux)
4375 return (spa_vdev_state_exit(spa, vd, ENOTSUP));
4376 spa->spa_ccw_fail_time = 0;
4377 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
4378 }
4379
4380 /* Restart initializing if necessary */
4381 mutex_enter(&vd->vdev_initialize_lock);
4382 if (vdev_writeable(vd) &&
4383 vd->vdev_initialize_thread == NULL &&
4384 vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
4385 (void) vdev_initialize(vd);
4386 }
4387 mutex_exit(&vd->vdev_initialize_lock);
4388
4389 /*
4390 * Restart trimming if necessary. We do not restart trimming for cache
4391 * devices here. This is triggered by l2arc_rebuild_vdev()
4392 * asynchronously for the whole device or in l2arc_evict() as it evicts
4393 * space for upcoming writes.
4394 */
4395 mutex_enter(&vd->vdev_trim_lock);
4396 if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
4397 vd->vdev_trim_thread == NULL &&
4398 vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
4399 (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
4400 vd->vdev_trim_secure);
4401 }
4402 mutex_exit(&vd->vdev_trim_lock);
4403
4404 if (wasoffline ||
4405 (oldstate < VDEV_STATE_DEGRADED &&
4406 vd->vdev_state >= VDEV_STATE_DEGRADED)) {
4407 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
4408
4409 /*
4410 * Asynchronously detach spare vdev if resilver or
4411 * rebuild is not required
4412 */
4413 if (vd->vdev_unspare &&
4414 !dsl_scan_resilvering(spa->spa_dsl_pool) &&
4415 !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
4416 !vdev_rebuild_active(tvd))
4417 spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
4418 }
4419 return (spa_vdev_state_exit(spa, vd, 0));
4420 }
4421
4422 static int
vdev_offline_locked(spa_t * spa,uint64_t guid,uint64_t flags)4423 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
4424 {
4425 vdev_t *vd, *tvd;
4426 int error = 0;
4427 uint64_t generation;
4428 metaslab_group_t *mg;
4429
4430 top:
4431 spa_vdev_state_enter(spa, SCL_ALLOC);
4432
4433 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4434 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4435
4436 if (!vd->vdev_ops->vdev_op_leaf)
4437 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
4438
4439 if (vd->vdev_ops == &vdev_draid_spare_ops)
4440 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
4441
4442 tvd = vd->vdev_top;
4443 mg = tvd->vdev_mg;
4444 generation = spa->spa_config_generation + 1;
4445
4446 /*
4447 * If the device isn't already offline, try to offline it.
4448 */
4449 if (!vd->vdev_offline) {
4450 /*
4451 * If this device has the only valid copy of some data,
4452 * don't allow it to be offlined. Log devices are always
4453 * expendable.
4454 */
4455 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
4456 vdev_dtl_required(vd))
4457 return (spa_vdev_state_exit(spa, NULL,
4458 SET_ERROR(EBUSY)));
4459
4460 /*
4461 * If the top-level is a slog and it has had allocations
4462 * then proceed. We check that the vdev's metaslab group
4463 * is not NULL since it's possible that we may have just
4464 * added this vdev but not yet initialized its metaslabs.
4465 */
4466 if (tvd->vdev_islog && mg != NULL) {
4467 /*
4468 * Prevent any future allocations.
4469 */
4470 ASSERT3P(tvd->vdev_log_mg, ==, NULL);
4471 metaslab_group_passivate(mg);
4472 (void) spa_vdev_state_exit(spa, vd, 0);
4473
4474 error = spa_reset_logs(spa);
4475
4476 /*
4477 * If the log device was successfully reset but has
4478 * checkpointed data, do not offline it.
4479 */
4480 if (error == 0 &&
4481 tvd->vdev_checkpoint_sm != NULL) {
4482 ASSERT3U(space_map_allocated(
4483 tvd->vdev_checkpoint_sm), !=, 0);
4484 error = ZFS_ERR_CHECKPOINT_EXISTS;
4485 }
4486
4487 spa_vdev_state_enter(spa, SCL_ALLOC);
4488
4489 /*
4490 * Check to see if the config has changed.
4491 */
4492 if (error || generation != spa->spa_config_generation) {
4493 metaslab_group_activate(mg);
4494 if (error)
4495 return (spa_vdev_state_exit(spa,
4496 vd, error));
4497 (void) spa_vdev_state_exit(spa, vd, 0);
4498 goto top;
4499 }
4500 ASSERT0(tvd->vdev_stat.vs_alloc);
4501 }
4502
4503 /*
4504 * Offline this device and reopen its top-level vdev.
4505 * If the top-level vdev is a log device then just offline
4506 * it. Otherwise, if this action results in the top-level
4507 * vdev becoming unusable, undo it and fail the request.
4508 */
4509 vd->vdev_offline = B_TRUE;
4510 vdev_reopen(tvd);
4511
4512 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
4513 vdev_is_dead(tvd)) {
4514 vd->vdev_offline = B_FALSE;
4515 vdev_reopen(tvd);
4516 return (spa_vdev_state_exit(spa, NULL,
4517 SET_ERROR(EBUSY)));
4518 }
4519
4520 /*
4521 * Add the device back into the metaslab rotor so that
4522 * once we online the device it's open for business.
4523 */
4524 if (tvd->vdev_islog && mg != NULL)
4525 metaslab_group_activate(mg);
4526 }
4527
4528 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
4529
4530 return (spa_vdev_state_exit(spa, vd, 0));
4531 }
4532
4533 int
vdev_offline(spa_t * spa,uint64_t guid,uint64_t flags)4534 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
4535 {
4536 int error;
4537
4538 mutex_enter(&spa->spa_vdev_top_lock);
4539 error = vdev_offline_locked(spa, guid, flags);
4540 mutex_exit(&spa->spa_vdev_top_lock);
4541
4542 return (error);
4543 }
4544
4545 /*
4546 * Clear the error counts associated with this vdev. Unlike vdev_online() and
4547 * vdev_offline(), we assume the spa config is locked. We also clear all
4548 * children. If 'vd' is NULL, then the user wants to clear all vdevs.
4549 */
4550 void
vdev_clear(spa_t * spa,vdev_t * vd)4551 vdev_clear(spa_t *spa, vdev_t *vd)
4552 {
4553 vdev_t *rvd = spa->spa_root_vdev;
4554
4555 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
4556
4557 if (vd == NULL)
4558 vd = rvd;
4559
4560 vd->vdev_stat.vs_read_errors = 0;
4561 vd->vdev_stat.vs_write_errors = 0;
4562 vd->vdev_stat.vs_checksum_errors = 0;
4563 vd->vdev_stat.vs_dio_verify_errors = 0;
4564 vd->vdev_stat.vs_slow_ios = 0;
4565
4566 for (int c = 0; c < vd->vdev_children; c++)
4567 vdev_clear(spa, vd->vdev_child[c]);
4568
4569 /*
4570 * It makes no sense to "clear" an indirect or removed vdev.
4571 */
4572 if (!vdev_is_concrete(vd) || vd->vdev_removed)
4573 return;
4574
4575 /*
4576 * If we're in the FAULTED state or have experienced failed I/O, then
4577 * clear the persistent state and attempt to reopen the device. We
4578 * also mark the vdev config dirty, so that the new faulted state is
4579 * written out to disk.
4580 */
4581 if (vd->vdev_faulted || vd->vdev_degraded ||
4582 !vdev_readable(vd) || !vdev_writeable(vd)) {
4583 /*
4584 * When reopening in response to a clear event, it may be due to
4585 * a fmadm repair request. In this case, if the device is
4586 * still broken, we want to still post the ereport again.
4587 */
4588 vd->vdev_forcefault = B_TRUE;
4589
4590 vd->vdev_faulted = vd->vdev_degraded = 0ULL;
4591 vd->vdev_cant_read = B_FALSE;
4592 vd->vdev_cant_write = B_FALSE;
4593 vd->vdev_stat.vs_aux = 0;
4594
4595 vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
4596
4597 vd->vdev_forcefault = B_FALSE;
4598
4599 if (vd != rvd && vdev_writeable(vd->vdev_top))
4600 vdev_state_dirty(vd->vdev_top);
4601
4602 /* If a resilver isn't required, check if vdevs can be culled */
4603 if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
4604 !dsl_scan_resilvering(spa->spa_dsl_pool) &&
4605 !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
4606 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
4607
4608 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
4609 }
4610
4611 /*
4612 * When clearing a FMA-diagnosed fault, we always want to
4613 * unspare the device, as we assume that the original spare was
4614 * done in response to the FMA fault.
4615 */
4616 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
4617 vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
4618 vd->vdev_parent->vdev_child[0] == vd)
4619 vd->vdev_unspare = B_TRUE;
4620
4621 /* Clear recent error events cache (i.e. duplicate events tracking) */
4622 zfs_ereport_clear(spa, vd);
4623 }
4624
4625 boolean_t
vdev_is_dead(vdev_t * vd)4626 vdev_is_dead(vdev_t *vd)
4627 {
4628 /*
4629 * Holes and missing devices are always considered "dead".
4630 * This simplifies the code since we don't have to check for
4631 * these types of devices in the various code paths.
4632 * Instead we rely on the fact that we skip over dead devices
4633 * before issuing I/O to them.
4634 */
4635 return (vd->vdev_state < VDEV_STATE_DEGRADED ||
4636 vd->vdev_ops == &vdev_hole_ops ||
4637 vd->vdev_ops == &vdev_missing_ops);
4638 }
4639
4640 boolean_t
vdev_readable(vdev_t * vd)4641 vdev_readable(vdev_t *vd)
4642 {
4643 return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
4644 }
4645
4646 boolean_t
vdev_writeable(vdev_t * vd)4647 vdev_writeable(vdev_t *vd)
4648 {
4649 return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
4650 vdev_is_concrete(vd));
4651 }
4652
4653 boolean_t
vdev_allocatable(vdev_t * vd)4654 vdev_allocatable(vdev_t *vd)
4655 {
4656 uint64_t state = vd->vdev_state;
4657
4658 /*
4659 * We currently allow allocations from vdevs which may be in the
4660 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
4661 * fails to reopen then we'll catch it later when we're holding
4662 * the proper locks. Note that we have to get the vdev state
4663 * in a local variable because although it changes atomically,
4664 * we're asking two separate questions about it.
4665 */
4666 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
4667 !vd->vdev_cant_write && vdev_is_concrete(vd) &&
4668 vd->vdev_mg->mg_initialized);
4669 }
4670
4671 boolean_t
vdev_accessible(vdev_t * vd,zio_t * zio)4672 vdev_accessible(vdev_t *vd, zio_t *zio)
4673 {
4674 ASSERT(zio->io_vd == vd);
4675
4676 if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
4677 return (B_FALSE);
4678
4679 if (zio->io_type == ZIO_TYPE_READ)
4680 return (!vd->vdev_cant_read);
4681
4682 if (zio->io_type == ZIO_TYPE_WRITE)
4683 return (!vd->vdev_cant_write);
4684
4685 return (B_TRUE);
4686 }
4687
4688 static void
vdev_get_child_stat(vdev_t * cvd,vdev_stat_t * vs,vdev_stat_t * cvs)4689 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
4690 {
4691 /*
4692 * Exclude the dRAID spare when aggregating to avoid double counting
4693 * the ops and bytes. These IOs are counted by the physical leaves.
4694 */
4695 if (cvd->vdev_ops == &vdev_draid_spare_ops)
4696 return;
4697
4698 for (int t = 0; t < VS_ZIO_TYPES; t++) {
4699 vs->vs_ops[t] += cvs->vs_ops[t];
4700 vs->vs_bytes[t] += cvs->vs_bytes[t];
4701 }
4702
4703 cvs->vs_scan_removing = cvd->vdev_removing;
4704 }
4705
4706 /*
4707 * Get extended stats
4708 */
4709 static void
vdev_get_child_stat_ex(vdev_t * cvd,vdev_stat_ex_t * vsx,vdev_stat_ex_t * cvsx)4710 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
4711 {
4712 (void) cvd;
4713
4714 int t, b;
4715 for (t = 0; t < ZIO_TYPES; t++) {
4716 for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
4717 vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
4718
4719 for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
4720 vsx->vsx_total_histo[t][b] +=
4721 cvsx->vsx_total_histo[t][b];
4722 }
4723 }
4724
4725 for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
4726 for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
4727 vsx->vsx_queue_histo[t][b] +=
4728 cvsx->vsx_queue_histo[t][b];
4729 }
4730 vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
4731 vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
4732
4733 for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
4734 vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
4735
4736 for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
4737 vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
4738 }
4739
4740 }
4741
4742 boolean_t
vdev_is_spacemap_addressable(vdev_t * vd)4743 vdev_is_spacemap_addressable(vdev_t *vd)
4744 {
4745 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
4746 return (B_TRUE);
4747
4748 /*
4749 * If double-word space map entries are not enabled we assume
4750 * 47 bits of the space map entry are dedicated to the entry's
4751 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
4752 * to calculate the maximum address that can be described by a
4753 * space map entry for the given device.
4754 */
4755 uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
4756
4757 if (shift >= 63) /* detect potential overflow */
4758 return (B_TRUE);
4759
4760 return (vd->vdev_asize < (1ULL << shift));
4761 }
4762
4763 /*
4764 * Get statistics for the given vdev.
4765 */
4766 static void
vdev_get_stats_ex_impl(vdev_t * vd,vdev_stat_t * vs,vdev_stat_ex_t * vsx)4767 vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
4768 {
4769 int t;
4770 /*
4771 * If we're getting stats on the root vdev, aggregate the I/O counts
4772 * over all top-level vdevs (i.e. the direct children of the root).
4773 */
4774 if (!vd->vdev_ops->vdev_op_leaf) {
4775 if (vs) {
4776 memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
4777 memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
4778 }
4779 if (vsx)
4780 memset(vsx, 0, sizeof (*vsx));
4781
4782 for (int c = 0; c < vd->vdev_children; c++) {
4783 vdev_t *cvd = vd->vdev_child[c];
4784 vdev_stat_t *cvs = &cvd->vdev_stat;
4785 vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
4786
4787 vdev_get_stats_ex_impl(cvd, cvs, cvsx);
4788 if (vs)
4789 vdev_get_child_stat(cvd, vs, cvs);
4790 if (vsx)
4791 vdev_get_child_stat_ex(cvd, vsx, cvsx);
4792 }
4793 } else {
4794 /*
4795 * We're a leaf. Just copy our ZIO active queue stats in. The
4796 * other leaf stats are updated in vdev_stat_update().
4797 */
4798 if (!vsx)
4799 return;
4800
4801 memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
4802
4803 for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
4804 vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
4805 vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
4806 }
4807 }
4808 }
4809
4810 void
vdev_get_stats_ex(vdev_t * vd,vdev_stat_t * vs,vdev_stat_ex_t * vsx)4811 vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
4812 {
4813 vdev_t *tvd = vd->vdev_top;
4814 mutex_enter(&vd->vdev_stat_lock);
4815 if (vs) {
4816 memcpy(vs, &vd->vdev_stat, sizeof (*vs));
4817 vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
4818 vs->vs_state = vd->vdev_state;
4819 vs->vs_rsize = vdev_get_min_asize(vd);
4820
4821 if (vd->vdev_ops->vdev_op_leaf) {
4822 vs->vs_pspace = vd->vdev_psize;
4823 vs->vs_rsize += VDEV_LABEL_START_SIZE +
4824 VDEV_LABEL_END_SIZE;
4825 /*
4826 * Report initializing progress. Since we don't
4827 * have the initializing locks held, this is only
4828 * an estimate (although a fairly accurate one).
4829 */
4830 vs->vs_initialize_bytes_done =
4831 vd->vdev_initialize_bytes_done;
4832 vs->vs_initialize_bytes_est =
4833 vd->vdev_initialize_bytes_est;
4834 vs->vs_initialize_state = vd->vdev_initialize_state;
4835 vs->vs_initialize_action_time =
4836 vd->vdev_initialize_action_time;
4837
4838 /*
4839 * Report manual TRIM progress. Since we don't have
4840 * the manual TRIM locks held, this is only an
4841 * estimate (although fairly accurate one).
4842 */
4843 vs->vs_trim_notsup = !vd->vdev_has_trim;
4844 vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
4845 vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
4846 vs->vs_trim_state = vd->vdev_trim_state;
4847 vs->vs_trim_action_time = vd->vdev_trim_action_time;
4848
4849 /* Set when there is a deferred resilver. */
4850 vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
4851 }
4852
4853 /*
4854 * Report expandable space on top-level, non-auxiliary devices
4855 * only. The expandable space is reported in terms of metaslab
4856 * sized units since that determines how much space the pool
4857 * can expand.
4858 */
4859 if (vd->vdev_aux == NULL && tvd != NULL) {
4860 vs->vs_esize = P2ALIGN_TYPED(
4861 vd->vdev_max_asize - vd->vdev_asize,
4862 1ULL << tvd->vdev_ms_shift, uint64_t);
4863 }
4864
4865 vs->vs_configured_ashift = vd->vdev_top != NULL
4866 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
4867 vs->vs_logical_ashift = vd->vdev_logical_ashift;
4868 if (vd->vdev_physical_ashift <= ASHIFT_MAX)
4869 vs->vs_physical_ashift = vd->vdev_physical_ashift;
4870 else
4871 vs->vs_physical_ashift = 0;
4872
4873 /*
4874 * Report fragmentation and rebuild progress for top-level,
4875 * non-auxiliary, concrete devices.
4876 */
4877 if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
4878 vdev_is_concrete(vd)) {
4879 /*
4880 * The vdev fragmentation rating doesn't take into
4881 * account the embedded slog metaslab (vdev_log_mg).
4882 * Since it's only one metaslab, it would have a tiny
4883 * impact on the overall fragmentation.
4884 */
4885 vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
4886 vd->vdev_mg->mg_fragmentation : 0;
4887 }
4888 vs->vs_noalloc = MAX(vd->vdev_noalloc,
4889 tvd ? tvd->vdev_noalloc : 0);
4890 }
4891
4892 vdev_get_stats_ex_impl(vd, vs, vsx);
4893 mutex_exit(&vd->vdev_stat_lock);
4894 }
4895
4896 void
vdev_get_stats(vdev_t * vd,vdev_stat_t * vs)4897 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
4898 {
4899 return (vdev_get_stats_ex(vd, vs, NULL));
4900 }
4901
4902 void
vdev_clear_stats(vdev_t * vd)4903 vdev_clear_stats(vdev_t *vd)
4904 {
4905 mutex_enter(&vd->vdev_stat_lock);
4906 vd->vdev_stat.vs_space = 0;
4907 vd->vdev_stat.vs_dspace = 0;
4908 vd->vdev_stat.vs_alloc = 0;
4909 mutex_exit(&vd->vdev_stat_lock);
4910 }
4911
4912 void
vdev_scan_stat_init(vdev_t * vd)4913 vdev_scan_stat_init(vdev_t *vd)
4914 {
4915 vdev_stat_t *vs = &vd->vdev_stat;
4916
4917 for (int c = 0; c < vd->vdev_children; c++)
4918 vdev_scan_stat_init(vd->vdev_child[c]);
4919
4920 mutex_enter(&vd->vdev_stat_lock);
4921 vs->vs_scan_processed = 0;
4922 mutex_exit(&vd->vdev_stat_lock);
4923 }
4924
4925 void
vdev_stat_update(zio_t * zio,uint64_t psize)4926 vdev_stat_update(zio_t *zio, uint64_t psize)
4927 {
4928 spa_t *spa = zio->io_spa;
4929 vdev_t *rvd = spa->spa_root_vdev;
4930 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
4931 vdev_t *pvd;
4932 uint64_t txg = zio->io_txg;
4933 /* Suppress ASAN false positive */
4934 #ifdef __SANITIZE_ADDRESS__
4935 vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
4936 vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
4937 #else
4938 vdev_stat_t *vs = &vd->vdev_stat;
4939 vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
4940 #endif
4941 zio_type_t type = zio->io_type;
4942 int flags = zio->io_flags;
4943
4944 /*
4945 * If this i/o is a gang leader, it didn't do any actual work.
4946 */
4947 if (zio->io_gang_tree)
4948 return;
4949
4950 if (zio->io_error == 0) {
4951 /*
4952 * If this is a root i/o, don't count it -- we've already
4953 * counted the top-level vdevs, and vdev_get_stats() will
4954 * aggregate them when asked. This reduces contention on
4955 * the root vdev_stat_lock and implicitly handles blocks
4956 * that compress away to holes, for which there is no i/o.
4957 * (Holes never create vdev children, so all the counters
4958 * remain zero, which is what we want.)
4959 *
4960 * Note: this only applies to successful i/o (io_error == 0)
4961 * because unlike i/o counts, errors are not additive.
4962 * When reading a ditto block, for example, failure of
4963 * one top-level vdev does not imply a root-level error.
4964 */
4965 if (vd == rvd)
4966 return;
4967
4968 ASSERT(vd == zio->io_vd);
4969
4970 if (flags & ZIO_FLAG_IO_BYPASS)
4971 return;
4972
4973 mutex_enter(&vd->vdev_stat_lock);
4974
4975 if (flags & ZIO_FLAG_IO_REPAIR) {
4976 /*
4977 * Repair is the result of a resilver issued by the
4978 * scan thread (spa_sync).
4979 */
4980 if (flags & ZIO_FLAG_SCAN_THREAD) {
4981 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
4982 dsl_scan_phys_t *scn_phys = &scn->scn_phys;
4983 uint64_t *processed = &scn_phys->scn_processed;
4984
4985 if (vd->vdev_ops->vdev_op_leaf)
4986 atomic_add_64(processed, psize);
4987 vs->vs_scan_processed += psize;
4988 }
4989
4990 /*
4991 * Repair is the result of a rebuild issued by the
4992 * rebuild thread (vdev_rebuild_thread). To avoid
4993 * double counting repaired bytes the virtual dRAID
4994 * spare vdev is excluded from the processed bytes.
4995 */
4996 if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
4997 vdev_t *tvd = vd->vdev_top;
4998 vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
4999 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
5000 uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
5001
5002 if (vd->vdev_ops->vdev_op_leaf &&
5003 vd->vdev_ops != &vdev_draid_spare_ops) {
5004 atomic_add_64(rebuilt, psize);
5005 }
5006 vs->vs_rebuild_processed += psize;
5007 }
5008
5009 if (flags & ZIO_FLAG_SELF_HEAL)
5010 vs->vs_self_healed += psize;
5011 }
5012
5013 /*
5014 * The bytes/ops/histograms are recorded at the leaf level and
5015 * aggregated into the higher level vdevs in vdev_get_stats().
5016 */
5017 if (vd->vdev_ops->vdev_op_leaf &&
5018 (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
5019 zio_type_t vs_type = type;
5020 zio_priority_t priority = zio->io_priority;
5021
5022 /*
5023 * TRIM ops and bytes are reported to user space as
5024 * ZIO_TYPE_FLUSH. This is done to preserve the
5025 * vdev_stat_t structure layout for user space.
5026 */
5027 if (type == ZIO_TYPE_TRIM)
5028 vs_type = ZIO_TYPE_FLUSH;
5029
5030 /*
5031 * Solely for the purposes of 'zpool iostat -lqrw'
5032 * reporting use the priority to categorize the IO.
5033 * Only the following are reported to user space:
5034 *
5035 * ZIO_PRIORITY_SYNC_READ,
5036 * ZIO_PRIORITY_SYNC_WRITE,
5037 * ZIO_PRIORITY_ASYNC_READ,
5038 * ZIO_PRIORITY_ASYNC_WRITE,
5039 * ZIO_PRIORITY_SCRUB,
5040 * ZIO_PRIORITY_TRIM,
5041 * ZIO_PRIORITY_REBUILD.
5042 */
5043 if (priority == ZIO_PRIORITY_INITIALIZING) {
5044 ASSERT3U(type, ==, ZIO_TYPE_WRITE);
5045 priority = ZIO_PRIORITY_ASYNC_WRITE;
5046 } else if (priority == ZIO_PRIORITY_REMOVAL) {
5047 priority = ((type == ZIO_TYPE_WRITE) ?
5048 ZIO_PRIORITY_ASYNC_WRITE :
5049 ZIO_PRIORITY_ASYNC_READ);
5050 }
5051
5052 vs->vs_ops[vs_type]++;
5053 vs->vs_bytes[vs_type] += psize;
5054
5055 if (flags & ZIO_FLAG_DELEGATED) {
5056 vsx->vsx_agg_histo[priority]
5057 [RQ_HISTO(zio->io_size)]++;
5058 } else {
5059 vsx->vsx_ind_histo[priority]
5060 [RQ_HISTO(zio->io_size)]++;
5061 }
5062
5063 if (zio->io_delta && zio->io_delay) {
5064 vsx->vsx_queue_histo[priority]
5065 [L_HISTO(zio->io_delta - zio->io_delay)]++;
5066 vsx->vsx_disk_histo[type]
5067 [L_HISTO(zio->io_delay)]++;
5068 vsx->vsx_total_histo[type]
5069 [L_HISTO(zio->io_delta)]++;
5070 }
5071 }
5072
5073 mutex_exit(&vd->vdev_stat_lock);
5074 return;
5075 }
5076
5077 if (flags & ZIO_FLAG_SPECULATIVE)
5078 return;
5079
5080 /*
5081 * If this is an I/O error that is going to be retried, then ignore the
5082 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
5083 * hard errors, when in reality they can happen for any number of
5084 * innocuous reasons (bus resets, MPxIO link failure, etc).
5085 */
5086 if (zio->io_error == EIO &&
5087 !(zio->io_flags & ZIO_FLAG_IO_RETRY))
5088 return;
5089
5090 /*
5091 * Intent logs writes won't propagate their error to the root
5092 * I/O so don't mark these types of failures as pool-level
5093 * errors.
5094 */
5095 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
5096 return;
5097
5098 if (type == ZIO_TYPE_WRITE && txg != 0 &&
5099 (!(flags & ZIO_FLAG_IO_REPAIR) ||
5100 (flags & ZIO_FLAG_SCAN_THREAD) ||
5101 spa->spa_claiming)) {
5102 /*
5103 * This is either a normal write (not a repair), or it's
5104 * a repair induced by the scrub thread, or it's a repair
5105 * made by zil_claim() during spa_load() in the first txg.
5106 * In the normal case, we commit the DTL change in the same
5107 * txg as the block was born. In the scrub-induced repair
5108 * case, we know that scrubs run in first-pass syncing context,
5109 * so we commit the DTL change in spa_syncing_txg(spa).
5110 * In the zil_claim() case, we commit in spa_first_txg(spa).
5111 *
5112 * We currently do not make DTL entries for failed spontaneous
5113 * self-healing writes triggered by normal (non-scrubbing)
5114 * reads, because we have no transactional context in which to
5115 * do so -- and it's not clear that it'd be desirable anyway.
5116 */
5117 if (vd->vdev_ops->vdev_op_leaf) {
5118 uint64_t commit_txg = txg;
5119 if (flags & ZIO_FLAG_SCAN_THREAD) {
5120 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
5121 ASSERT(spa_sync_pass(spa) == 1);
5122 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
5123 commit_txg = spa_syncing_txg(spa);
5124 } else if (spa->spa_claiming) {
5125 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
5126 commit_txg = spa_first_txg(spa);
5127 }
5128 ASSERT(commit_txg >= spa_syncing_txg(spa));
5129 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
5130 return;
5131 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
5132 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
5133 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
5134 }
5135 if (vd != rvd)
5136 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
5137 }
5138 }
5139
5140 int64_t
vdev_deflated_space(vdev_t * vd,int64_t space)5141 vdev_deflated_space(vdev_t *vd, int64_t space)
5142 {
5143 ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
5144 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
5145
5146 return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
5147 }
5148
5149 /*
5150 * Update the in-core space usage stats for this vdev, its metaslab class,
5151 * and the root vdev.
5152 */
5153 void
vdev_space_update(vdev_t * vd,int64_t alloc_delta,int64_t defer_delta,int64_t space_delta)5154 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
5155 int64_t space_delta)
5156 {
5157 (void) defer_delta;
5158 int64_t dspace_delta;
5159 spa_t *spa = vd->vdev_spa;
5160 vdev_t *rvd = spa->spa_root_vdev;
5161
5162 ASSERT(vd == vd->vdev_top);
5163
5164 /*
5165 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
5166 * factor. We must calculate this here and not at the root vdev
5167 * because the root vdev's psize-to-asize is simply the max of its
5168 * children's, thus not accurate enough for us.
5169 */
5170 dspace_delta = vdev_deflated_space(vd, space_delta);
5171
5172 mutex_enter(&vd->vdev_stat_lock);
5173 /* ensure we won't underflow */
5174 if (alloc_delta < 0) {
5175 ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
5176 }
5177
5178 vd->vdev_stat.vs_alloc += alloc_delta;
5179 vd->vdev_stat.vs_space += space_delta;
5180 vd->vdev_stat.vs_dspace += dspace_delta;
5181 mutex_exit(&vd->vdev_stat_lock);
5182
5183 /* every class but log contributes to root space stats */
5184 if (vd->vdev_mg != NULL && !vd->vdev_islog) {
5185 ASSERT(!vd->vdev_isl2cache);
5186 mutex_enter(&rvd->vdev_stat_lock);
5187 rvd->vdev_stat.vs_alloc += alloc_delta;
5188 rvd->vdev_stat.vs_space += space_delta;
5189 rvd->vdev_stat.vs_dspace += dspace_delta;
5190 mutex_exit(&rvd->vdev_stat_lock);
5191 }
5192 /* Note: metaslab_class_space_update moved to metaslab_space_update */
5193 }
5194
5195 /*
5196 * Mark a top-level vdev's config as dirty, placing it on the dirty list
5197 * so that it will be written out next time the vdev configuration is synced.
5198 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
5199 */
5200 void
vdev_config_dirty(vdev_t * vd)5201 vdev_config_dirty(vdev_t *vd)
5202 {
5203 spa_t *spa = vd->vdev_spa;
5204 vdev_t *rvd = spa->spa_root_vdev;
5205 int c;
5206
5207 ASSERT(spa_writeable(spa));
5208
5209 /*
5210 * If this is an aux vdev (as with l2cache and spare devices), then we
5211 * update the vdev config manually and set the sync flag.
5212 */
5213 if (vd->vdev_aux != NULL) {
5214 spa_aux_vdev_t *sav = vd->vdev_aux;
5215 nvlist_t **aux;
5216 uint_t naux;
5217
5218 for (c = 0; c < sav->sav_count; c++) {
5219 if (sav->sav_vdevs[c] == vd)
5220 break;
5221 }
5222
5223 if (c == sav->sav_count) {
5224 /*
5225 * We're being removed. There's nothing more to do.
5226 */
5227 ASSERT(sav->sav_sync == B_TRUE);
5228 return;
5229 }
5230
5231 sav->sav_sync = B_TRUE;
5232
5233 if (nvlist_lookup_nvlist_array(sav->sav_config,
5234 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
5235 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
5236 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
5237 }
5238
5239 ASSERT(c < naux);
5240
5241 /*
5242 * Setting the nvlist in the middle if the array is a little
5243 * sketchy, but it will work.
5244 */
5245 nvlist_free(aux[c]);
5246 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
5247
5248 return;
5249 }
5250
5251 /*
5252 * The dirty list is protected by the SCL_CONFIG lock. The caller
5253 * must either hold SCL_CONFIG as writer, or must be the sync thread
5254 * (which holds SCL_CONFIG as reader). There's only one sync thread,
5255 * so this is sufficient to ensure mutual exclusion.
5256 */
5257 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
5258 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5259 spa_config_held(spa, SCL_CONFIG, RW_READER)));
5260
5261 if (vd == rvd) {
5262 for (c = 0; c < rvd->vdev_children; c++)
5263 vdev_config_dirty(rvd->vdev_child[c]);
5264 } else {
5265 ASSERT(vd == vd->vdev_top);
5266
5267 if (!list_link_active(&vd->vdev_config_dirty_node) &&
5268 vdev_is_concrete(vd)) {
5269 list_insert_head(&spa->spa_config_dirty_list, vd);
5270 }
5271 }
5272 }
5273
5274 void
vdev_config_clean(vdev_t * vd)5275 vdev_config_clean(vdev_t *vd)
5276 {
5277 spa_t *spa = vd->vdev_spa;
5278
5279 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
5280 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5281 spa_config_held(spa, SCL_CONFIG, RW_READER)));
5282
5283 ASSERT(list_link_active(&vd->vdev_config_dirty_node));
5284 list_remove(&spa->spa_config_dirty_list, vd);
5285 }
5286
5287 /*
5288 * Mark a top-level vdev's state as dirty, so that the next pass of
5289 * spa_sync() can convert this into vdev_config_dirty(). We distinguish
5290 * the state changes from larger config changes because they require
5291 * much less locking, and are often needed for administrative actions.
5292 */
5293 void
vdev_state_dirty(vdev_t * vd)5294 vdev_state_dirty(vdev_t *vd)
5295 {
5296 spa_t *spa = vd->vdev_spa;
5297
5298 ASSERT(spa_writeable(spa));
5299 ASSERT(vd == vd->vdev_top);
5300
5301 /*
5302 * The state list is protected by the SCL_STATE lock. The caller
5303 * must either hold SCL_STATE as writer, or must be the sync thread
5304 * (which holds SCL_STATE as reader). There's only one sync thread,
5305 * so this is sufficient to ensure mutual exclusion.
5306 */
5307 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
5308 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5309 spa_config_held(spa, SCL_STATE, RW_READER)));
5310
5311 if (!list_link_active(&vd->vdev_state_dirty_node) &&
5312 vdev_is_concrete(vd))
5313 list_insert_head(&spa->spa_state_dirty_list, vd);
5314 }
5315
5316 void
vdev_state_clean(vdev_t * vd)5317 vdev_state_clean(vdev_t *vd)
5318 {
5319 spa_t *spa = vd->vdev_spa;
5320
5321 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
5322 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5323 spa_config_held(spa, SCL_STATE, RW_READER)));
5324
5325 ASSERT(list_link_active(&vd->vdev_state_dirty_node));
5326 list_remove(&spa->spa_state_dirty_list, vd);
5327 }
5328
5329 /*
5330 * Propagate vdev state up from children to parent.
5331 */
5332 void
vdev_propagate_state(vdev_t * vd)5333 vdev_propagate_state(vdev_t *vd)
5334 {
5335 spa_t *spa = vd->vdev_spa;
5336 vdev_t *rvd = spa->spa_root_vdev;
5337 int degraded = 0, faulted = 0;
5338 int corrupted = 0;
5339 vdev_t *child;
5340
5341 if (vd->vdev_children > 0) {
5342 for (int c = 0; c < vd->vdev_children; c++) {
5343 child = vd->vdev_child[c];
5344
5345 /*
5346 * Don't factor holes or indirect vdevs into the
5347 * decision.
5348 */
5349 if (!vdev_is_concrete(child))
5350 continue;
5351
5352 if (!vdev_readable(child) ||
5353 (!vdev_writeable(child) && spa_writeable(spa))) {
5354 /*
5355 * Root special: if there is a top-level log
5356 * device, treat the root vdev as if it were
5357 * degraded.
5358 */
5359 if (child->vdev_islog && vd == rvd)
5360 degraded++;
5361 else
5362 faulted++;
5363 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
5364 degraded++;
5365 }
5366
5367 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
5368 corrupted++;
5369 }
5370
5371 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
5372
5373 /*
5374 * Root special: if there is a top-level vdev that cannot be
5375 * opened due to corrupted metadata, then propagate the root
5376 * vdev's aux state as 'corrupt' rather than 'insufficient
5377 * replicas'.
5378 */
5379 if (corrupted && vd == rvd &&
5380 rvd->vdev_state == VDEV_STATE_CANT_OPEN)
5381 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
5382 VDEV_AUX_CORRUPT_DATA);
5383 }
5384
5385 if (vd->vdev_parent)
5386 vdev_propagate_state(vd->vdev_parent);
5387 }
5388
5389 /*
5390 * Set a vdev's state. If this is during an open, we don't update the parent
5391 * state, because we're in the process of opening children depth-first.
5392 * Otherwise, we propagate the change to the parent.
5393 *
5394 * If this routine places a device in a faulted state, an appropriate ereport is
5395 * generated.
5396 */
5397 void
vdev_set_state(vdev_t * vd,boolean_t isopen,vdev_state_t state,vdev_aux_t aux)5398 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
5399 {
5400 uint64_t save_state;
5401 spa_t *spa = vd->vdev_spa;
5402
5403 if (state == vd->vdev_state) {
5404 /*
5405 * Since vdev_offline() code path is already in an offline
5406 * state we can miss a statechange event to OFFLINE. Check
5407 * the previous state to catch this condition.
5408 */
5409 if (vd->vdev_ops->vdev_op_leaf &&
5410 (state == VDEV_STATE_OFFLINE) &&
5411 (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
5412 /* post an offline state change */
5413 zfs_post_state_change(spa, vd, vd->vdev_prevstate);
5414 }
5415 vd->vdev_stat.vs_aux = aux;
5416 return;
5417 }
5418
5419 save_state = vd->vdev_state;
5420
5421 vd->vdev_state = state;
5422 vd->vdev_stat.vs_aux = aux;
5423
5424 /*
5425 * If we are setting the vdev state to anything but an open state, then
5426 * always close the underlying device unless the device has requested
5427 * a delayed close (i.e. we're about to remove or fault the device).
5428 * Otherwise, we keep accessible but invalid devices open forever.
5429 * We don't call vdev_close() itself, because that implies some extra
5430 * checks (offline, etc) that we don't want here. This is limited to
5431 * leaf devices, because otherwise closing the device will affect other
5432 * children.
5433 */
5434 if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
5435 vd->vdev_ops->vdev_op_leaf)
5436 vd->vdev_ops->vdev_op_close(vd);
5437
5438 if (vd->vdev_removed &&
5439 state == VDEV_STATE_CANT_OPEN &&
5440 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
5441 /*
5442 * If the previous state is set to VDEV_STATE_REMOVED, then this
5443 * device was previously marked removed and someone attempted to
5444 * reopen it. If this failed due to a nonexistent device, then
5445 * keep the device in the REMOVED state. We also let this be if
5446 * it is one of our special test online cases, which is only
5447 * attempting to online the device and shouldn't generate an FMA
5448 * fault.
5449 */
5450 vd->vdev_state = VDEV_STATE_REMOVED;
5451 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
5452 } else if (state == VDEV_STATE_REMOVED) {
5453 vd->vdev_removed = B_TRUE;
5454 } else if (state == VDEV_STATE_CANT_OPEN) {
5455 /*
5456 * If we fail to open a vdev during an import or recovery, we
5457 * mark it as "not available", which signifies that it was
5458 * never there to begin with. Failure to open such a device
5459 * is not considered an error.
5460 */
5461 if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
5462 spa_load_state(spa) == SPA_LOAD_RECOVER) &&
5463 vd->vdev_ops->vdev_op_leaf)
5464 vd->vdev_not_present = 1;
5465
5466 /*
5467 * Post the appropriate ereport. If the 'prevstate' field is
5468 * set to something other than VDEV_STATE_UNKNOWN, it indicates
5469 * that this is part of a vdev_reopen(). In this case, we don't
5470 * want to post the ereport if the device was already in the
5471 * CANT_OPEN state beforehand.
5472 *
5473 * If the 'checkremove' flag is set, then this is an attempt to
5474 * online the device in response to an insertion event. If we
5475 * hit this case, then we have detected an insertion event for a
5476 * faulted or offline device that wasn't in the removed state.
5477 * In this scenario, we don't post an ereport because we are
5478 * about to replace the device, or attempt an online with
5479 * vdev_forcefault, which will generate the fault for us.
5480 */
5481 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
5482 !vd->vdev_not_present && !vd->vdev_checkremove &&
5483 vd != spa->spa_root_vdev) {
5484 const char *class;
5485
5486 switch (aux) {
5487 case VDEV_AUX_OPEN_FAILED:
5488 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
5489 break;
5490 case VDEV_AUX_CORRUPT_DATA:
5491 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
5492 break;
5493 case VDEV_AUX_NO_REPLICAS:
5494 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
5495 break;
5496 case VDEV_AUX_BAD_GUID_SUM:
5497 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
5498 break;
5499 case VDEV_AUX_TOO_SMALL:
5500 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
5501 break;
5502 case VDEV_AUX_BAD_LABEL:
5503 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
5504 break;
5505 case VDEV_AUX_BAD_ASHIFT:
5506 class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
5507 break;
5508 default:
5509 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
5510 }
5511
5512 (void) zfs_ereport_post(class, spa, vd, NULL, NULL,
5513 save_state);
5514 }
5515
5516 /* Erase any notion of persistent removed state */
5517 vd->vdev_removed = B_FALSE;
5518 } else {
5519 vd->vdev_removed = B_FALSE;
5520 }
5521
5522 /*
5523 * Notify ZED of any significant state-change on a leaf vdev.
5524 *
5525 */
5526 if (vd->vdev_ops->vdev_op_leaf) {
5527 /* preserve original state from a vdev_reopen() */
5528 if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
5529 (vd->vdev_prevstate != vd->vdev_state) &&
5530 (save_state <= VDEV_STATE_CLOSED))
5531 save_state = vd->vdev_prevstate;
5532
5533 /* filter out state change due to initial vdev_open */
5534 if (save_state > VDEV_STATE_CLOSED)
5535 zfs_post_state_change(spa, vd, save_state);
5536 }
5537
5538 if (!isopen && vd->vdev_parent)
5539 vdev_propagate_state(vd->vdev_parent);
5540 }
5541
5542 boolean_t
vdev_children_are_offline(vdev_t * vd)5543 vdev_children_are_offline(vdev_t *vd)
5544 {
5545 ASSERT(!vd->vdev_ops->vdev_op_leaf);
5546
5547 for (uint64_t i = 0; i < vd->vdev_children; i++) {
5548 if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
5549 return (B_FALSE);
5550 }
5551
5552 return (B_TRUE);
5553 }
5554
5555 /*
5556 * Check the vdev configuration to ensure that it's capable of supporting
5557 * a root pool. We do not support partial configuration.
5558 */
5559 boolean_t
vdev_is_bootable(vdev_t * vd)5560 vdev_is_bootable(vdev_t *vd)
5561 {
5562 if (!vd->vdev_ops->vdev_op_leaf) {
5563 const char *vdev_type = vd->vdev_ops->vdev_op_type;
5564
5565 if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
5566 return (B_FALSE);
5567 }
5568
5569 for (int c = 0; c < vd->vdev_children; c++) {
5570 if (!vdev_is_bootable(vd->vdev_child[c]))
5571 return (B_FALSE);
5572 }
5573 return (B_TRUE);
5574 }
5575
5576 boolean_t
vdev_is_concrete(vdev_t * vd)5577 vdev_is_concrete(vdev_t *vd)
5578 {
5579 vdev_ops_t *ops = vd->vdev_ops;
5580 if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
5581 ops == &vdev_missing_ops || ops == &vdev_root_ops) {
5582 return (B_FALSE);
5583 } else {
5584 return (B_TRUE);
5585 }
5586 }
5587
5588 /*
5589 * Determine if a log device has valid content. If the vdev was
5590 * removed or faulted in the MOS config then we know that
5591 * the content on the log device has already been written to the pool.
5592 */
5593 boolean_t
vdev_log_state_valid(vdev_t * vd)5594 vdev_log_state_valid(vdev_t *vd)
5595 {
5596 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
5597 !vd->vdev_removed)
5598 return (B_TRUE);
5599
5600 for (int c = 0; c < vd->vdev_children; c++)
5601 if (vdev_log_state_valid(vd->vdev_child[c]))
5602 return (B_TRUE);
5603
5604 return (B_FALSE);
5605 }
5606
5607 /*
5608 * Expand a vdev if possible.
5609 */
5610 void
vdev_expand(vdev_t * vd,uint64_t txg)5611 vdev_expand(vdev_t *vd, uint64_t txg)
5612 {
5613 ASSERT(vd->vdev_top == vd);
5614 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5615 ASSERT(vdev_is_concrete(vd));
5616
5617 vdev_set_deflate_ratio(vd);
5618
5619 if ((vd->vdev_spa->spa_raidz_expand == NULL ||
5620 vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
5621 (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
5622 vdev_is_concrete(vd)) {
5623 vdev_metaslab_group_create(vd);
5624 VERIFY(vdev_metaslab_init(vd, txg) == 0);
5625 vdev_config_dirty(vd);
5626 }
5627 }
5628
5629 /*
5630 * Split a vdev.
5631 */
5632 void
vdev_split(vdev_t * vd)5633 vdev_split(vdev_t *vd)
5634 {
5635 vdev_t *cvd, *pvd = vd->vdev_parent;
5636
5637 VERIFY3U(pvd->vdev_children, >, 1);
5638
5639 vdev_remove_child(pvd, vd);
5640 vdev_compact_children(pvd);
5641
5642 ASSERT3P(pvd->vdev_child, !=, NULL);
5643
5644 cvd = pvd->vdev_child[0];
5645 if (pvd->vdev_children == 1) {
5646 vdev_remove_parent(cvd);
5647 cvd->vdev_splitting = B_TRUE;
5648 }
5649 vdev_propagate_state(cvd);
5650 }
5651
5652 void
vdev_deadman(vdev_t * vd,const char * tag)5653 vdev_deadman(vdev_t *vd, const char *tag)
5654 {
5655 for (int c = 0; c < vd->vdev_children; c++) {
5656 vdev_t *cvd = vd->vdev_child[c];
5657
5658 vdev_deadman(cvd, tag);
5659 }
5660
5661 if (vd->vdev_ops->vdev_op_leaf) {
5662 vdev_queue_t *vq = &vd->vdev_queue;
5663
5664 mutex_enter(&vq->vq_lock);
5665 if (vq->vq_active > 0) {
5666 spa_t *spa = vd->vdev_spa;
5667 zio_t *fio;
5668 uint64_t delta;
5669
5670 zfs_dbgmsg("slow vdev: %s has %u active IOs",
5671 vd->vdev_path, vq->vq_active);
5672
5673 /*
5674 * Look at the head of all the pending queues,
5675 * if any I/O has been outstanding for longer than
5676 * the spa_deadman_synctime invoke the deadman logic.
5677 */
5678 fio = list_head(&vq->vq_active_list);
5679 delta = gethrtime() - fio->io_timestamp;
5680 if (delta > spa_deadman_synctime(spa))
5681 zio_deadman(fio, tag);
5682 }
5683 mutex_exit(&vq->vq_lock);
5684 }
5685 }
5686
5687 void
vdev_defer_resilver(vdev_t * vd)5688 vdev_defer_resilver(vdev_t *vd)
5689 {
5690 ASSERT(vd->vdev_ops->vdev_op_leaf);
5691
5692 vd->vdev_resilver_deferred = B_TRUE;
5693 vd->vdev_spa->spa_resilver_deferred = B_TRUE;
5694 }
5695
5696 /*
5697 * Clears the resilver deferred flag on all leaf devs under vd. Returns
5698 * B_TRUE if we have devices that need to be resilvered and are available to
5699 * accept resilver I/Os.
5700 */
5701 boolean_t
vdev_clear_resilver_deferred(vdev_t * vd,dmu_tx_t * tx)5702 vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
5703 {
5704 boolean_t resilver_needed = B_FALSE;
5705 spa_t *spa = vd->vdev_spa;
5706
5707 for (int c = 0; c < vd->vdev_children; c++) {
5708 vdev_t *cvd = vd->vdev_child[c];
5709 resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
5710 }
5711
5712 if (vd == spa->spa_root_vdev &&
5713 spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
5714 spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
5715 vdev_config_dirty(vd);
5716 spa->spa_resilver_deferred = B_FALSE;
5717 return (resilver_needed);
5718 }
5719
5720 if (!vdev_is_concrete(vd) || vd->vdev_aux ||
5721 !vd->vdev_ops->vdev_op_leaf)
5722 return (resilver_needed);
5723
5724 vd->vdev_resilver_deferred = B_FALSE;
5725
5726 return (!vdev_is_dead(vd) && !vd->vdev_offline &&
5727 vdev_resilver_needed(vd, NULL, NULL));
5728 }
5729
5730 boolean_t
vdev_xlate_is_empty(zfs_range_seg64_t * rs)5731 vdev_xlate_is_empty(zfs_range_seg64_t *rs)
5732 {
5733 return (rs->rs_start == rs->rs_end);
5734 }
5735
5736 /*
5737 * Translate a logical range to the first contiguous physical range for the
5738 * specified vdev_t. This function is initially called with a leaf vdev and
5739 * will walk each parent vdev until it reaches a top-level vdev. Once the
5740 * top-level is reached the physical range is initialized and the recursive
5741 * function begins to unwind. As it unwinds it calls the parent's vdev
5742 * specific translation function to do the real conversion.
5743 */
5744 void
vdev_xlate(vdev_t * vd,const zfs_range_seg64_t * logical_rs,zfs_range_seg64_t * physical_rs,zfs_range_seg64_t * remain_rs)5745 vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
5746 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
5747 {
5748 /*
5749 * Walk up the vdev tree
5750 */
5751 if (vd != vd->vdev_top) {
5752 vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
5753 remain_rs);
5754 } else {
5755 /*
5756 * We've reached the top-level vdev, initialize the physical
5757 * range to the logical range and set an empty remaining
5758 * range then start to unwind.
5759 */
5760 physical_rs->rs_start = logical_rs->rs_start;
5761 physical_rs->rs_end = logical_rs->rs_end;
5762
5763 remain_rs->rs_start = logical_rs->rs_start;
5764 remain_rs->rs_end = logical_rs->rs_start;
5765
5766 return;
5767 }
5768
5769 vdev_t *pvd = vd->vdev_parent;
5770 ASSERT3P(pvd, !=, NULL);
5771 ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
5772
5773 /*
5774 * As this recursive function unwinds, translate the logical
5775 * range into its physical and any remaining components by calling
5776 * the vdev specific translate function.
5777 */
5778 zfs_range_seg64_t intermediate = { 0 };
5779 pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
5780
5781 physical_rs->rs_start = intermediate.rs_start;
5782 physical_rs->rs_end = intermediate.rs_end;
5783 }
5784
5785 void
vdev_xlate_walk(vdev_t * vd,const zfs_range_seg64_t * logical_rs,vdev_xlate_func_t * func,void * arg)5786 vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
5787 vdev_xlate_func_t *func, void *arg)
5788 {
5789 zfs_range_seg64_t iter_rs = *logical_rs;
5790 zfs_range_seg64_t physical_rs;
5791 zfs_range_seg64_t remain_rs;
5792
5793 while (!vdev_xlate_is_empty(&iter_rs)) {
5794
5795 vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
5796
5797 /*
5798 * With raidz and dRAID, it's possible that the logical range
5799 * does not live on this leaf vdev. Only when there is a non-
5800 * zero physical size call the provided function.
5801 */
5802 if (!vdev_xlate_is_empty(&physical_rs))
5803 func(arg, &physical_rs);
5804
5805 iter_rs = remain_rs;
5806 }
5807 }
5808
5809 static char *
vdev_name(vdev_t * vd,char * buf,int buflen)5810 vdev_name(vdev_t *vd, char *buf, int buflen)
5811 {
5812 if (vd->vdev_path == NULL) {
5813 if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
5814 strlcpy(buf, vd->vdev_spa->spa_name, buflen);
5815 } else if (!vd->vdev_ops->vdev_op_leaf) {
5816 snprintf(buf, buflen, "%s-%llu",
5817 vd->vdev_ops->vdev_op_type,
5818 (u_longlong_t)vd->vdev_id);
5819 }
5820 } else {
5821 strlcpy(buf, vd->vdev_path, buflen);
5822 }
5823 return (buf);
5824 }
5825
5826 /*
5827 * Look at the vdev tree and determine whether any devices are currently being
5828 * replaced.
5829 */
5830 boolean_t
vdev_replace_in_progress(vdev_t * vdev)5831 vdev_replace_in_progress(vdev_t *vdev)
5832 {
5833 ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
5834
5835 if (vdev->vdev_ops == &vdev_replacing_ops)
5836 return (B_TRUE);
5837
5838 /*
5839 * A 'spare' vdev indicates that we have a replace in progress, unless
5840 * it has exactly two children, and the second, the hot spare, has
5841 * finished being resilvered.
5842 */
5843 if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
5844 !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
5845 return (B_TRUE);
5846
5847 for (int i = 0; i < vdev->vdev_children; i++) {
5848 if (vdev_replace_in_progress(vdev->vdev_child[i]))
5849 return (B_TRUE);
5850 }
5851
5852 return (B_FALSE);
5853 }
5854
5855 /*
5856 * Add a (source=src, propname=propval) list to an nvlist.
5857 */
5858 static void
vdev_prop_add_list(nvlist_t * nvl,const char * propname,const char * strval,uint64_t intval,zprop_source_t src)5859 vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
5860 uint64_t intval, zprop_source_t src)
5861 {
5862 nvlist_t *propval;
5863
5864 propval = fnvlist_alloc();
5865 fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
5866
5867 if (strval != NULL)
5868 fnvlist_add_string(propval, ZPROP_VALUE, strval);
5869 else
5870 fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
5871
5872 fnvlist_add_nvlist(nvl, propname, propval);
5873 nvlist_free(propval);
5874 }
5875
5876 static void
vdev_props_set_sync(void * arg,dmu_tx_t * tx)5877 vdev_props_set_sync(void *arg, dmu_tx_t *tx)
5878 {
5879 vdev_t *vd;
5880 nvlist_t *nvp = arg;
5881 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5882 objset_t *mos = spa->spa_meta_objset;
5883 nvpair_t *elem = NULL;
5884 uint64_t vdev_guid;
5885 uint64_t objid;
5886 nvlist_t *nvprops;
5887
5888 vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
5889 nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
5890 vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
5891
5892 /* this vdev could get removed while waiting for this sync task */
5893 if (vd == NULL)
5894 return;
5895
5896 /*
5897 * Set vdev property values in the vdev props mos object.
5898 */
5899 if (vd->vdev_root_zap != 0) {
5900 objid = vd->vdev_root_zap;
5901 } else if (vd->vdev_top_zap != 0) {
5902 objid = vd->vdev_top_zap;
5903 } else if (vd->vdev_leaf_zap != 0) {
5904 objid = vd->vdev_leaf_zap;
5905 } else {
5906 panic("unexpected vdev type");
5907 }
5908
5909 mutex_enter(&spa->spa_props_lock);
5910
5911 while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
5912 uint64_t intval;
5913 const char *strval;
5914 vdev_prop_t prop;
5915 const char *propname = nvpair_name(elem);
5916 zprop_type_t proptype;
5917
5918 switch (prop = vdev_name_to_prop(propname)) {
5919 case VDEV_PROP_USERPROP:
5920 if (vdev_prop_user(propname)) {
5921 strval = fnvpair_value_string(elem);
5922 if (strlen(strval) == 0) {
5923 /* remove the property if value == "" */
5924 (void) zap_remove(mos, objid, propname,
5925 tx);
5926 } else {
5927 VERIFY0(zap_update(mos, objid, propname,
5928 1, strlen(strval) + 1, strval, tx));
5929 }
5930 spa_history_log_internal(spa, "vdev set", tx,
5931 "vdev_guid=%llu: %s=%s",
5932 (u_longlong_t)vdev_guid, nvpair_name(elem),
5933 strval);
5934 }
5935 break;
5936 default:
5937 /* normalize the property name */
5938 propname = vdev_prop_to_name(prop);
5939 proptype = vdev_prop_get_type(prop);
5940
5941 if (nvpair_type(elem) == DATA_TYPE_STRING) {
5942 ASSERT(proptype == PROP_TYPE_STRING);
5943 strval = fnvpair_value_string(elem);
5944 VERIFY0(zap_update(mos, objid, propname,
5945 1, strlen(strval) + 1, strval, tx));
5946 spa_history_log_internal(spa, "vdev set", tx,
5947 "vdev_guid=%llu: %s=%s",
5948 (u_longlong_t)vdev_guid, nvpair_name(elem),
5949 strval);
5950 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5951 intval = fnvpair_value_uint64(elem);
5952
5953 if (proptype == PROP_TYPE_INDEX) {
5954 const char *unused;
5955 VERIFY0(vdev_prop_index_to_string(
5956 prop, intval, &unused));
5957 }
5958 VERIFY0(zap_update(mos, objid, propname,
5959 sizeof (uint64_t), 1, &intval, tx));
5960 spa_history_log_internal(spa, "vdev set", tx,
5961 "vdev_guid=%llu: %s=%lld",
5962 (u_longlong_t)vdev_guid,
5963 nvpair_name(elem), (longlong_t)intval);
5964 } else {
5965 panic("invalid vdev property type %u",
5966 nvpair_type(elem));
5967 }
5968 }
5969
5970 }
5971
5972 mutex_exit(&spa->spa_props_lock);
5973 }
5974
5975 int
vdev_prop_set(vdev_t * vd,nvlist_t * innvl,nvlist_t * outnvl)5976 vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
5977 {
5978 spa_t *spa = vd->vdev_spa;
5979 nvpair_t *elem = NULL;
5980 uint64_t vdev_guid;
5981 nvlist_t *nvprops;
5982 int error = 0;
5983
5984 ASSERT(vd != NULL);
5985
5986 /* Check that vdev has a zap we can use */
5987 if (vd->vdev_root_zap == 0 &&
5988 vd->vdev_top_zap == 0 &&
5989 vd->vdev_leaf_zap == 0)
5990 return (SET_ERROR(EINVAL));
5991
5992 if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
5993 &vdev_guid) != 0)
5994 return (SET_ERROR(EINVAL));
5995
5996 if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
5997 &nvprops) != 0)
5998 return (SET_ERROR(EINVAL));
5999
6000 if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
6001 return (SET_ERROR(EINVAL));
6002
6003 while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
6004 const char *propname = nvpair_name(elem);
6005 vdev_prop_t prop = vdev_name_to_prop(propname);
6006 uint64_t intval = 0;
6007 const char *strval = NULL;
6008
6009 if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
6010 error = EINVAL;
6011 goto end;
6012 }
6013
6014 if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) {
6015 error = EROFS;
6016 goto end;
6017 }
6018
6019 /* Special Processing */
6020 switch (prop) {
6021 case VDEV_PROP_PATH:
6022 if (vd->vdev_path == NULL) {
6023 error = EROFS;
6024 break;
6025 }
6026 if (nvpair_value_string(elem, &strval) != 0) {
6027 error = EINVAL;
6028 break;
6029 }
6030 /* New path must start with /dev/ */
6031 if (strncmp(strval, "/dev/", 5)) {
6032 error = EINVAL;
6033 break;
6034 }
6035 error = spa_vdev_setpath(spa, vdev_guid, strval);
6036 break;
6037 case VDEV_PROP_ALLOCATING:
6038 if (nvpair_value_uint64(elem, &intval) != 0) {
6039 error = EINVAL;
6040 break;
6041 }
6042 if (intval != vd->vdev_noalloc)
6043 break;
6044 if (intval == 0)
6045 error = spa_vdev_noalloc(spa, vdev_guid);
6046 else
6047 error = spa_vdev_alloc(spa, vdev_guid);
6048 break;
6049 case VDEV_PROP_FAILFAST:
6050 if (nvpair_value_uint64(elem, &intval) != 0) {
6051 error = EINVAL;
6052 break;
6053 }
6054 vd->vdev_failfast = intval & 1;
6055 break;
6056 case VDEV_PROP_CHECKSUM_N:
6057 if (nvpair_value_uint64(elem, &intval) != 0) {
6058 error = EINVAL;
6059 break;
6060 }
6061 vd->vdev_checksum_n = intval;
6062 break;
6063 case VDEV_PROP_CHECKSUM_T:
6064 if (nvpair_value_uint64(elem, &intval) != 0) {
6065 error = EINVAL;
6066 break;
6067 }
6068 vd->vdev_checksum_t = intval;
6069 break;
6070 case VDEV_PROP_IO_N:
6071 if (nvpair_value_uint64(elem, &intval) != 0) {
6072 error = EINVAL;
6073 break;
6074 }
6075 vd->vdev_io_n = intval;
6076 break;
6077 case VDEV_PROP_IO_T:
6078 if (nvpair_value_uint64(elem, &intval) != 0) {
6079 error = EINVAL;
6080 break;
6081 }
6082 vd->vdev_io_t = intval;
6083 break;
6084 case VDEV_PROP_SLOW_IO_N:
6085 if (nvpair_value_uint64(elem, &intval) != 0) {
6086 error = EINVAL;
6087 break;
6088 }
6089 vd->vdev_slow_io_n = intval;
6090 break;
6091 case VDEV_PROP_SLOW_IO_T:
6092 if (nvpair_value_uint64(elem, &intval) != 0) {
6093 error = EINVAL;
6094 break;
6095 }
6096 vd->vdev_slow_io_t = intval;
6097 break;
6098 default:
6099 /* Most processing is done in vdev_props_set_sync */
6100 break;
6101 }
6102 end:
6103 if (error != 0) {
6104 intval = error;
6105 vdev_prop_add_list(outnvl, propname, strval, intval, 0);
6106 return (error);
6107 }
6108 }
6109
6110 return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
6111 innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
6112 }
6113
6114 int
vdev_prop_get(vdev_t * vd,nvlist_t * innvl,nvlist_t * outnvl)6115 vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
6116 {
6117 spa_t *spa = vd->vdev_spa;
6118 objset_t *mos = spa->spa_meta_objset;
6119 int err = 0;
6120 uint64_t objid;
6121 uint64_t vdev_guid;
6122 nvpair_t *elem = NULL;
6123 nvlist_t *nvprops = NULL;
6124 uint64_t intval = 0;
6125 char *strval = NULL;
6126 const char *propname = NULL;
6127 vdev_prop_t prop;
6128
6129 ASSERT(vd != NULL);
6130 ASSERT(mos != NULL);
6131
6132 if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
6133 &vdev_guid) != 0)
6134 return (SET_ERROR(EINVAL));
6135
6136 nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
6137
6138 if (vd->vdev_root_zap != 0) {
6139 objid = vd->vdev_root_zap;
6140 } else if (vd->vdev_top_zap != 0) {
6141 objid = vd->vdev_top_zap;
6142 } else if (vd->vdev_leaf_zap != 0) {
6143 objid = vd->vdev_leaf_zap;
6144 } else {
6145 return (SET_ERROR(EINVAL));
6146 }
6147 ASSERT(objid != 0);
6148
6149 mutex_enter(&spa->spa_props_lock);
6150
6151 if (nvprops != NULL) {
6152 char namebuf[64] = { 0 };
6153
6154 while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
6155 intval = 0;
6156 strval = NULL;
6157 propname = nvpair_name(elem);
6158 prop = vdev_name_to_prop(propname);
6159 zprop_source_t src = ZPROP_SRC_DEFAULT;
6160 uint64_t integer_size, num_integers;
6161
6162 switch (prop) {
6163 /* Special Read-only Properties */
6164 case VDEV_PROP_NAME:
6165 strval = vdev_name(vd, namebuf,
6166 sizeof (namebuf));
6167 if (strval == NULL)
6168 continue;
6169 vdev_prop_add_list(outnvl, propname, strval, 0,
6170 ZPROP_SRC_NONE);
6171 continue;
6172 case VDEV_PROP_CAPACITY:
6173 /* percent used */
6174 intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
6175 (vd->vdev_stat.vs_alloc * 100 /
6176 vd->vdev_stat.vs_dspace);
6177 vdev_prop_add_list(outnvl, propname, NULL,
6178 intval, ZPROP_SRC_NONE);
6179 continue;
6180 case VDEV_PROP_STATE:
6181 vdev_prop_add_list(outnvl, propname, NULL,
6182 vd->vdev_state, ZPROP_SRC_NONE);
6183 continue;
6184 case VDEV_PROP_GUID:
6185 vdev_prop_add_list(outnvl, propname, NULL,
6186 vd->vdev_guid, ZPROP_SRC_NONE);
6187 continue;
6188 case VDEV_PROP_ASIZE:
6189 vdev_prop_add_list(outnvl, propname, NULL,
6190 vd->vdev_asize, ZPROP_SRC_NONE);
6191 continue;
6192 case VDEV_PROP_PSIZE:
6193 vdev_prop_add_list(outnvl, propname, NULL,
6194 vd->vdev_psize, ZPROP_SRC_NONE);
6195 continue;
6196 case VDEV_PROP_ASHIFT:
6197 vdev_prop_add_list(outnvl, propname, NULL,
6198 vd->vdev_ashift, ZPROP_SRC_NONE);
6199 continue;
6200 case VDEV_PROP_SIZE:
6201 vdev_prop_add_list(outnvl, propname, NULL,
6202 vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
6203 continue;
6204 case VDEV_PROP_FREE:
6205 vdev_prop_add_list(outnvl, propname, NULL,
6206 vd->vdev_stat.vs_dspace -
6207 vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
6208 continue;
6209 case VDEV_PROP_ALLOCATED:
6210 vdev_prop_add_list(outnvl, propname, NULL,
6211 vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
6212 continue;
6213 case VDEV_PROP_EXPANDSZ:
6214 vdev_prop_add_list(outnvl, propname, NULL,
6215 vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
6216 continue;
6217 case VDEV_PROP_FRAGMENTATION:
6218 vdev_prop_add_list(outnvl, propname, NULL,
6219 vd->vdev_stat.vs_fragmentation,
6220 ZPROP_SRC_NONE);
6221 continue;
6222 case VDEV_PROP_PARITY:
6223 vdev_prop_add_list(outnvl, propname, NULL,
6224 vdev_get_nparity(vd), ZPROP_SRC_NONE);
6225 continue;
6226 case VDEV_PROP_PATH:
6227 if (vd->vdev_path == NULL)
6228 continue;
6229 vdev_prop_add_list(outnvl, propname,
6230 vd->vdev_path, 0, ZPROP_SRC_NONE);
6231 continue;
6232 case VDEV_PROP_DEVID:
6233 if (vd->vdev_devid == NULL)
6234 continue;
6235 vdev_prop_add_list(outnvl, propname,
6236 vd->vdev_devid, 0, ZPROP_SRC_NONE);
6237 continue;
6238 case VDEV_PROP_PHYS_PATH:
6239 if (vd->vdev_physpath == NULL)
6240 continue;
6241 vdev_prop_add_list(outnvl, propname,
6242 vd->vdev_physpath, 0, ZPROP_SRC_NONE);
6243 continue;
6244 case VDEV_PROP_ENC_PATH:
6245 if (vd->vdev_enc_sysfs_path == NULL)
6246 continue;
6247 vdev_prop_add_list(outnvl, propname,
6248 vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
6249 continue;
6250 case VDEV_PROP_FRU:
6251 if (vd->vdev_fru == NULL)
6252 continue;
6253 vdev_prop_add_list(outnvl, propname,
6254 vd->vdev_fru, 0, ZPROP_SRC_NONE);
6255 continue;
6256 case VDEV_PROP_PARENT:
6257 if (vd->vdev_parent != NULL) {
6258 strval = vdev_name(vd->vdev_parent,
6259 namebuf, sizeof (namebuf));
6260 vdev_prop_add_list(outnvl, propname,
6261 strval, 0, ZPROP_SRC_NONE);
6262 }
6263 continue;
6264 case VDEV_PROP_CHILDREN:
6265 if (vd->vdev_children > 0)
6266 strval = kmem_zalloc(ZAP_MAXVALUELEN,
6267 KM_SLEEP);
6268 for (uint64_t i = 0; i < vd->vdev_children;
6269 i++) {
6270 const char *vname;
6271
6272 vname = vdev_name(vd->vdev_child[i],
6273 namebuf, sizeof (namebuf));
6274 if (vname == NULL)
6275 vname = "(unknown)";
6276 if (strlen(strval) > 0)
6277 strlcat(strval, ",",
6278 ZAP_MAXVALUELEN);
6279 strlcat(strval, vname, ZAP_MAXVALUELEN);
6280 }
6281 if (strval != NULL) {
6282 vdev_prop_add_list(outnvl, propname,
6283 strval, 0, ZPROP_SRC_NONE);
6284 kmem_free(strval, ZAP_MAXVALUELEN);
6285 }
6286 continue;
6287 case VDEV_PROP_NUMCHILDREN:
6288 vdev_prop_add_list(outnvl, propname, NULL,
6289 vd->vdev_children, ZPROP_SRC_NONE);
6290 continue;
6291 case VDEV_PROP_READ_ERRORS:
6292 vdev_prop_add_list(outnvl, propname, NULL,
6293 vd->vdev_stat.vs_read_errors,
6294 ZPROP_SRC_NONE);
6295 continue;
6296 case VDEV_PROP_WRITE_ERRORS:
6297 vdev_prop_add_list(outnvl, propname, NULL,
6298 vd->vdev_stat.vs_write_errors,
6299 ZPROP_SRC_NONE);
6300 continue;
6301 case VDEV_PROP_CHECKSUM_ERRORS:
6302 vdev_prop_add_list(outnvl, propname, NULL,
6303 vd->vdev_stat.vs_checksum_errors,
6304 ZPROP_SRC_NONE);
6305 continue;
6306 case VDEV_PROP_INITIALIZE_ERRORS:
6307 vdev_prop_add_list(outnvl, propname, NULL,
6308 vd->vdev_stat.vs_initialize_errors,
6309 ZPROP_SRC_NONE);
6310 continue;
6311 case VDEV_PROP_TRIM_ERRORS:
6312 vdev_prop_add_list(outnvl, propname, NULL,
6313 vd->vdev_stat.vs_trim_errors,
6314 ZPROP_SRC_NONE);
6315 continue;
6316 case VDEV_PROP_SLOW_IOS:
6317 vdev_prop_add_list(outnvl, propname, NULL,
6318 vd->vdev_stat.vs_slow_ios,
6319 ZPROP_SRC_NONE);
6320 continue;
6321 case VDEV_PROP_OPS_NULL:
6322 vdev_prop_add_list(outnvl, propname, NULL,
6323 vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
6324 ZPROP_SRC_NONE);
6325 continue;
6326 case VDEV_PROP_OPS_READ:
6327 vdev_prop_add_list(outnvl, propname, NULL,
6328 vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
6329 ZPROP_SRC_NONE);
6330 continue;
6331 case VDEV_PROP_OPS_WRITE:
6332 vdev_prop_add_list(outnvl, propname, NULL,
6333 vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
6334 ZPROP_SRC_NONE);
6335 continue;
6336 case VDEV_PROP_OPS_FREE:
6337 vdev_prop_add_list(outnvl, propname, NULL,
6338 vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
6339 ZPROP_SRC_NONE);
6340 continue;
6341 case VDEV_PROP_OPS_CLAIM:
6342 vdev_prop_add_list(outnvl, propname, NULL,
6343 vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
6344 ZPROP_SRC_NONE);
6345 continue;
6346 case VDEV_PROP_OPS_TRIM:
6347 /*
6348 * TRIM ops and bytes are reported to user
6349 * space as ZIO_TYPE_FLUSH. This is done to
6350 * preserve the vdev_stat_t structure layout
6351 * for user space.
6352 */
6353 vdev_prop_add_list(outnvl, propname, NULL,
6354 vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
6355 ZPROP_SRC_NONE);
6356 continue;
6357 case VDEV_PROP_BYTES_NULL:
6358 vdev_prop_add_list(outnvl, propname, NULL,
6359 vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
6360 ZPROP_SRC_NONE);
6361 continue;
6362 case VDEV_PROP_BYTES_READ:
6363 vdev_prop_add_list(outnvl, propname, NULL,
6364 vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
6365 ZPROP_SRC_NONE);
6366 continue;
6367 case VDEV_PROP_BYTES_WRITE:
6368 vdev_prop_add_list(outnvl, propname, NULL,
6369 vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
6370 ZPROP_SRC_NONE);
6371 continue;
6372 case VDEV_PROP_BYTES_FREE:
6373 vdev_prop_add_list(outnvl, propname, NULL,
6374 vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
6375 ZPROP_SRC_NONE);
6376 continue;
6377 case VDEV_PROP_BYTES_CLAIM:
6378 vdev_prop_add_list(outnvl, propname, NULL,
6379 vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
6380 ZPROP_SRC_NONE);
6381 continue;
6382 case VDEV_PROP_BYTES_TRIM:
6383 /*
6384 * TRIM ops and bytes are reported to user
6385 * space as ZIO_TYPE_FLUSH. This is done to
6386 * preserve the vdev_stat_t structure layout
6387 * for user space.
6388 */
6389 vdev_prop_add_list(outnvl, propname, NULL,
6390 vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
6391 ZPROP_SRC_NONE);
6392 continue;
6393 case VDEV_PROP_REMOVING:
6394 vdev_prop_add_list(outnvl, propname, NULL,
6395 vd->vdev_removing, ZPROP_SRC_NONE);
6396 continue;
6397 case VDEV_PROP_RAIDZ_EXPANDING:
6398 /* Only expose this for raidz */
6399 if (vd->vdev_ops == &vdev_raidz_ops) {
6400 vdev_prop_add_list(outnvl, propname,
6401 NULL, vd->vdev_rz_expanding,
6402 ZPROP_SRC_NONE);
6403 }
6404 continue;
6405 case VDEV_PROP_TRIM_SUPPORT:
6406 /* only valid for leaf vdevs */
6407 if (vd->vdev_ops->vdev_op_leaf) {
6408 vdev_prop_add_list(outnvl, propname,
6409 NULL, vd->vdev_has_trim,
6410 ZPROP_SRC_NONE);
6411 }
6412 continue;
6413 /* Numeric Properites */
6414 case VDEV_PROP_ALLOCATING:
6415 /* Leaf vdevs cannot have this property */
6416 if (vd->vdev_mg == NULL &&
6417 vd->vdev_top != NULL) {
6418 src = ZPROP_SRC_NONE;
6419 intval = ZPROP_BOOLEAN_NA;
6420 } else {
6421 err = vdev_prop_get_int(vd, prop,
6422 &intval);
6423 if (err && err != ENOENT)
6424 break;
6425
6426 if (intval ==
6427 vdev_prop_default_numeric(prop))
6428 src = ZPROP_SRC_DEFAULT;
6429 else
6430 src = ZPROP_SRC_LOCAL;
6431 }
6432
6433 vdev_prop_add_list(outnvl, propname, NULL,
6434 intval, src);
6435 break;
6436 case VDEV_PROP_FAILFAST:
6437 src = ZPROP_SRC_LOCAL;
6438 strval = NULL;
6439
6440 err = zap_lookup(mos, objid, nvpair_name(elem),
6441 sizeof (uint64_t), 1, &intval);
6442 if (err == ENOENT) {
6443 intval = vdev_prop_default_numeric(
6444 prop);
6445 err = 0;
6446 } else if (err) {
6447 break;
6448 }
6449 if (intval == vdev_prop_default_numeric(prop))
6450 src = ZPROP_SRC_DEFAULT;
6451
6452 vdev_prop_add_list(outnvl, propname, strval,
6453 intval, src);
6454 break;
6455 case VDEV_PROP_CHECKSUM_N:
6456 case VDEV_PROP_CHECKSUM_T:
6457 case VDEV_PROP_IO_N:
6458 case VDEV_PROP_IO_T:
6459 case VDEV_PROP_SLOW_IO_N:
6460 case VDEV_PROP_SLOW_IO_T:
6461 err = vdev_prop_get_int(vd, prop, &intval);
6462 if (err && err != ENOENT)
6463 break;
6464
6465 if (intval == vdev_prop_default_numeric(prop))
6466 src = ZPROP_SRC_DEFAULT;
6467 else
6468 src = ZPROP_SRC_LOCAL;
6469
6470 vdev_prop_add_list(outnvl, propname, NULL,
6471 intval, src);
6472 break;
6473 /* Text Properties */
6474 case VDEV_PROP_COMMENT:
6475 /* Exists in the ZAP below */
6476 /* FALLTHRU */
6477 case VDEV_PROP_USERPROP:
6478 /* User Properites */
6479 src = ZPROP_SRC_LOCAL;
6480
6481 err = zap_length(mos, objid, nvpair_name(elem),
6482 &integer_size, &num_integers);
6483 if (err)
6484 break;
6485
6486 switch (integer_size) {
6487 case 8:
6488 /* User properties cannot be integers */
6489 err = EINVAL;
6490 break;
6491 case 1:
6492 /* string property */
6493 strval = kmem_alloc(num_integers,
6494 KM_SLEEP);
6495 err = zap_lookup(mos, objid,
6496 nvpair_name(elem), 1,
6497 num_integers, strval);
6498 if (err) {
6499 kmem_free(strval,
6500 num_integers);
6501 break;
6502 }
6503 vdev_prop_add_list(outnvl, propname,
6504 strval, 0, src);
6505 kmem_free(strval, num_integers);
6506 break;
6507 }
6508 break;
6509 default:
6510 err = ENOENT;
6511 break;
6512 }
6513 if (err)
6514 break;
6515 }
6516 } else {
6517 /*
6518 * Get all properties from the MOS vdev property object.
6519 */
6520 zap_cursor_t zc;
6521 zap_attribute_t *za = zap_attribute_alloc();
6522 for (zap_cursor_init(&zc, mos, objid);
6523 (err = zap_cursor_retrieve(&zc, za)) == 0;
6524 zap_cursor_advance(&zc)) {
6525 intval = 0;
6526 strval = NULL;
6527 zprop_source_t src = ZPROP_SRC_DEFAULT;
6528 propname = za->za_name;
6529
6530 switch (za->za_integer_length) {
6531 case 8:
6532 /* We do not allow integer user properties */
6533 /* This is likely an internal value */
6534 break;
6535 case 1:
6536 /* string property */
6537 strval = kmem_alloc(za->za_num_integers,
6538 KM_SLEEP);
6539 err = zap_lookup(mos, objid, za->za_name, 1,
6540 za->za_num_integers, strval);
6541 if (err) {
6542 kmem_free(strval, za->za_num_integers);
6543 break;
6544 }
6545 vdev_prop_add_list(outnvl, propname, strval, 0,
6546 src);
6547 kmem_free(strval, za->za_num_integers);
6548 break;
6549
6550 default:
6551 break;
6552 }
6553 }
6554 zap_cursor_fini(&zc);
6555 zap_attribute_free(za);
6556 }
6557
6558 mutex_exit(&spa->spa_props_lock);
6559 if (err && err != ENOENT) {
6560 return (err);
6561 }
6562
6563 return (0);
6564 }
6565
6566 EXPORT_SYMBOL(vdev_fault);
6567 EXPORT_SYMBOL(vdev_degrade);
6568 EXPORT_SYMBOL(vdev_online);
6569 EXPORT_SYMBOL(vdev_offline);
6570 EXPORT_SYMBOL(vdev_clear);
6571
6572 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
6573 "Target number of metaslabs per top-level vdev");
6574
6575 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
6576 "Default lower limit for metaslab size");
6577
6578 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
6579 "Default upper limit for metaslab size");
6580
6581 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
6582 "Minimum number of metaslabs per top-level vdev");
6583
6584 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
6585 "Practical upper limit of total metaslabs per top-level vdev");
6586
6587 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
6588 "Rate limit slow IO (delay) events to this many per second");
6589
6590 ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
6591 "Rate limit hung IO (deadman) events to this many per second");
6592
6593 ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW,
6594 "Rate Direct I/O write verify events to this many per second");
6595
6596 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW,
6597 "Direct I/O writes will perform for checksum verification before "
6598 "commiting write");
6599
6600 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
6601 "Rate limit checksum events to this many checksum errors per second "
6602 "(do not set below ZED threshold).");
6603
6604 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
6605 "Ignore errors during resilver/scrub");
6606
6607 ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
6608 "Bypass vdev_validate()");
6609
6610 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
6611 "Disable cache flushes");
6612
6613 ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
6614 "Minimum number of metaslabs required to dedicate one for log blocks");
6615
6616 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
6617 param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
6618 "Minimum ashift used when creating new top-level vdevs");
6619
6620 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
6621 param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
6622 "Maximum ashift used when optimizing for logical -> physical sector "
6623 "size on new top-level vdevs");
6624
6625 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, raidz_impl,
6626 param_set_raidz_impl, param_get_raidz_impl, ZMOD_RW,
6627 "RAIDZ implementation");
6628