1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24  */
25 
26 #ifndef _SYS_VDEV_IMPL_H
27 #define   _SYS_VDEV_IMPL_H
28 
29 #include <sys/avl.h>
30 #include <sys/dmu.h>
31 #include <sys/metaslab.h>
32 #include <sys/nvpair.h>
33 #include <sys/space_map.h>
34 #include <sys/vdev.h>
35 #include <sys/dkio.h>
36 #include <sys/uberblock_impl.h>
37 
38 #ifdef    __cplusplus
39 extern "C" {
40 #endif
41 
42 /*
43  * Virtual device descriptors.
44  *
45  * All storage pool operations go through the virtual device framework,
46  * which provides data replication and I/O scheduling.
47  */
48 
49 /*
50  * Forward declarations that lots of things need.
51  */
52 typedef struct vdev_queue vdev_queue_t;
53 typedef struct vdev_cache vdev_cache_t;
54 typedef struct vdev_cache_entry vdev_cache_entry_t;
55 
56 extern int zfs_vdev_queue_depth_pct;
57 extern uint32_t zfs_vdev_async_write_max_active;
58 
59 /*
60  * Virtual device operations
61  */
62 typedef int         vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
63     uint64_t *logical_ashift, uint64_t *physical_ashift);
64 typedef void        vdev_close_func_t(vdev_t *vd);
65 typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
66 typedef void        vdev_io_start_func_t(zio_t *zio);
67 typedef void        vdev_io_done_func_t(zio_t *zio);
68 typedef void        vdev_state_change_func_t(vdev_t *vd, int, int);
69 typedef void        vdev_hold_func_t(vdev_t *vd);
70 typedef void        vdev_rele_func_t(vdev_t *vd);
71 
72 typedef struct vdev_ops {
73           vdev_open_func_t              *vdev_op_open;
74           vdev_close_func_t             *vdev_op_close;
75           vdev_asize_func_t             *vdev_op_asize;
76           vdev_io_start_func_t                    *vdev_op_io_start;
77           vdev_io_done_func_t           *vdev_op_io_done;
78           vdev_state_change_func_t      *vdev_op_state_change;
79           vdev_hold_func_t              *vdev_op_hold;
80           vdev_rele_func_t              *vdev_op_rele;
81           char                                    vdev_op_type[16];
82           boolean_t                     vdev_op_leaf;
83 } vdev_ops_t;
84 
85 /*
86  * Virtual device properties
87  */
88 struct vdev_cache_entry {
89           char                *ve_data;
90           uint64_t  ve_offset;
91           uint64_t  ve_lastused;
92           avl_node_t          ve_offset_node;
93           avl_node_t          ve_lastused_node;
94           uint32_t  ve_hits;
95           uint16_t  ve_missed_update;
96           zio_t               *ve_fill_io;
97 };
98 
99 struct vdev_cache {
100           avl_tree_t          vc_offset_tree;
101           avl_tree_t          vc_lastused_tree;
102           kmutex_t  vc_lock;
103 };
104 
105 typedef struct vdev_queue_class {
106           uint32_t  vqc_active;
107 
108           /*
109            * Sorted by offset or timestamp, depending on if the queue is
110            * LBA-ordered vs FIFO.
111            */
112           avl_tree_t          vqc_queued_tree;
113 } vdev_queue_class_t;
114 
115 struct vdev_queue {
116           vdev_t              *vq_vdev;
117           vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
118           avl_tree_t          vq_active_tree;
119           avl_tree_t          vq_read_offset_tree;
120           avl_tree_t          vq_write_offset_tree;
121           uint64_t  vq_last_offset;
122           hrtime_t  vq_io_complete_ts; /* time last i/o completed */
123           kmutex_t  vq_lock;
124           uint64_t  vq_lastoffset;
125 };
126 
127 /*
128  * Virtual device descriptor
129  */
130 struct vdev {
131           /*
132            * Common to all vdev types.
133            */
134           uint64_t  vdev_id;  /* child number in vdev parent          */
135           uint64_t  vdev_guid;          /* unique ID for this vdev    */
136           uint64_t  vdev_guid_sum;      /* self guid + all child guids          */
137           uint64_t  vdev_orig_guid;     /* orig. guid prior to remove */
138           uint64_t  vdev_asize;         /* allocatable device capacity          */
139           uint64_t  vdev_min_asize;     /* min acceptable asize                 */
140           uint64_t  vdev_max_asize;     /* max acceptable asize                 */
141           uint64_t  vdev_ashift;        /* block alignment shift      */
142           /*
143            * Logical block alignment shift
144            *
145            * The smallest sized/aligned I/O supported by the device.
146            */
147           uint64_t        vdev_logical_ashift;
148           /*
149            * Physical block alignment shift
150            *
151            * The device supports logical I/Os with vdev_logical_ashift
152            * size/alignment, but optimum performance will be achieved by
153            * aligning/sizing requests to vdev_physical_ashift.  Smaller
154            * requests may be inflated or incur device level read-modify-write
155            * operations.
156            *
157            * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
158          */
159           uint64_t        vdev_physical_ashift;
160           uint64_t  vdev_state;         /* see VDEV_STATE_* #defines  */
161           uint64_t  vdev_prevstate;     /* used when reopening a vdev */
162           vdev_ops_t          *vdev_ops;          /* vdev operations            */
163           spa_t               *vdev_spa;          /* spa for this vdev                    */
164           void                *vdev_tsd;          /* type-specific data                   */
165           vnode_t             *vdev_name_vp;      /* vnode for pathname                   */
166           vnode_t             *vdev_devid_vp;     /* vnode for devid            */
167           vdev_t              *vdev_top;          /* top-level vdev             */
168           vdev_t              *vdev_parent;       /* parent vdev                          */
169           vdev_t              **vdev_child;       /* array of children                    */
170           uint64_t  vdev_children;      /* number of children                   */
171           vdev_stat_t         vdev_stat;          /* virtual device statistics  */
172           boolean_t vdev_expanding;     /* expand the vdev?           */
173           boolean_t vdev_reopening;     /* reopen in progress?                  */
174           int                 vdev_open_error; /* error on last open            */
175           kthread_t *vdev_open_thread; /* thread opening children     */
176           uint64_t  vdev_crtxg;         /* txg when top-level was added */
177 
178           /*
179            * Top-level vdev state.
180            */
181           uint64_t  vdev_ms_array;      /* metaslab array object      */
182           uint64_t  vdev_ms_shift;      /* metaslab size shift                  */
183           uint64_t  vdev_ms_count;      /* number of metaslabs                  */
184           metaslab_group_t *vdev_mg;    /* metaslab group             */
185           metaslab_t          **vdev_ms;          /* metaslab array             */
186           txg_list_t          vdev_ms_list;       /* per-txg dirty metaslab lists         */
187           txg_list_t          vdev_dtl_list;      /* per-txg dirty DTL lists    */
188           txg_node_t          vdev_txg_node;      /* per-txg dirty vdev linkage */
189           boolean_t vdev_remove_wanted; /* async remove wanted?       */
190           boolean_t vdev_probe_wanted; /* async probe wanted?         */
191           list_node_t         vdev_config_dirty_node; /* config dirty list      */
192           list_node_t         vdev_state_dirty_node; /* state dirty list        */
193           uint64_t  vdev_deflate_ratio; /* deflation ratio (x512)     */
194           uint64_t  vdev_islog;         /* is an intent log device    */
195           uint64_t  vdev_removing;      /* device is being removed?   */
196           boolean_t vdev_ishole;        /* is a hole in the namespace */
197           kmutex_t  vdev_queue_lock; /* protects vdev_queue_depth     */
198           uint64_t  vdev_top_zap;
199 
200           /*
201            * The queue depth parameters determine how many async writes are
202            * still pending (i.e. allocated by net yet issued to disk) per
203            * top-level (vdev_async_write_queue_depth) and the maximum allowed
204            * (vdev_max_async_write_queue_depth). These values only apply to
205            * top-level vdevs.
206            */
207           uint64_t  vdev_async_write_queue_depth;
208           uint64_t  vdev_max_async_write_queue_depth;
209 
210           /*
211            * Leaf vdev state.
212            */
213           range_tree_t        *vdev_dtl[DTL_TYPES]; /* dirty time logs          */
214           space_map_t         *vdev_dtl_sm;       /* dirty time log space map   */
215           txg_node_t          vdev_dtl_node;      /* per-txg dirty DTL linkage  */
216           uint64_t  vdev_dtl_object; /* DTL object                              */
217           uint64_t  vdev_psize;         /* physical device capacity   */
218           uint64_t  vdev_wholedisk;     /* true if this is a whole disk */
219           uint64_t  vdev_offline;       /* persistent offline state   */
220           uint64_t  vdev_faulted;       /* persistent faulted state   */
221           uint64_t  vdev_degraded;      /* persistent degraded state  */
222           uint64_t  vdev_removed;       /* persistent removed state   */
223           uint64_t  vdev_resilver_txg; /* persistent resilvering state */
224           uint64_t  vdev_nparity;       /* number of parity devices for raidz */
225           char                *vdev_path;         /* vdev path (if any)                   */
226           char                *vdev_devid;        /* vdev devid (if any)                  */
227           char                *vdev_physpath;     /* vdev device path (if any)  */
228           char                *vdev_fru;          /* physical FRU location      */
229           uint64_t  vdev_not_present; /* not present during import    */
230           uint64_t  vdev_unspare;       /* unspare when resilvering done */
231           boolean_t vdev_nowritecache; /* true if flushwritecache failed */
232           boolean_t vdev_notrim;        /* true if trim failed */
233           boolean_t vdev_checkremove; /* temporary online test        */
234           boolean_t vdev_forcefault; /* force online fault            */
235           boolean_t vdev_splitting;     /* split or repair in progress  */
236           boolean_t vdev_delayed_close; /* delayed device close?      */
237           boolean_t vdev_tmpoffline; /* device taken offline temporarily? */
238           boolean_t vdev_detached;      /* device detached?           */
239           boolean_t vdev_cant_read;     /* vdev is failing all reads  */
240           boolean_t vdev_cant_write; /* vdev is failing all writes    */
241           boolean_t vdev_isspare;       /* was a hot spare            */
242           boolean_t vdev_isl2cache;     /* was a l2cache device                 */
243           vdev_queue_t        vdev_queue;         /* I/O deadline schedule queue          */
244           vdev_cache_t        vdev_cache;         /* physical block cache                 */
245           spa_aux_vdev_t      *vdev_aux;          /* for l2cache and spares vdevs         */
246           zio_t               *vdev_probe_zio; /* root of current probe         */
247           vdev_aux_t          vdev_label_aux;     /* on-disk aux state                    */
248           struct trim_map     *vdev_trimmap;      /* map on outstanding trims   */
249           uint16_t  vdev_rotation_rate; /* rotational rate of the media */
250 #define   VDEV_RATE_UNKNOWN   0
251 #define   VDEV_RATE_NON_ROTATING        1
252           uint64_t  vdev_leaf_zap;
253 
254           /*
255            * For DTrace to work in userland (libzpool) context, these fields must
256            * remain at the end of the structure.  DTrace will use the kernel's
257            * CTF definition for 'struct vdev', and since the size of a kmutex_t is
258            * larger in userland, the offsets for the rest of the fields would be
259            * incorrect.
260            */
261           kmutex_t  vdev_dtl_lock;      /* vdev_dtl_{map,resilver}    */
262           kmutex_t  vdev_stat_lock;     /* vdev_stat                            */
263           kmutex_t  vdev_probe_lock; /* protects vdev_probe_zio       */
264 };
265 
266 #define   VDEV_RAIDZ_MAXPARITY          3
267 
268 #define   VDEV_PAD_SIZE                 (8 << 10)
269 /* 2 padding areas (vl_pad1 and vl_pad2) to skip */
270 #define   VDEV_SKIP_SIZE                VDEV_PAD_SIZE * 2
271 #define   VDEV_PHYS_SIZE                (112 << 10)
272 #define   VDEV_UBERBLOCK_RING (128 << 10)
273 
274 /* The largest uberblock we support is 8k. */
275 #define   MAX_UBERBLOCK_SHIFT (13)
276 #define   VDEV_UBERBLOCK_SHIFT(vd)      \
277           MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
278               MAX_UBERBLOCK_SHIFT)
279 #define   VDEV_UBERBLOCK_COUNT(vd)      \
280           (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
281 #define   VDEV_UBERBLOCK_OFFSET(vd, n)  \
282           offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
283 #define   VDEV_UBERBLOCK_SIZE(vd)                 (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
284 
285 typedef struct vdev_phys {
286           char                vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
287           zio_eck_t vp_zbt;
288 } vdev_phys_t;
289 
290 typedef struct vdev_label {
291           char                vl_pad1[VDEV_PAD_SIZE];                           /*  8K */
292           char                vl_pad2[VDEV_PAD_SIZE];                           /*  8K */
293           vdev_phys_t         vl_vdev_phys;                                     /* 112K   */
294           char                vl_uberblock[VDEV_UBERBLOCK_RING];      /* 128K   */
295 } vdev_label_t;                                                                 /* 256K total */
296 
297 /*
298  * vdev_dirty() flags
299  */
300 #define   VDD_METASLAB        0x01
301 #define   VDD_DTL             0x02
302 
303 /* Offset of embedded boot loader region on each label */
304 #define   VDEV_BOOT_OFFSET    (2 * sizeof (vdev_label_t))
305 /*
306  * Size of embedded boot loader region on each label.
307  * The total size of the first two labels plus the boot area is 4MB.
308  */
309 #define   VDEV_BOOT_SIZE                (7ULL << 19)                            /* 3.5M */
310 
311 /*
312  * Size of label regions at the start and end of each leaf device.
313  */
314 #define   VDEV_LABEL_START_SIZE         (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
315 #define   VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
316 #define   VDEV_LABELS                   4
317 #define   VDEV_BEST_LABEL               VDEV_LABELS
318 
319 #define   VDEV_ALLOC_LOAD               0
320 #define   VDEV_ALLOC_ADD                1
321 #define   VDEV_ALLOC_SPARE    2
322 #define   VDEV_ALLOC_L2CACHE  3
323 #define   VDEV_ALLOC_ROOTPOOL 4
324 #define   VDEV_ALLOC_SPLIT    5
325 #define   VDEV_ALLOC_ATTACH   6
326 
327 /*
328  * Allocate or free a vdev
329  */
330 extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
331     vdev_ops_t *ops);
332 extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
333     vdev_t *parent, uint_t id, int alloctype);
334 extern void vdev_free(vdev_t *vd);
335 
336 /*
337  * Add or remove children and parents
338  */
339 extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
340 extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
341 extern void vdev_compact_children(vdev_t *pvd);
342 extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
343 extern void vdev_remove_parent(vdev_t *cvd);
344 
345 /*
346  * vdev sync load and sync
347  */
348 extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
349 extern boolean_t vdev_log_state_valid(vdev_t *vd);
350 extern void vdev_load(vdev_t *vd);
351 extern int vdev_dtl_load(vdev_t *vd);
352 extern void vdev_sync(vdev_t *vd, uint64_t txg);
353 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
354 extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
355 extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
356 
357 /*
358  * Available vdev types.
359  */
360 extern vdev_ops_t vdev_root_ops;
361 extern vdev_ops_t vdev_mirror_ops;
362 extern vdev_ops_t vdev_replacing_ops;
363 extern vdev_ops_t vdev_raidz_ops;
364 #if defined(__FreeBSD__) && defined(_KERNEL)
365 extern vdev_ops_t vdev_geom_ops;
366 #else
367 extern vdev_ops_t vdev_disk_ops;
368 #endif
369 extern vdev_ops_t vdev_file_ops;
370 extern vdev_ops_t vdev_missing_ops;
371 extern vdev_ops_t vdev_hole_ops;
372 extern vdev_ops_t vdev_spare_ops;
373 
374 /*
375  * Common size functions
376  */
377 extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
378 extern uint64_t vdev_get_min_asize(vdev_t *vd);
379 extern void vdev_set_min_asize(vdev_t *vd);
380 
381 /*
382  * Global variables
383  */
384 /* zdb uses this tunable, so it must be declared here to make lint happy. */
385 extern int zfs_vdev_cache_size;
386 extern uint_t zfs_geom_probe_vdev_key;
387 
388 #ifdef illumos
389 /*
390  * The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
391  */
392 typedef struct vdev_buf {
393           buf_t     vb_buf;             /* buffer that describes the io */
394           zio_t     *vb_io;             /* pointer back to the original zio_t */
395 } vdev_buf_t;
396 #endif
397 
398 #ifdef    __cplusplus
399 }
400 #endif
401 
402 #endif    /* _SYS_VDEV_IMPL_H */
403