xref: /freebsd-11-stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c (revision f305fabfe8c184888b47a26d5c32eddf91074754)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #ifdef illumos
33 #include <sys/vdev_disk.h>
34 #endif
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
37 #include <sys/zio.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/abd.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/fm/fs/zfs.h>
42 #include <sys/bio.h>
43 
44 #ifdef ZFS_DEBUG
45 #include <sys/vdev_initialize.h>	/* vdev_xlate testing */
46 #endif
47 
48 /*
49  * Virtual device vector for RAID-Z.
50  *
51  * This vdev supports single, double, and triple parity. For single parity,
52  * we use a simple XOR of all the data columns. For double or triple parity,
53  * we use a special case of Reed-Solomon coding. This extends the
54  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
55  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
56  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
57  * former is also based. The latter is designed to provide higher performance
58  * for writes.
59  *
60  * Note that the Plank paper claimed to support arbitrary N+M, but was then
61  * amended six years later identifying a critical flaw that invalidates its
62  * claims. Nevertheless, the technique can be adapted to work for up to
63  * triple parity. For additional parity, the amendment "Note: Correction to
64  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
65  * is viable, but the additional complexity means that write performance will
66  * suffer.
67  *
68  * All of the methods above operate on a Galois field, defined over the
69  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
70  * can be expressed with a single byte. Briefly, the operations on the
71  * field are defined as follows:
72  *
73  *   o addition (+) is represented by a bitwise XOR
74  *   o subtraction (-) is therefore identical to addition: A + B = A - B
75  *   o multiplication of A by 2 is defined by the following bitwise expression:
76  *
77  *	(A * 2)_7 = A_6
78  *	(A * 2)_6 = A_5
79  *	(A * 2)_5 = A_4
80  *	(A * 2)_4 = A_3 + A_7
81  *	(A * 2)_3 = A_2 + A_7
82  *	(A * 2)_2 = A_1 + A_7
83  *	(A * 2)_1 = A_0
84  *	(A * 2)_0 = A_7
85  *
86  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
87  * As an aside, this multiplication is derived from the error correcting
88  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
89  *
90  * Observe that any number in the field (except for 0) can be expressed as a
91  * power of 2 -- a generator for the field. We store a table of the powers of
92  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
93  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
94  * than field addition). The inverse of a field element A (A^-1) is therefore
95  * A ^ (255 - 1) = A^254.
96  *
97  * The up-to-three parity columns, P, Q, R over several data columns,
98  * D_0, ... D_n-1, can be expressed by field operations:
99  *
100  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
101  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
102  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
103  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
104  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
105  *
106  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
107  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
108  * independent coefficients. (There are no additional coefficients that have
109  * this property which is why the uncorrected Plank method breaks down.)
110  *
111  * See the reconstruction code below for how P, Q and R can used individually
112  * or in concert to recover missing data columns.
113  */
114 
115 typedef struct raidz_col {
116 	uint64_t rc_devidx;		/* child device index for I/O */
117 	uint64_t rc_offset;		/* device offset */
118 	uint64_t rc_size;		/* I/O size */
119 	abd_t *rc_abd;			/* I/O data */
120 	void *rc_gdata;			/* used to store the "good" version */
121 	int rc_error;			/* I/O error for this device */
122 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
123 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
124 } raidz_col_t;
125 
126 typedef struct raidz_map {
127 	uint64_t rm_cols;		/* Regular column count */
128 	uint64_t rm_scols;		/* Count including skipped columns */
129 	uint64_t rm_bigcols;		/* Number of oversized columns */
130 	uint64_t rm_asize;		/* Actual total I/O size */
131 	uint64_t rm_missingdata;	/* Count of missing data devices */
132 	uint64_t rm_missingparity;	/* Count of missing parity devices */
133 	uint64_t rm_firstdatacol;	/* First data column/parity count */
134 	uint64_t rm_nskip;		/* Skipped sectors for padding */
135 	uint64_t rm_skipstart;		/* Column index of padding start */
136 	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
137 	uintptr_t rm_reports;		/* # of referencing checksum reports */
138 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
139 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
140 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
141 } raidz_map_t;
142 
143 #define	VDEV_RAIDZ_P		0
144 #define	VDEV_RAIDZ_Q		1
145 #define	VDEV_RAIDZ_R		2
146 
147 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
148 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
149 
150 /*
151  * We provide a mechanism to perform the field multiplication operation on a
152  * 64-bit value all at once rather than a byte at a time. This works by
153  * creating a mask from the top bit in each byte and using that to
154  * conditionally apply the XOR of 0x1d.
155  */
156 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
157 { \
158 	(mask) = (x) & 0x8080808080808080ULL; \
159 	(mask) = ((mask) << 1) - ((mask) >> 7); \
160 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
161 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
162 }
163 
164 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
165 { \
166 	VDEV_RAIDZ_64MUL_2((x), mask); \
167 	VDEV_RAIDZ_64MUL_2((x), mask); \
168 }
169 
170 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
171 
172 /*
173  * Force reconstruction to use the general purpose method.
174  */
175 int vdev_raidz_default_to_general;
176 
177 /* Powers of 2 in the Galois field defined above. */
178 static const uint8_t vdev_raidz_pow2[256] = {
179 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
180 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
181 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
182 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
183 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
184 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
185 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
186 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
187 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
188 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
189 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
190 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
191 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
192 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
193 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
194 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
195 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
196 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
197 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
198 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
199 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
200 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
201 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
202 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
203 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
204 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
205 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
206 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
207 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
208 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
209 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
210 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
211 };
212 /* Logs of 2 in the Galois field defined above. */
213 static const uint8_t vdev_raidz_log2[256] = {
214 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
215 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
216 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
217 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
218 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
219 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
220 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
221 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
222 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
223 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
224 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
225 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
226 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
227 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
228 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
229 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
230 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
231 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
232 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
233 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
234 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
235 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
236 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
237 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
238 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
239 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
240 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
241 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
242 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
243 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
244 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
245 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
246 };
247 
248 static void vdev_raidz_generate_parity(raidz_map_t *rm);
249 
250 /*
251  * Multiply a given number by 2 raised to the given power.
252  */
253 static uint8_t
vdev_raidz_exp2(uint_t a,int exp)254 vdev_raidz_exp2(uint_t a, int exp)
255 {
256 	if (a == 0)
257 		return (0);
258 
259 	ASSERT(exp >= 0);
260 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
261 
262 	exp += vdev_raidz_log2[a];
263 	if (exp > 255)
264 		exp -= 255;
265 
266 	return (vdev_raidz_pow2[exp]);
267 }
268 
269 static void
vdev_raidz_map_free(raidz_map_t * rm)270 vdev_raidz_map_free(raidz_map_t *rm)
271 {
272 	int c;
273 	size_t size;
274 
275 	for (c = 0; c < rm->rm_firstdatacol; c++) {
276 		if (rm->rm_col[c].rc_abd != NULL)
277 			abd_free(rm->rm_col[c].rc_abd);
278 
279 		if (rm->rm_col[c].rc_gdata != NULL)
280 			zio_buf_free(rm->rm_col[c].rc_gdata,
281 			    rm->rm_col[c].rc_size);
282 	}
283 
284 	size = 0;
285 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
286 		if (rm->rm_col[c].rc_abd != NULL)
287 			abd_put(rm->rm_col[c].rc_abd);
288 		size += rm->rm_col[c].rc_size;
289 	}
290 
291 	if (rm->rm_abd_copy != NULL)
292 		abd_free(rm->rm_abd_copy);
293 
294 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
295 }
296 
297 static void
vdev_raidz_map_free_vsd(zio_t * zio)298 vdev_raidz_map_free_vsd(zio_t *zio)
299 {
300 	raidz_map_t *rm = zio->io_vsd;
301 
302 	ASSERT0(rm->rm_freed);
303 	rm->rm_freed = 1;
304 
305 	if (rm->rm_reports == 0)
306 		vdev_raidz_map_free(rm);
307 }
308 
309 /*ARGSUSED*/
310 static void
vdev_raidz_cksum_free(void * arg,size_t ignored)311 vdev_raidz_cksum_free(void *arg, size_t ignored)
312 {
313 	raidz_map_t *rm = arg;
314 
315 	ASSERT3U(rm->rm_reports, >, 0);
316 
317 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
318 		vdev_raidz_map_free(rm);
319 }
320 
321 static void
vdev_raidz_cksum_finish(zio_cksum_report_t * zcr,const void * good_data)322 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
323 {
324 	raidz_map_t *rm = zcr->zcr_cbdata;
325 	size_t c = zcr->zcr_cbinfo;
326 	size_t x;
327 
328 	const char *good = NULL;
329 	char *bad;
330 
331 	if (good_data == NULL) {
332 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
333 		return;
334 	}
335 
336 	if (c < rm->rm_firstdatacol) {
337 		/*
338 		 * The first time through, calculate the parity blocks for
339 		 * the good data (this relies on the fact that the good
340 		 * data never changes for a given logical ZIO)
341 		 */
342 		if (rm->rm_col[0].rc_gdata == NULL) {
343 			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
344 			char *buf;
345 			int offset;
346 
347 			/*
348 			 * Set up the rm_col[]s to generate the parity for
349 			 * good_data, first saving the parity bufs and
350 			 * replacing them with buffers to hold the result.
351 			 */
352 			for (x = 0; x < rm->rm_firstdatacol; x++) {
353 				bad_parity[x] = rm->rm_col[x].rc_abd;
354 				rm->rm_col[x].rc_gdata =
355 				    zio_buf_alloc(rm->rm_col[x].rc_size);
356 				rm->rm_col[x].rc_abd =
357 				    abd_get_from_buf(rm->rm_col[x].rc_gdata,
358 				    rm->rm_col[x].rc_size);
359 			}
360 
361 			/* fill in the data columns from good_data */
362 			buf = (char *)good_data;
363 			for (; x < rm->rm_cols; x++) {
364 				abd_put(rm->rm_col[x].rc_abd);
365 				rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
366 				    rm->rm_col[x].rc_size);
367 				buf += rm->rm_col[x].rc_size;
368 			}
369 
370 			/*
371 			 * Construct the parity from the good data.
372 			 */
373 			vdev_raidz_generate_parity(rm);
374 
375 			/* restore everything back to its original state */
376 			for (x = 0; x < rm->rm_firstdatacol; x++) {
377 				abd_put(rm->rm_col[x].rc_abd);
378 				rm->rm_col[x].rc_abd = bad_parity[x];
379 			}
380 
381 			offset = 0;
382 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
383 				abd_put(rm->rm_col[x].rc_abd);
384 				rm->rm_col[x].rc_abd = abd_get_offset(
385 				    rm->rm_abd_copy, offset);
386 				offset += rm->rm_col[x].rc_size;
387 			}
388 		}
389 
390 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
391 		good = rm->rm_col[c].rc_gdata;
392 	} else {
393 		/* adjust good_data to point at the start of our column */
394 		good = good_data;
395 
396 		for (x = rm->rm_firstdatacol; x < c; x++)
397 			good += rm->rm_col[x].rc_size;
398 	}
399 
400 	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
401 	/* we drop the ereport if it ends up that the data was good */
402 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
403 	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
404 }
405 
406 /*
407  * Invoked indirectly by zfs_ereport_start_checksum(), called
408  * below when our read operation fails completely.  The main point
409  * is to keep a copy of everything we read from disk, so that at
410  * vdev_raidz_cksum_finish() time we can compare it with the good data.
411  */
412 static void
vdev_raidz_cksum_report(zio_t * zio,zio_cksum_report_t * zcr,void * arg)413 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
414 {
415 	size_t c = (size_t)(uintptr_t)arg;
416 	size_t offset;
417 
418 	raidz_map_t *rm = zio->io_vsd;
419 	size_t size;
420 
421 	/* set up the report and bump the refcount  */
422 	zcr->zcr_cbdata = rm;
423 	zcr->zcr_cbinfo = c;
424 	zcr->zcr_finish = vdev_raidz_cksum_finish;
425 	zcr->zcr_free = vdev_raidz_cksum_free;
426 
427 	rm->rm_reports++;
428 	ASSERT3U(rm->rm_reports, >, 0);
429 
430 	if (rm->rm_abd_copy != NULL)
431 		return;
432 
433 	/*
434 	 * It's the first time we're called for this raidz_map_t, so we need
435 	 * to copy the data aside; there's no guarantee that our zio's buffer
436 	 * won't be re-used for something else.
437 	 *
438 	 * Our parity data is already in separate buffers, so there's no need
439 	 * to copy them.
440 	 */
441 
442 	size = 0;
443 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
444 		size += rm->rm_col[c].rc_size;
445 
446 	rm->rm_abd_copy =
447 	    abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
448 
449 	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
450 		raidz_col_t *col = &rm->rm_col[c];
451 		abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
452 
453 		abd_copy(tmp, col->rc_abd, col->rc_size);
454 		abd_put(col->rc_abd);
455 		col->rc_abd = tmp;
456 
457 		offset += col->rc_size;
458 	}
459 	ASSERT3U(offset, ==, size);
460 }
461 
462 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
463 	vdev_raidz_map_free_vsd,
464 	vdev_raidz_cksum_report
465 };
466 
467 /*
468  * Divides the IO evenly across all child vdevs; usually, dcols is
469  * the number of children in the target vdev.
470  */
471 static raidz_map_t *
vdev_raidz_map_alloc(abd_t * abd,uint64_t size,uint64_t offset,boolean_t dofree,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)472 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
473     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
474 {
475 	raidz_map_t *rm;
476 	/* The starting RAIDZ (parent) vdev sector of the block. */
477 	uint64_t b = offset >> unit_shift;
478 	/* The zio's size in units of the vdev's minimum sector size. */
479 	uint64_t s = size >> unit_shift;
480 	/* The first column for this stripe. */
481 	uint64_t f = b % dcols;
482 	/* The starting byte offset on each child vdev. */
483 	uint64_t o = (b / dcols) << unit_shift;
484 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
485 	uint64_t off = 0;
486 
487 	/*
488 	 * "Quotient": The number of data sectors for this stripe on all but
489 	 * the "big column" child vdevs that also contain "remainder" data.
490 	 */
491 	q = s / (dcols - nparity);
492 
493 	/*
494 	 * "Remainder": The number of partial stripe data sectors in this I/O.
495 	 * This will add a sector to some, but not all, child vdevs.
496 	 */
497 	r = s - q * (dcols - nparity);
498 
499 	/* The number of "big columns" - those which contain remainder data. */
500 	bc = (r == 0 ? 0 : r + nparity);
501 
502 	/*
503 	 * The total number of data and parity sectors associated with
504 	 * this I/O.
505 	 */
506 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
507 
508 	/* acols: The columns that will be accessed. */
509 	/* scols: The columns that will be accessed or skipped. */
510 	if (q == 0) {
511 		/* Our I/O request doesn't span all child vdevs. */
512 		acols = bc;
513 		scols = MIN(dcols, roundup(bc, nparity + 1));
514 	} else {
515 		acols = dcols;
516 		scols = dcols;
517 	}
518 
519 	ASSERT3U(acols, <=, scols);
520 
521 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
522 
523 	rm->rm_cols = acols;
524 	rm->rm_scols = scols;
525 	rm->rm_bigcols = bc;
526 	rm->rm_skipstart = bc;
527 	rm->rm_missingdata = 0;
528 	rm->rm_missingparity = 0;
529 	rm->rm_firstdatacol = nparity;
530 	rm->rm_abd_copy = NULL;
531 	rm->rm_reports = 0;
532 	rm->rm_freed = 0;
533 	rm->rm_ecksuminjected = 0;
534 
535 	asize = 0;
536 
537 	for (c = 0; c < scols; c++) {
538 		col = f + c;
539 		coff = o;
540 		if (col >= dcols) {
541 			col -= dcols;
542 			coff += 1ULL << unit_shift;
543 		}
544 		rm->rm_col[c].rc_devidx = col;
545 		rm->rm_col[c].rc_offset = coff;
546 		rm->rm_col[c].rc_abd = NULL;
547 		rm->rm_col[c].rc_gdata = NULL;
548 		rm->rm_col[c].rc_error = 0;
549 		rm->rm_col[c].rc_tried = 0;
550 		rm->rm_col[c].rc_skipped = 0;
551 
552 		if (c >= acols)
553 			rm->rm_col[c].rc_size = 0;
554 		else if (c < bc)
555 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
556 		else
557 			rm->rm_col[c].rc_size = q << unit_shift;
558 
559 		asize += rm->rm_col[c].rc_size;
560 	}
561 
562 	ASSERT3U(asize, ==, tot << unit_shift);
563 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
564 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
565 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
566 	ASSERT3U(rm->rm_nskip, <=, nparity);
567 
568 	if (!dofree) {
569 		for (c = 0; c < rm->rm_firstdatacol; c++) {
570 			rm->rm_col[c].rc_abd =
571 			    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
572 		}
573 
574 		rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
575 		off = rm->rm_col[c].rc_size;
576 
577 		for (c = c + 1; c < acols; c++) {
578 			rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
579 			off += rm->rm_col[c].rc_size;
580 		}
581 	}
582 
583 	/*
584 	 * If all data stored spans all columns, there's a danger that parity
585 	 * will always be on the same device and, since parity isn't read
586 	 * during normal operation, that that device's I/O bandwidth won't be
587 	 * used effectively. We therefore switch the parity every 1MB.
588 	 *
589 	 * ... at least that was, ostensibly, the theory. As a practical
590 	 * matter unless we juggle the parity between all devices evenly, we
591 	 * won't see any benefit. Further, occasional writes that aren't a
592 	 * multiple of the LCM of the number of children and the minimum
593 	 * stripe width are sufficient to avoid pessimal behavior.
594 	 * Unfortunately, this decision created an implicit on-disk format
595 	 * requirement that we need to support for all eternity, but only
596 	 * for single-parity RAID-Z.
597 	 *
598 	 * If we intend to skip a sector in the zeroth column for padding
599 	 * we must make sure to note this swap. We will never intend to
600 	 * skip the first column since at least one data and one parity
601 	 * column must appear in each row.
602 	 */
603 	ASSERT(rm->rm_cols >= 2);
604 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
605 
606 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
607 		devidx = rm->rm_col[0].rc_devidx;
608 		o = rm->rm_col[0].rc_offset;
609 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
610 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
611 		rm->rm_col[1].rc_devidx = devidx;
612 		rm->rm_col[1].rc_offset = o;
613 
614 		if (rm->rm_skipstart == 0)
615 			rm->rm_skipstart = 1;
616 	}
617 
618 	return (rm);
619 }
620 
621 struct pqr_struct {
622 	uint64_t *p;
623 	uint64_t *q;
624 	uint64_t *r;
625 };
626 
627 static int
vdev_raidz_p_func(void * buf,size_t size,void * private)628 vdev_raidz_p_func(void *buf, size_t size, void *private)
629 {
630 	struct pqr_struct *pqr = private;
631 	const uint64_t *src = buf;
632 	int i, cnt = size / sizeof (src[0]);
633 
634 	ASSERT(pqr->p && !pqr->q && !pqr->r);
635 
636 	for (i = 0; i < cnt; i++, src++, pqr->p++)
637 		*pqr->p ^= *src;
638 
639 	return (0);
640 }
641 
642 static int
vdev_raidz_pq_func(void * buf,size_t size,void * private)643 vdev_raidz_pq_func(void *buf, size_t size, void *private)
644 {
645 	struct pqr_struct *pqr = private;
646 	const uint64_t *src = buf;
647 	uint64_t mask;
648 	int i, cnt = size / sizeof (src[0]);
649 
650 	ASSERT(pqr->p && pqr->q && !pqr->r);
651 
652 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
653 		*pqr->p ^= *src;
654 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
655 		*pqr->q ^= *src;
656 	}
657 
658 	return (0);
659 }
660 
661 static int
vdev_raidz_pqr_func(void * buf,size_t size,void * private)662 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
663 {
664 	struct pqr_struct *pqr = private;
665 	const uint64_t *src = buf;
666 	uint64_t mask;
667 	int i, cnt = size / sizeof (src[0]);
668 
669 	ASSERT(pqr->p && pqr->q && pqr->r);
670 
671 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
672 		*pqr->p ^= *src;
673 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
674 		*pqr->q ^= *src;
675 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
676 		*pqr->r ^= *src;
677 	}
678 
679 	return (0);
680 }
681 
682 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)683 vdev_raidz_generate_parity_p(raidz_map_t *rm)
684 {
685 	uint64_t *p;
686 	int c;
687 	abd_t *src;
688 
689 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
690 		src = rm->rm_col[c].rc_abd;
691 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
692 
693 		if (c == rm->rm_firstdatacol) {
694 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
695 		} else {
696 			struct pqr_struct pqr = { p, NULL, NULL };
697 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
698 			    vdev_raidz_p_func, &pqr);
699 		}
700 	}
701 }
702 
703 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)704 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
705 {
706 	uint64_t *p, *q, pcnt, ccnt, mask, i;
707 	int c;
708 	abd_t *src;
709 
710 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
711 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
712 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
713 
714 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
715 		src = rm->rm_col[c].rc_abd;
716 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
717 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
718 
719 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
720 
721 		if (c == rm->rm_firstdatacol) {
722 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
723 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
724 		} else {
725 			struct pqr_struct pqr = { p, q, NULL };
726 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
727 			    vdev_raidz_pq_func, &pqr);
728 		}
729 
730 		if (c == rm->rm_firstdatacol) {
731 			for (i = ccnt; i < pcnt; i++) {
732 				p[i] = 0;
733 				q[i] = 0;
734 			}
735 		} else {
736 			/*
737 			 * Treat short columns as though they are full of 0s.
738 			 * Note that there's therefore nothing needed for P.
739 			 */
740 			for (i = ccnt; i < pcnt; i++) {
741 				VDEV_RAIDZ_64MUL_2(q[i], mask);
742 			}
743 		}
744 	}
745 }
746 
747 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)748 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
749 {
750 	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
751 	int c;
752 	abd_t *src;
753 
754 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
755 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
756 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
757 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
758 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
759 
760 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
761 		src = rm->rm_col[c].rc_abd;
762 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
763 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
764 		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
765 
766 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
767 
768 		if (c == rm->rm_firstdatacol) {
769 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
770 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
771 			(void) memcpy(r, p, rm->rm_col[c].rc_size);
772 		} else {
773 			struct pqr_struct pqr = { p, q, r };
774 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
775 			    vdev_raidz_pqr_func, &pqr);
776 		}
777 
778 		if (c == rm->rm_firstdatacol) {
779 			for (i = ccnt; i < pcnt; i++) {
780 				p[i] = 0;
781 				q[i] = 0;
782 				r[i] = 0;
783 			}
784 		} else {
785 			/*
786 			 * Treat short columns as though they are full of 0s.
787 			 * Note that there's therefore nothing needed for P.
788 			 */
789 			for (i = ccnt; i < pcnt; i++) {
790 				VDEV_RAIDZ_64MUL_2(q[i], mask);
791 				VDEV_RAIDZ_64MUL_4(r[i], mask);
792 			}
793 		}
794 	}
795 }
796 
797 /*
798  * Generate RAID parity in the first virtual columns according to the number of
799  * parity columns available.
800  */
801 static void
vdev_raidz_generate_parity(raidz_map_t * rm)802 vdev_raidz_generate_parity(raidz_map_t *rm)
803 {
804 	switch (rm->rm_firstdatacol) {
805 	case 1:
806 		vdev_raidz_generate_parity_p(rm);
807 		break;
808 	case 2:
809 		vdev_raidz_generate_parity_pq(rm);
810 		break;
811 	case 3:
812 		vdev_raidz_generate_parity_pqr(rm);
813 		break;
814 	default:
815 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
816 	}
817 }
818 
819 /* ARGSUSED */
820 static int
vdev_raidz_reconst_p_func(void * dbuf,void * sbuf,size_t size,void * private)821 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
822 {
823 	uint64_t *dst = dbuf;
824 	uint64_t *src = sbuf;
825 	int cnt = size / sizeof (src[0]);
826 
827 	for (int i = 0; i < cnt; i++) {
828 		dst[i] ^= src[i];
829 	}
830 
831 	return (0);
832 }
833 
834 /* ARGSUSED */
835 static int
vdev_raidz_reconst_q_pre_func(void * dbuf,void * sbuf,size_t size,void * private)836 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
837     void *private)
838 {
839 	uint64_t *dst = dbuf;
840 	uint64_t *src = sbuf;
841 	uint64_t mask;
842 	int cnt = size / sizeof (dst[0]);
843 
844 	for (int i = 0; i < cnt; i++, dst++, src++) {
845 		VDEV_RAIDZ_64MUL_2(*dst, mask);
846 		*dst ^= *src;
847 	}
848 
849 	return (0);
850 }
851 
852 /* ARGSUSED */
853 static int
vdev_raidz_reconst_q_pre_tail_func(void * buf,size_t size,void * private)854 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
855 {
856 	uint64_t *dst = buf;
857 	uint64_t mask;
858 	int cnt = size / sizeof (dst[0]);
859 
860 	for (int i = 0; i < cnt; i++, dst++) {
861 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
862 		VDEV_RAIDZ_64MUL_2(*dst, mask);
863 	}
864 
865 	return (0);
866 }
867 
868 struct reconst_q_struct {
869 	uint64_t *q;
870 	int exp;
871 };
872 
873 static int
vdev_raidz_reconst_q_post_func(void * buf,size_t size,void * private)874 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
875 {
876 	struct reconst_q_struct *rq = private;
877 	uint64_t *dst = buf;
878 	int cnt = size / sizeof (dst[0]);
879 
880 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
881 		*dst ^= *rq->q;
882 
883 		int j;
884 		uint8_t *b;
885 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
886 			*b = vdev_raidz_exp2(*b, rq->exp);
887 		}
888 	}
889 
890 	return (0);
891 }
892 
893 struct reconst_pq_struct {
894 	uint8_t *p;
895 	uint8_t *q;
896 	uint8_t *pxy;
897 	uint8_t *qxy;
898 	int aexp;
899 	int bexp;
900 };
901 
902 static int
vdev_raidz_reconst_pq_func(void * xbuf,void * ybuf,size_t size,void * private)903 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
904 {
905 	struct reconst_pq_struct *rpq = private;
906 	uint8_t *xd = xbuf;
907 	uint8_t *yd = ybuf;
908 
909 	for (int i = 0; i < size;
910 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
911 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
912 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
913 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
914 	}
915 
916 	return (0);
917 }
918 
919 static int
vdev_raidz_reconst_pq_tail_func(void * xbuf,size_t size,void * private)920 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
921 {
922 	struct reconst_pq_struct *rpq = private;
923 	uint8_t *xd = xbuf;
924 
925 	for (int i = 0; i < size;
926 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
927 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
928 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
929 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
930 	}
931 
932 	return (0);
933 }
934 
935 static int
vdev_raidz_reconstruct_p(raidz_map_t * rm,int * tgts,int ntgts)936 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
937 {
938 	int x = tgts[0];
939 	int c;
940 	abd_t *dst, *src;
941 
942 	ASSERT(ntgts == 1);
943 	ASSERT(x >= rm->rm_firstdatacol);
944 	ASSERT(x < rm->rm_cols);
945 
946 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
947 	ASSERT(rm->rm_col[x].rc_size > 0);
948 
949 	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
950 	dst = rm->rm_col[x].rc_abd;
951 
952 	abd_copy(dst, src, rm->rm_col[x].rc_size);
953 
954 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
955 		uint64_t size = MIN(rm->rm_col[x].rc_size,
956 		    rm->rm_col[c].rc_size);
957 
958 		src = rm->rm_col[c].rc_abd;
959 		dst = rm->rm_col[x].rc_abd;
960 
961 		if (c == x)
962 			continue;
963 
964 		(void) abd_iterate_func2(dst, src, 0, 0, size,
965 		    vdev_raidz_reconst_p_func, NULL);
966 	}
967 
968 	return (1 << VDEV_RAIDZ_P);
969 }
970 
971 static int
vdev_raidz_reconstruct_q(raidz_map_t * rm,int * tgts,int ntgts)972 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
973 {
974 	int x = tgts[0];
975 	int c, exp;
976 	abd_t *dst, *src;
977 
978 	ASSERT(ntgts == 1);
979 
980 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
981 
982 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
983 		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
984 		    rm->rm_col[c].rc_size);
985 
986 		src = rm->rm_col[c].rc_abd;
987 		dst = rm->rm_col[x].rc_abd;
988 
989 		if (c == rm->rm_firstdatacol) {
990 			abd_copy(dst, src, size);
991 			if (rm->rm_col[x].rc_size > size)
992 				abd_zero_off(dst, size,
993 				    rm->rm_col[x].rc_size - size);
994 		} else {
995 			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
996 			(void) abd_iterate_func2(dst, src, 0, 0, size,
997 			    vdev_raidz_reconst_q_pre_func, NULL);
998 			(void) abd_iterate_func(dst,
999 			    size, rm->rm_col[x].rc_size - size,
1000 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1001 		}
1002 	}
1003 
1004 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1005 	dst = rm->rm_col[x].rc_abd;
1006 	exp = 255 - (rm->rm_cols - 1 - x);
1007 
1008 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1009 	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1010 	    vdev_raidz_reconst_q_post_func, &rq);
1011 
1012 	return (1 << VDEV_RAIDZ_Q);
1013 }
1014 
1015 static int
vdev_raidz_reconstruct_pq(raidz_map_t * rm,int * tgts,int ntgts)1016 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1017 {
1018 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1019 	abd_t *pdata, *qdata;
1020 	uint64_t xsize, ysize;
1021 	int x = tgts[0];
1022 	int y = tgts[1];
1023 	abd_t *xd, *yd;
1024 
1025 	ASSERT(ntgts == 2);
1026 	ASSERT(x < y);
1027 	ASSERT(x >= rm->rm_firstdatacol);
1028 	ASSERT(y < rm->rm_cols);
1029 
1030 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1031 
1032 	/*
1033 	 * Move the parity data aside -- we're going to compute parity as
1034 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1035 	 * reuse the parity generation mechanism without trashing the actual
1036 	 * parity so we make those columns appear to be full of zeros by
1037 	 * setting their lengths to zero.
1038 	 */
1039 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1040 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1041 	xsize = rm->rm_col[x].rc_size;
1042 	ysize = rm->rm_col[y].rc_size;
1043 
1044 	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1045 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1046 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1047 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1048 	rm->rm_col[x].rc_size = 0;
1049 	rm->rm_col[y].rc_size = 0;
1050 
1051 	vdev_raidz_generate_parity_pq(rm);
1052 
1053 	rm->rm_col[x].rc_size = xsize;
1054 	rm->rm_col[y].rc_size = ysize;
1055 
1056 	p = abd_to_buf(pdata);
1057 	q = abd_to_buf(qdata);
1058 	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1059 	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1060 	xd = rm->rm_col[x].rc_abd;
1061 	yd = rm->rm_col[y].rc_abd;
1062 
1063 	/*
1064 	 * We now have:
1065 	 *	Pxy = P + D_x + D_y
1066 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1067 	 *
1068 	 * We can then solve for D_x:
1069 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1070 	 * where
1071 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1072 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1073 	 *
1074 	 * With D_x in hand, we can easily solve for D_y:
1075 	 *	D_y = P + Pxy + D_x
1076 	 */
1077 
1078 	a = vdev_raidz_pow2[255 + x - y];
1079 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1080 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1081 
1082 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1083 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1084 
1085 	ASSERT3U(xsize, >=, ysize);
1086 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1087 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1088 	    vdev_raidz_reconst_pq_func, &rpq);
1089 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1090 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1091 
1092 	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1093 	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1094 
1095 	/*
1096 	 * Restore the saved parity data.
1097 	 */
1098 	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1099 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1100 
1101 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1102 }
1103 
1104 /* BEGIN CSTYLED */
1105 /*
1106  * In the general case of reconstruction, we must solve the system of linear
1107  * equations defined by the coeffecients used to generate parity as well as
1108  * the contents of the data and parity disks. This can be expressed with
1109  * vectors for the original data (D) and the actual data (d) and parity (p)
1110  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1111  *
1112  *            __   __                     __     __
1113  *            |     |         __     __   |  p_0  |
1114  *            |  V  |         |  D_0  |   | p_m-1 |
1115  *            |     |    x    |   :   | = |  d_0  |
1116  *            |  I  |         | D_n-1 |   |   :   |
1117  *            |     |         ~~     ~~   | d_n-1 |
1118  *            ~~   ~~                     ~~     ~~
1119  *
1120  * I is simply a square identity matrix of size n, and V is a vandermonde
1121  * matrix defined by the coeffecients we chose for the various parity columns
1122  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1123  * computation as well as linear separability.
1124  *
1125  *      __               __               __     __
1126  *      |   1   ..  1 1 1 |               |  p_0  |
1127  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1128  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1129  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1130  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1131  *      |   :       : : : |   |   :   |   |  d_2  |
1132  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1133  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1134  *      |   0   ..  0 0 1 |               | d_n-1 |
1135  *      ~~               ~~               ~~     ~~
1136  *
1137  * Note that I, V, d, and p are known. To compute D, we must invert the
1138  * matrix and use the known data and parity values to reconstruct the unknown
1139  * data values. We begin by removing the rows in V|I and d|p that correspond
1140  * to failed or missing columns; we then make V|I square (n x n) and d|p
1141  * sized n by removing rows corresponding to unused parity from the bottom up
1142  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1143  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1144  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1145  *           __                               __
1146  *           |  1   1   1   1   1   1   1   1  |
1147  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1148  *           |  19 205 116  29  64  16  4   1  |      / /
1149  *           |  1   0   0   0   0   0   0   0  |     / /
1150  *           |  0   1   0   0   0   0   0   0  | <--' /
1151  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1152  *           |  0   0   0   1   0   0   0   0  |
1153  *           |  0   0   0   0   1   0   0   0  |
1154  *           |  0   0   0   0   0   1   0   0  |
1155  *           |  0   0   0   0   0   0   1   0  |
1156  *           |  0   0   0   0   0   0   0   1  |
1157  *           ~~                               ~~
1158  *           __                               __
1159  *           |  1   1   1   1   1   1   1   1  |
1160  *           |  19 205 116  29  64  16  4   1  |
1161  *           |  1   0   0   0   0   0   0   0  |
1162  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1163  *           |  0   0   0   0   1   0   0   0  |
1164  *           |  0   0   0   0   0   1   0   0  |
1165  *           |  0   0   0   0   0   0   1   0  |
1166  *           |  0   0   0   0   0   0   0   1  |
1167  *           ~~                               ~~
1168  *
1169  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1170  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1171  * matrix is not singular.
1172  * __                                                                 __
1173  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1174  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1175  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1176  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1177  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1178  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1179  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1180  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1181  * ~~                                                                 ~~
1182  * __                                                                 __
1183  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1184  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1185  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1186  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1187  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1188  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1189  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1190  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1191  * ~~                                                                 ~~
1192  * __                                                                 __
1193  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1194  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1195  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1196  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1197  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1198  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1199  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1200  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1201  * ~~                                                                 ~~
1202  * __                                                                 __
1203  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1204  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1205  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1206  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1207  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1208  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1209  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1210  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1211  * ~~                                                                 ~~
1212  * __                                                                 __
1213  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1214  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1215  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1216  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1217  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1218  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1219  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1220  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1221  * ~~                                                                 ~~
1222  * __                                                                 __
1223  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1224  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1225  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1226  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1227  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1228  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1229  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1230  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1231  * ~~                                                                 ~~
1232  *                   __                               __
1233  *                   |  0   0   1   0   0   0   0   0  |
1234  *                   | 167 100  5   41 159 169 217 208 |
1235  *                   | 166 100  4   40 158 168 216 209 |
1236  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1237  *                   |  0   0   0   0   1   0   0   0  |
1238  *                   |  0   0   0   0   0   1   0   0  |
1239  *                   |  0   0   0   0   0   0   1   0  |
1240  *                   |  0   0   0   0   0   0   0   1  |
1241  *                   ~~                               ~~
1242  *
1243  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1244  * of the missing data.
1245  *
1246  * As is apparent from the example above, the only non-trivial rows in the
1247  * inverse matrix correspond to the data disks that we're trying to
1248  * reconstruct. Indeed, those are the only rows we need as the others would
1249  * only be useful for reconstructing data known or assumed to be valid. For
1250  * that reason, we only build the coefficients in the rows that correspond to
1251  * targeted columns.
1252  */
1253 /* END CSTYLED */
1254 
1255 static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)1256 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1257     uint8_t **rows)
1258 {
1259 	int i, j;
1260 	int pow;
1261 
1262 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1263 
1264 	/*
1265 	 * Fill in the missing rows of interest.
1266 	 */
1267 	for (i = 0; i < nmap; i++) {
1268 		ASSERT3S(0, <=, map[i]);
1269 		ASSERT3S(map[i], <=, 2);
1270 
1271 		pow = map[i] * n;
1272 		if (pow > 255)
1273 			pow -= 255;
1274 		ASSERT(pow <= 255);
1275 
1276 		for (j = 0; j < n; j++) {
1277 			pow -= map[i];
1278 			if (pow < 0)
1279 				pow += 255;
1280 			rows[i][j] = vdev_raidz_pow2[pow];
1281 		}
1282 	}
1283 }
1284 
1285 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1286 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1287     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1288 {
1289 	int i, j, ii, jj;
1290 	uint8_t log;
1291 
1292 	/*
1293 	 * Assert that the first nmissing entries from the array of used
1294 	 * columns correspond to parity columns and that subsequent entries
1295 	 * correspond to data columns.
1296 	 */
1297 	for (i = 0; i < nmissing; i++) {
1298 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1299 	}
1300 	for (; i < n; i++) {
1301 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1302 	}
1303 
1304 	/*
1305 	 * First initialize the storage where we'll compute the inverse rows.
1306 	 */
1307 	for (i = 0; i < nmissing; i++) {
1308 		for (j = 0; j < n; j++) {
1309 			invrows[i][j] = (i == j) ? 1 : 0;
1310 		}
1311 	}
1312 
1313 	/*
1314 	 * Subtract all trivial rows from the rows of consequence.
1315 	 */
1316 	for (i = 0; i < nmissing; i++) {
1317 		for (j = nmissing; j < n; j++) {
1318 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1319 			jj = used[j] - rm->rm_firstdatacol;
1320 			ASSERT3S(jj, <, n);
1321 			invrows[i][j] = rows[i][jj];
1322 			rows[i][jj] = 0;
1323 		}
1324 	}
1325 
1326 	/*
1327 	 * For each of the rows of interest, we must normalize it and subtract
1328 	 * a multiple of it from the other rows.
1329 	 */
1330 	for (i = 0; i < nmissing; i++) {
1331 		for (j = 0; j < missing[i]; j++) {
1332 			ASSERT0(rows[i][j]);
1333 		}
1334 		ASSERT3U(rows[i][missing[i]], !=, 0);
1335 
1336 		/*
1337 		 * Compute the inverse of the first element and multiply each
1338 		 * element in the row by that value.
1339 		 */
1340 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1341 
1342 		for (j = 0; j < n; j++) {
1343 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1344 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1345 		}
1346 
1347 		for (ii = 0; ii < nmissing; ii++) {
1348 			if (i == ii)
1349 				continue;
1350 
1351 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1352 
1353 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1354 
1355 			for (j = 0; j < n; j++) {
1356 				rows[ii][j] ^=
1357 				    vdev_raidz_exp2(rows[i][j], log);
1358 				invrows[ii][j] ^=
1359 				    vdev_raidz_exp2(invrows[i][j], log);
1360 			}
1361 		}
1362 	}
1363 
1364 	/*
1365 	 * Verify that the data that is left in the rows are properly part of
1366 	 * an identity matrix.
1367 	 */
1368 	for (i = 0; i < nmissing; i++) {
1369 		for (j = 0; j < n; j++) {
1370 			if (j == missing[i]) {
1371 				ASSERT3U(rows[i][j], ==, 1);
1372 			} else {
1373 				ASSERT0(rows[i][j]);
1374 			}
1375 		}
1376 	}
1377 }
1378 
1379 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1380 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1381     int *missing, uint8_t **invrows, const uint8_t *used)
1382 {
1383 	int i, j, x, cc, c;
1384 	uint8_t *src;
1385 	uint64_t ccount;
1386 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1387 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1388 	uint8_t log = 0;
1389 	uint8_t val;
1390 	int ll;
1391 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1392 	uint8_t *p, *pp;
1393 	size_t psize;
1394 
1395 	psize = sizeof (invlog[0][0]) * n * nmissing;
1396 	p = kmem_alloc(psize, KM_SLEEP);
1397 
1398 	for (pp = p, i = 0; i < nmissing; i++) {
1399 		invlog[i] = pp;
1400 		pp += n;
1401 	}
1402 
1403 	for (i = 0; i < nmissing; i++) {
1404 		for (j = 0; j < n; j++) {
1405 			ASSERT3U(invrows[i][j], !=, 0);
1406 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1407 		}
1408 	}
1409 
1410 	for (i = 0; i < n; i++) {
1411 		c = used[i];
1412 		ASSERT3U(c, <, rm->rm_cols);
1413 
1414 		src = abd_to_buf(rm->rm_col[c].rc_abd);
1415 		ccount = rm->rm_col[c].rc_size;
1416 		for (j = 0; j < nmissing; j++) {
1417 			cc = missing[j] + rm->rm_firstdatacol;
1418 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1419 			ASSERT3U(cc, <, rm->rm_cols);
1420 			ASSERT3U(cc, !=, c);
1421 
1422 			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1423 			dcount[j] = rm->rm_col[cc].rc_size;
1424 		}
1425 
1426 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1427 
1428 		for (x = 0; x < ccount; x++, src++) {
1429 			if (*src != 0)
1430 				log = vdev_raidz_log2[*src];
1431 
1432 			for (cc = 0; cc < nmissing; cc++) {
1433 				if (x >= dcount[cc])
1434 					continue;
1435 
1436 				if (*src == 0) {
1437 					val = 0;
1438 				} else {
1439 					if ((ll = log + invlog[cc][i]) >= 255)
1440 						ll -= 255;
1441 					val = vdev_raidz_pow2[ll];
1442 				}
1443 
1444 				if (i == 0)
1445 					dst[cc][x] = val;
1446 				else
1447 					dst[cc][x] ^= val;
1448 			}
1449 		}
1450 	}
1451 
1452 	kmem_free(p, psize);
1453 }
1454 
1455 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1456 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1457 {
1458 	int n, i, c, t, tt;
1459 	int nmissing_rows;
1460 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1461 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1462 
1463 	uint8_t *p, *pp;
1464 	size_t psize;
1465 
1466 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1467 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1468 	uint8_t *used;
1469 
1470 	abd_t **bufs = NULL;
1471 
1472 	int code = 0;
1473 
1474 	/*
1475 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1476 	 * temporary linear ABDs.
1477 	 */
1478 	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1479 		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1480 
1481 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1482 			raidz_col_t *col = &rm->rm_col[c];
1483 
1484 			bufs[c] = col->rc_abd;
1485 			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1486 			abd_copy(col->rc_abd, bufs[c], col->rc_size);
1487 		}
1488 	}
1489 
1490 	n = rm->rm_cols - rm->rm_firstdatacol;
1491 
1492 	/*
1493 	 * Figure out which data columns are missing.
1494 	 */
1495 	nmissing_rows = 0;
1496 	for (t = 0; t < ntgts; t++) {
1497 		if (tgts[t] >= rm->rm_firstdatacol) {
1498 			missing_rows[nmissing_rows++] =
1499 			    tgts[t] - rm->rm_firstdatacol;
1500 		}
1501 	}
1502 
1503 	/*
1504 	 * Figure out which parity columns to use to help generate the missing
1505 	 * data columns.
1506 	 */
1507 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1508 		ASSERT(tt < ntgts);
1509 		ASSERT(c < rm->rm_firstdatacol);
1510 
1511 		/*
1512 		 * Skip any targeted parity columns.
1513 		 */
1514 		if (c == tgts[tt]) {
1515 			tt++;
1516 			continue;
1517 		}
1518 
1519 		code |= 1 << c;
1520 
1521 		parity_map[i] = c;
1522 		i++;
1523 	}
1524 
1525 	ASSERT(code != 0);
1526 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1527 
1528 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1529 	    nmissing_rows * n + sizeof (used[0]) * n;
1530 	p = kmem_alloc(psize, KM_SLEEP);
1531 
1532 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1533 		rows[i] = pp;
1534 		pp += n;
1535 		invrows[i] = pp;
1536 		pp += n;
1537 	}
1538 	used = pp;
1539 
1540 	for (i = 0; i < nmissing_rows; i++) {
1541 		used[i] = parity_map[i];
1542 	}
1543 
1544 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1545 		if (tt < nmissing_rows &&
1546 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1547 			tt++;
1548 			continue;
1549 		}
1550 
1551 		ASSERT3S(i, <, n);
1552 		used[i] = c;
1553 		i++;
1554 	}
1555 
1556 	/*
1557 	 * Initialize the interesting rows of the matrix.
1558 	 */
1559 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1560 
1561 	/*
1562 	 * Invert the matrix.
1563 	 */
1564 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1565 	    invrows, used);
1566 
1567 	/*
1568 	 * Reconstruct the missing data using the generated matrix.
1569 	 */
1570 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1571 	    invrows, used);
1572 
1573 	kmem_free(p, psize);
1574 
1575 	/*
1576 	 * copy back from temporary linear abds and free them
1577 	 */
1578 	if (bufs) {
1579 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1580 			raidz_col_t *col = &rm->rm_col[c];
1581 
1582 			abd_copy(bufs[c], col->rc_abd, col->rc_size);
1583 			abd_free(col->rc_abd);
1584 			col->rc_abd = bufs[c];
1585 		}
1586 		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1587 	}
1588 
1589 	return (code);
1590 }
1591 
1592 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1593 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1594 {
1595 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1596 	int ntgts;
1597 	int i, c;
1598 	int code;
1599 	int nbadparity, nbaddata;
1600 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1601 
1602 	/*
1603 	 * The tgts list must already be sorted.
1604 	 */
1605 	for (i = 1; i < nt; i++) {
1606 		ASSERT(t[i] > t[i - 1]);
1607 	}
1608 
1609 	nbadparity = rm->rm_firstdatacol;
1610 	nbaddata = rm->rm_cols - nbadparity;
1611 	ntgts = 0;
1612 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1613 		if (c < rm->rm_firstdatacol)
1614 			parity_valid[c] = B_FALSE;
1615 
1616 		if (i < nt && c == t[i]) {
1617 			tgts[ntgts++] = c;
1618 			i++;
1619 		} else if (rm->rm_col[c].rc_error != 0) {
1620 			tgts[ntgts++] = c;
1621 		} else if (c >= rm->rm_firstdatacol) {
1622 			nbaddata--;
1623 		} else {
1624 			parity_valid[c] = B_TRUE;
1625 			nbadparity--;
1626 		}
1627 	}
1628 
1629 	ASSERT(ntgts >= nt);
1630 	ASSERT(nbaddata >= 0);
1631 	ASSERT(nbaddata + nbadparity == ntgts);
1632 
1633 	dt = &tgts[nbadparity];
1634 
1635 	/*
1636 	 * See if we can use any of our optimized reconstruction routines.
1637 	 */
1638 	if (!vdev_raidz_default_to_general) {
1639 		switch (nbaddata) {
1640 		case 1:
1641 			if (parity_valid[VDEV_RAIDZ_P])
1642 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1643 
1644 			ASSERT(rm->rm_firstdatacol > 1);
1645 
1646 			if (parity_valid[VDEV_RAIDZ_Q])
1647 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1648 
1649 			ASSERT(rm->rm_firstdatacol > 2);
1650 			break;
1651 
1652 		case 2:
1653 			ASSERT(rm->rm_firstdatacol > 1);
1654 
1655 			if (parity_valid[VDEV_RAIDZ_P] &&
1656 			    parity_valid[VDEV_RAIDZ_Q])
1657 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1658 
1659 			ASSERT(rm->rm_firstdatacol > 2);
1660 
1661 			break;
1662 		}
1663 	}
1664 
1665 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1666 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1667 	ASSERT(code > 0);
1668 	return (code);
1669 }
1670 
1671 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)1672 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1673     uint64_t *logical_ashift, uint64_t *physical_ashift)
1674 {
1675 	vdev_t *cvd;
1676 	uint64_t nparity = vd->vdev_nparity;
1677 	int c;
1678 	int lasterror = 0;
1679 	int numerrors = 0;
1680 
1681 	ASSERT(nparity > 0);
1682 
1683 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1684 	    vd->vdev_children < nparity + 1) {
1685 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1686 		return (SET_ERROR(EINVAL));
1687 	}
1688 
1689 	vdev_open_children(vd);
1690 
1691 	for (c = 0; c < vd->vdev_children; c++) {
1692 		cvd = vd->vdev_child[c];
1693 
1694 		if (cvd->vdev_open_error != 0) {
1695 			lasterror = cvd->vdev_open_error;
1696 			numerrors++;
1697 			continue;
1698 		}
1699 
1700 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1701 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1702 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1703 		*physical_ashift = MAX(*physical_ashift,
1704 		    cvd->vdev_physical_ashift);
1705 	}
1706 
1707 	*asize *= vd->vdev_children;
1708 	*max_asize *= vd->vdev_children;
1709 
1710 	if (numerrors > nparity) {
1711 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1712 		return (lasterror);
1713 	}
1714 
1715 	return (0);
1716 }
1717 
1718 static void
vdev_raidz_close(vdev_t * vd)1719 vdev_raidz_close(vdev_t *vd)
1720 {
1721 	int c;
1722 
1723 	for (c = 0; c < vd->vdev_children; c++)
1724 		vdev_close(vd->vdev_child[c]);
1725 }
1726 
1727 #ifdef illumos
1728 /*
1729  * Handle a read or write I/O to a RAID-Z dump device.
1730  *
1731  * The dump device is in a unique situation compared to other ZFS datasets:
1732  * writing to this device should be as simple and fast as possible.  In
1733  * addition, durability matters much less since the dump will be extracted
1734  * once the machine reboots.  For that reason, this function eschews parity for
1735  * performance and simplicity.  The dump device uses the checksum setting
1736  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1737  * dataset.
1738  *
1739  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1740  * 128 KB will not fill an entire block; in addition, they may not be properly
1741  * aligned.  In that case, this function uses the preallocated 128 KB block and
1742  * omits reading or writing any "empty" portions of that block, as opposed to
1743  * allocating a fresh appropriately-sized block.
1744  *
1745  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1746  *
1747  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1748  *
1749  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1750  * allocated which spans all five child vdevs.  8 KB of data would be written to
1751  * each of four vdevs, with the fifth containing the parity bits.
1752  *
1753  *       parity    data     data     data     data
1754  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1755  *         ^        ^        ^        ^        ^
1756  *         |        |        |        |        |
1757  *   8 KB parity    ------8 KB data blocks------
1758  *
1759  * However, when writing to the dump device, the behavior is different:
1760  *
1761  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1762  *
1763  * Unlike the normal RAID-Z case in which the block is allocated based on the
1764  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1765  * I/O size is less than 128 KB, only the actual portions of data are written.
1766  * In this example the data is written to the third data vdev since that vdev
1767  * contains the offset [64 KB, 96 KB).
1768  *
1769  *       parity    data     data     data     data
1770  *     |        |        |        |   XX   |        |
1771  *                                    ^
1772  *                                    |
1773  *                             32 KB data block
1774  *
1775  * As a result, an individual I/O may not span all child vdevs; moreover, a
1776  * small I/O may only operate on a single child vdev.
1777  *
1778  * Note that since there are no parity bits calculated or written, this format
1779  * remains the same no matter how many parity bits are used in a normal RAID-Z
1780  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1781  * would look like:
1782  *
1783  *       parity   parity   parity    data     data     data     data
1784  *     |        |        |        |        |        |   XX   |        |
1785  *                                                      ^
1786  *                                                      |
1787  *                                               32 KB data block
1788  */
1789 int
vdev_raidz_physio(vdev_t * vd,caddr_t data,size_t size,uint64_t offset,uint64_t origoffset,boolean_t doread,boolean_t isdump)1790 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1791     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1792 {
1793 	vdev_t *tvd = vd->vdev_top;
1794 	vdev_t *cvd;
1795 	raidz_map_t *rm;
1796 	raidz_col_t *rc;
1797 	int c, err = 0;
1798 
1799 	uint64_t start, end, colstart, colend;
1800 	uint64_t coloffset, colsize, colskip;
1801 
1802 	int flags = doread ? BIO_READ : BIO_WRITE;
1803 
1804 #ifdef	_KERNEL
1805 
1806 	/*
1807 	 * Don't write past the end of the block
1808 	 */
1809 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1810 
1811 	start = offset;
1812 	end = start + size;
1813 
1814 	/*
1815 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1816 	 * from the "original" offset, this is, the offset of the extent which
1817 	 * contains the requisite offset of the data being read or written.
1818 	 *
1819 	 * Even if this I/O operation doesn't span the full block size, let's
1820 	 * treat the on-disk format as if the only blocks are the complete 128
1821 	 * KB size.
1822 	 */
1823 	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1824 	    SPA_OLD_MAXBLOCKSIZE);
1825 	rm = vdev_raidz_map_alloc(abd,
1826 	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1827 	    vd->vdev_children, vd->vdev_nparity);
1828 
1829 	coloffset = origoffset;
1830 
1831 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1832 	    c++, coloffset += rc->rc_size) {
1833 		rc = &rm->rm_col[c];
1834 		cvd = vd->vdev_child[rc->rc_devidx];
1835 
1836 		/*
1837 		 * Find the start and end of this column in the RAID-Z map,
1838 		 * keeping in mind that the stated size and offset of the
1839 		 * operation may not fill the entire column for this vdev.
1840 		 *
1841 		 * If any portion of the data spans this column, issue the
1842 		 * appropriate operation to the vdev.
1843 		 */
1844 		if (coloffset + rc->rc_size <= start)
1845 			continue;
1846 		if (coloffset >= end)
1847 			continue;
1848 
1849 		colstart = MAX(coloffset, start);
1850 		colend = MIN(end, coloffset + rc->rc_size);
1851 		colsize = colend - colstart;
1852 		colskip = colstart - coloffset;
1853 
1854 		VERIFY3U(colsize, <=, rc->rc_size);
1855 		VERIFY3U(colskip, <=, rc->rc_size);
1856 
1857 		/*
1858 		 * Note that the child vdev will have a vdev label at the start
1859 		 * of its range of offsets, hence the need for
1860 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1861 		 * example of why this calculation is needed.
1862 		 */
1863 		if ((err = vdev_disk_physio(cvd,
1864 		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1865 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1866 		    flags, isdump)) != 0)
1867 			break;
1868 	}
1869 
1870 	vdev_raidz_map_free(rm);
1871 	abd_put(abd);
1872 #endif	/* KERNEL */
1873 
1874 	return (err);
1875 }
1876 #endif
1877 
1878 static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize)1879 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1880 {
1881 	uint64_t asize;
1882 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1883 	uint64_t cols = vd->vdev_children;
1884 	uint64_t nparity = vd->vdev_nparity;
1885 
1886 	asize = ((psize - 1) >> ashift) + 1;
1887 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1888 	asize = roundup(asize, nparity + 1) << ashift;
1889 
1890 	return (asize);
1891 }
1892 
1893 static void
vdev_raidz_child_done(zio_t * zio)1894 vdev_raidz_child_done(zio_t *zio)
1895 {
1896 	raidz_col_t *rc = zio->io_private;
1897 
1898 	rc->rc_error = zio->io_error;
1899 	rc->rc_tried = 1;
1900 	rc->rc_skipped = 0;
1901 }
1902 
1903 static void
vdev_raidz_io_verify(zio_t * zio,raidz_map_t * rm,int col)1904 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1905 {
1906 #ifdef ZFS_DEBUG
1907 	vdev_t *vd = zio->io_vd;
1908 	vdev_t *tvd = vd->vdev_top;
1909 
1910 	range_seg_t logical_rs, physical_rs;
1911 	logical_rs.rs_start = zio->io_offset;
1912 	logical_rs.rs_end = logical_rs.rs_start +
1913 	    vdev_raidz_asize(zio->io_vd, zio->io_size);
1914 
1915 	raidz_col_t *rc = &rm->rm_col[col];
1916 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1917 
1918 	vdev_xlate(cvd, &logical_rs, &physical_rs);
1919 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1920 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1921 	/*
1922 	 * It would be nice to assert that rs_end is equal
1923 	 * to rc_offset + rc_size but there might be an
1924 	 * optional I/O at the end that is not accounted in
1925 	 * rc_size.
1926 	 */
1927 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1928 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1929 		    rc->rc_size + (1 << tvd->vdev_ashift));
1930 	} else {
1931 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1932 	}
1933 #endif
1934 }
1935 
1936 /*
1937  * Start an IO operation on a RAIDZ VDev
1938  *
1939  * Outline:
1940  * - For write operations:
1941  *   1. Generate the parity data
1942  *   2. Create child zio write operations to each column's vdev, for both
1943  *      data and parity.
1944  *   3. If the column skips any sectors for padding, create optional dummy
1945  *      write zio children for those areas to improve aggregation continuity.
1946  * - For read operations:
1947  *   1. Create child zio read operations to each data column's vdev to read
1948  *      the range of data required for zio.
1949  *   2. If this is a scrub or resilver operation, or if any of the data
1950  *      vdevs have had errors, then create zio read operations to the parity
1951  *      columns' VDevs as well.
1952  */
1953 static void
vdev_raidz_io_start(zio_t * zio)1954 vdev_raidz_io_start(zio_t *zio)
1955 {
1956 	vdev_t *vd = zio->io_vd;
1957 	vdev_t *tvd = vd->vdev_top;
1958 	vdev_t *cvd;
1959 	raidz_map_t *rm;
1960 	raidz_col_t *rc;
1961 	int c, i;
1962 
1963 	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1964 	    zio->io_type == ZIO_TYPE_FREE,
1965 	    tvd->vdev_ashift, vd->vdev_children,
1966 	    vd->vdev_nparity);
1967 
1968 	zio->io_vsd = rm;
1969 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1970 
1971 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1972 
1973 	if (zio->io_type == ZIO_TYPE_FREE) {
1974 		for (c = 0; c < rm->rm_cols; c++) {
1975 			rc = &rm->rm_col[c];
1976 			cvd = vd->vdev_child[rc->rc_devidx];
1977 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1978 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1979 			    zio->io_type, zio->io_priority, 0,
1980 			    vdev_raidz_child_done, rc));
1981 		}
1982 
1983 		zio_execute(zio);
1984 		return;
1985 	}
1986 
1987 	if (zio->io_type == ZIO_TYPE_WRITE) {
1988 		vdev_raidz_generate_parity(rm);
1989 
1990 		for (c = 0; c < rm->rm_cols; c++) {
1991 			rc = &rm->rm_col[c];
1992 			cvd = vd->vdev_child[rc->rc_devidx];
1993 
1994 			/*
1995 			 * Verify physical to logical translation.
1996 			 */
1997 			vdev_raidz_io_verify(zio, rm, c);
1998 
1999 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2000 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2001 			    zio->io_type, zio->io_priority, 0,
2002 			    vdev_raidz_child_done, rc));
2003 		}
2004 
2005 		/*
2006 		 * Generate optional I/Os for any skipped sectors to improve
2007 		 * aggregation contiguity.
2008 		 */
2009 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
2010 			ASSERT(c <= rm->rm_scols);
2011 			if (c == rm->rm_scols)
2012 				c = 0;
2013 			rc = &rm->rm_col[c];
2014 			cvd = vd->vdev_child[rc->rc_devidx];
2015 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2016 			    rc->rc_offset + rc->rc_size, NULL,
2017 			    1 << tvd->vdev_ashift,
2018 			    zio->io_type, zio->io_priority,
2019 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2020 		}
2021 
2022 		zio_execute(zio);
2023 		return;
2024 	}
2025 
2026 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2027 
2028 	/*
2029 	 * Iterate over the columns in reverse order so that we hit the parity
2030 	 * last -- any errors along the way will force us to read the parity.
2031 	 */
2032 	for (c = rm->rm_cols - 1; c >= 0; c--) {
2033 		rc = &rm->rm_col[c];
2034 		cvd = vd->vdev_child[rc->rc_devidx];
2035 		if (!vdev_readable(cvd)) {
2036 			if (c >= rm->rm_firstdatacol)
2037 				rm->rm_missingdata++;
2038 			else
2039 				rm->rm_missingparity++;
2040 			rc->rc_error = SET_ERROR(ENXIO);
2041 			rc->rc_tried = 1;	/* don't even try */
2042 			rc->rc_skipped = 1;
2043 			continue;
2044 		}
2045 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2046 			if (c >= rm->rm_firstdatacol)
2047 				rm->rm_missingdata++;
2048 			else
2049 				rm->rm_missingparity++;
2050 			rc->rc_error = SET_ERROR(ESTALE);
2051 			rc->rc_skipped = 1;
2052 			continue;
2053 		}
2054 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2055 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2056 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2057 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2058 			    zio->io_type, zio->io_priority, 0,
2059 			    vdev_raidz_child_done, rc));
2060 		}
2061 	}
2062 
2063 	zio_execute(zio);
2064 }
2065 
2066 
2067 /*
2068  * Report a checksum error for a child of a RAID-Z device.
2069  */
2070 static void
raidz_checksum_error(zio_t * zio,raidz_col_t * rc,void * bad_data)2071 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2072 {
2073 	void *buf;
2074 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2075 
2076 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2077 		zio_bad_cksum_t zbc;
2078 		raidz_map_t *rm = zio->io_vsd;
2079 
2080 		mutex_enter(&vd->vdev_stat_lock);
2081 		vd->vdev_stat.vs_checksum_errors++;
2082 		mutex_exit(&vd->vdev_stat_lock);
2083 
2084 		zbc.zbc_has_cksum = 0;
2085 		zbc.zbc_injected = rm->rm_ecksuminjected;
2086 
2087 		buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2088 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2089 		    rc->rc_offset, rc->rc_size, buf, bad_data,
2090 		    &zbc);
2091 		abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2092 	}
2093 }
2094 
2095 /*
2096  * We keep track of whether or not there were any injected errors, so that
2097  * any ereports we generate can note it.
2098  */
2099 static int
raidz_checksum_verify(zio_t * zio)2100 raidz_checksum_verify(zio_t *zio)
2101 {
2102 	zio_bad_cksum_t zbc;
2103 	raidz_map_t *rm = zio->io_vsd;
2104 
2105 	int ret = zio_checksum_error(zio, &zbc);
2106 	if (ret != 0 && zbc.zbc_injected != 0)
2107 		rm->rm_ecksuminjected = 1;
2108 
2109 	return (ret);
2110 }
2111 
2112 /*
2113  * Generate the parity from the data columns. If we tried and were able to
2114  * read the parity without error, verify that the generated parity matches the
2115  * data we read. If it doesn't, we fire off a checksum error. Return the
2116  * number such failures.
2117  */
2118 static int
raidz_parity_verify(zio_t * zio,raidz_map_t * rm)2119 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2120 {
2121 	void *orig[VDEV_RAIDZ_MAXPARITY];
2122 	int c, ret = 0;
2123 	raidz_col_t *rc;
2124 
2125 	blkptr_t *bp = zio->io_bp;
2126 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2127 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2128 
2129 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2130 		return (ret);
2131 
2132 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2133 		rc = &rm->rm_col[c];
2134 		if (!rc->rc_tried || rc->rc_error != 0)
2135 			continue;
2136 		orig[c] = zio_buf_alloc(rc->rc_size);
2137 		abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2138 	}
2139 
2140 	vdev_raidz_generate_parity(rm);
2141 
2142 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2143 		rc = &rm->rm_col[c];
2144 		if (!rc->rc_tried || rc->rc_error != 0)
2145 			continue;
2146 		if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2147 			raidz_checksum_error(zio, rc, orig[c]);
2148 			rc->rc_error = SET_ERROR(ECKSUM);
2149 			ret++;
2150 		}
2151 		zio_buf_free(orig[c], rc->rc_size);
2152 	}
2153 
2154 	return (ret);
2155 }
2156 
2157 /*
2158  * Keep statistics on all the ways that we used parity to correct data.
2159  */
2160 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2161 
2162 static int
vdev_raidz_worst_error(raidz_map_t * rm)2163 vdev_raidz_worst_error(raidz_map_t *rm)
2164 {
2165 	int error = 0;
2166 
2167 	for (int c = 0; c < rm->rm_cols; c++)
2168 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2169 
2170 	return (error);
2171 }
2172 
2173 /*
2174  * Iterate over all combinations of bad data and attempt a reconstruction.
2175  * Note that the algorithm below is non-optimal because it doesn't take into
2176  * account how reconstruction is actually performed. For example, with
2177  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2178  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2179  * cases we'd only use parity information in column 0.
2180  */
2181 static int
vdev_raidz_combrec(zio_t * zio,int total_errors,int data_errors)2182 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2183 {
2184 	raidz_map_t *rm = zio->io_vsd;
2185 	raidz_col_t *rc;
2186 	void *orig[VDEV_RAIDZ_MAXPARITY];
2187 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2188 	int *tgts = &tstore[1];
2189 	int current, next, i, c, n;
2190 	int code, ret = 0;
2191 
2192 	ASSERT(total_errors < rm->rm_firstdatacol);
2193 
2194 	/*
2195 	 * This simplifies one edge condition.
2196 	 */
2197 	tgts[-1] = -1;
2198 
2199 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2200 		/*
2201 		 * Initialize the targets array by finding the first n columns
2202 		 * that contain no error.
2203 		 *
2204 		 * If there were no data errors, we need to ensure that we're
2205 		 * always explicitly attempting to reconstruct at least one
2206 		 * data column. To do this, we simply push the highest target
2207 		 * up into the data columns.
2208 		 */
2209 		for (c = 0, i = 0; i < n; i++) {
2210 			if (i == n - 1 && data_errors == 0 &&
2211 			    c < rm->rm_firstdatacol) {
2212 				c = rm->rm_firstdatacol;
2213 			}
2214 
2215 			while (rm->rm_col[c].rc_error != 0) {
2216 				c++;
2217 				ASSERT3S(c, <, rm->rm_cols);
2218 			}
2219 
2220 			tgts[i] = c++;
2221 		}
2222 
2223 		/*
2224 		 * Setting tgts[n] simplifies the other edge condition.
2225 		 */
2226 		tgts[n] = rm->rm_cols;
2227 
2228 		/*
2229 		 * These buffers were allocated in previous iterations.
2230 		 */
2231 		for (i = 0; i < n - 1; i++) {
2232 			ASSERT(orig[i] != NULL);
2233 		}
2234 
2235 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2236 
2237 		current = 0;
2238 		next = tgts[current];
2239 
2240 		while (current != n) {
2241 			tgts[current] = next;
2242 			current = 0;
2243 
2244 			/*
2245 			 * Save off the original data that we're going to
2246 			 * attempt to reconstruct.
2247 			 */
2248 			for (i = 0; i < n; i++) {
2249 				ASSERT(orig[i] != NULL);
2250 				c = tgts[i];
2251 				ASSERT3S(c, >=, 0);
2252 				ASSERT3S(c, <, rm->rm_cols);
2253 				rc = &rm->rm_col[c];
2254 				abd_copy_to_buf(orig[i], rc->rc_abd,
2255 				    rc->rc_size);
2256 			}
2257 
2258 			/*
2259 			 * Attempt a reconstruction and exit the outer loop on
2260 			 * success.
2261 			 */
2262 			code = vdev_raidz_reconstruct(rm, tgts, n);
2263 			if (raidz_checksum_verify(zio) == 0) {
2264 				atomic_inc_64(&raidz_corrected[code]);
2265 
2266 				for (i = 0; i < n; i++) {
2267 					c = tgts[i];
2268 					rc = &rm->rm_col[c];
2269 					ASSERT(rc->rc_error == 0);
2270 					if (rc->rc_tried)
2271 						raidz_checksum_error(zio, rc,
2272 						    orig[i]);
2273 					rc->rc_error = SET_ERROR(ECKSUM);
2274 				}
2275 
2276 				ret = code;
2277 				goto done;
2278 			}
2279 
2280 			/*
2281 			 * Restore the original data.
2282 			 */
2283 			for (i = 0; i < n; i++) {
2284 				c = tgts[i];
2285 				rc = &rm->rm_col[c];
2286 				abd_copy_from_buf(rc->rc_abd, orig[i],
2287 				    rc->rc_size);
2288 			}
2289 
2290 			do {
2291 				/*
2292 				 * Find the next valid column after the current
2293 				 * position..
2294 				 */
2295 				for (next = tgts[current] + 1;
2296 				    next < rm->rm_cols &&
2297 				    rm->rm_col[next].rc_error != 0; next++)
2298 					continue;
2299 
2300 				ASSERT(next <= tgts[current + 1]);
2301 
2302 				/*
2303 				 * If that spot is available, we're done here.
2304 				 */
2305 				if (next != tgts[current + 1])
2306 					break;
2307 
2308 				/*
2309 				 * Otherwise, find the next valid column after
2310 				 * the previous position.
2311 				 */
2312 				for (c = tgts[current - 1] + 1;
2313 				    rm->rm_col[c].rc_error != 0; c++)
2314 					continue;
2315 
2316 				tgts[current] = c;
2317 				current++;
2318 
2319 			} while (current != n);
2320 		}
2321 	}
2322 	n--;
2323 done:
2324 	for (i = 0; i < n; i++) {
2325 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2326 	}
2327 
2328 	return (ret);
2329 }
2330 
2331 /*
2332  * Complete an IO operation on a RAIDZ VDev
2333  *
2334  * Outline:
2335  * - For write operations:
2336  *   1. Check for errors on the child IOs.
2337  *   2. Return, setting an error code if too few child VDevs were written
2338  *      to reconstruct the data later.  Note that partial writes are
2339  *      considered successful if they can be reconstructed at all.
2340  * - For read operations:
2341  *   1. Check for errors on the child IOs.
2342  *   2. If data errors occurred:
2343  *      a. Try to reassemble the data from the parity available.
2344  *      b. If we haven't yet read the parity drives, read them now.
2345  *      c. If all parity drives have been read but the data still doesn't
2346  *         reassemble with a correct checksum, then try combinatorial
2347  *         reconstruction.
2348  *      d. If that doesn't work, return an error.
2349  *   3. If there were unexpected errors or this is a resilver operation,
2350  *      rewrite the vdevs that had errors.
2351  */
2352 static void
vdev_raidz_io_done(zio_t * zio)2353 vdev_raidz_io_done(zio_t *zio)
2354 {
2355 	vdev_t *vd = zio->io_vd;
2356 	vdev_t *cvd;
2357 	raidz_map_t *rm = zio->io_vsd;
2358 	raidz_col_t *rc;
2359 	int unexpected_errors = 0;
2360 	int parity_errors = 0;
2361 	int parity_untried = 0;
2362 	int data_errors = 0;
2363 	int total_errors = 0;
2364 	int n, c;
2365 	int tgts[VDEV_RAIDZ_MAXPARITY];
2366 	int code;
2367 
2368 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2369 
2370 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2371 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2372 
2373 	for (c = 0; c < rm->rm_cols; c++) {
2374 		rc = &rm->rm_col[c];
2375 
2376 		if (rc->rc_error) {
2377 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2378 
2379 			if (c < rm->rm_firstdatacol)
2380 				parity_errors++;
2381 			else
2382 				data_errors++;
2383 
2384 			if (!rc->rc_skipped)
2385 				unexpected_errors++;
2386 
2387 			total_errors++;
2388 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2389 			parity_untried++;
2390 		}
2391 	}
2392 
2393 	if (zio->io_type == ZIO_TYPE_WRITE) {
2394 		/*
2395 		 * XXX -- for now, treat partial writes as a success.
2396 		 * (If we couldn't write enough columns to reconstruct
2397 		 * the data, the I/O failed.  Otherwise, good enough.)
2398 		 *
2399 		 * Now that we support write reallocation, it would be better
2400 		 * to treat partial failure as real failure unless there are
2401 		 * no non-degraded top-level vdevs left, and not update DTLs
2402 		 * if we intend to reallocate.
2403 		 */
2404 		/* XXPOLICY */
2405 		if (total_errors > rm->rm_firstdatacol)
2406 			zio->io_error = vdev_raidz_worst_error(rm);
2407 
2408 		return;
2409 	} else if (zio->io_type == ZIO_TYPE_FREE) {
2410 		return;
2411 	}
2412 
2413 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2414 	/*
2415 	 * There are three potential phases for a read:
2416 	 *	1. produce valid data from the columns read
2417 	 *	2. read all disks and try again
2418 	 *	3. perform combinatorial reconstruction
2419 	 *
2420 	 * Each phase is progressively both more expensive and less likely to
2421 	 * occur. If we encounter more errors than we can repair or all phases
2422 	 * fail, we have no choice but to return an error.
2423 	 */
2424 
2425 	/*
2426 	 * If the number of errors we saw was correctable -- less than or equal
2427 	 * to the number of parity disks read -- attempt to produce data that
2428 	 * has a valid checksum. Naturally, this case applies in the absence of
2429 	 * any errors.
2430 	 */
2431 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2432 		if (data_errors == 0) {
2433 			if (raidz_checksum_verify(zio) == 0) {
2434 				/*
2435 				 * If we read parity information (unnecessarily
2436 				 * as it happens since no reconstruction was
2437 				 * needed) regenerate and verify the parity.
2438 				 * We also regenerate parity when resilvering
2439 				 * so we can write it out to the failed device
2440 				 * later.
2441 				 */
2442 				if (parity_errors + parity_untried <
2443 				    rm->rm_firstdatacol ||
2444 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2445 					n = raidz_parity_verify(zio, rm);
2446 					unexpected_errors += n;
2447 					ASSERT(parity_errors + n <=
2448 					    rm->rm_firstdatacol);
2449 				}
2450 				goto done;
2451 			}
2452 		} else {
2453 			/*
2454 			 * We either attempt to read all the parity columns or
2455 			 * none of them. If we didn't try to read parity, we
2456 			 * wouldn't be here in the correctable case. There must
2457 			 * also have been fewer parity errors than parity
2458 			 * columns or, again, we wouldn't be in this code path.
2459 			 */
2460 			ASSERT(parity_untried == 0);
2461 			ASSERT(parity_errors < rm->rm_firstdatacol);
2462 
2463 			/*
2464 			 * Identify the data columns that reported an error.
2465 			 */
2466 			n = 0;
2467 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2468 				rc = &rm->rm_col[c];
2469 				if (rc->rc_error != 0) {
2470 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2471 					tgts[n++] = c;
2472 				}
2473 			}
2474 
2475 			ASSERT(rm->rm_firstdatacol >= n);
2476 
2477 			code = vdev_raidz_reconstruct(rm, tgts, n);
2478 
2479 			if (raidz_checksum_verify(zio) == 0) {
2480 				atomic_inc_64(&raidz_corrected[code]);
2481 
2482 				/*
2483 				 * If we read more parity disks than were used
2484 				 * for reconstruction, confirm that the other
2485 				 * parity disks produced correct data. This
2486 				 * routine is suboptimal in that it regenerates
2487 				 * the parity that we already used in addition
2488 				 * to the parity that we're attempting to
2489 				 * verify, but this should be a relatively
2490 				 * uncommon case, and can be optimized if it
2491 				 * becomes a problem. Note that we regenerate
2492 				 * parity when resilvering so we can write it
2493 				 * out to failed devices later.
2494 				 */
2495 				if (parity_errors < rm->rm_firstdatacol - n ||
2496 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2497 					n = raidz_parity_verify(zio, rm);
2498 					unexpected_errors += n;
2499 					ASSERT(parity_errors + n <=
2500 					    rm->rm_firstdatacol);
2501 				}
2502 
2503 				goto done;
2504 			}
2505 		}
2506 	}
2507 
2508 	/*
2509 	 * This isn't a typical situation -- either we got a read error or
2510 	 * a child silently returned bad data. Read every block so we can
2511 	 * try again with as much data and parity as we can track down. If
2512 	 * we've already been through once before, all children will be marked
2513 	 * as tried so we'll proceed to combinatorial reconstruction.
2514 	 */
2515 	unexpected_errors = 1;
2516 	rm->rm_missingdata = 0;
2517 	rm->rm_missingparity = 0;
2518 
2519 	for (c = 0; c < rm->rm_cols; c++) {
2520 		if (rm->rm_col[c].rc_tried)
2521 			continue;
2522 
2523 		zio_vdev_io_redone(zio);
2524 		do {
2525 			rc = &rm->rm_col[c];
2526 			if (rc->rc_tried)
2527 				continue;
2528 			zio_nowait(zio_vdev_child_io(zio, NULL,
2529 			    vd->vdev_child[rc->rc_devidx],
2530 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2531 			    zio->io_type, zio->io_priority, 0,
2532 			    vdev_raidz_child_done, rc));
2533 		} while (++c < rm->rm_cols);
2534 
2535 		return;
2536 	}
2537 
2538 	/*
2539 	 * At this point we've attempted to reconstruct the data given the
2540 	 * errors we detected, and we've attempted to read all columns. There
2541 	 * must, therefore, be one or more additional problems -- silent errors
2542 	 * resulting in invalid data rather than explicit I/O errors resulting
2543 	 * in absent data. We check if there is enough additional data to
2544 	 * possibly reconstruct the data and then perform combinatorial
2545 	 * reconstruction over all possible combinations. If that fails,
2546 	 * we're cooked.
2547 	 */
2548 	if (total_errors > rm->rm_firstdatacol) {
2549 		zio->io_error = vdev_raidz_worst_error(rm);
2550 
2551 	} else if (total_errors < rm->rm_firstdatacol &&
2552 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2553 		/*
2554 		 * If we didn't use all the available parity for the
2555 		 * combinatorial reconstruction, verify that the remaining
2556 		 * parity is correct.
2557 		 */
2558 		if (code != (1 << rm->rm_firstdatacol) - 1)
2559 			(void) raidz_parity_verify(zio, rm);
2560 	} else {
2561 		/*
2562 		 * We're here because either:
2563 		 *
2564 		 *	total_errors == rm_first_datacol, or
2565 		 *	vdev_raidz_combrec() failed
2566 		 *
2567 		 * In either case, there is enough bad data to prevent
2568 		 * reconstruction.
2569 		 *
2570 		 * Start checksum ereports for all children which haven't
2571 		 * failed, and the IO wasn't speculative.
2572 		 */
2573 		zio->io_error = SET_ERROR(ECKSUM);
2574 
2575 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2576 			for (c = 0; c < rm->rm_cols; c++) {
2577 				rc = &rm->rm_col[c];
2578 				if (rc->rc_error == 0) {
2579 					zio_bad_cksum_t zbc;
2580 					zbc.zbc_has_cksum = 0;
2581 					zbc.zbc_injected =
2582 					    rm->rm_ecksuminjected;
2583 
2584 					zfs_ereport_start_checksum(
2585 					    zio->io_spa,
2586 					    vd->vdev_child[rc->rc_devidx],
2587 					    zio, rc->rc_offset, rc->rc_size,
2588 					    (void *)(uintptr_t)c, &zbc);
2589 				}
2590 			}
2591 		}
2592 	}
2593 
2594 done:
2595 	zio_checksum_verified(zio);
2596 
2597 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2598 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2599 		/*
2600 		 * Use the good data we have in hand to repair damaged children.
2601 		 */
2602 		for (c = 0; c < rm->rm_cols; c++) {
2603 			rc = &rm->rm_col[c];
2604 			cvd = vd->vdev_child[rc->rc_devidx];
2605 
2606 			if (rc->rc_error == 0)
2607 				continue;
2608 
2609 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2610 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2611 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2612 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2613 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2614 		}
2615 	}
2616 }
2617 
2618 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)2619 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2620 {
2621 	if (faulted > vd->vdev_nparity)
2622 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2623 		    VDEV_AUX_NO_REPLICAS);
2624 	else if (degraded + faulted != 0)
2625 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2626 	else
2627 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2628 }
2629 
2630 /*
2631  * Determine if any portion of the provided block resides on a child vdev
2632  * with a dirty DTL and therefore needs to be resilvered.  The function
2633  * assumes that at least one DTL is dirty which imples that full stripe
2634  * width blocks must be resilvered.
2635  */
2636 static boolean_t
vdev_raidz_need_resilver(vdev_t * vd,uint64_t offset,size_t psize)2637 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2638 {
2639 	uint64_t dcols = vd->vdev_children;
2640 	uint64_t nparity = vd->vdev_nparity;
2641 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2642 	/* The starting RAIDZ (parent) vdev sector of the block. */
2643 	uint64_t b = offset >> ashift;
2644 	/* The zio's size in units of the vdev's minimum sector size. */
2645 	uint64_t s = ((psize - 1) >> ashift) + 1;
2646 	/* The first column for this stripe. */
2647 	uint64_t f = b % dcols;
2648 
2649 	if (s + nparity >= dcols)
2650 		return (B_TRUE);
2651 
2652 	for (uint64_t c = 0; c < s + nparity; c++) {
2653 		uint64_t devidx = (f + c) % dcols;
2654 		vdev_t *cvd = vd->vdev_child[devidx];
2655 
2656 		/*
2657 		 * dsl_scan_need_resilver() already checked vd with
2658 		 * vdev_dtl_contains(). So here just check cvd with
2659 		 * vdev_dtl_empty(), cheaper and a good approximation.
2660 		 */
2661 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2662 			return (B_TRUE);
2663 	}
2664 
2665 	return (B_FALSE);
2666 }
2667 
2668 static void
vdev_raidz_xlate(vdev_t * cvd,const range_seg_t * in,range_seg_t * res)2669 vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2670 {
2671 	vdev_t *raidvd = cvd->vdev_parent;
2672 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2673 
2674 	uint64_t width = raidvd->vdev_children;
2675 	uint64_t tgt_col = cvd->vdev_id;
2676 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2677 
2678 	/* make sure the offsets are block-aligned */
2679 	ASSERT0(in->rs_start % (1 << ashift));
2680 	ASSERT0(in->rs_end % (1 << ashift));
2681 	uint64_t b_start = in->rs_start >> ashift;
2682 	uint64_t b_end = in->rs_end >> ashift;
2683 
2684 	uint64_t start_row = 0;
2685 	if (b_start > tgt_col) /* avoid underflow */
2686 		start_row = ((b_start - tgt_col - 1) / width) + 1;
2687 
2688 	uint64_t end_row = 0;
2689 	if (b_end > tgt_col)
2690 		end_row = ((b_end - tgt_col - 1) / width) + 1;
2691 
2692 	res->rs_start = start_row << ashift;
2693 	res->rs_end = end_row << ashift;
2694 
2695 	ASSERT3U(res->rs_start, <=, in->rs_start);
2696 	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2697 }
2698 
2699 vdev_ops_t vdev_raidz_ops = {
2700 	vdev_raidz_open,
2701 	vdev_raidz_close,
2702 	vdev_raidz_asize,
2703 	vdev_raidz_io_start,
2704 	vdev_raidz_io_done,
2705 	vdev_raidz_state_change,
2706 	vdev_raidz_need_resilver,
2707 	NULL,
2708 	NULL,
2709 	NULL,
2710 	vdev_raidz_xlate,
2711 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2712 	B_FALSE			/* not a leaf vdev */
2713 };
2714