1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #ifdef illumos
33 #include <sys/vdev_disk.h>
34 #endif
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
37 #include <sys/zio.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/abd.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/fm/fs/zfs.h>
42 #include <sys/bio.h>
43
44 #ifdef ZFS_DEBUG
45 #include <sys/vdev_initialize.h> /* vdev_xlate testing */
46 #endif
47
48 /*
49 * Virtual device vector for RAID-Z.
50 *
51 * This vdev supports single, double, and triple parity. For single parity,
52 * we use a simple XOR of all the data columns. For double or triple parity,
53 * we use a special case of Reed-Solomon coding. This extends the
54 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
55 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
56 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
57 * former is also based. The latter is designed to provide higher performance
58 * for writes.
59 *
60 * Note that the Plank paper claimed to support arbitrary N+M, but was then
61 * amended six years later identifying a critical flaw that invalidates its
62 * claims. Nevertheless, the technique can be adapted to work for up to
63 * triple parity. For additional parity, the amendment "Note: Correction to
64 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
65 * is viable, but the additional complexity means that write performance will
66 * suffer.
67 *
68 * All of the methods above operate on a Galois field, defined over the
69 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
70 * can be expressed with a single byte. Briefly, the operations on the
71 * field are defined as follows:
72 *
73 * o addition (+) is represented by a bitwise XOR
74 * o subtraction (-) is therefore identical to addition: A + B = A - B
75 * o multiplication of A by 2 is defined by the following bitwise expression:
76 *
77 * (A * 2)_7 = A_6
78 * (A * 2)_6 = A_5
79 * (A * 2)_5 = A_4
80 * (A * 2)_4 = A_3 + A_7
81 * (A * 2)_3 = A_2 + A_7
82 * (A * 2)_2 = A_1 + A_7
83 * (A * 2)_1 = A_0
84 * (A * 2)_0 = A_7
85 *
86 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
87 * As an aside, this multiplication is derived from the error correcting
88 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
89 *
90 * Observe that any number in the field (except for 0) can be expressed as a
91 * power of 2 -- a generator for the field. We store a table of the powers of
92 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
93 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
94 * than field addition). The inverse of a field element A (A^-1) is therefore
95 * A ^ (255 - 1) = A^254.
96 *
97 * The up-to-three parity columns, P, Q, R over several data columns,
98 * D_0, ... D_n-1, can be expressed by field operations:
99 *
100 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
101 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
102 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
103 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
104 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
105 *
106 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
107 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
108 * independent coefficients. (There are no additional coefficients that have
109 * this property which is why the uncorrected Plank method breaks down.)
110 *
111 * See the reconstruction code below for how P, Q and R can used individually
112 * or in concert to recover missing data columns.
113 */
114
115 typedef struct raidz_col {
116 uint64_t rc_devidx; /* child device index for I/O */
117 uint64_t rc_offset; /* device offset */
118 uint64_t rc_size; /* I/O size */
119 abd_t *rc_abd; /* I/O data */
120 void *rc_gdata; /* used to store the "good" version */
121 int rc_error; /* I/O error for this device */
122 uint8_t rc_tried; /* Did we attempt this I/O column? */
123 uint8_t rc_skipped; /* Did we skip this I/O column? */
124 } raidz_col_t;
125
126 typedef struct raidz_map {
127 uint64_t rm_cols; /* Regular column count */
128 uint64_t rm_scols; /* Count including skipped columns */
129 uint64_t rm_bigcols; /* Number of oversized columns */
130 uint64_t rm_asize; /* Actual total I/O size */
131 uint64_t rm_missingdata; /* Count of missing data devices */
132 uint64_t rm_missingparity; /* Count of missing parity devices */
133 uint64_t rm_firstdatacol; /* First data column/parity count */
134 uint64_t rm_nskip; /* Skipped sectors for padding */
135 uint64_t rm_skipstart; /* Column index of padding start */
136 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
137 uintptr_t rm_reports; /* # of referencing checksum reports */
138 uint8_t rm_freed; /* map no longer has referencing ZIO */
139 uint8_t rm_ecksuminjected; /* checksum error was injected */
140 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
141 } raidz_map_t;
142
143 #define VDEV_RAIDZ_P 0
144 #define VDEV_RAIDZ_Q 1
145 #define VDEV_RAIDZ_R 2
146
147 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
148 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
149
150 /*
151 * We provide a mechanism to perform the field multiplication operation on a
152 * 64-bit value all at once rather than a byte at a time. This works by
153 * creating a mask from the top bit in each byte and using that to
154 * conditionally apply the XOR of 0x1d.
155 */
156 #define VDEV_RAIDZ_64MUL_2(x, mask) \
157 { \
158 (mask) = (x) & 0x8080808080808080ULL; \
159 (mask) = ((mask) << 1) - ((mask) >> 7); \
160 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
161 ((mask) & 0x1d1d1d1d1d1d1d1d); \
162 }
163
164 #define VDEV_RAIDZ_64MUL_4(x, mask) \
165 { \
166 VDEV_RAIDZ_64MUL_2((x), mask); \
167 VDEV_RAIDZ_64MUL_2((x), mask); \
168 }
169
170 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
171
172 /*
173 * Force reconstruction to use the general purpose method.
174 */
175 int vdev_raidz_default_to_general;
176
177 /* Powers of 2 in the Galois field defined above. */
178 static const uint8_t vdev_raidz_pow2[256] = {
179 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
180 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
181 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
182 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
183 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
184 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
185 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
186 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
187 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
188 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
189 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
190 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
191 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
192 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
193 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
194 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
195 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
196 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
197 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
198 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
199 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
200 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
201 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
202 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
203 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
204 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
205 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
206 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
207 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
208 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
209 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
210 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
211 };
212 /* Logs of 2 in the Galois field defined above. */
213 static const uint8_t vdev_raidz_log2[256] = {
214 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
215 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
216 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
217 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
218 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
219 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
220 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
221 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
222 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
223 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
224 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
225 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
226 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
227 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
228 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
229 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
230 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
231 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
232 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
233 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
234 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
235 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
236 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
237 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
238 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
239 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
240 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
241 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
242 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
243 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
244 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
245 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
246 };
247
248 static void vdev_raidz_generate_parity(raidz_map_t *rm);
249
250 /*
251 * Multiply a given number by 2 raised to the given power.
252 */
253 static uint8_t
vdev_raidz_exp2(uint_t a,int exp)254 vdev_raidz_exp2(uint_t a, int exp)
255 {
256 if (a == 0)
257 return (0);
258
259 ASSERT(exp >= 0);
260 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
261
262 exp += vdev_raidz_log2[a];
263 if (exp > 255)
264 exp -= 255;
265
266 return (vdev_raidz_pow2[exp]);
267 }
268
269 static void
vdev_raidz_map_free(raidz_map_t * rm)270 vdev_raidz_map_free(raidz_map_t *rm)
271 {
272 int c;
273 size_t size;
274
275 for (c = 0; c < rm->rm_firstdatacol; c++) {
276 if (rm->rm_col[c].rc_abd != NULL)
277 abd_free(rm->rm_col[c].rc_abd);
278
279 if (rm->rm_col[c].rc_gdata != NULL)
280 zio_buf_free(rm->rm_col[c].rc_gdata,
281 rm->rm_col[c].rc_size);
282 }
283
284 size = 0;
285 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
286 if (rm->rm_col[c].rc_abd != NULL)
287 abd_put(rm->rm_col[c].rc_abd);
288 size += rm->rm_col[c].rc_size;
289 }
290
291 if (rm->rm_abd_copy != NULL)
292 abd_free(rm->rm_abd_copy);
293
294 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
295 }
296
297 static void
vdev_raidz_map_free_vsd(zio_t * zio)298 vdev_raidz_map_free_vsd(zio_t *zio)
299 {
300 raidz_map_t *rm = zio->io_vsd;
301
302 ASSERT0(rm->rm_freed);
303 rm->rm_freed = 1;
304
305 if (rm->rm_reports == 0)
306 vdev_raidz_map_free(rm);
307 }
308
309 /*ARGSUSED*/
310 static void
vdev_raidz_cksum_free(void * arg,size_t ignored)311 vdev_raidz_cksum_free(void *arg, size_t ignored)
312 {
313 raidz_map_t *rm = arg;
314
315 ASSERT3U(rm->rm_reports, >, 0);
316
317 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
318 vdev_raidz_map_free(rm);
319 }
320
321 static void
vdev_raidz_cksum_finish(zio_cksum_report_t * zcr,const void * good_data)322 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
323 {
324 raidz_map_t *rm = zcr->zcr_cbdata;
325 size_t c = zcr->zcr_cbinfo;
326 size_t x;
327
328 const char *good = NULL;
329 char *bad;
330
331 if (good_data == NULL) {
332 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
333 return;
334 }
335
336 if (c < rm->rm_firstdatacol) {
337 /*
338 * The first time through, calculate the parity blocks for
339 * the good data (this relies on the fact that the good
340 * data never changes for a given logical ZIO)
341 */
342 if (rm->rm_col[0].rc_gdata == NULL) {
343 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
344 char *buf;
345 int offset;
346
347 /*
348 * Set up the rm_col[]s to generate the parity for
349 * good_data, first saving the parity bufs and
350 * replacing them with buffers to hold the result.
351 */
352 for (x = 0; x < rm->rm_firstdatacol; x++) {
353 bad_parity[x] = rm->rm_col[x].rc_abd;
354 rm->rm_col[x].rc_gdata =
355 zio_buf_alloc(rm->rm_col[x].rc_size);
356 rm->rm_col[x].rc_abd =
357 abd_get_from_buf(rm->rm_col[x].rc_gdata,
358 rm->rm_col[x].rc_size);
359 }
360
361 /* fill in the data columns from good_data */
362 buf = (char *)good_data;
363 for (; x < rm->rm_cols; x++) {
364 abd_put(rm->rm_col[x].rc_abd);
365 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
366 rm->rm_col[x].rc_size);
367 buf += rm->rm_col[x].rc_size;
368 }
369
370 /*
371 * Construct the parity from the good data.
372 */
373 vdev_raidz_generate_parity(rm);
374
375 /* restore everything back to its original state */
376 for (x = 0; x < rm->rm_firstdatacol; x++) {
377 abd_put(rm->rm_col[x].rc_abd);
378 rm->rm_col[x].rc_abd = bad_parity[x];
379 }
380
381 offset = 0;
382 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
383 abd_put(rm->rm_col[x].rc_abd);
384 rm->rm_col[x].rc_abd = abd_get_offset(
385 rm->rm_abd_copy, offset);
386 offset += rm->rm_col[x].rc_size;
387 }
388 }
389
390 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
391 good = rm->rm_col[c].rc_gdata;
392 } else {
393 /* adjust good_data to point at the start of our column */
394 good = good_data;
395
396 for (x = rm->rm_firstdatacol; x < c; x++)
397 good += rm->rm_col[x].rc_size;
398 }
399
400 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
401 /* we drop the ereport if it ends up that the data was good */
402 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
403 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
404 }
405
406 /*
407 * Invoked indirectly by zfs_ereport_start_checksum(), called
408 * below when our read operation fails completely. The main point
409 * is to keep a copy of everything we read from disk, so that at
410 * vdev_raidz_cksum_finish() time we can compare it with the good data.
411 */
412 static void
vdev_raidz_cksum_report(zio_t * zio,zio_cksum_report_t * zcr,void * arg)413 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
414 {
415 size_t c = (size_t)(uintptr_t)arg;
416 size_t offset;
417
418 raidz_map_t *rm = zio->io_vsd;
419 size_t size;
420
421 /* set up the report and bump the refcount */
422 zcr->zcr_cbdata = rm;
423 zcr->zcr_cbinfo = c;
424 zcr->zcr_finish = vdev_raidz_cksum_finish;
425 zcr->zcr_free = vdev_raidz_cksum_free;
426
427 rm->rm_reports++;
428 ASSERT3U(rm->rm_reports, >, 0);
429
430 if (rm->rm_abd_copy != NULL)
431 return;
432
433 /*
434 * It's the first time we're called for this raidz_map_t, so we need
435 * to copy the data aside; there's no guarantee that our zio's buffer
436 * won't be re-used for something else.
437 *
438 * Our parity data is already in separate buffers, so there's no need
439 * to copy them.
440 */
441
442 size = 0;
443 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
444 size += rm->rm_col[c].rc_size;
445
446 rm->rm_abd_copy =
447 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
448
449 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
450 raidz_col_t *col = &rm->rm_col[c];
451 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
452
453 abd_copy(tmp, col->rc_abd, col->rc_size);
454 abd_put(col->rc_abd);
455 col->rc_abd = tmp;
456
457 offset += col->rc_size;
458 }
459 ASSERT3U(offset, ==, size);
460 }
461
462 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
463 vdev_raidz_map_free_vsd,
464 vdev_raidz_cksum_report
465 };
466
467 /*
468 * Divides the IO evenly across all child vdevs; usually, dcols is
469 * the number of children in the target vdev.
470 */
471 static raidz_map_t *
vdev_raidz_map_alloc(abd_t * abd,uint64_t size,uint64_t offset,boolean_t dofree,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)472 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
473 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
474 {
475 raidz_map_t *rm;
476 /* The starting RAIDZ (parent) vdev sector of the block. */
477 uint64_t b = offset >> unit_shift;
478 /* The zio's size in units of the vdev's minimum sector size. */
479 uint64_t s = size >> unit_shift;
480 /* The first column for this stripe. */
481 uint64_t f = b % dcols;
482 /* The starting byte offset on each child vdev. */
483 uint64_t o = (b / dcols) << unit_shift;
484 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
485 uint64_t off = 0;
486
487 /*
488 * "Quotient": The number of data sectors for this stripe on all but
489 * the "big column" child vdevs that also contain "remainder" data.
490 */
491 q = s / (dcols - nparity);
492
493 /*
494 * "Remainder": The number of partial stripe data sectors in this I/O.
495 * This will add a sector to some, but not all, child vdevs.
496 */
497 r = s - q * (dcols - nparity);
498
499 /* The number of "big columns" - those which contain remainder data. */
500 bc = (r == 0 ? 0 : r + nparity);
501
502 /*
503 * The total number of data and parity sectors associated with
504 * this I/O.
505 */
506 tot = s + nparity * (q + (r == 0 ? 0 : 1));
507
508 /* acols: The columns that will be accessed. */
509 /* scols: The columns that will be accessed or skipped. */
510 if (q == 0) {
511 /* Our I/O request doesn't span all child vdevs. */
512 acols = bc;
513 scols = MIN(dcols, roundup(bc, nparity + 1));
514 } else {
515 acols = dcols;
516 scols = dcols;
517 }
518
519 ASSERT3U(acols, <=, scols);
520
521 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
522
523 rm->rm_cols = acols;
524 rm->rm_scols = scols;
525 rm->rm_bigcols = bc;
526 rm->rm_skipstart = bc;
527 rm->rm_missingdata = 0;
528 rm->rm_missingparity = 0;
529 rm->rm_firstdatacol = nparity;
530 rm->rm_abd_copy = NULL;
531 rm->rm_reports = 0;
532 rm->rm_freed = 0;
533 rm->rm_ecksuminjected = 0;
534
535 asize = 0;
536
537 for (c = 0; c < scols; c++) {
538 col = f + c;
539 coff = o;
540 if (col >= dcols) {
541 col -= dcols;
542 coff += 1ULL << unit_shift;
543 }
544 rm->rm_col[c].rc_devidx = col;
545 rm->rm_col[c].rc_offset = coff;
546 rm->rm_col[c].rc_abd = NULL;
547 rm->rm_col[c].rc_gdata = NULL;
548 rm->rm_col[c].rc_error = 0;
549 rm->rm_col[c].rc_tried = 0;
550 rm->rm_col[c].rc_skipped = 0;
551
552 if (c >= acols)
553 rm->rm_col[c].rc_size = 0;
554 else if (c < bc)
555 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
556 else
557 rm->rm_col[c].rc_size = q << unit_shift;
558
559 asize += rm->rm_col[c].rc_size;
560 }
561
562 ASSERT3U(asize, ==, tot << unit_shift);
563 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
564 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
565 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
566 ASSERT3U(rm->rm_nskip, <=, nparity);
567
568 if (!dofree) {
569 for (c = 0; c < rm->rm_firstdatacol; c++) {
570 rm->rm_col[c].rc_abd =
571 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
572 }
573
574 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
575 off = rm->rm_col[c].rc_size;
576
577 for (c = c + 1; c < acols; c++) {
578 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
579 off += rm->rm_col[c].rc_size;
580 }
581 }
582
583 /*
584 * If all data stored spans all columns, there's a danger that parity
585 * will always be on the same device and, since parity isn't read
586 * during normal operation, that that device's I/O bandwidth won't be
587 * used effectively. We therefore switch the parity every 1MB.
588 *
589 * ... at least that was, ostensibly, the theory. As a practical
590 * matter unless we juggle the parity between all devices evenly, we
591 * won't see any benefit. Further, occasional writes that aren't a
592 * multiple of the LCM of the number of children and the minimum
593 * stripe width are sufficient to avoid pessimal behavior.
594 * Unfortunately, this decision created an implicit on-disk format
595 * requirement that we need to support for all eternity, but only
596 * for single-parity RAID-Z.
597 *
598 * If we intend to skip a sector in the zeroth column for padding
599 * we must make sure to note this swap. We will never intend to
600 * skip the first column since at least one data and one parity
601 * column must appear in each row.
602 */
603 ASSERT(rm->rm_cols >= 2);
604 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
605
606 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
607 devidx = rm->rm_col[0].rc_devidx;
608 o = rm->rm_col[0].rc_offset;
609 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
610 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
611 rm->rm_col[1].rc_devidx = devidx;
612 rm->rm_col[1].rc_offset = o;
613
614 if (rm->rm_skipstart == 0)
615 rm->rm_skipstart = 1;
616 }
617
618 return (rm);
619 }
620
621 struct pqr_struct {
622 uint64_t *p;
623 uint64_t *q;
624 uint64_t *r;
625 };
626
627 static int
vdev_raidz_p_func(void * buf,size_t size,void * private)628 vdev_raidz_p_func(void *buf, size_t size, void *private)
629 {
630 struct pqr_struct *pqr = private;
631 const uint64_t *src = buf;
632 int i, cnt = size / sizeof (src[0]);
633
634 ASSERT(pqr->p && !pqr->q && !pqr->r);
635
636 for (i = 0; i < cnt; i++, src++, pqr->p++)
637 *pqr->p ^= *src;
638
639 return (0);
640 }
641
642 static int
vdev_raidz_pq_func(void * buf,size_t size,void * private)643 vdev_raidz_pq_func(void *buf, size_t size, void *private)
644 {
645 struct pqr_struct *pqr = private;
646 const uint64_t *src = buf;
647 uint64_t mask;
648 int i, cnt = size / sizeof (src[0]);
649
650 ASSERT(pqr->p && pqr->q && !pqr->r);
651
652 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
653 *pqr->p ^= *src;
654 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
655 *pqr->q ^= *src;
656 }
657
658 return (0);
659 }
660
661 static int
vdev_raidz_pqr_func(void * buf,size_t size,void * private)662 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
663 {
664 struct pqr_struct *pqr = private;
665 const uint64_t *src = buf;
666 uint64_t mask;
667 int i, cnt = size / sizeof (src[0]);
668
669 ASSERT(pqr->p && pqr->q && pqr->r);
670
671 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
672 *pqr->p ^= *src;
673 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
674 *pqr->q ^= *src;
675 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
676 *pqr->r ^= *src;
677 }
678
679 return (0);
680 }
681
682 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)683 vdev_raidz_generate_parity_p(raidz_map_t *rm)
684 {
685 uint64_t *p;
686 int c;
687 abd_t *src;
688
689 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
690 src = rm->rm_col[c].rc_abd;
691 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
692
693 if (c == rm->rm_firstdatacol) {
694 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
695 } else {
696 struct pqr_struct pqr = { p, NULL, NULL };
697 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
698 vdev_raidz_p_func, &pqr);
699 }
700 }
701 }
702
703 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)704 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
705 {
706 uint64_t *p, *q, pcnt, ccnt, mask, i;
707 int c;
708 abd_t *src;
709
710 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
711 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
712 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
713
714 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
715 src = rm->rm_col[c].rc_abd;
716 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
717 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
718
719 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
720
721 if (c == rm->rm_firstdatacol) {
722 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
723 (void) memcpy(q, p, rm->rm_col[c].rc_size);
724 } else {
725 struct pqr_struct pqr = { p, q, NULL };
726 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
727 vdev_raidz_pq_func, &pqr);
728 }
729
730 if (c == rm->rm_firstdatacol) {
731 for (i = ccnt; i < pcnt; i++) {
732 p[i] = 0;
733 q[i] = 0;
734 }
735 } else {
736 /*
737 * Treat short columns as though they are full of 0s.
738 * Note that there's therefore nothing needed for P.
739 */
740 for (i = ccnt; i < pcnt; i++) {
741 VDEV_RAIDZ_64MUL_2(q[i], mask);
742 }
743 }
744 }
745 }
746
747 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)748 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
749 {
750 uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
751 int c;
752 abd_t *src;
753
754 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
755 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
756 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
757 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
758 rm->rm_col[VDEV_RAIDZ_R].rc_size);
759
760 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
761 src = rm->rm_col[c].rc_abd;
762 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
763 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
764 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
765
766 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
767
768 if (c == rm->rm_firstdatacol) {
769 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
770 (void) memcpy(q, p, rm->rm_col[c].rc_size);
771 (void) memcpy(r, p, rm->rm_col[c].rc_size);
772 } else {
773 struct pqr_struct pqr = { p, q, r };
774 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
775 vdev_raidz_pqr_func, &pqr);
776 }
777
778 if (c == rm->rm_firstdatacol) {
779 for (i = ccnt; i < pcnt; i++) {
780 p[i] = 0;
781 q[i] = 0;
782 r[i] = 0;
783 }
784 } else {
785 /*
786 * Treat short columns as though they are full of 0s.
787 * Note that there's therefore nothing needed for P.
788 */
789 for (i = ccnt; i < pcnt; i++) {
790 VDEV_RAIDZ_64MUL_2(q[i], mask);
791 VDEV_RAIDZ_64MUL_4(r[i], mask);
792 }
793 }
794 }
795 }
796
797 /*
798 * Generate RAID parity in the first virtual columns according to the number of
799 * parity columns available.
800 */
801 static void
vdev_raidz_generate_parity(raidz_map_t * rm)802 vdev_raidz_generate_parity(raidz_map_t *rm)
803 {
804 switch (rm->rm_firstdatacol) {
805 case 1:
806 vdev_raidz_generate_parity_p(rm);
807 break;
808 case 2:
809 vdev_raidz_generate_parity_pq(rm);
810 break;
811 case 3:
812 vdev_raidz_generate_parity_pqr(rm);
813 break;
814 default:
815 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
816 }
817 }
818
819 /* ARGSUSED */
820 static int
vdev_raidz_reconst_p_func(void * dbuf,void * sbuf,size_t size,void * private)821 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
822 {
823 uint64_t *dst = dbuf;
824 uint64_t *src = sbuf;
825 int cnt = size / sizeof (src[0]);
826
827 for (int i = 0; i < cnt; i++) {
828 dst[i] ^= src[i];
829 }
830
831 return (0);
832 }
833
834 /* ARGSUSED */
835 static int
vdev_raidz_reconst_q_pre_func(void * dbuf,void * sbuf,size_t size,void * private)836 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
837 void *private)
838 {
839 uint64_t *dst = dbuf;
840 uint64_t *src = sbuf;
841 uint64_t mask;
842 int cnt = size / sizeof (dst[0]);
843
844 for (int i = 0; i < cnt; i++, dst++, src++) {
845 VDEV_RAIDZ_64MUL_2(*dst, mask);
846 *dst ^= *src;
847 }
848
849 return (0);
850 }
851
852 /* ARGSUSED */
853 static int
vdev_raidz_reconst_q_pre_tail_func(void * buf,size_t size,void * private)854 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
855 {
856 uint64_t *dst = buf;
857 uint64_t mask;
858 int cnt = size / sizeof (dst[0]);
859
860 for (int i = 0; i < cnt; i++, dst++) {
861 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
862 VDEV_RAIDZ_64MUL_2(*dst, mask);
863 }
864
865 return (0);
866 }
867
868 struct reconst_q_struct {
869 uint64_t *q;
870 int exp;
871 };
872
873 static int
vdev_raidz_reconst_q_post_func(void * buf,size_t size,void * private)874 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
875 {
876 struct reconst_q_struct *rq = private;
877 uint64_t *dst = buf;
878 int cnt = size / sizeof (dst[0]);
879
880 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
881 *dst ^= *rq->q;
882
883 int j;
884 uint8_t *b;
885 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
886 *b = vdev_raidz_exp2(*b, rq->exp);
887 }
888 }
889
890 return (0);
891 }
892
893 struct reconst_pq_struct {
894 uint8_t *p;
895 uint8_t *q;
896 uint8_t *pxy;
897 uint8_t *qxy;
898 int aexp;
899 int bexp;
900 };
901
902 static int
vdev_raidz_reconst_pq_func(void * xbuf,void * ybuf,size_t size,void * private)903 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
904 {
905 struct reconst_pq_struct *rpq = private;
906 uint8_t *xd = xbuf;
907 uint8_t *yd = ybuf;
908
909 for (int i = 0; i < size;
910 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
911 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
912 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
913 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
914 }
915
916 return (0);
917 }
918
919 static int
vdev_raidz_reconst_pq_tail_func(void * xbuf,size_t size,void * private)920 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
921 {
922 struct reconst_pq_struct *rpq = private;
923 uint8_t *xd = xbuf;
924
925 for (int i = 0; i < size;
926 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
927 /* same operation as vdev_raidz_reconst_pq_func() on xd */
928 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
929 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
930 }
931
932 return (0);
933 }
934
935 static int
vdev_raidz_reconstruct_p(raidz_map_t * rm,int * tgts,int ntgts)936 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
937 {
938 int x = tgts[0];
939 int c;
940 abd_t *dst, *src;
941
942 ASSERT(ntgts == 1);
943 ASSERT(x >= rm->rm_firstdatacol);
944 ASSERT(x < rm->rm_cols);
945
946 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
947 ASSERT(rm->rm_col[x].rc_size > 0);
948
949 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
950 dst = rm->rm_col[x].rc_abd;
951
952 abd_copy(dst, src, rm->rm_col[x].rc_size);
953
954 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
955 uint64_t size = MIN(rm->rm_col[x].rc_size,
956 rm->rm_col[c].rc_size);
957
958 src = rm->rm_col[c].rc_abd;
959 dst = rm->rm_col[x].rc_abd;
960
961 if (c == x)
962 continue;
963
964 (void) abd_iterate_func2(dst, src, 0, 0, size,
965 vdev_raidz_reconst_p_func, NULL);
966 }
967
968 return (1 << VDEV_RAIDZ_P);
969 }
970
971 static int
vdev_raidz_reconstruct_q(raidz_map_t * rm,int * tgts,int ntgts)972 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
973 {
974 int x = tgts[0];
975 int c, exp;
976 abd_t *dst, *src;
977
978 ASSERT(ntgts == 1);
979
980 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
981
982 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
983 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
984 rm->rm_col[c].rc_size);
985
986 src = rm->rm_col[c].rc_abd;
987 dst = rm->rm_col[x].rc_abd;
988
989 if (c == rm->rm_firstdatacol) {
990 abd_copy(dst, src, size);
991 if (rm->rm_col[x].rc_size > size)
992 abd_zero_off(dst, size,
993 rm->rm_col[x].rc_size - size);
994 } else {
995 ASSERT3U(size, <=, rm->rm_col[x].rc_size);
996 (void) abd_iterate_func2(dst, src, 0, 0, size,
997 vdev_raidz_reconst_q_pre_func, NULL);
998 (void) abd_iterate_func(dst,
999 size, rm->rm_col[x].rc_size - size,
1000 vdev_raidz_reconst_q_pre_tail_func, NULL);
1001 }
1002 }
1003
1004 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1005 dst = rm->rm_col[x].rc_abd;
1006 exp = 255 - (rm->rm_cols - 1 - x);
1007
1008 struct reconst_q_struct rq = { abd_to_buf(src), exp };
1009 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1010 vdev_raidz_reconst_q_post_func, &rq);
1011
1012 return (1 << VDEV_RAIDZ_Q);
1013 }
1014
1015 static int
vdev_raidz_reconstruct_pq(raidz_map_t * rm,int * tgts,int ntgts)1016 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1017 {
1018 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1019 abd_t *pdata, *qdata;
1020 uint64_t xsize, ysize;
1021 int x = tgts[0];
1022 int y = tgts[1];
1023 abd_t *xd, *yd;
1024
1025 ASSERT(ntgts == 2);
1026 ASSERT(x < y);
1027 ASSERT(x >= rm->rm_firstdatacol);
1028 ASSERT(y < rm->rm_cols);
1029
1030 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1031
1032 /*
1033 * Move the parity data aside -- we're going to compute parity as
1034 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1035 * reuse the parity generation mechanism without trashing the actual
1036 * parity so we make those columns appear to be full of zeros by
1037 * setting their lengths to zero.
1038 */
1039 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1040 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1041 xsize = rm->rm_col[x].rc_size;
1042 ysize = rm->rm_col[y].rc_size;
1043
1044 rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1045 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1046 rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1047 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1048 rm->rm_col[x].rc_size = 0;
1049 rm->rm_col[y].rc_size = 0;
1050
1051 vdev_raidz_generate_parity_pq(rm);
1052
1053 rm->rm_col[x].rc_size = xsize;
1054 rm->rm_col[y].rc_size = ysize;
1055
1056 p = abd_to_buf(pdata);
1057 q = abd_to_buf(qdata);
1058 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1059 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1060 xd = rm->rm_col[x].rc_abd;
1061 yd = rm->rm_col[y].rc_abd;
1062
1063 /*
1064 * We now have:
1065 * Pxy = P + D_x + D_y
1066 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1067 *
1068 * We can then solve for D_x:
1069 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1070 * where
1071 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1072 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1073 *
1074 * With D_x in hand, we can easily solve for D_y:
1075 * D_y = P + Pxy + D_x
1076 */
1077
1078 a = vdev_raidz_pow2[255 + x - y];
1079 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1080 tmp = 255 - vdev_raidz_log2[a ^ 1];
1081
1082 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1083 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1084
1085 ASSERT3U(xsize, >=, ysize);
1086 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1087 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1088 vdev_raidz_reconst_pq_func, &rpq);
1089 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1090 vdev_raidz_reconst_pq_tail_func, &rpq);
1091
1092 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1093 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1094
1095 /*
1096 * Restore the saved parity data.
1097 */
1098 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1099 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1100
1101 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1102 }
1103
1104 /* BEGIN CSTYLED */
1105 /*
1106 * In the general case of reconstruction, we must solve the system of linear
1107 * equations defined by the coeffecients used to generate parity as well as
1108 * the contents of the data and parity disks. This can be expressed with
1109 * vectors for the original data (D) and the actual data (d) and parity (p)
1110 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1111 *
1112 * __ __ __ __
1113 * | | __ __ | p_0 |
1114 * | V | | D_0 | | p_m-1 |
1115 * | | x | : | = | d_0 |
1116 * | I | | D_n-1 | | : |
1117 * | | ~~ ~~ | d_n-1 |
1118 * ~~ ~~ ~~ ~~
1119 *
1120 * I is simply a square identity matrix of size n, and V is a vandermonde
1121 * matrix defined by the coeffecients we chose for the various parity columns
1122 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1123 * computation as well as linear separability.
1124 *
1125 * __ __ __ __
1126 * | 1 .. 1 1 1 | | p_0 |
1127 * | 2^n-1 .. 4 2 1 | __ __ | : |
1128 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1129 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1130 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1131 * | : : : : | | : | | d_2 |
1132 * | 0 .. 1 0 0 | | D_n-1 | | : |
1133 * | 0 .. 0 1 0 | ~~ ~~ | : |
1134 * | 0 .. 0 0 1 | | d_n-1 |
1135 * ~~ ~~ ~~ ~~
1136 *
1137 * Note that I, V, d, and p are known. To compute D, we must invert the
1138 * matrix and use the known data and parity values to reconstruct the unknown
1139 * data values. We begin by removing the rows in V|I and d|p that correspond
1140 * to failed or missing columns; we then make V|I square (n x n) and d|p
1141 * sized n by removing rows corresponding to unused parity from the bottom up
1142 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1143 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1144 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1145 * __ __
1146 * | 1 1 1 1 1 1 1 1 |
1147 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1148 * | 19 205 116 29 64 16 4 1 | / /
1149 * | 1 0 0 0 0 0 0 0 | / /
1150 * | 0 1 0 0 0 0 0 0 | <--' /
1151 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1152 * | 0 0 0 1 0 0 0 0 |
1153 * | 0 0 0 0 1 0 0 0 |
1154 * | 0 0 0 0 0 1 0 0 |
1155 * | 0 0 0 0 0 0 1 0 |
1156 * | 0 0 0 0 0 0 0 1 |
1157 * ~~ ~~
1158 * __ __
1159 * | 1 1 1 1 1 1 1 1 |
1160 * | 19 205 116 29 64 16 4 1 |
1161 * | 1 0 0 0 0 0 0 0 |
1162 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1163 * | 0 0 0 0 1 0 0 0 |
1164 * | 0 0 0 0 0 1 0 0 |
1165 * | 0 0 0 0 0 0 1 0 |
1166 * | 0 0 0 0 0 0 0 1 |
1167 * ~~ ~~
1168 *
1169 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1170 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1171 * matrix is not singular.
1172 * __ __
1173 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1174 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1175 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1176 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1177 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1178 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1179 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1180 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1181 * ~~ ~~
1182 * __ __
1183 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1184 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1185 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1186 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1187 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1188 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1189 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1190 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1191 * ~~ ~~
1192 * __ __
1193 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1194 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1195 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1196 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1197 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1198 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1199 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1200 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1201 * ~~ ~~
1202 * __ __
1203 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1204 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1205 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1206 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1207 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1208 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1209 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1210 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1211 * ~~ ~~
1212 * __ __
1213 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1214 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1215 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1216 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1217 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1218 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1219 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1220 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1221 * ~~ ~~
1222 * __ __
1223 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1224 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1225 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1226 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1227 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1228 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1229 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1230 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1231 * ~~ ~~
1232 * __ __
1233 * | 0 0 1 0 0 0 0 0 |
1234 * | 167 100 5 41 159 169 217 208 |
1235 * | 166 100 4 40 158 168 216 209 |
1236 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1237 * | 0 0 0 0 1 0 0 0 |
1238 * | 0 0 0 0 0 1 0 0 |
1239 * | 0 0 0 0 0 0 1 0 |
1240 * | 0 0 0 0 0 0 0 1 |
1241 * ~~ ~~
1242 *
1243 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1244 * of the missing data.
1245 *
1246 * As is apparent from the example above, the only non-trivial rows in the
1247 * inverse matrix correspond to the data disks that we're trying to
1248 * reconstruct. Indeed, those are the only rows we need as the others would
1249 * only be useful for reconstructing data known or assumed to be valid. For
1250 * that reason, we only build the coefficients in the rows that correspond to
1251 * targeted columns.
1252 */
1253 /* END CSTYLED */
1254
1255 static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)1256 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1257 uint8_t **rows)
1258 {
1259 int i, j;
1260 int pow;
1261
1262 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1263
1264 /*
1265 * Fill in the missing rows of interest.
1266 */
1267 for (i = 0; i < nmap; i++) {
1268 ASSERT3S(0, <=, map[i]);
1269 ASSERT3S(map[i], <=, 2);
1270
1271 pow = map[i] * n;
1272 if (pow > 255)
1273 pow -= 255;
1274 ASSERT(pow <= 255);
1275
1276 for (j = 0; j < n; j++) {
1277 pow -= map[i];
1278 if (pow < 0)
1279 pow += 255;
1280 rows[i][j] = vdev_raidz_pow2[pow];
1281 }
1282 }
1283 }
1284
1285 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1286 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1287 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1288 {
1289 int i, j, ii, jj;
1290 uint8_t log;
1291
1292 /*
1293 * Assert that the first nmissing entries from the array of used
1294 * columns correspond to parity columns and that subsequent entries
1295 * correspond to data columns.
1296 */
1297 for (i = 0; i < nmissing; i++) {
1298 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1299 }
1300 for (; i < n; i++) {
1301 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1302 }
1303
1304 /*
1305 * First initialize the storage where we'll compute the inverse rows.
1306 */
1307 for (i = 0; i < nmissing; i++) {
1308 for (j = 0; j < n; j++) {
1309 invrows[i][j] = (i == j) ? 1 : 0;
1310 }
1311 }
1312
1313 /*
1314 * Subtract all trivial rows from the rows of consequence.
1315 */
1316 for (i = 0; i < nmissing; i++) {
1317 for (j = nmissing; j < n; j++) {
1318 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1319 jj = used[j] - rm->rm_firstdatacol;
1320 ASSERT3S(jj, <, n);
1321 invrows[i][j] = rows[i][jj];
1322 rows[i][jj] = 0;
1323 }
1324 }
1325
1326 /*
1327 * For each of the rows of interest, we must normalize it and subtract
1328 * a multiple of it from the other rows.
1329 */
1330 for (i = 0; i < nmissing; i++) {
1331 for (j = 0; j < missing[i]; j++) {
1332 ASSERT0(rows[i][j]);
1333 }
1334 ASSERT3U(rows[i][missing[i]], !=, 0);
1335
1336 /*
1337 * Compute the inverse of the first element and multiply each
1338 * element in the row by that value.
1339 */
1340 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1341
1342 for (j = 0; j < n; j++) {
1343 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1344 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1345 }
1346
1347 for (ii = 0; ii < nmissing; ii++) {
1348 if (i == ii)
1349 continue;
1350
1351 ASSERT3U(rows[ii][missing[i]], !=, 0);
1352
1353 log = vdev_raidz_log2[rows[ii][missing[i]]];
1354
1355 for (j = 0; j < n; j++) {
1356 rows[ii][j] ^=
1357 vdev_raidz_exp2(rows[i][j], log);
1358 invrows[ii][j] ^=
1359 vdev_raidz_exp2(invrows[i][j], log);
1360 }
1361 }
1362 }
1363
1364 /*
1365 * Verify that the data that is left in the rows are properly part of
1366 * an identity matrix.
1367 */
1368 for (i = 0; i < nmissing; i++) {
1369 for (j = 0; j < n; j++) {
1370 if (j == missing[i]) {
1371 ASSERT3U(rows[i][j], ==, 1);
1372 } else {
1373 ASSERT0(rows[i][j]);
1374 }
1375 }
1376 }
1377 }
1378
1379 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1380 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1381 int *missing, uint8_t **invrows, const uint8_t *used)
1382 {
1383 int i, j, x, cc, c;
1384 uint8_t *src;
1385 uint64_t ccount;
1386 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1387 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1388 uint8_t log = 0;
1389 uint8_t val;
1390 int ll;
1391 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1392 uint8_t *p, *pp;
1393 size_t psize;
1394
1395 psize = sizeof (invlog[0][0]) * n * nmissing;
1396 p = kmem_alloc(psize, KM_SLEEP);
1397
1398 for (pp = p, i = 0; i < nmissing; i++) {
1399 invlog[i] = pp;
1400 pp += n;
1401 }
1402
1403 for (i = 0; i < nmissing; i++) {
1404 for (j = 0; j < n; j++) {
1405 ASSERT3U(invrows[i][j], !=, 0);
1406 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1407 }
1408 }
1409
1410 for (i = 0; i < n; i++) {
1411 c = used[i];
1412 ASSERT3U(c, <, rm->rm_cols);
1413
1414 src = abd_to_buf(rm->rm_col[c].rc_abd);
1415 ccount = rm->rm_col[c].rc_size;
1416 for (j = 0; j < nmissing; j++) {
1417 cc = missing[j] + rm->rm_firstdatacol;
1418 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1419 ASSERT3U(cc, <, rm->rm_cols);
1420 ASSERT3U(cc, !=, c);
1421
1422 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1423 dcount[j] = rm->rm_col[cc].rc_size;
1424 }
1425
1426 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1427
1428 for (x = 0; x < ccount; x++, src++) {
1429 if (*src != 0)
1430 log = vdev_raidz_log2[*src];
1431
1432 for (cc = 0; cc < nmissing; cc++) {
1433 if (x >= dcount[cc])
1434 continue;
1435
1436 if (*src == 0) {
1437 val = 0;
1438 } else {
1439 if ((ll = log + invlog[cc][i]) >= 255)
1440 ll -= 255;
1441 val = vdev_raidz_pow2[ll];
1442 }
1443
1444 if (i == 0)
1445 dst[cc][x] = val;
1446 else
1447 dst[cc][x] ^= val;
1448 }
1449 }
1450 }
1451
1452 kmem_free(p, psize);
1453 }
1454
1455 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1456 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1457 {
1458 int n, i, c, t, tt;
1459 int nmissing_rows;
1460 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1461 int parity_map[VDEV_RAIDZ_MAXPARITY];
1462
1463 uint8_t *p, *pp;
1464 size_t psize;
1465
1466 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1467 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1468 uint8_t *used;
1469
1470 abd_t **bufs = NULL;
1471
1472 int code = 0;
1473
1474 /*
1475 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1476 * temporary linear ABDs.
1477 */
1478 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1479 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1480
1481 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1482 raidz_col_t *col = &rm->rm_col[c];
1483
1484 bufs[c] = col->rc_abd;
1485 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1486 abd_copy(col->rc_abd, bufs[c], col->rc_size);
1487 }
1488 }
1489
1490 n = rm->rm_cols - rm->rm_firstdatacol;
1491
1492 /*
1493 * Figure out which data columns are missing.
1494 */
1495 nmissing_rows = 0;
1496 for (t = 0; t < ntgts; t++) {
1497 if (tgts[t] >= rm->rm_firstdatacol) {
1498 missing_rows[nmissing_rows++] =
1499 tgts[t] - rm->rm_firstdatacol;
1500 }
1501 }
1502
1503 /*
1504 * Figure out which parity columns to use to help generate the missing
1505 * data columns.
1506 */
1507 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1508 ASSERT(tt < ntgts);
1509 ASSERT(c < rm->rm_firstdatacol);
1510
1511 /*
1512 * Skip any targeted parity columns.
1513 */
1514 if (c == tgts[tt]) {
1515 tt++;
1516 continue;
1517 }
1518
1519 code |= 1 << c;
1520
1521 parity_map[i] = c;
1522 i++;
1523 }
1524
1525 ASSERT(code != 0);
1526 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1527
1528 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1529 nmissing_rows * n + sizeof (used[0]) * n;
1530 p = kmem_alloc(psize, KM_SLEEP);
1531
1532 for (pp = p, i = 0; i < nmissing_rows; i++) {
1533 rows[i] = pp;
1534 pp += n;
1535 invrows[i] = pp;
1536 pp += n;
1537 }
1538 used = pp;
1539
1540 for (i = 0; i < nmissing_rows; i++) {
1541 used[i] = parity_map[i];
1542 }
1543
1544 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1545 if (tt < nmissing_rows &&
1546 c == missing_rows[tt] + rm->rm_firstdatacol) {
1547 tt++;
1548 continue;
1549 }
1550
1551 ASSERT3S(i, <, n);
1552 used[i] = c;
1553 i++;
1554 }
1555
1556 /*
1557 * Initialize the interesting rows of the matrix.
1558 */
1559 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1560
1561 /*
1562 * Invert the matrix.
1563 */
1564 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1565 invrows, used);
1566
1567 /*
1568 * Reconstruct the missing data using the generated matrix.
1569 */
1570 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1571 invrows, used);
1572
1573 kmem_free(p, psize);
1574
1575 /*
1576 * copy back from temporary linear abds and free them
1577 */
1578 if (bufs) {
1579 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1580 raidz_col_t *col = &rm->rm_col[c];
1581
1582 abd_copy(bufs[c], col->rc_abd, col->rc_size);
1583 abd_free(col->rc_abd);
1584 col->rc_abd = bufs[c];
1585 }
1586 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1587 }
1588
1589 return (code);
1590 }
1591
1592 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1593 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1594 {
1595 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1596 int ntgts;
1597 int i, c;
1598 int code;
1599 int nbadparity, nbaddata;
1600 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1601
1602 /*
1603 * The tgts list must already be sorted.
1604 */
1605 for (i = 1; i < nt; i++) {
1606 ASSERT(t[i] > t[i - 1]);
1607 }
1608
1609 nbadparity = rm->rm_firstdatacol;
1610 nbaddata = rm->rm_cols - nbadparity;
1611 ntgts = 0;
1612 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1613 if (c < rm->rm_firstdatacol)
1614 parity_valid[c] = B_FALSE;
1615
1616 if (i < nt && c == t[i]) {
1617 tgts[ntgts++] = c;
1618 i++;
1619 } else if (rm->rm_col[c].rc_error != 0) {
1620 tgts[ntgts++] = c;
1621 } else if (c >= rm->rm_firstdatacol) {
1622 nbaddata--;
1623 } else {
1624 parity_valid[c] = B_TRUE;
1625 nbadparity--;
1626 }
1627 }
1628
1629 ASSERT(ntgts >= nt);
1630 ASSERT(nbaddata >= 0);
1631 ASSERT(nbaddata + nbadparity == ntgts);
1632
1633 dt = &tgts[nbadparity];
1634
1635 /*
1636 * See if we can use any of our optimized reconstruction routines.
1637 */
1638 if (!vdev_raidz_default_to_general) {
1639 switch (nbaddata) {
1640 case 1:
1641 if (parity_valid[VDEV_RAIDZ_P])
1642 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1643
1644 ASSERT(rm->rm_firstdatacol > 1);
1645
1646 if (parity_valid[VDEV_RAIDZ_Q])
1647 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1648
1649 ASSERT(rm->rm_firstdatacol > 2);
1650 break;
1651
1652 case 2:
1653 ASSERT(rm->rm_firstdatacol > 1);
1654
1655 if (parity_valid[VDEV_RAIDZ_P] &&
1656 parity_valid[VDEV_RAIDZ_Q])
1657 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1658
1659 ASSERT(rm->rm_firstdatacol > 2);
1660
1661 break;
1662 }
1663 }
1664
1665 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1666 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1667 ASSERT(code > 0);
1668 return (code);
1669 }
1670
1671 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)1672 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1673 uint64_t *logical_ashift, uint64_t *physical_ashift)
1674 {
1675 vdev_t *cvd;
1676 uint64_t nparity = vd->vdev_nparity;
1677 int c;
1678 int lasterror = 0;
1679 int numerrors = 0;
1680
1681 ASSERT(nparity > 0);
1682
1683 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1684 vd->vdev_children < nparity + 1) {
1685 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1686 return (SET_ERROR(EINVAL));
1687 }
1688
1689 vdev_open_children(vd);
1690
1691 for (c = 0; c < vd->vdev_children; c++) {
1692 cvd = vd->vdev_child[c];
1693
1694 if (cvd->vdev_open_error != 0) {
1695 lasterror = cvd->vdev_open_error;
1696 numerrors++;
1697 continue;
1698 }
1699
1700 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1701 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1702 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1703 *physical_ashift = MAX(*physical_ashift,
1704 cvd->vdev_physical_ashift);
1705 }
1706
1707 *asize *= vd->vdev_children;
1708 *max_asize *= vd->vdev_children;
1709
1710 if (numerrors > nparity) {
1711 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1712 return (lasterror);
1713 }
1714
1715 return (0);
1716 }
1717
1718 static void
vdev_raidz_close(vdev_t * vd)1719 vdev_raidz_close(vdev_t *vd)
1720 {
1721 int c;
1722
1723 for (c = 0; c < vd->vdev_children; c++)
1724 vdev_close(vd->vdev_child[c]);
1725 }
1726
1727 #ifdef illumos
1728 /*
1729 * Handle a read or write I/O to a RAID-Z dump device.
1730 *
1731 * The dump device is in a unique situation compared to other ZFS datasets:
1732 * writing to this device should be as simple and fast as possible. In
1733 * addition, durability matters much less since the dump will be extracted
1734 * once the machine reboots. For that reason, this function eschews parity for
1735 * performance and simplicity. The dump device uses the checksum setting
1736 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1737 * dataset.
1738 *
1739 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1740 * 128 KB will not fill an entire block; in addition, they may not be properly
1741 * aligned. In that case, this function uses the preallocated 128 KB block and
1742 * omits reading or writing any "empty" portions of that block, as opposed to
1743 * allocating a fresh appropriately-sized block.
1744 *
1745 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1746 *
1747 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1748 *
1749 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1750 * allocated which spans all five child vdevs. 8 KB of data would be written to
1751 * each of four vdevs, with the fifth containing the parity bits.
1752 *
1753 * parity data data data data
1754 * | PP | XX | XX | XX | XX |
1755 * ^ ^ ^ ^ ^
1756 * | | | | |
1757 * 8 KB parity ------8 KB data blocks------
1758 *
1759 * However, when writing to the dump device, the behavior is different:
1760 *
1761 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1762 *
1763 * Unlike the normal RAID-Z case in which the block is allocated based on the
1764 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1765 * I/O size is less than 128 KB, only the actual portions of data are written.
1766 * In this example the data is written to the third data vdev since that vdev
1767 * contains the offset [64 KB, 96 KB).
1768 *
1769 * parity data data data data
1770 * | | | | XX | |
1771 * ^
1772 * |
1773 * 32 KB data block
1774 *
1775 * As a result, an individual I/O may not span all child vdevs; moreover, a
1776 * small I/O may only operate on a single child vdev.
1777 *
1778 * Note that since there are no parity bits calculated or written, this format
1779 * remains the same no matter how many parity bits are used in a normal RAID-Z
1780 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1781 * would look like:
1782 *
1783 * parity parity parity data data data data
1784 * | | | | | | XX | |
1785 * ^
1786 * |
1787 * 32 KB data block
1788 */
1789 int
vdev_raidz_physio(vdev_t * vd,caddr_t data,size_t size,uint64_t offset,uint64_t origoffset,boolean_t doread,boolean_t isdump)1790 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1791 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1792 {
1793 vdev_t *tvd = vd->vdev_top;
1794 vdev_t *cvd;
1795 raidz_map_t *rm;
1796 raidz_col_t *rc;
1797 int c, err = 0;
1798
1799 uint64_t start, end, colstart, colend;
1800 uint64_t coloffset, colsize, colskip;
1801
1802 int flags = doread ? BIO_READ : BIO_WRITE;
1803
1804 #ifdef _KERNEL
1805
1806 /*
1807 * Don't write past the end of the block
1808 */
1809 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1810
1811 start = offset;
1812 end = start + size;
1813
1814 /*
1815 * Allocate a RAID-Z map for this block. Note that this block starts
1816 * from the "original" offset, this is, the offset of the extent which
1817 * contains the requisite offset of the data being read or written.
1818 *
1819 * Even if this I/O operation doesn't span the full block size, let's
1820 * treat the on-disk format as if the only blocks are the complete 128
1821 * KB size.
1822 */
1823 abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1824 SPA_OLD_MAXBLOCKSIZE);
1825 rm = vdev_raidz_map_alloc(abd,
1826 SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1827 vd->vdev_children, vd->vdev_nparity);
1828
1829 coloffset = origoffset;
1830
1831 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1832 c++, coloffset += rc->rc_size) {
1833 rc = &rm->rm_col[c];
1834 cvd = vd->vdev_child[rc->rc_devidx];
1835
1836 /*
1837 * Find the start and end of this column in the RAID-Z map,
1838 * keeping in mind that the stated size and offset of the
1839 * operation may not fill the entire column for this vdev.
1840 *
1841 * If any portion of the data spans this column, issue the
1842 * appropriate operation to the vdev.
1843 */
1844 if (coloffset + rc->rc_size <= start)
1845 continue;
1846 if (coloffset >= end)
1847 continue;
1848
1849 colstart = MAX(coloffset, start);
1850 colend = MIN(end, coloffset + rc->rc_size);
1851 colsize = colend - colstart;
1852 colskip = colstart - coloffset;
1853
1854 VERIFY3U(colsize, <=, rc->rc_size);
1855 VERIFY3U(colskip, <=, rc->rc_size);
1856
1857 /*
1858 * Note that the child vdev will have a vdev label at the start
1859 * of its range of offsets, hence the need for
1860 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1861 * example of why this calculation is needed.
1862 */
1863 if ((err = vdev_disk_physio(cvd,
1864 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1865 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1866 flags, isdump)) != 0)
1867 break;
1868 }
1869
1870 vdev_raidz_map_free(rm);
1871 abd_put(abd);
1872 #endif /* KERNEL */
1873
1874 return (err);
1875 }
1876 #endif
1877
1878 static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize)1879 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1880 {
1881 uint64_t asize;
1882 uint64_t ashift = vd->vdev_top->vdev_ashift;
1883 uint64_t cols = vd->vdev_children;
1884 uint64_t nparity = vd->vdev_nparity;
1885
1886 asize = ((psize - 1) >> ashift) + 1;
1887 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1888 asize = roundup(asize, nparity + 1) << ashift;
1889
1890 return (asize);
1891 }
1892
1893 static void
vdev_raidz_child_done(zio_t * zio)1894 vdev_raidz_child_done(zio_t *zio)
1895 {
1896 raidz_col_t *rc = zio->io_private;
1897
1898 rc->rc_error = zio->io_error;
1899 rc->rc_tried = 1;
1900 rc->rc_skipped = 0;
1901 }
1902
1903 static void
vdev_raidz_io_verify(zio_t * zio,raidz_map_t * rm,int col)1904 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1905 {
1906 #ifdef ZFS_DEBUG
1907 vdev_t *vd = zio->io_vd;
1908 vdev_t *tvd = vd->vdev_top;
1909
1910 range_seg_t logical_rs, physical_rs;
1911 logical_rs.rs_start = zio->io_offset;
1912 logical_rs.rs_end = logical_rs.rs_start +
1913 vdev_raidz_asize(zio->io_vd, zio->io_size);
1914
1915 raidz_col_t *rc = &rm->rm_col[col];
1916 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1917
1918 vdev_xlate(cvd, &logical_rs, &physical_rs);
1919 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1920 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1921 /*
1922 * It would be nice to assert that rs_end is equal
1923 * to rc_offset + rc_size but there might be an
1924 * optional I/O at the end that is not accounted in
1925 * rc_size.
1926 */
1927 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1928 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1929 rc->rc_size + (1 << tvd->vdev_ashift));
1930 } else {
1931 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1932 }
1933 #endif
1934 }
1935
1936 /*
1937 * Start an IO operation on a RAIDZ VDev
1938 *
1939 * Outline:
1940 * - For write operations:
1941 * 1. Generate the parity data
1942 * 2. Create child zio write operations to each column's vdev, for both
1943 * data and parity.
1944 * 3. If the column skips any sectors for padding, create optional dummy
1945 * write zio children for those areas to improve aggregation continuity.
1946 * - For read operations:
1947 * 1. Create child zio read operations to each data column's vdev to read
1948 * the range of data required for zio.
1949 * 2. If this is a scrub or resilver operation, or if any of the data
1950 * vdevs have had errors, then create zio read operations to the parity
1951 * columns' VDevs as well.
1952 */
1953 static void
vdev_raidz_io_start(zio_t * zio)1954 vdev_raidz_io_start(zio_t *zio)
1955 {
1956 vdev_t *vd = zio->io_vd;
1957 vdev_t *tvd = vd->vdev_top;
1958 vdev_t *cvd;
1959 raidz_map_t *rm;
1960 raidz_col_t *rc;
1961 int c, i;
1962
1963 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1964 zio->io_type == ZIO_TYPE_FREE,
1965 tvd->vdev_ashift, vd->vdev_children,
1966 vd->vdev_nparity);
1967
1968 zio->io_vsd = rm;
1969 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1970
1971 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1972
1973 if (zio->io_type == ZIO_TYPE_FREE) {
1974 for (c = 0; c < rm->rm_cols; c++) {
1975 rc = &rm->rm_col[c];
1976 cvd = vd->vdev_child[rc->rc_devidx];
1977 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1978 rc->rc_offset, rc->rc_abd, rc->rc_size,
1979 zio->io_type, zio->io_priority, 0,
1980 vdev_raidz_child_done, rc));
1981 }
1982
1983 zio_execute(zio);
1984 return;
1985 }
1986
1987 if (zio->io_type == ZIO_TYPE_WRITE) {
1988 vdev_raidz_generate_parity(rm);
1989
1990 for (c = 0; c < rm->rm_cols; c++) {
1991 rc = &rm->rm_col[c];
1992 cvd = vd->vdev_child[rc->rc_devidx];
1993
1994 /*
1995 * Verify physical to logical translation.
1996 */
1997 vdev_raidz_io_verify(zio, rm, c);
1998
1999 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2000 rc->rc_offset, rc->rc_abd, rc->rc_size,
2001 zio->io_type, zio->io_priority, 0,
2002 vdev_raidz_child_done, rc));
2003 }
2004
2005 /*
2006 * Generate optional I/Os for any skipped sectors to improve
2007 * aggregation contiguity.
2008 */
2009 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
2010 ASSERT(c <= rm->rm_scols);
2011 if (c == rm->rm_scols)
2012 c = 0;
2013 rc = &rm->rm_col[c];
2014 cvd = vd->vdev_child[rc->rc_devidx];
2015 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2016 rc->rc_offset + rc->rc_size, NULL,
2017 1 << tvd->vdev_ashift,
2018 zio->io_type, zio->io_priority,
2019 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2020 }
2021
2022 zio_execute(zio);
2023 return;
2024 }
2025
2026 ASSERT(zio->io_type == ZIO_TYPE_READ);
2027
2028 /*
2029 * Iterate over the columns in reverse order so that we hit the parity
2030 * last -- any errors along the way will force us to read the parity.
2031 */
2032 for (c = rm->rm_cols - 1; c >= 0; c--) {
2033 rc = &rm->rm_col[c];
2034 cvd = vd->vdev_child[rc->rc_devidx];
2035 if (!vdev_readable(cvd)) {
2036 if (c >= rm->rm_firstdatacol)
2037 rm->rm_missingdata++;
2038 else
2039 rm->rm_missingparity++;
2040 rc->rc_error = SET_ERROR(ENXIO);
2041 rc->rc_tried = 1; /* don't even try */
2042 rc->rc_skipped = 1;
2043 continue;
2044 }
2045 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2046 if (c >= rm->rm_firstdatacol)
2047 rm->rm_missingdata++;
2048 else
2049 rm->rm_missingparity++;
2050 rc->rc_error = SET_ERROR(ESTALE);
2051 rc->rc_skipped = 1;
2052 continue;
2053 }
2054 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2055 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2056 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2057 rc->rc_offset, rc->rc_abd, rc->rc_size,
2058 zio->io_type, zio->io_priority, 0,
2059 vdev_raidz_child_done, rc));
2060 }
2061 }
2062
2063 zio_execute(zio);
2064 }
2065
2066
2067 /*
2068 * Report a checksum error for a child of a RAID-Z device.
2069 */
2070 static void
raidz_checksum_error(zio_t * zio,raidz_col_t * rc,void * bad_data)2071 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2072 {
2073 void *buf;
2074 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2075
2076 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2077 zio_bad_cksum_t zbc;
2078 raidz_map_t *rm = zio->io_vsd;
2079
2080 mutex_enter(&vd->vdev_stat_lock);
2081 vd->vdev_stat.vs_checksum_errors++;
2082 mutex_exit(&vd->vdev_stat_lock);
2083
2084 zbc.zbc_has_cksum = 0;
2085 zbc.zbc_injected = rm->rm_ecksuminjected;
2086
2087 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2088 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2089 rc->rc_offset, rc->rc_size, buf, bad_data,
2090 &zbc);
2091 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2092 }
2093 }
2094
2095 /*
2096 * We keep track of whether or not there were any injected errors, so that
2097 * any ereports we generate can note it.
2098 */
2099 static int
raidz_checksum_verify(zio_t * zio)2100 raidz_checksum_verify(zio_t *zio)
2101 {
2102 zio_bad_cksum_t zbc;
2103 raidz_map_t *rm = zio->io_vsd;
2104
2105 int ret = zio_checksum_error(zio, &zbc);
2106 if (ret != 0 && zbc.zbc_injected != 0)
2107 rm->rm_ecksuminjected = 1;
2108
2109 return (ret);
2110 }
2111
2112 /*
2113 * Generate the parity from the data columns. If we tried and were able to
2114 * read the parity without error, verify that the generated parity matches the
2115 * data we read. If it doesn't, we fire off a checksum error. Return the
2116 * number such failures.
2117 */
2118 static int
raidz_parity_verify(zio_t * zio,raidz_map_t * rm)2119 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2120 {
2121 void *orig[VDEV_RAIDZ_MAXPARITY];
2122 int c, ret = 0;
2123 raidz_col_t *rc;
2124
2125 blkptr_t *bp = zio->io_bp;
2126 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2127 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2128
2129 if (checksum == ZIO_CHECKSUM_NOPARITY)
2130 return (ret);
2131
2132 for (c = 0; c < rm->rm_firstdatacol; c++) {
2133 rc = &rm->rm_col[c];
2134 if (!rc->rc_tried || rc->rc_error != 0)
2135 continue;
2136 orig[c] = zio_buf_alloc(rc->rc_size);
2137 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2138 }
2139
2140 vdev_raidz_generate_parity(rm);
2141
2142 for (c = 0; c < rm->rm_firstdatacol; c++) {
2143 rc = &rm->rm_col[c];
2144 if (!rc->rc_tried || rc->rc_error != 0)
2145 continue;
2146 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2147 raidz_checksum_error(zio, rc, orig[c]);
2148 rc->rc_error = SET_ERROR(ECKSUM);
2149 ret++;
2150 }
2151 zio_buf_free(orig[c], rc->rc_size);
2152 }
2153
2154 return (ret);
2155 }
2156
2157 /*
2158 * Keep statistics on all the ways that we used parity to correct data.
2159 */
2160 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2161
2162 static int
vdev_raidz_worst_error(raidz_map_t * rm)2163 vdev_raidz_worst_error(raidz_map_t *rm)
2164 {
2165 int error = 0;
2166
2167 for (int c = 0; c < rm->rm_cols; c++)
2168 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2169
2170 return (error);
2171 }
2172
2173 /*
2174 * Iterate over all combinations of bad data and attempt a reconstruction.
2175 * Note that the algorithm below is non-optimal because it doesn't take into
2176 * account how reconstruction is actually performed. For example, with
2177 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2178 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2179 * cases we'd only use parity information in column 0.
2180 */
2181 static int
vdev_raidz_combrec(zio_t * zio,int total_errors,int data_errors)2182 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2183 {
2184 raidz_map_t *rm = zio->io_vsd;
2185 raidz_col_t *rc;
2186 void *orig[VDEV_RAIDZ_MAXPARITY];
2187 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2188 int *tgts = &tstore[1];
2189 int current, next, i, c, n;
2190 int code, ret = 0;
2191
2192 ASSERT(total_errors < rm->rm_firstdatacol);
2193
2194 /*
2195 * This simplifies one edge condition.
2196 */
2197 tgts[-1] = -1;
2198
2199 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2200 /*
2201 * Initialize the targets array by finding the first n columns
2202 * that contain no error.
2203 *
2204 * If there were no data errors, we need to ensure that we're
2205 * always explicitly attempting to reconstruct at least one
2206 * data column. To do this, we simply push the highest target
2207 * up into the data columns.
2208 */
2209 for (c = 0, i = 0; i < n; i++) {
2210 if (i == n - 1 && data_errors == 0 &&
2211 c < rm->rm_firstdatacol) {
2212 c = rm->rm_firstdatacol;
2213 }
2214
2215 while (rm->rm_col[c].rc_error != 0) {
2216 c++;
2217 ASSERT3S(c, <, rm->rm_cols);
2218 }
2219
2220 tgts[i] = c++;
2221 }
2222
2223 /*
2224 * Setting tgts[n] simplifies the other edge condition.
2225 */
2226 tgts[n] = rm->rm_cols;
2227
2228 /*
2229 * These buffers were allocated in previous iterations.
2230 */
2231 for (i = 0; i < n - 1; i++) {
2232 ASSERT(orig[i] != NULL);
2233 }
2234
2235 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2236
2237 current = 0;
2238 next = tgts[current];
2239
2240 while (current != n) {
2241 tgts[current] = next;
2242 current = 0;
2243
2244 /*
2245 * Save off the original data that we're going to
2246 * attempt to reconstruct.
2247 */
2248 for (i = 0; i < n; i++) {
2249 ASSERT(orig[i] != NULL);
2250 c = tgts[i];
2251 ASSERT3S(c, >=, 0);
2252 ASSERT3S(c, <, rm->rm_cols);
2253 rc = &rm->rm_col[c];
2254 abd_copy_to_buf(orig[i], rc->rc_abd,
2255 rc->rc_size);
2256 }
2257
2258 /*
2259 * Attempt a reconstruction and exit the outer loop on
2260 * success.
2261 */
2262 code = vdev_raidz_reconstruct(rm, tgts, n);
2263 if (raidz_checksum_verify(zio) == 0) {
2264 atomic_inc_64(&raidz_corrected[code]);
2265
2266 for (i = 0; i < n; i++) {
2267 c = tgts[i];
2268 rc = &rm->rm_col[c];
2269 ASSERT(rc->rc_error == 0);
2270 if (rc->rc_tried)
2271 raidz_checksum_error(zio, rc,
2272 orig[i]);
2273 rc->rc_error = SET_ERROR(ECKSUM);
2274 }
2275
2276 ret = code;
2277 goto done;
2278 }
2279
2280 /*
2281 * Restore the original data.
2282 */
2283 for (i = 0; i < n; i++) {
2284 c = tgts[i];
2285 rc = &rm->rm_col[c];
2286 abd_copy_from_buf(rc->rc_abd, orig[i],
2287 rc->rc_size);
2288 }
2289
2290 do {
2291 /*
2292 * Find the next valid column after the current
2293 * position..
2294 */
2295 for (next = tgts[current] + 1;
2296 next < rm->rm_cols &&
2297 rm->rm_col[next].rc_error != 0; next++)
2298 continue;
2299
2300 ASSERT(next <= tgts[current + 1]);
2301
2302 /*
2303 * If that spot is available, we're done here.
2304 */
2305 if (next != tgts[current + 1])
2306 break;
2307
2308 /*
2309 * Otherwise, find the next valid column after
2310 * the previous position.
2311 */
2312 for (c = tgts[current - 1] + 1;
2313 rm->rm_col[c].rc_error != 0; c++)
2314 continue;
2315
2316 tgts[current] = c;
2317 current++;
2318
2319 } while (current != n);
2320 }
2321 }
2322 n--;
2323 done:
2324 for (i = 0; i < n; i++) {
2325 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2326 }
2327
2328 return (ret);
2329 }
2330
2331 /*
2332 * Complete an IO operation on a RAIDZ VDev
2333 *
2334 * Outline:
2335 * - For write operations:
2336 * 1. Check for errors on the child IOs.
2337 * 2. Return, setting an error code if too few child VDevs were written
2338 * to reconstruct the data later. Note that partial writes are
2339 * considered successful if they can be reconstructed at all.
2340 * - For read operations:
2341 * 1. Check for errors on the child IOs.
2342 * 2. If data errors occurred:
2343 * a. Try to reassemble the data from the parity available.
2344 * b. If we haven't yet read the parity drives, read them now.
2345 * c. If all parity drives have been read but the data still doesn't
2346 * reassemble with a correct checksum, then try combinatorial
2347 * reconstruction.
2348 * d. If that doesn't work, return an error.
2349 * 3. If there were unexpected errors or this is a resilver operation,
2350 * rewrite the vdevs that had errors.
2351 */
2352 static void
vdev_raidz_io_done(zio_t * zio)2353 vdev_raidz_io_done(zio_t *zio)
2354 {
2355 vdev_t *vd = zio->io_vd;
2356 vdev_t *cvd;
2357 raidz_map_t *rm = zio->io_vsd;
2358 raidz_col_t *rc;
2359 int unexpected_errors = 0;
2360 int parity_errors = 0;
2361 int parity_untried = 0;
2362 int data_errors = 0;
2363 int total_errors = 0;
2364 int n, c;
2365 int tgts[VDEV_RAIDZ_MAXPARITY];
2366 int code;
2367
2368 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2369
2370 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2371 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2372
2373 for (c = 0; c < rm->rm_cols; c++) {
2374 rc = &rm->rm_col[c];
2375
2376 if (rc->rc_error) {
2377 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2378
2379 if (c < rm->rm_firstdatacol)
2380 parity_errors++;
2381 else
2382 data_errors++;
2383
2384 if (!rc->rc_skipped)
2385 unexpected_errors++;
2386
2387 total_errors++;
2388 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2389 parity_untried++;
2390 }
2391 }
2392
2393 if (zio->io_type == ZIO_TYPE_WRITE) {
2394 /*
2395 * XXX -- for now, treat partial writes as a success.
2396 * (If we couldn't write enough columns to reconstruct
2397 * the data, the I/O failed. Otherwise, good enough.)
2398 *
2399 * Now that we support write reallocation, it would be better
2400 * to treat partial failure as real failure unless there are
2401 * no non-degraded top-level vdevs left, and not update DTLs
2402 * if we intend to reallocate.
2403 */
2404 /* XXPOLICY */
2405 if (total_errors > rm->rm_firstdatacol)
2406 zio->io_error = vdev_raidz_worst_error(rm);
2407
2408 return;
2409 } else if (zio->io_type == ZIO_TYPE_FREE) {
2410 return;
2411 }
2412
2413 ASSERT(zio->io_type == ZIO_TYPE_READ);
2414 /*
2415 * There are three potential phases for a read:
2416 * 1. produce valid data from the columns read
2417 * 2. read all disks and try again
2418 * 3. perform combinatorial reconstruction
2419 *
2420 * Each phase is progressively both more expensive and less likely to
2421 * occur. If we encounter more errors than we can repair or all phases
2422 * fail, we have no choice but to return an error.
2423 */
2424
2425 /*
2426 * If the number of errors we saw was correctable -- less than or equal
2427 * to the number of parity disks read -- attempt to produce data that
2428 * has a valid checksum. Naturally, this case applies in the absence of
2429 * any errors.
2430 */
2431 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2432 if (data_errors == 0) {
2433 if (raidz_checksum_verify(zio) == 0) {
2434 /*
2435 * If we read parity information (unnecessarily
2436 * as it happens since no reconstruction was
2437 * needed) regenerate and verify the parity.
2438 * We also regenerate parity when resilvering
2439 * so we can write it out to the failed device
2440 * later.
2441 */
2442 if (parity_errors + parity_untried <
2443 rm->rm_firstdatacol ||
2444 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2445 n = raidz_parity_verify(zio, rm);
2446 unexpected_errors += n;
2447 ASSERT(parity_errors + n <=
2448 rm->rm_firstdatacol);
2449 }
2450 goto done;
2451 }
2452 } else {
2453 /*
2454 * We either attempt to read all the parity columns or
2455 * none of them. If we didn't try to read parity, we
2456 * wouldn't be here in the correctable case. There must
2457 * also have been fewer parity errors than parity
2458 * columns or, again, we wouldn't be in this code path.
2459 */
2460 ASSERT(parity_untried == 0);
2461 ASSERT(parity_errors < rm->rm_firstdatacol);
2462
2463 /*
2464 * Identify the data columns that reported an error.
2465 */
2466 n = 0;
2467 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2468 rc = &rm->rm_col[c];
2469 if (rc->rc_error != 0) {
2470 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2471 tgts[n++] = c;
2472 }
2473 }
2474
2475 ASSERT(rm->rm_firstdatacol >= n);
2476
2477 code = vdev_raidz_reconstruct(rm, tgts, n);
2478
2479 if (raidz_checksum_verify(zio) == 0) {
2480 atomic_inc_64(&raidz_corrected[code]);
2481
2482 /*
2483 * If we read more parity disks than were used
2484 * for reconstruction, confirm that the other
2485 * parity disks produced correct data. This
2486 * routine is suboptimal in that it regenerates
2487 * the parity that we already used in addition
2488 * to the parity that we're attempting to
2489 * verify, but this should be a relatively
2490 * uncommon case, and can be optimized if it
2491 * becomes a problem. Note that we regenerate
2492 * parity when resilvering so we can write it
2493 * out to failed devices later.
2494 */
2495 if (parity_errors < rm->rm_firstdatacol - n ||
2496 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2497 n = raidz_parity_verify(zio, rm);
2498 unexpected_errors += n;
2499 ASSERT(parity_errors + n <=
2500 rm->rm_firstdatacol);
2501 }
2502
2503 goto done;
2504 }
2505 }
2506 }
2507
2508 /*
2509 * This isn't a typical situation -- either we got a read error or
2510 * a child silently returned bad data. Read every block so we can
2511 * try again with as much data and parity as we can track down. If
2512 * we've already been through once before, all children will be marked
2513 * as tried so we'll proceed to combinatorial reconstruction.
2514 */
2515 unexpected_errors = 1;
2516 rm->rm_missingdata = 0;
2517 rm->rm_missingparity = 0;
2518
2519 for (c = 0; c < rm->rm_cols; c++) {
2520 if (rm->rm_col[c].rc_tried)
2521 continue;
2522
2523 zio_vdev_io_redone(zio);
2524 do {
2525 rc = &rm->rm_col[c];
2526 if (rc->rc_tried)
2527 continue;
2528 zio_nowait(zio_vdev_child_io(zio, NULL,
2529 vd->vdev_child[rc->rc_devidx],
2530 rc->rc_offset, rc->rc_abd, rc->rc_size,
2531 zio->io_type, zio->io_priority, 0,
2532 vdev_raidz_child_done, rc));
2533 } while (++c < rm->rm_cols);
2534
2535 return;
2536 }
2537
2538 /*
2539 * At this point we've attempted to reconstruct the data given the
2540 * errors we detected, and we've attempted to read all columns. There
2541 * must, therefore, be one or more additional problems -- silent errors
2542 * resulting in invalid data rather than explicit I/O errors resulting
2543 * in absent data. We check if there is enough additional data to
2544 * possibly reconstruct the data and then perform combinatorial
2545 * reconstruction over all possible combinations. If that fails,
2546 * we're cooked.
2547 */
2548 if (total_errors > rm->rm_firstdatacol) {
2549 zio->io_error = vdev_raidz_worst_error(rm);
2550
2551 } else if (total_errors < rm->rm_firstdatacol &&
2552 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2553 /*
2554 * If we didn't use all the available parity for the
2555 * combinatorial reconstruction, verify that the remaining
2556 * parity is correct.
2557 */
2558 if (code != (1 << rm->rm_firstdatacol) - 1)
2559 (void) raidz_parity_verify(zio, rm);
2560 } else {
2561 /*
2562 * We're here because either:
2563 *
2564 * total_errors == rm_first_datacol, or
2565 * vdev_raidz_combrec() failed
2566 *
2567 * In either case, there is enough bad data to prevent
2568 * reconstruction.
2569 *
2570 * Start checksum ereports for all children which haven't
2571 * failed, and the IO wasn't speculative.
2572 */
2573 zio->io_error = SET_ERROR(ECKSUM);
2574
2575 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2576 for (c = 0; c < rm->rm_cols; c++) {
2577 rc = &rm->rm_col[c];
2578 if (rc->rc_error == 0) {
2579 zio_bad_cksum_t zbc;
2580 zbc.zbc_has_cksum = 0;
2581 zbc.zbc_injected =
2582 rm->rm_ecksuminjected;
2583
2584 zfs_ereport_start_checksum(
2585 zio->io_spa,
2586 vd->vdev_child[rc->rc_devidx],
2587 zio, rc->rc_offset, rc->rc_size,
2588 (void *)(uintptr_t)c, &zbc);
2589 }
2590 }
2591 }
2592 }
2593
2594 done:
2595 zio_checksum_verified(zio);
2596
2597 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2598 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2599 /*
2600 * Use the good data we have in hand to repair damaged children.
2601 */
2602 for (c = 0; c < rm->rm_cols; c++) {
2603 rc = &rm->rm_col[c];
2604 cvd = vd->vdev_child[rc->rc_devidx];
2605
2606 if (rc->rc_error == 0)
2607 continue;
2608
2609 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2610 rc->rc_offset, rc->rc_abd, rc->rc_size,
2611 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2612 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2613 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2614 }
2615 }
2616 }
2617
2618 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)2619 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2620 {
2621 if (faulted > vd->vdev_nparity)
2622 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2623 VDEV_AUX_NO_REPLICAS);
2624 else if (degraded + faulted != 0)
2625 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2626 else
2627 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2628 }
2629
2630 /*
2631 * Determine if any portion of the provided block resides on a child vdev
2632 * with a dirty DTL and therefore needs to be resilvered. The function
2633 * assumes that at least one DTL is dirty which imples that full stripe
2634 * width blocks must be resilvered.
2635 */
2636 static boolean_t
vdev_raidz_need_resilver(vdev_t * vd,uint64_t offset,size_t psize)2637 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2638 {
2639 uint64_t dcols = vd->vdev_children;
2640 uint64_t nparity = vd->vdev_nparity;
2641 uint64_t ashift = vd->vdev_top->vdev_ashift;
2642 /* The starting RAIDZ (parent) vdev sector of the block. */
2643 uint64_t b = offset >> ashift;
2644 /* The zio's size in units of the vdev's minimum sector size. */
2645 uint64_t s = ((psize - 1) >> ashift) + 1;
2646 /* The first column for this stripe. */
2647 uint64_t f = b % dcols;
2648
2649 if (s + nparity >= dcols)
2650 return (B_TRUE);
2651
2652 for (uint64_t c = 0; c < s + nparity; c++) {
2653 uint64_t devidx = (f + c) % dcols;
2654 vdev_t *cvd = vd->vdev_child[devidx];
2655
2656 /*
2657 * dsl_scan_need_resilver() already checked vd with
2658 * vdev_dtl_contains(). So here just check cvd with
2659 * vdev_dtl_empty(), cheaper and a good approximation.
2660 */
2661 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2662 return (B_TRUE);
2663 }
2664
2665 return (B_FALSE);
2666 }
2667
2668 static void
vdev_raidz_xlate(vdev_t * cvd,const range_seg_t * in,range_seg_t * res)2669 vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2670 {
2671 vdev_t *raidvd = cvd->vdev_parent;
2672 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2673
2674 uint64_t width = raidvd->vdev_children;
2675 uint64_t tgt_col = cvd->vdev_id;
2676 uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2677
2678 /* make sure the offsets are block-aligned */
2679 ASSERT0(in->rs_start % (1 << ashift));
2680 ASSERT0(in->rs_end % (1 << ashift));
2681 uint64_t b_start = in->rs_start >> ashift;
2682 uint64_t b_end = in->rs_end >> ashift;
2683
2684 uint64_t start_row = 0;
2685 if (b_start > tgt_col) /* avoid underflow */
2686 start_row = ((b_start - tgt_col - 1) / width) + 1;
2687
2688 uint64_t end_row = 0;
2689 if (b_end > tgt_col)
2690 end_row = ((b_end - tgt_col - 1) / width) + 1;
2691
2692 res->rs_start = start_row << ashift;
2693 res->rs_end = end_row << ashift;
2694
2695 ASSERT3U(res->rs_start, <=, in->rs_start);
2696 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2697 }
2698
2699 vdev_ops_t vdev_raidz_ops = {
2700 vdev_raidz_open,
2701 vdev_raidz_close,
2702 vdev_raidz_asize,
2703 vdev_raidz_io_start,
2704 vdev_raidz_io_done,
2705 vdev_raidz_state_change,
2706 vdev_raidz_need_resilver,
2707 NULL,
2708 NULL,
2709 NULL,
2710 vdev_raidz_xlate,
2711 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2712 B_FALSE /* not a leaf vdev */
2713 };
2714