1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #ifndef __FreeBSD__
33 #include <sys/vdev_disk.h>
34 #endif
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
37 #include <sys/zio.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/fm/fs/zfs.h>
41 #ifdef __FreeBSD__
42 #include <sys/bio.h>
43 #endif
44 
45 /*
46  * Virtual device vector for RAID-Z.
47  *
48  * This vdev supports single, double, and triple parity. For single parity,
49  * we use a simple XOR of all the data columns. For double or triple parity,
50  * we use a special case of Reed-Solomon coding. This extends the
51  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
52  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
53  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
54  * former is also based. The latter is designed to provide higher performance
55  * for writes.
56  *
57  * Note that the Plank paper claimed to support arbitrary N+M, but was then
58  * amended six years later identifying a critical flaw that invalidates its
59  * claims. Nevertheless, the technique can be adapted to work for up to
60  * triple parity. For additional parity, the amendment "Note: Correction to
61  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
62  * is viable, but the additional complexity means that write performance will
63  * suffer.
64  *
65  * All of the methods above operate on a Galois field, defined over the
66  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
67  * can be expressed with a single byte. Briefly, the operations on the
68  * field are defined as follows:
69  *
70  *   o addition (+) is represented by a bitwise XOR
71  *   o subtraction (-) is therefore identical to addition: A + B = A - B
72  *   o multiplication of A by 2 is defined by the following bitwise expression:
73  *
74  *        (A * 2)_7 = A_6
75  *        (A * 2)_6 = A_5
76  *        (A * 2)_5 = A_4
77  *        (A * 2)_4 = A_3 + A_7
78  *        (A * 2)_3 = A_2 + A_7
79  *        (A * 2)_2 = A_1 + A_7
80  *        (A * 2)_1 = A_0
81  *        (A * 2)_0 = A_7
82  *
83  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
84  * As an aside, this multiplication is derived from the error correcting
85  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
86  *
87  * Observe that any number in the field (except for 0) can be expressed as a
88  * power of 2 -- a generator for the field. We store a table of the powers of
89  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
90  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
91  * than field addition). The inverse of a field element A (A^-1) is therefore
92  * A ^ (255 - 1) = A^254.
93  *
94  * The up-to-three parity columns, P, Q, R over several data columns,
95  * D_0, ... D_n-1, can be expressed by field operations:
96  *
97  *        P = D_0 + D_1 + ... + D_n-2 + D_n-1
98  *        Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
99  *          = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
100  *        R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
101  *          = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
102  *
103  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
104  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
105  * independent coefficients. (There are no additional coefficients that have
106  * this property which is why the uncorrected Plank method breaks down.)
107  *
108  * See the reconstruction code below for how P, Q and R can used individually
109  * or in concert to recover missing data columns.
110  */
111 
112 typedef struct raidz_col {
113           uint64_t rc_devidx;           /* child device index for I/O */
114           uint64_t rc_offset;           /* device offset */
115           uint64_t rc_size;             /* I/O size */
116           void *rc_data;                          /* I/O data */
117           void *rc_gdata;                         /* used to store the "good" version */
118           int rc_error;                           /* I/O error for this device */
119           uint8_t rc_tried;             /* Did we attempt this I/O column? */
120           uint8_t rc_skipped;           /* Did we skip this I/O column? */
121 } raidz_col_t;
122 
123 typedef struct raidz_map {
124           uint64_t rm_cols;             /* Regular column count */
125           uint64_t rm_scols;            /* Count including skipped columns */
126           uint64_t rm_bigcols;                    /* Number of oversized columns */
127           uint64_t rm_asize;            /* Actual total I/O size */
128           uint64_t rm_missingdata;      /* Count of missing data devices */
129           uint64_t rm_missingparity;    /* Count of missing parity devices */
130           uint64_t rm_firstdatacol;     /* First data column/parity count */
131           uint64_t rm_nskip;            /* Skipped sectors for padding */
132           uint64_t rm_skipstart;                  /* Column index of padding start */
133           void *rm_datacopy;            /* rm_asize-buffer of copied data */
134           uintptr_t rm_reports;                   /* # of referencing checksum reports */
135           uint8_t   rm_freed;           /* map no longer has referencing ZIO */
136           uint8_t   rm_ecksuminjected;  /* checksum error was injected */
137           raidz_col_t rm_col[1];                  /* Flexible array of I/O columns */
138 } raidz_map_t;
139 
140 #define   VDEV_RAIDZ_P                  0
141 #define   VDEV_RAIDZ_Q                  1
142 #define   VDEV_RAIDZ_R                  2
143 
144 #define   VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
145 #define   VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
146 
147 /*
148  * We provide a mechanism to perform the field multiplication operation on a
149  * 64-bit value all at once rather than a byte at a time. This works by
150  * creating a mask from the top bit in each byte and using that to
151  * conditionally apply the XOR of 0x1d.
152  */
153 #define   VDEV_RAIDZ_64MUL_2(x, mask) \
154 { \
155           (mask) = (x) & 0x8080808080808080ULL; \
156           (mask) = ((mask) << 1) - ((mask) >> 7); \
157           (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
158               ((mask) & 0x1d1d1d1d1d1d1d1d); \
159 }
160 
161 #define   VDEV_RAIDZ_64MUL_4(x, mask) \
162 { \
163           VDEV_RAIDZ_64MUL_2((x), mask); \
164           VDEV_RAIDZ_64MUL_2((x), mask); \
165 }
166 
167 #define   VDEV_LABEL_OFFSET(x)          (x + VDEV_LABEL_START_SIZE)
168 
169 /*
170  * Force reconstruction to use the general purpose method.
171  */
172 int vdev_raidz_default_to_general;
173 
174 /* Powers of 2 in the Galois field defined above. */
175 static const uint8_t vdev_raidz_pow2[256] = {
176           0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
177           0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
178           0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
179           0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
180           0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
181           0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
182           0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
183           0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
184           0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
185           0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
186           0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
187           0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
188           0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
189           0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
190           0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
191           0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
192           0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
193           0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
194           0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
195           0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
196           0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
197           0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
198           0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
199           0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
200           0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
201           0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
202           0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
203           0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
204           0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
205           0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
206           0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
207           0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
208 };
209 /* Logs of 2 in the Galois field defined above. */
210 static const uint8_t vdev_raidz_log2[256] = {
211           0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
212           0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
213           0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
214           0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
215           0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
216           0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
217           0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
218           0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
219           0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
220           0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
221           0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
222           0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
223           0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
224           0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
225           0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
226           0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
227           0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
228           0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
229           0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
230           0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
231           0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
232           0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
233           0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
234           0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
235           0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
236           0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
237           0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
238           0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
239           0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
240           0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
241           0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
242           0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
243 };
244 
245 static void vdev_raidz_generate_parity(raidz_map_t *rm);
246 
247 /*
248  * Multiply a given number by 2 raised to the given power.
249  */
250 static uint8_t
vdev_raidz_exp2(uint_t a,int exp)251 vdev_raidz_exp2(uint_t a, int exp)
252 {
253           if (a == 0)
254                     return (0);
255 
256           ASSERT(exp >= 0);
257           ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
258 
259           exp += vdev_raidz_log2[a];
260           if (exp > 255)
261                     exp -= 255;
262 
263           return (vdev_raidz_pow2[exp]);
264 }
265 
266 static void
vdev_raidz_map_free(raidz_map_t * rm)267 vdev_raidz_map_free(raidz_map_t *rm)
268 {
269           int c;
270           size_t size;
271 
272           for (c = 0; c < rm->rm_firstdatacol; c++) {
273                     if (rm->rm_col[c].rc_data != NULL)
274                               zio_buf_free(rm->rm_col[c].rc_data,
275                                   rm->rm_col[c].rc_size);
276 
277                     if (rm->rm_col[c].rc_gdata != NULL)
278                               zio_buf_free(rm->rm_col[c].rc_gdata,
279                                   rm->rm_col[c].rc_size);
280           }
281 
282           size = 0;
283           for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
284                     size += rm->rm_col[c].rc_size;
285 
286           if (rm->rm_datacopy != NULL)
287                     zio_buf_free(rm->rm_datacopy, size);
288 
289           kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
290 }
291 
292 static void
vdev_raidz_map_free_vsd(zio_t * zio)293 vdev_raidz_map_free_vsd(zio_t *zio)
294 {
295           raidz_map_t *rm = zio->io_vsd;
296 
297           ASSERT0(rm->rm_freed);
298           rm->rm_freed = 1;
299 
300           if (rm->rm_reports == 0)
301                     vdev_raidz_map_free(rm);
302 }
303 
304 /*ARGSUSED*/
305 static void
vdev_raidz_cksum_free(void * arg,size_t ignored)306 vdev_raidz_cksum_free(void *arg, size_t ignored)
307 {
308           raidz_map_t *rm = arg;
309 
310           ASSERT3U(rm->rm_reports, >, 0);
311 
312           if (--rm->rm_reports == 0 && rm->rm_freed != 0)
313                     vdev_raidz_map_free(rm);
314 }
315 
316 static void
vdev_raidz_cksum_finish(zio_cksum_report_t * zcr,const void * good_data)317 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
318 {
319           raidz_map_t *rm = zcr->zcr_cbdata;
320           size_t c = zcr->zcr_cbinfo;
321           size_t x;
322 
323           const char *good = NULL;
324           const char *bad = rm->rm_col[c].rc_data;
325 
326           if (good_data == NULL) {
327                     zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
328                     return;
329           }
330 
331           if (c < rm->rm_firstdatacol) {
332                     /*
333                      * The first time through, calculate the parity blocks for
334                      * the good data (this relies on the fact that the good
335                      * data never changes for a given logical ZIO)
336                      */
337                     if (rm->rm_col[0].rc_gdata == NULL) {
338                               char *bad_parity[VDEV_RAIDZ_MAXPARITY];
339                               char *buf;
340 
341                               /*
342                                * Set up the rm_col[]s to generate the parity for
343                                * good_data, first saving the parity bufs and
344                                * replacing them with buffers to hold the result.
345                                */
346                               for (x = 0; x < rm->rm_firstdatacol; x++) {
347                                         bad_parity[x] = rm->rm_col[x].rc_data;
348                                         rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
349                                             zio_buf_alloc(rm->rm_col[x].rc_size);
350                               }
351 
352                               /* fill in the data columns from good_data */
353                               buf = (char *)good_data;
354                               for (; x < rm->rm_cols; x++) {
355                                         rm->rm_col[x].rc_data = buf;
356                                         buf += rm->rm_col[x].rc_size;
357                               }
358 
359                               /*
360                                * Construct the parity from the good data.
361                                */
362                               vdev_raidz_generate_parity(rm);
363 
364                               /* restore everything back to its original state */
365                               for (x = 0; x < rm->rm_firstdatacol; x++)
366                                         rm->rm_col[x].rc_data = bad_parity[x];
367 
368                               buf = rm->rm_datacopy;
369                               for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
370                                         rm->rm_col[x].rc_data = buf;
371                                         buf += rm->rm_col[x].rc_size;
372                               }
373                     }
374 
375                     ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
376                     good = rm->rm_col[c].rc_gdata;
377           } else {
378                     /* adjust good_data to point at the start of our column */
379                     good = good_data;
380 
381                     for (x = rm->rm_firstdatacol; x < c; x++)
382                               good += rm->rm_col[x].rc_size;
383           }
384 
385           /* we drop the ereport if it ends up that the data was good */
386           zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
387 }
388 
389 /*
390  * Invoked indirectly by zfs_ereport_start_checksum(), called
391  * below when our read operation fails completely.  The main point
392  * is to keep a copy of everything we read from disk, so that at
393  * vdev_raidz_cksum_finish() time we can compare it with the good data.
394  */
395 static void
vdev_raidz_cksum_report(zio_t * zio,zio_cksum_report_t * zcr,void * arg)396 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
397 {
398           size_t c = (size_t)(uintptr_t)arg;
399           caddr_t buf;
400 
401           raidz_map_t *rm = zio->io_vsd;
402           size_t size;
403 
404           /* set up the report and bump the refcount  */
405           zcr->zcr_cbdata = rm;
406           zcr->zcr_cbinfo = c;
407           zcr->zcr_finish = vdev_raidz_cksum_finish;
408           zcr->zcr_free = vdev_raidz_cksum_free;
409 
410           rm->rm_reports++;
411           ASSERT3U(rm->rm_reports, >, 0);
412 
413           if (rm->rm_datacopy != NULL)
414                     return;
415 
416           /*
417            * It's the first time we're called for this raidz_map_t, so we need
418            * to copy the data aside; there's no guarantee that our zio's buffer
419            * won't be re-used for something else.
420            *
421            * Our parity data is already in separate buffers, so there's no need
422            * to copy them.
423            */
424 
425           size = 0;
426           for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
427                     size += rm->rm_col[c].rc_size;
428 
429           buf = rm->rm_datacopy = zio_buf_alloc(size);
430 
431           for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
432                     raidz_col_t *col = &rm->rm_col[c];
433 
434                     bcopy(col->rc_data, buf, col->rc_size);
435                     col->rc_data = buf;
436 
437                     buf += col->rc_size;
438           }
439           ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
440 }
441 
442 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
443           vdev_raidz_map_free_vsd,
444           vdev_raidz_cksum_report
445 };
446 
447 /*
448  * Divides the IO evenly across all child vdevs; usually, dcols is
449  * the number of children in the target vdev.
450  */
451 static raidz_map_t *
vdev_raidz_map_alloc(caddr_t data,uint64_t size,uint64_t offset,boolean_t dofree,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)452 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
453     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
454 {
455           raidz_map_t *rm;
456           /* The starting RAIDZ (parent) vdev sector of the block. */
457           uint64_t b = offset >> unit_shift;
458           /* The zio's size in units of the vdev's minimum sector size. */
459           uint64_t s = size >> unit_shift;
460           /* The first column for this stripe. */
461           uint64_t f = b % dcols;
462           /* The starting byte offset on each child vdev. */
463           uint64_t o = (b / dcols) << unit_shift;
464           uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
465 
466           /*
467            * "Quotient": The number of data sectors for this stripe on all but
468            * the "big column" child vdevs that also contain "remainder" data.
469            */
470           q = s / (dcols - nparity);
471 
472           /*
473            * "Remainder": The number of partial stripe data sectors in this I/O.
474            * This will add a sector to some, but not all, child vdevs.
475            */
476           r = s - q * (dcols - nparity);
477 
478           /* The number of "big columns" - those which contain remainder data. */
479           bc = (r == 0 ? 0 : r + nparity);
480 
481           /*
482            * The total number of data and parity sectors associated with
483            * this I/O.
484            */
485           tot = s + nparity * (q + (r == 0 ? 0 : 1));
486 
487           /* acols: The columns that will be accessed. */
488           /* scols: The columns that will be accessed or skipped. */
489           if (q == 0) {
490                     /* Our I/O request doesn't span all child vdevs. */
491                     acols = bc;
492                     scols = MIN(dcols, roundup(bc, nparity + 1));
493           } else {
494                     acols = dcols;
495                     scols = dcols;
496           }
497 
498           ASSERT3U(acols, <=, scols);
499 
500           rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
501 
502           rm->rm_cols = acols;
503           rm->rm_scols = scols;
504           rm->rm_bigcols = bc;
505           rm->rm_skipstart = bc;
506           rm->rm_missingdata = 0;
507           rm->rm_missingparity = 0;
508           rm->rm_firstdatacol = nparity;
509           rm->rm_datacopy = NULL;
510           rm->rm_reports = 0;
511           rm->rm_freed = 0;
512           rm->rm_ecksuminjected = 0;
513 
514           asize = 0;
515 
516           for (c = 0; c < scols; c++) {
517                     col = f + c;
518                     coff = o;
519                     if (col >= dcols) {
520                               col -= dcols;
521                               coff += 1ULL << unit_shift;
522                     }
523                     rm->rm_col[c].rc_devidx = col;
524                     rm->rm_col[c].rc_offset = coff;
525                     rm->rm_col[c].rc_data = NULL;
526                     rm->rm_col[c].rc_gdata = NULL;
527                     rm->rm_col[c].rc_error = 0;
528                     rm->rm_col[c].rc_tried = 0;
529                     rm->rm_col[c].rc_skipped = 0;
530 
531                     if (c >= acols)
532                               rm->rm_col[c].rc_size = 0;
533                     else if (c < bc)
534                               rm->rm_col[c].rc_size = (q + 1) << unit_shift;
535                     else
536                               rm->rm_col[c].rc_size = q << unit_shift;
537 
538                     asize += rm->rm_col[c].rc_size;
539           }
540 
541           ASSERT3U(asize, ==, tot << unit_shift);
542           rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
543           rm->rm_nskip = roundup(tot, nparity + 1) - tot;
544           ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
545           ASSERT3U(rm->rm_nskip, <=, nparity);
546 
547           if (!dofree) {
548                     for (c = 0; c < rm->rm_firstdatacol; c++) {
549                               rm->rm_col[c].rc_data =
550                                   zio_buf_alloc(rm->rm_col[c].rc_size);
551                     }
552 
553                     rm->rm_col[c].rc_data = data;
554 
555                     for (c = c + 1; c < acols; c++) {
556                               rm->rm_col[c].rc_data =
557                                   (char *)rm->rm_col[c - 1].rc_data +
558                                   rm->rm_col[c - 1].rc_size;
559                     }
560           }
561 
562           /*
563            * If all data stored spans all columns, there's a danger that parity
564            * will always be on the same device and, since parity isn't read
565            * during normal operation, that that device's I/O bandwidth won't be
566            * used effectively. We therefore switch the parity every 1MB.
567            *
568            * ... at least that was, ostensibly, the theory. As a practical
569            * matter unless we juggle the parity between all devices evenly, we
570            * won't see any benefit. Further, occasional writes that aren't a
571            * multiple of the LCM of the number of children and the minimum
572            * stripe width are sufficient to avoid pessimal behavior.
573            * Unfortunately, this decision created an implicit on-disk format
574            * requirement that we need to support for all eternity, but only
575            * for single-parity RAID-Z.
576            *
577            * If we intend to skip a sector in the zeroth column for padding
578            * we must make sure to note this swap. We will never intend to
579            * skip the first column since at least one data and one parity
580            * column must appear in each row.
581            */
582           ASSERT(rm->rm_cols >= 2);
583           ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
584 
585           if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
586                     devidx = rm->rm_col[0].rc_devidx;
587                     o = rm->rm_col[0].rc_offset;
588                     rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
589                     rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
590                     rm->rm_col[1].rc_devidx = devidx;
591                     rm->rm_col[1].rc_offset = o;
592 
593                     if (rm->rm_skipstart == 0)
594                               rm->rm_skipstart = 1;
595           }
596 
597           return (rm);
598 }
599 
600 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)601 vdev_raidz_generate_parity_p(raidz_map_t *rm)
602 {
603           uint64_t *p, *src, pcount, ccount, i;
604           int c;
605 
606           pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
607 
608           for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
609                     src = rm->rm_col[c].rc_data;
610                     p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
611                     ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
612 
613                     if (c == rm->rm_firstdatacol) {
614                               ASSERT(ccount == pcount);
615                               for (i = 0; i < ccount; i++, src++, p++) {
616                                         *p = *src;
617                               }
618                     } else {
619                               ASSERT(ccount <= pcount);
620                               for (i = 0; i < ccount; i++, src++, p++) {
621                                         *p ^= *src;
622                               }
623                     }
624           }
625 }
626 
627 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)628 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
629 {
630           uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
631           int c;
632 
633           pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
634           ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
635               rm->rm_col[VDEV_RAIDZ_Q].rc_size);
636 
637           for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
638                     src = rm->rm_col[c].rc_data;
639                     p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
640                     q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
641 
642                     ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
643 
644                     if (c == rm->rm_firstdatacol) {
645                               ASSERT(ccnt == pcnt || ccnt == 0);
646                               for (i = 0; i < ccnt; i++, src++, p++, q++) {
647                                         *p = *src;
648                                         *q = *src;
649                               }
650                               for (; i < pcnt; i++, src++, p++, q++) {
651                                         *p = 0;
652                                         *q = 0;
653                               }
654                     } else {
655                               ASSERT(ccnt <= pcnt);
656 
657                               /*
658                                * Apply the algorithm described above by multiplying
659                                * the previous result and adding in the new value.
660                                */
661                               for (i = 0; i < ccnt; i++, src++, p++, q++) {
662                                         *p ^= *src;
663 
664                                         VDEV_RAIDZ_64MUL_2(*q, mask);
665                                         *q ^= *src;
666                               }
667 
668                               /*
669                                * Treat short columns as though they are full of 0s.
670                                * Note that there's therefore nothing needed for P.
671                                */
672                               for (; i < pcnt; i++, q++) {
673                                         VDEV_RAIDZ_64MUL_2(*q, mask);
674                               }
675                     }
676           }
677 }
678 
679 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)680 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
681 {
682           uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
683           int c;
684 
685           pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
686           ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
687               rm->rm_col[VDEV_RAIDZ_Q].rc_size);
688           ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
689               rm->rm_col[VDEV_RAIDZ_R].rc_size);
690 
691           for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
692                     src = rm->rm_col[c].rc_data;
693                     p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
694                     q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
695                     r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
696 
697                     ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
698 
699                     if (c == rm->rm_firstdatacol) {
700                               ASSERT(ccnt == pcnt || ccnt == 0);
701                               for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
702                                         *p = *src;
703                                         *q = *src;
704                                         *r = *src;
705                               }
706                               for (; i < pcnt; i++, src++, p++, q++, r++) {
707                                         *p = 0;
708                                         *q = 0;
709                                         *r = 0;
710                               }
711                     } else {
712                               ASSERT(ccnt <= pcnt);
713 
714                               /*
715                                * Apply the algorithm described above by multiplying
716                                * the previous result and adding in the new value.
717                                */
718                               for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
719                                         *p ^= *src;
720 
721                                         VDEV_RAIDZ_64MUL_2(*q, mask);
722                                         *q ^= *src;
723 
724                                         VDEV_RAIDZ_64MUL_4(*r, mask);
725                                         *r ^= *src;
726                               }
727 
728                               /*
729                                * Treat short columns as though they are full of 0s.
730                                * Note that there's therefore nothing needed for P.
731                                */
732                               for (; i < pcnt; i++, q++, r++) {
733                                         VDEV_RAIDZ_64MUL_2(*q, mask);
734                                         VDEV_RAIDZ_64MUL_4(*r, mask);
735                               }
736                     }
737           }
738 }
739 
740 /*
741  * Generate RAID parity in the first virtual columns according to the number of
742  * parity columns available.
743  */
744 static void
vdev_raidz_generate_parity(raidz_map_t * rm)745 vdev_raidz_generate_parity(raidz_map_t *rm)
746 {
747           switch (rm->rm_firstdatacol) {
748           case 1:
749                     vdev_raidz_generate_parity_p(rm);
750                     break;
751           case 2:
752                     vdev_raidz_generate_parity_pq(rm);
753                     break;
754           case 3:
755                     vdev_raidz_generate_parity_pqr(rm);
756                     break;
757           default:
758                     cmn_err(CE_PANIC, "invalid RAID-Z configuration");
759           }
760 }
761 
762 static int
vdev_raidz_reconstruct_p(raidz_map_t * rm,int * tgts,int ntgts)763 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
764 {
765           uint64_t *dst, *src, xcount, ccount, count, i;
766           int x = tgts[0];
767           int c;
768 
769           ASSERT(ntgts == 1);
770           ASSERT(x >= rm->rm_firstdatacol);
771           ASSERT(x < rm->rm_cols);
772 
773           xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
774           ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
775           ASSERT(xcount > 0);
776 
777           src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
778           dst = rm->rm_col[x].rc_data;
779           for (i = 0; i < xcount; i++, dst++, src++) {
780                     *dst = *src;
781           }
782 
783           for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
784                     src = rm->rm_col[c].rc_data;
785                     dst = rm->rm_col[x].rc_data;
786 
787                     if (c == x)
788                               continue;
789 
790                     ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
791                     count = MIN(ccount, xcount);
792 
793                     for (i = 0; i < count; i++, dst++, src++) {
794                               *dst ^= *src;
795                     }
796           }
797 
798           return (1 << VDEV_RAIDZ_P);
799 }
800 
801 static int
vdev_raidz_reconstruct_q(raidz_map_t * rm,int * tgts,int ntgts)802 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
803 {
804           uint64_t *dst, *src, xcount, ccount, count, mask, i;
805           uint8_t *b;
806           int x = tgts[0];
807           int c, j, exp;
808 
809           ASSERT(ntgts == 1);
810 
811           xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
812           ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
813 
814           for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
815                     src = rm->rm_col[c].rc_data;
816                     dst = rm->rm_col[x].rc_data;
817 
818                     if (c == x)
819                               ccount = 0;
820                     else
821                               ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
822 
823                     count = MIN(ccount, xcount);
824 
825                     if (c == rm->rm_firstdatacol) {
826                               for (i = 0; i < count; i++, dst++, src++) {
827                                         *dst = *src;
828                               }
829                               for (; i < xcount; i++, dst++) {
830                                         *dst = 0;
831                               }
832 
833                     } else {
834                               for (i = 0; i < count; i++, dst++, src++) {
835                                         VDEV_RAIDZ_64MUL_2(*dst, mask);
836                                         *dst ^= *src;
837                               }
838 
839                               for (; i < xcount; i++, dst++) {
840                                         VDEV_RAIDZ_64MUL_2(*dst, mask);
841                               }
842                     }
843           }
844 
845           src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
846           dst = rm->rm_col[x].rc_data;
847           exp = 255 - (rm->rm_cols - 1 - x);
848 
849           for (i = 0; i < xcount; i++, dst++, src++) {
850                     *dst ^= *src;
851                     for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
852                               *b = vdev_raidz_exp2(*b, exp);
853                     }
854           }
855 
856           return (1 << VDEV_RAIDZ_Q);
857 }
858 
859 static int
vdev_raidz_reconstruct_pq(raidz_map_t * rm,int * tgts,int ntgts)860 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
861 {
862           uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
863           void *pdata, *qdata;
864           uint64_t xsize, ysize, i;
865           int x = tgts[0];
866           int y = tgts[1];
867 
868           ASSERT(ntgts == 2);
869           ASSERT(x < y);
870           ASSERT(x >= rm->rm_firstdatacol);
871           ASSERT(y < rm->rm_cols);
872 
873           ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
874 
875           /*
876            * Move the parity data aside -- we're going to compute parity as
877            * though columns x and y were full of zeros -- Pxy and Qxy. We want to
878            * reuse the parity generation mechanism without trashing the actual
879            * parity so we make those columns appear to be full of zeros by
880            * setting their lengths to zero.
881            */
882           pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
883           qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
884           xsize = rm->rm_col[x].rc_size;
885           ysize = rm->rm_col[y].rc_size;
886 
887           rm->rm_col[VDEV_RAIDZ_P].rc_data =
888               zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
889           rm->rm_col[VDEV_RAIDZ_Q].rc_data =
890               zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
891           rm->rm_col[x].rc_size = 0;
892           rm->rm_col[y].rc_size = 0;
893 
894           vdev_raidz_generate_parity_pq(rm);
895 
896           rm->rm_col[x].rc_size = xsize;
897           rm->rm_col[y].rc_size = ysize;
898 
899           p = pdata;
900           q = qdata;
901           pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
902           qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
903           xd = rm->rm_col[x].rc_data;
904           yd = rm->rm_col[y].rc_data;
905 
906           /*
907            * We now have:
908            *        Pxy = P + D_x + D_y
909            *        Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
910            *
911            * We can then solve for D_x:
912            *        D_x = A * (P + Pxy) + B * (Q + Qxy)
913            * where
914            *        A = 2^(x - y) * (2^(x - y) + 1)^-1
915            *        B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
916            *
917            * With D_x in hand, we can easily solve for D_y:
918            *        D_y = P + Pxy + D_x
919            */
920 
921           a = vdev_raidz_pow2[255 + x - y];
922           b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
923           tmp = 255 - vdev_raidz_log2[a ^ 1];
924 
925           aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
926           bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
927 
928           for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
929                     *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
930                         vdev_raidz_exp2(*q ^ *qxy, bexp);
931 
932                     if (i < ysize)
933                               *yd = *p ^ *pxy ^ *xd;
934           }
935 
936           zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
937               rm->rm_col[VDEV_RAIDZ_P].rc_size);
938           zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
939               rm->rm_col[VDEV_RAIDZ_Q].rc_size);
940 
941           /*
942            * Restore the saved parity data.
943            */
944           rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
945           rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
946 
947           return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
948 }
949 
950 /* BEGIN CSTYLED */
951 /*
952  * In the general case of reconstruction, we must solve the system of linear
953  * equations defined by the coeffecients used to generate parity as well as
954  * the contents of the data and parity disks. This can be expressed with
955  * vectors for the original data (D) and the actual data (d) and parity (p)
956  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
957  *
958  *            __   __                     __     __
959  *            |     |         __     __   |  p_0  |
960  *            |  V  |         |  D_0  |   | p_m-1 |
961  *            |     |    x    |   :   | = |  d_0  |
962  *            |  I  |         | D_n-1 |   |   :   |
963  *            |     |         ~~     ~~   | d_n-1 |
964  *            ~~   ~~                     ~~     ~~
965  *
966  * I is simply a square identity matrix of size n, and V is a vandermonde
967  * matrix defined by the coeffecients we chose for the various parity columns
968  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
969  * computation as well as linear separability.
970  *
971  *      __               __               __     __
972  *      |   1   ..  1 1 1 |               |  p_0  |
973  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
974  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
975  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
976  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
977  *      |   :       : : : |   |   :   |   |  d_2  |
978  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
979  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
980  *      |   0   ..  0 0 1 |               | d_n-1 |
981  *      ~~               ~~               ~~     ~~
982  *
983  * Note that I, V, d, and p are known. To compute D, we must invert the
984  * matrix and use the known data and parity values to reconstruct the unknown
985  * data values. We begin by removing the rows in V|I and d|p that correspond
986  * to failed or missing columns; we then make V|I square (n x n) and d|p
987  * sized n by removing rows corresponding to unused parity from the bottom up
988  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
989  * using Gauss-Jordan elimination. In the example below we use m=3 parity
990  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
991  *           __                               __
992  *           |  1   1   1   1   1   1   1   1  |
993  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
994  *           |  19 205 116  29  64  16  4   1  |      / /
995  *           |  1   0   0   0   0   0   0   0  |     / /
996  *           |  0   1   0   0   0   0   0   0  | <--' /
997  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
998  *           |  0   0   0   1   0   0   0   0  |
999  *           |  0   0   0   0   1   0   0   0  |
1000  *           |  0   0   0   0   0   1   0   0  |
1001  *           |  0   0   0   0   0   0   1   0  |
1002  *           |  0   0   0   0   0   0   0   1  |
1003  *           ~~                               ~~
1004  *           __                               __
1005  *           |  1   1   1   1   1   1   1   1  |
1006  *           |  19 205 116  29  64  16  4   1  |
1007  *           |  1   0   0   0   0   0   0   0  |
1008  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1009  *           |  0   0   0   0   1   0   0   0  |
1010  *           |  0   0   0   0   0   1   0   0  |
1011  *           |  0   0   0   0   0   0   1   0  |
1012  *           |  0   0   0   0   0   0   0   1  |
1013  *           ~~                               ~~
1014  *
1015  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1016  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1017  * matrix is not singular.
1018  * __                                                                 __
1019  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1020  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1021  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1022  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1023  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1024  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1025  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1026  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1027  * ~~                                                                 ~~
1028  * __                                                                 __
1029  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1030  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1031  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1032  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1033  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1034  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1035  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1036  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1037  * ~~                                                                 ~~
1038  * __                                                                 __
1039  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1040  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1041  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1042  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1043  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1044  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1045  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1046  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1047  * ~~                                                                 ~~
1048  * __                                                                 __
1049  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1050  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1051  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1052  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1053  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1054  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1055  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1056  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1057  * ~~                                                                 ~~
1058  * __                                                                 __
1059  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1060  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1061  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1062  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1063  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1064  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1065  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1066  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1067  * ~~                                                                 ~~
1068  * __                                                                 __
1069  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1070  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1071  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1072  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1073  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1074  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1075  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1076  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1077  * ~~                                                                 ~~
1078  *                   __                               __
1079  *                   |  0   0   1   0   0   0   0   0  |
1080  *                   | 167 100  5   41 159 169 217 208 |
1081  *                   | 166 100  4   40 158 168 216 209 |
1082  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1083  *                   |  0   0   0   0   1   0   0   0  |
1084  *                   |  0   0   0   0   0   1   0   0  |
1085  *                   |  0   0   0   0   0   0   1   0  |
1086  *                   |  0   0   0   0   0   0   0   1  |
1087  *                   ~~                               ~~
1088  *
1089  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1090  * of the missing data.
1091  *
1092  * As is apparent from the example above, the only non-trivial rows in the
1093  * inverse matrix correspond to the data disks that we're trying to
1094  * reconstruct. Indeed, those are the only rows we need as the others would
1095  * only be useful for reconstructing data known or assumed to be valid. For
1096  * that reason, we only build the coefficients in the rows that correspond to
1097  * targeted columns.
1098  */
1099 /* END CSTYLED */
1100 
1101 static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)1102 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1103     uint8_t **rows)
1104 {
1105           int i, j;
1106           int pow;
1107 
1108           ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1109 
1110           /*
1111            * Fill in the missing rows of interest.
1112            */
1113           for (i = 0; i < nmap; i++) {
1114                     ASSERT3S(0, <=, map[i]);
1115                     ASSERT3S(map[i], <=, 2);
1116 
1117                     pow = map[i] * n;
1118                     if (pow > 255)
1119                               pow -= 255;
1120                     ASSERT(pow <= 255);
1121 
1122                     for (j = 0; j < n; j++) {
1123                               pow -= map[i];
1124                               if (pow < 0)
1125                                         pow += 255;
1126                               rows[i][j] = vdev_raidz_pow2[pow];
1127                     }
1128           }
1129 }
1130 
1131 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1132 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1133     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1134 {
1135           int i, j, ii, jj;
1136           uint8_t log;
1137 
1138           /*
1139            * Assert that the first nmissing entries from the array of used
1140            * columns correspond to parity columns and that subsequent entries
1141            * correspond to data columns.
1142            */
1143           for (i = 0; i < nmissing; i++) {
1144                     ASSERT3S(used[i], <, rm->rm_firstdatacol);
1145           }
1146           for (; i < n; i++) {
1147                     ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1148           }
1149 
1150           /*
1151            * First initialize the storage where we'll compute the inverse rows.
1152            */
1153           for (i = 0; i < nmissing; i++) {
1154                     for (j = 0; j < n; j++) {
1155                               invrows[i][j] = (i == j) ? 1 : 0;
1156                     }
1157           }
1158 
1159           /*
1160            * Subtract all trivial rows from the rows of consequence.
1161            */
1162           for (i = 0; i < nmissing; i++) {
1163                     for (j = nmissing; j < n; j++) {
1164                               ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1165                               jj = used[j] - rm->rm_firstdatacol;
1166                               ASSERT3S(jj, <, n);
1167                               invrows[i][j] = rows[i][jj];
1168                               rows[i][jj] = 0;
1169                     }
1170           }
1171 
1172           /*
1173            * For each of the rows of interest, we must normalize it and subtract
1174            * a multiple of it from the other rows.
1175            */
1176           for (i = 0; i < nmissing; i++) {
1177                     for (j = 0; j < missing[i]; j++) {
1178                               ASSERT0(rows[i][j]);
1179                     }
1180                     ASSERT3U(rows[i][missing[i]], !=, 0);
1181 
1182                     /*
1183                      * Compute the inverse of the first element and multiply each
1184                      * element in the row by that value.
1185                      */
1186                     log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1187 
1188                     for (j = 0; j < n; j++) {
1189                               rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1190                               invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1191                     }
1192 
1193                     for (ii = 0; ii < nmissing; ii++) {
1194                               if (i == ii)
1195                                         continue;
1196 
1197                               ASSERT3U(rows[ii][missing[i]], !=, 0);
1198 
1199                               log = vdev_raidz_log2[rows[ii][missing[i]]];
1200 
1201                               for (j = 0; j < n; j++) {
1202                                         rows[ii][j] ^=
1203                                             vdev_raidz_exp2(rows[i][j], log);
1204                                         invrows[ii][j] ^=
1205                                             vdev_raidz_exp2(invrows[i][j], log);
1206                               }
1207                     }
1208           }
1209 
1210           /*
1211            * Verify that the data that is left in the rows are properly part of
1212            * an identity matrix.
1213            */
1214           for (i = 0; i < nmissing; i++) {
1215                     for (j = 0; j < n; j++) {
1216                               if (j == missing[i]) {
1217                                         ASSERT3U(rows[i][j], ==, 1);
1218                               } else {
1219                                         ASSERT0(rows[i][j]);
1220                               }
1221                     }
1222           }
1223 }
1224 
1225 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1226 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1227     int *missing, uint8_t **invrows, const uint8_t *used)
1228 {
1229           int i, j, x, cc, c;
1230           uint8_t *src;
1231           uint64_t ccount;
1232           uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1233           uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1234           uint8_t log = 0;
1235           uint8_t val;
1236           int ll;
1237           uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1238           uint8_t *p, *pp;
1239           size_t psize;
1240 
1241           psize = sizeof (invlog[0][0]) * n * nmissing;
1242           p = kmem_alloc(psize, KM_SLEEP);
1243 
1244           for (pp = p, i = 0; i < nmissing; i++) {
1245                     invlog[i] = pp;
1246                     pp += n;
1247           }
1248 
1249           for (i = 0; i < nmissing; i++) {
1250                     for (j = 0; j < n; j++) {
1251                               ASSERT3U(invrows[i][j], !=, 0);
1252                               invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1253                     }
1254           }
1255 
1256           for (i = 0; i < n; i++) {
1257                     c = used[i];
1258                     ASSERT3U(c, <, rm->rm_cols);
1259 
1260                     src = rm->rm_col[c].rc_data;
1261                     ccount = rm->rm_col[c].rc_size;
1262                     for (j = 0; j < nmissing; j++) {
1263                               cc = missing[j] + rm->rm_firstdatacol;
1264                               ASSERT3U(cc, >=, rm->rm_firstdatacol);
1265                               ASSERT3U(cc, <, rm->rm_cols);
1266                               ASSERT3U(cc, !=, c);
1267 
1268                               dst[j] = rm->rm_col[cc].rc_data;
1269                               dcount[j] = rm->rm_col[cc].rc_size;
1270                     }
1271 
1272                     ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1273 
1274                     for (x = 0; x < ccount; x++, src++) {
1275                               if (*src != 0)
1276                                         log = vdev_raidz_log2[*src];
1277 
1278                               for (cc = 0; cc < nmissing; cc++) {
1279                                         if (x >= dcount[cc])
1280                                                   continue;
1281 
1282                                         if (*src == 0) {
1283                                                   val = 0;
1284                                         } else {
1285                                                   if ((ll = log + invlog[cc][i]) >= 255)
1286                                                             ll -= 255;
1287                                                   val = vdev_raidz_pow2[ll];
1288                                         }
1289 
1290                                         if (i == 0)
1291                                                   dst[cc][x] = val;
1292                                         else
1293                                                   dst[cc][x] ^= val;
1294                               }
1295                     }
1296           }
1297 
1298           kmem_free(p, psize);
1299 }
1300 
1301 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1302 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1303 {
1304           int n, i, c, t, tt;
1305           int nmissing_rows;
1306           int missing_rows[VDEV_RAIDZ_MAXPARITY];
1307           int parity_map[VDEV_RAIDZ_MAXPARITY];
1308 
1309           uint8_t *p, *pp;
1310           size_t psize;
1311 
1312           uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1313           uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1314           uint8_t *used;
1315 
1316           int code = 0;
1317 
1318 
1319           n = rm->rm_cols - rm->rm_firstdatacol;
1320 
1321           /*
1322            * Figure out which data columns are missing.
1323            */
1324           nmissing_rows = 0;
1325           for (t = 0; t < ntgts; t++) {
1326                     if (tgts[t] >= rm->rm_firstdatacol) {
1327                               missing_rows[nmissing_rows++] =
1328                                   tgts[t] - rm->rm_firstdatacol;
1329                     }
1330           }
1331 
1332           /*
1333            * Figure out which parity columns to use to help generate the missing
1334            * data columns.
1335            */
1336           for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1337                     ASSERT(tt < ntgts);
1338                     ASSERT(c < rm->rm_firstdatacol);
1339 
1340                     /*
1341                      * Skip any targeted parity columns.
1342                      */
1343                     if (c == tgts[tt]) {
1344                               tt++;
1345                               continue;
1346                     }
1347 
1348                     code |= 1 << c;
1349 
1350                     parity_map[i] = c;
1351                     i++;
1352           }
1353 
1354           ASSERT(code != 0);
1355           ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1356 
1357           psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1358               nmissing_rows * n + sizeof (used[0]) * n;
1359           p = kmem_alloc(psize, KM_SLEEP);
1360 
1361           for (pp = p, i = 0; i < nmissing_rows; i++) {
1362                     rows[i] = pp;
1363                     pp += n;
1364                     invrows[i] = pp;
1365                     pp += n;
1366           }
1367           used = pp;
1368 
1369           for (i = 0; i < nmissing_rows; i++) {
1370                     used[i] = parity_map[i];
1371           }
1372 
1373           for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1374                     if (tt < nmissing_rows &&
1375                         c == missing_rows[tt] + rm->rm_firstdatacol) {
1376                               tt++;
1377                               continue;
1378                     }
1379 
1380                     ASSERT3S(i, <, n);
1381                     used[i] = c;
1382                     i++;
1383           }
1384 
1385           /*
1386            * Initialize the interesting rows of the matrix.
1387            */
1388           vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1389 
1390           /*
1391            * Invert the matrix.
1392            */
1393           vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1394               invrows, used);
1395 
1396           /*
1397            * Reconstruct the missing data using the generated matrix.
1398            */
1399           vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1400               invrows, used);
1401 
1402           kmem_free(p, psize);
1403 
1404           return (code);
1405 }
1406 
1407 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1408 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1409 {
1410           int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1411           int ntgts;
1412           int i, c;
1413           int code;
1414           int nbadparity, nbaddata;
1415           int parity_valid[VDEV_RAIDZ_MAXPARITY] = {0};
1416 
1417           /*
1418            * The tgts list must already be sorted.
1419            */
1420           for (i = 1; i < nt; i++) {
1421                     ASSERT(t[i] > t[i - 1]);
1422           }
1423 
1424           nbadparity = rm->rm_firstdatacol;
1425           nbaddata = rm->rm_cols - nbadparity;
1426           ntgts = 0;
1427           for (i = 0, c = 0; c < rm->rm_cols; c++) {
1428                     if (c < rm->rm_firstdatacol)
1429                               parity_valid[c] = B_FALSE;
1430 
1431                     if (i < nt && c == t[i]) {
1432                               tgts[ntgts++] = c;
1433                               i++;
1434                     } else if (rm->rm_col[c].rc_error != 0) {
1435                               tgts[ntgts++] = c;
1436                     } else if (c >= rm->rm_firstdatacol) {
1437                               nbaddata--;
1438                     } else {
1439                               parity_valid[c] = B_TRUE;
1440                               nbadparity--;
1441                     }
1442           }
1443 
1444           ASSERT(ntgts >= nt);
1445           ASSERT(nbaddata >= 0);
1446           ASSERT(nbaddata + nbadparity == ntgts);
1447 
1448           dt = &tgts[nbadparity];
1449 
1450           /*
1451            * See if we can use any of our optimized reconstruction routines.
1452            */
1453           if (!vdev_raidz_default_to_general) {
1454                     switch (nbaddata) {
1455                     case 1:
1456                               if (parity_valid[VDEV_RAIDZ_P])
1457                                         return (vdev_raidz_reconstruct_p(rm, dt, 1));
1458 
1459                               ASSERT(rm->rm_firstdatacol > 1);
1460 
1461                               if (parity_valid[VDEV_RAIDZ_Q])
1462                                         return (vdev_raidz_reconstruct_q(rm, dt, 1));
1463 
1464                               ASSERT(rm->rm_firstdatacol > 2);
1465                               break;
1466 
1467                     case 2:
1468                               ASSERT(rm->rm_firstdatacol > 1);
1469 
1470                               if (parity_valid[VDEV_RAIDZ_P] &&
1471                                   parity_valid[VDEV_RAIDZ_Q])
1472                                         return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1473 
1474                               ASSERT(rm->rm_firstdatacol > 2);
1475 
1476                               break;
1477                     }
1478           }
1479 
1480           code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1481           ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1482           ASSERT(code > 0);
1483           return (code);
1484 }
1485 
1486 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)1487 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1488     uint64_t *logical_ashift, uint64_t *physical_ashift)
1489 {
1490           vdev_t *cvd;
1491           uint64_t nparity = vd->vdev_nparity;
1492           int c;
1493           int lasterror = 0;
1494           int numerrors = 0;
1495 
1496           ASSERT(nparity > 0);
1497 
1498           if (nparity > VDEV_RAIDZ_MAXPARITY ||
1499               vd->vdev_children < nparity + 1) {
1500                     vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1501                     return (SET_ERROR(EINVAL));
1502           }
1503 
1504           vdev_open_children(vd);
1505 
1506           for (c = 0; c < vd->vdev_children; c++) {
1507                     cvd = vd->vdev_child[c];
1508 
1509                     if (cvd->vdev_open_error != 0) {
1510                               lasterror = cvd->vdev_open_error;
1511                               numerrors++;
1512                               continue;
1513                     }
1514 
1515                     *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1516                     *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1517                     *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1518                     *physical_ashift = MAX(*physical_ashift,
1519                         cvd->vdev_physical_ashift);
1520           }
1521 
1522           *asize *= vd->vdev_children;
1523           *max_asize *= vd->vdev_children;
1524 
1525           if (numerrors > nparity) {
1526                     vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1527                     return (lasterror);
1528           }
1529 
1530           return (0);
1531 }
1532 
1533 static void
vdev_raidz_close(vdev_t * vd)1534 vdev_raidz_close(vdev_t *vd)
1535 {
1536           int c;
1537 
1538           for (c = 0; c < vd->vdev_children; c++)
1539                     vdev_close(vd->vdev_child[c]);
1540 }
1541 
1542 #ifdef illumos
1543 /*
1544  * Handle a read or write I/O to a RAID-Z dump device.
1545  *
1546  * The dump device is in a unique situation compared to other ZFS datasets:
1547  * writing to this device should be as simple and fast as possible.  In
1548  * addition, durability matters much less since the dump will be extracted
1549  * once the machine reboots.  For that reason, this function eschews parity for
1550  * performance and simplicity.  The dump device uses the checksum setting
1551  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1552  * dataset.
1553  *
1554  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1555  * 128 KB will not fill an entire block; in addition, they may not be properly
1556  * aligned.  In that case, this function uses the preallocated 128 KB block and
1557  * omits reading or writing any "empty" portions of that block, as opposed to
1558  * allocating a fresh appropriately-sized block.
1559  *
1560  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1561  *
1562  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1563  *
1564  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1565  * allocated which spans all five child vdevs.  8 KB of data would be written to
1566  * each of four vdevs, with the fifth containing the parity bits.
1567  *
1568  *       parity    data     data     data     data
1569  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1570  *         ^        ^        ^        ^        ^
1571  *         |        |        |        |        |
1572  *   8 KB parity    ------8 KB data blocks------
1573  *
1574  * However, when writing to the dump device, the behavior is different:
1575  *
1576  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1577  *
1578  * Unlike the normal RAID-Z case in which the block is allocated based on the
1579  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1580  * I/O size is less than 128 KB, only the actual portions of data are written.
1581  * In this example the data is written to the third data vdev since that vdev
1582  * contains the offset [64 KB, 96 KB).
1583  *
1584  *       parity    data     data     data     data
1585  *     |        |        |        |   XX   |        |
1586  *                                    ^
1587  *                                    |
1588  *                             32 KB data block
1589  *
1590  * As a result, an individual I/O may not span all child vdevs; moreover, a
1591  * small I/O may only operate on a single child vdev.
1592  *
1593  * Note that since there are no parity bits calculated or written, this format
1594  * remains the same no matter how many parity bits are used in a normal RAID-Z
1595  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1596  * would look like:
1597  *
1598  *       parity   parity   parity    data     data     data     data
1599  *     |        |        |        |        |        |   XX   |        |
1600  *                                                      ^
1601  *                                                      |
1602  *                                               32 KB data block
1603  */
1604 int
vdev_raidz_physio(vdev_t * vd,caddr_t data,size_t size,uint64_t offset,uint64_t origoffset,boolean_t doread,boolean_t isdump)1605 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1606     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1607 {
1608           vdev_t *tvd = vd->vdev_top;
1609           vdev_t *cvd;
1610           raidz_map_t *rm;
1611           raidz_col_t *rc;
1612           int c, err = 0;
1613 
1614           uint64_t start, end, colstart, colend;
1615           uint64_t coloffset, colsize, colskip;
1616 
1617           int flags = doread ? BIO_READ : BIO_WRITE;
1618 
1619 #ifdef    _KERNEL
1620 
1621           /*
1622            * Don't write past the end of the block
1623            */
1624           VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1625 
1626           start = offset;
1627           end = start + size;
1628 
1629           /*
1630            * Allocate a RAID-Z map for this block.  Note that this block starts
1631            * from the "original" offset, this is, the offset of the extent which
1632            * contains the requisite offset of the data being read or written.
1633            *
1634            * Even if this I/O operation doesn't span the full block size, let's
1635            * treat the on-disk format as if the only blocks are the complete 128
1636            * KB size.
1637            */
1638           rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1639               SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1640               vd->vdev_children, vd->vdev_nparity);
1641 
1642           coloffset = origoffset;
1643 
1644           for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1645               c++, coloffset += rc->rc_size) {
1646                     rc = &rm->rm_col[c];
1647                     cvd = vd->vdev_child[rc->rc_devidx];
1648 
1649                     /*
1650                      * Find the start and end of this column in the RAID-Z map,
1651                      * keeping in mind that the stated size and offset of the
1652                      * operation may not fill the entire column for this vdev.
1653                      *
1654                      * If any portion of the data spans this column, issue the
1655                      * appropriate operation to the vdev.
1656                      */
1657                     if (coloffset + rc->rc_size <= start)
1658                               continue;
1659                     if (coloffset >= end)
1660                               continue;
1661 
1662                     colstart = MAX(coloffset, start);
1663                     colend = MIN(end, coloffset + rc->rc_size);
1664                     colsize = colend - colstart;
1665                     colskip = colstart - coloffset;
1666 
1667                     VERIFY3U(colsize, <=, rc->rc_size);
1668                     VERIFY3U(colskip, <=, rc->rc_size);
1669 
1670                     /*
1671                      * Note that the child vdev will have a vdev label at the start
1672                      * of its range of offsets, hence the need for
1673                      * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1674                      * example of why this calculation is needed.
1675                      */
1676                     if ((err = vdev_disk_physio(cvd,
1677                         ((char *)rc->rc_data) + colskip, colsize,
1678                         VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1679                         flags, isdump)) != 0)
1680                               break;
1681           }
1682 
1683           vdev_raidz_map_free(rm);
1684 #endif    /* KERNEL */
1685 
1686           return (err);
1687 }
1688 #endif
1689 
1690 static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize)1691 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1692 {
1693           uint64_t asize;
1694           uint64_t ashift = vd->vdev_top->vdev_ashift;
1695           uint64_t cols = vd->vdev_children;
1696           uint64_t nparity = vd->vdev_nparity;
1697 
1698           asize = ((psize - 1) >> ashift) + 1;
1699           asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1700           asize = roundup(asize, nparity + 1) << ashift;
1701 
1702           return (asize);
1703 }
1704 
1705 static void
vdev_raidz_child_done(zio_t * zio)1706 vdev_raidz_child_done(zio_t *zio)
1707 {
1708           raidz_col_t *rc = zio->io_private;
1709 
1710           rc->rc_error = zio->io_error;
1711           rc->rc_tried = 1;
1712           rc->rc_skipped = 0;
1713 }
1714 
1715 /*
1716  * Start an IO operation on a RAIDZ VDev
1717  *
1718  * Outline:
1719  * - For write operations:
1720  *   1. Generate the parity data
1721  *   2. Create child zio write operations to each column's vdev, for both
1722  *      data and parity.
1723  *   3. If the column skips any sectors for padding, create optional dummy
1724  *      write zio children for those areas to improve aggregation continuity.
1725  * - For read operations:
1726  *   1. Create child zio read operations to each data column's vdev to read
1727  *      the range of data required for zio.
1728  *   2. If this is a scrub or resilver operation, or if any of the data
1729  *      vdevs have had errors, then create zio read operations to the parity
1730  *      columns' VDevs as well.
1731  */
1732 static void
vdev_raidz_io_start(zio_t * zio)1733 vdev_raidz_io_start(zio_t *zio)
1734 {
1735           vdev_t *vd = zio->io_vd;
1736           vdev_t *tvd = vd->vdev_top;
1737           vdev_t *cvd;
1738           raidz_map_t *rm;
1739           raidz_col_t *rc;
1740           int c, i;
1741 
1742           rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1743               zio->io_type == ZIO_TYPE_FREE,
1744               tvd->vdev_ashift, vd->vdev_children,
1745               vd->vdev_nparity);
1746 
1747           zio->io_vsd = rm;
1748           zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1749 
1750           ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1751 
1752           if (zio->io_type == ZIO_TYPE_FREE) {
1753                     for (c = 0; c < rm->rm_cols; c++) {
1754                               rc = &rm->rm_col[c];
1755                               cvd = vd->vdev_child[rc->rc_devidx];
1756                               zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1757                                   rc->rc_offset, rc->rc_data, rc->rc_size,
1758                                   zio->io_type, zio->io_priority, 0,
1759                                   vdev_raidz_child_done, rc));
1760                     }
1761 
1762                     zio_execute(zio);
1763                     return;
1764           }
1765 
1766           if (zio->io_type == ZIO_TYPE_WRITE) {
1767                     vdev_raidz_generate_parity(rm);
1768 
1769                     for (c = 0; c < rm->rm_cols; c++) {
1770                               rc = &rm->rm_col[c];
1771                               cvd = vd->vdev_child[rc->rc_devidx];
1772                               zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1773                                   rc->rc_offset, rc->rc_data, rc->rc_size,
1774                                   zio->io_type, zio->io_priority, 0,
1775                                   vdev_raidz_child_done, rc));
1776                     }
1777 
1778                     /*
1779                      * Generate optional I/Os for any skipped sectors to improve
1780                      * aggregation contiguity.
1781                      */
1782                     for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1783                               ASSERT(c <= rm->rm_scols);
1784                               if (c == rm->rm_scols)
1785                                         c = 0;
1786                               rc = &rm->rm_col[c];
1787                               cvd = vd->vdev_child[rc->rc_devidx];
1788                               zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1789                                   rc->rc_offset + rc->rc_size, NULL,
1790                                   1 << tvd->vdev_ashift,
1791                                   zio->io_type, zio->io_priority,
1792                                   ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1793                     }
1794 
1795                     zio_execute(zio);
1796                     return;
1797           }
1798 
1799           ASSERT(zio->io_type == ZIO_TYPE_READ);
1800 
1801           /*
1802            * Iterate over the columns in reverse order so that we hit the parity
1803            * last -- any errors along the way will force us to read the parity.
1804            */
1805           for (c = rm->rm_cols - 1; c >= 0; c--) {
1806                     rc = &rm->rm_col[c];
1807                     cvd = vd->vdev_child[rc->rc_devidx];
1808                     if (!vdev_readable(cvd)) {
1809                               if (c >= rm->rm_firstdatacol)
1810                                         rm->rm_missingdata++;
1811                               else
1812                                         rm->rm_missingparity++;
1813                               rc->rc_error = SET_ERROR(ENXIO);
1814                               rc->rc_tried = 1;   /* don't even try */
1815                               rc->rc_skipped = 1;
1816                               continue;
1817                     }
1818                     if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1819                               if (c >= rm->rm_firstdatacol)
1820                                         rm->rm_missingdata++;
1821                               else
1822                                         rm->rm_missingparity++;
1823                               rc->rc_error = SET_ERROR(ESTALE);
1824                               rc->rc_skipped = 1;
1825                               continue;
1826                     }
1827                     if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1828                         (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1829                               zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1830                                   rc->rc_offset, rc->rc_data, rc->rc_size,
1831                                   zio->io_type, zio->io_priority, 0,
1832                                   vdev_raidz_child_done, rc));
1833                     }
1834           }
1835 
1836           zio_execute(zio);
1837 }
1838 
1839 
1840 /*
1841  * Report a checksum error for a child of a RAID-Z device.
1842  */
1843 static void
raidz_checksum_error(zio_t * zio,raidz_col_t * rc,void * bad_data)1844 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1845 {
1846           vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1847 
1848           if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1849                     zio_bad_cksum_t zbc;
1850                     raidz_map_t *rm = zio->io_vsd;
1851 
1852                     mutex_enter(&vd->vdev_stat_lock);
1853                     vd->vdev_stat.vs_checksum_errors++;
1854                     mutex_exit(&vd->vdev_stat_lock);
1855 
1856                     zbc.zbc_has_cksum = 0;
1857                     zbc.zbc_injected = rm->rm_ecksuminjected;
1858 
1859                     zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1860                         rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1861                         &zbc);
1862           }
1863 }
1864 
1865 /*
1866  * We keep track of whether or not there were any injected errors, so that
1867  * any ereports we generate can note it.
1868  */
1869 static int
raidz_checksum_verify(zio_t * zio)1870 raidz_checksum_verify(zio_t *zio)
1871 {
1872           zio_bad_cksum_t zbc;
1873           raidz_map_t *rm = zio->io_vsd;
1874 
1875           int ret = zio_checksum_error(zio, &zbc);
1876           if (ret != 0 && zbc.zbc_injected != 0)
1877                     rm->rm_ecksuminjected = 1;
1878 
1879           return (ret);
1880 }
1881 
1882 /*
1883  * Generate the parity from the data columns. If we tried and were able to
1884  * read the parity without error, verify that the generated parity matches the
1885  * data we read. If it doesn't, we fire off a checksum error. Return the
1886  * number such failures.
1887  */
1888 static int
raidz_parity_verify(zio_t * zio,raidz_map_t * rm)1889 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1890 {
1891           void *orig[VDEV_RAIDZ_MAXPARITY];
1892           int c, ret = 0;
1893           raidz_col_t *rc;
1894 
1895           blkptr_t *bp = zio->io_bp;
1896           enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1897               (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1898 
1899           if (checksum == ZIO_CHECKSUM_NOPARITY)
1900                     return (ret);
1901 
1902           for (c = 0; c < rm->rm_firstdatacol; c++) {
1903                     rc = &rm->rm_col[c];
1904                     if (!rc->rc_tried || rc->rc_error != 0)
1905                               continue;
1906                     orig[c] = zio_buf_alloc(rc->rc_size);
1907                     bcopy(rc->rc_data, orig[c], rc->rc_size);
1908           }
1909 
1910           vdev_raidz_generate_parity(rm);
1911 
1912           for (c = 0; c < rm->rm_firstdatacol; c++) {
1913                     rc = &rm->rm_col[c];
1914                     if (!rc->rc_tried || rc->rc_error != 0)
1915                               continue;
1916                     if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1917                               raidz_checksum_error(zio, rc, orig[c]);
1918                               rc->rc_error = SET_ERROR(ECKSUM);
1919                               ret++;
1920                     }
1921                     zio_buf_free(orig[c], rc->rc_size);
1922           }
1923 
1924           return (ret);
1925 }
1926 
1927 /*
1928  * Keep statistics on all the ways that we used parity to correct data.
1929  */
1930 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1931 
1932 static int
vdev_raidz_worst_error(raidz_map_t * rm)1933 vdev_raidz_worst_error(raidz_map_t *rm)
1934 {
1935           int error = 0;
1936 
1937           for (int c = 0; c < rm->rm_cols; c++)
1938                     error = zio_worst_error(error, rm->rm_col[c].rc_error);
1939 
1940           return (error);
1941 }
1942 
1943 /*
1944  * Iterate over all combinations of bad data and attempt a reconstruction.
1945  * Note that the algorithm below is non-optimal because it doesn't take into
1946  * account how reconstruction is actually performed. For example, with
1947  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1948  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1949  * cases we'd only use parity information in column 0.
1950  */
1951 static int
vdev_raidz_combrec(zio_t * zio,int total_errors,int data_errors)1952 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1953 {
1954           raidz_map_t *rm = zio->io_vsd;
1955           raidz_col_t *rc;
1956           void *orig[VDEV_RAIDZ_MAXPARITY];
1957           int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1958           int *tgts = &tstore[1];
1959           int current, next, i, c, n;
1960           int code, ret = 0;
1961 
1962           ASSERT(total_errors < rm->rm_firstdatacol);
1963 
1964           /*
1965            * This simplifies one edge condition.
1966            */
1967           tgts[-1] = -1;
1968 
1969           for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1970                     /*
1971                      * Initialize the targets array by finding the first n columns
1972                      * that contain no error.
1973                      *
1974                      * If there were no data errors, we need to ensure that we're
1975                      * always explicitly attempting to reconstruct at least one
1976                      * data column. To do this, we simply push the highest target
1977                      * up into the data columns.
1978                      */
1979                     for (c = 0, i = 0; i < n; i++) {
1980                               if (i == n - 1 && data_errors == 0 &&
1981                                   c < rm->rm_firstdatacol) {
1982                                         c = rm->rm_firstdatacol;
1983                               }
1984 
1985                               while (rm->rm_col[c].rc_error != 0) {
1986                                         c++;
1987                                         ASSERT3S(c, <, rm->rm_cols);
1988                               }
1989 
1990                               tgts[i] = c++;
1991                     }
1992 
1993                     /*
1994                      * Setting tgts[n] simplifies the other edge condition.
1995                      */
1996                     tgts[n] = rm->rm_cols;
1997 
1998                     /*
1999                      * These buffers were allocated in previous iterations.
2000                      */
2001                     for (i = 0; i < n - 1; i++) {
2002                               ASSERT(orig[i] != NULL);
2003                     }
2004 
2005                     orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2006 
2007                     current = 0;
2008                     next = tgts[current];
2009 
2010                     while (current != n) {
2011                               tgts[current] = next;
2012                               current = 0;
2013 
2014                               /*
2015                                * Save off the original data that we're going to
2016                                * attempt to reconstruct.
2017                                */
2018                               for (i = 0; i < n; i++) {
2019                                         ASSERT(orig[i] != NULL);
2020                                         c = tgts[i];
2021                                         ASSERT3S(c, >=, 0);
2022                                         ASSERT3S(c, <, rm->rm_cols);
2023                                         rc = &rm->rm_col[c];
2024                                         bcopy(rc->rc_data, orig[i], rc->rc_size);
2025                               }
2026 
2027                               /*
2028                                * Attempt a reconstruction and exit the outer loop on
2029                                * success.
2030                                */
2031                               code = vdev_raidz_reconstruct(rm, tgts, n);
2032                               if (raidz_checksum_verify(zio) == 0) {
2033                                         atomic_inc_64(&raidz_corrected[code]);
2034 
2035                                         for (i = 0; i < n; i++) {
2036                                                   c = tgts[i];
2037                                                   rc = &rm->rm_col[c];
2038                                                   ASSERT(rc->rc_error == 0);
2039                                                   if (rc->rc_tried)
2040                                                             raidz_checksum_error(zio, rc,
2041                                                                 orig[i]);
2042                                                   rc->rc_error = SET_ERROR(ECKSUM);
2043                                         }
2044 
2045                                         ret = code;
2046                                         goto done;
2047                               }
2048 
2049                               /*
2050                                * Restore the original data.
2051                                */
2052                               for (i = 0; i < n; i++) {
2053                                         c = tgts[i];
2054                                         rc = &rm->rm_col[c];
2055                                         bcopy(orig[i], rc->rc_data, rc->rc_size);
2056                               }
2057 
2058                               do {
2059                                         /*
2060                                          * Find the next valid column after the current
2061                                          * position..
2062                                          */
2063                                         for (next = tgts[current] + 1;
2064                                             next < rm->rm_cols &&
2065                                             rm->rm_col[next].rc_error != 0; next++)
2066                                                   continue;
2067 
2068                                         ASSERT(next <= tgts[current + 1]);
2069 
2070                                         /*
2071                                          * If that spot is available, we're done here.
2072                                          */
2073                                         if (next != tgts[current + 1])
2074                                                   break;
2075 
2076                                         /*
2077                                          * Otherwise, find the next valid column after
2078                                          * the previous position.
2079                                          */
2080                                         for (c = tgts[current - 1] + 1;
2081                                             rm->rm_col[c].rc_error != 0; c++)
2082                                                   continue;
2083 
2084                                         tgts[current] = c;
2085                                         current++;
2086 
2087                               } while (current != n);
2088                     }
2089           }
2090           n--;
2091 done:
2092           for (i = 0; i < n; i++) {
2093                     zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2094           }
2095 
2096           return (ret);
2097 }
2098 
2099 /*
2100  * Complete an IO operation on a RAIDZ VDev
2101  *
2102  * Outline:
2103  * - For write operations:
2104  *   1. Check for errors on the child IOs.
2105  *   2. Return, setting an error code if too few child VDevs were written
2106  *      to reconstruct the data later.  Note that partial writes are
2107  *      considered successful if they can be reconstructed at all.
2108  * - For read operations:
2109  *   1. Check for errors on the child IOs.
2110  *   2. If data errors occurred:
2111  *      a. Try to reassemble the data from the parity available.
2112  *      b. If we haven't yet read the parity drives, read them now.
2113  *      c. If all parity drives have been read but the data still doesn't
2114  *         reassemble with a correct checksum, then try combinatorial
2115  *         reconstruction.
2116  *      d. If that doesn't work, return an error.
2117  *   3. If there were unexpected errors or this is a resilver operation,
2118  *      rewrite the vdevs that had errors.
2119  */
2120 static void
vdev_raidz_io_done(zio_t * zio)2121 vdev_raidz_io_done(zio_t *zio)
2122 {
2123           vdev_t *vd = zio->io_vd;
2124           vdev_t *cvd;
2125           raidz_map_t *rm = zio->io_vsd;
2126           raidz_col_t *rc;
2127           int unexpected_errors = 0;
2128           int parity_errors = 0;
2129           int parity_untried = 0;
2130           int data_errors = 0;
2131           int total_errors = 0;
2132           int n, c;
2133           int tgts[VDEV_RAIDZ_MAXPARITY];
2134           int code;
2135 
2136           ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2137 
2138           ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2139           ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2140 
2141           for (c = 0; c < rm->rm_cols; c++) {
2142                     rc = &rm->rm_col[c];
2143 
2144                     if (rc->rc_error) {
2145                               ASSERT(rc->rc_error != ECKSUM);         /* child has no bp */
2146 
2147                               if (c < rm->rm_firstdatacol)
2148                                         parity_errors++;
2149                               else
2150                                         data_errors++;
2151 
2152                               if (!rc->rc_skipped)
2153                                         unexpected_errors++;
2154 
2155                               total_errors++;
2156                     } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2157                               parity_untried++;
2158                     }
2159           }
2160 
2161           if (zio->io_type == ZIO_TYPE_WRITE) {
2162                     /*
2163                      * XXX -- for now, treat partial writes as a success.
2164                      * (If we couldn't write enough columns to reconstruct
2165                      * the data, the I/O failed.  Otherwise, good enough.)
2166                      *
2167                      * Now that we support write reallocation, it would be better
2168                      * to treat partial failure as real failure unless there are
2169                      * no non-degraded top-level vdevs left, and not update DTLs
2170                      * if we intend to reallocate.
2171                      */
2172                     /* XXPOLICY */
2173                     if (total_errors > rm->rm_firstdatacol)
2174                               zio->io_error = vdev_raidz_worst_error(rm);
2175 
2176                     return;
2177           } else if (zio->io_type == ZIO_TYPE_FREE) {
2178                     return;
2179           }
2180 
2181           ASSERT(zio->io_type == ZIO_TYPE_READ);
2182           /*
2183            * There are three potential phases for a read:
2184            *        1. produce valid data from the columns read
2185            *        2. read all disks and try again
2186            *        3. perform combinatorial reconstruction
2187            *
2188            * Each phase is progressively both more expensive and less likely to
2189            * occur. If we encounter more errors than we can repair or all phases
2190            * fail, we have no choice but to return an error.
2191            */
2192 
2193           /*
2194            * If the number of errors we saw was correctable -- less than or equal
2195            * to the number of parity disks read -- attempt to produce data that
2196            * has a valid checksum. Naturally, this case applies in the absence of
2197            * any errors.
2198            */
2199           if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2200                     if (data_errors == 0) {
2201                               if (raidz_checksum_verify(zio) == 0) {
2202                                         /*
2203                                          * If we read parity information (unnecessarily
2204                                          * as it happens since no reconstruction was
2205                                          * needed) regenerate and verify the parity.
2206                                          * We also regenerate parity when resilvering
2207                                          * so we can write it out to the failed device
2208                                          * later.
2209                                          */
2210                                         if (parity_errors + parity_untried <
2211                                             rm->rm_firstdatacol ||
2212                                             (zio->io_flags & ZIO_FLAG_RESILVER)) {
2213                                                   n = raidz_parity_verify(zio, rm);
2214                                                   unexpected_errors += n;
2215                                                   ASSERT(parity_errors + n <=
2216                                                       rm->rm_firstdatacol);
2217                                         }
2218                                         goto done;
2219                               }
2220                     } else {
2221                               /*
2222                                * We either attempt to read all the parity columns or
2223                                * none of them. If we didn't try to read parity, we
2224                                * wouldn't be here in the correctable case. There must
2225                                * also have been fewer parity errors than parity
2226                                * columns or, again, we wouldn't be in this code path.
2227                                */
2228                               ASSERT(parity_untried == 0);
2229                               ASSERT(parity_errors < rm->rm_firstdatacol);
2230 
2231                               /*
2232                                * Identify the data columns that reported an error.
2233                                */
2234                               n = 0;
2235                               for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2236                                         rc = &rm->rm_col[c];
2237                                         if (rc->rc_error != 0) {
2238                                                   ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2239                                                   tgts[n++] = c;
2240                                         }
2241                               }
2242 
2243                               ASSERT(rm->rm_firstdatacol >= n);
2244 
2245                               code = vdev_raidz_reconstruct(rm, tgts, n);
2246 
2247                               if (raidz_checksum_verify(zio) == 0) {
2248                                         atomic_inc_64(&raidz_corrected[code]);
2249 
2250                                         /*
2251                                          * If we read more parity disks than were used
2252                                          * for reconstruction, confirm that the other
2253                                          * parity disks produced correct data. This
2254                                          * routine is suboptimal in that it regenerates
2255                                          * the parity that we already used in addition
2256                                          * to the parity that we're attempting to
2257                                          * verify, but this should be a relatively
2258                                          * uncommon case, and can be optimized if it
2259                                          * becomes a problem. Note that we regenerate
2260                                          * parity when resilvering so we can write it
2261                                          * out to failed devices later.
2262                                          */
2263                                         if (parity_errors < rm->rm_firstdatacol - n ||
2264                                             (zio->io_flags & ZIO_FLAG_RESILVER)) {
2265                                                   n = raidz_parity_verify(zio, rm);
2266                                                   unexpected_errors += n;
2267                                                   ASSERT(parity_errors + n <=
2268                                                       rm->rm_firstdatacol);
2269                                         }
2270 
2271                                         goto done;
2272                               }
2273                     }
2274           }
2275 
2276           /*
2277            * This isn't a typical situation -- either we got a read error or
2278            * a child silently returned bad data. Read every block so we can
2279            * try again with as much data and parity as we can track down. If
2280            * we've already been through once before, all children will be marked
2281            * as tried so we'll proceed to combinatorial reconstruction.
2282            */
2283           unexpected_errors = 1;
2284           rm->rm_missingdata = 0;
2285           rm->rm_missingparity = 0;
2286 
2287           for (c = 0; c < rm->rm_cols; c++) {
2288                     if (rm->rm_col[c].rc_tried)
2289                               continue;
2290 
2291                     zio_vdev_io_redone(zio);
2292                     do {
2293                               rc = &rm->rm_col[c];
2294                               if (rc->rc_tried)
2295                                         continue;
2296                               zio_nowait(zio_vdev_child_io(zio, NULL,
2297                                   vd->vdev_child[rc->rc_devidx],
2298                                   rc->rc_offset, rc->rc_data, rc->rc_size,
2299                                   zio->io_type, zio->io_priority, 0,
2300                                   vdev_raidz_child_done, rc));
2301                     } while (++c < rm->rm_cols);
2302 
2303                     return;
2304           }
2305 
2306           /*
2307            * At this point we've attempted to reconstruct the data given the
2308            * errors we detected, and we've attempted to read all columns. There
2309            * must, therefore, be one or more additional problems -- silent errors
2310            * resulting in invalid data rather than explicit I/O errors resulting
2311            * in absent data. We check if there is enough additional data to
2312            * possibly reconstruct the data and then perform combinatorial
2313            * reconstruction over all possible combinations. If that fails,
2314            * we're cooked.
2315            */
2316           if (total_errors > rm->rm_firstdatacol) {
2317                     zio->io_error = vdev_raidz_worst_error(rm);
2318 
2319           } else if (total_errors < rm->rm_firstdatacol &&
2320               (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2321                     /*
2322                      * If we didn't use all the available parity for the
2323                      * combinatorial reconstruction, verify that the remaining
2324                      * parity is correct.
2325                      */
2326                     if (code != (1 << rm->rm_firstdatacol) - 1)
2327                               (void) raidz_parity_verify(zio, rm);
2328           } else {
2329                     /*
2330                      * We're here because either:
2331                      *
2332                      *        total_errors == rm_first_datacol, or
2333                      *        vdev_raidz_combrec() failed
2334                      *
2335                      * In either case, there is enough bad data to prevent
2336                      * reconstruction.
2337                      *
2338                      * Start checksum ereports for all children which haven't
2339                      * failed, and the IO wasn't speculative.
2340                      */
2341                     zio->io_error = SET_ERROR(ECKSUM);
2342 
2343                     if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2344                               for (c = 0; c < rm->rm_cols; c++) {
2345                                         rc = &rm->rm_col[c];
2346                                         if (rc->rc_error == 0) {
2347                                                   zio_bad_cksum_t zbc;
2348                                                   zbc.zbc_has_cksum = 0;
2349                                                   zbc.zbc_injected =
2350                                                       rm->rm_ecksuminjected;
2351 
2352                                                   zfs_ereport_start_checksum(
2353                                                       zio->io_spa,
2354                                                       vd->vdev_child[rc->rc_devidx],
2355                                                       zio, rc->rc_offset, rc->rc_size,
2356                                                       (void *)(uintptr_t)c, &zbc);
2357                                         }
2358                               }
2359                     }
2360           }
2361 
2362 done:
2363           zio_checksum_verified(zio);
2364 
2365           if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2366               (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2367                     /*
2368                      * Use the good data we have in hand to repair damaged children.
2369                      */
2370                     for (c = 0; c < rm->rm_cols; c++) {
2371                               rc = &rm->rm_col[c];
2372                               cvd = vd->vdev_child[rc->rc_devidx];
2373 
2374                               if (rc->rc_error == 0)
2375                                         continue;
2376 
2377                               zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2378                                   rc->rc_offset, rc->rc_data, rc->rc_size,
2379                                   ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2380                                   ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2381                                   ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2382                     }
2383           }
2384 }
2385 
2386 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)2387 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2388 {
2389           if (faulted > vd->vdev_nparity)
2390                     vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2391                         VDEV_AUX_NO_REPLICAS);
2392           else if (degraded + faulted != 0)
2393                     vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2394           else
2395                     vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2396 }
2397 
2398 vdev_ops_t vdev_raidz_ops = {
2399           vdev_raidz_open,
2400           vdev_raidz_close,
2401           vdev_raidz_asize,
2402           vdev_raidz_io_start,
2403           vdev_raidz_io_done,
2404           vdev_raidz_state_change,
2405           NULL,
2406           NULL,
2407           VDEV_TYPE_RAIDZ,    /* name of this vdev type */
2408           B_FALSE                       /* not a leaf vdev */
2409 };
2410