1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/endian.h>
33 #include <sys/kernel.h>
34 #include <sys/kobj.h>
35 #include <sys/limits.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/mutex.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <geom/geom.h>
42 #include <geom/geom_dbg.h>
43 #include "geom/raid/g_raid.h"
44 #include "g_raid_tr_if.h"
45
46 #define N 2
47
48 SYSCTL_DECL(_kern_geom_raid_raid1e);
49
50 #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */
51 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
52 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
53 &g_raid1e_rebuild_slab, 0,
54 "Amount of the disk to rebuild each read/write cycle of the rebuild.");
55
56 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
57 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
58 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
59 &g_raid1e_rebuild_fair_io, 0,
60 "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
61
62 #define RAID1E_REBUILD_CLUSTER_IDLE 100
63 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
64 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
65 &g_raid1e_rebuild_cluster_idle, 0,
66 "Number of slabs to do each time we trigger a rebuild cycle");
67
68 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
69 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
70 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
71 &g_raid1e_rebuild_meta_update, 0,
72 "When to update the meta data.");
73
74 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
75
76 #define TR_RAID1E_NONE 0
77 #define TR_RAID1E_REBUILD 1
78 #define TR_RAID1E_RESYNC 2
79
80 #define TR_RAID1E_F_DOING_SOME 0x1
81 #define TR_RAID1E_F_LOCKED 0x2
82 #define TR_RAID1E_F_ABORT 0x4
83
84 struct g_raid_tr_raid1e_object {
85 struct g_raid_tr_object trso_base;
86 int trso_starting;
87 int trso_stopping;
88 int trso_type;
89 int trso_recover_slabs; /* slabs before rest */
90 int trso_fair_io;
91 int trso_meta_update;
92 int trso_flags;
93 struct g_raid_subdisk *trso_failed_sd; /* like per volume */
94 void *trso_buffer; /* Buffer space */
95 off_t trso_lock_pos; /* Locked range start. */
96 off_t trso_lock_len; /* Locked range length. */
97 struct bio trso_bio;
98 };
99
100 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
101 static g_raid_tr_event_t g_raid_tr_event_raid1e;
102 static g_raid_tr_start_t g_raid_tr_start_raid1e;
103 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
104 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
105 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
106 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
107 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
108 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
109 static g_raid_tr_free_t g_raid_tr_free_raid1e;
110
111 static kobj_method_t g_raid_tr_raid1e_methods[] = {
112 KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e),
113 KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e),
114 KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e),
115 KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e),
116 KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e),
117 KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e),
118 KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
119 KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e),
120 KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e),
121 KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e),
122 { 0, 0 }
123 };
124
125 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
126 "RAID1E",
127 g_raid_tr_raid1e_methods,
128 sizeof(struct g_raid_tr_raid1e_object),
129 .trc_enable = 1,
130 .trc_priority = 200,
131 .trc_accept_unmapped = 1
132 };
133
134 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
135 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
136 struct g_raid_subdisk *sd);
137 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
138 int no, off_t off, off_t len, u_int mask);
139
140 static inline void
V2P(struct g_raid_volume * vol,off_t virt,int * disk,off_t * offset,off_t * start)141 V2P(struct g_raid_volume *vol, off_t virt,
142 int *disk, off_t *offset, off_t *start)
143 {
144 off_t nstrip;
145 u_int strip_size;
146
147 strip_size = vol->v_strip_size;
148 /* Strip number. */
149 nstrip = virt / strip_size;
150 /* Start position in strip. */
151 *start = virt % strip_size;
152 /* Disk number. */
153 *disk = (nstrip * N) % vol->v_disks_count;
154 /* Strip start position in disk. */
155 *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
156 }
157
158 static inline void
P2V(struct g_raid_volume * vol,int disk,off_t offset,off_t * virt,int * copy)159 P2V(struct g_raid_volume *vol, int disk, off_t offset,
160 off_t *virt, int *copy)
161 {
162 off_t nstrip, start;
163 u_int strip_size;
164
165 strip_size = vol->v_strip_size;
166 /* Start position in strip. */
167 start = offset % strip_size;
168 /* Physical strip number. */
169 nstrip = (offset / strip_size) * vol->v_disks_count + disk;
170 /* Number of physical strip (copy) inside virtual strip. */
171 *copy = nstrip % N;
172 /* Offset in virtual space. */
173 *virt = (nstrip / N) * strip_size + start;
174 }
175
176 static int
g_raid_tr_taste_raid1e(struct g_raid_tr_object * tr,struct g_raid_volume * vol)177 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
178 {
179 struct g_raid_tr_raid1e_object *trs;
180
181 trs = (struct g_raid_tr_raid1e_object *)tr;
182 if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
183 tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
184 return (G_RAID_TR_TASTE_FAIL);
185 trs->trso_starting = 1;
186 return (G_RAID_TR_TASTE_SUCCEED);
187 }
188
189 static int
g_raid_tr_update_state_raid1e_even(struct g_raid_volume * vol)190 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
191 {
192 struct g_raid_softc *sc;
193 struct g_raid_subdisk *sd, *bestsd, *worstsd;
194 int i, j, state, sstate;
195
196 sc = vol->v_softc;
197 state = G_RAID_VOLUME_S_OPTIMAL;
198 for (i = 0; i < vol->v_disks_count / N; i++) {
199 bestsd = &vol->v_subdisks[i * N];
200 for (j = 1; j < N; j++) {
201 sd = &vol->v_subdisks[i * N + j];
202 if (sd->sd_state > bestsd->sd_state)
203 bestsd = sd;
204 else if (sd->sd_state == bestsd->sd_state &&
205 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
206 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
207 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
208 bestsd = sd;
209 }
210 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
211 bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
212 /* We found reasonable candidate. */
213 G_RAID_DEBUG1(1, sc,
214 "Promote subdisk %s:%d from %s to ACTIVE.",
215 vol->v_name, bestsd->sd_pos,
216 g_raid_subdisk_state2str(bestsd->sd_state));
217 g_raid_change_subdisk_state(bestsd,
218 G_RAID_SUBDISK_S_ACTIVE);
219 g_raid_write_metadata(sc,
220 vol, bestsd, bestsd->sd_disk);
221 }
222 worstsd = &vol->v_subdisks[i * N];
223 for (j = 1; j < N; j++) {
224 sd = &vol->v_subdisks[i * N + j];
225 if (sd->sd_state < worstsd->sd_state)
226 worstsd = sd;
227 }
228 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
229 sstate = G_RAID_VOLUME_S_OPTIMAL;
230 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
231 sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
232 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
233 sstate = G_RAID_VOLUME_S_DEGRADED;
234 else
235 sstate = G_RAID_VOLUME_S_BROKEN;
236 if (sstate < state)
237 state = sstate;
238 }
239 return (state);
240 }
241
242 static int
g_raid_tr_update_state_raid1e_odd(struct g_raid_volume * vol)243 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
244 {
245 struct g_raid_softc *sc;
246 struct g_raid_subdisk *sd, *bestsd, *worstsd;
247 int i, j, state, sstate;
248
249 sc = vol->v_softc;
250 if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
251 vol->v_disks_count)
252 return (G_RAID_VOLUME_S_OPTIMAL);
253 for (i = 0; i < vol->v_disks_count; i++) {
254 sd = &vol->v_subdisks[i];
255 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
256 /* We found reasonable candidate. */
257 G_RAID_DEBUG1(1, sc,
258 "Promote subdisk %s:%d from %s to STALE.",
259 vol->v_name, sd->sd_pos,
260 g_raid_subdisk_state2str(sd->sd_state));
261 g_raid_change_subdisk_state(sd,
262 G_RAID_SUBDISK_S_STALE);
263 g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
264 }
265 }
266 state = G_RAID_VOLUME_S_OPTIMAL;
267 for (i = 0; i < vol->v_disks_count; i++) {
268 bestsd = &vol->v_subdisks[i];
269 worstsd = &vol->v_subdisks[i];
270 for (j = 1; j < N; j++) {
271 sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
272 if (sd->sd_state > bestsd->sd_state)
273 bestsd = sd;
274 else if (sd->sd_state == bestsd->sd_state &&
275 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
276 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
277 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
278 bestsd = sd;
279 if (sd->sd_state < worstsd->sd_state)
280 worstsd = sd;
281 }
282 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
283 sstate = G_RAID_VOLUME_S_OPTIMAL;
284 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
285 sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
286 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
287 sstate = G_RAID_VOLUME_S_DEGRADED;
288 else
289 sstate = G_RAID_VOLUME_S_BROKEN;
290 if (sstate < state)
291 state = sstate;
292 }
293 return (state);
294 }
295
296 static int
g_raid_tr_update_state_raid1e(struct g_raid_volume * vol,struct g_raid_subdisk * sd)297 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
298 struct g_raid_subdisk *sd)
299 {
300 struct g_raid_tr_raid1e_object *trs;
301 struct g_raid_softc *sc;
302 u_int s;
303
304 sc = vol->v_softc;
305 trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
306 if (trs->trso_stopping &&
307 (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
308 s = G_RAID_VOLUME_S_STOPPED;
309 else if (trs->trso_starting)
310 s = G_RAID_VOLUME_S_STARTING;
311 else {
312 if ((vol->v_disks_count % N) == 0)
313 s = g_raid_tr_update_state_raid1e_even(vol);
314 else
315 s = g_raid_tr_update_state_raid1e_odd(vol);
316 }
317 if (s != vol->v_state) {
318 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
319 G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
320 G_RAID_EVENT_VOLUME);
321 g_raid_change_volume_state(vol, s);
322 if (!trs->trso_starting && !trs->trso_stopping)
323 g_raid_write_metadata(sc, vol, NULL, NULL);
324 }
325 if (!trs->trso_starting && !trs->trso_stopping)
326 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
327 return (0);
328 }
329
330 static void
g_raid_tr_raid1e_fail_disk(struct g_raid_softc * sc,struct g_raid_subdisk * sd,struct g_raid_disk * disk)331 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
332 struct g_raid_disk *disk)
333 {
334 struct g_raid_volume *vol;
335
336 vol = sd->sd_volume;
337 /*
338 * We don't fail the last disk in the pack, since it still has decent
339 * data on it and that's better than failing the disk if it is the root
340 * file system.
341 *
342 * XXX should this be controlled via a tunable? It makes sense for
343 * the volume that has / on it. I can't think of a case where we'd
344 * want the volume to go away on this kind of event.
345 */
346 if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
347 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
348 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
349 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
350 vol->v_disks_count) &&
351 (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
352 return;
353 g_raid_fail_disk(sc, sd, disk);
354 }
355
356 static void
g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object * trs)357 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
358 {
359 struct g_raid_volume *vol;
360 struct g_raid_subdisk *sd;
361
362 vol = trs->trso_base.tro_volume;
363 sd = trs->trso_failed_sd;
364 g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
365 free(trs->trso_buffer, M_TR_RAID1E);
366 trs->trso_buffer = NULL;
367 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
368 trs->trso_type = TR_RAID1E_NONE;
369 trs->trso_recover_slabs = 0;
370 trs->trso_failed_sd = NULL;
371 g_raid_tr_update_state_raid1e(vol, NULL);
372 }
373
374 static void
g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object * tr)375 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
376 {
377 struct g_raid_tr_raid1e_object *trs;
378 struct g_raid_subdisk *sd;
379
380 trs = (struct g_raid_tr_raid1e_object *)tr;
381 sd = trs->trso_failed_sd;
382 G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
383 "Subdisk %s:%d-%s rebuild completed.",
384 sd->sd_volume->v_name, sd->sd_pos,
385 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
386 g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
387 sd->sd_rebuild_pos = 0;
388 g_raid_tr_raid1e_rebuild_done(trs);
389 }
390
391 static void
g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object * tr)392 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
393 {
394 struct g_raid_tr_raid1e_object *trs;
395 struct g_raid_subdisk *sd;
396 struct g_raid_volume *vol;
397
398 vol = tr->tro_volume;
399 trs = (struct g_raid_tr_raid1e_object *)tr;
400 sd = trs->trso_failed_sd;
401 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
402 G_RAID_DEBUG1(1, vol->v_softc,
403 "Subdisk %s:%d-%s rebuild is aborting.",
404 sd->sd_volume->v_name, sd->sd_pos,
405 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
406 trs->trso_flags |= TR_RAID1E_F_ABORT;
407 } else {
408 G_RAID_DEBUG1(0, vol->v_softc,
409 "Subdisk %s:%d-%s rebuild aborted.",
410 sd->sd_volume->v_name, sd->sd_pos,
411 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
412 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
413 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
414 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
415 g_raid_unlock_range(tr->tro_volume,
416 trs->trso_lock_pos, trs->trso_lock_len);
417 }
418 g_raid_tr_raid1e_rebuild_done(trs);
419 }
420 }
421
422 static void
g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object * tr)423 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
424 {
425 struct g_raid_tr_raid1e_object *trs;
426 struct g_raid_softc *sc;
427 struct g_raid_volume *vol;
428 struct g_raid_subdisk *sd;
429 struct bio *bp;
430 off_t len, virtual, vend, offset, start;
431 int disk, copy, best;
432
433 trs = (struct g_raid_tr_raid1e_object *)tr;
434 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
435 return;
436 vol = tr->tro_volume;
437 sc = vol->v_softc;
438 sd = trs->trso_failed_sd;
439
440 while (1) {
441 if (sd->sd_rebuild_pos >= sd->sd_size) {
442 g_raid_tr_raid1e_rebuild_finish(tr);
443 return;
444 }
445 /* Get virtual offset from physical rebuild position. */
446 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©);
447 /* Get physical offset back to get first stripe position. */
448 V2P(vol, virtual, &disk, &offset, &start);
449 /* Calculate contignous data length. */
450 len = MIN(g_raid1e_rebuild_slab,
451 sd->sd_size - sd->sd_rebuild_pos);
452 if ((vol->v_disks_count % N) != 0)
453 len = MIN(len, vol->v_strip_size - start);
454 /* Find disk with most accurate data. */
455 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
456 offset + start, len, 0);
457 if (best < 0) {
458 /* There is no any valid disk. */
459 g_raid_tr_raid1e_rebuild_abort(tr);
460 return;
461 } else if (best != copy) {
462 /* Some other disk has better data. */
463 break;
464 }
465 /* We have the most accurate data. Skip the range. */
466 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
467 sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
468 sd->sd_rebuild_pos += len;
469 }
470
471 bp = &trs->trso_bio;
472 memset(bp, 0, sizeof(*bp));
473 bp->bio_offset = offset + start +
474 ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
475 bp->bio_length = len;
476 bp->bio_data = trs->trso_buffer;
477 bp->bio_cmd = BIO_READ;
478 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
479 bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
480 G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
481 /*
482 * If we are crossing stripe boundary, correct affected virtual
483 * range we should lock.
484 */
485 if (start + len > vol->v_strip_size) {
486 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©);
487 len = vend - virtual;
488 }
489 trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
490 trs->trso_flags |= TR_RAID1E_F_LOCKED;
491 trs->trso_lock_pos = virtual;
492 trs->trso_lock_len = len;
493 /* Lock callback starts I/O */
494 g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
495 }
496
497 static void
g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object * tr)498 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
499 {
500 struct g_raid_volume *vol;
501 struct g_raid_tr_raid1e_object *trs;
502 struct g_raid_subdisk *sd;
503
504 vol = tr->tro_volume;
505 trs = (struct g_raid_tr_raid1e_object *)tr;
506 if (trs->trso_failed_sd) {
507 G_RAID_DEBUG1(1, vol->v_softc,
508 "Already rebuild in start rebuild. pos %jd\n",
509 (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
510 return;
511 }
512 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
513 if (sd == NULL)
514 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
515 if (sd == NULL) {
516 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
517 if (sd != NULL) {
518 sd->sd_rebuild_pos = 0;
519 g_raid_change_subdisk_state(sd,
520 G_RAID_SUBDISK_S_RESYNC);
521 g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
522 } else {
523 sd = g_raid_get_subdisk(vol,
524 G_RAID_SUBDISK_S_UNINITIALIZED);
525 if (sd == NULL)
526 sd = g_raid_get_subdisk(vol,
527 G_RAID_SUBDISK_S_NEW);
528 if (sd != NULL) {
529 sd->sd_rebuild_pos = 0;
530 g_raid_change_subdisk_state(sd,
531 G_RAID_SUBDISK_S_REBUILD);
532 g_raid_write_metadata(vol->v_softc,
533 vol, sd, NULL);
534 }
535 }
536 }
537 if (sd == NULL) {
538 G_RAID_DEBUG1(1, vol->v_softc,
539 "No failed disk to rebuild. night night.");
540 return;
541 }
542 trs->trso_failed_sd = sd;
543 G_RAID_DEBUG1(0, vol->v_softc,
544 "Subdisk %s:%d-%s rebuild start at %jd.",
545 sd->sd_volume->v_name, sd->sd_pos,
546 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
547 trs->trso_failed_sd->sd_rebuild_pos);
548 trs->trso_type = TR_RAID1E_REBUILD;
549 trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
550 trs->trso_meta_update = g_raid1e_rebuild_meta_update;
551 g_raid_tr_raid1e_rebuild_some(tr);
552 }
553
554 static void
g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd)555 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
556 struct g_raid_subdisk *sd)
557 {
558 struct g_raid_volume *vol;
559 struct g_raid_tr_raid1e_object *trs;
560 int nr;
561
562 vol = tr->tro_volume;
563 trs = (struct g_raid_tr_raid1e_object *)tr;
564 if (trs->trso_stopping)
565 return;
566 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
567 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
568 switch(trs->trso_type) {
569 case TR_RAID1E_NONE:
570 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
571 return;
572 if (nr == 0) {
573 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
574 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
575 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
576 if (nr == 0)
577 return;
578 }
579 g_raid_tr_raid1e_rebuild_start(tr);
580 break;
581 case TR_RAID1E_REBUILD:
582 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
583 trs->trso_failed_sd == sd)
584 g_raid_tr_raid1e_rebuild_abort(tr);
585 break;
586 case TR_RAID1E_RESYNC:
587 break;
588 }
589 }
590
591 static int
g_raid_tr_event_raid1e(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd,u_int event)592 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
593 struct g_raid_subdisk *sd, u_int event)
594 {
595
596 g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
597 return (0);
598 }
599
600 static int
g_raid_tr_start_raid1e(struct g_raid_tr_object * tr)601 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
602 {
603 struct g_raid_tr_raid1e_object *trs;
604 struct g_raid_volume *vol;
605
606 trs = (struct g_raid_tr_raid1e_object *)tr;
607 vol = tr->tro_volume;
608 trs->trso_starting = 0;
609 g_raid_tr_update_state_raid1e(vol, NULL);
610 return (0);
611 }
612
613 static int
g_raid_tr_stop_raid1e(struct g_raid_tr_object * tr)614 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
615 {
616 struct g_raid_tr_raid1e_object *trs;
617 struct g_raid_volume *vol;
618
619 trs = (struct g_raid_tr_raid1e_object *)tr;
620 vol = tr->tro_volume;
621 trs->trso_starting = 0;
622 trs->trso_stopping = 1;
623 g_raid_tr_update_state_raid1e(vol, NULL);
624 return (0);
625 }
626
627 /*
628 * Select the disk to read from. Take into account: subdisk state, running
629 * error recovery, average disk load, head position and possible cache hits.
630 */
631 #define ABS(x) (((x) >= 0) ? (x) : (-(x)))
632 static int
g_raid_tr_raid1e_select_read_disk(struct g_raid_volume * vol,int no,off_t off,off_t len,u_int mask)633 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
634 int no, off_t off, off_t len, u_int mask)
635 {
636 struct g_raid_subdisk *sd;
637 off_t offset;
638 int i, best, prio, bestprio;
639
640 best = -1;
641 bestprio = INT_MAX;
642 for (i = 0; i < N; i++) {
643 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
644 offset = off;
645 if (no + i >= vol->v_disks_count)
646 offset += vol->v_strip_size;
647
648 prio = G_RAID_SUBDISK_LOAD(sd);
649 if ((mask & (1 << sd->sd_pos)) != 0)
650 continue;
651 switch (sd->sd_state) {
652 case G_RAID_SUBDISK_S_ACTIVE:
653 break;
654 case G_RAID_SUBDISK_S_RESYNC:
655 if (offset + off < sd->sd_rebuild_pos)
656 break;
657 /* FALLTHROUGH */
658 case G_RAID_SUBDISK_S_STALE:
659 prio += i << 24;
660 break;
661 case G_RAID_SUBDISK_S_REBUILD:
662 if (offset + off < sd->sd_rebuild_pos)
663 break;
664 /* FALLTHROUGH */
665 default:
666 continue;
667 }
668 prio += min(sd->sd_recovery, 255) << 16;
669 /* If disk head is precisely in position - highly prefer it. */
670 if (G_RAID_SUBDISK_POS(sd) == offset)
671 prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
672 else
673 /* If disk head is close to position - prefer it. */
674 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
675 G_RAID_SUBDISK_TRACK_SIZE)
676 prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
677 if (prio < bestprio) {
678 bestprio = prio;
679 best = i;
680 }
681 }
682 return (best);
683 }
684
685 static void
g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object * tr,struct bio * bp)686 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
687 {
688 struct g_raid_volume *vol;
689 struct g_raid_subdisk *sd;
690 struct bio_queue_head queue;
691 struct bio *cbp;
692 char *addr;
693 off_t offset, start, length, remain;
694 u_int no, strip_size;
695 int best;
696
697 vol = tr->tro_volume;
698 if ((bp->bio_flags & BIO_UNMAPPED) != 0)
699 addr = NULL;
700 else
701 addr = bp->bio_data;
702 strip_size = vol->v_strip_size;
703 V2P(vol, bp->bio_offset, &no, &offset, &start);
704 remain = bp->bio_length;
705 bioq_init(&queue);
706 while (remain > 0) {
707 length = MIN(strip_size - start, remain);
708 best = g_raid_tr_raid1e_select_read_disk(vol,
709 no, offset, length, 0);
710 KASSERT(best >= 0, ("No readable disk in volume %s!",
711 vol->v_name));
712 no += best;
713 if (no >= vol->v_disks_count) {
714 no -= vol->v_disks_count;
715 offset += strip_size;
716 }
717 cbp = g_clone_bio(bp);
718 if (cbp == NULL)
719 goto failure;
720 cbp->bio_offset = offset + start;
721 cbp->bio_length = length;
722 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
723 cbp->bio_ma_offset += (uintptr_t)addr;
724 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
725 cbp->bio_ma_offset %= PAGE_SIZE;
726 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
727 cbp->bio_length) / PAGE_SIZE;
728 } else
729 cbp->bio_data = addr;
730 cbp->bio_caller1 = &vol->v_subdisks[no];
731 bioq_insert_tail(&queue, cbp);
732 no += N - best;
733 if (no >= vol->v_disks_count) {
734 no -= vol->v_disks_count;
735 offset += strip_size;
736 }
737 remain -= length;
738 addr += length;
739 start = 0;
740 }
741 while ((cbp = bioq_takefirst(&queue)) != NULL) {
742 sd = cbp->bio_caller1;
743 cbp->bio_caller1 = NULL;
744 g_raid_subdisk_iostart(sd, cbp);
745 }
746 return;
747 failure:
748 while ((cbp = bioq_takefirst(&queue)) != NULL)
749 g_destroy_bio(cbp);
750 if (bp->bio_error == 0)
751 bp->bio_error = ENOMEM;
752 g_raid_iodone(bp, bp->bio_error);
753 }
754
755 static void
g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object * tr,struct bio * bp)756 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
757 {
758 struct g_raid_volume *vol;
759 struct g_raid_subdisk *sd;
760 struct bio_queue_head queue;
761 struct bio *cbp;
762 char *addr;
763 off_t offset, start, length, remain;
764 u_int no, strip_size;
765 int i;
766
767 vol = tr->tro_volume;
768 if ((bp->bio_flags & BIO_UNMAPPED) != 0)
769 addr = NULL;
770 else
771 addr = bp->bio_data;
772 strip_size = vol->v_strip_size;
773 V2P(vol, bp->bio_offset, &no, &offset, &start);
774 remain = bp->bio_length;
775 bioq_init(&queue);
776 while (remain > 0) {
777 length = MIN(strip_size - start, remain);
778 for (i = 0; i < N; i++) {
779 sd = &vol->v_subdisks[no];
780 switch (sd->sd_state) {
781 case G_RAID_SUBDISK_S_ACTIVE:
782 case G_RAID_SUBDISK_S_STALE:
783 case G_RAID_SUBDISK_S_RESYNC:
784 break;
785 case G_RAID_SUBDISK_S_REBUILD:
786 if (offset + start >= sd->sd_rebuild_pos)
787 goto nextdisk;
788 break;
789 default:
790 goto nextdisk;
791 }
792 cbp = g_clone_bio(bp);
793 if (cbp == NULL)
794 goto failure;
795 cbp->bio_offset = offset + start;
796 cbp->bio_length = length;
797 if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
798 bp->bio_cmd != BIO_DELETE) {
799 cbp->bio_ma_offset += (uintptr_t)addr;
800 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
801 cbp->bio_ma_offset %= PAGE_SIZE;
802 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
803 cbp->bio_length) / PAGE_SIZE;
804 } else
805 cbp->bio_data = addr;
806 cbp->bio_caller1 = sd;
807 bioq_insert_tail(&queue, cbp);
808 nextdisk:
809 if (++no >= vol->v_disks_count) {
810 no = 0;
811 offset += strip_size;
812 }
813 }
814 remain -= length;
815 if (bp->bio_cmd != BIO_DELETE)
816 addr += length;
817 start = 0;
818 }
819 while ((cbp = bioq_takefirst(&queue)) != NULL) {
820 sd = cbp->bio_caller1;
821 cbp->bio_caller1 = NULL;
822 g_raid_subdisk_iostart(sd, cbp);
823 }
824 return;
825 failure:
826 while ((cbp = bioq_takefirst(&queue)) != NULL)
827 g_destroy_bio(cbp);
828 if (bp->bio_error == 0)
829 bp->bio_error = ENOMEM;
830 g_raid_iodone(bp, bp->bio_error);
831 }
832
833 static void
g_raid_tr_iostart_raid1e(struct g_raid_tr_object * tr,struct bio * bp)834 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
835 {
836 struct g_raid_volume *vol;
837 struct g_raid_tr_raid1e_object *trs;
838
839 vol = tr->tro_volume;
840 trs = (struct g_raid_tr_raid1e_object *)tr;
841 if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
842 vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
843 vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
844 g_raid_iodone(bp, EIO);
845 return;
846 }
847 /*
848 * If we're rebuilding, squeeze in rebuild activity every so often,
849 * even when the disk is busy. Be sure to only count real I/O
850 * to the disk. All 'SPECIAL' I/O is traffic generated to the disk
851 * by this module.
852 */
853 if (trs->trso_failed_sd != NULL &&
854 !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
855 /* Make this new or running now round short. */
856 trs->trso_recover_slabs = 0;
857 if (--trs->trso_fair_io <= 0) {
858 trs->trso_fair_io = g_raid1e_rebuild_fair_io;
859 g_raid_tr_raid1e_rebuild_some(tr);
860 }
861 }
862 switch (bp->bio_cmd) {
863 case BIO_READ:
864 g_raid_tr_iostart_raid1e_read(tr, bp);
865 break;
866 case BIO_WRITE:
867 case BIO_DELETE:
868 g_raid_tr_iostart_raid1e_write(tr, bp);
869 break;
870 case BIO_SPEEDUP:
871 case BIO_FLUSH:
872 g_raid_tr_flush_common(tr, bp);
873 break;
874 default:
875 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
876 bp->bio_cmd, vol->v_name));
877 break;
878 }
879 }
880
881 static void
g_raid_tr_iodone_raid1e(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd,struct bio * bp)882 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
883 struct g_raid_subdisk *sd, struct bio *bp)
884 {
885 struct bio *cbp;
886 struct g_raid_subdisk *nsd;
887 struct g_raid_volume *vol;
888 struct bio *pbp;
889 struct g_raid_tr_raid1e_object *trs;
890 off_t virtual, offset, start;
891 uintptr_t mask;
892 int error, do_write, copy, disk, best;
893
894 trs = (struct g_raid_tr_raid1e_object *)tr;
895 vol = tr->tro_volume;
896 if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
897 if (trs->trso_type == TR_RAID1E_REBUILD) {
898 nsd = trs->trso_failed_sd;
899 if (bp->bio_cmd == BIO_READ) {
900 /* Immediately abort rebuild, if requested. */
901 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
902 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
903 g_raid_tr_raid1e_rebuild_abort(tr);
904 return;
905 }
906
907 /* On read error, skip and cross fingers. */
908 if (bp->bio_error != 0) {
909 G_RAID_LOGREQ(0, bp,
910 "Read error during rebuild (%d), "
911 "possible data loss!",
912 bp->bio_error);
913 goto rebuild_round_done;
914 }
915
916 /*
917 * The read operation finished, queue the
918 * write and get out.
919 */
920 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
921 bp->bio_error);
922 bp->bio_cmd = BIO_WRITE;
923 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
924 bp->bio_offset = nsd->sd_rebuild_pos;
925 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
926 g_raid_subdisk_iostart(nsd, bp);
927 } else {
928 /*
929 * The write operation just finished. Do
930 * another. We keep cloning the master bio
931 * since it has the right buffers allocated to
932 * it.
933 */
934 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
935 bp->bio_error);
936 if (bp->bio_error != 0 ||
937 trs->trso_flags & TR_RAID1E_F_ABORT) {
938 if ((trs->trso_flags &
939 TR_RAID1E_F_ABORT) == 0) {
940 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
941 nsd, nsd->sd_disk);
942 }
943 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
944 g_raid_tr_raid1e_rebuild_abort(tr);
945 return;
946 }
947 rebuild_round_done:
948 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
949 g_raid_unlock_range(tr->tro_volume,
950 trs->trso_lock_pos, trs->trso_lock_len);
951 nsd->sd_rebuild_pos += bp->bio_length;
952 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
953 g_raid_tr_raid1e_rebuild_finish(tr);
954 return;
955 }
956
957 /* Abort rebuild if we are stopping */
958 if (trs->trso_stopping) {
959 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
960 g_raid_tr_raid1e_rebuild_abort(tr);
961 return;
962 }
963
964 if (--trs->trso_meta_update <= 0) {
965 g_raid_write_metadata(vol->v_softc,
966 vol, nsd, nsd->sd_disk);
967 trs->trso_meta_update =
968 g_raid1e_rebuild_meta_update;
969 /* Compensate short rebuild I/Os. */
970 if ((vol->v_disks_count % N) != 0 &&
971 vol->v_strip_size <
972 g_raid1e_rebuild_slab) {
973 trs->trso_meta_update *=
974 g_raid1e_rebuild_slab;
975 trs->trso_meta_update /=
976 vol->v_strip_size;
977 }
978 }
979 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
980 if (--trs->trso_recover_slabs <= 0)
981 return;
982 /* Run next rebuild iteration. */
983 g_raid_tr_raid1e_rebuild_some(tr);
984 }
985 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
986 /*
987 * read good sd, read bad sd in parallel. when both
988 * done, compare the buffers. write good to the bad
989 * if different. do the next bit of work.
990 */
991 panic("Somehow, we think we're doing a resync");
992 }
993 return;
994 }
995 pbp = bp->bio_parent;
996 pbp->bio_inbed++;
997 mask = (intptr_t)bp->bio_caller2;
998 if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
999 /*
1000 * Read failed on first drive. Retry the read error on
1001 * another disk drive, if available, before erroring out the
1002 * read.
1003 */
1004 sd->sd_disk->d_read_errs++;
1005 G_RAID_LOGREQ(0, bp,
1006 "Read error (%d), %d read errors total",
1007 bp->bio_error, sd->sd_disk->d_read_errs);
1008
1009 /*
1010 * If there are too many read errors, we move to degraded.
1011 * XXX Do we want to FAIL the drive (eg, make the user redo
1012 * everything to get it back in sync), or just degrade the
1013 * drive, which kicks off a resync?
1014 */
1015 do_write = 0;
1016 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1017 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1018 else if (mask == 0)
1019 do_write = 1;
1020
1021 /* Restore what we were doing. */
1022 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1023 V2P(vol, virtual, &disk, &offset, &start);
1024
1025 /* Find the other disk, and try to do the I/O to it. */
1026 mask |= 1 << copy;
1027 best = g_raid_tr_raid1e_select_read_disk(vol,
1028 disk, offset, start, mask);
1029 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1030 disk += best;
1031 if (disk >= vol->v_disks_count) {
1032 disk -= vol->v_disks_count;
1033 offset += vol->v_strip_size;
1034 }
1035 cbp->bio_offset = offset + start;
1036 cbp->bio_length = bp->bio_length;
1037 cbp->bio_data = bp->bio_data;
1038 cbp->bio_ma = bp->bio_ma;
1039 cbp->bio_ma_offset = bp->bio_ma_offset;
1040 cbp->bio_ma_n = bp->bio_ma_n;
1041 g_destroy_bio(bp);
1042 nsd = &vol->v_subdisks[disk];
1043 G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1044 nsd->sd_pos);
1045 if (do_write)
1046 mask |= 1 << 31;
1047 if ((mask & (1U << 31)) != 0)
1048 sd->sd_recovery++;
1049 cbp->bio_caller2 = (void *)mask;
1050 if (do_write) {
1051 cbp->bio_caller1 = nsd;
1052 /* Lock callback starts I/O */
1053 g_raid_lock_range(sd->sd_volume,
1054 virtual, cbp->bio_length, pbp, cbp);
1055 } else {
1056 g_raid_subdisk_iostart(nsd, cbp);
1057 }
1058 return;
1059 }
1060 /*
1061 * We can't retry. Return the original error by falling
1062 * through. This will happen when there's only one good disk.
1063 * We don't need to fail the raid, since its actual state is
1064 * based on the state of the subdisks.
1065 */
1066 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1067 }
1068 if (bp->bio_cmd == BIO_READ &&
1069 bp->bio_error == 0 &&
1070 (mask & (1U << 31)) != 0) {
1071 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1072
1073 /* Restore what we were doing. */
1074 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1075 V2P(vol, virtual, &disk, &offset, &start);
1076
1077 /* Find best disk to write. */
1078 best = g_raid_tr_raid1e_select_read_disk(vol,
1079 disk, offset, start, ~mask);
1080 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1081 disk += best;
1082 if (disk >= vol->v_disks_count) {
1083 disk -= vol->v_disks_count;
1084 offset += vol->v_strip_size;
1085 }
1086 cbp->bio_offset = offset + start;
1087 cbp->bio_cmd = BIO_WRITE;
1088 cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1089 cbp->bio_caller2 = (void *)mask;
1090 g_destroy_bio(bp);
1091 G_RAID_LOGREQ(2, cbp,
1092 "Attempting bad sector remap on failing drive.");
1093 g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1094 return;
1095 }
1096 }
1097 if ((mask & (1U << 31)) != 0) {
1098 /*
1099 * We're done with a recovery, mark the range as unlocked.
1100 * For any write errors, we aggressively fail the disk since
1101 * there was both a READ and a WRITE error at this location.
1102 * Both types of errors generally indicates the drive is on
1103 * the verge of total failure anyway. Better to stop trusting
1104 * it now. However, we need to reset error to 0 in that case
1105 * because we're not failing the original I/O which succeeded.
1106 */
1107
1108 /* Restore what we were doing. */
1109 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1110 V2P(vol, virtual, &disk, &offset, &start);
1111
1112 for (copy = 0; copy < N; copy++) {
1113 if ((mask & (1 << copy) ) != 0)
1114 vol->v_subdisks[(disk + copy) %
1115 vol->v_disks_count].sd_recovery--;
1116 }
1117
1118 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1119 G_RAID_LOGREQ(0, bp, "Remap write failed: "
1120 "failing subdisk.");
1121 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1122 bp->bio_error = 0;
1123 }
1124 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1125 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1126 }
1127 if (pbp->bio_cmd != BIO_READ) {
1128 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1129 pbp->bio_error = bp->bio_error;
1130 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1131 G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1132 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1133 }
1134 error = pbp->bio_error;
1135 } else
1136 error = bp->bio_error;
1137 g_destroy_bio(bp);
1138 if (pbp->bio_children == pbp->bio_inbed) {
1139 pbp->bio_completed = pbp->bio_length;
1140 g_raid_iodone(pbp, error);
1141 }
1142 }
1143
1144 static int
g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object * tr,void * virtual,off_t boffset,size_t blength)1145 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual,
1146 off_t boffset, size_t blength)
1147 {
1148 struct g_raid_volume *vol;
1149 struct g_raid_subdisk *sd;
1150 struct bio_queue_head queue;
1151 char *addr;
1152 off_t offset, start, length, remain;
1153 u_int no, strip_size;
1154 int i, error;
1155
1156 vol = tr->tro_volume;
1157 addr = virtual;
1158 strip_size = vol->v_strip_size;
1159 V2P(vol, boffset, &no, &offset, &start);
1160 remain = blength;
1161 bioq_init(&queue);
1162 while (remain > 0) {
1163 length = MIN(strip_size - start, remain);
1164 for (i = 0; i < N; i++) {
1165 sd = &vol->v_subdisks[no];
1166 switch (sd->sd_state) {
1167 case G_RAID_SUBDISK_S_ACTIVE:
1168 case G_RAID_SUBDISK_S_STALE:
1169 case G_RAID_SUBDISK_S_RESYNC:
1170 break;
1171 case G_RAID_SUBDISK_S_REBUILD:
1172 if (offset + start >= sd->sd_rebuild_pos)
1173 goto nextdisk;
1174 break;
1175 default:
1176 goto nextdisk;
1177 }
1178 error = g_raid_subdisk_kerneldump(sd, addr,
1179 offset + start, length);
1180 if (error != 0)
1181 return (error);
1182 nextdisk:
1183 if (++no >= vol->v_disks_count) {
1184 no = 0;
1185 offset += strip_size;
1186 }
1187 }
1188 remain -= length;
1189 addr += length;
1190 start = 0;
1191 }
1192 return (0);
1193 }
1194
1195 static int
g_raid_tr_locked_raid1e(struct g_raid_tr_object * tr,void * argp)1196 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1197 {
1198 struct bio *bp;
1199 struct g_raid_subdisk *sd;
1200
1201 bp = (struct bio *)argp;
1202 sd = (struct g_raid_subdisk *)bp->bio_caller1;
1203 g_raid_subdisk_iostart(sd, bp);
1204
1205 return (0);
1206 }
1207
1208 static int
g_raid_tr_idle_raid1e(struct g_raid_tr_object * tr)1209 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1210 {
1211 struct g_raid_tr_raid1e_object *trs;
1212 struct g_raid_volume *vol;
1213
1214 vol = tr->tro_volume;
1215 trs = (struct g_raid_tr_raid1e_object *)tr;
1216 trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1217 trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1218 /* Compensate short rebuild I/Os. */
1219 if ((vol->v_disks_count % N) != 0 &&
1220 vol->v_strip_size < g_raid1e_rebuild_slab) {
1221 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1222 trs->trso_recover_slabs /= vol->v_strip_size;
1223 }
1224 if (trs->trso_type == TR_RAID1E_REBUILD)
1225 g_raid_tr_raid1e_rebuild_some(tr);
1226 return (0);
1227 }
1228
1229 static int
g_raid_tr_free_raid1e(struct g_raid_tr_object * tr)1230 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1231 {
1232 struct g_raid_tr_raid1e_object *trs;
1233
1234 trs = (struct g_raid_tr_raid1e_object *)tr;
1235
1236 if (trs->trso_buffer != NULL) {
1237 free(trs->trso_buffer, M_TR_RAID1E);
1238 trs->trso_buffer = NULL;
1239 }
1240 return (0);
1241 }
1242
1243 G_RAID_TR_DECLARE(raid1e, "RAID1E");
1244