1 /*        $NetBSD: rf_netbsdkintf.c,v 1.418 2025/01/08 08:25:36 andvar Exp $    */
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.418 2025/01/08 08:25:36 andvar Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131 
132 #include <prop/proplib.h>
133 
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137 
138 #include "rf_raid.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150 
151 #include "ioconf.h"
152 
153 #ifdef DEBUG
154 int     rf_kdebug_level = 0;
155 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
156 #else                                   /* DEBUG */
157 #define db1_printf(a) { }
158 #endif                                  /* DEBUG */
159 
160 #define DEVICE_XNAME(dev) dev ? device_xname(dev) : "null"
161 
162 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
163 static rf_declare_mutex2(rf_sparet_wait_mutex);
164 static rf_declare_cond2(rf_sparet_wait_cv);
165 static rf_declare_cond2(rf_sparet_resp_cv);
166 
167 static RF_SparetWait_t *rf_sparet_wait_queue;     /* requests to install a
168                                                              * spare table */
169 static RF_SparetWait_t *rf_sparet_resp_queue;     /* responses from
170                                                              * installation process */
171 #endif
172 
173 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
174 
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176 
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181     void *, int);
182 static void raidinit(struct raid_softc *);
183 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
184 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
185 
186 static int raid_match(device_t, cfdata_t, void *);
187 static void raid_attach(device_t, device_t, void *);
188 static int raid_detach(device_t, int);
189 
190 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
191     daddr_t, daddr_t);
192 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
193     daddr_t, daddr_t);
194 
195 static int raidwrite_component_label(unsigned,
196     dev_t, struct vnode *, RF_ComponentLabel_t *);
197 static int raidread_component_label(unsigned,
198     dev_t, struct vnode *, RF_ComponentLabel_t *);
199 
200 static int raid_diskstart(device_t, struct buf *bp);
201 static int raid_dumpblocks(device_t, void *, daddr_t, int);
202 static int raid_lastclose(device_t);
203 
204 static dev_type_open(raidopen);
205 static dev_type_close(raidclose);
206 static dev_type_read(raidread);
207 static dev_type_write(raidwrite);
208 static dev_type_ioctl(raidioctl);
209 static dev_type_strategy(raidstrategy);
210 static dev_type_dump(raiddump);
211 static dev_type_size(raidsize);
212 
213 const struct bdevsw raid_bdevsw = {
214           .d_open = raidopen,
215           .d_close = raidclose,
216           .d_strategy = raidstrategy,
217           .d_ioctl = raidioctl,
218           .d_dump = raiddump,
219           .d_psize = raidsize,
220           .d_discard = nodiscard,
221           .d_flag = D_DISK
222 };
223 
224 const struct cdevsw raid_cdevsw = {
225           .d_open = raidopen,
226           .d_close = raidclose,
227           .d_read = raidread,
228           .d_write = raidwrite,
229           .d_ioctl = raidioctl,
230           .d_stop = nostop,
231           .d_tty = notty,
232           .d_poll = nopoll,
233           .d_mmap = nommap,
234           .d_kqfilter = nokqfilter,
235           .d_discard = nodiscard,
236           .d_flag = D_DISK
237 };
238 
239 static struct dkdriver rf_dkdriver = {
240           .d_open = raidopen,
241           .d_close = raidclose,
242           .d_strategy = raidstrategy,
243           .d_diskstart = raid_diskstart,
244           .d_dumpblocks = raid_dumpblocks,
245           .d_lastclose = raid_lastclose,
246           .d_minphys = minphys
247 };
248 
249 #define   raidunit(x)         DISKUNIT(x)
250 #define   raidsoftc(dev)      (((struct raid_softc *)device_private(dev))->sc_r.softc)
251 
252 extern struct cfdriver raid_cd;
253 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
254     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
255     DVF_DETACH_SHUTDOWN);
256 
257 /* Internal representation of a rf_recon_req */
258 struct rf_recon_req_internal {
259           RF_RowCol_t col;
260           RF_ReconReqFlags_t flags;
261           void   *raidPtr;
262 };
263 
264 /*
265  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
266  * Be aware that large numbers can allow the driver to consume a lot of
267  * kernel memory, especially on writes, and in degraded mode reads.
268  *
269  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
270  * a single 64K write will typically require 64K for the old data,
271  * 64K for the old parity, and 64K for the new parity, for a total
272  * of 192K (if the parity buffer is not re-used immediately).
273  * Even it if is used immediately, that's still 128K, which when multiplied
274  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
275  *
276  * Now in degraded mode, for example, a 64K read on the above setup may
277  * require data reconstruction, which will require *all* of the 4 remaining
278  * disks to participate -- 4 * 32K/disk == 128K again.
279  */
280 
281 #ifndef RAIDOUTSTANDING
282 #define RAIDOUTSTANDING   6
283 #endif
284 
285 #define RAIDLABELDEV(dev)     \
286           (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
287 
288 /* declared here, and made public, for the benefit of KVM stuff.. */
289 
290 static int raidlock(struct raid_softc *);
291 static void raidunlock(struct raid_softc *);
292 
293 static int raid_detach_unlocked(struct raid_softc *);
294 
295 static void rf_markalldirty(RF_Raid_t *);
296 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
297 
298 static void rf_ReconThread(struct rf_recon_req_internal *);
299 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
301 static int rf_autoconfig(device_t);
302 static int rf_rescan(void);
303 static void rf_buildroothack(RF_ConfigSet_t *);
304 
305 static RF_AutoConfig_t *rf_find_raid_components(void);
306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
309 static int rf_set_autoconfig(RF_Raid_t *, int);
310 static int rf_set_rootpartition(RF_Raid_t *, int);
311 static void rf_release_all_vps(RF_ConfigSet_t *);
312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
313 static int rf_have_enough_components(RF_ConfigSet_t *);
314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
316 
317 /*
318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
320  * in the kernel config file.
321  */
322 #ifdef RAID_AUTOCONFIG
323 int raidautoconfig = 1;
324 #else
325 int raidautoconfig = 0;
326 #endif
327 static bool raidautoconfigdone = false;
328 
329 struct pool rf_alloclist_pool;   /* AllocList */
330 
331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
332 static kmutex_t raid_lock;
333 
334 static struct raid_softc *
raidcreate(int unit)335 raidcreate(int unit) {
336           struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
337           sc->sc_unit = unit;
338           cv_init(&sc->sc_cv, "raidunit");
339           mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
340           return sc;
341 }
342 
343 static void
raiddestroy(struct raid_softc * sc)344 raiddestroy(struct raid_softc *sc) {
345           cv_destroy(&sc->sc_cv);
346           mutex_destroy(&sc->sc_mutex);
347           kmem_free(sc, sizeof(*sc));
348 }
349 
350 static struct raid_softc *
raidget(int unit,bool create)351 raidget(int unit, bool create) {
352           struct raid_softc *sc;
353           if (unit < 0) {
354 #ifdef DIAGNOSTIC
355                     panic("%s: unit %d!", __func__, unit);
356 #endif
357                     return NULL;
358           }
359           mutex_enter(&raid_lock);
360           LIST_FOREACH(sc, &raids, sc_link) {
361                     if (sc->sc_unit == unit) {
362                               mutex_exit(&raid_lock);
363                               return sc;
364                     }
365           }
366           mutex_exit(&raid_lock);
367           if (!create)
368                     return NULL;
369           sc = raidcreate(unit);
370           mutex_enter(&raid_lock);
371           LIST_INSERT_HEAD(&raids, sc, sc_link);
372           mutex_exit(&raid_lock);
373           return sc;
374 }
375 
376 static void
raidput(struct raid_softc * sc)377 raidput(struct raid_softc *sc) {
378           mutex_enter(&raid_lock);
379           LIST_REMOVE(sc, sc_link);
380           mutex_exit(&raid_lock);
381           raiddestroy(sc);
382 }
383 
384 void
raidattach(int num)385 raidattach(int num)
386 {
387 
388           /*
389            * Device attachment and associated initialization now occurs
390            * as part of the module initialization.
391            */
392 }
393 
394 static int
rf_autoconfig(device_t self)395 rf_autoconfig(device_t self)
396 {
397           RF_AutoConfig_t *ac_list;
398           RF_ConfigSet_t *config_sets;
399 
400           if (!raidautoconfig || raidautoconfigdone == true)
401                     return 0;
402 
403           /* XXX This code can only be run once. */
404           raidautoconfigdone = true;
405 
406 #ifdef __HAVE_CPU_BOOTCONF
407           /*
408            * 0. find the boot device if needed first so we can use it later
409            * this needs to be done before we autoconfigure any raid sets,
410            * because if we use wedges we are not going to be able to open
411            * the boot device later
412            */
413           if (booted_device == NULL)
414                     cpu_bootconf();
415 #endif
416           /* 1. locate all RAID components on the system */
417           aprint_debug("Searching for RAID components...\n");
418           ac_list = rf_find_raid_components();
419 
420           /* 2. Sort them into their respective sets. */
421           config_sets = rf_create_auto_sets(ac_list);
422 
423           /*
424            * 3. Evaluate each set and configure the valid ones.
425            * This gets done in rf_buildroothack().
426            */
427           rf_buildroothack(config_sets);
428 
429           return 1;
430 }
431 
432 int
rf_inited(const struct raid_softc * rs)433 rf_inited(const struct raid_softc *rs) {
434           return (rs->sc_flags & RAIDF_INITED) != 0;
435 }
436 
437 RF_Raid_t *
rf_get_raid(struct raid_softc * rs)438 rf_get_raid(struct raid_softc *rs) {
439           return &rs->sc_r;
440 }
441 
442 int
rf_get_unit(const struct raid_softc * rs)443 rf_get_unit(const struct raid_softc *rs) {
444           return rs->sc_unit;
445 }
446 
447 static int
rf_containsboot(RF_Raid_t * r,device_t bdv)448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
449           const char *bootname;
450           size_t len;
451 
452           /* if bdv is NULL, the set can't contain it. exit early. */
453           if (bdv == NULL)
454                     return 0;
455 
456           bootname = device_xname(bdv);
457           len = strlen(bootname);
458 
459           for (int col = 0; col < r->numCol; col++) {
460                     const char *devname = r->Disks[col].devname;
461                     devname += sizeof("/dev/") - 1;
462                     if (strncmp(devname, "dk", 2) == 0) {
463                               const char *parent =
464                                   dkwedge_get_parent_name(r->Disks[col].dev);
465                               if (parent != NULL)
466                                         devname = parent;
467                     }
468                     if (strncmp(devname, bootname, len) == 0) {
469                               struct raid_softc *sc = r->softc;
470                               aprint_debug("raid%d includes boot device %s\n",
471                                   sc->sc_unit, devname);
472                               return 1;
473                     }
474           }
475           return 0;
476 }
477 
478 static int
rf_rescan(void)479 rf_rescan(void)
480 {
481           RF_AutoConfig_t *ac_list;
482           RF_ConfigSet_t *config_sets, *cset, *next_cset;
483           struct raid_softc *sc;
484           int raid_added;
485 
486           ac_list = rf_find_raid_components();
487           config_sets = rf_create_auto_sets(ac_list);
488 
489           raid_added = 1;
490           while (raid_added > 0) {
491                     raid_added = 0;
492                     cset = config_sets;
493                     while (cset != NULL) {
494                               next_cset = cset->next;
495                               if (rf_have_enough_components(cset) &&
496                                   cset->ac->clabel->autoconfigure == 1) {
497                                         sc = rf_auto_config_set(cset);
498                                         if (sc != NULL) {
499                                                   aprint_debug("raid%d: configured ok, rootable %d\n",
500                                                                  sc->sc_unit, cset->rootable);
501                                                   /* We added one RAID set */
502                                                   raid_added++;
503                                         } else {
504                                                   /* The autoconfig didn't work :( */
505                                                   aprint_debug("Autoconfig failed\n");
506                                                   rf_release_all_vps(cset);
507                                         }
508                               } else {
509                                         /* we're not autoconfiguring this set...
510                                            release the associated resources */
511                                         rf_release_all_vps(cset);
512                               }
513                               /* cleanup */
514                               rf_cleanup_config_set(cset);
515                               cset = next_cset;
516                     }
517                     if (raid_added > 0) {
518                               /* We added at least one RAID set, so re-scan for recursive RAID */
519                               ac_list = rf_find_raid_components();
520                               config_sets = rf_create_auto_sets(ac_list);
521                     }
522           }
523 
524           return 0;
525 }
526 
527 /*
528  * Example setup:
529  * dk1 at wd0: "raid@wd0", 171965 blocks at 32802, type: raidframe
530  * dk3 at wd1: "raid@wd1", 171965 blocks at 32802, type: raidframz
531  * raid1: Components: /dev/dk1 /dev/dk3
532  * dk4 at raid1: "empty@raid1", 8192 blocks at 34, type: msdos
533  * dk5 at raid1: "root@raid1", 163517 blocks at 8226, type: ffs
534  *
535  * If booted from wd0, booted_device will be
536  * disk wd0, startblk = 41092, nblks = 163517
537  *
538  * That is, dk5 with startblk computed from the beginning of wd0
539  * instead of beginning of raid1:
540  * 32802 + 64 (RF_PROTECTED_SECTORS) + 8226 = 41092
541  *
542  * In order to find the boot wedge, we must iterate on each component,
543  * find its offset from disk beginning, and look for the boot wedge with
544  * startblck adjusted.
545  */
546 static device_t
rf_find_bootwedge(struct raid_softc * rsc)547 rf_find_bootwedge(struct raid_softc *rsc)
548 {
549           RF_Raid_t *r = &rsc->sc_r;
550           const char *bootname;
551           size_t len;
552           device_t rdev = NULL;
553 
554           if (booted_device == NULL)
555                     goto out;
556 
557           bootname = device_xname(booted_device);
558           len = strlen(bootname);
559 
560           aprint_debug("%s: booted_device %s, startblk = %"PRId64", "
561                          "nblks = %"PRId64"\n", __func__,
562                          bootname, booted_startblk, booted_nblks);
563 
564           for (int col = 0; col < r->numCol; col++) {
565                     const char *devname = r->Disks[col].devname;
566                     const char *parent;
567                     struct disk *dk;
568                     u_int nwedges;
569                     struct dkwedge_info *dkwi;
570                     struct dkwedge_list dkwl;
571                     size_t dkwi_len;
572                     int i;
573 
574                     devname += sizeof("/dev/") - 1;
575                     if (strncmp(devname, "dk", 2) != 0)
576                               continue;
577 
578                     parent = dkwedge_get_parent_name(r->Disks[col].dev);
579                     if (parent == NULL) {
580                               aprint_debug("%s: cannot find parent for "
581                                              "component /dev/%s", __func__, devname);
582                               continue;
583                     }
584 
585                     if (strncmp(parent, bootname, len) != 0)
586                               continue;
587 
588                     aprint_debug("%s: looking up wedge %s in device %s\n",
589                                    __func__, devname, parent);
590 
591                     dk = disk_find(parent);
592                     nwedges = dk->dk_nwedges;
593                     dkwi_len = sizeof(*dkwi) * nwedges;
594                     dkwi = RF_Malloc(dkwi_len);
595 
596                     dkwl.dkwl_buf = dkwi;
597                     dkwl.dkwl_bufsize = dkwi_len;
598                     dkwl.dkwl_nwedges = 0;
599                     dkwl.dkwl_ncopied = 0;
600 
601                     if (dkwedge_list(dk, &dkwl, curlwp) == 0) {
602                               daddr_t startblk;
603 
604                               for (i = 0; i < dkwl.dkwl_ncopied; i++) {
605                                         if (strcmp(dkwi[i].dkw_devname, devname) == 0)
606                                                   break;
607                               }
608 
609                               KASSERT(i < dkwl.dkwl_ncopied);
610 
611                               aprint_debug("%s: wedge %s, "
612                                              "startblk = %"PRId64", "
613                                              "nblks = %"PRId64"\n",
614                                              __func__,
615                                              dkwi[i].dkw_devname,
616                                              dkwi[i].dkw_offset,
617                                              dkwi[i].dkw_size);
618 
619                               startblk = booted_startblk
620                                          - dkwi[i].dkw_offset
621                                          - RF_PROTECTED_SECTORS;
622 
623                               aprint_debug("%s: looking for wedge in %s, "
624                                              "startblk = %"PRId64", "
625                                              "nblks = %"PRId64"\n",
626                                              __func__,
627                                              DEVICE_XNAME(rsc->sc_dksc.sc_dev),
628                                              startblk, booted_nblks);
629 
630                               rdev = dkwedge_find_partition(rsc->sc_dksc.sc_dev,
631                                                                   startblk,
632                                                                   booted_nblks);
633                               if (rdev) {
634                                         aprint_debug("%s: root candidate wedge %s "
635                                                        "shifted from %s\n", __func__,
636                                                        device_xname(rdev),
637                                                        dkwi[i].dkw_devname);
638                                         goto done;
639                               } else {
640                                         aprint_debug("%s: not found\n", __func__);
641                               }
642                     }
643 
644                     aprint_debug("%s: nothing found for col %d\n", __func__, col);
645 done:
646                     RF_Free(dkwi, dkwi_len);
647           }
648 
649 out:
650           if (!rdev)
651                     aprint_debug("%s: nothing found\n", __func__);
652 
653           return rdev;
654 }
655 
656 static void
rf_buildroothack(RF_ConfigSet_t * config_sets)657 rf_buildroothack(RF_ConfigSet_t *config_sets)
658 {
659           RF_AutoConfig_t *ac_list;
660           RF_ConfigSet_t *cset;
661           RF_ConfigSet_t *next_cset;
662           int num_root;
663           int raid_added;
664           struct raid_softc *sc, *rsc;
665           struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */
666 
667           sc = rsc = NULL;
668           num_root = 0;
669 
670           raid_added = 1;
671           while (raid_added > 0) {
672                     raid_added = 0;
673                     cset = config_sets;
674                     while (cset != NULL) {
675                               next_cset = cset->next;
676                               if (rf_have_enough_components(cset) &&
677                                   cset->ac->clabel->autoconfigure == 1) {
678                                         sc = rf_auto_config_set(cset);
679                                         if (sc != NULL) {
680                                                   aprint_debug("raid%d: configured ok, rootable %d\n",
681                                                                  sc->sc_unit, cset->rootable);
682                                                   /* We added one RAID set */
683                                                   raid_added++;
684                                                   if (cset->rootable) {
685                                                             rsc = sc;
686                                                             num_root++;
687                                                   }
688                                         } else {
689                                                   /* The autoconfig didn't work :( */
690                                                   aprint_debug("Autoconfig failed\n");
691                                                   rf_release_all_vps(cset);
692                                         }
693                               } else {
694                                         /* we're not autoconfiguring this set...
695                                            release the associated resources */
696                                         rf_release_all_vps(cset);
697                               }
698                               /* cleanup */
699                               rf_cleanup_config_set(cset);
700                               cset = next_cset;
701                     }
702                     if (raid_added > 0) {
703                               /* We added at least one RAID set, so re-scan for recursive RAID */
704                               ac_list = rf_find_raid_components();
705                               config_sets = rf_create_auto_sets(ac_list);
706                     }
707           }
708 
709           /* if the user has specified what the root device should be
710              then we don't touch booted_device or boothowto... */
711 
712           if (rootspec != NULL) {
713                     aprint_debug("%s: rootspec %s\n", __func__, rootspec);
714                     return;
715           }
716 
717           /* we found something bootable... */
718           if (num_root == 1) {
719                     device_t candidate_root = NULL;
720                     dksc = &rsc->sc_dksc;
721 
722                     if (dksc->sc_dkdev.dk_nwedges != 0) {
723 
724                               /* Find the wedge we booted from */
725                               candidate_root = rf_find_bootwedge(rsc);
726 
727                               /* Try first partition */
728                               if (candidate_root == NULL) {
729                                         size_t i = 0;
730                                         candidate_root = dkwedge_find_by_parent(
731                                             device_xname(dksc->sc_dev), &i);
732                               }
733                               aprint_debug("%s: candidate wedge root %s\n",
734                                   __func__, DEVICE_XNAME(candidate_root));
735                     } else {
736                               candidate_root = dksc->sc_dev;
737                     }
738 
739                     aprint_debug("%s: candidate root = %s, booted_device = %s, "
740                                    "root_partition = %d, contains_boot=%d\n",
741                         __func__, DEVICE_XNAME(candidate_root),
742                         DEVICE_XNAME(booted_device), rsc->sc_r.root_partition,
743                         rf_containsboot(&rsc->sc_r, booted_device));
744 
745                     /* XXX the check for booted_device == NULL can probably be
746                      * dropped, now that rf_containsboot handles that case.
747                      */
748                     if (booted_device == NULL ||
749                         rsc->sc_r.root_partition == 1 ||
750                         rf_containsboot(&rsc->sc_r, booted_device)) {
751                               booted_device = candidate_root;
752                               booted_method = "raidframe/single";
753                               booted_partition = 0;         /* XXX assume 'a' */
754                               aprint_debug("%s: set booted_device = %s\n", __func__,
755                                   DEVICE_XNAME(booted_device));
756                     }
757           } else if (num_root > 1) {
758                     aprint_debug("%s: many roots=%d, %s\n", __func__, num_root,
759                         DEVICE_XNAME(booted_device));
760 
761                     /*
762                      * Maybe the MD code can help. If it cannot, then
763                      * setroot() will discover that we have no
764                      * booted_device and will ask the user if nothing was
765                      * hardwired in the kernel config file
766                      */
767                     if (booted_device == NULL)
768                               return;
769 
770                     num_root = 0;
771                     mutex_enter(&raid_lock);
772                     LIST_FOREACH(sc, &raids, sc_link) {
773                               RF_Raid_t *r = &sc->sc_r;
774                               if (r->valid == 0)
775                                         continue;
776 
777                               if (r->root_partition == 0)
778                                         continue;
779 
780                               if (rf_containsboot(r, booted_device)) {
781                                         num_root++;
782                                         rsc = sc;
783                                         dksc = &rsc->sc_dksc;
784                               }
785                     }
786                     mutex_exit(&raid_lock);
787 
788                     if (num_root == 1) {
789                               booted_device = dksc->sc_dev;
790                               booted_method = "raidframe/multi";
791                               booted_partition = 0;         /* XXX assume 'a' */
792                     } else {
793                               /* we can't guess.. require the user to answer... */
794                               boothowto |= RB_ASKNAME;
795                     }
796           }
797 }
798 
799 static int
raidsize(dev_t dev)800 raidsize(dev_t dev)
801 {
802           struct raid_softc *rs;
803           struct dk_softc *dksc;
804           unsigned int unit;
805 
806           unit = raidunit(dev);
807           if ((rs = raidget(unit, false)) == NULL)
808                     return -1;
809           dksc = &rs->sc_dksc;
810 
811           if ((rs->sc_flags & RAIDF_INITED) == 0)
812                     return -1;
813 
814           return dk_size(dksc, dev);
815 }
816 
817 static int
raiddump(dev_t dev,daddr_t blkno,void * va,size_t size)818 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
819 {
820           unsigned int unit;
821           struct raid_softc *rs;
822           struct dk_softc *dksc;
823 
824           unit = raidunit(dev);
825           if ((rs = raidget(unit, false)) == NULL)
826                     return ENXIO;
827           dksc = &rs->sc_dksc;
828 
829           if ((rs->sc_flags & RAIDF_INITED) == 0)
830                     return ENODEV;
831 
832         /*
833            Note that blkno is relative to this particular partition.
834            By adding adding RF_PROTECTED_SECTORS, we get a value that
835              is relative to the partition used for the underlying component.
836         */
837           blkno += RF_PROTECTED_SECTORS;
838 
839           return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
840 }
841 
842 static int
raid_dumpblocks(device_t dev,void * va,daddr_t blkno,int nblk)843 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
844 {
845           struct raid_softc *rs = raidsoftc(dev);
846           const struct bdevsw *bdev;
847           RF_Raid_t *raidPtr;
848           int     c, sparecol, j, scol, dumpto;
849           int     error = 0;
850 
851           raidPtr = &rs->sc_r;
852 
853           /* we only support dumping to RAID 1 sets */
854           if (raidPtr->Layout.numDataCol != 1 ||
855               raidPtr->Layout.numParityCol != 1)
856                     return EINVAL;
857 
858           if ((error = raidlock(rs)) != 0)
859                     return error;
860 
861           /* figure out what device is alive.. */
862 
863           /*
864              Look for a component to dump to.  The preference for the
865              component to dump to is as follows:
866              1) the first component
867              2) a used_spare of the first component
868              3) the second component
869              4) a used_spare of the second component
870           */
871 
872           dumpto = -1;
873           for (c = 0; c < raidPtr->numCol; c++) {
874                     if (raidPtr->Disks[c].status == rf_ds_optimal) {
875                               /* this might be the one */
876                               dumpto = c;
877                               break;
878                     }
879           }
880 
881           /*
882              At this point we have possibly selected a live component.
883              If we didn't find a live component, we now check to see
884              if there is a relevant spared component.
885           */
886 
887           for (c = 0; c < raidPtr->numSpare; c++) {
888                     sparecol = raidPtr->numCol + c;
889 
890                     if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
891                               /* How about this one? */
892                               scol = -1;
893                               for(j=0;j<raidPtr->numCol;j++) {
894                                         if (raidPtr->Disks[j].spareCol == sparecol) {
895                                                   scol = j;
896                                                   break;
897                                         }
898                               }
899                               if (scol == 0) {
900                                         /*
901                                            We must have found a spared first
902                                            component!  We'll take that over
903                                            anything else found so far.  (We
904                                            couldn't have found a real first
905                                            component before, since this is a
906                                            used spare, and it's saying that
907                                            it's replacing the first
908                                            component.)  On reboot (with
909                                            autoconfiguration turned on)
910                                            sparecol will become the first
911                                            component (component0) of this set.
912                                         */
913                                         dumpto = sparecol;
914                                         break;
915                               } else if (scol != -1) {
916                                         /*
917                                            Must be a spared second component.
918                                            We'll dump to that if we havn't found
919                                            anything else so far.
920                                         */
921                                         if (dumpto == -1)
922                                                   dumpto = sparecol;
923                               }
924                     }
925           }
926 
927           if (dumpto == -1) {
928                     /* we couldn't find any live components to dump to!?!?
929                      */
930                     error = EINVAL;
931                     goto out;
932           }
933 
934           bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
935           if (bdev == NULL) {
936                     error = ENXIO;
937                     goto out;
938           }
939 
940           error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
941                                         blkno, va, nblk * raidPtr->bytesPerSector);
942 
943 out:
944           raidunlock(rs);
945 
946           return error;
947 }
948 
949 /* ARGSUSED */
950 static int
raidopen(dev_t dev,int flags,int fmt,struct lwp * l)951 raidopen(dev_t dev, int flags, int fmt,
952     struct lwp *l)
953 {
954           int     unit = raidunit(dev);
955           struct raid_softc *rs;
956           struct dk_softc *dksc;
957           int     error = 0;
958           int     part, pmask;
959 
960           if ((rs = raidget(unit, true)) == NULL)
961                     return ENXIO;
962           if ((error = raidlock(rs)) != 0)
963                     return error;
964 
965           if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
966                     error = EBUSY;
967                     goto bad;
968           }
969 
970           dksc = &rs->sc_dksc;
971 
972           part = DISKPART(dev);
973           pmask = (1 << part);
974 
975           if (!DK_BUSY(dksc, pmask) &&
976               ((rs->sc_flags & RAIDF_INITED) != 0)) {
977                     /* First one... mark things as dirty... Note that we *MUST*
978                      have done a configure before this.  I DO NOT WANT TO BE
979                      SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
980                      THAT THEY BELONG TOGETHER!!!!! */
981                     /* XXX should check to see if we're only open for reading
982                        here... If so, we needn't do this, but then need some
983                        other way of keeping track of what's happened.. */
984 
985                     rf_markalldirty(&rs->sc_r);
986           }
987 
988           if ((rs->sc_flags & RAIDF_INITED) != 0)
989                     error = dk_open(dksc, dev, flags, fmt, l);
990 
991 bad:
992           raidunlock(rs);
993 
994           return error;
995 
996 
997 }
998 
999 static int
raid_lastclose(device_t self)1000 raid_lastclose(device_t self)
1001 {
1002           struct raid_softc *rs = raidsoftc(self);
1003 
1004           /* Last one... device is not unconfigured yet.
1005              Device shutdown has taken care of setting the
1006              clean bits if RAIDF_INITED is not set
1007              mark things as clean... */
1008 
1009           rf_update_component_labels(&rs->sc_r,
1010               RF_FINAL_COMPONENT_UPDATE);
1011 
1012           /* pass to unlocked code */
1013           if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
1014                     rs->sc_flags |= RAIDF_DETACH;
1015 
1016           return 0;
1017 }
1018 
1019 /* ARGSUSED */
1020 static int
raidclose(dev_t dev,int flags,int fmt,struct lwp * l)1021 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
1022 {
1023           int     unit = raidunit(dev);
1024           struct raid_softc *rs;
1025           struct dk_softc *dksc;
1026           cfdata_t cf;
1027           int     error = 0, do_detach = 0, do_put = 0;
1028 
1029           if ((rs = raidget(unit, false)) == NULL)
1030                     return ENXIO;
1031           dksc = &rs->sc_dksc;
1032 
1033           if ((error = raidlock(rs)) != 0)
1034                     return error;
1035 
1036           if ((rs->sc_flags & RAIDF_INITED) != 0) {
1037                     error = dk_close(dksc, dev, flags, fmt, l);
1038                     if ((rs->sc_flags & RAIDF_DETACH) != 0)
1039                               do_detach = 1;
1040           } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
1041                     do_put = 1;
1042 
1043           raidunlock(rs);
1044 
1045           if (do_detach) {
1046                     /* free the pseudo device attach bits */
1047                     cf = device_cfdata(dksc->sc_dev);
1048                     error = config_detach(dksc->sc_dev, 0);
1049                     if (error == 0)
1050                               free(cf, M_RAIDFRAME);
1051           } else if (do_put) {
1052                     raidput(rs);
1053           }
1054 
1055           return error;
1056 
1057 }
1058 
1059 static void
raid_wakeup(RF_Raid_t * raidPtr)1060 raid_wakeup(RF_Raid_t *raidPtr)
1061 {
1062           rf_lock_mutex2(raidPtr->iodone_lock);
1063           rf_signal_cond2(raidPtr->iodone_cv);
1064           rf_unlock_mutex2(raidPtr->iodone_lock);
1065 }
1066 
1067 static void
raidstrategy(struct buf * bp)1068 raidstrategy(struct buf *bp)
1069 {
1070           unsigned int unit;
1071           struct raid_softc *rs;
1072           struct dk_softc *dksc;
1073           RF_Raid_t *raidPtr;
1074 
1075           unit = raidunit(bp->b_dev);
1076           if ((rs = raidget(unit, false)) == NULL) {
1077                     bp->b_error = ENXIO;
1078                     goto fail;
1079           }
1080           if ((rs->sc_flags & RAIDF_INITED) == 0) {
1081                     bp->b_error = ENXIO;
1082                     goto fail;
1083           }
1084           dksc = &rs->sc_dksc;
1085           raidPtr = &rs->sc_r;
1086 
1087           /* Queue IO only */
1088           if (dk_strategy_defer(dksc, bp))
1089                     goto done;
1090 
1091           /* schedule the IO to happen at the next convenient time */
1092           raid_wakeup(raidPtr);
1093 
1094 done:
1095           return;
1096 
1097 fail:
1098           bp->b_resid = bp->b_bcount;
1099           biodone(bp);
1100 }
1101 
1102 static int
raid_diskstart(device_t dev,struct buf * bp)1103 raid_diskstart(device_t dev, struct buf *bp)
1104 {
1105           struct raid_softc *rs = raidsoftc(dev);
1106           RF_Raid_t *raidPtr;
1107 
1108           raidPtr = &rs->sc_r;
1109           if (!raidPtr->valid) {
1110                     db1_printf(("raid is not valid..\n"));
1111                     return ENODEV;
1112           }
1113 
1114           /* XXX */
1115           bp->b_resid = 0;
1116 
1117           return raiddoaccess(raidPtr, bp);
1118 }
1119 
1120 void
raiddone(RF_Raid_t * raidPtr,struct buf * bp)1121 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
1122 {
1123           struct raid_softc *rs;
1124           struct dk_softc *dksc;
1125 
1126           rs = raidPtr->softc;
1127           dksc = &rs->sc_dksc;
1128 
1129           dk_done(dksc, bp);
1130 
1131           rf_lock_mutex2(raidPtr->mutex);
1132           raidPtr->openings++;
1133           rf_unlock_mutex2(raidPtr->mutex);
1134 
1135           /* schedule more IO */
1136           raid_wakeup(raidPtr);
1137 }
1138 
1139 /* ARGSUSED */
1140 static int
raidread(dev_t dev,struct uio * uio,int flags)1141 raidread(dev_t dev, struct uio *uio, int flags)
1142 {
1143           int     unit = raidunit(dev);
1144           struct raid_softc *rs;
1145 
1146           if ((rs = raidget(unit, false)) == NULL)
1147                     return ENXIO;
1148 
1149           if ((rs->sc_flags & RAIDF_INITED) == 0)
1150                     return ENXIO;
1151 
1152           return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
1153 
1154 }
1155 
1156 /* ARGSUSED */
1157 static int
raidwrite(dev_t dev,struct uio * uio,int flags)1158 raidwrite(dev_t dev, struct uio *uio, int flags)
1159 {
1160           int     unit = raidunit(dev);
1161           struct raid_softc *rs;
1162 
1163           if ((rs = raidget(unit, false)) == NULL)
1164                     return ENXIO;
1165 
1166           if ((rs->sc_flags & RAIDF_INITED) == 0)
1167                     return ENXIO;
1168 
1169           return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
1170 
1171 }
1172 
1173 static int
raid_detach_unlocked(struct raid_softc * rs)1174 raid_detach_unlocked(struct raid_softc *rs)
1175 {
1176           struct dk_softc *dksc = &rs->sc_dksc;
1177           RF_Raid_t *raidPtr;
1178           int error;
1179 
1180           raidPtr = &rs->sc_r;
1181 
1182           if (DK_BUSY(dksc, 0) ||
1183               raidPtr->recon_in_progress != 0 ||
1184               raidPtr->parity_rewrite_in_progress != 0)
1185                     return EBUSY;
1186 
1187           if ((rs->sc_flags & RAIDF_INITED) == 0)
1188                     return 0;
1189 
1190           rs->sc_flags &= ~RAIDF_SHUTDOWN;
1191 
1192           if ((error = rf_Shutdown(raidPtr)) != 0)
1193                     return error;
1194 
1195           rs->sc_flags &= ~RAIDF_INITED;
1196 
1197           /* Kill off any queued buffers */
1198           dk_drain(dksc);
1199           bufq_free(dksc->sc_bufq);
1200 
1201           /* Detach the disk. */
1202           dkwedge_delall(&dksc->sc_dkdev);
1203           disk_detach(&dksc->sc_dkdev);
1204           disk_destroy(&dksc->sc_dkdev);
1205           dk_detach(dksc);
1206 
1207           return 0;
1208 }
1209 
1210 int
rf_fail_disk(RF_Raid_t * raidPtr,struct rf_recon_req * rr)1211 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1212 {
1213           struct rf_recon_req_internal *rrint;
1214 
1215           if (raidPtr->Layout.map->faultsTolerated == 0) {
1216                     /* Can't do this on a RAID 0!! */
1217                     return EINVAL;
1218           }
1219 
1220           if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1221                     /* bad column */
1222                     return EINVAL;
1223           }
1224 
1225           rf_lock_mutex2(raidPtr->mutex);
1226           if (raidPtr->status == rf_rs_reconstructing) {
1227                     raidPtr->abortRecon[rr->col] = 1;
1228           }
1229           if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1230               (raidPtr->numFailures > 0)) {
1231                     /* some other component has failed.  Let's not make
1232                        things worse. XXX wrong for RAID6 */
1233                     goto out;
1234           }
1235           if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1236                     int spareCol = raidPtr->Disks[rr->col].spareCol;
1237 
1238                     if (spareCol < raidPtr->numCol ||
1239                         spareCol >= raidPtr->numCol + raidPtr->numSpare)
1240                               goto out;
1241 
1242                     /*
1243                      * Fail the spare disk so that we can
1244                      * reconstruct on another one.
1245                      */
1246                     raidPtr->Disks[spareCol].status = rf_ds_failed;
1247 
1248           }
1249           rf_unlock_mutex2(raidPtr->mutex);
1250 
1251           /* make a copy of the recon request so that we don't rely on
1252            * the user's buffer */
1253           rrint = RF_Malloc(sizeof(*rrint));
1254           if (rrint == NULL)
1255                     return(ENOMEM);
1256           rrint->col = rr->col;
1257           rrint->flags = rr->flags;
1258           rrint->raidPtr = raidPtr;
1259 
1260           return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1261               rrint, "raid_recon");
1262 out:
1263           rf_unlock_mutex2(raidPtr->mutex);
1264           return EINVAL;
1265 }
1266 
1267 static int
rf_copyinspecificbuf(RF_Config_t * k_cfg)1268 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1269 {
1270           /* allocate a buffer for the layout-specific data, and copy it in */
1271           if (k_cfg->layoutSpecificSize == 0)
1272                     return 0;
1273 
1274           if (k_cfg->layoutSpecificSize > 10000) {
1275               /* sanity check */
1276               return EINVAL;
1277           }
1278 
1279           u_char *specific_buf;
1280           specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
1281           if (specific_buf == NULL)
1282                     return ENOMEM;
1283 
1284           int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1285               k_cfg->layoutSpecificSize);
1286           if (retcode) {
1287                     RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1288                     db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1289                     return retcode;
1290           }
1291 
1292           k_cfg->layoutSpecific = specific_buf;
1293           return 0;
1294 }
1295 
1296 static int
rf_getConfiguration(struct raid_softc * rs,void * data,RF_Config_t ** k_cfg)1297 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1298 {
1299           RF_Config_t *u_cfg = *((RF_Config_t **) data);
1300 
1301           if (rs->sc_r.valid) {
1302                     /* There is a valid RAID set running on this unit! */
1303                     printf("raid%d: Device already configured!\n", rs->sc_unit);
1304                     return EINVAL;
1305           }
1306 
1307           /* copy-in the configuration information */
1308           /* data points to a pointer to the configuration structure */
1309           *k_cfg = RF_Malloc(sizeof(**k_cfg));
1310           if (*k_cfg == NULL) {
1311                     return ENOMEM;
1312           }
1313           int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1314           if (retcode == 0)
1315                     return 0;
1316           RF_Free(*k_cfg, sizeof(RF_Config_t));
1317           db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1318           rs->sc_flags |= RAIDF_SHUTDOWN;
1319           return retcode;
1320 }
1321 
1322 int
rf_construct(struct raid_softc * rs,RF_Config_t * k_cfg)1323 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1324 {
1325           int retcode, i;
1326           RF_Raid_t *raidPtr = &rs->sc_r;
1327 
1328           rs->sc_flags &= ~RAIDF_SHUTDOWN;
1329 
1330           if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1331                     goto out;
1332 
1333           /* should do some kind of sanity check on the configuration.
1334            * Store the sum of all the bytes in the last byte? */
1335 
1336           /* Force nul-termination on all strings. */
1337 #define ZERO_FINAL(s)         do { s[sizeof(s) - 1] = '\0'; } while (0)
1338           for (i = 0; i < RF_MAXCOL; i++) {
1339                     ZERO_FINAL(k_cfg->devnames[0][i]);
1340           }
1341           for (i = 0; i < RF_MAXSPARE; i++) {
1342                     ZERO_FINAL(k_cfg->spare_names[i]);
1343           }
1344           for (i = 0; i < RF_MAXDBGV; i++) {
1345                     ZERO_FINAL(k_cfg->debugVars[i]);
1346           }
1347 #undef ZERO_FINAL
1348 
1349           /* Check some basic limits. */
1350           if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
1351                     retcode = EINVAL;
1352                     goto out;
1353           }
1354           if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
1355                     retcode = EINVAL;
1356                     goto out;
1357           }
1358 
1359           /* configure the system */
1360 
1361           /*
1362            * Clear the entire RAID descriptor, just to make sure
1363            *  there is no stale data left in the case of a
1364            *  reconfiguration
1365            */
1366           memset(raidPtr, 0, sizeof(*raidPtr));
1367           raidPtr->softc = rs;
1368           raidPtr->raidid = rs->sc_unit;
1369 
1370           retcode = rf_Configure(raidPtr, k_cfg, NULL);
1371 
1372           if (retcode == 0) {
1373                     /* allow this many simultaneous IO's to
1374                        this RAID device */
1375                     raidPtr->openings = RAIDOUTSTANDING;
1376 
1377                     raidinit(rs);
1378                     raid_wakeup(raidPtr);
1379                     rf_markalldirty(raidPtr);
1380           }
1381 
1382           /* free the buffers.  No return code here. */
1383           if (k_cfg->layoutSpecificSize) {
1384                     RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1385           }
1386 out:
1387           RF_Free(k_cfg, sizeof(RF_Config_t));
1388           if (retcode) {
1389                     /*
1390                      * If configuration failed, set sc_flags so that we
1391                      * will detach the device when we close it.
1392                      */
1393                     rs->sc_flags |= RAIDF_SHUTDOWN;
1394           }
1395           return retcode;
1396 }
1397 
1398 #if RF_DISABLED
1399 static int
rf_set_component_label(RF_Raid_t * raidPtr,RF_ComponentLabel_t * clabel)1400 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1401 {
1402 
1403           /* XXX check the label for valid stuff... */
1404           /* Note that some things *should not* get modified --
1405              the user should be re-initing the labels instead of
1406              trying to patch things.
1407              */
1408 #ifdef DEBUG
1409           int raidid = raidPtr->raidid;
1410           printf("raid%d: Got component label:\n", raidid);
1411           printf("raid%d: Version: %d\n", raidid, clabel->version);
1412           printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1413           printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1414           printf("raid%d: Column: %d\n", raidid, clabel->column);
1415           printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1416           printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1417           printf("raid%d: Status: %d\n", raidid, clabel->status);
1418 #endif    /* DEBUG */
1419           clabel->row = 0;
1420           int column = clabel->column;
1421 
1422           if ((column < 0) || (column >= raidPtr->numCol)) {
1423                     return(EINVAL);
1424           }
1425 
1426           /* XXX this isn't allowed to do anything for now :-) */
1427 
1428           /* XXX and before it is, we need to fill in the rest
1429              of the fields!?!?!?! */
1430           memcpy(raidget_component_label(raidPtr, column),
1431               clabel, sizeof(*clabel));
1432           raidflush_component_label(raidPtr, column);
1433           return 0;
1434 }
1435 #endif
1436 
1437 static int
rf_init_component_label(RF_Raid_t * raidPtr,RF_ComponentLabel_t * clabel)1438 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1439 {
1440           /*
1441              we only want the serial number from
1442              the above.  We get all the rest of the information
1443              from the config that was used to create this RAID
1444              set.
1445              */
1446 
1447           raidPtr->serial_number = clabel->serial_number;
1448 
1449           for (int column = 0; column < raidPtr->numCol; column++) {
1450                     RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1451                     if (RF_DEAD_DISK(diskPtr->status))
1452                               continue;
1453                     RF_ComponentLabel_t *ci_label = raidget_component_label(
1454                         raidPtr, column);
1455                     /* Zeroing this is important. */
1456                     memset(ci_label, 0, sizeof(*ci_label));
1457                     raid_init_component_label(raidPtr, ci_label);
1458                     ci_label->serial_number = raidPtr->serial_number;
1459                     ci_label->row = 0; /* we dont' pretend to support more */
1460                     rf_component_label_set_partitionsize(ci_label,
1461                         diskPtr->partitionSize);
1462                     ci_label->column = column;
1463                     raidflush_component_label(raidPtr, column);
1464                     /* XXXjld what about the spares? */
1465           }
1466 
1467           return 0;
1468 }
1469 
1470 static int
rf_rebuild_in_place(RF_Raid_t * raidPtr,RF_SingleComponent_t * componentPtr)1471 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1472 {
1473 
1474           if (raidPtr->Layout.map->faultsTolerated == 0) {
1475                     /* Can't do this on a RAID 0!! */
1476                     return EINVAL;
1477           }
1478 
1479           if (raidPtr->recon_in_progress == 1) {
1480                     /* a reconstruct is already in progress! */
1481                     return EINVAL;
1482           }
1483 
1484           RF_SingleComponent_t component;
1485           memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1486           component.row = 0; /* we don't support any more */
1487           int column = component.column;
1488 
1489           if ((column < 0) || (column >= raidPtr->numCol)) {
1490                     return EINVAL;
1491           }
1492 
1493           rf_lock_mutex2(raidPtr->mutex);
1494           if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1495               (raidPtr->numFailures > 0)) {
1496                     /* XXX 0 above shouldn't be constant!!! */
1497                     /* some component other than this has failed.
1498                        Let's not make things worse than they already
1499                        are... */
1500                     printf("raid%d: Unable to reconstruct to disk at:\n",
1501                            raidPtr->raidid);
1502                     printf("raid%d:     Col: %d   Too many failures.\n",
1503                            raidPtr->raidid, column);
1504                     rf_unlock_mutex2(raidPtr->mutex);
1505                     return EINVAL;
1506           }
1507 
1508           if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1509                     printf("raid%d: Unable to reconstruct to disk at:\n",
1510                            raidPtr->raidid);
1511                     printf("raid%d:    Col: %d   "
1512                         "Reconstruction already occurring!\n",
1513                         raidPtr->raidid, column);
1514 
1515                     rf_unlock_mutex2(raidPtr->mutex);
1516                     return EINVAL;
1517           }
1518 
1519           if (raidPtr->Disks[column].status == rf_ds_spared) {
1520                     rf_unlock_mutex2(raidPtr->mutex);
1521                     return EINVAL;
1522           }
1523 
1524           rf_unlock_mutex2(raidPtr->mutex);
1525 
1526           struct rf_recon_req_internal *rrint;
1527           rrint = RF_Malloc(sizeof(*rrint));
1528           if (rrint == NULL)
1529                     return ENOMEM;
1530 
1531           rrint->col = column;
1532           rrint->raidPtr = raidPtr;
1533 
1534           return RF_CREATE_THREAD(raidPtr->recon_thread,
1535               rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1536 }
1537 
1538 static int
rf_check_recon_status(RF_Raid_t * raidPtr,int * data)1539 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1540 {
1541           /*
1542            * This makes no sense on a RAID 0, or if we are not reconstructing
1543            * so tell the user it's done.
1544            */
1545           if (raidPtr->Layout.map->faultsTolerated == 0 ||
1546               raidPtr->status != rf_rs_reconstructing) {
1547                     *data = 100;
1548                     return 0;
1549           }
1550           if (raidPtr->reconControl->numRUsTotal == 0) {
1551                     *data = 0;
1552                     return 0;
1553           }
1554           *data = (raidPtr->reconControl->numRUsComplete * 100
1555               / raidPtr->reconControl->numRUsTotal);
1556           return 0;
1557 }
1558 
1559 /*
1560  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
1561  * on the component_name[] array.
1562  */
1563 static void
rf_copy_single_component(RF_SingleComponent_t * component,void * data)1564 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
1565 {
1566 
1567           memcpy(component, data, sizeof *component);
1568           component->component_name[sizeof(component->component_name) - 1] = '\0';
1569 }
1570 
1571 static int
raidioctl(dev_t dev,u_long cmd,void * data,int flag,struct lwp * l)1572 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1573 {
1574           int     unit = raidunit(dev);
1575           int     part, pmask;
1576           struct raid_softc *rs;
1577           struct dk_softc *dksc;
1578           RF_Config_t *k_cfg;
1579           RF_Raid_t *raidPtr;
1580           RF_AccTotals_t *totals;
1581           RF_SingleComponent_t component;
1582           RF_DeviceConfig_t *d_cfg, *ucfgp;
1583           int retcode = 0;
1584           int column;
1585           RF_ComponentLabel_t *clabel;
1586           int d;
1587 
1588           if ((rs = raidget(unit, false)) == NULL)
1589                     return ENXIO;
1590 
1591           dksc = &rs->sc_dksc;
1592           raidPtr = &rs->sc_r;
1593 
1594           db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1595               (int) DISKPART(dev), (int) unit, cmd));
1596 
1597           /* Only CONFIGURE and RESCAN can be done without the RAID being initialized. */
1598           switch (cmd) {
1599           case RAIDFRAME_CONFIGURE:
1600           case RAIDFRAME_RESCAN:
1601                     break;
1602           default:
1603                     if (!rf_inited(rs))
1604                               return ENXIO;
1605           }
1606 
1607           switch (cmd) {
1608                     /* configure the system */
1609           case RAIDFRAME_CONFIGURE:
1610                     if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1611                               return retcode;
1612                     return rf_construct(rs, k_cfg);
1613 
1614                     /* shutdown the system */
1615           case RAIDFRAME_SHUTDOWN:
1616 
1617                     part = DISKPART(dev);
1618                     pmask = (1 << part);
1619 
1620                     if ((retcode = raidlock(rs)) != 0)
1621                               return retcode;
1622 
1623                     if (DK_BUSY(dksc, pmask) ||
1624                         raidPtr->recon_in_progress != 0 ||
1625                         raidPtr->parity_rewrite_in_progress != 0)
1626                               retcode = EBUSY;
1627                     else {
1628                               /* detach and free on close */
1629                               rs->sc_flags |= RAIDF_SHUTDOWN;
1630                               retcode = 0;
1631                     }
1632 
1633                     raidunlock(rs);
1634 
1635                     return retcode;
1636           case RAIDFRAME_GET_COMPONENT_LABEL:
1637                     return rf_get_component_label(raidPtr, data);
1638 
1639 #if RF_DISABLED
1640           case RAIDFRAME_SET_COMPONENT_LABEL:
1641                     return rf_set_component_label(raidPtr, data);
1642 #endif
1643 
1644           case RAIDFRAME_INIT_LABELS:
1645                     return rf_init_component_label(raidPtr, data);
1646 
1647           case RAIDFRAME_SET_AUTOCONFIG:
1648                     d = rf_set_autoconfig(raidPtr, *(int *) data);
1649                     printf("raid%d: New autoconfig value is: %d\n",
1650                            raidPtr->raidid, d);
1651                     *(int *) data = d;
1652                     return retcode;
1653 
1654           case RAIDFRAME_SET_ROOT:
1655                     d = rf_set_rootpartition(raidPtr, *(int *) data);
1656                     printf("raid%d: New rootpartition value is: %d\n",
1657                            raidPtr->raidid, d);
1658                     *(int *) data = d;
1659                     return retcode;
1660 
1661                     /* initialize all parity */
1662           case RAIDFRAME_REWRITEPARITY:
1663 
1664                     if (raidPtr->Layout.map->faultsTolerated == 0) {
1665                               /* Parity for RAID 0 is trivially correct */
1666                               raidPtr->parity_good = RF_RAID_CLEAN;
1667                               return 0;
1668                     }
1669 
1670                     if (raidPtr->parity_rewrite_in_progress == 1) {
1671                               /* Re-write is already in progress! */
1672                               return EINVAL;
1673                     }
1674 
1675                     return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1676                         rf_RewriteParityThread, raidPtr,"raid_parity");
1677 
1678           case RAIDFRAME_ADD_HOT_SPARE:
1679                     rf_copy_single_component(&component, data);
1680                     return rf_add_hot_spare(raidPtr, &component);
1681 
1682           /* Remove a non hot-spare component, never implemented in userland */
1683           case RAIDFRAME_DELETE_COMPONENT:
1684                     rf_copy_single_component(&component, data);
1685                     return rf_delete_component(raidPtr, &component);
1686 
1687           case RAIDFRAME_REMOVE_COMPONENT:
1688                     rf_copy_single_component(&component, data);
1689                     return rf_remove_component(raidPtr, &component);
1690 
1691           case RAIDFRAME_INCORPORATE_HOT_SPARE:
1692                     rf_copy_single_component(&component, data);
1693                     return rf_incorporate_hot_spare(raidPtr, &component);
1694 
1695           case RAIDFRAME_REBUILD_IN_PLACE:
1696                     return rf_rebuild_in_place(raidPtr, data);
1697 
1698           case RAIDFRAME_GET_INFO:
1699                     ucfgp = *(RF_DeviceConfig_t **)data;
1700                     d_cfg = RF_Malloc(sizeof(*d_cfg));
1701                     if (d_cfg == NULL)
1702                               return ENOMEM;
1703                     retcode = rf_get_info(raidPtr, d_cfg);
1704                     if (retcode == 0) {
1705                               retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1706                     }
1707                     RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1708                     return retcode;
1709 
1710           case RAIDFRAME_CHECK_PARITY:
1711                     *(int *) data = raidPtr->parity_good;
1712                     return 0;
1713 
1714           case RAIDFRAME_PARITYMAP_STATUS:
1715                     if (rf_paritymap_ineligible(raidPtr))
1716                               return EINVAL;
1717                     rf_paritymap_status(raidPtr->parity_map, data);
1718                     return 0;
1719 
1720           case RAIDFRAME_PARITYMAP_SET_PARAMS:
1721                     if (rf_paritymap_ineligible(raidPtr))
1722                               return EINVAL;
1723                     if (raidPtr->parity_map == NULL)
1724                               return ENOENT; /* ??? */
1725                     if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1726                               return EINVAL;
1727                     return 0;
1728 
1729           case RAIDFRAME_PARITYMAP_GET_DISABLE:
1730                     if (rf_paritymap_ineligible(raidPtr))
1731                               return EINVAL;
1732                     *(int *) data = rf_paritymap_get_disable(raidPtr);
1733                     return 0;
1734 
1735           case RAIDFRAME_PARITYMAP_SET_DISABLE:
1736                     if (rf_paritymap_ineligible(raidPtr))
1737                               return EINVAL;
1738                     rf_paritymap_set_disable(raidPtr, *(int *)data);
1739                     /* XXX should errors be passed up? */
1740                     return 0;
1741 
1742           case RAIDFRAME_RESCAN:
1743                     return rf_rescan();
1744 
1745           case RAIDFRAME_RESET_ACCTOTALS:
1746                     memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1747                     return 0;
1748 
1749           case RAIDFRAME_GET_ACCTOTALS:
1750                     totals = (RF_AccTotals_t *) data;
1751                     *totals = raidPtr->acc_totals;
1752                     return 0;
1753 
1754           case RAIDFRAME_KEEP_ACCTOTALS:
1755                     raidPtr->keep_acc_totals = *(int *)data;
1756                     return 0;
1757 
1758           case RAIDFRAME_GET_SIZE:
1759                     *(int *) data = raidPtr->totalSectors;
1760                     return 0;
1761 
1762           case RAIDFRAME_FAIL_DISK:
1763                     return rf_fail_disk(raidPtr, data);
1764 
1765                     /* copyback is no longer supported */
1766           case RAIDFRAME_COPYBACK:
1767                     return EINVAL;
1768 
1769                     /* return the percentage completion of reconstruction */
1770           case RAIDFRAME_CHECK_RECON_STATUS:
1771                     return rf_check_recon_status(raidPtr, data);
1772 
1773           case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1774                     rf_check_recon_status_ext(raidPtr, data);
1775                     return 0;
1776 
1777           case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1778                     if (raidPtr->Layout.map->faultsTolerated == 0) {
1779                               /* This makes no sense on a RAID 0, so tell the
1780                                  user it's done. */
1781                               *(int *) data = 100;
1782                               return 0;
1783                     }
1784                     if (raidPtr->parity_rewrite_in_progress == 1) {
1785                               *(int *) data = 100 *
1786                                         raidPtr->parity_rewrite_stripes_done /
1787                                         raidPtr->Layout.numStripe;
1788                     } else {
1789                               *(int *) data = 100;
1790                     }
1791                     return 0;
1792 
1793           case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1794                     rf_check_parityrewrite_status_ext(raidPtr, data);
1795                     return 0;
1796 
1797           case RAIDFRAME_CHECK_COPYBACK_STATUS:
1798                     *(int *) data = 100;
1799                     return 0;
1800 
1801           case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1802                     rf_check_copyback_status_ext(raidPtr, data);
1803                     return 0;
1804 
1805           case RAIDFRAME_SET_LAST_UNIT:
1806                     for (column = 0; column < raidPtr->numCol; column++)
1807                               if (raidPtr->Disks[column].status != rf_ds_optimal)
1808                                         return EBUSY;
1809 
1810                     for (column = 0; column < raidPtr->numCol; column++) {
1811                               clabel = raidget_component_label(raidPtr, column);
1812                               clabel->last_unit = *(int *)data;
1813                               raidflush_component_label(raidPtr, column);
1814                     }
1815                     rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1816                     return 0;
1817 
1818                     /* the sparetable daemon calls this to wait for the kernel to
1819                      * need a spare table. this ioctl does not return until a
1820                      * spare table is needed. XXX -- calling mpsleep here in the
1821                      * ioctl code is almost certainly wrong and evil. -- XXX XXX
1822                      * -- I should either compute the spare table in the kernel,
1823                      * or have a different -- XXX XXX -- interface (a different
1824                      * character device) for delivering the table     -- XXX */
1825 #if RF_DISABLED
1826           case RAIDFRAME_SPARET_WAIT:
1827                     rf_lock_mutex2(rf_sparet_wait_mutex);
1828                     while (!rf_sparet_wait_queue)
1829                               rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1830                     RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1831                     rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1832                     rf_unlock_mutex2(rf_sparet_wait_mutex);
1833 
1834                     /* structure assignment */
1835                     *((RF_SparetWait_t *) data) = *waitreq;
1836 
1837                     RF_Free(waitreq, sizeof(*waitreq));
1838                     return 0;
1839 
1840                     /* wakes up a process waiting on SPARET_WAIT and puts an error
1841                      * code in it that will cause the dameon to exit */
1842           case RAIDFRAME_ABORT_SPARET_WAIT:
1843                     waitreq = RF_Malloc(sizeof(*waitreq));
1844                     waitreq->fcol = -1;
1845                     rf_lock_mutex2(rf_sparet_wait_mutex);
1846                     waitreq->next = rf_sparet_wait_queue;
1847                     rf_sparet_wait_queue = waitreq;
1848                     rf_broadcast_cond2(rf_sparet_wait_cv);
1849                     rf_unlock_mutex2(rf_sparet_wait_mutex);
1850                     return 0;
1851 
1852                     /* used by the spare table daemon to deliver a spare table
1853                      * into the kernel */
1854           case RAIDFRAME_SEND_SPARET:
1855 
1856                     /* install the spare table */
1857                     retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1858 
1859                     /* respond to the requestor.  the return status of the spare
1860                      * table installation is passed in the "fcol" field */
1861                     waitred = RF_Malloc(sizeof(*waitreq));
1862                     waitreq->fcol = retcode;
1863                     rf_lock_mutex2(rf_sparet_wait_mutex);
1864                     waitreq->next = rf_sparet_resp_queue;
1865                     rf_sparet_resp_queue = waitreq;
1866                     rf_broadcast_cond2(rf_sparet_resp_cv);
1867                     rf_unlock_mutex2(rf_sparet_wait_mutex);
1868 
1869                     return retcode;
1870 #endif
1871           default:
1872                     /*
1873                      * Don't bother trying to load compat modules
1874                      * if it is not our ioctl. This is more efficient
1875                      * and makes rump tests not depend on compat code
1876                      */
1877                     if (IOCGROUP(cmd) != 'r')
1878                               break;
1879 #ifdef _LP64
1880                     if ((l->l_proc->p_flag & PK_32) != 0) {
1881                               module_autoload("compat_netbsd32_raid",
1882                                   MODULE_CLASS_EXEC);
1883                               MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1884                                   (rs, cmd, data), enosys(), retcode);
1885                               if (retcode != EPASSTHROUGH)
1886                                         return retcode;
1887                     }
1888 #endif
1889                     module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1890                     MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1891                         (rs, cmd, data), enosys(), retcode);
1892                     if (retcode != EPASSTHROUGH)
1893                               return retcode;
1894 
1895                     module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1896                     MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1897                         (rs, cmd, data), enosys(), retcode);
1898                     if (retcode != EPASSTHROUGH)
1899                               return retcode;
1900                     break; /* fall through to the os-specific code below */
1901 
1902           }
1903 
1904           if (!raidPtr->valid)
1905                     return EINVAL;
1906 
1907           /*
1908            * Add support for "regular" device ioctls here.
1909            */
1910 
1911           switch (cmd) {
1912           case DIOCGCACHE:
1913                     retcode = rf_get_component_caches(raidPtr, (int *)data);
1914                     break;
1915 
1916           case DIOCCACHESYNC:
1917                     retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1918                     break;
1919 
1920           default:
1921                     retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1922                     break;
1923           }
1924 
1925           return retcode;
1926 
1927 }
1928 
1929 
1930 /* raidinit -- complete the rest of the initialization for the
1931    RAIDframe device.  */
1932 
1933 
1934 static void
raidinit(struct raid_softc * rs)1935 raidinit(struct raid_softc *rs)
1936 {
1937           cfdata_t cf;
1938           unsigned int unit;
1939           struct dk_softc *dksc = &rs->sc_dksc;
1940           RF_Raid_t *raidPtr = &rs->sc_r;
1941           device_t dev;
1942 
1943           unit = raidPtr->raidid;
1944 
1945           /* XXX doesn't check bounds. */
1946           snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1947 
1948           /* attach the pseudo device */
1949           cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1950           cf->cf_name = raid_cd.cd_name;
1951           cf->cf_atname = raid_cd.cd_name;
1952           cf->cf_unit = unit;
1953           cf->cf_fstate = FSTATE_STAR;
1954 
1955           dev = config_attach_pseudo(cf);
1956           if (dev == NULL) {
1957                     printf("raid%d: config_attach_pseudo failed\n",
1958                         raidPtr->raidid);
1959                     free(cf, M_RAIDFRAME);
1960                     return;
1961           }
1962 
1963           /* provide a backpointer to the real softc */
1964           raidsoftc(dev) = rs;
1965 
1966           /* disk_attach actually creates space for the CPU disklabel, among
1967            * other things, so it's critical to call this *BEFORE* we try putzing
1968            * with disklabels. */
1969           dk_init(dksc, dev, DKTYPE_RAID);
1970           disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1971 
1972           /* XXX There may be a weird interaction here between this, and
1973            * protectedSectors, as used in RAIDframe.  */
1974 
1975           rs->sc_size = raidPtr->totalSectors;
1976 
1977           /* Attach dk and disk subsystems */
1978           dk_attach(dksc);
1979           disk_attach(&dksc->sc_dkdev);
1980           rf_set_geometry(rs, raidPtr);
1981 
1982           bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1983 
1984           /* mark unit as usuable */
1985           rs->sc_flags |= RAIDF_INITED;
1986 
1987           dkwedge_discover(&dksc->sc_dkdev);
1988 }
1989 
1990 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1991 /* wake up the daemon & tell it to get us a spare table
1992  * XXX
1993  * the entries in the queues should be tagged with the raidPtr
1994  * so that in the extremely rare case that two recons happen at once,
1995  * we know for which device were requesting a spare table
1996  * XXX
1997  *
1998  * XXX This code is not currently used. GO
1999  */
2000 int
rf_GetSpareTableFromDaemon(RF_SparetWait_t * req)2001 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
2002 {
2003           int     retcode;
2004 
2005           rf_lock_mutex2(rf_sparet_wait_mutex);
2006           req->next = rf_sparet_wait_queue;
2007           rf_sparet_wait_queue = req;
2008           rf_broadcast_cond2(rf_sparet_wait_cv);
2009 
2010           /* mpsleep unlocks the mutex */
2011           while (!rf_sparet_resp_queue) {
2012                     rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
2013           }
2014           req = rf_sparet_resp_queue;
2015           rf_sparet_resp_queue = req->next;
2016           rf_unlock_mutex2(rf_sparet_wait_mutex);
2017 
2018           retcode = req->fcol;
2019           RF_Free(req, sizeof(*req));   /* this is not the same req as we
2020                                                    * alloc'd */
2021           return retcode;
2022 }
2023 #endif
2024 
2025 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2026  * bp & passes it down.
2027  * any calls originating in the kernel must use non-blocking I/O
2028  * do some extra sanity checking to return "appropriate" error values for
2029  * certain conditions (to make some standard utilities work)
2030  *
2031  * Formerly known as: rf_DoAccessKernel
2032  */
2033 void
raidstart(RF_Raid_t * raidPtr)2034 raidstart(RF_Raid_t *raidPtr)
2035 {
2036           struct raid_softc *rs;
2037           struct dk_softc *dksc;
2038 
2039           rs = raidPtr->softc;
2040           dksc = &rs->sc_dksc;
2041           /* quick check to see if anything has died recently */
2042           rf_lock_mutex2(raidPtr->mutex);
2043           if (raidPtr->numNewFailures > 0) {
2044                     rf_unlock_mutex2(raidPtr->mutex);
2045                     rf_update_component_labels(raidPtr,
2046                                                      RF_NORMAL_COMPONENT_UPDATE);
2047                     rf_lock_mutex2(raidPtr->mutex);
2048                     raidPtr->numNewFailures--;
2049           }
2050           rf_unlock_mutex2(raidPtr->mutex);
2051 
2052           if ((rs->sc_flags & RAIDF_INITED) == 0) {
2053                     printf("raid%d: raidstart not ready\n", raidPtr->raidid);
2054                     return;
2055           }
2056 
2057           dk_start(dksc, NULL);
2058 }
2059 
2060 static int
raiddoaccess(RF_Raid_t * raidPtr,struct buf * bp)2061 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
2062 {
2063           RF_SectorCount_t num_blocks, pb, sum;
2064           RF_RaidAddr_t raid_addr;
2065           daddr_t blocknum;
2066           int rc;
2067 
2068           rf_lock_mutex2(raidPtr->mutex);
2069           if (raidPtr->openings == 0) {
2070                     rf_unlock_mutex2(raidPtr->mutex);
2071                     return EAGAIN;
2072           }
2073           rf_unlock_mutex2(raidPtr->mutex);
2074 
2075           blocknum = bp->b_rawblkno;
2076 
2077           db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2078                         (int) blocknum));
2079 
2080           db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2081           db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2082 
2083           /* *THIS* is where we adjust what block we're going to...
2084            * but DO NOT TOUCH bp->b_blkno!!! */
2085           raid_addr = blocknum;
2086 
2087           num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2088           pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2089           sum = raid_addr + num_blocks + pb;
2090           if (1 || rf_debugKernelAccess) {
2091                     db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2092                                   (int) raid_addr, (int) sum, (int) num_blocks,
2093                                   (int) pb, (int) bp->b_resid));
2094           }
2095           if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2096               || (sum < num_blocks) || (sum < pb)) {
2097                     rc = ENOSPC;
2098                     goto done;
2099           }
2100           /*
2101            * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2102            */
2103 
2104           if (bp->b_bcount & raidPtr->sectorMask) {
2105                     rc = ENOSPC;
2106                     goto done;
2107           }
2108           db1_printf(("Calling DoAccess..\n"));
2109 
2110 
2111           rf_lock_mutex2(raidPtr->mutex);
2112           raidPtr->openings--;
2113           rf_unlock_mutex2(raidPtr->mutex);
2114 
2115           /* don't ever condition on bp->b_flags & B_WRITE.
2116            * always condition on B_READ instead */
2117 
2118           rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2119                                RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2120                                raid_addr, num_blocks,
2121                                bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2122 
2123 done:
2124           return rc;
2125 }
2126 
2127 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2128 
2129 int
rf_DispatchKernelIO(RF_DiskQueue_t * queue,RF_DiskQueueData_t * req)2130 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2131 {
2132           int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2133           struct buf *bp;
2134 
2135           req->queue = queue;
2136           bp = req->bp;
2137 
2138           switch (req->type) {
2139           case RF_IO_TYPE_NOP:          /* used primarily to unlock a locked queue */
2140                     /* XXX need to do something extra here.. */
2141                     /* I'm leaving this in, as I've never actually seen it used,
2142                      * and I'd like folks to report it... GO */
2143                     printf("%s: WAKEUP CALLED\n", __func__);
2144                     queue->numOutstanding++;
2145 
2146                     bp->b_flags = 0;
2147                     bp->b_private = req;
2148 
2149                     KernelWakeupFunc(bp);
2150                     break;
2151 
2152           case RF_IO_TYPE_READ:
2153           case RF_IO_TYPE_WRITE:
2154 #if RF_ACC_TRACE > 0
2155                     if (req->tracerec) {
2156                               RF_ETIMER_START(req->tracerec->timer);
2157                     }
2158 #endif
2159                     InitBP(bp, queue->rf_cinfo->ci_vp,
2160                         op, queue->rf_cinfo->ci_dev,
2161                         req->sectorOffset, req->numSector,
2162                         req->buf, KernelWakeupFunc, (void *) req,
2163                         queue->raidPtr->logBytesPerSector);
2164 
2165                     if (rf_debugKernelAccess) {
2166                               db1_printf(("dispatch: bp->b_blkno = %ld\n",
2167                                         (long) bp->b_blkno));
2168                     }
2169                     queue->numOutstanding++;
2170                     queue->last_deq_sector = req->sectorOffset;
2171                     /* acc wouldn't have been let in if there were any pending
2172                      * reqs at any other priority */
2173                     queue->curPriority = req->priority;
2174 
2175                     db1_printf(("Going for %c to unit %d col %d\n",
2176                                   req->type, queue->raidPtr->raidid,
2177                                   queue->col));
2178                     db1_printf(("sector %d count %d (%d bytes) %d\n",
2179                               (int) req->sectorOffset, (int) req->numSector,
2180                               (int) (req->numSector <<
2181                                   queue->raidPtr->logBytesPerSector),
2182                               (int) queue->raidPtr->logBytesPerSector));
2183 
2184                     /*
2185                      * XXX: drop lock here since this can block at
2186                      * least with backing SCSI devices.  Retake it
2187                      * to minimize fuss with calling interfaces.
2188                      */
2189 
2190                     RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2191                     bdev_strategy(bp);
2192                     RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2193                     break;
2194 
2195           default:
2196                     panic("bad req->type in rf_DispatchKernelIO");
2197           }
2198           db1_printf(("Exiting from DispatchKernelIO\n"));
2199 
2200           return 0;
2201 }
2202 /* this is the callback function associated with a I/O invoked from
2203    kernel code.
2204  */
2205 static void
KernelWakeupFunc(struct buf * bp)2206 KernelWakeupFunc(struct buf *bp)
2207 {
2208           RF_DiskQueueData_t *req = NULL;
2209           RF_DiskQueue_t *queue;
2210 
2211           db1_printf(("recovering the request queue:\n"));
2212 
2213           req = bp->b_private;
2214 
2215           queue = (RF_DiskQueue_t *) req->queue;
2216 
2217           rf_lock_mutex2(queue->raidPtr->iodone_lock);
2218 
2219 #if RF_ACC_TRACE > 0
2220           if (req->tracerec) {
2221                     RF_ETIMER_STOP(req->tracerec->timer);
2222                     RF_ETIMER_EVAL(req->tracerec->timer);
2223                     rf_lock_mutex2(rf_tracing_mutex);
2224                     req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2225                     req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2226                     req->tracerec->num_phys_ios++;
2227                     rf_unlock_mutex2(rf_tracing_mutex);
2228           }
2229 #endif
2230 
2231           /* XXX Ok, let's get aggressive... If b_error is set, let's go
2232            * ballistic, and mark the component as hosed... */
2233 
2234           if (bp->b_error != 0) {
2235                     /* Mark the disk as dead */
2236                     /* but only mark it once... */
2237                     /* and only if it wouldn't leave this RAID set
2238                        completely broken */
2239                     if (((queue->raidPtr->Disks[queue->col].status ==
2240                           rf_ds_optimal) ||
2241                          (queue->raidPtr->Disks[queue->col].status ==
2242                           rf_ds_used_spare)) &&
2243                          (queue->raidPtr->numFailures <
2244                           queue->raidPtr->Layout.map->faultsTolerated)) {
2245                               printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2246                                      queue->raidPtr->raidid,
2247                                      bp->b_error,
2248                                      queue->raidPtr->Disks[queue->col].devname);
2249                               queue->raidPtr->Disks[queue->col].status =
2250                                   rf_ds_failed;
2251                               queue->raidPtr->status = rf_rs_degraded;
2252                               queue->raidPtr->numFailures++;
2253                               queue->raidPtr->numNewFailures++;
2254                     } else {  /* Disk is already dead... */
2255                               /* printf("Disk already marked as dead!\n"); */
2256                     }
2257 
2258           }
2259 
2260           /* Fill in the error value */
2261           req->error = bp->b_error;
2262 
2263           /* Drop this one on the "finished" queue... */
2264           TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2265 
2266           /* Let the raidio thread know there is work to be done. */
2267           rf_signal_cond2(queue->raidPtr->iodone_cv);
2268 
2269           rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2270 }
2271 
2272 
2273 /*
2274  * initialize a buf structure for doing an I/O in the kernel.
2275  */
2276 static void
InitBP(struct buf * bp,struct vnode * b_vp,unsigned rw_flag,dev_t dev,RF_SectorNum_t startSect,RF_SectorCount_t numSect,void * bf,void (* cbFunc)(struct buf *),void * cbArg,int logBytesPerSector)2277 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2278        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2279        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2280 {
2281           bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2282           bp->b_oflags = 0;
2283           bp->b_cflags = 0;
2284           bp->b_bcount = numSect << logBytesPerSector;
2285           bp->b_bufsize = bp->b_bcount;
2286           bp->b_error = 0;
2287           bp->b_dev = dev;
2288           bp->b_data = bf;
2289           bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2290           bp->b_resid = bp->b_bcount;   /* XXX is this right!??!?!! */
2291           if (bp->b_bcount == 0) {
2292                     panic("bp->b_bcount is zero in InitBP!!");
2293           }
2294           bp->b_iodone = cbFunc;
2295           bp->b_private = cbArg;
2296 }
2297 
2298 /*
2299  * Wait interruptibly for an exclusive lock.
2300  *
2301  * XXX
2302  * Several drivers do this; it should be abstracted and made MP-safe.
2303  * (Hmm... where have we seen this warning before :->  GO )
2304  */
2305 static int
raidlock(struct raid_softc * rs)2306 raidlock(struct raid_softc *rs)
2307 {
2308           int     error;
2309 
2310           error = 0;
2311           mutex_enter(&rs->sc_mutex);
2312           while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2313                     rs->sc_flags |= RAIDF_WANTED;
2314                     error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2315                     if (error != 0)
2316                               goto done;
2317           }
2318           rs->sc_flags |= RAIDF_LOCKED;
2319 done:
2320           mutex_exit(&rs->sc_mutex);
2321           return error;
2322 }
2323 /*
2324  * Unlock and wake up any waiters.
2325  */
2326 static void
raidunlock(struct raid_softc * rs)2327 raidunlock(struct raid_softc *rs)
2328 {
2329 
2330           mutex_enter(&rs->sc_mutex);
2331           rs->sc_flags &= ~RAIDF_LOCKED;
2332           if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2333                     rs->sc_flags &= ~RAIDF_WANTED;
2334                     cv_broadcast(&rs->sc_cv);
2335           }
2336           mutex_exit(&rs->sc_mutex);
2337 }
2338 
2339 
2340 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2341 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2342 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2343 
2344 static daddr_t
rf_component_info_offset(void)2345 rf_component_info_offset(void)
2346 {
2347 
2348           return RF_COMPONENT_INFO_OFFSET;
2349 }
2350 
2351 static daddr_t
rf_component_info_size(unsigned secsize)2352 rf_component_info_size(unsigned secsize)
2353 {
2354           daddr_t info_size;
2355 
2356           KASSERT(secsize);
2357           if (secsize > RF_COMPONENT_INFO_SIZE)
2358                     info_size = secsize;
2359           else
2360                     info_size = RF_COMPONENT_INFO_SIZE;
2361 
2362           return info_size;
2363 }
2364 
2365 static daddr_t
rf_parity_map_offset(RF_Raid_t * raidPtr)2366 rf_parity_map_offset(RF_Raid_t *raidPtr)
2367 {
2368           daddr_t map_offset;
2369 
2370           KASSERT(raidPtr->bytesPerSector);
2371           if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2372                     map_offset = raidPtr->bytesPerSector;
2373           else
2374                     map_offset = RF_COMPONENT_INFO_SIZE;
2375           map_offset += rf_component_info_offset();
2376 
2377           return map_offset;
2378 }
2379 
2380 static daddr_t
rf_parity_map_size(RF_Raid_t * raidPtr)2381 rf_parity_map_size(RF_Raid_t *raidPtr)
2382 {
2383           daddr_t map_size;
2384 
2385           if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2386                     map_size = raidPtr->bytesPerSector;
2387           else
2388                     map_size = RF_PARITY_MAP_SIZE;
2389 
2390           return map_size;
2391 }
2392 
2393 int
raidmarkclean(RF_Raid_t * raidPtr,RF_RowCol_t col)2394 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2395 {
2396           RF_ComponentLabel_t *clabel;
2397 
2398           clabel = raidget_component_label(raidPtr, col);
2399           clabel->clean = RF_RAID_CLEAN;
2400           raidflush_component_label(raidPtr, col);
2401           return(0);
2402 }
2403 
2404 
2405 int
raidmarkdirty(RF_Raid_t * raidPtr,RF_RowCol_t col)2406 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2407 {
2408           RF_ComponentLabel_t *clabel;
2409 
2410           clabel = raidget_component_label(raidPtr, col);
2411           clabel->clean = RF_RAID_DIRTY;
2412           raidflush_component_label(raidPtr, col);
2413           return(0);
2414 }
2415 
2416 int
raidfetch_component_label(RF_Raid_t * raidPtr,RF_RowCol_t col)2417 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2418 {
2419           KASSERT(raidPtr->bytesPerSector);
2420 
2421           return raidread_component_label(raidPtr->bytesPerSector,
2422               raidPtr->Disks[col].dev,
2423               raidPtr->raid_cinfo[col].ci_vp,
2424               &raidPtr->raid_cinfo[col].ci_label);
2425 }
2426 
2427 RF_ComponentLabel_t *
raidget_component_label(RF_Raid_t * raidPtr,RF_RowCol_t col)2428 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2429 {
2430           return &raidPtr->raid_cinfo[col].ci_label;
2431 }
2432 
2433 int
raidflush_component_label(RF_Raid_t * raidPtr,RF_RowCol_t col)2434 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2435 {
2436           RF_ComponentLabel_t *label;
2437 
2438           label = &raidPtr->raid_cinfo[col].ci_label;
2439           label->mod_counter = raidPtr->mod_counter;
2440 #ifndef RF_NO_PARITY_MAP
2441           label->parity_map_modcount = label->mod_counter;
2442 #endif
2443           return raidwrite_component_label(raidPtr->bytesPerSector,
2444               raidPtr->Disks[col].dev,
2445               raidPtr->raid_cinfo[col].ci_vp, label);
2446 }
2447 
2448 /*
2449  * Swap the label endianness.
2450  *
2451  * Everything in the component label is 4-byte-swapped except the version,
2452  * which is kept in the byte-swapped version at all times, and indicates
2453  * for the writer that a swap is necessary.
2454  *
2455  * For reads it is expected that out_label == clabel, but writes expect
2456  * separate labels so only the re-swapped label is written out to disk,
2457  * leaving the swapped-except-version internally.
2458  *
2459  * Only support swapping label version 2.
2460  */
2461 static void
rf_swap_label(RF_ComponentLabel_t * clabel,RF_ComponentLabel_t * out_label)2462 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2463 {
2464           int       *in, *out, *in_last;
2465 
2466           KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2467 
2468           /* Don't swap the label, but do copy it. */
2469           out_label->version = clabel->version;
2470 
2471           in = &clabel->serial_number;
2472           in_last = &clabel->future_use2[42];
2473           out = &out_label->serial_number;
2474 
2475           for (; in < in_last; in++, out++)
2476                     *out = bswap32(*in);
2477 }
2478 
2479 static int
raidread_component_label(unsigned secsize,dev_t dev,struct vnode * b_vp,RF_ComponentLabel_t * clabel)2480 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2481     RF_ComponentLabel_t *clabel)
2482 {
2483           int error;
2484 
2485           error = raidread_component_area(dev, b_vp, clabel,
2486               sizeof(RF_ComponentLabel_t),
2487               rf_component_info_offset(),
2488               rf_component_info_size(secsize));
2489 
2490           if (error == 0 &&
2491               clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2492                     rf_swap_label(clabel, clabel);
2493           }
2494 
2495           return error;
2496 }
2497 
2498 /* ARGSUSED */
2499 static int
raidread_component_area(dev_t dev,struct vnode * b_vp,void * data,size_t msize,daddr_t offset,daddr_t dsize)2500 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2501     size_t msize, daddr_t offset, daddr_t dsize)
2502 {
2503           struct buf *bp;
2504           int error;
2505 
2506           /* XXX should probably ensure that we don't try to do this if
2507              someone has changed rf_protected_sectors. */
2508 
2509           if (b_vp == NULL) {
2510                     /* For whatever reason, this component is not valid.
2511                        Don't try to read a component label from it. */
2512                     return(EINVAL);
2513           }
2514 
2515           /* get a block of the appropriate size... */
2516           bp = geteblk((int)dsize);
2517           bp->b_dev = dev;
2518 
2519           /* get our ducks in a row for the read */
2520           bp->b_blkno = offset / DEV_BSIZE;
2521           bp->b_bcount = dsize;
2522           bp->b_flags |= B_READ;
2523           bp->b_resid = dsize;
2524 
2525           bdev_strategy(bp);
2526           error = biowait(bp);
2527 
2528           if (!error) {
2529                     memcpy(data, bp->b_data, msize);
2530           }
2531 
2532           brelse(bp, 0);
2533           return(error);
2534 }
2535 
2536 static int
raidwrite_component_label(unsigned secsize,dev_t dev,struct vnode * b_vp,RF_ComponentLabel_t * clabel)2537 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2538     RF_ComponentLabel_t *clabel)
2539 {
2540           RF_ComponentLabel_t *clabel_write = clabel;
2541           RF_ComponentLabel_t lclabel;
2542           int error;
2543 
2544           if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2545                     clabel_write = &lclabel;
2546                     rf_swap_label(clabel, clabel_write);
2547           }
2548           error = raidwrite_component_area(dev, b_vp, clabel_write,
2549               sizeof(RF_ComponentLabel_t),
2550               rf_component_info_offset(),
2551               rf_component_info_size(secsize));
2552 
2553           return error;
2554 }
2555 
2556 /* ARGSUSED */
2557 static int
raidwrite_component_area(dev_t dev,struct vnode * b_vp,void * data,size_t msize,daddr_t offset,daddr_t dsize)2558 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2559     size_t msize, daddr_t offset, daddr_t dsize)
2560 {
2561           struct buf *bp;
2562           int error;
2563 
2564           /* get a block of the appropriate size... */
2565           bp = geteblk((int)dsize);
2566           bp->b_dev = dev;
2567 
2568           /* get our ducks in a row for the write */
2569           bp->b_blkno = offset / DEV_BSIZE;
2570           bp->b_bcount = dsize;
2571           bp->b_flags |= B_WRITE;
2572           bp->b_resid = dsize;
2573 
2574           memset(bp->b_data, 0, dsize);
2575           memcpy(bp->b_data, data, msize);
2576 
2577           bdev_strategy(bp);
2578           error = biowait(bp);
2579           brelse(bp, 0);
2580           if (error) {
2581 #if 1
2582                     printf("Failed to write RAID component info!\n");
2583 #endif
2584           }
2585 
2586           return(error);
2587 }
2588 
2589 void
rf_paritymap_kern_write(RF_Raid_t * raidPtr,struct rf_paritymap_ondisk * map)2590 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2591 {
2592           int c;
2593 
2594           for (c = 0; c < raidPtr->numCol; c++) {
2595                     /* Skip dead disks. */
2596                     if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2597                               continue;
2598                     /* XXXjld: what if an error occurs here? */
2599                     raidwrite_component_area(raidPtr->Disks[c].dev,
2600                         raidPtr->raid_cinfo[c].ci_vp, map,
2601                         RF_PARITYMAP_NBYTE,
2602                         rf_parity_map_offset(raidPtr),
2603                         rf_parity_map_size(raidPtr));
2604           }
2605 }
2606 
2607 void
rf_paritymap_kern_read(RF_Raid_t * raidPtr,struct rf_paritymap_ondisk * map)2608 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2609 {
2610           struct rf_paritymap_ondisk tmp;
2611           int c,first;
2612 
2613           first=1;
2614           for (c = 0; c < raidPtr->numCol; c++) {
2615                     /* Skip dead disks. */
2616                     if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2617                               continue;
2618                     raidread_component_area(raidPtr->Disks[c].dev,
2619                         raidPtr->raid_cinfo[c].ci_vp, &tmp,
2620                         RF_PARITYMAP_NBYTE,
2621                         rf_parity_map_offset(raidPtr),
2622                         rf_parity_map_size(raidPtr));
2623                     if (first) {
2624                               memcpy(map, &tmp, sizeof(*map));
2625                               first = 0;
2626                     } else {
2627                               rf_paritymap_merge(map, &tmp);
2628                     }
2629           }
2630 }
2631 
2632 void
rf_markalldirty(RF_Raid_t * raidPtr)2633 rf_markalldirty(RF_Raid_t *raidPtr)
2634 {
2635           RF_ComponentLabel_t *clabel;
2636           int sparecol;
2637           int c;
2638           int j;
2639           int scol = -1;
2640 
2641           raidPtr->mod_counter++;
2642           for (c = 0; c < raidPtr->numCol; c++) {
2643                     /* we don't want to touch (at all) a disk that has
2644                        failed */
2645                     if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2646                               clabel = raidget_component_label(raidPtr, c);
2647                               if (clabel->status == rf_ds_spared) {
2648                                         /* XXX do something special...
2649                                            but whatever you do, don't
2650                                            try to access it!! */
2651                               } else {
2652                                         raidmarkdirty(raidPtr, c);
2653                               }
2654                     }
2655           }
2656 
2657           for (c = 0; c < raidPtr->numSpare ; c++) {
2658                     sparecol = raidPtr->numCol + c;
2659 
2660                     if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2661                               /*
2662 
2663                                  we claim this disk is "optimal" if it's
2664                                  rf_ds_used_spare, as that means it should be
2665                                  directly substitutable for the disk it replaced.
2666                                  We note that too...
2667 
2668                                */
2669 
2670                               for(j=0;j<raidPtr->numCol;j++) {
2671                                         if (raidPtr->Disks[j].spareCol == sparecol) {
2672                                                   scol = j;
2673                                                   break;
2674                                         }
2675                               }
2676 
2677                               clabel = raidget_component_label(raidPtr, sparecol);
2678                               /* make sure status is noted */
2679 
2680                               raid_init_component_label(raidPtr, clabel);
2681 
2682                               clabel->row = 0;
2683                               clabel->column = scol;
2684                               /* Note: we *don't* change status from rf_ds_used_spare
2685                                  to rf_ds_optimal */
2686                               /* clabel.status = rf_ds_optimal; */
2687 
2688                               raidmarkdirty(raidPtr, sparecol);
2689                     }
2690           }
2691 }
2692 
2693 
2694 void
rf_update_component_labels(RF_Raid_t * raidPtr,int final)2695 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2696 {
2697           RF_ComponentLabel_t *clabel;
2698           int sparecol;
2699           int c;
2700           int j;
2701           int scol;
2702           struct raid_softc *rs = raidPtr->softc;
2703 
2704           scol = -1;
2705 
2706           /* XXX should do extra checks to make sure things really are clean,
2707              rather than blindly setting the clean bit... */
2708 
2709           raidPtr->mod_counter++;
2710 
2711           for (c = 0; c < raidPtr->numCol; c++) {
2712                     if (raidPtr->Disks[c].status == rf_ds_optimal) {
2713                               clabel = raidget_component_label(raidPtr, c);
2714                               /* make sure status is noted */
2715                               clabel->status = rf_ds_optimal;
2716 
2717                               /* note what unit we are configured as */
2718                               if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2719                                         clabel->last_unit = raidPtr->raidid;
2720 
2721                               raidflush_component_label(raidPtr, c);
2722                               if (final == RF_FINAL_COMPONENT_UPDATE) {
2723                                         if (raidPtr->parity_good == RF_RAID_CLEAN) {
2724                                                   raidmarkclean(raidPtr, c);
2725                                         }
2726                               }
2727                     }
2728                     /* else we don't touch it.. */
2729           }
2730 
2731           for (c = 0; c < raidPtr->numSpare ; c++) {
2732                     sparecol = raidPtr->numCol + c;
2733 
2734                     /* Need to ensure that the reconstruct actually completed! */
2735                     if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2736                               /*
2737 
2738                                  we claim this disk is "optimal" if it's
2739                                  rf_ds_used_spare, as that means it should be
2740                                  directly substitutable for the disk it replaced.
2741                                  We note that too...
2742 
2743                                */
2744 
2745                               for(j=0;j<raidPtr->numCol;j++) {
2746                                         if (raidPtr->Disks[j].spareCol == sparecol) {
2747                                                   scol = j;
2748                                                   break;
2749                                         }
2750                               }
2751 
2752                               /* XXX shouldn't *really* need this... */
2753                               clabel = raidget_component_label(raidPtr, sparecol);
2754                               /* make sure status is noted */
2755 
2756                               raid_init_component_label(raidPtr, clabel);
2757 
2758                               clabel->column = scol;
2759                               clabel->status = rf_ds_optimal;
2760                               if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2761                                         clabel->last_unit = raidPtr->raidid;
2762 
2763                               raidflush_component_label(raidPtr, sparecol);
2764                               if (final == RF_FINAL_COMPONENT_UPDATE) {
2765                                         if (raidPtr->parity_good == RF_RAID_CLEAN) {
2766                                                   raidmarkclean(raidPtr, sparecol);
2767                                         }
2768                               }
2769                     }
2770           }
2771 }
2772 
2773 void
rf_close_component(RF_Raid_t * raidPtr,struct vnode * vp,int auto_configured)2774 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2775 {
2776 
2777           if (vp != NULL) {
2778                     if (auto_configured == 1) {
2779                               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2780                               VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2781                               vput(vp);
2782 
2783                     } else {
2784                               (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2785                     }
2786           }
2787 }
2788 
2789 
2790 void
rf_UnconfigureVnodes(RF_Raid_t * raidPtr)2791 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2792 {
2793           int r,c;
2794           struct vnode *vp;
2795           int acd;
2796 
2797 
2798           /* We take this opportunity to close the vnodes like we should.. */
2799 
2800           for (c = 0; c < raidPtr->numCol; c++) {
2801                     vp = raidPtr->raid_cinfo[c].ci_vp;
2802                     acd = raidPtr->Disks[c].auto_configured;
2803                     rf_close_component(raidPtr, vp, acd);
2804                     raidPtr->raid_cinfo[c].ci_vp = NULL;
2805                     raidPtr->Disks[c].auto_configured = 0;
2806           }
2807 
2808           for (r = 0; r < raidPtr->numSpare; r++) {
2809                     vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2810                     acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2811                     rf_close_component(raidPtr, vp, acd);
2812                     raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2813                     raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2814           }
2815 }
2816 
2817 
2818 static void
rf_ReconThread(struct rf_recon_req_internal * req)2819 rf_ReconThread(struct rf_recon_req_internal *req)
2820 {
2821           int     s;
2822           RF_Raid_t *raidPtr;
2823 
2824           s = splbio();
2825           raidPtr = (RF_Raid_t *) req->raidPtr;
2826           raidPtr->recon_in_progress = 1;
2827 
2828           if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2829                     raidPtr->forceRecon = 1;
2830           }
2831 
2832           rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2833                         ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2834 
2835           if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2836                     raidPtr->forceRecon = 0;
2837           }
2838 
2839           RF_Free(req, sizeof(*req));
2840 
2841           raidPtr->recon_in_progress = 0;
2842           splx(s);
2843 
2844           /* That's all... */
2845           kthread_exit(0);    /* does not return */
2846 }
2847 
2848 static void
rf_RewriteParityThread(RF_Raid_t * raidPtr)2849 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2850 {
2851           int retcode;
2852           int s;
2853 
2854           raidPtr->parity_rewrite_stripes_done = 0;
2855           raidPtr->parity_rewrite_in_progress = 1;
2856           s = splbio();
2857           retcode = rf_RewriteParity(raidPtr);
2858           splx(s);
2859           if (retcode) {
2860                     printf("raid%d: Error re-writing parity (%d)!\n",
2861                         raidPtr->raidid, retcode);
2862           } else {
2863                     /* set the clean bit!  If we shutdown correctly,
2864                        the clean bit on each component label will get
2865                        set */
2866                     raidPtr->parity_good = RF_RAID_CLEAN;
2867           }
2868           raidPtr->parity_rewrite_in_progress = 0;
2869 
2870           /* Anyone waiting for us to stop?  If so, inform them... */
2871           if (raidPtr->waitShutdown) {
2872                     rf_lock_mutex2(raidPtr->rad_lock);
2873                     cv_broadcast(&raidPtr->parity_rewrite_cv);
2874                     rf_unlock_mutex2(raidPtr->rad_lock);
2875           }
2876 
2877           /* That's all... */
2878           kthread_exit(0);    /* does not return */
2879 }
2880 
2881 static void
rf_ReconstructInPlaceThread(struct rf_recon_req_internal * req)2882 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2883 {
2884           int s;
2885           RF_Raid_t *raidPtr;
2886 
2887           s = splbio();
2888           raidPtr = req->raidPtr;
2889           raidPtr->recon_in_progress = 1;
2890 
2891           if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2892                     raidPtr->forceRecon = 1;
2893           }
2894 
2895           rf_ReconstructInPlace(raidPtr, req->col);
2896 
2897           if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2898                     raidPtr->forceRecon = 0;
2899           }
2900 
2901           RF_Free(req, sizeof(*req));
2902           raidPtr->recon_in_progress = 0;
2903           splx(s);
2904 
2905           /* That's all... */
2906           kthread_exit(0);    /* does not return */
2907 }
2908 
2909 static RF_AutoConfig_t *
rf_get_component(RF_AutoConfig_t * ac_list,dev_t dev,struct vnode * vp,const char * cname,RF_SectorCount_t size,uint64_t numsecs,unsigned secsize)2910 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2911     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2912     unsigned secsize)
2913 {
2914           int good_one = 0;
2915           RF_ComponentLabel_t *clabel;
2916           RF_AutoConfig_t *ac;
2917 
2918           clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2919 
2920           if (!raidread_component_label(secsize, dev, vp, clabel)) {
2921                     /* Got the label.  Does it look reasonable? */
2922                     if (rf_reasonable_label(clabel, numsecs) &&
2923                         (rf_component_label_partitionsize(clabel) <= size)) {
2924 #ifdef DEBUG
2925                               printf("Component on: %s: %llu\n",
2926                                         cname, (unsigned long long)size);
2927                               rf_print_component_label(clabel);
2928 #endif
2929                               /* if it's reasonable, add it, else ignore it. */
2930                               ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2931                                         M_WAITOK);
2932                               strlcpy(ac->devname, cname, sizeof(ac->devname));
2933                               ac->dev = dev;
2934                               ac->vp = vp;
2935                               ac->clabel = clabel;
2936                               ac->next = ac_list;
2937                               ac_list = ac;
2938                               good_one = 1;
2939                     }
2940           }
2941           if (!good_one) {
2942                     /* cleanup */
2943                     free(clabel, M_RAIDFRAME);
2944                     vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2945                     VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2946                     vput(vp);
2947           }
2948           return ac_list;
2949 }
2950 
2951 static RF_AutoConfig_t *
rf_find_raid_components(void)2952 rf_find_raid_components(void)
2953 {
2954           struct vnode *vp;
2955           struct disklabel label;
2956           device_t dv;
2957           deviter_t di;
2958           dev_t dev;
2959           int bmajor, bminor, wedge, rf_part_found;
2960           int error;
2961           int i;
2962           RF_AutoConfig_t *ac_list;
2963           uint64_t numsecs;
2964           unsigned secsize;
2965           int dowedges;
2966 
2967           /* initialize the AutoConfig list */
2968           ac_list = NULL;
2969 
2970           /*
2971            * we begin by trolling through *all* the devices on the system *twice*
2972            * first we scan for wedges, second for other devices. This avoids
2973            * using a raw partition instead of a wedge that covers the whole disk
2974            */
2975 
2976           for (dowedges=1; dowedges>=0; --dowedges) {
2977                     for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2978                          dv = deviter_next(&di)) {
2979 
2980                               /* we are only interested in disks */
2981                               if (device_class(dv) != DV_DISK)
2982                                         continue;
2983 
2984                               /* we don't care about floppies */
2985                               if (device_is_a(dv, "fd")) {
2986                                         continue;
2987                               }
2988 
2989                               /* we don't care about CDs. */
2990                               if (device_is_a(dv, "cd")) {
2991                                         continue;
2992                               }
2993 
2994                               /* we don't care about md. */
2995                               if (device_is_a(dv, "md")) {
2996                                         continue;
2997                               }
2998 
2999                               /* hdfd is the Atari/Hades floppy driver */
3000                               if (device_is_a(dv, "hdfd")) {
3001                                         continue;
3002                               }
3003 
3004                               /* fdisa is the Atari/Milan floppy driver */
3005                               if (device_is_a(dv, "fdisa")) {
3006                                         continue;
3007                               }
3008 
3009                               /* we don't care about spiflash */
3010                               if (device_is_a(dv, "spiflash")) {
3011                                         continue;
3012                               }
3013 
3014                               /* are we in the wedges pass ? */
3015                               wedge = device_is_a(dv, "dk");
3016                               if (wedge != dowedges) {
3017                                         continue;
3018                               }
3019 
3020                               /* need to find the device_name_to_block_device_major stuff */
3021                               bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3022 
3023                               rf_part_found = 0; /*No raid partition as yet*/
3024 
3025                               /* get a vnode for the raw partition of this disk */
3026                               bminor = minor(device_unit(dv));
3027                               dev = wedge ? makedev(bmajor, bminor) :
3028                                   MAKEDISKDEV(bmajor, bminor, RAW_PART);
3029                               if (bdevvp(dev, &vp))
3030                                         panic("RAID can't alloc vnode");
3031 
3032                               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3033                               error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3034 
3035                               if (error) {
3036                                         /* "Who cares."  Continue looking
3037                                            for something that exists*/
3038                                         vput(vp);
3039                                         continue;
3040                               }
3041 
3042                               error = getdisksize(vp, &numsecs, &secsize);
3043                               if (error) {
3044                                         /*
3045                                          * Pseudo devices like vnd and cgd can be
3046                                          * opened but may still need some configuration.
3047                                          * Ignore these quietly.
3048                                          */
3049                                         if (error != ENXIO)
3050                                                   printf("RAIDframe: can't get disk size"
3051                                                       " for dev %s (%d)\n",
3052                                                       device_xname(dv), error);
3053                                         VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3054                                         vput(vp);
3055                                         continue;
3056                               }
3057                               if (wedge) {
3058                                         struct dkwedge_info dkw;
3059                                         error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3060                                             NOCRED);
3061                                         if (error) {
3062                                                   printf("RAIDframe: can't get wedge info for "
3063                                                       "dev %s (%d)\n", device_xname(dv), error);
3064                                                   VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3065                                                   vput(vp);
3066                                                   continue;
3067                                         }
3068 
3069                                         if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3070                                                   VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3071                                                   vput(vp);
3072                                                   continue;
3073                                         }
3074 
3075                                         VOP_UNLOCK(vp);
3076                                         ac_list = rf_get_component(ac_list, dev, vp,
3077                                             device_xname(dv), dkw.dkw_size, numsecs, secsize);
3078                                         rf_part_found = 1; /*There is a raid component on this disk*/
3079                                         continue;
3080                               }
3081 
3082                               /* Ok, the disk exists.  Go get the disklabel. */
3083                               error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3084                               if (error) {
3085                                         /*
3086                                          * XXX can't happen - open() would
3087                                          * have errored out (or faked up one)
3088                                          */
3089                                         if (error != ENOTTY)
3090                                                   printf("RAIDframe: can't get label for dev "
3091                                                       "%s (%d)\n", device_xname(dv), error);
3092                               }
3093 
3094                               /* don't need this any more.  We'll allocate it again
3095                                  a little later if we really do... */
3096                               VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3097                               vput(vp);
3098 
3099                               if (error)
3100                                         continue;
3101 
3102                               rf_part_found = 0; /*No raid partitions yet*/
3103                               for (i = 0; i < label.d_npartitions; i++) {
3104                                         char cname[sizeof(ac_list->devname)];
3105 
3106                                         /* We only support partitions marked as RAID */
3107                                         if (label.d_partitions[i].p_fstype != FS_RAID)
3108                                                   continue;
3109 
3110                                         dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3111                                         if (bdevvp(dev, &vp))
3112                                                   panic("RAID can't alloc vnode");
3113 
3114                                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3115                                         error = VOP_OPEN(vp, FREAD, NOCRED);
3116                                         if (error) {
3117                                                   /* Not quite a 'whatever'.  In
3118                                                    * this situation we know
3119                                                    * there is a FS_RAID
3120                                                    * partition, but we can't
3121                                                    * open it.  The most likely
3122                                                    * reason is that the
3123                                                    * partition is already in
3124                                                    * use by another RAID set.
3125                                                    * So note that we've already
3126                                                    * found a partition on this
3127                                                    * disk so we don't attempt
3128                                                    * to use the raw disk later. */
3129                                                   rf_part_found = 1;
3130                                                   vput(vp);
3131                                                   continue;
3132                                         }
3133                                         VOP_UNLOCK(vp);
3134                                         snprintf(cname, sizeof(cname), "%s%c",
3135                                             device_xname(dv), 'a' + i);
3136                                         ac_list = rf_get_component(ac_list, dev, vp, cname,
3137                                                   label.d_partitions[i].p_size, numsecs, secsize);
3138                                         rf_part_found = 1; /*There is at least one raid partition on this disk*/
3139                               }
3140 
3141                               /*
3142                                *If there is no raid component on this disk, either in a
3143                                *disklabel or inside a wedge, check the raw partition as well,
3144                                *as it is possible to configure raid components on raw disk
3145                                *devices.
3146                                */
3147 
3148                               if (!rf_part_found) {
3149                                         char cname[sizeof(ac_list->devname)];
3150 
3151                                         dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3152                                         if (bdevvp(dev, &vp))
3153                                                   panic("RAID can't alloc vnode");
3154 
3155                                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3156 
3157                                         error = VOP_OPEN(vp, FREAD, NOCRED);
3158                                         if (error) {
3159                                                   /* Whatever... */
3160                                                   vput(vp);
3161                                                   continue;
3162                                         }
3163                                         VOP_UNLOCK(vp);
3164                                         snprintf(cname, sizeof(cname), "%s%c",
3165                                             device_xname(dv), 'a' + RAW_PART);
3166                                         ac_list = rf_get_component(ac_list, dev, vp, cname,
3167                                                   label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3168                               }
3169                     }
3170                     deviter_release(&di);
3171           }
3172           return ac_list;
3173 }
3174 
3175 int
rf_reasonable_label(RF_ComponentLabel_t * clabel,uint64_t numsecs)3176 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3177 {
3178 
3179           if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3180                clabel->version==RF_COMPONENT_LABEL_VERSION ||
3181                clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3182               (clabel->clean == RF_RAID_CLEAN ||
3183                clabel->clean == RF_RAID_DIRTY) &&
3184               clabel->row >=0 &&
3185               clabel->column >= 0 &&
3186               clabel->num_rows > 0 &&
3187               clabel->num_columns > 0 &&
3188               clabel->row < clabel->num_rows &&
3189               clabel->column < clabel->num_columns &&
3190               clabel->blockSize > 0 &&
3191               /*
3192                * numBlocksHi may contain garbage, but it is ok since
3193                * the type is unsigned.  If it is really garbage,
3194                * rf_fix_old_label_size() will fix it.
3195                */
3196               rf_component_label_numblocks(clabel) > 0) {
3197                     /*
3198                      * label looks reasonable enough...
3199                      * let's make sure it has no old garbage.
3200                      */
3201                     if (numsecs)
3202                               rf_fix_old_label_size(clabel, numsecs);
3203                     return(1);
3204           }
3205           return(0);
3206 }
3207 
3208 
3209 /*
3210  * For reasons yet unknown, some old component labels have garbage in
3211  * the newer numBlocksHi region, and this causes lossage.  Since those
3212  * disks will also have numsecs set to less than 32 bits of sectors,
3213  * we can determine when this corruption has occurred, and fix it.
3214  *
3215  * The exact same problem, with the same unknown reason, happens to
3216  * the partitionSizeHi member as well.
3217  */
3218 static void
rf_fix_old_label_size(RF_ComponentLabel_t * clabel,uint64_t numsecs)3219 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3220 {
3221 
3222           if (numsecs < ((uint64_t)1 << 32)) {
3223                     if (clabel->numBlocksHi) {
3224                               printf("WARNING: total sectors < 32 bits, yet "
3225                                      "numBlocksHi set\n"
3226                                      "WARNING: resetting numBlocksHi to zero.\n");
3227                               clabel->numBlocksHi = 0;
3228                     }
3229 
3230                     if (clabel->partitionSizeHi) {
3231                               printf("WARNING: total sectors < 32 bits, yet "
3232                                      "partitionSizeHi set\n"
3233                                      "WARNING: resetting partitionSizeHi to zero.\n");
3234                               clabel->partitionSizeHi = 0;
3235                     }
3236           }
3237 }
3238 
3239 
3240 #ifdef DEBUG
3241 void
rf_print_component_label(RF_ComponentLabel_t * clabel)3242 rf_print_component_label(RF_ComponentLabel_t *clabel)
3243 {
3244           uint64_t numBlocks;
3245           static const char *rp[] = {
3246               "No", "Force", "Soft", "*invalid*"
3247           };
3248 
3249 
3250           numBlocks = rf_component_label_numblocks(clabel);
3251 
3252           printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3253                  clabel->row, clabel->column,
3254                  clabel->num_rows, clabel->num_columns);
3255           printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3256                  clabel->version, clabel->serial_number,
3257                  clabel->mod_counter);
3258           printf("   Clean: %s Status: %d\n",
3259                  clabel->clean ? "Yes" : "No", clabel->status);
3260           printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3261                  clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3262           printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3263                  (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3264           printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3265           printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3266           printf("   Last configured as: raid%d\n", clabel->last_unit);
3267 #if 0
3268              printf("   Config order: %d\n", clabel->config_order);
3269 #endif
3270 
3271 }
3272 #endif
3273 
3274 static RF_ConfigSet_t *
rf_create_auto_sets(RF_AutoConfig_t * ac_list)3275 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3276 {
3277           RF_AutoConfig_t *ac;
3278           RF_ConfigSet_t *config_sets;
3279           RF_ConfigSet_t *cset;
3280           RF_AutoConfig_t *ac_next;
3281 
3282 
3283           config_sets = NULL;
3284 
3285           /* Go through the AutoConfig list, and figure out which components
3286              belong to what sets.  */
3287           ac = ac_list;
3288           while(ac!=NULL) {
3289                     /* we're going to putz with ac->next, so save it here
3290                        for use at the end of the loop */
3291                     ac_next = ac->next;
3292 
3293                     if (config_sets == NULL) {
3294                               /* will need at least this one... */
3295                               config_sets = malloc(sizeof(RF_ConfigSet_t),
3296                                                M_RAIDFRAME, M_WAITOK);
3297                               /* this one is easy :) */
3298                               config_sets->ac = ac;
3299                               config_sets->next = NULL;
3300                               config_sets->rootable = 0;
3301                               ac->next = NULL;
3302                     } else {
3303                               /* which set does this component fit into? */
3304                               cset = config_sets;
3305                               while(cset!=NULL) {
3306                                         if (rf_does_it_fit(cset, ac)) {
3307                                                   /* looks like it matches... */
3308                                                   ac->next = cset->ac;
3309                                                   cset->ac = ac;
3310                                                   break;
3311                                         }
3312                                         cset = cset->next;
3313                               }
3314                               if (cset==NULL) {
3315                                         /* didn't find a match above... new set..*/
3316                                         cset = malloc(sizeof(RF_ConfigSet_t),
3317                                                          M_RAIDFRAME, M_WAITOK);
3318                                         cset->ac = ac;
3319                                         ac->next = NULL;
3320                                         cset->next = config_sets;
3321                                         cset->rootable = 0;
3322                                         config_sets = cset;
3323                               }
3324                     }
3325                     ac = ac_next;
3326           }
3327 
3328 
3329           return(config_sets);
3330 }
3331 
3332 static int
rf_does_it_fit(RF_ConfigSet_t * cset,RF_AutoConfig_t * ac)3333 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3334 {
3335           RF_ComponentLabel_t *clabel1, *clabel2;
3336 
3337           /* If this one matches the *first* one in the set, that's good
3338              enough, since the other members of the set would have been
3339              through here too... */
3340           /* note that we are not checking partitionSize here..
3341 
3342              Note that we are also not checking the mod_counters here.
3343              If everything else matches except the mod_counter, that's
3344              good enough for this test.  We will deal with the mod_counters
3345              a little later in the autoconfiguration process.
3346 
3347               (clabel1->mod_counter == clabel2->mod_counter) &&
3348 
3349              The reason we don't check for this is that failed disks
3350              will have lower modification counts.  If those disks are
3351              not added to the set they used to belong to, then they will
3352              form their own set, which may result in 2 different sets,
3353              for example, competing to be configured at raid0, and
3354              perhaps competing to be the root filesystem set.  If the
3355              wrong ones get configured, or both attempt to become /,
3356              weird behaviour and or serious lossage will occur.  Thus we
3357              need to bring them into the fold here, and kick them out at
3358              a later point.
3359 
3360           */
3361 
3362           clabel1 = cset->ac->clabel;
3363           clabel2 = ac->clabel;
3364           if ((clabel1->version == clabel2->version) &&
3365               (clabel1->serial_number == clabel2->serial_number) &&
3366               (clabel1->num_rows == clabel2->num_rows) &&
3367               (clabel1->num_columns == clabel2->num_columns) &&
3368               (clabel1->sectPerSU == clabel2->sectPerSU) &&
3369               (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3370               (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3371               (clabel1->parityConfig == clabel2->parityConfig) &&
3372               (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3373               (clabel1->blockSize == clabel2->blockSize) &&
3374               rf_component_label_numblocks(clabel1) ==
3375               rf_component_label_numblocks(clabel2) &&
3376               (clabel1->autoconfigure == clabel2->autoconfigure) &&
3377               (clabel1->root_partition == clabel2->root_partition) &&
3378               (clabel1->last_unit == clabel2->last_unit) &&
3379               (clabel1->config_order == clabel2->config_order)) {
3380                     /* if it get's here, it almost *has* to be a match */
3381           } else {
3382                     /* it's not consistent with somebody in the set..
3383                        punt */
3384                     return(0);
3385           }
3386           /* all was fine.. it must fit... */
3387           return(1);
3388 }
3389 
3390 static int
rf_have_enough_components(RF_ConfigSet_t * cset)3391 rf_have_enough_components(RF_ConfigSet_t *cset)
3392 {
3393           RF_AutoConfig_t *ac;
3394           RF_AutoConfig_t *auto_config;
3395           RF_ComponentLabel_t *clabel;
3396           int c;
3397           int num_cols;
3398           int num_missing;
3399           int mod_counter;
3400           int mod_counter_found;
3401           int even_pair_failed;
3402           char parity_type;
3403 
3404 
3405           /* check to see that we have enough 'live' components
3406              of this set.  If so, we can configure it if necessary */
3407 
3408           num_cols = cset->ac->clabel->num_columns;
3409           parity_type = cset->ac->clabel->parityConfig;
3410 
3411           /* XXX Check for duplicate components!?!?!? */
3412 
3413           /* Determine what the mod_counter is supposed to be for this set. */
3414 
3415           mod_counter_found = 0;
3416           mod_counter = 0;
3417           ac = cset->ac;
3418           while(ac!=NULL) {
3419                     if (mod_counter_found==0) {
3420                               mod_counter = ac->clabel->mod_counter;
3421                               mod_counter_found = 1;
3422                     } else {
3423                               if (ac->clabel->mod_counter > mod_counter) {
3424                                         mod_counter = ac->clabel->mod_counter;
3425                               }
3426                     }
3427                     ac = ac->next;
3428           }
3429 
3430           num_missing = 0;
3431           auto_config = cset->ac;
3432 
3433           even_pair_failed = 0;
3434           for(c=0; c<num_cols; c++) {
3435                     ac = auto_config;
3436                     while(ac!=NULL) {
3437                               if ((ac->clabel->column == c) &&
3438                                   (ac->clabel->mod_counter == mod_counter)) {
3439                                         /* it's this one... */
3440 #ifdef DEBUG
3441                                         printf("Found: %s at %d\n",
3442                                                ac->devname,c);
3443 #endif
3444                                         break;
3445                               }
3446                               ac=ac->next;
3447                     }
3448                     if (ac==NULL) {
3449                                         /* Didn't find one here! */
3450                                         /* special case for RAID 1, especially
3451                                            where there are more than 2
3452                                            components (where RAIDframe treats
3453                                            things a little differently :( ) */
3454                               if (parity_type == '1') {
3455                                         if (c%2 == 0) { /* even component */
3456                                                   even_pair_failed = 1;
3457                                         } else { /* odd component.  If
3458                                                       we're failed, and
3459                                                       so is the even
3460                                                       component, it's
3461                                                       "Good Night, Charlie" */
3462                                                   if (even_pair_failed == 1) {
3463                                                             return(0);
3464                                                   }
3465                                         }
3466                               } else {
3467                                         /* normal accounting */
3468                                         num_missing++;
3469                               }
3470                     }
3471                     if ((parity_type == '1') && (c%2 == 1)) {
3472                                         /* Just did an even component, and we didn't
3473                                            bail.. reset the even_pair_failed flag,
3474                                            and go on to the next component.... */
3475                               even_pair_failed = 0;
3476                     }
3477           }
3478 
3479           clabel = cset->ac->clabel;
3480 
3481           if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3482               ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3483               ((clabel->parityConfig == '5') && (num_missing > 1))) {
3484                     /* XXX this needs to be made *much* more general */
3485                     /* Too many failures */
3486                     return(0);
3487           }
3488           /* otherwise, all is well, and we've got enough to take a kick
3489              at autoconfiguring this set */
3490           return(1);
3491 }
3492 
3493 static void
rf_create_configuration(RF_AutoConfig_t * ac,RF_Config_t * config,RF_Raid_t * raidPtr)3494 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3495                               RF_Raid_t *raidPtr)
3496 {
3497           RF_ComponentLabel_t *clabel;
3498           int i;
3499 
3500           clabel = ac->clabel;
3501 
3502           /* 1. Fill in the common stuff */
3503           config->numCol = clabel->num_columns;
3504           config->numSpare = 0; /* XXX should this be set here? */
3505           config->sectPerSU = clabel->sectPerSU;
3506           config->SUsPerPU = clabel->SUsPerPU;
3507           config->SUsPerRU = clabel->SUsPerRU;
3508           config->parityConfig = clabel->parityConfig;
3509           /* XXX... */
3510           strcpy(config->diskQueueType,"fifo");
3511           config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3512           config->layoutSpecificSize = 0; /* XXX ?? */
3513 
3514           while(ac!=NULL) {
3515                     /* row/col values will be in range due to the checks
3516                        in reasonable_label() */
3517                     strcpy(config->devnames[0][ac->clabel->column],
3518                            ac->devname);
3519                     ac = ac->next;
3520           }
3521 
3522           for(i=0;i<RF_MAXDBGV;i++) {
3523                     config->debugVars[i][0] = 0;
3524           }
3525 }
3526 
3527 static int
rf_set_autoconfig(RF_Raid_t * raidPtr,int new_value)3528 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3529 {
3530           RF_ComponentLabel_t *clabel;
3531           int column;
3532           int sparecol;
3533 
3534           raidPtr->autoconfigure = new_value;
3535 
3536           for(column=0; column<raidPtr->numCol; column++) {
3537                     if (raidPtr->Disks[column].status == rf_ds_optimal) {
3538                               clabel = raidget_component_label(raidPtr, column);
3539                               clabel->autoconfigure = new_value;
3540                               raidflush_component_label(raidPtr, column);
3541                     }
3542           }
3543           for(column = 0; column < raidPtr->numSpare ; column++) {
3544                     sparecol = raidPtr->numCol + column;
3545 
3546                     if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3547                               clabel = raidget_component_label(raidPtr, sparecol);
3548                               clabel->autoconfigure = new_value;
3549                               raidflush_component_label(raidPtr, sparecol);
3550                     }
3551           }
3552           return(new_value);
3553 }
3554 
3555 static int
rf_set_rootpartition(RF_Raid_t * raidPtr,int new_value)3556 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3557 {
3558           RF_ComponentLabel_t *clabel;
3559           int column;
3560           int sparecol;
3561 
3562           raidPtr->root_partition = new_value;
3563           for(column=0; column<raidPtr->numCol; column++) {
3564                     if (raidPtr->Disks[column].status == rf_ds_optimal) {
3565                               clabel = raidget_component_label(raidPtr, column);
3566                               clabel->root_partition = new_value;
3567                               raidflush_component_label(raidPtr, column);
3568                     }
3569           }
3570           for (column = 0; column < raidPtr->numSpare ; column++) {
3571                     sparecol = raidPtr->numCol + column;
3572 
3573                     if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3574                               clabel = raidget_component_label(raidPtr, sparecol);
3575                               clabel->root_partition = new_value;
3576                               raidflush_component_label(raidPtr, sparecol);
3577                     }
3578           }
3579           return(new_value);
3580 }
3581 
3582 static void
rf_release_all_vps(RF_ConfigSet_t * cset)3583 rf_release_all_vps(RF_ConfigSet_t *cset)
3584 {
3585           RF_AutoConfig_t *ac;
3586 
3587           ac = cset->ac;
3588           while(ac!=NULL) {
3589                     /* Close the vp, and give it back */
3590                     if (ac->vp) {
3591                               vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3592                               VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3593                               vput(ac->vp);
3594                               ac->vp = NULL;
3595                     }
3596                     ac = ac->next;
3597           }
3598 }
3599 
3600 
3601 static void
rf_cleanup_config_set(RF_ConfigSet_t * cset)3602 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3603 {
3604           RF_AutoConfig_t *ac;
3605           RF_AutoConfig_t *next_ac;
3606 
3607           ac = cset->ac;
3608           while(ac!=NULL) {
3609                     next_ac = ac->next;
3610                     /* nuke the label */
3611                     free(ac->clabel, M_RAIDFRAME);
3612                     /* cleanup the config structure */
3613                     free(ac, M_RAIDFRAME);
3614                     /* "next.." */
3615                     ac = next_ac;
3616           }
3617           /* and, finally, nuke the config set */
3618           free(cset, M_RAIDFRAME);
3619 }
3620 
3621 
3622 void
raid_init_component_label(RF_Raid_t * raidPtr,RF_ComponentLabel_t * clabel)3623 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3624 {
3625           /* avoid over-writing byteswapped version. */
3626           if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3627                     clabel->version = RF_COMPONENT_LABEL_VERSION;
3628           clabel->serial_number = raidPtr->serial_number;
3629           clabel->mod_counter = raidPtr->mod_counter;
3630 
3631           clabel->num_rows = 1;
3632           clabel->num_columns = raidPtr->numCol;
3633           clabel->clean = RF_RAID_DIRTY; /* not clean */
3634           clabel->status = rf_ds_optimal; /* "It's good!" */
3635 
3636           clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3637           clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3638           clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3639 
3640           clabel->blockSize = raidPtr->bytesPerSector;
3641           rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3642 
3643           /* XXX not portable */
3644           clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3645           clabel->maxOutstanding = raidPtr->maxOutstanding;
3646           clabel->autoconfigure = raidPtr->autoconfigure;
3647           clabel->root_partition = raidPtr->root_partition;
3648           clabel->last_unit = raidPtr->raidid;
3649           clabel->config_order = raidPtr->config_order;
3650 
3651 #ifndef RF_NO_PARITY_MAP
3652           rf_paritymap_init_label(raidPtr->parity_map, clabel);
3653 #endif
3654 }
3655 
3656 static struct raid_softc *
rf_auto_config_set(RF_ConfigSet_t * cset)3657 rf_auto_config_set(RF_ConfigSet_t *cset)
3658 {
3659           RF_Raid_t *raidPtr;
3660           RF_Config_t *config;
3661           int raidID;
3662           struct raid_softc *sc;
3663 
3664 #ifdef DEBUG
3665           printf("RAID autoconfigure\n");
3666 #endif
3667 
3668           /* 1. Create a config structure */
3669           config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3670 
3671           /*
3672              2. Figure out what RAID ID this one is supposed to live at
3673              See if we can get the same RAID dev that it was configured
3674              on last time..
3675           */
3676 
3677           raidID = cset->ac->clabel->last_unit;
3678           for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3679                sc = raidget(++raidID, false))
3680                     continue;
3681 #ifdef DEBUG
3682           printf("Configuring raid%d:\n",raidID);
3683 #endif
3684 
3685           if (sc == NULL)
3686                     sc = raidget(raidID, true);
3687           raidPtr = &sc->sc_r;
3688 
3689           /* XXX all this stuff should be done SOMEWHERE ELSE! */
3690           raidPtr->softc = sc;
3691           raidPtr->raidid = raidID;
3692           raidPtr->openings = RAIDOUTSTANDING;
3693 
3694           /* 3. Build the configuration structure */
3695           rf_create_configuration(cset->ac, config, raidPtr);
3696 
3697           /* 4. Do the configuration */
3698           if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3699                     raidinit(sc);
3700 
3701                     rf_markalldirty(raidPtr);
3702                     raidPtr->autoconfigure = 1; /* XXX do this here? */
3703                     switch (cset->ac->clabel->root_partition) {
3704                     case 1:   /* Force Root */
3705                     case 2:   /* Soft Root: root when boot partition part of raid */
3706                               /*
3707                                * everything configured just fine.  Make a note
3708                                * that this set is eligible to be root,
3709                                * or forced to be root
3710                                */
3711                               cset->rootable = cset->ac->clabel->root_partition;
3712                               /* XXX do this here? */
3713                               raidPtr->root_partition = cset->rootable;
3714                               break;
3715                     default:
3716                               break;
3717                     }
3718           } else {
3719                     raidput(sc);
3720                     sc = NULL;
3721           }
3722 
3723           /* 5. Cleanup */
3724           free(config, M_RAIDFRAME);
3725           return sc;
3726 }
3727 
3728 void
rf_pool_init(RF_Raid_t * raidPtr,char * w_chan,struct pool * p,size_t size,const char * pool_name,size_t xmin,size_t xmax)3729 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3730                size_t xmin, size_t xmax)
3731 {
3732 
3733           /* Format: raid%d_foo */
3734           snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3735 
3736           pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3737           pool_sethiwat(p, xmax);
3738           pool_prime(p, xmin);
3739 }
3740 
3741 
3742 /*
3743  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3744  * to see if there is IO pending and if that IO could possibly be done
3745  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
3746  * otherwise.
3747  *
3748  */
3749 int
rf_buf_queue_check(RF_Raid_t * raidPtr)3750 rf_buf_queue_check(RF_Raid_t *raidPtr)
3751 {
3752           struct raid_softc *rs;
3753           struct dk_softc *dksc;
3754 
3755           rs = raidPtr->softc;
3756           dksc = &rs->sc_dksc;
3757 
3758           if ((rs->sc_flags & RAIDF_INITED) == 0)
3759                     return 1;
3760 
3761           if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3762                     /* there is work to do */
3763                     return 0;
3764           }
3765           /* default is nothing to do */
3766           return 1;
3767 }
3768 
3769 int
rf_getdisksize(struct vnode * vp,RF_RaidDisk_t * diskPtr)3770 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3771 {
3772           uint64_t numsecs;
3773           unsigned secsize;
3774           int error;
3775 
3776           error = getdisksize(vp, &numsecs, &secsize);
3777           if (error == 0) {
3778                     diskPtr->blockSize = secsize;
3779                     diskPtr->numBlocks = numsecs - rf_protectedSectors;
3780                     diskPtr->partitionSize = numsecs;
3781                     return 0;
3782           }
3783           return error;
3784 }
3785 
3786 static int
raid_match(device_t self,cfdata_t cfdata,void * aux)3787 raid_match(device_t self, cfdata_t cfdata, void *aux)
3788 {
3789           return 1;
3790 }
3791 
3792 static void
raid_attach(device_t parent,device_t self,void * aux)3793 raid_attach(device_t parent, device_t self, void *aux)
3794 {
3795 }
3796 
3797 
3798 static int
raid_detach(device_t self,int flags)3799 raid_detach(device_t self, int flags)
3800 {
3801           int error;
3802           struct raid_softc *rs = raidsoftc(self);
3803 
3804           if (rs == NULL)
3805                     return ENXIO;
3806 
3807           if ((error = raidlock(rs)) != 0)
3808                     return error;
3809 
3810           error = raid_detach_unlocked(rs);
3811 
3812           raidunlock(rs);
3813 
3814           /* XXX raid can be referenced here */
3815 
3816           if (error)
3817                     return error;
3818 
3819           /* Free the softc */
3820           raidput(rs);
3821 
3822           return 0;
3823 }
3824 
3825 static void
rf_set_geometry(struct raid_softc * rs,RF_Raid_t * raidPtr)3826 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3827 {
3828           struct dk_softc *dksc = &rs->sc_dksc;
3829           struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3830 
3831           memset(dg, 0, sizeof(*dg));
3832 
3833           dg->dg_secperunit = raidPtr->totalSectors;
3834           dg->dg_secsize = raidPtr->bytesPerSector;
3835           dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3836           dg->dg_ntracks = 4 * raidPtr->numCol;
3837 
3838           disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3839 }
3840 
3841 /*
3842  * Get cache info for all the components (including spares).
3843  * Returns intersection of all the cache flags of all disks, or first
3844  * error if any encountered.
3845  * XXXfua feature flags can change as spares are added - lock down somehow
3846  */
3847 static int
rf_get_component_caches(RF_Raid_t * raidPtr,int * data)3848 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3849 {
3850           int c;
3851           int error;
3852           int dkwhole = 0, dkpart;
3853 
3854           for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3855                     /*
3856                      * Check any non-dead disk, even when currently being
3857                      * reconstructed.
3858                      */
3859                     if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
3860                               error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3861                                   DIOCGCACHE, &dkpart, FREAD, NOCRED);
3862                               if (error) {
3863                                         if (error != ENODEV) {
3864                                                   printf("raid%d: get cache for component %s failed\n",
3865                                                       raidPtr->raidid,
3866                                                       raidPtr->Disks[c].devname);
3867                                         }
3868 
3869                                         return error;
3870                               }
3871 
3872                               if (c == 0)
3873                                         dkwhole = dkpart;
3874                               else
3875                                         dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3876                     }
3877           }
3878 
3879           *data = dkwhole;
3880 
3881           return 0;
3882 }
3883 
3884 /*
3885  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3886  * We end up returning whatever error was returned by the first cache flush
3887  * that fails.
3888  */
3889 
3890 static int
rf_sync_component_cache(RF_Raid_t * raidPtr,int c,int force)3891 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3892 {
3893           int e = 0;
3894           for (int i = 0; i < 5; i++) {
3895                     e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3896                         &force, FWRITE, NOCRED);
3897                     if (!e || e == ENODEV)
3898                               return e;
3899                     printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3900                         raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3901           }
3902           return e;
3903 }
3904 
3905 int
rf_sync_component_caches(RF_Raid_t * raidPtr,int force)3906 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3907 {
3908           int c, error;
3909 
3910           error = 0;
3911           for (c = 0; c < raidPtr->numCol; c++) {
3912                     if (raidPtr->Disks[c].status == rf_ds_optimal) {
3913                               int e = rf_sync_component_cache(raidPtr, c, force);
3914                               if (e && !error)
3915                                         error = e;
3916                     }
3917           }
3918 
3919           for (c = 0; c < raidPtr->numSpare ; c++) {
3920                     int sparecol = raidPtr->numCol + c;
3921 
3922                     /* Need to ensure that the reconstruct actually completed! */
3923                     if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3924                               int e = rf_sync_component_cache(raidPtr, sparecol,
3925                                   force);
3926                               if (e && !error)
3927                                         error = e;
3928                     }
3929           }
3930           return error;
3931 }
3932 
3933 /* Fill in info with the current status */
3934 void
rf_check_recon_status_ext(RF_Raid_t * raidPtr,RF_ProgressInfo_t * info)3935 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3936 {
3937 
3938           memset(info, 0, sizeof(*info));
3939 
3940           if (raidPtr->status != rf_rs_reconstructing) {
3941                     info->total = 100;
3942                     info->completed = 100;
3943           } else {
3944                     info->total = raidPtr->reconControl->numRUsTotal;
3945                     info->completed = raidPtr->reconControl->numRUsComplete;
3946           }
3947           info->remaining = info->total - info->completed;
3948 }
3949 
3950 /* Fill in info with the current status */
3951 void
rf_check_parityrewrite_status_ext(RF_Raid_t * raidPtr,RF_ProgressInfo_t * info)3952 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3953 {
3954 
3955           memset(info, 0, sizeof(*info));
3956 
3957           if (raidPtr->parity_rewrite_in_progress == 1) {
3958                     info->total = raidPtr->Layout.numStripe;
3959                     info->completed = raidPtr->parity_rewrite_stripes_done;
3960           } else {
3961                     info->completed = 100;
3962                     info->total = 100;
3963           }
3964           info->remaining = info->total - info->completed;
3965 }
3966 
3967 /* Fill in info with the current status */
3968 void
rf_check_copyback_status_ext(RF_Raid_t * raidPtr,RF_ProgressInfo_t * info)3969 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3970 {
3971 
3972           memset(info, 0, sizeof(*info));
3973           info->remaining = 0;
3974           info->completed = 100;
3975           info->total = 100;
3976 }
3977 
3978 /* Fill in config with the current info */
3979 int
rf_get_info(RF_Raid_t * raidPtr,RF_DeviceConfig_t * config)3980 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3981 {
3982           int       d, i, j;
3983 
3984           if (!raidPtr->valid)
3985                     return ENODEV;
3986           config->cols = raidPtr->numCol;
3987           config->ndevs = raidPtr->numCol;
3988           if (config->ndevs >= RF_MAX_DISKS)
3989                     return ENOMEM;
3990           config->nspares = raidPtr->numSpare;
3991           if (config->nspares >= RF_MAX_DISKS)
3992                     return ENOMEM;
3993           config->maxqdepth = raidPtr->maxQueueDepth;
3994           d = 0;
3995           for (j = 0; j < config->cols; j++) {
3996                     config->devs[d] = raidPtr->Disks[j];
3997                     d++;
3998           }
3999           for (i = 0; i < config->nspares; i++) {
4000                     config->spares[i] = raidPtr->Disks[raidPtr->numCol + i];
4001                     if (config->spares[i].status == rf_ds_rebuilding_spare) {
4002                               /* raidctl(8) expects to see this as a used spare */
4003                               config->spares[i].status = rf_ds_used_spare;
4004                     }
4005           }
4006           return 0;
4007 }
4008 
4009 int
rf_get_component_label(RF_Raid_t * raidPtr,void * data)4010 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
4011 {
4012           RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
4013           RF_ComponentLabel_t *raid_clabel;
4014           int column = clabel->column;
4015 
4016           if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
4017                     return EINVAL;
4018           raid_clabel = raidget_component_label(raidPtr, column);
4019           memcpy(clabel, raid_clabel, sizeof *clabel);
4020           /* Fix-up for userland. */
4021           if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
4022                     clabel->version = RF_COMPONENT_LABEL_VERSION;
4023 
4024           return 0;
4025 }
4026 
4027 /*
4028  * Module interface
4029  */
4030 
4031 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
4032 
4033 #ifdef _MODULE
4034 CFDRIVER_DECL(raid, DV_DISK, NULL);
4035 #endif
4036 
4037 static int raid_modcmd(modcmd_t, void *);
4038 static int raid_modcmd_init(void);
4039 static int raid_modcmd_fini(void);
4040 
4041 static int
raid_modcmd(modcmd_t cmd,void * data)4042 raid_modcmd(modcmd_t cmd, void *data)
4043 {
4044           int error;
4045 
4046           error = 0;
4047           switch (cmd) {
4048           case MODULE_CMD_INIT:
4049                     error = raid_modcmd_init();
4050                     break;
4051           case MODULE_CMD_FINI:
4052                     error = raid_modcmd_fini();
4053                     break;
4054           default:
4055                     error = ENOTTY;
4056                     break;
4057           }
4058           return error;
4059 }
4060 
4061 static int
raid_modcmd_init(void)4062 raid_modcmd_init(void)
4063 {
4064           int error;
4065           int bmajor, cmajor;
4066 
4067           mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
4068           mutex_enter(&raid_lock);
4069 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4070           rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
4071           rf_init_cond2(rf_sparet_wait_cv, "sparetw");
4072           rf_init_cond2(rf_sparet_resp_cv, "rfgst");
4073 
4074           rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
4075 #endif
4076 
4077           bmajor = cmajor = -1;
4078           error = devsw_attach("raid", &raid_bdevsw, &bmajor,
4079               &raid_cdevsw, &cmajor);
4080           if (error != 0 && error != EEXIST) {
4081                     aprint_error("%s: devsw_attach failed %d\n", __func__, error);
4082                     mutex_exit(&raid_lock);
4083                     return error;
4084           }
4085 #ifdef _MODULE
4086           error = config_cfdriver_attach(&raid_cd);
4087           if (error != 0) {
4088                     aprint_error("%s: config_cfdriver_attach failed %d\n",
4089                         __func__, error);
4090                     devsw_detach(&raid_bdevsw, &raid_cdevsw);
4091                     mutex_exit(&raid_lock);
4092                     return error;
4093           }
4094 #endif
4095           error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4096           if (error != 0) {
4097                     aprint_error("%s: config_cfattach_attach failed %d\n",
4098                         __func__, error);
4099 #ifdef _MODULE
4100                     config_cfdriver_detach(&raid_cd);
4101 #endif
4102                     devsw_detach(&raid_bdevsw, &raid_cdevsw);
4103                     mutex_exit(&raid_lock);
4104                     return error;
4105           }
4106 
4107           raidautoconfigdone = false;
4108 
4109           mutex_exit(&raid_lock);
4110 
4111           if (error == 0) {
4112                     if (rf_BootRaidframe(true) == 0)
4113                               aprint_verbose("Kernelized RAIDframe activated\n");
4114                     else
4115                               panic("Serious error activating RAID!!");
4116           }
4117 
4118           /*
4119            * Register a finalizer which will be used to auto-config RAID
4120            * sets once all real hardware devices have been found.
4121            */
4122           error = config_finalize_register(NULL, rf_autoconfig);
4123           if (error != 0) {
4124                     aprint_error("WARNING: unable to register RAIDframe "
4125                         "finalizer\n");
4126                     error = 0;
4127           }
4128 
4129           return error;
4130 }
4131 
4132 static int
raid_modcmd_fini(void)4133 raid_modcmd_fini(void)
4134 {
4135           int error;
4136 
4137           mutex_enter(&raid_lock);
4138 
4139           /* Don't allow unload if raid device(s) exist.  */
4140           if (!LIST_EMPTY(&raids)) {
4141                     mutex_exit(&raid_lock);
4142                     return EBUSY;
4143           }
4144 
4145           error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
4146           if (error != 0) {
4147                     aprint_error("%s: cannot detach cfattach\n",__func__);
4148                     mutex_exit(&raid_lock);
4149                     return error;
4150           }
4151 #ifdef _MODULE
4152           error = config_cfdriver_detach(&raid_cd);
4153           if (error != 0) {
4154                     aprint_error("%s: cannot detach cfdriver\n",__func__);
4155                     config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4156                     mutex_exit(&raid_lock);
4157                     return error;
4158           }
4159 #endif
4160           devsw_detach(&raid_bdevsw, &raid_cdevsw);
4161           rf_BootRaidframe(false);
4162 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4163           rf_destroy_mutex2(rf_sparet_wait_mutex);
4164           rf_destroy_cond2(rf_sparet_wait_cv);
4165           rf_destroy_cond2(rf_sparet_resp_cv);
4166 #endif
4167           mutex_exit(&raid_lock);
4168           mutex_destroy(&raid_lock);
4169 
4170           return error;
4171 }
4172