1 /*	$OpenBSD: rf_raid5.c,v 1.4 2002/12/16 07:01:04 tdeval Exp $	*/
2 /*	$NetBSD: rf_raid5.c,v 1.4 2000/01/08 22:57:30 oster Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Carnegie-Mellon University.
6  * All rights reserved.
7  *
8  * Author: Mark Holland
9  *
10  * Permission to use, copy, modify and distribute this software and
11  * its documentation is hereby granted, provided that both the copyright
12  * notice and this permission notice appear in all copies of the
13  * software, derivative works or modified versions, and any portions
14  * thereof, and that both notices appear in supporting documentation.
15  *
16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19  *
20  * Carnegie Mellon requests users of this software to return to
21  *
22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
23  *  School of Computer Science
24  *  Carnegie Mellon University
25  *  Pittsburgh PA 15213-3890
26  *
27  * any improvements or extensions that they make and grant Carnegie the
28  * rights to redistribute these changes.
29  */
30 
31 /*****************************************************************************
32  *
33  * rf_raid5.c -- Implements RAID Level 5.
34  *
35  *****************************************************************************/
36 
37 #include "rf_types.h"
38 #include "rf_raid.h"
39 #include "rf_raid5.h"
40 #include "rf_dag.h"
41 #include "rf_dagffrd.h"
42 #include "rf_dagffwr.h"
43 #include "rf_dagdegrd.h"
44 #include "rf_dagdegwr.h"
45 #include "rf_dagutils.h"
46 #include "rf_general.h"
47 #include "rf_map.h"
48 #include "rf_utils.h"
49 
50 typedef struct RF_Raid5ConfigInfo_s {
51 	RF_RowCol_t **stripeIdentifier;	/*
52 					 * Filled in at config time and used
53 					 * by IdentifyStripe.
54 					 */
55 } RF_Raid5ConfigInfo_t;
56 
57 
58 int
rf_ConfigureRAID5(RF_ShutdownList_t ** listp,RF_Raid_t * raidPtr,RF_Config_t * cfgPtr)59 rf_ConfigureRAID5(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
60     RF_Config_t *cfgPtr)
61 {
62 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
63 	RF_Raid5ConfigInfo_t *info;
64 	RF_RowCol_t i, j, startdisk;
65 
66 	/* Create a RAID level 5 configuration structure. */
67 	RF_MallocAndAdd(info, sizeof(RF_Raid5ConfigInfo_t),
68 	    (RF_Raid5ConfigInfo_t *), raidPtr->cleanupList);
69 	if (info == NULL)
70 		return (ENOMEM);
71 	layoutPtr->layoutSpecificInfo = (void *) info;
72 
73 	RF_ASSERT(raidPtr->numRow == 1);
74 
75 	/*
76 	 * The stripe identifier must identify the disks in each stripe, IN
77 	 * THE ORDER THAT THEY APPEAR IN THE STRIPE.
78 	 */
79 	info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol,
80 	    raidPtr->numCol, raidPtr->cleanupList);
81 	if (info->stripeIdentifier == NULL)
82 		return (ENOMEM);
83 	startdisk = 0;
84 	for (i = 0; i < raidPtr->numCol; i++) {
85 		for (j = 0; j < raidPtr->numCol; j++) {
86 			info->stripeIdentifier[i][j] = (startdisk + j) %
87 			    raidPtr->numCol;
88 		}
89 		if ((--startdisk) < 0)
90 			startdisk = raidPtr->numCol - 1;
91 	}
92 
93 	/* Fill in the remaining layout parameters. */
94 	layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
95 	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit <<
96 	    raidPtr->logBytesPerSector;
97 	layoutPtr->numDataCol = raidPtr->numCol - 1;
98 	layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol *
99 	    layoutPtr->sectorsPerStripeUnit;
100 	layoutPtr->numParityCol = 1;
101 	layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
102 
103 	raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk *
104 	    layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
105 
106 	return (0);
107 }
108 
109 int
rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t * raidPtr)110 rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr)
111 {
112 	return (20);
113 }
114 
115 RF_HeadSepLimit_t
rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t * raidPtr)116 rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr)
117 {
118 	return (10);
119 }
120 
121 #if !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(_KERNEL)
122 /* Not currently used. */
123 int
rf_ShutdownRAID5(RF_Raid_t * raidPtr)124 rf_ShutdownRAID5(RF_Raid_t *raidPtr)
125 {
126 	return (0);
127 }
128 #endif
129 
130 void
rf_MapSectorRAID5(RF_Raid_t * raidPtr,RF_RaidAddr_t raidSector,RF_RowCol_t * row,RF_RowCol_t * col,RF_SectorNum_t * diskSector,int remap)131 rf_MapSectorRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
132     RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap)
133 {
134 	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
135 	*row = 0;
136 	*col = (SUID % raidPtr->numCol);
137 	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
138 	    raidPtr->Layout.sectorsPerStripeUnit +
139 	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
140 }
141 
142 void
rf_MapParityRAID5(RF_Raid_t * raidPtr,RF_RaidAddr_t raidSector,RF_RowCol_t * row,RF_RowCol_t * col,RF_SectorNum_t * diskSector,int remap)143 rf_MapParityRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
144     RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap)
145 {
146 	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
147 
148 	*row = 0;
149 	*col = raidPtr->Layout.numDataCol -
150 	    (SUID / raidPtr->Layout.numDataCol) % raidPtr->numCol;
151 	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
152 	    raidPtr->Layout.sectorsPerStripeUnit +
153 	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
154 }
155 
156 void
rf_IdentifyStripeRAID5(RF_Raid_t * raidPtr,RF_RaidAddr_t addr,RF_RowCol_t ** diskids,RF_RowCol_t * outRow)157 rf_IdentifyStripeRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
158     RF_RowCol_t **diskids, RF_RowCol_t *outRow)
159 {
160 	RF_StripeNum_t stripeID =
161 	    rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
162 	RF_Raid5ConfigInfo_t *info =
163 	    (RF_Raid5ConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
164 
165 	*outRow = 0;
166 	*diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
167 }
168 
169 void
rf_MapSIDToPSIDRAID5(RF_RaidLayout_t * layoutPtr,RF_StripeNum_t stripeID,RF_StripeNum_t * psID,RF_ReconUnitNum_t * which_ru)170 rf_MapSIDToPSIDRAID5(RF_RaidLayout_t *layoutPtr, RF_StripeNum_t stripeID,
171     RF_StripeNum_t *psID, RF_ReconUnitNum_t *which_ru)
172 {
173 	*which_ru = 0;
174 	*psID = stripeID;
175 }
176 
177 
178 /*
179  * Select an algorithm for performing an access.  Returns two pointers,
180  * one to a function that will return information about the DAG, and
181  * another to a function that will create the dag.
182  */
183 void
rf_RaidFiveDagSelect(RF_Raid_t * raidPtr,RF_IoType_t type,RF_AccessStripeMap_t * asmap,RF_VoidFuncPtr * createFunc)184 rf_RaidFiveDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
185     RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc)
186 {
187 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
188 	RF_PhysDiskAddr_t *failedPDA = NULL;
189 	RF_RowCol_t frow, fcol;
190 	RF_RowStatus_t rstat;
191 	int prior_recon;
192 
193 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
194 
195 	if (asmap->numDataFailed + asmap->numParityFailed > 1) {
196 		RF_ERRORMSG("Multiple disks failed in a single group !"
197 		            "  Aborting I/O operation.\n");
198 		 /* *infoFunc = */ *createFunc = NULL;
199 		return;
200 	} else
201 		if (asmap->numDataFailed + asmap->numParityFailed == 1) {
202 
203 			/*
204 			 * If under recon & already reconstructed, redirect
205 			 * the access to the spare drive and eliminate the
206 			 * failure indication.
207 			 */
208 			failedPDA = asmap->failedPDAs[0];
209 			frow = failedPDA->row;
210 			fcol = failedPDA->col;
211 			rstat = raidPtr->status[failedPDA->row];
212 			prior_recon = (rstat == rf_rs_reconfigured) || (
213 			    (rstat == rf_rs_reconstructing) ?
214 			    rf_CheckRUReconstructed(raidPtr
215 			     ->reconControl[frow]->reconMap,
216 			     failedPDA->startSector) : 0);
217 			if (prior_recon) {
218 				RF_RowCol_t or = failedPDA->row;
219 				RF_RowCol_t oc = failedPDA->col;
220 				RF_SectorNum_t oo = failedPDA->startSector;
221 
222 				if (layoutPtr->map->flags &
223 				    RF_DISTRIBUTE_SPARE) {
224 					/* Redirect to dist spare space. */
225 
226 					if (failedPDA == asmap->parityInfo) {
227 
228 						/* Parity has failed. */
229 						(layoutPtr->map->MapParity)
230 						    (raidPtr,
231 						     failedPDA->raidAddress,
232 						     &failedPDA->row,
233 						     &failedPDA->col,
234 						     &failedPDA->startSector,
235 						     RF_REMAP);
236 
237 						if (asmap->parityInfo->next) {
238 							/*
239 							 * Redir 2nd component,
240 							 * if any.
241 							 */
242 							RF_PhysDiskAddr_t *p =
243 							    asmap
244 							     ->parityInfo->next;
245 							RF_SectorNum_t SUoffs =
246 							    p->startSector %
247 						layoutPtr->sectorsPerStripeUnit;
248 							p->row = failedPDA->row;
249 							p->col = failedPDA->col;
250 							/*
251 							 * Cheating:
252 							 * startSector is not
253 							 * really a RAID
254 							 * address.
255 							 */
256 							p->startSector =
257 					rf_RaidAddressOfPrevStripeUnitBoundary(
258 					    layoutPtr, failedPDA->startSector) +
259 							    SUoffs;
260 						}
261 					} else
262 						if (asmap->parityInfo->next &&
263 						    failedPDA ==
264 						    asmap->parityInfo->next) {
265 							/*
266 							 * Should never happen.
267 							 */
268 							RF_ASSERT(0);
269 						} else {
270 							/* Data has failed. */
271 							(layoutPtr->map
272 							 ->MapSector) (raidPtr,
273 							 failedPDA->raidAddress,
274 							    &failedPDA->row,
275 							    &failedPDA->col,
276 							&failedPDA->startSector,
277 							    RF_REMAP);
278 						}
279 
280 				} else {
281 					/* Redirect to dedicated spare space. */
282 
283 					failedPDA->row =
284 					    raidPtr->Disks[frow][fcol].spareRow;
285 					failedPDA->col =
286 					    raidPtr->Disks[frow][fcol].spareCol;
287 
288 					/*
289 					 * The parity may have two distinct
290 					 * components, both of which may need
291 					 * to be redirected.
292 					 */
293 					if (asmap->parityInfo->next) {
294 						if (failedPDA ==
295 						    asmap->parityInfo) {
296 							failedPDA->next->row =
297 							    failedPDA->row;
298 							failedPDA->next->col =
299 							    failedPDA->col;
300 						} else {
301 							if (failedPDA ==
302 							    asmap->parityInfo
303 							     ->next) {
304 								/*
305 								 * Paranoid:
306 								 * Should never
307 								 * occur.
308 								 */
309 								asmap
310 								 ->parityInfo
311 								 ->row =
312 								 failedPDA->row;
313 								asmap
314 								 ->parityInfo
315 								 ->col =
316 								 failedPDA->col;
317 							}
318 						}
319 					}
320 				}
321 
322 				RF_ASSERT(failedPDA->col != -1);
323 
324 				if (rf_dagDebug || rf_mapDebug) {
325 					printf("raid%d: Redirected type '%c'"
326 					       " r %d c %d o %ld -> r %d c %d"
327 					       " o %ld\n", raidPtr->raidid,
328 					       type, or, oc, (long) oo,
329 					       failedPDA->row, failedPDA->col,
330 					       (long) failedPDA->startSector);
331 				}
332 				asmap->numDataFailed = asmap->numParityFailed
333 				                     = 0;
334 			}
335 		}
336 	/*
337 	 * All DAGs begin/end with block/unblock node. Therefore, hdrSucc &
338 	 * termAnt counts should always be 1. Also, these counts should not be
339 	 * visible outside DAG creation routines - manipulating the counts
340 	 * here should be removed.
341 	 */
342 	if (type == RF_IO_TYPE_READ) {
343 		if (asmap->numDataFailed == 0)
344 			*createFunc = (RF_VoidFuncPtr)
345 			    rf_CreateFaultFreeReadDAG;
346 		else
347 			*createFunc = (RF_VoidFuncPtr)
348 			    rf_CreateRaidFiveDegradedReadDAG;
349 	} else {
350 		/*
351 		 * If mirroring, always use large writes. If the access
352 		 * requires two distinct parity updates, always do a small
353 		 * write. If the stripe contains a failure but the access
354 		 * does not, do a small write. The first conditional
355 		 * (numStripeUnitsAccessed <= numDataCol/2) uses a
356 		 * less-than-or-equal rather than just a less-than because
357 		 * when G is 3 or 4, numDataCol/2 is 1, and I want
358 		 * single-stripe-unit updates to use just one disk.
359 		 */
360 		if ((asmap->numDataFailed + asmap->numParityFailed) == 0) {
361 			if (rf_suppressLocksAndLargeWrites ||
362 			    (((asmap->numStripeUnitsAccessed <=
363 			       (layoutPtr->numDataCol / 2)) &&
364 			      (layoutPtr->numDataCol != 1)) ||
365 			     (asmap->parityInfo->next != NULL) ||
366 			     rf_CheckStripeForFailures(raidPtr, asmap))) {
367 				*createFunc = (RF_VoidFuncPtr)
368 				    rf_CreateSmallWriteDAG;
369 			} else
370 				*createFunc = (RF_VoidFuncPtr)
371 				    rf_CreateLargeWriteDAG;
372 		} else {
373 			if (asmap->numParityFailed == 1)
374 				*createFunc = (RF_VoidFuncPtr)
375 				    rf_CreateNonRedundantWriteDAG;
376 			else
377 				if (asmap->numStripeUnitsAccessed != 1 &&
378 				    failedPDA->numSector !=
379 				    layoutPtr->sectorsPerStripeUnit)
380 					*createFunc = NULL;
381 				else
382 					*createFunc = (RF_VoidFuncPtr)
383 					    rf_CreateDegradedWriteDAG;
384 		}
385 	}
386 }
387