1 /*	$OpenBSD: rf_map.c,v 1.5 2002/12/16 07:01:04 tdeval Exp $	*/
2 /*	$NetBSD: rf_map.c,v 1.5 2000/06/29 00:22:27 oster Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Carnegie-Mellon University.
6  * All rights reserved.
7  *
8  * Author: Mark Holland
9  *
10  * Permission to use, copy, modify and distribute this software and
11  * its documentation is hereby granted, provided that both the copyright
12  * notice and this permission notice appear in all copies of the
13  * software, derivative works or modified versions, and any portions
14  * thereof, and that both notices appear in supporting documentation.
15  *
16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19  *
20  * Carnegie Mellon requests users of this software to return to
21  *
22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
23  *  School of Computer Science
24  *  Carnegie Mellon University
25  *  Pittsburgh PA 15213-3890
26  *
27  * any improvements or extensions that they make and grant Carnegie the
28  * rights to redistribute these changes.
29  */
30 
31 /*****************************************************************************
32  *
33  * map.c -- Main code for mapping RAID addresses to physical disk addresses.
34  *
35  *****************************************************************************/
36 
37 #include "rf_types.h"
38 #include "rf_threadstuff.h"
39 #include "rf_raid.h"
40 #include "rf_general.h"
41 #include "rf_map.h"
42 #include "rf_freelist.h"
43 #include "rf_shutdown.h"
44 
45 void rf_FreePDAList(RF_PhysDiskAddr_t *, RF_PhysDiskAddr_t *, int);
46 void rf_FreeASMList(RF_AccessStripeMap_t *, RF_AccessStripeMap_t *, int);
47 
48 /*****************************************************************************
49  *
50  * MapAccess -- Main 1st order mapping routine.
51  *
52  * Maps an access in the RAID address space to the corresponding set of
53  * physical disk addresses. The result is returned as a list of
54  * AccessStripeMap structures, one per stripe accessed. Each ASM structure
55  * contains a pointer to a list of PhysDiskAddr structures, which describe
56  * the physical locations touched by the user access. Note that this routine
57  * returns only static mapping information, i.e. the list of physical
58  * addresses returned does not necessarily identify the set of physical
59  * locations that will actually be read or written.
60  *
61  * The routine also maps the parity. The physical disk location returned
62  * always indicates the entire parity unit, even when only a subset of it
63  * is being accessed. This is because an access that is not stripe unit
64  * aligned but that spans a stripe unit boundary may require access two
65  * distinct portions of the parity unit, and we can't yet tell which
66  * portion(s) we'll actually need. We leave it up to the algorithm
67  * selection code to decide what subset of the parity unit to access.
68  *
69  * Note that addresses in the RAID address space must always be maintained
70  * as longs, instead of ints.
71  *
72  * This routine returns NULL if numBlocks is 0.
73  *
74  *****************************************************************************/
75 
76 RF_AccessStripeMapHeader_t *
rf_MapAccess(RF_Raid_t * raidPtr,RF_RaidAddr_t raidAddress,RF_SectorCount_t numBlocks,caddr_t buffer,int remap)77 rf_MapAccess(
78 	RF_Raid_t	*raidPtr,
79 	RF_RaidAddr_t	 raidAddress,	/*
80 					 * Starting address in RAID address
81 					 * space.
82 					 */
83 	RF_SectorCount_t numBlocks,	/*
84 					 * Number of blocks in RAID address
85 					 * space to access.
86 					 */
87 	caddr_t		 buffer,	/* Buffer to supply/receive data. */
88 	int		 remap		/*
89 					 * 1 => remap addresses to spare space.
90 					 */
91 )
92 {
93 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
94 	RF_AccessStripeMapHeader_t *asm_hdr = NULL;
95 	RF_AccessStripeMap_t *asm_list = NULL, *asm_p = NULL;
96 	int faultsTolerated = layoutPtr->map->faultsTolerated;
97 	/* We'll change raidAddress along the way. */
98 	RF_RaidAddr_t startAddress = raidAddress;
99 	RF_RaidAddr_t endAddress = raidAddress + numBlocks;
100 	RF_RaidDisk_t **disks = raidPtr->Disks;
101 
102 	RF_PhysDiskAddr_t *pda_p, *pda_q;
103 	RF_StripeCount_t numStripes = 0;
104 	RF_RaidAddr_t stripeRealEndAddress, stripeEndAddress;
105 	RF_RaidAddr_t nextStripeUnitAddress;
106 	RF_RaidAddr_t startAddrWithinStripe, lastRaidAddr;
107 	RF_StripeCount_t totStripes;
108 	RF_StripeNum_t stripeID, lastSID, SUID, lastSUID;
109 	RF_AccessStripeMap_t *asmList, *t_asm;
110 	RF_PhysDiskAddr_t *pdaList, *t_pda;
111 
112 	/* Allocate all the ASMs and PDAs up front. */
113 	lastRaidAddr = raidAddress + numBlocks - 1;
114 	stripeID = rf_RaidAddressToStripeID(layoutPtr, raidAddress);
115 	lastSID = rf_RaidAddressToStripeID(layoutPtr, lastRaidAddr);
116 	totStripes = lastSID - stripeID + 1;
117 	SUID = rf_RaidAddressToStripeUnitID(layoutPtr, raidAddress);
118 	lastSUID = rf_RaidAddressToStripeUnitID(layoutPtr, lastRaidAddr);
119 
120 	asmList = rf_AllocASMList(totStripes);
121 	pdaList = rf_AllocPDAList(lastSUID - SUID + 1 +
122 	    faultsTolerated * totStripes);	/*
123 						 * May also need pda(s)
124 						 * per stripe for parity.
125 						 */
126 
127 	if (raidAddress + numBlocks > raidPtr->totalSectors) {
128 		RF_ERRORMSG1("Unable to map access because offset (%d)"
129 		    " was invalid\n", (int) raidAddress);
130 		return (NULL);
131 	}
132 	if (rf_mapDebug)
133 		rf_PrintRaidAddressInfo(raidPtr, raidAddress, numBlocks);
134 	for (; raidAddress < endAddress;) {
135 		/* Make the next stripe structure. */
136 		RF_ASSERT(asmList);
137 		t_asm = asmList;
138 		asmList = asmList->next;
139 		bzero((char *) t_asm, sizeof(RF_AccessStripeMap_t));
140 		if (!asm_p)
141 			asm_list = asm_p = t_asm;
142 		else {
143 			asm_p->next = t_asm;
144 			asm_p = asm_p->next;
145 		}
146 		numStripes++;
147 
148 		/* Map SUs from current location to the end of the stripe. */
149 		asm_p->stripeID =
150 		/* rf_RaidAddressToStripeID(layoutPtr, raidAddress) */
151 		    stripeID++;
152 		stripeRealEndAddress =
153 		    rf_RaidAddressOfNextStripeBoundary(layoutPtr, raidAddress);
154 		stripeEndAddress = RF_MIN(endAddress, stripeRealEndAddress);
155 		asm_p->raidAddress = raidAddress;
156 		asm_p->endRaidAddress = stripeEndAddress;
157 
158 		/* Map each stripe unit in the stripe. */
159 		pda_p = NULL;
160 		/*
161 		 * Raid addr of start of portion of access that is within this
162 		 * stripe.
163 		 */
164 		startAddrWithinStripe = raidAddress;
165 
166 		for (; raidAddress < stripeEndAddress;) {
167 			RF_ASSERT(pdaList);
168 			t_pda = pdaList;
169 			pdaList = pdaList->next;
170 			bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
171 			if (!pda_p)
172 				asm_p->physInfo = pda_p = t_pda;
173 			else {
174 				pda_p->next = t_pda;
175 				pda_p = pda_p->next;
176 			}
177 
178 			pda_p->type = RF_PDA_TYPE_DATA;
179 			(layoutPtr->map->MapSector) (raidPtr, raidAddress,
180 			    &(pda_p->row), &(pda_p->col),
181 			    &(pda_p->startSector), remap);
182 
183 			/*
184 			 * Mark any failures we find.
185 			 * failedPDA is don't-care if there is more than
186 			 * one failure.
187 			 */
188 			/*
189 			 * The RAID address corresponding to this physical
190 			 * disk address.
191 			 */
192 			pda_p->raidAddress = raidAddress;
193 			nextStripeUnitAddress =
194 			    rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
195 			     raidAddress);
196 			pda_p->numSector = RF_MIN(endAddress,
197 			    nextStripeUnitAddress) - raidAddress;
198 			RF_ASSERT(pda_p->numSector != 0);
199 			rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 0);
200 			pda_p->bufPtr = buffer + rf_RaidAddressToByte(raidPtr,
201 			    (raidAddress - startAddress));
202 			asm_p->totalSectorsAccessed += pda_p->numSector;
203 			asm_p->numStripeUnitsAccessed++;
204 			asm_p->origRow = pda_p->row;	/*
205 							 * Redundant but
206 							 * harmless to do this
207 							 * in every loop
208 							 * iteration.
209 							 */
210 
211 			raidAddress = RF_MIN(endAddress, nextStripeUnitAddress);
212 		}
213 
214 		/*
215 		 * Map the parity. At this stage, the startSector and
216 		 * numSector fields for the parity unit are always set to
217 		 * indicate the entire parity unit. We may modify this after
218 		 * mapping the data portion.
219 		 */
220 		switch (faultsTolerated) {
221 		case 0:
222 			break;
223 		case 1:	/* Single fault tolerant. */
224 			RF_ASSERT(pdaList);
225 			t_pda = pdaList;
226 			pdaList = pdaList->next;
227 			bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
228 			pda_p = asm_p->parityInfo = t_pda;
229 			pda_p->type = RF_PDA_TYPE_PARITY;
230 			(layoutPtr->map->MapParity) (raidPtr,
231 			    rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
232 			     startAddrWithinStripe), &(pda_p->row),
233 			    &(pda_p->col), &(pda_p->startSector), remap);
234 			pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
235 			/*
236 			 * raidAddr may be needed to find unit to redirect to.
237 			 */
238 			pda_p->raidAddress =
239 			    rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
240 			     startAddrWithinStripe);
241 			rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 1);
242 			rf_ASMParityAdjust(asm_p->parityInfo,
243 			    startAddrWithinStripe, endAddress,
244 			    layoutPtr, asm_p);
245 
246 			break;
247 		case 2:	/* Two fault tolerant. */
248 			RF_ASSERT(pdaList && pdaList->next);
249 			t_pda = pdaList;
250 			pdaList = pdaList->next;
251 			bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
252 			pda_p = asm_p->parityInfo = t_pda;
253 			pda_p->type = RF_PDA_TYPE_PARITY;
254 			t_pda = pdaList;
255 			pdaList = pdaList->next;
256 			bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
257 			pda_q = asm_p->qInfo = t_pda;
258 			pda_q->type = RF_PDA_TYPE_Q;
259 			(layoutPtr->map->MapParity) (raidPtr,
260 			    rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
261 			     startAddrWithinStripe), &(pda_p->row),
262 			    &(pda_p->col), &(pda_p->startSector), remap);
263 			(layoutPtr->map->MapQ) (raidPtr,
264 			    rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
265 			     startAddrWithinStripe), &(pda_q->row),
266 			    &(pda_q->col), &(pda_q->startSector), remap);
267 			pda_q->numSector = pda_p->numSector =
268 			    layoutPtr->sectorsPerStripeUnit;
269 			/*
270 			 * raidAddr may be needed to find unit to redirect to.
271 			 */
272 			pda_p->raidAddress =
273 			    rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
274 			     startAddrWithinStripe);
275 			pda_q->raidAddress =
276 			    rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
277 			     startAddrWithinStripe);
278 			/* Failure mode stuff. */
279 			rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 1);
280 			rf_ASMCheckStatus(raidPtr, pda_q, asm_p, disks, 1);
281 			rf_ASMParityAdjust(asm_p->parityInfo,
282 			    startAddrWithinStripe, endAddress,
283 			    layoutPtr, asm_p);
284 			rf_ASMParityAdjust(asm_p->qInfo, startAddrWithinStripe,
285 			    endAddress, layoutPtr, asm_p);
286 			break;
287 		}
288 	}
289 	RF_ASSERT(asmList == NULL && pdaList == NULL);
290 	/* Make the header structure. */
291 	asm_hdr = rf_AllocAccessStripeMapHeader();
292 	RF_ASSERT(numStripes == totStripes);
293 	asm_hdr->numStripes = numStripes;
294 	asm_hdr->stripeMap = asm_list;
295 
296 	if (rf_mapDebug)
297 		rf_PrintAccessStripeMap(asm_hdr);
298 	return (asm_hdr);
299 }
300 
301 /*****************************************************************************
302  * This routine walks through an ASM list and marks the PDAs that have failed.
303  * It's called only when a disk failure causes an in-flight DAG to fail.
304  * The parity may consist of two components, but we want to use only one
305  * failedPDA pointer. Thus we set failedPDA to point to the first parity
306  * component, and rely on the rest of the code to do the right thing with this.
307  *****************************************************************************/
308 void
rf_MarkFailuresInASMList(RF_Raid_t * raidPtr,RF_AccessStripeMapHeader_t * asm_h)309 rf_MarkFailuresInASMList(RF_Raid_t *raidPtr, RF_AccessStripeMapHeader_t *asm_h)
310 {
311 	RF_RaidDisk_t **disks = raidPtr->Disks;
312 	RF_AccessStripeMap_t *asmap;
313 	RF_PhysDiskAddr_t *pda;
314 
315 	for (asmap = asm_h->stripeMap; asmap; asmap = asmap->next) {
316 		asmap->numDataFailed = asmap->numParityFailed =
317 		    asmap->numQFailed = 0;
318 		asmap->numFailedPDAs = 0;
319 		bzero((char *) asmap->failedPDAs,
320 		    RF_MAX_FAILED_PDA * sizeof(RF_PhysDiskAddr_t *));
321 		for (pda = asmap->physInfo; pda; pda = pda->next) {
322 			if (RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
323 				asmap->numDataFailed++;
324 				asmap->failedPDAs[asmap->numFailedPDAs] = pda;
325 				asmap->numFailedPDAs++;
326 			}
327 		}
328 		pda = asmap->parityInfo;
329 		if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
330 			asmap->numParityFailed++;
331 			asmap->failedPDAs[asmap->numFailedPDAs] = pda;
332 			asmap->numFailedPDAs++;
333 		}
334 		pda = asmap->qInfo;
335 		if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
336 			asmap->numQFailed++;
337 			asmap->failedPDAs[asmap->numFailedPDAs] = pda;
338 			asmap->numFailedPDAs++;
339 		}
340 	}
341 }
342 
343 /*****************************************************************************
344  *
345  * DuplicateASM -- Duplicates an ASM and returns the new one.
346  *
347  *****************************************************************************/
348 RF_AccessStripeMap_t *
rf_DuplicateASM(RF_AccessStripeMap_t * asmap)349 rf_DuplicateASM(RF_AccessStripeMap_t *asmap)
350 {
351 	RF_AccessStripeMap_t *new_asm;
352 	RF_PhysDiskAddr_t *pda, *new_pda, *t_pda;
353 
354 	new_pda = NULL;
355 	new_asm = rf_AllocAccessStripeMapComponent();
356 	bcopy((char *) asmap, (char *) new_asm, sizeof(RF_AccessStripeMap_t));
357 	new_asm->numFailedPDAs = 0;	/* ??? */
358 	new_asm->failedPDAs[0] = NULL;
359 	new_asm->physInfo = NULL;
360 	new_asm->parityInfo = NULL;
361 	new_asm->next = NULL;
362 
363 	for (pda = asmap->physInfo; pda; pda = pda->next) {
364 		/* Copy the physInfo list. */
365 		t_pda = rf_AllocPhysDiskAddr();
366 		bcopy((char *) pda, (char *) t_pda, sizeof(RF_PhysDiskAddr_t));
367 		t_pda->next = NULL;
368 		if (!new_asm->physInfo) {
369 			new_asm->physInfo = t_pda;
370 			new_pda = t_pda;
371 		} else {
372 			new_pda->next = t_pda;
373 			new_pda = new_pda->next;
374 		}
375 		if (pda == asmap->failedPDAs[0])
376 			new_asm->failedPDAs[0] = t_pda;
377 	}
378 	for (pda = asmap->parityInfo; pda; pda = pda->next) {
379 		/* Copy the parityInfo list. */
380 		t_pda = rf_AllocPhysDiskAddr();
381 		bcopy((char *) pda, (char *) t_pda, sizeof(RF_PhysDiskAddr_t));
382 		t_pda->next = NULL;
383 		if (!new_asm->parityInfo) {
384 			new_asm->parityInfo = t_pda;
385 			new_pda = t_pda;
386 		} else {
387 			new_pda->next = t_pda;
388 			new_pda = new_pda->next;
389 		}
390 		if (pda == asmap->failedPDAs[0])
391 			new_asm->failedPDAs[0] = t_pda;
392 	}
393 	return (new_asm);
394 }
395 
396 /*****************************************************************************
397  *
398  * DuplicatePDA -- Duplicates a PDA and returns the new one.
399  *
400  *****************************************************************************/
401 RF_PhysDiskAddr_t *
rf_DuplicatePDA(RF_PhysDiskAddr_t * pda)402 rf_DuplicatePDA(RF_PhysDiskAddr_t *pda)
403 {
404 	RF_PhysDiskAddr_t *new;
405 
406 	new = rf_AllocPhysDiskAddr();
407 	bcopy((char *) pda, (char *) new, sizeof(RF_PhysDiskAddr_t));
408 	return (new);
409 }
410 
411 /*****************************************************************************
412  *
413  * Routines to allocate and free list elements. All allocation routines zero
414  * the structure before returning it.
415  *
416  * FreePhysDiskAddr is static. It should never be called directly, because
417  * FreeAccessStripeMap takes care of freeing the PhysDiskAddr list.
418  *
419  *****************************************************************************/
420 
421 static RF_FreeList_t *rf_asmhdr_freelist;
422 #define	RF_MAX_FREE_ASMHDR		128
423 #define	RF_ASMHDR_INC			 16
424 #define	RF_ASMHDR_INITIAL		 32
425 
426 static RF_FreeList_t *rf_asm_freelist;
427 #define	RF_MAX_FREE_ASM			192
428 #define	RF_ASM_INC			 24
429 #define	RF_ASM_INITIAL			 64
430 
431 static RF_FreeList_t *rf_pda_freelist;
432 #define	RF_MAX_FREE_PDA			192
433 #define	RF_PDA_INC			 24
434 #define	RF_PDA_INITIAL			 64
435 
436 /*
437  * Called at shutdown time. So far, all that is necessary is to release
438  * all the free lists.
439  */
440 void rf_ShutdownMapModule(void *);
441 void
rf_ShutdownMapModule(void * ignored)442 rf_ShutdownMapModule(void *ignored)
443 {
444 	RF_FREELIST_DESTROY(rf_asmhdr_freelist, next,
445 	    (RF_AccessStripeMapHeader_t *));
446 	RF_FREELIST_DESTROY(rf_pda_freelist, next, (RF_PhysDiskAddr_t *));
447 	RF_FREELIST_DESTROY(rf_asm_freelist, next, (RF_AccessStripeMap_t *));
448 }
449 
450 int
rf_ConfigureMapModule(RF_ShutdownList_t ** listp)451 rf_ConfigureMapModule(RF_ShutdownList_t **listp)
452 {
453 	int rc;
454 
455 	RF_FREELIST_CREATE(rf_asmhdr_freelist, RF_MAX_FREE_ASMHDR,
456 	    RF_ASMHDR_INC, sizeof(RF_AccessStripeMapHeader_t));
457 	if (rf_asmhdr_freelist == NULL) {
458 		return (ENOMEM);
459 	}
460 	RF_FREELIST_CREATE(rf_asm_freelist, RF_MAX_FREE_ASM,
461 	    RF_ASM_INC, sizeof(RF_AccessStripeMap_t));
462 	if (rf_asm_freelist == NULL) {
463 		RF_FREELIST_DESTROY(rf_asmhdr_freelist, next,
464 		    (RF_AccessStripeMapHeader_t *));
465 		return (ENOMEM);
466 	}
467 	RF_FREELIST_CREATE(rf_pda_freelist, RF_MAX_FREE_PDA, RF_PDA_INC,
468 	    sizeof(RF_PhysDiskAddr_t));
469 	if (rf_pda_freelist == NULL) {
470 		RF_FREELIST_DESTROY(rf_asmhdr_freelist, next,
471 		    (RF_AccessStripeMapHeader_t *));
472 		RF_FREELIST_DESTROY(rf_pda_freelist, next,
473 		    (RF_PhysDiskAddr_t *));
474 		return (ENOMEM);
475 	}
476 	rc = rf_ShutdownCreate(listp, rf_ShutdownMapModule, NULL);
477 	if (rc) {
478 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d"
479 		    " rc=%d\n", __FILE__, __LINE__, rc);
480 		rf_ShutdownMapModule(NULL);
481 		return (rc);
482 	}
483 	RF_FREELIST_PRIME(rf_asmhdr_freelist, RF_ASMHDR_INITIAL, next,
484 	    (RF_AccessStripeMapHeader_t *));
485 	RF_FREELIST_PRIME(rf_asm_freelist, RF_ASM_INITIAL, next,
486 	    (RF_AccessStripeMap_t *));
487 	RF_FREELIST_PRIME(rf_pda_freelist, RF_PDA_INITIAL, next,
488 	    (RF_PhysDiskAddr_t *));
489 
490 	return (0);
491 }
492 
493 RF_AccessStripeMapHeader_t *
rf_AllocAccessStripeMapHeader(void)494 rf_AllocAccessStripeMapHeader(void)
495 {
496 	RF_AccessStripeMapHeader_t *p;
497 
498 	RF_FREELIST_GET(rf_asmhdr_freelist, p, next,
499 	    (RF_AccessStripeMapHeader_t *));
500 	bzero((char *) p, sizeof(RF_AccessStripeMapHeader_t));
501 
502 	return (p);
503 }
504 
505 void
rf_FreeAccessStripeMapHeader(RF_AccessStripeMapHeader_t * p)506 rf_FreeAccessStripeMapHeader(RF_AccessStripeMapHeader_t *p)
507 {
508 	RF_FREELIST_FREE(rf_asmhdr_freelist, p, next);
509 }
510 
511 RF_PhysDiskAddr_t *
rf_AllocPhysDiskAddr(void)512 rf_AllocPhysDiskAddr(void)
513 {
514 	RF_PhysDiskAddr_t *p;
515 
516 	RF_FREELIST_GET(rf_pda_freelist, p, next, (RF_PhysDiskAddr_t *));
517 	bzero((char *) p, sizeof(RF_PhysDiskAddr_t));
518 
519 	return (p);
520 }
521 
522 /*
523  * Allocates a list of PDAs, locking the free list only once.
524  * When we have to call calloc, we do it one component at a time to simplify
525  * the process of freeing the list at program shutdown. This should not be
526  * much of a performance hit, because it should be very infrequently executed.
527  */
528 RF_PhysDiskAddr_t *
rf_AllocPDAList(int count)529 rf_AllocPDAList(int count)
530 {
531 	RF_PhysDiskAddr_t *p = NULL;
532 
533 	RF_FREELIST_GET_N(rf_pda_freelist, p, next, (RF_PhysDiskAddr_t *),
534 	    count);
535 	return (p);
536 }
537 
538 void
rf_FreePhysDiskAddr(RF_PhysDiskAddr_t * p)539 rf_FreePhysDiskAddr(RF_PhysDiskAddr_t *p)
540 {
541 	RF_FREELIST_FREE(rf_pda_freelist, p, next);
542 }
543 
544 void
rf_FreePDAList(RF_PhysDiskAddr_t * l_start,RF_PhysDiskAddr_t * l_end,int count)545 rf_FreePDAList(
546 	/* Pointers to start and end of list. */
547 	RF_PhysDiskAddr_t	*l_start,
548 	RF_PhysDiskAddr_t	*l_end,
549 	int			 count	/* Number of elements in list. */
550 )
551 {
552 	RF_FREELIST_FREE_N(rf_pda_freelist, l_start, next,
553 	    (RF_PhysDiskAddr_t *), count);
554 }
555 
556 RF_AccessStripeMap_t *
rf_AllocAccessStripeMapComponent(void)557 rf_AllocAccessStripeMapComponent(void)
558 {
559 	RF_AccessStripeMap_t *p;
560 
561 	RF_FREELIST_GET(rf_asm_freelist, p, next, (RF_AccessStripeMap_t *));
562 	bzero((char *) p, sizeof(RF_AccessStripeMap_t));
563 
564 	return (p);
565 }
566 
567 /*
568  * This is essentially identical to AllocPDAList. I should combine the two.
569  * When we have to call calloc, we do it one component at a time to simplify
570  * the process of freeing the list at program shutdown. This should not be
571  * much of a performance hit, because it should be very infrequently executed.
572  */
573 RF_AccessStripeMap_t *
rf_AllocASMList(int count)574 rf_AllocASMList(int count)
575 {
576 	RF_AccessStripeMap_t *p = NULL;
577 
578 	RF_FREELIST_GET_N(rf_asm_freelist, p, next, (RF_AccessStripeMap_t *),
579 	    count);
580 	return (p);
581 }
582 
583 void
rf_FreeAccessStripeMapComponent(RF_AccessStripeMap_t * p)584 rf_FreeAccessStripeMapComponent(RF_AccessStripeMap_t *p)
585 {
586 	RF_FREELIST_FREE(rf_asm_freelist, p, next);
587 }
588 
589 void
rf_FreeASMList(RF_AccessStripeMap_t * l_start,RF_AccessStripeMap_t * l_end,int count)590 rf_FreeASMList(RF_AccessStripeMap_t *l_start, RF_AccessStripeMap_t *l_end,
591     int count)
592 {
593 	RF_FREELIST_FREE_N(rf_asm_freelist, l_start, next,
594 	    (RF_AccessStripeMap_t *), count);
595 }
596 
597 void
rf_FreeAccessStripeMap(RF_AccessStripeMapHeader_t * hdr)598 rf_FreeAccessStripeMap(RF_AccessStripeMapHeader_t *hdr)
599 {
600 	RF_AccessStripeMap_t *p, *pt = NULL;
601 	RF_PhysDiskAddr_t *pdp, *trailer, *pdaList = NULL, *pdaEnd = NULL;
602 	int count = 0, t, asm_count = 0;
603 
604 	for (p = hdr->stripeMap; p; p = p->next) {
605 
606 		/* Link the 3 pda lists into the accumulating pda list. */
607 
608 		if (!pdaList)
609 			pdaList = p->qInfo;
610 		else
611 			pdaEnd->next = p->qInfo;
612 		for (trailer = NULL, pdp = p->qInfo; pdp;) {
613 			trailer = pdp;
614 			pdp = pdp->next;
615 			count++;
616 		}
617 		if (trailer)
618 			pdaEnd = trailer;
619 
620 		if (!pdaList)
621 			pdaList = p->parityInfo;
622 		else
623 			pdaEnd->next = p->parityInfo;
624 		for (trailer = NULL, pdp = p->parityInfo; pdp;) {
625 			trailer = pdp;
626 			pdp = pdp->next;
627 			count++;
628 		}
629 		if (trailer)
630 			pdaEnd = trailer;
631 
632 		if (!pdaList)
633 			pdaList = p->physInfo;
634 		else
635 			pdaEnd->next = p->physInfo;
636 		for (trailer = NULL, pdp = p->physInfo; pdp;) {
637 			trailer = pdp;
638 			pdp = pdp->next;
639 			count++;
640 		}
641 		if (trailer)
642 			pdaEnd = trailer;
643 
644 		pt = p;
645 		asm_count++;
646 	}
647 
648 	/* Debug only. */
649 	for (t = 0, pdp = pdaList; pdp; pdp = pdp->next)
650 		t++;
651 	RF_ASSERT(t == count);
652 
653 	if (pdaList)
654 		rf_FreePDAList(pdaList, pdaEnd, count);
655 	rf_FreeASMList(hdr->stripeMap, pt, asm_count);
656 	rf_FreeAccessStripeMapHeader(hdr);
657 }
658 
659 /*
660  * We can't use the large write optimization if there are any failures in the
661  * stripe.
662  * In the declustered layout, there is no way to immediately determine what
663  * disks constitute a stripe, so we actually have to hunt through the stripe
664  * looking for failures.
665  * The reason we map the parity instead of just using asm->parityInfo->col is
666  * because the latter may have been already redirected to a spare drive, which
667  * would mess up the computation of the stripe offset.
668  *
669  * ASSUMES AT MOST ONE FAILURE IN THE STRIPE.
670  */
671 int
rf_CheckStripeForFailures(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap)672 rf_CheckStripeForFailures(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
673 {
674 	RF_RowCol_t trow, tcol, prow, pcol, *diskids, row, i;
675 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
676 	RF_StripeCount_t stripeOffset;
677 	int numFailures;
678 	RF_RaidAddr_t sosAddr;
679 	RF_SectorNum_t diskOffset, poffset;
680 	RF_RowCol_t testrow;
681 
682 	/* Quick out in the fault-free case. */
683 	RF_LOCK_MUTEX(raidPtr->mutex);
684 	numFailures = raidPtr->numFailures;
685 	RF_UNLOCK_MUTEX(raidPtr->mutex);
686 	if (numFailures == 0)
687 		return (0);
688 
689 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
690 	    asmap->raidAddress);
691 	row = asmap->physInfo->row;
692 	(layoutPtr->map->IdentifyStripe) (raidPtr, asmap->raidAddress,
693 	    &diskids, &testrow);
694 	(layoutPtr->map->MapParity) (raidPtr, asmap->raidAddress,
695 	    &prow, &pcol, &poffset, 0);	/* get pcol */
696 
697 	/*
698 	 * This needs not be true if we've redirected the access to a spare in
699 	 * another row.
700 	 * RF_ASSERT(row == testrow);
701 	 */
702 	stripeOffset = 0;
703 	for (i = 0; i < layoutPtr->numDataCol + layoutPtr->numParityCol; i++) {
704 		if (diskids[i] != pcol) {
705 			if (RF_DEAD_DISK(raidPtr
706 			    ->Disks[testrow][diskids[i]].status)) {
707 				if (raidPtr->status[testrow] !=
708 				    rf_rs_reconstructing)
709 					return (1);
710 				RF_ASSERT(
711 				    raidPtr->reconControl[testrow]->fcol ==
712 				    diskids[i]);
713 				layoutPtr->map->MapSector(raidPtr,
714 				    sosAddr + stripeOffset *
715 				    layoutPtr->sectorsPerStripeUnit,
716 				    &trow, &tcol, &diskOffset, 0);
717 				RF_ASSERT((trow == testrow) &&
718 				    (tcol == diskids[i]));
719 				if (!rf_CheckRUReconstructed(raidPtr
720 				     ->reconControl[testrow]->reconMap,
721 				     diskOffset))
722 					return (1);
723 				asmap->flags |= RF_ASM_REDIR_LARGE_WRITE;
724 				return (0);
725 			}
726 			stripeOffset++;
727 		}
728 	}
729 	return (0);
730 }
731 
732 /*
733  * Return the number of failed data units in the stripe.
734  */
735 int
rf_NumFailedDataUnitsInStripe(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap)736 rf_NumFailedDataUnitsInStripe(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
737 {
738 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
739 	RF_RowCol_t trow, tcol, row, i;
740 	RF_SectorNum_t diskOffset;
741 	RF_RaidAddr_t sosAddr;
742 	int numFailures;
743 
744 	/* Quick out in the fault-free case. */
745 	RF_LOCK_MUTEX(raidPtr->mutex);
746 	numFailures = raidPtr->numFailures;
747 	RF_UNLOCK_MUTEX(raidPtr->mutex);
748 	if (numFailures == 0)
749 		return (0);
750 	numFailures = 0;
751 
752 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
753 	    asmap->raidAddress);
754 	row = asmap->physInfo->row;
755 	for (i = 0; i < layoutPtr->numDataCol; i++) {
756 		(layoutPtr->map->MapSector) (raidPtr, sosAddr + i *
757 		    layoutPtr->sectorsPerStripeUnit,
758 		    &trow, &tcol, &diskOffset, 0);
759 		if (RF_DEAD_DISK(raidPtr->Disks[trow][tcol].status))
760 			numFailures++;
761 	}
762 
763 	return numFailures;
764 }
765 
766 
767 /*****************************************************************************
768  *
769  * Debug routines.
770  *
771  *****************************************************************************/
772 
773 void
rf_PrintAccessStripeMap(RF_AccessStripeMapHeader_t * asm_h)774 rf_PrintAccessStripeMap(RF_AccessStripeMapHeader_t *asm_h)
775 {
776 	rf_PrintFullAccessStripeMap(asm_h, 0);
777 }
778 
779 void
rf_PrintFullAccessStripeMap(RF_AccessStripeMapHeader_t * asm_h,int prbuf)780 rf_PrintFullAccessStripeMap(RF_AccessStripeMapHeader_t *asm_h,
781     int prbuf	/* Flag to print buffer pointers. */)
782 {
783 	int i;
784 	RF_AccessStripeMap_t *asmap = asm_h->stripeMap;
785 	RF_PhysDiskAddr_t *p;
786 	printf("%d stripes total\n", (int) asm_h->numStripes);
787 	for (; asmap; asmap = asmap->next) {
788 		/* printf("Num failures: %d\n", asmap->numDataFailed); */
789 		/* printf("Num sectors: %d\n",
790 		 * (int)asmap->totalSectorsAccessed); */
791 		printf("Stripe %d (%d sectors), failures: %d data, %d parity: ",
792 		    (int) asmap->stripeID,
793 		    (int) asmap->totalSectorsAccessed,
794 		    (int) asmap->numDataFailed,
795 		    (int) asmap->numParityFailed);
796 		if (asmap->parityInfo) {
797 			printf("Parity [r%d c%d s%d-%d", asmap->parityInfo->row,
798 			    asmap->parityInfo->col,
799 			    (int) asmap->parityInfo->startSector,
800 			    (int) (asmap->parityInfo->startSector +
801 			    asmap->parityInfo->numSector - 1));
802 			if (prbuf)
803 				printf(" b0x%lx",
804 				    (unsigned long) asmap->parityInfo->bufPtr);
805 			if (asmap->parityInfo->next) {
806 				printf(", r%d c%d s%d-%d",
807 				    asmap->parityInfo->next->row,
808 				    asmap->parityInfo->next->col,
809 				    (int) asmap->parityInfo->next->startSector,
810 				    (int) (asmap->parityInfo->next->startSector
811 				    + asmap->parityInfo->next->numSector - 1));
812 				if (prbuf)
813 					printf(" b0x%lx", (unsigned long)
814 					    asmap->parityInfo->next->bufPtr);
815 				RF_ASSERT(asmap->parityInfo->next->next
816 				    == NULL);
817 			}
818 			printf("]\n\t");
819 		}
820 		for (i = 0, p = asmap->physInfo; p; p = p->next, i++) {
821 			printf("SU r%d c%d s%d-%d ", p->row, p->col,
822 			    (int) p->startSector,
823 			    (int) (p->startSector + p->numSector - 1));
824 			if (prbuf)
825 				printf("b0x%lx ", (unsigned long) p->bufPtr);
826 			if (i && !(i & 1))
827 				printf("\n\t");
828 		}
829 		printf("\n");
830 		p = asm_h->stripeMap->failedPDAs[0];
831 		if (asm_h->stripeMap->numDataFailed +
832 		    asm_h->stripeMap->numParityFailed > 1)
833 			printf("[multiple failures]\n");
834 		else
835 			if (asm_h->stripeMap->numDataFailed +
836 			    asm_h->stripeMap->numParityFailed > 0)
837 				printf("\t[Failed PDA: r%d c%d s%d-%d]\n",
838 				    p->row, p->col, (int) p->startSector,
839 				    (int) (p->startSector + p->numSector - 1));
840 	}
841 }
842 
843 void
rf_PrintRaidAddressInfo(RF_Raid_t * raidPtr,RF_RaidAddr_t raidAddr,RF_SectorCount_t numBlocks)844 rf_PrintRaidAddressInfo(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr,
845     RF_SectorCount_t numBlocks)
846 {
847 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
848 	RF_RaidAddr_t ra, sosAddr =
849 	    rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
850 
851 	printf("Raid addrs of SU boundaries from start of stripe to end"
852 	    " of access:\n\t");
853 	for (ra = sosAddr; ra <= raidAddr + numBlocks;
854 	     ra += layoutPtr->sectorsPerStripeUnit) {
855 		printf("%d (0x%x), ", (int) ra, (int) ra);
856 	}
857 	printf("\n");
858 	printf("Offset into stripe unit: %d (0x%x)\n",
859 	    (int) (raidAddr % layoutPtr->sectorsPerStripeUnit),
860 	    (int) (raidAddr % layoutPtr->sectorsPerStripeUnit));
861 }
862 
863 /*
864  * Given a parity descriptor and the starting address within a stripe,
865  * range restrict the parity descriptor to touch only the correct stuff.
866  */
867 void
rf_ASMParityAdjust(RF_PhysDiskAddr_t * toAdjust,RF_StripeNum_t startAddrWithinStripe,RF_SectorNum_t endAddress,RF_RaidLayout_t * layoutPtr,RF_AccessStripeMap_t * asm_p)868 rf_ASMParityAdjust(
869     RF_PhysDiskAddr_t	*toAdjust,
870     RF_StripeNum_t	 startAddrWithinStripe,
871     RF_SectorNum_t	 endAddress,
872     RF_RaidLayout_t	*layoutPtr,
873     RF_AccessStripeMap_t *asm_p
874 )
875 {
876 	RF_PhysDiskAddr_t *new_pda;
877 
878 	/*
879 	 * When we're accessing only a portion of one stripe unit, we want the
880 	 * parity descriptor to identify only the chunk of parity associated
881 	 * with the data. When the access spans exactly one stripe unit
882 	 * boundary and is less than a stripe unit in size, it uses two
883 	 * disjoint regions of the parity unit. When an access spans more
884 	 * than one stripe unit boundary, it uses all of the parity unit.
885 	 *
886 	 * To better handle the case where stripe units are small, we may
887 	 * eventually want to change the 2nd case so that if the SU size is
888 	 * below some threshold, we just read/write the whole thing instead of
889 	 * breaking it up into two accesses.
890 	 */
891 	if (asm_p->numStripeUnitsAccessed == 1) {
892 		int x = (startAddrWithinStripe %
893 		    layoutPtr->sectorsPerStripeUnit);
894 		toAdjust->startSector += x;
895 		toAdjust->raidAddress += x;
896 		toAdjust->numSector = asm_p->physInfo->numSector;
897 		RF_ASSERT(toAdjust->numSector != 0);
898 	} else
899 		if (asm_p->numStripeUnitsAccessed == 2 &&
900 		    asm_p->totalSectorsAccessed <
901 		    layoutPtr->sectorsPerStripeUnit) {
902 			int x = (startAddrWithinStripe %
903 			    layoutPtr->sectorsPerStripeUnit);
904 
905 			/*
906 			 * Create a second pda and copy the parity map info
907 			 * into it.
908 			 */
909 			RF_ASSERT(toAdjust->next == NULL);
910 			new_pda = toAdjust->next = rf_AllocPhysDiskAddr();
911 			*new_pda = *toAdjust;	/* Structure assignment. */
912 			new_pda->next = NULL;
913 
914 			/*
915 			 * Adjust the start sector & number of blocks for the
916 			 * first parity pda.
917 			 */
918 			toAdjust->startSector += x;
919 			toAdjust->raidAddress += x;
920 			toAdjust->numSector =
921 			    rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
922 			     startAddrWithinStripe) - startAddrWithinStripe;
923 			RF_ASSERT(toAdjust->numSector != 0);
924 
925 			/* Adjust the second pda. */
926 			new_pda->numSector = endAddress -
927 			    rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
928 			     endAddress);
929 			/* new_pda->raidAddress =
930 			 *     rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
931 			 *      toAdjust->raidAddress); */
932 			RF_ASSERT(new_pda->numSector != 0);
933 		}
934 }
935 
936 /*
937  * Check if a disk has been spared or failed. If spared, redirect the I/O.
938  * If it has been failed, record it in the asm pointer.
939  * Fourth arg is whether data or parity.
940  */
941 void
rf_ASMCheckStatus(RF_Raid_t * raidPtr,RF_PhysDiskAddr_t * pda_p,RF_AccessStripeMap_t * asm_p,RF_RaidDisk_t ** disks,int parity)942 rf_ASMCheckStatus(
943     RF_Raid_t		 *raidPtr,
944     RF_PhysDiskAddr_t	 *pda_p,
945     RF_AccessStripeMap_t *asm_p,
946     RF_RaidDisk_t	**disks,
947     int			  parity
948 )
949 {
950 	RF_DiskStatus_t dstatus;
951 	RF_RowCol_t frow, fcol;
952 
953 	dstatus = disks[pda_p->row][pda_p->col].status;
954 
955 	if (dstatus == rf_ds_spared) {
956 		/* If the disk has been spared, redirect access to the spare. */
957 		frow = pda_p->row;
958 		fcol = pda_p->col;
959 		pda_p->row = disks[frow][fcol].spareRow;
960 		pda_p->col = disks[frow][fcol].spareCol;
961 	} else
962 		if (dstatus == rf_ds_dist_spared) {
963 			/* Ditto if disk has been spared to dist spare space. */
964 			RF_RowCol_t or = pda_p->row, oc = pda_p->col;
965 			RF_SectorNum_t oo = pda_p->startSector;
966 
967 			if (pda_p->type == RF_PDA_TYPE_DATA)
968 				raidPtr->Layout.map->MapSector(raidPtr,
969 				    pda_p->raidAddress, &pda_p->row,
970 				    &pda_p->col, &pda_p->startSector, RF_REMAP);
971 			else
972 				raidPtr->Layout.map->MapParity(raidPtr,
973 				    pda_p->raidAddress, &pda_p->row,
974 				    &pda_p->col, &pda_p->startSector, RF_REMAP);
975 
976 			if (rf_mapDebug) {
977 				printf("Redirected r %d c %d o %d -> r%d c %d"
978 				    " o %d\n", or, oc, (int) oo, pda_p->row,
979 				    pda_p->col, (int) pda_p->startSector);
980 			}
981 		} else
982 			if (RF_DEAD_DISK(dstatus)) {
983 				/*
984 				 * If the disk is inaccessible, mark the
985 				 * failure.
986 				 */
987 				if (parity)
988 					asm_p->numParityFailed++;
989 				else {
990 					asm_p->numDataFailed++;
991 #if 0
992 					/*
993 					 * XXX Do we really want this spewing
994 					 * out on the console ? GO
995 					 */
996 					printf("DATA_FAILED !\n");
997 #endif
998 				}
999 				asm_p->failedPDAs[asm_p->numFailedPDAs] = pda_p;
1000 				asm_p->numFailedPDAs++;
1001 #if 0
1002 				switch (asm_p->numParityFailed +
1003 				    asm_p->numDataFailed) {
1004 				case 1:
1005 					asm_p->failedPDAs[0] = pda_p;
1006 					break;
1007 				case 2:
1008 					asm_p->failedPDAs[1] = pda_p;
1009 				default:
1010 					break;
1011 				}
1012 #endif
1013 			}
1014 	/* The redirected access should never span a stripe unit boundary. */
1015 	RF_ASSERT(rf_RaidAddressToStripeUnitID(&raidPtr->Layout,
1016 	     pda_p->raidAddress) ==
1017 	    rf_RaidAddressToStripeUnitID(&raidPtr->Layout, pda_p->raidAddress +
1018 	     pda_p->numSector - 1));
1019 	RF_ASSERT(pda_p->col != -1);
1020 }
1021