1 /* $OpenBSD: rf_map.c,v 1.5 2002/12/16 07:01:04 tdeval Exp $ */
2 /* $NetBSD: rf_map.c,v 1.5 2000/06/29 00:22:27 oster Exp $ */
3
4 /*
5 * Copyright (c) 1995 Carnegie-Mellon University.
6 * All rights reserved.
7 *
8 * Author: Mark Holland
9 *
10 * Permission to use, copy, modify and distribute this software and
11 * its documentation is hereby granted, provided that both the copyright
12 * notice and this permission notice appear in all copies of the
13 * software, derivative works or modified versions, and any portions
14 * thereof, and that both notices appear in supporting documentation.
15 *
16 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19 *
20 * Carnegie Mellon requests users of this software to return to
21 *
22 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23 * School of Computer Science
24 * Carnegie Mellon University
25 * Pittsburgh PA 15213-3890
26 *
27 * any improvements or extensions that they make and grant Carnegie the
28 * rights to redistribute these changes.
29 */
30
31 /*****************************************************************************
32 *
33 * map.c -- Main code for mapping RAID addresses to physical disk addresses.
34 *
35 *****************************************************************************/
36
37 #include "rf_types.h"
38 #include "rf_threadstuff.h"
39 #include "rf_raid.h"
40 #include "rf_general.h"
41 #include "rf_map.h"
42 #include "rf_freelist.h"
43 #include "rf_shutdown.h"
44
45 void rf_FreePDAList(RF_PhysDiskAddr_t *, RF_PhysDiskAddr_t *, int);
46 void rf_FreeASMList(RF_AccessStripeMap_t *, RF_AccessStripeMap_t *, int);
47
48 /*****************************************************************************
49 *
50 * MapAccess -- Main 1st order mapping routine.
51 *
52 * Maps an access in the RAID address space to the corresponding set of
53 * physical disk addresses. The result is returned as a list of
54 * AccessStripeMap structures, one per stripe accessed. Each ASM structure
55 * contains a pointer to a list of PhysDiskAddr structures, which describe
56 * the physical locations touched by the user access. Note that this routine
57 * returns only static mapping information, i.e. the list of physical
58 * addresses returned does not necessarily identify the set of physical
59 * locations that will actually be read or written.
60 *
61 * The routine also maps the parity. The physical disk location returned
62 * always indicates the entire parity unit, even when only a subset of it
63 * is being accessed. This is because an access that is not stripe unit
64 * aligned but that spans a stripe unit boundary may require access two
65 * distinct portions of the parity unit, and we can't yet tell which
66 * portion(s) we'll actually need. We leave it up to the algorithm
67 * selection code to decide what subset of the parity unit to access.
68 *
69 * Note that addresses in the RAID address space must always be maintained
70 * as longs, instead of ints.
71 *
72 * This routine returns NULL if numBlocks is 0.
73 *
74 *****************************************************************************/
75
76 RF_AccessStripeMapHeader_t *
rf_MapAccess(RF_Raid_t * raidPtr,RF_RaidAddr_t raidAddress,RF_SectorCount_t numBlocks,caddr_t buffer,int remap)77 rf_MapAccess(
78 RF_Raid_t *raidPtr,
79 RF_RaidAddr_t raidAddress, /*
80 * Starting address in RAID address
81 * space.
82 */
83 RF_SectorCount_t numBlocks, /*
84 * Number of blocks in RAID address
85 * space to access.
86 */
87 caddr_t buffer, /* Buffer to supply/receive data. */
88 int remap /*
89 * 1 => remap addresses to spare space.
90 */
91 )
92 {
93 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
94 RF_AccessStripeMapHeader_t *asm_hdr = NULL;
95 RF_AccessStripeMap_t *asm_list = NULL, *asm_p = NULL;
96 int faultsTolerated = layoutPtr->map->faultsTolerated;
97 /* We'll change raidAddress along the way. */
98 RF_RaidAddr_t startAddress = raidAddress;
99 RF_RaidAddr_t endAddress = raidAddress + numBlocks;
100 RF_RaidDisk_t **disks = raidPtr->Disks;
101
102 RF_PhysDiskAddr_t *pda_p, *pda_q;
103 RF_StripeCount_t numStripes = 0;
104 RF_RaidAddr_t stripeRealEndAddress, stripeEndAddress;
105 RF_RaidAddr_t nextStripeUnitAddress;
106 RF_RaidAddr_t startAddrWithinStripe, lastRaidAddr;
107 RF_StripeCount_t totStripes;
108 RF_StripeNum_t stripeID, lastSID, SUID, lastSUID;
109 RF_AccessStripeMap_t *asmList, *t_asm;
110 RF_PhysDiskAddr_t *pdaList, *t_pda;
111
112 /* Allocate all the ASMs and PDAs up front. */
113 lastRaidAddr = raidAddress + numBlocks - 1;
114 stripeID = rf_RaidAddressToStripeID(layoutPtr, raidAddress);
115 lastSID = rf_RaidAddressToStripeID(layoutPtr, lastRaidAddr);
116 totStripes = lastSID - stripeID + 1;
117 SUID = rf_RaidAddressToStripeUnitID(layoutPtr, raidAddress);
118 lastSUID = rf_RaidAddressToStripeUnitID(layoutPtr, lastRaidAddr);
119
120 asmList = rf_AllocASMList(totStripes);
121 pdaList = rf_AllocPDAList(lastSUID - SUID + 1 +
122 faultsTolerated * totStripes); /*
123 * May also need pda(s)
124 * per stripe for parity.
125 */
126
127 if (raidAddress + numBlocks > raidPtr->totalSectors) {
128 RF_ERRORMSG1("Unable to map access because offset (%d)"
129 " was invalid\n", (int) raidAddress);
130 return (NULL);
131 }
132 if (rf_mapDebug)
133 rf_PrintRaidAddressInfo(raidPtr, raidAddress, numBlocks);
134 for (; raidAddress < endAddress;) {
135 /* Make the next stripe structure. */
136 RF_ASSERT(asmList);
137 t_asm = asmList;
138 asmList = asmList->next;
139 bzero((char *) t_asm, sizeof(RF_AccessStripeMap_t));
140 if (!asm_p)
141 asm_list = asm_p = t_asm;
142 else {
143 asm_p->next = t_asm;
144 asm_p = asm_p->next;
145 }
146 numStripes++;
147
148 /* Map SUs from current location to the end of the stripe. */
149 asm_p->stripeID =
150 /* rf_RaidAddressToStripeID(layoutPtr, raidAddress) */
151 stripeID++;
152 stripeRealEndAddress =
153 rf_RaidAddressOfNextStripeBoundary(layoutPtr, raidAddress);
154 stripeEndAddress = RF_MIN(endAddress, stripeRealEndAddress);
155 asm_p->raidAddress = raidAddress;
156 asm_p->endRaidAddress = stripeEndAddress;
157
158 /* Map each stripe unit in the stripe. */
159 pda_p = NULL;
160 /*
161 * Raid addr of start of portion of access that is within this
162 * stripe.
163 */
164 startAddrWithinStripe = raidAddress;
165
166 for (; raidAddress < stripeEndAddress;) {
167 RF_ASSERT(pdaList);
168 t_pda = pdaList;
169 pdaList = pdaList->next;
170 bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
171 if (!pda_p)
172 asm_p->physInfo = pda_p = t_pda;
173 else {
174 pda_p->next = t_pda;
175 pda_p = pda_p->next;
176 }
177
178 pda_p->type = RF_PDA_TYPE_DATA;
179 (layoutPtr->map->MapSector) (raidPtr, raidAddress,
180 &(pda_p->row), &(pda_p->col),
181 &(pda_p->startSector), remap);
182
183 /*
184 * Mark any failures we find.
185 * failedPDA is don't-care if there is more than
186 * one failure.
187 */
188 /*
189 * The RAID address corresponding to this physical
190 * disk address.
191 */
192 pda_p->raidAddress = raidAddress;
193 nextStripeUnitAddress =
194 rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
195 raidAddress);
196 pda_p->numSector = RF_MIN(endAddress,
197 nextStripeUnitAddress) - raidAddress;
198 RF_ASSERT(pda_p->numSector != 0);
199 rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 0);
200 pda_p->bufPtr = buffer + rf_RaidAddressToByte(raidPtr,
201 (raidAddress - startAddress));
202 asm_p->totalSectorsAccessed += pda_p->numSector;
203 asm_p->numStripeUnitsAccessed++;
204 asm_p->origRow = pda_p->row; /*
205 * Redundant but
206 * harmless to do this
207 * in every loop
208 * iteration.
209 */
210
211 raidAddress = RF_MIN(endAddress, nextStripeUnitAddress);
212 }
213
214 /*
215 * Map the parity. At this stage, the startSector and
216 * numSector fields for the parity unit are always set to
217 * indicate the entire parity unit. We may modify this after
218 * mapping the data portion.
219 */
220 switch (faultsTolerated) {
221 case 0:
222 break;
223 case 1: /* Single fault tolerant. */
224 RF_ASSERT(pdaList);
225 t_pda = pdaList;
226 pdaList = pdaList->next;
227 bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
228 pda_p = asm_p->parityInfo = t_pda;
229 pda_p->type = RF_PDA_TYPE_PARITY;
230 (layoutPtr->map->MapParity) (raidPtr,
231 rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
232 startAddrWithinStripe), &(pda_p->row),
233 &(pda_p->col), &(pda_p->startSector), remap);
234 pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
235 /*
236 * raidAddr may be needed to find unit to redirect to.
237 */
238 pda_p->raidAddress =
239 rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
240 startAddrWithinStripe);
241 rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 1);
242 rf_ASMParityAdjust(asm_p->parityInfo,
243 startAddrWithinStripe, endAddress,
244 layoutPtr, asm_p);
245
246 break;
247 case 2: /* Two fault tolerant. */
248 RF_ASSERT(pdaList && pdaList->next);
249 t_pda = pdaList;
250 pdaList = pdaList->next;
251 bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
252 pda_p = asm_p->parityInfo = t_pda;
253 pda_p->type = RF_PDA_TYPE_PARITY;
254 t_pda = pdaList;
255 pdaList = pdaList->next;
256 bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
257 pda_q = asm_p->qInfo = t_pda;
258 pda_q->type = RF_PDA_TYPE_Q;
259 (layoutPtr->map->MapParity) (raidPtr,
260 rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
261 startAddrWithinStripe), &(pda_p->row),
262 &(pda_p->col), &(pda_p->startSector), remap);
263 (layoutPtr->map->MapQ) (raidPtr,
264 rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
265 startAddrWithinStripe), &(pda_q->row),
266 &(pda_q->col), &(pda_q->startSector), remap);
267 pda_q->numSector = pda_p->numSector =
268 layoutPtr->sectorsPerStripeUnit;
269 /*
270 * raidAddr may be needed to find unit to redirect to.
271 */
272 pda_p->raidAddress =
273 rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
274 startAddrWithinStripe);
275 pda_q->raidAddress =
276 rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
277 startAddrWithinStripe);
278 /* Failure mode stuff. */
279 rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 1);
280 rf_ASMCheckStatus(raidPtr, pda_q, asm_p, disks, 1);
281 rf_ASMParityAdjust(asm_p->parityInfo,
282 startAddrWithinStripe, endAddress,
283 layoutPtr, asm_p);
284 rf_ASMParityAdjust(asm_p->qInfo, startAddrWithinStripe,
285 endAddress, layoutPtr, asm_p);
286 break;
287 }
288 }
289 RF_ASSERT(asmList == NULL && pdaList == NULL);
290 /* Make the header structure. */
291 asm_hdr = rf_AllocAccessStripeMapHeader();
292 RF_ASSERT(numStripes == totStripes);
293 asm_hdr->numStripes = numStripes;
294 asm_hdr->stripeMap = asm_list;
295
296 if (rf_mapDebug)
297 rf_PrintAccessStripeMap(asm_hdr);
298 return (asm_hdr);
299 }
300
301 /*****************************************************************************
302 * This routine walks through an ASM list and marks the PDAs that have failed.
303 * It's called only when a disk failure causes an in-flight DAG to fail.
304 * The parity may consist of two components, but we want to use only one
305 * failedPDA pointer. Thus we set failedPDA to point to the first parity
306 * component, and rely on the rest of the code to do the right thing with this.
307 *****************************************************************************/
308 void
rf_MarkFailuresInASMList(RF_Raid_t * raidPtr,RF_AccessStripeMapHeader_t * asm_h)309 rf_MarkFailuresInASMList(RF_Raid_t *raidPtr, RF_AccessStripeMapHeader_t *asm_h)
310 {
311 RF_RaidDisk_t **disks = raidPtr->Disks;
312 RF_AccessStripeMap_t *asmap;
313 RF_PhysDiskAddr_t *pda;
314
315 for (asmap = asm_h->stripeMap; asmap; asmap = asmap->next) {
316 asmap->numDataFailed = asmap->numParityFailed =
317 asmap->numQFailed = 0;
318 asmap->numFailedPDAs = 0;
319 bzero((char *) asmap->failedPDAs,
320 RF_MAX_FAILED_PDA * sizeof(RF_PhysDiskAddr_t *));
321 for (pda = asmap->physInfo; pda; pda = pda->next) {
322 if (RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
323 asmap->numDataFailed++;
324 asmap->failedPDAs[asmap->numFailedPDAs] = pda;
325 asmap->numFailedPDAs++;
326 }
327 }
328 pda = asmap->parityInfo;
329 if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
330 asmap->numParityFailed++;
331 asmap->failedPDAs[asmap->numFailedPDAs] = pda;
332 asmap->numFailedPDAs++;
333 }
334 pda = asmap->qInfo;
335 if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
336 asmap->numQFailed++;
337 asmap->failedPDAs[asmap->numFailedPDAs] = pda;
338 asmap->numFailedPDAs++;
339 }
340 }
341 }
342
343 /*****************************************************************************
344 *
345 * DuplicateASM -- Duplicates an ASM and returns the new one.
346 *
347 *****************************************************************************/
348 RF_AccessStripeMap_t *
rf_DuplicateASM(RF_AccessStripeMap_t * asmap)349 rf_DuplicateASM(RF_AccessStripeMap_t *asmap)
350 {
351 RF_AccessStripeMap_t *new_asm;
352 RF_PhysDiskAddr_t *pda, *new_pda, *t_pda;
353
354 new_pda = NULL;
355 new_asm = rf_AllocAccessStripeMapComponent();
356 bcopy((char *) asmap, (char *) new_asm, sizeof(RF_AccessStripeMap_t));
357 new_asm->numFailedPDAs = 0; /* ??? */
358 new_asm->failedPDAs[0] = NULL;
359 new_asm->physInfo = NULL;
360 new_asm->parityInfo = NULL;
361 new_asm->next = NULL;
362
363 for (pda = asmap->physInfo; pda; pda = pda->next) {
364 /* Copy the physInfo list. */
365 t_pda = rf_AllocPhysDiskAddr();
366 bcopy((char *) pda, (char *) t_pda, sizeof(RF_PhysDiskAddr_t));
367 t_pda->next = NULL;
368 if (!new_asm->physInfo) {
369 new_asm->physInfo = t_pda;
370 new_pda = t_pda;
371 } else {
372 new_pda->next = t_pda;
373 new_pda = new_pda->next;
374 }
375 if (pda == asmap->failedPDAs[0])
376 new_asm->failedPDAs[0] = t_pda;
377 }
378 for (pda = asmap->parityInfo; pda; pda = pda->next) {
379 /* Copy the parityInfo list. */
380 t_pda = rf_AllocPhysDiskAddr();
381 bcopy((char *) pda, (char *) t_pda, sizeof(RF_PhysDiskAddr_t));
382 t_pda->next = NULL;
383 if (!new_asm->parityInfo) {
384 new_asm->parityInfo = t_pda;
385 new_pda = t_pda;
386 } else {
387 new_pda->next = t_pda;
388 new_pda = new_pda->next;
389 }
390 if (pda == asmap->failedPDAs[0])
391 new_asm->failedPDAs[0] = t_pda;
392 }
393 return (new_asm);
394 }
395
396 /*****************************************************************************
397 *
398 * DuplicatePDA -- Duplicates a PDA and returns the new one.
399 *
400 *****************************************************************************/
401 RF_PhysDiskAddr_t *
rf_DuplicatePDA(RF_PhysDiskAddr_t * pda)402 rf_DuplicatePDA(RF_PhysDiskAddr_t *pda)
403 {
404 RF_PhysDiskAddr_t *new;
405
406 new = rf_AllocPhysDiskAddr();
407 bcopy((char *) pda, (char *) new, sizeof(RF_PhysDiskAddr_t));
408 return (new);
409 }
410
411 /*****************************************************************************
412 *
413 * Routines to allocate and free list elements. All allocation routines zero
414 * the structure before returning it.
415 *
416 * FreePhysDiskAddr is static. It should never be called directly, because
417 * FreeAccessStripeMap takes care of freeing the PhysDiskAddr list.
418 *
419 *****************************************************************************/
420
421 static RF_FreeList_t *rf_asmhdr_freelist;
422 #define RF_MAX_FREE_ASMHDR 128
423 #define RF_ASMHDR_INC 16
424 #define RF_ASMHDR_INITIAL 32
425
426 static RF_FreeList_t *rf_asm_freelist;
427 #define RF_MAX_FREE_ASM 192
428 #define RF_ASM_INC 24
429 #define RF_ASM_INITIAL 64
430
431 static RF_FreeList_t *rf_pda_freelist;
432 #define RF_MAX_FREE_PDA 192
433 #define RF_PDA_INC 24
434 #define RF_PDA_INITIAL 64
435
436 /*
437 * Called at shutdown time. So far, all that is necessary is to release
438 * all the free lists.
439 */
440 void rf_ShutdownMapModule(void *);
441 void
rf_ShutdownMapModule(void * ignored)442 rf_ShutdownMapModule(void *ignored)
443 {
444 RF_FREELIST_DESTROY(rf_asmhdr_freelist, next,
445 (RF_AccessStripeMapHeader_t *));
446 RF_FREELIST_DESTROY(rf_pda_freelist, next, (RF_PhysDiskAddr_t *));
447 RF_FREELIST_DESTROY(rf_asm_freelist, next, (RF_AccessStripeMap_t *));
448 }
449
450 int
rf_ConfigureMapModule(RF_ShutdownList_t ** listp)451 rf_ConfigureMapModule(RF_ShutdownList_t **listp)
452 {
453 int rc;
454
455 RF_FREELIST_CREATE(rf_asmhdr_freelist, RF_MAX_FREE_ASMHDR,
456 RF_ASMHDR_INC, sizeof(RF_AccessStripeMapHeader_t));
457 if (rf_asmhdr_freelist == NULL) {
458 return (ENOMEM);
459 }
460 RF_FREELIST_CREATE(rf_asm_freelist, RF_MAX_FREE_ASM,
461 RF_ASM_INC, sizeof(RF_AccessStripeMap_t));
462 if (rf_asm_freelist == NULL) {
463 RF_FREELIST_DESTROY(rf_asmhdr_freelist, next,
464 (RF_AccessStripeMapHeader_t *));
465 return (ENOMEM);
466 }
467 RF_FREELIST_CREATE(rf_pda_freelist, RF_MAX_FREE_PDA, RF_PDA_INC,
468 sizeof(RF_PhysDiskAddr_t));
469 if (rf_pda_freelist == NULL) {
470 RF_FREELIST_DESTROY(rf_asmhdr_freelist, next,
471 (RF_AccessStripeMapHeader_t *));
472 RF_FREELIST_DESTROY(rf_pda_freelist, next,
473 (RF_PhysDiskAddr_t *));
474 return (ENOMEM);
475 }
476 rc = rf_ShutdownCreate(listp, rf_ShutdownMapModule, NULL);
477 if (rc) {
478 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d"
479 " rc=%d\n", __FILE__, __LINE__, rc);
480 rf_ShutdownMapModule(NULL);
481 return (rc);
482 }
483 RF_FREELIST_PRIME(rf_asmhdr_freelist, RF_ASMHDR_INITIAL, next,
484 (RF_AccessStripeMapHeader_t *));
485 RF_FREELIST_PRIME(rf_asm_freelist, RF_ASM_INITIAL, next,
486 (RF_AccessStripeMap_t *));
487 RF_FREELIST_PRIME(rf_pda_freelist, RF_PDA_INITIAL, next,
488 (RF_PhysDiskAddr_t *));
489
490 return (0);
491 }
492
493 RF_AccessStripeMapHeader_t *
rf_AllocAccessStripeMapHeader(void)494 rf_AllocAccessStripeMapHeader(void)
495 {
496 RF_AccessStripeMapHeader_t *p;
497
498 RF_FREELIST_GET(rf_asmhdr_freelist, p, next,
499 (RF_AccessStripeMapHeader_t *));
500 bzero((char *) p, sizeof(RF_AccessStripeMapHeader_t));
501
502 return (p);
503 }
504
505 void
rf_FreeAccessStripeMapHeader(RF_AccessStripeMapHeader_t * p)506 rf_FreeAccessStripeMapHeader(RF_AccessStripeMapHeader_t *p)
507 {
508 RF_FREELIST_FREE(rf_asmhdr_freelist, p, next);
509 }
510
511 RF_PhysDiskAddr_t *
rf_AllocPhysDiskAddr(void)512 rf_AllocPhysDiskAddr(void)
513 {
514 RF_PhysDiskAddr_t *p;
515
516 RF_FREELIST_GET(rf_pda_freelist, p, next, (RF_PhysDiskAddr_t *));
517 bzero((char *) p, sizeof(RF_PhysDiskAddr_t));
518
519 return (p);
520 }
521
522 /*
523 * Allocates a list of PDAs, locking the free list only once.
524 * When we have to call calloc, we do it one component at a time to simplify
525 * the process of freeing the list at program shutdown. This should not be
526 * much of a performance hit, because it should be very infrequently executed.
527 */
528 RF_PhysDiskAddr_t *
rf_AllocPDAList(int count)529 rf_AllocPDAList(int count)
530 {
531 RF_PhysDiskAddr_t *p = NULL;
532
533 RF_FREELIST_GET_N(rf_pda_freelist, p, next, (RF_PhysDiskAddr_t *),
534 count);
535 return (p);
536 }
537
538 void
rf_FreePhysDiskAddr(RF_PhysDiskAddr_t * p)539 rf_FreePhysDiskAddr(RF_PhysDiskAddr_t *p)
540 {
541 RF_FREELIST_FREE(rf_pda_freelist, p, next);
542 }
543
544 void
rf_FreePDAList(RF_PhysDiskAddr_t * l_start,RF_PhysDiskAddr_t * l_end,int count)545 rf_FreePDAList(
546 /* Pointers to start and end of list. */
547 RF_PhysDiskAddr_t *l_start,
548 RF_PhysDiskAddr_t *l_end,
549 int count /* Number of elements in list. */
550 )
551 {
552 RF_FREELIST_FREE_N(rf_pda_freelist, l_start, next,
553 (RF_PhysDiskAddr_t *), count);
554 }
555
556 RF_AccessStripeMap_t *
rf_AllocAccessStripeMapComponent(void)557 rf_AllocAccessStripeMapComponent(void)
558 {
559 RF_AccessStripeMap_t *p;
560
561 RF_FREELIST_GET(rf_asm_freelist, p, next, (RF_AccessStripeMap_t *));
562 bzero((char *) p, sizeof(RF_AccessStripeMap_t));
563
564 return (p);
565 }
566
567 /*
568 * This is essentially identical to AllocPDAList. I should combine the two.
569 * When we have to call calloc, we do it one component at a time to simplify
570 * the process of freeing the list at program shutdown. This should not be
571 * much of a performance hit, because it should be very infrequently executed.
572 */
573 RF_AccessStripeMap_t *
rf_AllocASMList(int count)574 rf_AllocASMList(int count)
575 {
576 RF_AccessStripeMap_t *p = NULL;
577
578 RF_FREELIST_GET_N(rf_asm_freelist, p, next, (RF_AccessStripeMap_t *),
579 count);
580 return (p);
581 }
582
583 void
rf_FreeAccessStripeMapComponent(RF_AccessStripeMap_t * p)584 rf_FreeAccessStripeMapComponent(RF_AccessStripeMap_t *p)
585 {
586 RF_FREELIST_FREE(rf_asm_freelist, p, next);
587 }
588
589 void
rf_FreeASMList(RF_AccessStripeMap_t * l_start,RF_AccessStripeMap_t * l_end,int count)590 rf_FreeASMList(RF_AccessStripeMap_t *l_start, RF_AccessStripeMap_t *l_end,
591 int count)
592 {
593 RF_FREELIST_FREE_N(rf_asm_freelist, l_start, next,
594 (RF_AccessStripeMap_t *), count);
595 }
596
597 void
rf_FreeAccessStripeMap(RF_AccessStripeMapHeader_t * hdr)598 rf_FreeAccessStripeMap(RF_AccessStripeMapHeader_t *hdr)
599 {
600 RF_AccessStripeMap_t *p, *pt = NULL;
601 RF_PhysDiskAddr_t *pdp, *trailer, *pdaList = NULL, *pdaEnd = NULL;
602 int count = 0, t, asm_count = 0;
603
604 for (p = hdr->stripeMap; p; p = p->next) {
605
606 /* Link the 3 pda lists into the accumulating pda list. */
607
608 if (!pdaList)
609 pdaList = p->qInfo;
610 else
611 pdaEnd->next = p->qInfo;
612 for (trailer = NULL, pdp = p->qInfo; pdp;) {
613 trailer = pdp;
614 pdp = pdp->next;
615 count++;
616 }
617 if (trailer)
618 pdaEnd = trailer;
619
620 if (!pdaList)
621 pdaList = p->parityInfo;
622 else
623 pdaEnd->next = p->parityInfo;
624 for (trailer = NULL, pdp = p->parityInfo; pdp;) {
625 trailer = pdp;
626 pdp = pdp->next;
627 count++;
628 }
629 if (trailer)
630 pdaEnd = trailer;
631
632 if (!pdaList)
633 pdaList = p->physInfo;
634 else
635 pdaEnd->next = p->physInfo;
636 for (trailer = NULL, pdp = p->physInfo; pdp;) {
637 trailer = pdp;
638 pdp = pdp->next;
639 count++;
640 }
641 if (trailer)
642 pdaEnd = trailer;
643
644 pt = p;
645 asm_count++;
646 }
647
648 /* Debug only. */
649 for (t = 0, pdp = pdaList; pdp; pdp = pdp->next)
650 t++;
651 RF_ASSERT(t == count);
652
653 if (pdaList)
654 rf_FreePDAList(pdaList, pdaEnd, count);
655 rf_FreeASMList(hdr->stripeMap, pt, asm_count);
656 rf_FreeAccessStripeMapHeader(hdr);
657 }
658
659 /*
660 * We can't use the large write optimization if there are any failures in the
661 * stripe.
662 * In the declustered layout, there is no way to immediately determine what
663 * disks constitute a stripe, so we actually have to hunt through the stripe
664 * looking for failures.
665 * The reason we map the parity instead of just using asm->parityInfo->col is
666 * because the latter may have been already redirected to a spare drive, which
667 * would mess up the computation of the stripe offset.
668 *
669 * ASSUMES AT MOST ONE FAILURE IN THE STRIPE.
670 */
671 int
rf_CheckStripeForFailures(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap)672 rf_CheckStripeForFailures(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
673 {
674 RF_RowCol_t trow, tcol, prow, pcol, *diskids, row, i;
675 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
676 RF_StripeCount_t stripeOffset;
677 int numFailures;
678 RF_RaidAddr_t sosAddr;
679 RF_SectorNum_t diskOffset, poffset;
680 RF_RowCol_t testrow;
681
682 /* Quick out in the fault-free case. */
683 RF_LOCK_MUTEX(raidPtr->mutex);
684 numFailures = raidPtr->numFailures;
685 RF_UNLOCK_MUTEX(raidPtr->mutex);
686 if (numFailures == 0)
687 return (0);
688
689 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
690 asmap->raidAddress);
691 row = asmap->physInfo->row;
692 (layoutPtr->map->IdentifyStripe) (raidPtr, asmap->raidAddress,
693 &diskids, &testrow);
694 (layoutPtr->map->MapParity) (raidPtr, asmap->raidAddress,
695 &prow, &pcol, &poffset, 0); /* get pcol */
696
697 /*
698 * This needs not be true if we've redirected the access to a spare in
699 * another row.
700 * RF_ASSERT(row == testrow);
701 */
702 stripeOffset = 0;
703 for (i = 0; i < layoutPtr->numDataCol + layoutPtr->numParityCol; i++) {
704 if (diskids[i] != pcol) {
705 if (RF_DEAD_DISK(raidPtr
706 ->Disks[testrow][diskids[i]].status)) {
707 if (raidPtr->status[testrow] !=
708 rf_rs_reconstructing)
709 return (1);
710 RF_ASSERT(
711 raidPtr->reconControl[testrow]->fcol ==
712 diskids[i]);
713 layoutPtr->map->MapSector(raidPtr,
714 sosAddr + stripeOffset *
715 layoutPtr->sectorsPerStripeUnit,
716 &trow, &tcol, &diskOffset, 0);
717 RF_ASSERT((trow == testrow) &&
718 (tcol == diskids[i]));
719 if (!rf_CheckRUReconstructed(raidPtr
720 ->reconControl[testrow]->reconMap,
721 diskOffset))
722 return (1);
723 asmap->flags |= RF_ASM_REDIR_LARGE_WRITE;
724 return (0);
725 }
726 stripeOffset++;
727 }
728 }
729 return (0);
730 }
731
732 /*
733 * Return the number of failed data units in the stripe.
734 */
735 int
rf_NumFailedDataUnitsInStripe(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap)736 rf_NumFailedDataUnitsInStripe(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
737 {
738 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
739 RF_RowCol_t trow, tcol, row, i;
740 RF_SectorNum_t diskOffset;
741 RF_RaidAddr_t sosAddr;
742 int numFailures;
743
744 /* Quick out in the fault-free case. */
745 RF_LOCK_MUTEX(raidPtr->mutex);
746 numFailures = raidPtr->numFailures;
747 RF_UNLOCK_MUTEX(raidPtr->mutex);
748 if (numFailures == 0)
749 return (0);
750 numFailures = 0;
751
752 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
753 asmap->raidAddress);
754 row = asmap->physInfo->row;
755 for (i = 0; i < layoutPtr->numDataCol; i++) {
756 (layoutPtr->map->MapSector) (raidPtr, sosAddr + i *
757 layoutPtr->sectorsPerStripeUnit,
758 &trow, &tcol, &diskOffset, 0);
759 if (RF_DEAD_DISK(raidPtr->Disks[trow][tcol].status))
760 numFailures++;
761 }
762
763 return numFailures;
764 }
765
766
767 /*****************************************************************************
768 *
769 * Debug routines.
770 *
771 *****************************************************************************/
772
773 void
rf_PrintAccessStripeMap(RF_AccessStripeMapHeader_t * asm_h)774 rf_PrintAccessStripeMap(RF_AccessStripeMapHeader_t *asm_h)
775 {
776 rf_PrintFullAccessStripeMap(asm_h, 0);
777 }
778
779 void
rf_PrintFullAccessStripeMap(RF_AccessStripeMapHeader_t * asm_h,int prbuf)780 rf_PrintFullAccessStripeMap(RF_AccessStripeMapHeader_t *asm_h,
781 int prbuf /* Flag to print buffer pointers. */)
782 {
783 int i;
784 RF_AccessStripeMap_t *asmap = asm_h->stripeMap;
785 RF_PhysDiskAddr_t *p;
786 printf("%d stripes total\n", (int) asm_h->numStripes);
787 for (; asmap; asmap = asmap->next) {
788 /* printf("Num failures: %d\n", asmap->numDataFailed); */
789 /* printf("Num sectors: %d\n",
790 * (int)asmap->totalSectorsAccessed); */
791 printf("Stripe %d (%d sectors), failures: %d data, %d parity: ",
792 (int) asmap->stripeID,
793 (int) asmap->totalSectorsAccessed,
794 (int) asmap->numDataFailed,
795 (int) asmap->numParityFailed);
796 if (asmap->parityInfo) {
797 printf("Parity [r%d c%d s%d-%d", asmap->parityInfo->row,
798 asmap->parityInfo->col,
799 (int) asmap->parityInfo->startSector,
800 (int) (asmap->parityInfo->startSector +
801 asmap->parityInfo->numSector - 1));
802 if (prbuf)
803 printf(" b0x%lx",
804 (unsigned long) asmap->parityInfo->bufPtr);
805 if (asmap->parityInfo->next) {
806 printf(", r%d c%d s%d-%d",
807 asmap->parityInfo->next->row,
808 asmap->parityInfo->next->col,
809 (int) asmap->parityInfo->next->startSector,
810 (int) (asmap->parityInfo->next->startSector
811 + asmap->parityInfo->next->numSector - 1));
812 if (prbuf)
813 printf(" b0x%lx", (unsigned long)
814 asmap->parityInfo->next->bufPtr);
815 RF_ASSERT(asmap->parityInfo->next->next
816 == NULL);
817 }
818 printf("]\n\t");
819 }
820 for (i = 0, p = asmap->physInfo; p; p = p->next, i++) {
821 printf("SU r%d c%d s%d-%d ", p->row, p->col,
822 (int) p->startSector,
823 (int) (p->startSector + p->numSector - 1));
824 if (prbuf)
825 printf("b0x%lx ", (unsigned long) p->bufPtr);
826 if (i && !(i & 1))
827 printf("\n\t");
828 }
829 printf("\n");
830 p = asm_h->stripeMap->failedPDAs[0];
831 if (asm_h->stripeMap->numDataFailed +
832 asm_h->stripeMap->numParityFailed > 1)
833 printf("[multiple failures]\n");
834 else
835 if (asm_h->stripeMap->numDataFailed +
836 asm_h->stripeMap->numParityFailed > 0)
837 printf("\t[Failed PDA: r%d c%d s%d-%d]\n",
838 p->row, p->col, (int) p->startSector,
839 (int) (p->startSector + p->numSector - 1));
840 }
841 }
842
843 void
rf_PrintRaidAddressInfo(RF_Raid_t * raidPtr,RF_RaidAddr_t raidAddr,RF_SectorCount_t numBlocks)844 rf_PrintRaidAddressInfo(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr,
845 RF_SectorCount_t numBlocks)
846 {
847 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
848 RF_RaidAddr_t ra, sosAddr =
849 rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
850
851 printf("Raid addrs of SU boundaries from start of stripe to end"
852 " of access:\n\t");
853 for (ra = sosAddr; ra <= raidAddr + numBlocks;
854 ra += layoutPtr->sectorsPerStripeUnit) {
855 printf("%d (0x%x), ", (int) ra, (int) ra);
856 }
857 printf("\n");
858 printf("Offset into stripe unit: %d (0x%x)\n",
859 (int) (raidAddr % layoutPtr->sectorsPerStripeUnit),
860 (int) (raidAddr % layoutPtr->sectorsPerStripeUnit));
861 }
862
863 /*
864 * Given a parity descriptor and the starting address within a stripe,
865 * range restrict the parity descriptor to touch only the correct stuff.
866 */
867 void
rf_ASMParityAdjust(RF_PhysDiskAddr_t * toAdjust,RF_StripeNum_t startAddrWithinStripe,RF_SectorNum_t endAddress,RF_RaidLayout_t * layoutPtr,RF_AccessStripeMap_t * asm_p)868 rf_ASMParityAdjust(
869 RF_PhysDiskAddr_t *toAdjust,
870 RF_StripeNum_t startAddrWithinStripe,
871 RF_SectorNum_t endAddress,
872 RF_RaidLayout_t *layoutPtr,
873 RF_AccessStripeMap_t *asm_p
874 )
875 {
876 RF_PhysDiskAddr_t *new_pda;
877
878 /*
879 * When we're accessing only a portion of one stripe unit, we want the
880 * parity descriptor to identify only the chunk of parity associated
881 * with the data. When the access spans exactly one stripe unit
882 * boundary and is less than a stripe unit in size, it uses two
883 * disjoint regions of the parity unit. When an access spans more
884 * than one stripe unit boundary, it uses all of the parity unit.
885 *
886 * To better handle the case where stripe units are small, we may
887 * eventually want to change the 2nd case so that if the SU size is
888 * below some threshold, we just read/write the whole thing instead of
889 * breaking it up into two accesses.
890 */
891 if (asm_p->numStripeUnitsAccessed == 1) {
892 int x = (startAddrWithinStripe %
893 layoutPtr->sectorsPerStripeUnit);
894 toAdjust->startSector += x;
895 toAdjust->raidAddress += x;
896 toAdjust->numSector = asm_p->physInfo->numSector;
897 RF_ASSERT(toAdjust->numSector != 0);
898 } else
899 if (asm_p->numStripeUnitsAccessed == 2 &&
900 asm_p->totalSectorsAccessed <
901 layoutPtr->sectorsPerStripeUnit) {
902 int x = (startAddrWithinStripe %
903 layoutPtr->sectorsPerStripeUnit);
904
905 /*
906 * Create a second pda and copy the parity map info
907 * into it.
908 */
909 RF_ASSERT(toAdjust->next == NULL);
910 new_pda = toAdjust->next = rf_AllocPhysDiskAddr();
911 *new_pda = *toAdjust; /* Structure assignment. */
912 new_pda->next = NULL;
913
914 /*
915 * Adjust the start sector & number of blocks for the
916 * first parity pda.
917 */
918 toAdjust->startSector += x;
919 toAdjust->raidAddress += x;
920 toAdjust->numSector =
921 rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
922 startAddrWithinStripe) - startAddrWithinStripe;
923 RF_ASSERT(toAdjust->numSector != 0);
924
925 /* Adjust the second pda. */
926 new_pda->numSector = endAddress -
927 rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
928 endAddress);
929 /* new_pda->raidAddress =
930 * rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
931 * toAdjust->raidAddress); */
932 RF_ASSERT(new_pda->numSector != 0);
933 }
934 }
935
936 /*
937 * Check if a disk has been spared or failed. If spared, redirect the I/O.
938 * If it has been failed, record it in the asm pointer.
939 * Fourth arg is whether data or parity.
940 */
941 void
rf_ASMCheckStatus(RF_Raid_t * raidPtr,RF_PhysDiskAddr_t * pda_p,RF_AccessStripeMap_t * asm_p,RF_RaidDisk_t ** disks,int parity)942 rf_ASMCheckStatus(
943 RF_Raid_t *raidPtr,
944 RF_PhysDiskAddr_t *pda_p,
945 RF_AccessStripeMap_t *asm_p,
946 RF_RaidDisk_t **disks,
947 int parity
948 )
949 {
950 RF_DiskStatus_t dstatus;
951 RF_RowCol_t frow, fcol;
952
953 dstatus = disks[pda_p->row][pda_p->col].status;
954
955 if (dstatus == rf_ds_spared) {
956 /* If the disk has been spared, redirect access to the spare. */
957 frow = pda_p->row;
958 fcol = pda_p->col;
959 pda_p->row = disks[frow][fcol].spareRow;
960 pda_p->col = disks[frow][fcol].spareCol;
961 } else
962 if (dstatus == rf_ds_dist_spared) {
963 /* Ditto if disk has been spared to dist spare space. */
964 RF_RowCol_t or = pda_p->row, oc = pda_p->col;
965 RF_SectorNum_t oo = pda_p->startSector;
966
967 if (pda_p->type == RF_PDA_TYPE_DATA)
968 raidPtr->Layout.map->MapSector(raidPtr,
969 pda_p->raidAddress, &pda_p->row,
970 &pda_p->col, &pda_p->startSector, RF_REMAP);
971 else
972 raidPtr->Layout.map->MapParity(raidPtr,
973 pda_p->raidAddress, &pda_p->row,
974 &pda_p->col, &pda_p->startSector, RF_REMAP);
975
976 if (rf_mapDebug) {
977 printf("Redirected r %d c %d o %d -> r%d c %d"
978 " o %d\n", or, oc, (int) oo, pda_p->row,
979 pda_p->col, (int) pda_p->startSector);
980 }
981 } else
982 if (RF_DEAD_DISK(dstatus)) {
983 /*
984 * If the disk is inaccessible, mark the
985 * failure.
986 */
987 if (parity)
988 asm_p->numParityFailed++;
989 else {
990 asm_p->numDataFailed++;
991 #if 0
992 /*
993 * XXX Do we really want this spewing
994 * out on the console ? GO
995 */
996 printf("DATA_FAILED !\n");
997 #endif
998 }
999 asm_p->failedPDAs[asm_p->numFailedPDAs] = pda_p;
1000 asm_p->numFailedPDAs++;
1001 #if 0
1002 switch (asm_p->numParityFailed +
1003 asm_p->numDataFailed) {
1004 case 1:
1005 asm_p->failedPDAs[0] = pda_p;
1006 break;
1007 case 2:
1008 asm_p->failedPDAs[1] = pda_p;
1009 default:
1010 break;
1011 }
1012 #endif
1013 }
1014 /* The redirected access should never span a stripe unit boundary. */
1015 RF_ASSERT(rf_RaidAddressToStripeUnitID(&raidPtr->Layout,
1016 pda_p->raidAddress) ==
1017 rf_RaidAddressToStripeUnitID(&raidPtr->Layout, pda_p->raidAddress +
1018 pda_p->numSector - 1));
1019 RF_ASSERT(pda_p->col != -1);
1020 }
1021