1 /* $OpenBSD: rf_dagutils.c,v 1.4 2002/12/16 07:01:03 tdeval Exp $ */
2 /* $NetBSD: rf_dagutils.c,v 1.6 1999/12/09 02:26:09 oster Exp $ */
3
4 /*
5 * Copyright (c) 1995 Carnegie-Mellon University.
6 * All rights reserved.
7 *
8 * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
9 *
10 * Permission to use, copy, modify and distribute this software and
11 * its documentation is hereby granted, provided that both the copyright
12 * notice and this permission notice appear in all copies of the
13 * software, derivative works or modified versions, and any portions
14 * thereof, and that both notices appear in supporting documentation.
15 *
16 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19 *
20 * Carnegie Mellon requests users of this software to return to
21 *
22 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23 * School of Computer Science
24 * Carnegie Mellon University
25 * Pittsburgh PA 15213-3890
26 *
27 * any improvements or extensions that they make and grant Carnegie the
28 * rights to redistribute these changes.
29 */
30
31 /*****************************************************************************
32 *
33 * rf_dagutils.c -- Utility routines for manipulating dags.
34 *
35 *****************************************************************************/
36
37 #include "rf_archs.h"
38 #include "rf_types.h"
39 #include "rf_threadstuff.h"
40 #include "rf_raid.h"
41 #include "rf_dag.h"
42 #include "rf_dagutils.h"
43 #include "rf_dagfuncs.h"
44 #include "rf_general.h"
45 #include "rf_freelist.h"
46 #include "rf_map.h"
47 #include "rf_shutdown.h"
48
49 #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
50
51 RF_RedFuncs_t rf_xorFuncs = {
52 rf_RegularXorFunc, "Reg Xr", rf_SimpleXorFunc, "Simple Xr"
53 };
54
55 RF_RedFuncs_t rf_xorRecoveryFuncs = {
56 rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"
57 };
58
59 void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
60 void rf_PrintDAG(RF_DagHeader_t *);
61 int rf_ValidateBranch(RF_DagNode_t *, int *, int *, RF_DagNode_t **, int);
62 void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
63 void rf_ValidateVisitedBits(RF_DagHeader_t *);
64
65 /*****************************************************************************
66 *
67 * InitNode - Initialize a dag node.
68 *
69 * The size of the propList array is always the same as that of the
70 * successors array.
71 *
72 *****************************************************************************/
73 void
rf_InitNode(RF_DagNode_t * node,RF_NodeStatus_t initstatus,int commit,int (* doFunc)(RF_DagNode_t *),int (* undoFunc)(RF_DagNode_t * node),int (* wakeFunc)(RF_DagNode_t * node,int),int nSucc,int nAnte,int nParam,int nResult,RF_DagHeader_t * hdr,char * name,RF_AllocListElem_t * alist)74 rf_InitNode(
75 RF_DagNode_t *node,
76 RF_NodeStatus_t initstatus,
77 int commit,
78 int (*doFunc) (RF_DagNode_t *),
79 int (*undoFunc) (RF_DagNode_t *node),
80 int (*wakeFunc) (RF_DagNode_t *node, int),
81 int nSucc,
82 int nAnte,
83 int nParam,
84 int nResult,
85 RF_DagHeader_t *hdr,
86 char *name,
87 RF_AllocListElem_t *alist
88 )
89 {
90 void **ptrs;
91 int nptrs;
92
93 if (nAnte > RF_MAX_ANTECEDENTS)
94 RF_PANIC();
95 node->status = initstatus;
96 node->commitNode = commit;
97 node->doFunc = doFunc;
98 node->undoFunc = undoFunc;
99 node->wakeFunc = wakeFunc;
100 node->numParams = nParam;
101 node->numResults = nResult;
102 node->numAntecedents = nAnte;
103 node->numAntDone = 0;
104 node->next = NULL;
105 node->numSuccedents = nSucc;
106 node->name = name;
107 node->dagHdr = hdr;
108 node->visited = 0;
109
110 /* Allocate all the pointers with one call to malloc. */
111 nptrs = nSucc + nAnte + nResult + nSucc;
112
113 if (nptrs <= RF_DAG_PTRCACHESIZE) {
114 /*
115 * The dag_ptrs field of the node is basically some scribble
116 * space to be used here. We could get rid of it, and always
117 * allocate the range of pointers, but that's expensive. So,
118 * we pick a "common case" size for the pointer cache.
119 * Hopefully, we'll find that:
120 * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
121 * only a little bit (least efficient case).
122 * (2) Generally, ntprs isn't a lot less than
123 * RF_DAG_PTRCACHESIZE (wasted memory).
124 */
125 ptrs = (void **) node->dag_ptrs;
126 } else {
127 RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
128 }
129 node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL;
130 node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs + nSucc) : NULL;
131 node->results = (nResult) ? (void **) (ptrs + nSucc + nAnte) : NULL;
132 node->propList = (nSucc) ? (RF_PropHeader_t **)
133 (ptrs + nSucc + nAnte + nResult) : NULL;
134
135 if (nParam) {
136 if (nParam <= RF_DAG_PARAMCACHESIZE) {
137 node->params = (RF_DagParam_t *) node->dag_params;
138 } else {
139 RF_CallocAndAdd(node->params, nParam,
140 sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
141 }
142 } else {
143 node->params = NULL;
144 }
145 }
146
147
148
149 /*****************************************************************************
150 *
151 * Allocation and deallocation routines.
152 *
153 *****************************************************************************/
154
155 void
rf_FreeDAG(RF_DagHeader_t * dag_h)156 rf_FreeDAG(RF_DagHeader_t *dag_h)
157 {
158 RF_AccessStripeMapHeader_t *asmap, *t_asmap;
159 RF_DagHeader_t *nextDag;
160 int i;
161
162 while (dag_h) {
163 nextDag = dag_h->next;
164 for (i = 0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
165 /* Release mem chunks. */
166 rf_ReleaseMemChunk(dag_h->memChunk[i]);
167 dag_h->memChunk[i] = NULL;
168 }
169
170 RF_ASSERT(i == dag_h->chunkIndex);
171 if (dag_h->xtraChunkCnt > 0) {
172 /* Free xtraMemChunks. */
173 for (i = 0; dag_h->xtraMemChunk[i] &&
174 i < dag_h->xtraChunkIndex; i++) {
175 rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
176 dag_h->xtraMemChunk[i] = NULL;
177 }
178 RF_ASSERT(i == dag_h->xtraChunkIndex);
179 /* Free ptrs to xtraMemChunks. */
180 RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt *
181 sizeof(RF_ChunkDesc_t *));
182 }
183 rf_FreeAllocList(dag_h->allocList);
184 for (asmap = dag_h->asmList; asmap;) {
185 t_asmap = asmap;
186 asmap = asmap->next;
187 rf_FreeAccessStripeMap(t_asmap);
188 }
189 rf_FreeDAGHeader(dag_h);
190 dag_h = nextDag;
191 }
192 }
193
194 RF_PropHeader_t *
rf_MakePropListEntry(RF_DagHeader_t * dag_h,int resultNum,int paramNum,RF_PropHeader_t * next,RF_AllocListElem_t * allocList)195 rf_MakePropListEntry(RF_DagHeader_t *dag_h, int resultNum, int paramNum,
196 RF_PropHeader_t *next, RF_AllocListElem_t *allocList)
197 {
198 RF_PropHeader_t *p;
199
200 RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t), (RF_PropHeader_t *),
201 allocList);
202 p->resultNum = resultNum;
203 p->paramNum = paramNum;
204 p->next = next;
205 return (p);
206 }
207
208 static RF_FreeList_t *rf_dagh_freelist;
209
210 #define RF_MAX_FREE_DAGH 128
211 #define RF_DAGH_INC 16
212 #define RF_DAGH_INITIAL 32
213
214 void rf_ShutdownDAGs(void *);
215 void
rf_ShutdownDAGs(void * ignored)216 rf_ShutdownDAGs(void *ignored)
217 {
218 RF_FREELIST_DESTROY(rf_dagh_freelist, next, (RF_DagHeader_t *));
219 }
220
221 int
rf_ConfigureDAGs(RF_ShutdownList_t ** listp)222 rf_ConfigureDAGs(RF_ShutdownList_t **listp)
223 {
224 int rc;
225
226 RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH, RF_DAGH_INC,
227 sizeof(RF_DagHeader_t));
228 if (rf_dagh_freelist == NULL)
229 return (ENOMEM);
230 rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
231 if (rc) {
232 RF_ERRORMSG3("Unable to add to shutdown list file %s line"
233 " %d rc=%d\n", __FILE__, __LINE__, rc);
234 rf_ShutdownDAGs(NULL);
235 return (rc);
236 }
237 RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL, next,
238 (RF_DagHeader_t *));
239 return (0);
240 }
241
242 RF_DagHeader_t *
rf_AllocDAGHeader(void)243 rf_AllocDAGHeader(void)
244 {
245 RF_DagHeader_t *dh;
246
247 RF_FREELIST_GET(rf_dagh_freelist, dh, next, (RF_DagHeader_t *));
248 if (dh) {
249 bzero((char *) dh, sizeof(RF_DagHeader_t));
250 }
251 return (dh);
252 }
253
254 void
rf_FreeDAGHeader(RF_DagHeader_t * dh)255 rf_FreeDAGHeader(RF_DagHeader_t *dh)
256 {
257 RF_FREELIST_FREE(rf_dagh_freelist, dh, next);
258 }
259
260 /* Allocate a buffer big enough to hold the data described by pda. */
261 void *
rf_AllocBuffer(RF_Raid_t * raidPtr,RF_DagHeader_t * dag_h,RF_PhysDiskAddr_t * pda,RF_AllocListElem_t * allocList)262 rf_AllocBuffer(RF_Raid_t *raidPtr, RF_DagHeader_t *dag_h,
263 RF_PhysDiskAddr_t *pda, RF_AllocListElem_t *allocList)
264 {
265 char *p;
266
267 RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
268 (char *), allocList);
269 return ((void *) p);
270 }
271
272
273 /*****************************************************************************
274 *
275 * Debug routines.
276 *
277 *****************************************************************************/
278
279 char *
rf_NodeStatusString(RF_DagNode_t * node)280 rf_NodeStatusString(RF_DagNode_t *node)
281 {
282 switch (node->status) {
283 case rf_wait:
284 return ("wait");
285 case rf_fired:
286 return ("fired");
287 case rf_good:
288 return ("good");
289 case rf_bad:
290 return ("bad");
291 default:
292 return ("?");
293 }
294 }
295
296 void
rf_PrintNodeInfoString(RF_DagNode_t * node)297 rf_PrintNodeInfoString(RF_DagNode_t *node)
298 {
299 RF_PhysDiskAddr_t *pda;
300 int (*df) (RF_DagNode_t *) = node->doFunc;
301 int i, lk, unlk;
302 void *bufPtr;
303
304 if ((df == rf_DiskReadFunc) || (df == rf_DiskWriteFunc) ||
305 (df == rf_DiskReadMirrorIdleFunc) ||
306 (df == rf_DiskReadMirrorPartitionFunc)) {
307 pda = (RF_PhysDiskAddr_t *) node->params[0].p;
308 bufPtr = (void *) node->params[1].p;
309 lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
310 unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
311 RF_ASSERT(!(lk && unlk));
312 printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row,
313 pda->col, (long) pda->startSector, (int) pda->numSector,
314 (long) bufPtr, (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
315 return;
316 }
317 if (df == rf_DiskUnlockFunc) {
318 pda = (RF_PhysDiskAddr_t *) node->params[0].p;
319 lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
320 unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
321 RF_ASSERT(!(lk && unlk));
322 printf("r %d c %d %s\n", pda->row, pda->col,
323 (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
324 return;
325 }
326 if ((df == rf_SimpleXorFunc) || (df == rf_RegularXorFunc)
327 || (df == rf_RecoveryXorFunc)) {
328 printf("result buf 0x%lx\n", (long) node->results[0]);
329 for (i = 0; i < node->numParams - 1; i += 2) {
330 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
331 bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
332 printf(" buf 0x%lx r%d c%d offs %ld nsect %d\n",
333 (long) bufPtr, pda->row, pda->col,
334 (long) pda->startSector, (int) pda->numSector);
335 }
336 return;
337 }
338 #if RF_INCLUDE_PARITYLOGGING > 0
339 if (df == rf_ParityLogOverwriteFunc || df == rf_ParityLogUpdateFunc) {
340 for (i = 0; i < node->numParams - 1; i += 2) {
341 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
342 bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
343 printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
344 pda->row, pda->col, (long) pda->startSector,
345 (int) pda->numSector, (long) bufPtr);
346 }
347 return;
348 }
349 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
350
351 if ((df == rf_TerminateFunc) || (df == rf_NullNodeFunc)) {
352 printf("\n");
353 return;
354 }
355 printf("?\n");
356 }
357
358 void
rf_RecurPrintDAG(RF_DagNode_t * node,int depth,int unvisited)359 rf_RecurPrintDAG(RF_DagNode_t *node, int depth, int unvisited)
360 {
361 char *anttype;
362 int i;
363
364 node->visited = (unvisited) ? 0 : 1;
365 printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
366 node->nodeNum, node->commitNode, node->name,
367 rf_NodeStatusString(node), node->numSuccedents,
368 node->numSuccFired, node->numSuccDone,
369 node->numAntecedents, node->numAntDone,
370 node->numParams, node->numResults);
371 for (i = 0; i < node->numSuccedents; i++) {
372 printf("%d%s", node->succedents[i]->nodeNum,
373 ((i == node->numSuccedents - 1) ? "\0" : " "));
374 }
375 printf("} A{");
376 for (i = 0; i < node->numAntecedents; i++) {
377 switch (node->antType[i]) {
378 case rf_trueData:
379 anttype = "T";
380 break;
381 case rf_antiData:
382 anttype = "A";
383 break;
384 case rf_outputData:
385 anttype = "O";
386 break;
387 case rf_control:
388 anttype = "C";
389 break;
390 default:
391 anttype = "?";
392 break;
393 }
394 printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype,
395 (i == node->numAntecedents - 1) ? "\0" : " ");
396 }
397 printf("}; ");
398 rf_PrintNodeInfoString(node);
399 for (i = 0; i < node->numSuccedents; i++) {
400 if (node->succedents[i]->visited == unvisited)
401 rf_RecurPrintDAG(node->succedents[i], depth + 1,
402 unvisited);
403 }
404 }
405
406 void
rf_PrintDAG(RF_DagHeader_t * dag_h)407 rf_PrintDAG(RF_DagHeader_t *dag_h)
408 {
409 int unvisited, i;
410 char *status;
411
412 /* Set dag status. */
413 switch (dag_h->status) {
414 case rf_enable:
415 status = "enable";
416 break;
417 case rf_rollForward:
418 status = "rollForward";
419 break;
420 case rf_rollBackward:
421 status = "rollBackward";
422 break;
423 default:
424 status = "illegal !";
425 break;
426 }
427 /* Find out if visited bits are currently set or cleared. */
428 unvisited = dag_h->succedents[0]->visited;
429
430 printf("DAG type: %s\n", dag_h->creator);
431 printf("format is (depth) num commit type: status,nSucc nSuccFired/n"
432 "SuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)}; info\n");
433 printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
434 status, dag_h->numSuccedents, dag_h->numCommitNodes,
435 dag_h->numCommits);
436 for (i = 0; i < dag_h->numSuccedents; i++) {
437 printf("%d%s", dag_h->succedents[i]->nodeNum,
438 ((i == dag_h->numSuccedents - 1) ? "\0" : " "));
439 }
440 printf("};\n");
441 for (i = 0; i < dag_h->numSuccedents; i++) {
442 if (dag_h->succedents[i]->visited == unvisited)
443 rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
444 }
445 }
446
447 /* Assign node numbers. */
448 int
rf_AssignNodeNums(RF_DagHeader_t * dag_h)449 rf_AssignNodeNums(RF_DagHeader_t *dag_h)
450 {
451 int unvisited, i, nnum;
452 RF_DagNode_t *node;
453
454 nnum = 0;
455 unvisited = dag_h->succedents[0]->visited;
456
457 dag_h->nodeNum = nnum++;
458 for (i = 0; i < dag_h->numSuccedents; i++) {
459 node = dag_h->succedents[i];
460 if (node->visited == unvisited) {
461 nnum = rf_RecurAssignNodeNums(dag_h->succedents[i],
462 nnum, unvisited);
463 }
464 }
465 return (nnum);
466 }
467
468 int
rf_RecurAssignNodeNums(RF_DagNode_t * node,int num,int unvisited)469 rf_RecurAssignNodeNums(RF_DagNode_t *node, int num, int unvisited)
470 {
471 int i;
472
473 node->visited = (unvisited) ? 0 : 1;
474
475 node->nodeNum = num++;
476 for (i = 0; i < node->numSuccedents; i++) {
477 if (node->succedents[i]->visited == unvisited) {
478 num = rf_RecurAssignNodeNums(node->succedents[i],
479 num, unvisited);
480 }
481 }
482 return (num);
483 }
484
485 /* Set the header pointers in each node to "newptr". */
486 void
rf_ResetDAGHeaderPointers(RF_DagHeader_t * dag_h,RF_DagHeader_t * newptr)487 rf_ResetDAGHeaderPointers(RF_DagHeader_t *dag_h, RF_DagHeader_t *newptr)
488 {
489 int i;
490
491 for (i = 0; i < dag_h->numSuccedents; i++)
492 if (dag_h->succedents[i]->dagHdr != newptr)
493 rf_RecurResetDAGHeaderPointers(dag_h->succedents[i],
494 newptr);
495 }
496
497 void
rf_RecurResetDAGHeaderPointers(RF_DagNode_t * node,RF_DagHeader_t * newptr)498 rf_RecurResetDAGHeaderPointers(RF_DagNode_t *node, RF_DagHeader_t *newptr)
499 {
500 int i;
501
502 node->dagHdr = newptr;
503 for (i = 0; i < node->numSuccedents; i++)
504 if (node->succedents[i]->dagHdr != newptr)
505 rf_RecurResetDAGHeaderPointers(node->succedents[i],
506 newptr);
507 }
508
509 void
rf_PrintDAGList(RF_DagHeader_t * dag_h)510 rf_PrintDAGList(RF_DagHeader_t *dag_h)
511 {
512 int i = 0;
513
514 for (; dag_h; dag_h = dag_h->next) {
515 rf_AssignNodeNums(dag_h);
516 printf("\n\nDAG %d IN LIST:\n", i++);
517 rf_PrintDAG(dag_h);
518 }
519 }
520
521 int
rf_ValidateBranch(RF_DagNode_t * node,int * scount,int * acount,RF_DagNode_t ** nodes,int unvisited)522 rf_ValidateBranch(RF_DagNode_t *node, int *scount, int *acount,
523 RF_DagNode_t **nodes, int unvisited)
524 {
525 int i, retcode = 0;
526
527 /* Construct an array of node pointers indexed by node num. */
528 node->visited = (unvisited) ? 0 : 1;
529 nodes[node->nodeNum] = node;
530
531 if (node->next != NULL) {
532 printf("INVALID DAG: next pointer in node is not NULL.\n");
533 retcode = 1;
534 }
535 if (node->status != rf_wait) {
536 printf("INVALID DAG: Node status is not wait.\n");
537 retcode = 1;
538 }
539 if (node->numAntDone != 0) {
540 printf("INVALID DAG: numAntDone is not zero.\n");
541 retcode = 1;
542 }
543 if (node->doFunc == rf_TerminateFunc) {
544 if (node->numSuccedents != 0) {
545 printf("INVALID DAG: Terminator node has"
546 " succedents.\n");
547 retcode = 1;
548 }
549 } else {
550 if (node->numSuccedents == 0) {
551 printf("INVALID DAG: Non-terminator node has no"
552 " succedents\n");
553 retcode = 1;
554 }
555 }
556 for (i = 0; i < node->numSuccedents; i++) {
557 if (!node->succedents[i]) {
558 printf("INVALID DAG: succedent %d of node %s"
559 " is NULL.\n", i, node->name);
560 retcode = 1;
561 }
562 scount[node->succedents[i]->nodeNum]++;
563 }
564 for (i = 0; i < node->numAntecedents; i++) {
565 if (!node->antecedents[i]) {
566 printf("INVALID DAG: antecedent %d of node %s is"
567 " NULL.\n", i, node->name);
568 retcode = 1;
569 }
570 acount[node->antecedents[i]->nodeNum]++;
571 }
572 for (i = 0; i < node->numSuccedents; i++) {
573 if (node->succedents[i]->visited == unvisited) {
574 if (rf_ValidateBranch(node->succedents[i], scount,
575 acount, nodes, unvisited)) {
576 retcode = 1;
577 }
578 }
579 }
580 return (retcode);
581 }
582
583 void
rf_ValidateBranchVisitedBits(RF_DagNode_t * node,int unvisited,int rl)584 rf_ValidateBranchVisitedBits(RF_DagNode_t *node, int unvisited, int rl)
585 {
586 int i;
587
588 RF_ASSERT(node->visited == unvisited);
589 for (i = 0; i < node->numSuccedents; i++) {
590 if (node->succedents[i] == NULL) {
591 printf("node=%lx node->succedents[%d] is NULL.\n",
592 (long) node, i);
593 RF_ASSERT(0);
594 }
595 rf_ValidateBranchVisitedBits(node->succedents[i],
596 unvisited, rl + 1);
597 }
598 }
599
600 /*
601 * NOTE: Never call this on a big dag, because it is exponential
602 * in execution time.
603 */
604 void
rf_ValidateVisitedBits(RF_DagHeader_t * dag)605 rf_ValidateVisitedBits(RF_DagHeader_t *dag)
606 {
607 int i, unvisited;
608
609 unvisited = dag->succedents[0]->visited;
610
611 for (i = 0; i < dag->numSuccedents; i++) {
612 if (dag->succedents[i] == NULL) {
613 printf("dag=%lx dag->succedents[%d] is NULL.\n",
614 (long) dag, i);
615 RF_ASSERT(0);
616 }
617 rf_ValidateBranchVisitedBits(dag->succedents[i], unvisited, 0);
618 }
619 }
620
621 /*
622 * Validate a DAG. _at entry_ verify that:
623 * -- numNodesCompleted is zero
624 * -- node queue is null
625 * -- dag status is rf_enable
626 * -- next pointer is null on every node
627 * -- all nodes have status wait
628 * -- numAntDone is zero in all nodes
629 * -- terminator node has zero successors
630 * -- no other node besides terminator has zero successors
631 * -- no successor or antecedent pointer in a node is NULL
632 * -- number of times that each node appears as a successor of another node
633 * is equal to the antecedent count on that node
634 * -- number of times that each node appears as an antecedent of another node
635 * is equal to the succedent count on that node
636 * -- what else ?
637 */
638 int
rf_ValidateDAG(RF_DagHeader_t * dag_h)639 rf_ValidateDAG(RF_DagHeader_t *dag_h)
640 {
641 int i, nodecount;
642 int *scount, *acount; /* Per-node successor and antecedent counts. */
643 RF_DagNode_t **nodes; /* Array of ptrs to nodes in dag. */
644 int retcode = 0;
645 int unvisited;
646 int commitNodeCount = 0;
647
648 if (rf_validateVisitedDebug)
649 rf_ValidateVisitedBits(dag_h);
650
651 if (dag_h->numNodesCompleted != 0) {
652 printf("INVALID DAG: num nodes completed is %d, should be 0.\n",
653 dag_h->numNodesCompleted);
654 retcode = 1;
655 goto validate_dag_bad;
656 }
657 if (dag_h->status != rf_enable) {
658 printf("INVALID DAG: not enabled.\n");
659 retcode = 1;
660 goto validate_dag_bad;
661 }
662 if (dag_h->numCommits != 0) {
663 printf("INVALID DAG: numCommits != 0 (%d)\n",
664 dag_h->numCommits);
665 retcode = 1;
666 goto validate_dag_bad;
667 }
668 if (dag_h->numSuccedents != 1) {
669 /* Currently, all dags must have only one succedent. */
670 printf("INVALID DAG: numSuccedents != 1 (%d).\n",
671 dag_h->numSuccedents);
672 retcode = 1;
673 goto validate_dag_bad;
674 }
675 nodecount = rf_AssignNodeNums(dag_h);
676
677 unvisited = dag_h->succedents[0]->visited;
678
679 RF_Calloc(scount, nodecount, sizeof(int), (int *));
680 RF_Calloc(acount, nodecount, sizeof(int), (int *));
681 RF_Calloc(nodes, nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
682 for (i = 0; i < dag_h->numSuccedents; i++) {
683 if ((dag_h->succedents[i]->visited == unvisited)
684 && rf_ValidateBranch(dag_h->succedents[i], scount,
685 acount, nodes, unvisited)) {
686 retcode = 1;
687 }
688 }
689 /* Start at 1 to skip the header node. */
690 for (i = 1; i < nodecount; i++) {
691 if (nodes[i]->commitNode)
692 commitNodeCount++;
693 if (nodes[i]->doFunc == NULL) {
694 printf("INVALID DAG: node %s has an undefined"
695 " doFunc.\n", nodes[i]->name);
696 retcode = 1;
697 goto validate_dag_out;
698 }
699 if (nodes[i]->undoFunc == NULL) {
700 printf("INVALID DAG: node %s has an undefined"
701 " doFunc.\n", nodes[i]->name);
702 retcode = 1;
703 goto validate_dag_out;
704 }
705 if (nodes[i]->numAntecedents != scount[nodes[i]->nodeNum]) {
706 printf("INVALID DAG: node %s has %d antecedents but"
707 " appears as a succedent %d times.\n",
708 nodes[i]->name, nodes[i]->numAntecedents,
709 scount[nodes[i]->nodeNum]);
710 retcode = 1;
711 goto validate_dag_out;
712 }
713 if (nodes[i]->numSuccedents != acount[nodes[i]->nodeNum]) {
714 printf("INVALID DAG: node %s has %d succedents but"
715 " appears as an antecedent %d times.\n",
716 nodes[i]->name, nodes[i]->numSuccedents,
717 acount[nodes[i]->nodeNum]);
718 retcode = 1;
719 goto validate_dag_out;
720 }
721 }
722
723 if (dag_h->numCommitNodes != commitNodeCount) {
724 printf("INVALID DAG: incorrect commit node count. "
725 "hdr->numCommitNodes (%d) found (%d) commit nodes"
726 " in graph.\n",
727 dag_h->numCommitNodes, commitNodeCount);
728 retcode = 1;
729 goto validate_dag_out;
730 }
731
732 validate_dag_out:
733 RF_Free(scount, nodecount * sizeof(int));
734 RF_Free(acount, nodecount * sizeof(int));
735 RF_Free(nodes, nodecount * sizeof(RF_DagNode_t *));
736 if (retcode)
737 rf_PrintDAGList(dag_h);
738
739 if (rf_validateVisitedDebug)
740 rf_ValidateVisitedBits(dag_h);
741
742 return (retcode);
743
744 validate_dag_bad:
745 rf_PrintDAGList(dag_h);
746 return (retcode);
747 }
748
749
750 /*****************************************************************************
751 *
752 * Misc construction routines.
753 *
754 *****************************************************************************/
755
756 void
rf_redirect_asm(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap)757 rf_redirect_asm(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
758 {
759 int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
760 int row = asmap->physInfo->row;
761 int fcol = raidPtr->reconControl[row]->fcol;
762 int srow = raidPtr->reconControl[row]->spareRow;
763 int scol = raidPtr->reconControl[row]->spareCol;
764 RF_PhysDiskAddr_t *pda;
765
766 RF_ASSERT(raidPtr->status[row] == rf_rs_reconstructing);
767 for (pda = asmap->physInfo; pda; pda = pda->next) {
768 if (pda->col == fcol) {
769 if (rf_dagDebug) {
770 if (!rf_CheckRUReconstructed(
771 raidPtr->reconControl[row]->reconMap,
772 pda->startSector)) {
773 RF_PANIC();
774 }
775 }
776 /*printf("Remapped data for large write\n");*/
777 if (ds) {
778 raidPtr->Layout.map->MapSector(raidPtr,
779 pda->raidAddress, &pda->row, &pda->col,
780 &pda->startSector, RF_REMAP);
781 } else {
782 pda->row = srow;
783 pda->col = scol;
784 }
785 }
786 }
787 for (pda = asmap->parityInfo; pda; pda = pda->next) {
788 if (pda->col == fcol) {
789 if (rf_dagDebug) {
790 if (!rf_CheckRUReconstructed(
791 raidPtr->reconControl[row]->reconMap,
792 pda->startSector)) {
793 RF_PANIC();
794 }
795 }
796 }
797 if (ds) {
798 (raidPtr->Layout.map->MapParity) (raidPtr,
799 pda->raidAddress, &pda->row, &pda->col,
800 &pda->startSector, RF_REMAP);
801 } else {
802 pda->row = srow;
803 pda->col = scol;
804 }
805 }
806 }
807
808
809 /*
810 * This routine allocates read buffers and generates stripe maps for the
811 * regions of the array from the start of the stripe to the start of the
812 * access, and from the end of the access to the end of the stripe. It also
813 * computes and returns the number of DAG nodes needed to read all this data.
814 * Note that this routine does the wrong thing if the access is fully
815 * contained within one stripe unit, so we RF_ASSERT against this case at the
816 * start.
817 */
818 void
rf_MapUnaccessedPortionOfStripe(RF_Raid_t * raidPtr,RF_RaidLayout_t * layoutPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,RF_AccessStripeMapHeader_t ** new_asm_h,int * nRodNodes,char ** sosBuffer,char ** eosBuffer,RF_AllocListElem_t * allocList)819 rf_MapUnaccessedPortionOfStripe(
820 RF_Raid_t *raidPtr,
821 RF_RaidLayout_t *layoutPtr, /* in: layout information */
822 RF_AccessStripeMap_t *asmap, /* in: access stripe map */
823 RF_DagHeader_t *dag_h, /* in: header of the dag */
824 /* to create */
825 RF_AccessStripeMapHeader_t **new_asm_h, /* in: ptr to array of 2 */
826 /* headers, to be */
827 /* filled in */
828 int *nRodNodes, /* out: num nodes to be */
829 /* generated to read */
830 /* unaccessed data */
831 char **sosBuffer, /* out: pointers to newly */
832 /* allocated buffer */
833 char **eosBuffer,
834 RF_AllocListElem_t *allocList
835 )
836 {
837 RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
838 RF_SectorNum_t sosNumSector, eosNumSector;
839
840 RF_ASSERT(asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol / 2));
841 /*
842 * Generate an access map for the region of the array from start of
843 * stripe to start of access.
844 */
845 new_asm_h[0] = new_asm_h[1] = NULL;
846 *nRodNodes = 0;
847 if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
848 sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
849 asmap->raidAddress);
850 sosNumSector = asmap->raidAddress - sosRaidAddress;
851 RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr,
852 sosNumSector), (char *), allocList);
853 new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress,
854 sosNumSector, *sosBuffer, RF_DONT_REMAP);
855 new_asm_h[0]->next = dag_h->asmList;
856 dag_h->asmList = new_asm_h[0];
857 *nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
858
859 RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
860 /* We're totally within one stripe here. */
861 if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
862 rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
863 }
864 /*
865 * Generate an access map for the region of the array from end of
866 * access to end of stripe.
867 */
868 if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
869 eosRaidAddress = asmap->endRaidAddress;
870 eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr,
871 eosRaidAddress) - eosRaidAddress;
872 RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr,
873 eosNumSector), (char *), allocList);
874 new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress,
875 eosNumSector, *eosBuffer, RF_DONT_REMAP);
876 new_asm_h[1]->next = dag_h->asmList;
877 dag_h->asmList = new_asm_h[1];
878 *nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
879
880 RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
881 /* We're totally within one stripe here. */
882 if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
883 rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
884 }
885 }
886
887
888 /* Returns non-zero if the indicated ranges of stripe unit offsets overlap. */
889 int
rf_PDAOverlap(RF_RaidLayout_t * layoutPtr,RF_PhysDiskAddr_t * src,RF_PhysDiskAddr_t * dest)890 rf_PDAOverlap(RF_RaidLayout_t *layoutPtr, RF_PhysDiskAddr_t *src,
891 RF_PhysDiskAddr_t *dest)
892 {
893 RF_SectorNum_t soffs =
894 rf_StripeUnitOffset(layoutPtr, src->startSector);
895 RF_SectorNum_t doffs =
896 rf_StripeUnitOffset(layoutPtr, dest->startSector);
897 /* Use -1 to be sure we stay within SU. */
898 RF_SectorNum_t send =
899 rf_StripeUnitOffset(layoutPtr, src->startSector +
900 src->numSector - 1);
901 RF_SectorNum_t dend =
902 rf_StripeUnitOffset(layoutPtr, dest->startSector +
903 dest->numSector - 1);
904
905 return ((RF_MAX(soffs, doffs) <= RF_MIN(send, dend)) ? 1 : 0);
906 }
907
908
909 /*
910 * GenerateFailedAccessASMs
911 *
912 * This routine figures out what portion of the stripe needs to be read
913 * to effect the degraded read or write operation. It's primary function
914 * is to identify everything required to recover the data, and then
915 * eliminate anything that is already being accessed by the user.
916 *
917 * The main result is two new ASMs, one for the region from the start of the
918 * stripe to the start of the access, and one for the region from the end of
919 * the access to the end of the stripe. These ASMs describe everything that
920 * needs to be read to effect the degraded access. Other results are:
921 * nXorBufs -- The total number of buffers that need to be XORed together
922 * to recover the lost data,
923 * rpBufPtr -- Ptr to a newly-allocated buffer to hold the parity. If NULL
924 * at entry, not allocated.
925 * overlappingPDAs --
926 * Describes which of the non-failed PDAs, in the user access,
927 * overlap data that needs to be read to effect recovery.
928 * overlappingPDAs[i]==1 if and only if, neglecting the failed
929 * PDA, the i'th pda in the input asm overlaps data that needs
930 * to be read for recovery.
931 */
932 /* in: asmap - ASM for the actual access, one stripe only. */
933 /* in: faildPDA - Which component of the access has failed. */
934 /* in: dag_h - Header of the DAG we're going to create. */
935 /* out: new_asm_h - The two new ASMs. */
936 /* out: nXorBufs - The total number of xor bufs required. */
937 /* out: rpBufPtr - A buffer for the parity read. */
938 void
rf_GenerateFailedAccessASMs(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_PhysDiskAddr_t * failedPDA,RF_DagHeader_t * dag_h,RF_AccessStripeMapHeader_t ** new_asm_h,int * nXorBufs,char ** rpBufPtr,char * overlappingPDAs,RF_AllocListElem_t * allocList)939 rf_GenerateFailedAccessASMs(
940 RF_Raid_t *raidPtr,
941 RF_AccessStripeMap_t *asmap,
942 RF_PhysDiskAddr_t *failedPDA,
943 RF_DagHeader_t *dag_h,
944 RF_AccessStripeMapHeader_t **new_asm_h,
945 int *nXorBufs,
946 char **rpBufPtr,
947 char *overlappingPDAs,
948 RF_AllocListElem_t *allocList
949 )
950 {
951 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
952
953 /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
954 RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
955
956 RF_SectorCount_t numSect[2], numParitySect;
957 RF_PhysDiskAddr_t *pda;
958 char *rdBuf, *bufP;
959 int foundit, i;
960
961 bufP = NULL;
962 foundit = 0;
963 /*
964 * First compute the following raid addresses:
965 * - Start of stripe
966 * - (sosAddr) MIN(start of access, start of failed SU)
967 * - (sosEndAddr) MAX(end of access, end of failed SU)
968 * - (eosStartAddr) end of stripe (i.e. start of next stripe)
969 * (eosAddr)
970 */
971 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
972 asmap->raidAddress);
973 sosEndAddr = RF_MIN(asmap->raidAddress,
974 rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
975 failedPDA->raidAddress));
976 eosStartAddr = RF_MAX(asmap->endRaidAddress,
977 rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
978 failedPDA->raidAddress));
979 eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr,
980 asmap->raidAddress);
981
982 /*
983 * Now generate access stripe maps for each of the above regions of
984 * the stripe. Use a dummy (NULL) buf ptr for now.
985 */
986
987 new_asm_h[0] = (sosAddr != sosEndAddr) ?
988 rf_MapAccess(raidPtr, sosAddr, sosEndAddr - sosAddr, NULL,
989 RF_DONT_REMAP) : NULL;
990 new_asm_h[1] = (eosStartAddr != eosAddr) ?
991 rf_MapAccess(raidPtr, eosStartAddr, eosAddr - eosStartAddr, NULL,
992 RF_DONT_REMAP) : NULL;
993
994 /*
995 * Walk through the PDAs and range-restrict each SU to the region of
996 * the SU touched on the failed PDA. Also compute total data buffer
997 * space requirements in this step. Ignore the parity for now.
998 */
999
1000 numSect[0] = numSect[1] = 0;
1001 if (new_asm_h[0]) {
1002 new_asm_h[0]->next = dag_h->asmList;
1003 dag_h->asmList = new_asm_h[0];
1004 for (pda = new_asm_h[0]->stripeMap->physInfo; pda;
1005 pda = pda->next) {
1006 rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
1007 RF_RESTRICT_NOBUFFER, 0);
1008 numSect[0] += pda->numSector;
1009 }
1010 }
1011 if (new_asm_h[1]) {
1012 new_asm_h[1]->next = dag_h->asmList;
1013 dag_h->asmList = new_asm_h[1];
1014 for (pda = new_asm_h[1]->stripeMap->physInfo;
1015 pda; pda = pda->next) {
1016 rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
1017 RF_RESTRICT_NOBUFFER, 0);
1018 numSect[1] += pda->numSector;
1019 }
1020 }
1021 numParitySect = failedPDA->numSector;
1022
1023 /*
1024 * Allocate buffer space for the data & parity we have to read to
1025 * recover from the failure.
1026 */
1027
1028 if (numSect[0] + numSect[1] + ((rpBufPtr) ? numParitySect : 0)) {
1029 /* Don't allocate parity buf if not needed. */
1030 RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,
1031 numSect[0] + numSect[1] + numParitySect), (char *),
1032 allocList);
1033 bufP = rdBuf;
1034 if (rf_degDagDebug)
1035 printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
1036 (int) rf_RaidAddressToByte(raidPtr,
1037 numSect[0] + numSect[1] + numParitySect),
1038 (unsigned long) bufP);
1039 }
1040 /*
1041 * Now walk through the pdas one last time and assign buffer pointers
1042 * (ugh!). Again, ignore the parity. Also, count nodes to find out
1043 * how many bufs need to be xored together.
1044 */
1045 (*nXorBufs) = 1; /* In read case, 1 is for parity. */
1046 /* In write case, 1 is for failed data. */
1047 if (new_asm_h[0]) {
1048 for (pda = new_asm_h[0]->stripeMap->physInfo; pda;
1049 pda = pda->next) {
1050 pda->bufPtr = bufP;
1051 bufP += rf_RaidAddressToByte(raidPtr, pda->numSector);
1052 }
1053 *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
1054 }
1055 if (new_asm_h[1]) {
1056 for (pda = new_asm_h[1]->stripeMap->physInfo; pda;
1057 pda = pda->next) {
1058 pda->bufPtr = bufP;
1059 bufP += rf_RaidAddressToByte(raidPtr, pda->numSector);
1060 }
1061 (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
1062 }
1063 if (rpBufPtr)
1064 /* The rest of the buffer is for parity. */
1065 *rpBufPtr = bufP;
1066
1067 /*
1068 * The last step is to figure out how many more distinct buffers need
1069 * to get xor'd to produce the missing unit. there's one for each
1070 * user-data read node that overlaps the portion of the failed unit
1071 * being accessed.
1072 */
1073
1074 for (foundit = i = 0, pda = asmap->physInfo;
1075 pda; i++, pda = pda->next) {
1076 if (pda == failedPDA) {
1077 i--;
1078 foundit = 1;
1079 continue;
1080 }
1081 if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
1082 overlappingPDAs[i] = 1;
1083 (*nXorBufs)++;
1084 }
1085 }
1086 if (!foundit) {
1087 RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA"
1088 " in asm list.\n");
1089 RF_ASSERT(0);
1090 }
1091 if (rf_degDagDebug) {
1092 if (new_asm_h[0]) {
1093 printf("First asm:\n");
1094 rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
1095 }
1096 if (new_asm_h[1]) {
1097 printf("Second asm:\n");
1098 rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
1099 }
1100 }
1101 }
1102
1103
1104 /*
1105 * Adjust the offset and number of sectors in the destination pda so that
1106 * it covers at most the region of the SU covered by the source PDA. This
1107 * is exclusively a restriction: the number of sectors indicated by the
1108 * target PDA can only shrink.
1109 *
1110 * For example: s = sectors within SU indicated by source PDA
1111 * d = sectors within SU indicated by dest PDA
1112 * r = results, stored in dest PDA
1113 *
1114 * |--------------- one stripe unit ---------------------|
1115 * | sssssssssssssssssssssssssssssssss |
1116 * | ddddddddddddddddddddddddddddddddddddddddddddd |
1117 * | rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr |
1118 *
1119 * Another example:
1120 *
1121 * |--------------- one stripe unit ---------------------|
1122 * | sssssssssssssssssssssssssssssssss |
1123 * | ddddddddddddddddddddddd |
1124 * | rrrrrrrrrrrrrrrr |
1125 *
1126 */
1127 void
rf_RangeRestrictPDA(RF_Raid_t * raidPtr,RF_PhysDiskAddr_t * src,RF_PhysDiskAddr_t * dest,int dobuffer,int doraidaddr)1128 rf_RangeRestrictPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *src,
1129 RF_PhysDiskAddr_t *dest, int dobuffer, int doraidaddr)
1130 {
1131 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1132 RF_SectorNum_t soffs =
1133 rf_StripeUnitOffset(layoutPtr, src->startSector);
1134 RF_SectorNum_t doffs =
1135 rf_StripeUnitOffset(layoutPtr, dest->startSector);
1136 RF_SectorNum_t send =
1137 rf_StripeUnitOffset(layoutPtr, src->startSector +
1138 src->numSector - 1); /* Use -1 to be sure we stay within SU. */
1139 RF_SectorNum_t dend =
1140 rf_StripeUnitOffset(layoutPtr, dest->startSector +
1141 dest->numSector - 1);
1142 RF_SectorNum_t subAddr =
1143 rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
1144 dest->startSector); /* Stripe unit boundary. */
1145
1146 dest->startSector = subAddr + RF_MAX(soffs, doffs);
1147 dest->numSector = subAddr + RF_MIN(send, dend) + 1 - dest->startSector;
1148
1149 if (dobuffer)
1150 dest->bufPtr += (soffs > doffs) ?
1151 rf_RaidAddressToByte(raidPtr, soffs - doffs) : 0;
1152 if (doraidaddr) {
1153 dest->raidAddress =
1154 rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
1155 dest->raidAddress) +
1156 rf_StripeUnitOffset(layoutPtr, dest->startSector);
1157 }
1158 }
1159
1160 /*
1161 * Want the highest of these primes to be the largest one
1162 * less than the max expected number of columns (won't hurt
1163 * to be too small or too large, but won't be optimal, either)
1164 * --jimz
1165 */
1166 #define NLOWPRIMES 8
1167 static int lowprimes[NLOWPRIMES] = {2, 3, 5, 7, 11, 13, 17, 19};
1168
1169 /*****************************************************************************
1170 * Compute the workload shift factor. (chained declustering)
1171 *
1172 * Return nonzero if access should shift to secondary, otherwise,
1173 * access is to primary.
1174 *****************************************************************************/
1175 int
rf_compute_workload_shift(RF_Raid_t * raidPtr,RF_PhysDiskAddr_t * pda)1176 rf_compute_workload_shift(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda)
1177 {
1178 /*
1179 * Variables:
1180 * d = Column of disk containing primary.
1181 * f = Column of failed disk.
1182 * n = Number of disks in array.
1183 * sd = "shift distance"
1184 * (number of columns that d is to the right of f).
1185 * row = Row of array the access is in.
1186 * v = Numerator of redirection ratio.
1187 * k = Denominator of redirection ratio.
1188 */
1189 RF_RowCol_t d, f, sd, row, n;
1190 int k, v, ret, i;
1191
1192 row = pda->row;
1193 n = raidPtr->numCol;
1194
1195 /* Assign column of primary copy to d. */
1196 d = pda->col;
1197
1198 /* Assign column of dead disk to f. */
1199 for (f = 0; ((!RF_DEAD_DISK(raidPtr->Disks[row][f].status)) &&
1200 (f < n)); f++);
1201
1202 RF_ASSERT(f < n);
1203 RF_ASSERT(f != d);
1204
1205 sd = (f > d) ? (n + d - f) : (d - f);
1206 RF_ASSERT(sd < n);
1207
1208 /*
1209 * v of every k accesses should be redirected.
1210 *
1211 * v/k := (n-1-sd)/(n-1)
1212 */
1213 v = (n - 1 - sd);
1214 k = (n - 1);
1215
1216 #if 1
1217 /*
1218 * XXX
1219 * Is this worth it ?
1220 *
1221 * Now reduce the fraction, by repeatedly factoring
1222 * out primes (just like they teach in elementary school !).
1223 */
1224 for (i = 0; i < NLOWPRIMES; i++) {
1225 if (lowprimes[i] > v)
1226 break;
1227 while (((v % lowprimes[i]) == 0) && ((k % lowprimes[i]) == 0)) {
1228 v /= lowprimes[i];
1229 k /= lowprimes[i];
1230 }
1231 }
1232 #endif
1233
1234 raidPtr->hist_diskreq[row][d]++;
1235 if (raidPtr->hist_diskreq[row][d] > v) {
1236 ret = 0; /* Do not redirect. */
1237 } else {
1238 ret = 1; /* Redirect. */
1239 }
1240
1241 #if 0
1242 printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
1243 raidPtr->hist_diskreq[row][d]);
1244 #endif
1245
1246 if (raidPtr->hist_diskreq[row][d] >= k) {
1247 /* Reset counter. */
1248 raidPtr->hist_diskreq[row][d] = 0;
1249 }
1250 return (ret);
1251 }
1252
1253 /*
1254 * Disk selection routines.
1255 */
1256
1257 /*
1258 * Select the disk with the shortest queue from a mirror pair.
1259 * Both the disk I/Os queued in RAIDframe as well as those at the physical
1260 * disk are counted as members of the "queue".
1261 */
1262 void
rf_SelectMirrorDiskIdle(RF_DagNode_t * node)1263 rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
1264 {
1265 RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1266 RF_RowCol_t rowData, colData, rowMirror, colMirror;
1267 int dataQueueLength, mirrorQueueLength, usemirror;
1268 RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
1269 RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
1270 RF_PhysDiskAddr_t *tmp_pda;
1271 RF_RaidDisk_t **disks = raidPtr->Disks;
1272 RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1273
1274 /* Return the [row col] of the disk with the shortest queue. */
1275 rowData = data_pda->row;
1276 colData = data_pda->col;
1277 rowMirror = mirror_pda->row;
1278 colMirror = mirror_pda->col;
1279 dataQueue = &(dqs[rowData][colData]);
1280 mirrorQueue = &(dqs[rowMirror][colMirror]);
1281
1282 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1283 RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1284 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1285 dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
1286 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1287 RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1288 RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1289 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1290 mirrorQueueLength = mirrorQueue->queueLength +
1291 mirrorQueue->numOutstanding;
1292 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1293 RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1294 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1295
1296 usemirror = 0;
1297 if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1298 usemirror = 0;
1299 } else
1300 if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1301 usemirror = 1;
1302 } else
1303 if (raidPtr->parity_good == RF_RAID_DIRTY) {
1304 /* Trust only the main disk. */
1305 usemirror = 0;
1306 } else
1307 if (dataQueueLength < mirrorQueueLength) {
1308 usemirror = 0;
1309 } else
1310 if (mirrorQueueLength < dataQueueLength) {
1311 usemirror = 1;
1312 } else {
1313 /* Queues are equal length. */
1314 /* Attempt cleverness. */
1315 if (SNUM_DIFF(dataQueue
1316 ->last_deq_sector, data_pda
1317 ->startSector) <=
1318 SNUM_DIFF(mirrorQueue
1319 ->last_deq_sector, mirror_pda
1320 ->startSector)) {
1321 usemirror = 0;
1322 } else {
1323 usemirror = 1;
1324 }
1325 }
1326
1327 if (usemirror) {
1328 /* Use mirror (parity) disk, swap params 0 & 4. */
1329 tmp_pda = data_pda;
1330 node->params[0].p = mirror_pda;
1331 node->params[4].p = tmp_pda;
1332 } else {
1333 /* Use data disk, leave param 0 unchanged. */
1334 }
1335 /*printf("dataQueueLength %d, mirrorQueueLength %d\n", dataQueueLength,
1336 mirrorQueueLength);*/
1337 }
1338
1339 /*
1340 * Do simple partitioning. This assumes that
1341 * the data and parity disks are laid out identically.
1342 */
1343 void
rf_SelectMirrorDiskPartition(RF_DagNode_t * node)1344 rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
1345 {
1346 RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1347 RF_RowCol_t rowData, colData, rowMirror, colMirror;
1348 RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
1349 RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
1350 RF_PhysDiskAddr_t *tmp_pda;
1351 RF_RaidDisk_t **disks = raidPtr->Disks;
1352 RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1353 int usemirror;
1354
1355 /* Return the [row col] of the disk with the shortest queue. */
1356 rowData = data_pda->row;
1357 colData = data_pda->col;
1358 rowMirror = mirror_pda->row;
1359 colMirror = mirror_pda->col;
1360 dataQueue = &(dqs[rowData][colData]);
1361 mirrorQueue = &(dqs[rowMirror][colMirror]);
1362
1363 usemirror = 0;
1364 if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1365 usemirror = 0;
1366 } else
1367 if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1368 usemirror = 1;
1369 } else
1370 if (raidPtr->parity_good == RF_RAID_DIRTY) {
1371 /* Trust only the main disk. */
1372 usemirror = 0;
1373 } else
1374 if (data_pda->startSector <
1375 (disks[rowData][colData].numBlocks / 2)) {
1376 usemirror = 0;
1377 } else {
1378 usemirror = 1;
1379 }
1380
1381 if (usemirror) {
1382 /* Use mirror (parity) disk, swap params 0 & 4. */
1383 tmp_pda = data_pda;
1384 node->params[0].p = mirror_pda;
1385 node->params[4].p = tmp_pda;
1386 } else {
1387 /* Use data disk, leave param 0 unchanged. */
1388 }
1389 }
1390