1 /*	$OpenBSD: rf_dagutils.c,v 1.4 2002/12/16 07:01:03 tdeval Exp $	*/
2 /*	$NetBSD: rf_dagutils.c,v 1.6 1999/12/09 02:26:09 oster Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Carnegie-Mellon University.
6  * All rights reserved.
7  *
8  * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
9  *
10  * Permission to use, copy, modify and distribute this software and
11  * its documentation is hereby granted, provided that both the copyright
12  * notice and this permission notice appear in all copies of the
13  * software, derivative works or modified versions, and any portions
14  * thereof, and that both notices appear in supporting documentation.
15  *
16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19  *
20  * Carnegie Mellon requests users of this software to return to
21  *
22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
23  *  School of Computer Science
24  *  Carnegie Mellon University
25  *  Pittsburgh PA 15213-3890
26  *
27  * any improvements or extensions that they make and grant Carnegie the
28  * rights to redistribute these changes.
29  */
30 
31 /*****************************************************************************
32  *
33  * rf_dagutils.c -- Utility routines for manipulating dags.
34  *
35  *****************************************************************************/
36 
37 #include "rf_archs.h"
38 #include "rf_types.h"
39 #include "rf_threadstuff.h"
40 #include "rf_raid.h"
41 #include "rf_dag.h"
42 #include "rf_dagutils.h"
43 #include "rf_dagfuncs.h"
44 #include "rf_general.h"
45 #include "rf_freelist.h"
46 #include "rf_map.h"
47 #include "rf_shutdown.h"
48 
49 #define	SNUM_DIFF(_a_,_b_)	(((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
50 
51 RF_RedFuncs_t rf_xorFuncs = {
52 	rf_RegularXorFunc, "Reg Xr", rf_SimpleXorFunc, "Simple Xr"
53 };
54 
55 RF_RedFuncs_t rf_xorRecoveryFuncs = {
56 	rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"
57 };
58 
59 void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
60 void rf_PrintDAG(RF_DagHeader_t *);
61 int  rf_ValidateBranch(RF_DagNode_t *, int *, int *, RF_DagNode_t **, int);
62 void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
63 void rf_ValidateVisitedBits(RF_DagHeader_t *);
64 
65 /*****************************************************************************
66  *
67  * InitNode - Initialize a dag node.
68  *
69  * The size of the propList array is always the same as that of the
70  * successors array.
71  *
72  *****************************************************************************/
73 void
rf_InitNode(RF_DagNode_t * node,RF_NodeStatus_t initstatus,int commit,int (* doFunc)(RF_DagNode_t *),int (* undoFunc)(RF_DagNode_t * node),int (* wakeFunc)(RF_DagNode_t * node,int),int nSucc,int nAnte,int nParam,int nResult,RF_DagHeader_t * hdr,char * name,RF_AllocListElem_t * alist)74 rf_InitNode(
75 	RF_DagNode_t	 *node,
76 	RF_NodeStatus_t	  initstatus,
77 	int		  commit,
78 	int		(*doFunc) (RF_DagNode_t *),
79 	int		(*undoFunc) (RF_DagNode_t *node),
80 	int		(*wakeFunc) (RF_DagNode_t *node, int),
81 	int		  nSucc,
82 	int		  nAnte,
83 	int		  nParam,
84 	int		  nResult,
85 	RF_DagHeader_t	 *hdr,
86 	char		 *name,
87 	RF_AllocListElem_t *alist
88 )
89 {
90 	void **ptrs;
91 	int nptrs;
92 
93 	if (nAnte > RF_MAX_ANTECEDENTS)
94 		RF_PANIC();
95 	node->status = initstatus;
96 	node->commitNode = commit;
97 	node->doFunc = doFunc;
98 	node->undoFunc = undoFunc;
99 	node->wakeFunc = wakeFunc;
100 	node->numParams = nParam;
101 	node->numResults = nResult;
102 	node->numAntecedents = nAnte;
103 	node->numAntDone = 0;
104 	node->next = NULL;
105 	node->numSuccedents = nSucc;
106 	node->name = name;
107 	node->dagHdr = hdr;
108 	node->visited = 0;
109 
110 	/* Allocate all the pointers with one call to malloc. */
111 	nptrs = nSucc + nAnte + nResult + nSucc;
112 
113 	if (nptrs <= RF_DAG_PTRCACHESIZE) {
114 		/*
115 	         * The dag_ptrs field of the node is basically some scribble
116 	         * space to be used here. We could get rid of it, and always
117 	         * allocate the range of pointers, but that's expensive. So,
118 	         * we pick a "common case" size for the pointer cache.
119 		 * Hopefully, we'll find that:
120 	         * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
121 	         *     only a little bit (least efficient case).
122 	         * (2) Generally, ntprs isn't a lot less than
123 		 *     RF_DAG_PTRCACHESIZE (wasted memory).
124 	         */
125 		ptrs = (void **) node->dag_ptrs;
126 	} else {
127 		RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
128 	}
129 	node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL;
130 	node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs + nSucc) : NULL;
131 	node->results = (nResult) ? (void **) (ptrs + nSucc + nAnte) : NULL;
132 	node->propList = (nSucc) ? (RF_PropHeader_t **)
133 	    (ptrs + nSucc + nAnte + nResult) : NULL;
134 
135 	if (nParam) {
136 		if (nParam <= RF_DAG_PARAMCACHESIZE) {
137 			node->params = (RF_DagParam_t *) node->dag_params;
138 		} else {
139 			RF_CallocAndAdd(node->params, nParam,
140 			    sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
141 		}
142 	} else {
143 		node->params = NULL;
144 	}
145 }
146 
147 
148 
149 /*****************************************************************************
150  *
151  * Allocation and deallocation routines.
152  *
153  *****************************************************************************/
154 
155 void
rf_FreeDAG(RF_DagHeader_t * dag_h)156 rf_FreeDAG(RF_DagHeader_t *dag_h)
157 {
158 	RF_AccessStripeMapHeader_t *asmap, *t_asmap;
159 	RF_DagHeader_t *nextDag;
160 	int i;
161 
162 	while (dag_h) {
163 		nextDag = dag_h->next;
164 		for (i = 0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
165 			/* Release mem chunks. */
166 			rf_ReleaseMemChunk(dag_h->memChunk[i]);
167 			dag_h->memChunk[i] = NULL;
168 		}
169 
170 		RF_ASSERT(i == dag_h->chunkIndex);
171 		if (dag_h->xtraChunkCnt > 0) {
172 			/* Free xtraMemChunks. */
173 			for (i = 0; dag_h->xtraMemChunk[i] &&
174 			     i < dag_h->xtraChunkIndex; i++) {
175 				rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
176 				dag_h->xtraMemChunk[i] = NULL;
177 			}
178 			RF_ASSERT(i == dag_h->xtraChunkIndex);
179 			/* Free ptrs to xtraMemChunks. */
180 			RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt *
181 			    sizeof(RF_ChunkDesc_t *));
182 		}
183 		rf_FreeAllocList(dag_h->allocList);
184 		for (asmap = dag_h->asmList; asmap;) {
185 			t_asmap = asmap;
186 			asmap = asmap->next;
187 			rf_FreeAccessStripeMap(t_asmap);
188 		}
189 		rf_FreeDAGHeader(dag_h);
190 		dag_h = nextDag;
191 	}
192 }
193 
194 RF_PropHeader_t *
rf_MakePropListEntry(RF_DagHeader_t * dag_h,int resultNum,int paramNum,RF_PropHeader_t * next,RF_AllocListElem_t * allocList)195 rf_MakePropListEntry(RF_DagHeader_t *dag_h, int resultNum, int paramNum,
196     RF_PropHeader_t *next, RF_AllocListElem_t *allocList)
197 {
198 	RF_PropHeader_t *p;
199 
200 	RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t), (RF_PropHeader_t *),
201 	    allocList);
202 	p->resultNum = resultNum;
203 	p->paramNum = paramNum;
204 	p->next = next;
205 	return (p);
206 }
207 
208 static RF_FreeList_t *rf_dagh_freelist;
209 
210 #define	RF_MAX_FREE_DAGH	128
211 #define	RF_DAGH_INC		 16
212 #define	RF_DAGH_INITIAL		 32
213 
214 void rf_ShutdownDAGs(void *);
215 void
rf_ShutdownDAGs(void * ignored)216 rf_ShutdownDAGs(void *ignored)
217 {
218 	RF_FREELIST_DESTROY(rf_dagh_freelist, next, (RF_DagHeader_t *));
219 }
220 
221 int
rf_ConfigureDAGs(RF_ShutdownList_t ** listp)222 rf_ConfigureDAGs(RF_ShutdownList_t **listp)
223 {
224 	int rc;
225 
226 	RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH, RF_DAGH_INC,
227 	    sizeof(RF_DagHeader_t));
228 	if (rf_dagh_freelist == NULL)
229 		return (ENOMEM);
230 	rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
231 	if (rc) {
232 		RF_ERRORMSG3("Unable to add to shutdown list file %s line"
233 		    " %d rc=%d\n", __FILE__, __LINE__, rc);
234 		rf_ShutdownDAGs(NULL);
235 		return (rc);
236 	}
237 	RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL, next,
238 	    (RF_DagHeader_t *));
239 	return (0);
240 }
241 
242 RF_DagHeader_t *
rf_AllocDAGHeader(void)243 rf_AllocDAGHeader(void)
244 {
245 	RF_DagHeader_t *dh;
246 
247 	RF_FREELIST_GET(rf_dagh_freelist, dh, next, (RF_DagHeader_t *));
248 	if (dh) {
249 		bzero((char *) dh, sizeof(RF_DagHeader_t));
250 	}
251 	return (dh);
252 }
253 
254 void
rf_FreeDAGHeader(RF_DagHeader_t * dh)255 rf_FreeDAGHeader(RF_DagHeader_t *dh)
256 {
257 	RF_FREELIST_FREE(rf_dagh_freelist, dh, next);
258 }
259 
260 /* Allocate a buffer big enough to hold the data described by pda. */
261 void *
rf_AllocBuffer(RF_Raid_t * raidPtr,RF_DagHeader_t * dag_h,RF_PhysDiskAddr_t * pda,RF_AllocListElem_t * allocList)262 rf_AllocBuffer(RF_Raid_t *raidPtr, RF_DagHeader_t *dag_h,
263     RF_PhysDiskAddr_t *pda, RF_AllocListElem_t *allocList)
264 {
265 	char *p;
266 
267 	RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
268 	    (char *), allocList);
269 	return ((void *) p);
270 }
271 
272 
273 /*****************************************************************************
274  *
275  * Debug routines.
276  *
277  *****************************************************************************/
278 
279 char *
rf_NodeStatusString(RF_DagNode_t * node)280 rf_NodeStatusString(RF_DagNode_t *node)
281 {
282 	switch (node->status) {
283 	case rf_wait:
284 		return ("wait");
285 	case rf_fired:
286 		return ("fired");
287 	case rf_good:
288 		return ("good");
289 	case rf_bad:
290 		return ("bad");
291 	default:
292 		return ("?");
293 	}
294 }
295 
296 void
rf_PrintNodeInfoString(RF_DagNode_t * node)297 rf_PrintNodeInfoString(RF_DagNode_t *node)
298 {
299 	RF_PhysDiskAddr_t *pda;
300 	int (*df) (RF_DagNode_t *) = node->doFunc;
301 	int i, lk, unlk;
302 	void *bufPtr;
303 
304 	if ((df == rf_DiskReadFunc) || (df == rf_DiskWriteFunc) ||
305 	    (df == rf_DiskReadMirrorIdleFunc) ||
306 	    (df == rf_DiskReadMirrorPartitionFunc)) {
307 		pda = (RF_PhysDiskAddr_t *) node->params[0].p;
308 		bufPtr = (void *) node->params[1].p;
309 		lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
310 		unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
311 		RF_ASSERT(!(lk && unlk));
312 		printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row,
313 		    pda->col, (long) pda->startSector, (int) pda->numSector,
314 		    (long) bufPtr, (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
315 		return;
316 	}
317 	if (df == rf_DiskUnlockFunc) {
318 		pda = (RF_PhysDiskAddr_t *) node->params[0].p;
319 		lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
320 		unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
321 		RF_ASSERT(!(lk && unlk));
322 		printf("r %d c %d %s\n", pda->row, pda->col,
323 		    (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
324 		return;
325 	}
326 	if ((df == rf_SimpleXorFunc) || (df == rf_RegularXorFunc)
327 	    || (df == rf_RecoveryXorFunc)) {
328 		printf("result buf 0x%lx\n", (long) node->results[0]);
329 		for (i = 0; i < node->numParams - 1; i += 2) {
330 			pda = (RF_PhysDiskAddr_t *) node->params[i].p;
331 			bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
332 			printf("    buf 0x%lx r%d c%d offs %ld nsect %d\n",
333 			    (long) bufPtr, pda->row, pda->col,
334 			    (long) pda->startSector, (int) pda->numSector);
335 		}
336 		return;
337 	}
338 #if	RF_INCLUDE_PARITYLOGGING > 0
339 	if (df == rf_ParityLogOverwriteFunc || df == rf_ParityLogUpdateFunc) {
340 		for (i = 0; i < node->numParams - 1; i += 2) {
341 			pda = (RF_PhysDiskAddr_t *) node->params[i].p;
342 			bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
343 			printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
344 			    pda->row, pda->col, (long) pda->startSector,
345 			    (int) pda->numSector, (long) bufPtr);
346 		}
347 		return;
348 	}
349 #endif	/* RF_INCLUDE_PARITYLOGGING > 0 */
350 
351 	if ((df == rf_TerminateFunc) || (df == rf_NullNodeFunc)) {
352 		printf("\n");
353 		return;
354 	}
355 	printf("?\n");
356 }
357 
358 void
rf_RecurPrintDAG(RF_DagNode_t * node,int depth,int unvisited)359 rf_RecurPrintDAG(RF_DagNode_t *node, int depth, int unvisited)
360 {
361 	char *anttype;
362 	int i;
363 
364 	node->visited = (unvisited) ? 0 : 1;
365 	printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
366 	    node->nodeNum, node->commitNode, node->name,
367 	    rf_NodeStatusString(node), node->numSuccedents,
368 	    node->numSuccFired, node->numSuccDone,
369 	    node->numAntecedents, node->numAntDone,
370 	    node->numParams, node->numResults);
371 	for (i = 0; i < node->numSuccedents; i++) {
372 		printf("%d%s", node->succedents[i]->nodeNum,
373 		    ((i == node->numSuccedents - 1) ? "\0" : " "));
374 	}
375 	printf("} A{");
376 	for (i = 0; i < node->numAntecedents; i++) {
377 		switch (node->antType[i]) {
378 		case rf_trueData:
379 			anttype = "T";
380 			break;
381 		case rf_antiData:
382 			anttype = "A";
383 			break;
384 		case rf_outputData:
385 			anttype = "O";
386 			break;
387 		case rf_control:
388 			anttype = "C";
389 			break;
390 		default:
391 			anttype = "?";
392 			break;
393 		}
394 		printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype,
395 		    (i == node->numAntecedents - 1) ? "\0" : " ");
396 	}
397 	printf("}; ");
398 	rf_PrintNodeInfoString(node);
399 	for (i = 0; i < node->numSuccedents; i++) {
400 		if (node->succedents[i]->visited == unvisited)
401 			rf_RecurPrintDAG(node->succedents[i], depth + 1,
402 			    unvisited);
403 	}
404 }
405 
406 void
rf_PrintDAG(RF_DagHeader_t * dag_h)407 rf_PrintDAG(RF_DagHeader_t *dag_h)
408 {
409 	int unvisited, i;
410 	char *status;
411 
412 	/* Set dag status. */
413 	switch (dag_h->status) {
414 	case rf_enable:
415 		status = "enable";
416 		break;
417 	case rf_rollForward:
418 		status = "rollForward";
419 		break;
420 	case rf_rollBackward:
421 		status = "rollBackward";
422 		break;
423 	default:
424 		status = "illegal !";
425 		break;
426 	}
427 	/* Find out if visited bits are currently set or cleared. */
428 	unvisited = dag_h->succedents[0]->visited;
429 
430 	printf("DAG type:  %s\n", dag_h->creator);
431 	printf("format is (depth) num commit type: status,nSucc nSuccFired/n"
432 	    "SuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)};  info\n");
433 	printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
434 	    status, dag_h->numSuccedents, dag_h->numCommitNodes,
435 	    dag_h->numCommits);
436 	for (i = 0; i < dag_h->numSuccedents; i++) {
437 		printf("%d%s", dag_h->succedents[i]->nodeNum,
438 		    ((i == dag_h->numSuccedents - 1) ? "\0" : " "));
439 	}
440 	printf("};\n");
441 	for (i = 0; i < dag_h->numSuccedents; i++) {
442 		if (dag_h->succedents[i]->visited == unvisited)
443 			rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
444 	}
445 }
446 
447 /* Assign node numbers. */
448 int
rf_AssignNodeNums(RF_DagHeader_t * dag_h)449 rf_AssignNodeNums(RF_DagHeader_t *dag_h)
450 {
451 	int unvisited, i, nnum;
452 	RF_DagNode_t *node;
453 
454 	nnum = 0;
455 	unvisited = dag_h->succedents[0]->visited;
456 
457 	dag_h->nodeNum = nnum++;
458 	for (i = 0; i < dag_h->numSuccedents; i++) {
459 		node = dag_h->succedents[i];
460 		if (node->visited == unvisited) {
461 			nnum = rf_RecurAssignNodeNums(dag_h->succedents[i],
462 			    nnum, unvisited);
463 		}
464 	}
465 	return (nnum);
466 }
467 
468 int
rf_RecurAssignNodeNums(RF_DagNode_t * node,int num,int unvisited)469 rf_RecurAssignNodeNums(RF_DagNode_t *node, int num, int unvisited)
470 {
471 	int i;
472 
473 	node->visited = (unvisited) ? 0 : 1;
474 
475 	node->nodeNum = num++;
476 	for (i = 0; i < node->numSuccedents; i++) {
477 		if (node->succedents[i]->visited == unvisited) {
478 			num = rf_RecurAssignNodeNums(node->succedents[i],
479 			    num, unvisited);
480 		}
481 	}
482 	return (num);
483 }
484 
485 /* Set the header pointers in each node to "newptr". */
486 void
rf_ResetDAGHeaderPointers(RF_DagHeader_t * dag_h,RF_DagHeader_t * newptr)487 rf_ResetDAGHeaderPointers(RF_DagHeader_t *dag_h, RF_DagHeader_t *newptr)
488 {
489 	int i;
490 
491 	for (i = 0; i < dag_h->numSuccedents; i++)
492 		if (dag_h->succedents[i]->dagHdr != newptr)
493 			rf_RecurResetDAGHeaderPointers(dag_h->succedents[i],
494 			    newptr);
495 }
496 
497 void
rf_RecurResetDAGHeaderPointers(RF_DagNode_t * node,RF_DagHeader_t * newptr)498 rf_RecurResetDAGHeaderPointers(RF_DagNode_t *node, RF_DagHeader_t *newptr)
499 {
500 	int i;
501 
502 	node->dagHdr = newptr;
503 	for (i = 0; i < node->numSuccedents; i++)
504 		if (node->succedents[i]->dagHdr != newptr)
505 			rf_RecurResetDAGHeaderPointers(node->succedents[i],
506 			    newptr);
507 }
508 
509 void
rf_PrintDAGList(RF_DagHeader_t * dag_h)510 rf_PrintDAGList(RF_DagHeader_t *dag_h)
511 {
512 	int i = 0;
513 
514 	for (; dag_h; dag_h = dag_h->next) {
515 		rf_AssignNodeNums(dag_h);
516 		printf("\n\nDAG %d IN LIST:\n", i++);
517 		rf_PrintDAG(dag_h);
518 	}
519 }
520 
521 int
rf_ValidateBranch(RF_DagNode_t * node,int * scount,int * acount,RF_DagNode_t ** nodes,int unvisited)522 rf_ValidateBranch(RF_DagNode_t *node, int *scount, int *acount,
523     RF_DagNode_t **nodes, int unvisited)
524 {
525 	int i, retcode = 0;
526 
527 	/* Construct an array of node pointers indexed by node num. */
528 	node->visited = (unvisited) ? 0 : 1;
529 	nodes[node->nodeNum] = node;
530 
531 	if (node->next != NULL) {
532 		printf("INVALID DAG: next pointer in node is not NULL.\n");
533 		retcode = 1;
534 	}
535 	if (node->status != rf_wait) {
536 		printf("INVALID DAG: Node status is not wait.\n");
537 		retcode = 1;
538 	}
539 	if (node->numAntDone != 0) {
540 		printf("INVALID DAG: numAntDone is not zero.\n");
541 		retcode = 1;
542 	}
543 	if (node->doFunc == rf_TerminateFunc) {
544 		if (node->numSuccedents != 0) {
545 			printf("INVALID DAG: Terminator node has"
546 			    " succedents.\n");
547 			retcode = 1;
548 		}
549 	} else {
550 		if (node->numSuccedents == 0) {
551 			printf("INVALID DAG: Non-terminator node has no"
552 			    " succedents\n");
553 			retcode = 1;
554 		}
555 	}
556 	for (i = 0; i < node->numSuccedents; i++) {
557 		if (!node->succedents[i]) {
558 			printf("INVALID DAG: succedent %d of node %s"
559 			    " is NULL.\n", i, node->name);
560 			retcode = 1;
561 		}
562 		scount[node->succedents[i]->nodeNum]++;
563 	}
564 	for (i = 0; i < node->numAntecedents; i++) {
565 		if (!node->antecedents[i]) {
566 			printf("INVALID DAG: antecedent %d of node %s is"
567 			    " NULL.\n", i, node->name);
568 			retcode = 1;
569 		}
570 		acount[node->antecedents[i]->nodeNum]++;
571 	}
572 	for (i = 0; i < node->numSuccedents; i++) {
573 		if (node->succedents[i]->visited == unvisited) {
574 			if (rf_ValidateBranch(node->succedents[i], scount,
575 				acount, nodes, unvisited)) {
576 				retcode = 1;
577 			}
578 		}
579 	}
580 	return (retcode);
581 }
582 
583 void
rf_ValidateBranchVisitedBits(RF_DagNode_t * node,int unvisited,int rl)584 rf_ValidateBranchVisitedBits(RF_DagNode_t *node, int unvisited, int rl)
585 {
586 	int i;
587 
588 	RF_ASSERT(node->visited == unvisited);
589 	for (i = 0; i < node->numSuccedents; i++) {
590 		if (node->succedents[i] == NULL) {
591 			printf("node=%lx node->succedents[%d] is NULL.\n",
592 			    (long) node, i);
593 			RF_ASSERT(0);
594 		}
595 		rf_ValidateBranchVisitedBits(node->succedents[i],
596 		    unvisited, rl + 1);
597 	}
598 }
599 
600 /*
601  * NOTE:  Never call this on a big dag, because it is exponential
602  * in execution time.
603  */
604 void
rf_ValidateVisitedBits(RF_DagHeader_t * dag)605 rf_ValidateVisitedBits(RF_DagHeader_t *dag)
606 {
607 	int i, unvisited;
608 
609 	unvisited = dag->succedents[0]->visited;
610 
611 	for (i = 0; i < dag->numSuccedents; i++) {
612 		if (dag->succedents[i] == NULL) {
613 			printf("dag=%lx dag->succedents[%d] is NULL.\n",
614 			    (long) dag, i);
615 			RF_ASSERT(0);
616 		}
617 		rf_ValidateBranchVisitedBits(dag->succedents[i], unvisited, 0);
618 	}
619 }
620 
621 /*
622  * Validate a DAG. _at entry_ verify that:
623  *   -- numNodesCompleted is zero
624  *   -- node queue is null
625  *   -- dag status is rf_enable
626  *   -- next pointer is null on every node
627  *   -- all nodes have status wait
628  *   -- numAntDone is zero in all nodes
629  *   -- terminator node has zero successors
630  *   -- no other node besides terminator has zero successors
631  *   -- no successor or antecedent pointer in a node is NULL
632  *   -- number of times that each node appears as a successor of another node
633  *      is equal to the antecedent count on that node
634  *   -- number of times that each node appears as an antecedent of another node
635  *      is equal to the succedent count on that node
636  *   -- what else ?
637  */
638 int
rf_ValidateDAG(RF_DagHeader_t * dag_h)639 rf_ValidateDAG(RF_DagHeader_t *dag_h)
640 {
641 	int i, nodecount;
642 	int *scount, *acount;	/* Per-node successor and antecedent counts. */
643 	RF_DagNode_t **nodes;	/* Array of ptrs to nodes in dag. */
644 	int retcode = 0;
645 	int unvisited;
646 	int commitNodeCount = 0;
647 
648 	if (rf_validateVisitedDebug)
649 		rf_ValidateVisitedBits(dag_h);
650 
651 	if (dag_h->numNodesCompleted != 0) {
652 		printf("INVALID DAG: num nodes completed is %d, should be 0.\n",
653 		    dag_h->numNodesCompleted);
654 		retcode = 1;
655 		goto validate_dag_bad;
656 	}
657 	if (dag_h->status != rf_enable) {
658 		printf("INVALID DAG: not enabled.\n");
659 		retcode = 1;
660 		goto validate_dag_bad;
661 	}
662 	if (dag_h->numCommits != 0) {
663 		printf("INVALID DAG: numCommits != 0 (%d)\n",
664 		    dag_h->numCommits);
665 		retcode = 1;
666 		goto validate_dag_bad;
667 	}
668 	if (dag_h->numSuccedents != 1) {
669 		/* Currently, all dags must have only one succedent. */
670 		printf("INVALID DAG: numSuccedents != 1 (%d).\n",
671 		    dag_h->numSuccedents);
672 		retcode = 1;
673 		goto validate_dag_bad;
674 	}
675 	nodecount = rf_AssignNodeNums(dag_h);
676 
677 	unvisited = dag_h->succedents[0]->visited;
678 
679 	RF_Calloc(scount, nodecount, sizeof(int), (int *));
680 	RF_Calloc(acount, nodecount, sizeof(int), (int *));
681 	RF_Calloc(nodes, nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
682 	for (i = 0; i < dag_h->numSuccedents; i++) {
683 		if ((dag_h->succedents[i]->visited == unvisited)
684 		    && rf_ValidateBranch(dag_h->succedents[i], scount,
685 			acount, nodes, unvisited)) {
686 			retcode = 1;
687 		}
688 	}
689 	/* Start at 1 to skip the header node. */
690 	for (i = 1; i < nodecount; i++) {
691 		if (nodes[i]->commitNode)
692 			commitNodeCount++;
693 		if (nodes[i]->doFunc == NULL) {
694 			printf("INVALID DAG: node %s has an undefined"
695 			    " doFunc.\n", nodes[i]->name);
696 			retcode = 1;
697 			goto validate_dag_out;
698 		}
699 		if (nodes[i]->undoFunc == NULL) {
700 			printf("INVALID DAG: node %s has an undefined"
701 			    " doFunc.\n", nodes[i]->name);
702 			retcode = 1;
703 			goto validate_dag_out;
704 		}
705 		if (nodes[i]->numAntecedents != scount[nodes[i]->nodeNum]) {
706 			printf("INVALID DAG: node %s has %d antecedents but"
707 			    " appears as a succedent %d times.\n",
708 			    nodes[i]->name, nodes[i]->numAntecedents,
709 			    scount[nodes[i]->nodeNum]);
710 			retcode = 1;
711 			goto validate_dag_out;
712 		}
713 		if (nodes[i]->numSuccedents != acount[nodes[i]->nodeNum]) {
714 			printf("INVALID DAG: node %s has %d succedents but"
715 			    " appears as an antecedent %d times.\n",
716 			    nodes[i]->name, nodes[i]->numSuccedents,
717 			    acount[nodes[i]->nodeNum]);
718 			retcode = 1;
719 			goto validate_dag_out;
720 		}
721 	}
722 
723 	if (dag_h->numCommitNodes != commitNodeCount) {
724 		printf("INVALID DAG: incorrect commit node count. "
725 		    "hdr->numCommitNodes (%d) found (%d) commit nodes"
726 		    " in graph.\n",
727 		    dag_h->numCommitNodes, commitNodeCount);
728 		retcode = 1;
729 		goto validate_dag_out;
730 	}
731 
732 validate_dag_out:
733 	RF_Free(scount, nodecount * sizeof(int));
734 	RF_Free(acount, nodecount * sizeof(int));
735 	RF_Free(nodes, nodecount * sizeof(RF_DagNode_t *));
736 	if (retcode)
737 		rf_PrintDAGList(dag_h);
738 
739 	if (rf_validateVisitedDebug)
740 		rf_ValidateVisitedBits(dag_h);
741 
742 	return (retcode);
743 
744 validate_dag_bad:
745 	rf_PrintDAGList(dag_h);
746 	return (retcode);
747 }
748 
749 
750 /*****************************************************************************
751  *
752  * Misc construction routines.
753  *
754  *****************************************************************************/
755 
756 void
rf_redirect_asm(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap)757 rf_redirect_asm(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
758 {
759 	int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
760 	int row = asmap->physInfo->row;
761 	int fcol = raidPtr->reconControl[row]->fcol;
762 	int srow = raidPtr->reconControl[row]->spareRow;
763 	int scol = raidPtr->reconControl[row]->spareCol;
764 	RF_PhysDiskAddr_t *pda;
765 
766 	RF_ASSERT(raidPtr->status[row] == rf_rs_reconstructing);
767 	for (pda = asmap->physInfo; pda; pda = pda->next) {
768 		if (pda->col == fcol) {
769 			if (rf_dagDebug) {
770 				if (!rf_CheckRUReconstructed(
771 				    raidPtr->reconControl[row]->reconMap,
772 				    pda->startSector)) {
773 					RF_PANIC();
774 				}
775 			}
776 			/*printf("Remapped data for large write\n");*/
777 			if (ds) {
778 				raidPtr->Layout.map->MapSector(raidPtr,
779 				    pda->raidAddress, &pda->row, &pda->col,
780 				    &pda->startSector, RF_REMAP);
781 			} else {
782 				pda->row = srow;
783 				pda->col = scol;
784 			}
785 		}
786 	}
787 	for (pda = asmap->parityInfo; pda; pda = pda->next) {
788 		if (pda->col == fcol) {
789 			if (rf_dagDebug) {
790 				if (!rf_CheckRUReconstructed(
791 				    raidPtr->reconControl[row]->reconMap,
792 				    pda->startSector)) {
793 					RF_PANIC();
794 				}
795 			}
796 		}
797 		if (ds) {
798 			(raidPtr->Layout.map->MapParity) (raidPtr,
799 			    pda->raidAddress, &pda->row, &pda->col,
800 			    &pda->startSector, RF_REMAP);
801 		} else {
802 			pda->row = srow;
803 			pda->col = scol;
804 		}
805 	}
806 }
807 
808 
809 /*
810  * This routine allocates read buffers and generates stripe maps for the
811  * regions of the array from the start of the stripe to the start of the
812  * access, and from the end of the access to the end of the stripe. It also
813  * computes and returns the number of DAG nodes needed to read all this data.
814  * Note that this routine does the wrong thing if the access is fully
815  * contained within one stripe unit, so we RF_ASSERT against this case at the
816  * start.
817  */
818 void
rf_MapUnaccessedPortionOfStripe(RF_Raid_t * raidPtr,RF_RaidLayout_t * layoutPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,RF_AccessStripeMapHeader_t ** new_asm_h,int * nRodNodes,char ** sosBuffer,char ** eosBuffer,RF_AllocListElem_t * allocList)819 rf_MapUnaccessedPortionOfStripe(
820 	RF_Raid_t		 *raidPtr,
821 	RF_RaidLayout_t		 *layoutPtr,	/* in: layout information */
822 	RF_AccessStripeMap_t	 *asmap,	/* in: access stripe map */
823 	RF_DagHeader_t		 *dag_h,	/* in: header of the dag */
824 						/*     to create */
825 	RF_AccessStripeMapHeader_t **new_asm_h,	/* in: ptr to array of 2 */
826 						/*     headers, to be */
827 						/*     filled in */
828 	int			 *nRodNodes,	/* out: num nodes to be */
829 						/*      generated to read */
830 						/*      unaccessed data */
831 	char			**sosBuffer,	/* out: pointers to newly */
832 						/*      allocated buffer */
833 	char			**eosBuffer,
834 	RF_AllocListElem_t	 *allocList
835 )
836 {
837 	RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
838 	RF_SectorNum_t sosNumSector, eosNumSector;
839 
840 	RF_ASSERT(asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol / 2));
841 	/*
842 	 * Generate an access map for the region of the array from start of
843 	 * stripe to start of access.
844 	 */
845 	new_asm_h[0] = new_asm_h[1] = NULL;
846 	*nRodNodes = 0;
847 	if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
848 		sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
849 		    asmap->raidAddress);
850 		sosNumSector = asmap->raidAddress - sosRaidAddress;
851 		RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr,
852 		    sosNumSector), (char *), allocList);
853 		new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress,
854 		    sosNumSector, *sosBuffer, RF_DONT_REMAP);
855 		new_asm_h[0]->next = dag_h->asmList;
856 		dag_h->asmList = new_asm_h[0];
857 		*nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
858 
859 		RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
860 		/* We're totally within one stripe here. */
861 		if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
862 			rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
863 	}
864 	/*
865 	 * Generate an access map for the region of the array from end of
866 	 * access to end of stripe.
867 	 */
868 	if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
869 		eosRaidAddress = asmap->endRaidAddress;
870 		eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr,
871 		    eosRaidAddress) - eosRaidAddress;
872 		RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr,
873 		    eosNumSector), (char *), allocList);
874 		new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress,
875 		    eosNumSector, *eosBuffer, RF_DONT_REMAP);
876 		new_asm_h[1]->next = dag_h->asmList;
877 		dag_h->asmList = new_asm_h[1];
878 		*nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
879 
880 		RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
881 		/* We're totally within one stripe here. */
882 		if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
883 			rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
884 	}
885 }
886 
887 
888 /* Returns non-zero if the indicated ranges of stripe unit offsets overlap. */
889 int
rf_PDAOverlap(RF_RaidLayout_t * layoutPtr,RF_PhysDiskAddr_t * src,RF_PhysDiskAddr_t * dest)890 rf_PDAOverlap(RF_RaidLayout_t *layoutPtr, RF_PhysDiskAddr_t *src,
891     RF_PhysDiskAddr_t *dest)
892 {
893 	RF_SectorNum_t soffs =
894 	    rf_StripeUnitOffset(layoutPtr, src->startSector);
895 	RF_SectorNum_t doffs =
896 	    rf_StripeUnitOffset(layoutPtr, dest->startSector);
897 	/* Use -1 to be sure we stay within SU. */
898 	RF_SectorNum_t send =
899 	    rf_StripeUnitOffset(layoutPtr, src->startSector +
900 	    src->numSector - 1);
901 	RF_SectorNum_t dend =
902 	    rf_StripeUnitOffset(layoutPtr, dest->startSector +
903 	    dest->numSector - 1);
904 
905 	return ((RF_MAX(soffs, doffs) <= RF_MIN(send, dend)) ? 1 : 0);
906 }
907 
908 
909 /*
910  * GenerateFailedAccessASMs
911  *
912  * This routine figures out what portion of the stripe needs to be read
913  * to effect the degraded read or write operation. It's primary function
914  * is to identify everything required to recover the data, and then
915  * eliminate anything that is already being accessed by the user.
916  *
917  * The main result is two new ASMs, one for the region from the start of the
918  * stripe to the start of the access, and one for the region from the end of
919  * the access to the end of the stripe. These ASMs describe everything that
920  * needs to be read to effect the degraded access. Other results are:
921  *    nXorBufs -- The total number of buffers that need to be XORed together
922  *		  to recover the lost data,
923  *    rpBufPtr -- Ptr to a newly-allocated buffer to hold the parity. If NULL
924  *                at entry, not allocated.
925  *    overlappingPDAs --
926  *                Describes which of the non-failed PDAs, in the user access,
927  *                overlap data that needs to be read to effect recovery.
928  *                overlappingPDAs[i]==1 if and only if, neglecting the failed
929  *                PDA, the i'th pda in the input asm overlaps data that needs
930  *                to be read for recovery.
931  */
932  /* in: asmap - ASM for the actual access, one stripe only. */
933  /* in: faildPDA - Which component of the access has failed. */
934  /* in: dag_h - Header of the DAG we're going to create. */
935  /* out: new_asm_h - The two new ASMs. */
936  /* out: nXorBufs - The total number of xor bufs required. */
937  /* out: rpBufPtr - A buffer for the parity read. */
938 void
rf_GenerateFailedAccessASMs(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_PhysDiskAddr_t * failedPDA,RF_DagHeader_t * dag_h,RF_AccessStripeMapHeader_t ** new_asm_h,int * nXorBufs,char ** rpBufPtr,char * overlappingPDAs,RF_AllocListElem_t * allocList)939 rf_GenerateFailedAccessASMs(
940 	RF_Raid_t		 *raidPtr,
941 	RF_AccessStripeMap_t	 *asmap,
942 	RF_PhysDiskAddr_t	 *failedPDA,
943 	RF_DagHeader_t		 *dag_h,
944 	RF_AccessStripeMapHeader_t **new_asm_h,
945 	int			 *nXorBufs,
946 	char			**rpBufPtr,
947 	char			 *overlappingPDAs,
948 	RF_AllocListElem_t	 *allocList
949 )
950 {
951 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
952 
953 	/* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
954 	RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
955 
956 	RF_SectorCount_t numSect[2], numParitySect;
957 	RF_PhysDiskAddr_t *pda;
958 	char *rdBuf, *bufP;
959 	int foundit, i;
960 
961 	bufP = NULL;
962 	foundit = 0;
963 	/*
964 	 * First compute the following raid addresses:
965 	 * - Start of stripe
966 	 * - (sosAddr) MIN(start of access, start of failed SU)
967 	 * - (sosEndAddr) MAX(end of access, end of failed SU)
968 	 * - (eosStartAddr) end of stripe (i.e. start of next stripe)
969 	 *   (eosAddr)
970 	 */
971 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
972 	    asmap->raidAddress);
973 	sosEndAddr = RF_MIN(asmap->raidAddress,
974 	    rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
975 	    failedPDA->raidAddress));
976 	eosStartAddr = RF_MAX(asmap->endRaidAddress,
977 	    rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
978 	    failedPDA->raidAddress));
979 	eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr,
980 	    asmap->raidAddress);
981 
982 	/*
983 	 * Now generate access stripe maps for each of the above regions of
984 	 * the stripe. Use a dummy (NULL) buf ptr for now.
985 	 */
986 
987 	new_asm_h[0] = (sosAddr != sosEndAddr) ?
988 	    rf_MapAccess(raidPtr, sosAddr, sosEndAddr - sosAddr, NULL,
989 	    RF_DONT_REMAP) : NULL;
990 	new_asm_h[1] = (eosStartAddr != eosAddr) ?
991 	    rf_MapAccess(raidPtr, eosStartAddr, eosAddr - eosStartAddr, NULL,
992 	    RF_DONT_REMAP) : NULL;
993 
994 	/*
995 	 * Walk through the PDAs and range-restrict each SU to the region of
996 	 * the SU touched on the failed PDA. Also compute total data buffer
997 	 * space requirements in this step. Ignore the parity for now.
998 	 */
999 
1000 	numSect[0] = numSect[1] = 0;
1001 	if (new_asm_h[0]) {
1002 		new_asm_h[0]->next = dag_h->asmList;
1003 		dag_h->asmList = new_asm_h[0];
1004 		for (pda = new_asm_h[0]->stripeMap->physInfo; pda;
1005 		     pda = pda->next) {
1006 			rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
1007 			    RF_RESTRICT_NOBUFFER, 0);
1008 			numSect[0] += pda->numSector;
1009 		}
1010 	}
1011 	if (new_asm_h[1]) {
1012 		new_asm_h[1]->next = dag_h->asmList;
1013 		dag_h->asmList = new_asm_h[1];
1014 		for (pda = new_asm_h[1]->stripeMap->physInfo;
1015 		     pda; pda = pda->next) {
1016 			rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
1017 			    RF_RESTRICT_NOBUFFER, 0);
1018 			numSect[1] += pda->numSector;
1019 		}
1020 	}
1021 	numParitySect = failedPDA->numSector;
1022 
1023 	/*
1024 	 * Allocate buffer space for the data & parity we have to read to
1025 	 * recover from the failure.
1026 	 */
1027 
1028 	if (numSect[0] + numSect[1] + ((rpBufPtr) ? numParitySect : 0)) {
1029 		/* Don't allocate parity buf if not needed. */
1030 		RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,
1031 		    numSect[0] + numSect[1] + numParitySect), (char *),
1032 		    allocList);
1033 		bufP = rdBuf;
1034 		if (rf_degDagDebug)
1035 			printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
1036 			    (int) rf_RaidAddressToByte(raidPtr,
1037 			    numSect[0] + numSect[1] + numParitySect),
1038 			    (unsigned long) bufP);
1039 	}
1040 	/*
1041 	 * Now walk through the pdas one last time and assign buffer pointers
1042 	 * (ugh!). Again, ignore the parity. Also, count nodes to find out
1043 	 * how many bufs need to be xored together.
1044 	 */
1045 	(*nXorBufs) = 1;	/* In read case, 1 is for parity. */
1046 				/* In write case, 1 is for failed data. */
1047 	if (new_asm_h[0]) {
1048 		for (pda = new_asm_h[0]->stripeMap->physInfo; pda;
1049 		     pda = pda->next) {
1050 			pda->bufPtr = bufP;
1051 			bufP += rf_RaidAddressToByte(raidPtr, pda->numSector);
1052 		}
1053 		*nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
1054 	}
1055 	if (new_asm_h[1]) {
1056 		for (pda = new_asm_h[1]->stripeMap->physInfo; pda;
1057 		     pda = pda->next) {
1058 			pda->bufPtr = bufP;
1059 			bufP += rf_RaidAddressToByte(raidPtr, pda->numSector);
1060 		}
1061 		(*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
1062 	}
1063 	if (rpBufPtr)
1064 		/* The rest of the buffer is for parity. */
1065 		*rpBufPtr = bufP;
1066 
1067 	/*
1068 	 * The last step is to figure out how many more distinct buffers need
1069 	 * to get xor'd to produce the missing unit. there's one for each
1070 	 * user-data read node that overlaps the portion of the failed unit
1071 	 * being accessed.
1072 	 */
1073 
1074 	for (foundit = i = 0, pda = asmap->physInfo;
1075 	     pda; i++, pda = pda->next) {
1076 		if (pda == failedPDA) {
1077 			i--;
1078 			foundit = 1;
1079 			continue;
1080 		}
1081 		if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
1082 			overlappingPDAs[i] = 1;
1083 			(*nXorBufs)++;
1084 		}
1085 	}
1086 	if (!foundit) {
1087 		RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA"
1088 		    " in asm list.\n");
1089 		RF_ASSERT(0);
1090 	}
1091 	if (rf_degDagDebug) {
1092 		if (new_asm_h[0]) {
1093 			printf("First asm:\n");
1094 			rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
1095 		}
1096 		if (new_asm_h[1]) {
1097 			printf("Second asm:\n");
1098 			rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
1099 		}
1100 	}
1101 }
1102 
1103 
1104 /*
1105  * Adjust the offset and number of sectors in the destination pda so that
1106  * it covers at most the region of the SU covered by the source PDA. This
1107  * is exclusively a restriction:  the number of sectors indicated by the
1108  * target PDA can only shrink.
1109  *
1110  * For example:  s = sectors within SU indicated by source PDA
1111  *               d = sectors within SU indicated by dest PDA
1112  *               r = results, stored in dest PDA
1113  *
1114  * |--------------- one stripe unit ---------------------|
1115  * |           sssssssssssssssssssssssssssssssss         |
1116  * |    ddddddddddddddddddddddddddddddddddddddddddddd    |
1117  * |           rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr         |
1118  *
1119  * Another example:
1120  *
1121  * |--------------- one stripe unit ---------------------|
1122  * |           sssssssssssssssssssssssssssssssss         |
1123  * |    ddddddddddddddddddddddd                          |
1124  * |           rrrrrrrrrrrrrrrr                          |
1125  *
1126  */
1127 void
rf_RangeRestrictPDA(RF_Raid_t * raidPtr,RF_PhysDiskAddr_t * src,RF_PhysDiskAddr_t * dest,int dobuffer,int doraidaddr)1128 rf_RangeRestrictPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *src,
1129     RF_PhysDiskAddr_t *dest, int dobuffer, int doraidaddr)
1130 {
1131 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1132 	RF_SectorNum_t soffs =
1133 	    rf_StripeUnitOffset(layoutPtr, src->startSector);
1134 	RF_SectorNum_t doffs =
1135 	    rf_StripeUnitOffset(layoutPtr, dest->startSector);
1136 	RF_SectorNum_t send =
1137 	    rf_StripeUnitOffset(layoutPtr, src->startSector +
1138 	    src->numSector - 1); /* Use -1 to be sure we stay within SU. */
1139 	RF_SectorNum_t dend =
1140 	    rf_StripeUnitOffset(layoutPtr, dest->startSector +
1141 	    dest->numSector - 1);
1142 	RF_SectorNum_t subAddr =
1143 	    rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
1144 	    dest->startSector); /* Stripe unit boundary. */
1145 
1146 	dest->startSector = subAddr + RF_MAX(soffs, doffs);
1147 	dest->numSector = subAddr + RF_MIN(send, dend) + 1 - dest->startSector;
1148 
1149 	if (dobuffer)
1150 		dest->bufPtr += (soffs > doffs) ?
1151 		    rf_RaidAddressToByte(raidPtr, soffs - doffs) : 0;
1152 	if (doraidaddr) {
1153 		dest->raidAddress =
1154 		    rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
1155 		    dest->raidAddress) +
1156 		    rf_StripeUnitOffset(layoutPtr, dest->startSector);
1157 	}
1158 }
1159 
1160 /*
1161  * Want the highest of these primes to be the largest one
1162  * less than the max expected number of columns (won't hurt
1163  * to be too small or too large, but won't be optimal, either)
1164  * --jimz
1165  */
1166 #define	NLOWPRIMES	8
1167 static int lowprimes[NLOWPRIMES] = {2, 3, 5, 7, 11, 13, 17, 19};
1168 
1169 /*****************************************************************************
1170  * Compute the workload shift factor. (chained declustering)
1171  *
1172  * Return nonzero if access should shift to secondary, otherwise,
1173  * access is to primary.
1174  *****************************************************************************/
1175 int
rf_compute_workload_shift(RF_Raid_t * raidPtr,RF_PhysDiskAddr_t * pda)1176 rf_compute_workload_shift(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda)
1177 {
1178 	/*
1179          * Variables:
1180          *  d   = Column of disk containing primary.
1181          *  f   = Column of failed disk.
1182          *  n   = Number of disks in array.
1183          *  sd  = "shift distance"
1184 	 *	  (number of columns that d is to the right of f).
1185          *  row = Row of array the access is in.
1186          *  v   = Numerator of redirection ratio.
1187          *  k   = Denominator of redirection ratio.
1188          */
1189 	RF_RowCol_t d, f, sd, row, n;
1190 	int k, v, ret, i;
1191 
1192 	row = pda->row;
1193 	n = raidPtr->numCol;
1194 
1195 	/* Assign column of primary copy to d. */
1196 	d = pda->col;
1197 
1198 	/* Assign column of dead disk to f. */
1199 	for (f = 0; ((!RF_DEAD_DISK(raidPtr->Disks[row][f].status)) &&
1200 	     (f < n)); f++);
1201 
1202 	RF_ASSERT(f < n);
1203 	RF_ASSERT(f != d);
1204 
1205 	sd = (f > d) ? (n + d - f) : (d - f);
1206 	RF_ASSERT(sd < n);
1207 
1208 	/*
1209          * v of every k accesses should be redirected.
1210          *
1211          * v/k := (n-1-sd)/(n-1)
1212          */
1213 	v = (n - 1 - sd);
1214 	k = (n - 1);
1215 
1216 #if 1
1217 	/*
1218          * XXX
1219          * Is this worth it ?
1220          *
1221          * Now reduce the fraction, by repeatedly factoring
1222          * out primes (just like they teach in elementary school !).
1223          */
1224 	for (i = 0; i < NLOWPRIMES; i++) {
1225 		if (lowprimes[i] > v)
1226 			break;
1227 		while (((v % lowprimes[i]) == 0) && ((k % lowprimes[i]) == 0)) {
1228 			v /= lowprimes[i];
1229 			k /= lowprimes[i];
1230 		}
1231 	}
1232 #endif
1233 
1234 	raidPtr->hist_diskreq[row][d]++;
1235 	if (raidPtr->hist_diskreq[row][d] > v) {
1236 		ret = 0;	/* Do not redirect. */
1237 	} else {
1238 		ret = 1;	/* Redirect. */
1239 	}
1240 
1241 #if 0
1242 	printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
1243 	    raidPtr->hist_diskreq[row][d]);
1244 #endif
1245 
1246 	if (raidPtr->hist_diskreq[row][d] >= k) {
1247 		/* Reset counter. */
1248 		raidPtr->hist_diskreq[row][d] = 0;
1249 	}
1250 	return (ret);
1251 }
1252 
1253 /*
1254  * Disk selection routines.
1255  */
1256 
1257 /*
1258  * Select the disk with the shortest queue from a mirror pair.
1259  * Both the disk I/Os queued in RAIDframe as well as those at the physical
1260  * disk are counted as members of the "queue".
1261  */
1262 void
rf_SelectMirrorDiskIdle(RF_DagNode_t * node)1263 rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
1264 {
1265 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1266 	RF_RowCol_t rowData, colData, rowMirror, colMirror;
1267 	int dataQueueLength, mirrorQueueLength, usemirror;
1268 	RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
1269 	RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
1270 	RF_PhysDiskAddr_t *tmp_pda;
1271 	RF_RaidDisk_t **disks = raidPtr->Disks;
1272 	RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1273 
1274 	/* Return the [row col] of the disk with the shortest queue. */
1275 	rowData = data_pda->row;
1276 	colData = data_pda->col;
1277 	rowMirror = mirror_pda->row;
1278 	colMirror = mirror_pda->col;
1279 	dataQueue = &(dqs[rowData][colData]);
1280 	mirrorQueue = &(dqs[rowMirror][colMirror]);
1281 
1282 #ifdef	RF_LOCK_QUEUES_TO_READ_LEN
1283 	RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1284 #endif	/* RF_LOCK_QUEUES_TO_READ_LEN */
1285 	dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
1286 #ifdef	RF_LOCK_QUEUES_TO_READ_LEN
1287 	RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1288 	RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1289 #endif	/* RF_LOCK_QUEUES_TO_READ_LEN */
1290 	mirrorQueueLength = mirrorQueue->queueLength +
1291 	    mirrorQueue->numOutstanding;
1292 #ifdef	RF_LOCK_QUEUES_TO_READ_LEN
1293 	RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1294 #endif	/* RF_LOCK_QUEUES_TO_READ_LEN */
1295 
1296 	usemirror = 0;
1297 	if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1298 		usemirror = 0;
1299 	} else
1300 		if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1301 			usemirror = 1;
1302 		} else
1303 			if (raidPtr->parity_good == RF_RAID_DIRTY) {
1304 				/* Trust only the main disk. */
1305 				usemirror = 0;
1306 			} else
1307 			if (dataQueueLength < mirrorQueueLength) {
1308 				usemirror = 0;
1309 			} else
1310 				if (mirrorQueueLength < dataQueueLength) {
1311 					usemirror = 1;
1312 				} else {
1313 					/* Queues are equal length. */
1314 					/* Attempt cleverness. */
1315 					if (SNUM_DIFF(dataQueue
1316 					    ->last_deq_sector, data_pda
1317 					    ->startSector) <=
1318 					    SNUM_DIFF(mirrorQueue
1319 					    ->last_deq_sector, mirror_pda
1320 					    ->startSector)) {
1321 						usemirror = 0;
1322 					} else {
1323 						usemirror = 1;
1324 					}
1325 				}
1326 
1327 	if (usemirror) {
1328 		/* Use mirror (parity) disk, swap params 0 & 4. */
1329 		tmp_pda = data_pda;
1330 		node->params[0].p = mirror_pda;
1331 		node->params[4].p = tmp_pda;
1332 	} else {
1333 		/* Use data disk, leave param 0 unchanged. */
1334 	}
1335 	/*printf("dataQueueLength %d, mirrorQueueLength %d\n", dataQueueLength,
1336 	    mirrorQueueLength);*/
1337 }
1338 
1339 /*
1340  * Do simple partitioning. This assumes that
1341  * the data and parity disks are laid out identically.
1342  */
1343 void
rf_SelectMirrorDiskPartition(RF_DagNode_t * node)1344 rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
1345 {
1346 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1347 	RF_RowCol_t rowData, colData, rowMirror, colMirror;
1348 	RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
1349 	RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
1350 	RF_PhysDiskAddr_t *tmp_pda;
1351 	RF_RaidDisk_t **disks = raidPtr->Disks;
1352 	RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1353 	int usemirror;
1354 
1355 	/* Return the [row col] of the disk with the shortest queue. */
1356 	rowData = data_pda->row;
1357 	colData = data_pda->col;
1358 	rowMirror = mirror_pda->row;
1359 	colMirror = mirror_pda->col;
1360 	dataQueue = &(dqs[rowData][colData]);
1361 	mirrorQueue = &(dqs[rowMirror][colMirror]);
1362 
1363 	usemirror = 0;
1364 	if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1365 		usemirror = 0;
1366 	} else
1367 		if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1368 			usemirror = 1;
1369 		} else
1370 			if (raidPtr->parity_good == RF_RAID_DIRTY) {
1371 				/* Trust only the main disk. */
1372 				usemirror = 0;
1373 		} else
1374 				if (data_pda->startSector <
1375 				    (disks[rowData][colData].numBlocks / 2)) {
1376 				usemirror = 0;
1377 			} else {
1378 				usemirror = 1;
1379 			}
1380 
1381 	if (usemirror) {
1382 		/* Use mirror (parity) disk, swap params 0 & 4. */
1383 		tmp_pda = data_pda;
1384 		node->params[0].p = mirror_pda;
1385 		node->params[4].p = tmp_pda;
1386 	} else {
1387 		/* Use data disk, leave param 0 unchanged. */
1388 	}
1389 }
1390