1 /*	$OpenBSD: rf_dagdegrd.c,v 1.5 2002/12/16 07:01:03 tdeval Exp $	*/
2 /*	$NetBSD: rf_dagdegrd.c,v 1.5 2000/01/07 03:40:57 oster Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Carnegie-Mellon University.
6  * All rights reserved.
7  *
8  * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
9  *
10  * Permission to use, copy, modify and distribute this software and
11  * its documentation is hereby granted, provided that both the copyright
12  * notice and this permission notice appear in all copies of the
13  * software, derivative works or modified versions, and any portions
14  * thereof, and that both notices appear in supporting documentation.
15  *
16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19  *
20  * Carnegie Mellon requests users of this software to return to
21  *
22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
23  *  School of Computer Science
24  *  Carnegie Mellon University
25  *  Pittsburgh PA 15213-3890
26  *
27  * any improvements or extensions that they make and grant Carnegie the
28  * rights to redistribute these changes.
29  */
30 
31 /*
32  * rf_dagdegrd.c
33  *
34  * Code for creating degraded read DAGs.
35  */
36 
37 #include "rf_types.h"
38 #include "rf_raid.h"
39 #include "rf_dag.h"
40 #include "rf_dagutils.h"
41 #include "rf_dagfuncs.h"
42 #include "rf_debugMem.h"
43 #include "rf_memchunk.h"
44 #include "rf_general.h"
45 #include "rf_dagdegrd.h"
46 
47 
48 /*****************************************************************************
49  *
50  * General comments on DAG creation:
51  *
52  * All DAGs in this file use roll-away error recovery. Each DAG has a single
53  * commit node, usually called "Cmt". If an error occurs before the Cmt node
54  * is reached, the execution engine will halt forward execution and work
55  * backward through the graph, executing the undo functions. Assuming that
56  * each node in the graph prior to the Cmt node are undoable and atomic - or -
57  * does not make changes to permanent state, the graph will fail atomically.
58  * If an error occurs after the Cmt node executes, the engine will roll-forward
59  * through the graph, blindly executing nodes until it reaches the end.
60  * If a graph reaches the end, it is assumed to have completed successfully.
61  *
62  * A graph has only 1 Cmt node.
63  *
64  *****************************************************************************/
65 
66 
67 /*****************************************************************************
68  *
69  * The following wrappers map the standard DAG creation interface to the
70  * DAG creation routines. Additionally, these wrappers enable experimentation
71  * with new DAG structures by providing an extra level of indirection, allowing
72  * the DAG creation routines to be replaced at this single point.
73  *
74  *****************************************************************************/
75 
76 void
rf_CreateRaidFiveDegradedReadDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList)77 rf_CreateRaidFiveDegradedReadDAG(
78     RF_Raid_t			*raidPtr,
79     RF_AccessStripeMap_t	*asmap,
80     RF_DagHeader_t		*dag_h,
81     void			*bp,
82     RF_RaidAccessFlags_t	 flags,
83     RF_AllocListElem_t		*allocList)
84 {
85 	rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
86 	    &rf_xorRecoveryFuncs);
87 }
88 
89 
90 /*****************************************************************************
91  *
92  * DAG creation code begins here.
93  *
94  *****************************************************************************/
95 
96 
97 /*****************************************************************************
98  * Create a degraded read DAG for RAID level 1.
99  *
100  * Hdr -> Nil -> R(p/s)d -> Commit -> Trm
101  *
102  * The "Rd" node reads data from the surviving disk in the mirror pair.
103  *   Rpd - read of primary copy
104  *   Rsd - read of secondary copy
105  *
106  * Parameters:	raidPtr	  - description of the physical array
107  *		asmap	  - logical & physical addresses for this access
108  *		bp	  - buffer ptr (for holding write data)
109  *		flags	  - general flags (e.g. disk locking)
110  *		allocList - list of memory allocated in DAG creation
111  *****************************************************************************/
112 
113 void
rf_CreateRaidOneDegradedReadDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList)114 rf_CreateRaidOneDegradedReadDAG(
115     RF_Raid_t			*raidPtr,
116     RF_AccessStripeMap_t	*asmap,
117     RF_DagHeader_t		*dag_h,
118     void			*bp,
119     RF_RaidAccessFlags_t	 flags,
120     RF_AllocListElem_t		*allocList)
121 {
122 	RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode;
123 	RF_StripeNum_t parityStripeID;
124 	RF_ReconUnitNum_t which_ru;
125 	RF_PhysDiskAddr_t *pda;
126 	int useMirror, i;
127 
128 	useMirror = 0;
129 	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
130 	    asmap->raidAddress, &which_ru);
131 	if (rf_dagDebug) {
132 		printf("[Creating RAID level 1 degraded read DAG]\n");
133 	}
134 	dag_h->creator = "RaidOneDegradedReadDAG";
135 	/* Alloc the Wnd nodes and the Wmir node. */
136 	if (asmap->numDataFailed == 0)
137 		useMirror = RF_FALSE;
138 	else
139 		useMirror = RF_TRUE;
140 
141 	/* Total number of nodes = 1 + (block + commit + terminator). */
142 	RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *),
143 	    allocList);
144 	i = 0;
145 	rdNode = &nodes[i];
146 	i++;
147 	blockNode = &nodes[i];
148 	i++;
149 	commitNode = &nodes[i];
150 	i++;
151 	termNode = &nodes[i];
152 	i++;
153 
154 	/*
155 	 * This dag can not commit until the commit node is reached. Errors
156 	 * prior to the commit point imply the dag has failed and must be
157 	 * retried.
158 	 */
159 	dag_h->numCommitNodes = 1;
160 	dag_h->numCommits = 0;
161 	dag_h->numSuccedents = 1;
162 
163 	/* Initialize the block, commit, and terminator nodes. */
164 	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
165 	    rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
166 	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
167 	    rf_NullNodeUndoFunc, NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
168 	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
169 	    rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
170 
171 	pda = asmap->physInfo;
172 	RF_ASSERT(pda != NULL);
173 	/* parityInfo must describe entire parity unit. */
174 	RF_ASSERT(asmap->parityInfo->next == NULL);
175 
176 	/* Initialize the data node. */
177 	if (!useMirror) {
178 		/* Read primary copy of data. */
179 		rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc,
180 		    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
181 		    dag_h, "Rpd", allocList);
182 		rdNode->params[0].p = pda;
183 		rdNode->params[1].p = pda->bufPtr;
184 		rdNode->params[2].v = parityStripeID;
185 		rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
186 		    0, 0, which_ru);
187 	} else {
188 		/* Read secondary copy of data. */
189 		rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc,
190 		    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
191 		    dag_h, "Rsd", allocList);
192 		rdNode->params[0].p = asmap->parityInfo;
193 		rdNode->params[1].p = pda->bufPtr;
194 		rdNode->params[2].v = parityStripeID;
195 		rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
196 		    0, 0, which_ru);
197 	}
198 
199 	/* Connect header to block node. */
200 	RF_ASSERT(dag_h->numSuccedents == 1);
201 	RF_ASSERT(blockNode->numAntecedents == 0);
202 	dag_h->succedents[0] = blockNode;
203 
204 	/* Connect block node to rdnode. */
205 	RF_ASSERT(blockNode->numSuccedents == 1);
206 	RF_ASSERT(rdNode->numAntecedents == 1);
207 	blockNode->succedents[0] = rdNode;
208 	rdNode->antecedents[0] = blockNode;
209 	rdNode->antType[0] = rf_control;
210 
211 	/* Connect rdnode to commit node. */
212 	RF_ASSERT(rdNode->numSuccedents == 1);
213 	RF_ASSERT(commitNode->numAntecedents == 1);
214 	rdNode->succedents[0] = commitNode;
215 	commitNode->antecedents[0] = rdNode;
216 	commitNode->antType[0] = rf_control;
217 
218 	/* Connect commit node to terminator. */
219 	RF_ASSERT(commitNode->numSuccedents == 1);
220 	RF_ASSERT(termNode->numAntecedents == 1);
221 	RF_ASSERT(termNode->numSuccedents == 0);
222 	commitNode->succedents[0] = termNode;
223 	termNode->antecedents[0] = commitNode;
224 	termNode->antType[0] = rf_control;
225 }
226 
227 
228 /*****************************************************************************
229  *
230  * Create a DAG to perform a degraded-mode read of data within one stripe.
231  * This DAG is as follows:
232  *
233  * Hdr -> Block -> Rud -> Xor -> Cmt -> T
234  *		-> Rrd ->
235  *		-> Rp -->
236  *
237  * Each R node is a successor of the L node.
238  * One successor arc from each R node goes to C, and the other to X.
239  * There is one Rud for each chunk of surviving user data requested by the
240  * user, and one Rrd for each chunk of surviving user data _not_ being read by
241  * the user.
242  * R = read, ud = user data, rd = recovery (surviving) data, p = parity
243  * X = XOR, C = Commit, T = terminate
244  *
245  * The block node guarantees a single source node.
246  *
247  * Note:  The target buffer for the XOR node is set to the actual user buffer
248  * where the failed data is supposed to end up. This buffer is zero'd by the
249  * code here. Thus, if you create a degraded read dag, use it, and then
250  * re-use, you have to be sure to zero the target buffer prior to the re-use.
251  *
252  * The recfunc argument at the end specifies the name and function used for
253  * the redundancy recovery function.
254  *
255  *****************************************************************************/
256 
257 void
rf_CreateDegradedReadDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,RF_RedFuncs_t * recFunc)258 rf_CreateDegradedReadDAG(
259     RF_Raid_t			*raidPtr,
260     RF_AccessStripeMap_t	*asmap,
261     RF_DagHeader_t		*dag_h,
262     void			*bp,
263     RF_RaidAccessFlags_t	 flags,
264     RF_AllocListElem_t		*allocList,
265     RF_RedFuncs_t		*recFunc)
266 {
267 	RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *xorNode, *blockNode;
268 	RF_DagNode_t *commitNode, *rpNode, *termNode;
269 	int nNodes, nRrdNodes, nRudNodes, nXorBufs, i;
270 	int j, paramNum;
271 	RF_SectorCount_t sectorsPerSU;
272 	RF_ReconUnitNum_t which_ru;
273 	char *overlappingPDAs;		/* A temporary array of flags. */
274 	RF_AccessStripeMapHeader_t *new_asm_h[2];
275 	RF_PhysDiskAddr_t *pda, *parityPDA;
276 	RF_StripeNum_t parityStripeID;
277 	RF_PhysDiskAddr_t *failedPDA;
278 	RF_RaidLayout_t *layoutPtr;
279 	char *rpBuf;
280 
281 	layoutPtr = &(raidPtr->Layout);
282 	/*
283 	 * failedPDA points to the pda within the asm that targets
284 	 * the failed disk.
285 	 */
286 	failedPDA = asmap->failedPDAs[0];
287 	parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
288 	    asmap->raidAddress, &which_ru);
289 	sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
290 
291 	if (rf_dagDebug) {
292 		printf("[Creating degraded read DAG]\n");
293 	}
294 	RF_ASSERT(asmap->numDataFailed == 1);
295 	dag_h->creator = "DegradedReadDAG";
296 
297 	/*
298 	 * Generate two ASMs identifying the surviving data we need
299 	 * in order to recover the lost data.
300 	 */
301 
302 	/* overlappingPDAs array must be zero'd. */
303 	RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed,
304 	    sizeof(char), (char *));
305 	rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h,
306 	    new_asm_h, &nXorBufs, &rpBuf, overlappingPDAs, allocList);
307 
308 	/*
309 	 * Create all the nodes at once.
310 	 *
311 	 * -1 because no access is generated for the failed pda.
312 	 */
313 	nRudNodes = asmap->numStripeUnitsAccessed - 1;
314 	nRrdNodes = ((new_asm_h[0]) ?
315 	    new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
316 	    ((new_asm_h[1]) ?
317 	    new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
318 	nNodes = 5 + nRudNodes + nRrdNodes;	/*
319 						 * lock, unlock, xor, Rp,
320 						 * Rud, Rrd
321 						 */
322 	RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *),
323 	    allocList);
324 	i = 0;
325 	blockNode = &nodes[i];
326 	i++;
327 	commitNode = &nodes[i];
328 	i++;
329 	xorNode = &nodes[i];
330 	i++;
331 	rpNode = &nodes[i];
332 	i++;
333 	termNode = &nodes[i];
334 	i++;
335 	rudNodes = &nodes[i];
336 	i += nRudNodes;
337 	rrdNodes = &nodes[i];
338 	i += nRrdNodes;
339 	RF_ASSERT(i == nNodes);
340 
341 	/* Initialize nodes. */
342 	dag_h->numCommitNodes = 1;
343 	dag_h->numCommits = 0;
344 	/*
345 	 * This dag can not commit until the commit node is reached.
346 	 * Errors prior to the commit point imply the dag has failed.
347 	 */
348 	dag_h->numSuccedents = 1;
349 
350 	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
351 	    rf_NullNodeUndoFunc, NULL, nRudNodes + nRrdNodes + 1, 0, 0, 0,
352 	    dag_h, "Nil", allocList);
353 	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
354 	    rf_NullNodeUndoFunc, NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
355 	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
356 	    rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
357 	rf_InitNode(xorNode, rf_wait, RF_FALSE, recFunc->simple,
358 	    rf_NullNodeUndoFunc, NULL, 1, nRudNodes + nRrdNodes + 1,
359 	    2 * nXorBufs + 2, 1, dag_h, recFunc->SimpleName, allocList);
360 
361 	/* Fill in the Rud nodes. */
362 	for (pda = asmap->physInfo, i = 0; i < nRudNodes;
363 	     i++, pda = pda->next) {
364 		if (pda == failedPDA) {
365 			i--;
366 			continue;
367 		}
368 		rf_InitNode(&rudNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
369 		    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
370 		    dag_h, "Rud", allocList);
371 		RF_ASSERT(pda);
372 		rudNodes[i].params[0].p = pda;
373 		rudNodes[i].params[1].p = pda->bufPtr;
374 		rudNodes[i].params[2].v = parityStripeID;
375 		rudNodes[i].params[3].v =
376 		    RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
377 	}
378 
379 	/* Fill in the Rrd nodes. */
380 	i = 0;
381 	if (new_asm_h[0]) {
382 		for (pda = new_asm_h[0]->stripeMap->physInfo;
383 		     i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
384 		     i++, pda = pda->next) {
385 			rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE,
386 			    rf_DiskReadFunc, rf_DiskReadUndoFunc,
387 			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
388 			    "Rrd", allocList);
389 			RF_ASSERT(pda);
390 			rrdNodes[i].params[0].p = pda;
391 			rrdNodes[i].params[1].p = pda->bufPtr;
392 			rrdNodes[i].params[2].v = parityStripeID;
393 			rrdNodes[i].params[3].v =
394 			    RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0,
395 			    which_ru);
396 		}
397 	}
398 	if (new_asm_h[1]) {
399 		for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
400 		    j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
401 		    j++, pda = pda->next) {
402 			rf_InitNode(&rrdNodes[i + j], rf_wait, RF_FALSE,
403 			    rf_DiskReadFunc, rf_DiskReadUndoFunc,
404 			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
405 			    "Rrd", allocList);
406 			RF_ASSERT(pda);
407 			rrdNodes[i + j].params[0].p = pda;
408 			rrdNodes[i + j].params[1].p = pda->bufPtr;
409 			rrdNodes[i + j].params[2].v = parityStripeID;
410 			rrdNodes[i + j].params[3].v =
411 			    RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0,
412 			    which_ru);
413 		}
414 	}
415 	/* Make a PDA for the parity unit. */
416 	RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
417 	    (RF_PhysDiskAddr_t *), allocList);
418 	parityPDA->row = asmap->parityInfo->row;
419 	parityPDA->col = asmap->parityInfo->col;
420 	parityPDA->startSector = ((asmap->parityInfo->startSector /
421 	    sectorsPerSU) * sectorsPerSU) +
422 	    (failedPDA->startSector % sectorsPerSU);
423 	parityPDA->numSector = failedPDA->numSector;
424 
425 	/* Initialize the Rp node. */
426 	rf_InitNode(rpNode, rf_wait, RF_FALSE, rf_DiskReadFunc,
427 	    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
428 	    "Rp ", allocList);
429 	rpNode->params[0].p = parityPDA;
430 	rpNode->params[1].p = rpBuf;
431 	rpNode->params[2].v = parityStripeID;
432 	rpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0,
433 	    which_ru);
434 
435 	/*
436 	 * The last and nastiest step is to assign all
437 	 * the parameters of the Xor node.
438 	 */
439 	paramNum = 0;
440 	for (i = 0; i < nRrdNodes; i++) {
441 		/* All the Rrd nodes need to be xored together. */
442 		xorNode->params[paramNum++] = rrdNodes[i].params[0];
443 		xorNode->params[paramNum++] = rrdNodes[i].params[1];
444 	}
445 	for (i = 0; i < nRudNodes; i++) {
446 		/* Any Rud nodes that overlap the failed access need to be
447 		 * xored in. */
448 		if (overlappingPDAs[i]) {
449 			RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t),
450 			    (RF_PhysDiskAddr_t *), allocList);
451 			bcopy((char *) rudNodes[i].params[0].p, (char *) pda,
452 			    sizeof(RF_PhysDiskAddr_t));
453 			rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
454 			    RF_RESTRICT_DOBUFFER, 0);
455 			xorNode->params[paramNum++].p = pda;
456 			xorNode->params[paramNum++].p = pda->bufPtr;
457 		}
458 	}
459 	RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
460 
461 	/* Install parity pda as last set of params to be xor'd. */
462 	xorNode->params[paramNum++].p = parityPDA;
463 	xorNode->params[paramNum++].p = rpBuf;
464 
465 	/*
466 	 * The last 2 params to the recovery xor node are
467 	 * the failed PDA and the raidPtr.
468 	 */
469 	xorNode->params[paramNum++].p = failedPDA;
470 	xorNode->params[paramNum++].p = raidPtr;
471 	RF_ASSERT(paramNum == 2 * nXorBufs + 2);
472 
473 	/*
474 	 * The xor node uses results[0] as the target buffer.
475 	 * Set pointer and zero the buffer. In the kernel, this
476 	 * may be a user buffer in which case we have to remap it.
477 	 */
478 	xorNode->results[0] = failedPDA->bufPtr;
479 	RF_BZERO(bp, failedPDA->bufPtr, rf_RaidAddressToByte(raidPtr,
480 	    failedPDA->numSector));
481 
482 	/* Connect nodes to form graph. */
483 	/* Connect the header to the block node. */
484 	RF_ASSERT(dag_h->numSuccedents == 1);
485 	RF_ASSERT(blockNode->numAntecedents == 0);
486 	dag_h->succedents[0] = blockNode;
487 
488 	/* Connect the block node to the read nodes. */
489 	RF_ASSERT(blockNode->numSuccedents == (1 + nRrdNodes + nRudNodes));
490 	RF_ASSERT(rpNode->numAntecedents == 1);
491 	blockNode->succedents[0] = rpNode;
492 	rpNode->antecedents[0] = blockNode;
493 	rpNode->antType[0] = rf_control;
494 	for (i = 0; i < nRrdNodes; i++) {
495 		RF_ASSERT(rrdNodes[i].numSuccedents == 1);
496 		blockNode->succedents[1 + i] = &rrdNodes[i];
497 		rrdNodes[i].antecedents[0] = blockNode;
498 		rrdNodes[i].antType[0] = rf_control;
499 	}
500 	for (i = 0; i < nRudNodes; i++) {
501 		RF_ASSERT(rudNodes[i].numSuccedents == 1);
502 		blockNode->succedents[1 + nRrdNodes + i] = &rudNodes[i];
503 		rudNodes[i].antecedents[0] = blockNode;
504 		rudNodes[i].antType[0] = rf_control;
505 	}
506 
507 	/* Connect the read nodes to the xor node. */
508 	RF_ASSERT(xorNode->numAntecedents == (1 + nRrdNodes + nRudNodes));
509 	RF_ASSERT(rpNode->numSuccedents == 1);
510 	rpNode->succedents[0] = xorNode;
511 	xorNode->antecedents[0] = rpNode;
512 	xorNode->antType[0] = rf_trueData;
513 	for (i = 0; i < nRrdNodes; i++) {
514 		RF_ASSERT(rrdNodes[i].numSuccedents == 1);
515 		rrdNodes[i].succedents[0] = xorNode;
516 		xorNode->antecedents[1 + i] = &rrdNodes[i];
517 		xorNode->antType[1 + i] = rf_trueData;
518 	}
519 	for (i = 0; i < nRudNodes; i++) {
520 		RF_ASSERT(rudNodes[i].numSuccedents == 1);
521 		rudNodes[i].succedents[0] = xorNode;
522 		xorNode->antecedents[1 + nRrdNodes + i] = &rudNodes[i];
523 		xorNode->antType[1 + nRrdNodes + i] = rf_trueData;
524 	}
525 
526 	/* Connect the xor node to the commit node. */
527 	RF_ASSERT(xorNode->numSuccedents == 1);
528 	RF_ASSERT(commitNode->numAntecedents == 1);
529 	xorNode->succedents[0] = commitNode;
530 	commitNode->antecedents[0] = xorNode;
531 	commitNode->antType[0] = rf_control;
532 
533 	/* Connect the termNode to the commit node. */
534 	RF_ASSERT(commitNode->numSuccedents == 1);
535 	RF_ASSERT(termNode->numAntecedents == 1);
536 	RF_ASSERT(termNode->numSuccedents == 0);
537 	commitNode->succedents[0] = termNode;
538 	termNode->antType[0] = rf_control;
539 	termNode->antecedents[0] = commitNode;
540 }
541 
542 
543 /*****************************************************************************
544  * Create a degraded read DAG for Chained Declustering.
545  *
546  * Hdr -> Nil -> R(p/s)d -> Cmt -> Trm
547  *
548  * The "Rd" node reads data from the surviving disk in the mirror pair
549  *   Rpd - read of primary copy
550  *   Rsd - read of secondary copy
551  *
552  * Parameters:  raidPtr	  - description of the physical array
553  *		asmap	  - logical & physical addresses for this access
554  *		bp	  - buffer ptr (for holding write data)
555  *		flags	  - general flags (e.g. disk locking)
556  *		allocList - list of memory allocated in DAG creation
557  *****************************************************************************/
558 
559 void
rf_CreateRaidCDegradedReadDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList)560 rf_CreateRaidCDegradedReadDAG(
561     RF_Raid_t			*raidPtr,
562     RF_AccessStripeMap_t	*asmap,
563     RF_DagHeader_t		*dag_h,
564     void			*bp,
565     RF_RaidAccessFlags_t	 flags,
566     RF_AllocListElem_t		*allocList
567 )
568 {
569 	RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode;
570 	RF_StripeNum_t parityStripeID;
571 	int useMirror, i, shiftable;
572 	RF_ReconUnitNum_t which_ru;
573 	RF_PhysDiskAddr_t *pda;
574 
575 	if ((asmap->numDataFailed + asmap->numParityFailed) == 0) {
576 		shiftable = RF_TRUE;
577 	} else {
578 		shiftable = RF_FALSE;
579 	}
580 	useMirror = 0;
581 	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
582 	    asmap->raidAddress, &which_ru);
583 
584 	if (rf_dagDebug) {
585 		printf("[Creating RAID C degraded read DAG]\n");
586 	}
587 	dag_h->creator = "RaidCDegradedReadDAG";
588 	/* Alloc the Wnd nodes and the Wmir node. */
589 	if (asmap->numDataFailed == 0)
590 		useMirror = RF_FALSE;
591 	else
592 		useMirror = RF_TRUE;
593 
594 	/* total number of nodes = 1 + (block + commit + terminator) */
595 	RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *),
596 	    allocList);
597 	i = 0;
598 	rdNode = &nodes[i];
599 	i++;
600 	blockNode = &nodes[i];
601 	i++;
602 	commitNode = &nodes[i];
603 	i++;
604 	termNode = &nodes[i];
605 	i++;
606 
607 	/*
608 	 * This dag can not commit until the commit node is reached.
609 	 * Errors prior to the commit point imply the dag has failed
610 	 * and must be retried.
611 	 */
612 	dag_h->numCommitNodes = 1;
613 	dag_h->numCommits = 0;
614 	dag_h->numSuccedents = 1;
615 
616 	/* initialize the block, commit, and terminator nodes */
617 	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
618 	    rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
619 	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
620 	    rf_NullNodeUndoFunc, NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
621 	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
622 	    rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
623 
624 	pda = asmap->physInfo;
625 	RF_ASSERT(pda != NULL);
626 	/* ParityInfo must describe entire parity unit. */
627 	RF_ASSERT(asmap->parityInfo->next == NULL);
628 
629 	/* Initialize the data node. */
630 	if (!useMirror) {
631 		rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc,
632 		    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
633 		    dag_h, "Rpd", allocList);
634 		if (shiftable && rf_compute_workload_shift(raidPtr, pda)) {
635 			/* Shift this read to the next disk in line. */
636 			rdNode->params[0].p = asmap->parityInfo;
637 			rdNode->params[1].p = pda->bufPtr;
638 			rdNode->params[2].v = parityStripeID;
639 			rdNode->params[3].v = RF_CREATE_PARAM3(
640 			    RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
641 		} else {
642 			/* Read primary copy. */
643 			rdNode->params[0].p = pda;
644 			rdNode->params[1].p = pda->bufPtr;
645 			rdNode->params[2].v = parityStripeID;
646 			rdNode->params[3].v = RF_CREATE_PARAM3(
647 			    RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
648 		}
649 	} else {
650 		/* Read secondary copy of data. */
651 		rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc,
652 		    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
653 		    dag_h, "Rsd", allocList);
654 		rdNode->params[0].p = asmap->parityInfo;
655 		rdNode->params[1].p = pda->bufPtr;
656 		rdNode->params[2].v = parityStripeID;
657 		rdNode->params[3].v =
658 		    RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
659 	}
660 
661 	/* Connect header to block node. */
662 	RF_ASSERT(dag_h->numSuccedents == 1);
663 	RF_ASSERT(blockNode->numAntecedents == 0);
664 	dag_h->succedents[0] = blockNode;
665 
666 	/* Connect block node to rdnode. */
667 	RF_ASSERT(blockNode->numSuccedents == 1);
668 	RF_ASSERT(rdNode->numAntecedents == 1);
669 	blockNode->succedents[0] = rdNode;
670 	rdNode->antecedents[0] = blockNode;
671 	rdNode->antType[0] = rf_control;
672 
673 	/* Connect rdnode to commit node. */
674 	RF_ASSERT(rdNode->numSuccedents == 1);
675 	RF_ASSERT(commitNode->numAntecedents == 1);
676 	rdNode->succedents[0] = commitNode;
677 	commitNode->antecedents[0] = rdNode;
678 	commitNode->antType[0] = rf_control;
679 
680 	/* Connect commit node to terminator. */
681 	RF_ASSERT(commitNode->numSuccedents == 1);
682 	RF_ASSERT(termNode->numAntecedents == 1);
683 	RF_ASSERT(termNode->numSuccedents == 0);
684 	commitNode->succedents[0] = termNode;
685 	termNode->antecedents[0] = commitNode;
686 	termNode->antType[0] = rf_control;
687 }
688 
689 /*
690  * XXX move this elsewhere ?
691  */
692 void
rf_DD_GenerateFailedAccessASMs(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_PhysDiskAddr_t ** pdap,int * nNodep,RF_PhysDiskAddr_t ** pqpdap,int * nPQNodep,RF_AllocListElem_t * allocList)693 rf_DD_GenerateFailedAccessASMs(
694     RF_Raid_t			 *raidPtr,
695     RF_AccessStripeMap_t	 *asmap,
696     RF_PhysDiskAddr_t		**pdap,
697     int				 *nNodep,
698     RF_PhysDiskAddr_t		**pqpdap,
699     int				 *nPQNodep,
700     RF_AllocListElem_t		 *allocList
701 )
702 {
703 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
704 	int PDAPerDisk, i;
705 	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
706 	int numDataCol = layoutPtr->numDataCol;
707 	int state;
708 	RF_SectorNum_t suoff, suend;
709 	unsigned firstDataCol, napdas, count;
710 	RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end = 0;
711 	RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0];
712 	RF_PhysDiskAddr_t *ftwo = asmap->failedPDAs[1];
713 	RF_PhysDiskAddr_t *pda_p;
714 	RF_PhysDiskAddr_t *phys_p;
715 	RF_RaidAddr_t sosAddr;
716 
717 	/*
718 	 * Determine how many pda's we will have to generate per unaccessed
719 	 * stripe. If there is only one failed data unit, it is one; if two,
720 	 * possibly two, depending wether they overlap.
721 	 */
722 
723 	fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
724 	fone_end = fone_start + fone->numSector;
725 
726 #define	CONS_PDA(if,start,num)		do {				\
727 	pda_p->row = asmap->if->row;					\
728 	pda_p->col = asmap->if->col;					\
729 	pda_p->startSector = ((asmap->if->startSector / secPerSU) *	\
730 	    secPerSU) + start;						\
731 	pda_p->numSector = num;						\
732 	pda_p->next = NULL;						\
733 	RF_MallocAndAdd(pda_p->bufPtr,					\
734 	    rf_RaidAddressToByte(raidPtr,num),(char *), allocList);	\
735 } while (0)
736 
737 	if (asmap->numDataFailed == 1) {
738 		PDAPerDisk = 1;
739 		state = 1;
740 		RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t),
741 		    (RF_PhysDiskAddr_t *), allocList);
742 		pda_p = *pqpdap;
743 		/* Build p. */
744 		CONS_PDA(parityInfo, fone_start, fone->numSector);
745 		pda_p->type = RF_PDA_TYPE_PARITY;
746 		pda_p++;
747 		/* Build q. */
748 		CONS_PDA(qInfo, fone_start, fone->numSector);
749 		pda_p->type = RF_PDA_TYPE_Q;
750 	} else {
751 		ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
752 		ftwo_end = ftwo_start + ftwo->numSector;
753 		if (fone->numSector + ftwo->numSector > secPerSU) {
754 			PDAPerDisk = 1;
755 			state = 2;
756 			RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t),
757 			    (RF_PhysDiskAddr_t *), allocList);
758 			pda_p = *pqpdap;
759 			CONS_PDA(parityInfo, 0, secPerSU);
760 			pda_p->type = RF_PDA_TYPE_PARITY;
761 			pda_p++;
762 			CONS_PDA(qInfo, 0, secPerSU);
763 			pda_p->type = RF_PDA_TYPE_Q;
764 		} else {
765 			PDAPerDisk = 2;
766 			state = 3;
767 			/* Four of them, fone, then ftwo. */
768 			RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t),
769 			    (RF_PhysDiskAddr_t *), allocList);
770 			pda_p = *pqpdap;
771 			CONS_PDA(parityInfo, fone_start, fone->numSector);
772 			pda_p->type = RF_PDA_TYPE_PARITY;
773 			pda_p++;
774 			CONS_PDA(qInfo, fone_start, fone->numSector);
775 			pda_p->type = RF_PDA_TYPE_Q;
776 			pda_p++;
777 			CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
778 			pda_p->type = RF_PDA_TYPE_PARITY;
779 			pda_p++;
780 			CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
781 			pda_p->type = RF_PDA_TYPE_Q;
782 		}
783 	}
784 	/* Figure out number of nonaccessed pda. */
785 	napdas = PDAPerDisk * (numDataCol - asmap->numStripeUnitsAccessed -
786 	    (ftwo == NULL ? 1 : 0));
787 	*nPQNodep = PDAPerDisk;
788 
789 	/*
790 	 * Sweep over the over accessed pda's, figuring out the number of
791 	 * additional pda's to generate. Of course, skip the failed ones.
792 	 */
793 
794 	count = 0;
795 	for (pda_p = asmap->physInfo; pda_p; pda_p = pda_p->next) {
796 		if ((pda_p == fone) || (pda_p == ftwo))
797 			continue;
798 		suoff = rf_StripeUnitOffset(layoutPtr, pda_p->startSector);
799 		suend = suoff + pda_p->numSector;
800 		switch (state) {
801 		case 1:	/* One failed PDA to overlap. */
802 			/*
803 			 * If a PDA doesn't contain the failed unit, it can
804 			 * only miss the start or end, not both.
805 			 */
806 			if ((suoff > fone_start) || (suend < fone_end))
807 				count++;
808 			break;
809 		case 2:	/* Whole stripe. */
810 			if (suoff)			/* Leak at begining. */
811 				count++;
812 			if (suend < numDataCol)		/* Leak at end. */
813 				count++;
814 			break;
815 		case 3:	/* Two disjoint units. */
816 			if ((suoff > fone_start) || (suend < fone_end))
817 				count++;
818 			if ((suoff > ftwo_start) || (suend < ftwo_end))
819 				count++;
820 			break;
821 		default:
822 			RF_PANIC();
823 		}
824 	}
825 
826 	napdas += count;
827 	*nNodep = napdas;
828 	if (napdas == 0)
829 		return;		/* short circuit */
830 
831 	/* Allocate up our list of pda's. */
832 
833 	RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t),
834 	    (RF_PhysDiskAddr_t *), allocList);
835 	*pdap = pda_p;
836 
837 	/* Link them together. */
838 	for (i = 0; i < (napdas - 1); i++)
839 		pda_p[i].next = pda_p + (i + 1);
840 
841 	/* March through the one's up to the first accessed disk. */
842 	firstDataCol = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
843 	    asmap->physInfo->raidAddress) % numDataCol;
844 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
845 	    asmap->raidAddress);
846 	for (i = 0; i < firstDataCol; i++) {
847 		if ((pda_p - (*pdap)) == napdas)
848 			continue;
849 		pda_p->type = RF_PDA_TYPE_DATA;
850 		pda_p->raidAddress = sosAddr + (i * secPerSU);
851 		(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress,
852 		    &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
853 		/* Skip over dead disks. */
854 		if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
855 			continue;
856 		switch (state) {
857 		case 1:	/* Fone. */
858 			pda_p->numSector = fone->numSector;
859 			pda_p->raidAddress += fone_start;
860 			pda_p->startSector += fone_start;
861 			RF_MallocAndAdd(pda_p->bufPtr,
862 			    rf_RaidAddressToByte(raidPtr, pda_p->numSector),
863 			    (char *), allocList);
864 			break;
865 		case 2:	/* Full stripe. */
866 			pda_p->numSector = secPerSU;
867 			RF_MallocAndAdd(pda_p->bufPtr,
868 			    rf_RaidAddressToByte(raidPtr, secPerSU),
869 			    (char *), allocList);
870 			break;
871 		case 3:	/* Two slabs. */
872 			pda_p->numSector = fone->numSector;
873 			pda_p->raidAddress += fone_start;
874 			pda_p->startSector += fone_start;
875 			RF_MallocAndAdd(pda_p->bufPtr,
876 			    rf_RaidAddressToByte(raidPtr, pda_p->numSector),
877 			    (char *), allocList);
878 			pda_p++;
879 			pda_p->type = RF_PDA_TYPE_DATA;
880 			pda_p->raidAddress = sosAddr + (i * secPerSU);
881 			(raidPtr->Layout.map->MapSector) (raidPtr,
882 			    pda_p->raidAddress, &(pda_p->row), &(pda_p->col),
883 			    &(pda_p->startSector), 0);
884 			pda_p->numSector = ftwo->numSector;
885 			pda_p->raidAddress += ftwo_start;
886 			pda_p->startSector += ftwo_start;
887 			RF_MallocAndAdd(pda_p->bufPtr,
888 			    rf_RaidAddressToByte(raidPtr, pda_p->numSector),
889 			    (char *), allocList);
890 			break;
891 		default:
892 			RF_PANIC();
893 		}
894 		pda_p++;
895 	}
896 
897 	/* March through the touched stripe units. */
898 	for (phys_p = asmap->physInfo; phys_p; phys_p = phys_p->next, i++) {
899 		if ((phys_p == asmap->failedPDAs[0]) ||
900 		    (phys_p == asmap->failedPDAs[1]))
901 			continue;
902 		suoff = rf_StripeUnitOffset(layoutPtr, phys_p->startSector);
903 		suend = suoff + phys_p->numSector;
904 		switch (state) {
905 		case 1:	/* Single buffer. */
906 			if (suoff > fone_start) {
907 				RF_ASSERT(suend >= fone_end);
908 				/*
909 				 * The data read starts after the mapped
910 				 * access, snip off the begining.
911 				 */
912 				pda_p->numSector = suoff - fone_start;
913 				pda_p->raidAddress = sosAddr + (i * secPerSU)
914 				    + fone_start;
915 				(raidPtr->Layout.map->MapSector) (raidPtr,
916 				    pda_p->raidAddress, &(pda_p->row),
917 				    &(pda_p->col), &(pda_p->startSector), 0);
918 				RF_MallocAndAdd(pda_p->bufPtr,
919 				    rf_RaidAddressToByte(raidPtr,
920 				    pda_p->numSector), (char *), allocList);
921 				pda_p++;
922 			}
923 			if (suend < fone_end) {
924 				RF_ASSERT(suoff <= fone_start);
925 				/*
926 				 * The data read stops before the end of the
927 				 * failed access, extend.
928 				 */
929 				pda_p->numSector = fone_end - suend;
930 				pda_p->raidAddress = sosAddr + (i * secPerSU)
931 				    + suend;	/* off by one? */
932 				(raidPtr->Layout.map->MapSector) (raidPtr,
933 				    pda_p->raidAddress, &(pda_p->row),
934 				    &(pda_p->col), &(pda_p->startSector), 0);
935 				RF_MallocAndAdd(pda_p->bufPtr,
936 				    rf_RaidAddressToByte(raidPtr,
937 				    pda_p->numSector), (char *), allocList);
938 				pda_p++;
939 			}
940 			break;
941 		case 2:	/* Whole stripe unit. */
942 			RF_ASSERT((suoff == 0) || (suend == secPerSU));
943 			if (suend < secPerSU) {
944 				/* Short read, snip from end on. */
945 				pda_p->numSector = secPerSU - suend;
946 				pda_p->raidAddress = sosAddr + (i * secPerSU)
947 				    + suend;	/* off by one? */
948 				(raidPtr->Layout.map->MapSector) (raidPtr,
949 				    pda_p->raidAddress, &(pda_p->row),
950 				    &(pda_p->col), &(pda_p->startSector), 0);
951 				RF_MallocAndAdd(pda_p->bufPtr,
952 				    rf_RaidAddressToByte(raidPtr,
953 				    pda_p->numSector), (char *), allocList);
954 				pda_p++;
955 			} else
956 				if (suoff > 0) {
957 					/* Short at front. */
958 					pda_p->numSector = suoff;
959 					pda_p->raidAddress = sosAddr +
960 					    (i * secPerSU);
961 					(raidPtr->Layout.map->MapSector)
962 					    (raidPtr, pda_p->raidAddress,
963 					    &(pda_p->row), &(pda_p->col),
964 					    &(pda_p->startSector), 0);
965 					RF_MallocAndAdd(pda_p->bufPtr,
966 					    rf_RaidAddressToByte(raidPtr,
967 					    pda_p->numSector), (char *),
968 					    allocList);
969 					pda_p++;
970 				}
971 			break;
972 		case 3:	/* Two nonoverlapping failures. */
973 			if ((suoff > fone_start) || (suend < fone_end)) {
974 				if (suoff > fone_start) {
975 					RF_ASSERT(suend >= fone_end);
976 					/*
977 					 * The data read starts after the
978 					 * mapped access, snip off the
979 					 * begining.
980 					 */
981 					pda_p->numSector = suoff - fone_start;
982 					pda_p->raidAddress = sosAddr +
983 					    (i * secPerSU) + fone_start;
984 					(raidPtr->Layout.map->MapSector)
985 					    (raidPtr, pda_p->raidAddress,
986 					    &(pda_p->row), &(pda_p->col),
987 					    &(pda_p->startSector), 0);
988 					RF_MallocAndAdd(pda_p->bufPtr,
989 					    rf_RaidAddressToByte(raidPtr,
990 					    pda_p->numSector), (char *),
991 					    allocList);
992 					pda_p++;
993 				}
994 				if (suend < fone_end) {
995 					RF_ASSERT(suoff <= fone_start);
996 					/*
997 					 * The data read stops before the end
998 					 * of the failed access, extend.
999 					 */
1000 					pda_p->numSector = fone_end - suend;
1001 					pda_p->raidAddress = sosAddr +
1002 					    (i * secPerSU) +
1003 					    suend;	/* Off by one ? */
1004 					(raidPtr->Layout.map->MapSector)
1005 					    (raidPtr, pda_p->raidAddress,
1006 					    &(pda_p->row), &(pda_p->col),
1007 					    &(pda_p->startSector), 0);
1008 					RF_MallocAndAdd(pda_p->bufPtr,
1009 					    rf_RaidAddressToByte(raidPtr,
1010 					    pda_p->numSector), (char *),
1011 					    allocList);
1012 					pda_p++;
1013 				}
1014 			}
1015 			if ((suoff > ftwo_start) || (suend < ftwo_end)) {
1016 				if (suoff > ftwo_start) {
1017 					RF_ASSERT(suend >= ftwo_end);
1018 					/*
1019 					 * The data read starts after the
1020 					 * mapped access, snip off the
1021 					 * begining.
1022 					 */
1023 					pda_p->numSector = suoff - ftwo_start;
1024 					pda_p->raidAddress = sosAddr +
1025 					    (i * secPerSU) + ftwo_start;
1026 					(raidPtr->Layout.map->MapSector)
1027 					    (raidPtr, pda_p->raidAddress,
1028 					    &(pda_p->row), &(pda_p->col),
1029 					    &(pda_p->startSector), 0);
1030 					RF_MallocAndAdd(pda_p->bufPtr,
1031 					    rf_RaidAddressToByte(raidPtr,
1032 					    pda_p->numSector), (char *),
1033 					    allocList);
1034 					pda_p++;
1035 				}
1036 				if (suend < ftwo_end) {
1037 					RF_ASSERT(suoff <= ftwo_start);
1038 					/*
1039 					 * The data read stops before the end
1040 					 * of the failed access, extend.
1041 					 */
1042 					pda_p->numSector = ftwo_end - suend;
1043 					pda_p->raidAddress = sosAddr +
1044 					    (i * secPerSU) +
1045 					    suend;	/* Off by one ? */
1046 					(raidPtr->Layout.map->MapSector)
1047 					    (raidPtr, pda_p->raidAddress,
1048 					    &(pda_p->row), &(pda_p->col),
1049 					    &(pda_p->startSector), 0);
1050 					RF_MallocAndAdd(pda_p->bufPtr,
1051 					    rf_RaidAddressToByte(raidPtr,
1052 					    pda_p->numSector), (char *),
1053 					    allocList);
1054 					pda_p++;
1055 				}
1056 			}
1057 			break;
1058 		default:
1059 			RF_PANIC();
1060 		}
1061 	}
1062 
1063 	/* After the last accessed disk. */
1064 	for (; i < numDataCol; i++) {
1065 		if ((pda_p - (*pdap)) == napdas)
1066 			continue;
1067 		pda_p->type = RF_PDA_TYPE_DATA;
1068 		pda_p->raidAddress = sosAddr + (i * secPerSU);
1069 		(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress,
1070 		    &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
1071 		/* Skip over dead disks. */
1072 		if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
1073 			continue;
1074 		switch (state) {
1075 		case 1:	/* Fone. */
1076 			pda_p->numSector = fone->numSector;
1077 			pda_p->raidAddress += fone_start;
1078 			pda_p->startSector += fone_start;
1079 			RF_MallocAndAdd(pda_p->bufPtr,
1080 			    rf_RaidAddressToByte(raidPtr, pda_p->numSector),
1081 			    (char *), allocList);
1082 			break;
1083 		case 2:	/* Full stripe. */
1084 			pda_p->numSector = secPerSU;
1085 			RF_MallocAndAdd(pda_p->bufPtr,
1086 			    rf_RaidAddressToByte(raidPtr, secPerSU),
1087 			    (char *), allocList);
1088 			break;
1089 		case 3:	/* Two slabs. */
1090 			pda_p->numSector = fone->numSector;
1091 			pda_p->raidAddress += fone_start;
1092 			pda_p->startSector += fone_start;
1093 			RF_MallocAndAdd(pda_p->bufPtr,
1094 			    rf_RaidAddressToByte(raidPtr, pda_p->numSector),
1095 			    (char *), allocList);
1096 			pda_p++;
1097 			pda_p->type = RF_PDA_TYPE_DATA;
1098 			pda_p->raidAddress = sosAddr + (i * secPerSU);
1099 			(raidPtr->Layout.map->MapSector) (raidPtr,
1100 			    pda_p->raidAddress, &(pda_p->row), &(pda_p->col),
1101 			    &(pda_p->startSector), 0);
1102 			pda_p->numSector = ftwo->numSector;
1103 			pda_p->raidAddress += ftwo_start;
1104 			pda_p->startSector += ftwo_start;
1105 			RF_MallocAndAdd(pda_p->bufPtr,
1106 			    rf_RaidAddressToByte(raidPtr, pda_p->numSector),
1107 			    (char *), allocList);
1108 			break;
1109 		default:
1110 			RF_PANIC();
1111 		}
1112 		pda_p++;
1113 	}
1114 
1115 	RF_ASSERT(pda_p - *pdap == napdas);
1116 	return;
1117 }
1118 
1119 #define	INIT_DISK_NODE(node,name)	do {				\
1120 	rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc,		\
1121 	    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0,		\
1122 	    dag_h, name, allocList);					\
1123 	(node)->succedents[0] = unblockNode;				\
1124 	(node)->succedents[1] = recoveryNode;				\
1125 	(node)->antecedents[0] = blockNode;				\
1126 	(node)->antType[0] = rf_control;				\
1127 } while (0)
1128 
1129 #define	DISK_NODE_PARAMS(_node_,_p_)	do {				\
1130 	(_node_).params[0].p = _p_ ;					\
1131 	(_node_).params[1].p = (_p_)->bufPtr;				\
1132 	(_node_).params[2].v = parityStripeID;				\
1133 	(_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,	\
1134 	    0, 0, which_ru);						\
1135 } while (0)
1136 
1137 void
rf_DoubleDegRead(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,char * redundantReadNodeName,char * recoveryNodeName,int (* recovFunc)(RF_DagNode_t *))1138 rf_DoubleDegRead(
1139     RF_Raid_t			 *raidPtr,
1140     RF_AccessStripeMap_t	 *asmap,
1141     RF_DagHeader_t		 *dag_h,
1142     void			 *bp,
1143     RF_RaidAccessFlags_t	  flags,
1144     RF_AllocListElem_t		 *allocList,
1145     char			 *redundantReadNodeName,
1146     char			 *recoveryNodeName,
1147     int				(*recovFunc) (RF_DagNode_t *)
1148 )
1149 {
1150 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
1151 	RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *recoveryNode, *blockNode,
1152 	    *unblockNode, *rpNodes, *rqNodes, *termNode;
1153 	RF_PhysDiskAddr_t *pda, *pqPDAs;
1154 	RF_PhysDiskAddr_t *npdas;
1155 	int nNodes, nRrdNodes, nRudNodes, i;
1156 	RF_ReconUnitNum_t which_ru;
1157 	int nReadNodes, nPQNodes;
1158 	RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0];
1159 	RF_PhysDiskAddr_t *failedPDAtwo = asmap->failedPDAs[1];
1160 	RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(
1161 	    layoutPtr, asmap->raidAddress, &which_ru);
1162 
1163 	if (rf_dagDebug)
1164 		printf("[Creating Double Degraded Read DAG]\n");
1165 	rf_DD_GenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes,
1166 	    &pqPDAs, &nPQNodes, allocList);
1167 
1168 	nRudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
1169 	nReadNodes = nRrdNodes + nRudNodes + 2 * nPQNodes;
1170 	nNodes = 4 /* Block, unblock, recovery, term. */ + nReadNodes;
1171 
1172 	RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *),
1173 	    allocList);
1174 	i = 0;
1175 	blockNode = &nodes[i];
1176 	i += 1;
1177 	unblockNode = &nodes[i];
1178 	i += 1;
1179 	recoveryNode = &nodes[i];
1180 	i += 1;
1181 	termNode = &nodes[i];
1182 	i += 1;
1183 	rudNodes = &nodes[i];
1184 	i += nRudNodes;
1185 	rrdNodes = &nodes[i];
1186 	i += nRrdNodes;
1187 	rpNodes = &nodes[i];
1188 	i += nPQNodes;
1189 	rqNodes = &nodes[i];
1190 	i += nPQNodes;
1191 	RF_ASSERT(i == nNodes);
1192 
1193 	dag_h->numSuccedents = 1;
1194 	dag_h->succedents[0] = blockNode;
1195 	dag_h->creator = "DoubleDegRead";
1196 	dag_h->numCommits = 0;
1197 	dag_h->numCommitNodes = 1;	/* Unblock. */
1198 
1199 	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
1200 	    rf_TerminateUndoFunc, NULL, 0, 2, 0, 0, dag_h, "Trm", allocList);
1201 	termNode->antecedents[0] = unblockNode;
1202 	termNode->antType[0] = rf_control;
1203 	termNode->antecedents[1] = recoveryNode;
1204 	termNode->antType[1] = rf_control;
1205 
1206 	/*
1207 	 * Init the block and unblock nodes.
1208 	 * The block node has all nodes except itself, unblock and
1209 	 * recovery as successors.
1210 	 * Similarly for predecessors of the unblock.
1211 	 */
1212 	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
1213 	    rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h,
1214 	    "Nil", allocList);
1215 	rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
1216 	    rf_NullNodeUndoFunc, NULL, 1, nReadNodes, 0, 0, dag_h,
1217 	    "Nil", allocList);
1218 
1219 	for (i = 0; i < nReadNodes; i++) {
1220 		blockNode->succedents[i] = rudNodes + i;
1221 		unblockNode->antecedents[i] = rudNodes + i;
1222 		unblockNode->antType[i] = rf_control;
1223 	}
1224 	unblockNode->succedents[0] = termNode;
1225 
1226 	/*
1227 	 * The recovery node has all the reads as predecessors, and the term
1228 	 * node as successors. It gets a pda as a param from each of the read
1229 	 * nodes plus the raidPtr. For each failed unit is has a result pda.
1230 	 */
1231 	rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc,
1232 	    rf_NullNodeUndoFunc, NULL,
1233 	    1,				/* succesors */
1234 	    nReadNodes,			/* preds */
1235 	    nReadNodes + 2,		/* params */
1236 	    asmap->numDataFailed,	/* results */
1237 	    dag_h, recoveryNodeName, allocList);
1238 
1239 	recoveryNode->succedents[0] = termNode;
1240 	for (i = 0; i < nReadNodes; i++) {
1241 		recoveryNode->antecedents[i] = rudNodes + i;
1242 		recoveryNode->antType[i] = rf_trueData;
1243 	}
1244 
1245 	/*
1246 	 * Build the read nodes, then come back and fill in recovery params
1247 	 * and results.
1248 	 */
1249 	pda = asmap->physInfo;
1250 	for (i = 0; i < nRudNodes; pda = pda->next) {
1251 		if ((pda == failedPDA) || (pda == failedPDAtwo))
1252 			continue;
1253 		INIT_DISK_NODE(rudNodes + i, "Rud");
1254 		RF_ASSERT(pda);
1255 		DISK_NODE_PARAMS(rudNodes[i], pda);
1256 		i++;
1257 	}
1258 
1259 	pda = npdas;
1260 	for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
1261 		INIT_DISK_NODE(rrdNodes + i, "Rrd");
1262 		RF_ASSERT(pda);
1263 		DISK_NODE_PARAMS(rrdNodes[i], pda);
1264 	}
1265 
1266 	/* Redundancy pdas. */
1267 	pda = pqPDAs;
1268 	INIT_DISK_NODE(rpNodes, "Rp");
1269 	RF_ASSERT(pda);
1270 	DISK_NODE_PARAMS(rpNodes[0], pda);
1271 	pda++;
1272 	INIT_DISK_NODE(rqNodes, redundantReadNodeName);
1273 	RF_ASSERT(pda);
1274 	DISK_NODE_PARAMS(rqNodes[0], pda);
1275 	if (nPQNodes == 2) {
1276 		pda++;
1277 		INIT_DISK_NODE(rpNodes + 1, "Rp");
1278 		RF_ASSERT(pda);
1279 		DISK_NODE_PARAMS(rpNodes[1], pda);
1280 		pda++;
1281 		INIT_DISK_NODE(rqNodes + 1, redundantReadNodeName);
1282 		RF_ASSERT(pda);
1283 		DISK_NODE_PARAMS(rqNodes[1], pda);
1284 	}
1285 	/* Fill in recovery node params. */
1286 	for (i = 0; i < nReadNodes; i++)
1287 		recoveryNode->params[i] = rudNodes[i].params[0]; /* pda */
1288 	recoveryNode->params[i++].p = (void *) raidPtr;
1289 	recoveryNode->params[i++].p = (void *) asmap;
1290 	recoveryNode->results[0] = failedPDA;
1291 	if (asmap->numDataFailed == 2)
1292 		recoveryNode->results[1] = failedPDAtwo;
1293 
1294 	/* Zero fill the target data buffers ? */
1295 }
1296