1 /* $OpenBSD: rf_dagdegwr.c,v 1.5 2002/12/16 07:01:03 tdeval Exp $ */
2 /* $NetBSD: rf_dagdegwr.c,v 1.5 2000/01/07 03:40:57 oster Exp $ */
3
4 /*
5 * Copyright (c) 1995 Carnegie-Mellon University.
6 * All rights reserved.
7 *
8 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
9 *
10 * Permission to use, copy, modify and distribute this software and
11 * its documentation is hereby granted, provided that both the copyright
12 * notice and this permission notice appear in all copies of the
13 * software, derivative works or modified versions, and any portions
14 * thereof, and that both notices appear in supporting documentation.
15 *
16 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19 *
20 * Carnegie Mellon requests users of this software to return to
21 *
22 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23 * School of Computer Science
24 * Carnegie Mellon University
25 * Pittsburgh PA 15213-3890
26 *
27 * any improvements or extensions that they make and grant Carnegie the
28 * rights to redistribute these changes.
29 */
30
31 /*
32 * rf_dagdegwr.c
33 *
34 * Code for creating degraded write DAGs.
35 *
36 */
37
38 #include "rf_types.h"
39 #include "rf_raid.h"
40 #include "rf_dag.h"
41 #include "rf_dagutils.h"
42 #include "rf_dagfuncs.h"
43 #include "rf_debugMem.h"
44 #include "rf_memchunk.h"
45 #include "rf_general.h"
46 #include "rf_dagdegwr.h"
47
48
49 /*****************************************************************************
50 *
51 * General comments on DAG creation:
52 *
53 * All DAGs in this file use roll-away error recovery. Each DAG has a single
54 * commit node, usually called "Cmt". If an error occurs before the Cmt node
55 * is reached, the execution engine will halt forward execution and work
56 * backward through the graph, executing the undo functions. Assuming that
57 * each node in the graph prior to the Cmt node are undoable and atomic - or -
58 * does not make changes to permanent state, the graph will fail atomically.
59 * If an error occurs after the Cmt node executes, the engine will roll-forward
60 * through the graph, blindly executing nodes until it reaches the end.
61 * If a graph reaches the end, it is assumed to have completed successfully.
62 *
63 * A graph has only 1 Cmt node.
64 *
65 *****************************************************************************/
66
67
68 /*****************************************************************************
69 *
70 * The following wrappers map the standard DAG creation interface to the
71 * DAG creation routines. Additionally, these wrappers enable experimentation
72 * with new DAG structures by providing an extra level of indirection, allowing
73 * the DAG creation routines to be replaced at this single point.
74 *
75 *****************************************************************************/
76
RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)77 RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)
78 {
79 rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
80 flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
81 }
82
83 void
rf_CreateDegradedWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList)84 rf_CreateDegradedWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
85 RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags,
86 RF_AllocListElem_t *allocList)
87 {
88 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
89 RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0];
90
91 RF_ASSERT(asmap->numDataFailed == 1);
92 dag_h->creator = "DegradedWriteDAG";
93
94 /*
95 * If the access writes only a portion of the failed unit, and also
96 * writes some portion of at least one surviving unit, we create two
97 * DAGs, one for the failed component and one for the non-failed
98 * component, and do them sequentially. Note that the fact that we're
99 * accessing only a portion of the failed unit indicates that the
100 * access either starts or ends in the failed unit, and hence we need
101 * create only two dags. This is inefficient in that the same data or
102 * parity can get read and written twice using this structure. I need
103 * to fix this to do the access all at once.
104 */
105 RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 &&
106 failedPDA->numSector != layoutPtr->sectorsPerStripeUnit));
107 rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
108 flags, allocList);
109 }
110
111
112
113 /*****************************************************************************
114 *
115 * DAG creation code begins here.
116 *
117 *****************************************************************************/
118
119
120 /*****************************************************************************
121 *
122 * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
123 * write, which is as follows
124 *
125 * / {Wnq} --\
126 * hdr -> blockNode -> Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
127 * \ {Rod} / | Wnd ---/
128 * \ {Wnd} -/
129 *
130 * Commit nodes: Xor, Wnd
131 *
132 * IMPORTANT:
133 * This DAG generator does not work for double-degraded archs since it does not
134 * generate Q.
135 *
136 * This dag is essentially identical to the large-write dag, except that the
137 * write to the failed data unit is suppressed.
138 *
139 * IMPORTANT: this dag does not work in the case where the access writes only
140 * a portion of the failed unit, and also writes some portion of at least one
141 * surviving SU. this case is handled in CreateDegradedWriteDAG above.
142 *
143 * The block & unblock nodes are leftovers from a previous version. They
144 * do nothing, but I haven't deleted them because it would be a tremendous
145 * effort to put them back in.
146 *
147 * This dag is used whenever one of the data units in a write has failed.
148 * If it is the parity unit that failed, the nonredundant write dag (below)
149 * is used.
150 *
151 *****************************************************************************/
152
153 void
rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,int nfaults,int (* redFunc)(RF_DagNode_t *),int allowBufferRecycle)154 rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr,
155 RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
156 RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, int nfaults,
157 int (*redFunc) (RF_DagNode_t *), int allowBufferRecycle)
158 {
159 int nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum,
160 rdnodesFaked;
161 RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
162 RF_DagNode_t *nodes, *wndNodes, *rrdNodes, *xorNode, *commitNode;
163 RF_SectorCount_t sectorsPerSU;
164 RF_ReconUnitNum_t which_ru;
165 char *xorTargetBuf = NULL; /*
166 * The target buffer for the XOR
167 * operation.
168 */
169 char *overlappingPDAs; /* A temporary array of flags. */
170 RF_AccessStripeMapHeader_t *new_asm_h[2];
171 RF_PhysDiskAddr_t *pda, *parityPDA;
172 RF_StripeNum_t parityStripeID;
173 RF_PhysDiskAddr_t *failedPDA;
174 RF_RaidLayout_t *layoutPtr;
175
176 layoutPtr = &(raidPtr->Layout);
177 parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
178 asmap->raidAddress, &which_ru);
179 sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
180 /*
181 * failedPDA points to the pda within the asm that targets
182 * the failed disk.
183 */
184 failedPDA = asmap->failedPDAs[0];
185
186 if (rf_dagDebug)
187 printf("[Creating degraded-write DAG]\n");
188
189 RF_ASSERT(asmap->numDataFailed == 1);
190 dag_h->creator = "SimpleDegradedWriteDAG";
191
192 /*
193 * Generate two ASMs identifying the surviving data
194 * we need in order to recover the lost data.
195 */
196 /* overlappingPDAs array must be zero'd */
197 RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed,
198 sizeof(char), (char *));
199 rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h,
200 new_asm_h, &nXorBufs, NULL, overlappingPDAs, allocList);
201
202 /* Create all the nodes at once. */
203 nWndNodes = asmap->numStripeUnitsAccessed - 1; /*
204 * No access is
205 * generated for the
206 * failed pda.
207 */
208
209 nRrdNodes = ((new_asm_h[0]) ?
210 new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
211 ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed
212 : 0);
213 /*
214 * XXX
215 *
216 * There's a bug with a complete stripe overwrite- that means 0 reads
217 * of old data, and the rest of the DAG generation code doesn't like
218 * that. A release is coming, and I don't wanna risk breaking a
219 * critical DAG generator, so here's what I'm gonna do- if there's
220 * no read nodes, I'm gonna fake there being a read node, and I'm
221 * gonna swap in a no-op node in its place (to make all the link-up
222 * code happy).
223 * This should be fixed at some point. --jimz
224 */
225 if (nRrdNodes == 0) {
226 nRrdNodes = 1;
227 rdnodesFaked = 1;
228 } else {
229 rdnodesFaked = 0;
230 }
231 /* Lock, unlock, xor, Wnd, Rrd, W(nfaults). */
232 nNodes = 5 + nfaults + nWndNodes + nRrdNodes;
233 RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t),
234 (RF_DagNode_t *), allocList);
235 i = 0;
236 blockNode = &nodes[i];
237 i += 1;
238 commitNode = &nodes[i];
239 i += 1;
240 unblockNode = &nodes[i];
241 i += 1;
242 termNode = &nodes[i];
243 i += 1;
244 xorNode = &nodes[i];
245 i += 1;
246 wnpNode = &nodes[i];
247 i += 1;
248 wndNodes = &nodes[i];
249 i += nWndNodes;
250 rrdNodes = &nodes[i];
251 i += nRrdNodes;
252 if (nfaults == 2) {
253 wnqNode = &nodes[i];
254 i += 1;
255 } else {
256 wnqNode = NULL;
257 }
258 RF_ASSERT(i == nNodes);
259
260 /*
261 * This dag can not commit until all rrd and xor Nodes have
262 * completed.
263 */
264 dag_h->numCommitNodes = 1;
265 dag_h->numCommits = 0;
266 dag_h->numSuccedents = 1;
267
268 RF_ASSERT(nRrdNodes > 0);
269 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
270 rf_NullNodeUndoFunc, NULL, nRrdNodes, 0, 0, 0, dag_h,
271 "Nil", allocList);
272 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
273 rf_NullNodeUndoFunc, NULL, nWndNodes + nfaults, 1, 0, 0,
274 dag_h, "Cmt", allocList);
275 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
276 rf_NullNodeUndoFunc, NULL, 1, nWndNodes + nfaults, 0, 0,
277 dag_h, "Nil", allocList);
278 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
279 rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
280 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc,
281 NULL, 1, nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc",
282 allocList);
283
284 /*
285 * Fill in the Rrd nodes. If any of the rrd buffers are the same size
286 * as the failed buffer, save a pointer to it so we can use it as the
287 * target of the XOR. The pdas in the rrd nodes have been range-
288 * restricted, so if a buffer is the same size as the failed buffer,
289 * it must also be at the same alignment within the SU.
290 */
291 i = 0;
292 if (new_asm_h[0]) {
293 for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo;
294 i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
295 i++, pda = pda->next) {
296 rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE,
297 rf_DiskReadFunc, rf_DiskReadUndoFunc,
298 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
299 "Rrd", allocList);
300 RF_ASSERT(pda);
301 rrdNodes[i].params[0].p = pda;
302 rrdNodes[i].params[1].p = pda->bufPtr;
303 rrdNodes[i].params[2].v = parityStripeID;
304 rrdNodes[i].params[3].v = RF_CREATE_PARAM3(
305 RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
306 }
307 }
308 /* i now equals the number of stripe units accessed in new_asm_h[0]. */
309 if (new_asm_h[1]) {
310 for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
311 j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
312 j++, pda = pda->next) {
313 rf_InitNode(&rrdNodes[i + j], rf_wait, RF_FALSE,
314 rf_DiskReadFunc, rf_DiskReadUndoFunc,
315 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
316 "Rrd", allocList);
317 RF_ASSERT(pda);
318 rrdNodes[i + j].params[0].p = pda;
319 rrdNodes[i + j].params[1].p = pda->bufPtr;
320 rrdNodes[i + j].params[2].v = parityStripeID;
321 rrdNodes[i + j].params[3].v = RF_CREATE_PARAM3(
322 RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
323 if (allowBufferRecycle &&
324 (pda->numSector == failedPDA->numSector))
325 xorTargetBuf = pda->bufPtr;
326 }
327 }
328 if (rdnodesFaked) {
329 /*
330 * This is where we'll init that fake noop read node.
331 * (XXX should the wakeup func be different ?)
332 */
333 rf_InitNode(&rrdNodes[0], rf_wait, RF_FALSE, rf_NullNodeFunc,
334 rf_NullNodeUndoFunc, NULL, 1, 1, 0, 0, dag_h, "RrN",
335 allocList);
336 }
337 /*
338 * Make a PDA for the parity unit. The parity PDA should start at
339 * the same offset into the SU as the failed PDA.
340 */
341 /*
342 * Danner comment: I don't think this copy is really necessary. We are
343 * in one of two cases here.
344 * (1) The entire failed unit is written. Then asmap->parityInfo will
345 * describe the entire parity.
346 * (2) We are only writing a subset of the failed unit and nothing else.
347 * Then the asmap->parityInfo describes the failed unit and the copy
348 * can also be avoided.
349 */
350
351 RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
352 (RF_PhysDiskAddr_t *), allocList);
353 parityPDA->row = asmap->parityInfo->row;
354 parityPDA->col = asmap->parityInfo->col;
355 parityPDA->startSector = ((asmap->parityInfo->startSector /
356 sectorsPerSU) * sectorsPerSU) + (failedPDA->startSector %
357 sectorsPerSU);
358 parityPDA->numSector = failedPDA->numSector;
359
360 if (!xorTargetBuf) {
361 RF_CallocAndAdd(xorTargetBuf, 1, rf_RaidAddressToByte(raidPtr,
362 failedPDA->numSector), (char *), allocList);
363 }
364 /* Init the Wnp node. */
365 rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
366 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
367 dag_h, "Wnp", allocList);
368 wnpNode->params[0].p = parityPDA;
369 wnpNode->params[1].p = xorTargetBuf;
370 wnpNode->params[2].v = parityStripeID;
371 wnpNode->params[3].v = RF_CREATE_PARAM3(
372 RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
373
374 /* Fill in the Wnq Node. */
375 if (nfaults == 2) {
376 {
377 RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
378 (RF_PhysDiskAddr_t *), allocList);
379 parityPDA->row = asmap->qInfo->row;
380 parityPDA->col = asmap->qInfo->col;
381 parityPDA->startSector = ((asmap->qInfo->startSector /
382 sectorsPerSU) * sectorsPerSU) +
383 (failedPDA->startSector % sectorsPerSU);
384 parityPDA->numSector = failedPDA->numSector;
385
386 rf_InitNode(wnqNode, rf_wait, RF_FALSE,
387 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
388 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
389 "Wnq", allocList);
390 wnqNode->params[0].p = parityPDA;
391 RF_CallocAndAdd(xorNode->results[1], 1,
392 rf_RaidAddressToByte(raidPtr, failedPDA->numSector),
393 (char *), allocList);
394 wnqNode->params[1].p = xorNode->results[1];
395 wnqNode->params[2].v = parityStripeID;
396 wnqNode->params[3].v = RF_CREATE_PARAM3(
397 RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
398 }
399 }
400 /* Fill in the Wnd nodes. */
401 for (pda = asmap->physInfo, i = 0; i < nWndNodes;
402 i++, pda = pda->next) {
403 if (pda == failedPDA) {
404 i--;
405 continue;
406 }
407 rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
408 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
409 dag_h, "Wnd", allocList);
410 RF_ASSERT(pda);
411 wndNodes[i].params[0].p = pda;
412 wndNodes[i].params[1].p = pda->bufPtr;
413 wndNodes[i].params[2].v = parityStripeID;
414 wndNodes[i].params[3].v = RF_CREATE_PARAM3(
415 RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
416 }
417
418 /* Fill in the results of the xor node. */
419 xorNode->results[0] = xorTargetBuf;
420
421 /* Fill in the params of the xor node. */
422
423 paramNum = 0;
424 if (rdnodesFaked == 0) {
425 for (i = 0; i < nRrdNodes; i++) {
426 /* All the Rrd nodes need to be xored together. */
427 xorNode->params[paramNum++] = rrdNodes[i].params[0];
428 xorNode->params[paramNum++] = rrdNodes[i].params[1];
429 }
430 }
431 for (i = 0; i < nWndNodes; i++) {
432 /*
433 * Any Wnd nodes that overlap the failed access need to be
434 * xored in.
435 */
436 if (overlappingPDAs[i]) {
437 RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t),
438 (RF_PhysDiskAddr_t *), allocList);
439 bcopy((char *) wndNodes[i].params[0].p, (char *) pda,
440 sizeof(RF_PhysDiskAddr_t));
441 rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
442 RF_RESTRICT_DOBUFFER, 0);
443 xorNode->params[paramNum++].p = pda;
444 xorNode->params[paramNum++].p = pda->bufPtr;
445 }
446 }
447 RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
448
449 /*
450 * Install the failed PDA into the xor param list so that the
451 * new data gets xor'd in.
452 */
453 xorNode->params[paramNum++].p = failedPDA;
454 xorNode->params[paramNum++].p = failedPDA->bufPtr;
455
456 /*
457 * The last 2 params to the recovery xor node are always the failed
458 * PDA and the raidPtr. Install the failedPDA even though we have just
459 * done so above. This allows us to use the same XOR function for both
460 * degraded reads and degraded writes.
461 */
462 xorNode->params[paramNum++].p = failedPDA;
463 xorNode->params[paramNum++].p = raidPtr;
464 RF_ASSERT(paramNum == 2 * nXorBufs + 2);
465
466 /*
467 * Code to link nodes begins here.
468 */
469
470 /* Link header to block node. */
471 RF_ASSERT(blockNode->numAntecedents == 0);
472 dag_h->succedents[0] = blockNode;
473
474 /* Link block node to rd nodes. */
475 RF_ASSERT(blockNode->numSuccedents == nRrdNodes);
476 for (i = 0; i < nRrdNodes; i++) {
477 RF_ASSERT(rrdNodes[i].numAntecedents == 1);
478 blockNode->succedents[i] = &rrdNodes[i];
479 rrdNodes[i].antecedents[0] = blockNode;
480 rrdNodes[i].antType[0] = rf_control;
481 }
482
483 /* Link read nodes to xor node. */
484 RF_ASSERT(xorNode->numAntecedents == nRrdNodes);
485 for (i = 0; i < nRrdNodes; i++) {
486 RF_ASSERT(rrdNodes[i].numSuccedents == 1);
487 rrdNodes[i].succedents[0] = xorNode;
488 xorNode->antecedents[i] = &rrdNodes[i];
489 xorNode->antType[i] = rf_trueData;
490 }
491
492 /* Link xor node to commit node. */
493 RF_ASSERT(xorNode->numSuccedents == 1);
494 RF_ASSERT(commitNode->numAntecedents == 1);
495 xorNode->succedents[0] = commitNode;
496 commitNode->antecedents[0] = xorNode;
497 commitNode->antType[0] = rf_control;
498
499 /* Link commit node to wnd nodes. */
500 RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes);
501 for (i = 0; i < nWndNodes; i++) {
502 RF_ASSERT(wndNodes[i].numAntecedents == 1);
503 commitNode->succedents[i] = &wndNodes[i];
504 wndNodes[i].antecedents[0] = commitNode;
505 wndNodes[i].antType[0] = rf_control;
506 }
507
508 /* Link the commit node to wnp, wnq nodes. */
509 RF_ASSERT(wnpNode->numAntecedents == 1);
510 commitNode->succedents[nWndNodes] = wnpNode;
511 wnpNode->antecedents[0] = commitNode;
512 wnpNode->antType[0] = rf_control;
513 if (nfaults == 2) {
514 RF_ASSERT(wnqNode->numAntecedents == 1);
515 commitNode->succedents[nWndNodes + 1] = wnqNode;
516 wnqNode->antecedents[0] = commitNode;
517 wnqNode->antType[0] = rf_control;
518 }
519 /* Link write new data nodes to unblock node. */
520 RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults));
521 for (i = 0; i < nWndNodes; i++) {
522 RF_ASSERT(wndNodes[i].numSuccedents == 1);
523 wndNodes[i].succedents[0] = unblockNode;
524 unblockNode->antecedents[i] = &wndNodes[i];
525 unblockNode->antType[i] = rf_control;
526 }
527
528 /* Link write new parity node to unblock node. */
529 RF_ASSERT(wnpNode->numSuccedents == 1);
530 wnpNode->succedents[0] = unblockNode;
531 unblockNode->antecedents[nWndNodes] = wnpNode;
532 unblockNode->antType[nWndNodes] = rf_control;
533
534 /* Link write new q node to unblock node. */
535 if (nfaults == 2) {
536 RF_ASSERT(wnqNode->numSuccedents == 1);
537 wnqNode->succedents[0] = unblockNode;
538 unblockNode->antecedents[nWndNodes + 1] = wnqNode;
539 unblockNode->antType[nWndNodes + 1] = rf_control;
540 }
541 /* Link unblock node to term node. */
542 RF_ASSERT(unblockNode->numSuccedents == 1);
543 RF_ASSERT(termNode->numAntecedents == 1);
544 RF_ASSERT(termNode->numSuccedents == 0);
545 unblockNode->succedents[0] = termNode;
546 termNode->antecedents[0] = unblockNode;
547 termNode->antType[0] = rf_control;
548 }
549
550 #define CONS_PDA(if,start,num) do { \
551 pda_p->row = asmap->if->row; \
552 pda_p->col = asmap->if->col; \
553 pda_p->startSector = ((asmap->if->startSector / secPerSU) * \
554 secPerSU) + start; \
555 pda_p->numSector = num; \
556 pda_p->next = NULL; \
557 RF_MallocAndAdd(pda_p->bufPtr, \
558 rf_RaidAddressToByte(raidPtr,num),(char *), allocList); \
559 } while (0)
560
561 void
rf_WriteGenerateFailedAccessASMs(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_PhysDiskAddr_t ** pdap,int * nNodep,RF_PhysDiskAddr_t ** pqpdap,int * nPQNodep,RF_AllocListElem_t * allocList)562 rf_WriteGenerateFailedAccessASMs(RF_Raid_t *raidPtr,
563 RF_AccessStripeMap_t *asmap, RF_PhysDiskAddr_t **pdap, int *nNodep,
564 RF_PhysDiskAddr_t **pqpdap, int *nPQNodep, RF_AllocListElem_t *allocList)
565 {
566 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
567 int PDAPerDisk, i;
568 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
569 int numDataCol = layoutPtr->numDataCol;
570 int state;
571 unsigned napdas;
572 RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end;
573 RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0];
574 RF_PhysDiskAddr_t *ftwo = asmap->failedPDAs[1];
575 RF_PhysDiskAddr_t *pda_p;
576 RF_RaidAddr_t sosAddr;
577
578 /*
579 * Determine how many pda's we will have to generate per unaccessed
580 * stripe. If there is only one failed data unit, it is one; if two,
581 * possibly two, depending wether they overlap.
582 */
583
584 fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
585 fone_end = fone_start + fone->numSector;
586
587 if (asmap->numDataFailed == 1) {
588 PDAPerDisk = 1;
589 state = 1;
590 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t),
591 (RF_PhysDiskAddr_t *), allocList);
592 pda_p = *pqpdap;
593 /* Build p. */
594 CONS_PDA(parityInfo, fone_start, fone->numSector);
595 pda_p->type = RF_PDA_TYPE_PARITY;
596 pda_p++;
597 /* Build q. */
598 CONS_PDA(qInfo, fone_start, fone->numSector);
599 pda_p->type = RF_PDA_TYPE_Q;
600 } else {
601 ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
602 ftwo_end = ftwo_start + ftwo->numSector;
603 if (fone->numSector + ftwo->numSector > secPerSU) {
604 PDAPerDisk = 1;
605 state = 2;
606 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t),
607 (RF_PhysDiskAddr_t *), allocList);
608 pda_p = *pqpdap;
609 CONS_PDA(parityInfo, 0, secPerSU);
610 pda_p->type = RF_PDA_TYPE_PARITY;
611 pda_p++;
612 CONS_PDA(qInfo, 0, secPerSU);
613 pda_p->type = RF_PDA_TYPE_Q;
614 } else {
615 PDAPerDisk = 2;
616 state = 3;
617 /* Four of them, fone, then ftwo. */
618 RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t),
619 (RF_PhysDiskAddr_t *), allocList);
620 pda_p = *pqpdap;
621 CONS_PDA(parityInfo, fone_start, fone->numSector);
622 pda_p->type = RF_PDA_TYPE_PARITY;
623 pda_p++;
624 CONS_PDA(qInfo, fone_start, fone->numSector);
625 pda_p->type = RF_PDA_TYPE_Q;
626 pda_p++;
627 CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
628 pda_p->type = RF_PDA_TYPE_PARITY;
629 pda_p++;
630 CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
631 pda_p->type = RF_PDA_TYPE_Q;
632 }
633 }
634 /* Figure out number of nonaccessed pda. */
635 napdas = PDAPerDisk * (numDataCol - 2);
636 *nPQNodep = PDAPerDisk;
637
638 *nNodep = napdas;
639 if (napdas == 0)
640 return; /* Short circuit. */
641
642 /* Allocate up our list of pda's. */
643
644 RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t),
645 (RF_PhysDiskAddr_t *), allocList);
646 *pdap = pda_p;
647
648 /* Link them together. */
649 for (i = 0; i < (napdas - 1); i++)
650 pda_p[i].next = pda_p + (i + 1);
651
652 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
653 asmap->raidAddress);
654 for (i = 0; i < numDataCol; i++) {
655 if ((pda_p - (*pdap)) == napdas)
656 continue;
657 pda_p->type = RF_PDA_TYPE_DATA;
658 pda_p->raidAddress = sosAddr + (i * secPerSU);
659 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress,
660 &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
661 /* Skip over dead disks. */
662 if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
663 continue;
664 switch (state) {
665 case 1: /* Fone. */
666 pda_p->numSector = fone->numSector;
667 pda_p->raidAddress += fone_start;
668 pda_p->startSector += fone_start;
669 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(
670 raidPtr, pda_p->numSector), (char *), allocList);
671 break;
672 case 2: /* Full stripe. */
673 pda_p->numSector = secPerSU;
674 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(
675 raidPtr, secPerSU), (char *), allocList);
676 break;
677 case 3: /* Two slabs. */
678 pda_p->numSector = fone->numSector;
679 pda_p->raidAddress += fone_start;
680 pda_p->startSector += fone_start;
681 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(
682 raidPtr, pda_p->numSector), (char *), allocList);
683 pda_p++;
684 pda_p->type = RF_PDA_TYPE_DATA;
685 pda_p->raidAddress = sosAddr + (i * secPerSU);
686 (raidPtr->Layout.map->MapSector) (raidPtr,
687 pda_p->raidAddress, &(pda_p->row), &(pda_p->col),
688 &(pda_p->startSector), 0);
689 pda_p->numSector = ftwo->numSector;
690 pda_p->raidAddress += ftwo_start;
691 pda_p->startSector += ftwo_start;
692 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(
693 raidPtr, pda_p->numSector), (char *), allocList);
694 break;
695 default:
696 RF_PANIC();
697 }
698 pda_p++;
699 }
700
701 RF_ASSERT(pda_p - *pdap == napdas);
702 return;
703 }
704
705 #define DISK_NODE_PDA(node) ((node)->params[0].p)
706
707 #define DISK_NODE_PARAMS(_node_,_p_) do { \
708 (_node_).params[0].p = _p_ ; \
709 (_node_).params[1].p = (_p_)->bufPtr; \
710 (_node_).params[2].v = parityStripeID; \
711 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, \
712 0, 0, which_ru); \
713 } while (0)
714
715 void
rf_DoubleDegSmallWrite(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,char * redundantReadNodeName,char * redundantWriteNodeName,char * recoveryNodeName,int (* recovFunc)(RF_DagNode_t *))716 rf_DoubleDegSmallWrite(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
717 RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags,
718 RF_AllocListElem_t *allocList, char *redundantReadNodeName,
719 char *redundantWriteNodeName, char *recoveryNodeName,
720 int (*recovFunc) (RF_DagNode_t *))
721 {
722 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
723 RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode,
724 *unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode;
725 RF_PhysDiskAddr_t *pda, *pqPDAs;
726 RF_PhysDiskAddr_t *npdas;
727 int nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i;
728 RF_ReconUnitNum_t which_ru;
729 int nPQNodes;
730 RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(
731 layoutPtr, asmap->raidAddress, &which_ru);
732
733 /*
734 * Simple small write case - First part looks like a reconstruct-read
735 * of the failed data units. Then a write of all data units not
736 * failed.
737 */
738
739
740 /*
741 * Hdr | ------Block- / / \ Rrd Rrd ... Rrd Rp Rq \ \
742 * / -------PQ----- / \ \ Wud Wp WQ \ | /
743 * --Unblock- | T
744 *
745 * Rrd = read recovery data (potentially none)
746 * Wud = write user data (not incl. failed disks)
747 * Wp = Write P (could be two)
748 * Wq = Write Q (could be two)
749 *
750 */
751
752 rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes,
753 &pqPDAs, &nPQNodes, allocList);
754
755 RF_ASSERT(asmap->numDataFailed == 1);
756
757 nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
758 nReadNodes = nRrdNodes + 2 * nPQNodes;
759 nWriteNodes = nWudNodes + 2 * nPQNodes;
760 nNodes = 4 + nReadNodes + nWriteNodes;
761
762 RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *),
763 allocList);
764 blockNode = nodes;
765 unblockNode = blockNode + 1;
766 termNode = unblockNode + 1;
767 recoveryNode = termNode + 1;
768 rrdNodes = recoveryNode + 1;
769 rpNodes = rrdNodes + nRrdNodes;
770 rqNodes = rpNodes + nPQNodes;
771 wudNodes = rqNodes + nPQNodes;
772 wpNodes = wudNodes + nWudNodes;
773 wqNodes = wpNodes + nPQNodes;
774
775 dag_h->creator = "PQ_DDSimpleSmallWrite";
776 dag_h->numSuccedents = 1;
777 dag_h->succedents[0] = blockNode;
778 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
779 rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
780 termNode->antecedents[0] = unblockNode;
781 termNode->antType[0] = rf_control;
782
783 /* Init the block and unblock nodes. */
784 /* The block node has all the read nodes as successors. */
785 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
786 rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h,
787 "Nil", allocList);
788 for (i = 0; i < nReadNodes; i++)
789 blockNode->succedents[i] = rrdNodes + i;
790
791 /* The unblock node has all the writes as successors. */
792 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
793 rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h,
794 "Nil", allocList);
795 for (i = 0; i < nWriteNodes; i++) {
796 unblockNode->antecedents[i] = wudNodes + i;
797 unblockNode->antType[i] = rf_control;
798 }
799 unblockNode->succedents[0] = termNode;
800
801 #define INIT_READ_NODE(node,name) do { \
802 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, \
803 rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, \
804 dag_h, name, allocList); \
805 (node)->succedents[0] = recoveryNode; \
806 (node)->antecedents[0] = blockNode; \
807 (node)->antType[0] = rf_control; \
808 } while (0)
809
810 /* Build the read nodes. */
811 pda = npdas;
812 for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
813 INIT_READ_NODE(rrdNodes + i, "rrd");
814 DISK_NODE_PARAMS(rrdNodes[i], pda);
815 }
816
817 /* Read redundancy pdas. */
818 pda = pqPDAs;
819 INIT_READ_NODE(rpNodes, "Rp");
820 RF_ASSERT(pda);
821 DISK_NODE_PARAMS(rpNodes[0], pda);
822 pda++;
823 INIT_READ_NODE(rqNodes, redundantReadNodeName);
824 RF_ASSERT(pda);
825 DISK_NODE_PARAMS(rqNodes[0], pda);
826 if (nPQNodes == 2) {
827 pda++;
828 INIT_READ_NODE(rpNodes + 1, "Rp");
829 RF_ASSERT(pda);
830 DISK_NODE_PARAMS(rpNodes[1], pda);
831 pda++;
832 INIT_READ_NODE(rqNodes + 1, redundantReadNodeName);
833 RF_ASSERT(pda);
834 DISK_NODE_PARAMS(rqNodes[1], pda);
835 }
836 /*
837 * The recovery node has all reads as precedessors and all writes as
838 * successors. It generates a result for every write P or write Q
839 * node. As parameters, it takes a pda per read and a pda per stripe
840 * of user data written. It also takes as the last params the raidPtr
841 * and asm. For results, it takes PDA for P & Q.
842 */
843
844 rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc,
845 rf_NullNodeUndoFunc, NULL,
846 nWriteNodes, /* succesors */
847 nReadNodes, /* preds */
848 nReadNodes + nWudNodes + 3, /* params */
849 2 * nPQNodes, /* results */
850 dag_h, recoveryNodeName, allocList);
851
852
853
854 for (i = 0; i < nReadNodes; i++) {
855 recoveryNode->antecedents[i] = rrdNodes + i;
856 recoveryNode->antType[i] = rf_control;
857 recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i);
858 }
859 for (i = 0; i < nWudNodes; i++) {
860 recoveryNode->succedents[i] = wudNodes + i;
861 }
862 recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0];
863 recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr;
864 recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap;
865
866 for (; i < nWriteNodes; i++)
867 recoveryNode->succedents[i] = wudNodes + i;
868
869 pda = pqPDAs;
870 recoveryNode->results[0] = pda;
871 pda++;
872 recoveryNode->results[1] = pda;
873 if (nPQNodes == 2) {
874 pda++;
875 recoveryNode->results[2] = pda;
876 pda++;
877 recoveryNode->results[3] = pda;
878 }
879 /* Fill writes. */
880 #define INIT_WRITE_NODE(node,name) do { \
881 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, \
882 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, \
883 dag_h, name, allocList); \
884 (node)->succedents[0] = unblockNode; \
885 (node)->antecedents[0] = recoveryNode; \
886 (node)->antType[0] = rf_control; \
887 } while (0)
888
889 pda = asmap->physInfo;
890 for (i = 0; i < nWudNodes; i++) {
891 INIT_WRITE_NODE(wudNodes + i, "Wd");
892 DISK_NODE_PARAMS(wudNodes[i], pda);
893 recoveryNode->params[nReadNodes + i].p =
894 DISK_NODE_PDA(wudNodes + i);
895 pda = pda->next;
896 }
897 /* Write redundancy pdas. */
898 pda = pqPDAs;
899 INIT_WRITE_NODE(wpNodes, "Wp");
900 RF_ASSERT(pda);
901 DISK_NODE_PARAMS(wpNodes[0], pda);
902 pda++;
903 INIT_WRITE_NODE(wqNodes, "Wq");
904 RF_ASSERT(pda);
905 DISK_NODE_PARAMS(wqNodes[0], pda);
906 if (nPQNodes == 2) {
907 pda++;
908 INIT_WRITE_NODE(wpNodes + 1, "Wp");
909 RF_ASSERT(pda);
910 DISK_NODE_PARAMS(wpNodes[1], pda);
911 pda++;
912 INIT_WRITE_NODE(wqNodes + 1, "Wq");
913 RF_ASSERT(pda);
914 DISK_NODE_PARAMS(wqNodes[1], pda);
915 }
916 }
917