1 /*	$OpenBSD: rf_pqdegdags.c,v 1.5 2002/12/16 07:01:04 tdeval Exp $	*/
2 /*	$NetBSD: rf_pqdegdags.c,v 1.5 1999/08/15 02:36:40 oster Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Carnegie-Mellon University.
6  * All rights reserved.
7  *
8  * Author: Daniel Stodolsky
9  *
10  * Permission to use, copy, modify and distribute this software and
11  * its documentation is hereby granted, provided that both the copyright
12  * notice and this permission notice appear in all copies of the
13  * software, derivative works or modified versions, and any portions
14  * thereof, and that both notices appear in supporting documentation.
15  *
16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19  *
20  * Carnegie Mellon requests users of this software to return to
21  *
22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
23  *  School of Computer Science
24  *  Carnegie Mellon University
25  *  Pittsburgh PA 15213-3890
26  *
27  * any improvements or extensions that they make and grant Carnegie the
28  * rights to redistribute these changes.
29  */
30 
31 /*
32  * rf_pqdegdags.c
33  * Degraded mode dags for double fault cases.
34  */
35 
36 
37 #include "rf_archs.h"
38 
39 #if	(RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
40 
41 #include "rf_types.h"
42 #include "rf_raid.h"
43 #include "rf_dag.h"
44 #include "rf_dagdegrd.h"
45 #include "rf_dagdegwr.h"
46 #include "rf_dagfuncs.h"
47 #include "rf_dagutils.h"
48 #include "rf_etimer.h"
49 #include "rf_acctrace.h"
50 #include "rf_general.h"
51 #include "rf_pqdegdags.h"
52 #include "rf_pq.h"
53 
54 void rf_applyPDA(RF_Raid_t *, RF_PhysDiskAddr_t *, RF_PhysDiskAddr_t *,
55 	RF_PhysDiskAddr_t *, void *);
56 
57 /*
58  * Two data drives have failed, and we are doing a read that covers one of them.
59  * We may also be reading some of the surviving drives.
60  */
61 
62 
63 /*****************************************************************************
64  *
65  * Creates a DAG to perform a degraded-mode read of data within one stripe.
66  * This DAG is as follows:
67  *
68  *			                Hdr
69  *			                 |
70  *			               Block
71  *			 /         /           \         \     \   \
72  *			Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
73  *			| \       | \         | \       | \    | \ | \
74  *
75  *			           |                 |
76  *			        Unblock              X
77  *			            \               /
78  *			             ------ T ------
79  *
80  * Each R node is a successor of the L node.
81  * One successor arc from each R node goes to U, and the other to X.
82  * There is one Rud for each chunk of surviving user data requested by the
83  * user, and one Rrd for each chunk of surviving user data _not_ being read
84  * by the user.
85  * R = read, ud = user data, rd = recovery (surviving) data, p = P data,
86  * q = Qdata, X = pq recovery node, T = terminate
87  *
88  * The block & unblock nodes are leftovers from a previous version. They
89  * do nothing, but I haven't deleted them because it would be a tremendous
90  * effort to put them back in.
91  *
92  * Note:  The target buffer for the XOR node is set to the actual user buffer
93  * where the failed data is supposed to end up. This buffer is zero'd by the
94  * code here. Thus, if you create a degraded read dag, use it, and then
95  * re-use. You have to be sure to zero the target buffer prior to the re-use.
96  *
97  * Every buffer read is passed to the pq recovery node, whose job it is to
98  * sort out what's needed and what's not.
99  *****************************************************************************/
100 
101 /* Init a disk node with 2 successors and one predecessor. */
102 #define	INIT_DISK_NODE(node,name)					\
103 do {									\
104 	rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc,		\
105 	    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2, 1, 4, 0,	\
106 	    dag_h, name, allocList);					\
107 	(node)->succedents[0] = unblockNode;				\
108 	(node)->succedents[1] = recoveryNode;				\
109 	(node)->antecedents[0] = blockNode;				\
110 	(node)->antType[0] = rf_control;				\
111 } while (0)
112 
113 #define	DISK_NODE_PARAMS(_node_,_p_)					\
114 do {									\
115 	(_node_).params[0].p = _p_ ;					\
116 	(_node_).params[1].p = (_p_)->bufPtr;				\
117 	(_node_).params[2].v = parityStripeID;				\
118 	(_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,	\
119 	    0, 0, which_ru);						\
120 } while (0)
121 
122 #define	DISK_NODE_PDA(node)	((node)->params[0].p)
123 
RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)124 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
125 {
126 	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
127 	    "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
128 }
129 
130 void
rf_applyPDA(RF_Raid_t * raidPtr,RF_PhysDiskAddr_t * pda,RF_PhysDiskAddr_t * ppda,RF_PhysDiskAddr_t * qpda,void * bp)131 rf_applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda,
132     RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, void *bp)
133 {
134 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
135 	RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
136 	RF_SectorCount_t s0len = ppda->numSector, len;
137 	RF_SectorNum_t suoffset;
138 	unsigned coeff;
139 	char *pbuf = ppda->bufPtr;
140 	char *qbuf = qpda->bufPtr;
141 	char *buf;
142 	int delta;
143 
144 	suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
145 	len = pda->numSector;
146 	/* See if pda intersects a recovery pda. */
147 	if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
148 		buf = pda->bufPtr;
149 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
150 		    pda->raidAddress);
151 		coeff = (coeff % raidPtr->Layout.numDataCol);
152 
153 		if (suoffset < s0off) {
154 			delta = s0off - suoffset;
155 			buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
156 			    delta);
157 			suoffset = s0off;
158 			len -= delta;
159 		}
160 		if (suoffset > s0off) {
161 			delta = suoffset - s0off;
162 			pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
163 			    delta);
164 			qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
165 			    delta);
166 		}
167 		if ((suoffset + len) > (s0len + s0off))
168 			len = s0len + s0off - suoffset;
169 
170 		/* Src, dest, len. */
171 		rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
172 
173 		/* Dest, src, len, coeff. */
174 		rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf,
175 		    rf_RaidAddressToByte(raidPtr, len), coeff);
176 	}
177 }
178 
179 
180 /*
181  * Recover data in the case of a double failure. There can be two
182  * result buffers, one for each chunk of data trying to be recovered.
183  * The params are pda's that have not been range restricted or otherwise
184  * politely massaged - this should be done here. The last params are the
185  * pdas of P and Q, followed by the raidPtr. The list can look like
186  *
187  *   pda, pda, ..., p pda, q pda, raidptr, asm
188  *
189  * or
190  *
191  *   pda, pda, ..., p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
192  *
193  * depending on whether two chunks of recovery data were required.
194  *
195  * The second condition only arises if there are two failed buffers
196  * whose lengths do not add up a stripe unit.
197  */
198 
199 int
rf_PQDoubleRecoveryFunc(RF_DagNode_t * node)200 rf_PQDoubleRecoveryFunc(RF_DagNode_t *node)
201 {
202 	int np = node->numParams;
203 	RF_AccessStripeMap_t *asmap =
204 	    (RF_AccessStripeMap_t *) node->params[np - 1].p;
205 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
206 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
207 	int d, i;
208 	unsigned coeff;
209 	RF_RaidAddr_t sosAddr, suoffset;
210 	RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
211 	int two = 0;
212 	RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
213 	char *buf;
214 	int numDataCol = layoutPtr->numDataCol;
215 	RF_Etimer_t timer;
216 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
217 
218 	RF_ETIMER_START(timer);
219 
220 	if (asmap->failedPDAs[1] &&
221 	    (asmap->failedPDAs[1]->numSector +
222 	     asmap->failedPDAs[0]->numSector < secPerSU)) {
223 		RF_ASSERT(0);
224 		ppda = node->params[np - 6].p;
225 		ppda2 = node->params[np - 5].p;
226 		qpda = node->params[np - 4].p;
227 		qpda2 = node->params[np - 3].p;
228 		d = (np - 6);
229 		two = 1;
230 	} else {
231 		ppda = node->params[np - 4].p;
232 		qpda = node->params[np - 3].p;
233 		d = (np - 4);
234 	}
235 
236 	for (i = 0; i < d; i++) {
237 		pda = node->params[i].p;
238 		buf = pda->bufPtr;
239 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
240 		len = pda->numSector;
241 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
242 		    pda->raidAddress);
243 		/* Compute the data unit offset within the column. */
244 		coeff = (coeff % raidPtr->Layout.numDataCol);
245 		/* See if pda intersects a recovery pda. */
246 		rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
247 		if (two)
248 			rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
249 	}
250 
251 	/*
252 	 * Ok, we got the parity back to the point where we can recover. We
253 	 * now need to determine the coeff of the columns that need to be
254 	 * recovered. We can also only need to recover a single stripe unit.
255 	 */
256 
257 	if (asmap->failedPDAs[1] == NULL) {	/*
258 						 * Only a single stripe unit
259 						 * to recover.
260 						 */
261 		pda = asmap->failedPDAs[0];
262 		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
263 		    asmap->raidAddress);
264 		/* Need to determine the column of the other failed disk. */
265 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
266 		    pda->raidAddress);
267 		/* Compute the data unit offset within the column. */
268 		coeff = (coeff % raidPtr->Layout.numDataCol);
269 		for (i = 0; i < numDataCol; i++) {
270 			npda.raidAddress = sosAddr + (i * secPerSU);
271 			(raidPtr->Layout.map->MapSector) (raidPtr,
272 			    npda.raidAddress, &(npda.row), &(npda.col),
273 			    &(npda.startSector), 0);
274 			/* Skip over dead disks. */
275 			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col]
276 			    .status))
277 				if (i != coeff)
278 					break;
279 		}
280 		RF_ASSERT(i < numDataCol);
281 		RF_ASSERT(two == 0);
282 		/*
283 		 * Recover the data. Since we need only to recover one
284 		 * column, we overwrite the parity with the other one.
285 		 */
286 		if (coeff < i)	/* Recovering 'a'. */
287 			rf_PQ_recover((unsigned long *) ppda->bufPtr,
288 			    (unsigned long *) qpda->bufPtr,
289 			    (unsigned long *) pda->bufPtr,
290 			    (unsigned long *) ppda->bufPtr,
291 			    rf_RaidAddressToByte(raidPtr, pda->numSector),
292 			    coeff, i);
293 		else		/* Recovering 'b'. */
294 			rf_PQ_recover((unsigned long *) ppda->bufPtr,
295 			    (unsigned long *) qpda->bufPtr,
296 			    (unsigned long *) ppda->bufPtr,
297 			    (unsigned long *) pda->bufPtr,
298 			    rf_RaidAddressToByte(raidPtr, pda->numSector),
299 			    i, coeff);
300 	} else
301 		RF_PANIC();
302 
303 	RF_ETIMER_STOP(timer);
304 	RF_ETIMER_EVAL(timer);
305 	if (tracerec)
306 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
307 	rf_GenericWakeupFunc(node, 0);
308 	return (0);
309 }
310 
311 int
rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t * node)312 rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node)
313 {
314 	/*
315 	 * The situation:
316 	 *
317 	 * We are doing a write that hits only one failed data unit. The other
318 	 * failed data unit is not being overwritten, so we need to generate
319 	 * it.
320 	 *
321 	 * For the moment, we assume all the nonfailed data being written is in
322 	 * the shadow of the failed data unit. (i.e., either a single data
323 	 * unit write or the entire failed stripe unit is being overwritten.)
324 	 *
325 	 * Recovery strategy: apply the recovery data to the parity and Q.
326 	 * Use P & Q to recover the second failed data unit in P. Zero fill
327 	 * Q, then apply the recovered data to P. Then apply the data being
328 	 * written to the failed drive. Then walk through the surviving drives,
329 	 * applying new data when it exists, othewise the recovery data.
330 	 * Quite a mess.
331 	 *
332 	 *
333 	 * The params:
334 	 *
335 	 *   read pda0, read pda1, ..., read pda (numDataCol-3),
336 	 *   write pda0, ..., write pda (numStripeUnitAccess - numDataFailed),
337 	 *   failed pda, raidPtr, asmap
338 	 */
339 
340 	int np = node->numParams;
341 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *)
342 	    node->params[np - 1].p;
343 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
344 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
345 	int i;
346 	RF_RaidAddr_t sosAddr;
347 	unsigned coeff;
348 	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
349 	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
350 	int numDataCol = layoutPtr->numDataCol;
351 	RF_Etimer_t timer;
352 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
353 
354 	RF_ASSERT(node->numResults == 2);
355 	RF_ASSERT(asmap->failedPDAs[1] == NULL);
356 	RF_ETIMER_START(timer);
357 	ppda = node->results[0];
358 	qpda = node->results[1];
359 	/* apply the recovery data */
360 	for (i = 0; i < numDataCol - 2; i++)
361 		rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
362 		    node->dagHdr->bp);
363 
364 	/* Determine the other failed data unit. */
365 	pda = asmap->failedPDAs[0];
366 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
367 	    asmap->raidAddress);
368 	/* Need to determine the column of the other failed disk. */
369 	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
370 	/* Compute the data unit offset within the column. */
371 	coeff = (coeff % raidPtr->Layout.numDataCol);
372 	for (i = 0; i < numDataCol; i++) {
373 		npda.raidAddress = sosAddr + (i * secPerSU);
374 		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress,
375 		    &(npda.row), &(npda.col), &(npda.startSector), 0);
376 		/* Skip over dead disks. */
377 		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
378 			if (i != coeff)
379 				break;
380 	}
381 	RF_ASSERT(i < numDataCol);
382 	/*
383 	 * Recover the data. The column we want to recover, we write over the
384 	 * parity. The column we don't care about, we dump in q.
385 	 */
386 	if (coeff < i)		/* Recovering 'a'. */
387 		rf_PQ_recover((unsigned long *) ppda->bufPtr,
388 		    (unsigned long *) qpda->bufPtr,
389 		    (unsigned long *) ppda->bufPtr,
390 		    (unsigned long *) qpda->bufPtr,
391 		    rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
392 	else			/* Recovering 'b'. */
393 		rf_PQ_recover((unsigned long *) ppda->bufPtr,
394 		    (unsigned long *) qpda->bufPtr,
395 		    (unsigned long *) qpda->bufPtr,
396 		    (unsigned long *) ppda->bufPtr,
397 		    rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
398 
399 	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
400 	bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
401 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr,
402 	    rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
403 
404 	/* Now apply all the write data to the buffer. */
405 	/*
406 	 * Single stripe unit write case: The failed data is the only thing
407 	 * we are writing.
408 	 */
409 	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
410 	/* Dest, src, len, coeff. */
411 	rf_IncQ((unsigned long *) qpda->bufPtr,
412 	    (unsigned long *) asmap->failedPDAs[0]->bufPtr,
413 	    rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
414 	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr,
415 	    rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
416 
417 	/* Now apply all the recovery data. */
418 	for (i = 0; i < numDataCol - 2; i++)
419 		rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
420 		    node->dagHdr->bp);
421 
422 	RF_ETIMER_STOP(timer);
423 	RF_ETIMER_EVAL(timer);
424 	if (tracerec)
425 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
426 
427 	rf_GenericWakeupFunc(node, 0);
428 	return (0);
429 }
430 
RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)431 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
432 {
433 	RF_PANIC();
434 }
435 
436 
437 /*
438  * Two lost data unit write case.
439  *
440  * There are really two cases here:
441  *
442  * (1) The write completely covers the two lost data units.
443  *     In that case, a reconstruct write that doesn't write the
444  *     failed data units will do the correct thing. So in this case,
445  *     the dag looks like
446  *
447  *	   Full stripe read of surviving data units (not being overwritten)
448  *	   Write new data (ignoring failed units)
449  *	   Compute P&Q
450  *	   Write P&Q
451  *
452  *
453  * (2) The write does not completely cover both failed data units
454  *     (but touches at least one of them). Then we need to do the
455  *     equivalent of a reconstruct read to recover the missing data
456  *     unit from the other stripe.
457  *
458  *     For any data we are writing that is not in the "shadow"
459  *     of the failed units, we need to do a four cycle update.
460  *     PANIC on this case. For now.
461  *
462  */
463 
RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)464 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
465 {
466 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
467 	RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
468 	int sum;
469 	int nf = asmap->numDataFailed;
470 
471 	sum = asmap->failedPDAs[0]->numSector;
472 	if (nf == 2)
473 		sum += asmap->failedPDAs[1]->numSector;
474 
475 	if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
476 		/* Large write case. */
477 		rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
478 		return;
479 	}
480 	if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
481 		/* Small write case, no user data not in shadow. */
482 		rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags,
483 		    allocList);
484 		return;
485 	}
486 	RF_PANIC();
487 }
488 
RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)489 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
490 {
491 	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList,
492 	    "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
493 }
494 
495 #endif	/* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
496