1 /*	$OpenBSD: rf_evenodd_dagfuncs.c,v 1.7 2002/12/16 07:01:04 tdeval Exp $	*/
2 /*	$NetBSD: rf_evenodd_dagfuncs.c,v 1.6 2000/03/30 12:45:40 augustss Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Carnegie-Mellon University.
6  * All rights reserved.
7  *
8  * Author: ChangMing Wu
9  *
10  * Permission to use, copy, modify and distribute this software and
11  * its documentation is hereby granted, provided that both the copyright
12  * notice and this permission notice appear in all copies of the
13  * software, derivative works or modified versions, and any portions
14  * thereof, and that both notices appear in supporting documentation.
15  *
16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19  *
20  * Carnegie Mellon requests users of this software to return to
21  *
22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
23  *  School of Computer Science
24  *  Carnegie Mellon University
25  *  Pittsburgh PA 15213-3890
26  *
27  * any improvements or extensions that they make and grant Carnegie the
28  * rights to redistribute these changes.
29  */
30 
31 /*
32  * Code for RAID-EVENODD architecture.
33  */
34 
35 #include "rf_types.h"
36 #include "rf_raid.h"
37 #include "rf_dag.h"
38 #include "rf_dagffrd.h"
39 #include "rf_dagffwr.h"
40 #include "rf_dagdegrd.h"
41 #include "rf_dagdegwr.h"
42 #include "rf_dagutils.h"
43 #include "rf_dagfuncs.h"
44 #include "rf_etimer.h"
45 #include "rf_general.h"
46 #include "rf_configure.h"
47 #include "rf_parityscan.h"
48 #include "rf_evenodd.h"
49 #include "rf_evenodd_dagfuncs.h"
50 
51 /* These redundant functions are for small write. */
52 RF_RedFuncs_t rf_EOSmallWritePFuncs = {
53 	rf_RegularXorFunc, "Regular Old-New P",
54 	rf_SimpleXorFunc, "Simple Old-New P"
55 };
56 RF_RedFuncs_t rf_EOSmallWriteEFuncs = {
57 	rf_RegularONEFunc, "Regular Old-New E",
58 	rf_SimpleONEFunc, "Regular Old-New E"
59 };
60 /* These redundant functions are for degraded read. */
61 RF_RedFuncs_t rf_eoPRecoveryFuncs = {
62 	rf_RecoveryXorFunc, "Recovery Xr",
63 	rf_RecoveryXorFunc, "Recovery Xr"
64 };
65 RF_RedFuncs_t rf_eoERecoveryFuncs = {
66 	rf_RecoveryEFunc, "Recovery E Func",
67 	rf_RecoveryEFunc, "Recovery E Func"
68 };
69 
70 
71 /*****************************************************************************
72  *   The following encoding node functions is used in
73  *   EO_000_CreateLargeWriteDAG.
74  *****************************************************************************/
75 int
rf_RegularPEFunc(RF_DagNode_t * node)76 rf_RegularPEFunc(RF_DagNode_t *node)
77 {
78 	rf_RegularESubroutine(node, node->results[1]);
79 	rf_RegularXorFunc(node);	/* Do the wakeup here ! */
80 #if 1
81 	return (0);		/* XXX This was missing... GO */
82 #endif
83 }
84 
85 
86 /*****************************************************************************
87  *  For EO_001_CreateSmallWriteDAG, there are (i) RegularONEFunc() and
88  *  (ii) SimpleONEFunc() to be used. The previous case is when write accesses
89  *  at least sectors of full stripe unit.
90  *  The later function is used when the write accesses two stripe units but
91  *  with total sectors less than sectors per SU. In this case, the access of
92  *  parity and 'E' are shown as disconnected areas in their stripe unit and
93  *  parity write and 'E' write are both divided into two distinct writes
94  *  (totally four). This simple old-new write and regular old-new write happen
95  *  as in RAID-5.
96  *****************************************************************************/
97 
98 /*
99  * Algorithm:
100  *   1. Store the difference of old data and new data in the Rod buffer.
101  *   2. Then encode this buffer into the buffer that already have old 'E'
102  *	information inside it, the result can be shown to be the new 'E'
103  *	information.
104  *   3. Xor the Wnd buffer into the difference buffer to recover the original
105  *	old data.
106  * Here we have another alternative: to allocate a temporary buffer for
107  * storing the difference of old data and new data, then encode temp buf
108  * into old 'E' buf to form new 'E', but this approach takes the same speed
109  * as the previous, and needs more memory.
110  */
111 int
rf_RegularONEFunc(RF_DagNode_t * node)112 rf_RegularONEFunc(RF_DagNode_t *node)
113 {
114 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
115 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
116 	int EpdaIndex = (node->numParams - 1) / 2 - 1;	/*
117 							 * The parameter of node
118 							 * where you can find
119 							 * e-pda.
120 							 */
121 	int i, k, retcode = 0;
122 	int suoffset, length;
123 	RF_RowCol_t scol;
124 	char *srcbuf, *destbuf;
125 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
126 	RF_Etimer_t timer;
127 	RF_PhysDiskAddr_t *pda, *EPDA = (RF_PhysDiskAddr_t *)
128 	    node->params[EpdaIndex].p;
129 	/* Generally zero. */
130 	int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector);
131 
132 	RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q);
133 	RF_ASSERT(ESUOffset == 0);
134 
135 	RF_ETIMER_START(timer);
136 
137 	/*
138 	 * Xor the Wnd buffer into Rod buffer. The difference of old data and
139 	 * new data is stored in Rod buffer.
140 	 */
141 	for (k = 0; k < EpdaIndex; k += 2) {
142 		length = rf_RaidAddressToByte(raidPtr,
143 		    ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
144 		retcode = rf_bxor(node->params[k + EpdaIndex + 3].p,
145 		    node->params[k + 1].p, length, node->dagHdr->bp);
146 	}
147 	/*
148 	 * Start to encode the buffer, storing the difference of old data and
149 	 * new data into 'E' buffer.
150 	 */
151 	for (i = 0; i < EpdaIndex; i += 2)
152 		if (node->params[i + 1].p != node->results[0]) {
153 			/* results[0] is buf ptr of E. */
154 			pda = (RF_PhysDiskAddr_t *) node->params[i].p;
155 			srcbuf = (char *) node->params[i + 1].p;
156 			scol = rf_EUCol(layoutPtr, pda->raidAddress);
157 			suoffset = rf_StripeUnitOffset(layoutPtr,
158 			    pda->startSector);
159 			destbuf = ((char *) node->results[0]) +
160 			    rf_RaidAddressToByte(raidPtr, suoffset);
161 			rf_e_encToBuf(raidPtr, scol, srcbuf,
162 			    RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
163 		}
164 	/*
165 	 * Recover the original old data to be used by parity encoding
166 	 * function in XorNode.
167 	 */
168 	for (k = 0; k < EpdaIndex; k += 2) {
169 		length = rf_RaidAddressToByte(raidPtr,
170 		    ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
171 		retcode = rf_bxor(node->params[k + EpdaIndex + 3].p,
172 		    node->params[k + 1].p, length, node->dagHdr->bp);
173 	}
174 	RF_ETIMER_STOP(timer);
175 	RF_ETIMER_EVAL(timer);
176 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
177 	rf_GenericWakeupFunc(node, 0);
178 #if 1
179 	return (0);		/* XXX This was missing... GO */
180 #endif
181 }
182 
183 int
rf_SimpleONEFunc(RF_DagNode_t * node)184 rf_SimpleONEFunc(RF_DagNode_t *node)
185 {
186 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
187 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
188 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
189 	int retcode = 0;
190 	char *srcbuf, *destbuf;
191 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
192 	int length;
193 	RF_RowCol_t scol;
194 	RF_Etimer_t timer;
195 
196 	RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type ==
197 	    RF_PDA_TYPE_Q);
198 	if (node->dagHdr->status == rf_enable) {
199 		RF_ETIMER_START(timer);
200 		/* This is a pda of writeDataNodes. */
201 		length = rf_RaidAddressToByte(raidPtr,
202 		    ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector);
203 		/* bxor to buffer of readDataNodes. */
204 		retcode = rf_bxor(node->params[5].p, node->params[1].p,
205 		    length, node->dagHdr->bp);
206 		/*
207 		 * Find out the corresponding column in encoding matrix for
208 		 * write column to be encoded into redundant disk 'E'.
209 		 */
210 		scol = rf_EUCol(layoutPtr, pda->raidAddress);
211 		srcbuf = node->params[1].p;
212 		destbuf = node->params[3].p;
213 		/* Start encoding process. */
214 		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2,
215 		    destbuf, pda->numSector);
216 		rf_bxor(node->params[5].p, node->params[1].p, length,
217 		    node->dagHdr->bp);
218 		RF_ETIMER_STOP(timer);
219 		RF_ETIMER_EVAL(timer);
220 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
221 
222 	}
223 	return (rf_GenericWakeupFunc(node, retcode));	/*
224 							 * Call wake func
225 							 * explicitly since no
226 							 * I/O in this node.
227 							 */
228 }
229 
230 
231 /*
232  * Called by rf_RegularPEFunc(node) and rf_RegularEFunc(node)
233  * in f.f. large write.
234  */
235 void
rf_RegularESubroutine(RF_DagNode_t * node,char * ebuf)236 rf_RegularESubroutine(RF_DagNode_t *node, char *ebuf)
237 {
238 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
239 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
240 	RF_PhysDiskAddr_t *pda;
241 	int i, suoffset;
242 	RF_RowCol_t scol;
243 	char *srcbuf, *destbuf;
244 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
245 	RF_Etimer_t timer;
246 
247 	RF_ETIMER_START(timer);
248 	for (i = 0; i < node->numParams - 2; i += 2) {
249 		RF_ASSERT(node->params[i + 1].p != ebuf);
250 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
251 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
252 		scol = rf_EUCol(layoutPtr, pda->raidAddress);
253 		srcbuf = (char *) node->params[i + 1].p;
254 		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset);
255 		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2,
256 		    destbuf, pda->numSector);
257 	}
258 	RF_ETIMER_STOP(timer);
259 	RF_ETIMER_EVAL(timer);
260 	tracerec->xor_us += RF_ETIMER_VAL_US(timer);
261 }
262 
263 
264 /*****************************************************************************
265  *			 Used in  EO_001_CreateLargeWriteDAG.
266  *****************************************************************************/
267 int
rf_RegularEFunc(RF_DagNode_t * node)268 rf_RegularEFunc(RF_DagNode_t *node)
269 {
270 	rf_RegularESubroutine(node, node->results[0]);
271 	rf_GenericWakeupFunc(node, 0);
272 #if 1
273 	return (0);		/* XXX This was missing... GO */
274 #endif
275 }
276 
277 
278 /*****************************************************************************
279  * This degraded function allow only two cases:
280  *   1. When write accesses the full failed stripe unit, then the access can
281  *	be more than one stripe unit.
282  *   2. When write accesses only part of the failed SU, we assume accesses of
283  *	more than one stripe unit are not allowed so that the write can be
284  *	dealt with like a large write.
285  * The following function is based on these assumptions. So except in the
286  * second case, it looks the same as a large write encoding function. But
287  * this is not exactly the normal way of doing a degraded write, since
288  * RAIDframe has to break cases of accesses other than the above two into
289  * smaller accesses. We may have to change DegrESubroutin in the future.
290  *****************************************************************************/
291 void
rf_DegrESubroutine(RF_DagNode_t * node,char * ebuf)292 rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf)
293 {
294 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
295 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
296 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
297 	RF_PhysDiskAddr_t *pda;
298 	int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
299 	RF_RowCol_t scol;
300 	char *srcbuf, *destbuf;
301 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
302 	RF_Etimer_t timer;
303 
304 	RF_ETIMER_START(timer);
305 	for (i = 0; i < node->numParams - 2; i += 2) {
306 		RF_ASSERT(node->params[i + 1].p != ebuf);
307 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
308 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
309 		scol = rf_EUCol(layoutPtr, pda->raidAddress);
310 		srcbuf = (char *) node->params[i + 1].p;
311 		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
312 		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
313 	}
314 
315 	RF_ETIMER_STOP(timer);
316 	RF_ETIMER_EVAL(timer);
317 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
318 }
319 
320 
321 /*****************************************************************************
322  * This function is used in case where one data disk failed and both redundant
323  * disks are alive. It is used in the EO_100_CreateWriteDAG. Note: if there is
324  * another disk failed in the stripe but not accessed at this time, then we
325  * should, instead, use the rf_EOWriteDoubleRecoveryFunc().
326  *****************************************************************************/
327 int
rf_Degraded_100_EOFunc(RF_DagNode_t * node)328 rf_Degraded_100_EOFunc(RF_DagNode_t *node)
329 {
330 	rf_DegrESubroutine(node, node->results[1]);
331 	rf_RecoveryXorFunc(node);	/* Does the wakeup here ! */
332 #if 1
333 	return (0);		/* XXX This was missing... Should these be
334 				 * void functions ??? GO */
335 #endif
336 }
337 
338 
339 /*****************************************************************************
340  * This function is to encode one sector in one of the data disks to the E
341  * disk. However, in evenodd this function can also be used as decoding
342  * function to recover data from dead disk in the case of parity failure and
343  * a single data failure.
344  *****************************************************************************/
345 void
rf_e_EncOneSect(RF_RowCol_t srcLogicCol,char * srcSecbuf,RF_RowCol_t destLogicCol,char * destSecbuf,int bytesPerSector)346 rf_e_EncOneSect(RF_RowCol_t srcLogicCol, char *srcSecbuf,
347     RF_RowCol_t destLogicCol, char *destSecbuf, int bytesPerSector)
348 {
349 	int S_index;		/*
350 				 * Index of the EU in the src col which need
351 				 * be Xored into all EUs in a dest sector.
352 				 */
353 	int numRowInEncMatrix = (RF_EO_MATRIX_DIM) - 1;
354 	RF_RowCol_t j, indexInDest;	/*
355 					 * Row index of an encoding unit in
356 					 * the destination column of encoding
357 					 * matrix.
358 					 */
359 	RF_RowCol_t indexInSrc;	/*
360 				 * Row index of an encoding unit in the source
361 				 * column used for recovery.
362 				 */
363 	int bytesPerEU = bytesPerSector / numRowInEncMatrix;
364 
365 #if	RF_EO_MATRIX_DIM > 17
366 	int shortsPerEU = bytesPerEU / sizeof(short);
367 	short  *destShortBuf, *srcShortBuf1, *srcShortBuf2;
368 	short temp1;
369 #elif	RF_EO_MATRIX_DIM == 17
370 	int longsPerEU = bytesPerEU / sizeof(long);
371 	long *destLongBuf, *srcLongBuf1, *srcLongBuf2;
372 	long temp1;
373 #endif
374 
375 #if	RF_EO_MATRIX_DIM > 17
376 	RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1);
377 	RF_ASSERT(bytesPerEU % sizeof(short) == 0);
378 #elif	RF_EO_MATRIX_DIM == 17
379 	RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4);
380 	RF_ASSERT(bytesPerEU % sizeof(long) == 0);
381 #endif
382 
383 	S_index = rf_EO_Mod((RF_EO_MATRIX_DIM - 1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
384 #if	RF_EO_MATRIX_DIM > 17
385 	srcShortBuf1 = (short *) (srcSecbuf + S_index * bytesPerEU);
386 #elif	RF_EO_MATRIX_DIM == 17
387 	srcLongBuf1 = (long *) (srcSecbuf + S_index * bytesPerEU);
388 #endif
389 
390 	for (indexInDest = 0; indexInDest < numRowInEncMatrix; indexInDest++) {
391 		indexInSrc = rf_EO_Mod((indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
392 
393 #if	RF_EO_MATRIX_DIM > 17
394 		destShortBuf = (short *) (destSecbuf + indexInDest * bytesPerEU);
395 		srcShortBuf2 = (short *) (srcSecbuf + indexInSrc * bytesPerEU);
396 		for (j = 0; j < shortsPerEU; j++) {
397 			temp1 = destShortBuf[j] ^ srcShortBuf1[j];
398 			/* Note: S_index won't be at the end row for any src
399 			 * col ! */
400 			if (indexInSrc != RF_EO_MATRIX_DIM - 1)
401 				destShortBuf[j] = (srcShortBuf2[j]) ^ temp1;
402 			/* if indexInSrc is at the end row, ie.
403 			 * RF_EO_MATRIX_DIM -1, then all elements are zero ! */
404 			else
405 				destShortBuf[j] = temp1;
406 		}
407 
408 #elif	RF_EO_MATRIX_DIM == 17
409 		destLongBuf = (long *) (destSecbuf + indexInDest * bytesPerEU);
410 		srcLongBuf2 = (long *) (srcSecbuf + indexInSrc * bytesPerEU);
411 		for (j = 0; j < longsPerEU; j++) {
412 			temp1 = destLongBuf[j] ^ srcLongBuf1[j];
413 			if (indexInSrc != RF_EO_MATRIX_DIM - 1)
414 				destLongBuf[j] = (srcLongBuf2[j]) ^ temp1;
415 			else
416 				destLongBuf[j] = temp1;
417 		}
418 #endif
419 	}
420 }
421 
422 void
rf_e_encToBuf(RF_Raid_t * raidPtr,RF_RowCol_t srcLogicCol,char * srcbuf,RF_RowCol_t destLogicCol,char * destbuf,int numSector)423 rf_e_encToBuf(RF_Raid_t *raidPtr, RF_RowCol_t srcLogicCol, char *srcbuf,
424     RF_RowCol_t destLogicCol, char *destbuf, int numSector)
425 {
426 	int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
427 
428 	for (i = 0; i < numSector; i++) {
429 		rf_e_EncOneSect(srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector);
430 		srcbuf += bytesPerSector;
431 		destbuf += bytesPerSector;
432 	}
433 }
434 
435 
436 /*****************************************************************************
437  * when parity die and one data die, We use second redundant information, 'E',
438  * to recover the data in dead disk. This function is used in the recovery node of
439  * for EO_110_CreateReadDAG
440  *****************************************************************************/
441 int
rf_RecoveryEFunc(RF_DagNode_t * node)442 rf_RecoveryEFunc(RF_DagNode_t *node)
443 {
444 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
445 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
446 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
447 	RF_RowCol_t scol;	/* source logical column */
448 	RF_RowCol_t fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress);	/* logical column of
449 									 * failed SU */
450 	int i;
451 	RF_PhysDiskAddr_t *pda;
452 	int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
453 	char *srcbuf, *destbuf;
454 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
455 	RF_Etimer_t timer;
456 
457 	bzero((char *) node->results[0], rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
458 	if (node->dagHdr->status == rf_enable) {
459 		RF_ETIMER_START(timer);
460 		for (i = 0; i < node->numParams - 2; i += 2)
461 			if (node->params[i + 1].p != node->results[0]) {
462 				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
463 				if (i == node->numParams - 4)
464 					scol = RF_EO_MATRIX_DIM - 2;	/* the colume of
465 									 * redundant E */
466 				else
467 					scol = rf_EUCol(layoutPtr, pda->raidAddress);
468 				srcbuf = (char *) node->params[i + 1].p;
469 				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
470 				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
471 				rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
472 			}
473 		RF_ETIMER_STOP(timer);
474 		RF_ETIMER_EVAL(timer);
475 		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
476 	}
477 	return (rf_GenericWakeupFunc(node, 0));	/* node execute successfully */
478 }
479 
480 
481 /*****************************************************************************
482  * This function is used in the case where one data and the parity have filed.
483  * (in EO_110_CreateWriteDAG)
484  *****************************************************************************/
485 int
rf_EO_DegradedWriteEFunc(RF_DagNode_t * node)486 rf_EO_DegradedWriteEFunc(RF_DagNode_t *node)
487 {
488 	rf_DegrESubroutine(node, node->results[0]);
489 	rf_GenericWakeupFunc(node, 0);
490 #if 1
491 	return (0);		/* XXX Yet another one !!! GO */
492 #endif
493 }
494 
495 
496 
497 /*****************************************************************************
498  *	THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES.
499  *****************************************************************************/
500 
501 void
rf_doubleEOdecode(RF_Raid_t * raidPtr,char ** rrdbuf,char ** dest,RF_RowCol_t * fcol,char * pbuf,char * ebuf)502 rf_doubleEOdecode(RF_Raid_t *raidPtr, char **rrdbuf, char **dest,
503     RF_RowCol_t *fcol, char *pbuf, char *ebuf)
504 {
505 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
506 	int i, j, k, f1, f2, row;
507 	int rrdrow, erow, count = 0;
508 	int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
509 	int numRowInEncMatrix = (RF_EO_MATRIX_DIM) - 1;
510 #if 0
511 	int pcol = (RF_EO_MATRIX_DIM) - 1;
512 #endif
513 	int ecol = (RF_EO_MATRIX_DIM) - 2;
514 	int bytesPerEU = bytesPerSector / numRowInEncMatrix;
515 	int numDataCol = layoutPtr->numDataCol;
516 #if	RF_EO_MATRIX_DIM > 17
517 	int shortsPerEU = bytesPerEU / sizeof(short);
518 	short *rrdbuf_current, *pbuf_current, *ebuf_current;
519 	short *dest_smaller, *dest_smaller_current;
520 	short *dest_larger, *dest_larger_current;
521 	short *temp;
522 	short *P;
523 
524 	RF_ASSERT(bytesPerEU % sizeof(short) == 0);
525 	RF_Malloc(P, bytesPerEU, (short *));
526 	RF_Malloc(temp, bytesPerEU, (short *));
527 #elif	RF_EO_MATRIX_DIM == 17
528 	int longsPerEU = bytesPerEU / sizeof(long);
529 	long *rrdbuf_current, *pbuf_current, *ebuf_current;
530 	long *dest_smaller, *dest_smaller_current;
531 	long *dest_larger, *dest_larger_current;
532 	long *temp;
533 	long *P;
534 
535 	RF_ASSERT(bytesPerEU % sizeof(long) == 0);
536 	RF_Malloc(P, bytesPerEU, (long *));
537 	RF_Malloc(temp, bytesPerEU, (long *));
538 #endif
539 	RF_ASSERT(*((long *) dest[0]) == 0);
540 	RF_ASSERT(*((long *) dest[1]) == 0);
541 	bzero((char *) P, bytesPerEU);
542 	bzero((char *) temp, bytesPerEU);
543 	RF_ASSERT(*P == 0);
544 	/*
545 	 * Calculate the 'P' parameter, which, not parity, is the Xor of all
546 	 * elements in the last two column, ie. 'E' and 'parity' columns, see
547 	 * the Ref. paper by Blaum, et al 1993.
548 	 */
549 	for (i = 0; i < numRowInEncMatrix; i++)
550 		for (k = 0; k < longsPerEU; k++) {
551 #if	RF_EO_MATRIX_DIM > 17
552 			ebuf_current = ((short *) ebuf) + i * shortsPerEU + k;
553 			pbuf_current = ((short *) pbuf) + i * shortsPerEU + k;
554 #elif	RF_EO_MATRIX_DIM == 17
555 			ebuf_current = ((long *) ebuf) + i * longsPerEU + k;
556 			pbuf_current = ((long *) pbuf) + i * longsPerEU + k;
557 #endif
558 			P[k] ^= *ebuf_current;
559 			P[k] ^= *pbuf_current;
560 		}
561 	RF_ASSERT(fcol[0] != fcol[1]);
562 	if (fcol[0] < fcol[1]) {
563 #if	RF_EO_MATRIX_DIM > 17
564 		dest_smaller = (short *) (dest[0]);
565 		dest_larger = (short *) (dest[1]);
566 #elif	RF_EO_MATRIX_DIM == 17
567 		dest_smaller = (long *) (dest[0]);
568 		dest_larger = (long *) (dest[1]);
569 #endif
570 		f1 = fcol[0];
571 		f2 = fcol[1];
572 	} else {
573 #if	RF_EO_MATRIX_DIM > 17
574 		dest_smaller = (short *) (dest[1]);
575 		dest_larger = (short *) (dest[0]);
576 #elif	RF_EO_MATRIX_DIM == 17
577 		dest_smaller = (long *) (dest[1]);
578 		dest_larger = (long *) (dest[0]);
579 #endif
580 		f1 = fcol[1];
581 		f2 = fcol[0];
582 	}
583 	row = (RF_EO_MATRIX_DIM) - 1;
584 	while ((row = rf_EO_Mod((row + f1 - f2), RF_EO_MATRIX_DIM)) !=
585 	    ((RF_EO_MATRIX_DIM) - 1)) {
586 #if	RF_EO_MATRIX_DIM > 17
587 		dest_larger_current = dest_larger + row * shortsPerEU;
588 		dest_smaller_current = dest_smaller + row * shortsPerEU;
589 #elif	RF_EO_MATRIX_DIM == 17
590 		dest_larger_current = dest_larger + row * longsPerEU;
591 		dest_smaller_current = dest_smaller + row * longsPerEU;
592 #endif
593 		/*
594 		 * Do the diagonal recovery. Initially, temp[k] = (failed 1),
595 		 * which is the failed data in the column that has smaller
596 		 * col index.
597 		 */
598 		/* Step 1:  ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */
599 		for (j = 0; j < numDataCol; j++) {
600 			if (j == f1 || j == f2)
601 				continue;
602 			rrdrow = rf_EO_Mod((row + f2 - j), RF_EO_MATRIX_DIM);
603 			if (rrdrow != (RF_EO_MATRIX_DIM) - 1) {
604 #if	RF_EO_MATRIX_DIM > 17
605 				rrdbuf_current = (short *) (rrdbuf[j]) +
606 				    rrdrow * shortsPerEU;
607 				for (k = 0; k < shortsPerEU; k++)
608 					temp[k] ^= *(rrdbuf_current + k);
609 #elif	RF_EO_MATRIX_DIM == 17
610 				rrdbuf_current = (long *) (rrdbuf[j]) +
611 				    rrdrow * longsPerEU;
612 				for (k = 0; k < longsPerEU; k++)
613 					temp[k] ^= *(rrdbuf_current + k);
614 #endif
615 			}
616 		}
617 		/*
618 		 * Step 2:  ^E(erow,m-2), If erow is at the bottom row, don't
619 		 * Xor into it.  E(erow,m-2) = (principle diagonal) ^ (failed
620 		 * 1) ^ (failed 2) ^ (SUM of nonfailed in-diagonal
621 		 * A(rrdrow,0..m-3))
622 		 * After this step, temp[k] = (principle diagonal) ^ (failed 2).
623 		 */
624 
625 		erow = rf_EO_Mod((row + f2 - ecol), (RF_EO_MATRIX_DIM));
626 		if (erow != (RF_EO_MATRIX_DIM) - 1) {
627 #if	RF_EO_MATRIX_DIM > 17
628 			ebuf_current = (short *) ebuf + shortsPerEU * erow;
629 			for (k = 0; k < shortsPerEU; k++)
630 				temp[k] ^= *(ebuf_current + k);
631 #elif	RF_EO_MATRIX_DIM == 17
632 			ebuf_current = (long *) ebuf + longsPerEU * erow;
633 			for (k = 0; k < longsPerEU; k++)
634 				temp[k] ^= *(ebuf_current + k);
635 #endif
636 		}
637 		/*
638 		 * Step 3: ^P to obtain the failed data (failed 2). P can be
639 		 * proved to be actually (principal diagonal). After this
640 		 * step, temp[k] = (failed 2), the failed data to be recovered.
641 		 */
642 #if	RF_EO_MATRIX_DIM > 17
643 		for (k = 0; k < shortsPerEU; k++)
644 			temp[k] ^= P[k];
645 		/* Put the data into the destination buffer. */
646 		for (k = 0; k < shortsPerEU; k++)
647 			dest_larger_current[k] = temp[k];
648 #elif	RF_EO_MATRIX_DIM == 17
649 		for (k = 0; k < longsPerEU; k++)
650 			temp[k] ^= P[k];
651 		/* Put the data into the destination buffer. */
652 		for (k = 0; k < longsPerEU; k++)
653 			dest_larger_current[k] = temp[k];
654 #endif
655 
656 		/* THE FOLLOWING DO THE HORIZONTAL XOR. */
657 		/*
658 		 * Step 1:  ^(SUM of A(row,0..m-3)), ie. all nonfailed data
659 		 * columns.
660 		 */
661 		for (j = 0; j < numDataCol; j++) {
662 			if (j == f1 || j == f2)
663 				continue;
664 #if	RF_EO_MATRIX_DIM > 17
665 			rrdbuf_current = (short *) (rrdbuf[j]) +
666 			    row * shortsPerEU;
667 			for (k = 0; k < shortsPerEU; k++)
668 				temp[k] ^= *(rrdbuf_current + k);
669 #elif	RF_EO_MATRIX_DIM == 17
670 			rrdbuf_current = (long *) (rrdbuf[j]) +
671 			    row * longsPerEU;
672 			for (k = 0; k < longsPerEU; k++)
673 				temp[k] ^= *(rrdbuf_current + k);
674 #endif
675 		}
676 		/* Step 2: ^A(row,m-1) */
677 		/* Step 3: Put the data into the destination buffer. */
678 #if	RF_EO_MATRIX_DIM > 17
679 		pbuf_current = (short *) pbuf + shortsPerEU * row;
680 		for (k = 0; k < shortsPerEU; k++)
681 			temp[k] ^= *(pbuf_current + k);
682 		for (k = 0; k < shortsPerEU; k++)
683 			dest_smaller_current[k] = temp[k];
684 #elif	RF_EO_MATRIX_DIM == 17
685 		pbuf_current = (long *) pbuf + longsPerEU * row;
686 		for (k = 0; k < longsPerEU; k++)
687 			temp[k] ^= *(pbuf_current + k);
688 		for (k = 0; k < longsPerEU; k++)
689 			dest_smaller_current[k] = temp[k];
690 #endif
691 		count++;
692 	}
693 	/*
694 	 * Check if all Encoding Unit in the data buffer have been decoded ?
695 	 * According to EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime
696 	 * number, this algorithm will covered all buffer.
697 	 */
698 	RF_ASSERT(count == numRowInEncMatrix);
699 	RF_Free((char *) P, bytesPerEU);
700 	RF_Free((char *) temp, bytesPerEU);
701 }
702 
703 
704 /*****************************************************************************
705  *	This function is called by double degraded read EO_200_CreateReadDAG.
706  *****************************************************************************/
707 int
rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t * node)708 rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node)
709 {
710 	int ndataParam = 0;
711 	int np = node->numParams;
712 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *)
713 	    node->params[np - 1].p;
714 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
715 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
716 	int i, prm, sector, nresults = node->numResults;
717 	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
718 	unsigned sosAddr;
719 	int two = 0, mallc_one = 0, mallc_two = 0;	/*
720 							 * Flags to indicate if
721 							 * memory is allocated.
722 							 */
723 	int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
724 	RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1,
725 	    npda;
726 	RF_RowCol_t fcol[2], fsuoff[2], fsuend[2],
727 	    numDataCol = layoutPtr->numDataCol;
728 	char **buf, *ebuf, *pbuf, *dest[2];
729 	long *suoff = NULL, *suend = NULL, *prmToCol = NULL, psuoff, esuoff;
730 	RF_SectorNum_t startSector, endSector;
731 	RF_Etimer_t timer;
732 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
733 
734 	RF_ETIMER_START(timer);
735 
736 	/*
737 	 * Find out the number of parameters that are pdas for data
738 	 * information.
739 	 */
740 	for (i = 0; i <= np; i++)
741 		if (((RF_PhysDiskAddr_t *) node->params[i].p)->type !=
742 		    RF_PDA_TYPE_DATA) {
743 			ndataParam = i;
744 			break;
745 		}
746 	RF_Malloc(buf, numDataCol * sizeof(char *), (char **));
747 	if (ndataParam != 0) {
748 		RF_Malloc(suoff, ndataParam * sizeof(long), (long *));
749 		RF_Malloc(suend, ndataParam * sizeof(long), (long *));
750 		RF_Malloc(prmToCol, ndataParam * sizeof(long), (long *));
751 	}
752 	if (asmap->failedPDAs[1] &&
753 	    (asmap->failedPDAs[1]->numSector +
754 	     asmap->failedPDAs[0]->numSector) < secPerSU) {
755 		RF_ASSERT(0);	/* Currently, no support for this situation. */
756 		ppda = node->params[np - 6].p;
757 		ppda2 = node->params[np - 5].p;
758 		RF_ASSERT(ppda2->type == RF_PDA_TYPE_PARITY);
759 		epda = node->params[np - 4].p;
760 		epda2 = node->params[np - 3].p;
761 		RF_ASSERT(epda2->type == RF_PDA_TYPE_Q);
762 		two = 1;
763 	} else {
764 		ppda = node->params[np - 4].p;
765 		epda = node->params[np - 3].p;
766 		psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
767 		esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector);
768 		RF_ASSERT(psuoff == esuoff);
769 	}
770 	/*
771 	 * The followings have three goals:
772 	 *   1. Determine the startSector to begin decoding and endSector
773 	 *	to end decoding.
774 	 *   2. Determine the column numbers of the two failed disks.
775 	 *   3. Determine the offset and end offset of the access within
776 	 *	each failed stripe unit.
777 	 */
778 	if (nresults == 1) {
779 		/* Find the startSector to begin decoding. */
780 		pda = node->results[0];
781 		bzero(pda->bufPtr, bytesPerSector * pda->numSector);
782 		fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
783 		fsuend[0] = fsuoff[0] + pda->numSector;
784 		startSector = fsuoff[0];
785 		endSector = fsuend[0];
786 
787 		/* Find out the column of failed disk being accessed. */
788 		fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress);
789 
790 		/* Find out the other failed column not accessed. */
791 		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
792 		    asmap->raidAddress);
793 		for (i = 0; i < numDataCol; i++) {
794 			npda.raidAddress = sosAddr + (i * secPerSU);
795 			(raidPtr->Layout.map->MapSector) (raidPtr,
796 			    npda.raidAddress, &(npda.row), &(npda.col),
797 			    &(npda.startSector), 0);
798 			/* Skip over dead disks. */
799 			if (RF_DEAD_DISK(raidPtr
800 			    ->Disks[npda.row][npda.col].status))
801 				if (i != fcol[0])
802 					break;
803 		}
804 		RF_ASSERT(i < numDataCol);
805 		fcol[1] = i;
806 	} else {
807 		RF_ASSERT(nresults == 2);
808 		pda0 = node->results[0];
809 		bzero(pda0->bufPtr, bytesPerSector * pda0->numSector);
810 		pda1 = node->results[1];
811 		bzero(pda1->bufPtr, bytesPerSector * pda1->numSector);
812 		/*
813 		 * Determine the failed column numbers of the two failed
814 		 * disks.
815 		 */
816 		fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress);
817 		fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress);
818 		/*
819 		 * Determine the offset and end offset of the access within
820 		 * each failed stripe unit.
821 		 */
822 		fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector);
823 		fsuend[0] = fsuoff[0] + pda0->numSector;
824 		fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector);
825 		fsuend[1] = fsuoff[1] + pda1->numSector;
826 		/* Determine the startSector to begin decoding. */
827 		startSector = RF_MIN(pda0->startSector, pda1->startSector);
828 		/* Determine the endSector to end decoding. */
829 		endSector = RF_MAX(fsuend[0], fsuend[1]);
830 	}
831 	/*
832 	 * Assign the beginning sector and the end sector for each parameter.
833 	 * Find out the corresponding column # for each parameter.
834 	 */
835 	for (prm = 0; prm < ndataParam; prm++) {
836 		pda = node->params[prm].p;
837 		suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
838 		suend[prm] = suoff[prm] + pda->numSector;
839 		prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress);
840 	}
841 	/*
842 	 * 'sector' is the sector for the current decoding algorithm. For each
843 	 * sector in the failed SU
844 	 * 1. Find out the corresponding parameters that cover the current
845 	 *    sector and that are needed for the decoding of this sector in
846 	 *    failed SU.
847 	 * 2. Find out if sector is in the shadow of any accessed failed SU.
848 	 *    If not, malloc a temporary space of a sector in size.
849 	 */
850 	for (sector = startSector; sector < endSector; sector++) {
851 		if (nresults == 2)
852 			if (!(fsuoff[0] <= sector && sector < fsuend[0]) &&
853 			    !(fsuoff[1] <= sector && sector < fsuend[1]))
854 				continue;
855 		for (prm = 0; prm < ndataParam; prm++)
856 			if (suoff[prm] <= sector && sector < suend[prm])
857 				buf[(prmToCol[prm])] = ((RF_PhysDiskAddr_t *)
858 				    node->params[prm].p)->bufPtr +
859 				    rf_RaidAddressToByte(raidPtr,
860 				     sector - suoff[prm]);
861 		/*
862 		 * Find out if sector is in the shadow of any accessed failed
863 		 * SU. If yes, assign dest[0], dest[1] to point at suitable
864 		 * position of the buffer corresponding to failed SUs. If no,
865 		 * malloc a temporary space of a sector in size for
866 		 * destination of decoding.
867 		 */
868 		RF_ASSERT(nresults == 1 || nresults == 2);
869 		if (nresults == 1) {
870 			dest[0] = ((RF_PhysDiskAddr_t *)
871 			    node->results[0])->bufPtr +
872 			    rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
873 			/* Always malloc temp buffer to dest[1]. */
874 			RF_Malloc(dest[1], bytesPerSector, (char *));
875 			bzero(dest[1], bytesPerSector);
876 			mallc_two = 1;
877 		} else {
878 			if (fsuoff[0] <= sector && sector < fsuend[0])
879 				dest[0] = ((RF_PhysDiskAddr_t *)
880 				    node->results[0])->bufPtr +
881 				    rf_RaidAddressToByte(raidPtr,
882 				     sector - fsuoff[0]);
883 			else {
884 				RF_Malloc(dest[0], bytesPerSector, (char *));
885 				bzero(dest[0], bytesPerSector);
886 				mallc_one = 1;
887 			}
888 			if (fsuoff[1] <= sector && sector < fsuend[1])
889 				dest[1] = ((RF_PhysDiskAddr_t *)
890 				    node->results[1])->bufPtr +
891 				    rf_RaidAddressToByte(raidPtr,
892 				     sector - fsuoff[1]);
893 			else {
894 				RF_Malloc(dest[1], bytesPerSector, (char *));
895 				bzero(dest[1], bytesPerSector);
896 				mallc_two = 1;
897 			}
898 			RF_ASSERT(mallc_one == 0 || mallc_two == 0);
899 		}
900 		pbuf = ppda->bufPtr + rf_RaidAddressToByte(raidPtr,
901 		    sector - psuoff);
902 		ebuf = epda->bufPtr + rf_RaidAddressToByte(raidPtr,
903 		    sector - esuoff);
904 		/*
905 		 * After finish finding all needed sectors, call doubleEOdecode
906 		 * function for decoding one sector to destination.
907 		 */
908 		rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
909 		/*
910 		 * Free all allocated memory, and mark flag to indicate no
911 		 * memory is being allocated.
912 		 */
913 		if (mallc_one == 1)
914 			RF_Free(dest[0], bytesPerSector);
915 		if (mallc_two == 1)
916 			RF_Free(dest[1], bytesPerSector);
917 		mallc_one = mallc_two = 0;
918 	}
919 	RF_Free(buf, numDataCol * sizeof(char *));
920 	if (ndataParam != 0) {
921 		RF_Free(suoff, ndataParam * sizeof(long));
922 		RF_Free(suend, ndataParam * sizeof(long));
923 		RF_Free(prmToCol, ndataParam * sizeof(long));
924 	}
925 	RF_ETIMER_STOP(timer);
926 	RF_ETIMER_EVAL(timer);
927 	if (tracerec) {
928 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
929 	}
930 	rf_GenericWakeupFunc(node, 0);
931 #if 1
932 	return (0);		/* XXX Is this even close !!?!?!!? GO */
933 #endif
934 }
935 
936 
937 /*
938  * Currently, only access of one of the two failed SU is allowed in this
939  * function. Also, asmap->numStripeUnitsAccessed is limited to be one,
940  * the RAIDframe will break large access into many accesses of single
941  * stripe unit.
942  */
943 
944 int
rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t * node)945 rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t *node)
946 {
947 	int np = node->numParams;
948 	RF_AccessStripeMap_t *asmap =
949 	    (RF_AccessStripeMap_t *) node->params[np - 1].p;
950 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
951 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
952 	RF_SectorNum_t sector;
953 	RF_RowCol_t col, scol;
954 	int prm, i, j;
955 	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
956 	unsigned sosAddr;
957 	unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
958 	RF_int64 numbytes;
959 	RF_SectorNum_t startSector, endSector;
960 	RF_PhysDiskAddr_t *ppda, *epda, *pda, *fpda, npda;
961 	RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol;
962 	char **buf;		/*
963 				 * buf[0], buf[1], buf[2], ... etc, point to
964 				 * buffer storing data read from col0, col1,
965 				 * col2.
966 				 */
967 	char *ebuf, *pbuf, *dest[2], *olddata[2];
968 	RF_Etimer_t timer;
969 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
970 
971 	RF_ASSERT(asmap->numDataFailed == 1);	/*
972 						 * Currently only support this
973 						 * case, the other failed SU
974 						 * is not being accessed.
975 						 */
976 	RF_ETIMER_START(timer);
977 	RF_Malloc(buf, numDataCol * sizeof(char *), (char **));
978 
979 	ppda = node->results[0];	/*
980 					 * Instead of being buffers,
981 					 * node->results[0] and [1]
982 					 * are Ppda and Epda.
983 					 */
984 	epda = node->results[1];
985 	fpda = asmap->failedPDAs[0];
986 
987 	/* First, recovery the failed old SU using EvenOdd double decoding. */
988 	/* Determine the startSector and endSector for decoding. */
989 	startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector);
990 	endSector = startSector + fpda->numSector;
991 	/*
992 	 * Assign buf[col] pointers to point to each non-failed column and
993 	 * initialize the pbuf and ebuf to point at the beginning of each
994 	 * source buffers and destination buffers. */
995 	for (prm = 0; prm < numDataCol - 2; prm++) {
996 		pda = (RF_PhysDiskAddr_t *) node->params[prm].p;
997 		col = rf_EUCol(layoutPtr, pda->raidAddress);
998 		buf[col] = pda->bufPtr;
999 	}
1000 	/*
1001 	 * pbuf and ebuf: They will change values as double recovery decoding
1002 	 * goes on.
1003 	 */
1004 	pbuf = ppda->bufPtr;
1005 	ebuf = epda->bufPtr;
1006 	/*
1007 	 * Find out the logical column numbers in the encoding matrix of the
1008 	 * two failed columns.
1009 	 */
1010 	fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress);
1011 
1012 	/* Find out the other failed column not accessed this time. */
1013 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
1014 	    asmap->raidAddress);
1015 	for (i = 0; i < numDataCol; i++) {
1016 		npda.raidAddress = sosAddr + (i * secPerSU);
1017 		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress,
1018 		    &(npda.row), &(npda.col), &(npda.startSector), 0);
1019 		/* Skip over dead disks. */
1020 		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
1021 			if (i != fcol[0])
1022 				break;
1023 	}
1024 	RF_ASSERT(i < numDataCol);
1025 	fcol[1] = i;
1026 	/* Assign temporary space to put recovered failed SU. */
1027 	numbytes = fpda->numSector * bytesPerSector;
1028 	RF_Malloc(olddata[0], numbytes, (char *));
1029 	RF_Malloc(olddata[1], numbytes, (char *));
1030 	dest[0] = olddata[0];
1031 	dest[1] = olddata[1];
1032 	bzero(olddata[0], numbytes);
1033 	bzero(olddata[1], numbytes);
1034 	/*
1035 	 * Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j]
1036 	 * have already pointed at the beginning of each source buffers and
1037 	 * destination buffers.
1038 	 */
1039 	for (sector = startSector, i = 0; sector < endSector; sector++, i++) {
1040 		rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
1041 		for (j = 0; j < numDataCol; j++)
1042 			if ((j != fcol[0]) && (j != fcol[1]))
1043 				buf[j] += bytesPerSector;
1044 		dest[0] += bytesPerSector;
1045 		dest[1] += bytesPerSector;
1046 		ebuf += bytesPerSector;
1047 		pbuf += bytesPerSector;
1048 	}
1049 	/*
1050 	 * After recovery, the buffer pointed by olddata[0] is the old failed
1051 	 * data. With new writing data and this old data, use small write to
1052 	 * calculate the new redundant informations.
1053 	 */
1054 	/*
1055 	 * node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of
1056 	 * Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol
1057 	 * -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[
1058 	 * PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol
1059 	 * +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of
1060 	 * wudNodes; For current implementation, we assume the simplest case:
1061 	 * asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1
1062 	 * ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new
1063 	 * data to be written to the failed disk. We first bxor the new data
1064 	 * into the old recovered data, then do the same things as small
1065 	 * write.
1066 	 */
1067 
1068 	rf_bxor(((RF_PhysDiskAddr_t *) node->params[numDataCol].p)->bufPtr,
1069 	    olddata[0], numbytes, node->dagHdr->bp);
1070 	/* Do new 'E' calculation. */
1071 	/*
1072 	 * Find out the corresponding column in encoding matrix for write
1073 	 * column to be encoded into redundant disk 'E'.
1074 	 */
1075 	scol = rf_EUCol(layoutPtr, fpda->raidAddress);
1076 	/*
1077 	 * olddata[0] now is source buffer pointer; epda->bufPtr is the dest
1078 	 * buffer pointer.
1079 	 */
1080 	rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2,
1081 	    epda->bufPtr, fpda->numSector);
1082 
1083 	/* Do new 'P' calculation. */
1084 	rf_bxor(olddata[0], ppda->bufPtr, numbytes, node->dagHdr->bp);
1085 	/* Free the allocated buffer. */
1086 	RF_Free(olddata[0], numbytes);
1087 	RF_Free(olddata[1], numbytes);
1088 	RF_Free(buf, numDataCol * sizeof(char *));
1089 
1090 	RF_ETIMER_STOP(timer);
1091 	RF_ETIMER_EVAL(timer);
1092 	if (tracerec) {
1093 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
1094 	}
1095 	rf_GenericWakeupFunc(node, 0);
1096 	return (0);
1097 }
1098