1 /*	$OpenBSD: rf_pq.c,v 1.6 2002/12/16 07:01:04 tdeval Exp $	*/
2 /*	$NetBSD: rf_pq.c,v 1.7 2000/01/07 03:41:02 oster Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Carnegie-Mellon University.
6  * All rights reserved.
7  *
8  * Author: Daniel Stodolsky
9  *
10  * Permission to use, copy, modify and distribute this software and
11  * its documentation is hereby granted, provided that both the copyright
12  * notice and this permission notice appear in all copies of the
13  * software, derivative works or modified versions, and any portions
14  * thereof, and that both notices appear in supporting documentation.
15  *
16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19  *
20  * Carnegie Mellon requests users of this software to return to
21  *
22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
23  *  School of Computer Science
24  *  Carnegie Mellon University
25  *  Pittsburgh PA 15213-3890
26  *
27  * any improvements or extensions that they make and grant Carnegie the
28  * rights to redistribute these changes.
29  */
30 
31 /*
32  * Code for RAID level 6 (P + Q) disk array architecture.
33  */
34 
35 #include "rf_archs.h"
36 #include "rf_types.h"
37 #include "rf_raid.h"
38 #include "rf_dag.h"
39 #include "rf_dagffrd.h"
40 #include "rf_dagffwr.h"
41 #include "rf_dagdegrd.h"
42 #include "rf_dagdegwr.h"
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_etimer.h"
46 #include "rf_pqdeg.h"
47 #include "rf_general.h"
48 #include "rf_map.h"
49 #include "rf_pq.h"
50 
51 RF_RedFuncs_t rf_pFuncs = {
52 	rf_RegularONPFunc, "Regular Old-New P",
53 	rf_SimpleONPFunc, "Simple Old-New P"
54 };
55 RF_RedFuncs_t rf_pRecoveryFuncs = {
56 	rf_RecoveryPFunc, "Recovery P Func",
57 	rf_RecoveryPFunc, "Recovery P Func"
58 };
59 
60 int
rf_RegularONPFunc(RF_DagNode_t * node)61 rf_RegularONPFunc(RF_DagNode_t *node)
62 {
63 	return (rf_RegularXorFunc(node));
64 }
65 
66 
67 /*
68  * Same as simpleONQ func, but the coefficient is always 1.
69  */
70 
71 int
rf_SimpleONPFunc(RF_DagNode_t * node)72 rf_SimpleONPFunc(RF_DagNode_t *node)
73 {
74 	return (rf_SimpleXorFunc(node));
75 }
76 
77 int
rf_RecoveryPFunc(RF_DagNode_t * node)78 rf_RecoveryPFunc(RF_DagNode_t *node)
79 {
80 	return (rf_RecoveryXorFunc(node));
81 }
82 
83 int
rf_RegularPFunc(RF_DagNode_t * node)84 rf_RegularPFunc(RF_DagNode_t *node)
85 {
86 	return (rf_RegularXorFunc(node));
87 }
88 
89 
90 #if	(RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
91 
92 void rf_QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
93 	unsigned char coeff);
94 void rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, unsigned length,
95 	unsigned coeff);
96 
97 RF_RedFuncs_t rf_qFuncs = {
98 	rf_RegularONQFunc, "Regular Old-New Q",
99 	rf_SimpleONQFunc, "Simple Old-New Q"
100 };
101 RF_RedFuncs_t rf_qRecoveryFuncs = {
102 	rf_RecoveryQFunc, "Recovery Q Func",
103 	rf_RecoveryQFunc, "Recovery Q Func"
104 };
105 RF_RedFuncs_t rf_pqRecoveryFuncs = {
106 	rf_RecoveryPQFunc, "Recovery PQ Func",
107 	rf_RecoveryPQFunc, "Recovery PQ Func"
108 };
109 
110 void
rf_PQDagSelect(RF_Raid_t * raidPtr,RF_IoType_t type,RF_AccessStripeMap_t * asmap,RF_VoidFuncPtr * createFunc)111 rf_PQDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
112     RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc)
113 {
114 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
115 	unsigned ndfail = asmap->numDataFailed;
116 	unsigned npfail = asmap->numParityFailed;
117 	unsigned ntfail = npfail + ndfail;
118 
119 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
120 	if (ntfail > 2) {
121 		RF_ERRORMSG("more than two disks failed in a single group !"
122 		            "  Aborting I/O operation.\n");
123 		 /* *infoFunc = */ *createFunc = NULL;
124 		return;
125 	}
126 	/* Ok, we can do this I/O. */
127 	if (type == RF_IO_TYPE_READ) {
128 		switch (ndfail) {
129 		case 0:
130 			/* Fault free read. */
131 			*createFunc = (RF_VoidFuncPtr)
132 			    rf_CreateFaultFreeReadDAG;	/* Same as raid 5. */
133 			break;
134 		case 1:
135 			/* Lost a single data unit. */
136 			/*
137 			 * Two cases:
138 			 * (1) Parity is not lost. Do a normal raid 5
139 			 *     reconstruct read.
140 			 * (2) Parity is lost. Do a reconstruct read using "q".
141 			 */
142 			if (ntfail == 2) {	/* Also lost redundancy. */
143 				if (asmap->failedPDAs[1]->type ==
144 				    RF_PDA_TYPE_PARITY)
145 					*createFunc = (RF_VoidFuncPtr)
146 					    rf_PQ_110_CreateReadDAG;
147 				else
148 					*createFunc = (RF_VoidFuncPtr)
149 					    rf_PQ_101_CreateReadDAG;
150 			} else {
151 				/*
152 				 * P and Q are ok. But is there a failure in
153 				 * some unaccessed data unit ?
154 				 */
155 				if (rf_NumFailedDataUnitsInStripe(raidPtr,
156 				    asmap) == 2)
157 					*createFunc = (RF_VoidFuncPtr)
158 					    rf_PQ_200_CreateReadDAG;
159 				else
160 					*createFunc = (RF_VoidFuncPtr)
161 					    rf_PQ_100_CreateReadDAG;
162 			}
163 			break;
164 		case 2:
165 			/* Lost two data units. */
166 			/* *infoFunc = rf_PQOneTwo; */
167 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
168 			break;
169 		}
170 		return;
171 	}
172 	/* A write. */
173 	switch (ntfail) {
174 	case 0:		/* Fault free. */
175 		if (rf_suppressLocksAndLargeWrites ||
176 		    (((asmap->numStripeUnitsAccessed <=
177 		       (layoutPtr->numDataCol / 2)) &&
178 		      (layoutPtr->numDataCol != 1)) ||
179 		     (asmap->parityInfo->next != NULL) ||
180 		     (asmap->qInfo->next != NULL) ||
181 		     rf_CheckStripeForFailures(raidPtr, asmap))) {
182 
183 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
184 		} else {
185 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
186 		}
187 		break;
188 
189 	case 1:		/* Single disk fault. */
190 		if (npfail == 1) {
191 			RF_ASSERT((asmap->failedPDAs[0]->type ==
192 			    RF_PDA_TYPE_PARITY) ||
193 			    (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
194 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {
195 				/*
196 				 * Q died, treat like normal mode raid5 write.
197 				 */
198 				if (((asmap->numStripeUnitsAccessed <=
199 				      (layoutPtr->numDataCol / 2)) ||
200 				     (asmap->numStripeUnitsAccessed == 1)) ||
201 				    rf_NumFailedDataUnitsInStripe(raidPtr,
202 				     asmap))
203 					*createFunc = (RF_VoidFuncPtr)
204 					    rf_PQ_001_CreateSmallWriteDAG;
205 				else
206 					*createFunc = (RF_VoidFuncPtr)
207 					    rf_PQ_001_CreateLargeWriteDAG;
208 			} else {/* Parity died, small write only updating Q. */
209 				if (((asmap->numStripeUnitsAccessed <=
210 				      (layoutPtr->numDataCol / 2)) ||
211 				     (asmap->numStripeUnitsAccessed == 1)) ||
212 				    rf_NumFailedDataUnitsInStripe(raidPtr,
213 				     asmap))
214 					*createFunc = (RF_VoidFuncPtr)
215 					    rf_PQ_010_CreateSmallWriteDAG;
216 				else
217 					*createFunc = (RF_VoidFuncPtr)
218 					    rf_PQ_010_CreateLargeWriteDAG;
219 			}
220 		} else {	/*
221 				 * Data missing. Do a P reconstruct write if
222 				 * only a single data unit is lost in the
223 				 * stripe, otherwise a PQ reconstruct write.
224 				 */
225 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
226 				*createFunc = (RF_VoidFuncPtr)
227 				    rf_PQ_200_CreateWriteDAG;
228 			else
229 				*createFunc = (RF_VoidFuncPtr)
230 				    rf_PQ_100_CreateWriteDAG;
231 		}
232 		break;
233 
234 	case 2:		/* Two disk faults. */
235 		switch (npfail) {
236 		case 2:	/* Both p and q dead. */
237 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
238 			break;
239 		case 1:	/* Either p or q and dead data. */
240 			RF_ASSERT(asmap->failedPDAs[0]->type ==
241 			          RF_PDA_TYPE_DATA);
242 			RF_ASSERT((asmap->failedPDAs[1]->type ==
243 			           RF_PDA_TYPE_PARITY) ||
244 			          (asmap->failedPDAs[1]->type ==
245 			           RF_PDA_TYPE_Q));
246 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
247 				*createFunc = (RF_VoidFuncPtr)
248 				    rf_PQ_101_CreateWriteDAG;
249 			else
250 				*createFunc = (RF_VoidFuncPtr)
251 				    rf_PQ_110_CreateWriteDAG;
252 			break;
253 		case 0:	/* Double data loss. */
254 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
255 			break;
256 		}
257 		break;
258 
259 	default:	/* More than 2 disk faults. */
260 		*createFunc = NULL;
261 		RF_PANIC();
262 	}
263 	return;
264 }
265 
266 
267 /*
268  * Used as a stop gap info function.
269  */
270 #if 0
271 void
272 rf_PQOne(RF_Raid_t *raidPtr, int *nSucc, int *nAnte,
273     RF_AccessStripeMap_t *asmap)
274 {
275 	*nSucc = *nAnte = 1;
276 }
277 
278 void
279 rf_PQOneTwo(RF_Raid_t *raidPtr, int *nSucc, int *nAnte,
280     RF_AccessStripeMap_t *asmap)
281 {
282 	*nSucc = 1;
283 	*nAnte = 2;
284 }
285 #endif
286 
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)287 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
288 {
289 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags,
290 	    allocList, 2, rf_RegularPQFunc, RF_FALSE);
291 }
292 
293 int
rf_RegularONQFunc(RF_DagNode_t * node)294 rf_RegularONQFunc(RF_DagNode_t *node)
295 {
296 	int np = node->numParams;
297 	int d;
298 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
299 	int i;
300 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
301 	RF_Etimer_t timer;
302 	char *qbuf, *qpbuf;
303 	char *obuf, *nbuf;
304 	RF_PhysDiskAddr_t *old, *new;
305 	unsigned long coeff;
306 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
307 
308 	RF_ETIMER_START(timer);
309 
310 	d = (np - 3) / 4;
311 	RF_ASSERT(4 * d + 3 == np);
312 	qbuf = (char *) node->params[2 * d + 1].p;	/* Q buffer. */
313 	for (i = 0; i < d; i++) {
314 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
315 		obuf = (char *) node->params[2 * i + 1].p;
316 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
317 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
318 		RF_ASSERT(new->numSector == old->numSector);
319 		RF_ASSERT(new->raidAddress == old->raidAddress);
320 		/*
321 		 * The stripe unit within the stripe tells us the coefficient
322 		 * to use for the multiply.
323 		 */
324 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
325 		    new->raidAddress);
326 		/*
327 		 * Compute the data unit offset within the column, then add
328 		 * one.
329 		 */
330 		coeff = (coeff % raidPtr->Layout.numDataCol);
331 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,
332 		    old->startSector % secPerSU);
333 		rf_QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr,
334 		    old->numSector), coeff);
335 	}
336 
337 	RF_ETIMER_STOP(timer);
338 	RF_ETIMER_EVAL(timer);
339 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
340 	rf_GenericWakeupFunc(node, 0);	/*
341 					 * Call wake func explicitly since no
342 					 * I/O in this node.
343 					 */
344 	return (0);
345 }
346 
347 
348 /*
349  * See the SimpleXORFunc for the difference between a simple and regular func.
350  * These Q functions should be used for
351  *	new q = Q(data, old data, old q)
352  * style updates and not for
353  *	q = (new data, new data, ...)
354  * computations.
355  *
356  * The simple q takes 2(2d+1)+1 params, where d is the number
357  * of stripes written. The order of params is
358  *   old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ...
359  *   old data pda_d, old data buffer_d
360  *   [2d] old q pda_0, old q buffer
361  *   [2d_2] new data pda_0, new data buffer_0, ...
362  *   new data pda_d, new data buffer_d
363  *   raidPtr
364  */
365 
366 int
rf_SimpleONQFunc(RF_DagNode_t * node)367 rf_SimpleONQFunc(RF_DagNode_t *node)
368 {
369 	int np = node->numParams;
370 	int d;
371 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
372 	int i;
373 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
374 	RF_Etimer_t timer;
375 	char *qbuf;
376 	char *obuf, *nbuf;
377 	RF_PhysDiskAddr_t *old, *new;
378 	unsigned long coeff;
379 
380 	RF_ETIMER_START(timer);
381 
382 	d = (np - 3) / 4;
383 	RF_ASSERT(4 * d + 3 == np);
384 	qbuf = (char *) node->params[2 * d + 1].p;	/* Q buffer. */
385 	for (i = 0; i < d; i++) {
386 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
387 		obuf = (char *) node->params[2 * i + 1].p;
388 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
389 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
390 		RF_ASSERT(new->numSector == old->numSector);
391 		RF_ASSERT(new->raidAddress == old->raidAddress);
392 		/*
393 		 * The stripe unit within the stripe tells us the coefficient
394 		 * to use for the multiply.
395 		 */
396 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
397 		    new->raidAddress);
398 		/*
399 		 * Compute the data unit offset within the column, then add
400 		 * one.
401 		 */
402 		coeff = (coeff % raidPtr->Layout.numDataCol);
403 		rf_QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr,
404 		    old->numSector), coeff);
405 	}
406 
407 	RF_ETIMER_STOP(timer);
408 	RF_ETIMER_EVAL(timer);
409 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
410 	rf_GenericWakeupFunc(node, 0);	/*
411 					 * Call wake func explicitly since no
412 					 * I/O in this node.
413 					 */
414 	return (0);
415 }
416 
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)417 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
418 {
419 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags,
420 	    allocList, &rf_pFuncs, &rf_qFuncs);
421 }
422 
423 
424 void rf_RegularQSubr(RF_DagNode_t *, char *);
425 
426 void
rf_RegularQSubr(RF_DagNode_t * node,char * qbuf)427 rf_RegularQSubr(RF_DagNode_t *node, char *qbuf)
428 {
429 	int np = node->numParams;
430 	int d;
431 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
432 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
433 	int i;
434 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
435 	RF_Etimer_t timer;
436 	char *obuf, *qpbuf;
437 	RF_PhysDiskAddr_t *old;
438 	unsigned long coeff;
439 
440 	RF_ETIMER_START(timer);
441 
442 	d = (np - 1) / 2;
443 	RF_ASSERT(2 * d + 1 == np);
444 	for (i = 0; i < d; i++) {
445 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
446 		obuf = (char *) node->params[2 * i + 1].p;
447 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
448 		    old->raidAddress);
449 		/*
450 		 * Compute the data unit offset within the column, then add
451 		 * one.
452 		 */
453 		coeff = (coeff % raidPtr->Layout.numDataCol);
454 		/*
455 		 * The input buffers may not all be aligned with the start of
456 		 * the stripe. So shift by their sector offset within the
457 		 * stripe unit.
458 		 */
459 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,
460 		    old->startSector % secPerSU);
461 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf,
462 		    rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
463 	}
464 
465 	RF_ETIMER_STOP(timer);
466 	RF_ETIMER_EVAL(timer);
467 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
468 }
469 
470 
471 /*
472  * Used in degraded writes.
473  */
474 
475 void rf_DegrQSubr(RF_DagNode_t *);
476 
477 void
rf_DegrQSubr(RF_DagNode_t * node)478 rf_DegrQSubr(RF_DagNode_t *node)
479 {
480 	int np = node->numParams;
481 	int d;
482 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
483 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
484 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
485 	RF_Etimer_t timer;
486 	char *qbuf = node->results[1];
487 	char *obuf, *qpbuf;
488 	RF_PhysDiskAddr_t *old;
489 	unsigned long coeff;
490 	unsigned fail_start;
491 	int i, j;
492 
493 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
494 	fail_start = old->startSector % secPerSU;
495 
496 	RF_ETIMER_START(timer);
497 
498 	d = (np - 2) / 2;
499 	RF_ASSERT(2 * d + 2 == np);
500 	for (i = 0; i < d; i++) {
501 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
502 		obuf = (char *) node->params[2 * i + 1].p;
503 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
504 		    old->raidAddress);
505 		/*
506 		 * Compute the data unit offset within the column, then add
507 		 * one.
508 		 */
509 		coeff = (coeff % raidPtr->Layout.numDataCol);
510 		/*
511 		 * The input buffers may not all be aligned with the start of
512 		 * the stripe. So shift by their sector offset within the
513 		 * stripe unit.
514 		 */
515 		j = old->startSector % secPerSU;
516 		RF_ASSERT(j >= fail_start);
517 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
518 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf,
519 		    rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
520 	}
521 
522 	RF_ETIMER_STOP(timer);
523 	RF_ETIMER_EVAL(timer);
524 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
525 }
526 
527 
528 /*
529  * Called by large write code to compute the new parity and the new q.
530  *
531  * Structure of the params:
532  *
533  *   pda_0, buffer_0, pda_1 , buffer_1, ..., pda_d, buffer_d (d = numDataCol)
534  *   raidPtr
535  *
536  * For a total of 2d+1 arguments.
537  * The result buffers results[0], results[1] are the buffers for the p and q,
538  * respectively.
539  *
540  * We compute Q first, then compute P. The P calculation may try to reuse
541  * one of the input buffers for its output, so if we computed P first, we would
542  * corrupt the input for the q calculation.
543  */
544 
545 int
rf_RegularPQFunc(RF_DagNode_t * node)546 rf_RegularPQFunc(RF_DagNode_t *node)
547 {
548 	rf_RegularQSubr(node, node->results[1]);
549 	return (rf_RegularXorFunc(node));	/* Does the wakeup. */
550 }
551 
552 int
rf_RegularQFunc(RF_DagNode_t * node)553 rf_RegularQFunc(RF_DagNode_t *node)
554 {
555 	/* Almost ... adjust Qsubr args. */
556 	rf_RegularQSubr(node, node->results[0]);
557 	rf_GenericWakeupFunc(node, 0);	/*
558 					 * Call wake func explicitly since no
559 					 * I/O in this node.
560 					 */
561 	return (0);
562 }
563 
564 
565 /*
566  * Called by singly degraded write code to compute the new parity and
567  * the new q.
568  *
569  * Structure of the params:
570  *
571  *   pda_0, buffer_0, pda_1 , buffer_1, ..., pda_d, buffer_d
572  *   failedPDA raidPtr
573  *
574  * for a total of 2d+2 arguments.
575  * The result buffers results[0], results[1] are the buffers for the parity
576  * and q, respectively.
577  *
578  * We compute Q first, then compute parity. The parity calculation may try
579  * to reuse one of the input buffers for its output, so if we computed parity
580  * first, we would corrupt the input for the q calculation.
581  *
582  * We treat this identically to the regularPQ case, ignoring the failedPDA
583  * extra argument.
584  */
585 
586 void
rf_Degraded_100_PQFunc(RF_DagNode_t * node)587 rf_Degraded_100_PQFunc(RF_DagNode_t *node)
588 {
589 	int     np = node->numParams;
590 
591 	RF_ASSERT(np >= 2);
592 	rf_DegrQSubr(node);
593 	rf_RecoveryXorFunc(node);
594 }
595 
596 
597 /*
598  * The two below are used when reading a stripe with a single lost data unit.
599  * The parameters are
600  *
601  *  pda_0, buffer_0, ..., pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
602  *
603  * and results[0] contains the data buffer, which is originally zero-filled.
604  */
605 
606 /*
607  * This Q func is used by the degraded-mode dag functions to recover lost data.
608  * The second-to-last parameter is the PDA for the failed portion of the
609  * access. The code here looks at this PDA and assumes that the xor target
610  * buffer is equal in size to the number of sectors in the failed PDA. It then
611  * uses the other PDAs in the parameter list to determine where within the
612  * target buffer the corresponding data should be xored.
613  *
614  * Recall the basic equation is
615  *
616  *     Q = (data_1 + 2 * data_2 ... + k * data_k) mod 256
617  *
618  * so to recover data_j we need
619  *
620  *    J data_j = (Q - data_1 - 2 data_2 ... - k * data_k) mod 256
621  *
622  * So the coefficient for each buffer is (255 - data_col), and j should be
623  * initialized by copying Q into it. Then we need to do a table lookup to
624  * convert to solve
625  *   data_j /= J
626  *
627  */
628 
629 int
rf_RecoveryQFunc(RF_DagNode_t * node)630 rf_RecoveryQFunc(RF_DagNode_t *node)
631 {
632 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
633 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
634 	RF_PhysDiskAddr_t *failedPDA =
635 	    (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
636 	int i;
637 	RF_PhysDiskAddr_t *pda;
638 	RF_RaidAddr_t suoffset;
639 	RF_RaidAddr_t failedSUOffset =
640 	    rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
641 	char *srcbuf, *destbuf;
642 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
643 	RF_Etimer_t timer;
644 	unsigned long coeff;
645 
646 	RF_ETIMER_START(timer);
647 	/* Start by copying Q into the buffer. */
648 	bcopy(node->params[node->numParams - 3].p, node->results[0],
649 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
650 	for (i = 0; i < node->numParams - 4; i += 2) {
651 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
652 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
653 		srcbuf = (char *) node->params[i + 1].p;
654 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
655 		destbuf = ((char *) node->results[0]) +
656 		    rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
657 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
658 		    pda->raidAddress);
659 		/* Compute the data unit offset within the column. */
660 		coeff = (coeff % raidPtr->Layout.numDataCol);
661 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf,
662 		    rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
663 	}
664 	/* Do the nasty inversion now. */
665 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
666 	    failedPDA->startSector) % raidPtr->Layout.numDataCol);
667 	rf_InvertQ(node->results[0], node->results[0],
668 	    rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
669 	RF_ETIMER_STOP(timer);
670 	RF_ETIMER_EVAL(timer);
671 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
672 	rf_GenericWakeupFunc(node, 0);
673 	return (0);
674 }
675 
676 int
rf_RecoveryPQFunc(RF_DagNode_t * node)677 rf_RecoveryPQFunc(RF_DagNode_t *node)
678 {
679 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
680 	printf("raid%d: Recovery from PQ not implemented.\n", raidPtr->raidid);
681 	return (1);
682 }
683 
684 
685 /*
686  * Degraded write Q subroutine.
687  * Used when P is dead.
688  * Large-write style Q computation.
689  * Parameters:
690  *
691  * (pda, buf), (pda, buf), ..., (failedPDA, bufPtr), failedPDA, raidPtr.
692  *
693  * We ignore failedPDA.
694  *
695  * This is a "simple style" recovery func.
696  */
697 
698 void
rf_PQ_DegradedWriteQFunc(RF_DagNode_t * node)699 rf_PQ_DegradedWriteQFunc(RF_DagNode_t *node)
700 {
701 	int np = node->numParams;
702 	int d;
703 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
704 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
705 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
706 	RF_Etimer_t timer;
707 	char *qbuf = node->results[0];
708 	char *obuf, *qpbuf;
709 	RF_PhysDiskAddr_t *old;
710 	unsigned long coeff;
711 	int fail_start, i, j;
712 
713 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
714 	fail_start = old->startSector % secPerSU;
715 
716 	RF_ETIMER_START(timer);
717 
718 	d = (np - 2) / 2;
719 	RF_ASSERT(2 * d + 2 == np);
720 
721 	for (i = 0; i < d; i++) {
722 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
723 		obuf = (char *) node->params[2 * i + 1].p;
724 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
725 		    old->raidAddress);
726 		/*
727 		 * Compute the data unit offset within the column, then add
728 		 * one.
729 		 */
730 		coeff = (coeff % raidPtr->Layout.numDataCol);
731 		j = old->startSector % secPerSU;
732 		RF_ASSERT(j >= fail_start);
733 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
734 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf,
735 		    rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
736 	}
737 
738 	RF_ETIMER_STOP(timer);
739 	RF_ETIMER_EVAL(timer);
740 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
741 	rf_GenericWakeupFunc(node, 0);
742 }
743 
744 
745 /* Q computations. */
746 
747 /*
748  * Coeff - colummn;
749  *
750  * Compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
751  *
752  * On 5-bit basis;
753  * Length in bytes;
754  */
755 
756 void
rf_IncQ(unsigned long * dest,unsigned long * buf,unsigned length,unsigned coeff)757 rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length,
758     unsigned coeff)
759 {
760 	unsigned long a, d, new;
761 	unsigned long a1, a2;
762 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
763 	unsigned r = rf_rn[coeff + 1];
764 
765 #define	EXTRACT(a,i)	((a >> (5L*i)) & 0x1f)
766 #define	INSERT(a,i)	(a << (5L*i))
767 
768 	length /= 8;
769 	/* 13 5 bit quants in a 64 bit word. */
770 	while (length) {
771 		a = *buf++;
772 		d = *dest;
773 		a1 = EXTRACT(a, 0) ^ r;
774 		a2 = EXTRACT(a, 1) ^ r;
775 		new = INSERT(a2, 1) | a1;
776 		a1 = EXTRACT(a, 2) ^ r;
777 		a2 = EXTRACT(a, 3) ^ r;
778 		a1 = q[a1];
779 		a2 = q[a2];
780 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
781 		a1 = EXTRACT(a, 4) ^ r;
782 		a2 = EXTRACT(a, 5) ^ r;
783 		a1 = q[a1];
784 		a2 = q[a2];
785 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
786 		a1 = EXTRACT(a, 5) ^ r;
787 		a2 = EXTRACT(a, 6) ^ r;
788 		a1 = q[a1];
789 		a2 = q[a2];
790 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
791 #if	RF_LONGSHIFT > 2
792 		a1 = EXTRACT(a, 7) ^ r;
793 		a2 = EXTRACT(a, 8) ^ r;
794 		a1 = q[a1];
795 		a2 = q[a2];
796 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
797 		a1 = EXTRACT(a, 9) ^ r;
798 		a2 = EXTRACT(a, 10) ^ r;
799 		a1 = q[a1];
800 		a2 = q[a2];
801 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
802 		a1 = EXTRACT(a, 11) ^ r;
803 		a2 = EXTRACT(a, 12) ^ r;
804 		a1 = q[a1];
805 		a2 = q[a2];
806 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
807 #endif	/* RF_LONGSHIFT > 2 */
808 		d ^= new;
809 		*dest++ = d;
810 		length--;
811 	}
812 }
813 
814 
815 /*
816  * Compute.
817  *
818  * dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new)]
819  *
820  * On a five bit basis.
821  * Optimization: compute old ^ new on 64 bit basis.
822  *
823  * Length in bytes.
824  */
825 
826 void
rf_QDelta(char * dest,char * obuf,char * nbuf,unsigned length,unsigned char coeff)827 rf_QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
828     unsigned char coeff)
829 {
830 	unsigned long a, d, new;
831 	unsigned long a1, a2;
832 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
833 	unsigned int r = rf_rn[coeff + 1];
834 
835 	r = a1 = a2 = new = d = a = 0; /* XXX For now... */
836 	q = NULL; /* XXX For now */
837 
838 #ifdef	_KERNEL
839 	/*
840 	 * PQ in kernel currently not supported because the encoding/decoding
841 	 * table is not present.
842 	 */
843 	bzero(dest, length);
844 #else	/* _KERNEL */
845 	/* This code probably doesn't work and should be rewritten. -wvcii */
846 	/* 13 5 bit quants in a 64 bit word. */
847 	length /= 8;
848 	while (length) {
849 		a = *obuf++;	/*
850 				 * XXX Need to reorg to avoid cache conflicts.
851 				 */
852 		a ^= *nbuf++;
853 		d = *dest;
854 		a1 = EXTRACT(a, 0) ^ r;
855 		a2 = EXTRACT(a, 1) ^ r;
856 		a1 = q[a1];
857 		a2 = q[a2];
858 		new = INSERT(a2, 1) | a1;
859 		a1 = EXTRACT(a, 2) ^ r;
860 		a2 = EXTRACT(a, 3) ^ r;
861 		a1 = q[a1];
862 		a2 = q[a2];
863 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
864 		a1 = EXTRACT(a, 4) ^ r;
865 		a2 = EXTRACT(a, 5) ^ r;
866 		a1 = q[a1];
867 		a2 = q[a2];
868 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
869 		a1 = EXTRACT(a, 5) ^ r;
870 		a2 = EXTRACT(a, 6) ^ r;
871 		a1 = q[a1];
872 		a2 = q[a2];
873 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
874 #if	RF_LONGSHIFT > 2
875 		a1 = EXTRACT(a, 7) ^ r;
876 		a2 = EXTRACT(a, 8) ^ r;
877 		a1 = q[a1];
878 		a2 = q[a2];
879 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
880 		a1 = EXTRACT(a, 9) ^ r;
881 		a2 = EXTRACT(a, 10) ^ r;
882 		a1 = q[a1];
883 		a2 = q[a2];
884 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
885 		a1 = EXTRACT(a, 11) ^ r;
886 		a2 = EXTRACT(a, 12) ^ r;
887 		a1 = q[a1];
888 		a2 = q[a2];
889 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
890 #endif	/* RF_LONGSHIFT > 2 */
891 		d ^= new;
892 		*dest++ = d;
893 		length--;
894 	}
895 #endif	/* _KERNEL */
896 }
897 
898 
899 /*
900  * Recover columns a and b from the given p and q into
901  * bufs abuf and bbuf. All bufs are word aligned.
902  * Length is in bytes.
903  */
904 
905 /*
906  * XXX
907  *
908  * Everything about this seems wrong.
909  */
910 
911 void
rf_PQ_recover(unsigned long * pbuf,unsigned long * qbuf,unsigned long * abuf,unsigned long * bbuf,unsigned length,unsigned coeff_a,unsigned coeff_b)912 rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf,
913     unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b)
914 {
915 	unsigned long p, q, a, a0, a1;
916 	int col = (29 * coeff_a) + coeff_b;
917 	unsigned char *q0 = &(rf_qinv[col][0]);
918 
919 	length /= 8;
920 	while (length) {
921 		p = *pbuf++;
922 		q = *qbuf++;
923 		a0 = EXTRACT(p, 0);
924 		a1 = EXTRACT(q, 0);
925 		a = q0[a0 << 5 | a1];
926 
927 #define	MF(i)								\
928 do {									\
929 	a0 = EXTRACT(p, i);						\
930 	a1 = EXTRACT(q, i);						\
931 	a  = a | INSERT(q0[a0<<5 | a1], i);				\
932 } while (0)
933 
934 		MF(1);
935 		MF(2);
936 		MF(3);
937 		MF(4);
938 		MF(5);
939 		MF(6);
940 #if 0
941 		MF(7);
942 		MF(8);
943 		MF(9);
944 		MF(10);
945 		MF(11);
946 		MF(12);
947 #endif	/* 0 */
948 		*abuf++ = a;
949 		*bbuf++ = a ^ p;
950 		length--;
951 	}
952 }
953 
954 
955 /*
956  * Lost parity and a data column. Recover that data column.
957  * Assume col coeff is lost. Let q the contents of Q after
958  * all surviving data columns have been q-xored out of it.
959  * Then we have the equation
960  *
961  *   q[28-coeff][a_i ^ r_i+1] = q
962  *
963  * but q is cyclic with period 31.
964  * So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
965  *    q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
966  *
967  * so a_i = r_{coeff+1} ^ q[3+coeff][q]
968  *
969  * The routine is passed q buffer and the buffer
970  * the data is to be recoverd into. They can be the same.
971  */
972 
973 void
rf_InvertQ(unsigned long * qbuf,unsigned long * abuf,unsigned length,unsigned coeff)974 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, unsigned length,
975     unsigned coeff)
976 {
977 	unsigned long a, new;
978 	unsigned long a1, a2;
979 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
980 	unsigned r = rf_rn[coeff + 1];
981 
982 	/* 13 5 bit quants in a 64 bit word. */
983 	length /= 8;
984 	while (length) {
985 		a = *qbuf++;
986 		a1 = EXTRACT(a, 0);
987 		a2 = EXTRACT(a, 1);
988 		a1 = r ^ q[a1];
989 		a2 = r ^ q[a2];
990 		new = INSERT(a2, 1) | a1;
991 
992 #define	M(i,j)								\
993 do {									\
994 	a1 = EXTRACT(a, i);						\
995 	a2 = EXTRACT(a, j);						\
996 	a1 = r ^ q[a1];							\
997 	a2 = r ^ q[a2];							\
998 	new = new | INSERT(a1, i) | INSERT(a2, j);			\
999 } while (0)
1000 
1001 		M(2, 3);
1002 		M(4, 5);
1003 		M(5, 6);
1004 #if	RF_LONGSHIFT > 2
1005 		M(7, 8);
1006 		M(9, 10);
1007 		M(11, 12);
1008 #endif	/* RF_LONGSHIFT > 2 */
1009 		*abuf++ = new;
1010 		length--;
1011 	}
1012 }
1013 #endif	/* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
1014