1 /* $OpenBSD: rf_pq.c,v 1.6 2002/12/16 07:01:04 tdeval Exp $ */
2 /* $NetBSD: rf_pq.c,v 1.7 2000/01/07 03:41:02 oster Exp $ */
3
4 /*
5 * Copyright (c) 1995 Carnegie-Mellon University.
6 * All rights reserved.
7 *
8 * Author: Daniel Stodolsky
9 *
10 * Permission to use, copy, modify and distribute this software and
11 * its documentation is hereby granted, provided that both the copyright
12 * notice and this permission notice appear in all copies of the
13 * software, derivative works or modified versions, and any portions
14 * thereof, and that both notices appear in supporting documentation.
15 *
16 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19 *
20 * Carnegie Mellon requests users of this software to return to
21 *
22 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23 * School of Computer Science
24 * Carnegie Mellon University
25 * Pittsburgh PA 15213-3890
26 *
27 * any improvements or extensions that they make and grant Carnegie the
28 * rights to redistribute these changes.
29 */
30
31 /*
32 * Code for RAID level 6 (P + Q) disk array architecture.
33 */
34
35 #include "rf_archs.h"
36 #include "rf_types.h"
37 #include "rf_raid.h"
38 #include "rf_dag.h"
39 #include "rf_dagffrd.h"
40 #include "rf_dagffwr.h"
41 #include "rf_dagdegrd.h"
42 #include "rf_dagdegwr.h"
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_etimer.h"
46 #include "rf_pqdeg.h"
47 #include "rf_general.h"
48 #include "rf_map.h"
49 #include "rf_pq.h"
50
51 RF_RedFuncs_t rf_pFuncs = {
52 rf_RegularONPFunc, "Regular Old-New P",
53 rf_SimpleONPFunc, "Simple Old-New P"
54 };
55 RF_RedFuncs_t rf_pRecoveryFuncs = {
56 rf_RecoveryPFunc, "Recovery P Func",
57 rf_RecoveryPFunc, "Recovery P Func"
58 };
59
60 int
rf_RegularONPFunc(RF_DagNode_t * node)61 rf_RegularONPFunc(RF_DagNode_t *node)
62 {
63 return (rf_RegularXorFunc(node));
64 }
65
66
67 /*
68 * Same as simpleONQ func, but the coefficient is always 1.
69 */
70
71 int
rf_SimpleONPFunc(RF_DagNode_t * node)72 rf_SimpleONPFunc(RF_DagNode_t *node)
73 {
74 return (rf_SimpleXorFunc(node));
75 }
76
77 int
rf_RecoveryPFunc(RF_DagNode_t * node)78 rf_RecoveryPFunc(RF_DagNode_t *node)
79 {
80 return (rf_RecoveryXorFunc(node));
81 }
82
83 int
rf_RegularPFunc(RF_DagNode_t * node)84 rf_RegularPFunc(RF_DagNode_t *node)
85 {
86 return (rf_RegularXorFunc(node));
87 }
88
89
90 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
91
92 void rf_QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
93 unsigned char coeff);
94 void rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, unsigned length,
95 unsigned coeff);
96
97 RF_RedFuncs_t rf_qFuncs = {
98 rf_RegularONQFunc, "Regular Old-New Q",
99 rf_SimpleONQFunc, "Simple Old-New Q"
100 };
101 RF_RedFuncs_t rf_qRecoveryFuncs = {
102 rf_RecoveryQFunc, "Recovery Q Func",
103 rf_RecoveryQFunc, "Recovery Q Func"
104 };
105 RF_RedFuncs_t rf_pqRecoveryFuncs = {
106 rf_RecoveryPQFunc, "Recovery PQ Func",
107 rf_RecoveryPQFunc, "Recovery PQ Func"
108 };
109
110 void
rf_PQDagSelect(RF_Raid_t * raidPtr,RF_IoType_t type,RF_AccessStripeMap_t * asmap,RF_VoidFuncPtr * createFunc)111 rf_PQDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
112 RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc)
113 {
114 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
115 unsigned ndfail = asmap->numDataFailed;
116 unsigned npfail = asmap->numParityFailed;
117 unsigned ntfail = npfail + ndfail;
118
119 RF_ASSERT(RF_IO_IS_R_OR_W(type));
120 if (ntfail > 2) {
121 RF_ERRORMSG("more than two disks failed in a single group !"
122 " Aborting I/O operation.\n");
123 /* *infoFunc = */ *createFunc = NULL;
124 return;
125 }
126 /* Ok, we can do this I/O. */
127 if (type == RF_IO_TYPE_READ) {
128 switch (ndfail) {
129 case 0:
130 /* Fault free read. */
131 *createFunc = (RF_VoidFuncPtr)
132 rf_CreateFaultFreeReadDAG; /* Same as raid 5. */
133 break;
134 case 1:
135 /* Lost a single data unit. */
136 /*
137 * Two cases:
138 * (1) Parity is not lost. Do a normal raid 5
139 * reconstruct read.
140 * (2) Parity is lost. Do a reconstruct read using "q".
141 */
142 if (ntfail == 2) { /* Also lost redundancy. */
143 if (asmap->failedPDAs[1]->type ==
144 RF_PDA_TYPE_PARITY)
145 *createFunc = (RF_VoidFuncPtr)
146 rf_PQ_110_CreateReadDAG;
147 else
148 *createFunc = (RF_VoidFuncPtr)
149 rf_PQ_101_CreateReadDAG;
150 } else {
151 /*
152 * P and Q are ok. But is there a failure in
153 * some unaccessed data unit ?
154 */
155 if (rf_NumFailedDataUnitsInStripe(raidPtr,
156 asmap) == 2)
157 *createFunc = (RF_VoidFuncPtr)
158 rf_PQ_200_CreateReadDAG;
159 else
160 *createFunc = (RF_VoidFuncPtr)
161 rf_PQ_100_CreateReadDAG;
162 }
163 break;
164 case 2:
165 /* Lost two data units. */
166 /* *infoFunc = rf_PQOneTwo; */
167 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
168 break;
169 }
170 return;
171 }
172 /* A write. */
173 switch (ntfail) {
174 case 0: /* Fault free. */
175 if (rf_suppressLocksAndLargeWrites ||
176 (((asmap->numStripeUnitsAccessed <=
177 (layoutPtr->numDataCol / 2)) &&
178 (layoutPtr->numDataCol != 1)) ||
179 (asmap->parityInfo->next != NULL) ||
180 (asmap->qInfo->next != NULL) ||
181 rf_CheckStripeForFailures(raidPtr, asmap))) {
182
183 *createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
184 } else {
185 *createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
186 }
187 break;
188
189 case 1: /* Single disk fault. */
190 if (npfail == 1) {
191 RF_ASSERT((asmap->failedPDAs[0]->type ==
192 RF_PDA_TYPE_PARITY) ||
193 (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
194 if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {
195 /*
196 * Q died, treat like normal mode raid5 write.
197 */
198 if (((asmap->numStripeUnitsAccessed <=
199 (layoutPtr->numDataCol / 2)) ||
200 (asmap->numStripeUnitsAccessed == 1)) ||
201 rf_NumFailedDataUnitsInStripe(raidPtr,
202 asmap))
203 *createFunc = (RF_VoidFuncPtr)
204 rf_PQ_001_CreateSmallWriteDAG;
205 else
206 *createFunc = (RF_VoidFuncPtr)
207 rf_PQ_001_CreateLargeWriteDAG;
208 } else {/* Parity died, small write only updating Q. */
209 if (((asmap->numStripeUnitsAccessed <=
210 (layoutPtr->numDataCol / 2)) ||
211 (asmap->numStripeUnitsAccessed == 1)) ||
212 rf_NumFailedDataUnitsInStripe(raidPtr,
213 asmap))
214 *createFunc = (RF_VoidFuncPtr)
215 rf_PQ_010_CreateSmallWriteDAG;
216 else
217 *createFunc = (RF_VoidFuncPtr)
218 rf_PQ_010_CreateLargeWriteDAG;
219 }
220 } else { /*
221 * Data missing. Do a P reconstruct write if
222 * only a single data unit is lost in the
223 * stripe, otherwise a PQ reconstruct write.
224 */
225 if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
226 *createFunc = (RF_VoidFuncPtr)
227 rf_PQ_200_CreateWriteDAG;
228 else
229 *createFunc = (RF_VoidFuncPtr)
230 rf_PQ_100_CreateWriteDAG;
231 }
232 break;
233
234 case 2: /* Two disk faults. */
235 switch (npfail) {
236 case 2: /* Both p and q dead. */
237 *createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
238 break;
239 case 1: /* Either p or q and dead data. */
240 RF_ASSERT(asmap->failedPDAs[0]->type ==
241 RF_PDA_TYPE_DATA);
242 RF_ASSERT((asmap->failedPDAs[1]->type ==
243 RF_PDA_TYPE_PARITY) ||
244 (asmap->failedPDAs[1]->type ==
245 RF_PDA_TYPE_Q));
246 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
247 *createFunc = (RF_VoidFuncPtr)
248 rf_PQ_101_CreateWriteDAG;
249 else
250 *createFunc = (RF_VoidFuncPtr)
251 rf_PQ_110_CreateWriteDAG;
252 break;
253 case 0: /* Double data loss. */
254 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
255 break;
256 }
257 break;
258
259 default: /* More than 2 disk faults. */
260 *createFunc = NULL;
261 RF_PANIC();
262 }
263 return;
264 }
265
266
267 /*
268 * Used as a stop gap info function.
269 */
270 #if 0
271 void
272 rf_PQOne(RF_Raid_t *raidPtr, int *nSucc, int *nAnte,
273 RF_AccessStripeMap_t *asmap)
274 {
275 *nSucc = *nAnte = 1;
276 }
277
278 void
279 rf_PQOneTwo(RF_Raid_t *raidPtr, int *nSucc, int *nAnte,
280 RF_AccessStripeMap_t *asmap)
281 {
282 *nSucc = 1;
283 *nAnte = 2;
284 }
285 #endif
286
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)287 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
288 {
289 rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags,
290 allocList, 2, rf_RegularPQFunc, RF_FALSE);
291 }
292
293 int
rf_RegularONQFunc(RF_DagNode_t * node)294 rf_RegularONQFunc(RF_DagNode_t *node)
295 {
296 int np = node->numParams;
297 int d;
298 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
299 int i;
300 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
301 RF_Etimer_t timer;
302 char *qbuf, *qpbuf;
303 char *obuf, *nbuf;
304 RF_PhysDiskAddr_t *old, *new;
305 unsigned long coeff;
306 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
307
308 RF_ETIMER_START(timer);
309
310 d = (np - 3) / 4;
311 RF_ASSERT(4 * d + 3 == np);
312 qbuf = (char *) node->params[2 * d + 1].p; /* Q buffer. */
313 for (i = 0; i < d; i++) {
314 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
315 obuf = (char *) node->params[2 * i + 1].p;
316 new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
317 nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
318 RF_ASSERT(new->numSector == old->numSector);
319 RF_ASSERT(new->raidAddress == old->raidAddress);
320 /*
321 * The stripe unit within the stripe tells us the coefficient
322 * to use for the multiply.
323 */
324 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
325 new->raidAddress);
326 /*
327 * Compute the data unit offset within the column, then add
328 * one.
329 */
330 coeff = (coeff % raidPtr->Layout.numDataCol);
331 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,
332 old->startSector % secPerSU);
333 rf_QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr,
334 old->numSector), coeff);
335 }
336
337 RF_ETIMER_STOP(timer);
338 RF_ETIMER_EVAL(timer);
339 tracerec->q_us += RF_ETIMER_VAL_US(timer);
340 rf_GenericWakeupFunc(node, 0); /*
341 * Call wake func explicitly since no
342 * I/O in this node.
343 */
344 return (0);
345 }
346
347
348 /*
349 * See the SimpleXORFunc for the difference between a simple and regular func.
350 * These Q functions should be used for
351 * new q = Q(data, old data, old q)
352 * style updates and not for
353 * q = (new data, new data, ...)
354 * computations.
355 *
356 * The simple q takes 2(2d+1)+1 params, where d is the number
357 * of stripes written. The order of params is
358 * old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ...
359 * old data pda_d, old data buffer_d
360 * [2d] old q pda_0, old q buffer
361 * [2d_2] new data pda_0, new data buffer_0, ...
362 * new data pda_d, new data buffer_d
363 * raidPtr
364 */
365
366 int
rf_SimpleONQFunc(RF_DagNode_t * node)367 rf_SimpleONQFunc(RF_DagNode_t *node)
368 {
369 int np = node->numParams;
370 int d;
371 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
372 int i;
373 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
374 RF_Etimer_t timer;
375 char *qbuf;
376 char *obuf, *nbuf;
377 RF_PhysDiskAddr_t *old, *new;
378 unsigned long coeff;
379
380 RF_ETIMER_START(timer);
381
382 d = (np - 3) / 4;
383 RF_ASSERT(4 * d + 3 == np);
384 qbuf = (char *) node->params[2 * d + 1].p; /* Q buffer. */
385 for (i = 0; i < d; i++) {
386 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
387 obuf = (char *) node->params[2 * i + 1].p;
388 new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
389 nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
390 RF_ASSERT(new->numSector == old->numSector);
391 RF_ASSERT(new->raidAddress == old->raidAddress);
392 /*
393 * The stripe unit within the stripe tells us the coefficient
394 * to use for the multiply.
395 */
396 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
397 new->raidAddress);
398 /*
399 * Compute the data unit offset within the column, then add
400 * one.
401 */
402 coeff = (coeff % raidPtr->Layout.numDataCol);
403 rf_QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr,
404 old->numSector), coeff);
405 }
406
407 RF_ETIMER_STOP(timer);
408 RF_ETIMER_EVAL(timer);
409 tracerec->q_us += RF_ETIMER_VAL_US(timer);
410 rf_GenericWakeupFunc(node, 0); /*
411 * Call wake func explicitly since no
412 * I/O in this node.
413 */
414 return (0);
415 }
416
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)417 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
418 {
419 rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags,
420 allocList, &rf_pFuncs, &rf_qFuncs);
421 }
422
423
424 void rf_RegularQSubr(RF_DagNode_t *, char *);
425
426 void
rf_RegularQSubr(RF_DagNode_t * node,char * qbuf)427 rf_RegularQSubr(RF_DagNode_t *node, char *qbuf)
428 {
429 int np = node->numParams;
430 int d;
431 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
432 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
433 int i;
434 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
435 RF_Etimer_t timer;
436 char *obuf, *qpbuf;
437 RF_PhysDiskAddr_t *old;
438 unsigned long coeff;
439
440 RF_ETIMER_START(timer);
441
442 d = (np - 1) / 2;
443 RF_ASSERT(2 * d + 1 == np);
444 for (i = 0; i < d; i++) {
445 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
446 obuf = (char *) node->params[2 * i + 1].p;
447 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
448 old->raidAddress);
449 /*
450 * Compute the data unit offset within the column, then add
451 * one.
452 */
453 coeff = (coeff % raidPtr->Layout.numDataCol);
454 /*
455 * The input buffers may not all be aligned with the start of
456 * the stripe. So shift by their sector offset within the
457 * stripe unit.
458 */
459 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,
460 old->startSector % secPerSU);
461 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf,
462 rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
463 }
464
465 RF_ETIMER_STOP(timer);
466 RF_ETIMER_EVAL(timer);
467 tracerec->q_us += RF_ETIMER_VAL_US(timer);
468 }
469
470
471 /*
472 * Used in degraded writes.
473 */
474
475 void rf_DegrQSubr(RF_DagNode_t *);
476
477 void
rf_DegrQSubr(RF_DagNode_t * node)478 rf_DegrQSubr(RF_DagNode_t *node)
479 {
480 int np = node->numParams;
481 int d;
482 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
483 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
484 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
485 RF_Etimer_t timer;
486 char *qbuf = node->results[1];
487 char *obuf, *qpbuf;
488 RF_PhysDiskAddr_t *old;
489 unsigned long coeff;
490 unsigned fail_start;
491 int i, j;
492
493 old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
494 fail_start = old->startSector % secPerSU;
495
496 RF_ETIMER_START(timer);
497
498 d = (np - 2) / 2;
499 RF_ASSERT(2 * d + 2 == np);
500 for (i = 0; i < d; i++) {
501 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
502 obuf = (char *) node->params[2 * i + 1].p;
503 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
504 old->raidAddress);
505 /*
506 * Compute the data unit offset within the column, then add
507 * one.
508 */
509 coeff = (coeff % raidPtr->Layout.numDataCol);
510 /*
511 * The input buffers may not all be aligned with the start of
512 * the stripe. So shift by their sector offset within the
513 * stripe unit.
514 */
515 j = old->startSector % secPerSU;
516 RF_ASSERT(j >= fail_start);
517 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
518 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf,
519 rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
520 }
521
522 RF_ETIMER_STOP(timer);
523 RF_ETIMER_EVAL(timer);
524 tracerec->q_us += RF_ETIMER_VAL_US(timer);
525 }
526
527
528 /*
529 * Called by large write code to compute the new parity and the new q.
530 *
531 * Structure of the params:
532 *
533 * pda_0, buffer_0, pda_1 , buffer_1, ..., pda_d, buffer_d (d = numDataCol)
534 * raidPtr
535 *
536 * For a total of 2d+1 arguments.
537 * The result buffers results[0], results[1] are the buffers for the p and q,
538 * respectively.
539 *
540 * We compute Q first, then compute P. The P calculation may try to reuse
541 * one of the input buffers for its output, so if we computed P first, we would
542 * corrupt the input for the q calculation.
543 */
544
545 int
rf_RegularPQFunc(RF_DagNode_t * node)546 rf_RegularPQFunc(RF_DagNode_t *node)
547 {
548 rf_RegularQSubr(node, node->results[1]);
549 return (rf_RegularXorFunc(node)); /* Does the wakeup. */
550 }
551
552 int
rf_RegularQFunc(RF_DagNode_t * node)553 rf_RegularQFunc(RF_DagNode_t *node)
554 {
555 /* Almost ... adjust Qsubr args. */
556 rf_RegularQSubr(node, node->results[0]);
557 rf_GenericWakeupFunc(node, 0); /*
558 * Call wake func explicitly since no
559 * I/O in this node.
560 */
561 return (0);
562 }
563
564
565 /*
566 * Called by singly degraded write code to compute the new parity and
567 * the new q.
568 *
569 * Structure of the params:
570 *
571 * pda_0, buffer_0, pda_1 , buffer_1, ..., pda_d, buffer_d
572 * failedPDA raidPtr
573 *
574 * for a total of 2d+2 arguments.
575 * The result buffers results[0], results[1] are the buffers for the parity
576 * and q, respectively.
577 *
578 * We compute Q first, then compute parity. The parity calculation may try
579 * to reuse one of the input buffers for its output, so if we computed parity
580 * first, we would corrupt the input for the q calculation.
581 *
582 * We treat this identically to the regularPQ case, ignoring the failedPDA
583 * extra argument.
584 */
585
586 void
rf_Degraded_100_PQFunc(RF_DagNode_t * node)587 rf_Degraded_100_PQFunc(RF_DagNode_t *node)
588 {
589 int np = node->numParams;
590
591 RF_ASSERT(np >= 2);
592 rf_DegrQSubr(node);
593 rf_RecoveryXorFunc(node);
594 }
595
596
597 /*
598 * The two below are used when reading a stripe with a single lost data unit.
599 * The parameters are
600 *
601 * pda_0, buffer_0, ..., pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
602 *
603 * and results[0] contains the data buffer, which is originally zero-filled.
604 */
605
606 /*
607 * This Q func is used by the degraded-mode dag functions to recover lost data.
608 * The second-to-last parameter is the PDA for the failed portion of the
609 * access. The code here looks at this PDA and assumes that the xor target
610 * buffer is equal in size to the number of sectors in the failed PDA. It then
611 * uses the other PDAs in the parameter list to determine where within the
612 * target buffer the corresponding data should be xored.
613 *
614 * Recall the basic equation is
615 *
616 * Q = (data_1 + 2 * data_2 ... + k * data_k) mod 256
617 *
618 * so to recover data_j we need
619 *
620 * J data_j = (Q - data_1 - 2 data_2 ... - k * data_k) mod 256
621 *
622 * So the coefficient for each buffer is (255 - data_col), and j should be
623 * initialized by copying Q into it. Then we need to do a table lookup to
624 * convert to solve
625 * data_j /= J
626 *
627 */
628
629 int
rf_RecoveryQFunc(RF_DagNode_t * node)630 rf_RecoveryQFunc(RF_DagNode_t *node)
631 {
632 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
633 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
634 RF_PhysDiskAddr_t *failedPDA =
635 (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
636 int i;
637 RF_PhysDiskAddr_t *pda;
638 RF_RaidAddr_t suoffset;
639 RF_RaidAddr_t failedSUOffset =
640 rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
641 char *srcbuf, *destbuf;
642 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
643 RF_Etimer_t timer;
644 unsigned long coeff;
645
646 RF_ETIMER_START(timer);
647 /* Start by copying Q into the buffer. */
648 bcopy(node->params[node->numParams - 3].p, node->results[0],
649 rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
650 for (i = 0; i < node->numParams - 4; i += 2) {
651 RF_ASSERT(node->params[i + 1].p != node->results[0]);
652 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
653 srcbuf = (char *) node->params[i + 1].p;
654 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
655 destbuf = ((char *) node->results[0]) +
656 rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
657 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
658 pda->raidAddress);
659 /* Compute the data unit offset within the column. */
660 coeff = (coeff % raidPtr->Layout.numDataCol);
661 rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf,
662 rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
663 }
664 /* Do the nasty inversion now. */
665 coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
666 failedPDA->startSector) % raidPtr->Layout.numDataCol);
667 rf_InvertQ(node->results[0], node->results[0],
668 rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
669 RF_ETIMER_STOP(timer);
670 RF_ETIMER_EVAL(timer);
671 tracerec->q_us += RF_ETIMER_VAL_US(timer);
672 rf_GenericWakeupFunc(node, 0);
673 return (0);
674 }
675
676 int
rf_RecoveryPQFunc(RF_DagNode_t * node)677 rf_RecoveryPQFunc(RF_DagNode_t *node)
678 {
679 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
680 printf("raid%d: Recovery from PQ not implemented.\n", raidPtr->raidid);
681 return (1);
682 }
683
684
685 /*
686 * Degraded write Q subroutine.
687 * Used when P is dead.
688 * Large-write style Q computation.
689 * Parameters:
690 *
691 * (pda, buf), (pda, buf), ..., (failedPDA, bufPtr), failedPDA, raidPtr.
692 *
693 * We ignore failedPDA.
694 *
695 * This is a "simple style" recovery func.
696 */
697
698 void
rf_PQ_DegradedWriteQFunc(RF_DagNode_t * node)699 rf_PQ_DegradedWriteQFunc(RF_DagNode_t *node)
700 {
701 int np = node->numParams;
702 int d;
703 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
704 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
705 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
706 RF_Etimer_t timer;
707 char *qbuf = node->results[0];
708 char *obuf, *qpbuf;
709 RF_PhysDiskAddr_t *old;
710 unsigned long coeff;
711 int fail_start, i, j;
712
713 old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
714 fail_start = old->startSector % secPerSU;
715
716 RF_ETIMER_START(timer);
717
718 d = (np - 2) / 2;
719 RF_ASSERT(2 * d + 2 == np);
720
721 for (i = 0; i < d; i++) {
722 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
723 obuf = (char *) node->params[2 * i + 1].p;
724 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
725 old->raidAddress);
726 /*
727 * Compute the data unit offset within the column, then add
728 * one.
729 */
730 coeff = (coeff % raidPtr->Layout.numDataCol);
731 j = old->startSector % secPerSU;
732 RF_ASSERT(j >= fail_start);
733 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
734 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf,
735 rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
736 }
737
738 RF_ETIMER_STOP(timer);
739 RF_ETIMER_EVAL(timer);
740 tracerec->q_us += RF_ETIMER_VAL_US(timer);
741 rf_GenericWakeupFunc(node, 0);
742 }
743
744
745 /* Q computations. */
746
747 /*
748 * Coeff - colummn;
749 *
750 * Compute dest ^= qfor[28-coeff][rn[coeff+1] a]
751 *
752 * On 5-bit basis;
753 * Length in bytes;
754 */
755
756 void
rf_IncQ(unsigned long * dest,unsigned long * buf,unsigned length,unsigned coeff)757 rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length,
758 unsigned coeff)
759 {
760 unsigned long a, d, new;
761 unsigned long a1, a2;
762 unsigned int *q = &(rf_qfor[28 - coeff][0]);
763 unsigned r = rf_rn[coeff + 1];
764
765 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
766 #define INSERT(a,i) (a << (5L*i))
767
768 length /= 8;
769 /* 13 5 bit quants in a 64 bit word. */
770 while (length) {
771 a = *buf++;
772 d = *dest;
773 a1 = EXTRACT(a, 0) ^ r;
774 a2 = EXTRACT(a, 1) ^ r;
775 new = INSERT(a2, 1) | a1;
776 a1 = EXTRACT(a, 2) ^ r;
777 a2 = EXTRACT(a, 3) ^ r;
778 a1 = q[a1];
779 a2 = q[a2];
780 new = new | INSERT(a1, 2) | INSERT(a2, 3);
781 a1 = EXTRACT(a, 4) ^ r;
782 a2 = EXTRACT(a, 5) ^ r;
783 a1 = q[a1];
784 a2 = q[a2];
785 new = new | INSERT(a1, 4) | INSERT(a2, 5);
786 a1 = EXTRACT(a, 5) ^ r;
787 a2 = EXTRACT(a, 6) ^ r;
788 a1 = q[a1];
789 a2 = q[a2];
790 new = new | INSERT(a1, 5) | INSERT(a2, 6);
791 #if RF_LONGSHIFT > 2
792 a1 = EXTRACT(a, 7) ^ r;
793 a2 = EXTRACT(a, 8) ^ r;
794 a1 = q[a1];
795 a2 = q[a2];
796 new = new | INSERT(a1, 7) | INSERT(a2, 8);
797 a1 = EXTRACT(a, 9) ^ r;
798 a2 = EXTRACT(a, 10) ^ r;
799 a1 = q[a1];
800 a2 = q[a2];
801 new = new | INSERT(a1, 9) | INSERT(a2, 10);
802 a1 = EXTRACT(a, 11) ^ r;
803 a2 = EXTRACT(a, 12) ^ r;
804 a1 = q[a1];
805 a2 = q[a2];
806 new = new | INSERT(a1, 11) | INSERT(a2, 12);
807 #endif /* RF_LONGSHIFT > 2 */
808 d ^= new;
809 *dest++ = d;
810 length--;
811 }
812 }
813
814
815 /*
816 * Compute.
817 *
818 * dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new)]
819 *
820 * On a five bit basis.
821 * Optimization: compute old ^ new on 64 bit basis.
822 *
823 * Length in bytes.
824 */
825
826 void
rf_QDelta(char * dest,char * obuf,char * nbuf,unsigned length,unsigned char coeff)827 rf_QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
828 unsigned char coeff)
829 {
830 unsigned long a, d, new;
831 unsigned long a1, a2;
832 unsigned int *q = &(rf_qfor[28 - coeff][0]);
833 unsigned int r = rf_rn[coeff + 1];
834
835 r = a1 = a2 = new = d = a = 0; /* XXX For now... */
836 q = NULL; /* XXX For now */
837
838 #ifdef _KERNEL
839 /*
840 * PQ in kernel currently not supported because the encoding/decoding
841 * table is not present.
842 */
843 bzero(dest, length);
844 #else /* _KERNEL */
845 /* This code probably doesn't work and should be rewritten. -wvcii */
846 /* 13 5 bit quants in a 64 bit word. */
847 length /= 8;
848 while (length) {
849 a = *obuf++; /*
850 * XXX Need to reorg to avoid cache conflicts.
851 */
852 a ^= *nbuf++;
853 d = *dest;
854 a1 = EXTRACT(a, 0) ^ r;
855 a2 = EXTRACT(a, 1) ^ r;
856 a1 = q[a1];
857 a2 = q[a2];
858 new = INSERT(a2, 1) | a1;
859 a1 = EXTRACT(a, 2) ^ r;
860 a2 = EXTRACT(a, 3) ^ r;
861 a1 = q[a1];
862 a2 = q[a2];
863 new = new | INSERT(a1, 2) | INSERT(a2, 3);
864 a1 = EXTRACT(a, 4) ^ r;
865 a2 = EXTRACT(a, 5) ^ r;
866 a1 = q[a1];
867 a2 = q[a2];
868 new = new | INSERT(a1, 4) | INSERT(a2, 5);
869 a1 = EXTRACT(a, 5) ^ r;
870 a2 = EXTRACT(a, 6) ^ r;
871 a1 = q[a1];
872 a2 = q[a2];
873 new = new | INSERT(a1, 5) | INSERT(a2, 6);
874 #if RF_LONGSHIFT > 2
875 a1 = EXTRACT(a, 7) ^ r;
876 a2 = EXTRACT(a, 8) ^ r;
877 a1 = q[a1];
878 a2 = q[a2];
879 new = new | INSERT(a1, 7) | INSERT(a2, 8);
880 a1 = EXTRACT(a, 9) ^ r;
881 a2 = EXTRACT(a, 10) ^ r;
882 a1 = q[a1];
883 a2 = q[a2];
884 new = new | INSERT(a1, 9) | INSERT(a2, 10);
885 a1 = EXTRACT(a, 11) ^ r;
886 a2 = EXTRACT(a, 12) ^ r;
887 a1 = q[a1];
888 a2 = q[a2];
889 new = new | INSERT(a1, 11) | INSERT(a2, 12);
890 #endif /* RF_LONGSHIFT > 2 */
891 d ^= new;
892 *dest++ = d;
893 length--;
894 }
895 #endif /* _KERNEL */
896 }
897
898
899 /*
900 * Recover columns a and b from the given p and q into
901 * bufs abuf and bbuf. All bufs are word aligned.
902 * Length is in bytes.
903 */
904
905 /*
906 * XXX
907 *
908 * Everything about this seems wrong.
909 */
910
911 void
rf_PQ_recover(unsigned long * pbuf,unsigned long * qbuf,unsigned long * abuf,unsigned long * bbuf,unsigned length,unsigned coeff_a,unsigned coeff_b)912 rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf,
913 unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b)
914 {
915 unsigned long p, q, a, a0, a1;
916 int col = (29 * coeff_a) + coeff_b;
917 unsigned char *q0 = &(rf_qinv[col][0]);
918
919 length /= 8;
920 while (length) {
921 p = *pbuf++;
922 q = *qbuf++;
923 a0 = EXTRACT(p, 0);
924 a1 = EXTRACT(q, 0);
925 a = q0[a0 << 5 | a1];
926
927 #define MF(i) \
928 do { \
929 a0 = EXTRACT(p, i); \
930 a1 = EXTRACT(q, i); \
931 a = a | INSERT(q0[a0<<5 | a1], i); \
932 } while (0)
933
934 MF(1);
935 MF(2);
936 MF(3);
937 MF(4);
938 MF(5);
939 MF(6);
940 #if 0
941 MF(7);
942 MF(8);
943 MF(9);
944 MF(10);
945 MF(11);
946 MF(12);
947 #endif /* 0 */
948 *abuf++ = a;
949 *bbuf++ = a ^ p;
950 length--;
951 }
952 }
953
954
955 /*
956 * Lost parity and a data column. Recover that data column.
957 * Assume col coeff is lost. Let q the contents of Q after
958 * all surviving data columns have been q-xored out of it.
959 * Then we have the equation
960 *
961 * q[28-coeff][a_i ^ r_i+1] = q
962 *
963 * but q is cyclic with period 31.
964 * So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
965 * q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
966 *
967 * so a_i = r_{coeff+1} ^ q[3+coeff][q]
968 *
969 * The routine is passed q buffer and the buffer
970 * the data is to be recoverd into. They can be the same.
971 */
972
973 void
rf_InvertQ(unsigned long * qbuf,unsigned long * abuf,unsigned length,unsigned coeff)974 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, unsigned length,
975 unsigned coeff)
976 {
977 unsigned long a, new;
978 unsigned long a1, a2;
979 unsigned int *q = &(rf_qfor[3 + coeff][0]);
980 unsigned r = rf_rn[coeff + 1];
981
982 /* 13 5 bit quants in a 64 bit word. */
983 length /= 8;
984 while (length) {
985 a = *qbuf++;
986 a1 = EXTRACT(a, 0);
987 a2 = EXTRACT(a, 1);
988 a1 = r ^ q[a1];
989 a2 = r ^ q[a2];
990 new = INSERT(a2, 1) | a1;
991
992 #define M(i,j) \
993 do { \
994 a1 = EXTRACT(a, i); \
995 a2 = EXTRACT(a, j); \
996 a1 = r ^ q[a1]; \
997 a2 = r ^ q[a2]; \
998 new = new | INSERT(a1, i) | INSERT(a2, j); \
999 } while (0)
1000
1001 M(2, 3);
1002 M(4, 5);
1003 M(5, 6);
1004 #if RF_LONGSHIFT > 2
1005 M(7, 8);
1006 M(9, 10);
1007 M(11, 12);
1008 #endif /* RF_LONGSHIFT > 2 */
1009 *abuf++ = new;
1010 length--;
1011 }
1012 }
1013 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
1014