1 /*        $NetBSD: rf_pq.c,v 1.18 2023/10/15 18:15:20 oster Exp $     */
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Daniel Stodolsky
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*
30  * Code for RAID level 6 (P + Q) disk array architecture.
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: rf_pq.c,v 1.18 2023/10/15 18:15:20 oster Exp $");
35 
36 #include "rf_archs.h"
37 
38 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
39 
40 #include <dev/raidframe/raidframevar.h>
41 
42 #include "rf_raid.h"
43 #include "rf_dag.h"
44 #include "rf_dagffrd.h"
45 #include "rf_dagffwr.h"
46 #include "rf_dagdegrd.h"
47 #include "rf_dagdegwr.h"
48 #include "rf_dagutils.h"
49 #include "rf_dagfuncs.h"
50 #include "rf_etimer.h"
51 #include "rf_pqdeg.h"
52 #include "rf_general.h"
53 #include "rf_map.h"
54 #include "rf_pq.h"
55 
56 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
57 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
58 
59 void
rf_RegularONPFunc(RF_DagNode_t * node)60 rf_RegularONPFunc(RF_DagNode_t *node)
61 {
62           rf_RegularXorFunc(node);
63 }
64 /*
65    same as simpleONQ func, but the coefficient is always 1
66 */
67 
68 void
rf_SimpleONPFunc(RF_DagNode_t * node)69 rf_SimpleONPFunc(RF_DagNode_t *node)
70 {
71           rf_SimpleXorFunc(node);
72 }
73 
74 void
rf_RecoveryPFunc(RF_DagNode_t * node)75 rf_RecoveryPFunc(RF_DagNode_t *node)
76 {
77           rf_RecoveryXorFunc(node);
78 }
79 
80 void
rf_RegularPFunc(RF_DagNode_t * node)81 rf_RegularPFunc(RF_DagNode_t *node)
82 {
83           rf_RegularXorFunc(node);
84 }
85 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
86 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
87 
88 static void
89 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
90     unsigned char coeff);
91 static void
92 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
93     unsigned length, unsigned coeff);
94 
95 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
96 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
97 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
98 
99 void
rf_PQDagSelect(RF_Raid_t * raidPtr,RF_IoType_t type,RF_AccessStripeMap_t * asmap,RF_VoidFuncPtr * createFunc)100 rf_PQDagSelect(
101     RF_Raid_t * raidPtr,
102     RF_IoType_t type,
103     RF_AccessStripeMap_t * asmap,
104     RF_VoidFuncPtr * createFunc)
105 {
106           RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
107           unsigned ndfail = asmap->numDataFailed;
108           unsigned npfail = asmap->numParityFailed;
109           unsigned ntfail = npfail + ndfail;
110 
111           RF_ASSERT(RF_IO_IS_R_OR_W(type));
112           if (ntfail > 2) {
113                     RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
114                     *createFunc = NULL;
115                     return;
116           }
117           /* ok, we can do this I/O */
118           if (type == RF_IO_TYPE_READ) {
119                     switch (ndfail) {
120                     case 0:
121                               /* fault free read */
122                               *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;   /* same as raid 5 */
123                               break;
124                     case 1:
125                               /* lost a single data unit */
126                               /* two cases: (1) parity is not lost. do a normal raid
127                                * 5 reconstruct read. (2) parity is lost. do a
128                                * reconstruct read using "q". */
129                               if (ntfail == 2) {  /* also lost redundancy */
130                                         if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
131                                                   *createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
132                                         else
133                                                   *createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
134                               } else {
135                                         /* P and Q are ok. But is there a failure in
136                                          * some unaccessed data unit? */
137                                         if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
138                                                   *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
139                                         else
140                                                   *createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
141                               }
142                               break;
143                     case 2:
144                               /* lost two data units */
145                               *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
146                               break;
147                     }
148                     return;
149           }
150           /* a write */
151           switch (ntfail) {
152           case 0:             /* fault free */
153                     if (rf_suppressLocksAndLargeWrites ||
154                         (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
155                               (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
156 
157                               *createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
158                     } else {
159                               *createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
160                     }
161                     break;
162 
163           case 1:             /* single disk fault */
164                     if (npfail == 1) {
165                               RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
166                               if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {          /* q died, treat like
167                                                                                                      * normal mode raid5
168                                                                                                      * write. */
169                                         if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
170                                             || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
171                                                   *createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
172                                         else
173                                                   *createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
174                               } else {/* parity died, small write only updating Q */
175                                         if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
176                                             || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
177                                                   *createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
178                                         else
179                                                   *createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
180                               }
181                     } else {  /* data missing. Do a P reconstruct write if
182                                          * only a single data unit is lost in the
183                                          * stripe, otherwise a PQ reconstruct write. */
184                               if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
185                                         *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
186                               else
187                                         *createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
188                     }
189                     break;
190 
191           case 2:             /* two disk faults */
192                     switch (npfail) {
193                     case 2:   /* both p and q dead */
194                               *createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
195                               break;
196                     case 1:   /* either p or q and dead data */
197                               RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
198                               RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
199                               if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
200                                         *createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
201                               else
202                                         *createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
203                               break;
204                     case 0:   /* double data loss */
205                               *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
206                               break;
207                     }
208                     break;
209 
210           default:            /* more than 2 disk faults */
211                     *createFunc = NULL;
212                     RF_PANIC();
213           }
214           return;
215 }
216 /*
217    Used as a stop gap info function
218 */
219 #if 0
220 static void
221 PQOne(RF_Raid_t *raidPtr, int *nSucc, int *nAnte, RF_AccessStripeMap_t *asmap)
222 {
223           *nSucc = *nAnte = 1;
224 }
225 
226 static void
227 PQOneTwo(RF_Raid_t *raidPtr, int *nSucc, int *nAnte, RF_AccessStripeMap_t *asmap)
228 {
229           *nSucc = 1;
230           *nAnte = 2;
231 }
232 #endif
233 
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)234 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
235 {
236           rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
237               rf_RegularPQFunc, RF_FALSE);
238 }
239 
240 void
rf_RegularONQFunc(RF_DagNode_t * node)241 rf_RegularONQFunc(RF_DagNode_t *node)
242 {
243           int     np = node->numParams;
244           int     d;
245           RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
246           int     i;
247           RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
248           RF_Etimer_t timer;
249           char   *qbuf, *qpbuf;
250           char   *obuf, *nbuf;
251           RF_PhysDiskAddr_t *old, *new;
252           unsigned long coeff;
253           unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
254 
255           RF_ETIMER_START(timer);
256 
257           d = (np - 3) / 4;
258           RF_ASSERT(4 * d + 3 == np);
259           qbuf = (char *) node->params[2 * d + 1].p;        /* q buffer */
260           for (i = 0; i < d; i++) {
261                     old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
262                     obuf = (char *) node->params[2 * i + 1].p;
263                     new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
264                     nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
265                     RF_ASSERT(new->numSector == old->numSector);
266                     RF_ASSERT(new->raidAddress == old->raidAddress);
267                     /* the stripe unit within the stripe tells us the coefficient
268                      * to use for the multiply. */
269                     coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
270                     /* compute the data unit offset within the column, then add
271                      * one */
272                     coeff = (coeff % raidPtr->Layout.numDataCol);
273                     qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
274                     QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
275           }
276 
277           RF_ETIMER_STOP(timer);
278           RF_ETIMER_EVAL(timer);
279           tracerec->q_us += RF_ETIMER_VAL_US(timer);
280           rf_GenericWakeupFunc(node, 0);          /* call wake func explicitly since no
281                                                    * I/O in this node */
282 }
283 /*
284    See the SimpleXORFunc for the difference between a simple and regular func.
285    These Q functions should be used for
286 
287          new q = Q(data,old data,old q)
288 
289    style updates and not for
290 
291          q = ( new data, new data, .... )
292 
293    computations.
294 
295    The simple q takes 2(2d+1)+1 params, where d is the number
296    of stripes written. The order of params is
297    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
298    [2d] old q pda_0, old q buffer
299    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
300    raidPtr
301 */
302 
303 void
rf_SimpleONQFunc(RF_DagNode_t * node)304 rf_SimpleONQFunc(RF_DagNode_t *node)
305 {
306           int     np = node->numParams;
307           int     d;
308           RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
309           int     i;
310           RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
311           RF_Etimer_t timer;
312           char   *qbuf;
313           char   *obuf, *nbuf;
314           RF_PhysDiskAddr_t *old, *new;
315           unsigned long coeff;
316 
317           RF_ETIMER_START(timer);
318 
319           d = (np - 3) / 4;
320           RF_ASSERT(4 * d + 3 == np);
321           qbuf = (char *) node->params[2 * d + 1].p;        /* q buffer */
322           for (i = 0; i < d; i++) {
323                     old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
324                     obuf = (char *) node->params[2 * i + 1].p;
325                     new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
326                     nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
327                     RF_ASSERT(new->numSector == old->numSector);
328                     RF_ASSERT(new->raidAddress == old->raidAddress);
329                     /* the stripe unit within the stripe tells us the coefficient
330                      * to use for the multiply. */
331                     coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
332                     /* compute the data unit offset within the column, then add
333                      * one */
334                     coeff = (coeff % raidPtr->Layout.numDataCol);
335                     QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
336           }
337 
338           RF_ETIMER_STOP(timer);
339           RF_ETIMER_EVAL(timer);
340           tracerec->q_us += RF_ETIMER_VAL_US(timer);
341           rf_GenericWakeupFunc(node, 0);          /* call wake func explicitly since no
342                                                    * I/O in this node */
343 }
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)344 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
345 {
346           rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
347 }
348 
349 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
350 
351 static void
RegularQSubr(RF_DagNode_t * node,char * qbuf)352 RegularQSubr(RF_DagNode_t *node, char *qbuf)
353 {
354           int     np = node->numParams;
355           int     d;
356           RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
357           unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
358           int     i;
359           RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
360           RF_Etimer_t timer;
361           char   *obuf, *qpbuf;
362           RF_PhysDiskAddr_t *old;
363           unsigned long coeff;
364 
365           RF_ETIMER_START(timer);
366 
367           d = (np - 1) / 2;
368           RF_ASSERT(2 * d + 1 == np);
369           for (i = 0; i < d; i++) {
370                     old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
371                     obuf = (char *) node->params[2 * i + 1].p;
372                     coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
373                     /* compute the data unit offset within the column, then add
374                      * one */
375                     coeff = (coeff % raidPtr->Layout.numDataCol);
376                     /* the input buffers may not all be aligned with the start of
377                      * the stripe. so shift by their sector offset within the
378                      * stripe unit */
379                     qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
380                     rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
381           }
382 
383           RF_ETIMER_STOP(timer);
384           RF_ETIMER_EVAL(timer);
385           tracerec->q_us += RF_ETIMER_VAL_US(timer);
386 }
387 /*
388    used in degraded writes.
389 */
390 
391 static void DegrQSubr(RF_DagNode_t *node);
392 
393 static void
DegrQSubr(RF_DagNode_t * node)394 DegrQSubr(RF_DagNode_t *node)
395 {
396           int     np = node->numParams;
397           int     d;
398           RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
399           unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
400           int     i;
401           RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
402           RF_Etimer_t timer;
403           char   *qbuf = node->results[1];
404           char   *obuf, *qpbuf;
405           RF_PhysDiskAddr_t *old;
406           unsigned long coeff;
407           unsigned fail_start;
408           int     j;
409 
410           old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
411           fail_start = old->startSector % secPerSU;
412 
413           RF_ETIMER_START(timer);
414 
415           d = (np - 2) / 2;
416           RF_ASSERT(2 * d + 2 == np);
417           for (i = 0; i < d; i++) {
418                     old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
419                     obuf = (char *) node->params[2 * i + 1].p;
420                     coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
421                     /* compute the data unit offset within the column, then add
422                      * one */
423                     coeff = (coeff % raidPtr->Layout.numDataCol);
424                     /* the input buffers may not all be aligned with the start of
425                      * the stripe. so shift by their sector offset within the
426                      * stripe unit */
427                     j = old->startSector % secPerSU;
428                     RF_ASSERT(j >= fail_start);
429                     qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
430                     rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
431           }
432 
433           RF_ETIMER_STOP(timer);
434           RF_ETIMER_EVAL(timer);
435           tracerec->q_us += RF_ETIMER_VAL_US(timer);
436 }
437 /*
438    Called by large write code to compute the new parity and the new q.
439 
440    structure of the params:
441 
442    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
443    raidPtr
444 
445    for a total of 2d+1 arguments.
446    The result buffers results[0], results[1] are the buffers for the p and q,
447    respectively.
448 
449    We compute Q first, then compute P. The P calculation may try to reuse
450    one of the input buffers for its output, so if we computed P first, we would
451    corrupt the input for the q calculation.
452 */
453 
454 void
rf_RegularPQFunc(RF_DagNode_t * node)455 rf_RegularPQFunc(RF_DagNode_t *node)
456 {
457           RegularQSubr(node, node->results[1]);
458           rf_RegularXorFunc(node);      /* does the wakeup */
459 }
460 
461 void
rf_RegularQFunc(RF_DagNode_t * node)462 rf_RegularQFunc(RF_DagNode_t *node)
463 {
464           /* Almost ... adjust Qsubr args */
465           RegularQSubr(node, node->results[0]);
466           rf_GenericWakeupFunc(node, 0);          /* call wake func explicitly since no
467                                                    * I/O in this node */
468 }
469 /*
470    Called by singly degraded write code to compute the new parity and the new q.
471 
472    structure of the params:
473 
474    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
475    failedPDA raidPtr
476 
477    for a total of 2d+2 arguments.
478    The result buffers results[0], results[1] are the buffers for the parity and q,
479    respectively.
480 
481    We compute Q first, then compute parity. The parity calculation may try to reuse
482    one of the input buffers for its output, so if we computed parity first, we would
483    corrupt the input for the q calculation.
484 
485    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
486 */
487 
488 void
rf_Degraded_100_PQFunc(RF_DagNode_t * node)489 rf_Degraded_100_PQFunc(RF_DagNode_t *node)
490 {
491           int     np = node->numParams;
492 
493           RF_ASSERT(np >= 2);
494           DegrQSubr(node);
495           rf_RecoveryXorFunc(node);
496 }
497 
498 
499 /*
500    The two below are used when reading a stripe with a single lost data unit.
501    The parameters are
502 
503    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
504 
505    and results[0] contains the data buffer. Which is originally zero-filled.
506 
507 */
508 
509 /* this Q func is used by the degraded-mode dag functions to recover lost data.
510  * the second-to-last parameter is the PDA for the failed portion of the access.
511  * the code here looks at this PDA and assumes that the xor target buffer is
512  * equal in size to the number of sectors in the failed PDA.  It then uses
513  * the other PDAs in the parameter list to determine where within the target
514  * buffer the corresponding data should be xored.
515  *
516  * Recall the basic equation is
517  *
518  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
519  *
520  * so to recover data_j we need
521  *
522  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
523  *
524  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
525  * copying Q into it. Then we need to do a table lookup to convert to solve
526  *   data_j /= J
527  *
528  *
529  */
530 void
rf_RecoveryQFunc(RF_DagNode_t * node)531 rf_RecoveryQFunc(RF_DagNode_t *node)
532 {
533           RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
534           RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
535           RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
536           int     i;
537           RF_PhysDiskAddr_t *pda = NULL;
538           RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
539           char   *srcbuf, *destbuf;
540           RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
541           RF_Etimer_t timer;
542           unsigned long coeff;
543 
544           RF_ETIMER_START(timer);
545           /* start by copying Q into the buffer */
546           memcpy(node->results[0], node->params[node->numParams - 3].p,
547               rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
548           for (i = 0; i < node->numParams - 4; i += 2) {
549                     RF_ASSERT(node->params[i + 1].p != node->results[0]);
550                     pda = (RF_PhysDiskAddr_t *) node->params[i].p;
551                     srcbuf = (char *) node->params[i + 1].p;
552                     suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
553                     destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
554                     coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
555                     /* compute the data unit offset within the column */
556                     coeff = (coeff % raidPtr->Layout.numDataCol);
557                     rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
558           }
559           /* Do the nasty inversion now */
560           coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
561           rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
562           RF_ETIMER_STOP(timer);
563           RF_ETIMER_EVAL(timer);
564           tracerec->q_us += RF_ETIMER_VAL_US(timer);
565           rf_GenericWakeupFunc(node, 0);
566 }
567 
568 void
rf_RecoveryPQFunc(RF_DagNode_t * node)569 rf_RecoveryPQFunc(RF_DagNode_t *node)
570 {
571           RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
572           printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
573           /* XXX: Was: */
574           /* return (1); */
575 }
576 /*
577    Degraded write Q subroutine.
578    Used when P is dead.
579    Large-write style Q computation.
580    Parameters
581 
582    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
583 
584    We ignore failedPDA.
585 
586    This is a "simple style" recovery func.
587 */
588 
589 void
rf_PQ_DegradedWriteQFunc(RF_DagNode_t * node)590 rf_PQ_DegradedWriteQFunc(RF_DagNode_t *node)
591 {
592           int     np = node->numParams;
593           int     d;
594           RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
595           unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
596           int     i;
597           RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
598           RF_Etimer_t timer;
599           char   *qbuf = node->results[0];
600           char   *obuf, *qpbuf;
601           RF_PhysDiskAddr_t *old;
602           unsigned long coeff;
603           int     fail_start, j;
604 
605           old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
606           fail_start = old->startSector % secPerSU;
607 
608           RF_ETIMER_START(timer);
609 
610           d = (np - 2) / 2;
611           RF_ASSERT(2 * d + 2 == np);
612 
613           for (i = 0; i < d; i++) {
614                     old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
615                     obuf = (char *) node->params[2 * i + 1].p;
616                     coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
617                     /* compute the data unit offset within the column, then add
618                      * one */
619                     coeff = (coeff % raidPtr->Layout.numDataCol);
620                     j = old->startSector % secPerSU;
621                     RF_ASSERT(j >= fail_start);
622                     qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
623                     rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
624           }
625 
626           RF_ETIMER_STOP(timer);
627           RF_ETIMER_EVAL(timer);
628           tracerec->q_us += RF_ETIMER_VAL_US(timer);
629           rf_GenericWakeupFunc(node, 0);
630 }
631 
632 
633 
634 
635 /* Q computations */
636 
637 /*
638    coeff - colummn;
639 
640    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
641 
642    on 5-bit basis;
643    length in bytes;
644 */
645 
646 void
rf_IncQ(unsigned long * dest,unsigned long * buf,unsigned length,unsigned coeff)647 rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length, unsigned coeff)
648 {
649           unsigned long a, d, new;
650           unsigned long a1, a2;
651           unsigned int *q = &(rf_qfor[28 - coeff][0]);
652           unsigned r = rf_rn[coeff + 1];
653 
654 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
655 #define INSERT(a,i) (a << (5L*i))
656 
657           length /= 8;
658           /* 13 5 bit quants in a 64 bit word */
659           while (length) {
660                     a = *buf++;
661                     d = *dest;
662                     a1 = EXTRACT(a, 0) ^ r;
663                     a2 = EXTRACT(a, 1) ^ r;
664                     new = INSERT(a2, 1) | a1;
665                     a1 = EXTRACT(a, 2) ^ r;
666                     a2 = EXTRACT(a, 3) ^ r;
667                     a1 = q[a1];
668                     a2 = q[a2];
669                     new = new | INSERT(a1, 2) | INSERT(a2, 3);
670                     a1 = EXTRACT(a, 4) ^ r;
671                     a2 = EXTRACT(a, 5) ^ r;
672                     a1 = q[a1];
673                     a2 = q[a2];
674                     new = new | INSERT(a1, 4) | INSERT(a2, 5);
675                     a1 = EXTRACT(a, 5) ^ r;
676                     a2 = EXTRACT(a, 6) ^ r;
677                     a1 = q[a1];
678                     a2 = q[a2];
679                     new = new | INSERT(a1, 5) | INSERT(a2, 6);
680 #if RF_LONGSHIFT > 2
681                     a1 = EXTRACT(a, 7) ^ r;
682                     a2 = EXTRACT(a, 8) ^ r;
683                     a1 = q[a1];
684                     a2 = q[a2];
685                     new = new | INSERT(a1, 7) | INSERT(a2, 8);
686                     a1 = EXTRACT(a, 9) ^ r;
687                     a2 = EXTRACT(a, 10) ^ r;
688                     a1 = q[a1];
689                     a2 = q[a2];
690                     new = new | INSERT(a1, 9) | INSERT(a2, 10);
691                     a1 = EXTRACT(a, 11) ^ r;
692                     a2 = EXTRACT(a, 12) ^ r;
693                     a1 = q[a1];
694                     a2 = q[a2];
695                     new = new | INSERT(a1, 11) | INSERT(a2, 12);
696 #endif                                  /* RF_LONGSHIFT > 2 */
697                     d ^= new;
698                     *dest++ = d;
699                     length--;
700           }
701 }
702 /*
703    compute
704 
705    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
706 
707    on a five bit basis.
708    optimization: compute old ^ new on 64 bit basis.
709 
710    length in bytes.
711 */
712 
713 static void
QDelta(char * dest,char * obuf,char * nbuf,unsigned length,unsigned char coeff)714 QDelta(
715     char *dest,
716     char *obuf,
717     char *nbuf,
718     unsigned length,
719     unsigned char coeff)
720 {
721 #ifndef _KERNEL
722           unsigned long a, d, new;
723           unsigned long a1, a2;
724           unsigned int *q = &(rf_qfor[28 - coeff][0]);
725           unsigned int r = rf_rn[coeff + 1];
726 
727           r = a1 = a2 = new = d = a = 0; /* XXX for now... */
728           q = NULL; /* XXX for now */
729 #endif
730 #ifdef _KERNEL
731           /* PQ in kernel currently not supported because the encoding/decoding
732            * table is not present */
733           memset(dest, 0, length);
734 #else                                   /* KERNEL */
735           /* this code probably doesn't work and should be rewritten  -wvcii */
736           /* 13 5 bit quants in a 64 bit word */
737           length /= 8;
738           while (length) {
739                     a = *obuf++;        /* XXX need to reorg to avoid cache conflicts */
740                     a ^= *nbuf++;
741                     d = *dest;
742                     a1 = EXTRACT(a, 0) ^ r;
743                     a2 = EXTRACT(a, 1) ^ r;
744                     a1 = q[a1];
745                     a2 = q[a2];
746                     new = INSERT(a2, 1) | a1;
747                     a1 = EXTRACT(a, 2) ^ r;
748                     a2 = EXTRACT(a, 3) ^ r;
749                     a1 = q[a1];
750                     a2 = q[a2];
751                     new = new | INSERT(a1, 2) | INSERT(a2, 3);
752                     a1 = EXTRACT(a, 4) ^ r;
753                     a2 = EXTRACT(a, 5) ^ r;
754                     a1 = q[a1];
755                     a2 = q[a2];
756                     new = new | INSERT(a1, 4) | INSERT(a2, 5);
757                     a1 = EXTRACT(a, 5) ^ r;
758                     a2 = EXTRACT(a, 6) ^ r;
759                     a1 = q[a1];
760                     a2 = q[a2];
761                     new = new | INSERT(a1, 5) | INSERT(a2, 6);
762 #if RF_LONGSHIFT > 2
763                     a1 = EXTRACT(a, 7) ^ r;
764                     a2 = EXTRACT(a, 8) ^ r;
765                     a1 = q[a1];
766                     a2 = q[a2];
767                     new = new | INSERT(a1, 7) | INSERT(a2, 8);
768                     a1 = EXTRACT(a, 9) ^ r;
769                     a2 = EXTRACT(a, 10) ^ r;
770                     a1 = q[a1];
771                     a2 = q[a2];
772                     new = new | INSERT(a1, 9) | INSERT(a2, 10);
773                     a1 = EXTRACT(a, 11) ^ r;
774                     a2 = EXTRACT(a, 12) ^ r;
775                     a1 = q[a1];
776                     a2 = q[a2];
777                     new = new | INSERT(a1, 11) | INSERT(a2, 12);
778 #endif                                  /* RF_LONGSHIFT > 2 */
779                     d ^= new;
780                     *dest++ = d;
781                     length--;
782           }
783 #endif                                  /* _KERNEL */
784 }
785 /*
786    recover columns a and b from the given p and q into
787    bufs abuf and bbuf. All bufs are word aligned.
788    Length is in bytes.
789 */
790 
791 
792 /*
793  * XXX
794  *
795  * Everything about this seems wrong.
796  */
797 void
rf_PQ_recover(unsigned long * pbuf,unsigned long * qbuf,unsigned long * abuf,unsigned long * bbuf,unsigned length,unsigned coeff_a,unsigned coeff_b)798 rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf, unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b)
799 {
800           unsigned long p, q, a, a0, a1;
801           int     col = (29 * coeff_a) + coeff_b;
802           unsigned char *q0 = &(rf_qinv[col][0]);
803 
804           length /= 8;
805           while (length) {
806                     p = *pbuf++;
807                     q = *qbuf++;
808                     a0 = EXTRACT(p, 0);
809                     a1 = EXTRACT(q, 0);
810                     a = q0[a0 << 5 | a1];
811 #define MF(i) \
812       a0 = EXTRACT(p,i); \
813       a1 = EXTRACT(q,i); \
814       a  = a | INSERT(q0[a0<<5 | a1],i)
815 
816                     MF(1);
817                     MF(2);
818                     MF(3);
819                     MF(4);
820                     MF(5);
821                     MF(6);
822 #if 0
823                     MF(7);
824                     MF(8);
825                     MF(9);
826                     MF(10);
827                     MF(11);
828                     MF(12);
829 #endif                                  /* 0 */
830                     *abuf++ = a;
831                     *bbuf++ = a ^ p;
832                     length--;
833           }
834 }
835 /*
836    Lost parity and a data column. Recover that data column.
837    Assume col coeff is lost. Let q the contents of Q after
838    all surviving data columns have been q-xored out of it.
839    Then we have the equation
840 
841    q[28-coeff][a_i ^ r_i+1] = q
842 
843    but q is cyclic with period 31.
844    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
845       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
846 
847    so a_i = r_{coeff+1} ^ q[3+coeff][q]
848 
849    The routine is passed q buffer and the buffer
850    the data is to be recoverd into. They can be the same.
851 */
852 
853 
854 
855 static void
rf_InvertQ(unsigned long * qbuf,unsigned long * abuf,unsigned length,unsigned coeff)856 rf_InvertQ(
857     unsigned long *qbuf,
858     unsigned long *abuf,
859     unsigned length,
860     unsigned coeff)
861 {
862           unsigned long a, new;
863           unsigned long a1, a2;
864           unsigned int *q = &(rf_qfor[3 + coeff][0]);
865           unsigned r = rf_rn[coeff + 1];
866 
867           /* 13 5 bit quants in a 64 bit word */
868           length /= 8;
869           while (length) {
870                     a = *qbuf++;
871                     a1 = EXTRACT(a, 0);
872                     a2 = EXTRACT(a, 1);
873                     a1 = r ^ q[a1];
874                     a2 = r ^ q[a2];
875                     new = INSERT(a2, 1) | a1;
876 #define M(i,j) \
877       a1 = EXTRACT(a,i); \
878       a2 = EXTRACT(a,j); \
879       a1 = r ^ q[a1]; \
880       a2 = r ^ q[a2]; \
881       new = new | INSERT(a1,i) | INSERT(a2,j)
882 
883                     M(2, 3);
884                     M(4, 5);
885                     M(5, 6);
886 #if RF_LONGSHIFT > 2
887                     M(7, 8);
888                     M(9, 10);
889                     M(11, 12);
890 #endif                                  /* RF_LONGSHIFT > 2 */
891                     *abuf++ = new;
892                     length--;
893           }
894 }
895 #endif                                  /* (RF_INCLUDE_DECL_PQ > 0) ||
896                                          * (RF_INCLUDE_RAID6 > 0) */
897