xref: /dragonfly/sys/dev/raid/vinum/vinumraid5.c (revision 5c0a88654f01120458c4ef12ced712b93a5d6c4c)
1 /*-
2  * Copyright (c) 1997, 1998
3  *        Cybernet Corporation and Nan Yang Computer Services Limited.
4  *      All rights reserved.
5  *
6  *  This software was developed as part of the NetMAX project.
7  *
8  *  Written by Greg Lehey
9  *
10  *  This software is distributed under the so-called ``Berkeley
11  *  License'':
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. All advertising materials mentioning features or use of this software
22  *    must display the following acknowledgement:
23  *        This product includes software developed by Cybernet Corporation
24  *      and Nan Yang Computer Services Limited
25  * 4. Neither the name of the Companies nor the names of its contributors
26  *    may be used to endorse or promote products derived from this software
27  *    without specific prior written permission.
28  *
29  * This software is provided ``as is'', and any express or implied
30  * warranties, including, but not limited to, the implied warranties of
31  * merchantability and fitness for a particular purpose are disclaimed.
32  * In no event shall the company or contributors be liable for any
33  * direct, indirect, incidental, special, exemplary, or consequential
34  * damages (including, but not limited to, procurement of substitute
35  * goods or services; loss of use, data, or profits; or business
36  * interruption) however caused and on any theory of liability, whether
37  * in contract, strict liability, or tort (including negligence or
38  * otherwise) arising in any way out of the use of this software, even if
39  * advised of the possibility of such damage.
40  *
41  * $Id: vinumraid5.c,v 1.21 2001/01/09 04:21:27 grog Exp grog $
42  * $FreeBSD: src/sys/dev/vinum/vinumraid5.c,v 1.6.2.2 2001/03/13 02:59:43 grog Exp $
43  */
44 #include "vinumhdr.h"
45 #include "request.h"
46 #include <sys/resourcevar.h>
47 
48 /*
49  * Parameters which describe the current transfer.
50  * These are only used for calculation, but they
51  * need to be passed to other functions, so it's
52  * tidier to put them in a struct
53  */
54 struct metrics {
55     vinum_off_t stripebase;                                               /* base address of stripe (1st subdisk) */
56     int stripeoffset;                                                     /* offset in stripe */
57     int stripesectors;                                                    /* total sectors to transfer in this stripe */
58     vinum_off_t sdbase;                                                   /* offset in subdisk of stripe base */
59     int sdcount;                                                /* number of disks involved in this transfer */
60     vinum_off_t diskstart;                                                /* remember where this transfer starts */
61     int psdno;                                                            /* number of parity subdisk */
62     int badsdno;                                                /* number of down subdisk, if there is one */
63     int firstsdno;                                              /* first data subdisk number */
64     /* These correspond to the fields in rqelement, sort of */
65     int useroffset;
66     /*
67      * Initial offset and length values for the first
68      * data block
69      */
70     int initoffset;                                             /* start address of block to transfer */
71     short initlen;                                              /* length in sectors of data transfer */
72     /* Define a normal operation */
73     int dataoffset;                                             /* start address of block to transfer */
74     int datalen;                                                /* length in sectors of data transfer */
75     /* Define a group operation */
76     int groupoffset;                                                      /* subdisk offset of group operation */
77     int grouplen;                                               /* length in sectors of group operation */
78     /* Define a normal write operation */
79     int writeoffset;                                                      /* subdisk offset of normal write */
80     int writelen;                                               /* length in sectors of write operation */
81     enum xferinfo flags;                                        /* to check what we're doing */
82     int rqcount;                                                /* number of elements in request */
83 };
84 
85 enum requeststatus bre5(struct request *rq,
86     int plexno,
87     vinum_off_t * diskstart,
88     vinum_off_t diskend);
89 void complete_raid5_write(struct rqelement *);
90 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
91 void setrqebounds(struct rqelement *rqe, struct metrics *mp);
92 
93 /*
94  * define the low-level requests needed to perform
95  * a high-level I/O operation for a specific plex
96  * 'plexno'.
97  *
98  * Return 0 if all subdisks involved in the
99  * request are up, 1 if some subdisks are not up,
100  * and -1 if the request is at least partially
101  * outside the bounds of the subdisks.
102  *
103  * Modify the pointer *diskstart to point to the
104  * end address.  On read, return on the first bad
105  * subdisk, so that the caller
106  * (build_read_request) can try alternatives.
107  *
108  * On entry to this routine, the prq structures
109  * are not assigned.  The assignment is performed
110  * by expandrq().  Strictly speaking, the elements
111  * rqe->sdno of all entries should be set to -1,
112  * since 0 (from bzero) is a valid subdisk number.
113  * We avoid this problem by initializing the ones
114  * we use, and not looking at the others (index >=
115  * prq->requests).
116  */
117 enum requeststatus
bre5(struct request * rq,int plexno,vinum_off_t * diskaddr,vinum_off_t diskend)118 bre5(struct request *rq,
119     int plexno,
120     vinum_off_t * diskaddr,
121     vinum_off_t diskend)
122 {
123     struct metrics m;                                                     /* most of the information */
124     struct sd *sd;
125     struct plex *plex;
126     struct bio *bio;                                                      /* user's bp */
127     struct buf *bp;
128     struct rqgroup *rqg;                                        /* the request group that we will create */
129     struct rqelement *rqe;                                      /* point to this request information */
130     int rsectors;                                               /* sectors remaining in this stripe */
131     int mysdno;                                                           /* another sd index in loops */
132     int rqno;                                                             /* request number */
133 
134     rqg = NULL;                                                           /* shut up, damn compiler */
135     m.diskstart = *diskaddr;                                    /* start of transfer */
136     bio = rq->bio;                                              /* buffer pointer */
137     bp = bio->bio_buf;
138     plex = &PLEX[plexno];                                       /* point to the plex */
139 
140 
141     while (*diskaddr < diskend) {                               /* until we get it all sorted out */
142           if (*diskaddr >= plex->length)                                  /* beyond the end of the plex */
143               return REQUEST_EOF;                                         /* can't continue */
144 
145           m.badsdno = -1;                                                 /* no bad subdisk yet */
146 
147           /* Part A: Define the request */
148           /*
149            * First, calculate some sizes:
150            * The offset of the start address from
151            * the start of the stripe.
152            */
153           m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1));
154 
155           /*
156            * The plex-relative address of the
157            * start of the stripe.
158            */
159           m.stripebase = *diskaddr - m.stripeoffset;
160 
161           /* subdisk containing the parity stripe */
162           if (plex->organization == plex_raid5)
163               m.psdno = plex->subdisks - 1
164                     - (*diskaddr / (plex->stripesize * (plex->subdisks - 1)))
165                     % plex->subdisks;
166           else                                                            /* RAID-4 */
167               m.psdno = plex->subdisks - 1;
168 
169           /*
170            * The number of the subdisk in which
171            * the start is located.
172            */
173           m.firstsdno = m.stripeoffset / plex->stripesize;
174           if (m.firstsdno >= m.psdno)                           /* at or past parity sd */
175               m.firstsdno++;                                    /* increment it */
176 
177           /*
178            * The offset from the beginning of
179            * the stripe on this subdisk.
180            */
181           m.initoffset = m.stripeoffset % plex->stripesize;
182 
183           /* The offset of the stripe start relative to this subdisk */
184           m.sdbase = m.stripebase / (plex->subdisks - 1);
185 
186           m.useroffset = *diskaddr - m.diskstart;               /* The offset of the start in the user buffer */
187 
188           /*
189            * The number of sectors to transfer in the
190            * current (first) subdisk.
191            */
192           m.initlen = umin(diskend - *diskaddr,                 /* the amount remaining to transfer */
193               plex->stripesize - m.initoffset);                 /* and the amount left in this block */
194 
195           /*
196            * The number of sectors to transfer in this stripe
197            * is the minumum of the amount remaining to transfer
198            * and the amount left in this stripe.
199            */
200           m.stripesectors = umin(diskend - *diskaddr,
201               plex->stripesize * (plex->subdisks - 1) - m.stripeoffset);
202 
203           /* The number of data subdisks involved in this request */
204           m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize;
205 
206           /* Part B: decide what kind of transfer this will be.
207 
208            * start and end addresses of the transfer in
209            * the current block.
210            *
211            * There are a number of different kinds of
212            * transfer, each of which relates to a
213            * specific subdisk:
214            *
215            * 1. Normal read.  All participating subdisks
216            *    are up, and the transfer can be made
217            *    directly to the user buffer.  The bounds
218            *    of the transfer are described by
219            *    m.dataoffset and m.datalen.  We have
220            *    already calculated m.initoffset and
221            *    m.initlen, which define the parameters
222            *    for the first data block.
223            *
224            * 2. Recovery read.  One participating
225            *    subdisk is down.  To recover data, all
226            *    the other subdisks, including the parity
227            *    subdisk, must be read.  The data is
228            *    recovered by exclusive-oring all the
229            *    other blocks.  The bounds of the
230            *    transfer are described by m.groupoffset
231            *    and m.grouplen.
232            *
233            * 3. A read request may request reading both
234            *    available data (normal read) and
235            *    non-available data (recovery read).
236            *    This can be a problem if the address
237            *    ranges of the two reads do not coincide:
238            *    in this case, the normal read needs to
239            *    be extended to cover the address range
240            *    of the recovery read, and must thus be
241            *    performed out of malloced memory.
242            *
243            * 4. Normal write.  All the participating
244            *    subdisks are up.  The bounds of the
245            *    transfer are described by m.dataoffset
246            *    and m.datalen.  Since these values
247            *    differ for each block, we calculate the
248            *    bounds for the parity block
249            *    independently as the maximum of the
250            *    individual blocks and store these values
251            *    in m.writeoffset and m.writelen.  This
252            *    write proceeds in four phases:
253            *
254            *    i.  Read the old contents of each block
255            *        and the parity block.
256            *    ii.  ``Remove'' the old contents from
257            *         the parity block with exclusive or.
258            *    iii. ``Insert'' the new contents of the
259            *          block in the parity block, again
260            *          with exclusive or.
261            *
262            *    iv.  Write the new contents of the data
263            *         blocks and the parity block.  The data
264            *         block transfers can be made directly from
265            *         the user buffer.
266            *
267            * 5. Degraded write where the data block is
268            *    not available.  The bounds of the
269            *    transfer are described by m.groupoffset
270            *    and m.grouplen. This requires the
271            *    following steps:
272            *
273            *    i.  Read in all the other data blocks,
274            *        excluding the parity block.
275            *
276            *    ii.  Recreate the parity block from the
277            *         other data blocks and the data to be
278            *         written.
279            *
280            *    iii. Write the parity block.
281            *
282            * 6. Parityless write, a write where the
283            *    parity block is not available.  This is
284            *    in fact the simplest: just write the
285            *    data blocks.  This can proceed directly
286            *    from the user buffer.  The bounds of the
287            *    transfer are described by m.dataoffset
288            *    and m.datalen.
289            *
290            * 7. Combination of degraded data block write
291            *    and normal write.  In this case the
292            *    address ranges of the reads may also
293            *    need to be extended to cover all
294            *    participating blocks.
295            *
296            * All requests in a group transfer transfer
297            * the same address range relative to their
298            * subdisk.  The individual transfers may
299            * vary, but since our group of requests is
300            * all in a single slice, we can define a
301            * range in which they all fall.
302            *
303            * In the following code section, we determine
304            * which kind of transfer we will perform.  If
305            * there is a group transfer, we also decide
306            * its bounds relative to the subdisks.  At
307            * the end, we have the following values:
308            *
309            *  m.flags indicates the kinds of transfers
310            *    we will perform.
311            *  m.initoffset indicates the offset of the
312            *    beginning of any data operation relative
313            *    to the beginning of the stripe base.
314            *  m.initlen specifies the length of any data
315            *    operation.
316            *  m.dataoffset contains the same value as
317            *    m.initoffset.
318            *  m.datalen contains the same value as
319            *    m.initlen.  Initially dataoffset and
320            *    datalen describe the parameters for the
321            *    first data block; while building the data
322            *    block requests, they are updated for each
323            *    block.
324            *  m.groupoffset indicates the offset of any
325            *    group operation relative to the beginning
326            *    of the stripe base.
327            *  m.grouplen specifies the length of any
328            *    group operation.
329            *  m.writeoffset indicates the offset of a
330            *    normal write relative to the beginning of
331            *    the stripe base.  This value differs from
332            *    m.dataoffset in that it applies to the
333            *    entire operation, and not just the first
334            *    block.
335            *  m.writelen specifies the total span of a
336            *    normal write operation.  writeoffset and
337            *    writelen are used to define the parity
338            *    block.
339            */
340           m.groupoffset = 0;                                    /* assume no group... */
341           m.grouplen = 0;                                                 /* until we know we have one */
342           m.writeoffset = m.initoffset;                         /* start offset of transfer */
343           m.writelen = 0;                                                 /* nothing to write yet */
344           m.flags = 0;                                                    /* no flags yet */
345           rsectors = m.stripesectors;                           /* remaining sectors to examine */
346           m.dataoffset = m.initoffset;                          /* start at the beginning of the transfer */
347           m.datalen = m.initlen;
348 
349           if (m.sdcount > 1) {
350               plex->multiblock++;                                         /* more than one block for the request */
351               /*
352                * If we have two transfers that don't overlap,
353                * (one at the end of the first block, the other
354                * at the beginning of the second block),
355                * it's cheaper to split them.
356                */
357               if (rsectors < plex->stripesize) {
358                     m.sdcount = 1;                                        /* just one subdisk */
359                     m.stripesectors = m.initlen;                /* and just this many sectors */
360                     rsectors = m.initlen;                                 /* and in the loop counter */
361               }
362           }
363           if (SD[plex->sdnos[m.psdno]].state < sd_reborn)       /* is our parity subdisk down? */
364               m.badsdno = m.psdno;                              /* note that it's down */
365           if (bp->b_cmd == BUF_CMD_READ) {                      /* read operation */
366               for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
367                     if (mysdno == m.psdno)                                /* ignore parity on read */
368                         mysdno++;
369                     if (mysdno == plex->subdisks)               /* wraparound */
370                         mysdno = 0;
371                     if (mysdno == m.psdno)                                /* parity, */
372                         mysdno++;                                         /* we've given already */
373 
374                     if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */
375                         if (m.badsdno >= 0)                               /* we had one already, */
376                               return REQUEST_DOWN;                        /* we can't take a second */
377                         m.badsdno = mysdno;                               /* got the first */
378                         m.groupoffset = m.dataoffset;           /* define the bounds */
379                         m.grouplen = m.datalen;
380                         m.flags |= XFR_RECOVERY_READ;           /* we need recovery */
381                         plex->recovered_reads++;                /* count another one */
382                     } else
383                         m.flags |= XFR_NORMAL_READ;                       /* normal read */
384 
385                     /* Update the pointers for the next block */
386                     m.dataoffset = 0;                           /* back to the start of the stripe */
387                     rsectors -= m.datalen;                                /* remaining sectors to examine */
388                     m.datalen = umin(rsectors, plex->stripesize); /* amount that will fit in this block */
389               }
390           } else {                                              /* write operation */
391               for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
392                     if (mysdno == m.psdno)                                /* parity stripe, we've dealt with that */
393                         mysdno++;
394                     if (mysdno == plex->subdisks)               /* wraparound */
395                         mysdno = 0;
396                     if (mysdno == m.psdno)                                /* parity, */
397                         mysdno++;                                         /* we've given already */
398 
399                     sd = &SD[plex->sdnos[mysdno]];
400                     if (sd->state != sd_up) {
401                         enum requeststatus s;
402 
403                         s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
404                         if (s && (m.badsdno >= 0)) {            /* second bad disk, */
405                               int sdno;
406                               /*
407                                * If the parity disk is down, there's
408                                * no recovery.  We make all involved
409                                * subdisks stale.  Otherwise, we
410                                * should be able to recover, but it's
411                                * like pulling teeth.  Fix it later.
412                                */
413                               for (sdno = 0; sdno < m.sdcount; sdno++) {
414                                   struct sd *sd = &SD[plex->sdnos[sdno]];
415                                   if (sd->state >= sd_reborn)             /* sort of up, */
416                                         set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */
417                               }
418                               return s;                         /* and crap out */
419                         }
420                         m.badsdno = mysdno;                               /* note which one is bad */
421                         m.flags |= XFR_DEGRADED_WRITE;          /* we need recovery */
422                         plex->degraded_writes++;                /* count another one */
423                         m.groupoffset = m.dataoffset;           /* define the bounds */
424                         m.grouplen = m.datalen;
425                     } else {
426                         m.flags |= XFR_NORMAL_WRITE;            /* normal write operation */
427                         if (m.writeoffset > m.dataoffset) {     /* move write operation lower */
428                               m.writelen = umax(m.writeoffset + m.writelen,
429                                   m.dataoffset + m.datalen)
430                                   - m.dataoffset;
431                               m.writeoffset = m.dataoffset;
432                         } else
433                               m.writelen = umax(m.writeoffset + m.writelen,
434                                   m.dataoffset + m.datalen)
435                                   - m.writeoffset;
436                     }
437 
438                     /* Update the pointers for the next block */
439                     m.dataoffset = 0;                           /* back to the start of the stripe */
440                     rsectors -= m.datalen;                                /* remaining sectors to examine */
441                     m.datalen = umin(rsectors, plex->stripesize); /* amount that will fit in this block */
442               }
443               if (m.badsdno == m.psdno) {                                 /* got a bad parity block, */
444                     struct sd *psd = &SD[plex->sdnos[m.psdno]];
445 
446                     if (psd->state == sd_down)
447                         set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */
448                     else if (psd->state == sd_crashed)
449                         set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */
450                     m.flags &= ~XFR_NORMAL_WRITE;               /* this write isn't normal, */
451                     m.flags |= XFR_PARITYLESS_WRITE;            /* it's parityless */
452                     plex->parityless_writes++;                  /* count another one */
453               }
454           }
455 
456           /* reset the initial transfer values */
457           m.dataoffset = m.initoffset;                          /* start at the beginning of the transfer */
458           m.datalen = m.initlen;
459 
460           /* decide how many requests we need */
461           if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))
462               /* doing a recovery read or degraded write, */
463               m.rqcount = plex->subdisks;                                 /* all subdisks */
464           else if (m.flags & XFR_NORMAL_WRITE)                  /* normal write, */
465               m.rqcount = m.sdcount + 1;                                  /* all data blocks and the parity block */
466           else                                                            /* parityless write or normal read */
467               m.rqcount = m.sdcount;                            /* just the data blocks */
468 
469           /* Part C: build the requests */
470           rqg = allocrqg(rq, m.rqcount);                                  /* get a request group */
471           if (rqg == NULL) {                                    /* malloc failed */
472               bp->b_error = ENOMEM;
473               bp->b_flags |= B_ERROR;
474               return REQUEST_ENOMEM;
475           }
476           rqg->plexno = plexno;
477           rqg->flags = m.flags;
478           rqno = 0;                                             /* index in the request group */
479 
480           /* 1: PARITY BLOCK */
481           /*
482            * Are we performing an operation which requires parity?  In that case,
483            * work out the parameters and define the parity block.
484            * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE
485            */
486           if (m.flags & XFR_PARITYOP) {                         /* need parity */
487               rqe = &rqg->rqe[rqno];                            /* point to element */
488               sd = &SD[plex->sdnos[m.psdno]];                   /* the subdisk in question */
489               rqe->rqg = rqg;                                   /* point back to group */
490               rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */
491               &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE);       /* transfer flags without data op stuf */
492               setrqebounds(rqe, &m);                            /* set up the bounds of the transfer */
493               rqe->sdno = sd->sdno;                             /* subdisk number */
494               rqe->driveno = sd->driveno;
495               if (build_rq_buffer(rqe, plex))                   /* build the buffer */
496                     return REQUEST_ENOMEM;                                /* can't do it */
497               rqe->b.b_cmd = BUF_CMD_READ;                      /* we must read first */
498               m.sdcount++;                                      /* adjust the subdisk count */
499               rqno++;                                                     /* and point to the next request */
500           }
501           /*
502            * 2: DATA BLOCKS
503            * Now build up requests for the blocks required
504            * for individual transfers
505            */
506           for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) {
507               if (mysdno == m.psdno)                            /* parity, */
508                     mysdno++;                                   /* we've given already */
509               if (mysdno == plex->subdisks)                     /* got to the end, */
510                     mysdno = 0;                                           /* wrap around */
511               if (mysdno == m.psdno)                            /* parity, */
512                     mysdno++;                                   /* we've given already */
513 
514               rqe = &rqg->rqe[rqno];                            /* point to element */
515               sd = &SD[plex->sdnos[mysdno]];                    /* the subdisk in question */
516               rqe->rqg = rqg;                                   /* point to group */
517               if (m.flags & XFR_NEEDS_MALLOC)                   /* we need a malloced buffer first */
518                     rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */
519               else
520                     rqe->flags = m.flags | XFR_DATA_BLOCK;      /* transfer flags */
521               if (mysdno == m.badsdno) {                                  /* this is the bad subdisk */
522                     rqg->badsdno = rqno;                                  /* note which one */
523                     rqe->flags |= XFR_BAD_SUBDISK;                        /* note that it's dead */
524                     /*
525                      * we can't read or write from/to it,
526                      * but we don't need to malloc
527                      */
528                     rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE);
529               }
530               setrqebounds(rqe, &m);                            /* set up the bounds of the transfer */
531               rqe->useroffset = m.useroffset;                   /* offset in user buffer */
532               rqe->sdno = sd->sdno;                             /* subdisk number */
533               rqe->driveno = sd->driveno;
534               if (build_rq_buffer(rqe, plex))                   /* build the buffer */
535                     return REQUEST_ENOMEM;                                /* can't do it */
536               if ((m.flags & XFR_PARITYOP)                      /* parity operation, */
537               &&((m.flags & XFR_BAD_SUBDISK) == 0))             /* and not the bad subdisk, */
538                     rqe->b.b_cmd = BUF_CMD_READ;                /* we must read first */
539 
540               /* Now update pointers for the next block */
541               *diskaddr += m.datalen;                           /* skip past what we've done */
542               m.stripesectors -= m.datalen;                     /* deduct from what's left */
543               m.useroffset += m.datalen;                                  /* and move on in the user buffer */
544               m.datalen = umin(m.stripesectors, plex->stripesize);    /* and recalculate */
545               m.dataoffset = 0;                                           /* start at the beginning of next block */
546           }
547 
548           /*
549            * 3: REMAINING BLOCKS FOR RECOVERY
550            * Finally, if we have a recovery operation, build
551            * up transfers for the other subdisks.  Follow the
552            * subdisks around until we get to where we started.
553            * These requests use only the group parameters.
554            */
555           if ((rqno < m.rqcount)                                          /* haven't done them all already */
556           &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) {
557               for (; rqno < m.rqcount; rqno++, mysdno++) {
558                     if (mysdno == m.psdno)                                /* parity, */
559                         mysdno++;                                         /* we've given already */
560                     if (mysdno == plex->subdisks)               /* got to the end, */
561                         mysdno = 0;                                       /* wrap around */
562                     if (mysdno == m.psdno)                                /* parity, */
563                         mysdno++;                                         /* we've given already */
564 
565                     rqe = &rqg->rqe[rqno];                                /* point to element */
566                     sd = &SD[plex->sdnos[mysdno]];                        /* the subdisk in question */
567                     rqe->rqg = rqg;                                       /* point to group */
568 
569                     rqe->sdoffset = m.sdbase + m.groupoffset;   /* start of transfer */
570                     rqe->dataoffset = 0;                                  /* for tidiness' sake */
571                     rqe->groupoffset = 0;                                 /* group starts at the beginining */
572                     rqe->datalen = 0;
573                     rqe->grouplen = m.grouplen;
574                     rqe->buflen = m.grouplen;
575                     rqe->flags = (m.flags | XFR_MALLOCED)       /* transfer flags without data op stuf */
576                     &~XFR_DATAOP;
577                     rqe->sdno = sd->sdno;                                 /* subdisk number */
578                     rqe->driveno = sd->driveno;
579                     if (build_rq_buffer(rqe, plex))                       /* build the buffer */
580                         return REQUEST_ENOMEM;                  /* can't do it */
581                     rqe->b.b_cmd = BUF_CMD_READ;                /* we must read first */
582               }
583           }
584           /*
585            * We need to lock the address range before
586            * doing anything.  We don't have to be
587            * performing a recovery operation: somebody
588            * else could be doing so, and the results could
589            * influence us.  Note the fact here, we'll perform
590            * the lock in launch_requests.
591            */
592           rqg->lockbase = m.stripebase;
593           if (*diskaddr < diskend)                              /* didn't finish the request on this stripe */
594               plex->multistripe++;                              /* count another one */
595     }
596     return REQUEST_OK;
597 }
598 
599 /*
600  * Helper function for rqe5: adjust the bounds of
601  * the transfers to minimize the buffer
602  * allocation.
603  *
604  * Each request can handle two of three different
605  * data ranges:
606  *
607  * 1.  The range described by the parameters
608  *     dataoffset and datalen, for normal read or
609  *     parityless write.
610  * 2.  The range described by the parameters
611  *     groupoffset and grouplen, for recovery read
612  *     and degraded write.
613  * 3.  For normal write, the range depends on the
614  *     kind of block.  For data blocks, the range
615  *     is defined by dataoffset and datalen.  For
616  *     parity blocks, it is defined by writeoffset
617  *     and writelen.
618  *
619  * In order not to allocate more memory than
620  * necessary, this function adjusts the bounds
621  * parameter for each request to cover just the
622  * minimum necessary for the function it performs.
623  * This will normally vary from one request to the
624  * next.
625  *
626  * Things are slightly different for the parity
627  * block.  In this case, the bounds defined by
628  * mp->writeoffset and mp->writelen also play a
629  * role.  Select this case by setting the
630  * parameter for parity != 0
631  */
632 void
setrqebounds(struct rqelement * rqe,struct metrics * mp)633 setrqebounds(struct rqelement *rqe, struct metrics *mp)
634 {
635     /* parity block of a normal write */
636     if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK))
637           == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) {           /* case 3 */
638           if (rqe->flags & XFR_DEGRADED_WRITE) {                /* also degraded write */
639               /*
640                * With a combined normal and degraded write, we
641                * will zero out the area of the degraded write
642                * in the second phase, so we don't need to read
643                * it in.  Unfortunately, we need a way to tell
644                * build_request_buffer the size of the buffer,
645                * and currently that's the length of the read.
646                * As a result, we read everything, even the stuff
647                * that we're going to nuke.
648                * FIXME XXX
649                */
650               if (mp->groupoffset < mp->writeoffset) {          /* group operation starts lower */
651                     rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
652                     rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */
653                     rqe->groupoffset = 0;                                 /* and the group at the beginning */
654               } else {                                                    /* individual data starts first */
655                     rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
656                     rqe->dataoffset = 0;                                  /* individual data starts at the beginning */
657                     rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */
658               }
659               rqe->datalen = mp->writelen;
660               rqe->grouplen = mp->grouplen;
661           } else {                                              /* just normal write (case 3) */
662               rqe->sdoffset = mp->sdbase + mp->writeoffset;   /* start of transfer */
663               rqe->dataoffset = 0;                              /* degradation starts at the beginning */
664               rqe->groupoffset = 0;                             /* for tidiness' sake */
665               rqe->datalen = mp->writelen;
666               rqe->grouplen = 0;
667           }
668     } else if (rqe->flags & XFR_DATAOP) {                       /* data operation (case 1 or 3) */
669           if (rqe->flags & XFR_GROUPOP) {                                 /* also a group operation (case 2) */
670               if (mp->groupoffset < mp->dataoffset) {           /* group operation starts lower */
671                     rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
672                     rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */
673                     rqe->groupoffset = 0;                                 /* and the group at the beginning */
674               } else {                                                    /* individual data starts first */
675                     rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
676                     rqe->dataoffset = 0;                                  /* individual data starts at the beginning */
677                     rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */
678               }
679               rqe->datalen = mp->datalen;
680               rqe->grouplen = mp->grouplen;
681           } else {                                              /* just data operation (case 1) */
682               rqe->sdoffset = mp->sdbase + mp->dataoffset;    /* start of transfer */
683               rqe->dataoffset = 0;                              /* degradation starts at the beginning */
684               rqe->groupoffset = 0;                             /* for tidiness' sake */
685               rqe->datalen = mp->datalen;
686               rqe->grouplen = 0;
687           }
688     } else {                                                              /* just group operations (case 2) */
689           rqe->sdoffset = mp->sdbase + mp->groupoffset;         /* start of transfer */
690           rqe->dataoffset = 0;                                            /* for tidiness' sake */
691           rqe->groupoffset = 0;                                           /* group starts at the beginining */
692           rqe->datalen = 0;
693           rqe->grouplen = mp->grouplen;
694     }
695     rqe->buflen = umax(rqe->dataoffset + rqe->datalen,          /* total buffer length */
696           rqe->groupoffset + rqe->grouplen);
697 }
698