raid/vinum/vinumraid5.c

/*-
 * Copyright (c) 1997, 1998
 *        Cybernet Corporation and Nan Yang Computer Services Limited.
 *      All rights reserved.
 *
 *  This software was developed as part of the NetMAX project.
 *
 *  Written by Greg Lehey
 *
 *  This software is distributed under the so-called ``Berkeley
 *  License'':
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by Cybernet Corporation
 *      and Nan Yang Computer Services Limited
 * 4. Neither the name of the Companies nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * This software is provided ``as is'', and any express or implied
 * warranties, including, but not limited to, the implied warranties of
 * merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall the company or contributors be liable for any
 * direct, indirect, incidental, special, exemplary, or consequential
 * damages (including, but not limited to, procurement of substitute
 * goods or services; loss of use, data, or profits; or business
 * interruption) however caused and on any theory of liability, whether
 * in contract, strict liability, or tort (including negligence or
 * otherwise) arising in any way out of the use of this software, even if
 * advised of the possibility of such damage.
 *
 * $Id: vinumraid5.c,v 1.21 2001/01/09 04:21:27 grog Exp grog $
 * $FreeBSD: src/sys/dev/vinum/vinumraid5.c,v 1.6.2.2 2001/03/13 02:59:43 grog Exp $
 */
#include "vinumhdr.h"
#include "request.h"
#include <sys/resourcevar.h>

/*
 * Parameters which describe the current transfer.
 * These are only used for calculation, but they
 * need to be passed to other functions, so it's
 * tidier to put them in a struct
 */
struct metrics {
    vinum_off_t stripebase;                                               /* base address of stripe (1st subdisk) */
    int stripeoffset;                                                     /* offset in stripe */
    int stripesectors;                                                    /* total sectors to transfer in this stripe */
    vinum_off_t sdbase;                                                   /* offset in subdisk of stripe base */
    int sdcount;                                                /* number of disks involved in this transfer */
    vinum_off_t diskstart;                                                /* remember where this transfer starts */
    int psdno;                                                            /* number of parity subdisk */
    int badsdno;                                                /* number of down subdisk, if there is one */
    int firstsdno;                                              /* first data subdisk number */
    /* These correspond to the fields in rqelement, sort of */
    int useroffset;
    /*
     * Initial offset and length values for the first
     * data block
     */
    int initoffset;                                             /* start address of block to transfer */
    short initlen;                                              /* length in sectors of data transfer */
    /* Define a normal operation */
    int dataoffset;                                             /* start address of block to transfer */
    int datalen;                                                /* length in sectors of data transfer */
    /* Define a group operation */
    int groupoffset;                                                      /* subdisk offset of group operation */
    int grouplen;                                               /* length in sectors of group operation */
    /* Define a normal write operation */
    int writeoffset;                                                      /* subdisk offset of normal write */
    int writelen;                                               /* length in sectors of write operation */
    enum xferinfo flags;                                        /* to check what we're doing */
    int rqcount;                                                /* number of elements in request */
};

enum requeststatus bre5(struct request *rq,
    int plexno,
    vinum_off_t * diskstart,
    vinum_off_t diskend);
void complete_raid5_write(struct rqelement *);
enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
void setrqebounds(struct rqelement *rqe, struct metrics *mp);

/*
 * define the low-level requests needed to perform
 * a high-level I/O operation for a specific plex
 * 'plexno'.
 *
 * Return 0 if all subdisks involved in the
 * request are up, 1 if some subdisks are not up,
 * and -1 if the request is at least partially
 * outside the bounds of the subdisks.
 *
 * Modify the pointer *diskstart to point to the
 * end address.  On read, return on the first bad
 * subdisk, so that the caller
 * (build_read_request) can try alternatives.
 *
 * On entry to this routine, the prq structures
 * are not assigned.  The assignment is performed
 * by expandrq().  Strictly speaking, the elements
 * rqe->sdno of all entries should be set to -1,
 * since 0 (from bzero) is a valid subdisk number.
 * We avoid this problem by initializing the ones
 * we use, and not looking at the others (index >=
 * prq->requests).
 */
enum requeststatus
bre5(struct request *rq,
    int plexno,
    vinum_off_t * diskaddr,
    vinum_off_t diskend)
{
    struct metrics m;                                                     /* most of the information */
    struct sd *sd;
    struct plex *plex;
    struct bio *bio;                                                      /* user's bp */
    struct buf *bp;
    struct rqgroup *rqg;                                        /* the request group that we will create */
    struct rqelement *rqe;                                      /* point to this request information */
    int rsectors;                                               /* sectors remaining in this stripe */
    int mysdno;                                                           /* another sd index in loops */
    int rqno;                                                             /* request number */

    rqg = NULL;                                                           /* shut up, damn compiler */
    m.diskstart = *diskaddr;                                    /* start of transfer */
    bio = rq->bio;                                              /* buffer pointer */
    bp = bio->bio_buf;
    plex = &PLEX[plexno];                                       /* point to the plex */


    while (*diskaddr < diskend) {                               /* until we get it all sorted out */
          if (*diskaddr >= plex->length)                                  /* beyond the end of the plex */
              return REQUEST_EOF;                                         /* can't continue */

          m.badsdno = -1;                                                 /* no bad subdisk yet */

          /* Part A: Define the request */
          /*
           * First, calculate some sizes:
           * The offset of the start address from
           * the start of the stripe.
           */
          m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1));

          /*
           * The plex-relative address of the
           * start of the stripe.
           */
          m.stripebase = *diskaddr - m.stripeoffset;

          /* subdisk containing the parity stripe */
          if (plex->organization == plex_raid5)
              m.psdno = plex->subdisks - 1
                    - (*diskaddr / (plex->stripesize * (plex->subdisks - 1)))
                    % plex->subdisks;
          else                                                            /* RAID-4 */
              m.psdno = plex->subdisks - 1;

          /*
           * The number of the subdisk in which
           * the start is located.
           */
          m.firstsdno = m.stripeoffset / plex->stripesize;
          if (m.firstsdno >= m.psdno)                           /* at or past parity sd */
              m.firstsdno++;                                    /* increment it */

          /*
           * The offset from the beginning of
           * the stripe on this subdisk.
           */
          m.initoffset = m.stripeoffset % plex->stripesize;

          /* The offset of the stripe start relative to this subdisk */
          m.sdbase = m.stripebase / (plex->subdisks - 1);

          m.useroffset = *diskaddr - m.diskstart;               /* The offset of the start in the user buffer */

          /*
           * The number of sectors to transfer in the
           * current (first) subdisk.
           */
          m.initlen = umin(diskend - *diskaddr,                 /* the amount remaining to transfer */
              plex->stripesize - m.initoffset);                 /* and the amount left in this block */

          /*
           * The number of sectors to transfer in this stripe
           * is the minumum of the amount remaining to transfer
           * and the amount left in this stripe.
           */
          m.stripesectors = umin(diskend - *diskaddr,
              plex->stripesize * (plex->subdisks - 1) - m.stripeoffset);

          /* The number of data subdisks involved in this request */
          m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize;

          /* Part B: decide what kind of transfer this will be.

           * start and end addresses of the transfer in
           * the current block.
           *
           * There are a number of different kinds of
           * transfer, each of which relates to a
           * specific subdisk:
           *
           * 1. Normal read.  All participating subdisks
           *    are up, and the transfer can be made
           *    directly to the user buffer.  The bounds
           *    of the transfer are described by
           *    m.dataoffset and m.datalen.  We have
           *    already calculated m.initoffset and
           *    m.initlen, which define the parameters
           *    for the first data block.
           *
           * 2. Recovery read.  One participating
           *    subdisk is down.  To recover data, all
           *    the other subdisks, including the parity
           *    subdisk, must be read.  The data is
           *    recovered by exclusive-oring all the
           *    other blocks.  The bounds of the
           *    transfer are described by m.groupoffset
           *    and m.grouplen.
           *
           * 3. A read request may request reading both
           *    available data (normal read) and
           *    non-available data (recovery read).
           *    This can be a problem if the address
           *    ranges of the two reads do not coincide:
           *    in this case, the normal read needs to
           *    be extended to cover the address range
           *    of the recovery read, and must thus be
           *    performed out of malloced memory.
           *
           * 4. Normal write.  All the participating
           *    subdisks are up.  The bounds of the
           *    transfer are described by m.dataoffset
           *    and m.datalen.  Since these values
           *    differ for each block, we calculate the
           *    bounds for the parity block
           *    independently as the maximum of the
           *    individual blocks and store these values
           *    in m.writeoffset and m.writelen.  This
           *    write proceeds in four phases:
           *
           *    i.  Read the old contents of each block
           *        and the parity block.
           *    ii.  ``Remove'' the old contents from
           *         the parity block with exclusive or.
           *    iii. ``Insert'' the new contents of the
           *          block in the parity block, again
           *          with exclusive or.
           *
           *    iv.  Write the new contents of the data
           *         blocks and the parity block.  The data
           *         block transfers can be made directly from
           *         the user buffer.
           *
           * 5. Degraded write where the data block is
           *    not available.  The bounds of the
           *    transfer are described by m.groupoffset
           *    and m.grouplen. This requires the
           *    following steps:
           *
           *    i.  Read in all the other data blocks,
           *        excluding the parity block.
           *
           *    ii.  Recreate the parity block from the
           *         other data blocks and the data to be
           *         written.
           *
           *    iii. Write the parity block.
           *
           * 6. Parityless write, a write where the
           *    parity block is not available.  This is
           *    in fact the simplest: just write the
           *    data blocks.  This can proceed directly
           *    from the user buffer.  The bounds of the
           *    transfer are described by m.dataoffset
           *    and m.datalen.
           *
           * 7. Combination of degraded data block write
           *    and normal write.  In this case the
           *    address ranges of the reads may also
           *    need to be extended to cover all
           *    participating blocks.
           *
           * All requests in a group transfer transfer
           * the same address range relative to their
           * subdisk.  The individual transfers may
           * vary, but since our group of requests is
           * all in a single slice, we can define a
           * range in which they all fall.
           *
           * In the following code section, we determine
           * which kind of transfer we will perform.  If
           * there is a group transfer, we also decide
           * its bounds relative to the subdisks.  At
           * the end, we have the following values:
           *
           *  m.flags indicates the kinds of transfers
           *    we will perform.
           *  m.initoffset indicates the offset of the
           *    beginning of any data operation relative
           *    to the beginning of the stripe base.
           *  m.initlen specifies the length of any data
           *    operation.
           *  m.dataoffset contains the same value as
           *    m.initoffset.
           *  m.datalen contains the same value as
           *    m.initlen.  Initially dataoffset and
           *    datalen describe the parameters for the
           *    first data block; while building the data
           *    block requests, they are updated for each
           *    block.
           *  m.groupoffset indicates the offset of any
           *    group operation relative to the beginning
           *    of the stripe base.
           *  m.grouplen specifies the length of any
           *    group operation.
           *  m.writeoffset indicates the offset of a
           *    normal write relative to the beginning of
           *    the stripe base.  This value differs from
           *    m.dataoffset in that it applies to the
           *    entire operation, and not just the first
           *    block.
           *  m.writelen specifies the total span of a
           *    normal write operation.  writeoffset and
           *    writelen are used to define the parity
           *    block.
           */
          m.groupoffset = 0;                                    /* assume no group... */
          m.grouplen = 0;                                                 /* until we know we have one */
          m.writeoffset = m.initoffset;                         /* start offset of transfer */
          m.writelen = 0;                                                 /* nothing to write yet */
          m.flags = 0;                                                    /* no flags yet */
          rsectors = m.stripesectors;                           /* remaining sectors to examine */
          m.dataoffset = m.initoffset;                          /* start at the beginning of the transfer */
          m.datalen = m.initlen;

          if (m.sdcount > 1) {
              plex->multiblock++;                                         /* more than one block for the request */
              /*
               * If we have two transfers that don't overlap,
               * (one at the end of the first block, the other
               * at the beginning of the second block),
               * it's cheaper to split them.
               */
              if (rsectors < plex->stripesize) {
                    m.sdcount = 1;                                        /* just one subdisk */
                    m.stripesectors = m.initlen;                /* and just this many sectors */
                    rsectors = m.initlen;                                 /* and in the loop counter */
              }
          }
          if (SD[plex->sdnos[m.psdno]].state < sd_reborn)       /* is our parity subdisk down? */
              m.badsdno = m.psdno;                              /* note that it's down */
          if (bp->b_cmd == BUF_CMD_READ) {                      /* read operation */
              for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
                    if (mysdno == m.psdno)                                /* ignore parity on read */
                        mysdno++;
                    if (mysdno == plex->subdisks)               /* wraparound */
                        mysdno = 0;
                    if (mysdno == m.psdno)                                /* parity, */
                        mysdno++;                                         /* we've given already */

                    if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */
                        if (m.badsdno >= 0)                               /* we had one already, */
                              return REQUEST_DOWN;                        /* we can't take a second */
                        m.badsdno = mysdno;                               /* got the first */
                        m.groupoffset = m.dataoffset;           /* define the bounds */
                        m.grouplen = m.datalen;
                        m.flags |= XFR_RECOVERY_READ;           /* we need recovery */
                        plex->recovered_reads++;                /* count another one */
                    } else
                        m.flags |= XFR_NORMAL_READ;                       /* normal read */

                    /* Update the pointers for the next block */
                    m.dataoffset = 0;                           /* back to the start of the stripe */
                    rsectors -= m.datalen;                                /* remaining sectors to examine */
                    m.datalen = umin(rsectors, plex->stripesize); /* amount that will fit in this block */
              }
          } else {                                              /* write operation */
              for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
                    if (mysdno == m.psdno)                                /* parity stripe, we've dealt with that */
                        mysdno++;
                    if (mysdno == plex->subdisks)               /* wraparound */
                        mysdno = 0;
                    if (mysdno == m.psdno)                                /* parity, */
                        mysdno++;                                         /* we've given already */

                    sd = &SD[plex->sdnos[mysdno]];
                    if (sd->state != sd_up) {
                        enum requeststatus s;

                        s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
                        if (s && (m.badsdno >= 0)) {            /* second bad disk, */
                              int sdno;
                              /*
                               * If the parity disk is down, there's
                               * no recovery.  We make all involved
                               * subdisks stale.  Otherwise, we
                               * should be able to recover, but it's
                               * like pulling teeth.  Fix it later.
                               */
                              for (sdno = 0; sdno < m.sdcount; sdno++) {
                                  struct sd *sd = &SD[plex->sdnos[sdno]];
                                  if (sd->state >= sd_reborn)             /* sort of up, */
                                        set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */
                              }
                              return s;                         /* and crap out */
                        }
                        m.badsdno = mysdno;                               /* note which one is bad */
                        m.flags |= XFR_DEGRADED_WRITE;          /* we need recovery */
                        plex->degraded_writes++;                /* count another one */
                        m.groupoffset = m.dataoffset;           /* define the bounds */
                        m.grouplen = m.datalen;
                    } else {
                        m.flags |= XFR_NORMAL_WRITE;            /* normal write operation */
                        if (m.writeoffset > m.dataoffset) {     /* move write operation lower */
                              m.writelen = umax(m.writeoffset + m.writelen,
                                  m.dataoffset + m.datalen)
                                  - m.dataoffset;
                              m.writeoffset = m.dataoffset;
                        } else
                              m.writelen = umax(m.writeoffset + m.writelen,
                                  m.dataoffset + m.datalen)
                                  - m.writeoffset;
                    }

                    /* Update the pointers for the next block */
                    m.dataoffset = 0;                           /* back to the start of the stripe */
                    rsectors -= m.datalen;                                /* remaining sectors to examine */
                    m.datalen = umin(rsectors, plex->stripesize); /* amount that will fit in this block */
              }
              if (m.badsdno == m.psdno) {                                 /* got a bad parity block, */
                    struct sd *psd = &SD[plex->sdnos[m.psdno]];

                    if (psd->state == sd_down)
                        set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */
                    else if (psd->state == sd_crashed)
                        set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */
                    m.flags &= ~XFR_NORMAL_WRITE;               /* this write isn't normal, */
                    m.flags |= XFR_PARITYLESS_WRITE;            /* it's parityless */
                    plex->parityless_writes++;                  /* count another one */
              }
          }

          /* reset the initial transfer values */
          m.dataoffset = m.initoffset;                          /* start at the beginning of the transfer */
          m.datalen = m.initlen;

          /* decide how many requests we need */
          if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))
              /* doing a recovery read or degraded write, */
              m.rqcount = plex->subdisks;                                 /* all subdisks */
          else if (m.flags & XFR_NORMAL_WRITE)                  /* normal write, */
              m.rqcount = m.sdcount + 1;                                  /* all data blocks and the parity block */
          else                                                            /* parityless write or normal read */
              m.rqcount = m.sdcount;                            /* just the data blocks */

          /* Part C: build the requests */
          rqg = allocrqg(rq, m.rqcount);                                  /* get a request group */
          if (rqg == NULL) {                                    /* malloc failed */
              bp->b_error = ENOMEM;
              bp->b_flags |= B_ERROR;
              return REQUEST_ENOMEM;
          }
          rqg->plexno = plexno;
          rqg->flags = m.flags;
          rqno = 0;                                             /* index in the request group */

          /* 1: PARITY BLOCK */
          /*
           * Are we performing an operation which requires parity?  In that case,
           * work out the parameters and define the parity block.
           * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE
           */
          if (m.flags & XFR_PARITYOP) {                         /* need parity */
              rqe = &rqg->rqe[rqno];                            /* point to element */
              sd = &SD[plex->sdnos[m.psdno]];                   /* the subdisk in question */
              rqe->rqg = rqg;                                   /* point back to group */
              rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */
              &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE);       /* transfer flags without data op stuf */
              setrqebounds(rqe, &m);                            /* set up the bounds of the transfer */
              rqe->sdno = sd->sdno;                             /* subdisk number */
              rqe->driveno = sd->driveno;
              if (build_rq_buffer(rqe, plex))                   /* build the buffer */
                    return REQUEST_ENOMEM;                                /* can't do it */
              rqe->b.b_cmd = BUF_CMD_READ;                      /* we must read first */
              m.sdcount++;                                      /* adjust the subdisk count */
              rqno++;                                                     /* and point to the next request */
          }
          /*
           * 2: DATA BLOCKS
           * Now build up requests for the blocks required
           * for individual transfers
           */
          for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) {
              if (mysdno == m.psdno)                            /* parity, */
                    mysdno++;                                   /* we've given already */
              if (mysdno == plex->subdisks)                     /* got to the end, */
                    mysdno = 0;                                           /* wrap around */
              if (mysdno == m.psdno)                            /* parity, */
                    mysdno++;                                   /* we've given already */

              rqe = &rqg->rqe[rqno];                            /* point to element */
              sd = &SD[plex->sdnos[mysdno]];                    /* the subdisk in question */
              rqe->rqg = rqg;                                   /* point to group */
              if (m.flags & XFR_NEEDS_MALLOC)                   /* we need a malloced buffer first */
                    rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */
              else
                    rqe->flags = m.flags | XFR_DATA_BLOCK;      /* transfer flags */
              if (mysdno == m.badsdno) {                                  /* this is the bad subdisk */
                    rqg->badsdno = rqno;                                  /* note which one */
                    rqe->flags |= XFR_BAD_SUBDISK;                        /* note that it's dead */
                    /*
                     * we can't read or write from/to it,
                     * but we don't need to malloc
                     */
                    rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE);
              }
              setrqebounds(rqe, &m);                            /* set up the bounds of the transfer */
              rqe->useroffset = m.useroffset;                   /* offset in user buffer */
              rqe->sdno = sd->sdno;                             /* subdisk number */
              rqe->driveno = sd->driveno;
              if (build_rq_buffer(rqe, plex))                   /* build the buffer */
                    return REQUEST_ENOMEM;                                /* can't do it */
              if ((m.flags & XFR_PARITYOP)                      /* parity operation, */
              &&((m.flags & XFR_BAD_SUBDISK) == 0))             /* and not the bad subdisk, */
                    rqe->b.b_cmd = BUF_CMD_READ;                /* we must read first */

              /* Now update pointers for the next block */
              *diskaddr += m.datalen;                           /* skip past what we've done */
              m.stripesectors -= m.datalen;                     /* deduct from what's left */
              m.useroffset += m.datalen;                                  /* and move on in the user buffer */
              m.datalen = umin(m.stripesectors, plex->stripesize);    /* and recalculate */
              m.dataoffset = 0;                                           /* start at the beginning of next block */
          }

          /*
           * 3: REMAINING BLOCKS FOR RECOVERY
           * Finally, if we have a recovery operation, build
           * up transfers for the other subdisks.  Follow the
           * subdisks around until we get to where we started.
           * These requests use only the group parameters.
           */
          if ((rqno < m.rqcount)                                          /* haven't done them all already */
          &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) {
              for (; rqno < m.rqcount; rqno++, mysdno++) {
                    if (mysdno == m.psdno)                                /* parity, */
                        mysdno++;                                         /* we've given already */
                    if (mysdno == plex->subdisks)               /* got to the end, */
                        mysdno = 0;                                       /* wrap around */
                    if (mysdno == m.psdno)                                /* parity, */
                        mysdno++;                                         /* we've given already */

                    rqe = &rqg->rqe[rqno];                                /* point to element */
                    sd = &SD[plex->sdnos[mysdno]];                        /* the subdisk in question */
                    rqe->rqg = rqg;                                       /* point to group */

                    rqe->sdoffset = m.sdbase + m.groupoffset;   /* start of transfer */
                    rqe->dataoffset = 0;                                  /* for tidiness' sake */
                    rqe->groupoffset = 0;                                 /* group starts at the beginining */
                    rqe->datalen = 0;
                    rqe->grouplen = m.grouplen;
                    rqe->buflen = m.grouplen;
                    rqe->flags = (m.flags | XFR_MALLOCED)       /* transfer flags without data op stuf */
                    &~XFR_DATAOP;
                    rqe->sdno = sd->sdno;                                 /* subdisk number */
                    rqe->driveno = sd->driveno;
                    if (build_rq_buffer(rqe, plex))                       /* build the buffer */
                        return REQUEST_ENOMEM;                  /* can't do it */
                    rqe->b.b_cmd = BUF_CMD_READ;                /* we must read first */
              }
          }
          /*
           * We need to lock the address range before
           * doing anything.  We don't have to be
           * performing a recovery operation: somebody
           * else could be doing so, and the results could
           * influence us.  Note the fact here, we'll perform
           * the lock in launch_requests.
           */
          rqg->lockbase = m.stripebase;
          if (*diskaddr < diskend)                              /* didn't finish the request on this stripe */
              plex->multistripe++;                              /* count another one */
    }
    return REQUEST_OK;
}

/*
 * Helper function for rqe5: adjust the bounds of
 * the transfers to minimize the buffer
 * allocation.
 *
 * Each request can handle two of three different
 * data ranges:
 *
 * 1.  The range described by the parameters
 *     dataoffset and datalen, for normal read or
 *     parityless write.
 * 2.  The range described by the parameters
 *     groupoffset and grouplen, for recovery read
 *     and degraded write.
 * 3.  For normal write, the range depends on the
 *     kind of block.  For data blocks, the range
 *     is defined by dataoffset and datalen.  For
 *     parity blocks, it is defined by writeoffset
 *     and writelen.
 *
 * In order not to allocate more memory than
 * necessary, this function adjusts the bounds
 * parameter for each request to cover just the
 * minimum necessary for the function it performs.
 * This will normally vary from one request to the
 * next.
 *
 * Things are slightly different for the parity
 * block.  In this case, the bounds defined by
 * mp->writeoffset and mp->writelen also play a
 * role.  Select this case by setting the
 * parameter for parity != 0
 */
void
setrqebounds(struct rqelement *rqe, struct metrics *mp)
{
    /* parity block of a normal write */
    if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK))
          == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) {           /* case 3 */
          if (rqe->flags & XFR_DEGRADED_WRITE) {                /* also degraded write */
              /*
               * With a combined normal and degraded write, we
               * will zero out the area of the degraded write
               * in the second phase, so we don't need to read
               * it in.  Unfortunately, we need a way to tell
               * build_request_buffer the size of the buffer,
               * and currently that's the length of the read.
               * As a result, we read everything, even the stuff
               * that we're going to nuke.
               * FIXME XXX
               */
              if (mp->groupoffset < mp->writeoffset) {          /* group operation starts lower */
                    rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
                    rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */
                    rqe->groupoffset = 0;                                 /* and the group at the beginning */
              } else {                                                    /* individual data starts first */
                    rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
                    rqe->dataoffset = 0;                                  /* individual data starts at the beginning */
                    rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */
              }
              rqe->datalen = mp->writelen;
              rqe->grouplen = mp->grouplen;
          } else {                                              /* just normal write (case 3) */
              rqe->sdoffset = mp->sdbase + mp->writeoffset;   /* start of transfer */
              rqe->dataoffset = 0;                              /* degradation starts at the beginning */
              rqe->groupoffset = 0;                             /* for tidiness' sake */
              rqe->datalen = mp->writelen;
              rqe->grouplen = 0;
          }
    } else if (rqe->flags & XFR_DATAOP) {                       /* data operation (case 1 or 3) */
          if (rqe->flags & XFR_GROUPOP) {                                 /* also a group operation (case 2) */
              if (mp->groupoffset < mp->dataoffset) {           /* group operation starts lower */
                    rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
                    rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */
                    rqe->groupoffset = 0;                                 /* and the group at the beginning */
              } else {                                                    /* individual data starts first */
                    rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
                    rqe->dataoffset = 0;                                  /* individual data starts at the beginning */
                    rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */
              }
              rqe->datalen = mp->datalen;
              rqe->grouplen = mp->grouplen;
          } else {                                              /* just data operation (case 1) */
              rqe->sdoffset = mp->sdbase + mp->dataoffset;    /* start of transfer */
              rqe->dataoffset = 0;                              /* degradation starts at the beginning */
              rqe->groupoffset = 0;                             /* for tidiness' sake */
              rqe->datalen = mp->datalen;
              rqe->grouplen = 0;
          }
    } else {                                                              /* just group operations (case 2) */
          rqe->sdoffset = mp->sdbase + mp->groupoffset;         /* start of transfer */
          rqe->dataoffset = 0;                                            /* for tidiness' sake */
          rqe->groupoffset = 0;                                           /* group starts at the beginining */
          rqe->datalen = 0;
          rqe->grouplen = mp->grouplen;
    }
    rqe->buflen = umax(rqe->dataoffset + rqe->datalen,          /* total buffer length */
          rqe->groupoffset + rqe->grouplen);
}