vfs_bio.c (revision cb0f6a61c726519b940fe80f97ab125ba66bbd79) - OpenGrok cross reference for /dragonfly/sys/kern/vfs_bio.c

/*
 * Copyright (c) 1994,1997 John S. Dyson
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice immediately at the beginning of the file, without modification,
 *    this list of conditions, and the following disclaimer.
 * 2. Absolutely no warranty of function or purpose is made by the author
 *                  John S. Dyson.
 *
 * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $
 */

/*
 * this file contains a new buffer I/O scheme implementing a coherent
 * VM object and buffer cache scheme.  Pains have been taken to make
 * sure that the performance degradation associated with schemes such
 * as this is not realized.
 *
 * Author:  John S. Dyson
 * Significant help during the development and debugging phases
 * had been provided by David Greenman, also of the FreeBSD core team.
 *
 * see man buf(9) for more info.  Note that man buf(9) doesn't reflect
 * the actual buf/bio implementation in DragonFly.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/devicestat.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/resourcevar.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/dsched.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_kern.h>
#include <vm/vm_pageout.h>
#include <vm/vm_page.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <vm/vm_map.h>
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>

#include <sys/buf2.h>
#include <sys/spinlock2.h>
#include <vm/vm_page2.h>

#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
#endif

/*
 * Buffer queues.
 */
enum bufq_type {
          BQUEUE_NONE,        /* not on any queue */
          BQUEUE_LOCKED,      /* locked buffers */
          BQUEUE_CLEAN,       /* non-B_DELWRI buffers */
          BQUEUE_DIRTY,       /* B_DELWRI buffers */
          BQUEUE_DIRTY_HW,    /* B_DELWRI buffers - heavy weight */
          BQUEUE_EMPTY,       /* empty buffer headers */

          BUFFER_QUEUES                 /* number of buffer queues */
};

typedef enum bufq_type bufq_type_t;

#define BD_WAKE_SIZE          16384
#define BD_WAKE_MASK          (BD_WAKE_SIZE - 1)

TAILQ_HEAD(bqueues, buf);

struct bufpcpu {
          struct spinlock spin;
          struct bqueues bufqueues[BUFFER_QUEUES];
} __cachealign;

struct bufpcpu bufpcpu[MAXCPU];

static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");

struct buf *buf;              /* buffer header pool */

static void vfs_clean_pages(struct buf *bp);
static void vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m);
#if 0
static void vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m);
#endif
static void vfs_vmio_release(struct buf *bp);
static int flushbufqueues(struct buf *marker, bufq_type_t q);
static vm_page_t bio_page_alloc(struct buf *bp, vm_object_t obj,
                                        vm_pindex_t pg, int deficit);

static void bd_signal(long totalspace);
static void buf_daemon(void);
static void buf_daemon_hw(void);

/*
 * bogus page -- for I/O to/from partially complete buffers
 * this is a temporary solution to the problem, but it is not
 * really that bad.  it would be better to split the buffer
 * for input in the case of buffers partially already in memory,
 * but the code is intricate enough already.
 */
vm_page_t bogus_page;

/*
 * These are all static, but make the ones we export globals so we do
 * not need to use compiler magic.
 */
long bufspace;                          /* atomic ops */
long maxbufspace;
long lobufspace, hibufspace;
static long lorunningspace;
static long hirunningspace;
static long dirtykvaspace;              /* atomic */
long dirtybufspace;                     /* atomic (global for systat) */
static long dirtybufcount;              /* atomic */
static long dirtybufspacehw;            /* atomic */
static long dirtybufcounthw;            /* atomic */
static long runningbufspace;            /* atomic */
static long runningbufcount;            /* atomic */
long lodirtybufspace;
long hidirtybufspace;
static int getnewbufcalls;
static int needsbuffer;                           /* atomic */
static int runningbufreq;               /* atomic */
static int bd_request;                            /* atomic */
static int bd_request_hw;               /* atomic */
static u_int bd_wake_ary[BD_WAKE_SIZE];
static u_int bd_wake_index;
static u_int vm_cycle_point = 40; /* 23-36 will migrate more act->inact */
static int debug_commit;
static int debug_bufbio;
static int debug_kvabio;
static long bufcache_bw = 200 * 1024 * 1024;

static struct thread *bufdaemon_td;
static struct thread *bufdaemonhw_td;
static u_int lowmempgallocs;
static u_int flushperqueue = 1024;

/*
 * Sysctls for operational control of the buffer cache.
 */
SYSCTL_UINT(_vfs, OID_AUTO, flushperqueue, CTLFLAG_RW, &flushperqueue, 0,
          "Number of buffers to flush from each per-cpu queue");
SYSCTL_LONG(_vfs, OID_AUTO, lodirtybufspace, CTLFLAG_RW, &lodirtybufspace, 0,
          "Number of dirty buffers to flush before bufdaemon becomes inactive");
SYSCTL_LONG(_vfs, OID_AUTO, hidirtybufspace, CTLFLAG_RW, &hidirtybufspace, 0,
          "High watermark used to trigger explicit flushing of dirty buffers");
SYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
          "Minimum amount of buffer space required for active I/O");
SYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
          "Maximum amount of buffer space to usable for active I/O");
SYSCTL_LONG(_vfs, OID_AUTO, bufcache_bw, CTLFLAG_RW, &bufcache_bw, 0,
          "Buffer-cache -> VM page cache transfer bandwidth");
SYSCTL_UINT(_vfs, OID_AUTO, lowmempgallocs, CTLFLAG_RW, &lowmempgallocs, 0,
          "Page allocations done during periods of very low free memory");
SYSCTL_UINT(_vfs, OID_AUTO, vm_cycle_point, CTLFLAG_RW, &vm_cycle_point, 0,
          "Recycle pages to active or inactive queue transition pt 0-64");
/*
 * Sysctls determining current state of the buffer cache.
 */
SYSCTL_LONG(_vfs, OID_AUTO, nbuf, CTLFLAG_RD, &nbuf, 0,
          "Total number of buffers in buffer cache");
SYSCTL_LONG(_vfs, OID_AUTO, dirtykvaspace, CTLFLAG_RD, &dirtykvaspace, 0,
          "KVA reserved by dirty buffers (all)");
SYSCTL_LONG(_vfs, OID_AUTO, dirtybufspace, CTLFLAG_RD, &dirtybufspace, 0,
          "Pending bytes of dirty buffers (all)");
SYSCTL_LONG(_vfs, OID_AUTO, dirtybufspacehw, CTLFLAG_RD, &dirtybufspacehw, 0,
          "Pending bytes of dirty buffers (heavy weight)");
SYSCTL_LONG(_vfs, OID_AUTO, dirtybufcount, CTLFLAG_RD, &dirtybufcount, 0,
          "Pending number of dirty buffers");
SYSCTL_LONG(_vfs, OID_AUTO, dirtybufcounthw, CTLFLAG_RD, &dirtybufcounthw, 0,
          "Pending number of dirty buffers (heavy weight)");
SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
          "I/O bytes currently in progress due to asynchronous writes");
SYSCTL_LONG(_vfs, OID_AUTO, runningbufcount, CTLFLAG_RD, &runningbufcount, 0,
          "I/O buffers currently in progress due to asynchronous writes");
SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
          "Hard limit on maximum amount of memory usable for buffer space");
SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
          "Soft limit on maximum amount of memory usable for buffer space");
SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
          "Minimum amount of memory to reserve for system buffer space");
SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
          "Amount of memory available for buffers");
SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, 0,
          "New buffer header acquisition requests");
SYSCTL_INT(_vfs, OID_AUTO, debug_commit, CTLFLAG_RW, &debug_commit, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, debug_bufbio, CTLFLAG_RW, &debug_bufbio, 0, "");
SYSCTL_INT(_vfs, OID_AUTO, debug_kvabio, CTLFLAG_RW, &debug_kvabio, 0, "");
SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf),
          "sizeof(struct buf)");

char *buf_wmesg = BUF_WMESG;

#define VFS_BIO_NEED_ANY      0x01      /* any freeable buffer */
#define VFS_BIO_NEED_UNUSED02 0x02
#define VFS_BIO_NEED_UNUSED04 0x04
#define VFS_BIO_NEED_BUFSPACE 0x08      /* wait for buf space, lo hysteresis */

/*
 * Called when buffer space is potentially available for recovery.
 * getnewbuf() will block on this flag when it is unable to free
 * sufficient buffer space.  Buffer space becomes recoverable when
 * bp's get placed back in the queues.
 */
static __inline void
bufspacewakeup(void)
{
          /*
           * If someone is waiting for BUF space, wake them up.  Even
           * though we haven't freed the kva space yet, the waiting
           * process will be able to now.
           */
          for (;;) {
                    int flags = needsbuffer;
                    cpu_ccfence();
                    if ((flags & VFS_BIO_NEED_BUFSPACE) == 0)
                              break;
                    if (atomic_cmpset_int(&needsbuffer, flags,
                                              flags & ~VFS_BIO_NEED_BUFSPACE)) {
                              wakeup(&needsbuffer);
                              break;
                    }
                    /* retry */
          }
}

/*
 * runningbufwakeup:
 *
 *        Accounting for I/O in progress.
 *
 */
static __inline void
runningbufwakeup(struct buf *bp)
{
          long totalspace;
          long flags;

          if ((totalspace = bp->b_runningbufspace) != 0) {
                    atomic_add_long(&runningbufspace, -totalspace);
                    atomic_add_long(&runningbufcount, -1);
                    bp->b_runningbufspace = 0;

                    /*
                     * see waitrunningbufspace() for limit test.
                     */
                    for (;;) {
                              flags = runningbufreq;
                              cpu_ccfence();
                              if (flags == 0)
                                        break;
                              if (atomic_cmpset_int(&runningbufreq, flags, 0)) {
                                        wakeup(&runningbufreq);
                                        break;
                              }
                              /* retry */
                    }
                    bd_signal(totalspace);
          }
}

/*
 * bufcountwakeup:
 *
 *        Called when a buffer has been added to one of the free queues to
 *        account for the buffer and to wakeup anyone waiting for free buffers.
 *        This typically occurs when large amounts of metadata are being handled
 *        by the buffer cache ( else buffer space runs out first, usually ).
 */
static __inline void
bufcountwakeup(void)
{
          long flags;

          for (;;) {
                    flags = needsbuffer;
                    if (flags == 0)
                              break;
                    if (atomic_cmpset_int(&needsbuffer, flags,
                                              (flags & ~VFS_BIO_NEED_ANY))) {
                              wakeup(&needsbuffer);
                              break;
                    }
                    /* retry */
          }
}

/*
 * waitrunningbufspace()
 *
 * If runningbufspace exceeds 4/6 hirunningspace we block until
 * runningbufspace drops to 3/6 hirunningspace.  We also block if another
 * thread blocked here in order to be fair, even if runningbufspace
 * is now lower than the limit.
 *
 * The caller may be using this function to block in a tight loop, we
 * must block while runningbufspace is greater than at least
 * hirunningspace * 3 / 6.
 */
void
waitrunningbufspace(void)
{
          long limit = hirunningspace * 4 / 6;
          long flags;

          while (runningbufspace > limit || runningbufreq) {
                    tsleep_interlock(&runningbufreq, 0);
                    flags = atomic_fetchadd_int(&runningbufreq, 1);
                    if (runningbufspace > limit || flags)
                              tsleep(&runningbufreq, PINTERLOCKED, "wdrn1", hz);
          }
}

/*
 * buf_dirty_count_severe:
 *
 *        Return true if we have too many dirty buffers.
 */
int
buf_dirty_count_severe(void)
{
          return (runningbufspace + dirtykvaspace >= hidirtybufspace ||
                  dirtybufcount >= nbuf / 2);
}

/*
 * Return true if the amount of running I/O is severe and BIOQ should
 * start bursting.
 */
int
buf_runningbufspace_severe(void)
{
          return (runningbufspace >= hirunningspace * 4 / 6);
}

/*
 * vfs_buf_test_cache:
 *
 * Called when a buffer is extended.  This function clears the B_CACHE
 * bit if the newly extended portion of the buffer does not contain
 * valid data.
 *
 * NOTE! Dirty VM pages are not processed into dirty (B_DELWRI) buffer
 * cache buffers.  The VM pages remain dirty, as someone had mmap()'d
 * them while a clean buffer was present.
 */
static __inline__
void
vfs_buf_test_cache(struct buf *bp,
                      vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
                      vm_page_t m)
{
          if (bp->b_flags & B_CACHE) {
                    int base = (foff + off) & PAGE_MASK;
                    if (vm_page_is_valid(m, base, size) == 0)
                              bp->b_flags &= ~B_CACHE;
          }
}

/*
 * bd_speedup()
 *
 * Spank the buf_daemon[_hw] if the total dirty buffer space exceeds the
 * low water mark.
 */
static __inline__
void
bd_speedup(void)
{
          if (dirtykvaspace < lodirtybufspace && dirtybufcount < nbuf / 2)
                    return;

          if (bd_request == 0 &&
              (dirtykvaspace > lodirtybufspace / 2 ||
               dirtybufcount - dirtybufcounthw >= nbuf / 2)) {
                    if (atomic_fetchadd_int(&bd_request, 1) == 0)
                              wakeup(&bd_request);
          }
          if (bd_request_hw == 0 &&
              (dirtykvaspace > lodirtybufspace / 2 ||
               dirtybufcounthw >= nbuf / 2)) {
                    if (atomic_fetchadd_int(&bd_request_hw, 1) == 0)
                              wakeup(&bd_request_hw);
          }
}

/*
 * bd_heatup()
 *
 *        Get the buf_daemon heated up when the number of running and dirty
 *        buffers exceeds the mid-point.
 *
 *        Return the total number of dirty bytes past the second mid point
 *        as a measure of how much excess dirty data there is in the system.
 */
long
bd_heatup(void)
{
          long mid1;
          long mid2;
          long totalspace;

          mid1 = lodirtybufspace + (hidirtybufspace - lodirtybufspace) / 2;

          totalspace = runningbufspace + dirtykvaspace;
          if (totalspace >= mid1 || dirtybufcount >= nbuf / 2) {
                    bd_speedup();
                    mid2 = mid1 + (hidirtybufspace - mid1) / 2;
                    if (totalspace >= mid2)
                              return(totalspace - mid2);
          }
          return(0);
}

/*
 * bd_wait()
 *
 *        Wait for the buffer cache to flush (totalspace) bytes worth of
 *        buffers, then return.
 *
 *        Regardless this function blocks while the number of dirty buffers
 *        exceeds hidirtybufspace.
 */
void
bd_wait(long totalspace)
{
          u_int i;
          u_int j;
          u_int mi;
          int count;

          if (curthread == bufdaemonhw_td || curthread == bufdaemon_td)
                    return;

          while (totalspace > 0) {
                    bd_heatup();

                    /*
                     * Order is important.  Suppliers adjust bd_wake_index after
                     * updating runningbufspace/dirtykvaspace.  We want to fetch
                     * bd_wake_index before accessing.  Any error should thus
                     * be in our favor.
                     */
                    i = atomic_fetchadd_int(&bd_wake_index, 0);
                    if (totalspace > runningbufspace + dirtykvaspace)
                              totalspace = runningbufspace + dirtykvaspace;
                    count = totalspace / MAXBSIZE;
                    if (count >= BD_WAKE_SIZE / 2)
                              count = BD_WAKE_SIZE / 2;
                    i = i + count;
                    mi = i & BD_WAKE_MASK;

                    /*
                     * This is not a strict interlock, so we play a bit loose
                     * with locking access to dirtybufspace*.  We have to re-check
                     * bd_wake_index to ensure that it hasn't passed us.
                     */
                    tsleep_interlock(&bd_wake_ary[mi], 0);
                    atomic_add_int(&bd_wake_ary[mi], 1);
                    j = atomic_fetchadd_int(&bd_wake_index, 0);
                    if ((int)(i - j) >= 0)
                              tsleep(&bd_wake_ary[mi], PINTERLOCKED, "flstik", hz);

                    totalspace = runningbufspace + dirtykvaspace - hidirtybufspace;
          }
}

/*
 * bd_signal()
 *
 *        This function is called whenever runningbufspace or dirtykvaspace
 *        is reduced.  Track threads waiting for run+dirty buffer I/O
 *        complete.
 */
static void
bd_signal(long totalspace)
{
          u_int i;

          if (totalspace > 0) {
                    if (totalspace > MAXBSIZE * BD_WAKE_SIZE)
                              totalspace = MAXBSIZE * BD_WAKE_SIZE;
                    while (totalspace > 0) {
                              i = atomic_fetchadd_int(&bd_wake_index, 1);
                              i &= BD_WAKE_MASK;
                              if (atomic_readandclear_int(&bd_wake_ary[i]))
                                        wakeup(&bd_wake_ary[i]);
                              totalspace -= MAXBSIZE;
                    }
          }
}

/*
 * BIO tracking support routines.
 *
 * Release a ref on a bio_track.  Wakeup requests are atomically released
 * along with the last reference so bk_active will never wind up set to
 * only 0x80000000.
 */
static
void
bio_track_rel(struct bio_track *track)
{
          int       active;
          int       desired;

          /*
           * Shortcut
           */
          active = track->bk_active;
          if (active == 1 && atomic_cmpset_int(&track->bk_active, 1, 0))
                    return;

          /*
           * Full-on.  Note that the wait flag is only atomically released on
           * the 1->0 count transition.
           *
           * We check for a negative count transition using bit 30 since bit 31
           * has a different meaning.
           */
          for (;;) {
                    desired = (active & 0x7FFFFFFF) - 1;
                    if (desired)
                              desired |= active & 0x80000000;
                    if (atomic_cmpset_int(&track->bk_active, active, desired)) {
                              if (desired & 0x40000000)
                                        panic("bio_track_rel: bad count: %p", track);
                              if (active & 0x80000000)
                                        wakeup(track);
                              break;
                    }
                    active = track->bk_active;
          }
}

/*
 * Wait for the tracking count to reach 0.
 *
 * Use atomic ops such that the wait flag is only set atomically when
 * bk_active is non-zero.
 */
int
bio_track_wait(struct bio_track *track, int slp_flags, int slp_timo)
{
          int       active;
          int       desired;
          int       error;

          /*
           * Shortcut
           */
          if (track->bk_active == 0)
                    return(0);

          /*
           * Full-on.  Note that the wait flag may only be atomically set if
           * the active count is non-zero.
           *
           * NOTE: We cannot optimize active == desired since a wakeup could
           *         clear active prior to our tsleep_interlock().
           */
          error = 0;
          while ((active = track->bk_active) != 0) {
                    cpu_ccfence();
                    desired = active | 0x80000000;
                    tsleep_interlock(track, slp_flags);
                    if (atomic_cmpset_int(&track->bk_active, active, desired)) {
                              error = tsleep(track, slp_flags | PINTERLOCKED,
                                               "trwait", slp_timo);
                              if (error)
                                        break;
                    }
          }
          return (error);
}

/*
 * bufinit:
 *
 *        Load time initialisation of the buffer cache, called from machine
 *        dependant initialization code.
 */
static
void
bufinit(void *dummy __unused)
{
          struct bufpcpu *pcpu;
          struct buf *bp;
          vm_offset_t bogus_offset;
          int i;
          int j;
          long n;

          /* next, make a null set of free lists */
          for (i = 0; i < ncpus; ++i) {
                    pcpu = &bufpcpu[i];
                    spin_init(&pcpu->spin, "bufinit");
                    for (j = 0; j < BUFFER_QUEUES; j++)
                              TAILQ_INIT(&pcpu->bufqueues[j]);
          }

          /*
           * Finally, initialize each buffer header and stick on empty q.
           * Each buffer gets its own KVA reservation.
           */
          i = 0;
          pcpu = &bufpcpu[i];

          for (n = 0; n < nbuf; n++) {
                    bp = &buf[n];
                    bzero(bp, sizeof *bp);
                    bp->b_flags = B_INVAL;        /* we're just an empty header */
                    bp->b_cmd = BUF_CMD_DONE;
                    bp->b_qindex = BQUEUE_EMPTY;
                    bp->b_qcpu = i;
                    bp->b_kvabase = (void *)(vm_map_min(buffer_map) +
                                                   MAXBSIZE * n);
                    bp->b_kvasize = MAXBSIZE;
                    initbufbio(bp);
                    xio_init(&bp->b_xio);
                    buf_dep_init(bp);
                    TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
                                          bp, b_freelist);

                    i = (i + 1) % ncpus;
                    pcpu = &bufpcpu[i];
          }

          /*
           * maxbufspace is the absolute maximum amount of buffer space we are
           * allowed to reserve in KVM and in real terms.  The absolute maximum
           * is nominally used by buf_daemon.  hibufspace is the nominal maximum
           * used by most other processes.  The differential is required to
           * ensure that buf_daemon is able to run when other processes might
           * be blocked waiting for buffer space.
           *
           * Calculate hysteresis (lobufspace, hibufspace).  Don't make it
           * too large or we might lockup a cpu for too long a period of
           * time in our tight loop.
           */
          maxbufspace = nbuf * NBUFCALCSIZE;
          hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
          lobufspace = hibufspace * 7 / 8;
          if (hibufspace - lobufspace > 64 * 1024 * 1024)
                    lobufspace = hibufspace - 64 * 1024 * 1024;
          if (lobufspace > hibufspace - MAXBSIZE)
                    lobufspace = hibufspace - MAXBSIZE;

          lorunningspace = 512 * 1024;
          /* hirunningspace -- see below */

          /*
           * Reduce the chance of a deadlock occuring by limiting the number
           * of delayed-write dirty buffers we allow to stack up.
           *
           * We don't want too much actually queued to the device at once
           * (XXX this needs to be per-mount!), because the buffers will
           * wind up locked for a very long period of time while the I/O
           * drains.
           */
          hidirtybufspace = hibufspace / 2;       /* dirty + running */
          hirunningspace = hibufspace / 16;       /* locked & queued to device */
          if (hirunningspace < 1024 * 1024)
                    hirunningspace = 1024 * 1024;

          dirtykvaspace = 0;
          dirtybufspace = 0;
          dirtybufspacehw = 0;

          lodirtybufspace = hidirtybufspace / 2;

          /*
           * Maximum number of async ops initiated per buf_daemon loop.  This is
           * somewhat of a hack at the moment, we really need to limit ourselves
           * based on the number of bytes of I/O in-transit that were initiated
           * from buf_daemon.
           */

          bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE,
                                                     VM_SUBSYS_BOGUS);
          vm_object_hold(kernel_object);
          bogus_page = vm_page_alloc(kernel_object,
                                           (bogus_offset >> PAGE_SHIFT),
                                           VM_ALLOC_NORMAL);
          vm_object_drop(kernel_object);
          vmstats.v_wire_count++;

}

SYSINIT(do_bufinit, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, bufinit, NULL);

/*
 * Initialize the embedded bio structures, typically used by
 * deprecated code which tries to allocate its own struct bufs.
 */
void
initbufbio(struct buf *bp)
{
          bp->b_bio1.bio_buf = bp;
          bp->b_bio1.bio_prev = NULL;
          bp->b_bio1.bio_offset = NOOFFSET;
          bp->b_bio1.bio_next = &bp->b_bio2;
          bp->b_bio1.bio_done = NULL;
          bp->b_bio1.bio_flags = 0;

          bp->b_bio2.bio_buf = bp;
          bp->b_bio2.bio_prev = &bp->b_bio1;
          bp->b_bio2.bio_offset = NOOFFSET;
          bp->b_bio2.bio_next = NULL;
          bp->b_bio2.bio_done = NULL;
          bp->b_bio2.bio_flags = 0;

          BUF_LOCKINIT(bp);
}

/*
 * Reinitialize the embedded bio structures as well as any additional
 * translation cache layers.
 */
void
reinitbufbio(struct buf *bp)
{
          struct bio *bio;

          for (bio = &bp->b_bio1; bio; bio = bio->bio_next) {
                    bio->bio_done = NULL;
                    bio->bio_offset = NOOFFSET;
          }
}

/*
 * Undo the effects of an initbufbio().
 */
void
uninitbufbio(struct buf *bp)
{
          dsched_buf_exit(bp);
          BUF_LOCKFREE(bp);
}

/*
 * Push another BIO layer onto an existing BIO and return it.  The new
 * BIO layer may already exist, holding cached translation data.
 */
struct bio *
push_bio(struct bio *bio)
{
          struct bio *nbio;

          if ((nbio = bio->bio_next) == NULL) {
                    int index = bio - &bio->bio_buf->b_bio_array[0];
                    if (index >= NBUF_BIO - 1) {
                              panic("push_bio: too many layers %d for bp %p",
                                        index, bio->bio_buf);
                    }
                    nbio = &bio->bio_buf->b_bio_array[index + 1];
                    bio->bio_next = nbio;
                    nbio->bio_prev = bio;
                    nbio->bio_buf = bio->bio_buf;
                    nbio->bio_offset = NOOFFSET;
                    nbio->bio_done = NULL;
                    nbio->bio_next = NULL;
          }
          KKASSERT(nbio->bio_done == NULL);
          return(nbio);
}

/*
 * Pop a BIO translation layer, returning the previous layer.  The
 * must have been previously pushed.
 */
struct bio *
pop_bio(struct bio *bio)
{
          return(bio->bio_prev);
}

void
clearbiocache(struct bio *bio)
{
          while (bio) {
                    bio->bio_offset = NOOFFSET;
                    bio = bio->bio_next;
          }
}

/*
 * Remove the buffer from the appropriate free list.
 * (caller must be locked)
 */
static __inline void
_bremfree(struct buf *bp)
{
          struct bufpcpu *pcpu = &bufpcpu[bp->b_qcpu];

          if (bp->b_qindex != BQUEUE_NONE) {
                    KASSERT(BUF_LOCKINUSE(bp), ("bremfree: bp %p not locked", bp));
                    TAILQ_REMOVE(&pcpu->bufqueues[bp->b_qindex], bp, b_freelist);
                    bp->b_qindex = BQUEUE_NONE;
          } else {
                    if (!BUF_LOCKINUSE(bp))
                              panic("bremfree: removing a buffer not on a queue");
          }
}

/*
 * bremfree() - must be called with a locked buffer
 */
void
bremfree(struct buf *bp)
{
          struct bufpcpu *pcpu = &bufpcpu[bp->b_qcpu];

          spin_lock(&pcpu->spin);
          _bremfree(bp);
          spin_unlock(&pcpu->spin);
}

/*
 * bremfree_locked - must be called with pcpu->spin locked
 */
static void
bremfree_locked(struct buf *bp)
{
          _bremfree(bp);
}

/*
 * This version of bread issues any required I/O asyncnronously and
 * makes a callback on completion.
 *
 * The callback must check whether BIO_DONE is set in the bio and issue
 * the bpdone(bp, 0) if it isn't.  The callback is responsible for clearing
 * BIO_DONE and disposing of the I/O (bqrelse()ing it).
 */
void
breadcb(struct vnode *vp, off_t loffset, int size, int bflags,
          void (*func)(struct bio *), void *arg)
{
          struct buf *bp;

          bp = getblk(vp, loffset, size, 0, 0);

          /* if not found in cache, do some I/O */
          if ((bp->b_flags & B_CACHE) == 0) {
                    bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL | B_NOTMETA);
                    bp->b_flags |= bflags;
                    bp->b_cmd = BUF_CMD_READ;
                    bp->b_bio1.bio_done = func;
                    bp->b_bio1.bio_caller_info1.ptr = arg;
                    vfs_busy_pages(vp, bp);
                    BUF_KERNPROC(bp);
                    vn_strategy(vp, &bp->b_bio1);
          } else if (func) {
                    /*
                     * Since we are issuing the callback synchronously it cannot
                     * race the BIO_DONE, so no need for atomic ops here.
                     */
                    /*bp->b_bio1.bio_done = func;*/
                    bp->b_bio1.bio_caller_info1.ptr = arg;
                    bp->b_bio1.bio_flags |= BIO_DONE;
                    func(&bp->b_bio1);
          } else {
                    bqrelse(bp);
          }
}

/*
 * breadnx() - Terminal function for bread() and breadn().
 *
 * This function will start asynchronous I/O on read-ahead blocks as well
 * as satisfy the primary request.
 *
 * We must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE is
 * set, the buffer is valid and we do not have to do anything.
 */
int
breadnx(struct vnode *vp, off_t loffset, int size, int bflags,
          off_t *raoffset, int *rabsize,
          int cnt, struct buf **bpp)
{
          struct buf *bp, *rabp;
          int i;
          int rv = 0, readwait = 0;
          int blkflags = (bflags & B_KVABIO) ? GETBLK_KVABIO : 0;

          if (*bpp)
                    bp = *bpp;
          else
                    *bpp = bp = getblk(vp, loffset, size, blkflags, 0);

          /* if not found in cache, do some I/O */
          if ((bp->b_flags & B_CACHE) == 0) {
                    bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL | B_NOTMETA);
                    bp->b_flags |= bflags;
                    bp->b_cmd = BUF_CMD_READ;
                    bp->b_bio1.bio_done = biodone_sync;
                    bp->b_bio1.bio_flags |= BIO_SYNC;
                    vfs_busy_pages(vp, bp);
                    vn_strategy(vp, &bp->b_bio1);
                    ++readwait;
          }

          for (i = 0; i < cnt; i++, raoffset++, rabsize++) {
                    if (inmem(vp, *raoffset))
                              continue;
                    rabp = getblk(vp, *raoffset, *rabsize, GETBLK_KVABIO, 0);

                    if ((rabp->b_flags & B_CACHE) == 0) {
                              rabp->b_flags &= ~(B_ERROR | B_EINTR |
                                                     B_INVAL | B_NOTMETA);
                              rabp->b_flags |= (bflags & ~B_KVABIO);
                              rabp->b_cmd = BUF_CMD_READ;
                              vfs_busy_pages(vp, rabp);
                              BUF_KERNPROC(rabp);
                              vn_strategy(vp, &rabp->b_bio1);
                    } else {
                              brelse(rabp);
                    }
          }
          if (readwait)
                    rv = biowait(&bp->b_bio1, "biord");
          return (rv);
}

/*
 * bwrite:
 *
 *        Synchronous write, waits for completion.
 *
 *        Write, release buffer on completion.  (Done by iodone
 *        if async).  Do not bother writing anything if the buffer
 *        is invalid.
 *
 *        Note that we set B_CACHE here, indicating that buffer is
 *        fully valid and thus cacheable.  This is true even of NFS
 *        now so we set it generally.  This could be set either here
 *        or in biodone() since the I/O is synchronous.  We put it
 *        here.
 */
int
bwrite(struct buf *bp)
{
          int error;

          if (bp->b_flags & B_INVAL) {
                    brelse(bp);
                    return (0);
          }
          if (BUF_LOCKINUSE(bp) == 0)
                    panic("bwrite: buffer is not busy???");

          /*
           * NOTE: We no longer mark the buffer clear prior to the vn_strategy()
           *         call because it will remove the buffer from the vnode's
           *         dirty buffer list prematurely and possibly cause filesystem
           *         checks to race buffer flushes.  This is now handled in
           *         bpdone().
           *
           *         bundirty(bp); REMOVED
           */

          bp->b_flags &= ~(B_ERROR | B_EINTR);
          bp->b_flags |= B_CACHE;
          bp->b_cmd = BUF_CMD_WRITE;
          bp->b_error = 0;
          bp->b_bio1.bio_done = biodone_sync;
          bp->b_bio1.bio_flags |= BIO_SYNC;
          vfs_busy_pages(bp->b_vp, bp);

          /*
           * Normal bwrites pipeline writes.  NOTE: b_bufsize is only
           * valid for vnode-backed buffers.
           */
          bsetrunningbufspace(bp, bp->b_bufsize);
          vn_strategy(bp->b_vp, &bp->b_bio1);
          error = biowait(&bp->b_bio1, "biows");
          brelse(bp);

          return (error);
}

/*
 * bawrite:
 *
 *        Asynchronous write.  Start output on a buffer, but do not wait for
 *        it to complete.  The buffer is released when the output completes.
 *
 *        bwrite() ( or the VOP routine anyway ) is responsible for handling
 *        B_INVAL buffers.  Not us.
 */
void
bawrite(struct buf *bp)
{
          if (bp->b_flags & B_INVAL) {
                    brelse(bp);
                    return;
          }
          if (BUF_LOCKINUSE(bp) == 0)
                    panic("bawrite: buffer is not busy???");

          /*
           * NOTE: We no longer mark the buffer clear prior to the vn_strategy()
           *         call because it will remove the buffer from the vnode's
           *         dirty buffer list prematurely and possibly cause filesystem
           *         checks to race buffer flushes.  This is now handled in
           *         bpdone().
           *
           *         bundirty(bp); REMOVED
           */
          bp->b_flags &= ~(B_ERROR | B_EINTR);
          bp->b_flags |= B_CACHE;
          bp->b_cmd = BUF_CMD_WRITE;
          bp->b_error = 0;
          KKASSERT(bp->b_bio1.bio_done == NULL);
          vfs_busy_pages(bp->b_vp, bp);

          /*
           * Normal bwrites pipeline writes.  NOTE: b_bufsize is only
           * valid for vnode-backed buffers.
           */
          bsetrunningbufspace(bp, bp->b_bufsize);
          BUF_KERNPROC(bp);
          vn_strategy(bp->b_vp, &bp->b_bio1);
}

/*
 * bdwrite:
 *
 *        Delayed write. (Buffer is marked dirty).  Do not bother writing
 *        anything if the buffer is marked invalid.
 *
 *        Note that since the buffer must be completely valid, we can safely
 *        set B_CACHE.  In fact, we have to set B_CACHE here rather then in
 *        biodone() in order to prevent getblk from writing the buffer
 *        out synchronously.
 */
void
bdwrite(struct buf *bp)
{
          if (BUF_LOCKINUSE(bp) == 0)
                    panic("bdwrite: buffer is not busy");

          if (bp->b_flags & B_INVAL) {
                    brelse(bp);
                    return;
          }
          bdirty(bp);

          dsched_buf_enter(bp);         /* might stack */

          /*
           * Set B_CACHE, indicating that the buffer is fully valid.  This is
           * true even of NFS now.
           */
          bp->b_flags |= B_CACHE;

          /*
           * This bmap keeps the system from needing to do the bmap later,
           * perhaps when the system is attempting to do a sync.  Since it
           * is likely that the indirect block -- or whatever other datastructure
           * that the filesystem needs is still in memory now, it is a good
           * thing to do this.  Note also, that if the pageout daemon is
           * requesting a sync -- there might not be enough memory to do
           * the bmap then...  So, this is important to do.
           */
          if (bp->b_bio2.bio_offset == NOOFFSET) {
                    VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
                               NULL, NULL, BUF_CMD_WRITE);
          }

          /*
           * Because the underlying pages may still be mapped and
           * writable trying to set the dirty buffer (b_dirtyoff/end)
           * range here will be inaccurate.
           *
           * However, we must still clean the pages to satisfy the
           * vnode_pager and pageout daemon, so they think the pages
           * have been "cleaned".  What has really occured is that
           * they've been earmarked for later writing by the buffer
           * cache.
           *
           * So we get the b_dirtyoff/end update but will not actually
           * depend on it (NFS that is) until the pages are busied for
           * writing later on.
           */
          vfs_clean_pages(bp);
          bqrelse(bp);

          /*
           * note: we cannot initiate I/O from a bdwrite even if we wanted to,
           * due to the softdep code.
           */
}

/*
 * Fake write - return pages to VM system as dirty, leave the buffer clean.
 * This is used by tmpfs.
 *
 * It is important for any VFS using this routine to NOT use it for
 * IO_SYNC or IO_ASYNC operations which occur when the system really
 * wants to flush VM pages to backing store.
 */
void
buwrite(struct buf *bp)
{
          vm_page_t m;
          int i;

          /*
           * Only works for VMIO buffers.  If the buffer is already
           * marked for delayed-write we can't avoid the bdwrite().
           */
          if ((bp->b_flags & B_VMIO) == 0 || (bp->b_flags & B_DELWRI)) {
                    bdwrite(bp);
                    return;
          }

          /*
           * Mark as needing a commit.
           */
          for (i = 0; i < bp->b_xio.xio_npages; i++) {
                    m = bp->b_xio.xio_pages[i];
                    vm_page_need_commit(m);
          }
          bqrelse(bp);
}

/*
 * bdirty:
 *
 *        Turn buffer into delayed write request by marking it B_DELWRI.
 *        B_RELBUF and B_NOCACHE must be cleared.
 *
 *        We reassign the buffer to itself to properly update it in the
 *        dirty/clean lists.
 *
 *        Must be called from a critical section.
 *        The buffer must be on BQUEUE_NONE.
 */
void
bdirty(struct buf *bp)
{
          KASSERT(bp->b_qindex == BQUEUE_NONE,
                    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
          if (bp->b_flags & B_NOCACHE) {
                    kprintf("bdirty: clearing B_NOCACHE on buf %p\n", bp);
                    bp->b_flags &= ~B_NOCACHE;
          }
          if (bp->b_flags & B_INVAL) {
                    kprintf("bdirty: warning, dirtying invalid buffer %p\n", bp);
          }
          bp->b_flags &= ~B_RELBUF;

          if ((bp->b_flags & B_DELWRI) == 0) {
                    lwkt_gettoken(&bp->b_vp->v_token);
                    bp->b_flags |= B_DELWRI;
                    reassignbuf(bp);
                    lwkt_reltoken(&bp->b_vp->v_token);

                    atomic_add_long(&dirtybufcount, 1);
                    atomic_add_long(&dirtykvaspace, bp->b_kvasize);
                    atomic_add_long(&dirtybufspace, bp->b_bufsize);
                    if (bp->b_flags & B_HEAVY) {
                              atomic_add_long(&dirtybufcounthw, 1);
                              atomic_add_long(&dirtybufspacehw, bp->b_bufsize);
                    }
                    bd_heatup();
          }
}

/*
 * Set B_HEAVY, indicating that this is a heavy-weight buffer that
 * needs to be flushed with a different buf_daemon thread to avoid
 * deadlocks.  B_HEAVY also imposes restrictions in getnewbuf().
 */
void
bheavy(struct buf *bp)
{
          if ((bp->b_flags & B_HEAVY) == 0) {
                    bp->b_flags |= B_HEAVY;
                    if (bp->b_flags & B_DELWRI) {
                              atomic_add_long(&dirtybufcounthw, 1);
                              atomic_add_long(&dirtybufspacehw, bp->b_bufsize);
                    }
          }
}

/*
 * bundirty:
 *
 *        Clear B_DELWRI for buffer.
 *
 *        Must be called from a critical section.
 *
 *        The buffer is typically on BQUEUE_NONE but there is one case in
 *        brelse() that calls this function after placing the buffer on
 *        a different queue.
 */
void
bundirty(struct buf *bp)
{
          if (bp->b_flags & B_DELWRI) {
                    lwkt_gettoken(&bp->b_vp->v_token);
                    bp->b_flags &= ~B_DELWRI;
                    reassignbuf(bp);
                    lwkt_reltoken(&bp->b_vp->v_token);

                    atomic_add_long(&dirtybufcount, -1);
                    atomic_add_long(&dirtykvaspace, -bp->b_kvasize);
                    atomic_add_long(&dirtybufspace, -bp->b_bufsize);
                    if (bp->b_flags & B_HEAVY) {
                              atomic_add_long(&dirtybufcounthw, -1);
                              atomic_add_long(&dirtybufspacehw, -bp->b_bufsize);
                    }
                    bd_signal(bp->b_bufsize);
          }
          /*
           * Since it is now being written, we can clear its deferred write flag.
           */
          bp->b_flags &= ~B_DEFERRED;
}

/*
 * Set the b_runningbufspace field, used to track how much I/O is
 * in progress at any given moment.
 */
void
bsetrunningbufspace(struct buf *bp, int bytes)
{
          bp->b_runningbufspace = bytes;
          if (bytes) {
                    atomic_add_long(&runningbufspace, bytes);
                    atomic_add_long(&runningbufcount, 1);
          }
}

/*
 * brelse:
 *
 *        Release a busy buffer and, if requested, free its resources.  The
 *        buffer will be stashed in the appropriate bufqueue[] allowing it
 *        to be accessed later as a cache entity or reused for other purposes.
 */
void
brelse(struct buf *bp)
{
          struct bufpcpu *pcpu;
#ifdef INVARIANTS
          int saved_flags = bp->b_flags;
#endif

          KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
                    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));

          /*
           * If B_NOCACHE is set we are being asked to destroy the buffer and
           * its backing store.  Clear B_DELWRI.
           *
           * B_NOCACHE is set in two cases: (1) when the caller really wants
           * to destroy the buffer and backing store and (2) when the caller
           * wants to destroy the buffer and backing store after a write
           * completes.
           */
          if ((bp->b_flags & (B_NOCACHE|B_DELWRI)) == (B_NOCACHE|B_DELWRI)) {
                    bundirty(bp);
          }

          if ((bp->b_flags & (B_INVAL | B_DELWRI)) == B_DELWRI) {
                    /*
                     * A re-dirtied buffer is only subject to destruction
                     * by B_INVAL.  B_ERROR and B_NOCACHE are ignored.
                     */
                    /* leave buffer intact */
          } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
                       (bp->b_bufsize <= 0)) {
                    /*
                     * Either a failed read or we were asked to free or not
                     * cache the buffer.  This path is reached with B_DELWRI
                     * set only if B_INVAL is already set.  B_NOCACHE governs
                     * backing store destruction.
                     *
                     * NOTE: HAMMER will set B_LOCKED in buf_deallocate if the
                     * buffer cannot be immediately freed.
                     */
                    bp->b_flags |= B_INVAL;
                    if (LIST_FIRST(&bp->b_dep) != NULL)
                              buf_deallocate(bp);
                    if (bp->b_flags & B_DELWRI) {
                              atomic_add_long(&dirtybufcount, -1);
                              atomic_add_long(&dirtykvaspace, -bp->b_kvasize);
                              atomic_add_long(&dirtybufspace, -bp->b_bufsize);
                              if (bp->b_flags & B_HEAVY) {
                                        atomic_add_long(&dirtybufcounthw, -1);
                                        atomic_add_long(&dirtybufspacehw,
                                                            -bp->b_bufsize);
                              }
                              bd_signal(bp->b_bufsize);
                    }
                    bp->b_flags &= ~(B_DELWRI | B_CACHE);
          }

          /*
           * We must clear B_RELBUF if B_DELWRI or B_LOCKED is set,
           * or if b_refs is non-zero.
           *
           * If vfs_vmio_release() is called with either bit set, the
           * underlying pages may wind up getting freed causing a previous
           * write (bdwrite()) to get 'lost' because pages associated with
           * a B_DELWRI bp are marked clean.  Pages associated with a
           * B_LOCKED buffer may be mapped by the filesystem.
           *
           * If we want to release the buffer ourselves (rather then the
           * originator asking us to release it), give the originator a
           * chance to countermand the release by setting B_LOCKED.
           *
           * We still allow the B_INVAL case to call vfs_vmio_release(), even
           * if B_DELWRI is set.
           *
           * If B_DELWRI is not set we may have to set B_RELBUF if we are low
           * on pages to return pages to the VM page queues.
           */
          if ((bp->b_flags & (B_DELWRI | B_LOCKED)) || bp->b_refs) {
                    bp->b_flags &= ~B_RELBUF;
          } else if (vm_paging_min()) {
                    if (LIST_FIRST(&bp->b_dep) != NULL)
                              buf_deallocate(bp);           /* can set B_LOCKED */
                    if (bp->b_flags & (B_DELWRI | B_LOCKED))
                              bp->b_flags &= ~B_RELBUF;
                    else
                              bp->b_flags |= B_RELBUF;
          }

          /*
           * Make sure b_cmd is clear.  It may have already been cleared by
           * biodone().
           *
           * At this point destroying the buffer is governed by the B_INVAL
           * or B_RELBUF flags.
           */
          bp->b_cmd = BUF_CMD_DONE;
          dsched_buf_exit(bp);

          /*
           * VMIO buffer rundown.  Make sure the VM page array is restored
           * after an I/O may have replaces some of the pages with bogus pages
           * in order to not destroy dirty pages in a fill-in read.
           *
           * Note that due to the code above, if a buffer is marked B_DELWRI
           * then the B_RELBUF and B_NOCACHE bits will always be clear.
           * B_INVAL may still be set, however.
           *
           * For clean buffers, B_INVAL or B_RELBUF will destroy the buffer
           * but not the backing store.   B_NOCACHE will destroy the backing
           * store.
           *
           * Note that dirty NFS buffers contain byte-granular write ranges
           * and should not be destroyed w/ B_INVAL even if the backing store
           * is left intact.
           */
          if (bp->b_flags & B_VMIO) {
                    /*
                     * Rundown for VMIO buffers which are not dirty NFS buffers.
                     */
                    int i, j, resid;
                    vm_page_t m;
                    off_t foff;
                    vm_pindex_t poff;
                    vm_object_t obj;
                    struct vnode *vp;

                    vp = bp->b_vp;

                    /*
                     * Get the base offset and length of the buffer.  Note that
                     * in the VMIO case if the buffer block size is not
                     * page-aligned then b_data pointer may not be page-aligned.
                     * But our b_xio.xio_pages array *IS* page aligned.
                     *
                     * block sizes less then DEV_BSIZE (usually 512) are not
                     * supported due to the page granularity bits (m->valid,
                     * m->dirty, etc...).
                     *
                     * See man buf(9) for more information
                     */

                    resid = bp->b_bufsize;
                    foff = bp->b_loffset;

                    for (i = 0; i < bp->b_xio.xio_npages; i++) {
                              m = bp->b_xio.xio_pages[i];

                              /*
                               * If we hit a bogus page, fixup *all* of them
                               * now.  Note that we left these pages wired
                               * when we removed them so they had better exist,
                               * and they cannot be ripped out from under us so
                               * no critical section protection is necessary.
                               */
                              if (m == bogus_page) {
                                        obj = vp->v_object;
                                        poff = OFF_TO_IDX(bp->b_loffset);

                                        vm_object_hold(obj);
                                        for (j = i; j < bp->b_xio.xio_npages; j++) {
                                                  vm_page_t mtmp;

                                                  mtmp = bp->b_xio.xio_pages[j];
                                                  if (mtmp == bogus_page) {
                                                            if ((bp->b_flags & B_HASBOGUS) == 0)
                                                                      panic("brelse: bp %p corrupt bogus", bp);
                                                            mtmp = vm_page_lookup(obj, poff + j);
                                                            if (!mtmp)
                                                                      panic("brelse: bp %p page %d missing", bp, j);
                                                            bp->b_xio.xio_pages[j] = mtmp;
                                                  }
                                        }
                                        vm_object_drop(obj);

                                        if ((bp->b_flags & B_HASBOGUS) ||
                                            (bp->b_flags & B_INVAL) == 0) {
                                                  pmap_qenter_noinval(
                                                      trunc_page((vm_offset_t)bp->b_data),
                                                      bp->b_xio.xio_pages,
                                                      bp->b_xio.xio_npages);
                                                  bp->b_flags &= ~B_HASBOGUS;
                                                  bp->b_flags |= B_KVABIO;
                                                  bkvareset(bp);
                                        }
                                        m = bp->b_xio.xio_pages[i];
                              }

                              /*
                               * Invalidate the backing store if B_NOCACHE is set
                               * (e.g. used with vinvalbuf()).  If this is NFS
                               * we impose a requirement that the block size be
                               * a multiple of PAGE_SIZE and create a temporary
                               * hack to basically invalidate the whole page.  The
                               * problem is that NFS uses really odd buffer sizes
                               * especially when tracking piecemeal writes and
                               * it also vinvalbuf()'s a lot, which would result
                               * in only partial page validation and invalidation
                               * here.  If the file page is mmap()'d, however,
                               * all the valid bits get set so after we invalidate
                               * here we would end up with weird m->valid values
                               * like 0xfc.  nfs_getpages() can't handle this so
                               * we clear all the valid bits for the NFS case
                               * instead of just some of them.
                               *
                               * The real bug is the VM system having to set m->valid
                               * to VM_PAGE_BITS_ALL for faulted-in pages, which
                               * itself is an artifact of the whole 512-byte
                               * granular mess that exists to support odd block
                               * sizes and UFS meta-data block sizes (e.g. 6144).
                               * A complete rewrite is required.
                               *
                               * XXX
                               */
                              if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
                                        int poffset = foff & PAGE_MASK;
                                        int presid;

                                        presid = PAGE_SIZE - poffset;
                                        if (bp->b_vp->v_tag == VT_NFS &&
                                            bp->b_vp->v_type == VREG) {
                                                  ; /* entire page */
                                        } else if (presid > resid) {
                                                  presid = resid;
                                        }
                                        KASSERT(presid >= 0, ("brelse: extra page"));
                                        vm_page_set_invalid(m, poffset, presid);

                                        /*
                                         * Also make sure any swap cache is removed
                                         * as it is now stale (HAMMER in particular
                                         * uses B_NOCACHE to deal with buffer
                                         * aliasing).
                                         */
                                        swap_pager_unswapped(m);
                              }
                              resid -= PAGE_SIZE - (foff & PAGE_MASK);
                              foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
                    }
                    if (bp->b_flags & (B_INVAL | B_RELBUF))
                              vfs_vmio_release(bp);
          } else {
                    /*
                     * Rundown for non-VMIO buffers.
                     *
                     * XXX With B_MALLOC buffers removed, there should no longer
                     * be any situation where brelse() is called on a non B_VMIO
                     * buffer.  Recommend assertion here.  XXX
                     */
                    if (bp->b_flags & (B_INVAL | B_RELBUF)) {
                              if (bp->b_bufsize)
                                        allocbuf(bp, 0);
                              KKASSERT (LIST_FIRST(&bp->b_dep) == NULL);
                              if (bp->b_vp)
                                        brelvp(bp);
                    }
          }

          if (bp->b_qindex != BQUEUE_NONE)
                    panic("brelse: free buffer onto another queue???");

          /*
           * Figure out the correct queue to place the cleaned up buffer on.
           * Buffers placed in the EMPTY or EMPTYKVA had better already be
           * disassociated from their vnode.
           *
           * Return the buffer to its original pcpu area
           */
          pcpu = &bufpcpu[bp->b_qcpu];
          spin_lock(&pcpu->spin);

          if (bp->b_flags & B_LOCKED) {
                    /*
                     * Buffers that are locked are placed in the locked queue
                     * immediately, regardless of their state.
                     */
                    bp->b_qindex = BQUEUE_LOCKED;
                    TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
                                          bp, b_freelist);
          } else if (bp->b_bufsize == 0) {
                    /*
                     * Buffers with no memory.  Due to conditionals near the top
                     * of brelse() such buffers should probably already be
                     * marked B_INVAL and disassociated from their vnode.
                     */
                    bp->b_flags |= B_INVAL;
                    KASSERT(bp->b_vp == NULL,
                              ("bp1 %p flags %08x/%08x vnode %p "
                               "unexpectededly still associated!",
                              bp, saved_flags, bp->b_flags, bp->b_vp));
                    KKASSERT((bp->b_flags & B_HASHED) == 0);
                    bp->b_qindex = BQUEUE_EMPTY;
                    TAILQ_INSERT_HEAD(&pcpu->bufqueues[bp->b_qindex],
                                          bp, b_freelist);
          } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) {
                    /*
                     * Buffers with junk contents.   Again these buffers had better
                     * already be disassociated from their vnode.
                     */
                    KASSERT(bp->b_vp == NULL,
                              ("bp2 %p flags %08x/%08x vnode %p unexpectededly "
                               "still associated!",
                              bp, saved_flags, bp->b_flags, bp->b_vp));
                    KKASSERT((bp->b_flags & B_HASHED) == 0);
                    bp->b_flags |= B_INVAL;
                    bp->b_qindex = BQUEUE_CLEAN;
                    TAILQ_INSERT_HEAD(&pcpu->bufqueues[bp->b_qindex],
                                          bp, b_freelist);
          } else {
                    /*
                     * Remaining buffers.  These buffers are still associated with
                     * their vnode.
                     */
                    switch(bp->b_flags & (B_DELWRI|B_HEAVY)) {
                    case B_DELWRI:
                              bp->b_qindex = BQUEUE_DIRTY;
                              TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
                                                    bp, b_freelist);
                              break;
                    case B_DELWRI | B_HEAVY:
                              bp->b_qindex = BQUEUE_DIRTY_HW;
                              TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
                                                    bp, b_freelist);
                              break;
                    default:
                              /*
                               * NOTE: Buffers are always placed at the end of the
                               * queue.  If B_AGE is not set the buffer will cycle
                               * through the queue twice.
                               */
                              bp->b_qindex = BQUEUE_CLEAN;
                              TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
                                                    bp, b_freelist);
                              break;
                    }
          }
          spin_unlock(&pcpu->spin);

          /*
           * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
           * on the correct queue but we have not yet unlocked it.
           */
          if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
                    bundirty(bp);

          /*
           * The bp is on an appropriate queue unless locked.  If it is not
           * locked or dirty we can wakeup threads waiting for buffer space.
           *
           * We've already handled the B_INVAL case ( B_DELWRI will be clear
           * if B_INVAL is set ).
           */
          if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0)
                    bufcountwakeup();

          /*
           * Something we can maybe free or reuse
           */
          if (bp->b_bufsize || bp->b_kvasize)
                    bufspacewakeup();

          /*
           * Clean up temporary flags and unlock the buffer.
           */
          bp->b_flags &= ~(B_NOCACHE | B_RELBUF | B_DIRECT);
          BUF_UNLOCK(bp);
}

/*
 * bqrelse:
 *
 *        Release a buffer back to the appropriate queue but do not try to free
 *        it.  The buffer is expected to be used again soon.
 *
 *        bqrelse() is used by bdwrite() to requeue a delayed write, and used by
 *        biodone() to requeue an async I/O on completion.  It is also used when
 *        known good buffers need to be requeued but we think we may need the data
 *        again soon.
 *
 *        XXX we should be able to leave the B_RELBUF hint set on completion.
 */
void
bqrelse(struct buf *bp)
{
          struct bufpcpu *pcpu;

          KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
                    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));

          if (bp->b_qindex != BQUEUE_NONE)
                    panic("bqrelse: free buffer onto another queue???");

          buf_act_advance(bp);

          pcpu = &bufpcpu[bp->b_qcpu];
          spin_lock(&pcpu->spin);

          if (bp->b_flags & B_LOCKED) {
                    /*
                     * Locked buffers are released to the locked queue.  However,
                     * if the buffer is dirty it will first go into the dirty
                     * queue and later on after the I/O completes successfully it
                     * will be released to the locked queue.
                     */
                    bp->b_qindex = BQUEUE_LOCKED;
                    TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
                                          bp, b_freelist);
          } else if (bp->b_flags & B_DELWRI) {
                    bp->b_qindex = (bp->b_flags & B_HEAVY) ?
                                     BQUEUE_DIRTY_HW : BQUEUE_DIRTY;
                    TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
                                          bp, b_freelist);
          } else if (vm_paging_min()) {
                    /*
                     * We are too low on memory, we have to try to free the
                     * buffer (most importantly: the wired pages making up its
                     * backing store) *now*.
                     */
                    spin_unlock(&pcpu->spin);
                    brelse(bp);
                    return;
          } else {
                    bp->b_qindex = BQUEUE_CLEAN;
                    TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
                                          bp, b_freelist);
          }
          spin_unlock(&pcpu->spin);

          /*
           * We have now placed the buffer on the proper queue, but have yet
           * to unlock it.
           */
          if ((bp->b_flags & B_LOCKED) == 0 &&
              ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)) {
                    bufcountwakeup();
          }

          /*
           * Something we can maybe free or reuse.
           */
          if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
                    bufspacewakeup();

          /*
           * Final cleanup and unlock.  Clear bits that are only used while a
           * buffer is actively locked.
           */
          bp->b_flags &= ~(B_NOCACHE | B_RELBUF);
          dsched_buf_exit(bp);
          BUF_UNLOCK(bp);
}

/*
 * Hold a buffer, preventing it from being reused.  This will prevent
 * normal B_RELBUF operations on the buffer but will not prevent B_INVAL
 * operations.  If a B_INVAL operation occurs the buffer will remain held
 * but the underlying pages may get ripped out.
 *
 * These functions are typically used in VOP_READ/VOP_WRITE functions
 * to hold a buffer during a copyin or copyout, preventing deadlocks
 * or recursive lock panics when read()/write() is used over mmap()'d
 * space.
 *
 * NOTE: bqhold() requires that the buffer be locked at the time of the
 *         hold.  bqdrop() has no requirements other than the buffer having
 *         previously been held.
 */
void
bqhold(struct buf *bp)
{
          atomic_add_int(&bp->b_refs, 1);
}

void
bqdrop(struct buf *bp)
{
          KKASSERT(bp->b_refs > 0);
          atomic_add_int(&bp->b_refs, -1);
}

/*
 * Return backing pages held by the buffer 'bp' back to the VM system.
 * This routine is called when the bp is invalidated, released, or
 * reused.
 *
 * The KVA mapping (b_data) for the underlying pages is removed by
 * this function.
 *
 * WARNING! This routine is integral to the low memory critical path
 *          when a buffer is B_RELBUF'd.  If the system has a severe page
 *          deficit we need to get the page(s) onto the PQ_FREE or PQ_CACHE
 *          queues so they can be reused in the current pageout daemon
 *          pass.
 */
static void
vfs_vmio_release(struct buf *bp)
{
          int i;
          vm_page_t m;

          for (i = 0; i < bp->b_xio.xio_npages; i++) {
                    m = bp->b_xio.xio_pages[i];
                    bp->b_xio.xio_pages[i] = NULL;

                    /*
                     * We need to own the page in order to safely unwire it.
                     */
                    vm_page_busy_wait(m, FALSE, "vmiopg");

                    /*
                     * The VFS is telling us this is not a meta-data buffer
                     * even if it is backed by a block device.
                     */
                    if (bp->b_flags & B_NOTMETA)
                              vm_page_flag_set(m, PG_NOTMETA);

                    /*
                     * This is a very important bit of code.  We try to track
                     * VM page use whether the pages are wired into the buffer
                     * cache or not.  While wired into the buffer cache the
                     * bp tracks the act_count.
                     *
                     * We can choose to place unwired pages on the inactive
                     * queue (0) or active queue (1).  If we place too many
                     * on the active queue the queue will cycle the act_count
                     * on pages we'd like to keep, just from single-use pages
                     * (such as when doing a tar-up or file scan).
                     */
                    if (bp->b_act_count < vm_cycle_point)
                              vm_page_unwire(m, 0);
                    else
                              vm_page_unwire(m, 1);

                    /*
                     * If the wire_count has dropped to 0 we may need to take
                     * further action before unbusying the page.
                     *
                     * WARNING: vm_page_try_*() also checks PG_NEED_COMMIT for us.
                     */
                    if (m->wire_count == 0) {
                              if (bp->b_flags & B_DIRECT) {
                                        /*
                                         * Attempt to free the page if B_DIRECT is
                                         * set, the caller does not desire the page
                                         * to be cached.
                                         */
                                        vm_page_wakeup(m);
                                        vm_page_try_to_free(m);
                              } else if ((bp->b_flags & (B_NOTMETA | B_TTC)) ||
                                           vm_paging_min()) {
                                        /*
                                         * Attempt to move the page to PQ_CACHE
                                         * if B_NOTMETA is set.  This flag is set
                                         * by HAMMER to remove one of the two pages
                                         * present when double buffering is enabled.
                                         *
                                         * Attempt to move the page to PQ_CACHE
                                         * If we have a severe page deficit.  This
                                         * will cause buffer cache operations related
                                         * to pageouts to recycle the related pages
                                         * in order to avoid a low memory deadlock.
                                         */
                                        m->act_count = bp->b_act_count;
                                        vm_page_try_to_cache(m);
                              } else {
                                        /*
                                         * Nominal case, leave the page on the
                                         * queue the original unwiring placed it on
                                         * (active or inactive).
                                         */
                                        m->act_count = bp->b_act_count;
                                        vm_page_wakeup(m);
                              }
                    } else {
                              vm_page_wakeup(m);
                    }
          }

          /*
           * Zero out the pmap pte's for the mapping, but don't bother
           * invalidating the TLB.  The range will be properly invalidating
           * when new pages are entered into the mapping.
           *
           * This in particular reduces tmpfs tear-down overhead and reduces
           * buffer cache re-use overhead (one invalidation sequence instead
           * of two per re-use).
           */
          pmap_qremove_noinval(trunc_page((vm_offset_t) bp->b_data),
                                   bp->b_xio.xio_npages);
          CPUMASK_ASSZERO(bp->b_cpumask);
          if (bp->b_bufsize) {
                    atomic_add_long(&bufspace, -bp->b_bufsize);
                    bp->b_bufsize = 0;
                    bufspacewakeup();
          }
          bp->b_xio.xio_npages = 0;
          bp->b_flags &= ~(B_VMIO | B_TTC);
          KKASSERT (LIST_FIRST(&bp->b_dep) == NULL);
          if (bp->b_vp)
                    brelvp(bp);
}

/*
 * Find and initialize a new buffer header, freeing up existing buffers
 * in the bufqueues as necessary.  The new buffer is returned locked.
 *
 * Important:  B_INVAL is not set.  If the caller wishes to throw the
 * buffer away, the caller must set B_INVAL prior to calling brelse().
 *
 * We block if:
 *        We have insufficient buffer headers
 *        We have insufficient buffer space
 *
 * To avoid VFS layer recursion we do not flush dirty buffers ourselves.
 * Instead we ask the buf daemon to do it for us.  We attempt to
 * avoid piecemeal wakeups of the pageout daemon.
 */
struct buf *
getnewbuf(int blkflags, int slptimeo, int size, int maxsize)
{
          struct bufpcpu *pcpu;
          struct buf *bp;
          struct buf *nbp;
          int nqindex;
          int nqcpu;
          int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0;
          int maxloops = 200000;
          int restart_reason = 0;
          struct buf *restart_bp = NULL;
          static char flushingbufs[MAXCPU];
          char *flushingp;

          /*
           * We can't afford to block since we might be holding a vnode lock,
           * which may prevent system daemons from running.  We deal with
           * low-memory situations by proactively returning memory and running
           * async I/O rather then sync I/O.
           */

          ++getnewbufcalls;
          nqcpu = mycpu->gd_cpuid;
          flushingp = &flushingbufs[nqcpu];
restart:
          if (bufspace < lobufspace)
                    *flushingp = 0;

          if (debug_bufbio && --maxloops == 0)
                    panic("getnewbuf, excessive loops on cpu %d restart %d (%p)",
                              mycpu->gd_cpuid, restart_reason, restart_bp);

          /*
           * Setup for scan.  If we do not have enough free buffers,
           * we setup a degenerate case that immediately fails.  Note
           * that if we are specially marked process, we are allowed to
           * dip into our reserves.
           *
           * The scanning sequence is nominally:  EMPTY->CLEAN
           */
          pcpu = &bufpcpu[nqcpu];
          spin_lock(&pcpu->spin);

          /*
           * Prime the scan for this cpu.  Locate the first buffer to
           * check.  If we are flushing buffers we must skip the
           * EMPTY queue.
           */
          nqindex = BQUEUE_EMPTY;
          nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_EMPTY]);
          if (nbp == NULL || *flushingp) {
                    nqindex = BQUEUE_CLEAN;
                    nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_CLEAN]);
          }

          /*
           * Run scan, possibly freeing data and/or kva mappings on the fly,
           * depending.
           *
           * WARNING! spin is held!
           */
          while ((bp = nbp) != NULL) {
                    int qindex = nqindex;

                    nbp = TAILQ_NEXT(bp, b_freelist);

                    /*
                     * BQUEUE_CLEAN - B_AGE special case.  If not set the bp
                     * cycles through the queue twice before being selected.
                     */
                    if (qindex == BQUEUE_CLEAN &&
                        (bp->b_flags & B_AGE) == 0 && nbp) {
                              bp->b_flags |= B_AGE;
                              TAILQ_REMOVE(&pcpu->bufqueues[qindex],
                                             bp, b_freelist);
                              TAILQ_INSERT_TAIL(&pcpu->bufqueues[qindex],
                                                    bp, b_freelist);
                              continue;
                    }

                    /*
                     * Calculate next bp ( we can only use it if we do not block
                     * or do other fancy things ).
                     */
                    if (nbp == NULL) {
                              switch(qindex) {
                              case BQUEUE_EMPTY:
                                        nqindex = BQUEUE_CLEAN;
                                        if ((nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_CLEAN])))
                                                  break;
                                        /* fall through */
                              case BQUEUE_CLEAN:
                                        /*
                                         * nbp is NULL.
                                         */
                                        break;
                              }
                    }

                    /*
                     * Sanity Checks
                     */
                    KASSERT(bp->b_qindex == qindex,
                              ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));

                    /*
                     * Note: we no longer distinguish between VMIO and non-VMIO
                     * buffers.
                     */
                    KASSERT((bp->b_flags & B_DELWRI) == 0,
                              ("delwri buffer %p found in queue %d", bp, qindex));

                    /*
                     * Do not try to reuse a buffer with a non-zero b_refs.
                     * This is an unsynchronized test.  A synchronized test
                     * is also performed after we lock the buffer.
                     */
                    if (bp->b_refs)
                              continue;

                    /*
                     * Start freeing the bp.  This is somewhat involved.  nbp
                     * remains valid only for BQUEUE_EMPTY bp's.  Buffers
                     * on the clean list must be disassociated from their
                     * current vnode.  Buffers on the empty lists have
                     * already been disassociated.
                     *
                     * b_refs is checked after locking along with queue changes.
                     * We must check here to deal with zero->nonzero transitions
                     * made by the owner of the buffer lock, which is used by
                     * VFS's to hold the buffer while issuing an unlocked
                     * uiomove()s.  We cannot invalidate the buffer's pages
                     * for this case.  Once we successfully lock a buffer the
                     * only 0->1 transitions of b_refs will occur via findblk().
                     *
                     * We must also check for queue changes after successful
                     * locking as the current lock holder may dispose of the
                     * buffer and change its queue.
                     */
                    if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
                              spin_unlock(&pcpu->spin);
                              tsleep(&bd_request, 0, "gnbxxx", (hz + 99) / 100);
                              restart_reason = 1;
                              restart_bp = bp;
                              goto restart;
                    }
                    if (bp->b_qindex != qindex || bp->b_refs) {
                              spin_unlock(&pcpu->spin);
                              BUF_UNLOCK(bp);
                              restart_reason = 2;
                              restart_bp = bp;
                              goto restart;
                    }
                    bremfree_locked(bp);
                    spin_unlock(&pcpu->spin);

                    /*
                     * Dependancies must be handled before we disassociate the
                     * vnode.
                     *
                     * NOTE: HAMMER will set B_LOCKED if the buffer cannot
                     * be immediately disassociated.  HAMMER then becomes
                     * responsible for releasing the buffer.
                     *
                     * NOTE: spin is UNLOCKED now.
                     */
                    if (LIST_FIRST(&bp->b_dep) != NULL) {
                              buf_deallocate(bp);
                              if (bp->b_flags & B_LOCKED) {
                                        bqrelse(bp);
                                        restart_reason = 3;
                                        restart_bp = bp;
                                        goto restart;
                              }
                              KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
                    }

                    /*
                     * CLEAN buffers have content or associations that must be
                     * cleaned out if not repurposing.
                     */
                    if (qindex == BQUEUE_CLEAN) {
                              if (bp->b_flags & B_VMIO)
                                        vfs_vmio_release(bp);
                              if (bp->b_vp)
                                        brelvp(bp);
                    }

                    /*
                     * NOTE:  nbp is now entirely invalid.  We can only restart
                     * the scan from this point on.
                     *
                     * Get the rest of the buffer freed up.  b_kva* is still
                     * valid after this operation.
                     */
                    KASSERT(bp->b_vp == NULL,
                              ("bp3 %p flags %08x vnode %p qindex %d "
                               "unexpectededly still associated!",
                               bp, bp->b_flags, bp->b_vp, qindex));
                    KKASSERT((bp->b_flags & B_HASHED) == 0);

                    if (bp->b_bufsize)
                              allocbuf(bp, 0);

                if (bp->b_flags & (B_VNDIRTY | B_VNCLEAN | B_HASHED)) {
                              kprintf("getnewbuf: caught bug vp queue "
                                        "%p/%08x qidx %d\n",
                                        bp, bp->b_flags, qindex);
                              brelvp(bp);
                    }
                    bp->b_flags = B_BNOCLIP;
                    bp->b_cmd = BUF_CMD_DONE;
                    bp->b_vp = NULL;
                    bp->b_error = 0;
                    bp->b_resid = 0;
                    bp->b_bcount = 0;
                    bp->b_xio.xio_npages = 0;
                    bp->b_dirtyoff = bp->b_dirtyend = 0;
                    bp->b_act_count = ACT_INIT;
                    reinitbufbio(bp);
                    KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
                    buf_dep_init(bp);
                    if (blkflags & GETBLK_BHEAVY)
                              bp->b_flags |= B_HEAVY;

                    if (bufspace >= hibufspace)
                              *flushingp = 1;
                    if (bufspace < lobufspace)
                              *flushingp = 0;
                    if (*flushingp) {
                              bp->b_flags |= B_INVAL;
                              brelse(bp);
                              restart_reason = 5;
                              restart_bp = bp;
                              goto restart;
                    }

                    /*
                     * b_refs can transition to a non-zero value while we hold
                     * the buffer locked due to a findblk().  Our brelvp() above
                     * interlocked any future possible transitions due to
                     * findblk()s.
                     *
                     * If we find b_refs to be non-zero we can destroy the
                     * buffer's contents but we cannot yet reuse the buffer.
                     */
                    if (bp->b_refs) {
                              bp->b_flags |= B_INVAL;
                              brelse(bp);
                              restart_reason = 6;
                              restart_bp = bp;

                              goto restart;
                    }

                    /*
                     * We found our buffer!
                     */
                    break;
          }

          /*
           * If we exhausted our list, iterate other cpus.  If that fails,
           * sleep as appropriate.  We may have to wakeup various daemons
           * and write out some dirty buffers.
           *
           * Generally we are sleeping due to insufficient buffer space.
           *
           * NOTE: spin is held if bp is NULL, else it is not held.
           */
          if (bp == NULL) {
                    int flags;
                    char *waitmsg;

                    spin_unlock(&pcpu->spin);

                    nqcpu = (nqcpu + 1) % ncpus;
                    if (nqcpu != mycpu->gd_cpuid) {
                              restart_reason = 7;
                              restart_bp = bp;
                              goto restart;
                    }

                    if (bufspace >= hibufspace) {
                              waitmsg = "bufspc";
                              flags = VFS_BIO_NEED_BUFSPACE;
                    } else {
                              waitmsg = "newbuf";
                              flags = VFS_BIO_NEED_ANY;
                    }

                    bd_speedup();       /* heeeelp */
                    atomic_set_int(&needsbuffer, flags);
                    while (needsbuffer & flags) {
                              int value;

                              tsleep_interlock(&needsbuffer, 0);
                              value = atomic_fetchadd_int(&needsbuffer, 0);
                              if (value & flags) {
                                        if (tsleep(&needsbuffer, PINTERLOCKED|slpflags,
                                                     waitmsg, slptimeo)) {
                                                  return (NULL);
                                        }
                              }
                    }
          } else {
                    /*
                     * We finally have a valid bp.  Reset b_data.
                     *
                     * (spin is not held)
                     */
                    bp->b_data = bp->b_kvabase;
          }
          return(bp);
}

/*
 * buf_daemon:
 *
 *        Buffer flushing daemon.  Buffers are normally flushed by the
 *        update daemon but if it cannot keep up this process starts to
 *        take the load in an attempt to prevent getnewbuf() from blocking.
 *
 *        Once a flush is initiated it does not stop until the number
 *        of buffers falls below lodirtybuffers, but we will wake up anyone
 *        waiting at the mid-point.
 */
static struct kproc_desc buf_kp = {
          "bufdaemon",
          buf_daemon,
          &bufdaemon_td
};
SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST,
          kproc_start, &buf_kp);

static struct kproc_desc bufhw_kp = {
          "bufdaemon_hw",
          buf_daemon_hw,
          &bufdaemonhw_td
};
SYSINIT(bufdaemon_hw, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST,
          kproc_start, &bufhw_kp);

static void
buf_daemon1(struct thread *td, int queue, int (*buf_limit_fn)(long),
              int *bd_req)
{
          long limit;
          struct buf *marker;

          marker = kmalloc(sizeof(*marker), M_BIOBUF, M_WAITOK | M_ZERO);
          marker->b_flags |= B_MARKER;
          marker->b_qindex = BQUEUE_NONE;
          marker->b_qcpu = 0;

          /*
           * This process needs to be suspended prior to shutdown sync.
           */
          EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
                                    td, SHUTDOWN_PRI_LAST);
          curthread->td_flags |= TDF_SYSTHREAD;

          /*
           * This process is allowed to take the buffer cache to the limit
           */
          for (;;) {
                    kproc_suspend_loop();

                    /*
                     * Do the flush as long as the number of dirty buffers
                     * (including those running) exceeds lodirtybufspace.
                     *
                     * When flushing limit running I/O to hirunningspace
                     * Do the flush.  Limit the amount of in-transit I/O we
                     * allow to build up, otherwise we would completely saturate
                     * the I/O system.  Wakeup any waiting processes before we
                     * normally would so they can run in parallel with our drain.
                     *
                     * Our aggregate normal+HW lo water mark is lodirtybufspace,
                     * but because we split the operation into two threads we
                     * have to cut it in half for each thread.
                     */
                    waitrunningbufspace();
                    limit = lodirtybufspace / 2;
                    while (buf_limit_fn(limit)) {
                              if (flushbufqueues(marker, queue) == 0)
                                        break;
                              if (runningbufspace < hirunningspace)
                                        continue;
                              waitrunningbufspace();
                    }

                    /*
                     * We reached our low water mark, reset the
                     * request and sleep until we are needed again.
                     * The sleep is just so the suspend code works.
                     */
                    tsleep_interlock(bd_req, 0);
                    if (atomic_swap_int(bd_req, 0) == 0)
                              tsleep(bd_req, PINTERLOCKED, "psleep", hz);
          }
          /* NOT REACHED */
          /*kfree(marker, M_BIOBUF);*/
}

static int
buf_daemon_limit(long limit)
{
          return (runningbufspace + dirtykvaspace > limit ||
                    dirtybufcount - dirtybufcounthw >= nbuf / 2);
}

static int
buf_daemon_hw_limit(long limit)
{
          return (runningbufspace + dirtykvaspace > limit ||
                    dirtybufcounthw >= nbuf / 2);
}

static void
buf_daemon(void)
{
          buf_daemon1(bufdaemon_td, BQUEUE_DIRTY, buf_daemon_limit,
                        &bd_request);
}

static void
buf_daemon_hw(void)
{
          buf_daemon1(bufdaemonhw_td, BQUEUE_DIRTY_HW, buf_daemon_hw_limit,
                        &bd_request_hw);
}

/*
 * Flush up to (flushperqueue) buffers in the dirty queue.  Each cpu has a
 * localized version of the queue.  Each call made to this function iterates
 * to another cpu.  It is desireable to flush several buffers from the same
 * cpu's queue at once, as these are likely going to be linear.
 *
 * We must be careful to free up B_INVAL buffers instead of write them, which
 * NFS is particularly sensitive to.
 *
 * B_RELBUF may only be set by VFSs.  We do set B_AGE to indicate that we
 * really want to try to get the buffer out and reuse it due to the write
 * load on the machine.
 *
 * We must lock the buffer in order to check its validity before we can mess
 * with its contents.  spin isn't enough.
 */
static int
flushbufqueues(struct buf *marker, bufq_type_t q)
{
          struct bufpcpu *pcpu;
          struct buf *bp;
          int r = 0;
          u_int loops = flushperqueue;
          int lcpu = marker->b_qcpu;

          KKASSERT(marker->b_qindex == BQUEUE_NONE);
          KKASSERT(marker->b_flags & B_MARKER);

again:
          /*
           * Spinlock needed to perform operations on the queue and may be
           * held through a non-blocking BUF_LOCK(), but cannot be held when
           * BUF_UNLOCK()ing or through any other major operation.
           */
          pcpu = &bufpcpu[marker->b_qcpu];
          spin_lock(&pcpu->spin);
          marker->b_qindex = q;
          TAILQ_INSERT_HEAD(&pcpu->bufqueues[q], marker, b_freelist);
          bp = marker;

          while ((bp = TAILQ_NEXT(bp, b_freelist)) != NULL) {
                    /*
                     * NOTE: spinlock is always held at the top of the loop
                     */
                    if (bp->b_flags & B_MARKER)
                              continue;
                    if ((bp->b_flags & B_DELWRI) == 0) {
                              kprintf("Unexpected clean buffer %p\n", bp);
                              continue;
                    }
                    if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
                              continue;
                    KKASSERT(bp->b_qcpu == marker->b_qcpu && bp->b_qindex == q);

                    /*
                     * Once the buffer is locked we will have no choice but to
                     * unlock the spinlock around a later BUF_UNLOCK and re-set
                     * bp = marker when looping.  Move the marker now to make
                     * things easier.
                     */
                    TAILQ_REMOVE(&pcpu->bufqueues[q], marker, b_freelist);
                    TAILQ_INSERT_AFTER(&pcpu->bufqueues[q], bp, marker, b_freelist);

                    /*
                     * Must recheck B_DELWRI after successfully locking
                     * the buffer.
                     */
                    if ((bp->b_flags & B_DELWRI) == 0) {
                              spin_unlock(&pcpu->spin);
                              BUF_UNLOCK(bp);
                              spin_lock(&pcpu->spin);
                              bp = marker;
                              continue;
                    }

                    /*
                     * Remove the buffer from its queue.  We still own the
                     * spinlock here.
                     */
                    _bremfree(bp);

                    /*
                     * Disposing of an invalid buffer counts as a flush op
                     */
                    if (bp->b_flags & B_INVAL) {
                              spin_unlock(&pcpu->spin);
                              brelse(bp);
                              goto doloop;
                    }

                    /*
                     * Release the spinlock for the more complex ops we
                     * are now going to do.
                     */
                    spin_unlock(&pcpu->spin);
                    lwkt_yield();

                    /*
                     * This is a bit messy
                     */
                    if (LIST_FIRST(&bp->b_dep) != NULL &&
                        (bp->b_flags & B_DEFERRED) == 0 &&
                        buf_countdeps(bp, 0)) {
                              spin_lock(&pcpu->spin);
                              TAILQ_INSERT_TAIL(&pcpu->bufqueues[q], bp, b_freelist);
                              bp->b_qindex = q;
                              bp->b_flags |= B_DEFERRED;
                              spin_unlock(&pcpu->spin);
                              BUF_UNLOCK(bp);
                              spin_lock(&pcpu->spin);
                              bp = marker;
                              continue;
                    }

                    /*
                     * spinlock not held here.
                     *
                     * If the buffer has a dependancy, buf_checkwrite() must
                     * also return 0 for us to be able to initate the write.
                     *
                     * If the buffer is flagged B_ERROR it may be requeued
                     * over and over again, we try to avoid a live lock.
                     */
                    if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) {
                              brelse(bp);
                    } else if (bp->b_flags & B_ERROR) {
                              tsleep(bp, 0, "bioer", 1);
                              bp->b_flags &= ~B_AGE;
                              cluster_awrite(bp);
                    } else {
                              bp->b_flags |= B_AGE | B_KVABIO;
                              cluster_awrite(bp);
                    }
                    /* bp invalid but needs to be NULL-tested if we break out */
doloop:
                    spin_lock(&pcpu->spin);
                    ++r;
                    if (--loops == 0)
                              break;
                    bp = marker;
          }
          /* bp is invalid here but can be NULL-tested to advance */

          TAILQ_REMOVE(&pcpu->bufqueues[q], marker, b_freelist);
          marker->b_qindex = BQUEUE_NONE;
          spin_unlock(&pcpu->spin);

          /*
           * Advance the marker to be fair.
           */
          marker->b_qcpu = (marker->b_qcpu + 1) % ncpus;
          if (bp == NULL) {
                    if (marker->b_qcpu != lcpu)
                              goto again;
          }

          return (r);
}

/*
 * inmem:
 *
 *        Returns true if no I/O is needed to access the associated VM object.
 *        This is like findblk except it also hunts around in the VM system for
 *        the data.
 *
 *        Note that we ignore vm_page_free() races from interrupts against our
 *        lookup, since if the caller is not protected our return value will not
 *        be any more valid then otherwise once we exit the critical section.
 */
int
inmem(struct vnode *vp, off_t loffset)
{
          vm_object_t obj;
          vm_offset_t toff, tinc, size;
          vm_page_t m;
          int res = 1;

          if (findblk(vp, loffset, FINDBLK_TEST))
                    return 1;
          if (vp->v_mount == NULL)
                    return 0;
          if ((obj = vp->v_object) == NULL)
                    return 0;

          size = PAGE_SIZE;
          if (size > vp->v_mount->mnt_stat.f_iosize)
                    size = vp->v_mount->mnt_stat.f_iosize;

          vm_object_hold(obj);
          for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
                    m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff));
                    if (m == NULL) {
                              res = 0;
                              break;
                    }
                    tinc = size;
                    if (tinc > PAGE_SIZE - ((toff + loffset) & PAGE_MASK))
                              tinc = PAGE_SIZE - ((toff + loffset) & PAGE_MASK);
                    if (vm_page_is_valid(m,
                        (vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0) {
                              res = 0;
                              break;
                    }
          }
          vm_object_drop(obj);
          return (res);
}

/*
 * findblk:
 *
 *        Locate and return the specified buffer.  Unless flagged otherwise,
 *        a locked buffer will be returned if it exists or NULL if it does not.
 *
 *        findblk()'d buffers are still on the bufqueues and if you intend
 *        to use your (locked NON-TEST) buffer you need to bremfree(bp)
 *        and possibly do other stuff to it.
 *
 *        FINDBLK_TEST        - Do not lock the buffer.  The caller is responsible
 *                              for locking the buffer and ensuring that it remains
 *                              the desired buffer after locking.
 *
 *        FINDBLK_NBLOCK      - Lock the buffer non-blocking.  If we are unable
 *                              to acquire the lock we return NULL, even if the
 *                              buffer exists.
 *
 *        FINDBLK_REF         - Returns the buffer ref'd, which prevents normal
 *                              reuse by getnewbuf() but does not prevent
 *                              disassociation (B_INVAL).  Used to avoid deadlocks
 *                              against random (vp,loffset)s due to reassignment.
 *
 *        FINDBLK_KVABIO      - Only applicable when returning a locked buffer.
 *                              Indicates that the caller supports B_KVABIO.
 *
 *        (0)                 - Lock the buffer blocking.
 */
struct buf *
findblk(struct vnode *vp, off_t loffset, int flags)
{
          struct buf *bp;
          int lkflags;

          lkflags = LK_EXCLUSIVE;
          if (flags & FINDBLK_NBLOCK)
                    lkflags |= LK_NOWAIT;

          for (;;) {
                    /*
                     * Lookup.  Ref the buf while holding v_token to prevent
                     * reuse (but does not prevent diassociation).
                     */
                    lwkt_gettoken_shared(&vp->v_token);
                    bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset);
                    if (bp == NULL) {
                              lwkt_reltoken(&vp->v_token);
                              return(NULL);
                    }
                    bqhold(bp);
                    lwkt_reltoken(&vp->v_token);

                    /*
                     * If testing only break and return bp, do not lock.
                     */
                    if (flags & FINDBLK_TEST)
                              break;

                    /*
                     * Lock the buffer, return an error if the lock fails.
                     * (only FINDBLK_NBLOCK can cause the lock to fail).
                     */
                    if (BUF_LOCK(bp, lkflags)) {
                              atomic_subtract_int(&bp->b_refs, 1);
                              /* bp = NULL; not needed */
                              return(NULL);
                    }

                    /*
                     * Revalidate the locked buf before allowing it to be
                     * returned.
                     *
                     * B_KVABIO is only set/cleared when locking.  When
                     * clearing B_KVABIO, we must ensure that the buffer
                     * is synchronized to all cpus.
                     */
                    if (bp->b_vp == vp && bp->b_loffset == loffset) {
                              if (flags & FINDBLK_KVABIO)
                                        bp->b_flags |= B_KVABIO;
                              else
                                        bkvasync_all(bp);
                              break;
                    }
                    atomic_subtract_int(&bp->b_refs, 1);
                    BUF_UNLOCK(bp);
          }

          /*
           * Success
           */
          if ((flags & FINDBLK_REF) == 0)
                    atomic_subtract_int(&bp->b_refs, 1);
          return(bp);
}

/*
 * getcacheblk:
 *
 *        Similar to getblk() except only returns the buffer if it is
 *        B_CACHE and requires no other manipulation.  Otherwise NULL
 *        is returned.  NULL is also returned if GETBLK_NOWAIT is set
 *        and the getblk() would block.
 *
 *        If B_RAM is set the buffer might be just fine, but we return
 *        NULL anyway because we want the code to fall through to the
 *        cluster read to issue more read-aheads.  Otherwise read-ahead breaks.
 *
 *        If blksize is 0 the buffer cache buffer must already be fully
 *        cached.
 *
 *        If blksize is non-zero getblk() will be used, allowing a buffer
 *        to be reinstantiated from its VM backing store.  The buffer must
 *        still be fully cached after reinstantiation to be returned.
 */
struct buf *
getcacheblk(struct vnode *vp, off_t loffset, int blksize, int blkflags)
{
          struct buf *bp;
          int fndflags = 0;

          if (blkflags & GETBLK_NOWAIT)
                    fndflags |= FINDBLK_NBLOCK;
          if (blkflags & GETBLK_KVABIO)
                    fndflags |= FINDBLK_KVABIO;

          if (blksize) {
                    bp = getblk(vp, loffset, blksize, blkflags, 0);
                    if (bp) {
                              if ((bp->b_flags & (B_INVAL | B_CACHE)) == B_CACHE) {
                                        bp->b_flags &= ~B_AGE;
                                        if (bp->b_flags & B_RAM) {
                                                  bqrelse(bp);
                                                  bp = NULL;
                                        }
                              } else {
                                        brelse(bp);
                                        bp = NULL;
                              }
                    }
          } else {
                    bp = findblk(vp, loffset, fndflags);
                    if (bp) {
                              if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) ==
                                  B_CACHE) {
                                        bp->b_flags &= ~B_AGE;
                                        bremfree(bp);
                              } else {
                                        BUF_UNLOCK(bp);
                                        bp = NULL;
                              }
                    }
          }
          return (bp);
}

/*
 * getblk:
 *
 *        Get a block given a specified block and offset into a file/device.
 *        B_INVAL may or may not be set on return.  The caller should clear
 *        B_INVAL prior to initiating a READ.
 *
 *        IT IS IMPORTANT TO UNDERSTAND THAT IF YOU CALL GETBLK() AND B_CACHE
 *        IS NOT SET, YOU MUST INITIALIZE THE RETURNED BUFFER, ISSUE A READ,
 *        OR SET B_INVAL BEFORE RETIRING IT.  If you retire a getblk'd buffer
 *        without doing any of those things the system will likely believe
 *        the buffer to be valid (especially if it is not B_VMIO), and the
 *        next getblk() will return the buffer with B_CACHE set.
 *
 *        For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
 *        an existing buffer.
 *
 *        For a VMIO buffer, B_CACHE is modified according to the backing VM.
 *        If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
 *        and then cleared based on the backing VM.  If the previous buffer is
 *        non-0-sized but invalid, B_CACHE will be cleared.
 *
 *        If getblk() must create a new buffer, the new buffer is returned with
 *        both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
 *        case it is returned with B_INVAL clear and B_CACHE set based on the
 *        backing VM.
 *
 *        getblk() also forces a bwrite() for any B_DELWRI buffer whos
 *        B_CACHE bit is clear.
 *
 *        What this means, basically, is that the caller should use B_CACHE to
 *        determine whether the buffer is fully valid or not and should clear
 *        B_INVAL prior to issuing a read.  If the caller intends to validate
 *        the buffer by loading its data area with something, the caller needs
 *        to clear B_INVAL.  If the caller does this without issuing an I/O,
 *        the caller should set B_CACHE ( as an optimization ), else the caller
 *        should issue the I/O and biodone() will set B_CACHE if the I/O was
 *        a write attempt or if it was a successfull read.  If the caller
 *        intends to issue a READ, the caller must clear B_INVAL and B_ERROR
 *        prior to issuing the READ.  biodone() will *not* clear B_INVAL.
 *
 *        getblk flags:
 *
 *        GETBLK_PCATCH - catch signal if blocked, can cause NULL return
 *        GETBLK_BHEAVY - heavy-weight buffer cache buffer
 */
struct buf *
getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo)
{
          struct buf *bp;
          int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0;
          int error;
          int lkflags;

          if (size > MAXBSIZE)
                    panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE);
          if (vp->v_object == NULL)
                    panic("getblk: vnode %p has no object!", vp);

          /*
           * NOTE: findblk does not try to resolve KVABIO in REF-only mode.
           *         we still have to handle that ourselves.
           */
loop:
          if ((bp = findblk(vp, loffset, FINDBLK_REF | FINDBLK_TEST)) != NULL) {
                    /*
                     * The buffer was found in the cache, but we need to lock it.
                     * We must acquire a ref on the bp to prevent reuse, but
                     * this will not prevent disassociation (brelvp()) so we
                     * must recheck (vp,loffset) after acquiring the lock.
                     *
                     * Without the ref the buffer could potentially be reused
                     * before we acquire the lock and create a deadlock
                     * situation between the thread trying to reuse the buffer
                     * and us due to the fact that we would wind up blocking
                     * on a random (vp,loffset).
                     */
                    if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
                              if (blkflags & GETBLK_NOWAIT) {
                                        bqdrop(bp);
                                        return(NULL);
                              }
                              lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL;
                              if (blkflags & GETBLK_PCATCH)
                                        lkflags |= LK_PCATCH;
                              error = BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo);
                              if (error) {
                                        bqdrop(bp);
                                        if (error == ENOLCK)
                                                  goto loop;
                                        return (NULL);
                              }
                              /* buffer may have changed on us */
                    }
                    bqdrop(bp);

                    /*
                     * Once the buffer has been locked, make sure we didn't race
                     * a buffer recyclement.  Buffers that are no longer hashed
                     * will have b_vp == NULL, so this takes care of that check
                     * as well.
                     */
                    if (bp->b_vp != vp || bp->b_loffset != loffset) {
#if 0
                              kprintf("Warning buffer %p (vp %p loffset %lld) "
                                        "was recycled\n",
                                        bp, vp, (long long)loffset);
#endif
                              BUF_UNLOCK(bp);
                              goto loop;
                    }

                    /*
                     * If SZMATCH any pre-existing buffer must be of the requested
                     * size or NULL is returned.  The caller absolutely does not
                     * want getblk() to bwrite() the buffer on a size mismatch.
                     */
                    if ((blkflags & GETBLK_SZMATCH) && size != bp->b_bcount) {
                              BUF_UNLOCK(bp);
                              return(NULL);
                    }

                    /*
                     * All vnode-based buffers must be backed by a VM object.
                     *
                     * Set B_KVABIO for any incidental work, we will fix it
                     * up later.
                     */
                    KKASSERT(bp->b_flags & B_VMIO);
                    KKASSERT(bp->b_cmd == BUF_CMD_DONE);
                    bp->b_flags &= ~B_AGE;
                    bp->b_flags |= B_KVABIO;

                    /*
                     * Make sure that B_INVAL buffers do not have a cached
                     * block number translation.
                     */
                    if ((bp->b_flags & B_INVAL) &&
                        (bp->b_bio2.bio_offset != NOOFFSET)) {
                              kprintf("Warning invalid buffer %p (vp %p loffset %lld)"
                                        " did not have cleared bio_offset cache\n",
                                        bp, vp, (long long)loffset);
                              clearbiocache(&bp->b_bio2);
                    }

                    /*
                     * The buffer is locked.  B_CACHE is cleared if the buffer is
                     * invalid.
                     *
                     * After the bremfree(), disposals must use b[q]relse().
                     */
                    if (bp->b_flags & B_INVAL)
                              bp->b_flags &= ~B_CACHE;
                    bremfree(bp);

                    /*
                     * Any size inconsistancy with a dirty buffer or a buffer
                     * with a softupdates dependancy must be resolved.  Resizing
                     * the buffer in such circumstances can lead to problems.
                     *
                     * Dirty or dependant buffers are written synchronously.
                     * Other types of buffers are simply released and
                     * reconstituted as they may be backed by valid, dirty VM
                     * pages (but not marked B_DELWRI).
                     *
                     * NFS NOTE: NFS buffers which straddle EOF are oddly-sized
                     * and may be left over from a prior truncation (and thus
                     * no longer represent the actual EOF point), so we
                     * definitely do not want to B_NOCACHE the backing store.
                     */
                    if (size != bp->b_bcount) {
                              if (bp->b_flags & B_DELWRI) {
                                        bp->b_flags |= B_RELBUF;
                                        bwrite(bp);
                              } else if (LIST_FIRST(&bp->b_dep)) {
                                        bp->b_flags |= B_RELBUF;
                                        bwrite(bp);
                              } else {
                                        bp->b_flags |= B_RELBUF;
                                        brelse(bp);
                              }
                              goto loop;
                    }
                    KKASSERT(size <= bp->b_kvasize);
                    KASSERT(bp->b_loffset != NOOFFSET,
                              ("getblk: no buffer offset"));

                    /*
                     * A buffer with B_DELWRI set and B_CACHE clear must
                     * be committed before we can return the buffer in
                     * order to prevent the caller from issuing a read
                     * ( due to B_CACHE not being set ) and overwriting
                     * it.
                     *
                     * Most callers, including NFS and FFS, need this to
                     * operate properly either because they assume they
                     * can issue a read if B_CACHE is not set, or because
                     * ( for example ) an uncached B_DELWRI might loop due
                     * to softupdates re-dirtying the buffer.  In the latter
                     * case, B_CACHE is set after the first write completes,
                     * preventing further loops.
                     *
                     * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
                     * above while extending the buffer, we cannot allow the
                     * buffer to remain with B_CACHE set after the write
                     * completes or it will represent a corrupt state.  To
                     * deal with this we set B_NOCACHE to scrap the buffer
                     * after the write.
                     *
                     * XXX Should this be B_RELBUF instead of B_NOCACHE?
                     *     I'm not even sure this state is still possible
                     *     now that getblk() writes out any dirty buffers
                     *     on size changes.
                     *
                     * We might be able to do something fancy, like setting
                     * B_CACHE in bwrite() except if B_DELWRI is already set,
                     * so the below call doesn't set B_CACHE, but that gets real
                     * confusing.  This is much easier.
                     */
                    if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
                              kprintf("getblk: Warning, bp %p loff=%jx DELWRI set "
                                        "and CACHE clear, b_flags %08x\n",
                                        bp, (uintmax_t)bp->b_loffset, bp->b_flags);
                              bp->b_flags |= B_NOCACHE;
                              bwrite(bp);
                              goto loop;
                    }
          } else {
                    /*
                     * Buffer is not in-core, create new buffer.  The buffer
                     * returned by getnewbuf() is locked.  Note that the returned
                     * buffer is also considered valid (not marked B_INVAL).
                     *
                     * Calculating the offset for the I/O requires figuring out
                     * the block size.  We use DEV_BSIZE for VBLK or VCHR and
                     * the mount's f_iosize otherwise.  If the vnode does not
                     * have an associated mount we assume that the passed size is
                     * the block size.
                     *
                     * Note that vn_isdisk() cannot be used here since it may
                     * return a failure for numerous reasons.   Note that the
                     * buffer size may be larger then the block size (the caller
                     * will use block numbers with the proper multiple).  Beware
                     * of using any v_* fields which are part of unions.  In
                     * particular, in DragonFly the mount point overloading
                     * mechanism uses the namecache only and the underlying
                     * directory vnode is not a special case.
                     */
                    int bsize, maxsize;

                    if (vp->v_type == VBLK || vp->v_type == VCHR)
                              bsize = DEV_BSIZE;
                    else if (vp->v_mount)
                              bsize = vp->v_mount->mnt_stat.f_iosize;
                    else
                              bsize = size;

                    maxsize = size + (loffset & PAGE_MASK);
                    maxsize = imax(maxsize, bsize);

                    bp = getnewbuf(blkflags, slptimeo, size, maxsize);
                    if (bp == NULL) {
                              if (slpflags || slptimeo)
                                        return NULL;
                              goto loop;
                    }

                    /*
                     * Atomically insert the buffer into the hash, so that it can
                     * be found by findblk().
                     *
                     * If bgetvp() returns non-zero a collision occured, and the
                     * bp will not be associated with the vnode.
                     *
                     * Make sure the translation layer has been cleared.
                     */
                    bp->b_loffset = loffset;
                    bp->b_bio2.bio_offset = NOOFFSET;
                    /* bp->b_bio2.bio_next = NULL; */

                    if (bgetvp(vp, bp, size)) {
                              bp->b_flags |= B_INVAL;
                              brelse(bp);
                              goto loop;
                    }

                    /*
                     * All vnode-based buffers must be backed by a VM object.
                     *
                     * Set B_KVABIO for incidental work
                     */
                    KKASSERT(vp->v_object != NULL);
                    bp->b_flags |= B_VMIO | B_KVABIO;
                    KKASSERT(bp->b_cmd == BUF_CMD_DONE);

                    allocbuf(bp, size);
          }

          /*
           * Do the nasty smp broadcast (if the buffer needs it) when KVABIO
           * is not supported.
           */
          if (bp && (blkflags & GETBLK_KVABIO) == 0) {
                    bkvasync_all(bp);
          }
          return (bp);
}

/*
 * regetblk(bp)
 *
 * Reacquire a buffer that was previously released to the locked queue,
 * or reacquire a buffer which is interlocked by having bioops->io_deallocate
 * set B_LOCKED (which handles the acquisition race).
 *
 * To this end, either B_LOCKED must be set or the dependancy list must be
 * non-empty.
 */
void
regetblk(struct buf *bp)
{
          KKASSERT((bp->b_flags & B_LOCKED) || LIST_FIRST(&bp->b_dep) != NULL);
          BUF_LOCK(bp, LK_EXCLUSIVE | LK_RETRY);
          bremfree(bp);
}

/*
 * allocbuf:
 *
 *        This code constitutes the buffer memory from either anonymous system
 *        memory (in the case of non-VMIO operations) or from an associated
 *        VM object (in the case of VMIO operations).  This code is able to
 *        resize a buffer up or down.
 *
 *        Note that this code is tricky, and has many complications to resolve
 *        deadlock or inconsistant data situations.  Tread lightly!!!
 *        There are B_CACHE and B_DELWRI interactions that must be dealt with by
 *        the caller.  Calling this code willy nilly can result in the loss of
 *        data.
 *
 *        allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
 *        B_CACHE for the non-VMIO case.
 *
 *        This routine does not need to be called from a critical section but you
 *        must own the buffer.
 */
void
allocbuf(struct buf *bp, int size)
{
          vm_page_t m;
          int newbsize;
          int desiredpages;
          int i;

          if (BUF_LOCKINUSE(bp) == 0)
                    panic("allocbuf: buffer not busy");

          if (bp->b_kvasize < size)
                    panic("allocbuf: buffer too small");

          KKASSERT(bp->b_flags & B_VMIO);

          newbsize = roundup2(size, DEV_BSIZE);
          desiredpages = ((int)(bp->b_loffset & PAGE_MASK) +
                              newbsize + PAGE_MASK) >> PAGE_SHIFT;
          KKASSERT(desiredpages <= XIO_INTERNAL_PAGES);

          /*
           * Set B_CACHE initially if buffer is 0 length or will become
           * 0-length.
           */
          if (size == 0 || bp->b_bufsize == 0)
                    bp->b_flags |= B_CACHE;

          if (newbsize < bp->b_bufsize) {
                    /*
                     * DEV_BSIZE aligned new buffer size is less then the
                     * DEV_BSIZE aligned existing buffer size.  Figure out
                     * if we have to remove any pages.
                     */
                    if (desiredpages < bp->b_xio.xio_npages) {
                              for (i = desiredpages; i < bp->b_xio.xio_npages; i++) {
                                        /*
                                         * the page is not freed here -- it
                                         * is the responsibility of
                                         * vnode_pager_setsize
                                         */
                                        m = bp->b_xio.xio_pages[i];
                                        KASSERT(m != bogus_page,
                                            ("allocbuf: bogus page found"));
                                        vm_page_busy_wait(m, TRUE, "biodep");
                                        bp->b_xio.xio_pages[i] = NULL;
                                        vm_page_unwire(m, 0);
                                        vm_page_wakeup(m);
                              }
                              pmap_qremove_noinval((vm_offset_t)
                                              trunc_page((vm_offset_t)bp->b_data) +
                                              (desiredpages << PAGE_SHIFT),
                                             (bp->b_xio.xio_npages - desiredpages));
                              bp->b_xio.xio_npages = desiredpages;

                              /*
                               * Don't bother invalidating the pmap changes
                               * (which wastes global SMP invalidation IPIs)
                               * when setting the size to 0.  This case occurs
                               * when called via getnewbuf() during buffer
                               * recyclement.
                               */
                              if (desiredpages == 0) {
                                        CPUMASK_ASSZERO(bp->b_cpumask);
                              } else {
                                        bkvareset(bp);
                              }
                    }
          } else if (size > bp->b_bcount) {
                    /*
                     * We are growing the buffer, possibly in a
                     * byte-granular fashion.
                     */
                    struct vnode *vp;
                    vm_object_t obj;
                    vm_offset_t toff;
                    vm_offset_t tinc;

                    /*
                     * Step 1, bring in the VM pages from the object,
                     * allocating them if necessary.  We must clear
                     * B_CACHE if these pages are not valid for the
                     * range covered by the buffer.
                     */
                    vp = bp->b_vp;
                    obj = vp->v_object;

                    vm_object_hold(obj);
                    while (bp->b_xio.xio_npages < desiredpages) {
                              vm_page_t m;
                              vm_pindex_t pi;
                              int error;

                              pi = OFF_TO_IDX(bp->b_loffset) +
                                   bp->b_xio.xio_npages;

                              /*
                               * Blocking on m->busy_count might lead to a
                               * deadlock:
                               *
                               *  vm_fault->getpages->cluster_read->allocbuf
                               */
                              m = vm_page_lookup_busy_try(obj, pi, FALSE,
                                                                &error);
                              if (error) {
                                        vm_page_sleep_busy(m, FALSE, "pgtblk");
                                        continue;
                              }
                              if (m == NULL) {
                                        /*
                                         * note: must allocate system pages
                                         * since blocking here could intefere
                                         * with paging I/O, no matter which
                                         * process we are.
                                         */
                                        m = bio_page_alloc(bp, obj, pi,
                                                               desiredpages -
                                                                bp->b_xio.xio_npages);
                                        if (m) {
                                                  vm_page_wire(m);
                                                  vm_page_wakeup(m);
                                                  bp->b_flags &= ~B_CACHE;
                                                  bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
                                                  ++bp->b_xio.xio_npages;
                                        }
                                        continue;
                              }

                              /*
                               * We found a page and were able to busy it.
                               */
                              vm_page_wire(m);
                              vm_page_wakeup(m);
                              bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
                              ++bp->b_xio.xio_npages;
                              if (bp->b_act_count < m->act_count)
                                        bp->b_act_count = m->act_count;
                    }
                    vm_object_drop(obj);

                    /*
                     * Step 2.  We've loaded the pages into the buffer,
                     * we have to figure out if we can still have B_CACHE
                     * set.  Note that B_CACHE is set according to the
                     * byte-granular range ( bcount and size ), not the
                     * aligned range ( newbsize ).
                     *
                     * The VM test is against m->valid, which is DEV_BSIZE
                     * aligned.  Needless to say, the validity of the data
                     * needs to also be DEV_BSIZE aligned.  Note that this
                     * fails with NFS if the server or some other client
                     * extends the file's EOF.  If our buffer is resized,
                     * B_CACHE may remain set! XXX
                     */

                    toff = bp->b_bcount;
                    tinc = PAGE_SIZE - ((bp->b_loffset + toff) & PAGE_MASK);

                    while ((bp->b_flags & B_CACHE) && toff < size) {
                              vm_pindex_t pi;

                              if (tinc > (size - toff))
                                        tinc = size - toff;

                              pi = ((bp->b_loffset & PAGE_MASK) + toff) >>
                                  PAGE_SHIFT;

                              vfs_buf_test_cache(
                                  bp,
                                  bp->b_loffset,
                                  toff,
                                  tinc,
                                  bp->b_xio.xio_pages[pi]
                              );
                              toff += tinc;
                              tinc = PAGE_SIZE;
                    }

                    /*
                     * Step 3, fixup the KVM pmap.  Remember that
                     * bp->b_data is relative to bp->b_loffset, but
                     * bp->b_loffset may be offset into the first page.
                     */
                    bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
                    pmap_qenter_noinval((vm_offset_t)bp->b_data,
                                  bp->b_xio.xio_pages, bp->b_xio.xio_npages);
                    bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
                                              (vm_offset_t)(bp->b_loffset & PAGE_MASK));
                    bkvareset(bp);
          }
          atomic_add_long(&bufspace, newbsize - bp->b_bufsize);

          /* adjust space use on already-dirty buffer */
          if (bp->b_flags & B_DELWRI) {
                    /* dirtykvaspace unchanged */
                    atomic_add_long(&dirtybufspace, newbsize - bp->b_bufsize);
                    if (bp->b_flags & B_HEAVY) {
                              atomic_add_long(&dirtybufspacehw,
                                                  newbsize - bp->b_bufsize);
                    }
          }
          bp->b_bufsize = newbsize;     /* actual buffer allocation   */
          bp->b_bcount = size;                    /* requested buffer size      */
          bufspacewakeup();
}

/*
 * biowait:
 *
 *        Wait for buffer I/O completion, returning error status. B_EINTR
 *        is converted into an EINTR error but not cleared (since a chain
 *        of biowait() calls may occur).
 *
 *        On return bpdone() will have been called but the buffer will remain
 *        locked and will not have been brelse()'d.
 *
 *        NOTE!  If a timeout is specified and ETIMEDOUT occurs the I/O is
 *        likely still in progress on return.
 *
 *        NOTE!  This operation is on a BIO, not a BUF.
 *
 *        NOTE!  BIO_DONE is cleared by vn_strategy()
 */
static __inline int
_biowait(struct bio *bio, const char *wmesg, int to)
{
          struct buf *bp = bio->bio_buf;
          u_int32_t flags;
          u_int32_t nflags;
          int error;

          KKASSERT(bio == &bp->b_bio1);
          for (;;) {
                    flags = bio->bio_flags;
                    if (flags & BIO_DONE)
                              break;
                    nflags = flags | BIO_WANT;
                    tsleep_interlock(bio, 0);
                    if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) {
                              if (wmesg)
                                        error = tsleep(bio, PINTERLOCKED, wmesg, to);
                              else if (bp->b_cmd == BUF_CMD_READ)
                                        error = tsleep(bio, PINTERLOCKED, "biord", to);
                              else
                                        error = tsleep(bio, PINTERLOCKED, "biowr", to);
                              if (error) {
                                        kprintf("tsleep error biowait %d\n", error);
                                        return (error);
                              }
                    }
          }

          /*
           * Finish up.
           */
          KKASSERT(bp->b_cmd == BUF_CMD_DONE);
          bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
          if (bp->b_flags & B_EINTR)
                    return (EINTR);
          if (bp->b_flags & B_ERROR)
                    return (bp->b_error ? bp->b_error : EIO);
          return (0);
}

int
biowait(struct bio *bio, const char *wmesg)
{
          return(_biowait(bio, wmesg, 0));
}

int
biowait_timeout(struct bio *bio, const char *wmesg, int to)
{
          return(_biowait(bio, wmesg, to));
}

/*
 * This associates a tracking count with an I/O.  vn_strategy() and
 * dev_dstrategy() do this automatically but there are a few cases
 * where a vnode or device layer is bypassed when a block translation
 * is cached.  In such cases bio_start_transaction() may be called on
 * the bypassed layers so the system gets an I/O in progress indication
 * for those higher layers.
 */
void
bio_start_transaction(struct bio *bio, struct bio_track *track)
{
          bio->bio_track = track;
          bio_track_ref(track);
          dsched_buf_enter(bio->bio_buf);         /* might stack */
}

/*
 * Initiate I/O on a vnode.
 *
 * SWAPCACHE OPERATION:
 *
 *        Real buffer cache buffers have a non-NULL bp->b_vp.  Unfortunately
 *        devfs also uses b_vp for fake buffers so we also have to check
 *        that B_PAGING is 0.  In this case the passed 'vp' is probably the
 *        underlying block device.  The swap assignments are related to the
 *        buffer cache buffer's b_vp, not the passed vp.
 *
 *        The passed vp == bp->b_vp only in the case where the strategy call
 *        is made on the vp itself for its own buffers (a regular file or
 *        block device vp).  The filesystem usually then re-calls vn_strategy()
 *        after translating the request to an underlying device.
 *
 *        Cluster buffers set B_CLUSTER and the passed vp is the vp of the
 *        underlying buffer cache buffers.
 *
 *        We can only deal with page-aligned buffers at the moment, because
 *        we can't tell what the real dirty state for pages straddling a buffer
 *        are.
 *
 *        In order to call swap_pager_strategy() we must provide the VM object
 *        and base offset for the underlying buffer cache pages so it can find
 *        the swap blocks.
 */
void
vn_strategy(struct vnode *vp, struct bio *bio)
{
          struct bio_track *track;
          struct buf *bp = bio->bio_buf;

          KKASSERT(bp->b_cmd != BUF_CMD_DONE);

          /*
           * Set when an I/O is issued on the bp.  Cleared by consumers
           * (aka HAMMER), allowing the consumer to determine if I/O had
           * actually occurred.
           */
          bp->b_flags |= B_IOISSUED;

          /*
           * Handle the swapcache intercept.
           *
           * NOTE: The swapcache itself always supports KVABIO and will
           *         do the right thing if its underlying devices do not.
           */
          if (vn_cache_strategy(vp, bio))
                    return;

          /*
           * If the vnode does not support KVABIO and the buffer is using
           * KVABIO, we must synchronize b_data to all cpus before dispatching.
           */
          if ((vp->v_flag & VKVABIO) == 0 && (bp->b_flags & B_KVABIO))
                    bkvasync_all(bp);

          /*
           * Otherwise do the operation through the filesystem
           */
        if (bp->b_cmd == BUF_CMD_READ)
                track = &vp->v_track_read;
        else
                track = &vp->v_track_write;
          KKASSERT((bio->bio_flags & BIO_DONE) == 0);
          bio->bio_track = track;
          bio_track_ref(track);
          dsched_buf_enter(bp);         /* might stack */
        vop_strategy(*vp->v_ops, vp, bio);
}

/*
 * vn_cache_strategy()
 *
 * Returns 1 if the interrupt was successful, 0 if not.
 *
 * NOTE: This function supports the KVABIO API wherein b_data might not
 *         be synchronized to the current cpu.
 */
static void vn_cache_strategy_callback(struct bio *bio);

int
vn_cache_strategy(struct vnode *vp, struct bio *bio)
{
          struct buf *bp = bio->bio_buf;
          struct bio *nbio;
          vm_object_t object;
          vm_page_t m;
          int i;

          /*
           * Stop using swapcache if paniced, dumping, or dumped
           */
          if (panicstr || dumping)
                    return(0);

          /*
           * Is this buffer cache buffer suitable for reading from
           * the swap cache?
           */
          if (vm_swapcache_read_enable == 0 ||
              bp->b_cmd != BUF_CMD_READ ||
              ((bp->b_flags & B_CLUSTER) == 0 &&
               (bp->b_vp == NULL || (bp->b_flags & B_PAGING))) ||
              ((int)bp->b_loffset & PAGE_MASK) != 0 ||
              (bp->b_bcount & PAGE_MASK) != 0) {
                    return(0);
          }

          /*
           * Figure out the original VM object (it will match the underlying
           * VM pages).  Note that swap cached data uses page indices relative
           * to that object, not relative to bio->bio_offset.
           */
          if (bp->b_flags & B_CLUSTER)
                    object = vp->v_object;
          else
                    object = bp->b_vp->v_object;

          /*
           * In order to be able to use the swap cache all underlying VM
           * pages must be marked as such, and we can't have any bogus pages.
           */
          for (i = 0; i < bp->b_xio.xio_npages; ++i) {
                    m = bp->b_xio.xio_pages[i];
                    if ((m->flags & PG_SWAPPED) == 0)
                              break;
                    if (m == bogus_page)
                              break;
          }

          /*
           * If we are good then issue the I/O using swap_pager_strategy().
           *
           * We can only do this if the buffer actually supports object-backed
           * I/O.  If it doesn't npages will be 0.
           */
          if (i && i == bp->b_xio.xio_npages) {
                    m = bp->b_xio.xio_pages[0];
                    nbio = push_bio(bio);
                    nbio->bio_done = vn_cache_strategy_callback;
                    nbio->bio_offset = ptoa(m->pindex);
                    KKASSERT(m->object == object);
                    swap_pager_strategy(object, nbio);
                    return(1);
          }
          return(0);
}

/*
 * This is a bit of a hack but since the vn_cache_strategy() function can
 * override a VFS's strategy function we must make sure that the bio, which
 * is probably bio2, doesn't leak an unexpected offset value back to the
 * filesystem.  The filesystem (e.g. UFS) might otherwise assume that the
 * bio went through its own file strategy function and the the bio2 offset
 * is a cached disk offset when, in fact, it isn't.
 */
static void
vn_cache_strategy_callback(struct bio *bio)
{
          bio->bio_offset = NOOFFSET;
          biodone(pop_bio(bio));
}

/*
 * bpdone:
 *
 *        Finish I/O on a buffer after all BIOs have been processed.
 *        Called when the bio chain is exhausted or by biowait.  If called
 *        by biowait, elseit is typically 0.
 *
 *        bpdone is also responsible for setting B_CACHE in a B_VMIO bp.
 *        In a non-VMIO bp, B_CACHE will be set on the next getblk()
 *        assuming B_INVAL is clear.
 *
 *        For the VMIO case, we set B_CACHE if the op was a read and no
 *        read error occured, or if the op was a write.  B_CACHE is never
 *        set if the buffer is invalid or otherwise uncacheable.
 *
 *        bpdone does not mess with B_INVAL, allowing the I/O routine or the
 *        initiator to leave B_INVAL set to brelse the buffer out of existance
 *        in the biodone routine.
 *
 *        bpdone is responsible for calling bundirty() on the buffer after a
 *        successful write.  We previously did this prior to initiating the
 *        write under the assumption that the buffer might be dirtied again
 *        while the write was in progress, however doing it before-hand creates
 *        a race condition prior to the call to vn_strategy() where the
 *        filesystem may not be aware that a dirty buffer is present.
 *        It should not be possible for the buffer or its underlying pages to
 *        be redirtied prior to bpdone()'s unbusying of the underlying VM
 *        pages.
 */
void
bpdone(struct buf *bp, int elseit)
{
          buf_cmd_t cmd;

          KASSERT(BUF_LOCKINUSE(bp), ("bpdone: bp %p not busy", bp));
          KASSERT(bp->b_cmd != BUF_CMD_DONE,
                    ("bpdone: bp %p already done!", bp));

          /*
           * No more BIOs are left.  All completion functions have been dealt
           * with, now we clean up the buffer.
           */
          cmd = bp->b_cmd;
          bp->b_cmd = BUF_CMD_DONE;

          /*
           * Only reads and writes are processed past this point.
           */
          if (cmd != BUF_CMD_READ && cmd != BUF_CMD_WRITE) {
                    if (cmd == BUF_CMD_FREEBLKS)
                              bp->b_flags |= B_NOCACHE;
                    if (elseit)
                              brelse(bp);
                    return;
          }

          /*
           * A failed write must re-dirty the buffer unless B_INVAL
           * was set.
           *
           * A successful write must clear the dirty flag.  This is done after
           * the write to ensure that the buffer remains on the vnode's dirty
           * list for filesystem interlocks / checks until the write is actually
           * complete.  HAMMER2 is sensitive to this issue.
           *
           * Only applicable to normal buffers (with VPs).  vinum buffers may
           * not have a vp.
           *
           * Must be done prior to calling buf_complete() as the callback might
           * re-dirty the buffer.
           */
          if (cmd == BUF_CMD_WRITE) {
                    if ((bp->b_flags & (B_ERROR | B_INVAL)) == B_ERROR) {
                              bp->b_flags &= ~B_NOCACHE;
                              if (bp->b_vp)
                                        bdirty(bp);
                    } else {
                              if (bp->b_vp)
                                        bundirty(bp);
                    }
          }

          /*
           * Warning: softupdates may re-dirty the buffer, and HAMMER can do
           * a lot worse.  XXX - move this above the clearing of b_cmd
           */
          if (LIST_FIRST(&bp->b_dep) != NULL)
                    buf_complete(bp);

          if (bp->b_flags & B_VMIO) {
                    int i;
                    vm_ooffset_t foff;
                    vm_page_t m;
                    vm_object_t obj;
                    int iosize;
                    struct vnode *vp = bp->b_vp;

                    obj = vp->v_object;

#if defined(VFS_BIO_DEBUG)
                    if (vp->v_auxrefs == 0)
                              panic("bpdone: zero vnode hold count");
                    if ((vp->v_flag & VOBJBUF) == 0)
                              panic("bpdone: vnode is not setup for merged cache");
#endif

                    foff = bp->b_loffset;
                    KASSERT(foff != NOOFFSET, ("bpdone: no buffer offset"));
                    KASSERT(obj != NULL, ("bpdone: missing VM object"));

#if defined(VFS_BIO_DEBUG)
                    if (obj->paging_in_progress < bp->b_xio.xio_npages) {
                              kprintf("bpdone: paging in progress(%d) < "
                                        "bp->b_xio.xio_npages(%d)\n",
                                        obj->paging_in_progress,
                                        bp->b_xio.xio_npages);
                    }
#endif

                    /*
                     * Set B_CACHE if the op was a normal read and no error
                     * occured.  B_CACHE is set for writes in the b*write()
                     * routines.
                     */
                    iosize = bp->b_bcount - bp->b_resid;
                    if (cmd == BUF_CMD_READ &&
                        (bp->b_flags & (B_INVAL|B_NOCACHE|B_ERROR)) == 0) {
                              bp->b_flags |= B_CACHE;
                    }

                    vm_object_hold(obj);
                    for (i = 0; i < bp->b_xio.xio_npages; i++) {
                              int resid;
                              int isbogus;

                              resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
                              if (resid > iosize)
                                        resid = iosize;

                              /*
                               * cleanup bogus pages, restoring the originals.  Since
                               * the originals should still be wired, we don't have
                               * to worry about interrupt/freeing races destroying
                               * the VM object association.
                               */
                              m = bp->b_xio.xio_pages[i];
                              if (m == bogus_page) {
                                        if ((bp->b_flags & B_HASBOGUS) == 0)
                                                  panic("bpdone: bp %p corrupt bogus", bp);
                                        m = vm_page_lookup(obj, OFF_TO_IDX(foff));
                                        if (m == NULL)
                                                  panic("bpdone: page disappeared");
                                        bp->b_xio.xio_pages[i] = m;
                                        isbogus = 1;
                              } else {
                                        isbogus = 0;
                              }
#if defined(VFS_BIO_DEBUG)
                              if (OFF_TO_IDX(foff) != m->pindex) {
                                        kprintf("bpdone: foff(%lu)/m->pindex(%ld) "
                                                  "mismatch\n",
                                                  (unsigned long)foff, (long)m->pindex);
                              }
#endif

                              /*
                               * In the write case, the valid and clean bits are
                               * already changed correctly (see bdwrite()), so we
                               * only need to do this here in the read case.
                               */
                              vm_page_busy_wait(m, FALSE, "bpdpgw");
                              if (cmd == BUF_CMD_READ && isbogus == 0 && resid > 0)
                                        vfs_clean_one_page(bp, i, m);

                              /*
                               * when debugging new filesystems or buffer I/O
                               * methods, this is the most common error that pops
                               * up.  if you see this, you have not set the page
                               * busy flag correctly!!!
                               */
                              if ((m->busy_count & PBUSY_MASK) == 0) {
                                        kprintf("bpdone: page busy < 0, "
                                            "pindex: %d, foff: 0x(%x,%x), "
                                            "resid: %d, index: %d\n",
                                            (int) m->pindex, (int)(foff >> 32),
                                                            (int) foff & 0xffffffff, resid, i);
                                        if (!vn_isdisk(vp, NULL))
                                                  kprintf(" iosize: %ld, loffset: %lld, "
                                                            "flags: 0x%08x, npages: %d\n",
                                                      bp->b_vp->v_mount->mnt_stat.f_iosize,
                                                      (long long)bp->b_loffset,
                                                      bp->b_flags, bp->b_xio.xio_npages);
                                        else
                                                  kprintf(" VDEV, loffset: %lld, flags: 0x%08x, npages: %d\n",
                                                      (long long)bp->b_loffset,
                                                      bp->b_flags, bp->b_xio.xio_npages);
                                        kprintf(" valid: 0x%x, dirty: 0x%x, "
                                                  "wired: %d\n",
                                                  m->valid, m->dirty,
                                                  m->wire_count);
                                        panic("bpdone: page busy < 0");
                              }
                              vm_page_io_finish(m);
                              vm_page_wakeup(m);
                              vm_object_pip_wakeup(obj);
                              foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
                              iosize -= resid;
                    }
                    if (bp->b_flags & B_HASBOGUS) {
                              pmap_qenter_noinval(trunc_page((vm_offset_t)bp->b_data),
                                                      bp->b_xio.xio_pages,
                                                      bp->b_xio.xio_npages);
                              bp->b_flags &= ~B_HASBOGUS;
                              bkvareset(bp);
                    }
                    vm_object_drop(obj);
          }

          /*
           * Finish up by releasing the buffer.  There are no more synchronous
           * or asynchronous completions, those were handled by bio_done
           * callbacks.
           */
          if (elseit) {
                    if (bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR|B_RELBUF))
                              brelse(bp);
                    else
                              bqrelse(bp);
          }
}

/*
 * Normal biodone.
 */
void
biodone(struct bio *bio)
{
          struct buf *bp = bio->bio_buf;

          runningbufwakeup(bp);

          /*
           * Run up the chain of BIO's.   Leave b_cmd intact for the duration.
           */
          while (bio) {
                    biodone_t *done_func;
                    struct bio_track *track;

                    /*
                     * BIO tracking.  Most but not all BIOs are tracked.
                     */
                    if ((track = bio->bio_track) != NULL) {
                              bio_track_rel(track);
                              bio->bio_track = NULL;
                    }

                    /*
                     * A bio_done function terminates the loop.  The function
                     * will be responsible for any further chaining and/or
                     * buffer management.
                     *
                     * WARNING!  The done function can deallocate the buffer!
                     */
                    if ((done_func = bio->bio_done) != NULL) {
                              bio->bio_done = NULL;
                              done_func(bio);
                              return;
                    }
                    bio = bio->bio_prev;
          }

          /*
           * If we've run out of bio's do normal [a]synchronous completion.
           */
          bpdone(bp, 1);
}

/*
 * Synchronous biodone - this terminates a synchronous BIO.
 *
 * bpdone() is called with elseit=FALSE, leaving the buffer completed
 * but still locked.  The caller must brelse() the buffer after waiting
 * for completion.
 */
void
biodone_sync(struct bio *bio)
{
          struct buf *bp = bio->bio_buf;
          int flags;
          int nflags;

          KKASSERT(bio == &bp->b_bio1);
          bpdone(bp, 0);

          for (;;) {
                    flags = bio->bio_flags;
                    nflags = (flags | BIO_DONE) & ~BIO_WANT;

                    if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) {
                              if (flags & BIO_WANT)
                                        wakeup(bio);
                              break;
                    }
          }
}

/*
 * vfs_unbusy_pages:
 *
 *        This routine is called in lieu of iodone in the case of
 *        incomplete I/O.  This keeps the busy status for pages
 *        consistant.
 */
void
vfs_unbusy_pages(struct buf *bp)
{
          int i;

          runningbufwakeup(bp);

          if (bp->b_flags & B_VMIO) {
                    struct vnode *vp = bp->b_vp;
                    vm_object_t obj;

                    obj = vp->v_object;
                    vm_object_hold(obj);

                    for (i = 0; i < bp->b_xio.xio_npages; i++) {
                              vm_page_t m = bp->b_xio.xio_pages[i];

                              /*
                               * When restoring bogus changes the original pages
                               * should still be wired, so we are in no danger of
                               * losing the object association and do not need
                               * critical section protection particularly.
                               */
                              if (m == bogus_page) {
                                        m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_loffset) + i);
                                        if (!m) {
                                                  panic("vfs_unbusy_pages: page missing");
                                        }
                                        bp->b_xio.xio_pages[i] = m;
                              }
                              vm_page_busy_wait(m, FALSE, "bpdpgw");
                              vm_page_io_finish(m);
                              vm_page_wakeup(m);
                              vm_object_pip_wakeup(obj);
                    }
                    if (bp->b_flags & B_HASBOGUS) {
                              pmap_qenter_noinval(trunc_page((vm_offset_t)bp->b_data),
                                                      bp->b_xio.xio_pages,
                                                      bp->b_xio.xio_npages);
                              bp->b_flags &= ~B_HASBOGUS;
                              bkvareset(bp);
                    }
                    vm_object_drop(obj);
          }
}

/*
 * vfs_busy_pages:
 *
 *        This routine is called before a device strategy routine.
 *        It is used to tell the VM system that paging I/O is in
 *        progress, and treat the pages associated with the buffer
 *        almost as being PBUSY_LOCKED.  Also the object 'paging_in_progress'
 *        flag is handled to make sure that the object doesn't become
 *        inconsistant.
 *
 *        Since I/O has not been initiated yet, certain buffer flags
 *        such as B_ERROR or B_INVAL may be in an inconsistant state
 *        and should be ignored.
 */
void
vfs_busy_pages(struct vnode *vp, struct buf *bp)
{
          int i, bogus;
          struct lwp *lp = curthread->td_lwp;

          /*
           * The buffer's I/O command must already be set.  If reading,
           * B_CACHE must be 0 (double check against callers only doing
           * I/O when B_CACHE is 0).
           */
          KKASSERT(bp->b_cmd != BUF_CMD_DONE);
          KKASSERT(bp->b_cmd == BUF_CMD_WRITE || (bp->b_flags & B_CACHE) == 0);

          if (bp->b_flags & B_VMIO) {
                    vm_object_t obj;

                    obj = vp->v_object;
                    KASSERT(bp->b_loffset != NOOFFSET,
                              ("vfs_busy_pages: no buffer offset"));

                    /*
                     * Busy all the pages.  We have to busy them all at once
                     * to avoid deadlocks.
                     */
retry:
                    for (i = 0; i < bp->b_xio.xio_npages; i++) {
                              vm_page_t m = bp->b_xio.xio_pages[i];

                              if (vm_page_busy_try(m, FALSE)) {
                                        vm_page_sleep_busy(m, FALSE, "vbpage");
                                        while (--i >= 0)
                                                  vm_page_wakeup(bp->b_xio.xio_pages[i]);
                                        goto retry;
                              }
                    }

                    /*
                     * Setup for I/O, soft-busy the page right now because
                     * the next loop may block.
                     */
                    for (i = 0; i < bp->b_xio.xio_npages; i++) {
                              vm_page_t m = bp->b_xio.xio_pages[i];

                              if ((bp->b_flags & B_CLUSTER) == 0) {
                                        vm_object_pip_add(obj, 1);
                                        vm_page_io_start(m);
                              }
                    }

                    /*
                     * Adjust protections for I/O and do bogus-page mapping.
                     * Assume that vm_page_protect() can block (it can block
                     * if VM_PROT_NONE, don't take any chances regardless).
                     *
                     * In particular note that for writes we must incorporate
                     * page dirtyness from the VM system into the buffer's
                     * dirty range.
                     *
                     * For reads we theoretically must incorporate page dirtyness
                     * from the VM system to determine if the page needs bogus
                     * replacement, but we shortcut the test by simply checking
                     * that all m->valid bits are set, indicating that the page
                     * is fully valid and does not need to be re-read.  For any
                     * VM system dirtyness the page will also be fully valid
                     * since it was mapped at one point.
                     */
                    bogus = 0;
                    for (i = 0; i < bp->b_xio.xio_npages; i++) {
                              vm_page_t m = bp->b_xio.xio_pages[i];

                              if (bp->b_cmd == BUF_CMD_WRITE) {
                                        /*
                                         * When readying a vnode-backed buffer for
                                         * a write we must zero-fill any invalid
                                         * portions of the backing VM pages, mark
                                         * it valid and clear related dirty bits.
                                         *
                                         * vfs_clean_one_page() incorporates any
                                         * VM dirtyness and updates the b_dirtyoff
                                         * range (after we've made the page RO).
                                         *
                                         * It is also expected that the pmap modified
                                         * bit has already been cleared by the
                                         * vm_page_protect().  We may not be able
                                         * to clear all dirty bits for a page if it
                                         * was also memory mapped (NFS).
                                         *
                                         * Finally be sure to unassign any swap-cache
                                         * backing store as it is now stale.
                                         */
                                        vm_page_protect(m, VM_PROT_READ);
                                        vfs_clean_one_page(bp, i, m);
                                        swap_pager_unswapped(m);
                              } else if (m->valid == VM_PAGE_BITS_ALL) {
                                        /*
                                         * When readying a vnode-backed buffer for
                                         * read we must replace any dirty pages with
                                         * a bogus page so dirty data is not destroyed
                                         * when filling gaps.
                                         *
                                         * To avoid testing whether the page is
                                         * dirty we instead test that the page was
                                         * at some point mapped (m->valid fully
                                         * valid) with the understanding that
                                         * this also covers the dirty case.
                                         */
                                        bp->b_xio.xio_pages[i] = bogus_page;
                                        bp->b_flags |= B_HASBOGUS;
                                        bogus++;
                              } else if (m->valid & m->dirty) {
                                        /*
                                         * This case should not occur as partial
                                         * dirtyment can only happen if the buffer
                                         * is B_CACHE, and this code is not entered
                                         * if the buffer is B_CACHE.
                                         */
                                        kprintf("Warning: vfs_busy_pages - page not "
                                                  "fully valid! loff=%jx bpf=%08x "
                                                  "idx=%d val=%02x dir=%02x\n",
                                                  (uintmax_t)bp->b_loffset, bp->b_flags,
                                                  i, m->valid, m->dirty);
                                        vm_page_protect(m, VM_PROT_NONE);
                              } else {
                                        /*
                                         * The page is not valid and can be made
                                         * part of the read.
                                         */
                                        vm_page_protect(m, VM_PROT_NONE);
                              }
                              vm_page_wakeup(m);
                    }
                    if (bogus) {
                              pmap_qenter_noinval(trunc_page((vm_offset_t)bp->b_data),
                                                      bp->b_xio.xio_pages,
                                                      bp->b_xio.xio_npages);
                              bkvareset(bp);
                    }
          }

          /*
           * This is the easiest place to put the process accounting for the I/O
           * for now.
           */
          if (lp != NULL) {
                    if (bp->b_cmd == BUF_CMD_READ)
                              lp->lwp_ru.ru_inblock++;
                    else
                              lp->lwp_ru.ru_oublock++;
          }
}

/*
 * Tell the VM system that the pages associated with this buffer
 * are clean.  This is used for delayed writes where the data is
 * going to go to disk eventually without additional VM intevention.
 *
 * NOTE: While we only really need to clean through to b_bcount, we
 *         just go ahead and clean through to b_bufsize.
 */
static void
vfs_clean_pages(struct buf *bp)
{
          vm_page_t m;
          int i;

          if ((bp->b_flags & B_VMIO) == 0)
                    return;

          KASSERT(bp->b_loffset != NOOFFSET,
                    ("vfs_clean_pages: no buffer offset"));

          for (i = 0; i < bp->b_xio.xio_npages; i++) {
                    m = bp->b_xio.xio_pages[i];
                    vfs_clean_one_page(bp, i, m);
          }
}

/*
 * vfs_clean_one_page:
 *
 *        Set the valid bits and clear the dirty bits in a page within a
 *        buffer.  The range is restricted to the buffer's size and the
 *        buffer's logical offset might index into the first page.
 *
 *        The caller has busied or soft-busied the page and it is not mapped,
 *        test and incorporate the dirty bits into b_dirtyoff/end before
 *        clearing them.  Note that we need to clear the pmap modified bits
 *        after determining the the page was dirty, vm_page_set_validclean()
 *        does not do it for us.
 *
 *        This routine is typically called after a read completes (dirty should
 *        be zero in that case as we are not called on bogus-replace pages),
 *        or before a write is initiated.
 */
static void
vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m)
{
          int bcount;
          int xoff;
          int soff;
          int eoff;

          /*
           * Calculate offset range within the page but relative to buffer's
           * loffset.  loffset might be offset into the first page.
           */
          xoff = (int)bp->b_loffset & PAGE_MASK;  /* loffset offset into pg 0 */
          bcount = bp->b_bcount + xoff;           /* offset adjusted */

          if (pageno == 0) {
                    soff = xoff;
                    eoff = PAGE_SIZE;
          } else {
                    soff = (pageno << PAGE_SHIFT);
                    eoff = soff + PAGE_SIZE;
          }
          if (eoff > bcount)
                    eoff = bcount;
          if (soff >= eoff)
                    return;

          /*
           * Test dirty bits and adjust b_dirtyoff/end.
           *
           * If dirty pages are incorporated into the bp any prior
           * B_NEEDCOMMIT state (NFS) must be cleared because the
           * caller has not taken into account the new dirty data.
           *
           * If the page was memory mapped the dirty bits might go beyond the
           * end of the buffer, but we can't really make the assumption that
           * a file EOF straddles the buffer (even though this is the case for
           * NFS if B_NEEDCOMMIT is also set).  So for the purposes of clearing
           * B_NEEDCOMMIT we only test the dirty bits covered by the buffer.
           * This also saves some console spam.
           *
           * When clearing B_NEEDCOMMIT we must also clear B_CLUSTEROK,
           * NFS can handle huge commits but not huge writes.
           */
          vm_page_test_dirty(m);
          if (m->dirty) {
                    if ((bp->b_flags & B_NEEDCOMMIT) &&
                        (m->dirty & vm_page_bits(soff & PAGE_MASK, eoff - soff))) {
                              if (debug_commit)
                                        kprintf("Warning: vfs_clean_one_page: bp %p "
                                            "loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT"
                                            " cmd %d vd %02x/%02x x/s/e %d %d %d "
                                            "doff/end %d %d\n",
                                            bp, (uintmax_t)bp->b_loffset, bp->b_bcount,
                                            bp->b_flags, bp->b_cmd,
                                            m->valid, m->dirty, xoff, soff, eoff,
                                            bp->b_dirtyoff, bp->b_dirtyend);
                              bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
                              if (debug_commit)
                                        print_backtrace(-1);
                    }
                    /*
                     * Only clear the pmap modified bits if ALL the dirty bits
                     * are set, otherwise the system might mis-clear portions
                     * of a page.
                     */
                    if (m->dirty == VM_PAGE_BITS_ALL &&
                        (bp->b_flags & B_NEEDCOMMIT) == 0) {
                              pmap_clear_modify(m);
                    }
                    if (bp->b_dirtyoff > soff - xoff)
                              bp->b_dirtyoff = soff - xoff;
                    if (bp->b_dirtyend < eoff - xoff)
                              bp->b_dirtyend = eoff - xoff;
          }

          /*
           * Set related valid bits, clear related dirty bits.
           * Does not mess with the pmap modified bit.
           *
           * WARNING!  We cannot just clear all of m->dirty here as the
           *             buffer cache buffers may use a DEV_BSIZE'd aligned
           *             block size, or have an odd size (e.g. NFS at file EOF).
           *             The putpages code can clear m->dirty to 0.
           *
           *             If a VOP_WRITE generates a buffer cache buffer which
           *             covers the same space as mapped writable pages the
           *             buffer flush might not be able to clear all the dirty
           *             bits and still require a putpages from the VM system
           *             to finish it off.
           *
           * WARNING!  vm_page_set_validclean() currently assumes vm_token
           *             is held.  The page might not be busied (bdwrite() case).
           *             XXX remove this comment once we've validated that this
           *             is no longer an issue.
           */
          vm_page_set_validclean(m, soff & PAGE_MASK, eoff - soff);
}

#if 0
/*
 * Similar to vfs_clean_one_page() but sets the bits to valid and dirty.
 * The page data is assumed to be valid (there is no zeroing here).
 */
static void
vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m)
{
          int bcount;
          int xoff;
          int soff;
          int eoff;

          /*
           * Calculate offset range within the page but relative to buffer's
           * loffset.  loffset might be offset into the first page.
           */
          xoff = (int)bp->b_loffset & PAGE_MASK;  /* loffset offset into pg 0 */
          bcount = bp->b_bcount + xoff;           /* offset adjusted */

          if (pageno == 0) {
                    soff = xoff;
                    eoff = PAGE_SIZE;
          } else {
                    soff = (pageno << PAGE_SHIFT);
                    eoff = soff + PAGE_SIZE;
          }
          if (eoff > bcount)
                    eoff = bcount;
          if (soff >= eoff)
                    return;
          vm_page_set_validdirty(m, soff & PAGE_MASK, eoff - soff);
}
#endif

/*
 * vfs_bio_clrbuf:
 *
 *        Clear a buffer.  This routine essentially fakes an I/O, so we need
 *        to clear B_ERROR and B_INVAL.
 *
 *        Note that while we only theoretically need to clear through b_bcount,
 *        we go ahead and clear through b_bufsize.
 */
void
vfs_bio_clrbuf(struct buf *bp)
{
          int i, mask = 0;
          caddr_t sa, ea;
          KKASSERT(bp->b_flags & B_VMIO);

          bp->b_flags &= ~(B_INVAL | B_EINTR | B_ERROR);
          bkvasync(bp);

          if ((bp->b_xio.xio_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
              (bp->b_loffset & PAGE_MASK) == 0) {
                    mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
                    if ((bp->b_xio.xio_pages[0]->valid & mask) == mask) {
                              bp->b_resid = 0;
                              return;
                    }
                    if ((bp->b_xio.xio_pages[0]->valid & mask) == 0) {
                              bzero(bp->b_data, bp->b_bufsize);
                              bp->b_xio.xio_pages[0]->valid |= mask;
                              bp->b_resid = 0;
                              return;
                    }
          }
          sa = bp->b_data;
          for(i = 0; i < bp->b_xio.xio_npages; i++, sa=ea) {
                    int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
                    ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
                    ea = (caddr_t)(vm_offset_t)ulmin(
                                  (u_long)(vm_offset_t)ea,
                                  (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
                    mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
                    if ((bp->b_xio.xio_pages[i]->valid & mask) == mask)
                              continue;
                    if ((bp->b_xio.xio_pages[i]->valid & mask) == 0) {
                              bzero(sa, ea - sa);
                    } else {
                              for (; sa < ea; sa += DEV_BSIZE, j++) {
                                        if ((bp->b_xio.xio_pages[i]->valid &
                                            (1<<j)) == 0) {
                                                  bzero(sa, DEV_BSIZE);
                                        }
                              }
                    }
                    bp->b_xio.xio_pages[i]->valid |= mask;
          }
          bp->b_resid = 0;
}

/*
 * Allocate a page for a buffer cache buffer.
 *
 * If NULL is returned the caller is expected to retry (typically check if
 * the page already exists on retry before trying to allocate one).
 *
 * NOTE! Low-memory handling is dealt with in b[q]relse(), not here.  This
 *         function will use the system reserve with the hope that the page
 *         allocations can be returned to PQ_CACHE/PQ_FREE when the caller
 *         is done with the buffer.
 *
 * NOTE! However, TMPFS is a special case because flushing a dirty buffer
 *         to TMPFS doesn't clean the page.  For TMPFS, only the pagedaemon
 *         is capable of retiring pages (to swap).  For TMPFS we don't dig
 *         into the system reserve because doing so could stall out pretty
 *         much every process running on the system.
 */
static
vm_page_t
bio_page_alloc(struct buf *bp, vm_object_t obj, vm_pindex_t pg, int deficit)
{
          int vmflags = VM_ALLOC_NORMAL | VM_ALLOC_NULL_OK;
          vm_page_t p;

          ASSERT_LWKT_TOKEN_HELD(vm_object_token(obj));

          /*
           * Avoid localized page-queue exhaustion by rotating the effective
           * cpu-base for the BIO page allocation.  Remember we are trying to
           * avoid contention, so we want all the cpus to be in lockstep with
           * different cpuids.  Really serious contention in the kernel page
           * allocator can occur without this.
           *
           * This is kinda anti-NUMA, but localizing file data is a really hard
           * call.  It works great in some situations (temporary files in tmpfs),
           * and horribly in other situations.
           *
           * XXX add some NUMA relocalization (2 zones or 4 zones).
           */
          vmflags |= VM_ALLOC_CPU((mycpu->gd_cpuid + (u_short)ticks) % ncpus);

          /*
           * Try a normal allocation first.
           */
          p = vm_page_alloc(obj, pg, vmflags);
          if (p)
                    return(p);
          if (vm_page_lookup(obj, pg))
                    return(NULL);
          vm_pageout_deficit += deficit;

          /*
           * Try again, digging into the system reserve.
           *
           * Trying to recover pages from the buffer cache here can deadlock
           * against other threads trying to busy underlying pages so we
           * depend on the code in brelse() and bqrelse() to free/cache the
           * underlying buffer cache pages when memory is low.
           */
          if (curthread->td_flags & TDF_SYSTHREAD)
                    vmflags |= VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT;
          else if (bp->b_vp && bp->b_vp->v_tag == VT_TMPFS)
                    vmflags |= 0;
          else
                    vmflags |= VM_ALLOC_SYSTEM;

          /*recoverbufpages();*/
          p = vm_page_alloc(obj, pg, vmflags);
          if (p)
                    return(p);
          if (vm_page_lookup(obj, pg))
                    return(NULL);

          /*
           * Wait for memory to free up and try again
           */
          if (vm_paging_severe())
                    ++lowmempgallocs;
          vm_wait(hz / 20 + 1);

          p = vm_page_alloc(obj, pg, vmflags);
          if (p)
                    return(p);
          if (vm_page_lookup(obj, pg))
                    return(NULL);

          /*
           * Ok, now we are really in trouble.
           */
          if (bootverbose) {
                    static struct krate biokrate = { .freq = 1 };
                    krateprintf(&biokrate,
                                  "Warning: bio_page_alloc: memory exhausted "
                                  "during buffer cache page allocation from %s\n",
                                  curthread->td_comm);
          }
          if (curthread->td_flags & TDF_SYSTHREAD)
                    vm_wait(hz / 20 + 1);
          else
                    vm_wait(hz / 2 + 1);
          return (NULL);
}

/*
 * The buffer's mapping has changed.  Adjust the buffer's memory
 * synchronization.  The caller is the exclusive holder of the buffer
 * and has set or cleared B_KVABIO according to preference.
 *
 * WARNING! If the caller is using B_KVABIO mode, this function will
 *            not map the data to the current cpu.  The caller must also
 *            call bkvasync(bp).
 */
void
bkvareset(struct buf *bp)
{
          if (bp->b_flags & B_KVABIO) {
                    CPUMASK_ASSZERO(bp->b_cpumask);
          } else {
                    CPUMASK_ORMASK(bp->b_cpumask, smp_active_mask);
                    smp_invltlb();
                    cpu_invltlb();
          }
}

/*
 * The buffer will be used by the caller on the caller's cpu, synchronize
 * its data to the current cpu.  Caller must control the buffer by holding
 * its lock, but calling cpu does not necessarily have to be the owner of
 * the lock (i.e. HAMMER2's concurrent I/O accessors).
 *
 * If B_KVABIO is not set, the buffer is already fully synchronized.
 */
void
bkvasync(struct buf *bp)
{
          int cpuid = mycpu->gd_cpuid;
          char *bdata;

          if ((bp->b_flags & B_KVABIO) &&
              CPUMASK_TESTBIT(bp->b_cpumask, cpuid) == 0) {
                    bdata = bp->b_data;
                    while (bdata < bp->b_data + bp->b_bufsize) {
                              cpu_invlpg(bdata);
                              bdata += PAGE_SIZE -
                                         ((intptr_t)bdata & PAGE_MASK);
                    }
                    ATOMIC_CPUMASK_ORBIT(bp->b_cpumask, cpuid);
          }
}

/*
 * The buffer will be used by a subsystem that does not understand
 * the KVABIO API.  Make sure its data is synchronized to all cpus.
 *
 * If B_KVABIO is not set, the buffer is already fully synchronized.
 *
 * NOTE! This is the only safe way to clear B_KVABIO on a buffer.
 */
void
bkvasync_all(struct buf *bp)
{
          if (debug_kvabio > 0) {
                    --debug_kvabio;
                    print_backtrace(10);
          }

          if ((bp->b_flags & B_KVABIO) &&
              CPUMASK_CMPMASKNEQ(bp->b_cpumask, smp_active_mask)) {
                    smp_invltlb();
                    cpu_invltlb();
                    ATOMIC_CPUMASK_ORMASK(bp->b_cpumask, smp_active_mask);
          }
          bp->b_flags &= ~B_KVABIO;
}

/*
 * Scan all buffers in the system and issue the callback.
 */
int
scan_all_buffers(int (*callback)(struct buf *, void *), void *info)
{
          int count = 0;
          int error;
          long n;

          for (n = 0; n < nbuf; ++n) {
                    if ((error = callback(&buf[n], info)) < 0) {
                              count = error;
                              break;
                    }
                    count += error;
          }
          return (count);
}

/*
 * nestiobuf_iodone: biodone callback for nested buffers and propagate
 * completion to the master buffer.
 */
static void
nestiobuf_iodone(struct bio *bio)
{
          struct bio *mbio;
          struct buf *mbp, *bp;
          struct devstat *stats;
          int error;
          int donebytes;

          bp = bio->bio_buf;
          mbio = bio->bio_caller_info1.ptr;
          stats = bio->bio_caller_info2.ptr;
          mbp = mbio->bio_buf;

          KKASSERT(bp->b_bcount <= bp->b_bufsize);
          KKASSERT(mbp != bp);

          error = bp->b_error;
          if (bp->b_error == 0 &&
              (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
                    /*
                     * Not all got transfered, raise an error. We have no way to
                     * propagate these conditions to mbp.
                     */
                    error = EIO;
          }

          donebytes = bp->b_bufsize;

          relpbuf(bp, NULL);

          nestiobuf_done(mbio, donebytes, error, stats);
}

void
nestiobuf_done(struct bio *mbio, int donebytes, int error, struct devstat *stats)
{
          struct buf *mbp;

          mbp = mbio->bio_buf;

          KKASSERT((int)(intptr_t)mbio->bio_driver_info > 0);

          /*
           * If an error occured, propagate it to the master buffer.
           *
           * Several biodone()s may wind up running concurrently so
           * use an atomic op to adjust b_flags.
           */
          if (error) {
                    mbp->b_error = error;
                    atomic_set_int(&mbp->b_flags, B_ERROR);
          }

          /*
           * Decrement the operations in progress counter and terminate the
           * I/O if this was the last bit.
           */
          if (atomic_fetchadd_int((int *)&mbio->bio_driver_info, -1) == 1) {
                    mbp->b_resid = 0;
                    if (stats)
                              devstat_end_transaction_buf(stats, mbp);
                    biodone(mbio);
          }
}

/*
 * Initialize a nestiobuf for use.  Set an initial count of 1 to prevent
 * the mbio from being biodone()'d while we are still adding sub-bios to
 * it.
 */
void
nestiobuf_init(struct bio *bio)
{
          bio->bio_driver_info = (void *)1;
}

/*
 * The BIOs added to the nestedio have already been started, remove the
 * count that placeheld our mbio and biodone() it if the count would
 * transition to 0.
 */
void
nestiobuf_start(struct bio *mbio)
{
          struct buf *mbp = mbio->bio_buf;

          /*
           * Decrement the operations in progress counter and terminate the
           * I/O if this was the last bit.
           */
          if (atomic_fetchadd_int((int *)&mbio->bio_driver_info, -1) == 1) {
                    if (mbp->b_flags & B_ERROR)
                              mbp->b_resid = mbp->b_bcount;
                    else
                              mbp->b_resid = 0;
                    biodone(mbio);
          }
}

/*
 * Set an intermediate error prior to calling nestiobuf_start()
 */
void
nestiobuf_error(struct bio *mbio, int error)
{
          struct buf *mbp = mbio->bio_buf;

          if (error) {
                    mbp->b_error = error;
                    atomic_set_int(&mbp->b_flags, B_ERROR);
          }
}

/*
 * nestiobuf_add: setup a "nested" buffer.
 *
 * => 'mbp' is a "master" buffer which is being divided into sub pieces.
 * => 'bp' should be a buffer allocated by getiobuf.
 * => 'offset' is a byte offset in the master buffer.
 * => 'size' is a size in bytes of this nested buffer.
 */
void
nestiobuf_add(struct bio *mbio, struct buf *bp, int offset, size_t size, struct devstat *stats)
{
          struct buf *mbp = mbio->bio_buf;
          struct vnode *vp = mbp->b_vp;

          KKASSERT(mbp->b_bcount >= offset + size);

          atomic_add_int((int *)&mbio->bio_driver_info, 1);

          /* kernel needs to own the lock for it to be released in biodone */
          BUF_KERNPROC(bp);
          bp->b_vp = vp;
          bp->b_cmd = mbp->b_cmd;
          bp->b_bio1.bio_done = nestiobuf_iodone;
          bp->b_data = (char *)mbp->b_data + offset;
          bp->b_resid = bp->b_bcount = size;
          bp->b_bufsize = bp->b_bcount;

          bp->b_bio1.bio_track = NULL;
          bp->b_bio1.bio_caller_info1.ptr = mbio;
          bp->b_bio1.bio_caller_info2.ptr = stats;
}

const char *
buf_cmd_name(struct buf *bp)
{
          const char *name;

          switch(bp->b_cmd) {
          case BUF_CMD_DONE:
                    name = "(DONE)";
                    break;
          case BUF_CMD_READ:
                    name = "READ";
                    break;
          case BUF_CMD_WRITE:
                    name = "WRITE";
                    break;
          case BUF_CMD_FREEBLKS:
                    name = "FREEBLKS";
                    break;
          case BUF_CMD_FORMAT:
                    name = "FORMAT";
                    break;
          case BUF_CMD_FLUSH:
                    name = "FLUSH";
                    break;
          default:
                    name = "(UNKNOWN)";
                    break;
          }
          return name;
}


#ifdef DDB

DB_SHOW_COMMAND(buffer, db_show_buffer)
{
          /* get args */
          struct buf *bp = (struct buf *)addr;

          if (!have_addr) {
                    db_printf("usage: show buffer <addr>\n");
                    return;
          }

          db_printf("b_flags = 0x%pb%i\n", PRINT_BUF_FLAGS, bp->b_flags);
          db_printf("b_cmd = %d\n", bp->b_cmd);
          db_printf("b_error = %d, b_bufsize = %d, b_bcount = %d, "
                      "b_resid = %d\n, b_data = %p, "
                      "bio_offset(disk) = %lld, bio_offset(phys) = %lld\n",
                      bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
                      bp->b_data,
                      (long long)bp->b_bio2.bio_offset,
                      (long long)(bp->b_bio2.bio_next ?
                                        bp->b_bio2.bio_next->bio_offset : (off_t)-1));
          if (bp->b_xio.xio_npages) {
                    int i;
                    db_printf("b_xio.xio_npages = %d, pages(OBJ, IDX, PA): ",
                              bp->b_xio.xio_npages);
                    for (i = 0; i < bp->b_xio.xio_npages; i++) {
                              vm_page_t m;
                              m = bp->b_xio.xio_pages[i];
                              db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
                                  (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
                              if ((i + 1) < bp->b_xio.xio_npages)
                                        db_printf(",");
                    }
                    db_printf("\n");
          }
}
#endif /* DDB */