1 /** $MirOS: src/sys/kern/vfs_cluster.c,v 1.3 2005/07/04 00:10:43 tg Exp $ */
2 /* $OpenBSD: vfs_cluster.c,v 1.33 2004/10/26 17:16:27 pedro Exp $ */
3 /* $NetBSD: vfs_cluster.c,v 1.12 1996/04/22 01:39:05 christos Exp $ */
4
5 /*-
6 * Copyright (c) 1993
7 * The Regents of the University of California. All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)vfs_cluster.c 8.8 (Berkeley) 7/28/94
34 */
35
36 #include <sys/param.h>
37 #include <sys/proc.h>
38 #include <sys/buf.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/malloc.h>
42 #include <sys/systm.h>
43 #include <sys/resourcevar.h>
44
45 #include <uvm/uvm_extern.h>
46
47 /*
48 * Local declarations
49 */
50 void cluster_callback(struct buf *);
51 struct buf *cluster_newbuf(struct vnode *, struct buf *, long, daddr_t,
52 daddr_t, long, int);
53 struct buf *cluster_rbuild(struct vnode *, u_quad_t, struct buf *,
54 daddr_t, daddr_t, long, int, long);
55 void cluster_wbuild(struct vnode *, struct buf *, long,
56 daddr_t, int, daddr_t);
57 struct cluster_save *cluster_collectbufs(struct vnode *,
58 struct cluster_info *, struct buf *);
59
60 #ifdef DIAGNOSTIC
61 /*
62 * Set to 1 if reads of block zero should cause readahead to be done.
63 * Set to 0 treats a read of block zero as a non-sequential read.
64 *
65 * Setting to one assumes that most reads of block zero of files are due to
66 * sequential passes over the files (e.g. cat, sum) where additional blocks
67 * will soon be needed. Setting to zero assumes that the majority are
68 * surgical strikes to get particular info (e.g. size, file) where readahead
69 * blocks will not be used and, in fact, push out other potentially useful
70 * blocks from the cache. The former seems intuitive, but some quick tests
71 * showed that the latter performed better from a system-wide point of view.
72 */
73 int doclusterraz = 0;
74 #define ISSEQREAD(ci, blk) \
75 (((blk) != 0 || doclusterraz) && \
76 ((blk) == (ci)->ci_lastr + 1 || (blk) == (ci)->ci_lastr))
77 #else
78 #define ISSEQREAD(ci, blk) \
79 ((blk) != 0 && ((blk) == (ci)->ci_lastr + 1 || (blk) == (ci)->ci_lastr))
80 #endif
81
82 /*
83 * This replaces bread. If this is a bread at the beginning of a file and
84 * lastr is 0, we assume this is the first read and we'll read up to two
85 * blocks if they are sequential. After that, we'll do regular read ahead
86 * in clustered chunks.
87 *
88 * There are 4 or 5 cases depending on how you count:
89 * Desired block is in the cache:
90 * 1 Not sequential access (0 I/Os).
91 * 2 Access is sequential, do read-ahead (1 ASYNC).
92 * Desired block is not in cache:
93 * 3 Not sequential access (1 SYNC).
94 * 4 Sequential access, next block is contiguous (2 SYNC).
95 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
96 *
97 * There are potentially two buffers that require I/O.
98 * bp is the block requested.
99 * rbp is the read-ahead block.
100 * If either is NULL, then you don't have to do the I/O.
101 */
102 int
cluster_read(vp,ci,filesize,lblkno,size,cred,bpp)103 cluster_read(vp, ci, filesize, lblkno, size, cred, bpp)
104 struct vnode *vp;
105 struct cluster_info *ci;
106 u_quad_t filesize;
107 daddr_t lblkno;
108 long size;
109 struct ucred *cred;
110 struct buf **bpp;
111 {
112 struct buf *bp, *rbp;
113 daddr_t blkno, ioblkno;
114 long flags;
115 int error, num_ra, alreadyincore;
116
117 #ifdef DIAGNOSTIC
118 if (size == 0)
119 panic("cluster_read: size = 0");
120 #endif
121
122 error = 0;
123 flags = B_READ;
124 *bpp = bp = getblk(vp, lblkno, size, 0, 0);
125 if (bp->b_flags & B_CACHE) {
126 /*
127 * Desired block is in cache; do any readahead ASYNC.
128 * Case 1, 2.
129 */
130 flags |= B_ASYNC;
131 ioblkno = lblkno + (ci->ci_ralen ? ci->ci_ralen : 1);
132 alreadyincore = incore(vp, ioblkno) != NULL;
133 bp = NULL;
134 } else {
135 /* Block wasn't in cache, case 3, 4, 5. */
136 bp->b_flags |= B_READ;
137 ioblkno = lblkno;
138 alreadyincore = 0;
139 curproc->p_stats->p_ru.ru_inblock++; /* XXX */
140 }
141 /*
142 * XXX
143 * Replace 1 with a window size based on some permutation of
144 * maxcontig and rot_delay. This will let you figure out how
145 * many blocks you should read-ahead (case 2, 4, 5).
146 *
147 * If the access isn't sequential, reset the window to 1.
148 * Note that a read to the same block is considered sequential.
149 * This catches the case where the file is being read sequentially,
150 * but at smaller than the filesystem block size.
151 */
152 rbp = NULL;
153 if (!ISSEQREAD(ci, lblkno)) {
154 ci->ci_ralen = 0;
155 ci->ci_maxra = lblkno;
156 } else if ((u_quad_t)(ioblkno + 1) * (u_quad_t)size <= filesize &&
157 !alreadyincore &&
158 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) &&
159 blkno != -1) {
160 /*
161 * Reading sequentially, and the next block is not in the
162 * cache. We are going to try reading ahead.
163 */
164 if (num_ra) {
165 /*
166 * If our desired readahead block had been read
167 * in a previous readahead but is no longer in
168 * core, then we may be reading ahead too far
169 * or are not using our readahead very rapidly.
170 * In this case we scale back the window.
171 */
172 if (!alreadyincore && ioblkno <= ci->ci_maxra)
173 ci->ci_ralen = max(ci->ci_ralen >> 1, 1);
174 /*
175 * There are more sequential blocks than our current
176 * window allows, scale up. Ideally we want to get
177 * in sync with the filesystem maxcontig value.
178 */
179 else if (num_ra > ci->ci_ralen && lblkno != ci->ci_lastr)
180 ci->ci_ralen = ci->ci_ralen ?
181 min(num_ra, ci->ci_ralen << 1) : 1;
182
183 if (num_ra > ci->ci_ralen)
184 num_ra = ci->ci_ralen;
185 }
186
187 if (num_ra) /* case 2, 4 */
188 rbp = cluster_rbuild(vp, filesize,
189 bp, ioblkno, blkno, size, num_ra, flags);
190 else if (ioblkno == lblkno) {
191 bp->b_blkno = blkno;
192 /* Case 5: check how many blocks to read ahead */
193 ++ioblkno;
194 if ((u_quad_t)(ioblkno + 1) * (u_quad_t)size >
195 filesize ||
196 incore(vp, ioblkno) || (error = VOP_BMAP(vp,
197 ioblkno, NULL, &blkno, &num_ra)) || blkno == -1)
198 goto skip_readahead;
199 /*
200 * Adjust readahead as above.
201 * Don't check alreadyincore, we know it is 0 from
202 * the previous conditional.
203 */
204 if (num_ra) {
205 if (ioblkno <= ci->ci_maxra)
206 ci->ci_ralen = max(ci->ci_ralen >> 1, 1);
207 else if (num_ra > ci->ci_ralen &&
208 lblkno != ci->ci_lastr)
209 ci->ci_ralen = ci->ci_ralen ?
210 min(num_ra,ci->ci_ralen<<1) : 1;
211 if (num_ra > ci->ci_ralen)
212 num_ra = ci->ci_ralen;
213 }
214 flags |= B_ASYNC;
215 if (num_ra)
216 rbp = cluster_rbuild(vp, filesize,
217 NULL, ioblkno, blkno, size, num_ra, flags);
218 else {
219 rbp = getblk(vp, ioblkno, size, 0, 0);
220 rbp->b_flags |= flags;
221 rbp->b_blkno = blkno;
222 }
223 } else {
224 /* case 2; read ahead single block */
225 rbp = getblk(vp, ioblkno, size, 0, 0);
226 rbp->b_flags |= flags;
227 rbp->b_blkno = blkno;
228 }
229
230 if (rbp == bp) /* case 4 */
231 rbp = NULL;
232 else if (rbp) /* case 2, 5 */
233 curproc->p_stats->p_ru.ru_inblock++; /* XXX */
234 }
235
236 /* XXX Kirk, do we need to make sure the bp has creds? */
237 skip_readahead:
238 if (bp) {
239 if (bp->b_flags & (B_DONE | B_DELWRI))
240 panic("cluster_read: DONE bp");
241 else
242 error = VOP_STRATEGY(bp);
243 }
244
245 if (rbp) {
246 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
247 rbp->b_flags &= ~(B_ASYNC | B_READ);
248 brelse(rbp);
249 } else
250 (void) VOP_STRATEGY(rbp);
251 }
252
253 /*
254 * Recalculate our maximum readahead
255 */
256 if (rbp == NULL)
257 rbp = bp;
258 if (rbp)
259 ci->ci_maxra = rbp->b_lblkno + (rbp->b_bcount / size) - 1;
260
261 if (bp)
262 return(biowait(bp));
263 return(error);
264 }
265
266 /*
267 * If blocks are contiguous on disk, use this to provide clustered
268 * read ahead. We will read as many blocks as possible sequentially
269 * and then parcel them up into logical blocks in the buffer hash table.
270 */
271 struct buf *
cluster_rbuild(vp,filesize,bp,lbn,blkno,size,run,flags)272 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
273 struct vnode *vp;
274 u_quad_t filesize;
275 struct buf *bp;
276 daddr_t lbn;
277 daddr_t blkno;
278 long size;
279 int run;
280 long flags;
281 {
282 struct cluster_save *b_save;
283 struct buf *tbp;
284 daddr_t bn;
285 int i, inc;
286
287 #ifdef DIAGNOSTIC
288 if (size != vp->v_mount->mnt_stat.f_iosize)
289 panic("cluster_rbuild: size %ld != filesize %ld",
290 size, (long)vp->v_mount->mnt_stat.f_iosize);
291 #endif
292 if ((u_quad_t)size * (u_quad_t)(lbn + run + 1) > filesize)
293 --run;
294 if (run == 0) {
295 if (!bp) {
296 bp = getblk(vp, lbn, size, 0, 0);
297 bp->b_blkno = blkno;
298 bp->b_flags |= flags;
299 }
300 return(bp);
301 }
302
303 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
304 if (bp->b_flags & (B_DONE | B_DELWRI))
305 return (bp);
306
307 b_save = malloc(sizeof(struct buf *) * run +
308 sizeof(struct cluster_save), M_VCLUSTER, M_WAITOK);
309 b_save->bs_bufsize = b_save->bs_bcount = size;
310 b_save->bs_nchildren = 0;
311 b_save->bs_children = (struct buf **)(b_save + 1);
312 b_save->bs_saveaddr = bp->b_saveaddr;
313 bp->b_saveaddr = b_save;
314
315 inc = btodb(size);
316 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
317 /*
318 * A component of the cluster is already in core,
319 * terminate the cluster early.
320 */
321 if (incore(vp, lbn + i))
322 break;
323 tbp = getblk(vp, lbn + i, 0, 0, 0);
324
325 /*
326 * getblk may return some memory in the buffer if there were
327 * no empty buffers to shed it to. If there is currently
328 * memory in the buffer, we move it down size bytes to make
329 * room for the valid pages that cluster_callback will insert.
330 * We do this now so we don't have to do it at interrupt time
331 * in the callback routine.
332 */
333 if (tbp->b_bufsize != 0) {
334 caddr_t bdata = tbp->b_data;
335
336 /*
337 * No room in the buffer to add another page,
338 * terminate the cluster early.
339 */
340 if (tbp->b_bufsize + size > MAXBSIZE) {
341 #ifdef DIAGNOSTIC
342 if (tbp->b_bufsize > MAXBSIZE)
343 panic("cluster_rbuild: too much memory");
344 #endif
345 /* This buffer is *not* valid. */
346 tbp->b_flags |= B_INVAL;
347 brelse(tbp);
348 break;
349 }
350 pagemove(bdata, bdata + tbp->b_bufsize, size);
351 }
352 tbp->b_blkno = bn;
353 tbp->b_flags &= ~(B_DONE | B_ERROR);
354 tbp->b_flags |= flags | B_READ | B_ASYNC;
355 b_save->bs_children[b_save->bs_nchildren++] = tbp;
356 }
357 /*
358 * The cluster may have been terminated early, adjust the cluster
359 * buffer size accordingly. If no cluster could be formed,
360 * deallocate the cluster save info.
361 */
362 if (i <= run) {
363 if (i == 1) {
364 bp->b_saveaddr = b_save->bs_saveaddr;
365 bp->b_flags &= ~B_CALL;
366 bp->b_iodone = NULL;
367 free(b_save, M_VCLUSTER);
368 }
369 allocbuf(bp, size * i);
370 }
371 return(bp);
372 }
373
374 /*
375 * Either get a new buffer or grow the existing one.
376 */
377 struct buf *
cluster_newbuf(vp,bp,flags,blkno,lblkno,size,run)378 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
379 struct vnode *vp;
380 struct buf *bp;
381 long flags;
382 daddr_t blkno;
383 daddr_t lblkno;
384 long size;
385 int run;
386 {
387 if (!bp) {
388 bp = getblk(vp, lblkno, size, 0, 0);
389 if (bp->b_flags & (B_DONE | B_DELWRI)) {
390 bp->b_blkno = blkno;
391 return(bp);
392 }
393 }
394 allocbuf(bp, run * size);
395 bp->b_blkno = blkno;
396 bp->b_iodone = cluster_callback;
397 bp->b_flags |= flags | B_CALL;
398 return(bp);
399 }
400
401 /*
402 * Cleanup after a clustered read or write.
403 * This is complicated by the fact that any of the buffers might have
404 * extra memory (if there were no empty buffer headers at allocbuf time)
405 * that we will need to shift around.
406 */
407 void
cluster_callback(bp)408 cluster_callback(bp)
409 struct buf *bp;
410 {
411 struct cluster_save *b_save;
412 struct buf **bpp, *tbp;
413 long bsize;
414 caddr_t cp;
415 int error = 0;
416
417 splassert(IPL_BIO);
418
419 /*
420 * Must propagate errors to all the components.
421 */
422 if (bp->b_flags & B_ERROR)
423 error = bp->b_error;
424
425 b_save = (struct cluster_save *)(bp->b_saveaddr);
426 bp->b_saveaddr = b_save->bs_saveaddr;
427
428 bsize = b_save->bs_bufsize;
429 cp = (char *)bp->b_data + bsize;
430 /*
431 * Move memory from the large cluster buffer into the component
432 * buffers and mark IO as done on these.
433 */
434 for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
435 tbp = *bpp;
436 pagemove(cp, tbp->b_data, bsize);
437 tbp->b_bufsize += bsize;
438 tbp->b_bcount = bsize;
439 if (error) {
440 tbp->b_flags |= B_ERROR;
441 tbp->b_error = error;
442 }
443 biodone(tbp);
444 bp->b_bufsize -= bsize;
445 cp += bsize;
446 }
447 /*
448 * If there was excess memory in the cluster buffer,
449 * slide it up adjacent to the remaining valid data.
450 */
451 if (bp->b_bufsize != bsize) {
452 if (bp->b_bufsize < bsize)
453 panic("cluster_callback: too little memory");
454 if (bp->b_bufsize < cp - (char *)bp->b_data)
455 pagemove(cp, (char *)bp->b_data + bsize,
456 bp->b_bufsize - bsize);
457 else
458 pagemove((char *)bp->b_data + bp->b_bufsize,
459 (char *)bp->b_data + bsize,
460 cp - ((char *)bp->b_data + bsize));
461 }
462 bp->b_bcount = bsize;
463 bp->b_iodone = NULL;
464 free(b_save, M_VCLUSTER);
465 if (bp->b_flags & B_ASYNC)
466 brelse(bp);
467 else {
468 bp->b_flags &= ~B_WANTED;
469 wakeup(bp);
470 }
471 }
472
473 /*
474 * Do clustered write for FFS.
475 *
476 * Three cases:
477 * 1. Write is not sequential (write asynchronously)
478 * Write is sequential:
479 * 2. beginning of cluster - begin cluster
480 * 3. middle of a cluster - add to cluster
481 * 4. end of a cluster - asynchronously write cluster
482 */
483 void
cluster_write(bp,ci,filesize)484 cluster_write(bp, ci, filesize)
485 struct buf *bp;
486 struct cluster_info *ci;
487 u_quad_t filesize;
488 {
489 struct vnode *vp;
490 daddr_t lbn;
491 int maxclen, cursize;
492
493 vp = bp->b_vp;
494 lbn = bp->b_lblkno;
495
496 /* Initialize vnode to beginning of file. */
497 if (lbn == 0)
498 ci->ci_lasta = ci->ci_clen = ci->ci_cstart = ci->ci_lastw = 0;
499
500 if (ci->ci_clen == 0 || lbn != ci->ci_lastw + 1 ||
501 (bp->b_blkno != ci->ci_lasta + btodb(bp->b_bcount))) {
502 maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
503 if (ci->ci_clen != 0) {
504 /*
505 * Next block is not sequential.
506 *
507 * If we are not writing at end of file, the process
508 * seeked to another point in the file since its
509 * last write, or we have reached our maximum
510 * cluster size, then push the previous cluster.
511 * Otherwise try reallocating to make it sequential.
512 */
513 cursize = ci->ci_lastw - ci->ci_cstart + 1;
514 if (((u_quad_t)(lbn + 1)) * bp->b_bcount != filesize ||
515 lbn != ci->ci_lastw + 1 || ci->ci_clen <= cursize) {
516 cluster_wbuild(vp, NULL, bp->b_bcount,
517 ci->ci_cstart, cursize, lbn);
518 } else {
519 struct buf **bpp, **endbp;
520 struct cluster_save *buflist;
521
522 buflist = cluster_collectbufs(vp, ci, bp);
523 endbp = &buflist->bs_children
524 [buflist->bs_nchildren - 1];
525 if (VOP_REALLOCBLKS(vp, buflist)) {
526 /*
527 * Failed, push the previous cluster.
528 */
529 for (bpp = buflist->bs_children;
530 bpp < endbp; bpp++)
531 brelse(*bpp);
532 free(buflist, M_VCLUSTER);
533 cluster_wbuild(vp, NULL, bp->b_bcount,
534 ci->ci_cstart, cursize, lbn);
535 } else {
536 /*
537 * Succeeded, keep building cluster.
538 */
539 for (bpp = buflist->bs_children;
540 bpp <= endbp; bpp++)
541 bdwrite(*bpp);
542 free(buflist, M_VCLUSTER);
543 ci->ci_lastw = lbn;
544 ci->ci_lasta = bp->b_blkno;
545 return;
546 }
547 }
548 }
549 /*
550 * Consider beginning a cluster.
551 * If at end of file, make cluster as large as possible,
552 * otherwise find size of existing cluster.
553 */
554 if ((u_quad_t)(lbn + 1) * (u_quad_t)bp->b_bcount != filesize &&
555 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
556 bp->b_blkno == -1)) {
557 bawrite(bp);
558 ci->ci_clen = 0;
559 ci->ci_lasta = bp->b_blkno;
560 ci->ci_cstart = lbn + 1;
561 ci->ci_lastw = lbn;
562 return;
563 }
564 ci->ci_clen = maxclen;
565 if (maxclen == 0) { /* I/O not contiguous */
566 ci->ci_cstart = lbn + 1;
567 bawrite(bp);
568 } else { /* Wait for rest of cluster */
569 ci->ci_cstart = lbn;
570 bdwrite(bp);
571 }
572 } else if (lbn == ci->ci_cstart + ci->ci_clen) {
573 /*
574 * At end of cluster, write it out.
575 */
576 cluster_wbuild(vp, bp, bp->b_bcount, ci->ci_cstart,
577 ci->ci_clen + 1, lbn);
578 ci->ci_clen = 0;
579 ci->ci_cstart = lbn + 1;
580 } else
581 /*
582 * In the middle of a cluster, so just delay the
583 * I/O for now.
584 */
585 bdwrite(bp);
586 ci->ci_lastw = lbn;
587 ci->ci_lasta = bp->b_blkno;
588 }
589
590
591 /*
592 * This is an awful lot like cluster_rbuild...wish they could be combined.
593 * The last lbn argument is the current block on which I/O is being
594 * performed. Check to see that it doesn't fall in the middle of
595 * the current block (if last_bp == NULL).
596 */
597 void
cluster_wbuild(vp,last_bp,size,start_lbn,len,lbn)598 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
599 struct vnode *vp;
600 struct buf *last_bp;
601 long size;
602 daddr_t start_lbn;
603 int len;
604 daddr_t lbn;
605 {
606 struct cluster_save *b_save;
607 struct buf *bp, *tbp;
608 caddr_t cp;
609 int i, s;
610
611 #ifdef DIAGNOSTIC
612 if (size != vp->v_mount->mnt_stat.f_iosize)
613 panic("cluster_wbuild: size %ld != filesize %ld",
614 size, (long)vp->v_mount->mnt_stat.f_iosize);
615 #endif
616 redo:
617 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
618 ++start_lbn;
619 --len;
620 }
621
622 /* Get more memory for current buffer */
623 if (len <= 1) {
624 if (last_bp) {
625 bawrite(last_bp);
626 } else if (len) {
627 bp = getblk(vp, start_lbn, size, 0, 0);
628 /*
629 * The buffer could have already been flushed out of
630 * the cache. If that has happened, we'll get a new
631 * buffer here with random data, just drop it.
632 */
633 if ((bp->b_flags & B_DELWRI) == 0)
634 brelse(bp);
635 else
636 bawrite(bp);
637 }
638 return;
639 }
640
641 bp = getblk(vp, start_lbn, size, 0, 0);
642 if (!(bp->b_flags & B_DELWRI)) {
643 ++start_lbn;
644 --len;
645 brelse(bp);
646 goto redo;
647 }
648
649 /*
650 * Extra memory in the buffer, punt on this buffer.
651 * XXX we could handle this in most cases, but we would have to
652 * push the extra memory down to after our max possible cluster
653 * size and then potentially pull it back up if the cluster was
654 * terminated prematurely--too much hassle.
655 */
656 if (bp->b_bcount != bp->b_bufsize) {
657 ++start_lbn;
658 --len;
659 bawrite(bp);
660 goto redo;
661 }
662
663 --len;
664 b_save = malloc(sizeof(struct buf *) * len +
665 sizeof(struct cluster_save), M_VCLUSTER, M_WAITOK);
666 b_save->bs_bcount = bp->b_bcount;
667 b_save->bs_bufsize = bp->b_bufsize;
668 b_save->bs_nchildren = 0;
669 b_save->bs_children = (struct buf **)(b_save + 1);
670 b_save->bs_saveaddr = bp->b_saveaddr;
671 bp->b_saveaddr = b_save;
672
673 bp->b_flags |= B_CALL;
674 bp->b_iodone = cluster_callback;
675 cp = (char *)bp->b_data + size;
676 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
677 /*
678 * Block is not in core or the non-sequential block
679 * ending our cluster was part of the cluster (in which
680 * case we don't want to write it twice).
681 */
682 if (!incore(vp, start_lbn) ||
683 (last_bp == NULL && start_lbn == lbn))
684 break;
685
686 /*
687 * Get the desired block buffer (unless it is the final
688 * sequential block whose buffer was passed in explicitly
689 * as last_bp).
690 */
691 if (last_bp == NULL || start_lbn != lbn) {
692 tbp = getblk(vp, start_lbn, size, 0, 0);
693 if (!(tbp->b_flags & B_DELWRI)) {
694 brelse(tbp);
695 break;
696 }
697 } else
698 tbp = last_bp;
699
700 ++b_save->bs_nchildren;
701
702 if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) {
703 printf("Clustered Block: %d addr %x bufsize: %ld\n",
704 bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
705 printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
706 tbp->b_blkno);
707 panic("Clustered write to wrong blocks");
708 }
709
710 /*
711 * We might as well AGE the buffer here; it's either empty, or
712 * contains data that we couldn't get rid of (but wanted to).
713 */
714 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
715 tbp->b_flags |= (B_ASYNC | B_AGE);
716 s = splbio();
717 buf_undirty(tbp);
718 ++tbp->b_vp->v_numoutput;
719 splx(s);
720
721 if (LIST_FIRST(&tbp->b_dep) != NULL)
722 buf_start(tbp);
723
724 /* Move memory from children to parent */
725 pagemove(tbp->b_data, cp, size);
726 bp->b_bcount += size;
727 bp->b_bufsize += size;
728
729 tbp->b_bufsize -= size;
730 b_save->bs_children[i] = tbp;
731
732 cp += size;
733 }
734
735 if (i == 0) {
736 /* None to cluster */
737 bp->b_saveaddr = b_save->bs_saveaddr;
738 bp->b_flags &= ~B_CALL;
739 bp->b_iodone = NULL;
740 free(b_save, M_VCLUSTER);
741 }
742 bawrite(bp);
743 if (i < len) {
744 len -= i + 1;
745 start_lbn += 1;
746 goto redo;
747 }
748 }
749
750 /*
751 * Collect together all the buffers in a cluster.
752 * Plus add one additional buffer.
753 */
754 struct cluster_save *
cluster_collectbufs(vp,ci,last_bp)755 cluster_collectbufs(vp, ci, last_bp)
756 struct vnode *vp;
757 struct cluster_info *ci;
758 struct buf *last_bp;
759 {
760 struct cluster_save *buflist;
761 daddr_t lbn;
762 int i, len;
763
764 len = ci->ci_lastw - ci->ci_cstart + 1;
765 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
766 M_VCLUSTER, M_WAITOK);
767 buflist->bs_nchildren = 0;
768 buflist->bs_children = (struct buf **)(buflist + 1);
769 for (lbn = ci->ci_cstart, i = 0; i < len; lbn++, i++)
770 (void)bread(vp, lbn, last_bp->b_bcount, NOCRED,
771 &buflist->bs_children[i]);
772 buflist->bs_children[i] = last_bp;
773 buflist->bs_nchildren = i + 1;
774 return (buflist);
775 }
776