1 /**	$MirOS: src/sys/kern/vfs_cluster.c,v 1.3 2005/07/04 00:10:43 tg Exp $ */
2 /*	$OpenBSD: vfs_cluster.c,v 1.33 2004/10/26 17:16:27 pedro Exp $	*/
3 /*	$NetBSD: vfs_cluster.c,v 1.12 1996/04/22 01:39:05 christos Exp $	*/
4 
5 /*-
6  * Copyright (c) 1993
7  *	The Regents of the University of California.  All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)vfs_cluster.c	8.8 (Berkeley) 7/28/94
34  */
35 
36 #include <sys/param.h>
37 #include <sys/proc.h>
38 #include <sys/buf.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/malloc.h>
42 #include <sys/systm.h>
43 #include <sys/resourcevar.h>
44 
45 #include <uvm/uvm_extern.h>
46 
47 /*
48  * Local declarations
49  */
50 void	cluster_callback(struct buf *);
51 struct buf *cluster_newbuf(struct vnode *, struct buf *, long, daddr_t,
52 	    daddr_t, long, int);
53 struct buf *cluster_rbuild(struct vnode *, u_quad_t, struct buf *,
54 	    daddr_t, daddr_t, long, int, long);
55 void	    cluster_wbuild(struct vnode *, struct buf *, long,
56 	    daddr_t, int, daddr_t);
57 struct cluster_save *cluster_collectbufs(struct vnode *,
58 	    struct cluster_info *, struct buf *);
59 
60 #ifdef DIAGNOSTIC
61 /*
62  * Set to 1 if reads of block zero should cause readahead to be done.
63  * Set to 0 treats a read of block zero as a non-sequential read.
64  *
65  * Setting to one assumes that most reads of block zero of files are due to
66  * sequential passes over the files (e.g. cat, sum) where additional blocks
67  * will soon be needed.  Setting to zero assumes that the majority are
68  * surgical strikes to get particular info (e.g. size, file) where readahead
69  * blocks will not be used and, in fact, push out other potentially useful
70  * blocks from the cache.  The former seems intuitive, but some quick tests
71  * showed that the latter performed better from a system-wide point of view.
72  */
73 int	doclusterraz = 0;
74 #define ISSEQREAD(ci, blk) \
75 	(((blk) != 0 || doclusterraz) && \
76 	 ((blk) == (ci)->ci_lastr + 1 || (blk) == (ci)->ci_lastr))
77 #else
78 #define ISSEQREAD(ci, blk) \
79 	((blk) != 0 && ((blk) == (ci)->ci_lastr + 1 || (blk) == (ci)->ci_lastr))
80 #endif
81 
82 /*
83  * This replaces bread.  If this is a bread at the beginning of a file and
84  * lastr is 0, we assume this is the first read and we'll read up to two
85  * blocks if they are sequential.  After that, we'll do regular read ahead
86  * in clustered chunks.
87  *
88  * There are 4 or 5 cases depending on how you count:
89  *	Desired block is in the cache:
90  *	    1 Not sequential access (0 I/Os).
91  *	    2 Access is sequential, do read-ahead (1 ASYNC).
92  *	Desired block is not in cache:
93  *	    3 Not sequential access (1 SYNC).
94  *	    4 Sequential access, next block is contiguous (2 SYNC).
95  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
96  *
97  * There are potentially two buffers that require I/O.
98  * 	bp is the block requested.
99  *	rbp is the read-ahead block.
100  *	If either is NULL, then you don't have to do the I/O.
101  */
102 int
cluster_read(vp,ci,filesize,lblkno,size,cred,bpp)103 cluster_read(vp, ci, filesize, lblkno, size, cred, bpp)
104 	struct vnode *vp;
105 	struct cluster_info *ci;
106 	u_quad_t filesize;
107 	daddr_t lblkno;
108 	long size;
109 	struct ucred *cred;
110 	struct buf **bpp;
111 {
112 	struct buf *bp, *rbp;
113 	daddr_t blkno, ioblkno;
114 	long flags;
115 	int error, num_ra, alreadyincore;
116 
117 #ifdef DIAGNOSTIC
118 	if (size == 0)
119 		panic("cluster_read: size = 0");
120 #endif
121 
122 	error = 0;
123 	flags = B_READ;
124 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
125 	if (bp->b_flags & B_CACHE) {
126 		/*
127 		 * Desired block is in cache; do any readahead ASYNC.
128 		 * Case 1, 2.
129 		 */
130 		flags |= B_ASYNC;
131 		ioblkno = lblkno + (ci->ci_ralen ? ci->ci_ralen : 1);
132 		alreadyincore = incore(vp, ioblkno) != NULL;
133 		bp = NULL;
134 	} else {
135 		/* Block wasn't in cache, case 3, 4, 5. */
136 		bp->b_flags |= B_READ;
137 		ioblkno = lblkno;
138 		alreadyincore = 0;
139 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
140 	}
141 	/*
142 	 * XXX
143 	 * Replace 1 with a window size based on some permutation of
144 	 * maxcontig and rot_delay.  This will let you figure out how
145 	 * many blocks you should read-ahead (case 2, 4, 5).
146 	 *
147 	 * If the access isn't sequential, reset the window to 1.
148 	 * Note that a read to the same block is considered sequential.
149 	 * This catches the case where the file is being read sequentially,
150 	 * but at smaller than the filesystem block size.
151 	 */
152 	rbp = NULL;
153 	if (!ISSEQREAD(ci, lblkno)) {
154 		ci->ci_ralen = 0;
155 		ci->ci_maxra = lblkno;
156 	} else if ((u_quad_t)(ioblkno + 1) * (u_quad_t)size <= filesize &&
157 		   !alreadyincore &&
158 		   !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) &&
159 		   blkno != -1) {
160 		/*
161 		 * Reading sequentially, and the next block is not in the
162 		 * cache.  We are going to try reading ahead.
163 		 */
164 		if (num_ra) {
165 			/*
166 			 * If our desired readahead block had been read
167 			 * in a previous readahead but is no longer in
168 			 * core, then we may be reading ahead too far
169 			 * or are not using our readahead very rapidly.
170 			 * In this case we scale back the window.
171 			 */
172 			if (!alreadyincore && ioblkno <= ci->ci_maxra)
173 				ci->ci_ralen = max(ci->ci_ralen >> 1, 1);
174 			/*
175 			 * There are more sequential blocks than our current
176 			 * window allows, scale up.  Ideally we want to get
177 			 * in sync with the filesystem maxcontig value.
178 			 */
179 			else if (num_ra > ci->ci_ralen && lblkno != ci->ci_lastr)
180 				ci->ci_ralen = ci->ci_ralen ?
181 					min(num_ra, ci->ci_ralen << 1) : 1;
182 
183 			if (num_ra > ci->ci_ralen)
184 				num_ra = ci->ci_ralen;
185 		}
186 
187 		if (num_ra)				/* case 2, 4 */
188 			rbp = cluster_rbuild(vp, filesize,
189 			    bp, ioblkno, blkno, size, num_ra, flags);
190 		else if (ioblkno == lblkno) {
191 			bp->b_blkno = blkno;
192 			/* Case 5: check how many blocks to read ahead */
193 			++ioblkno;
194 			if ((u_quad_t)(ioblkno + 1) * (u_quad_t)size >
195 			    filesize ||
196 			    incore(vp, ioblkno) || (error = VOP_BMAP(vp,
197 			    ioblkno, NULL, &blkno, &num_ra)) || blkno == -1)
198 				goto skip_readahead;
199 			/*
200 			 * Adjust readahead as above.
201 			 * Don't check alreadyincore, we know it is 0 from
202 			 * the previous conditional.
203 			 */
204 			if (num_ra) {
205 				if (ioblkno <= ci->ci_maxra)
206 					ci->ci_ralen = max(ci->ci_ralen >> 1, 1);
207 				else if (num_ra > ci->ci_ralen &&
208 					 lblkno != ci->ci_lastr)
209 					ci->ci_ralen = ci->ci_ralen ?
210 						min(num_ra,ci->ci_ralen<<1) : 1;
211 				if (num_ra > ci->ci_ralen)
212 					num_ra = ci->ci_ralen;
213 			}
214 			flags |= B_ASYNC;
215 			if (num_ra)
216 				rbp = cluster_rbuild(vp, filesize,
217 				    NULL, ioblkno, blkno, size, num_ra, flags);
218 			else {
219 				rbp = getblk(vp, ioblkno, size, 0, 0);
220 				rbp->b_flags |= flags;
221 				rbp->b_blkno = blkno;
222 			}
223 		} else {
224 			/* case 2; read ahead single block */
225 			rbp = getblk(vp, ioblkno, size, 0, 0);
226 			rbp->b_flags |= flags;
227 			rbp->b_blkno = blkno;
228 		}
229 
230 		if (rbp == bp)			/* case 4 */
231 			rbp = NULL;
232 		else if (rbp)			/* case 2, 5 */
233 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
234 	}
235 
236 	/* XXX Kirk, do we need to make sure the bp has creds? */
237 skip_readahead:
238 	if (bp) {
239 		if (bp->b_flags & (B_DONE | B_DELWRI))
240 			panic("cluster_read: DONE bp");
241 		else
242 			error = VOP_STRATEGY(bp);
243 	}
244 
245 	if (rbp) {
246 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
247 			rbp->b_flags &= ~(B_ASYNC | B_READ);
248 			brelse(rbp);
249 		} else
250 			(void) VOP_STRATEGY(rbp);
251 	}
252 
253 	/*
254 	 * Recalculate our maximum readahead
255 	 */
256 	if (rbp == NULL)
257 		rbp = bp;
258 	if (rbp)
259 		ci->ci_maxra = rbp->b_lblkno + (rbp->b_bcount / size) - 1;
260 
261 	if (bp)
262 		return(biowait(bp));
263 	return(error);
264 }
265 
266 /*
267  * If blocks are contiguous on disk, use this to provide clustered
268  * read ahead.  We will read as many blocks as possible sequentially
269  * and then parcel them up into logical blocks in the buffer hash table.
270  */
271 struct buf *
cluster_rbuild(vp,filesize,bp,lbn,blkno,size,run,flags)272 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
273 	struct vnode *vp;
274 	u_quad_t filesize;
275 	struct buf *bp;
276 	daddr_t lbn;
277 	daddr_t blkno;
278 	long size;
279 	int run;
280 	long flags;
281 {
282 	struct cluster_save *b_save;
283 	struct buf *tbp;
284 	daddr_t bn;
285 	int i, inc;
286 
287 #ifdef DIAGNOSTIC
288 	if (size != vp->v_mount->mnt_stat.f_iosize)
289 		panic("cluster_rbuild: size %ld != filesize %ld",
290 			size, (long)vp->v_mount->mnt_stat.f_iosize);
291 #endif
292 	if ((u_quad_t)size * (u_quad_t)(lbn + run + 1) > filesize)
293 		--run;
294 	if (run == 0) {
295 		if (!bp) {
296 			bp = getblk(vp, lbn, size, 0, 0);
297 			bp->b_blkno = blkno;
298 			bp->b_flags |= flags;
299 		}
300 		return(bp);
301 	}
302 
303 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
304 	if (bp->b_flags & (B_DONE | B_DELWRI))
305 		return (bp);
306 
307 	b_save = malloc(sizeof(struct buf *) * run +
308 	    sizeof(struct cluster_save), M_VCLUSTER, M_WAITOK);
309 	b_save->bs_bufsize = b_save->bs_bcount = size;
310 	b_save->bs_nchildren = 0;
311 	b_save->bs_children = (struct buf **)(b_save + 1);
312 	b_save->bs_saveaddr = bp->b_saveaddr;
313 	bp->b_saveaddr = b_save;
314 
315 	inc = btodb(size);
316 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
317 		/*
318 		 * A component of the cluster is already in core,
319 		 * terminate the cluster early.
320 		 */
321 		if (incore(vp, lbn + i))
322 			break;
323 		tbp = getblk(vp, lbn + i, 0, 0, 0);
324 
325 		/*
326 		 * getblk may return some memory in the buffer if there were
327 		 * no empty buffers to shed it to.  If there is currently
328 		 * memory in the buffer, we move it down size bytes to make
329 		 * room for the valid pages that cluster_callback will insert.
330 		 * We do this now so we don't have to do it at interrupt time
331 		 * in the callback routine.
332 		 */
333 		if (tbp->b_bufsize != 0) {
334 			caddr_t bdata = tbp->b_data;
335 
336 			/*
337 			 * No room in the buffer to add another page,
338 			 * terminate the cluster early.
339 			 */
340 			if (tbp->b_bufsize + size > MAXBSIZE) {
341 #ifdef DIAGNOSTIC
342 				if (tbp->b_bufsize > MAXBSIZE)
343 					panic("cluster_rbuild: too much memory");
344 #endif
345 				/* This buffer is *not* valid.  */
346 				tbp->b_flags |= B_INVAL;
347 				brelse(tbp);
348 				break;
349 			}
350 			pagemove(bdata, bdata + tbp->b_bufsize, size);
351 		}
352 		tbp->b_blkno = bn;
353 		tbp->b_flags &= ~(B_DONE | B_ERROR);
354 		tbp->b_flags |= flags | B_READ | B_ASYNC;
355 		b_save->bs_children[b_save->bs_nchildren++] = tbp;
356 	}
357 	/*
358 	 * The cluster may have been terminated early, adjust the cluster
359 	 * buffer size accordingly.  If no cluster could be formed,
360 	 * deallocate the cluster save info.
361 	 */
362 	if (i <= run) {
363 		if (i == 1) {
364 			bp->b_saveaddr = b_save->bs_saveaddr;
365 			bp->b_flags &= ~B_CALL;
366 			bp->b_iodone = NULL;
367 			free(b_save, M_VCLUSTER);
368 		}
369 		allocbuf(bp, size * i);
370 	}
371 	return(bp);
372 }
373 
374 /*
375  * Either get a new buffer or grow the existing one.
376  */
377 struct buf *
cluster_newbuf(vp,bp,flags,blkno,lblkno,size,run)378 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
379 	struct vnode *vp;
380 	struct buf *bp;
381 	long flags;
382 	daddr_t blkno;
383 	daddr_t lblkno;
384 	long size;
385 	int run;
386 {
387 	if (!bp) {
388 		bp = getblk(vp, lblkno, size, 0, 0);
389 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
390 			bp->b_blkno = blkno;
391 			return(bp);
392 		}
393 	}
394 	allocbuf(bp, run * size);
395 	bp->b_blkno = blkno;
396 	bp->b_iodone = cluster_callback;
397 	bp->b_flags |= flags | B_CALL;
398 	return(bp);
399 }
400 
401 /*
402  * Cleanup after a clustered read or write.
403  * This is complicated by the fact that any of the buffers might have
404  * extra memory (if there were no empty buffer headers at allocbuf time)
405  * that we will need to shift around.
406  */
407 void
cluster_callback(bp)408 cluster_callback(bp)
409 	struct buf *bp;
410 {
411 	struct cluster_save *b_save;
412 	struct buf **bpp, *tbp;
413 	long bsize;
414 	caddr_t cp;
415 	int error = 0;
416 
417 	splassert(IPL_BIO);
418 
419 	/*
420 	 * Must propagate errors to all the components.
421 	 */
422 	if (bp->b_flags & B_ERROR)
423 		error = bp->b_error;
424 
425 	b_save = (struct cluster_save *)(bp->b_saveaddr);
426 	bp->b_saveaddr = b_save->bs_saveaddr;
427 
428 	bsize = b_save->bs_bufsize;
429 	cp = (char *)bp->b_data + bsize;
430 	/*
431 	 * Move memory from the large cluster buffer into the component
432 	 * buffers and mark IO as done on these.
433 	 */
434 	for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
435 		tbp = *bpp;
436 		pagemove(cp, tbp->b_data, bsize);
437 		tbp->b_bufsize += bsize;
438 		tbp->b_bcount = bsize;
439 		if (error) {
440 			tbp->b_flags |= B_ERROR;
441 			tbp->b_error = error;
442 		}
443 		biodone(tbp);
444 		bp->b_bufsize -= bsize;
445 		cp += bsize;
446 	}
447 	/*
448 	 * If there was excess memory in the cluster buffer,
449 	 * slide it up adjacent to the remaining valid data.
450 	 */
451 	if (bp->b_bufsize != bsize) {
452 		if (bp->b_bufsize < bsize)
453 			panic("cluster_callback: too little memory");
454 		if (bp->b_bufsize < cp - (char *)bp->b_data)
455 			pagemove(cp, (char *)bp->b_data + bsize,
456 			    bp->b_bufsize - bsize);
457 		else
458 			pagemove((char *)bp->b_data + bp->b_bufsize,
459 			    (char *)bp->b_data + bsize,
460 			    cp - ((char *)bp->b_data + bsize));
461 	}
462 	bp->b_bcount = bsize;
463 	bp->b_iodone = NULL;
464 	free(b_save, M_VCLUSTER);
465 	if (bp->b_flags & B_ASYNC)
466 		brelse(bp);
467 	else {
468 		bp->b_flags &= ~B_WANTED;
469 		wakeup(bp);
470 	}
471 }
472 
473 /*
474  * Do clustered write for FFS.
475  *
476  * Three cases:
477  *	1. Write is not sequential (write asynchronously)
478  *	Write is sequential:
479  *	2.	beginning of cluster - begin cluster
480  *	3.	middle of a cluster - add to cluster
481  *	4.	end of a cluster - asynchronously write cluster
482  */
483 void
cluster_write(bp,ci,filesize)484 cluster_write(bp, ci, filesize)
485 	struct buf *bp;
486 	struct cluster_info *ci;
487 	u_quad_t filesize;
488 {
489 	struct vnode *vp;
490 	daddr_t lbn;
491 	int maxclen, cursize;
492 
493 	vp = bp->b_vp;
494 	lbn = bp->b_lblkno;
495 
496 	/* Initialize vnode to beginning of file. */
497 	if (lbn == 0)
498 		ci->ci_lasta = ci->ci_clen = ci->ci_cstart = ci->ci_lastw = 0;
499 
500 	if (ci->ci_clen == 0 || lbn != ci->ci_lastw + 1 ||
501 	    (bp->b_blkno != ci->ci_lasta + btodb(bp->b_bcount))) {
502 		maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
503 		if (ci->ci_clen != 0) {
504 			/*
505 			 * Next block is not sequential.
506 			 *
507 			 * If we are not writing at end of file, the process
508 			 * seeked to another point in the file since its
509 			 * last write, or we have reached our maximum
510 			 * cluster size, then push the previous cluster.
511 			 * Otherwise try reallocating to make it sequential.
512 			 */
513 			cursize = ci->ci_lastw - ci->ci_cstart + 1;
514 			if (((u_quad_t)(lbn + 1)) * bp->b_bcount != filesize ||
515 			    lbn != ci->ci_lastw + 1 || ci->ci_clen <= cursize) {
516 				cluster_wbuild(vp, NULL, bp->b_bcount,
517 				    ci->ci_cstart, cursize, lbn);
518 			} else {
519 				struct buf **bpp, **endbp;
520 				struct cluster_save *buflist;
521 
522 				buflist = cluster_collectbufs(vp, ci, bp);
523 				endbp = &buflist->bs_children
524 				    [buflist->bs_nchildren - 1];
525 				if (VOP_REALLOCBLKS(vp, buflist)) {
526 					/*
527 					 * Failed, push the previous cluster.
528 					 */
529 					for (bpp = buflist->bs_children;
530 					    bpp < endbp; bpp++)
531 						brelse(*bpp);
532 					free(buflist, M_VCLUSTER);
533 					cluster_wbuild(vp, NULL, bp->b_bcount,
534 					    ci->ci_cstart, cursize, lbn);
535 				} else {
536 					/*
537 					 * Succeeded, keep building cluster.
538 					 */
539 					for (bpp = buflist->bs_children;
540 					    bpp <= endbp; bpp++)
541 						bdwrite(*bpp);
542 					free(buflist, M_VCLUSTER);
543 					ci->ci_lastw = lbn;
544 					ci->ci_lasta = bp->b_blkno;
545 					return;
546 				}
547 			}
548 		}
549 		/*
550 		 * Consider beginning a cluster.
551 		 * If at end of file, make cluster as large as possible,
552 		 * otherwise find size of existing cluster.
553 		 */
554 		if ((u_quad_t)(lbn + 1) * (u_quad_t)bp->b_bcount != filesize &&
555 		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
556 		    bp->b_blkno == -1)) {
557 			bawrite(bp);
558 			ci->ci_clen = 0;
559 			ci->ci_lasta = bp->b_blkno;
560 			ci->ci_cstart = lbn + 1;
561 			ci->ci_lastw = lbn;
562 			return;
563 		}
564 		ci->ci_clen = maxclen;
565 		if (maxclen == 0) {		/* I/O not contiguous */
566 			ci->ci_cstart = lbn + 1;
567 			bawrite(bp);
568 		} else {			/* Wait for rest of cluster */
569 			ci->ci_cstart = lbn;
570 			bdwrite(bp);
571 		}
572 	} else if (lbn == ci->ci_cstart + ci->ci_clen) {
573 		/*
574 		 * At end of cluster, write it out.
575 		 */
576 		cluster_wbuild(vp, bp, bp->b_bcount, ci->ci_cstart,
577 		    ci->ci_clen + 1, lbn);
578 		ci->ci_clen = 0;
579 		ci->ci_cstart = lbn + 1;
580 	} else
581 		/*
582 		 * In the middle of a cluster, so just delay the
583 		 * I/O for now.
584 		 */
585 		bdwrite(bp);
586 	ci->ci_lastw = lbn;
587 	ci->ci_lasta = bp->b_blkno;
588 }
589 
590 
591 /*
592  * This is an awful lot like cluster_rbuild...wish they could be combined.
593  * The last lbn argument is the current block on which I/O is being
594  * performed.  Check to see that it doesn't fall in the middle of
595  * the current block (if last_bp == NULL).
596  */
597 void
cluster_wbuild(vp,last_bp,size,start_lbn,len,lbn)598 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
599 	struct vnode *vp;
600 	struct buf *last_bp;
601 	long size;
602 	daddr_t start_lbn;
603 	int len;
604 	daddr_t	lbn;
605 {
606 	struct cluster_save *b_save;
607 	struct buf *bp, *tbp;
608 	caddr_t	cp;
609 	int i, s;
610 
611 #ifdef DIAGNOSTIC
612 	if (size != vp->v_mount->mnt_stat.f_iosize)
613 		panic("cluster_wbuild: size %ld != filesize %ld",
614 			size, (long)vp->v_mount->mnt_stat.f_iosize);
615 #endif
616 redo:
617 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
618 		++start_lbn;
619 		--len;
620 	}
621 
622 	/* Get more memory for current buffer */
623 	if (len <= 1) {
624 		if (last_bp) {
625 			bawrite(last_bp);
626 		} else if (len) {
627 			bp = getblk(vp, start_lbn, size, 0, 0);
628 			/*
629 			 * The buffer could have already been flushed out of
630 			 * the cache. If that has happened, we'll get a new
631 			 * buffer here with random data, just drop it.
632 			 */
633 			if ((bp->b_flags & B_DELWRI) == 0)
634 				brelse(bp);
635 			else
636 				bawrite(bp);
637 		}
638 		return;
639 	}
640 
641 	bp = getblk(vp, start_lbn, size, 0, 0);
642 	if (!(bp->b_flags & B_DELWRI)) {
643 		++start_lbn;
644 		--len;
645 		brelse(bp);
646 		goto redo;
647 	}
648 
649 	/*
650 	 * Extra memory in the buffer, punt on this buffer.
651 	 * XXX we could handle this in most cases, but we would have to
652 	 * push the extra memory down to after our max possible cluster
653 	 * size and then potentially pull it back up if the cluster was
654 	 * terminated prematurely--too much hassle.
655 	 */
656 	if (bp->b_bcount != bp->b_bufsize) {
657 		++start_lbn;
658 		--len;
659 		bawrite(bp);
660 		goto redo;
661 	}
662 
663 	--len;
664 	b_save = malloc(sizeof(struct buf *) * len +
665 	    sizeof(struct cluster_save), M_VCLUSTER, M_WAITOK);
666 	b_save->bs_bcount = bp->b_bcount;
667 	b_save->bs_bufsize = bp->b_bufsize;
668 	b_save->bs_nchildren = 0;
669 	b_save->bs_children = (struct buf **)(b_save + 1);
670 	b_save->bs_saveaddr = bp->b_saveaddr;
671 	bp->b_saveaddr = b_save;
672 
673 	bp->b_flags |= B_CALL;
674 	bp->b_iodone = cluster_callback;
675 	cp = (char *)bp->b_data + size;
676 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
677 		/*
678 		 * Block is not in core or the non-sequential block
679 		 * ending our cluster was part of the cluster (in which
680 		 * case we don't want to write it twice).
681 		 */
682 		if (!incore(vp, start_lbn) ||
683 		    (last_bp == NULL && start_lbn == lbn))
684 			break;
685 
686 		/*
687 		 * Get the desired block buffer (unless it is the final
688 		 * sequential block whose buffer was passed in explicitly
689 		 * as last_bp).
690 		 */
691 		if (last_bp == NULL || start_lbn != lbn) {
692 			tbp = getblk(vp, start_lbn, size, 0, 0);
693 			if (!(tbp->b_flags & B_DELWRI)) {
694 				brelse(tbp);
695 				break;
696 			}
697 		} else
698 			tbp = last_bp;
699 
700 		++b_save->bs_nchildren;
701 
702 		if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) {
703 			printf("Clustered Block: %d addr %x bufsize: %ld\n",
704 			    bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
705 			printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
706 			    tbp->b_blkno);
707 			panic("Clustered write to wrong blocks");
708 		}
709 
710 		/*
711 		 * We might as well AGE the buffer here; it's either empty, or
712 		 * contains data that we couldn't get rid of (but wanted to).
713 		 */
714 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
715 		tbp->b_flags |= (B_ASYNC | B_AGE);
716 		s = splbio();
717 		buf_undirty(tbp);
718 		++tbp->b_vp->v_numoutput;
719 		splx(s);
720 
721 		if (LIST_FIRST(&tbp->b_dep) != NULL)
722 			buf_start(tbp);
723 
724 		/* Move memory from children to parent */
725 		pagemove(tbp->b_data, cp, size);
726 		bp->b_bcount += size;
727 		bp->b_bufsize += size;
728 
729 		tbp->b_bufsize -= size;
730 		b_save->bs_children[i] = tbp;
731 
732 		cp += size;
733 	}
734 
735 	if (i == 0) {
736 		/* None to cluster */
737 		bp->b_saveaddr = b_save->bs_saveaddr;
738 		bp->b_flags &= ~B_CALL;
739 		bp->b_iodone = NULL;
740 		free(b_save, M_VCLUSTER);
741 	}
742 	bawrite(bp);
743 	if (i < len) {
744 		len -= i + 1;
745 		start_lbn += 1;
746 		goto redo;
747 	}
748 }
749 
750 /*
751  * Collect together all the buffers in a cluster.
752  * Plus add one additional buffer.
753  */
754 struct cluster_save *
cluster_collectbufs(vp,ci,last_bp)755 cluster_collectbufs(vp, ci, last_bp)
756 	struct vnode *vp;
757 	struct cluster_info *ci;
758 	struct buf *last_bp;
759 {
760 	struct cluster_save *buflist;
761 	daddr_t	lbn;
762 	int i, len;
763 
764 	len = ci->ci_lastw - ci->ci_cstart + 1;
765 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
766 	    M_VCLUSTER, M_WAITOK);
767 	buflist->bs_nchildren = 0;
768 	buflist->bs_children = (struct buf **)(buflist + 1);
769 	for (lbn = ci->ci_cstart, i = 0; i < len; lbn++, i++)
770 		(void)bread(vp, lbn, last_bp->b_bcount, NOCRED,
771 		    &buflist->bs_children[i]);
772 	buflist->bs_children[i] = last_bp;
773 	buflist->bs_nchildren = i + 1;
774 	return (buflist);
775 }
776