1 /**	$MirOS: src/sys/kern/subr_disk.c,v 1.3 2005/07/07 14:39:25 tg Exp $ */
2 /*	$OpenBSD: subr_disk.c,v 1.29 2004/12/26 21:22:13 miod Exp $	*/
3 /*	$NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $	*/
4 
5 /*
6  * Copyright (c) 1995 Jason R. Thorpe.  All rights reserved.
7  * Copyright (c) 1982, 1986, 1988, 1993
8  *	The Regents of the University of California.  All rights reserved.
9  * (c) UNIX System Laboratories, Inc.
10  * All or some portions of this file are derived from material licensed
11  * to the University of California by American Telephone and Telegraph
12  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
13  * the permission of UNIX System Laboratories, Inc.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 3. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
40  */
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/malloc.h>
46 #include <sys/fcntl.h>
47 #include <sys/buf.h>
48 #include <sys/stat.h>
49 #include <sys/syslog.h>
50 #include <sys/time.h>
51 #include <sys/disklabel.h>
52 #include <sys/conf.h>
53 #include <sys/lock.h>
54 #include <sys/disk.h>
55 #include <sys/dkio.h>
56 #include <sys/dkstat.h>		/* XXX */
57 #include <sys/proc.h>
58 
59 #include <dev/rndvar.h>
60 
61 /*
62  * A global list of all disks attached to the system.  May grow or
63  * shrink over time.
64  */
65 struct	disklist_head disklist;	/* TAILQ_HEAD */
66 int	disk_count;		/* number of drives in global disklist */
67 int	disk_change;		/* set if a disk has been attached/detached
68 				 * since last we looked at this variable. This
69 				 * is reset by hw_sysctl()
70 				 */
71 
72 /*
73  * Seek sort for disks.  We depend on the driver which calls us using b_resid
74  * as the current cylinder number.
75  *
76  * The argument ap structure holds a b_actf activity chain pointer on which we
77  * keep two queues, sorted in ascending cylinder order.  The first queue holds
78  * those requests which are positioned after the current cylinder (in the first
79  * request); the second holds requests which came in after their cylinder number
80  * was passed.  Thus we implement a one way scan, retracting after reaching the
81  * end of the drive to the first request on the second queue, at which time it
82  * becomes the first queue.
83  *
84  * A one-way scan is natural because of the way UNIX read-ahead blocks are
85  * allocated.
86  */
87 
88 void
disksort(ap,bp)89 disksort(ap, bp)
90 	register struct buf *ap, *bp;
91 {
92 	register struct buf *bq;
93 
94 	/* If the queue is empty, then it's easy. */
95 	if (ap->b_actf == NULL) {
96 		bp->b_actf = NULL;
97 		ap->b_actf = bp;
98 		return;
99 	}
100 
101 	/*
102 	 * If we lie after the first (currently active) request, then we
103 	 * must locate the second request list and add ourselves to it.
104 	 */
105 	bq = ap->b_actf;
106 	if (bp->b_cylinder < bq->b_cylinder) {
107 		while (bq->b_actf) {
108 			/*
109 			 * Check for an ``inversion'' in the normally ascending
110 			 * cylinder numbers, indicating the start of the second
111 			 * request list.
112 			 */
113 			if (bq->b_actf->b_cylinder < bq->b_cylinder) {
114 				/*
115 				 * Search the second request list for the first
116 				 * request at a larger cylinder number.  We go
117 				 * before that; if there is no such request, we
118 				 * go at end.
119 				 */
120 				do {
121 					if (bp->b_cylinder <
122 					    bq->b_actf->b_cylinder)
123 						goto insert;
124 					if (bp->b_cylinder ==
125 					    bq->b_actf->b_cylinder &&
126 					    bp->b_blkno < bq->b_actf->b_blkno)
127 						goto insert;
128 					bq = bq->b_actf;
129 				} while (bq->b_actf);
130 				goto insert;		/* after last */
131 			}
132 			bq = bq->b_actf;
133 		}
134 		/*
135 		 * No inversions... we will go after the last, and
136 		 * be the first request in the second request list.
137 		 */
138 		goto insert;
139 	}
140 	/*
141 	 * Request is at/after the current request...
142 	 * sort in the first request list.
143 	 */
144 	while (bq->b_actf) {
145 		/*
146 		 * We want to go after the current request if there is an
147 		 * inversion after it (i.e. it is the end of the first
148 		 * request list), or if the next request is a larger cylinder
149 		 * than our request.
150 		 */
151 		if (bq->b_actf->b_cylinder < bq->b_cylinder ||
152 		    bp->b_cylinder < bq->b_actf->b_cylinder ||
153 		    (bp->b_cylinder == bq->b_actf->b_cylinder &&
154 		    bp->b_blkno < bq->b_actf->b_blkno))
155 			goto insert;
156 		bq = bq->b_actf;
157 	}
158 	/*
159 	 * Neither a second list nor a larger request... we go at the end of
160 	 * the first list, which is the same as the end of the whole schebang.
161 	 */
162 insert:	bp->b_actf = bq->b_actf;
163 	bq->b_actf = bp;
164 }
165 
166 /*
167  * Compute checksum for disk label.
168  */
169 u_int
dkcksum(lp)170 dkcksum(lp)
171 	register struct disklabel *lp;
172 {
173 	register u_int16_t *start, *end;
174 	register u_int16_t sum = 0;
175 
176 	start = (u_int16_t *)lp;
177 	end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions];
178 	while (start < end)
179 		sum ^= *start++;
180 	return (sum);
181 }
182 
183 /*
184  * Disk error is the preface to plaintive error messages
185  * about failing disk transfers.  It prints messages of the form
186 
187 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
188 
189  * if the offset of the error in the transfer and a disk label
190  * are both available.  blkdone should be -1 if the position of the error
191  * is unknown; the disklabel pointer may be null from drivers that have not
192  * been converted to use them.  The message is printed with printf
193  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
194  * The message should be completed (with at least a newline) with printf
195  * or addlog, respectively.  There is no trailing space.
196  */
197 void
diskerr(bp,dname,what,pri,blkdone,lp)198 diskerr(bp, dname, what, pri, blkdone, lp)
199 	register struct buf *bp;
200 	char *dname, *what;
201 	int pri, blkdone;
202 	register struct disklabel *lp;
203 {
204 	int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
205 	register int (*pr)(const char *, ...);
206 	char partname = 'a' + part;
207 	int sn;
208 
209 	if (pri != LOG_PRINTF) {
210 		logpri(pri);
211 		pr = addlog;
212 	} else
213 		pr = printf;
214 	(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
215 	    bp->b_flags & B_READ ? "read" : "writ");
216 	sn = bp->b_blkno;
217 	if (bp->b_bcount <= DEV_BSIZE)
218 		(*pr)("%d", sn);
219 	else {
220 		if (blkdone >= 0) {
221 			sn += blkdone;
222 			(*pr)("%d of ", sn);
223 		}
224 		(*pr)("%d-%d", bp->b_blkno,
225 		    bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
226 	}
227 	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
228 		sn += lp->d_partitions[part].p_offset;
229 		(*pr)(" (%s%d bn %d; cn %d", dname, unit, sn,
230 		    sn / lp->d_secpercyl);
231 		sn %= lp->d_secpercyl;
232 		(*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors);
233 	}
234 }
235 
236 /*
237  * Initialize the disklist.  Called by main() before autoconfiguration.
238  */
239 void
disk_init()240 disk_init()
241 {
242 
243 	TAILQ_INIT(&disklist);
244 	disk_count = disk_change = 0;
245 }
246 
247 /*
248  * Searches the disklist for the disk corresponding to the
249  * name provided.
250  */
251 struct disk *
disk_find(name)252 disk_find(name)
253 	char *name;
254 {
255 	struct disk *diskp;
256 
257 	if ((name == NULL) || (disk_count <= 0))
258 		return (NULL);
259 
260 	TAILQ_FOREACH(diskp, &disklist, dk_link)
261 		if (strcmp(diskp->dk_name, name) == 0)
262 			return (diskp);
263 
264 	return (NULL);
265 }
266 
267 int
disk_construct(diskp,lockname)268 disk_construct(diskp, lockname)
269 	struct disk *diskp;
270 	char *lockname;
271 {
272 	lockinit(&diskp->dk_lock, PRIBIO | PCATCH, lockname,
273 		 0, LK_CANRECURSE);
274 
275 	diskp->dk_flags |= DKF_CONSTRUCTED;
276 
277 	return (0);
278 }
279 
280 /*
281  * Attach a disk.
282  */
283 void
disk_attach(diskp)284 disk_attach(diskp)
285 	struct disk *diskp;
286 {
287 	int s;
288 
289 	if (!diskp->dk_flags & DKF_CONSTRUCTED)
290 		disk_construct(diskp, diskp->dk_name);
291 
292 	/*
293 	 * Allocate and initialize the disklabel structures.  Note that
294 	 * it's not safe to sleep here, since we're probably going to be
295 	 * called during autoconfiguration.
296 	 */
297 	diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT);
298 	diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF,
299 	    M_NOWAIT);
300 	if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL))
301 		panic("disk_attach: can't allocate storage for disklabel");
302 
303 	bzero(diskp->dk_label, sizeof(struct disklabel));
304 	bzero(diskp->dk_cpulabel, sizeof(struct cpu_disklabel));
305 
306 	/*
307 	 * Set the attached timestamp.
308 	 */
309 	s = splclock();
310 	diskp->dk_attachtime = mono_time;
311 	splx(s);
312 
313 	/*
314 	 * Link into the disklist.
315 	 */
316 	TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
317 	++disk_count;
318 	disk_change = 1;
319 }
320 
321 /*
322  * Detach a disk.
323  */
324 void
disk_detach(diskp)325 disk_detach(diskp)
326 	struct disk *diskp;
327 {
328 
329 	/*
330 	 * Free the space used by the disklabel structures.
331 	 */
332 	free(diskp->dk_label, M_DEVBUF);
333 	free(diskp->dk_cpulabel, M_DEVBUF);
334 
335 	/*
336 	 * Remove from the disklist.
337 	 */
338 	TAILQ_REMOVE(&disklist, diskp, dk_link);
339 	disk_change = 1;
340 	if (--disk_count < 0)
341 		panic("disk_detach: disk_count < 0");
342 }
343 
344 /*
345  * Increment a disk's busy counter.  If the counter is going from
346  * 0 to 1, set the timestamp.
347  */
348 void
disk_busy(diskp)349 disk_busy(diskp)
350 	struct disk *diskp;
351 {
352 	int s;
353 
354 	/*
355 	 * XXX We'd like to use something as accurate as microtime(),
356 	 * but that doesn't depend on the system TOD clock.
357 	 */
358 	if (diskp->dk_busy++ == 0) {
359 		s = splclock();
360 		diskp->dk_timestamp = mono_time;
361 		splx(s);
362 	}
363 }
364 
365 /*
366  * Decrement a disk's busy counter, increment the byte count, total busy
367  * time, and reset the timestamp.
368  */
369 void
disk_unbusy(diskp,bcount,read)370 disk_unbusy(diskp, bcount, read)
371 	struct disk *diskp;
372 	long bcount;
373 	int read;
374 {
375 	int s;
376 	struct timeval dv_time, diff_time;
377 
378 	if (diskp->dk_busy-- == 0)
379 		printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name);
380 
381 	s = splclock();
382 	dv_time = mono_time;
383 	splx(s);
384 
385 	timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
386 	timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
387 
388 	diskp->dk_timestamp = dv_time;
389 	if (bcount > 0) {
390 		if (read) {
391 			diskp->dk_rbytes += bcount;
392 			diskp->dk_rxfer++;
393 		} else {
394 			diskp->dk_wbytes += bcount;
395 			diskp->dk_wxfer++;
396 		}
397 	} else
398 		diskp->dk_seek++;
399 
400 	add_disk_randomness(bcount ^ diff_time.tv_usec);
401 }
402 
403 
404 int
disk_lock(dk)405 disk_lock(dk)
406 	struct disk *dk;
407 {
408 	int error;
409 
410 	error = lockmgr(&dk->dk_lock, LK_EXCLUSIVE, 0, curproc);
411 
412 	return (error);
413 }
414 
415 void
disk_unlock(dk)416 disk_unlock(dk)
417 	struct disk *dk;
418 {
419 	lockmgr(&dk->dk_lock, LK_RELEASE, 0, curproc);
420 }
421 
422 
423 /*
424  * Reset the metrics counters on the given disk.  Note that we cannot
425  * reset the busy counter, as it may case a panic in disk_unbusy().
426  * We also must avoid playing with the timestamp information, as it
427  * may skew any pending transfer results.
428  */
429 void
disk_resetstat(diskp)430 disk_resetstat(diskp)
431 	struct disk *diskp;
432 {
433 	int s = splbio(), t;
434 
435 	diskp->dk_rxfer = 0;
436 	diskp->dk_rbytes = 0;
437 	diskp->dk_wxfer = 0;
438 	diskp->dk_wbytes = 0;
439 	diskp->dk_seek = 0;
440 
441 	t = splclock();
442 	diskp->dk_attachtime = mono_time;
443 	splx(t);
444 
445 	timerclear(&diskp->dk_time);
446 
447 	splx(s);
448 }
449 
450 
451 int
dk_mountroot()452 dk_mountroot()
453 {
454 	dev_t rawdev, rrootdev;
455 	int part = DISKPART(rootdev);
456 	int (*mountrootfn)(void);
457 	struct disklabel dl;
458 	int error;
459 
460 	rrootdev = blktochr(rootdev);
461 	rawdev = MAKEDISKDEV(major(rrootdev), DISKUNIT(rootdev), RAW_PART);
462 	printf("rootdev=0x%x rrootdev=0x%x rawdev=0x%x\n", rootdev,
463 	    rrootdev, rawdev);
464 
465 	/*
466 	 * open device, ioctl for the disklabel, and close it.
467 	 */
468 	error = (cdevsw[major(rrootdev)].d_open)(rawdev, FREAD,
469 	    S_IFCHR, curproc);
470 	if (error)
471 		panic("cannot open disk, 0x%x/0x%x, error %d",
472 		    rootdev, rrootdev, error);
473 	error = (cdevsw[major(rrootdev)].d_ioctl)(rawdev, DIOCGDINFO,
474 	    (caddr_t)&dl, FREAD, curproc);
475 	if (error)
476 		panic("cannot read disk label, 0x%x/0x%x, error %d",
477 		    rootdev, rrootdev, error);
478 	(void) (cdevsw[major(rrootdev)].d_close)(rawdev, FREAD,
479 	    S_IFCHR, curproc);
480 
481 	if (dl.d_partitions[part].p_size == 0)
482 		panic("root filesystem has size 0");
483 	switch (dl.d_partitions[part].p_fstype) {
484 #ifdef EXT2FS
485 	case FS_EXT2FS:
486 		{
487 		extern int ext2fs_mountroot(void);
488 		mountrootfn = ext2fs_mountroot;
489 		}
490 		break;
491 #endif
492 #ifdef FFS
493 	case FS_BSDFFS:
494 		{
495 		extern int ffs_mountroot(void);
496 		mountrootfn = ffs_mountroot;
497 		}
498 		break;
499 #endif
500 #ifdef LFS
501 	case FS_BSDLFS:
502 		{
503 		extern int lfs_mountroot(void);
504 		mountrootfn = lfs_mountroot;
505 		}
506 		break;
507 #endif
508 #ifdef CD9660
509 	case FS_ISO9660:
510 		{
511 		extern int cd9660_mountroot(void);
512 		mountrootfn = cd9660_mountroot;
513 		}
514 		break;
515 #endif
516 	default:
517 #ifdef FFS
518 		{
519 		extern int ffs_mountroot(void);
520 
521 		printf("filesystem type %d not known.. assuming ffs\n",
522 		    dl.d_partitions[part].p_fstype);
523 		mountrootfn = ffs_mountroot;
524 		}
525 #else
526 		panic("disk 0x%x/0x%x filesystem type %d not known",
527 		    rootdev, rrootdev, dl.d_partitions[part].p_fstype);
528 #endif
529 	}
530 	return (*mountrootfn)();
531 }
532 
533 struct bufq *
bufq_default_alloc(void)534 bufq_default_alloc(void)
535 {
536 	struct bufq_default *bq;
537 
538 	bq = malloc(sizeof(*bq), M_DEVBUF, M_NOWAIT);
539 	if (bq == NULL)
540 		panic("bufq_default_alloc: no memory");
541 
542 	memset(bq, 0, sizeof(*bq));
543 	bq->bufq.bufq_free = bufq_default_free;
544 	bq->bufq.bufq_add = bufq_default_add;
545 	bq->bufq.bufq_get = bufq_default_get;
546 
547 	return ((struct bufq *)bq);
548 }
549 
550 void
bufq_default_free(struct bufq * bq)551 bufq_default_free(struct bufq *bq)
552 {
553 	free(bq, M_DEVBUF);
554 }
555 
556 void
bufq_default_add(struct bufq * bq,struct buf * bp)557 bufq_default_add(struct bufq *bq, struct buf *bp)
558 {
559 	struct bufq_default *bufq = (struct bufq_default *)bq;
560 	struct proc *p = bp->b_proc;
561 	struct buf *head;
562 
563 	if (p == NULL || p->p_nice < NZERO)
564 		head = &bufq->bufq_head[0];
565 	else if (p->p_nice == NZERO)
566 		head = &bufq->bufq_head[1];
567 	else
568 		head = &bufq->bufq_head[2];
569 
570 	disksort(head, bp);
571 }
572 
573 struct buf *
bufq_default_get(struct bufq * bq)574 bufq_default_get(struct bufq *bq)
575 {
576 	struct bufq_default *bufq = (struct bufq_default *)bq;
577 	struct buf *bp, *head;
578 	int i;
579 
580 	for (i = 0; i < 3; i++) {
581 		head = &bufq->bufq_head[i];
582 		if ((bp = head->b_actf))
583 			break;
584 	}
585 	if (bp == NULL)
586 		return (NULL);
587 	head->b_actf = bp->b_actf;
588 	return (bp);
589 }
590