1 /** $MirOS: src/sys/kern/subr_disk.c,v 1.3 2005/07/07 14:39:25 tg Exp $ */
2 /* $OpenBSD: subr_disk.c,v 1.29 2004/12/26 21:22:13 miod Exp $ */
3 /* $NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $ */
4
5 /*
6 * Copyright (c) 1995 Jason R. Thorpe. All rights reserved.
7 * Copyright (c) 1982, 1986, 1988, 1993
8 * The Regents of the University of California. All rights reserved.
9 * (c) UNIX System Laboratories, Inc.
10 * All or some portions of this file are derived from material licensed
11 * to the University of California by American Telephone and Telegraph
12 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
13 * the permission of UNIX System Laboratories, Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
40 */
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/malloc.h>
46 #include <sys/fcntl.h>
47 #include <sys/buf.h>
48 #include <sys/stat.h>
49 #include <sys/syslog.h>
50 #include <sys/time.h>
51 #include <sys/disklabel.h>
52 #include <sys/conf.h>
53 #include <sys/lock.h>
54 #include <sys/disk.h>
55 #include <sys/dkio.h>
56 #include <sys/dkstat.h> /* XXX */
57 #include <sys/proc.h>
58
59 #include <dev/rndvar.h>
60
61 /*
62 * A global list of all disks attached to the system. May grow or
63 * shrink over time.
64 */
65 struct disklist_head disklist; /* TAILQ_HEAD */
66 int disk_count; /* number of drives in global disklist */
67 int disk_change; /* set if a disk has been attached/detached
68 * since last we looked at this variable. This
69 * is reset by hw_sysctl()
70 */
71
72 /*
73 * Seek sort for disks. We depend on the driver which calls us using b_resid
74 * as the current cylinder number.
75 *
76 * The argument ap structure holds a b_actf activity chain pointer on which we
77 * keep two queues, sorted in ascending cylinder order. The first queue holds
78 * those requests which are positioned after the current cylinder (in the first
79 * request); the second holds requests which came in after their cylinder number
80 * was passed. Thus we implement a one way scan, retracting after reaching the
81 * end of the drive to the first request on the second queue, at which time it
82 * becomes the first queue.
83 *
84 * A one-way scan is natural because of the way UNIX read-ahead blocks are
85 * allocated.
86 */
87
88 void
disksort(ap,bp)89 disksort(ap, bp)
90 register struct buf *ap, *bp;
91 {
92 register struct buf *bq;
93
94 /* If the queue is empty, then it's easy. */
95 if (ap->b_actf == NULL) {
96 bp->b_actf = NULL;
97 ap->b_actf = bp;
98 return;
99 }
100
101 /*
102 * If we lie after the first (currently active) request, then we
103 * must locate the second request list and add ourselves to it.
104 */
105 bq = ap->b_actf;
106 if (bp->b_cylinder < bq->b_cylinder) {
107 while (bq->b_actf) {
108 /*
109 * Check for an ``inversion'' in the normally ascending
110 * cylinder numbers, indicating the start of the second
111 * request list.
112 */
113 if (bq->b_actf->b_cylinder < bq->b_cylinder) {
114 /*
115 * Search the second request list for the first
116 * request at a larger cylinder number. We go
117 * before that; if there is no such request, we
118 * go at end.
119 */
120 do {
121 if (bp->b_cylinder <
122 bq->b_actf->b_cylinder)
123 goto insert;
124 if (bp->b_cylinder ==
125 bq->b_actf->b_cylinder &&
126 bp->b_blkno < bq->b_actf->b_blkno)
127 goto insert;
128 bq = bq->b_actf;
129 } while (bq->b_actf);
130 goto insert; /* after last */
131 }
132 bq = bq->b_actf;
133 }
134 /*
135 * No inversions... we will go after the last, and
136 * be the first request in the second request list.
137 */
138 goto insert;
139 }
140 /*
141 * Request is at/after the current request...
142 * sort in the first request list.
143 */
144 while (bq->b_actf) {
145 /*
146 * We want to go after the current request if there is an
147 * inversion after it (i.e. it is the end of the first
148 * request list), or if the next request is a larger cylinder
149 * than our request.
150 */
151 if (bq->b_actf->b_cylinder < bq->b_cylinder ||
152 bp->b_cylinder < bq->b_actf->b_cylinder ||
153 (bp->b_cylinder == bq->b_actf->b_cylinder &&
154 bp->b_blkno < bq->b_actf->b_blkno))
155 goto insert;
156 bq = bq->b_actf;
157 }
158 /*
159 * Neither a second list nor a larger request... we go at the end of
160 * the first list, which is the same as the end of the whole schebang.
161 */
162 insert: bp->b_actf = bq->b_actf;
163 bq->b_actf = bp;
164 }
165
166 /*
167 * Compute checksum for disk label.
168 */
169 u_int
dkcksum(lp)170 dkcksum(lp)
171 register struct disklabel *lp;
172 {
173 register u_int16_t *start, *end;
174 register u_int16_t sum = 0;
175
176 start = (u_int16_t *)lp;
177 end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions];
178 while (start < end)
179 sum ^= *start++;
180 return (sum);
181 }
182
183 /*
184 * Disk error is the preface to plaintive error messages
185 * about failing disk transfers. It prints messages of the form
186
187 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
188
189 * if the offset of the error in the transfer and a disk label
190 * are both available. blkdone should be -1 if the position of the error
191 * is unknown; the disklabel pointer may be null from drivers that have not
192 * been converted to use them. The message is printed with printf
193 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
194 * The message should be completed (with at least a newline) with printf
195 * or addlog, respectively. There is no trailing space.
196 */
197 void
diskerr(bp,dname,what,pri,blkdone,lp)198 diskerr(bp, dname, what, pri, blkdone, lp)
199 register struct buf *bp;
200 char *dname, *what;
201 int pri, blkdone;
202 register struct disklabel *lp;
203 {
204 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
205 register int (*pr)(const char *, ...);
206 char partname = 'a' + part;
207 int sn;
208
209 if (pri != LOG_PRINTF) {
210 logpri(pri);
211 pr = addlog;
212 } else
213 pr = printf;
214 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
215 bp->b_flags & B_READ ? "read" : "writ");
216 sn = bp->b_blkno;
217 if (bp->b_bcount <= DEV_BSIZE)
218 (*pr)("%d", sn);
219 else {
220 if (blkdone >= 0) {
221 sn += blkdone;
222 (*pr)("%d of ", sn);
223 }
224 (*pr)("%d-%d", bp->b_blkno,
225 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
226 }
227 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
228 sn += lp->d_partitions[part].p_offset;
229 (*pr)(" (%s%d bn %d; cn %d", dname, unit, sn,
230 sn / lp->d_secpercyl);
231 sn %= lp->d_secpercyl;
232 (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors);
233 }
234 }
235
236 /*
237 * Initialize the disklist. Called by main() before autoconfiguration.
238 */
239 void
disk_init()240 disk_init()
241 {
242
243 TAILQ_INIT(&disklist);
244 disk_count = disk_change = 0;
245 }
246
247 /*
248 * Searches the disklist for the disk corresponding to the
249 * name provided.
250 */
251 struct disk *
disk_find(name)252 disk_find(name)
253 char *name;
254 {
255 struct disk *diskp;
256
257 if ((name == NULL) || (disk_count <= 0))
258 return (NULL);
259
260 TAILQ_FOREACH(diskp, &disklist, dk_link)
261 if (strcmp(diskp->dk_name, name) == 0)
262 return (diskp);
263
264 return (NULL);
265 }
266
267 int
disk_construct(diskp,lockname)268 disk_construct(diskp, lockname)
269 struct disk *diskp;
270 char *lockname;
271 {
272 lockinit(&diskp->dk_lock, PRIBIO | PCATCH, lockname,
273 0, LK_CANRECURSE);
274
275 diskp->dk_flags |= DKF_CONSTRUCTED;
276
277 return (0);
278 }
279
280 /*
281 * Attach a disk.
282 */
283 void
disk_attach(diskp)284 disk_attach(diskp)
285 struct disk *diskp;
286 {
287 int s;
288
289 if (!diskp->dk_flags & DKF_CONSTRUCTED)
290 disk_construct(diskp, diskp->dk_name);
291
292 /*
293 * Allocate and initialize the disklabel structures. Note that
294 * it's not safe to sleep here, since we're probably going to be
295 * called during autoconfiguration.
296 */
297 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT);
298 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF,
299 M_NOWAIT);
300 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL))
301 panic("disk_attach: can't allocate storage for disklabel");
302
303 bzero(diskp->dk_label, sizeof(struct disklabel));
304 bzero(diskp->dk_cpulabel, sizeof(struct cpu_disklabel));
305
306 /*
307 * Set the attached timestamp.
308 */
309 s = splclock();
310 diskp->dk_attachtime = mono_time;
311 splx(s);
312
313 /*
314 * Link into the disklist.
315 */
316 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
317 ++disk_count;
318 disk_change = 1;
319 }
320
321 /*
322 * Detach a disk.
323 */
324 void
disk_detach(diskp)325 disk_detach(diskp)
326 struct disk *diskp;
327 {
328
329 /*
330 * Free the space used by the disklabel structures.
331 */
332 free(diskp->dk_label, M_DEVBUF);
333 free(diskp->dk_cpulabel, M_DEVBUF);
334
335 /*
336 * Remove from the disklist.
337 */
338 TAILQ_REMOVE(&disklist, diskp, dk_link);
339 disk_change = 1;
340 if (--disk_count < 0)
341 panic("disk_detach: disk_count < 0");
342 }
343
344 /*
345 * Increment a disk's busy counter. If the counter is going from
346 * 0 to 1, set the timestamp.
347 */
348 void
disk_busy(diskp)349 disk_busy(diskp)
350 struct disk *diskp;
351 {
352 int s;
353
354 /*
355 * XXX We'd like to use something as accurate as microtime(),
356 * but that doesn't depend on the system TOD clock.
357 */
358 if (diskp->dk_busy++ == 0) {
359 s = splclock();
360 diskp->dk_timestamp = mono_time;
361 splx(s);
362 }
363 }
364
365 /*
366 * Decrement a disk's busy counter, increment the byte count, total busy
367 * time, and reset the timestamp.
368 */
369 void
disk_unbusy(diskp,bcount,read)370 disk_unbusy(diskp, bcount, read)
371 struct disk *diskp;
372 long bcount;
373 int read;
374 {
375 int s;
376 struct timeval dv_time, diff_time;
377
378 if (diskp->dk_busy-- == 0)
379 printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name);
380
381 s = splclock();
382 dv_time = mono_time;
383 splx(s);
384
385 timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
386 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
387
388 diskp->dk_timestamp = dv_time;
389 if (bcount > 0) {
390 if (read) {
391 diskp->dk_rbytes += bcount;
392 diskp->dk_rxfer++;
393 } else {
394 diskp->dk_wbytes += bcount;
395 diskp->dk_wxfer++;
396 }
397 } else
398 diskp->dk_seek++;
399
400 add_disk_randomness(bcount ^ diff_time.tv_usec);
401 }
402
403
404 int
disk_lock(dk)405 disk_lock(dk)
406 struct disk *dk;
407 {
408 int error;
409
410 error = lockmgr(&dk->dk_lock, LK_EXCLUSIVE, 0, curproc);
411
412 return (error);
413 }
414
415 void
disk_unlock(dk)416 disk_unlock(dk)
417 struct disk *dk;
418 {
419 lockmgr(&dk->dk_lock, LK_RELEASE, 0, curproc);
420 }
421
422
423 /*
424 * Reset the metrics counters on the given disk. Note that we cannot
425 * reset the busy counter, as it may case a panic in disk_unbusy().
426 * We also must avoid playing with the timestamp information, as it
427 * may skew any pending transfer results.
428 */
429 void
disk_resetstat(diskp)430 disk_resetstat(diskp)
431 struct disk *diskp;
432 {
433 int s = splbio(), t;
434
435 diskp->dk_rxfer = 0;
436 diskp->dk_rbytes = 0;
437 diskp->dk_wxfer = 0;
438 diskp->dk_wbytes = 0;
439 diskp->dk_seek = 0;
440
441 t = splclock();
442 diskp->dk_attachtime = mono_time;
443 splx(t);
444
445 timerclear(&diskp->dk_time);
446
447 splx(s);
448 }
449
450
451 int
dk_mountroot()452 dk_mountroot()
453 {
454 dev_t rawdev, rrootdev;
455 int part = DISKPART(rootdev);
456 int (*mountrootfn)(void);
457 struct disklabel dl;
458 int error;
459
460 rrootdev = blktochr(rootdev);
461 rawdev = MAKEDISKDEV(major(rrootdev), DISKUNIT(rootdev), RAW_PART);
462 printf("rootdev=0x%x rrootdev=0x%x rawdev=0x%x\n", rootdev,
463 rrootdev, rawdev);
464
465 /*
466 * open device, ioctl for the disklabel, and close it.
467 */
468 error = (cdevsw[major(rrootdev)].d_open)(rawdev, FREAD,
469 S_IFCHR, curproc);
470 if (error)
471 panic("cannot open disk, 0x%x/0x%x, error %d",
472 rootdev, rrootdev, error);
473 error = (cdevsw[major(rrootdev)].d_ioctl)(rawdev, DIOCGDINFO,
474 (caddr_t)&dl, FREAD, curproc);
475 if (error)
476 panic("cannot read disk label, 0x%x/0x%x, error %d",
477 rootdev, rrootdev, error);
478 (void) (cdevsw[major(rrootdev)].d_close)(rawdev, FREAD,
479 S_IFCHR, curproc);
480
481 if (dl.d_partitions[part].p_size == 0)
482 panic("root filesystem has size 0");
483 switch (dl.d_partitions[part].p_fstype) {
484 #ifdef EXT2FS
485 case FS_EXT2FS:
486 {
487 extern int ext2fs_mountroot(void);
488 mountrootfn = ext2fs_mountroot;
489 }
490 break;
491 #endif
492 #ifdef FFS
493 case FS_BSDFFS:
494 {
495 extern int ffs_mountroot(void);
496 mountrootfn = ffs_mountroot;
497 }
498 break;
499 #endif
500 #ifdef LFS
501 case FS_BSDLFS:
502 {
503 extern int lfs_mountroot(void);
504 mountrootfn = lfs_mountroot;
505 }
506 break;
507 #endif
508 #ifdef CD9660
509 case FS_ISO9660:
510 {
511 extern int cd9660_mountroot(void);
512 mountrootfn = cd9660_mountroot;
513 }
514 break;
515 #endif
516 default:
517 #ifdef FFS
518 {
519 extern int ffs_mountroot(void);
520
521 printf("filesystem type %d not known.. assuming ffs\n",
522 dl.d_partitions[part].p_fstype);
523 mountrootfn = ffs_mountroot;
524 }
525 #else
526 panic("disk 0x%x/0x%x filesystem type %d not known",
527 rootdev, rrootdev, dl.d_partitions[part].p_fstype);
528 #endif
529 }
530 return (*mountrootfn)();
531 }
532
533 struct bufq *
bufq_default_alloc(void)534 bufq_default_alloc(void)
535 {
536 struct bufq_default *bq;
537
538 bq = malloc(sizeof(*bq), M_DEVBUF, M_NOWAIT);
539 if (bq == NULL)
540 panic("bufq_default_alloc: no memory");
541
542 memset(bq, 0, sizeof(*bq));
543 bq->bufq.bufq_free = bufq_default_free;
544 bq->bufq.bufq_add = bufq_default_add;
545 bq->bufq.bufq_get = bufq_default_get;
546
547 return ((struct bufq *)bq);
548 }
549
550 void
bufq_default_free(struct bufq * bq)551 bufq_default_free(struct bufq *bq)
552 {
553 free(bq, M_DEVBUF);
554 }
555
556 void
bufq_default_add(struct bufq * bq,struct buf * bp)557 bufq_default_add(struct bufq *bq, struct buf *bp)
558 {
559 struct bufq_default *bufq = (struct bufq_default *)bq;
560 struct proc *p = bp->b_proc;
561 struct buf *head;
562
563 if (p == NULL || p->p_nice < NZERO)
564 head = &bufq->bufq_head[0];
565 else if (p->p_nice == NZERO)
566 head = &bufq->bufq_head[1];
567 else
568 head = &bufq->bufq_head[2];
569
570 disksort(head, bp);
571 }
572
573 struct buf *
bufq_default_get(struct bufq * bq)574 bufq_default_get(struct bufq *bq)
575 {
576 struct bufq_default *bufq = (struct bufq_default *)bq;
577 struct buf *bp, *head;
578 int i;
579
580 for (i = 0; i < 3; i++) {
581 head = &bufq->bufq_head[i];
582 if ((bp = head->b_actf))
583 break;
584 }
585 if (bp == NULL)
586 return (NULL);
587 head->b_actf = bp->b_actf;
588 return (bp);
589 }
590