1 /*        $NetBSD: spec_vnops.c,v 1.219 2025/01/06 09:45:49 mlelstv Exp $       */
2 
3 /*-
4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 1989, 1993
31  *        The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. Neither the name of the University nor the names of its contributors
42  *    may be used to endorse or promote products derived from this software
43  *    without specific prior written permission.
44  *
45  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55  * SUCH DAMAGE.
56  *
57  *        @(#)spec_vnops.c    8.15 (Berkeley) 7/14/95
58  */
59 
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.219 2025/01/06 09:45:49 mlelstv Exp $");
62 
63 #ifdef _KERNEL_OPT
64 #include "opt_ddb.h"
65 #endif
66 
67 #include <sys/param.h>
68 #include <sys/proc.h>
69 #include <sys/systm.h>
70 #include <sys/kernel.h>
71 #include <sys/conf.h>
72 #include <sys/buf.h>
73 #include <sys/mount.h>
74 #include <sys/namei.h>
75 #include <sys/vnode_impl.h>
76 #include <sys/stat.h>
77 #include <sys/errno.h>
78 #include <sys/ioctl.h>
79 #include <sys/poll.h>
80 #include <sys/file.h>
81 #include <sys/disklabel.h>
82 #include <sys/disk.h>
83 #include <sys/lockf.h>
84 #include <sys/tty.h>
85 #include <sys/kauth.h>
86 #include <sys/fstrans.h>
87 #include <sys/module.h>
88 #include <sys/atomic.h>
89 
90 #include <miscfs/genfs/genfs.h>
91 #include <miscfs/specfs/specdev.h>
92 
93 #ifdef DDB
94 #include <ddb/ddb.h>
95 #endif
96 
97 /*
98  * Lock order:
99  *
100  *        vnode lock
101  *        -> device_lock
102  *        -> struct vnode::v_interlock
103  */
104 
105 /* symbolic sleep message strings for devices */
106 const char          devopn[] = "devopn";
107 const char          devio[] = "devio";
108 const char          devwait[] = "devwait";
109 const char          devin[] = "devin";
110 const char          devout[] = "devout";
111 const char          devioc[] = "devioc";
112 const char          devcls[] = "devcls";
113 
114 #define   SPECHSZ   64
115 #if       ((SPECHSZ&(SPECHSZ-1)) == 0)
116 #define   SPECHASH(rdev)      (((rdev>>5)+(rdev))&(SPECHSZ-1))
117 #else
118 #define   SPECHASH(rdev)      (((unsigned)((rdev>>5)+(rdev)))%SPECHSZ)
119 #endif
120 
121 static vnode_t      *specfs_hash[SPECHSZ];
122 extern struct mount *dead_rootmount;
123 
124 /*
125  * This vnode operations vector is used for special device nodes
126  * created from whole cloth by the kernel.  For the ops vector for
127  * vnodes built from special devices found in a filesystem, see (e.g)
128  * ffs_specop_entries[] in ffs_vnops.c or the equivalent for other
129  * filesystems.
130  */
131 
132 int (**spec_vnodeop_p)(void *);
133 const struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
134           { &vop_default_desc, vn_default_error },
135           { &vop_parsepath_desc, genfs_parsepath },         /* parsepath */
136           { &vop_lookup_desc, spec_lookup },                /* lookup */
137           { &vop_create_desc, genfs_badop },                /* create */
138           { &vop_mknod_desc, genfs_badop },                 /* mknod */
139           { &vop_open_desc, spec_open },                              /* open */
140           { &vop_close_desc, spec_close },                  /* close */
141           { &vop_access_desc, genfs_ebadf },                /* access */
142           { &vop_accessx_desc, genfs_ebadf },               /* accessx */
143           { &vop_getattr_desc, genfs_ebadf },               /* getattr */
144           { &vop_setattr_desc, genfs_ebadf },               /* setattr */
145           { &vop_read_desc, spec_read },                              /* read */
146           { &vop_write_desc, spec_write },                  /* write */
147           { &vop_fallocate_desc, genfs_eopnotsupp },        /* fallocate */
148           { &vop_fdiscard_desc, spec_fdiscard },            /* fdiscard */
149           { &vop_fcntl_desc, genfs_fcntl },                 /* fcntl */
150           { &vop_ioctl_desc, spec_ioctl },                  /* ioctl */
151           { &vop_poll_desc, spec_poll },                              /* poll */
152           { &vop_kqfilter_desc, spec_kqfilter },            /* kqfilter */
153           { &vop_revoke_desc, genfs_revoke },               /* revoke */
154           { &vop_mmap_desc, spec_mmap },                              /* mmap */
155           { &vop_fsync_desc, spec_fsync },                  /* fsync */
156           { &vop_seek_desc, spec_seek },                              /* seek */
157           { &vop_remove_desc, genfs_badop },                /* remove */
158           { &vop_link_desc, genfs_badop },                  /* link */
159           { &vop_rename_desc, genfs_badop },                /* rename */
160           { &vop_mkdir_desc, genfs_badop },                 /* mkdir */
161           { &vop_rmdir_desc, genfs_badop },                 /* rmdir */
162           { &vop_symlink_desc, genfs_badop },               /* symlink */
163           { &vop_readdir_desc, genfs_badop },               /* readdir */
164           { &vop_readlink_desc, genfs_badop },              /* readlink */
165           { &vop_abortop_desc, genfs_badop },               /* abortop */
166           { &vop_inactive_desc, spec_inactive },            /* inactive */
167           { &vop_reclaim_desc, spec_reclaim },              /* reclaim */
168           { &vop_lock_desc, genfs_lock },                             /* lock */
169           { &vop_unlock_desc, genfs_unlock },               /* unlock */
170           { &vop_bmap_desc, spec_bmap },                              /* bmap */
171           { &vop_strategy_desc, spec_strategy },            /* strategy */
172           { &vop_print_desc, spec_print },                  /* print */
173           { &vop_islocked_desc, genfs_islocked },           /* islocked */
174           { &vop_pathconf_desc, spec_pathconf },            /* pathconf */
175           { &vop_advlock_desc, spec_advlock },              /* advlock */
176           { &vop_bwrite_desc, vn_bwrite },                  /* bwrite */
177           { &vop_getpages_desc, genfs_getpages },           /* getpages */
178           { &vop_putpages_desc, genfs_putpages },           /* putpages */
179           { NULL, NULL }
180 };
181 const struct vnodeopv_desc spec_vnodeop_opv_desc =
182           { &spec_vnodeop_p, spec_vnodeop_entries };
183 
184 static kauth_listener_t rawio_listener;
185 static struct kcondvar specfs_iocv;
186 
187 /*
188  * Returns true if vnode is /dev/mem or /dev/kmem.
189  */
190 bool
iskmemvp(struct vnode * vp)191 iskmemvp(struct vnode *vp)
192 {
193           return ((vp->v_type == VCHR) && iskmemdev(vp->v_rdev));
194 }
195 
196 /*
197  * Returns true if dev is /dev/mem or /dev/kmem.
198  */
199 int
iskmemdev(dev_t dev)200 iskmemdev(dev_t dev)
201 {
202           /* mem_no is emitted by config(8) to generated devsw.c */
203           extern const int mem_no;
204 
205           /* minor 14 is /dev/io on i386 with COMPAT_10 */
206           return (major(dev) == mem_no && (minor(dev) < 2 || minor(dev) == 14));
207 }
208 
209 static int
rawio_listener_cb(kauth_cred_t cred,kauth_action_t action,void * cookie,void * arg0,void * arg1,void * arg2,void * arg3)210 rawio_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
211     void *arg0, void *arg1, void *arg2, void *arg3)
212 {
213           int result;
214 
215           result = KAUTH_RESULT_DEFER;
216 
217           if ((action != KAUTH_DEVICE_RAWIO_SPEC) &&
218               (action != KAUTH_DEVICE_RAWIO_PASSTHRU))
219                     return result;
220 
221           /* Access is mandated by permissions. */
222           result = KAUTH_RESULT_ALLOW;
223 
224           return result;
225 }
226 
227 void
spec_init(void)228 spec_init(void)
229 {
230 
231           rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
232               rawio_listener_cb, NULL);
233           cv_init(&specfs_iocv, "specio");
234 }
235 
236 /*
237  * spec_io_enter(vp, &sn, &dev)
238  *
239  *        Enter an operation that may not hold vp's vnode lock or an
240  *        fstrans on vp's mount.  Until spec_io_exit, the vnode will not
241  *        be revoked.
242  *
243  *        On success, set sn to the specnode pointer and dev to the dev_t
244  *        number and return zero.  Caller must later call spec_io_exit
245  *        when done.
246  *
247  *        On failure, return ENXIO -- the device has been revoked and no
248  *        longer exists.
249  */
250 static int
spec_io_enter(struct vnode * vp,struct specnode ** snp,dev_t * devp)251 spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp)
252 {
253           dev_t dev;
254           struct specnode *sn;
255           unsigned iocnt;
256           int error = 0;
257 
258           mutex_enter(vp->v_interlock);
259 
260           /*
261            * Extract all the info we need from the vnode, unless the
262            * vnode has already been reclaimed.  This can happen if the
263            * underlying device has been removed and all the device nodes
264            * for it have been revoked.  The caller may not hold a vnode
265            * lock or fstrans to prevent this from happening before it has
266            * had an opportunity to notice the vnode is dead.
267            */
268           if (vdead_check(vp, VDEAD_NOWAIT) != 0 ||
269               (sn = vp->v_specnode) == NULL ||
270               (dev = vp->v_rdev) == NODEV) {
271                     error = ENXIO;
272                     goto out;
273           }
274 
275           /*
276            * Notify spec_close that we are doing an I/O operation which
277            * may not be not bracketed by fstrans(9) and thus is not
278            * blocked by vfs suspension.
279            *
280            * We could hold this reference with psref(9) instead, but we
281            * already have to take the interlock for vdead_check, so
282            * there's not much more cost here to another atomic operation.
283            */
284           do {
285                     iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt);
286                     if (__predict_false(iocnt == UINT_MAX)) {
287                               /*
288                                * The I/O count is limited by the number of
289                                * LWPs (which will never overflow this) --
290                                * unless one driver uses another driver via
291                                * specfs, which is rather unusual, but which
292                                * could happen via pud(4) userspace drivers.
293                                * We could use a 64-bit count, but can't use
294                                * atomics for that on all platforms.
295                                * (Probably better to switch to psref or
296                                * localcount instead.)
297                                */
298                               error = EBUSY;
299                               goto out;
300                     }
301           } while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1)
302               != iocnt);
303 
304           /* Success!  */
305           *snp = sn;
306           *devp = dev;
307           error = 0;
308 
309 out:      mutex_exit(vp->v_interlock);
310           return error;
311 }
312 
313 /*
314  * spec_io_exit(vp, sn)
315  *
316  *        Exit an operation entered with a successful spec_io_enter --
317  *        allow concurrent spec_node_revoke to proceed.  The argument sn
318  *        must match the struct specnode pointer returned by spec_io_exit
319  *        for vp.
320  */
321 static void
spec_io_exit(struct vnode * vp,struct specnode * sn)322 spec_io_exit(struct vnode *vp, struct specnode *sn)
323 {
324           struct specdev *sd = sn->sn_dev;
325           unsigned iocnt;
326 
327           KASSERT(vp->v_specnode == sn);
328 
329           /*
330            * We are done.  Notify spec_close if appropriate.  The
331            * transition of 1 -> 0 must happen under device_lock so
332            * spec_close doesn't miss a wakeup.
333            */
334           do {
335                     iocnt = atomic_load_relaxed(&sd->sd_iocnt);
336                     KASSERT(iocnt > 0);
337                     if (iocnt == 1) {
338                               mutex_enter(&device_lock);
339                               if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0)
340                                         cv_broadcast(&specfs_iocv);
341                               mutex_exit(&device_lock);
342                               break;
343                     }
344           } while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt);
345 }
346 
347 /*
348  * spec_io_drain(sd)
349  *
350  *        Wait for all existing spec_io_enter/exit sections to complete.
351  *        Caller must ensure spec_io_enter will fail at this point.
352  */
353 static void
spec_io_drain(struct specdev * sd)354 spec_io_drain(struct specdev *sd)
355 {
356 
357           /*
358            * I/O at the same time as closing is unlikely -- it often
359            * indicates an application bug.
360            */
361           if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0))
362                     return;
363 
364           mutex_enter(&device_lock);
365           while (atomic_load_relaxed(&sd->sd_iocnt) > 0)
366                     cv_wait(&specfs_iocv, &device_lock);
367           mutex_exit(&device_lock);
368 }
369 
370 /*
371  * Initialize a vnode that represents a device.
372  */
373 void
spec_node_init(vnode_t * vp,dev_t rdev)374 spec_node_init(vnode_t *vp, dev_t rdev)
375 {
376           specnode_t *sn;
377           specdev_t *sd;
378           vnode_t *vp2;
379           vnode_t **vpp;
380 
381           KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
382           KASSERT(vp->v_specnode == NULL);
383 
384           /*
385            * Search the hash table for this device.  If known, add a
386            * reference to the device structure.  If not known, create
387            * a new entry to represent the device.  In all cases add
388            * the vnode to the hash table.
389            */
390           sn = kmem_alloc(sizeof(*sn), KM_SLEEP);
391           sd = kmem_alloc(sizeof(*sd), KM_SLEEP);
392           mutex_enter(&device_lock);
393           vpp = &specfs_hash[SPECHASH(rdev)];
394           for (vp2 = *vpp; vp2 != NULL; vp2 = vp2->v_specnext) {
395                     KASSERT(vp2->v_specnode != NULL);
396                     if (rdev == vp2->v_rdev && vp->v_type == vp2->v_type) {
397                               break;
398                     }
399           }
400           if (vp2 == NULL) {
401                     /* No existing record, create a new one. */
402                     sd->sd_mountpoint = NULL;
403                     sd->sd_lockf = NULL;
404                     sd->sd_refcnt = 1;
405                     sd->sd_opencnt = 0;
406                     sd->sd_bdevvp = NULL;
407                     sd->sd_iocnt = 0;
408                     sd->sd_opened = false;
409                     sd->sd_closing = false;
410                     sn->sn_dev = sd;
411                     sd = NULL;
412           } else {
413                     /* Use the existing record. */
414                     sn->sn_dev = vp2->v_specnode->sn_dev;
415                     sn->sn_dev->sd_refcnt++;
416           }
417           /* Insert vnode into the hash chain. */
418           sn->sn_opencnt = 0;
419           sn->sn_rdev = rdev;
420           sn->sn_gone = false;
421           vp->v_specnode = sn;
422           vp->v_specnext = *vpp;
423           *vpp = vp;
424           mutex_exit(&device_lock);
425 
426           /* Free the record we allocated if unused. */
427           if (sd != NULL) {
428                     kmem_free(sd, sizeof(*sd));
429           }
430 }
431 
432 /*
433  * Lookup a vnode by device number and return it referenced.
434  */
435 int
spec_node_lookup_by_dev(enum vtype type,dev_t dev,int flags,vnode_t ** vpp)436 spec_node_lookup_by_dev(enum vtype type, dev_t dev, int flags, vnode_t **vpp)
437 {
438           int error;
439           vnode_t *vp;
440 
441 top:      mutex_enter(&device_lock);
442           for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
443                     if (type == vp->v_type && dev == vp->v_rdev) {
444                               mutex_enter(vp->v_interlock);
445                               /* If clean or being cleaned, then ignore it. */
446                               if (vdead_check(vp, VDEAD_NOWAIT) == 0)
447                                         break;
448                               if ((flags & VDEAD_NOWAIT) == 0) {
449                                         mutex_exit(&device_lock);
450                                         /*
451                                          * It may be being revoked as we speak,
452                                          * and the caller wants to wait until
453                                          * all revocation has completed.  Let
454                                          * vcache_vget wait for it to finish
455                                          * dying; as a side effect, vcache_vget
456                                          * releases vp->v_interlock.  Note that
457                                          * vcache_vget cannot succeed at this
458                                          * point because vdead_check already
459                                          * failed.
460                                          */
461                                         error = vcache_vget(vp);
462                                         KASSERT(error);
463                                         goto top;
464                               }
465                               mutex_exit(vp->v_interlock);
466                     }
467           }
468           KASSERT(vp == NULL || mutex_owned(vp->v_interlock));
469           if (vp == NULL) {
470                     mutex_exit(&device_lock);
471                     return ENOENT;
472           }
473           /*
474            * If it is an opened block device return the opened vnode.
475            */
476           if (type == VBLK && vp->v_specnode->sn_dev->sd_bdevvp != NULL) {
477                     mutex_exit(vp->v_interlock);
478                     vp = vp->v_specnode->sn_dev->sd_bdevvp;
479                     mutex_enter(vp->v_interlock);
480           }
481           mutex_exit(&device_lock);
482           error = vcache_vget(vp);
483           if (error)
484                     return error;
485           *vpp = vp;
486 
487           return 0;
488 }
489 
490 /*
491  * Lookup a vnode by file system mounted on and return it referenced.
492  */
493 int
spec_node_lookup_by_mount(struct mount * mp,vnode_t ** vpp)494 spec_node_lookup_by_mount(struct mount *mp, vnode_t **vpp)
495 {
496           int i, error;
497           vnode_t *vp, *vq;
498 
499           mutex_enter(&device_lock);
500           for (i = 0, vq = NULL; i < SPECHSZ && vq == NULL; i++) {
501                     for (vp = specfs_hash[i]; vp; vp = vp->v_specnext) {
502                               if (vp->v_type != VBLK)
503                                         continue;
504                               vq = vp->v_specnode->sn_dev->sd_bdevvp;
505                               if (vq != NULL &&
506                                   vq->v_specnode->sn_dev->sd_mountpoint == mp)
507                                         break;
508                               vq = NULL;
509                     }
510           }
511           if (vq == NULL) {
512                     mutex_exit(&device_lock);
513                     return ENOENT;
514           }
515           mutex_enter(vq->v_interlock);
516           mutex_exit(&device_lock);
517           error = vcache_vget(vq);
518           if (error)
519                     return error;
520           *vpp = vq;
521 
522           return 0;
523 
524 }
525 
526 /*
527  * Get the file system mounted on this block device.
528  *
529  * XXX Caller should hold the vnode lock -- shared or exclusive -- so
530  * that this can't changed, and the vnode can't be revoked while we
531  * examine it.  But not all callers do, and they're scattered through a
532  * lot of file systems, so we can't assert this yet.
533  */
534 struct mount *
spec_node_getmountedfs(vnode_t * devvp)535 spec_node_getmountedfs(vnode_t *devvp)
536 {
537           struct mount *mp;
538 
539           KASSERT(devvp->v_type == VBLK);
540           mp = devvp->v_specnode->sn_dev->sd_mountpoint;
541 
542           return mp;
543 }
544 
545 /*
546  * Set the file system mounted on this block device.
547  *
548  * XXX Caller should hold the vnode lock exclusively so this can't be
549  * changed or assumed by spec_node_getmountedfs while we change it, and
550  * the vnode can't be revoked while we handle it.  But not all callers
551  * do, and they're scattered through a lot of file systems, so we can't
552  * assert this yet.  Instead, for now, we'll take an I/O reference so
553  * at least the ioctl doesn't race with revoke/detach.
554  *
555  * If you do change this to assert an exclusive vnode lock, you must
556  * also do vdead_check before trying bdev_ioctl, because the vnode may
557  * have been revoked by the time the caller locked it, and this is
558  * _not_ a vop -- calls to spec_node_setmountedfs don't go through
559  * v_op, so revoking the vnode doesn't prevent further calls.
560  *
561  * XXX Caller should additionally have the vnode open, at least if mp
562  * is nonnull, but I'm not sure all callers do that -- need to audit.
563  * Currently udf closes the vnode before clearing the mount.
564  */
565 void
spec_node_setmountedfs(vnode_t * devvp,struct mount * mp)566 spec_node_setmountedfs(vnode_t *devvp, struct mount *mp)
567 {
568           struct dkwedge_info dkw;
569           struct specnode *sn;
570           dev_t dev;
571           int error;
572 
573           KASSERT(devvp->v_type == VBLK);
574 
575           error = spec_io_enter(devvp, &sn, &dev);
576           if (error)
577                     return;
578 
579           KASSERT(sn->sn_dev->sd_mountpoint == NULL || mp == NULL);
580           sn->sn_dev->sd_mountpoint = mp;
581           if (mp == NULL)
582                     goto out;
583 
584           error = bdev_ioctl(dev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp);
585           if (error)
586                     goto out;
587 
588           strlcpy(mp->mnt_stat.f_mntfromlabel, dkw.dkw_wname,
589               sizeof(mp->mnt_stat.f_mntfromlabel));
590 
591 out:      spec_io_exit(devvp, sn);
592 }
593 
594 /*
595  * A vnode representing a special device is going away.  Close
596  * the device if the vnode holds it open.
597  */
598 void
spec_node_revoke(vnode_t * vp)599 spec_node_revoke(vnode_t *vp)
600 {
601           specnode_t *sn;
602           specdev_t *sd;
603           struct vnode **vpp;
604 
605           KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
606 
607           sn = vp->v_specnode;
608           sd = sn->sn_dev;
609 
610           KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
611           KASSERT(vp->v_specnode != NULL);
612           KASSERT(sn->sn_gone == false);
613 
614           mutex_enter(&device_lock);
615           KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
616               "sn_opencnt=%u > sd_opencnt=%u",
617               sn->sn_opencnt, sd->sd_opencnt);
618           sn->sn_gone = true;
619           if (sn->sn_opencnt != 0) {
620                     sd->sd_opencnt -= (sn->sn_opencnt - 1);
621                     sn->sn_opencnt = 1;
622                     mutex_exit(&device_lock);
623 
624                     VOP_CLOSE(vp, FNONBLOCK, NOCRED);
625 
626                     mutex_enter(&device_lock);
627                     KASSERT(sn->sn_opencnt == 0);
628           }
629 
630           /*
631            * We may have revoked the vnode in this thread while another
632            * thread was in the middle of spec_close, in the window when
633            * spec_close releases the vnode lock to call .d_close for the
634            * last close.  In that case, wait for the concurrent
635            * spec_close to complete.
636            */
637           while (sd->sd_closing)
638                     cv_wait(&specfs_iocv, &device_lock);
639 
640           /*
641            * Remove from the hash so lookups stop returning this
642            * specnode.  We will dissociate it from the specdev -- and
643            * possibly free the specdev -- in spec_node_destroy.
644            */
645           KASSERT(sn->sn_gone);
646           KASSERT(sn->sn_opencnt == 0);
647           for (vpp = &specfs_hash[SPECHASH(vp->v_rdev)];;
648                vpp = &(*vpp)->v_specnext) {
649                     if (*vpp == vp) {
650                               *vpp = vp->v_specnext;
651                               vp->v_specnext = NULL;
652                               break;
653                     }
654           }
655           mutex_exit(&device_lock);
656 }
657 
658 /*
659  * A vnode representing a special device is being recycled.
660  * Destroy the specfs component.
661  */
662 void
spec_node_destroy(vnode_t * vp)663 spec_node_destroy(vnode_t *vp)
664 {
665           specnode_t *sn;
666           specdev_t *sd;
667           int refcnt;
668 
669           sn = vp->v_specnode;
670           sd = sn->sn_dev;
671 
672           KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
673           KASSERT(vp->v_specnode != NULL);
674           KASSERT(sn->sn_opencnt == 0);
675 
676           mutex_enter(&device_lock);
677           sn = vp->v_specnode;
678           vp->v_specnode = NULL;
679           refcnt = sd->sd_refcnt--;
680           KASSERT(refcnt > 0);
681           mutex_exit(&device_lock);
682 
683           /* If the device is no longer in use, destroy our record. */
684           if (refcnt == 1) {
685                     KASSERT(sd->sd_iocnt == 0);
686                     KASSERT(sd->sd_opencnt == 0);
687                     KASSERT(sd->sd_bdevvp == NULL);
688                     kmem_free(sd, sizeof(*sd));
689           }
690           kmem_free(sn, sizeof(*sn));
691 }
692 
693 /*
694  * Trivial lookup routine that always fails.
695  */
696 int
spec_lookup(void * v)697 spec_lookup(void *v)
698 {
699           struct vop_lookup_v2_args /* {
700                     struct vnode *a_dvp;
701                     struct vnode **a_vpp;
702                     struct componentname *a_cnp;
703           } */ *ap = v;
704 
705           *ap->a_vpp = NULL;
706           return ENOTDIR;
707 }
708 
709 typedef int (*spec_ioctl_t)(dev_t, u_long, void *, int, struct lwp *);
710 
711 /*
712  * Open a special file.
713  */
714 /* ARGSUSED */
715 int
spec_open(void * v)716 spec_open(void *v)
717 {
718           struct vop_open_args /* {
719                     struct vnode *a_vp;
720                     int  a_mode;
721                     kauth_cred_t a_cred;
722           } */ *ap = v;
723           struct lwp *l = curlwp;
724           struct vnode *vp = ap->a_vp;
725           dev_t dev, dev1;
726           int error;
727           enum kauth_device_req req;
728           specnode_t *sn, *sn1;
729           specdev_t *sd;
730           int dtype;
731           spec_ioctl_t ioctl;
732           u_int gen = 0;
733           const char *name = NULL;
734           bool needclose = false;
735 
736           KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
737           KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d",
738               vp->v_type);
739 
740           dev = vp->v_rdev;
741           sn = vp->v_specnode;
742           sd = sn->sn_dev;
743 
744           /*
745            * Don't allow open if fs is mounted -nodev.
746            */
747           if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
748                     return ENXIO;
749 
750           switch (ap->a_mode & (FREAD | FWRITE)) {
751           case FREAD | FWRITE:
752                     req = KAUTH_REQ_DEVICE_RAWIO_SPEC_RW;
753                     break;
754           case FWRITE:
755                     req = KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE;
756                     break;
757           default:
758                     req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ;
759                     break;
760           }
761           error = kauth_authorize_device_spec(ap->a_cred, req, vp);
762           if (error)
763                     return error;
764 
765           /*
766            * Acquire an open reference -- as long as we hold onto it, and
767            * the vnode isn't revoked, it can't be closed, and the vnode
768            * can't be revoked until we release the vnode lock.
769            */
770           mutex_enter(&device_lock);
771           KASSERT(!sn->sn_gone);
772           switch (vp->v_type) {
773           case VCHR:
774                     /*
775                      * Character devices can accept opens from multiple
776                      * vnodes.  But first, wait for any close to finish.
777                      * Wait under the vnode lock so we don't have to worry
778                      * about the vnode being revoked while we wait.
779                      */
780                     while (sd->sd_closing) {
781                               error = cv_wait_sig(&specfs_iocv, &device_lock);
782                               if (error)
783                                         break;
784                     }
785                     if (error)
786                               break;
787                     sd->sd_opencnt++;
788                     sn->sn_opencnt++;
789                     KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
790                         "sn_opencnt=%u > sd_opencnt=%u",
791                         sn->sn_opencnt, sd->sd_opencnt);
792                     break;
793           case VBLK:
794                     /*
795                      * For block devices, permit only one open.  The buffer
796                      * cache cannot remain self-consistent with multiple
797                      * vnodes holding a block device open.
798                      *
799                      * Treat zero opencnt with non-NULL mountpoint as open.
800                      * This may happen after forced detach of a mounted device.
801                      *
802                      * Also treat sd_closing, meaning there is a concurrent
803                      * close in progress, as still open.
804                      */
805                     if (sd->sd_opencnt != 0 ||
806                         sd->sd_mountpoint != NULL ||
807                         sd->sd_closing) {
808                               error = EBUSY;
809                               break;
810                     }
811                     KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u",
812                         sn->sn_opencnt);
813                     sn->sn_opencnt = 1;
814                     sd->sd_opencnt = 1;
815                     sd->sd_bdevvp = vp;
816                     break;
817           default:
818                     panic("invalid specfs vnode type: %d", vp->v_type);
819           }
820           mutex_exit(&device_lock);
821           if (error)
822                     return error;
823 
824           /*
825            * Set VV_ISTTY if this is a tty cdev.
826            *
827            * XXX This does the wrong thing if the module has to be
828            * autoloaded.  We should maybe set this after autoloading
829            * modules and calling .d_open successfully, except (a) we need
830            * the vnode lock to touch it, and (b) once we acquire the
831            * vnode lock again, the vnode may have been revoked, and
832            * deadfs's dead_read needs VV_ISTTY to be already set in order
833            * to return the right answer.  So this needs some additional
834            * synchronization to be made to work correctly with tty driver
835            * module autoload.  For now, let's just hope it doesn't cause
836            * too much trouble for a tty from an autoloaded driver module
837            * to fail with EIO instead of returning EOF.
838            */
839           if (vp->v_type == VCHR) {
840                     if (cdev_type(dev) == D_TTY)
841                               vp->v_vflag |= VV_ISTTY;
842           }
843 
844           /*
845            * Because opening the device may block indefinitely, e.g. when
846            * opening a tty, and loading a module may cross into many
847            * other subsystems, we must not hold the vnode lock while
848            * calling .d_open, so release it now and reacquire it when
849            * done.
850            *
851            * Take an I/O reference so that any concurrent spec_close via
852            * spec_node_revoke will wait for us to finish calling .d_open.
853            * The vnode can't be dead at this point because we have it
854            * locked.  Note that if revoked, the driver must interrupt
855            * .d_open before spec_close starts waiting for I/O to drain so
856            * this doesn't deadlock.
857            */
858           VOP_UNLOCK(vp);
859           error = spec_io_enter(vp, &sn1, &dev1);
860           if (error) {
861                     vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
862                     return error;
863           }
864           KASSERT(sn1 == sn);
865           KASSERT(dev1 == dev);
866 
867           /*
868            * Open the device.  If .d_open returns ENXIO (device not
869            * configured), the driver may not be loaded, so try
870            * autoloading a module and then try .d_open again if anything
871            * got loaded.
872            */
873           switch (vp->v_type) {
874           case VCHR:
875                     do {
876                               const struct cdevsw *cdev;
877 
878                               gen = module_gen;
879                               error = cdev_open(dev, ap->a_mode, S_IFCHR, l);
880                               if (error != ENXIO)
881                                         break;
882 
883                               /* Check if we already have a valid driver */
884                               mutex_enter(&device_lock);
885                               cdev = cdevsw_lookup(dev);
886                               mutex_exit(&device_lock);
887                               if (cdev != NULL)
888                                         break;
889 
890                               /* Get device name from devsw_conv array */
891                               if ((name = cdevsw_getname(major(dev))) == NULL)
892                                         break;
893 
894                               /* Try to autoload device module */
895                               (void)module_autoload(name, MODULE_CLASS_DRIVER);
896                     } while (gen != module_gen);
897                     break;
898 
899           case VBLK:
900                     do {
901                               const struct bdevsw *bdev;
902 
903                               gen = module_gen;
904                               error = bdev_open(dev, ap->a_mode, S_IFBLK, l);
905                               if (error != ENXIO)
906                                         break;
907 
908                               /* Check if we already have a valid driver */
909                               mutex_enter(&device_lock);
910                               bdev = bdevsw_lookup(dev);
911                               mutex_exit(&device_lock);
912                               if (bdev != NULL)
913                                         break;
914 
915                               /* Get device name from devsw_conv array */
916                               if ((name = bdevsw_getname(major(dev))) == NULL)
917                                         break;
918 
919                               /* Try to autoload device module */
920                               (void)module_autoload(name, MODULE_CLASS_DRIVER);
921                     } while (gen != module_gen);
922                     break;
923 
924           default:
925                     __unreachable();
926           }
927 
928           /*
929            * Release the I/O reference now that we have called .d_open,
930            * and reacquire the vnode lock.  At this point, the device may
931            * have been revoked, so we must tread carefully.  However, sn
932            * and sd remain valid pointers until we drop our reference.
933            */
934           spec_io_exit(vp, sn);
935           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
936           KASSERT(vp->v_specnode == sn);
937 
938           /*
939            * If it has been revoked since we released the vnode lock and
940            * reacquired it, then spec_node_revoke has closed it, and we
941            * must fail with EBADF.
942            *
943            * Otherwise, if opening it failed, back out and release the
944            * open reference.  If it was ever successfully opened and we
945            * got the last reference this way, it's now our job to close
946            * it.  This might happen in the following scenario:
947            *
948            *        Thread 1            Thread 2
949            *        VOP_OPEN
950            *          ...
951            *          .d_open -> 0 (success)
952            *          acquire vnode lock
953            *          do stuff                    VOP_OPEN
954            *          release vnode lock          ...
955            *                                        .d_open -> EBUSY
956            *        VOP_CLOSE
957            *          acquire vnode lock
958            *          --sd_opencnt != 0
959            *          => no .d_close
960            *          release vnode lock
961            *                                        acquire vnode lock
962            *                                        --sd_opencnt == 0
963            *
964            * We can't resolve this by making spec_close wait for .d_open
965            * to complete before examining sd_opencnt, because .d_open can
966            * hang indefinitely, e.g. for a tty.
967            */
968           mutex_enter(&device_lock);
969           if (sn->sn_gone) {
970                     if (error == 0)
971                               error = EBADF;
972           } else if (error == 0) {
973                     /*
974                      * Device has not been revoked, so our opencnt can't
975                      * have gone away at this point -- transition to
976                      * sn_gone=true happens before transition to
977                      * sn_opencnt=0 in spec_node_revoke.
978                      */
979                     KASSERT(sd->sd_opencnt);
980                     KASSERT(sn->sn_opencnt);
981                     KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
982                         "sn_opencnt=%u > sd_opencnt=%u",
983                         sn->sn_opencnt, sd->sd_opencnt);
984                     KASSERT(!sd->sd_closing);
985                     sd->sd_opened = true;
986           } else if (sd->sd_opencnt == 1 && sd->sd_opened) {
987                     /*
988                      * We're the last reference to a _previous_ open even
989                      * though this one failed, so we have to close it.
990                      * Don't decrement the reference count here --
991                      * spec_close will do that.
992                      */
993                     KASSERT(sn->sn_opencnt == 1);
994                     needclose = true;
995           } else {
996                     KASSERT(sd->sd_opencnt);
997                     KASSERT(sn->sn_opencnt);
998                     KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
999                         "sn_opencnt=%u > sd_opencnt=%u",
1000                         sn->sn_opencnt, sd->sd_opencnt);
1001                     sd->sd_opencnt--;
1002                     sn->sn_opencnt--;
1003                     if (vp->v_type == VBLK)
1004                               sd->sd_bdevvp = NULL;
1005           }
1006           mutex_exit(&device_lock);
1007 
1008           /*
1009            * If this open failed, but the device was previously opened,
1010            * and another thread concurrently closed the vnode while we
1011            * were in the middle of reopening it, the other thread will
1012            * see sd_opencnt > 0 and thus decide not to call .d_close --
1013            * it is now our responsibility to do so.
1014            *
1015            * XXX The flags passed to VOP_CLOSE here are wrong, but
1016            * drivers can't rely on FREAD|FWRITE anyway -- e.g., consider
1017            * a device opened by thread 0 with O_READ, then opened by
1018            * thread 1 with O_WRITE, then closed by thread 0, and finally
1019            * closed by thread 1; the last .d_close call will have FWRITE
1020            * but not FREAD.  We should just eliminate the FREAD/FWRITE
1021            * parameter to .d_close altogether.
1022            */
1023           if (needclose) {
1024                     KASSERT(error);
1025                     VOP_CLOSE(vp, FNONBLOCK, NOCRED);
1026           }
1027 
1028           /* If anything went wrong, we're done.  */
1029           if (error)
1030                     return error;
1031 
1032           /*
1033            * For disk devices, automagically set the vnode size to the
1034            * partition size, if we can.  This applies to block devices
1035            * and character devices alike -- every block device must have
1036            * a corresponding character device.  And if the module is
1037            * loaded it will remain loaded until we're done here (it is
1038            * forbidden to devsw_detach until closed).  So it is safe to
1039            * query cdev_type unconditionally here.
1040            */
1041           switch (vp->v_type) {
1042           case VCHR:
1043                     ioctl = cdev_ioctl;
1044                     dtype = cdev_type(dev);
1045                     break;
1046           default:
1047                     ioctl = bdev_ioctl;
1048                     dtype = bdev_type(dev);
1049                     break;
1050           }
1051           if (dtype == D_DISK) {
1052                     struct partinfo pi;
1053                     off_t sz;
1054 
1055                     error = (*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp);
1056                     if (error == 0)
1057                               sz = (off_t)pi.pi_size * pi.pi_secsize;
1058                     else if (error == ENOTTY)
1059                               error = (*ioctl)(dev, DIOCGMEDIASIZE, &sz, FREAD, curlwp);
1060 
1061                     if (error == 0)
1062                               uvm_vnp_setsize(vp, (voff_t)sz);
1063           }
1064 
1065           /* Success!  */
1066           return 0;
1067 }
1068 
1069 /*
1070  * Vnode op for read
1071  */
1072 /* ARGSUSED */
1073 int
spec_read(void * v)1074 spec_read(void *v)
1075 {
1076           struct vop_read_args /* {
1077                     struct vnode *a_vp;
1078                     struct uio *a_uio;
1079                     int  a_ioflag;
1080                     kauth_cred_t a_cred;
1081           } */ *ap = v;
1082           struct vnode *vp = ap->a_vp;
1083           struct uio *uio = ap->a_uio;
1084           struct lwp *l = curlwp;
1085           struct specnode *sn;
1086           dev_t dev;
1087           struct buf *bp;
1088           daddr_t bn;
1089           int bsize, bscale;
1090           struct partinfo pi;
1091           int n, on;
1092           int error = 0;
1093           int i, nra;
1094           daddr_t lastbn, *rablks;
1095           int *rasizes;
1096           int nrablks, ratogo;
1097 
1098           KASSERT(uio->uio_rw == UIO_READ);
1099           KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ||
1100                     uio->uio_vmspace == curproc->p_vmspace),
1101               "vmspace belongs to neither kernel nor curproc");
1102 
1103           if (uio->uio_resid == 0)
1104                     return 0;
1105 
1106           switch (vp->v_type) {
1107 
1108           case VCHR:
1109                     /*
1110                      * Release the lock while we sleep -- possibly
1111                      * indefinitely, if this is, e.g., a tty -- in
1112                      * cdev_read, so we don't hold up everything else that
1113                      * might want access to the vnode.
1114                      *
1115                      * But before we issue the read, take an I/O reference
1116                      * to the specnode so close will know when we're done
1117                      * reading.  Note that the moment we release the lock,
1118                      * the vnode's identity may change; hence spec_io_enter
1119                      * may fail, and the caller may have a dead vnode on
1120                      * their hands, if the file system on which vp lived
1121                      * has been unmounted.
1122                      */
1123                     VOP_UNLOCK(vp);
1124                     error = spec_io_enter(vp, &sn, &dev);
1125                     if (error)
1126                               goto out;
1127                     error = cdev_read(dev, uio, ap->a_ioflag);
1128                     spec_io_exit(vp, sn);
1129 out:                /* XXX What if the caller held an exclusive lock?  */
1130                     vn_lock(vp, LK_SHARED | LK_RETRY);
1131                     return error;
1132 
1133           case VBLK:
1134                     KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
1135                     if (uio->uio_offset < 0)
1136                               return EINVAL;
1137 
1138                     if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0)
1139                               bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE);
1140                     else
1141                               bsize = BLKDEV_IOSIZE;
1142 
1143                     bscale = bsize >> DEV_BSHIFT;
1144 
1145                     nra = uimax(16 * MAXPHYS / bsize - 1, 511);
1146                     rablks = kmem_alloc(nra * sizeof(*rablks), KM_SLEEP);
1147                     rasizes = kmem_alloc(nra * sizeof(*rasizes), KM_SLEEP);
1148                     lastbn = ((uio->uio_offset + uio->uio_resid - 1) >> DEV_BSHIFT)
1149                         &~ (bscale - 1);
1150                     nrablks = ratogo = 0;
1151                     do {
1152                               bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1);
1153                               on = uio->uio_offset % bsize;
1154                               n = uimin((unsigned)(bsize - on), uio->uio_resid);
1155 
1156                               if (ratogo == 0) {
1157                                         nrablks = uimin((lastbn - bn) / bscale, nra);
1158                                         ratogo = nrablks;
1159 
1160                                         for (i = 0; i < nrablks; ++i) {
1161                                                   rablks[i] = bn + (i+1) * bscale;
1162                                                   rasizes[i] = bsize;
1163                                         }
1164 
1165                                         error = breadn(vp, bn, bsize,
1166                                             rablks, rasizes, nrablks,
1167                                             0, &bp);
1168                               } else {
1169                                         if (ratogo > 0)
1170                                                   --ratogo;
1171                                         error = bread(vp, bn, bsize, 0, &bp);
1172                               }
1173                               if (error)
1174                                         break;
1175                               n = uimin(n, bsize - bp->b_resid);
1176                               error = uiomove((char *)bp->b_data + on, n, uio);
1177                               brelse(bp, 0);
1178                     } while (error == 0 && uio->uio_resid > 0 && n != 0);
1179 
1180                     kmem_free(rablks, nra * sizeof(*rablks));
1181                     kmem_free(rasizes, nra * sizeof(*rasizes));
1182 
1183                     return error;
1184 
1185           default:
1186                     panic("spec_read type");
1187           }
1188           /* NOTREACHED */
1189 }
1190 
1191 /*
1192  * Vnode op for write
1193  */
1194 /* ARGSUSED */
1195 int
spec_write(void * v)1196 spec_write(void *v)
1197 {
1198           struct vop_write_args /* {
1199                     struct vnode *a_vp;
1200                     struct uio *a_uio;
1201                     int  a_ioflag;
1202                     kauth_cred_t a_cred;
1203           } */ *ap = v;
1204           struct vnode *vp = ap->a_vp;
1205           struct uio *uio = ap->a_uio;
1206           struct lwp *l = curlwp;
1207           struct specnode *sn;
1208           dev_t dev;
1209           struct buf *bp;
1210           daddr_t bn;
1211           int bsize, bscale;
1212           struct partinfo pi;
1213           int n, on;
1214           int error = 0;
1215 
1216           KASSERT(uio->uio_rw == UIO_WRITE);
1217           KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ||
1218                     uio->uio_vmspace == curproc->p_vmspace),
1219               "vmspace belongs to neither kernel nor curproc");
1220 
1221           switch (vp->v_type) {
1222 
1223           case VCHR:
1224                     /*
1225                      * Release the lock while we sleep -- possibly
1226                      * indefinitely, if this is, e.g., a tty -- in
1227                      * cdev_write, so we don't hold up everything else that
1228                      * might want access to the vnode.
1229                      *
1230                      * But before we issue the write, take an I/O reference
1231                      * to the specnode so close will know when we're done
1232                      * writing.  Note that the moment we release the lock,
1233                      * the vnode's identity may change; hence spec_io_enter
1234                      * may fail, and the caller may have a dead vnode on
1235                      * their hands, if the file system on which vp lived
1236                      * has been unmounted.
1237                      */
1238                     VOP_UNLOCK(vp);
1239                     error = spec_io_enter(vp, &sn, &dev);
1240                     if (error)
1241                               goto out;
1242                     error = cdev_write(dev, uio, ap->a_ioflag);
1243                     spec_io_exit(vp, sn);
1244 out:                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1245                     return error;
1246 
1247           case VBLK:
1248                     KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
1249                     if (uio->uio_resid == 0)
1250                               return 0;
1251                     if (uio->uio_offset < 0)
1252                               return EINVAL;
1253 
1254                     if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0)
1255                               bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE);
1256                     else
1257                               bsize = BLKDEV_IOSIZE;
1258 
1259                     bscale = bsize >> DEV_BSHIFT;
1260                     do {
1261                               bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1);
1262                               on = uio->uio_offset % bsize;
1263                               n = uimin((unsigned)(bsize - on), uio->uio_resid);
1264                               if (n == bsize)
1265                                         bp = getblk(vp, bn, bsize, 0, 0);
1266                               else
1267                                         error = bread(vp, bn, bsize, B_MODIFY, &bp);
1268                               if (error) {
1269                                         return error;
1270                               }
1271                               n = uimin(n, bsize - bp->b_resid);
1272                               error = uiomove((char *)bp->b_data + on, n, uio);
1273                               if (error)
1274                                         brelse(bp, 0);
1275                               else {
1276                                         if (n + on == bsize)
1277                                                   bawrite(bp);
1278                                         else
1279                                                   bdwrite(bp);
1280                                         error = bp->b_error;
1281                               }
1282                     } while (error == 0 && uio->uio_resid > 0 && n != 0);
1283                     return error;
1284 
1285           default:
1286                     panic("spec_write type");
1287           }
1288           /* NOTREACHED */
1289 }
1290 
1291 /*
1292  * fdiscard, which on disk devices becomes TRIM.
1293  */
1294 int
spec_fdiscard(void * v)1295 spec_fdiscard(void *v)
1296 {
1297           struct vop_fdiscard_args /* {
1298                     struct vnode *a_vp;
1299                     off_t a_pos;
1300                     off_t a_len;
1301           } */ *ap = v;
1302           struct vnode *vp = ap->a_vp;
1303           dev_t dev;
1304 
1305           KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1306 
1307           dev = vp->v_rdev;
1308 
1309           switch (vp->v_type) {
1310           case VCHR:
1311 #if 0               /* This is not stored for character devices. */
1312                     KASSERT(vp == vp->v_specnode->sn_dev->sd_cdevvp);
1313 #endif
1314                     return cdev_discard(dev, ap->a_pos, ap->a_len);
1315           case VBLK:
1316                     KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
1317                     return bdev_discard(dev, ap->a_pos, ap->a_len);
1318           default:
1319                     panic("spec_fdiscard: not a device\n");
1320           }
1321 }
1322 
1323 /*
1324  * Device ioctl operation.
1325  */
1326 /* ARGSUSED */
1327 int
spec_ioctl(void * v)1328 spec_ioctl(void *v)
1329 {
1330           struct vop_ioctl_args /* {
1331                     struct vnode *a_vp;
1332                     u_long a_command;
1333                     void  *a_data;
1334                     int  a_fflag;
1335                     kauth_cred_t a_cred;
1336           } */ *ap = v;
1337           struct vnode *vp = ap->a_vp;
1338           struct specnode *sn;
1339           dev_t dev;
1340           int error;
1341 
1342           error = spec_io_enter(vp, &sn, &dev);
1343           if (error)
1344                     return error;
1345 
1346           switch (vp->v_type) {
1347           case VCHR:
1348                     error = cdev_ioctl(dev, ap->a_command, ap->a_data,
1349                         ap->a_fflag, curlwp);
1350                     break;
1351           case VBLK:
1352                     KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
1353                     error = bdev_ioctl(dev, ap->a_command, ap->a_data,
1354                         ap->a_fflag, curlwp);
1355                     break;
1356           default:
1357                     panic("spec_ioctl");
1358                     /* NOTREACHED */
1359           }
1360 
1361           spec_io_exit(vp, sn);
1362           return error;
1363 }
1364 
1365 /* ARGSUSED */
1366 int
spec_poll(void * v)1367 spec_poll(void *v)
1368 {
1369           struct vop_poll_args /* {
1370                     struct vnode *a_vp;
1371                     int a_events;
1372           } */ *ap = v;
1373           struct vnode *vp = ap->a_vp;
1374           struct specnode *sn;
1375           dev_t dev;
1376           int revents;
1377 
1378           if (spec_io_enter(vp, &sn, &dev) != 0)
1379                     return POLLERR;
1380 
1381           switch (vp->v_type) {
1382           case VCHR:
1383                     revents = cdev_poll(dev, ap->a_events, curlwp);
1384                     break;
1385           default:
1386                     revents = genfs_poll(v);
1387                     break;
1388           }
1389 
1390           spec_io_exit(vp, sn);
1391           return revents;
1392 }
1393 
1394 /* ARGSUSED */
1395 int
spec_kqfilter(void * v)1396 spec_kqfilter(void *v)
1397 {
1398           struct vop_kqfilter_args /* {
1399                     struct vnode        *a_vp;
1400                     struct proc         *a_kn;
1401           } */ *ap = v;
1402           struct vnode *vp = ap->a_vp;
1403           struct specnode *sn;
1404           dev_t dev;
1405           int error;
1406 
1407           error = spec_io_enter(vp, &sn, &dev);
1408           if (error)
1409                     return error;
1410 
1411           switch (vp->v_type) {
1412           case VCHR:
1413                     error = cdev_kqfilter(dev, ap->a_kn);
1414                     break;
1415           default:
1416                     /*
1417                      * Block devices don't support kqfilter, and refuse it
1418                      * for any other files (like those vflush()ed) too.
1419                      */
1420                     error = EOPNOTSUPP;
1421                     break;
1422           }
1423 
1424           spec_io_exit(vp, sn);
1425           return error;
1426 }
1427 
1428 /*
1429  * Allow mapping of only D_DISK.  This is called only for VBLK.
1430  */
1431 int
spec_mmap(void * v)1432 spec_mmap(void *v)
1433 {
1434           struct vop_mmap_args /* {
1435                     struct vnode *a_vp;
1436                     vm_prot_t a_prot;
1437                     kauth_cred_t a_cred;
1438           } */ *ap = v;
1439           struct vnode *vp = ap->a_vp;
1440           struct specnode *sn;
1441           dev_t dev;
1442           int error;
1443 
1444           KASSERT(vp->v_type == VBLK);
1445 
1446           error = spec_io_enter(vp, &sn, &dev);
1447           if (error)
1448                     return error;
1449 
1450           error = bdev_type(dev) == D_DISK ? 0 : EINVAL;
1451 
1452           spec_io_exit(vp, sn);
1453           return 0;
1454 }
1455 
1456 /*
1457  * Synch buffers associated with a block device
1458  */
1459 /* ARGSUSED */
1460 int
spec_fsync(void * v)1461 spec_fsync(void *v)
1462 {
1463           struct vop_fsync_args /* {
1464                     struct vnode *a_vp;
1465                     kauth_cred_t a_cred;
1466                     int  a_flags;
1467                     off_t offlo;
1468                     off_t offhi;
1469           } */ *ap = v;
1470           struct vnode *vp = ap->a_vp;
1471           struct mount *mp;
1472           int error;
1473 
1474           if (vp->v_type == VBLK) {
1475                     if ((mp = spec_node_getmountedfs(vp)) != NULL) {
1476                               error = VFS_FSYNC(mp, vp, ap->a_flags);
1477                               if (error != EOPNOTSUPP)
1478                                         return error;
1479                     }
1480                     return vflushbuf(vp, ap->a_flags);
1481           }
1482           return 0;
1483 }
1484 
1485 /*
1486  * Just call the device strategy routine
1487  */
1488 int
spec_strategy(void * v)1489 spec_strategy(void *v)
1490 {
1491           struct vop_strategy_args /* {
1492                     struct vnode *a_vp;
1493                     struct buf *a_bp;
1494           } */ *ap = v;
1495           struct vnode *vp = ap->a_vp;
1496           struct buf *bp = ap->a_bp;
1497           struct specnode *sn = NULL;
1498           dev_t dev;
1499           int error;
1500 
1501           error = spec_io_enter(vp, &sn, &dev);
1502           if (error)
1503                     goto out;
1504 
1505           bp->b_dev = dev;
1506 
1507           if (!(bp->b_flags & B_READ)) {
1508 #ifdef DIAGNOSTIC
1509                     if (bp->b_vp && bp->b_vp->v_type == VBLK) {
1510                               struct mount *mp = spec_node_getmountedfs(bp->b_vp);
1511 
1512                               if (mp && (mp->mnt_flag & MNT_RDONLY)) {
1513                                         printf("%s blk %"PRId64" written while ro!\n",
1514                                             mp->mnt_stat.f_mntonname, bp->b_blkno);
1515 #ifdef DDB
1516                                         db_stacktrace();
1517 #endif
1518                               }
1519                     }
1520 #endif /* DIAGNOSTIC */
1521                     error = fscow_run(bp, false);
1522                     if (error)
1523                               goto out;
1524           }
1525           bdev_strategy(bp);
1526 
1527           error = 0;
1528 
1529 out:      if (sn)
1530                     spec_io_exit(vp, sn);
1531           if (error) {
1532                     bp->b_error = error;
1533                     bp->b_resid = bp->b_bcount;
1534                     biodone(bp);
1535           }
1536           return error;
1537 }
1538 
1539 int
spec_inactive(void * v)1540 spec_inactive(void *v)
1541 {
1542           struct vop_inactive_v2_args /* {
1543                     struct vnode *a_vp;
1544                     struct bool *a_recycle;
1545           } */ *ap = v;
1546 
1547           KASSERT(ap->a_vp->v_mount == dead_rootmount);
1548           *ap->a_recycle = true;
1549 
1550           return 0;
1551 }
1552 
1553 int
spec_reclaim(void * v)1554 spec_reclaim(void *v)
1555 {
1556           struct vop_reclaim_v2_args /* {
1557                     struct vnode *a_vp;
1558           } */ *ap = v;
1559           struct vnode *vp = ap->a_vp;
1560 
1561           KASSERT(vp->v_specnode->sn_opencnt == 0);
1562 
1563           VOP_UNLOCK(vp);
1564 
1565           KASSERT(vp->v_mount == dead_rootmount);
1566           return 0;
1567 }
1568 
1569 /*
1570  * This is a noop, simply returning what one has been given.
1571  */
1572 int
spec_bmap(void * v)1573 spec_bmap(void *v)
1574 {
1575           struct vop_bmap_args /* {
1576                     struct vnode *a_vp;
1577                     daddr_t  a_bn;
1578                     struct vnode **a_vpp;
1579                     daddr_t *a_bnp;
1580                     int *a_runp;
1581           } */ *ap = v;
1582 
1583           if (ap->a_vpp != NULL)
1584                     *ap->a_vpp = ap->a_vp;
1585           if (ap->a_bnp != NULL)
1586                     *ap->a_bnp = ap->a_bn;
1587           if (ap->a_runp != NULL)
1588                     *ap->a_runp = (MAXBSIZE >> DEV_BSHIFT) - 1;
1589           return 0;
1590 }
1591 
1592 /*
1593  * Device close routine
1594  */
1595 /* ARGSUSED */
1596 int
spec_close(void * v)1597 spec_close(void *v)
1598 {
1599           struct vop_close_args /* {
1600                     struct vnode *a_vp;
1601                     int  a_fflag;
1602                     kauth_cred_t a_cred;
1603           } */ *ap = v;
1604           struct vnode *vp = ap->a_vp;
1605           struct session *sess;
1606           dev_t dev;
1607           int flags = ap->a_fflag;
1608           int mode, error, count;
1609           specnode_t *sn;
1610           specdev_t *sd;
1611 
1612           KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1613 
1614           mutex_enter(vp->v_interlock);
1615           sn = vp->v_specnode;
1616           dev = vp->v_rdev;
1617           sd = sn->sn_dev;
1618           /*
1619            * If we're going away soon, make this non-blocking.
1620            * Also ensures that we won't wedge in vn_lock below.
1621            */
1622           if (vdead_check(vp, VDEAD_NOWAIT) != 0)
1623                     flags |= FNONBLOCK;
1624           mutex_exit(vp->v_interlock);
1625 
1626           switch (vp->v_type) {
1627 
1628           case VCHR:
1629                     /*
1630                      * Hack: a tty device that is a controlling terminal
1631                      * has a reference from the session structure.  We
1632                      * cannot easily tell that a character device is a
1633                      * controlling terminal, unless it is the closing
1634                      * process' controlling terminal.  In that case, if the
1635                      * open count is 1 release the reference from the
1636                      * session.  Also, remove the link from the tty back to
1637                      * the session and pgrp.
1638                      *
1639                      * XXX V. fishy.
1640                      */
1641                     mutex_enter(&proc_lock);
1642                     sess = curlwp->l_proc->p_session;
1643                     if (sn->sn_opencnt == 1 && vp == sess->s_ttyvp) {
1644                               mutex_spin_enter(&tty_lock);
1645                               sess->s_ttyvp = NULL;
1646                               if (sess->s_ttyp->t_session != NULL) {
1647                                         sess->s_ttyp->t_pgrp = NULL;
1648                                         sess->s_ttyp->t_session = NULL;
1649                                         mutex_spin_exit(&tty_lock);
1650                                         /* Releases proc_lock. */
1651                                         proc_sessrele(sess);
1652                               } else {
1653                                         mutex_spin_exit(&tty_lock);
1654                                         if (sess->s_ttyp->t_pgrp != NULL)
1655                                                   panic("spec_close: spurious pgrp ref");
1656                                         mutex_exit(&proc_lock);
1657                               }
1658                               vrele(vp);
1659                     } else
1660                               mutex_exit(&proc_lock);
1661 
1662                     /*
1663                      * If the vnode is locked, then we are in the midst
1664                      * of forcably closing the device, otherwise we only
1665                      * close on last reference.
1666                      */
1667                     mode = S_IFCHR;
1668                     break;
1669 
1670           case VBLK:
1671                     KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
1672                     /*
1673                      * On last close of a block device (that isn't mounted)
1674                      * we must invalidate any in core blocks, so that
1675                      * we can, for instance, change floppy disks.
1676                      */
1677                     error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0);
1678                     if (error)
1679                               return error;
1680                     /*
1681                      * We do not want to really close the device if it
1682                      * is still in use unless we are trying to close it
1683                      * forcibly. Since every use (buffer, vnode, swap, cmap)
1684                      * holds a reference to the vnode, and because we mark
1685                      * any other vnodes that alias this device, when the
1686                      * sum of the reference counts on all the aliased
1687                      * vnodes descends to one, we are on last close.
1688                      */
1689                     mode = S_IFBLK;
1690                     break;
1691 
1692           default:
1693                     panic("spec_close: not special");
1694           }
1695 
1696           /*
1697            * Decrement the open reference count of this node and the
1698            * device.  For block devices, the open reference count must be
1699            * 1 at this point.  If the device's open reference count goes
1700            * to zero, we're the last one out so get the lights.
1701            *
1702            * We may find --sd->sd_opencnt gives zero, and yet
1703            * sd->sd_opened is false.  This happens if the vnode is
1704            * revoked at the same time as it is being opened, which can
1705            * happen when opening a tty blocks indefinitely.  In that
1706            * case, we still must call close -- it is the job of close to
1707            * interrupt the open.  Either way, the device will be no
1708            * longer opened, so we have to clear sd->sd_opened; subsequent
1709            * opens will have responsibility for issuing close.
1710            *
1711            * This has the side effect that the sequence of opens might
1712            * happen out of order -- we might end up doing open, open,
1713            * close, close, instead of open, close, open, close.  This is
1714            * unavoidable with the current devsw API, where open is
1715            * allowed to block and close must be able to run concurrently
1716            * to interrupt it.  It is the driver's responsibility to
1717            * ensure that close is idempotent so that this works.  Drivers
1718            * requiring per-open state and exact 1:1 correspondence
1719            * between open and close can use fd_clone.
1720            */
1721           mutex_enter(&device_lock);
1722           KASSERT(sn->sn_opencnt);
1723           KASSERT(sd->sd_opencnt);
1724           KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
1725               "sn_opencnt=%u > sd_opencnt=%u",
1726               sn->sn_opencnt, sd->sd_opencnt);
1727           sn->sn_opencnt--;
1728           count = --sd->sd_opencnt;
1729           if (vp->v_type == VBLK) {
1730                     KASSERTMSG(count == 0, "block device with %u opens",
1731                         count + 1);
1732                     sd->sd_bdevvp = NULL;
1733           }
1734           if (count == 0) {
1735                     KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u",
1736                         sn->sn_opencnt);
1737                     KASSERT(!sd->sd_closing);
1738                     sd->sd_opened = false;
1739                     sd->sd_closing = true;
1740           }
1741           mutex_exit(&device_lock);
1742 
1743           if (count != 0)
1744                     return 0;
1745 
1746           /*
1747            * If we're able to block, release the vnode lock & reacquire. We
1748            * might end up sleeping for someone else who wants our queues. They
1749            * won't get them if we hold the vnode locked.
1750            */
1751           if (!(flags & FNONBLOCK))
1752                     VOP_UNLOCK(vp);
1753 
1754           /*
1755            * If we can cancel all outstanding I/O, then wait for it to
1756            * drain before we call .d_close.  Drivers that split up
1757            * .d_cancel and .d_close this way need not have any internal
1758            * mechanism for waiting in .d_close for I/O to drain.
1759            */
1760           if (vp->v_type == VBLK)
1761                     error = bdev_cancel(dev, flags, mode, curlwp);
1762           else
1763                     error = cdev_cancel(dev, flags, mode, curlwp);
1764           if (error == 0)
1765                     spec_io_drain(sd);
1766           else
1767                     KASSERTMSG(error == ENODEV, "cancel dev=0x%lx failed with %d",
1768                         (unsigned long)dev, error);
1769 
1770           if (vp->v_type == VBLK)
1771                     error = bdev_close(dev, flags, mode, curlwp);
1772           else
1773                     error = cdev_close(dev, flags, mode, curlwp);
1774 
1775           /*
1776            * Wait for all other devsw operations to drain.  After this
1777            * point, no bdev/cdev_* can be active for this specdev.
1778            */
1779           spec_io_drain(sd);
1780 
1781           /*
1782            * Wake any spec_open calls waiting for close to finish -- do
1783            * this before reacquiring the vnode lock, because spec_open
1784            * holds the vnode lock while waiting, so doing this after
1785            * reacquiring the lock would deadlock.
1786            */
1787           mutex_enter(&device_lock);
1788           KASSERT(!sd->sd_opened);
1789           KASSERT(sd->sd_closing);
1790           sd->sd_closing = false;
1791           cv_broadcast(&specfs_iocv);
1792           mutex_exit(&device_lock);
1793 
1794           if (!(flags & FNONBLOCK))
1795                     vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1796 
1797           return error;
1798 }
1799 
1800 /*
1801  * Print out the contents of a special device vnode.
1802  */
1803 int
spec_print(void * v)1804 spec_print(void *v)
1805 {
1806           struct vop_print_args /* {
1807                     struct vnode *a_vp;
1808           } */ *ap = v;
1809 
1810           printf("dev %llu, %llu\n", (unsigned long long)major(ap->a_vp->v_rdev),
1811               (unsigned long long)minor(ap->a_vp->v_rdev));
1812           return 0;
1813 }
1814 
1815 /*
1816  * Return POSIX pathconf information applicable to special devices.
1817  */
1818 int
spec_pathconf(void * v)1819 spec_pathconf(void *v)
1820 {
1821           struct vop_pathconf_args /* {
1822                     struct vnode *a_vp;
1823                     int a_name;
1824                     register_t *a_retval;
1825           } */ *ap = v;
1826 
1827           switch (ap->a_name) {
1828           case _PC_LINK_MAX:
1829                     *ap->a_retval = LINK_MAX;
1830                     return 0;
1831           case _PC_MAX_CANON:
1832                     *ap->a_retval = MAX_CANON;
1833                     return 0;
1834           case _PC_MAX_INPUT:
1835                     *ap->a_retval = MAX_INPUT;
1836                     return 0;
1837           case _PC_PIPE_BUF:
1838                     *ap->a_retval = PIPE_BUF;
1839                     return 0;
1840           case _PC_CHOWN_RESTRICTED:
1841                     *ap->a_retval = 1;
1842                     return 0;
1843           case _PC_VDISABLE:
1844                     *ap->a_retval = _POSIX_VDISABLE;
1845                     return 0;
1846           case _PC_SYNC_IO:
1847                     *ap->a_retval = 1;
1848                     return 0;
1849           default:
1850                     return genfs_pathconf(ap);
1851           }
1852           /* NOTREACHED */
1853 }
1854 
1855 /*
1856  * Advisory record locking support.
1857  */
1858 int
spec_advlock(void * v)1859 spec_advlock(void *v)
1860 {
1861           struct vop_advlock_args /* {
1862                     struct vnode *a_vp;
1863                     void *a_id;
1864                     int a_op;
1865                     struct flock *a_fl;
1866                     int a_flags;
1867           } */ *ap = v;
1868           struct vnode *vp = ap->a_vp;
1869 
1870           return lf_advlock(ap, &vp->v_speclockf, (off_t)0);
1871 }
1872