1 /*        $NetBSD: vfs_mount.c,v 1.110 2024/12/07 02:27:38 riastradh Exp $      */
2 
3 /*-
4  * Copyright (c) 1997-2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *        The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *        @(#)vfs_subr.c      8.13 (Berkeley) 4/18/94
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.110 2024/12/07 02:27:38 riastradh Exp $");
71 
72 #include "veriexec.h"
73 
74 #include <sys/param.h>
75 #include <sys/kernel.h>
76 
77 #include <sys/atomic.h>
78 #include <sys/buf.h>
79 #include <sys/conf.h>
80 #include <sys/device.h>
81 #include <sys/extattr.h>
82 #include <sys/fcntl.h>
83 #include <sys/filedesc.h>
84 #include <sys/fstrans.h>
85 #include <sys/kauth.h>
86 #include <sys/kmem.h>
87 #include <sys/module.h>
88 #include <sys/mount.h>
89 #include <sys/namei.h>
90 #include <sys/sdt.h>
91 #include <sys/syscallargs.h>
92 #include <sys/sysctl.h>
93 #include <sys/systm.h>
94 #include <sys/verified_exec.h>
95 #include <sys/vfs_syscalls.h>
96 #include <sys/vnode_impl.h>
97 
98 #include <miscfs/deadfs/deadfs.h>
99 #include <miscfs/genfs/genfs.h>
100 #include <miscfs/specfs/specdev.h>
101 
102 #include <uvm/uvm_swap.h>
103 
104 enum mountlist_type {
105           ME_MOUNT,
106           ME_MARKER
107 };
108 struct mountlist_entry {
109           TAILQ_ENTRY(mountlist_entry) me_list;   /* Mount list. */
110           struct mount *me_mount;                           /* Actual mount if ME_MOUNT,
111                                                                current mount else. */
112           enum mountlist_type me_type;            /* Mount or marker. */
113 };
114 struct mount_iterator {
115           struct mountlist_entry mi_entry;
116 };
117 
118 static struct vnode *vfs_vnode_iterator_next1(struct vnode_iterator *,
119     bool (*)(void *, struct vnode *), void *, bool);
120 
121 /* Root filesystem. */
122 vnode_t *                     rootvnode;
123 
124 /* Mounted filesystem list. */
125 static TAILQ_HEAD(mountlist, mountlist_entry) mountlist;
126 static kmutex_t                         mountlist_lock __cacheline_aligned;
127 int vnode_offset_next_by_lru  /* XXX: ugly hack for pstat.c */
128     = offsetof(vnode_impl_t, vi_lrulist.tqe_next);
129 
130 kmutex_t                      vfs_list_lock __cacheline_aligned;
131 
132 static specificdata_domain_t  mount_specificdata_domain;
133 static kmutex_t                         mntid_lock;
134 
135 static kmutex_t                         mountgen_lock __cacheline_aligned;
136 static uint64_t                         mountgen;
137 
138 void
vfs_mount_sysinit(void)139 vfs_mount_sysinit(void)
140 {
141 
142           TAILQ_INIT(&mountlist);
143           mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
144           mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
145 
146           mount_specificdata_domain = specificdata_domain_create();
147           mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
148           mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
149           mountgen = 0;
150 }
151 
152 struct mount *
vfs_mountalloc(struct vfsops * vfsops,vnode_t * vp)153 vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
154 {
155           struct mount *mp;
156           int error __diagused;
157 
158           mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
159           mp->mnt_op = vfsops;
160           mp->mnt_refcnt = 1;
161           TAILQ_INIT(&mp->mnt_vnodelist);
162           mp->mnt_renamelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
163           mp->mnt_vnodelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
164           mp->mnt_updating = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
165           mp->mnt_vnodecovered = vp;
166           mount_initspecific(mp);
167 
168           error = fstrans_mount(mp);
169           KASSERT(error == 0);
170 
171           mutex_enter(&mountgen_lock);
172           mp->mnt_gen = mountgen++;
173           mutex_exit(&mountgen_lock);
174 
175           return mp;
176 }
177 
178 /*
179  * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
180  * initialize a mount structure for it.
181  *
182  * Devname is usually updated by mount(8) after booting.
183  */
184 int
vfs_rootmountalloc(const char * fstypename,const char * devname,struct mount ** mpp)185 vfs_rootmountalloc(const char *fstypename, const char *devname,
186     struct mount **mpp)
187 {
188           struct vfsops *vfsp = NULL;
189           struct mount *mp;
190           int error __diagused;
191 
192           mutex_enter(&vfs_list_lock);
193           LIST_FOREACH(vfsp, &vfs_list, vfs_list)
194                     if (!strncmp(vfsp->vfs_name, fstypename,
195                         sizeof(mp->mnt_stat.f_fstypename)))
196                               break;
197           if (vfsp == NULL) {
198                     mutex_exit(&vfs_list_lock);
199                     return SET_ERROR(ENODEV);
200           }
201           vfsp->vfs_refcount++;
202           mutex_exit(&vfs_list_lock);
203 
204           if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
205                     return SET_ERROR(ENOMEM);
206           error = vfs_busy(mp);
207           KASSERT(error == 0);
208           mp->mnt_flag = MNT_RDONLY;
209           (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
210               sizeof(mp->mnt_stat.f_fstypename));
211           mp->mnt_stat.f_mntonname[0] = '/';
212           mp->mnt_stat.f_mntonname[1] = '\0';
213           mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
214               '\0';
215           (void)copystr(devname, mp->mnt_stat.f_mntfromname,
216               sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
217           *mpp = mp;
218           return 0;
219 }
220 
221 /*
222  * vfs_getnewfsid: get a new unique fsid.
223  */
224 void
vfs_getnewfsid(struct mount * mp)225 vfs_getnewfsid(struct mount *mp)
226 {
227           static u_short xxxfs_mntid;
228           struct mountlist_entry *me;
229           fsid_t tfsid;
230           int mtype;
231 
232           mutex_enter(&mntid_lock);
233           if (xxxfs_mntid == 0)
234                     ++xxxfs_mntid;
235           mtype = makefstype(mp->mnt_op->vfs_name);
236           tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
237           tfsid.__fsid_val[1] = mtype;
238           /* Always increment to not return the same fsid to parallel mounts. */
239           xxxfs_mntid++;
240 
241           /*
242            * Directly walk mountlist to prevent deadlock through
243            * mountlist_iterator_next() -> vfs_busy().
244            */
245           mutex_enter(&mountlist_lock);
246           for (me = TAILQ_FIRST(&mountlist); me != TAILQ_END(&mountlist); ) {
247                     if (me->me_type == ME_MOUNT &&
248                         me->me_mount->mnt_stat.f_fsidx.__fsid_val[0] ==
249                         tfsid.__fsid_val[0] &&
250                         me->me_mount->mnt_stat.f_fsidx.__fsid_val[1] ==
251                         tfsid.__fsid_val[1]) {
252                               tfsid.__fsid_val[0]++;
253                               xxxfs_mntid++;
254                               me = TAILQ_FIRST(&mountlist);
255                     } else {
256                               me = TAILQ_NEXT(me, me_list);
257                     }
258           }
259           mutex_exit(&mountlist_lock);
260 
261           mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
262           mp->mnt_stat.f_fsidx.__fsid_val[1] = tfsid.__fsid_val[1];
263           mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
264           mutex_exit(&mntid_lock);
265 }
266 
267 /*
268  * Lookup a mount point by filesystem identifier.
269  *
270  * XXX Needs to add a reference to the mount point.
271  */
272 struct mount *
vfs_getvfs(fsid_t * fsid)273 vfs_getvfs(fsid_t *fsid)
274 {
275           mount_iterator_t *iter;
276           struct mount *mp;
277 
278           mountlist_iterator_init(&iter);
279           while ((mp = mountlist_iterator_next(iter)) != NULL) {
280                     if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
281                         mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
282                               mountlist_iterator_destroy(iter);
283                               return mp;
284                     }
285           }
286           mountlist_iterator_destroy(iter);
287           return NULL;
288 }
289 
290 /*
291  * Take a reference to a mount structure.
292  */
293 void
vfs_ref(struct mount * mp)294 vfs_ref(struct mount *mp)
295 {
296 
297           KASSERT(mp->mnt_refcnt > 0 || mutex_owned(&mountlist_lock));
298 
299           atomic_inc_uint(&mp->mnt_refcnt);
300 }
301 
302 /*
303  * Drop a reference to a mount structure, freeing if the last reference.
304  */
305 void
vfs_rele(struct mount * mp)306 vfs_rele(struct mount *mp)
307 {
308 
309           membar_release();
310           if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
311                     return;
312           }
313           membar_acquire();
314 
315           /*
316            * Nothing else has visibility of the mount: we can now
317            * free the data structures.
318            */
319           KASSERT(mp->mnt_refcnt == 0);
320           specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
321           mutex_obj_free(mp->mnt_updating);
322           mutex_obj_free(mp->mnt_renamelock);
323           mutex_obj_free(mp->mnt_vnodelock);
324           if (mp->mnt_op != NULL) {
325                     vfs_delref(mp->mnt_op);
326           }
327           fstrans_unmount(mp);
328           /*
329            * Final free of mp gets done from fstrans_mount_dtor().
330            *
331            * Prevents this memory to be reused as a mount before
332            * fstrans releases all references to it.
333            */
334 }
335 
336 /*
337  * Mark a mount point as busy, and gain a new reference to it.  Used to
338  * prevent the file system from being unmounted during critical sections.
339  *
340  * vfs_busy can be called multiple times and by multiple threads
341  * and must be accompanied by the same number of vfs_unbusy calls.
342  *
343  * => The caller must hold a pre-existing reference to the mount.
344  * => Will fail if the file system is being unmounted, or is unmounted.
345  */
346 static inline int
_vfs_busy(struct mount * mp,bool wait)347 _vfs_busy(struct mount *mp, bool wait)
348 {
349 
350           KASSERT(mp->mnt_refcnt > 0);
351 
352           if (wait) {
353                     fstrans_start(mp);
354           } else {
355                     if (fstrans_start_nowait(mp))
356                               return SET_ERROR(EBUSY);
357           }
358           if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
359                     fstrans_done(mp);
360                     return SET_ERROR(ENOENT);
361           }
362           vfs_ref(mp);
363           return 0;
364 }
365 
366 int
vfs_busy(struct mount * mp)367 vfs_busy(struct mount *mp)
368 {
369 
370           return _vfs_busy(mp, true);
371 }
372 
373 int
vfs_trybusy(struct mount * mp)374 vfs_trybusy(struct mount *mp)
375 {
376 
377           return _vfs_busy(mp, false);
378 }
379 
380 /*
381  * Unbusy a busy filesystem.
382  *
383  * Every successful vfs_busy() call must be undone by a vfs_unbusy() call.
384  */
385 void
vfs_unbusy(struct mount * mp)386 vfs_unbusy(struct mount *mp)
387 {
388 
389           KASSERT(mp->mnt_refcnt > 0);
390 
391           fstrans_done(mp);
392           vfs_rele(mp);
393 }
394 
395 /*
396  * Change a file systems lower mount.
397  * Both the current and the new lower mount may be NULL.  The caller
398  * guarantees exclusive access to the mount and holds a pre-existing
399  * reference to the new lower mount.
400  */
401 int
vfs_set_lowermount(struct mount * mp,struct mount * lowermp)402 vfs_set_lowermount(struct mount *mp, struct mount *lowermp)
403 {
404           struct mount *oldlowermp;
405           int error;
406 
407 #ifdef DEBUG
408           /*
409            * Limit the depth of file system stack so kernel sanitizers
410            * may stress mount/unmount without exhausting the kernel stack.
411            */
412           int depth;
413           struct mount *mp2;
414 
415           for (depth = 0, mp2 = lowermp; mp2; depth++, mp2 = mp2->mnt_lower) {
416                     if (depth == 23)
417                               return SET_ERROR(EINVAL);
418           }
419 #endif
420 
421           if (lowermp) {
422                     if (lowermp == dead_rootmount)
423                               return SET_ERROR(ENOENT);
424                     error = vfs_busy(lowermp);
425                     if (error)
426                               return error;
427                     vfs_ref(lowermp);
428           }
429 
430           oldlowermp = mp->mnt_lower;
431           mp->mnt_lower = lowermp;
432 
433           if (lowermp)
434                     vfs_unbusy(lowermp);
435 
436           if (oldlowermp)
437                     vfs_rele(oldlowermp);
438 
439           return 0;
440 }
441 
442 struct vnode_iterator {
443           vnode_impl_t vi_vnode;
444 };
445 
446 void
vfs_vnode_iterator_init(struct mount * mp,struct vnode_iterator ** vnip)447 vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vnip)
448 {
449           vnode_t *vp;
450           vnode_impl_t *vip;
451 
452           vp = vnalloc_marker(mp);
453           vip = VNODE_TO_VIMPL(vp);
454 
455           mutex_enter(mp->mnt_vnodelock);
456           TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vip, vi_mntvnodes);
457           vp->v_usecount = 1;
458           mutex_exit(mp->mnt_vnodelock);
459 
460           *vnip = (struct vnode_iterator *)vip;
461 }
462 
463 void
vfs_vnode_iterator_destroy(struct vnode_iterator * vni)464 vfs_vnode_iterator_destroy(struct vnode_iterator *vni)
465 {
466           vnode_impl_t *mvip = &vni->vi_vnode;
467           vnode_t *mvp = VIMPL_TO_VNODE(mvip);
468           kmutex_t *lock;
469 
470           KASSERT(vnis_marker(mvp));
471           if (vrefcnt(mvp) != 0) {
472                     lock = mvp->v_mount->mnt_vnodelock;
473                     mutex_enter(lock);
474                     TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvip, vi_mntvnodes);
475                     mvp->v_usecount = 0;
476                     mutex_exit(lock);
477           }
478           vnfree_marker(mvp);
479 }
480 
481 static struct vnode *
vfs_vnode_iterator_next1(struct vnode_iterator * vni,bool (* f)(void *,struct vnode *),void * cl,bool do_wait)482 vfs_vnode_iterator_next1(struct vnode_iterator *vni,
483     bool (*f)(void *, struct vnode *), void *cl, bool do_wait)
484 {
485           vnode_impl_t *mvip = &vni->vi_vnode;
486           struct mount *mp = VIMPL_TO_VNODE(mvip)->v_mount;
487           vnode_t *vp;
488           vnode_impl_t *vip;
489           kmutex_t *lock;
490           int error;
491 
492           KASSERT(vnis_marker(VIMPL_TO_VNODE(mvip)));
493 
494           lock = mp->mnt_vnodelock;
495           do {
496                     mutex_enter(lock);
497                     vip = TAILQ_NEXT(mvip, vi_mntvnodes);
498                     TAILQ_REMOVE(&mp->mnt_vnodelist, mvip, vi_mntvnodes);
499                     VIMPL_TO_VNODE(mvip)->v_usecount = 0;
500 again:
501                     if (vip == NULL) {
502                               mutex_exit(lock);
503                               return NULL;
504                     }
505                     vp = VIMPL_TO_VNODE(vip);
506                     KASSERT(vp != NULL);
507                     mutex_enter(vp->v_interlock);
508                     if (vnis_marker(vp) ||
509                         vdead_check(vp, (do_wait ? 0 : VDEAD_NOWAIT)) ||
510                         (f && !(*f)(cl, vp))) {
511                               mutex_exit(vp->v_interlock);
512                               vip = TAILQ_NEXT(vip, vi_mntvnodes);
513                               goto again;
514                     }
515 
516                     TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vip, mvip,
517                         vi_mntvnodes);
518                     VIMPL_TO_VNODE(mvip)->v_usecount = 1;
519                     mutex_exit(lock);
520                     error = vcache_vget(vp);
521                     KASSERT(error == 0 || error == ENOENT);
522           } while (error != 0);
523 
524           return vp;
525 }
526 
527 struct vnode *
vfs_vnode_iterator_next(struct vnode_iterator * vni,bool (* f)(void *,struct vnode *),void * cl)528 vfs_vnode_iterator_next(struct vnode_iterator *vni,
529     bool (*f)(void *, struct vnode *), void *cl)
530 {
531 
532           return vfs_vnode_iterator_next1(vni, f, cl, false);
533 }
534 
535 /*
536  * Move a vnode from one mount queue to another.
537  */
538 void
vfs_insmntque(vnode_t * vp,struct mount * mp)539 vfs_insmntque(vnode_t *vp, struct mount *mp)
540 {
541           vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
542           struct mount *omp;
543           kmutex_t *lock;
544 
545           KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
546               vp->v_tag == VT_VFS);
547 
548           /*
549            * Delete from old mount point vnode list, if on one.
550            */
551           if ((omp = vp->v_mount) != NULL) {
552                     lock = omp->mnt_vnodelock;
553                     mutex_enter(lock);
554                     TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vip, vi_mntvnodes);
555                     mutex_exit(lock);
556           }
557 
558           /*
559            * Insert into list of vnodes for the new mount point, if
560            * available.  The caller must take a reference on the mount
561            * structure and donate to the vnode.
562            */
563           if ((vp->v_mount = mp) != NULL) {
564                     lock = mp->mnt_vnodelock;
565                     mutex_enter(lock);
566                     TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vip, vi_mntvnodes);
567                     mutex_exit(lock);
568           }
569 
570           if (omp != NULL) {
571                     /* Release reference to old mount. */
572                     vfs_rele(omp);
573           }
574 }
575 
576 /*
577  * Remove any vnodes in the vnode table belonging to mount point mp.
578  *
579  * If FORCECLOSE is not specified, there should not be any active ones,
580  * return error if any are found (nb: this is a user error, not a
581  * system error). If FORCECLOSE is specified, detach any active vnodes
582  * that are found.
583  *
584  * If WRITECLOSE is set, only flush out regular file vnodes open for
585  * writing.
586  *
587  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
588  */
589 #ifdef DEBUG
590 int busyprt = 0;    /* print out busy vnodes */
591 struct ctldebug debug1 = { "busyprt", &busyprt };
592 #endif
593 
594 static vnode_t *
vflushnext(struct vnode_iterator * marker,int * when)595 vflushnext(struct vnode_iterator *marker, int *when)
596 {
597           if (getticks() > *when) {
598                     yield();
599                     *when = getticks() + hz / 10;
600           }
601           preempt_point();
602           return vfs_vnode_iterator_next1(marker, NULL, NULL, true);
603 }
604 
605 /*
606  * Flush one vnode.  Referenced on entry, unreferenced on return.
607  */
608 static int
vflush_one(vnode_t * vp,vnode_t * skipvp,int flags)609 vflush_one(vnode_t *vp, vnode_t *skipvp, int flags)
610 {
611           int error;
612           struct vattr vattr;
613 
614           if (vp == skipvp ||
615               ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))) {
616                     vrele(vp);
617                     return 0;
618           }
619           /*
620            * If WRITECLOSE is set, only flush out regular file
621            * vnodes open for writing or open and unlinked.
622            */
623           if ((flags & WRITECLOSE)) {
624                     if (vp->v_type != VREG) {
625                               vrele(vp);
626                               return 0;
627                     }
628                     error = vn_lock(vp, LK_EXCLUSIVE);
629                     if (error) {
630                               KASSERT(error == ENOENT);
631                               vrele(vp);
632                               return 0;
633                     }
634                     error = VOP_FSYNC(vp, curlwp->l_cred, FSYNC_WAIT, 0, 0);
635                     if (error == 0)
636                               error = VOP_GETATTR(vp, &vattr, curlwp->l_cred);
637                     VOP_UNLOCK(vp);
638                     if (error) {
639                               vrele(vp);
640                               return error;
641                     }
642                     if (vp->v_writecount == 0 && vattr.va_nlink > 0) {
643                               vrele(vp);
644                               return 0;
645                     }
646           }
647           /*
648            * First try to recycle the vnode.
649            */
650           if (vrecycle(vp))
651                     return 0;
652           /*
653            * If FORCECLOSE is set, forcibly close the vnode.
654            * For block or character devices, revert to an
655            * anonymous device.  For all other files, just
656            * kill them.
657            */
658           if (flags & FORCECLOSE) {
659                     if (vrefcnt(vp) > 1 &&
660                         (vp->v_type == VBLK || vp->v_type == VCHR))
661                               vcache_make_anon(vp);
662                     else
663                               vgone(vp);
664                     return 0;
665           }
666           vrele(vp);
667           return SET_ERROR(EBUSY);
668 }
669 
670 int
vflush(struct mount * mp,vnode_t * skipvp,int flags)671 vflush(struct mount *mp, vnode_t *skipvp, int flags)
672 {
673           vnode_t *vp;
674           struct vnode_iterator *marker;
675           int busy, error, when, retries = 2;
676 
677           do {
678                     busy = error = when = 0;
679 
680                     /*
681                      * First, flush out any vnode references from the
682                      * deferred vrele list.
683                      */
684                     vrele_flush(mp);
685 
686                     vfs_vnode_iterator_init(mp, &marker);
687 
688                     while ((vp = vflushnext(marker, &when)) != NULL) {
689                               error = vflush_one(vp, skipvp, flags);
690                               if (error == EBUSY) {
691                                         error = 0;
692                                         busy++;
693 #ifdef DEBUG
694                                         if (busyprt && retries == 0)
695                                                   vprint("vflush: busy vnode", vp);
696 #endif
697                               } else if (error != 0) {
698                                         break;
699                               }
700                     }
701 
702                     vfs_vnode_iterator_destroy(marker);
703           } while (error == 0 && busy > 0 && retries-- > 0);
704 
705           if (error)
706                     return error;
707           if (busy)
708                     return SET_ERROR(EBUSY);
709           return 0;
710 }
711 
712 /*
713  * Mount a file system.
714  */
715 
716 /*
717  * Scan all active processes to see if any of them have a current or root
718  * directory onto which the new filesystem has just been  mounted. If so,
719  * replace them with the new mount point.
720  */
721 static void
mount_checkdirs(vnode_t * olddp)722 mount_checkdirs(vnode_t *olddp)
723 {
724           vnode_t *newdp, *rele1, *rele2;
725           struct cwdinfo *cwdi;
726           struct proc *p;
727           bool retry;
728 
729           if (vrefcnt(olddp) == 1) {
730                     return;
731           }
732           if (VFS_ROOT(olddp->v_mountedhere, LK_EXCLUSIVE, &newdp))
733                     panic("mount: lost mount");
734 
735           do {
736                     retry = false;
737                     mutex_enter(&proc_lock);
738                     PROCLIST_FOREACH(p, &allproc) {
739                               if ((cwdi = p->p_cwdi) == NULL)
740                                         continue;
741                               /*
742                                * Cannot change to the old directory any more,
743                                * so even if we see a stale value it is not a
744                                * problem.
745                                */
746                               if (cwdi->cwdi_cdir != olddp &&
747                                   cwdi->cwdi_rdir != olddp)
748                                         continue;
749                               retry = true;
750                               rele1 = NULL;
751                               rele2 = NULL;
752                               atomic_inc_uint(&cwdi->cwdi_refcnt);
753                               mutex_exit(&proc_lock);
754                               rw_enter(&cwdi->cwdi_lock, RW_WRITER);
755                               if (cwdi->cwdi_cdir == olddp) {
756                                         rele1 = cwdi->cwdi_cdir;
757                                         vref(newdp);
758                                         cwdi->cwdi_cdir = newdp;
759                               }
760                               if (cwdi->cwdi_rdir == olddp) {
761                                         rele2 = cwdi->cwdi_rdir;
762                                         vref(newdp);
763                                         cwdi->cwdi_rdir = newdp;
764                               }
765                               rw_exit(&cwdi->cwdi_lock);
766                               cwdfree(cwdi);
767                               if (rele1 != NULL)
768                                         vrele(rele1);
769                               if (rele2 != NULL)
770                                         vrele(rele2);
771                               mutex_enter(&proc_lock);
772                               break;
773                     }
774                     mutex_exit(&proc_lock);
775           } while (retry);
776 
777           if (rootvnode == olddp) {
778                     vrele(rootvnode);
779                     vref(newdp);
780                     rootvnode = newdp;
781           }
782           vput(newdp);
783 }
784 
785 /*
786  * Start extended attributes
787  */
788 static int
start_extattr(struct mount * mp)789 start_extattr(struct mount *mp)
790 {
791           int error;
792 
793           error = VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL);
794           if (error)
795                     printf("%s: failed to start extattr: error = %d\n",
796                            mp->mnt_stat.f_mntonname, error);
797 
798           return error;
799 }
800 
801 int
mount_domount(struct lwp * l,vnode_t ** vpp,struct vfsops * vfsops,const char * path,int flags,void * data,size_t * data_len)802 mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
803     const char *path, int flags, void *data, size_t *data_len)
804 {
805           vnode_t *vp = *vpp;
806           struct mount *mp;
807           struct pathbuf *pb;
808           struct nameidata nd;
809           int error, error2;
810 
811           error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
812               KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
813           if (error) {
814                     vfs_delref(vfsops);
815                     return error;
816           }
817 
818           /* Cannot make a non-dir a mount-point (from here anyway). */
819           if (vp->v_type != VDIR) {
820                     vfs_delref(vfsops);
821                     return SET_ERROR(ENOTDIR);
822           }
823 
824           if (flags & MNT_EXPORTED) {
825                     vfs_delref(vfsops);
826                     return SET_ERROR(EINVAL);
827           }
828 
829           if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
830                     vfs_delref(vfsops);
831                     return SET_ERROR(ENOMEM);
832           }
833 
834           mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
835 
836           /*
837            * The underlying file system may refuse the mount for
838            * various reasons.  Allow the user to force it to happen.
839            *
840            * Set the mount level flags.
841            */
842           mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);
843 
844           error = VFS_MOUNT(mp, path, data, data_len);
845           mp->mnt_flag &= ~MNT_OP_FLAGS;
846 
847           if (error != 0) {
848                     vfs_rele(mp);
849                     return error;
850           }
851 
852           /* Suspend new file system before taking mnt_updating. */
853           do {
854                     error2 = vfs_suspend(mp, 0);
855           } while (error2 == EINTR || error2 == ERESTART);
856           KASSERT(error2 == 0 || error2 == EOPNOTSUPP);
857           mutex_enter(mp->mnt_updating);
858 
859           /*
860            * Validate and prepare the mount point.
861            */
862           error = pathbuf_copyin(path, &pb);
863           if (error != 0) {
864                     goto err_mounted;
865           }
866           NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
867           error = namei(&nd);
868           pathbuf_destroy(pb);
869           if (error != 0) {
870                     goto err_mounted;
871           }
872           if (nd.ni_vp != vp) {
873                     vput(nd.ni_vp);
874                     error = SET_ERROR(EINVAL);
875                     goto err_mounted;
876           }
877           if (vp->v_mountedhere != NULL) {
878                     vput(nd.ni_vp);
879                     error = SET_ERROR(EBUSY);
880                     goto err_mounted;
881           }
882           error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
883           if (error != 0) {
884                     vput(nd.ni_vp);
885                     goto err_mounted;
886           }
887 
888           /*
889            * Put the new filesystem on the mount list after root.
890            */
891           cache_purge(vp);
892           mp->mnt_iflag &= ~IMNT_WANTRDWR;
893 
894           mountlist_append(mp);
895           if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
896                     vfs_syncer_add_to_worklist(mp);
897           vp->v_mountedhere = mp;
898           vput(nd.ni_vp);
899 
900           mount_checkdirs(vp);
901           mutex_exit(mp->mnt_updating);
902           if (error2 == 0)
903                     vfs_resume(mp);
904 
905           /* Hold an additional reference to the mount across VFS_START(). */
906           vfs_ref(mp);
907           (void) VFS_STATVFS(mp, &mp->mnt_stat);
908           error = VFS_START(mp, 0);
909           if (error) {
910                     vrele(vp);
911           } else if (flags & MNT_EXTATTR) {
912                     if (start_extattr(mp) != 0)
913                               mp->mnt_flag &= ~MNT_EXTATTR;
914           }
915           /* Drop reference held for VFS_START(). */
916           vfs_rele(mp);
917           *vpp = NULL;
918           return error;
919 
920 err_mounted:
921           if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
922                     panic("Unmounting fresh file system failed");
923           mutex_exit(mp->mnt_updating);
924           if (error2 == 0)
925                     vfs_resume(mp);
926           vfs_set_lowermount(mp, NULL);
927           vfs_rele(mp);
928 
929           return error;
930 }
931 
932 /*
933  * Do the actual file system unmount.  File system is assumed to have
934  * been locked by the caller.
935  *
936  * => Caller hold reference to the mount, explicitly for dounmount().
937  */
938 int
dounmount(struct mount * mp,int flags,struct lwp * l)939 dounmount(struct mount *mp, int flags, struct lwp *l)
940 {
941           struct vnode *coveredvp, *vp;
942           struct vnode_impl *vip;
943           int error, async, used_syncer, used_extattr;
944           const bool was_suspended = fstrans_is_owner(mp);
945 
946 #if NVERIEXEC > 0
947           error = veriexec_unmountchk(mp);
948           if (error)
949                     return (error);
950 #endif /* NVERIEXEC > 0 */
951 
952           if (!was_suspended) {
953                     error = vfs_suspend(mp, 0);
954                     if (error) {
955                               return error;
956                     }
957           }
958 
959           KASSERT((mp->mnt_iflag & IMNT_GONE) == 0);
960 
961           used_syncer = (mp->mnt_iflag & IMNT_ONWORKLIST) != 0;
962           used_extattr = mp->mnt_flag & MNT_EXTATTR;
963 
964           mp->mnt_iflag |= IMNT_UNMOUNT;
965           mutex_enter(mp->mnt_updating);
966           /*
967            * Temporarily clear the MNT_ASYNC flags so that bwrite() doesn't
968            * convert the sync writes to delayed writes.
969            */
970           async = mp->mnt_flag & MNT_ASYNC;
971           mp->mnt_flag &= ~MNT_ASYNC;
972           cache_purgevfs(mp); /* remove cache entries for this file sys */
973           if (used_syncer)
974                     vfs_syncer_remove_from_worklist(mp);
975           error = 0;
976           if (((mp->mnt_flag & MNT_RDONLY) == 0) && ((flags & MNT_FORCE) == 0)) {
977                     error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
978           }
979           if (error == 0 || (flags & MNT_FORCE)) {
980                     error = VFS_UNMOUNT(mp, flags);
981           }
982           if (error) {
983                     mp->mnt_iflag &= ~IMNT_UNMOUNT;
984                     mp->mnt_flag |= async;
985                     if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
986                               vfs_syncer_add_to_worklist(mp);
987                     mutex_exit(mp->mnt_updating);
988                     if (!was_suspended)
989                               vfs_resume(mp);
990                     if (used_extattr) {
991                               if (start_extattr(mp) != 0)
992                                         mp->mnt_flag &= ~MNT_EXTATTR;
993                               else
994                                         mp->mnt_flag |= MNT_EXTATTR;
995                     }
996                     return (error);
997           }
998           mutex_exit(mp->mnt_updating);
999 
1000           /*
1001            * mark filesystem as gone to prevent further umounts
1002            * after mnt_umounting lock is gone, this also prevents
1003            * vfs_busy() from succeeding.
1004            */
1005           mp->mnt_iflag |= IMNT_GONE;
1006           if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
1007                     coveredvp->v_mountedhere = NULL;
1008           }
1009           if (!was_suspended)
1010                     vfs_resume(mp);
1011 
1012           mountlist_remove(mp);
1013 
1014           if ((vip = TAILQ_FIRST(&mp->mnt_vnodelist)) != NULL) {
1015                     vp = VIMPL_TO_VNODE(vip);
1016                     vprint("dangling", vp);
1017                     panic("unmount: dangling vnode");
1018           }
1019           vfs_hooks_unmount(mp);
1020 
1021           vfs_set_lowermount(mp, NULL);
1022           vfs_rele(mp);       /* reference from mount() */
1023           if (coveredvp != NULLVP) {
1024                     vrele(coveredvp);
1025           }
1026           return (0);
1027 }
1028 
1029 /*
1030  * Unmount all file systems.
1031  * We traverse the list in reverse order under the assumption that doing so
1032  * will avoid needing to worry about dependencies.
1033  */
1034 bool
vfs_unmountall(struct lwp * l)1035 vfs_unmountall(struct lwp *l)
1036 {
1037 
1038           printf("unmounting file systems...\n");
1039           return vfs_unmountall1(l, true, true);
1040 }
1041 
1042 static void
vfs_unmount_print(struct mount * mp,const char * pfx)1043 vfs_unmount_print(struct mount *mp, const char *pfx)
1044 {
1045 
1046           aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
1047               mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
1048               mp->mnt_stat.f_fstypename);
1049 }
1050 
1051 /*
1052  * Return the mount with the highest generation less than "gen".
1053  */
1054 static struct mount *
vfs_unmount_next(uint64_t gen)1055 vfs_unmount_next(uint64_t gen)
1056 {
1057           mount_iterator_t *iter;
1058           struct mount *mp, *nmp;
1059 
1060           nmp = NULL;
1061 
1062           mountlist_iterator_init(&iter);
1063           while ((mp = mountlist_iterator_next(iter)) != NULL) {
1064                     if ((nmp == NULL || mp->mnt_gen > nmp->mnt_gen) &&
1065                         mp->mnt_gen < gen) {
1066                               if (nmp != NULL)
1067                                         vfs_rele(nmp);
1068                               nmp = mp;
1069                               vfs_ref(nmp);
1070                     }
1071           }
1072           mountlist_iterator_destroy(iter);
1073 
1074           return nmp;
1075 }
1076 
1077 bool
vfs_unmount_forceone(struct lwp * l)1078 vfs_unmount_forceone(struct lwp *l)
1079 {
1080           struct mount *mp;
1081           int error;
1082 
1083           mp = vfs_unmount_next(mountgen);
1084           if (mp == NULL) {
1085                     return false;
1086           }
1087 
1088 #ifdef DEBUG
1089           printf("forcefully unmounting %s (%s)...\n",
1090               mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
1091 #endif
1092           if ((error = dounmount(mp, MNT_FORCE, l)) == 0) {
1093                     vfs_unmount_print(mp, "forcefully ");
1094                     return true;
1095           } else {
1096                     vfs_rele(mp);
1097           }
1098 
1099 #ifdef DEBUG
1100           printf("forceful unmount of %s failed with error %d\n",
1101               mp->mnt_stat.f_mntonname, error);
1102 #endif
1103 
1104           return false;
1105 }
1106 
1107 bool
vfs_unmountall1(struct lwp * l,bool force,bool verbose)1108 vfs_unmountall1(struct lwp *l, bool force, bool verbose)
1109 {
1110           struct mount *mp;
1111           mount_iterator_t *iter;
1112           bool any_error = false, progress = false;
1113           uint64_t gen;
1114           int error;
1115 
1116           gen = mountgen;
1117           for (;;) {
1118                     mp = vfs_unmount_next(gen);
1119                     if (mp == NULL)
1120                               break;
1121                     gen = mp->mnt_gen;
1122 
1123 #ifdef DEBUG
1124                     printf("unmounting %p %s (%s)...\n",
1125                         (void *)mp, mp->mnt_stat.f_mntonname,
1126                         mp->mnt_stat.f_mntfromname);
1127 #endif
1128                     if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
1129                               vfs_unmount_print(mp, "");
1130                               progress = true;
1131                     } else {
1132                               vfs_rele(mp);
1133                               if (verbose) {
1134                                         printf("unmount of %s failed with error %d\n",
1135                                             mp->mnt_stat.f_mntonname, error);
1136                               }
1137                               any_error = true;
1138                     }
1139           }
1140           if (verbose) {
1141                     printf("unmounting done\n");
1142           }
1143           if (any_error && verbose) {
1144                     printf("WARNING: some file systems would not unmount\n");
1145           }
1146           /* If the mountlist is empty it is time to remove swap. */
1147           mountlist_iterator_init(&iter);
1148           if (mountlist_iterator_next(iter) == NULL) {
1149                     uvm_swap_shutdown(l);
1150           }
1151           mountlist_iterator_destroy(iter);
1152 
1153           return progress;
1154 }
1155 
1156 void
vfs_sync_all(struct lwp * l)1157 vfs_sync_all(struct lwp *l)
1158 {
1159           printf("syncing disks... ");
1160 
1161           /* remove user processes from run queue */
1162           suspendsched();
1163           (void)spl0();
1164 
1165           /* avoid coming back this way again if we panic. */
1166           doing_shutdown = 1;
1167 
1168           do_sys_sync(l);
1169 
1170           /* Wait for sync to finish. */
1171           if (vfs_syncwait() != 0) {
1172 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
1173                     Debugger();
1174 #endif
1175                     printf("giving up\n");
1176                     return;
1177           } else
1178                     printf("done\n");
1179 }
1180 
1181 /*
1182  * Sync and unmount file systems before shutting down.
1183  */
1184 void
vfs_shutdown(void)1185 vfs_shutdown(void)
1186 {
1187           lwp_t *l = curlwp;
1188 
1189           vfs_sync_all(l);
1190 
1191           /*
1192            * If we have panicked - do not make the situation potentially
1193            * worse by unmounting the file systems.
1194            */
1195           if (panicstr != NULL) {
1196                     return;
1197           }
1198 
1199           /* Unmount file systems. */
1200           vfs_unmountall(l);
1201 }
1202 
1203 /*
1204  * Print a list of supported file system types (used by vfs_mountroot)
1205  */
1206 static void
vfs_print_fstypes(void)1207 vfs_print_fstypes(void)
1208 {
1209           struct vfsops *v;
1210           int cnt = 0;
1211 
1212           mutex_enter(&vfs_list_lock);
1213           LIST_FOREACH(v, &vfs_list, vfs_list)
1214                     ++cnt;
1215           mutex_exit(&vfs_list_lock);
1216 
1217           if (cnt == 0) {
1218                     printf("WARNING: No file system modules have been loaded.\n");
1219                     return;
1220           }
1221 
1222           printf("Supported file systems:");
1223           mutex_enter(&vfs_list_lock);
1224           LIST_FOREACH(v, &vfs_list, vfs_list) {
1225                     printf(" %s", v->vfs_name);
1226           }
1227           mutex_exit(&vfs_list_lock);
1228           printf("\n");
1229 }
1230 
1231 /*
1232  * Mount the root file system.  If the operator didn't specify a
1233  * file system to use, try all possible file systems until one
1234  * succeeds.
1235  */
1236 int
vfs_mountroot(void)1237 vfs_mountroot(void)
1238 {
1239           struct vfsops *v;
1240           int error = ENODEV;
1241 
1242           if (root_device == NULL)
1243                     panic("vfs_mountroot: root device unknown");
1244 
1245           switch (device_class(root_device)) {
1246           case DV_IFNET:
1247                     if (rootdev != NODEV)
1248                               panic("vfs_mountroot: rootdev set for DV_IFNET "
1249                                   "(0x%llx -> %llu,%llu)",
1250                                   (unsigned long long)rootdev,
1251                                   (unsigned long long)major(rootdev),
1252                                   (unsigned long long)minor(rootdev));
1253                     break;
1254 
1255           case DV_DISK:
1256                     if (rootdev == NODEV)
1257                               panic("vfs_mountroot: rootdev not set for DV_DISK");
1258                   if (bdevvp(rootdev, &rootvp))
1259                           panic("vfs_mountroot: can't get vnode for rootdev");
1260                     vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY);
1261                     error = VOP_OPEN(rootvp, FREAD, FSCRED);
1262                     VOP_UNLOCK(rootvp);
1263                     if (error) {
1264                               printf("vfs_mountroot: can't open root device\n");
1265                               return (error);
1266                     }
1267                     break;
1268 
1269           case DV_VIRTUAL:
1270                     break;
1271 
1272           default:
1273                     printf("%s: inappropriate for root file system\n",
1274                         device_xname(root_device));
1275                     return SET_ERROR(ENODEV);
1276           }
1277 
1278           /*
1279            * If user specified a root fs type, use it.  Make sure the
1280            * specified type exists and has a mount_root()
1281            */
1282           if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
1283                     v = vfs_getopsbyname(rootfstype);
1284                     error = SET_ERROR(EFTYPE);
1285                     if (v != NULL) {
1286                               if (v->vfs_mountroot != NULL) {
1287                                         error = (v->vfs_mountroot)();
1288                               }
1289                               v->vfs_refcount--;
1290                     }
1291                     goto done;
1292           }
1293 
1294           /*
1295            * Try each file system currently configured into the kernel.
1296            */
1297           mutex_enter(&vfs_list_lock);
1298           LIST_FOREACH(v, &vfs_list, vfs_list) {
1299                     if (v->vfs_mountroot == NULL)
1300                               continue;
1301 #ifdef DEBUG
1302                     aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1303 #endif
1304                     v->vfs_refcount++;
1305                     mutex_exit(&vfs_list_lock);
1306                     error = (*v->vfs_mountroot)();
1307                     mutex_enter(&vfs_list_lock);
1308                     v->vfs_refcount--;
1309                     if (!error) {
1310                               aprint_normal("root file system type: %s\n",
1311                                   v->vfs_name);
1312                               break;
1313                     }
1314           }
1315           mutex_exit(&vfs_list_lock);
1316 
1317           if (v == NULL) {
1318                     vfs_print_fstypes();
1319                     printf("no file system for %s", device_xname(root_device));
1320                     if (device_class(root_device) == DV_DISK)
1321                               printf(" (dev 0x%llx)", (unsigned long long)rootdev);
1322                     printf("\n");
1323                     error = SET_ERROR(EFTYPE);
1324           }
1325 
1326 done:
1327           if (error && device_class(root_device) == DV_DISK) {
1328                     vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY);
1329                     VOP_CLOSE(rootvp, FREAD, FSCRED);
1330                     VOP_UNLOCK(rootvp);
1331                     vrele(rootvp);
1332           }
1333           if (error == 0) {
1334                     mount_iterator_t *iter;
1335                     struct mount *mp;
1336 
1337                     mountlist_iterator_init(&iter);
1338                     mp = mountlist_iterator_next(iter);
1339                     KASSERT(mp != NULL);
1340                     mountlist_iterator_destroy(iter);
1341 
1342                     mp->mnt_flag |= MNT_ROOTFS;
1343                     mp->mnt_op->vfs_refcount++;
1344 
1345                     /*
1346                      * Get the vnode for '/'.  Set cwdi0.cwdi_cdir to
1347                      * reference it, and donate it the reference grabbed
1348                      * with VFS_ROOT().
1349                      */
1350                     error = VFS_ROOT(mp, LK_NONE, &rootvnode);
1351                     if (error)
1352                               panic("cannot find root vnode, error=%d", error);
1353                     cwdi0.cwdi_cdir = rootvnode;
1354                     cwdi0.cwdi_rdir = NULL;
1355 
1356                     /*
1357                      * Now that root is mounted, we can fixup initproc's CWD
1358                      * info.  All other processes are kthreads, which merely
1359                      * share proc0's CWD info.
1360                      */
1361                     initproc->p_cwdi->cwdi_cdir = rootvnode;
1362                     vref(initproc->p_cwdi->cwdi_cdir);
1363                     initproc->p_cwdi->cwdi_rdir = NULL;
1364                     /*
1365                      * Enable loading of modules from the filesystem
1366                      */
1367                     module_load_vfs_init();
1368 
1369           }
1370           return (error);
1371 }
1372 
1373 /*
1374  * mount_specific_key_create --
1375  *        Create a key for subsystem mount-specific data.
1376  */
1377 int
mount_specific_key_create(specificdata_key_t * keyp,specificdata_dtor_t dtor)1378 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1379 {
1380 
1381           return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
1382 }
1383 
1384 /*
1385  * mount_specific_key_delete --
1386  *        Delete a key for subsystem mount-specific data.
1387  */
1388 void
mount_specific_key_delete(specificdata_key_t key)1389 mount_specific_key_delete(specificdata_key_t key)
1390 {
1391 
1392           specificdata_key_delete(mount_specificdata_domain, key);
1393 }
1394 
1395 /*
1396  * mount_initspecific --
1397  *        Initialize a mount's specificdata container.
1398  */
1399 void
mount_initspecific(struct mount * mp)1400 mount_initspecific(struct mount *mp)
1401 {
1402           int error __diagused;
1403 
1404           error = specificdata_init(mount_specificdata_domain,
1405               &mp->mnt_specdataref);
1406           KASSERT(error == 0);
1407 }
1408 
1409 /*
1410  * mount_finispecific --
1411  *        Finalize a mount's specificdata container.
1412  */
1413 void
mount_finispecific(struct mount * mp)1414 mount_finispecific(struct mount *mp)
1415 {
1416 
1417           specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
1418 }
1419 
1420 /*
1421  * mount_getspecific --
1422  *        Return mount-specific data corresponding to the specified key.
1423  */
1424 void *
mount_getspecific(struct mount * mp,specificdata_key_t key)1425 mount_getspecific(struct mount *mp, specificdata_key_t key)
1426 {
1427 
1428           return specificdata_getspecific(mount_specificdata_domain,
1429               &mp->mnt_specdataref, key);
1430 }
1431 
1432 /*
1433  * mount_setspecific --
1434  *        Set mount-specific data corresponding to the specified key.
1435  */
1436 void
mount_setspecific(struct mount * mp,specificdata_key_t key,void * data)1437 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
1438 {
1439 
1440           specificdata_setspecific(mount_specificdata_domain,
1441               &mp->mnt_specdataref, key, data);
1442 }
1443 
1444 /*
1445  * Check to see if a filesystem is mounted on a block device.
1446  */
1447 int
vfs_mountedon(vnode_t * vp)1448 vfs_mountedon(vnode_t *vp)
1449 {
1450           vnode_t *vq;
1451           int error = 0;
1452 
1453           if (vp->v_type != VBLK)
1454                     return SET_ERROR(ENOTBLK);
1455           if (spec_node_getmountedfs(vp) != NULL)
1456                     return SET_ERROR(EBUSY);
1457           if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, VDEAD_NOWAIT, &vq)
1458               == 0) {
1459                     if (spec_node_getmountedfs(vq) != NULL)
1460                               error = SET_ERROR(EBUSY);
1461                     vrele(vq);
1462           }
1463 
1464           return error;
1465 }
1466 
1467 /*
1468  * Check if a device pointed to by vp is mounted.
1469  *
1470  * Returns:
1471  *   EINVAL         if it's not a disk
1472  *   EBUSY          if it's a disk and mounted
1473  *   0              if it's a disk and not mounted
1474  */
1475 int
rawdev_mounted(vnode_t * vp,vnode_t ** bvpp)1476 rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
1477 {
1478           vnode_t *bvp;
1479           dev_t dev;
1480           int d_type;
1481 
1482           bvp = NULL;
1483           d_type = D_OTHER;
1484 
1485           if (iskmemvp(vp))
1486                     return SET_ERROR(EINVAL);
1487 
1488           switch (vp->v_type) {
1489           case VCHR: {
1490                     const struct cdevsw *cdev;
1491 
1492                     dev = vp->v_rdev;
1493                     cdev = cdevsw_lookup(dev);
1494                     if (cdev != NULL) {
1495                               dev_t blkdev;
1496 
1497                               blkdev = devsw_chr2blk(dev);
1498                               if (blkdev != NODEV) {
1499                                         if (vfinddev(blkdev, VBLK, &bvp) != 0) {
1500                                                   d_type = (cdev->d_flag & D_TYPEMASK);
1501                                                   /* XXX: what if bvp disappears? */
1502                                                   vrele(bvp);
1503                                         }
1504                               }
1505                     }
1506 
1507                     break;
1508           }
1509 
1510           case VBLK: {
1511                     const struct bdevsw *bdev;
1512 
1513                     dev = vp->v_rdev;
1514                     bdev = bdevsw_lookup(dev);
1515                     if (bdev != NULL)
1516                               d_type = (bdev->d_flag & D_TYPEMASK);
1517 
1518                     bvp = vp;
1519 
1520                     break;
1521           }
1522 
1523           default:
1524                     break;
1525           }
1526 
1527           if (d_type != D_DISK)
1528                     return SET_ERROR(EINVAL);
1529 
1530           if (bvpp != NULL)
1531                     *bvpp = bvp;
1532 
1533           /*
1534            * XXX: This is bogus. We should be failing the request
1535            * XXX: not only if this specific slice is mounted, but
1536            * XXX: if it's on a disk with any other mounted slice.
1537            */
1538           if (vfs_mountedon(bvp))
1539                     return SET_ERROR(EBUSY);
1540 
1541           return 0;
1542 }
1543 
1544 /*
1545  * Make a 'unique' number from a mount type name.
1546  */
1547 long
makefstype(const char * type)1548 makefstype(const char *type)
1549 {
1550           long rv;
1551 
1552           for (rv = 0; *type; type++) {
1553                     rv <<= 2;
1554                     rv ^= *type;
1555           }
1556           return rv;
1557 }
1558 
1559 static struct mountlist_entry *
mountlist_alloc(enum mountlist_type type,struct mount * mp)1560 mountlist_alloc(enum mountlist_type type, struct mount *mp)
1561 {
1562           struct mountlist_entry *me;
1563 
1564           me = kmem_zalloc(sizeof(*me), KM_SLEEP);
1565           me->me_mount = mp;
1566           me->me_type = type;
1567 
1568           return me;
1569 }
1570 
1571 static void
mountlist_free(struct mountlist_entry * me)1572 mountlist_free(struct mountlist_entry *me)
1573 {
1574 
1575           kmem_free(me, sizeof(*me));
1576 }
1577 
1578 void
mountlist_iterator_init(mount_iterator_t ** mip)1579 mountlist_iterator_init(mount_iterator_t **mip)
1580 {
1581           struct mountlist_entry *me;
1582 
1583           me = mountlist_alloc(ME_MARKER, NULL);
1584           mutex_enter(&mountlist_lock);
1585           TAILQ_INSERT_HEAD(&mountlist, me, me_list);
1586           mutex_exit(&mountlist_lock);
1587           *mip = (mount_iterator_t *)me;
1588 }
1589 
1590 void
mountlist_iterator_destroy(mount_iterator_t * mi)1591 mountlist_iterator_destroy(mount_iterator_t *mi)
1592 {
1593           struct mountlist_entry *marker = &mi->mi_entry;
1594 
1595           if (marker->me_mount != NULL)
1596                     vfs_unbusy(marker->me_mount);
1597 
1598           mutex_enter(&mountlist_lock);
1599           TAILQ_REMOVE(&mountlist, marker, me_list);
1600           mutex_exit(&mountlist_lock);
1601 
1602           mountlist_free(marker);
1603 
1604 }
1605 
1606 /*
1607  * Return the next mount or NULL for this iterator.
1608  * Mark it busy on success.
1609  */
1610 static inline struct mount *
_mountlist_iterator_next(mount_iterator_t * mi,bool wait)1611 _mountlist_iterator_next(mount_iterator_t *mi, bool wait)
1612 {
1613           struct mountlist_entry *me, *marker = &mi->mi_entry;
1614           struct mount *mp;
1615           int error;
1616 
1617           if (marker->me_mount != NULL) {
1618                     vfs_unbusy(marker->me_mount);
1619                     marker->me_mount = NULL;
1620           }
1621 
1622           mutex_enter(&mountlist_lock);
1623           for (;;) {
1624                     KASSERT(marker->me_type == ME_MARKER);
1625 
1626                     me = TAILQ_NEXT(marker, me_list);
1627                     if (me == NULL) {
1628                               /* End of list: keep marker and return. */
1629                               mutex_exit(&mountlist_lock);
1630                               return NULL;
1631                     }
1632                     TAILQ_REMOVE(&mountlist, marker, me_list);
1633                     TAILQ_INSERT_AFTER(&mountlist, me, marker, me_list);
1634 
1635                     /* Skip other markers. */
1636                     if (me->me_type != ME_MOUNT)
1637                               continue;
1638 
1639                     /* Take an initial reference for vfs_busy() below. */
1640                     mp = me->me_mount;
1641                     KASSERT(mp != NULL);
1642                     vfs_ref(mp);
1643                     mutex_exit(&mountlist_lock);
1644 
1645                     /* Try to mark this mount busy and return on success. */
1646                     if (wait)
1647                               error = vfs_busy(mp);
1648                     else
1649                               error = vfs_trybusy(mp);
1650                     if (error == 0) {
1651                               vfs_rele(mp);
1652                               marker->me_mount = mp;
1653                               return mp;
1654                     }
1655                     vfs_rele(mp);
1656                     mutex_enter(&mountlist_lock);
1657           }
1658 }
1659 
1660 struct mount *
mountlist_iterator_next(mount_iterator_t * mi)1661 mountlist_iterator_next(mount_iterator_t *mi)
1662 {
1663 
1664           return _mountlist_iterator_next(mi, true);
1665 }
1666 
1667 struct mount *
mountlist_iterator_trynext(mount_iterator_t * mi)1668 mountlist_iterator_trynext(mount_iterator_t *mi)
1669 {
1670 
1671           return _mountlist_iterator_next(mi, false);
1672 }
1673 
1674 /*
1675  * Attach new mount to the end of the mount list.
1676  */
1677 void
mountlist_append(struct mount * mp)1678 mountlist_append(struct mount *mp)
1679 {
1680           struct mountlist_entry *me;
1681 
1682           me = mountlist_alloc(ME_MOUNT, mp);
1683           mutex_enter(&mountlist_lock);
1684           TAILQ_INSERT_TAIL(&mountlist, me, me_list);
1685           mutex_exit(&mountlist_lock);
1686 }
1687 
1688 /*
1689  * Remove mount from mount list.
1690  */
1691 void
mountlist_remove(struct mount * mp)1692 mountlist_remove(struct mount *mp)
1693 {
1694           struct mountlist_entry *me;
1695 
1696           mutex_enter(&mountlist_lock);
1697           TAILQ_FOREACH(me, &mountlist, me_list)
1698                     if (me->me_type == ME_MOUNT && me->me_mount == mp)
1699                               break;
1700           KASSERT(me != NULL);
1701           TAILQ_REMOVE(&mountlist, me, me_list);
1702           mutex_exit(&mountlist_lock);
1703           mountlist_free(me);
1704 }
1705 
1706 /*
1707  * Unlocked variant to traverse the mountlist.
1708  * To be used from DDB only.
1709  */
1710 struct mount *
_mountlist_next(struct mount * mp)1711 _mountlist_next(struct mount *mp)
1712 {
1713           struct mountlist_entry *me;
1714 
1715           if (mp == NULL) {
1716                     me = TAILQ_FIRST(&mountlist);
1717           } else {
1718                     TAILQ_FOREACH(me, &mountlist, me_list)
1719                               if (me->me_type == ME_MOUNT && me->me_mount == mp)
1720                                         break;
1721                     if (me != NULL)
1722                               me = TAILQ_NEXT(me, me_list);
1723           }
1724 
1725           while (me != NULL && me->me_type != ME_MOUNT)
1726                     me = TAILQ_NEXT(me, me_list);
1727 
1728           return (me ? me->me_mount : NULL);
1729 }
1730