1 /*        $NetBSD: vfs_init.c,v 1.67 2024/12/07 02:27:38 riastradh Exp $        */
2 
3 /*-
4  * Copyright (c) 1998, 2000, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1989, 1993
35  *        The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed
38  * to Berkeley by John Heidemann of the UCLA Ficus project.
39  *
40  * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *        @(#)vfs_init.c      8.5 (Berkeley) 5/11/95
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: vfs_init.c,v 1.67 2024/12/07 02:27:38 riastradh Exp $");
71 
72 #include <sys/param.h>
73 #include <sys/types.h>
74 
75 #include <sys/buf.h>
76 #include <sys/dirhash.h>
77 #include <sys/errno.h>
78 #include <sys/kauth.h>
79 #include <sys/kmem.h>
80 #include <sys/module.h>
81 #include <sys/mount.h>
82 #include <sys/namei.h>
83 #include <sys/sdt.h>
84 #include <sys/stat.h>
85 #include <sys/sysctl.h>
86 #include <sys/systm.h>
87 #include <sys/time.h>
88 #include <sys/ucred.h>
89 #include <sys/vnode.h>
90 #include <sys/vnode_impl.h>
91 
92 #include <miscfs/deadfs/deadfs.h>
93 #include <miscfs/fifofs/fifo.h>
94 #include <miscfs/specfs/specdev.h>
95 
96 /*
97  * Sigh, such primitive tools are these...
98  */
99 #if 0
100 #define DODEBUG(A) A
101 #else
102 #define DODEBUG(A) __nothing
103 #endif
104 
105 SDT_PROVIDER_DEFINE(vfs);
106 
107 /*
108  * These vnodeopv_descs are listed here because they are not
109  * associated with any particular file system, and thus cannot
110  * be initialized by vfs_attach().
111  */
112 const struct vnodeopv_desc * const vfs_special_vnodeopv_descs[] = {
113           &dead_vnodeop_opv_desc,
114           &fifo_vnodeop_opv_desc,
115           &spec_vnodeop_opv_desc,
116           NULL,
117 };
118 
119 struct vfs_list_head vfs_list =                             /* vfs list */
120     LIST_HEAD_INITIALIZER(vfs_list);
121 
122 static kauth_listener_t mount_listener;
123 
124 /*
125  * This code doesn't work if the defn is **vnodop_defns with cc.
126  * The problem is because of the compiler sometimes putting in an
127  * extra level of indirection for arrays.  It's an interesting
128  * "feature" of C.
129  */
130 typedef int (*PFI)(void *);
131 
132 /*
133  * A miscellaneous routine.
134  * A generic "default" routine that just returns an error.
135  */
136 /*ARGSUSED*/
137 int
vn_default_error(void * v)138 vn_default_error(void *v)
139 {
140 
141           return SET_ERROR(EOPNOTSUPP);
142 }
143 
144 static struct sysctllog *vfs_sysctllog;
145 
146 /*
147  * Top level filesystem related information gathering.
148  */
149 static void
sysctl_vfs_setup(void)150 sysctl_vfs_setup(void)
151 {
152 
153           sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
154               CTLFLAG_PERMANENT,
155               CTLTYPE_NODE, "generic",
156               SYSCTL_DESCR("Non-specific vfs related information"),
157               NULL, 0, NULL, 0,
158               CTL_VFS, VFS_GENERIC, CTL_EOL);
159           sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
160               CTLFLAG_PERMANENT,
161               CTLTYPE_STRING, "fstypes",
162               SYSCTL_DESCR("List of file systems present"),
163               sysctl_vfs_generic_fstypes, 0, NULL, 0,
164               CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
165           sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
166               CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
167               CTLTYPE_INT, "magiclinks",
168               SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
169               NULL, 0, &vfs_magiclinks, 0,
170               CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
171           sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
172               CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
173               CTLTYPE_INT, "timestamp_precision",
174               SYSCTL_DESCR("File timestamp precision"),
175               NULL, 0, &vfs_timestamp_precision, 0,
176               CTL_VFS, VFS_GENERIC, VFS_TIMESTAMP_PRECISION,
177               CTL_EOL);
178 }
179 
180 /*
181  * vfs_init.c
182  *
183  * Allocate and fill in operations vectors.
184  *
185  * An undocumented feature of this approach to defining operations is that
186  * there can be multiple entries in vfs_opv_descs for the same operations
187  * vector. This allows third parties to extend the set of operations
188  * supported by another layer in a binary compatibile way. For example,
189  * assume that NFS needed to be modified to support Ficus. NFS has an entry
190  * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
191  * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
192  * listing those new operations Ficus adds to NFS, all without modifying the
193  * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
194  * that is a(whole)nother story.) This is a feature.
195  */
196 
197 /*
198  * Init the vector, if it needs it.
199  * Also handle backwards compatibility.
200  */
201 static void
vfs_opv_init_explicit(const struct vnodeopv_desc * vfs_opv_desc)202 vfs_opv_init_explicit(const struct vnodeopv_desc *vfs_opv_desc)
203 {
204           int (**opv_desc_vector)(void *);
205           const struct vnodeopv_entry_desc *opve_descp;
206 
207           opv_desc_vector = *(vfs_opv_desc->opv_desc_vector_p);
208 
209           for (opve_descp = vfs_opv_desc->opv_desc_ops;
210                opve_descp->opve_op;
211                opve_descp++) {
212                     /*
213                      * Sanity check:  is this operation listed
214                      * in the list of operations?  We check this
215                      * by seeing if its offset is zero.  Since
216                      * the default routine should always be listed
217                      * first, it should be the only one with a zero
218                      * offset.  Any other operation with a zero
219                      * offset is probably not listed in
220                      * vfs_op_descs, and so is probably an error.
221                      *
222                      * A panic here means the layer programmer
223                      * has committed the all-too common bug
224                      * of adding a new operation to the layer's
225                      * list of vnode operations but
226                      * not adding the operation to the system-wide
227                      * list of supported operations.
228                      */
229                     if (opve_descp->opve_op->vdesc_offset == 0 &&
230                         opve_descp->opve_op->vdesc_offset != VOFFSET(vop_default))
231                     {
232                               printf("operation %s not listed in %s.\n",
233                                   opve_descp->opve_op->vdesc_name, "vfs_op_descs");
234                               panic("vfs_opv_init: bad operation");
235                     }
236 
237                     /*
238                      * Fill in this entry.
239                      */
240                     opv_desc_vector[opve_descp->opve_op->vdesc_offset] =
241                         opve_descp->opve_impl;
242           }
243 }
244 
245 static void
vfs_opv_init_default(const struct vnodeopv_desc * vfs_opv_desc)246 vfs_opv_init_default(const struct vnodeopv_desc *vfs_opv_desc)
247 {
248           int j;
249           int (**opv_desc_vector)(void *);
250 
251           opv_desc_vector = *(vfs_opv_desc->opv_desc_vector_p);
252 
253           /*
254            * Force every operations vector to have a default routine.
255            */
256           if (opv_desc_vector[VOFFSET(vop_default)] == NULL)
257                     panic("vfs_opv_init: operation vector without vop_default");
258 
259           for (j = 0; j < VNODE_OPS_COUNT; j++) {
260                     if (opv_desc_vector[j] == NULL) {
261                               opv_desc_vector[j] =
262                                   opv_desc_vector[VOFFSET(vop_default)];
263                     }
264           }
265 }
266 
267 void
vfs_opv_init(const struct vnodeopv_desc * const * vopvdpp)268 vfs_opv_init(const struct vnodeopv_desc * const *vopvdpp)
269 {
270           int (**opv_desc_vector)(void *);
271           int i;
272 
273           /*
274            * Allocate the vectors.
275            */
276           for (i = 0; vopvdpp[i] != NULL; i++) {
277                     opv_desc_vector =
278                         kmem_alloc(VNODE_OPS_COUNT * sizeof(PFI), KM_SLEEP);
279                     memset(opv_desc_vector, 0, VNODE_OPS_COUNT * sizeof(PFI));
280                     *(vopvdpp[i]->opv_desc_vector_p) = opv_desc_vector;
281                     DODEBUG(printf("vector at %p allocated\n",
282                         opv_desc_vector_p));
283           }
284 
285           /*
286            * ...and fill them in.
287            */
288           for (i = 0; vopvdpp[i] != NULL; i++)
289                     vfs_opv_init_explicit(vopvdpp[i]);
290 
291           /*
292            * Finally, go back and replace unfilled routines
293            * with their default.
294            */
295           for (i = 0; vopvdpp[i] != NULL; i++)
296                     vfs_opv_init_default(vopvdpp[i]);
297 }
298 
299 void
vfs_opv_free(const struct vnodeopv_desc * const * vopvdpp)300 vfs_opv_free(const struct vnodeopv_desc * const *vopvdpp)
301 {
302           int i;
303 
304           /*
305            * Free the vectors allocated in vfs_opv_init().
306            */
307           for (i = 0; vopvdpp[i] != NULL; i++) {
308                     kmem_free(*(vopvdpp[i]->opv_desc_vector_p),
309                         VNODE_OPS_COUNT * sizeof(PFI));
310                     *(vopvdpp[i]->opv_desc_vector_p) = NULL;
311           }
312 }
313 
314 #ifdef DEBUG
315 static void
vfs_op_check(void)316 vfs_op_check(void)
317 {
318           int i;
319 
320           DODEBUG(printf("Vnode_interface_init.\n"));
321 
322           /*
323            * Check offset of each op.
324            */
325           for (i = 0; vfs_op_descs[i]; i++) {
326                     if (vfs_op_descs[i]->vdesc_offset != i)
327                               panic("vfs_op_check: vfs_op_desc[] offset mismatch");
328           }
329 
330           if (i != VNODE_OPS_COUNT) {
331                     panic("vfs_op_check: vnode ops count mismatch (%d != %d)",
332                         i, VNODE_OPS_COUNT);
333           }
334 
335           DODEBUG(printf ("vfs_opv_numops=%d\n", VNODE_OPS_COUNT));
336 }
337 #endif /* DEBUG */
338 
339 /*
340  * Common routine to check if an unprivileged mount is allowed.
341  *
342  * We export just this part (i.e., without the access control) so that if a
343  * secmodel wants to implement finer grained user mounts it can do so without
344  * copying too much code. More elaborate policies (i.e., specific users allowed
345  * to also create devices and/or introduce set-id binaries, or export
346  * file-systems) will require a different implementation.
347  *
348  * This routine is intended to be called from listener context, and as such
349  * does not take credentials as an argument.
350  */
351 int
usermount_common_policy(struct mount * mp,u_long flags)352 usermount_common_policy(struct mount *mp, u_long flags)
353 {
354 
355           /* No exporting if unprivileged. */
356           if (flags & MNT_EXPORTED)
357                     return SET_ERROR(EPERM);
358 
359           /* Must have 'nosuid' and 'nodev'. */
360           if ((flags & MNT_NODEV) == 0 || (flags & MNT_NOSUID) == 0)
361                     return SET_ERROR(EPERM);
362 
363           /* Retain 'noexec'. */
364           if ((mp->mnt_flag & MNT_NOEXEC) && (flags & MNT_NOEXEC) == 0)
365                     return SET_ERROR(EPERM);
366 
367           return 0;
368 }
369 
370 static int
mount_listener_cb(kauth_cred_t cred,kauth_action_t action,void * cookie,void * arg0,void * arg1,void * arg2,void * arg3)371 mount_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
372     void *arg0, void *arg1, void *arg2, void *arg3)
373 {
374           int result;
375           enum kauth_system_req req;
376 
377           result = KAUTH_RESULT_DEFER;
378           req = (enum kauth_system_req)(uintptr_t)(uintptr_t)arg0;
379 
380           if (action != KAUTH_SYSTEM_MOUNT)
381                     return result;
382 
383           if (req == KAUTH_REQ_SYSTEM_MOUNT_GET)
384                     result = KAUTH_RESULT_ALLOW;
385           else if (req == KAUTH_REQ_SYSTEM_MOUNT_DEVICE) {
386                     vnode_t *devvp = arg2;
387                     accmode_t accmode = (accmode_t)(unsigned long)arg3;
388                     int error;
389 
390                     error = VOP_ACCESS(devvp, accmode, cred);
391                     if (!error)
392                               result = KAUTH_RESULT_ALLOW;
393           }
394 
395           return result;
396 }
397 
398 /*
399  * Initialize the vnode structures and initialize each file system type.
400  */
401 void
vfsinit(void)402 vfsinit(void)
403 {
404 
405           /*
406            * Attach sysctl nodes
407            */
408           sysctl_vfs_setup();
409 
410           /*
411            * Initialize the vnode table
412            */
413           vntblinit();
414 
415           /*
416            * Initialize the vnode name cache
417            */
418           nchinit();
419 
420 #ifdef DEBUG
421           /*
422            * Check the list of vnode operations.
423            */
424           vfs_op_check();
425 #endif
426 
427           /*
428            * Initialize the special vnode operations.
429            */
430           vfs_opv_init(vfs_special_vnodeopv_descs);
431 
432           /*
433            * Initialise generic dirhash.
434            */
435           dirhash_init();
436 
437           /*
438            * Initialise VFS hooks.
439            */
440           vfs_hooks_init();
441 
442           mount_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
443               mount_listener_cb, NULL);
444 
445           /*
446            * Establish each file system which was statically
447            * included in the kernel.
448            */
449           module_init_class(MODULE_CLASS_VFS);
450 
451           /*
452            * Initialize EVFILT_FS for kqueue.
453            */
454           vfs_evfilt_fs_init();
455 }
456 
457 /*
458  * Drop a reference to a file system type.
459  */
460 void
vfs_delref(struct vfsops * vfs)461 vfs_delref(struct vfsops *vfs)
462 {
463 
464           mutex_enter(&vfs_list_lock);
465           vfs->vfs_refcount--;
466           mutex_exit(&vfs_list_lock);
467 }
468 
469 /*
470  * Establish a file system and initialize it.
471  */
472 int
vfs_attach(struct vfsops * vfs)473 vfs_attach(struct vfsops *vfs)
474 {
475           struct vfsops *v;
476           int error = 0;
477 
478           mutex_enter(&vfs_list_lock);
479 
480           /*
481            * Make sure this file system doesn't already exist.
482            */
483           LIST_FOREACH(v, &vfs_list, vfs_list) {
484                     if (strcmp(vfs->vfs_name, v->vfs_name) == 0) {
485                               error = SET_ERROR(EEXIST);
486                               goto out;
487                     }
488           }
489 
490           /*
491            * Initialize the vnode operations for this file system.
492            */
493           vfs_opv_init(vfs->vfs_opv_descs);
494 
495           /*
496            * Now initialize the file system itself.
497            */
498           (*vfs->vfs_init)();
499 
500           /*
501            * ...and link it into the kernel's list.
502            */
503           LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list);
504 
505           /*
506            * Sanity: make sure the reference count is 0.
507            */
508           vfs->vfs_refcount = 0;
509 out:
510           mutex_exit(&vfs_list_lock);
511           return error;
512 }
513 
514 /*
515  * Remove a file system from the kernel.
516  */
517 int
vfs_detach(struct vfsops * vfs)518 vfs_detach(struct vfsops *vfs)
519 {
520           struct vfsops *v;
521           int error = 0;
522 
523           mutex_enter(&vfs_list_lock);
524 
525           /*
526            * Make sure no one is using the filesystem.
527            */
528           if (vfs->vfs_refcount != 0) {
529                     error = SET_ERROR(EBUSY);
530                     goto out;
531           }
532 
533           /*
534            * ...and remove it from the kernel's list.
535            */
536           LIST_FOREACH(v, &vfs_list, vfs_list) {
537                     if (v == vfs) {
538                               LIST_REMOVE(v, vfs_list);
539                               break;
540                     }
541           }
542 
543           if (v == NULL) {
544                     error = SET_ERROR(ESRCH);
545                     goto out;
546           }
547 
548           /*
549            * Now run the file system-specific cleanups.
550            */
551           (*vfs->vfs_done)();
552 
553           /*
554            * Free the vnode operations vector.
555            */
556           vfs_opv_free(vfs->vfs_opv_descs);
557 out:
558           mutex_exit(&vfs_list_lock);
559           return error;
560 }
561 
562 void
vfs_reinit(void)563 vfs_reinit(void)
564 {
565           struct vfsops *vfs;
566 
567           mutex_enter(&vfs_list_lock);
568           LIST_FOREACH(vfs, &vfs_list, vfs_list) {
569                     if (vfs->vfs_reinit) {
570                               vfs->vfs_refcount++;
571                               mutex_exit(&vfs_list_lock);
572                               (*vfs->vfs_reinit)();
573                               mutex_enter(&vfs_list_lock);
574                               vfs->vfs_refcount--;
575                     }
576           }
577           mutex_exit(&vfs_list_lock);
578 }
579