1 /*        $NetBSD: vfs_syscalls.c,v 1.570 2024/12/07 02:23:09 riastradh Exp $   */
2 
3 /*-
4  * Copyright (c) 2008, 2009, 2019, 2020, 2023 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1989, 1993
34  *        The Regents of the University of California.  All rights reserved.
35  * (c) UNIX System Laboratories, Inc.
36  * All or some portions of this file are derived from material licensed
37  * to the University of California by American Telephone and Telegraph
38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39  * the permission of UNIX System Laboratories, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *        @(#)vfs_syscalls.c  8.42 (Berkeley) 7/31/95
66  */
67 
68 /*
69  * Virtual File System System Calls
70  */
71 
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.570 2024/12/07 02:23:09 riastradh Exp $");
74 
75 #ifdef _KERNEL_OPT
76 #include "opt_fileassoc.h"
77 #include "veriexec.h"
78 #endif
79 
80 #include <sys/param.h>
81 #include <sys/types.h>
82 
83 #include <sys/atomic.h>
84 #include <sys/buf.h>
85 #include <sys/compat_stub.h>
86 #include <sys/dirent.h>
87 #include <sys/event.h>
88 #include <sys/extattr.h>
89 #include <sys/fcntl.h>
90 #include <sys/file.h>
91 #ifdef FILEASSOC
92 #include <sys/fileassoc.h>
93 #endif /* FILEASSOC */
94 #include <sys/filedesc.h>
95 #include <sys/fstrans.h>
96 #include <sys/kauth.h>
97 #include <sys/kernel.h>
98 #include <sys/kmem.h>
99 #include <sys/ktrace.h>
100 #include <sys/module.h>
101 #include <sys/mount.h>
102 #include <sys/namei.h>
103 #include <sys/proc.h>
104 #include <sys/quota.h>
105 #include <sys/quotactl.h>
106 #include <sys/stat.h>
107 #include <sys/syscallargs.h>
108 #include <sys/sysctl.h>
109 #include <sys/systm.h>
110 #include <sys/uio.h>
111 #include <sys/verified_exec.h>
112 #include <sys/vfs_syscalls.h>
113 #include <sys/vnode.h>
114 
115 #include <miscfs/genfs/genfs.h>
116 #include <miscfs/specfs/specdev.h>
117 
118 #include <nfs/nfs.h>
119 #include <nfs/nfs_var.h>
120 #include <nfs/nfsproto.h>
121 #include <nfs/rpcv2.h>
122 
123 /* XXX this shouldn't be here */
124 #ifndef OFF_T_MAX
125 #define OFF_T_MAX __type_max(off_t)
126 #endif
127 
128 static int change_flags(struct vnode *, u_long, struct lwp *);
129 static int change_mode(struct vnode *, int, struct lwp *);
130 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
131 static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
132 static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
133     enum uio_seg);
134 static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
135 static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
136     enum uio_seg);
137 static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
138     enum uio_seg, int);
139 static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
140     size_t, register_t *);
141 static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
142 
143 static int fd_nameiat(struct lwp *, int, struct nameidata *);
144 static int fd_nameiat_simple_user(struct lwp *, int, const char *,
145     namei_simple_flags_t, struct vnode **);
146 
147 /*
148  * This table is used to maintain compatibility with 4.3BSD
149  * and NetBSD 0.9 mount syscalls - and possibly other systems.
150  * Note, the order is important!
151  *
152  * Do not modify this table. It should only contain filesystems
153  * supported by NetBSD 0.9 and 4.3BSD.
154  */
155 const char * const mountcompatnames[] = {
156           NULL,               /* 0 = MOUNT_NONE */
157           MOUNT_FFS,          /* 1 = MOUNT_UFS */
158           MOUNT_NFS,          /* 2 */
159           MOUNT_MFS,          /* 3 */
160           MOUNT_MSDOS,        /* 4 */
161           MOUNT_CD9660,       /* 5 = MOUNT_ISOFS */
162           MOUNT_FDESC,        /* 6 */
163           MOUNT_KERNFS,       /* 7 */
164           NULL,               /* 8 = MOUNT_DEVFS */
165           MOUNT_AFS,          /* 9 */
166 };
167 
168 const u_int nmountcompatnames = __arraycount(mountcompatnames);
169 
170 /*
171  * Filter event method for EVFILT_FS.
172  */
173 static struct klist fs_klist;
174 static kmutex_t fs_klist_lock;
175 
176 CTASSERT((NOTE_SUBMIT & VQ_MOUNT) == 0);
177 CTASSERT((NOTE_SUBMIT & VQ_UNMOUNT) == 0);
178 
179 void
vfs_evfilt_fs_init(void)180 vfs_evfilt_fs_init(void)
181 {
182 
183           klist_init(&fs_klist);
184           mutex_init(&fs_klist_lock, MUTEX_DEFAULT, IPL_NONE);
185 }
186 
187 static int
filt_fsattach(struct knote * kn)188 filt_fsattach(struct knote *kn)
189 {
190 
191           mutex_enter(&fs_klist_lock);
192           kn->kn_flags |= EV_CLEAR;
193           klist_insert(&fs_klist, kn);
194           mutex_exit(&fs_klist_lock);
195 
196           return 0;
197 }
198 
199 static void
filt_fsdetach(struct knote * kn)200 filt_fsdetach(struct knote *kn)
201 {
202 
203           mutex_enter(&fs_klist_lock);
204           klist_remove(&fs_klist, kn);
205           mutex_exit(&fs_klist_lock);
206 }
207 
208 static int
filt_fs(struct knote * kn,long hint)209 filt_fs(struct knote *kn, long hint)
210 {
211           int rv;
212 
213           if (hint & NOTE_SUBMIT) {
214                     KASSERT(mutex_owned(&fs_klist_lock));
215                     kn->kn_fflags |= hint & ~NOTE_SUBMIT;
216           } else {
217                     mutex_enter(&fs_klist_lock);
218           }
219 
220           rv = (kn->kn_fflags != 0);
221 
222           if ((hint & NOTE_SUBMIT) == 0) {
223                     mutex_exit(&fs_klist_lock);
224           }
225 
226           return rv;
227 }
228 
229 /* referenced in kern_event.c */
230 const struct filterops fs_filtops = {
231           .f_flags = FILTEROP_MPSAFE,
232           .f_attach = filt_fsattach,
233           .f_detach = filt_fsdetach,
234           .f_event = filt_fs,
235 };
236 
237 static int
fd_nameiat(struct lwp * l,int fdat,struct nameidata * ndp)238 fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
239 {
240           file_t *dfp;
241           int error;
242           const char *path = pathbuf_stringcopy_get(ndp->ni_pathbuf);
243 
244           if (fdat != AT_FDCWD && path[0] != '/') {
245                     if ((error = fd_getvnode(fdat, &dfp)) != 0)
246                               goto out;
247 
248                     NDAT(ndp, dfp->f_vnode);
249           }
250 
251           error = namei(ndp);
252 
253           if (fdat != AT_FDCWD && path[0] != '/')
254                     fd_putfile(fdat);
255 out:
256           pathbuf_stringcopy_put(ndp->ni_pathbuf, path);
257           return error;
258 }
259 
260 static int
fd_nameiat_simple_user(struct lwp * l,int fdat,const char * path,namei_simple_flags_t sflags,struct vnode ** vp_ret)261 fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
262     namei_simple_flags_t sflags, struct vnode **vp_ret)
263 {
264           file_t *dfp;
265           struct vnode *dvp;
266           int error;
267           struct pathbuf *pb;
268           const char *p;
269 
270           error = pathbuf_copyin(path, &pb);
271           if (error) {
272                     return error;
273           }
274           p = pathbuf_stringcopy_get(pb);
275 
276           if (fdat != AT_FDCWD && p[0] != '/') {
277                     if ((error = fd_getvnode(fdat, &dfp)) != 0)
278                               goto out;
279 
280                     dvp = dfp->f_vnode;
281           } else {
282                     dvp = NULL;
283           }
284 
285           error = nameiat_simple(dvp, pb, sflags, vp_ret);
286 
287           if (fdat != AT_FDCWD && p[0] != '/')
288                     fd_putfile(fdat);
289 
290 out:
291           pathbuf_stringcopy_put(pb, p);
292           pathbuf_destroy(pb);
293 
294           return error;
295 }
296 
297 static int
open_setfp(struct lwp * l,file_t * fp,struct vnode * vp,int indx,int flags)298 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
299 {
300           int error;
301 
302           fp->f_flag = flags & FMASK;
303           fp->f_type = DTYPE_VNODE;
304           fp->f_ops = &vnops;
305           fp->f_vnode = vp;
306 
307           if (flags & (O_EXLOCK | O_SHLOCK)) {
308                     struct flock lf;
309                     int type;
310 
311                     lf.l_whence = SEEK_SET;
312                     lf.l_start = 0;
313                     lf.l_len = 0;
314                     if (flags & O_EXLOCK)
315                               lf.l_type = F_WRLCK;
316                     else
317                               lf.l_type = F_RDLCK;
318                     type = F_FLOCK;
319                     if ((flags & FNONBLOCK) == 0)
320                               type |= F_WAIT;
321                     VOP_UNLOCK(vp);
322                     error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
323                     if (error) {
324                               (void) vn_close(vp, fp->f_flag, fp->f_cred);
325                               fd_abort(l->l_proc, fp, indx);
326                               return error;
327                     }
328                     vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
329                     atomic_or_uint(&fp->f_flag, FHASLOCK);
330           }
331           if (flags & O_CLOEXEC)
332                     fd_set_exclose(l, indx, true);
333           return 0;
334 }
335 
336 static int
mount_update(struct lwp * l,struct vnode * vp,const char * path,int flags,void * data,size_t * data_len)337 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
338     void *data, size_t *data_len)
339 {
340           struct mount *mp;
341           int error = 0, saved_flags;
342 
343           mp = vp->v_mount;
344           saved_flags = mp->mnt_flag;
345 
346           /* We can operate only on VV_ROOT nodes. */
347           if ((vp->v_vflag & VV_ROOT) == 0) {
348                     error = EINVAL;
349                     goto out;
350           }
351 
352           /*
353            * We only allow the filesystem to be reloaded if it
354            * is currently mounted read-only.  Additionally, we
355            * prevent read-write to read-only downgrades.
356            */
357           if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
358               (mp->mnt_flag & MNT_RDONLY) == 0 &&
359               (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
360                     error = EOPNOTSUPP; /* Needs translation */
361                     goto out;
362           }
363 
364           /*
365            * Enabling MNT_UNION requires a covered mountpoint and
366            * must not happen on the root mount.
367            */
368           if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) {
369                     error = EOPNOTSUPP;
370                     goto out;
371           }
372 
373           error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
374               KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
375           if (error)
376                     goto out;
377 
378           error = vfs_suspend(mp, 0);
379           if (error)
380                     goto out;
381 
382           mutex_enter(mp->mnt_updating);
383 
384           mp->mnt_flag &= ~MNT_OP_FLAGS;
385           mp->mnt_flag |= flags & MNT_OP_FLAGS;
386 
387           /*
388            * Set the mount level flags.
389            */
390           if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) {
391                     if ((flags & MNT_RDONLY))
392                               mp->mnt_iflag |= IMNT_WANTRDONLY;
393                     else
394                               mp->mnt_iflag |= IMNT_WANTRDWR;
395           }
396           mp->mnt_flag &= ~MNT_BASIC_FLAGS;
397           mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
398           if ((mp->mnt_iflag & IMNT_WANTRDONLY))
399                     mp->mnt_flag &= ~MNT_RDONLY;
400 
401           error = VFS_MOUNT(mp, path, data, data_len);
402 
403           if (error && data != NULL) {
404                     int error2;
405 
406                     /*
407                      * Update failed; let's try and see if it was an
408                      * export request.  For compat with 3.0 and earlier.
409                      */
410                     error2 = vfs_hooks_reexport(mp, path, data);
411 
412                     /*
413                      * Only update error code if the export request was
414                      * understood but some problem occurred while
415                      * processing it.
416                      */
417                     if (error2 != EJUSTRETURN)
418                               error = error2;
419           }
420 
421           if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY))
422                     mp->mnt_flag |= MNT_RDONLY;
423           if (error)
424                     mp->mnt_flag = saved_flags;
425           mp->mnt_flag &= ~MNT_OP_FLAGS;
426           mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR);
427           if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
428                     if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
429                               vfs_syncer_add_to_worklist(mp);
430           } else {
431                     if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
432                               vfs_syncer_remove_from_worklist(mp);
433           }
434           mutex_exit(mp->mnt_updating);
435           vfs_resume(mp);
436 
437           if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
438               (flags & MNT_EXTATTR)) {
439                     if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
440                               NULL, 0, NULL) != 0) {
441                               printf("%s: failed to start extattr, error = %d",
442                                   mp->mnt_stat.f_mntonname, error);
443                               mp->mnt_flag &= ~MNT_EXTATTR;
444                     }
445           }
446 
447           if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
448               !(flags & MNT_EXTATTR)) {
449                     if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
450                               NULL, 0, NULL) != 0) {
451                               printf("%s: failed to stop extattr, error = %d",
452                                   mp->mnt_stat.f_mntonname, error);
453                               mp->mnt_flag |= MNT_RDONLY;
454                     }
455           }
456 out:
457           return (error);
458 }
459 
460 static int
mount_get_vfsops(const char * fstype,enum uio_seg type_seg,struct vfsops ** vfsops)461 mount_get_vfsops(const char *fstype, enum uio_seg type_seg,
462     struct vfsops **vfsops)
463 {
464           char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
465           int error;
466 
467           if (type_seg == UIO_USERSPACE) {
468                     /* Copy file-system type from userspace.  */
469                     error = copyinstr(fstype, fstypename, sizeof(fstypename),
470                         NULL);
471           } else {
472                     error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
473                     KASSERT(error == 0);
474           }
475 
476           if (error) {
477                     /*
478                      * Historically, filesystem types were identified by numbers.
479                      * If we get an integer for the filesystem type instead of a
480                      * string, we check to see if it matches one of the historic
481                      * filesystem types.
482                      */
483                     u_long fsindex = (u_long)fstype;
484                     if (fsindex >= nmountcompatnames ||
485                         mountcompatnames[fsindex] == NULL)
486                               return ENODEV;
487                     strlcpy(fstypename, mountcompatnames[fsindex],
488                         sizeof(fstypename));
489           }
490 
491           /* Accept `ufs' as an alias for `ffs', for compatibility. */
492           if (strcmp(fstypename, "ufs") == 0)
493                     fstypename[0] = 'f';
494 
495           if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
496                     return 0;
497 
498           /* If we can autoload a vfs module, try again */
499           (void)module_autoload(fstypename, MODULE_CLASS_VFS);
500 
501           if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
502                     return 0;
503 
504           return ENODEV;
505 }
506 
507 static int
mount_getargs(struct lwp * l,struct vnode * vp,const char * path,int flags,void * data,size_t * data_len)508 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
509     void *data, size_t *data_len)
510 {
511           struct mount *mp;
512           int error;
513 
514           /* If MNT_GETARGS is specified, it should be the only flag. */
515           if (flags & ~MNT_GETARGS)
516                     return EINVAL;
517 
518           mp = vp->v_mount;
519 
520           /* XXX: probably some notion of "can see" here if we want isolation. */
521           error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
522               KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
523           if (error)
524                     return error;
525 
526           if ((vp->v_vflag & VV_ROOT) == 0)
527                     return EINVAL;
528 
529           if (vfs_busy(mp))
530                     return EPERM;
531 
532           mutex_enter(mp->mnt_updating);
533           mp->mnt_flag &= ~MNT_OP_FLAGS;
534           mp->mnt_flag |= MNT_GETARGS;
535           error = VFS_MOUNT(mp, path, data, data_len);
536           mp->mnt_flag &= ~MNT_OP_FLAGS;
537           mutex_exit(mp->mnt_updating);
538 
539           vfs_unbusy(mp);
540           return (error);
541 }
542 
543 int
sys___mount50(struct lwp * l,const struct sys___mount50_args * uap,register_t * retval)544 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap,
545     register_t *retval)
546 {
547           /* {
548                     syscallarg(const char *) type;
549                     syscallarg(const char *) path;
550                     syscallarg(int) flags;
551                     syscallarg(void *) data;
552                     syscallarg(size_t) data_len;
553           } */
554 
555           return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE,
556               SCARG(uap, path), SCARG(uap, flags),
557               SCARG(uap, data), UIO_USERSPACE, SCARG(uap, data_len),
558               retval);
559 }
560 
561 int
do_sys_mount(struct lwp * l,const char * type,enum uio_seg type_seg,const char * path,int flags,void * data,enum uio_seg data_seg,size_t data_len,register_t * retval)562 do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg,
563     const char *path, int flags,
564     void *data, enum uio_seg data_seg, size_t data_len,
565     register_t *retval)
566 {
567           struct vfsops *vfsops = NULL; /* XXX gcc4.8 */
568           struct vnode *vp;
569           void *data_buf = data;
570           bool vfsopsrele = false;
571           size_t alloc_sz = 0;
572           int error;
573 
574           /*
575            * Get vnode to be covered
576            */
577           error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
578           if (error != 0) {
579                     vp = NULL;
580                     goto done;
581           }
582 
583           if (flags & (MNT_GETARGS | MNT_UPDATE)) {
584                     vfsops = vp->v_mount->mnt_op;
585           } else {
586                     /* 'type' is userspace */
587                     error = mount_get_vfsops(type, type_seg, &vfsops);
588                     if (error != 0)
589                               goto done;
590                     vfsopsrele = true;
591           }
592 
593           /*
594            * We allow data to be NULL, even for userspace. Some fs's don't need
595            * it. The others will handle NULL.
596            */
597           if (data != NULL && data_seg == UIO_USERSPACE) {
598                     if (data_len == 0) {
599                               /* No length supplied, use default for filesystem */
600                               data_len = vfsops->vfs_min_mount_data;
601 
602                               /*
603                                * Hopefully a longer buffer won't make copyin() fail.
604                                * For compatibility with 3.0 and earlier.
605                                */
606                               if (flags & MNT_UPDATE
607                                   && data_len < sizeof (struct mnt_export_args30))
608                                         data_len = sizeof (struct mnt_export_args30);
609                     }
610                     if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
611                               error = EINVAL;
612                               goto done;
613                     }
614                     alloc_sz = data_len;
615                     data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
616 
617                     /* NFS needs the buffer even for mnt_getargs .... */
618                     error = copyin(data, data_buf, data_len);
619                     if (error != 0)
620                               goto done;
621           }
622 
623           if (flags & MNT_GETARGS) {
624                     if (data_len == 0) {
625                               error = EINVAL;
626                               goto done;
627                     }
628                     error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
629                     if (error != 0)
630                               goto done;
631                     if (data_seg == UIO_USERSPACE)
632                               error = copyout(data_buf, data, data_len);
633                     *retval = data_len;
634           } else if (flags & MNT_UPDATE) {
635                     error = mount_update(l, vp, path, flags, data_buf, &data_len);
636           } else {
637                     /* Locking is handled internally in mount_domount(). */
638                     KASSERT(vfsopsrele == true);
639                     error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
640                         &data_len);
641                     vfsopsrele = false;
642           }
643           if (!error) {
644                     mutex_enter(&fs_klist_lock);
645                     KNOTE(&fs_klist, NOTE_SUBMIT | VQ_MOUNT);
646                     mutex_exit(&fs_klist_lock);
647           }
648 
649 done:
650           if (vfsopsrele)
651                     vfs_delref(vfsops);
652           if (vp != NULL) {
653                     vrele(vp);
654           }
655           if (data_buf != data)
656                     kmem_free(data_buf, alloc_sz);
657           return (error);
658 }
659 
660 /*
661  * Unmount a file system.
662  *
663  * Note: unmount takes a path to the vnode mounted on as argument,
664  * not special file (as before).
665  */
666 /* ARGSUSED */
667 int
sys_unmount(struct lwp * l,const struct sys_unmount_args * uap,register_t * retval)668 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap,
669     register_t *retval)
670 {
671           /* {
672                     syscallarg(const char *) path;
673                     syscallarg(int) flags;
674           } */
675           struct vnode *vp;
676           struct mount *mp;
677           int error;
678           struct pathbuf *pb;
679           struct nameidata nd;
680 
681           error = pathbuf_copyin(SCARG(uap, path), &pb);
682           if (error) {
683                     return error;
684           }
685 
686           NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
687           if ((error = namei(&nd)) != 0) {
688                     pathbuf_destroy(pb);
689                     return error;
690           }
691           vp = nd.ni_vp;
692           pathbuf_destroy(pb);
693 
694           mp = vp->v_mount;
695           vfs_ref(mp);
696           VOP_UNLOCK(vp);
697 
698           error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
699               KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
700           if (error) {
701                     vrele(vp);
702                     vfs_rele(mp);
703                     return (error);
704           }
705 
706           /*
707            * Don't allow unmounting the root file system.
708            */
709           if (mp->mnt_flag & MNT_ROOTFS) {
710                     vrele(vp);
711                     vfs_rele(mp);
712                     return (EINVAL);
713           }
714 
715           /*
716            * Must be the root of the filesystem
717            */
718           if ((vp->v_vflag & VV_ROOT) == 0) {
719                     vrele(vp);
720                     vfs_rele(mp);
721                     return (EINVAL);
722           }
723 
724           vrele(vp);
725           error = dounmount(mp, SCARG(uap, flags), l);
726           vfs_rele(mp);
727           if (!error) {
728                     mutex_enter(&fs_klist_lock);
729                     KNOTE(&fs_klist, NOTE_SUBMIT | VQ_UNMOUNT);
730                     mutex_exit(&fs_klist_lock);
731           }
732           return error;
733 }
734 
735 /*
736  * Sync each mounted filesystem.
737  */
738 #ifdef DEBUG
739 int syncprt = 0;
740 struct ctldebug debug0 = { "syncprt", &syncprt };
741 #endif
742 
743 void
do_sys_sync(struct lwp * l)744 do_sys_sync(struct lwp *l)
745 {
746           mount_iterator_t *iter;
747           struct mount *mp;
748           int asyncflag;
749 
750           mountlist_iterator_init(&iter);
751           while ((mp = mountlist_iterator_next(iter)) != NULL) {
752                     mutex_enter(mp->mnt_updating);
753                     if ((mp->mnt_flag & MNT_RDONLY) == 0) {
754                               /*
755                                * Temporarily clear the MNT_ASYNC flags so that
756                                * bwrite() doesnt convert the sync writes to
757                                * delayed writes.
758                                */
759                               asyncflag = mp->mnt_flag & MNT_ASYNC;
760                               mp->mnt_flag &= ~MNT_ASYNC;
761                               VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
762                               mp->mnt_flag |= asyncflag;
763                     }
764                     mutex_exit(mp->mnt_updating);
765           }
766           mountlist_iterator_destroy(iter);
767 #ifdef DEBUG
768           if (syncprt)
769                     vfs_bufstats();
770 #endif /* DEBUG */
771 }
772 
773 static bool
sync_vnode_filter(void * cookie,vnode_t * vp)774 sync_vnode_filter(void *cookie, vnode_t *vp)
775 {
776 
777           if (vp->v_numoutput > 0) {
778                     ++*(int *)cookie;
779           }
780           return false;
781 }
782 
783 int
vfs_syncwait(void)784 vfs_syncwait(void)
785 {
786           int nbusy, nbusy_prev, iter;
787           struct vnode_iterator *vniter;
788           mount_iterator_t *mpiter;
789           struct mount *mp;
790 
791           for (nbusy_prev = 0, iter = 0; iter < 20;) {
792                     nbusy = 0;
793                     mountlist_iterator_init(&mpiter);
794                     while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
795                               vnode_t *vp __diagused;
796                               vfs_vnode_iterator_init(mp, &vniter);
797                               vp = vfs_vnode_iterator_next(vniter,
798                                   sync_vnode_filter, &nbusy);
799                               KASSERT(vp == NULL);
800                               vfs_vnode_iterator_destroy(vniter);
801                     }
802                     mountlist_iterator_destroy(mpiter);
803 
804                     if (nbusy == 0)
805                               break;
806                     if (nbusy_prev == 0)
807                               nbusy_prev = nbusy;
808                     printf("%d ", nbusy);
809                     kpause("syncwait", false, MAX(1, hz / 25 * iter), NULL);
810                     if (nbusy >= nbusy_prev) /* we didn't flush anything */
811                               iter++;
812                     else
813                               nbusy_prev = nbusy;
814           }
815 
816           if (nbusy) {
817 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
818                     printf("giving up\nPrinting vnodes for busy buffers\n");
819                     mountlist_iterator_init(&mpiter);
820                     while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
821                               vnode_t *vp;
822                               vfs_vnode_iterator_init(mp, &vniter);
823                               vp = vfs_vnode_iterator_next(vniter,
824                                   NULL, NULL);
825                               mutex_enter(vp->v_interlock);
826                               if (vp->v_numoutput > 0)
827                                         vprint(NULL, vp);
828                               mutex_exit(vp->v_interlock);
829                               vrele(vp);
830                               vfs_vnode_iterator_destroy(vniter);
831                     }
832                     mountlist_iterator_destroy(mpiter);
833 #endif
834           }
835 
836           return nbusy;
837 }
838 
839 /* ARGSUSED */
840 int
sys_sync(struct lwp * l,const void * v,register_t * retval)841 sys_sync(struct lwp *l, const void *v, register_t *retval)
842 {
843 
844           do_sys_sync(l);
845           return (0);
846 }
847 
848 /*
849  * Access or change filesystem quotas.
850  *
851  * (this is really 14 different calls bundled into one)
852  */
853 
854 static int
do_sys_quotactl_stat(struct mount * mp,struct quotastat * info_u)855 do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
856 {
857           struct quotastat info_k;
858           int error;
859 
860           /* ensure any padding bytes are cleared */
861           memset(&info_k, 0, sizeof(info_k));
862 
863           error = vfs_quotactl_stat(mp, &info_k);
864           if (error) {
865                     return error;
866           }
867 
868           return copyout(&info_k, info_u, sizeof(info_k));
869 }
870 
871 static int
do_sys_quotactl_idtypestat(struct mount * mp,int idtype,struct quotaidtypestat * info_u)872 do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
873     struct quotaidtypestat *info_u)
874 {
875           struct quotaidtypestat info_k;
876           int error;
877 
878           /* ensure any padding bytes are cleared */
879           memset(&info_k, 0, sizeof(info_k));
880 
881           error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
882           if (error) {
883                     return error;
884           }
885 
886           return copyout(&info_k, info_u, sizeof(info_k));
887 }
888 
889 static int
do_sys_quotactl_objtypestat(struct mount * mp,int objtype,struct quotaobjtypestat * info_u)890 do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
891     struct quotaobjtypestat *info_u)
892 {
893           struct quotaobjtypestat info_k;
894           int error;
895 
896           /* ensure any padding bytes are cleared */
897           memset(&info_k, 0, sizeof(info_k));
898 
899           error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
900           if (error) {
901                     return error;
902           }
903 
904           return copyout(&info_k, info_u, sizeof(info_k));
905 }
906 
907 static int
do_sys_quotactl_get(struct mount * mp,const struct quotakey * key_u,struct quotaval * val_u)908 do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
909     struct quotaval *val_u)
910 {
911           struct quotakey key_k;
912           struct quotaval val_k;
913           int error;
914 
915           /* ensure any padding bytes are cleared */
916           memset(&val_k, 0, sizeof(val_k));
917 
918           error = copyin(key_u, &key_k, sizeof(key_k));
919           if (error) {
920                     return error;
921           }
922 
923           error = vfs_quotactl_get(mp, &key_k, &val_k);
924           if (error) {
925                     return error;
926           }
927 
928           return copyout(&val_k, val_u, sizeof(val_k));
929 }
930 
931 static int
do_sys_quotactl_put(struct mount * mp,const struct quotakey * key_u,const struct quotaval * val_u)932 do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
933     const struct quotaval *val_u)
934 {
935           struct quotakey key_k;
936           struct quotaval val_k;
937           int error;
938 
939           error = copyin(key_u, &key_k, sizeof(key_k));
940           if (error) {
941                     return error;
942           }
943 
944           error = copyin(val_u, &val_k, sizeof(val_k));
945           if (error) {
946                     return error;
947           }
948 
949           return vfs_quotactl_put(mp, &key_k, &val_k);
950 }
951 
952 static int
do_sys_quotactl_del(struct mount * mp,const struct quotakey * key_u)953 do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
954 {
955           struct quotakey key_k;
956           int error;
957 
958           error = copyin(key_u, &key_k, sizeof(key_k));
959           if (error) {
960                     return error;
961           }
962 
963           return vfs_quotactl_del(mp, &key_k);
964 }
965 
966 static int
do_sys_quotactl_cursoropen(struct mount * mp,struct quotakcursor * cursor_u)967 do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
968 {
969           struct quotakcursor cursor_k;
970           int error;
971 
972           /* ensure any padding bytes are cleared */
973           memset(&cursor_k, 0, sizeof(cursor_k));
974 
975           error = vfs_quotactl_cursoropen(mp, &cursor_k);
976           if (error) {
977                     return error;
978           }
979 
980           return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
981 }
982 
983 static int
do_sys_quotactl_cursorclose(struct mount * mp,struct quotakcursor * cursor_u)984 do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
985 {
986           struct quotakcursor cursor_k;
987           int error;
988 
989           error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
990           if (error) {
991                     return error;
992           }
993 
994           return vfs_quotactl_cursorclose(mp, &cursor_k);
995 }
996 
997 static int
do_sys_quotactl_cursorskipidtype(struct mount * mp,struct quotakcursor * cursor_u,int idtype)998 do_sys_quotactl_cursorskipidtype(struct mount *mp,
999     struct quotakcursor *cursor_u, int idtype)
1000 {
1001           struct quotakcursor cursor_k;
1002           int error;
1003 
1004           error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1005           if (error) {
1006                     return error;
1007           }
1008 
1009           error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
1010           if (error) {
1011                     return error;
1012           }
1013 
1014           return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1015 }
1016 
1017 static int
do_sys_quotactl_cursorget(struct mount * mp,struct quotakcursor * cursor_u,struct quotakey * keys_u,struct quotaval * vals_u,unsigned maxnum,unsigned * ret_u)1018 do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
1019     struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
1020     unsigned *ret_u)
1021 {
1022 #define CGET_STACK_MAX 8
1023           struct quotakcursor cursor_k;
1024           struct quotakey stackkeys[CGET_STACK_MAX];
1025           struct quotaval stackvals[CGET_STACK_MAX];
1026           struct quotakey *keys_k;
1027           struct quotaval *vals_k;
1028           unsigned ret_k;
1029           int error;
1030 
1031           if (maxnum > 128) {
1032                     maxnum = 128;
1033           }
1034 
1035           error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1036           if (error) {
1037                     return error;
1038           }
1039 
1040           if (maxnum <= CGET_STACK_MAX) {
1041                     keys_k = stackkeys;
1042                     vals_k = stackvals;
1043                     /* ensure any padding bytes are cleared */
1044                     memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
1045                     memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
1046           } else {
1047                     keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
1048                     vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
1049           }
1050 
1051           error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
1052               &ret_k);
1053           if (error) {
1054                     goto fail;
1055           }
1056 
1057           error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
1058           if (error) {
1059                     goto fail;
1060           }
1061 
1062           error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
1063           if (error) {
1064                     goto fail;
1065           }
1066 
1067           error = copyout(&ret_k, ret_u, sizeof(ret_k));
1068           if (error) {
1069                     goto fail;
1070           }
1071 
1072           /* do last to maximize the chance of being able to recover a failure */
1073           error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1074 
1075 fail:
1076           if (keys_k != stackkeys) {
1077                     kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
1078           }
1079           if (vals_k != stackvals) {
1080                     kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
1081           }
1082           return error;
1083 }
1084 
1085 static int
do_sys_quotactl_cursoratend(struct mount * mp,struct quotakcursor * cursor_u,int * ret_u)1086 do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
1087     int *ret_u)
1088 {
1089           struct quotakcursor cursor_k;
1090           int ret_k;
1091           int error;
1092 
1093           error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1094           if (error) {
1095                     return error;
1096           }
1097 
1098           error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
1099           if (error) {
1100                     return error;
1101           }
1102 
1103           error = copyout(&ret_k, ret_u, sizeof(ret_k));
1104           if (error) {
1105                     return error;
1106           }
1107 
1108           return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1109 }
1110 
1111 static int
do_sys_quotactl_cursorrewind(struct mount * mp,struct quotakcursor * cursor_u)1112 do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
1113 {
1114           struct quotakcursor cursor_k;
1115           int error;
1116 
1117           error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
1118           if (error) {
1119                     return error;
1120           }
1121 
1122           error = vfs_quotactl_cursorrewind(mp, &cursor_k);
1123           if (error) {
1124                     return error;
1125           }
1126 
1127           return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
1128 }
1129 
1130 static int
do_sys_quotactl_quotaon(struct mount * mp,int idtype,const char * path_u)1131 do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
1132 {
1133           char *path_k;
1134           int error;
1135 
1136           /* XXX this should probably be a struct pathbuf */
1137           path_k = PNBUF_GET();
1138           error = copyin(path_u, path_k, PATH_MAX);
1139           if (error) {
1140                     PNBUF_PUT(path_k);
1141                     return error;
1142           }
1143 
1144           error = vfs_quotactl_quotaon(mp, idtype, path_k);
1145 
1146           PNBUF_PUT(path_k);
1147           return error;
1148 }
1149 
1150 static int
do_sys_quotactl_quotaoff(struct mount * mp,int idtype)1151 do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
1152 {
1153 
1154           return vfs_quotactl_quotaoff(mp, idtype);
1155 }
1156 
1157 int
do_sys_quotactl(const char * path_u,const struct quotactl_args * args)1158 do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
1159 {
1160           struct mount *mp;
1161           struct vnode *vp;
1162           int error;
1163 
1164           error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
1165           if (error != 0)
1166                     return (error);
1167           mp = vp->v_mount;
1168 
1169           switch (args->qc_op) {
1170           case QUOTACTL_STAT:
1171                     error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
1172                     break;
1173           case QUOTACTL_IDTYPESTAT:
1174                     error = do_sys_quotactl_idtypestat(mp,
1175                         args->u.idtypestat.qc_idtype,
1176                         args->u.idtypestat.qc_info);
1177                     break;
1178           case QUOTACTL_OBJTYPESTAT:
1179                     error = do_sys_quotactl_objtypestat(mp,
1180                         args->u.objtypestat.qc_objtype,
1181                         args->u.objtypestat.qc_info);
1182                     break;
1183           case QUOTACTL_GET:
1184                     error = do_sys_quotactl_get(mp,
1185                         args->u.get.qc_key,
1186                         args->u.get.qc_val);
1187                     break;
1188           case QUOTACTL_PUT:
1189                     error = do_sys_quotactl_put(mp,
1190                         args->u.put.qc_key,
1191                         args->u.put.qc_val);
1192                     break;
1193           case QUOTACTL_DEL:
1194                     error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1195                     break;
1196           case QUOTACTL_CURSOROPEN:
1197                     error = do_sys_quotactl_cursoropen(mp,
1198                         args->u.cursoropen.qc_cursor);
1199                     break;
1200           case QUOTACTL_CURSORCLOSE:
1201                     error = do_sys_quotactl_cursorclose(mp,
1202                         args->u.cursorclose.qc_cursor);
1203                     break;
1204           case QUOTACTL_CURSORSKIPIDTYPE:
1205                     error = do_sys_quotactl_cursorskipidtype(mp,
1206                         args->u.cursorskipidtype.qc_cursor,
1207                         args->u.cursorskipidtype.qc_idtype);
1208                     break;
1209           case QUOTACTL_CURSORGET:
1210                     error = do_sys_quotactl_cursorget(mp,
1211                         args->u.cursorget.qc_cursor,
1212                         args->u.cursorget.qc_keys,
1213                         args->u.cursorget.qc_vals,
1214                         args->u.cursorget.qc_maxnum,
1215                         args->u.cursorget.qc_ret);
1216                     break;
1217           case QUOTACTL_CURSORATEND:
1218                     error = do_sys_quotactl_cursoratend(mp,
1219                         args->u.cursoratend.qc_cursor,
1220                         args->u.cursoratend.qc_ret);
1221                     break;
1222           case QUOTACTL_CURSORREWIND:
1223                     error = do_sys_quotactl_cursorrewind(mp,
1224                         args->u.cursorrewind.qc_cursor);
1225                     break;
1226           case QUOTACTL_QUOTAON:
1227                     error = do_sys_quotactl_quotaon(mp,
1228                         args->u.quotaon.qc_idtype,
1229                         args->u.quotaon.qc_quotafile);
1230                     break;
1231           case QUOTACTL_QUOTAOFF:
1232                     error = do_sys_quotactl_quotaoff(mp,
1233                         args->u.quotaoff.qc_idtype);
1234                     break;
1235           default:
1236                     error = EINVAL;
1237                     break;
1238           }
1239 
1240           vrele(vp);
1241           return error;
1242 }
1243 
1244 /* ARGSUSED */
1245 int
sys___quotactl(struct lwp * l,const struct sys___quotactl_args * uap,register_t * retval)1246 sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
1247     register_t *retval)
1248 {
1249           /* {
1250                     syscallarg(const char *) path;
1251                     syscallarg(struct quotactl_args *) args;
1252           } */
1253           struct quotactl_args args;
1254           int error;
1255 
1256           error = copyin(SCARG(uap, args), &args, sizeof(args));
1257           if (error) {
1258                     return error;
1259           }
1260 
1261           return do_sys_quotactl(SCARG(uap, path), &args);
1262 }
1263 
1264 int
dostatvfs(struct mount * mp,struct statvfs * sp,struct lwp * l,int flags,int root)1265 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
1266     int root)
1267 {
1268           struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1269           bool chrooted;
1270           int error = 0;
1271 
1272           KASSERT(l == curlwp);
1273 
1274           /*
1275            * This is safe unlocked.  cwdi_rdir never goes non-NULL -> NULL,
1276            * since it would imply chroots can be escaped.  Just make sure this
1277            * routine is self-consistent.
1278            */
1279           chrooted = (atomic_load_relaxed(&cwdi->cwdi_rdir) != NULL);
1280 
1281           /*
1282            * If MNT_NOWAIT or MNT_LAZY is specified, do not
1283            * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1284            * overrides MNT_NOWAIT.
1285            */
1286           if (flags == MNT_NOWAIT       || flags == MNT_LAZY ||
1287               (flags != MNT_WAIT && flags != 0)) {
1288                     memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1289           } else {
1290                     /* Get the filesystem stats now */
1291                     memset(sp, 0, sizeof(*sp));
1292                     if ((error = VFS_STATVFS(mp, sp)) != 0)
1293                               return error;
1294                     if (!chrooted)
1295                               (void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1296           }
1297 
1298           if (chrooted) {
1299                     size_t len;
1300                     char *bp;
1301                     char c;
1302                     char *path = PNBUF_GET();
1303 
1304                     bp = path + MAXPATHLEN;
1305                     *--bp = '\0';
1306                     rw_enter(&cwdi->cwdi_lock, RW_READER);
1307                     error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1308                         MAXPATHLEN / 2, 0, l);
1309                     rw_exit(&cwdi->cwdi_lock);
1310                     if (error) {
1311                               PNBUF_PUT(path);
1312                               return error;
1313                     }
1314                     len = strlen(bp);
1315                     if (len != 1) {
1316                               /*
1317                                * for mount points that are below our root, we can see
1318                                * them, so we fix up the pathname and return them. The
1319                                * rest we cannot see, so we don't allow viewing the
1320                                * data.
1321                                */
1322                               if (strncmp(bp, sp->f_mntonname, len) == 0 &&
1323                                   ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
1324                                         (void)strlcpy(sp->f_mntonname,
1325                                             c == '\0' ? "/" : &sp->f_mntonname[len],
1326                                             sizeof(sp->f_mntonname));
1327                               } else {
1328                                         if (root)
1329                                                   (void)strlcpy(sp->f_mntonname, "/",
1330                                                       sizeof(sp->f_mntonname));
1331                                         else
1332                                                   error = EPERM;
1333                               }
1334                     }
1335                     PNBUF_PUT(path);
1336           }
1337           sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1338           return error;
1339 }
1340 
1341 /*
1342  * Get filesystem statistics by path.
1343  */
1344 int
do_sys_pstatvfs(struct lwp * l,const char * path,int flags,struct statvfs * sb)1345 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
1346 {
1347           struct mount *mp;
1348           int error;
1349           struct vnode *vp;
1350 
1351           error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1352           if (error != 0)
1353                     return error;
1354           mp = vp->v_mount;
1355           error = dostatvfs(mp, sb, l, flags, 1);
1356           vrele(vp);
1357           return error;
1358 }
1359 
1360 /* ARGSUSED */
1361 int
sys___statvfs190(struct lwp * l,const struct sys___statvfs190_args * uap,register_t * retval)1362 sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap,
1363     register_t *retval)
1364 {
1365           /* {
1366                     syscallarg(const char *) path;
1367                     syscallarg(struct statvfs *) buf;
1368                     syscallarg(int) flags;
1369           } */
1370           struct statvfs *sb;
1371           int error;
1372 
1373           sb = STATVFSBUF_GET();
1374           error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1375           if (error == 0)
1376                     error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1377           STATVFSBUF_PUT(sb);
1378           return error;
1379 }
1380 
1381 /*
1382  * Get filesystem statistics by fd.
1383  */
1384 int
do_sys_fstatvfs(struct lwp * l,int fd,int flags,struct statvfs * sb)1385 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
1386 {
1387           file_t *fp;
1388           struct mount *mp;
1389           int error;
1390 
1391           /* fd_getvnode() will use the descriptor for us */
1392           if ((error = fd_getvnode(fd, &fp)) != 0)
1393                     return (error);
1394           mp = fp->f_vnode->v_mount;
1395           error = dostatvfs(mp, sb, curlwp, flags, 1);
1396           fd_putfile(fd);
1397           return error;
1398 }
1399 
1400 /* ARGSUSED */
1401 int
sys___fstatvfs190(struct lwp * l,const struct sys___fstatvfs190_args * uap,register_t * retval)1402 sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap,
1403     register_t *retval)
1404 {
1405           /* {
1406                     syscallarg(int) fd;
1407                     syscallarg(struct statvfs *) buf;
1408                     syscallarg(int) flags;
1409           } */
1410           struct statvfs *sb;
1411           int error;
1412 
1413           sb = STATVFSBUF_GET();
1414           error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1415           if (error == 0)
1416                     error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1417           STATVFSBUF_PUT(sb);
1418           return error;
1419 }
1420 
1421 /*
1422  * Get statistics on all filesystems.
1423  */
1424 int
do_sys_getvfsstat(struct lwp * l,void * sfsp,size_t bufsize,int flags,int (* copyfn)(const void *,void *,size_t),size_t entry_sz,register_t * retval)1425 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
1426     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
1427     register_t *retval)
1428 {
1429           int root = 0;
1430           mount_iterator_t *iter;
1431           struct proc *p = l->l_proc;
1432           struct mount *mp;
1433           struct statvfs *sb;
1434           size_t count, maxcount;
1435           int error = 0;
1436 
1437           sb = STATVFSBUF_GET();
1438           maxcount = bufsize / entry_sz;
1439           count = 0;
1440           mountlist_iterator_init(&iter);
1441           while ((mp = mountlist_iterator_next(iter)) != NULL) {
1442                     if (sfsp && count < maxcount) {
1443                               error = dostatvfs(mp, sb, l, flags, 0);
1444                               if (error) {
1445                                         error = 0;
1446                                         continue;
1447                               }
1448                               error = copyfn(sb, sfsp, entry_sz);
1449                               if (error)
1450                                         goto out;
1451                               sfsp = (char *)sfsp + entry_sz;
1452                               root |= strcmp(sb->f_mntonname, "/") == 0;
1453                     }
1454                     count++;
1455           }
1456 
1457           if (root == 0 && p->p_cwdi->cwdi_rdir) {
1458                     /*
1459                      * fake a root entry
1460                      */
1461                     error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1462                         sb, l, flags, 1);
1463                     if (error != 0)
1464                               goto out;
1465                     if (sfsp) {
1466                               error = copyfn(sb, sfsp, entry_sz);
1467                               if (error != 0)
1468                                         goto out;
1469                     }
1470                     count++;
1471           }
1472           if (sfsp && count > maxcount)
1473                     *retval = maxcount;
1474           else
1475                     *retval = count;
1476 out:
1477           mountlist_iterator_destroy(iter);
1478           STATVFSBUF_PUT(sb);
1479           return error;
1480 }
1481 
1482 int
sys___getvfsstat90(struct lwp * l,const struct sys___getvfsstat90_args * uap,register_t * retval)1483 sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap,
1484     register_t *retval)
1485 {
1486           /* {
1487                     syscallarg(struct statvfs *) buf;
1488                     syscallarg(size_t) bufsize;
1489                     syscallarg(int) flags;
1490           } */
1491 
1492           return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1493               SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1494 }
1495 
1496 /*
1497  * Change current working directory to a given file descriptor.
1498  */
1499 int
do_sys_fchdir(struct lwp * l,int fd,register_t * retval)1500 do_sys_fchdir(struct lwp *l, int fd, register_t *retval)
1501 {
1502           struct proc *p = l->l_proc;
1503           struct cwdinfo *cwdi;
1504           struct vnode *vp, *tdp;
1505           struct mount *mp;
1506           file_t *fp;
1507           int error;
1508 
1509           /* fd_getvnode() will use the descriptor for us */
1510           if ((error = fd_getvnode(fd, &fp)) != 0)
1511                     return error;
1512           vp = fp->f_vnode;
1513 
1514           vref(vp);
1515           vn_lock(vp, LK_SHARED | LK_RETRY);
1516           if (vp->v_type != VDIR)
1517                     error = ENOTDIR;
1518           else
1519                     error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1520           if (error) {
1521                     vput(vp);
1522                     goto out;
1523           }
1524           while ((mp = vp->v_mountedhere) != NULL) {
1525                     error = vfs_busy(mp);
1526                     vput(vp);
1527                     if (error != 0)
1528                               goto out;
1529                     error = VFS_ROOT(mp, LK_SHARED, &tdp);
1530                     vfs_unbusy(mp);
1531                     if (error)
1532                               goto out;
1533                     vp = tdp;
1534           }
1535           VOP_UNLOCK(vp);
1536 
1537           /*
1538            * Disallow changing to a directory not under the process's
1539            * current root directory (if there is one).
1540            */
1541           cwdi = p->p_cwdi;
1542           rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1543           if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1544                     vrele(vp);
1545                     error = EPERM;      /* operation not permitted */
1546           } else {
1547                     vrele(cwdi->cwdi_cdir);
1548                     cwdi->cwdi_cdir = vp;
1549           }
1550           rw_exit(&cwdi->cwdi_lock);
1551 
1552 out:
1553           fd_putfile(fd);
1554           return error;
1555 }
1556 
1557 /*
1558  * Change current working directory to a given file descriptor.
1559  */
1560 /* ARGSUSED */
1561 int
sys_fchdir(struct lwp * l,const struct sys_fchdir_args * uap,register_t * retval)1562 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap,
1563     register_t *retval)
1564 {
1565           /* {
1566                     syscallarg(int) fd;
1567           } */
1568 
1569           return do_sys_fchdir(l, SCARG(uap, fd), retval);
1570 }
1571 
1572 /*
1573  * Change this process's notion of the root directory to a given file
1574  * descriptor.
1575  */
1576 int
sys_fchroot(struct lwp * l,const struct sys_fchroot_args * uap,register_t * retval)1577 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap,
1578     register_t *retval)
1579 {
1580           struct vnode        *vp;
1581           file_t    *fp;
1582           int                  error, fd = SCARG(uap, fd);
1583 
1584           if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1585                         KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
1586                     return error;
1587           /* fd_getvnode() will use the descriptor for us */
1588           if ((error = fd_getvnode(fd, &fp)) != 0)
1589                     return error;
1590           vp = fp->f_vnode;
1591           vn_lock(vp, LK_SHARED | LK_RETRY);
1592           if (vp->v_type != VDIR)
1593                     error = ENOTDIR;
1594           else
1595                     error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1596           VOP_UNLOCK(vp);
1597           if (error)
1598                     goto out;
1599           vref(vp);
1600           change_root(vp);
1601 
1602 out:
1603           fd_putfile(fd);
1604           return (error);
1605 }
1606 
1607 /*
1608  * Change current working directory (``.'').
1609  */
1610 int
do_sys_chdir(struct lwp * l,const char * path,enum uio_seg seg,register_t * retval)1611 do_sys_chdir(struct lwp *l, const char *path, enum uio_seg seg,
1612     register_t *retval)
1613 {
1614           struct proc *p = l->l_proc;
1615           struct cwdinfo * cwdi;
1616           int error;
1617           struct vnode *vp;
1618 
1619           if ((error = chdir_lookup(path, seg, &vp, l)) != 0)
1620                     return error;
1621           cwdi = p->p_cwdi;
1622           rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1623           vrele(cwdi->cwdi_cdir);
1624           cwdi->cwdi_cdir = vp;
1625           rw_exit(&cwdi->cwdi_lock);
1626           return 0;
1627 }
1628 
1629 /*
1630  * Change current working directory (``.'').
1631  */
1632 /* ARGSUSED */
1633 int
sys_chdir(struct lwp * l,const struct sys_chdir_args * uap,register_t * retval)1634 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
1635 {
1636           /* {
1637                     syscallarg(const char *) path;
1638           } */
1639 
1640           return do_sys_chdir(l, SCARG(uap, path), UIO_USERSPACE, retval);
1641 }
1642 
1643 /*
1644  * Change notion of root (``/'') directory.
1645  */
1646 /* ARGSUSED */
1647 int
sys_chroot(struct lwp * l,const struct sys_chroot_args * uap,register_t * retval)1648 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap,
1649     register_t *retval)
1650 {
1651           /* {
1652                     syscallarg(const char *) path;
1653           } */
1654           int error;
1655           struct vnode *vp;
1656 
1657           if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1658                         KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
1659                     return (error);
1660 
1661           error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
1662           if (error == 0)
1663                     change_root(vp);
1664           return error;
1665 }
1666 
1667 /*
1668  * Common routine for chroot and fchroot.
1669  * NB: callers need to properly authorize the change root operation.
1670  */
1671 void
change_root(struct vnode * vp)1672 change_root(struct vnode *vp)
1673 {
1674           kauth_cred_t ncred;
1675           struct lwp *l = curlwp;
1676           struct proc *p = l->l_proc;
1677           struct cwdinfo *cwdi = p->p_cwdi;
1678 
1679           ncred = kauth_cred_alloc();
1680 
1681           rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1682           if (cwdi->cwdi_rdir != NULL)
1683                     vrele(cwdi->cwdi_rdir);
1684           cwdi->cwdi_rdir = vp;
1685 
1686           /*
1687            * Prevent escaping from chroot by putting the root under
1688            * the working directory.  Silently chdir to / if we aren't
1689            * already there.
1690            */
1691           if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1692                     /*
1693                      * XXX would be more failsafe to change directory to a
1694                      * deadfs node here instead
1695                      */
1696                     vrele(cwdi->cwdi_cdir);
1697                     vref(vp);
1698                     cwdi->cwdi_cdir = vp;
1699           }
1700           rw_exit(&cwdi->cwdi_lock);
1701 
1702           /* Get a write lock on the process credential. */
1703           proc_crmod_enter();
1704 
1705           kauth_cred_clone(p->p_cred, ncred);
1706           kauth_proc_chroot(ncred, p->p_cwdi);
1707 
1708           /* Broadcast our credentials to the process and other LWPs. */
1709           proc_crmod_leave(ncred, p->p_cred, true);
1710 }
1711 
1712 /*
1713  * Common routine for chroot and chdir.
1714  * XXX "where" should be enum uio_seg
1715  */
1716 int
chdir_lookup(const char * path,int where,struct vnode ** vpp,struct lwp * l)1717 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
1718 {
1719           struct pathbuf *pb;
1720           struct nameidata nd;
1721           int error;
1722 
1723           error = pathbuf_maybe_copyin(path, where, &pb);
1724           if (error) {
1725                     return error;
1726           }
1727           NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
1728           if ((error = namei(&nd)) != 0) {
1729                     pathbuf_destroy(pb);
1730                     return error;
1731           }
1732           *vpp = nd.ni_vp;
1733           pathbuf_destroy(pb);
1734 
1735           if ((*vpp)->v_type != VDIR)
1736                     error = ENOTDIR;
1737           else
1738                     error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1739 
1740           if (error)
1741                     vput(*vpp);
1742           else
1743                     VOP_UNLOCK(*vpp);
1744           return (error);
1745 }
1746 
1747 /*
1748  * Internals of sys_open - path has already been converted into a pathbuf
1749  * (so we can easily reuse this function from other parts of the kernel,
1750  * like posix_spawn post-processing).
1751  */
1752 int
do_open(lwp_t * l,struct vnode * dvp,struct pathbuf * pb,int open_flags,int open_mode,int * fd)1753 do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
1754     int open_mode, int *fd)
1755 {
1756           struct proc *p = l->l_proc;
1757           struct cwdinfo *cwdi = p->p_cwdi;
1758           file_t *fp;
1759           struct vnode *vp;
1760           int dupfd;
1761           bool dupfd_move;
1762           int flags, cmode;
1763           int indx, error;
1764 
1765           if (open_flags & O_SEARCH) {
1766                     open_flags &= ~(int)O_SEARCH;
1767           }
1768 
1769           /*
1770            * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1771            * may be specified.
1772            */
1773           if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE))
1774                     return EINVAL;
1775 
1776           flags = FFLAGS(open_flags);
1777           if ((flags & (FREAD | FWRITE)) == 0)
1778                     return EINVAL;
1779 
1780           if ((error = fd_allocfile(&fp, &indx)) != 0) {
1781                     return error;
1782           }
1783 
1784           /* We're going to read cwdi->cwdi_cmask unlocked here. */
1785           cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1786 
1787           error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode,
1788               &vp, &dupfd_move, &dupfd);
1789           if (error != 0) {
1790                     fd_abort(p, fp, indx);
1791                     return error;
1792           }
1793 
1794           if (vp == NULL) {
1795                     fd_abort(p, fp, indx);
1796                     error = fd_dupopen(dupfd, dupfd_move, flags, &indx);
1797                     if (error)
1798                               return error;
1799                     *fd = indx;
1800           } else {
1801                     error = open_setfp(l, fp, vp, indx, flags);
1802                     if (error)
1803                               return error;
1804                     VOP_UNLOCK(vp);
1805                     *fd = indx;
1806                     fd_affix(p, fp, indx);
1807           }
1808 
1809           return 0;
1810 }
1811 
1812 int
fd_open(const char * path,int open_flags,int open_mode,int * fd)1813 fd_open(const char *path, int open_flags, int open_mode, int *fd)
1814 {
1815           struct pathbuf *pb;
1816           int error, oflags;
1817 
1818           oflags = FFLAGS(open_flags);
1819           if ((oflags & (FREAD | FWRITE)) == 0)
1820                     return EINVAL;
1821 
1822           pb = pathbuf_create(path);
1823           if (pb == NULL)
1824                     return ENOMEM;
1825 
1826           error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1827           pathbuf_destroy(pb);
1828 
1829           return error;
1830 }
1831 
1832 static int
do_sys_openat(lwp_t * l,int fdat,const char * path,int flags,int mode,int * fd)1833 do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
1834     int mode, int *fd)
1835 {
1836           file_t *dfp = NULL;
1837           struct vnode *dvp = NULL;
1838           struct pathbuf *pb;
1839           const char *pathstring = NULL;
1840           int error;
1841 
1842           if (path == NULL) {
1843                     MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error);
1844                     if (error == ENOSYS)
1845                               goto no_compat;
1846                     if (error)
1847                               return error;
1848           } else {
1849 no_compat:
1850                     error = pathbuf_copyin(path, &pb);
1851                     if (error)
1852                               return error;
1853           }
1854 
1855           pathstring = pathbuf_stringcopy_get(pb);
1856 
1857           /*
1858            * fdat is ignored if:
1859            * 1) if fdat is AT_FDCWD, which means use current directory as base.
1860            * 2) if path is absolute, then fdat is useless.
1861            */
1862           if (fdat != AT_FDCWD && pathstring[0] != '/') {
1863                     /* fd_getvnode() will use the descriptor for us */
1864                     if ((error = fd_getvnode(fdat, &dfp)) != 0)
1865                               goto out;
1866 
1867                     dvp = dfp->f_vnode;
1868           }
1869 
1870           error = do_open(l, dvp, pb, flags, mode, fd);
1871 
1872           if (dfp != NULL)
1873                     fd_putfile(fdat);
1874 out:
1875           pathbuf_stringcopy_put(pb, pathstring);
1876           pathbuf_destroy(pb);
1877           return error;
1878 }
1879 
1880 int
sys_open(struct lwp * l,const struct sys_open_args * uap,register_t * retval)1881 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
1882 {
1883           /* {
1884                     syscallarg(const char *) path;
1885                     syscallarg(int) flags;
1886                     syscallarg(int) mode;
1887           } */
1888           int error;
1889           int fd;
1890 
1891           error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1892                                     SCARG(uap, flags), SCARG(uap, mode), &fd);
1893 
1894           if (error == 0)
1895                     *retval = fd;
1896 
1897           return error;
1898 }
1899 
1900 int
sys_openat(struct lwp * l,const struct sys_openat_args * uap,register_t * retval)1901 sys_openat(struct lwp *l, const struct sys_openat_args *uap,
1902     register_t *retval)
1903 {
1904           /* {
1905                     syscallarg(int) fd;
1906                     syscallarg(const char *) path;
1907                     syscallarg(int) oflags;
1908                     syscallarg(int) mode;
1909           } */
1910           int error;
1911           int fd;
1912 
1913           error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1914               SCARG(uap, oflags), SCARG(uap, mode), &fd);
1915 
1916           if (error == 0)
1917                     *retval = fd;
1918 
1919           return error;
1920 }
1921 
1922 static void
vfs__fhfree(fhandle_t * fhp)1923 vfs__fhfree(fhandle_t *fhp)
1924 {
1925           size_t fhsize;
1926 
1927           fhsize = FHANDLE_SIZE(fhp);
1928           kmem_free(fhp, fhsize);
1929 }
1930 
1931 /*
1932  * vfs_composefh: compose a filehandle.
1933  */
1934 
1935 int
vfs_composefh(struct vnode * vp,fhandle_t * fhp,size_t * fh_size)1936 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
1937 {
1938           struct mount *mp;
1939           struct fid *fidp;
1940           int error;
1941           size_t needfhsize;
1942           size_t fidsize;
1943 
1944           mp = vp->v_mount;
1945           fidp = NULL;
1946           if (*fh_size < FHANDLE_SIZE_MIN) {
1947                     fidsize = 0;
1948           } else {
1949                     fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1950                     if (fhp != NULL) {
1951                               memset(fhp, 0, *fh_size);
1952                               fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1953                               fidp = &fhp->fh_fid;
1954                     }
1955           }
1956           error = VFS_VPTOFH(vp, fidp, &fidsize);
1957           needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1958           if (error == 0 && *fh_size < needfhsize) {
1959                     error = E2BIG;
1960           }
1961           *fh_size = needfhsize;
1962           return error;
1963 }
1964 
1965 int
vfs_composefh_alloc(struct vnode * vp,fhandle_t ** fhpp)1966 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
1967 {
1968           struct mount *mp;
1969           fhandle_t *fhp;
1970           size_t fhsize;
1971           size_t fidsize;
1972           int error;
1973 
1974           mp = vp->v_mount;
1975           fidsize = 0;
1976           error = VFS_VPTOFH(vp, NULL, &fidsize);
1977           KASSERT(error != 0);
1978           if (error != E2BIG) {
1979                     goto out;
1980           }
1981           fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1982           fhp = kmem_zalloc(fhsize, KM_SLEEP);
1983           fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1984           error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1985           if (error == 0) {
1986                     KASSERT(FHANDLE_SIZE(fhp) == fhsize);
1987                     KASSERT(FHANDLE_FILEID(fhp)->fid_len == fidsize);
1988                     *fhpp = fhp;
1989           } else {
1990                     kmem_free(fhp, fhsize);
1991           }
1992 out:
1993           return error;
1994 }
1995 
1996 void
vfs_composefh_free(fhandle_t * fhp)1997 vfs_composefh_free(fhandle_t *fhp)
1998 {
1999 
2000           vfs__fhfree(fhp);
2001 }
2002 
2003 /*
2004  * vfs_fhtovp: lookup a vnode by a filehandle.
2005  */
2006 
2007 int
vfs_fhtovp(fhandle_t * fhp,struct vnode ** vpp)2008 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
2009 {
2010           struct mount *mp;
2011           int error;
2012 
2013           *vpp = NULL;
2014           mp = vfs_getvfs(FHANDLE_FSID(fhp));
2015           if (mp == NULL) {
2016                     error = ESTALE;
2017                     goto out;
2018           }
2019           if (mp->mnt_op->vfs_fhtovp == NULL) {
2020                     error = EOPNOTSUPP;
2021                     goto out;
2022           }
2023           error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp);
2024 out:
2025           return error;
2026 }
2027 
2028 /*
2029  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
2030  * the needed size.
2031  */
2032 
2033 int
vfs_copyinfh_alloc(const void * ufhp,size_t fhsize,fhandle_t ** fhpp)2034 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
2035 {
2036           fhandle_t *fhp;
2037           int error;
2038 
2039           if (fhsize > FHANDLE_SIZE_MAX) {
2040                     return EINVAL;
2041           }
2042           if (fhsize < FHANDLE_SIZE_MIN) {
2043                     return EINVAL;
2044           }
2045 again:
2046           fhp = kmem_alloc(fhsize, KM_SLEEP);
2047           error = copyin(ufhp, fhp, fhsize);
2048           if (error == 0) {
2049                     /* XXX this check shouldn't be here */
2050                     if (FHANDLE_SIZE(fhp) == fhsize) {
2051                               *fhpp = fhp;
2052                               return 0;
2053                     } else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
2054                               /*
2055                                * a kludge for nfsv2 padded handles.
2056                                */
2057                               size_t sz;
2058 
2059                               sz = FHANDLE_SIZE(fhp);
2060                               kmem_free(fhp, fhsize);
2061                               fhsize = sz;
2062                               goto again;
2063                     } else {
2064                               /*
2065                                * userland told us wrong size.
2066                                */
2067                               error = EINVAL;
2068                     }
2069           }
2070           kmem_free(fhp, fhsize);
2071           return error;
2072 }
2073 
2074 void
vfs_copyinfh_free(fhandle_t * fhp)2075 vfs_copyinfh_free(fhandle_t *fhp)
2076 {
2077 
2078           vfs__fhfree(fhp);
2079 }
2080 
2081 /*
2082  * Get file handle system call
2083  */
2084 int
sys___getfh30(struct lwp * l,const struct sys___getfh30_args * uap,register_t * retval)2085 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap,
2086     register_t *retval)
2087 {
2088           /* {
2089                     syscallarg(char *) fname;
2090                     syscallarg(fhandle_t *) fhp;
2091                     syscallarg(size_t *) fh_size;
2092           } */
2093           struct vnode *vp;
2094           fhandle_t *fh;
2095           int error;
2096           struct pathbuf *pb;
2097           struct nameidata nd;
2098           size_t sz;
2099           size_t usz;
2100 
2101           /*
2102            * Must be super user
2103            */
2104           error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2105               0, NULL, NULL, NULL);
2106           if (error)
2107                     return (error);
2108 
2109           error = pathbuf_copyin(SCARG(uap, fname), &pb);
2110           if (error) {
2111                     return error;
2112           }
2113           NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
2114           error = namei(&nd);
2115           if (error) {
2116                     pathbuf_destroy(pb);
2117                     return error;
2118           }
2119           vp = nd.ni_vp;
2120           pathbuf_destroy(pb);
2121 
2122           error = vfs_composefh_alloc(vp, &fh);
2123           vput(vp);
2124           if (error != 0) {
2125                     return error;
2126           }
2127           error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
2128           if (error != 0) {
2129                     goto out;
2130           }
2131           sz = FHANDLE_SIZE(fh);
2132           error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
2133           if (error != 0) {
2134                     goto out;
2135           }
2136           if (usz >= sz) {
2137                     error = copyout(fh, SCARG(uap, fhp), sz);
2138           } else {
2139                     error = E2BIG;
2140           }
2141 out:
2142           vfs_composefh_free(fh);
2143           return (error);
2144 }
2145 
2146 /*
2147  * Open a file given a file handle.
2148  *
2149  * Check permissions, allocate an open file structure,
2150  * and call the device open routine if any.
2151  */
2152 
2153 int
dofhopen(struct lwp * l,const void * ufhp,size_t fhsize,int oflags,register_t * retval)2154 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
2155     register_t *retval)
2156 {
2157           file_t *fp;
2158           struct vnode *vp = NULL;
2159           kauth_cred_t cred = l->l_cred;
2160           file_t *nfp;
2161           int indx, error;
2162           struct vattr va;
2163           fhandle_t *fh;
2164           int flags;
2165           proc_t *p;
2166 
2167           p = curproc;
2168 
2169           /*
2170            * Must be super user
2171            */
2172           if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2173                         0, NULL, NULL, NULL)))
2174                     return (error);
2175 
2176           if (oflags & O_SEARCH) {
2177                     oflags &= ~(int)O_SEARCH;
2178           }
2179 
2180           flags = FFLAGS(oflags);
2181           if ((flags & (FREAD | FWRITE)) == 0)
2182                     return (EINVAL);
2183           if ((flags & O_CREAT))
2184                     return (EINVAL);
2185           if ((error = fd_allocfile(&nfp, &indx)) != 0)
2186                     return (error);
2187           fp = nfp;
2188           error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2189           if (error != 0) {
2190                     goto bad;
2191           }
2192           error = vfs_fhtovp(fh, &vp);
2193           vfs_copyinfh_free(fh);
2194           if (error != 0) {
2195                     goto bad;
2196           }
2197 
2198           /* Now do an effective vn_open */
2199 
2200           if (vp->v_type == VSOCK) {
2201                     error = EOPNOTSUPP;
2202                     goto bad;
2203           }
2204           error = vn_openchk(vp, cred, flags);
2205           if (error != 0)
2206                     goto bad;
2207           if (flags & O_TRUNC) {
2208                     VOP_UNLOCK(vp);                         /* XXX */
2209                     vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
2210                     vattr_null(&va);
2211                     va.va_size = 0;
2212                     error = VOP_SETATTR(vp, &va, cred);
2213                     if (error)
2214                               goto bad;
2215           }
2216           if ((error = VOP_OPEN(vp, flags, cred)) != 0)
2217                     goto bad;
2218           if (flags & FWRITE) {
2219                     mutex_enter(vp->v_interlock);
2220                     vp->v_writecount++;
2221                     mutex_exit(vp->v_interlock);
2222           }
2223 
2224           /* done with modified vn_open, now finish what sys_open does. */
2225           if ((error = open_setfp(l, fp, vp, indx, flags)))
2226                     return error;
2227 
2228           VOP_UNLOCK(vp);
2229           *retval = indx;
2230           fd_affix(p, fp, indx);
2231           return (0);
2232 
2233 bad:
2234           fd_abort(p, fp, indx);
2235           if (vp != NULL)
2236                     vput(vp);
2237           if (error == EDUPFD || error == EMOVEFD) {
2238                     /* XXX should probably close curlwp->l_dupfd */
2239                     error = EOPNOTSUPP;
2240           }
2241           return (error);
2242 }
2243 
2244 int
sys___fhopen40(struct lwp * l,const struct sys___fhopen40_args * uap,register_t * retval)2245 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap,
2246     register_t *retval)
2247 {
2248           /* {
2249                     syscallarg(const void *) fhp;
2250                     syscallarg(size_t) fh_size;
2251                     syscallarg(int) flags;
2252           } */
2253 
2254           return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2255               SCARG(uap, flags), retval);
2256 }
2257 
2258 int
do_fhstat(struct lwp * l,const void * ufhp,size_t fhsize,struct stat * sb)2259 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
2260 {
2261           int error;
2262           fhandle_t *fh;
2263           struct vnode *vp;
2264 
2265           /*
2266            * Must be super user
2267            */
2268           if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2269                         0, NULL, NULL, NULL)))
2270                     return error;
2271 
2272           error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2273           if (error != 0)
2274                     return error;
2275 
2276           error = vfs_fhtovp(fh, &vp);
2277           vfs_copyinfh_free(fh);
2278           if (error != 0)
2279                     return error;
2280 
2281           error = vn_stat(vp, sb);
2282           vput(vp);
2283           return error;
2284 }
2285 
2286 /* ARGSUSED */
2287 int
sys___fhstat50(struct lwp * l,const struct sys___fhstat50_args * uap,register_t * retval)2288 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap,
2289     register_t *retval)
2290 {
2291           /* {
2292                     syscallarg(const void *) fhp;
2293                     syscallarg(size_t) fh_size;
2294                     syscallarg(struct stat *) sb;
2295           } */
2296           struct stat sb;
2297           int error;
2298 
2299           error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2300           if (error)
2301                     return error;
2302           return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2303 }
2304 
2305 int
do_fhstatvfs(struct lwp * l,const void * ufhp,size_t fhsize,struct statvfs * sb,int flags)2306 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize,
2307     struct statvfs *sb, int flags)
2308 {
2309           fhandle_t *fh;
2310           struct mount *mp;
2311           struct vnode *vp;
2312           int error;
2313 
2314           /*
2315            * Must be super user
2316            */
2317           if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2318                         0, NULL, NULL, NULL)))
2319                     return error;
2320 
2321           error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2322           if (error != 0)
2323                     return error;
2324 
2325           error = vfs_fhtovp(fh, &vp);
2326           vfs_copyinfh_free(fh);
2327           if (error != 0)
2328                     return error;
2329 
2330           mp = vp->v_mount;
2331           error = dostatvfs(mp, sb, l, flags, 1);
2332           vput(vp);
2333           return error;
2334 }
2335 
2336 /* ARGSUSED */
2337 int
sys___fhstatvfs190(struct lwp * l,const struct sys___fhstatvfs190_args * uap,register_t * retval)2338 sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap,
2339     register_t *retval)
2340 {
2341           /* {
2342                     syscallarg(const void *) fhp;
2343                     syscallarg(size_t) fh_size;
2344                     syscallarg(struct statvfs *) buf;
2345                     syscallarg(int)     flags;
2346           } */
2347           struct statvfs *sb = STATVFSBUF_GET();
2348           int error;
2349 
2350           error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2351               SCARG(uap, flags));
2352           if (error == 0)
2353                     error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2354           STATVFSBUF_PUT(sb);
2355           return error;
2356 }
2357 
2358 int
do_posix_mknodat(struct lwp * l,int fdat,const char * pathname,mode_t mode,dev_t dev)2359 do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2360     dev_t dev)
2361 {
2362 
2363           /*
2364            * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO
2365            * in mode and dev=0.
2366            *
2367            * In all the other cases it's implementation defined behavior.
2368            */
2369 
2370           if ((mode & S_IFIFO) && dev == 0)
2371                     return do_sys_mkfifoat(l, fdat, pathname, mode);
2372           else
2373                     return do_sys_mknodat(l, fdat, pathname, mode, dev,
2374                         UIO_USERSPACE);
2375 }
2376 
2377 /*
2378  * Create a special file.
2379  */
2380 /* ARGSUSED */
2381 int
sys___mknod50(struct lwp * l,const struct sys___mknod50_args * uap,register_t * retval)2382 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
2383     register_t *retval)
2384 {
2385           /* {
2386                     syscallarg(const char *) path;
2387                     syscallarg(mode_t) mode;
2388                     syscallarg(dev_t) dev;
2389           } */
2390           return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path),
2391               SCARG(uap, mode), SCARG(uap, dev));
2392 }
2393 
2394 int
sys_mknodat(struct lwp * l,const struct sys_mknodat_args * uap,register_t * retval)2395 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
2396     register_t *retval)
2397 {
2398           /* {
2399                     syscallarg(int) fd;
2400                     syscallarg(const char *) path;
2401                     syscallarg(mode_t) mode;
2402                     syscallarg(int) pad;
2403                     syscallarg(dev_t) dev;
2404           } */
2405 
2406           return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2407               SCARG(uap, mode), SCARG(uap, dev));
2408 }
2409 
2410 int
do_sys_mknod(struct lwp * l,const char * pathname,mode_t mode,dev_t dev,enum uio_seg seg)2411 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
2412     enum uio_seg seg)
2413 {
2414           return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg);
2415 }
2416 
2417 int
do_sys_mknodat(struct lwp * l,int fdat,const char * pathname,mode_t mode,dev_t dev,enum uio_seg seg)2418 do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
2419     dev_t dev, enum uio_seg seg)
2420 {
2421           struct proc *p = l->l_proc;
2422           struct vnode *vp;
2423           struct vattr vattr;
2424           int error, optype;
2425           struct pathbuf *pb;
2426           struct nameidata nd;
2427           const char *pathstring;
2428 
2429           if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2430                         0, NULL, NULL, NULL)) != 0)
2431                     return (error);
2432 
2433           optype = VOP_MKNOD_DESCOFFSET;
2434 
2435           error = pathbuf_maybe_copyin(pathname, seg, &pb);
2436           if (error) {
2437                     return error;
2438           }
2439           pathstring = pathbuf_stringcopy_get(pb);
2440           if (pathstring == NULL) {
2441                     pathbuf_destroy(pb);
2442                     return ENOMEM;
2443           }
2444 
2445           NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2446 
2447           if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2448                     goto out;
2449           vp = nd.ni_vp;
2450 
2451           if (vp != NULL)
2452                     error = EEXIST;
2453           else {
2454                     vattr_null(&vattr);
2455                     /* We will read cwdi->cwdi_cmask unlocked. */
2456                     vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2457                     vattr.va_rdev = dev;
2458 
2459                     switch (mode & S_IFMT) {
2460                     case S_IFMT:        /* used by badsect to flag bad sectors */
2461                               vattr.va_type = VBAD;
2462                               break;
2463                     case S_IFCHR:
2464                               vattr.va_type = VCHR;
2465                               break;
2466                     case S_IFBLK:
2467                               vattr.va_type = VBLK;
2468                               break;
2469                     case S_IFWHT:
2470                               optype = VOP_WHITEOUT_DESCOFFSET;
2471                               break;
2472                     case S_IFREG:
2473 #if NVERIEXEC > 0
2474                               error = veriexec_openchk(l, nd.ni_vp, pathstring,
2475                                   O_CREAT);
2476 #endif /* NVERIEXEC > 0 */
2477                               vattr.va_type = VREG;
2478                               vattr.va_rdev = VNOVAL;
2479                               optype = VOP_CREATE_DESCOFFSET;
2480                               break;
2481                     default:
2482                               error = EINVAL;
2483                               break;
2484                     }
2485 
2486                     if (error == 0 && optype == VOP_MKNOD_DESCOFFSET &&
2487                         vattr.va_rdev == VNOVAL)
2488                               error = EINVAL;
2489           }
2490 
2491           if (!error) {
2492                     switch (optype) {
2493                     case VOP_WHITEOUT_DESCOFFSET:
2494                               error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2495                               if (error)
2496                                         VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2497                               vput(nd.ni_dvp);
2498                               break;
2499 
2500                     case VOP_MKNOD_DESCOFFSET:
2501                               error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2502                                   &nd.ni_cnd, &vattr);
2503                               if (error == 0)
2504                                         vrele(nd.ni_vp);
2505                               vput(nd.ni_dvp);
2506                               break;
2507 
2508                     case VOP_CREATE_DESCOFFSET:
2509                               error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2510                                   &nd.ni_cnd, &vattr);
2511                               if (error == 0)
2512                                         vrele(nd.ni_vp);
2513                               vput(nd.ni_dvp);
2514                               break;
2515                     }
2516           } else {
2517                     VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2518                     if (nd.ni_dvp == vp)
2519                               vrele(nd.ni_dvp);
2520                     else
2521                               vput(nd.ni_dvp);
2522                     if (vp)
2523                               vrele(vp);
2524           }
2525 out:
2526           pathbuf_stringcopy_put(pb, pathstring);
2527           pathbuf_destroy(pb);
2528           return (error);
2529 }
2530 
2531 /*
2532  * Create a named pipe.
2533  */
2534 /* ARGSUSED */
2535 int
sys_mkfifo(struct lwp * l,const struct sys_mkfifo_args * uap,register_t * retval)2536 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap,
2537     register_t *retval)
2538 {
2539           /* {
2540                     syscallarg(const char *) path;
2541                     syscallarg(int) mode;
2542           } */
2543 
2544           return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path),
2545               SCARG(uap, mode));
2546 }
2547 
2548 int
sys_mkfifoat(struct lwp * l,const struct sys_mkfifoat_args * uap,register_t * retval)2549 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
2550     register_t *retval)
2551 {
2552           /* {
2553                     syscallarg(int) fd;
2554                     syscallarg(const char *) path;
2555                     syscallarg(int) mode;
2556           } */
2557 
2558           return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2559               SCARG(uap, mode));
2560 }
2561 
2562 static int
do_sys_mkfifoat(struct lwp * l,int fdat,const char * path,mode_t mode)2563 do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
2564 {
2565           struct proc *p = l->l_proc;
2566           struct vattr vattr;
2567           int error;
2568           struct pathbuf *pb;
2569           struct nameidata nd;
2570 
2571           error = pathbuf_copyin(path, &pb);
2572           if (error) {
2573                     return error;
2574           }
2575           NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
2576 
2577           if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
2578                     pathbuf_destroy(pb);
2579                     return error;
2580           }
2581           if (nd.ni_vp != NULL) {
2582                     VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2583                     if (nd.ni_dvp == nd.ni_vp)
2584                               vrele(nd.ni_dvp);
2585                     else
2586                               vput(nd.ni_dvp);
2587                     vrele(nd.ni_vp);
2588                     pathbuf_destroy(pb);
2589                     return (EEXIST);
2590           }
2591           vattr_null(&vattr);
2592           vattr.va_type = VFIFO;
2593           /* We will read cwdi->cwdi_cmask unlocked. */
2594           vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2595           error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2596           if (error == 0)
2597                     vrele(nd.ni_vp);
2598           vput(nd.ni_dvp);
2599           pathbuf_destroy(pb);
2600           return (error);
2601 }
2602 
2603 /*
2604  * Make a hard file link.
2605  */
2606 /* ARGSUSED */
2607 int
do_sys_linkat(struct lwp * l,int fdpath,const char * path,int fdlink,const char * link,int follow,register_t * retval)2608 do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
2609     const char *link, int follow, register_t *retval)
2610 {
2611           struct vnode *vp;
2612           struct pathbuf *linkpb;
2613           struct nameidata nd;
2614           namei_simple_flags_t ns_flags;
2615           int error;
2616 
2617           if (follow & AT_SYMLINK_FOLLOW)
2618                     ns_flags = NSM_FOLLOW_TRYEMULROOT;
2619           else
2620                     ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2621 
2622           error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2623           if (error != 0)
2624                     return (error);
2625           error = pathbuf_copyin(link, &linkpb);
2626           if (error) {
2627                     goto out1;
2628           }
2629           NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2630           if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
2631                     goto out2;
2632           if (nd.ni_vp) {
2633                     error = EEXIST;
2634                     goto abortop;
2635           }
2636           /* Prevent hard links on directories. */
2637           if (vp->v_type == VDIR) {
2638                     error = EPERM;
2639                     goto abortop;
2640           }
2641           /* Prevent cross-mount operation. */
2642           if (nd.ni_dvp->v_mount != vp->v_mount) {
2643                     error = EXDEV;
2644                     goto abortop;
2645           }
2646           error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2647           VOP_UNLOCK(nd.ni_dvp);
2648           vrele(nd.ni_dvp);
2649 out2:
2650           pathbuf_destroy(linkpb);
2651 out1:
2652           vrele(vp);
2653           return (error);
2654 abortop:
2655           VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2656           if (nd.ni_dvp == nd.ni_vp)
2657                     vrele(nd.ni_dvp);
2658           else
2659                     vput(nd.ni_dvp);
2660           if (nd.ni_vp != NULL)
2661                     vrele(nd.ni_vp);
2662           goto out2;
2663 }
2664 
2665 int
sys_link(struct lwp * l,const struct sys_link_args * uap,register_t * retval)2666 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
2667 {
2668           /* {
2669                     syscallarg(const char *) path;
2670                     syscallarg(const char *) link;
2671           } */
2672           const char *path = SCARG(uap, path);
2673           const char *link = SCARG(uap, link);
2674 
2675           return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2676               AT_SYMLINK_FOLLOW, retval);
2677 }
2678 
2679 int
sys_linkat(struct lwp * l,const struct sys_linkat_args * uap,register_t * retval)2680 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
2681     register_t *retval)
2682 {
2683           /* {
2684                     syscallarg(int) fd1;
2685                     syscallarg(const char *) name1;
2686                     syscallarg(int) fd2;
2687                     syscallarg(const char *) name2;
2688                     syscallarg(int) flags;
2689           } */
2690           int fd1 = SCARG(uap, fd1);
2691           const char *name1 = SCARG(uap, name1);
2692           int fd2 = SCARG(uap, fd2);
2693           const char *name2 = SCARG(uap, name2);
2694           int follow;
2695 
2696           follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2697 
2698           return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2699 }
2700 
2701 int
do_sys_symlink(const char * patharg,const char * link,enum uio_seg seg)2702 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
2703 {
2704 
2705           return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2706 }
2707 
2708 static int
do_sys_symlinkat(struct lwp * l,const char * patharg,int fdat,const char * link,enum uio_seg seg)2709 do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
2710     const char *link, enum uio_seg seg)
2711 {
2712           struct proc *p = curproc;
2713           struct vattr vattr;
2714           char *path;
2715           int error;
2716           size_t len;
2717           struct pathbuf *linkpb;
2718           struct nameidata nd;
2719 
2720           KASSERT(l != NULL || fdat == AT_FDCWD);
2721 
2722           path = PNBUF_GET();
2723           if (seg == UIO_USERSPACE) {
2724                     if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0)
2725                               goto out1;
2726                     if ((error = pathbuf_copyin(link, &linkpb)) != 0)
2727                               goto out1;
2728           } else {
2729                     len = strlen(patharg) + 1;
2730                     KASSERT(len <= MAXPATHLEN);
2731                     memcpy(path, patharg, len);
2732                     linkpb = pathbuf_create(link);
2733                     if (linkpb == NULL) {
2734                               error = ENOMEM;
2735                               goto out1;
2736                     }
2737           }
2738           ktrkuser("symlink-target", path, len - 1);
2739 
2740           NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
2741           if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2742                     goto out2;
2743           if (nd.ni_vp) {
2744                     VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2745                     if (nd.ni_dvp == nd.ni_vp)
2746                               vrele(nd.ni_dvp);
2747                     else
2748                               vput(nd.ni_dvp);
2749                     vrele(nd.ni_vp);
2750                     error = EEXIST;
2751                     goto out2;
2752           }
2753           vattr_null(&vattr);
2754           vattr.va_type = VLNK;
2755           /* We will read cwdi->cwdi_cmask unlocked. */
2756           vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2757           error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2758           if (error == 0)
2759                     vrele(nd.ni_vp);
2760           vput(nd.ni_dvp);
2761 out2:
2762           pathbuf_destroy(linkpb);
2763 out1:
2764           PNBUF_PUT(path);
2765           return (error);
2766 }
2767 
2768 /*
2769  * Make a symbolic link.
2770  */
2771 /* ARGSUSED */
2772 int
sys_symlink(struct lwp * l,const struct sys_symlink_args * uap,register_t * retval)2773 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
2774 {
2775           /* {
2776                     syscallarg(const char *) path;
2777                     syscallarg(const char *) link;
2778           } */
2779 
2780           return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2781               UIO_USERSPACE);
2782 }
2783 
2784 int
sys_symlinkat(struct lwp * l,const struct sys_symlinkat_args * uap,register_t * retval)2785 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
2786     register_t *retval)
2787 {
2788           /* {
2789                     syscallarg(const char *) path1;
2790                     syscallarg(int) fd;
2791                     syscallarg(const char *) path2;
2792           } */
2793 
2794           return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2795               SCARG(uap, path2), UIO_USERSPACE);
2796 }
2797 
2798 /*
2799  * Delete a whiteout from the filesystem.
2800  */
2801 /* ARGSUSED */
2802 int
sys_undelete(struct lwp * l,const struct sys_undelete_args * uap,register_t * retval)2803 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap,
2804     register_t *retval)
2805 {
2806           /* {
2807                     syscallarg(const char *) path;
2808           } */
2809           int error;
2810           struct pathbuf *pb;
2811           struct nameidata nd;
2812 
2813           error = pathbuf_copyin(SCARG(uap, path), &pb);
2814           if (error) {
2815                     return error;
2816           }
2817 
2818           NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
2819           error = namei(&nd);
2820           if (error) {
2821                     pathbuf_destroy(pb);
2822                     return (error);
2823           }
2824 
2825           if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2826                     VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2827                     if (nd.ni_dvp == nd.ni_vp)
2828                               vrele(nd.ni_dvp);
2829                     else
2830                               vput(nd.ni_dvp);
2831                     if (nd.ni_vp)
2832                               vrele(nd.ni_vp);
2833                     pathbuf_destroy(pb);
2834                     return (EEXIST);
2835           }
2836           if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
2837                     VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2838           vput(nd.ni_dvp);
2839           pathbuf_destroy(pb);
2840           return (error);
2841 }
2842 
2843 /*
2844  * Delete a name from the filesystem.
2845  */
2846 /* ARGSUSED */
2847 int
sys_unlink(struct lwp * l,const struct sys_unlink_args * uap,register_t * retval)2848 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap,
2849     register_t *retval)
2850 {
2851           /* {
2852                     syscallarg(const char *) path;
2853           } */
2854 
2855           return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0,
2856               UIO_USERSPACE);
2857 }
2858 
2859 int
sys_unlinkat(struct lwp * l,const struct sys_unlinkat_args * uap,register_t * retval)2860 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
2861     register_t *retval)
2862 {
2863           /* {
2864                     syscallarg(int) fd;
2865                     syscallarg(const char *) path;
2866                     syscallarg(int) flag;
2867           } */
2868 
2869           return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2870               SCARG(uap, flag), UIO_USERSPACE);
2871 }
2872 
2873 int
do_sys_unlink(const char * arg,enum uio_seg seg)2874 do_sys_unlink(const char *arg, enum uio_seg seg)
2875 {
2876 
2877           return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
2878 }
2879 
2880 static int
do_sys_unlinkat(struct lwp * l,int fdat,const char * arg,int flags,enum uio_seg seg)2881 do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
2882     enum uio_seg seg)
2883 {
2884           struct vnode *vp;
2885           int error;
2886           struct pathbuf *pb;
2887           struct nameidata nd;
2888           const char *pathstring;
2889 
2890           KASSERT(l != NULL || fdat == AT_FDCWD);
2891 
2892           error = pathbuf_maybe_copyin(arg, seg, &pb);
2893           if (error) {
2894                     return error;
2895           }
2896           pathstring = pathbuf_stringcopy_get(pb);
2897           if (pathstring == NULL) {
2898                     pathbuf_destroy(pb);
2899                     return ENOMEM;
2900           }
2901 
2902           NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
2903           if ((error = fd_nameiat(l, fdat, &nd)) != 0)
2904                     goto out;
2905           vp = nd.ni_vp;
2906 
2907           /*
2908            * The root of a mounted filesystem cannot be deleted.
2909            */
2910           if ((vp->v_vflag & VV_ROOT) != 0) {
2911                     error = EBUSY;
2912                     goto abort;
2913           }
2914 
2915           if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2916                     error = EBUSY;
2917                     goto abort;
2918           }
2919 
2920           /*
2921            * No rmdir "." please.
2922            */
2923           if (nd.ni_dvp == vp) {
2924                     error = EINVAL;
2925                     goto abort;
2926           }
2927 
2928           /*
2929            * AT_REMOVEDIR is required to remove a directory
2930            */
2931           if (vp->v_type == VDIR) {
2932                     if (!(flags & AT_REMOVEDIR)) {
2933                               error = EPERM;
2934                               goto abort;
2935                     } else {
2936                               error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2937                               vput(nd.ni_dvp);
2938                               goto out;
2939                     }
2940           }
2941 
2942           /*
2943            * Starting here we only deal with non directories.
2944            */
2945           if (flags & AT_REMOVEDIR) {
2946                     error = ENOTDIR;
2947                     goto abort;
2948           }
2949 
2950 #if NVERIEXEC > 0
2951           /* Handle remove requests for veriexec entries. */
2952           if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
2953                     goto abort;
2954           }
2955 #endif /* NVERIEXEC > 0 */
2956 
2957 #ifdef FILEASSOC
2958           (void)fileassoc_file_delete(vp);
2959 #endif /* FILEASSOC */
2960           error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2961           vput(nd.ni_dvp);
2962           goto out;
2963 
2964 abort:
2965           VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2966           if (nd.ni_dvp == vp)
2967                     vrele(nd.ni_dvp);
2968           else
2969                     vput(nd.ni_dvp);
2970           vput(vp);
2971 
2972 out:
2973           pathbuf_stringcopy_put(pb, pathstring);
2974           pathbuf_destroy(pb);
2975           return (error);
2976 }
2977 
2978 /*
2979  * Reposition read/write file offset.
2980  */
2981 int
sys_lseek(struct lwp * l,const struct sys_lseek_args * uap,register_t * retval)2982 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
2983 {
2984           /* {
2985                     syscallarg(int) fd;
2986                     syscallarg(int) pad;
2987                     syscallarg(off_t) offset;
2988                     syscallarg(int) whence;
2989           } */
2990           file_t *fp;
2991           int error, fd;
2992 
2993           switch (SCARG(uap, whence)) {
2994           case SEEK_CUR:
2995           case SEEK_END:
2996           case SEEK_SET:
2997                     break;
2998           default:
2999                     return EINVAL;
3000           }
3001 
3002           fd = SCARG(uap, fd);
3003 
3004           if ((fp = fd_getfile(fd)) == NULL)
3005                     return (EBADF);
3006 
3007           if (fp->f_ops->fo_seek == NULL) {
3008                     error = ESPIPE;
3009                     goto out;
3010           }
3011 
3012           error = (*fp->f_ops->fo_seek)(fp, SCARG(uap, offset),
3013               SCARG(uap, whence), (off_t *)retval, FOF_UPDATE_OFFSET);
3014 out:
3015           fd_putfile(fd);
3016           return (error);
3017 }
3018 
3019 /*
3020  * Positional read system call.
3021  */
3022 int
sys_pread(struct lwp * l,const struct sys_pread_args * uap,register_t * retval)3023 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
3024 {
3025           /* {
3026                     syscallarg(int) fd;
3027                     syscallarg(void *) buf;
3028                     syscallarg(size_t) nbyte;
3029                     syscallarg(off_t) offset;
3030           } */
3031           file_t *fp;
3032           off_t offset;
3033           int error, fd = SCARG(uap, fd);
3034 
3035           if ((fp = fd_getfile(fd)) == NULL)
3036                     return (EBADF);
3037 
3038           if ((fp->f_flag & FREAD) == 0) {
3039                     fd_putfile(fd);
3040                     return (EBADF);
3041           }
3042 
3043           if (fp->f_ops->fo_seek == NULL) {
3044                     error = ESPIPE;
3045                     goto out;
3046           }
3047 
3048           offset = SCARG(uap, offset);
3049           error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3050           if (error)
3051                     goto out;
3052 
3053           /* dofileread() will unuse the descriptor for us */
3054           return dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3055               &offset, 0, retval);
3056 
3057 out:
3058           fd_putfile(fd);
3059           return (error);
3060 }
3061 
3062 /*
3063  * Positional scatter read system call.
3064  */
3065 int
sys_preadv(struct lwp * l,const struct sys_preadv_args * uap,register_t * retval)3066 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap,
3067     register_t *retval)
3068 {
3069           /* {
3070                     syscallarg(int) fd;
3071                     syscallarg(const struct iovec *) iovp;
3072                     syscallarg(int) iovcnt;
3073                     syscallarg(off_t) offset;
3074           } */
3075           off_t offset = SCARG(uap, offset);
3076 
3077           return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
3078               SCARG(uap, iovcnt), &offset, 0, retval);
3079 }
3080 
3081 /*
3082  * Positional write system call.
3083  */
3084 int
sys_pwrite(struct lwp * l,const struct sys_pwrite_args * uap,register_t * retval)3085 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap,
3086     register_t *retval)
3087 {
3088           /* {
3089                     syscallarg(int) fd;
3090                     syscallarg(const void *) buf;
3091                     syscallarg(size_t) nbyte;
3092                     syscallarg(off_t) offset;
3093           } */
3094           file_t *fp;
3095           off_t offset;
3096           int error, fd = SCARG(uap, fd);
3097 
3098           if ((fp = fd_getfile(fd)) == NULL)
3099                     return (EBADF);
3100 
3101           if ((fp->f_flag & FWRITE) == 0) {
3102                     fd_putfile(fd);
3103                     return (EBADF);
3104           }
3105 
3106           if (fp->f_ops->fo_seek == NULL) {
3107                     error = ESPIPE;
3108                     goto out;
3109           }
3110 
3111           offset = SCARG(uap, offset);
3112           error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
3113           if (error)
3114                     goto out;
3115 
3116           /* dofilewrite() will unuse the descriptor for us */
3117           return dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
3118               &offset, 0, retval);
3119 
3120 out:
3121           fd_putfile(fd);
3122           return (error);
3123 }
3124 
3125 /*
3126  * Positional gather write system call.
3127  */
3128 int
sys_pwritev(struct lwp * l,const struct sys_pwritev_args * uap,register_t * retval)3129 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap,
3130     register_t *retval)
3131 {
3132           /* {
3133                     syscallarg(int) fd;
3134                     syscallarg(const struct iovec *) iovp;
3135                     syscallarg(int) iovcnt;
3136                     syscallarg(off_t) offset;
3137           } */
3138           off_t offset = SCARG(uap, offset);
3139 
3140           return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
3141               SCARG(uap, iovcnt), &offset, 0, retval);
3142 }
3143 
3144 /*
3145  * Check access permissions.
3146  */
3147 int
sys_access(struct lwp * l,const struct sys_access_args * uap,register_t * retval)3148 sys_access(struct lwp *l, const struct sys_access_args *uap,
3149     register_t *retval)
3150 {
3151           /* {
3152                     syscallarg(const char *) path;
3153                     syscallarg(int) flags;
3154           } */
3155 
3156           return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
3157               SCARG(uap, flags), 0);
3158 }
3159 
3160 int
do_sys_accessat(struct lwp * l,int fdat,const char * path,int mode,int flags)3161 do_sys_accessat(struct lwp *l, int fdat, const char *path,
3162     int mode, int flags)
3163 {
3164           kauth_cred_t cred;
3165           struct vnode *vp;
3166           int error, nd_flag, vmode;
3167           struct pathbuf *pb;
3168           struct nameidata nd;
3169 
3170           CTASSERT(F_OK == 0);
3171           if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
3172                     /* nonsense mode */
3173                     return EINVAL;
3174           }
3175 
3176           nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT;
3177           if (flags & AT_SYMLINK_NOFOLLOW)
3178                     nd_flag &= ~FOLLOW;
3179 
3180           error = pathbuf_copyin(path, &pb);
3181           if (error)
3182                     return error;
3183 
3184           NDINIT(&nd, LOOKUP, nd_flag, pb);
3185 
3186           /* Override default credentials */
3187           if (!(flags & AT_EACCESS)) {
3188                     cred = kauth_cred_dup(l->l_cred);
3189                     kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
3190                     kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
3191           } else
3192                     cred = l->l_cred;
3193           nd.ni_cnd.cn_cred = cred;
3194 
3195           if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3196                     pathbuf_destroy(pb);
3197                     goto out;
3198           }
3199           vp = nd.ni_vp;
3200           pathbuf_destroy(pb);
3201 
3202           /* Flags == 0 means only check for existence. */
3203           if (mode) {
3204                     vmode = 0;
3205                     if (mode & R_OK)
3206                               vmode |= VREAD;
3207                     if (mode & W_OK)
3208                               vmode |= VWRITE;
3209                     if (mode & X_OK)
3210                               vmode |= VEXEC;
3211 
3212                     error = VOP_ACCESS(vp, vmode, cred);
3213                     if (!error && (vmode & VWRITE))
3214                               error = vn_writechk(vp);
3215           }
3216           vput(vp);
3217 out:
3218           if (!(flags & AT_EACCESS))
3219                     kauth_cred_free(cred);
3220           return (error);
3221 }
3222 
3223 int
sys_faccessat(struct lwp * l,const struct sys_faccessat_args * uap,register_t * retval)3224 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
3225     register_t *retval)
3226 {
3227           /* {
3228                     syscallarg(int) fd;
3229                     syscallarg(const char *) path;
3230                     syscallarg(int) amode;
3231                     syscallarg(int) flag;
3232           } */
3233 
3234           return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3235               SCARG(uap, amode), SCARG(uap, flag));
3236 }
3237 
3238 /*
3239  * Common code for all sys_stat functions, including compat versions.
3240  */
3241 int
do_sys_stat(const char * userpath,unsigned int nd_flag,struct stat * sb)3242 do_sys_stat(const char *userpath, unsigned int nd_flag, struct stat *sb)
3243 {
3244 
3245           return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3246 }
3247 
3248 int
do_sys_statat(struct lwp * l,int fdat,const char * userpath,unsigned int nd_flag,struct stat * sb)3249 do_sys_statat(struct lwp *l, int fdat, const char *userpath,
3250     unsigned int nd_flag, struct stat *sb)
3251 {
3252           int error;
3253           struct pathbuf *pb;
3254           struct nameidata nd;
3255 
3256           KASSERT(l != NULL || fdat == AT_FDCWD);
3257 
3258           error = pathbuf_copyin(userpath, &pb);
3259           if (error) {
3260                     return error;
3261           }
3262 
3263           NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
3264 
3265           error = fd_nameiat(l, fdat, &nd);
3266           if (error != 0) {
3267                     pathbuf_destroy(pb);
3268                     return error;
3269           }
3270           error = vn_stat(nd.ni_vp, sb);
3271           vput(nd.ni_vp);
3272           pathbuf_destroy(pb);
3273           return error;
3274 }
3275 
3276 /*
3277  * Get file status; this version follows links.
3278  */
3279 /* ARGSUSED */
3280 int
sys___stat50(struct lwp * l,const struct sys___stat50_args * uap,register_t * retval)3281 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap,
3282     register_t *retval)
3283 {
3284           /* {
3285                     syscallarg(const char *) path;
3286                     syscallarg(struct stat *) ub;
3287           } */
3288           struct stat sb;
3289           int error;
3290 
3291           error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3292           if (error)
3293                     return error;
3294           return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3295 }
3296 
3297 /*
3298  * Get file status; this version does not follow links.
3299  */
3300 /* ARGSUSED */
3301 int
sys___lstat50(struct lwp * l,const struct sys___lstat50_args * uap,register_t * retval)3302 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap,
3303     register_t *retval)
3304 {
3305           /* {
3306                     syscallarg(const char *) path;
3307                     syscallarg(struct stat *) ub;
3308           } */
3309           struct stat sb;
3310           int error;
3311 
3312           error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3313           if (error)
3314                     return error;
3315           return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3316 }
3317 
3318 int
sys_fstatat(struct lwp * l,const struct sys_fstatat_args * uap,register_t * retval)3319 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
3320     register_t *retval)
3321 {
3322           /* {
3323                     syscallarg(int) fd;
3324                     syscallarg(const char *) path;
3325                     syscallarg(struct stat *) buf;
3326                     syscallarg(int) flag;
3327           } */
3328           unsigned int nd_flag;
3329           struct stat sb;
3330           int error;
3331 
3332           if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3333                     nd_flag = NOFOLLOW;
3334           else
3335                     nd_flag = FOLLOW;
3336 
3337           error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3338               &sb);
3339           if (error)
3340                     return error;
3341           return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3342 }
3343 
3344 static int
kern_pathconf(register_t * retval,const char * path,int name,int flag)3345 kern_pathconf(register_t *retval, const char *path, int name, int flag)
3346 {
3347           int error;
3348           struct pathbuf *pb;
3349           struct nameidata nd;
3350 
3351           error = pathbuf_copyin(path, &pb);
3352           if (error) {
3353                     return error;
3354           }
3355           NDINIT(&nd, LOOKUP, flag | LOCKLEAF | TRYEMULROOT, pb);
3356           if ((error = namei(&nd)) != 0) {
3357                     pathbuf_destroy(pb);
3358                     return error;
3359           }
3360           error = VOP_PATHCONF(nd.ni_vp, name, retval);
3361           vput(nd.ni_vp);
3362           pathbuf_destroy(pb);
3363           return error;
3364 }
3365 
3366 /*
3367  * Get configurable pathname variables.
3368  */
3369 /* ARGSUSED */
3370 int
sys_pathconf(struct lwp * l,const struct sys_pathconf_args * uap,register_t * retval)3371 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap,
3372     register_t *retval)
3373 {
3374           /* {
3375                     syscallarg(const char *) path;
3376                     syscallarg(int) name;
3377           } */
3378 
3379           return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3380               FOLLOW);
3381 }
3382 
3383 /* ARGSUSED */
3384 int
sys_lpathconf(struct lwp * l,const struct sys_lpathconf_args * uap,register_t * retval)3385 sys_lpathconf(struct lwp *l, const struct sys_lpathconf_args *uap,
3386     register_t *retval)
3387 {
3388           /* {
3389                     syscallarg(const char *) path;
3390                     syscallarg(int) name;
3391           } */
3392 
3393           return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
3394               NOFOLLOW);
3395 }
3396 
3397 /*
3398  * Return target name of a symbolic link.
3399  */
3400 /* ARGSUSED */
3401 int
sys_readlink(struct lwp * l,const struct sys_readlink_args * uap,register_t * retval)3402 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
3403     register_t *retval)
3404 {
3405           /* {
3406                     syscallarg(const char *) path;
3407                     syscallarg(char *) buf;
3408                     syscallarg(size_t) count;
3409           } */
3410 
3411           return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3412               SCARG(uap, buf), SCARG(uap, count), retval);
3413 }
3414 
3415 static int
do_sys_readlinkat(struct lwp * l,int fdat,const char * path,char * buf,size_t count,register_t * retval)3416 do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
3417     size_t count, register_t *retval)
3418 {
3419           struct vnode *vp;
3420           struct iovec aiov;
3421           struct uio auio;
3422           int error;
3423           struct pathbuf *pb;
3424           struct nameidata nd;
3425 
3426           error = pathbuf_copyin(path, &pb);
3427           if (error) {
3428                     return error;
3429           }
3430           NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT,
3431               pb);
3432           if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
3433                     pathbuf_destroy(pb);
3434                     return error;
3435           }
3436           vp = nd.ni_vp;
3437           pathbuf_destroy(pb);
3438           if (vp->v_type != VLNK)
3439                     error = EINVAL;
3440           else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
3441               (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
3442                     aiov.iov_base = buf;
3443                     aiov.iov_len = count;
3444                     auio.uio_iov = &aiov;
3445                     auio.uio_iovcnt = 1;
3446                     auio.uio_offset = 0;
3447                     auio.uio_rw = UIO_READ;
3448                     KASSERT(l == curlwp);
3449                     auio.uio_vmspace = l->l_proc->p_vmspace;
3450                     auio.uio_resid = count;
3451                     if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
3452                               *retval = count - auio.uio_resid;
3453           }
3454           vput(vp);
3455           return (error);
3456 }
3457 
3458 int
sys_readlinkat(struct lwp * l,const struct sys_readlinkat_args * uap,register_t * retval)3459 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
3460     register_t *retval)
3461 {
3462           /* {
3463                     syscallarg(int) fd;
3464                     syscallarg(const char *) path;
3465                     syscallarg(char *) buf;
3466                     syscallarg(size_t) bufsize;
3467           } */
3468 
3469           return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3470               SCARG(uap, buf), SCARG(uap, bufsize), retval);
3471 }
3472 
3473 /*
3474  * Change flags of a file given a path name.
3475  */
3476 /* ARGSUSED */
3477 int
sys_chflags(struct lwp * l,const struct sys_chflags_args * uap,register_t * retval)3478 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap,
3479     register_t *retval)
3480 {
3481           /* {
3482                     syscallarg(const char *) path;
3483                     syscallarg(u_long) flags;
3484           } */
3485           struct vnode *vp;
3486           int error;
3487 
3488           error = namei_simple_user(SCARG(uap, path),
3489               NSM_FOLLOW_TRYEMULROOT, &vp);
3490           if (error != 0)
3491                     return (error);
3492           error = change_flags(vp, SCARG(uap, flags), l);
3493           vput(vp);
3494           return (error);
3495 }
3496 
3497 /*
3498  * Change flags of a file given a file descriptor.
3499  */
3500 /* ARGSUSED */
3501 int
sys_fchflags(struct lwp * l,const struct sys_fchflags_args * uap,register_t * retval)3502 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap,
3503     register_t *retval)
3504 {
3505           /* {
3506                     syscallarg(int) fd;
3507                     syscallarg(u_long) flags;
3508           } */
3509           struct vnode *vp;
3510           file_t *fp;
3511           int error;
3512 
3513           /* fd_getvnode() will use the descriptor for us */
3514           if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3515                     return (error);
3516           vp = fp->f_vnode;
3517           error = change_flags(vp, SCARG(uap, flags), l);
3518           VOP_UNLOCK(vp);
3519           fd_putfile(SCARG(uap, fd));
3520           return (error);
3521 }
3522 
3523 /*
3524  * Change flags of a file given a path name; this version does
3525  * not follow links.
3526  */
3527 int
sys_lchflags(struct lwp * l,const struct sys_lchflags_args * uap,register_t * retval)3528 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap,
3529     register_t *retval)
3530 {
3531           /* {
3532                     syscallarg(const char *) path;
3533                     syscallarg(u_long) flags;
3534           } */
3535           struct vnode *vp;
3536           int error;
3537 
3538           error = namei_simple_user(SCARG(uap, path),
3539               NSM_NOFOLLOW_TRYEMULROOT, &vp);
3540           if (error != 0)
3541                     return (error);
3542           error = change_flags(vp, SCARG(uap, flags), l);
3543           vput(vp);
3544           return (error);
3545 }
3546 
3547 /*
3548  * Common routine to change flags of a file.
3549  */
3550 int
change_flags(struct vnode * vp,u_long flags,struct lwp * l)3551 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
3552 {
3553           struct vattr vattr;
3554           int error;
3555 
3556           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3557 
3558           vattr_null(&vattr);
3559           vattr.va_flags = flags;
3560           error = VOP_SETATTR(vp, &vattr, l->l_cred);
3561 
3562           return (error);
3563 }
3564 
3565 /*
3566  * Change mode of a file given path name; this version follows links.
3567  */
3568 /* ARGSUSED */
3569 int
sys_chmod(struct lwp * l,const struct sys_chmod_args * uap,register_t * retval)3570 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
3571 {
3572           /* {
3573                     syscallarg(const char *) path;
3574                     syscallarg(int) mode;
3575           } */
3576 
3577           return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3578               SCARG(uap, mode), 0);
3579 }
3580 
3581 int
do_sys_chmodat(struct lwp * l,int fdat,const char * path,int mode,int flags)3582 do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
3583 {
3584           int error;
3585           struct vnode *vp;
3586           namei_simple_flags_t ns_flag;
3587 
3588           if (flags & AT_SYMLINK_NOFOLLOW)
3589                     ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3590           else
3591                     ns_flag = NSM_FOLLOW_TRYEMULROOT;
3592 
3593           error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3594           if (error != 0)
3595                     return error;
3596 
3597           error = change_mode(vp, mode, l);
3598 
3599           vrele(vp);
3600 
3601           return (error);
3602 }
3603 
3604 /*
3605  * Change mode of a file given a file descriptor.
3606  */
3607 /* ARGSUSED */
3608 int
sys_fchmod(struct lwp * l,const struct sys_fchmod_args * uap,register_t * retval)3609 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap,
3610     register_t *retval)
3611 {
3612           /* {
3613                     syscallarg(int) fd;
3614                     syscallarg(int) mode;
3615           } */
3616           file_t *fp;
3617           int error;
3618 
3619           /* fd_getvnode() will use the descriptor for us */
3620           if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3621                     return (error);
3622           error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3623           fd_putfile(SCARG(uap, fd));
3624           return (error);
3625 }
3626 
3627 int
sys_fchmodat(struct lwp * l,const struct sys_fchmodat_args * uap,register_t * retval)3628 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
3629     register_t *retval)
3630 {
3631           /* {
3632                     syscallarg(int) fd;
3633                     syscallarg(const char *) path;
3634                     syscallarg(int) mode;
3635                     syscallarg(int) flag;
3636           } */
3637 
3638           return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3639               SCARG(uap, mode), SCARG(uap, flag));
3640 }
3641 
3642 /*
3643  * Change mode of a file given path name; this version does not follow links.
3644  */
3645 /* ARGSUSED */
3646 int
sys_lchmod(struct lwp * l,const struct sys_lchmod_args * uap,register_t * retval)3647 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap,
3648     register_t *retval)
3649 {
3650           /* {
3651                     syscallarg(const char *) path;
3652                     syscallarg(int) mode;
3653           } */
3654           int error;
3655           struct vnode *vp;
3656 
3657           error = namei_simple_user(SCARG(uap, path),
3658               NSM_NOFOLLOW_TRYEMULROOT, &vp);
3659           if (error != 0)
3660                     return (error);
3661 
3662           error = change_mode(vp, SCARG(uap, mode), l);
3663 
3664           vrele(vp);
3665           return (error);
3666 }
3667 
3668 /*
3669  * Common routine to set mode given a vnode.
3670  */
3671 static int
change_mode(struct vnode * vp,int mode,struct lwp * l)3672 change_mode(struct vnode *vp, int mode, struct lwp *l)
3673 {
3674           struct vattr vattr;
3675           int error;
3676 
3677           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3678           vattr_null(&vattr);
3679           vattr.va_mode = mode & ALLPERMS;
3680           error = VOP_SETATTR(vp, &vattr, l->l_cred);
3681           VOP_UNLOCK(vp);
3682           return (error);
3683 }
3684 
3685 /*
3686  * Set ownership given a path name; this version follows links.
3687  */
3688 /* ARGSUSED */
3689 int
sys_chown(struct lwp * l,const struct sys_chown_args * uap,register_t * retval)3690 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
3691 {
3692           /* {
3693                     syscallarg(const char *) path;
3694                     syscallarg(uid_t) uid;
3695                     syscallarg(gid_t) gid;
3696           } */
3697           return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3698               SCARG(uap, gid), 0);
3699 }
3700 
3701 int
do_sys_chownat(struct lwp * l,int fdat,const char * path,uid_t uid,gid_t gid,int flags)3702 do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
3703    gid_t gid, int flags)
3704 {
3705           int error;
3706           struct vnode *vp;
3707           namei_simple_flags_t ns_flag;
3708 
3709           if (flags & AT_SYMLINK_NOFOLLOW)
3710                     ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3711           else
3712                     ns_flag = NSM_FOLLOW_TRYEMULROOT;
3713 
3714           error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3715           if (error != 0)
3716                     return error;
3717 
3718           error = change_owner(vp, uid, gid, l, 0);
3719 
3720           vrele(vp);
3721 
3722           return (error);
3723 }
3724 
3725 /*
3726  * Set ownership given a path name; this version follows links.
3727  * Provides POSIX semantics.
3728  */
3729 /* ARGSUSED */
3730 int
sys___posix_chown(struct lwp * l,const struct sys___posix_chown_args * uap,register_t * retval)3731 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap,
3732     register_t *retval)
3733 {
3734           /* {
3735                     syscallarg(const char *) path;
3736                     syscallarg(uid_t) uid;
3737                     syscallarg(gid_t) gid;
3738           } */
3739           int error;
3740           struct vnode *vp;
3741 
3742           error = namei_simple_user(SCARG(uap, path),
3743               NSM_FOLLOW_TRYEMULROOT, &vp);
3744           if (error != 0)
3745                     return (error);
3746 
3747           error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3748 
3749           vrele(vp);
3750           return (error);
3751 }
3752 
3753 /*
3754  * Set ownership given a file descriptor.
3755  */
3756 /* ARGSUSED */
3757 int
sys_fchown(struct lwp * l,const struct sys_fchown_args * uap,register_t * retval)3758 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap,
3759     register_t *retval)
3760 {
3761           /* {
3762                     syscallarg(int) fd;
3763                     syscallarg(uid_t) uid;
3764                     syscallarg(gid_t) gid;
3765           } */
3766           int error;
3767           file_t *fp;
3768 
3769           /* fd_getvnode() will use the descriptor for us */
3770           if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3771                     return (error);
3772           error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3773               l, 0);
3774           fd_putfile(SCARG(uap, fd));
3775           return (error);
3776 }
3777 
3778 int
sys_fchownat(struct lwp * l,const struct sys_fchownat_args * uap,register_t * retval)3779 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
3780     register_t *retval)
3781 {
3782           /* {
3783                     syscallarg(int) fd;
3784                     syscallarg(const char *) path;
3785                     syscallarg(uid_t) owner;
3786                     syscallarg(gid_t) group;
3787                     syscallarg(int) flag;
3788           } */
3789 
3790           return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3791               SCARG(uap, owner), SCARG(uap, group),
3792               SCARG(uap, flag));
3793 }
3794 
3795 /*
3796  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
3797  */
3798 /* ARGSUSED */
3799 int
sys___posix_fchown(struct lwp * l,const struct sys___posix_fchown_args * uap,register_t * retval)3800 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap,
3801     register_t *retval)
3802 {
3803           /* {
3804                     syscallarg(int) fd;
3805                     syscallarg(uid_t) uid;
3806                     syscallarg(gid_t) gid;
3807           } */
3808           int error;
3809           file_t *fp;
3810 
3811           /* fd_getvnode() will use the descriptor for us */
3812           if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3813                     return (error);
3814           error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3815               l, 1);
3816           fd_putfile(SCARG(uap, fd));
3817           return (error);
3818 }
3819 
3820 /*
3821  * Set ownership given a path name; this version does not follow links.
3822  */
3823 /* ARGSUSED */
3824 int
sys_lchown(struct lwp * l,const struct sys_lchown_args * uap,register_t * retval)3825 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap,
3826     register_t *retval)
3827 {
3828           /* {
3829                     syscallarg(const char *) path;
3830                     syscallarg(uid_t) uid;
3831                     syscallarg(gid_t) gid;
3832           } */
3833           int error;
3834           struct vnode *vp;
3835 
3836           error = namei_simple_user(SCARG(uap, path),
3837               NSM_NOFOLLOW_TRYEMULROOT, &vp);
3838           if (error != 0)
3839                     return (error);
3840 
3841           error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
3842 
3843           vrele(vp);
3844           return (error);
3845 }
3846 
3847 /*
3848  * Set ownership given a path name; this version does not follow links.
3849  * Provides POSIX/XPG semantics.
3850  */
3851 /* ARGSUSED */
3852 int
sys___posix_lchown(struct lwp * l,const struct sys___posix_lchown_args * uap,register_t * retval)3853 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap,
3854     register_t *retval)
3855 {
3856           /* {
3857                     syscallarg(const char *) path;
3858                     syscallarg(uid_t) uid;
3859                     syscallarg(gid_t) gid;
3860           } */
3861           int error;
3862           struct vnode *vp;
3863 
3864           error = namei_simple_user(SCARG(uap, path),
3865               NSM_NOFOLLOW_TRYEMULROOT, &vp);
3866           if (error != 0)
3867                     return (error);
3868 
3869           error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
3870 
3871           vrele(vp);
3872           return (error);
3873 }
3874 
3875 /*
3876  * Common routine to set ownership given a vnode.
3877  */
3878 static int
change_owner(struct vnode * vp,uid_t uid,gid_t gid,struct lwp * l,int posix_semantics)3879 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
3880     int posix_semantics)
3881 {
3882           struct vattr vattr;
3883           mode_t newmode;
3884           int error;
3885 
3886           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3887           if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
3888                     goto out;
3889 
3890 #define CHANGED(x) ((int)(x) != -1)
3891           newmode = vattr.va_mode;
3892           if (posix_semantics) {
3893                     /*
3894                      * POSIX/XPG semantics: if the caller is not the super-user,
3895                      * clear set-user-id and set-group-id bits.  Both POSIX and
3896                      * the XPG consider the behaviour for calls by the super-user
3897                      * implementation-defined; we leave the set-user-id and set-
3898                      * group-id settings intact in that case.
3899                      */
3900                     if (vattr.va_mode & S_ISUID) {
3901                               if (kauth_authorize_vnode(l->l_cred,
3902                                         KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
3903                                         newmode &= ~S_ISUID;
3904                     }
3905                     if (vattr.va_mode & S_ISGID) {
3906                               if (kauth_authorize_vnode(l->l_cred,
3907                                         KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
3908                                         newmode &= ~S_ISGID;
3909                     }
3910           } else {
3911                     /*
3912                      * NetBSD semantics: when changing owner and/or group,
3913                      * clear the respective bit(s).
3914                      */
3915                     if (CHANGED(uid))
3916                               newmode &= ~S_ISUID;
3917                     if (CHANGED(gid))
3918                               newmode &= ~S_ISGID;
3919           }
3920           /* Update va_mode iff altered. */
3921           if (vattr.va_mode == newmode)
3922                     newmode = VNOVAL;
3923 
3924           vattr_null(&vattr);
3925           vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3926           vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3927           vattr.va_mode = newmode;
3928           error = VOP_SETATTR(vp, &vattr, l->l_cred);
3929 #undef CHANGED
3930 
3931 out:
3932           VOP_UNLOCK(vp);
3933           return (error);
3934 }
3935 
3936 /*
3937  * Set the access and modification times given a path name; this
3938  * version follows links.
3939  */
3940 /* ARGSUSED */
3941 int
sys___utimes50(struct lwp * l,const struct sys___utimes50_args * uap,register_t * retval)3942 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
3943     register_t *retval)
3944 {
3945           /* {
3946                     syscallarg(const char *) path;
3947                     syscallarg(const struct timeval *) tptr;
3948           } */
3949 
3950           return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3951               SCARG(uap, tptr), UIO_USERSPACE);
3952 }
3953 
3954 /*
3955  * Set the access and modification times given a file descriptor.
3956  */
3957 /* ARGSUSED */
3958 int
sys___futimes50(struct lwp * l,const struct sys___futimes50_args * uap,register_t * retval)3959 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
3960     register_t *retval)
3961 {
3962           /* {
3963                     syscallarg(int) fd;
3964                     syscallarg(const struct timeval *) tptr;
3965           } */
3966           int error;
3967           file_t *fp;
3968 
3969           /* fd_getvnode() will use the descriptor for us */
3970           if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3971                     return (error);
3972           error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
3973               UIO_USERSPACE);
3974           fd_putfile(SCARG(uap, fd));
3975           return (error);
3976 }
3977 
3978 int
sys_futimens(struct lwp * l,const struct sys_futimens_args * uap,register_t * retval)3979 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
3980     register_t *retval)
3981 {
3982           /* {
3983                     syscallarg(int) fd;
3984                     syscallarg(const struct timespec *) tptr;
3985           } */
3986           int error;
3987           file_t *fp;
3988 
3989           /* fd_getvnode() will use the descriptor for us */
3990           if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
3991                     return (error);
3992           error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
3993               SCARG(uap, tptr), UIO_USERSPACE);
3994           fd_putfile(SCARG(uap, fd));
3995           return (error);
3996 }
3997 
3998 /*
3999  * Set the access and modification times given a path name; this
4000  * version does not follow links.
4001  */
4002 int
sys___lutimes50(struct lwp * l,const struct sys___lutimes50_args * uap,register_t * retval)4003 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
4004     register_t *retval)
4005 {
4006           /* {
4007                     syscallarg(const char *) path;
4008                     syscallarg(const struct timeval *) tptr;
4009           } */
4010 
4011           return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
4012               SCARG(uap, tptr), UIO_USERSPACE);
4013 }
4014 
4015 int
sys_utimensat(struct lwp * l,const struct sys_utimensat_args * uap,register_t * retval)4016 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
4017     register_t *retval)
4018 {
4019           /* {
4020                     syscallarg(int) fd;
4021                     syscallarg(const char *) path;
4022                     syscallarg(const struct timespec *) tptr;
4023                     syscallarg(int) flag;
4024           } */
4025           int follow;
4026           const struct timespec *tptr;
4027           int error;
4028 
4029           tptr = SCARG(uap, tptr);
4030           follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
4031 
4032           error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
4033               SCARG(uap, path), follow, tptr, UIO_USERSPACE);
4034 
4035           return error;
4036 }
4037 
4038 /*
4039  * Common routine to set access and modification times given a vnode.
4040  */
4041 int
do_sys_utimens(struct lwp * l,struct vnode * vp,const char * path,int flag,const struct timespec * tptr,enum uio_seg seg)4042 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
4043     const struct timespec *tptr, enum uio_seg seg)
4044 {
4045 
4046           return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
4047 }
4048 
4049 int
do_sys_utimensat(struct lwp * l,int fdat,struct vnode * vp,const char * path,int flag,const struct timespec * tptr,enum uio_seg seg)4050 do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
4051     const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
4052 {
4053           struct vattr vattr;
4054           int error, dorele = 0;
4055           namei_simple_flags_t sflags;
4056           bool vanull, setbirthtime;
4057           struct timespec ts[2];
4058 
4059           KASSERT(l != NULL || fdat == AT_FDCWD);
4060 
4061           /*
4062            * I have checked all callers and they pass either FOLLOW,
4063            * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
4064            * is 0. More to the point, they don't pass anything else.
4065            * Let's keep it that way at least until the namei interfaces
4066            * are fully sanitized.
4067            */
4068           KASSERT(flag == NOFOLLOW || flag == FOLLOW);
4069           sflags = (flag == FOLLOW) ?
4070               NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
4071 
4072           if (tptr == NULL) {
4073                     vanull = true;
4074                     nanotime(&ts[0]);
4075                     ts[1] = ts[0];
4076           } else {
4077                     vanull = false;
4078                     if (seg != UIO_SYSSPACE) {
4079                               error = copyin(tptr, ts, sizeof (ts));
4080                               if (error != 0)
4081                                         return error;
4082                     } else {
4083                               ts[0] = tptr[0];
4084                               ts[1] = tptr[1];
4085                     }
4086           }
4087 
4088           if (ts[0].tv_nsec == UTIME_NOW) {
4089                     nanotime(&ts[0]);
4090                     if (ts[1].tv_nsec == UTIME_NOW) {
4091                               vanull = true;
4092                               ts[1] = ts[0];
4093                     }
4094           } else if (ts[1].tv_nsec == UTIME_NOW)
4095                     nanotime(&ts[1]);
4096 
4097           if (vp == NULL) {
4098                     /* note: SEG describes TPTR, not PATH; PATH is always user */
4099                     error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
4100                     if (error != 0)
4101                               return error;
4102                     dorele = 1;
4103           }
4104 
4105           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4106           setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
4107               timespeccmp(&ts[1], &vattr.va_birthtime, <));
4108           vattr_null(&vattr);
4109 
4110           if (ts[0].tv_nsec != UTIME_OMIT)
4111                     vattr.va_atime = ts[0];
4112 
4113           if (ts[1].tv_nsec != UTIME_OMIT) {
4114                     vattr.va_mtime = ts[1];
4115                     if (setbirthtime)
4116                               vattr.va_birthtime = ts[1];
4117           }
4118 
4119           if (vanull)
4120                     vattr.va_vaflags |= VA_UTIMES_NULL;
4121           error = VOP_SETATTR(vp, &vattr, l->l_cred);
4122           VOP_UNLOCK(vp);
4123 
4124           if (dorele != 0)
4125                     vrele(vp);
4126 
4127           return error;
4128 }
4129 
4130 int
do_sys_utimes(struct lwp * l,struct vnode * vp,const char * path,int flag,const struct timeval * tptr,enum uio_seg seg)4131 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
4132     const struct timeval *tptr, enum uio_seg seg)
4133 {
4134           struct timespec ts[2];
4135           struct timespec *tsptr = NULL;
4136           int error;
4137 
4138           if (tptr != NULL) {
4139                     struct timeval tv[2];
4140 
4141                     if (seg != UIO_SYSSPACE) {
4142                               error = copyin(tptr, tv, sizeof(tv));
4143                               if (error != 0)
4144                                         return error;
4145                               tptr = tv;
4146                     }
4147 
4148                     if ((tptr[0].tv_usec == UTIME_NOW) ||
4149                         (tptr[0].tv_usec == UTIME_OMIT))
4150                               ts[0].tv_nsec = tptr[0].tv_usec;
4151                     else {
4152                               if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000)
4153                                         return EINVAL;
4154 
4155                               TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
4156                     }
4157 
4158                     if ((tptr[1].tv_usec == UTIME_NOW) ||
4159                         (tptr[1].tv_usec == UTIME_OMIT))
4160                               ts[1].tv_nsec = tptr[1].tv_usec;
4161                     else {
4162                               if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000)
4163                                         return EINVAL;
4164 
4165                               TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
4166                     }
4167 
4168                     tsptr = &ts[0];
4169           }
4170 
4171           return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
4172 }
4173 
4174 /*
4175  * Truncate a file given its path name.
4176  */
4177 /* ARGSUSED */
4178 int
sys_truncate(struct lwp * l,const struct sys_truncate_args * uap,register_t * retval)4179 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap,
4180     register_t *retval)
4181 {
4182           /* {
4183                     syscallarg(const char *) path;
4184                     syscallarg(int) pad;
4185                     syscallarg(off_t) length;
4186           } */
4187           struct vnode *vp;
4188           struct vattr vattr;
4189           int error;
4190 
4191           if (SCARG(uap, length) < 0)
4192                     return EINVAL;
4193 
4194           error = namei_simple_user(SCARG(uap, path),
4195               NSM_FOLLOW_TRYEMULROOT, &vp);
4196           if (error != 0)
4197                     return (error);
4198           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4199           if (vp->v_type == VDIR)
4200                     error = EISDIR;
4201           else if ((error = vn_writechk(vp)) == 0 &&
4202               (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
4203                     vattr_null(&vattr);
4204                     vattr.va_size = SCARG(uap, length);
4205                     error = VOP_SETATTR(vp, &vattr, l->l_cred);
4206           }
4207           vput(vp);
4208           return (error);
4209 }
4210 
4211 /*
4212  * Truncate a file given a file descriptor.
4213  */
4214 /* ARGSUSED */
4215 int
sys_ftruncate(struct lwp * l,const struct sys_ftruncate_args * uap,register_t * retval)4216 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap,
4217     register_t *retval)
4218 {
4219           /* {
4220                     syscallarg(int) fd;
4221                     syscallarg(int) pad;
4222                     syscallarg(off_t) length;
4223           } */
4224           file_t *fp;
4225           int error, fd = SCARG(uap, fd);
4226 
4227           fp = fd_getfile(fd);
4228           if (fp == NULL)
4229                     return EBADF;
4230           if (fp->f_ops->fo_truncate == NULL)
4231                     error = EOPNOTSUPP;
4232           else
4233                     error = (*fp->f_ops->fo_truncate)(fp, SCARG(uap, length));
4234 
4235           fd_putfile(fd);
4236           return error;
4237 }
4238 
4239 /*
4240  * Sync an open file.
4241  */
4242 /* ARGSUSED */
4243 int
sys_fsync(struct lwp * l,const struct sys_fsync_args * uap,register_t * retval)4244 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
4245 {
4246           /* {
4247                     syscallarg(int) fd;
4248           } */
4249           struct vnode *vp;
4250           file_t *fp;
4251           int error;
4252 
4253           /* fd_getvnode() will use the descriptor for us */
4254           if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4255                     return (error);
4256           vp = fp->f_vnode;
4257           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4258           error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
4259           VOP_UNLOCK(vp);
4260           fd_putfile(SCARG(uap, fd));
4261           return (error);
4262 }
4263 
4264 /*
4265  * Sync a range of file data.  API modeled after that found in AIX.
4266  *
4267  * FDATASYNC indicates that we need only save enough metadata to be able
4268  * to re-read the written data.
4269  */
4270 /* ARGSUSED */
4271 int
sys_fsync_range(struct lwp * l,const struct sys_fsync_range_args * uap,register_t * retval)4272 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap,
4273     register_t *retval)
4274 {
4275           /* {
4276                     syscallarg(int) fd;
4277                     syscallarg(int) flags;
4278                     syscallarg(off_t) start;
4279                     syscallarg(off_t) length;
4280           } */
4281           struct vnode *vp;
4282           file_t *fp;
4283           int flags, nflags;
4284           off_t s, e, len;
4285           int error;
4286 
4287           /* fd_getvnode() will use the descriptor for us */
4288           if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4289                     return (error);
4290 
4291           if ((fp->f_flag & FWRITE) == 0) {
4292                     error = EBADF;
4293                     goto out;
4294           }
4295 
4296           flags = SCARG(uap, flags);
4297           if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
4298               ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
4299                     error = EINVAL;
4300                     goto out;
4301           }
4302           /* Now set up the flags for value(s) to pass to VOP_FSYNC() */
4303           if (flags & FDATASYNC)
4304                     nflags = FSYNC_DATAONLY | FSYNC_WAIT;
4305           else
4306                     nflags = FSYNC_WAIT;
4307           if (flags & FDISKSYNC)
4308                     nflags |= FSYNC_CACHE;
4309 
4310           len = SCARG(uap, length);
4311           /* If length == 0, we do the whole file, and s = e = 0 will do that */
4312           if (len) {
4313                     s = SCARG(uap, start);
4314                     if (s < 0 || len < 0 || len > OFF_T_MAX - s) {
4315                               error = EINVAL;
4316                               goto out;
4317                     }
4318                     e = s + len;
4319                     KASSERT(s <= e);
4320           } else {
4321                     e = 0;
4322                     s = 0;
4323           }
4324 
4325           vp = fp->f_vnode;
4326           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4327           error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4328           VOP_UNLOCK(vp);
4329 out:
4330           fd_putfile(SCARG(uap, fd));
4331           return (error);
4332 }
4333 
4334 /*
4335  * Sync the data of an open file.
4336  */
4337 /* ARGSUSED */
4338 int
sys_fdatasync(struct lwp * l,const struct sys_fdatasync_args * uap,register_t * retval)4339 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap,
4340     register_t *retval)
4341 {
4342           /* {
4343                     syscallarg(int) fd;
4344           } */
4345           struct vnode *vp;
4346           file_t *fp;
4347           int error;
4348 
4349           /* fd_getvnode() will use the descriptor for us */
4350           if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4351                     return (error);
4352           vp = fp->f_vnode;
4353           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4354           error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
4355           VOP_UNLOCK(vp);
4356           fd_putfile(SCARG(uap, fd));
4357           return (error);
4358 }
4359 
4360 /*
4361  * Rename files, (standard) BSD semantics frontend.
4362  */
4363 /* ARGSUSED */
4364 int
sys_rename(struct lwp * l,const struct sys_rename_args * uap,register_t * retval)4365 sys_rename(struct lwp *l, const struct sys_rename_args *uap,
4366     register_t *retval)
4367 {
4368           /* {
4369                     syscallarg(const char *) from;
4370                     syscallarg(const char *) to;
4371           } */
4372 
4373           return do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4374               SCARG(uap, to), UIO_USERSPACE, 0);
4375 }
4376 
4377 int
sys_renameat(struct lwp * l,const struct sys_renameat_args * uap,register_t * retval)4378 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
4379     register_t *retval)
4380 {
4381           /* {
4382                     syscallarg(int) fromfd;
4383                     syscallarg(const char *) from;
4384                     syscallarg(int) tofd;
4385                     syscallarg(const char *) to;
4386           } */
4387 
4388           return do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4389               SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0);
4390 }
4391 
4392 /*
4393  * Rename files, POSIX semantics frontend.
4394  */
4395 /* ARGSUSED */
4396 int
sys___posix_rename(struct lwp * l,const struct sys___posix_rename_args * uap,register_t * retval)4397 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap,
4398     register_t *retval)
4399 {
4400           /* {
4401                     syscallarg(const char *) from;
4402                     syscallarg(const char *) to;
4403           } */
4404 
4405           return do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4406               SCARG(uap, to), UIO_USERSPACE, 1);
4407 }
4408 
4409 /*
4410  * Rename files.  Source and destination must either both be directories,
4411  * or both not be directories.  If target is a directory, it must be empty.
4412  * If `from' and `to' refer to the same object, the value of the `retain'
4413  * argument is used to determine whether `from' will be
4414  *
4415  * (retain == 0)    deleted unless `from' and `to' refer to the same
4416  *                            object in the file system's name space (BSD).
4417  * (retain == 1)    always retained (POSIX).
4418  *
4419  * XXX Synchronize with nfsrv_rename in nfs_serv.c.
4420  */
4421 int
do_sys_rename(const char * from,const char * to,enum uio_seg seg,int retain)4422 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
4423 {
4424 
4425           return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg,
4426               retain);
4427 }
4428 
4429 static int
do_sys_renameat(struct lwp * l,int fromfd,const char * from,int tofd,const char * to,enum uio_seg seg,int retain)4430 do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
4431     const char *to, enum uio_seg seg, int retain)
4432 {
4433           struct pathbuf *fpb, *tpb;
4434           struct nameidata fnd, tnd;
4435           struct vnode *fdvp, *fvp;
4436           struct vnode *tdvp, *tvp;
4437           struct mount *mp, *tmp;
4438           int error;
4439 
4440           KASSERT(l != NULL || fromfd == AT_FDCWD);
4441           KASSERT(l != NULL || tofd == AT_FDCWD);
4442 
4443           error = pathbuf_maybe_copyin(from, seg, &fpb);
4444           if (error)
4445                     goto out0;
4446           KASSERT(fpb != NULL);
4447 
4448           error = pathbuf_maybe_copyin(to, seg, &tpb);
4449           if (error)
4450                     goto out1;
4451           KASSERT(tpb != NULL);
4452 
4453           /*
4454            * Lookup from.
4455            *
4456            * XXX LOCKPARENT is wrong because we don't actually want it
4457            * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4458            * insane, so for the time being we need to leave it like this.
4459            */
4460           NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
4461           if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
4462                     goto out2;
4463 
4464           /*
4465            * Pull out the important results of the lookup, fdvp and fvp.
4466            * Of course, fvp is bogus because we're about to unlock fdvp.
4467            */
4468           fdvp = fnd.ni_dvp;
4469           fvp = fnd.ni_vp;
4470           mp = fdvp->v_mount;
4471           KASSERT(fdvp != NULL);
4472           KASSERT(fvp != NULL);
4473           KASSERT(fdvp == fvp || VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
4474           /*
4475            * Bracket the operation with fstrans_start()/fstrans_done().
4476            *
4477            * Inside the bracket this file system cannot be unmounted so
4478            * a vnode on this file system cannot change its v_mount.
4479            * A vnode on another file system may still change to dead mount.
4480            */
4481           fstrans_start(mp);
4482 
4483           /*
4484            * Make sure neither fdvp nor fvp is locked.
4485            */
4486           if (fdvp != fvp)
4487                     VOP_UNLOCK(fdvp);
4488           /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4489           /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4490 
4491           /*
4492            * Reject renaming `.' and `..'.  Can't do this until after
4493            * namei because we need namei's parsing to find the final
4494            * component name.  (namei should just leave us with the final
4495            * component name and not look it up itself, but anyway...)
4496            *
4497            * This was here before because we used to relookup from
4498            * instead of to and relookup requires the caller to check
4499            * this, but now file systems may depend on this check, so we
4500            * must retain it until the file systems are all rototilled.
4501            */
4502           if ((fnd.ni_cnd.cn_namelen == 1 &&
4503                     fnd.ni_cnd.cn_nameptr[0] == '.') ||
4504               (fnd.ni_cnd.cn_namelen == 2 &&
4505                     fnd.ni_cnd.cn_nameptr[0] == '.' &&
4506                     fnd.ni_cnd.cn_nameptr[1] == '.')) {
4507                     error = EINVAL;     /* XXX EISDIR?  */
4508                     goto abort0;
4509           }
4510 
4511           /*
4512            * Lookup to.
4513            *
4514            * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
4515            * fvp here to decide whether to add CREATEDIR is a load of
4516            * bollocks because fvp might be the wrong node by now, since
4517            * fdvp is unlocked.
4518            *
4519            * XXX Why not pass CREATEDIR always?
4520            */
4521           NDINIT(&tnd, RENAME,
4522               (LOCKPARENT | NOCACHE | TRYEMULROOT |
4523                     ((fvp->v_type == VDIR)? CREATEDIR : 0)),
4524               tpb);
4525           if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
4526                     goto abort0;
4527 
4528           /*
4529            * Pull out the important results of the lookup, tdvp and tvp.
4530            * Of course, tvp is bogus because we're about to unlock tdvp.
4531            */
4532           tdvp = tnd.ni_dvp;
4533           tvp = tnd.ni_vp;
4534           KASSERT(tdvp != NULL);
4535           KASSERT(tdvp == tvp || VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4536 
4537           if (fvp->v_type == VDIR)
4538                     tnd.ni_cnd.cn_flags |= WILLBEDIR;
4539           /*
4540            * Make sure neither tdvp nor tvp is locked.
4541            */
4542           if (tdvp != tvp)
4543                     VOP_UNLOCK(tdvp);
4544           /* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4545           /* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4546 
4547           /*
4548            * Reject renaming onto `.' or `..'.  relookup is unhappy with
4549            * these, which is why we must do this here.  Once upon a time
4550            * we relooked up from instead of to, and consequently didn't
4551            * need this check, but now that we relookup to instead of
4552            * from, we need this; and we shall need it forever forward
4553            * until the VOP_RENAME protocol changes, because file systems
4554            * will no doubt begin to depend on this check.
4555            */
4556           if (tnd.ni_cnd.cn_namelen == 1 && tnd.ni_cnd.cn_nameptr[0] == '.') {
4557                     error = EISDIR;
4558                     goto abort1;
4559           }
4560           if (tnd.ni_cnd.cn_namelen == 2 &&
4561               tnd.ni_cnd.cn_nameptr[0] == '.' &&
4562               tnd.ni_cnd.cn_nameptr[1] == '.') {
4563                     error = EINVAL;
4564                     goto abort1;
4565           }
4566 
4567           /*
4568            * Make sure the mount points match.  Although we don't hold
4569            * any vnode locks, the v_mount on fdvp file system are stable.
4570            *
4571            * Unmounting another file system at an inopportune moment may
4572            * cause tdvp to disappear and change its v_mount to dead.
4573            *
4574            * So in either case different v_mount means cross-device rename.
4575            */
4576           KASSERT(mp != NULL);
4577           tmp = tdvp->v_mount;
4578 
4579           if (mp != tmp) {
4580                     error = EXDEV;
4581                     goto abort1;
4582           }
4583 
4584           /*
4585            * Take the vfs rename lock to avoid cross-directory screw cases.
4586            * Nothing is locked currently, so taking this lock is safe.
4587            */
4588           error = VFS_RENAMELOCK_ENTER(mp);
4589           if (error)
4590                     goto abort1;
4591 
4592           /*
4593            * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4594            * and nothing is locked except for the vfs rename lock.
4595            *
4596            * The next step is a little rain dance to conform to the
4597            * insane lock protocol, even though it does nothing to ward
4598            * off race conditions.
4599            *
4600            * We need tdvp and tvp to be locked.  However, because we have
4601            * unlocked tdvp in order to hold no locks while we take the
4602            * vfs rename lock, tvp may be wrong here, and we can't safely
4603            * lock it even if the sensible file systems will just unlock
4604            * it straight away.  Consequently, we must lock tdvp and then
4605            * relookup tvp to get it locked.
4606            *
4607            * Finally, because the VOP_RENAME protocol is brain-damaged
4608            * and various file systems insanely depend on the semantics of
4609            * this brain damage, the lookup of to must be the last lookup
4610            * before VOP_RENAME.
4611            */
4612           vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
4613           error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
4614           if (error)
4615                     goto abort2;
4616 
4617           /*
4618            * Drop the old tvp and pick up the new one -- which might be
4619            * the same, but that doesn't matter to us.  After this, tdvp
4620            * and tvp should both be locked.
4621            */
4622           if (tvp != NULL)
4623                     vrele(tvp);
4624           tvp = tnd.ni_vp;
4625           KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4626           KASSERT(tvp == NULL || VOP_ISLOCKED(tvp) == LK_EXCLUSIVE);
4627 
4628           /*
4629            * The old do_sys_rename had various consistency checks here
4630            * involving fvp and tvp.  fvp is bogus already here, and tvp
4631            * will become bogus soon in any sensible file system, so the
4632            * only purpose in putting these checks here is to give lip
4633            * service to these screw cases and to acknowledge that they
4634            * exist, not actually to handle them, but here you go
4635            * anyway...
4636            */
4637 
4638           /*
4639            * Acknowledge that directories and non-directories aren't
4640            * supposed to mix.
4641            */
4642           if (tvp != NULL) {
4643                     if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
4644                               error = ENOTDIR;
4645                               goto abort3;
4646                     } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
4647                               error = EISDIR;
4648                               goto abort3;
4649                     }
4650           }
4651 
4652           /*
4653            * Acknowledge some random screw case, among the dozens that
4654            * might arise.
4655            */
4656           if (fvp == tdvp) {
4657                     error = EINVAL;
4658                     goto abort3;
4659           }
4660 
4661           /*
4662            * Acknowledge that POSIX has a wacky screw case.
4663            *
4664            * XXX Eventually the retain flag needs to be passed on to
4665            * VOP_RENAME.
4666            */
4667           if (fvp == tvp) {
4668                     if (retain) {
4669                               error = 0;
4670                               goto abort3;
4671                     } else if (fdvp == tdvp &&
4672                         fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen &&
4673                         0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4674                               fnd.ni_cnd.cn_namelen)) {
4675                               error = 0;
4676                               goto abort3;
4677                     }
4678           }
4679 
4680           /*
4681            * Make sure veriexec can screw us up.  (But a race can screw
4682            * up veriexec, of course -- remember, fvp and (soon) tvp are
4683            * bogus.)
4684            */
4685 #if NVERIEXEC > 0
4686           {
4687                     char *f1, *f2;
4688                     size_t f1_len;
4689                     size_t f2_len;
4690 
4691                     f1_len = fnd.ni_cnd.cn_namelen + 1;
4692                     f1 = kmem_alloc(f1_len, KM_SLEEP);
4693                     strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4694 
4695                     f2_len = tnd.ni_cnd.cn_namelen + 1;
4696                     f2 = kmem_alloc(f2_len, KM_SLEEP);
4697                     strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4698 
4699                     error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4700 
4701                     kmem_free(f1, f1_len);
4702                     kmem_free(f2, f2_len);
4703 
4704                     if (error)
4705                               goto abort3;
4706           }
4707 #endif /* NVERIEXEC > 0 */
4708 
4709           /*
4710            * All ready.  Incant the rename vop.
4711            */
4712           /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4713           /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4714           KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4715           KASSERT(tvp == NULL || VOP_ISLOCKED(tvp) == LK_EXCLUSIVE);
4716           error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4717 
4718           /*
4719            * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4720            * tdvp and tvp.  But we can't assert any of that.
4721            */
4722           /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
4723           /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
4724           /* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
4725           /* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
4726 
4727           /*
4728            * So all we have left to do is to drop the rename lock and
4729            * destroy the pathbufs.
4730            */
4731           VFS_RENAMELOCK_EXIT(mp);
4732           fstrans_done(mp);
4733           goto out2;
4734 
4735 abort3:   if (tvp != NULL && tvp != tdvp)
4736                     VOP_UNLOCK(tvp);
4737 abort2:   VOP_UNLOCK(tdvp);
4738           VFS_RENAMELOCK_EXIT(mp);
4739 abort1:   VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4740           vrele(tdvp);
4741           if (tvp != NULL)
4742                     vrele(tvp);
4743 abort0:   VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4744           vrele(fdvp);
4745           vrele(fvp);
4746           fstrans_done(mp);
4747 out2:     pathbuf_destroy(tpb);
4748 out1:     pathbuf_destroy(fpb);
4749 out0:     return error;
4750 }
4751 
4752 /*
4753  * Make a directory file.
4754  */
4755 /* ARGSUSED */
4756 int
sys_mkdir(struct lwp * l,const struct sys_mkdir_args * uap,register_t * retval)4757 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
4758 {
4759           /* {
4760                     syscallarg(const char *) path;
4761                     syscallarg(int) mode;
4762           } */
4763 
4764           return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4765               SCARG(uap, mode), UIO_USERSPACE);
4766 }
4767 
4768 int
sys_mkdirat(struct lwp * l,const struct sys_mkdirat_args * uap,register_t * retval)4769 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
4770     register_t *retval)
4771 {
4772           /* {
4773                     syscallarg(int) fd;
4774                     syscallarg(const char *) path;
4775                     syscallarg(int) mode;
4776           } */
4777 
4778           return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4779               SCARG(uap, mode), UIO_USERSPACE);
4780 }
4781 
4782 int
do_sys_mkdir(const char * path,mode_t mode,enum uio_seg seg)4783 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
4784 {
4785 
4786           return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg);
4787 }
4788 
4789 static int
do_sys_mkdirat(struct lwp * l,int fdat,const char * path,mode_t mode,enum uio_seg seg)4790 do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
4791     enum uio_seg seg)
4792 {
4793           struct proc *p = curlwp->l_proc;
4794           struct vnode *vp;
4795           struct vattr vattr;
4796           int error;
4797           struct pathbuf *pb;
4798           struct nameidata nd;
4799 
4800           KASSERT(l != NULL || fdat == AT_FDCWD);
4801 
4802           /* XXX bollocks, should pass in a pathbuf */
4803           error = pathbuf_maybe_copyin(path, seg, &pb);
4804           if (error) {
4805                     return error;
4806           }
4807 
4808           NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
4809 
4810           if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
4811                     pathbuf_destroy(pb);
4812                     return (error);
4813           }
4814           vp = nd.ni_vp;
4815           if (vp != NULL) {
4816                     VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4817                     if (nd.ni_dvp == vp)
4818                               vrele(nd.ni_dvp);
4819                     else
4820                               vput(nd.ni_dvp);
4821                     vrele(vp);
4822                     pathbuf_destroy(pb);
4823                     return (EEXIST);
4824           }
4825           vattr_null(&vattr);
4826           vattr.va_type = VDIR;
4827           /* We will read cwdi->cwdi_cmask unlocked. */
4828           vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4829           nd.ni_cnd.cn_flags |= WILLBEDIR;
4830           error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4831           if (!error)
4832                     vrele(nd.ni_vp);
4833           vput(nd.ni_dvp);
4834           pathbuf_destroy(pb);
4835           return (error);
4836 }
4837 
4838 /*
4839  * Remove a directory file.
4840  */
4841 /* ARGSUSED */
4842 int
sys_rmdir(struct lwp * l,const struct sys_rmdir_args * uap,register_t * retval)4843 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
4844 {
4845           /* {
4846                     syscallarg(char *) path;
4847           } */
4848 
4849           return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), AT_REMOVEDIR,
4850               UIO_USERSPACE);
4851 }
4852 
4853 /*
4854  * Read a block of directory entries in a file system independent format.
4855  */
4856 int
sys___getdents30(struct lwp * l,const struct sys___getdents30_args * uap,register_t * retval)4857 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap,
4858     register_t *retval)
4859 {
4860           /* {
4861                     syscallarg(int) fd;
4862                     syscallarg(char *) buf;
4863                     syscallarg(size_t) count;
4864           } */
4865           file_t *fp;
4866           int error, done;
4867 
4868           /* fd_getvnode() will use the descriptor for us */
4869           if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
4870                     return (error);
4871           if ((fp->f_flag & FREAD) == 0) {
4872                     error = EBADF;
4873                     goto out;
4874           }
4875           error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4876               SCARG(uap, count), &done, l, 0, 0);
4877           ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4878           *retval = done;
4879 out:
4880           fd_putfile(SCARG(uap, fd));
4881           return (error);
4882 }
4883 
4884 /*
4885  * Set the mode mask for creation of filesystem nodes.
4886  */
4887 int
sys_umask(struct lwp * l,const struct sys_umask_args * uap,register_t * retval)4888 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
4889 {
4890           /* {
4891                     syscallarg(mode_t) newmask;
4892           } */
4893 
4894           /*
4895            * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of
4896            * serialization with those reads is required.  It's important to
4897            * return a coherent answer for the caller of umask() though, and
4898            * the atomic operation accomplishes that.
4899            */
4900           *retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask,
4901               SCARG(uap, newmask) & ALLPERMS);
4902 
4903           return (0);
4904 }
4905 
4906 int
dorevoke(struct vnode * vp,kauth_cred_t cred)4907 dorevoke(struct vnode *vp, kauth_cred_t cred)
4908 {
4909           struct vattr vattr;
4910           int error, fs_decision;
4911 
4912           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4913           error = VOP_GETATTR(vp, &vattr, cred);
4914           VOP_UNLOCK(vp);
4915           if (error != 0)
4916                     return error;
4917           fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
4918           error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4919               fs_decision);
4920           if (!error)
4921                     VOP_REVOKE(vp, REVOKEALL);
4922           return (error);
4923 }
4924 
4925 /*
4926  * Void all references to file by ripping underlying filesystem
4927  * away from vnode.
4928  */
4929 /* ARGSUSED */
4930 int
sys_revoke(struct lwp * l,const struct sys_revoke_args * uap,register_t * retval)4931 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap,
4932     register_t *retval)
4933 {
4934           /* {
4935                     syscallarg(const char *) path;
4936           } */
4937           struct vnode *vp;
4938           int error;
4939 
4940           error = namei_simple_user(SCARG(uap, path), NSM_FOLLOW_TRYEMULROOT,
4941               &vp);
4942           if (error != 0)
4943                     return (error);
4944           error = dorevoke(vp, l->l_cred);
4945           vrele(vp);
4946           return (error);
4947 }
4948 
4949 /*
4950  * Allocate backing store for a file, filling a hole without having to
4951  * explicitly write anything out.
4952  */
4953 /* ARGSUSED */
4954 int
sys_posix_fallocate(struct lwp * l,const struct sys_posix_fallocate_args * uap,register_t * retval)4955 sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
4956     register_t *retval)
4957 {
4958           /* {
4959                     syscallarg(int) fd;
4960                     syscallarg(off_t) pos;
4961                     syscallarg(off_t) len;
4962           } */
4963           int fd;
4964           off_t pos, len;
4965           struct file *fp;
4966           struct vnode *vp;
4967           int error;
4968 
4969           fd = SCARG(uap, fd);
4970           pos = SCARG(uap, pos);
4971           len = SCARG(uap, len);
4972 
4973           if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
4974                     *retval = EINVAL;
4975                     return 0;
4976           }
4977 
4978           error = fd_getvnode(fd, &fp);
4979           if (error) {
4980                     *retval = error;
4981                     return 0;
4982           }
4983           if ((fp->f_flag & FWRITE) == 0) {
4984                     error = EBADF;
4985                     goto fail;
4986           }
4987           vp = fp->f_vnode;
4988 
4989           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4990           if (vp->v_type == VDIR) {
4991                     error = EISDIR;
4992           } else {
4993                     error = VOP_FALLOCATE(vp, pos, len);
4994           }
4995           VOP_UNLOCK(vp);
4996 
4997 fail:
4998           fd_putfile(fd);
4999           *retval = error;
5000           return 0;
5001 }
5002 
5003 /*
5004  * Deallocate backing store for a file, creating a hole. Also used for
5005  * invoking TRIM on disks.
5006  */
5007 /* ARGSUSED */
5008 int
sys_fdiscard(struct lwp * l,const struct sys_fdiscard_args * uap,register_t * retval)5009 sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
5010     register_t *retval)
5011 {
5012           /* {
5013                     syscallarg(int) fd;
5014                     syscallarg(off_t) pos;
5015                     syscallarg(off_t) len;
5016           } */
5017           int fd;
5018           off_t pos, len;
5019           struct file *fp;
5020           struct vnode *vp;
5021           int error;
5022 
5023           fd = SCARG(uap, fd);
5024           pos = SCARG(uap, pos);
5025           len = SCARG(uap, len);
5026 
5027           if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
5028                     return EINVAL;
5029           }
5030 
5031           error = fd_getvnode(fd, &fp);
5032           if (error) {
5033                     return error;
5034           }
5035           if ((fp->f_flag & FWRITE) == 0) {
5036                     error = EBADF;
5037                     goto fail;
5038           }
5039           vp = fp->f_vnode;
5040 
5041           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
5042           if (vp->v_type == VDIR) {
5043                     error = EISDIR;
5044           } else {
5045                     error = VOP_FDISCARD(vp, pos, len);
5046           }
5047           VOP_UNLOCK(vp);
5048 
5049 fail:
5050           fd_putfile(fd);
5051           return error;
5052 }
5053