1 /*        $NetBSD: linux_file.c,v 1.133 2024/10/01 17:46:51 riastradh Exp $     */
2 
3 /*-
4  * Copyright (c) 1995, 1998, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Frank van der Linden and Eric Haszlakiewicz.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Functions in multiarch:
34  *        linux_sys_llseek    : linux_llseek.c
35  */
36 
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: linux_file.c,v 1.133 2024/10/01 17:46:51 riastradh Exp $");
39 
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/namei.h>
44 #include <sys/proc.h>
45 #include <sys/file.h>
46 #include <sys/fcntl.h>
47 #include <sys/stat.h>
48 #include <sys/vfs_syscalls.h>
49 #include <sys/filedesc.h>
50 #include <sys/ioctl.h>
51 #include <sys/kernel.h>
52 #include <sys/mount.h>
53 #include <sys/namei.h>
54 #include <sys/vnode.h>
55 #include <sys/tty.h>
56 #include <sys/socketvar.h>
57 #include <sys/conf.h>
58 #include <sys/pipe.h>
59 #include <sys/fstrans.h>
60 #include <sys/syscallargs.h>
61 #include <sys/vfs_syscalls.h>
62 
63 #include <compat/linux/common/linux_types.h>
64 #include <compat/linux/common/linux_signal.h>
65 #include <compat/linux/common/linux_fcntl.h>
66 #include <compat/linux/common/linux_util.h>
67 #include <compat/linux/common/linux_machdep.h>
68 #include <compat/linux/common/linux_ipc.h>
69 #include <compat/linux/common/linux_sem.h>
70 
71 #include <compat/linux/linux_syscallargs.h>
72 
73 #ifdef DEBUG_LINUX
74 #define DPRINTF(a, ...)       uprintf(a, __VA_ARGS__)
75 #else
76 #define DPRINTF(a, ...)
77 #endif
78 
79 #define LINUX_COPY_FILE_RANGE_MAX_CHUNK 8192
80 
81 static int bsd_to_linux_ioflags(int);
82 #if !defined(__aarch64__) && !defined(__amd64__)
83 static void bsd_to_linux_stat(struct stat *, struct linux_stat *);
84 #endif
85 
conv_linux_flock(linux,flock)86 conv_linux_flock(linux, flock)
87 
88 /*
89  * Some file-related calls are handled here. The usual flag conversion
90  * an structure conversion is done, and alternate emul path searching.
91  */
92 
93 /*
94  * The next two functions convert between the Linux and NetBSD values
95  * of the flags used in open(2) and fcntl(2).
96  */
97 int
98 linux_to_bsd_ioflags(int lflags)
99 {
100           int res = 0;
101 
102           res |= cvtto_bsd_mask(lflags, LINUX_O_WRONLY, O_WRONLY);
103           res |= cvtto_bsd_mask(lflags, LINUX_O_RDONLY, O_RDONLY);
104           res |= cvtto_bsd_mask(lflags, LINUX_O_RDWR, O_RDWR);
105 
106           res |= cvtto_bsd_mask(lflags, LINUX_O_CREAT, O_CREAT);
107           res |= cvtto_bsd_mask(lflags, LINUX_O_EXCL, O_EXCL);
108           res |= cvtto_bsd_mask(lflags, LINUX_O_NOCTTY, O_NOCTTY);
109           res |= cvtto_bsd_mask(lflags, LINUX_O_TRUNC, O_TRUNC);
110           res |= cvtto_bsd_mask(lflags, LINUX_O_APPEND, O_APPEND);
111           res |= cvtto_bsd_mask(lflags, LINUX_O_NONBLOCK, O_NONBLOCK);
112           res |= cvtto_bsd_mask(lflags, LINUX_O_NDELAY, O_NDELAY);
113           res |= cvtto_bsd_mask(lflags, LINUX_O_SYNC, O_FSYNC);
114           res |= cvtto_bsd_mask(lflags, LINUX_FASYNC, O_ASYNC);
115           res |= cvtto_bsd_mask(lflags, LINUX_O_DIRECT, O_DIRECT);
116           res |= cvtto_bsd_mask(lflags, LINUX_O_DIRECTORY, O_DIRECTORY);
117           res |= cvtto_bsd_mask(lflags, LINUX_O_NOFOLLOW, O_NOFOLLOW);
118           res |= cvtto_bsd_mask(lflags, LINUX_O_CLOEXEC, O_CLOEXEC);
119 
120           return res;
121 }
122 
123 static int
bsd_to_linux_ioflags(int bflags)124 bsd_to_linux_ioflags(int bflags)
125 {
126           int res = 0;
127 
128           res |= cvtto_linux_mask(bflags, O_WRONLY, LINUX_O_WRONLY);
129           res |= cvtto_linux_mask(bflags, O_RDONLY, LINUX_O_RDONLY);
130           res |= cvtto_linux_mask(bflags, O_RDWR, LINUX_O_RDWR);
131 
132           res |= cvtto_linux_mask(bflags, O_CREAT, LINUX_O_CREAT);
133           res |= cvtto_linux_mask(bflags, O_EXCL, LINUX_O_EXCL);
134           res |= cvtto_linux_mask(bflags, O_NOCTTY, LINUX_O_NOCTTY);
135           res |= cvtto_linux_mask(bflags, O_TRUNC, LINUX_O_TRUNC);
136           res |= cvtto_linux_mask(bflags, O_APPEND, LINUX_O_APPEND);
137           res |= cvtto_linux_mask(bflags, O_NONBLOCK, LINUX_O_NONBLOCK);
138           res |= cvtto_linux_mask(bflags, O_NDELAY, LINUX_O_NDELAY);
139           res |= cvtto_linux_mask(bflags, O_FSYNC, LINUX_O_SYNC);
140           res |= cvtto_linux_mask(bflags, O_ASYNC, LINUX_FASYNC);
141           res |= cvtto_linux_mask(bflags, O_DIRECT, LINUX_O_DIRECT);
142           res |= cvtto_linux_mask(bflags, O_DIRECTORY, LINUX_O_DIRECTORY);
143           res |= cvtto_linux_mask(bflags, O_NOFOLLOW, LINUX_O_NOFOLLOW);
144           res |= cvtto_linux_mask(bflags, O_CLOEXEC, LINUX_O_CLOEXEC);
145 
146           return res;
147 }
148 
149 static inline off_t
linux_hilo_to_off_t(unsigned long hi,unsigned long lo)150 linux_hilo_to_off_t(unsigned long hi, unsigned long lo)
151 {
152 #ifdef _LP64
153           /*
154            * Linux discards the "hi" portion on LP64 platforms; even though
155            * glibc puts of the upper 32-bits of the offset into the "hi"
156            * argument regardless, the "lo" argument has all the bits in
157            * this case.
158            */
159           (void) hi;
160           return (off_t)lo;
161 #else
162           return (((off_t)hi) << 32) | lo;
163 #endif /* _LP64 */
164 }
165 
166 #if !defined(__aarch64__)
167 /*
168  * creat(2) is an obsolete function, but it's present as a Linux
169  * system call, so let's deal with it.
170  *
171  * Note: On the Alpha this doesn't really exist in Linux, but it's defined
172  * in syscalls.master anyway so this doesn't have to be special cased.
173  *
174  * Just call open(2) with the TRUNC, CREAT and WRONLY flags.
175  */
176 int
linux_sys_creat(struct lwp * l,const struct linux_sys_creat_args * uap,register_t * retval)177 linux_sys_creat(struct lwp *l, const struct linux_sys_creat_args *uap,
178     register_t *retval)
179 {
180           /* {
181                     syscallarg(const char *) path;
182                     syscallarg(linux_umode_t) mode;
183           } */
184           struct sys_open_args oa;
185 
186           SCARG(&oa, path) = SCARG(uap, path);
187           SCARG(&oa, flags) = O_CREAT | O_TRUNC | O_WRONLY;
188           SCARG(&oa, mode) = SCARG(uap, mode);
189 
190           return sys_open(l, &oa, retval);
191 }
192 #endif
193 
194 static void
linux_open_ctty(struct lwp * l,int flags,int fd)195 linux_open_ctty(struct lwp *l, int flags, int fd)
196 {
197           struct proc *p = l->l_proc;
198 
199           /*
200            * this bit from sunos_misc.c (and svr4_fcntl.c).
201            * If we are a session leader, and we don't have a controlling
202            * terminal yet, and the O_NOCTTY flag is not set, try to make
203            * this the controlling terminal.
204            */
205         if (!(flags & O_NOCTTY) && SESS_LEADER(p) && !(p->p_lflag & PL_CONTROLT)) {
206                 file_t *fp;
207 
208                     fp = fd_getfile(fd);
209 
210                 /* ignore any error, just give it a try */
211                 if (fp != NULL) {
212                               if (fp->f_type == DTYPE_VNODE) {
213                                         (fp->f_ops->fo_ioctl) (fp, TIOCSCTTY, NULL);
214                               }
215                               fd_putfile(fd);
216                     }
217         }
218 }
219 
220 /*
221  * open(2). Take care of the different flag values, and let the
222  * NetBSD syscall do the real work. See if this operation
223  * gives the current process a controlling terminal.
224  * (XXX is this necessary?)
225  */
226 int
linux_sys_open(struct lwp * l,const struct linux_sys_open_args * uap,register_t * retval)227 linux_sys_open(struct lwp *l, const struct linux_sys_open_args *uap,
228     register_t *retval)
229 {
230           /* {
231                     syscallarg(const char *) path;
232                     syscallarg(int) flags;
233                     syscallarg(linux_umode_t) mode;
234           } */
235           int error, fl;
236           struct sys_open_args boa;
237 
238           fl = linux_to_bsd_ioflags(SCARG(uap, flags));
239 
240           SCARG(&boa, path) = SCARG(uap, path);
241           SCARG(&boa, flags) = fl;
242           SCARG(&boa, mode) = SCARG(uap, mode);
243 
244           if ((error = sys_open(l, &boa, retval)))
245                     return (error == EFTYPE) ? ELOOP : error;
246 
247           linux_open_ctty(l, fl, *retval);
248           return 0;
249 }
250 
251 int
linux_sys_openat(struct lwp * l,const struct linux_sys_openat_args * uap,register_t * retval)252 linux_sys_openat(struct lwp *l, const struct linux_sys_openat_args *uap,
253     register_t *retval)
254 {
255           /* {
256                     syscallarg(int) fd;
257                     syscallarg(const char *) path;
258                     syscallarg(int) flags;
259                     syscallarg(linux_umode_t) mode;
260           } */
261           int error, fl;
262           struct sys_openat_args boa;
263 
264           fl = linux_to_bsd_ioflags(SCARG(uap, flags));
265 
266           SCARG(&boa, fd) = SCARG(uap, fd);
267           SCARG(&boa, path) = SCARG(uap, path);
268           SCARG(&boa, oflags) = fl;
269           SCARG(&boa, mode) = SCARG(uap, mode);
270 
271           if ((error = sys_openat(l, &boa, retval)))
272                     return (error == EFTYPE) ? ELOOP : error;
273 
274           linux_open_ctty(l, fl, *retval);
275           return 0;
276 }
277 
278 /*
279  * Most actions in the fcntl() call are straightforward; simply
280  * pass control to the NetBSD system call. A few commands need
281  * conversions after the actual system call has done its work,
282  * because the flag values and lock structure are different.
283  */
284 int
linux_sys_fcntl(struct lwp * l,const struct linux_sys_fcntl_args * uap,register_t * retval)285 linux_sys_fcntl(struct lwp *l, const struct linux_sys_fcntl_args *uap,
286     register_t *retval)
287 {
288           /* {
289                     syscallarg(int) fd;
290                     syscallarg(int) cmd;
291                     syscallarg(void *) arg;
292           } */
293           struct proc *p = l->l_proc;
294           int fd, cmd, error;
295           u_long val;
296           void *arg;
297           struct sys_fcntl_args fca;
298           file_t *fp;
299           struct vnode *vp;
300           struct vattr va;
301           long pgid;
302           struct pgrp *pgrp;
303           struct tty *tp;
304 
305           fd = SCARG(uap, fd);
306           cmd = SCARG(uap, cmd);
307           arg = SCARG(uap, arg);
308 
309           switch (cmd) {
310 
311           case LINUX_F_DUPFD:
312                     cmd = F_DUPFD;
313                     break;
314 
315           case LINUX_F_GETFD:
316                     cmd = F_GETFD;
317                     break;
318 
319           case LINUX_F_SETFD:
320                     cmd = F_SETFD;
321                     break;
322 
323           case LINUX_F_GETFL:
324                     SCARG(&fca, fd) = fd;
325                     SCARG(&fca, cmd) = F_GETFL;
326                     SCARG(&fca, arg) = arg;
327                     if ((error = sys_fcntl(l, &fca, retval)))
328                               return error;
329                     retval[0] = bsd_to_linux_ioflags(retval[0]);
330                     return 0;
331 
332           case LINUX_F_SETFL: {
333                     file_t    *fp1 = NULL;
334 
335                     val = linux_to_bsd_ioflags((unsigned long)SCARG(uap, arg));
336                     /*
337                      * Linux seems to have same semantics for sending SIGIO to the
338                      * read side of socket, but slightly different semantics
339                      * for SIGIO to the write side.  Rather than sending the SIGIO
340                      * every time it's possible to write (directly) more data, it
341                      * only sends SIGIO if last write(2) failed due to insufficient
342                      * memory to hold the data. This is compatible enough
343                      * with NetBSD semantics to not do anything about the
344                      * difference.
345                      *
346                      * Linux does NOT send SIGIO for pipes. Deal with socketpair
347                      * ones and DTYPE_PIPE ones. For these, we don't set
348                      * the underlying flags (we don't pass O_ASYNC flag down
349                      * to sys_fcntl()), but set the FASYNC flag for file descriptor,
350                      * so that F_GETFL would report the ASYNC i/o is on.
351                      */
352                     if (val & O_ASYNC) {
353                               if (((fp1 = fd_getfile(fd)) == NULL))
354                                   return (EBADF);
355                               if (((fp1->f_type == DTYPE_SOCKET) && fp1->f_data
356                                     && ((struct socket *)fp1->f_data)->so_state & SS_ISAPIPE)
357                                   || (fp1->f_type == DTYPE_PIPE))
358                                         val &= ~O_ASYNC;
359                               else {
360                                         /* not a pipe, do not modify anything */
361                                         fd_putfile(fd);
362                                         fp1 = NULL;
363                               }
364                     }
365 
366                     SCARG(&fca, fd) = fd;
367                     SCARG(&fca, cmd) = F_SETFL;
368                     SCARG(&fca, arg) = (void *) val;
369 
370                     error = sys_fcntl(l, &fca, retval);
371 
372                     /* Now set the FASYNC flag for pipes */
373                     if (fp1) {
374                               if (!error) {
375                                         mutex_enter(&fp1->f_lock);
376                                         fp1->f_flag |= FASYNC;
377                                         mutex_exit(&fp1->f_lock);
378                               }
379                               fd_putfile(fd);
380                     }
381 
382                     return (error);
383               }
384 
385           case LINUX_F_GETLK:
386                     do_linux_getlk(fd, cmd, arg, linux, flock);
387 
388           case LINUX_F_SETLK:
389           case LINUX_F_SETLKW:
390                     do_linux_setlk(fd, cmd, arg, linux, flock, LINUX_F_SETLK);
391 
392           case LINUX_F_SETOWN:
393           case LINUX_F_GETOWN:
394                     /*
395                      * We need to route fcntl() for tty descriptors around normal
396                      * fcntl(), since NetBSD tty TIOC{G,S}PGRP semantics is too
397                      * restrictive for Linux F_{G,S}ETOWN. For non-tty descriptors,
398                      * this is not a problem.
399                      */
400                     if ((fp = fd_getfile(fd)) == NULL)
401                               return EBADF;
402 
403                     /* Check it's a character device vnode */
404                     if (fp->f_type != DTYPE_VNODE
405                         || (vp = (struct vnode *)fp->f_data) == NULL
406                         || vp->v_type != VCHR) {
407                               fd_putfile(fd);
408 
409               not_tty:
410                               /* Not a tty, proceed with common fcntl() */
411                               cmd = cmd == LINUX_F_SETOWN ? F_SETOWN : F_GETOWN;
412                               break;
413                     }
414 
415                     vn_lock(vp, LK_SHARED | LK_RETRY);
416                     error = VOP_GETATTR(vp, &va, l->l_cred);
417                     VOP_UNLOCK(vp);
418 
419                     fd_putfile(fd);
420 
421                     if (error)
422                               return error;
423 
424                     if ((tp = cdev_tty(va.va_rdev)) == NULL)
425                               goto not_tty;
426 
427                     /* set tty pg_id appropriately */
428                     mutex_enter(&proc_lock);
429                     if (cmd == LINUX_F_GETOWN) {
430                               retval[0] = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
431                               mutex_exit(&proc_lock);
432                               return 0;
433                     }
434                     if ((long)arg <= 0) {
435                               pgid = -(long)arg;
436                     } else {
437                               struct proc *p1 = proc_find((long)arg);
438                               if (p1 == NULL) {
439                                         mutex_exit(&proc_lock);
440                                         return (ESRCH);
441                               }
442                               pgid = (long)p1->p_pgrp->pg_id;
443                     }
444                     pgrp = pgrp_find(pgid);
445                     if (pgrp == NULL || pgrp->pg_session != p->p_session) {
446                               mutex_exit(&proc_lock);
447                               return EPERM;
448                     }
449                     tp->t_pgrp = pgrp;
450                     mutex_exit(&proc_lock);
451                     return 0;
452 
453           case LINUX_F_DUPFD_CLOEXEC:
454                     cmd = F_DUPFD_CLOEXEC;
455                     break;
456 
457           case LINUX_F_ADD_SEALS:
458                     cmd = F_ADD_SEALS;
459                     break;
460 
461           case LINUX_F_GET_SEALS:
462                     cmd = F_GET_SEALS;
463                     break;
464 
465           default:
466                     return EOPNOTSUPP;
467           }
468 
469           SCARG(&fca, fd) = fd;
470           SCARG(&fca, cmd) = cmd;
471           SCARG(&fca, arg) = arg;
472 
473           return sys_fcntl(l, &fca, retval);
474 }
475 
476 #if !defined(__aarch64__) && !defined(__amd64__)
477 /*
478  * Convert a NetBSD stat structure to a Linux stat structure.
479  * Only the order of the fields and the padding in the structure
480  * is different. linux_fakedev is a machine-dependent function
481  * which optionally converts device driver major/minor numbers
482  * (XXX horrible, but what can you do against code that compares
483  * things against constant major device numbers? sigh)
484  */
485 static void
bsd_to_linux_stat(struct stat * bsp,struct linux_stat * lsp)486 bsd_to_linux_stat(struct stat *bsp, struct linux_stat *lsp)
487 {
488 
489           memset(lsp, 0, sizeof(*lsp));
490           lsp->lst_dev     = linux_fakedev(bsp->st_dev, 0);
491           lsp->lst_ino     = bsp->st_ino;
492           lsp->lst_mode    = (linux_mode_t)bsp->st_mode;
493           if (bsp->st_nlink >= (1 << 15))
494                     lsp->lst_nlink = (1 << 15) - 1;
495           else
496                     lsp->lst_nlink = (linux_nlink_t)bsp->st_nlink;
497           lsp->lst_uid     = bsp->st_uid;
498           lsp->lst_gid     = bsp->st_gid;
499           lsp->lst_rdev    = linux_fakedev(bsp->st_rdev, 1);
500           lsp->lst_size    = bsp->st_size;
501           lsp->lst_blksize = bsp->st_blksize;
502           lsp->lst_blocks  = bsp->st_blocks;
503           lsp->lst_atime   = bsp->st_atime;
504           lsp->lst_mtime   = bsp->st_mtime;
505           lsp->lst_ctime   = bsp->st_ctime;
506 #ifdef LINUX_STAT_HAS_NSEC
507           lsp->lst_atime_nsec   = bsp->st_atimensec;
508           lsp->lst_mtime_nsec   = bsp->st_mtimensec;
509           lsp->lst_ctime_nsec   = bsp->st_ctimensec;
510 #endif
511 }
512 
513 /*
514  * The stat functions below are plain sailing. stat and lstat are handled
515  * by one function to avoid code duplication.
516  */
517 int
linux_sys_fstat(struct lwp * l,const struct linux_sys_fstat_args * uap,register_t * retval)518 linux_sys_fstat(struct lwp *l, const struct linux_sys_fstat_args *uap,
519     register_t *retval)
520 {
521           /* {
522                     syscallarg(int) fd;
523                     syscallarg(linux_stat *) sp;
524           } */
525           struct linux_stat tmplst;
526           struct stat tmpst;
527           int error;
528 
529           error = do_sys_fstat(SCARG(uap, fd), &tmpst);
530           if (error != 0)
531                     return error;
532           bsd_to_linux_stat(&tmpst, &tmplst);
533 
534           return copyout(&tmplst, SCARG(uap, sp), sizeof tmplst);
535 }
536 
537 static int
linux_stat1(const struct linux_sys_stat_args * uap,register_t * retval,int flags)538 linux_stat1(const struct linux_sys_stat_args *uap, register_t *retval,
539     int flags)
540 {
541           struct linux_stat tmplst;
542           struct stat tmpst;
543           int error;
544 
545           error = do_sys_stat(SCARG(uap, path), flags, &tmpst);
546           if (error != 0)
547                     return error;
548 
549           bsd_to_linux_stat(&tmpst, &tmplst);
550 
551           return copyout(&tmplst, SCARG(uap, sp), sizeof tmplst);
552 }
553 
554 int
linux_sys_stat(struct lwp * l,const struct linux_sys_stat_args * uap,register_t * retval)555 linux_sys_stat(struct lwp *l, const struct linux_sys_stat_args *uap,
556     register_t *retval)
557 {
558           /* {
559                     syscallarg(const char *) path;
560                     syscallarg(struct linux_stat *) sp;
561           } */
562 
563           return linux_stat1(uap, retval, FOLLOW);
564 }
565 
566 /* Note: this is "newlstat" in the Linux sources */
567 /*        (we don't bother with the old lstat currently) */
568 int
linux_sys_lstat(struct lwp * l,const struct linux_sys_lstat_args * uap,register_t * retval)569 linux_sys_lstat(struct lwp *l, const struct linux_sys_lstat_args *uap,
570     register_t *retval)
571 {
572           /* {
573                     syscallarg(const char *) path;
574                     syscallarg(struct linux_stat *) sp;
575           } */
576 
577           return linux_stat1((const void *)uap, retval, NOFOLLOW);
578 }
579 #endif /* !__aarch64__ && !__amd64__ */
580 
581 /*
582  * The following syscalls are mostly here because of the alternate path check.
583  */
584 
585 int
linux_sys_linkat(struct lwp * l,const struct linux_sys_linkat_args * uap,register_t * retval)586 linux_sys_linkat(struct lwp *l, const struct linux_sys_linkat_args *uap,
587     register_t *retval)
588 {
589           /* {
590                     syscallarg(int) fd1;
591                     syscallarg(const char *) name1;
592                     syscallarg(int) fd2;
593                     syscallarg(const char *) name2;
594                     syscallarg(int) flags;
595           } */
596           int fd1 = SCARG(uap, fd1);
597           const char *name1 = SCARG(uap, name1);
598           int fd2 = SCARG(uap, fd2);
599           const char *name2 = SCARG(uap, name2);
600           int follow;
601 
602           follow = SCARG(uap, flags) & LINUX_AT_SYMLINK_FOLLOW;
603 
604           return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
605 }
606 
607 static int
linux_unlink_dircheck(const char * path)608 linux_unlink_dircheck(const char *path)
609 {
610           struct nameidata nd;
611           struct pathbuf *pb;
612           int error;
613 
614           /*
615            * Linux returns EISDIR if unlink(2) is called on a directory.
616            * We return EPERM in such cases. To emulate correct behaviour,
617            * check if the path points to directory and return EISDIR if this
618            * is the case.
619            *
620            * XXX this should really not copy in the path buffer twice...
621            */
622           error = pathbuf_copyin(path, &pb);
623           if (error) {
624                     return error;
625           }
626           NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
627           if (namei(&nd) == 0) {
628                     struct stat sb;
629 
630                     if (vn_stat(nd.ni_vp, &sb) == 0
631                         && S_ISDIR(sb.st_mode))
632                               error = EISDIR;
633 
634                     vput(nd.ni_vp);
635           }
636           pathbuf_destroy(pb);
637           return error ? error : EPERM;
638 }
639 
640 int
linux_sys_unlink(struct lwp * l,const struct linux_sys_unlink_args * uap,register_t * retval)641 linux_sys_unlink(struct lwp *l, const struct linux_sys_unlink_args *uap,
642     register_t *retval)
643 {
644           /* {
645                     syscallarg(const char *) path;
646           } */
647           int error;
648 
649           error = sys_unlink(l, (const void *)uap, retval);
650           if (error == EPERM)
651                     error = linux_unlink_dircheck(SCARG(uap, path));
652 
653           return error;
654 }
655 
656 int
linux_sys_unlinkat(struct lwp * l,const struct linux_sys_unlinkat_args * uap,register_t * retval)657 linux_sys_unlinkat(struct lwp *l, const struct linux_sys_unlinkat_args *uap,
658     register_t *retval)
659 {
660           /* {
661                     syscallarg(int) fd;
662                     syscallarg(const char *) path;
663                     syscallarg(int) flag;
664           } */
665           struct sys_unlinkat_args ua;
666           int error;
667 
668           SCARG(&ua, fd) = SCARG(uap, fd);
669           SCARG(&ua, path) = SCARG(uap, path);
670           SCARG(&ua, flag) = linux_to_bsd_atflags(SCARG(uap, flag));
671 
672           error = sys_unlinkat(l, &ua, retval);
673           if (error == EPERM)
674                     error = linux_unlink_dircheck(SCARG(uap, path));
675 
676           return error;
677 }
678 
679 int
linux_sys_mknod(struct lwp * l,const struct linux_sys_mknod_args * uap,register_t * retval)680 linux_sys_mknod(struct lwp *l, const struct linux_sys_mknod_args *uap,
681     register_t *retval)
682 {
683           /* {
684                     syscallarg(const char *) path;
685                     syscallarg(linux_umode_t) mode;
686                     syscallarg(unsigned) dev;
687           } */
688           struct linux_sys_mknodat_args ua;
689 
690           SCARG(&ua, fd) = LINUX_AT_FDCWD;
691           SCARG(&ua, path) = SCARG(uap, path);
692           SCARG(&ua, mode) = SCARG(uap, mode);
693           SCARG(&ua, dev) = SCARG(uap, dev);
694 
695           return linux_sys_mknodat(l, &ua, retval);
696 }
697 
698 int
linux_sys_mknodat(struct lwp * l,const struct linux_sys_mknodat_args * uap,register_t * retval)699 linux_sys_mknodat(struct lwp *l, const struct linux_sys_mknodat_args *uap,
700     register_t *retval)
701 {
702           /* {
703                     syscallarg(int) fd;
704                     syscallarg(const char *) path;
705                     syscallarg(linux_umode_t) mode;
706                     syscallarg(unsigned) dev;
707           } */
708 
709           /*
710            * BSD handles FIFOs separately
711            */
712           if (S_ISFIFO(SCARG(uap, mode))) {
713                     struct sys_mkfifoat_args bma;
714 
715                     SCARG(&bma, fd) = SCARG(uap, fd);
716                     SCARG(&bma, path) = SCARG(uap, path);
717                     SCARG(&bma, mode) = SCARG(uap, mode);
718                     return sys_mkfifoat(l, &bma, retval);
719           } else {
720 
721                     /*
722                      * Linux device numbers uses 8 bits for minor and 8 bits
723                      * for major. Due to how we map our major and minor,
724                      * this just fits into our dev_t. Just mask off the
725                      * upper 16bit to remove any random junk.
726                      */
727 
728                     return do_sys_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
729                         SCARG(uap, mode), SCARG(uap, dev) & 0xffff, UIO_USERSPACE);
730           }
731 }
732 
733 int
linux_sys_fchmodat(struct lwp * l,const struct linux_sys_fchmodat_args * uap,register_t * retval)734 linux_sys_fchmodat(struct lwp *l, const struct linux_sys_fchmodat_args *uap,
735     register_t *retval)
736 {
737           /* {
738                     syscallarg(int) fd;
739                     syscallarg(const char *) path;
740                     syscallarg(linux_umode_t) mode;
741           } */
742 
743           return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
744                                     SCARG(uap, mode), AT_SYMLINK_FOLLOW);
745 }
746 
747 int
linux_sys_fchownat(struct lwp * l,const struct linux_sys_fchownat_args * uap,register_t * retval)748 linux_sys_fchownat(struct lwp *l, const struct linux_sys_fchownat_args *uap,
749     register_t *retval)
750 {
751           /* {
752                     syscallarg(int) fd;
753                     syscallarg(const char *) path;
754                     syscallarg(uid_t) owner;
755                     syscallarg(gid_t) group;
756                     syscallarg(int) flag;
757           } */
758           int flag;
759 
760           flag = linux_to_bsd_atflags(SCARG(uap, flag));
761           return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
762                                     SCARG(uap, owner), SCARG(uap, group), flag);
763 }
764 
765 int
linux_sys_faccessat(struct lwp * l,const struct linux_sys_faccessat_args * uap,register_t * retval)766 linux_sys_faccessat(struct lwp *l, const struct linux_sys_faccessat_args *uap,
767     register_t *retval)
768 {
769           /* {
770                     syscallarg(int) fd;
771                     syscallarg(const char *) path;
772                     syscallarg(int) amode;
773           } */
774 
775           return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
776                SCARG(uap, amode), AT_SYMLINK_FOLLOW);
777 }
778 
779 /*
780  * This is just fsync() for now (just as it is in the Linux kernel)
781  * Note: this is not implemented under Linux on Alpha and Arm
782  *        but should still be defined in our syscalls.master.
783  *        (syscall #148 on the arm)
784  */
785 int
linux_sys_fdatasync(struct lwp * l,const struct linux_sys_fdatasync_args * uap,register_t * retval)786 linux_sys_fdatasync(struct lwp *l, const struct linux_sys_fdatasync_args *uap,
787     register_t *retval)
788 {
789           /* {
790                     syscallarg(int) fd;
791           } */
792 
793           return sys_fsync(l, (const void *)uap, retval);
794 }
795 
796 /*
797  * pread(2).
798  */
799 int
linux_sys_pread(struct lwp * l,const struct linux_sys_pread_args * uap,register_t * retval)800 linux_sys_pread(struct lwp *l, const struct linux_sys_pread_args *uap,
801     register_t *retval)
802 {
803           /* {
804                     syscallarg(int) fd;
805                     syscallarg(void *) buf;
806                     syscallarg(size_t) nbyte;
807                     syscallarg(off_t) offset;
808           } */
809           struct sys_pread_args pra;
810 
811           SCARG(&pra, fd) = SCARG(uap, fd);
812           SCARG(&pra, buf) = SCARG(uap, buf);
813           SCARG(&pra, nbyte) = SCARG(uap, nbyte);
814           SCARG(&pra, PAD) = 0;
815           SCARG(&pra, offset) = SCARG(uap, offset);
816 
817           return sys_pread(l, &pra, retval);
818 }
819 
820 /*
821  * pwrite(2).
822  */
823 int
linux_sys_pwrite(struct lwp * l,const struct linux_sys_pwrite_args * uap,register_t * retval)824 linux_sys_pwrite(struct lwp *l, const struct linux_sys_pwrite_args *uap,
825     register_t *retval)
826 {
827           /* {
828                     syscallarg(int) fd;
829                     syscallarg(void *) buf;
830                     syscallarg(size_t) nbyte;
831                     syscallarg(off_t) offset;
832           } */
833           struct sys_pwrite_args pra;
834 
835           SCARG(&pra, fd) = SCARG(uap, fd);
836           SCARG(&pra, buf) = SCARG(uap, buf);
837           SCARG(&pra, nbyte) = SCARG(uap, nbyte);
838           SCARG(&pra, PAD) = 0;
839           SCARG(&pra, offset) = SCARG(uap, offset);
840 
841           return sys_pwrite(l, &pra, retval);
842 }
843 
844 /*
845  * preadv(2)
846  */
847 int
linux_sys_preadv(struct lwp * l,const struct linux_sys_preadv_args * uap,register_t * retval)848 linux_sys_preadv(struct lwp *l, const struct linux_sys_preadv_args *uap,
849     register_t *retval)
850 {
851           /* {
852                     syscallarg(int) fd;
853                     syscallarg(const struct iovec *) iovp;
854                     syscallarg(int) iovcnt;
855                     syscallarg(unsigned long) off_lo;
856                     syscallarg(unsigned long) off_hi;
857           } */
858           struct sys_preadv_args ua;
859 
860           SCARG(&ua, fd) = SCARG(uap, fd);
861           SCARG(&ua, iovp) = SCARG(uap, iovp);
862           SCARG(&ua, iovcnt) = SCARG(uap, iovcnt);
863           SCARG(&ua, PAD) = 0;
864           SCARG(&ua, offset) = linux_hilo_to_off_t(SCARG(uap, off_hi),
865                                                              SCARG(uap, off_lo));
866           return sys_preadv(l, &ua, retval);
867 }
868 
869 /*
870  * pwritev(2)
871  */
872 int
linux_sys_pwritev(struct lwp * l,const struct linux_sys_pwritev_args * uap,register_t * retval)873 linux_sys_pwritev(struct lwp *l, const struct linux_sys_pwritev_args *uap,
874     register_t *retval)
875 {
876           /* {
877                     syscallarg(int) fd;
878                     syscallarg(const struct iovec *) iovp;
879                     syscallarg(int) iovcnt;
880                     syscallarg(unsigned long) off_lo;
881                     syscallarg(unsigned long) off_hi;
882           } */
883           struct sys_pwritev_args ua;
884 
885           SCARG(&ua, fd) = SCARG(uap, fd);
886           SCARG(&ua, iovp) = (const void *)SCARG(uap, iovp);
887           SCARG(&ua, iovcnt) = SCARG(uap, iovcnt);
888           SCARG(&ua, PAD) = 0;
889           SCARG(&ua, offset) = linux_hilo_to_off_t(SCARG(uap, off_hi),
890                                                              SCARG(uap, off_lo));
891           return sys_pwritev(l, &ua, retval);
892 }
893 
894 int
linux_sys_dup3(struct lwp * l,const struct linux_sys_dup3_args * uap,register_t * retval)895 linux_sys_dup3(struct lwp *l, const struct linux_sys_dup3_args *uap,
896     register_t *retval)
897 {
898           /* {
899                     syscallarg(int) from;
900                     syscallarg(int) to;
901                     syscallarg(int) flags;
902           } */
903           int flags;
904 
905           flags = linux_to_bsd_ioflags(SCARG(uap, flags));
906           if ((flags & ~O_CLOEXEC) != 0)
907                     return EINVAL;
908 
909           if (SCARG(uap, from) == SCARG(uap, to))
910                     return EINVAL;
911 
912           return dodup(l, SCARG(uap, from), SCARG(uap, to), flags, retval);
913 }
914 
915 int
linux_to_bsd_atflags(int lflags)916 linux_to_bsd_atflags(int lflags)
917 {
918           int bflags = 0;
919 
920           if (lflags & LINUX_AT_SYMLINK_NOFOLLOW)
921                     bflags |= AT_SYMLINK_NOFOLLOW;
922           if (lflags & LINUX_AT_REMOVEDIR)
923                     bflags |= AT_REMOVEDIR;
924           if (lflags & LINUX_AT_SYMLINK_FOLLOW)
925                     bflags |= AT_SYMLINK_FOLLOW;
926 
927           return bflags;
928 }
929 
930 int
linux_sys_faccessat2(lwp_t * l,const struct linux_sys_faccessat2_args * uap,register_t * retval)931 linux_sys_faccessat2(lwp_t *l, const struct linux_sys_faccessat2_args *uap,
932     register_t *retval)
933 {
934           /* {
935                     syscallarg(int) fd;
936                     syscallarg(const char *) path;
937                     syscallarg(int) amode;
938                     syscallarg(int) flags;
939           } */
940           int flag = linux_to_bsd_atflags(SCARG(uap, flags));
941           int mode = SCARG(uap, amode);
942           int fd = SCARG(uap, fd);
943           const char *path = SCARG(uap, path);
944 
945           return do_sys_accessat(l, fd, path, mode, flag);
946 }
947 
948 int
linux_sys_sync_file_range(lwp_t * l,const struct linux_sys_sync_file_range_args * uap,register_t * retval)949 linux_sys_sync_file_range(lwp_t *l,
950     const struct linux_sys_sync_file_range_args *uap, register_t *retval)
951 {
952           /* {
953                     syscallarg(int) fd;
954                     syscallarg(off_t) offset;
955                     syscallarg(off_t) nbytes;
956                     syscallarg(unsigned int) flags;
957           } */
958 
959           struct sys_fsync_range_args ua;
960 
961           if (SCARG(uap, offset) < 0 || SCARG(uap, nbytes) < 0 ||
962               ((SCARG(uap, flags) & ~LINUX_SYNC_FILE_RANGE_ALL) != 0))
963                     return EINVAL;
964 
965           /* Fill ua with uap */
966           SCARG(&ua, fd) = SCARG(uap, fd);
967           SCARG(&ua, flags) = SCARG(uap, flags);
968 
969           /* Round down offset to page boundary */
970           SCARG(&ua, start) = rounddown(SCARG(uap, offset), PAGE_SIZE);
971           SCARG(&ua, length) = SCARG(uap, nbytes);
972           if (SCARG(&ua, length) != 0) {
973                     /* Round up length to nbytes+offset to page boundary */
974                     SCARG(&ua, length) = roundup(SCARG(uap, nbytes)
975                         + SCARG(uap, offset) - SCARG(&ua, start), PAGE_SIZE);
976           }
977 
978           return sys_fsync_range(l, &ua, retval);
979 }
980 
981 int
linux_sys_syncfs(lwp_t * l,const struct linux_sys_syncfs_args * uap,register_t * retval)982 linux_sys_syncfs(lwp_t *l, const struct linux_sys_syncfs_args *uap,
983     register_t *retval)
984 {
985           /* {
986                     syscallarg(int) fd;
987           } */
988 
989           struct mount *mp;
990           struct vnode *vp;
991           file_t *fp;
992           int error, fd;
993           fd = SCARG(uap, fd);
994 
995           /* Get file pointer */
996           if ((error = fd_getvnode(fd, &fp)) != 0)
997                     return error;
998 
999           /* Get vnode and mount point */
1000           vp = fp->f_vnode;
1001           mp = vp->v_mount;
1002 
1003           mutex_enter(mp->mnt_updating);
1004           if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1005                     int asyncflag = mp->mnt_flag & MNT_ASYNC;
1006                     mp->mnt_flag &= ~MNT_ASYNC;
1007                     VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
1008                     if (asyncflag)
1009                               mp->mnt_flag |= MNT_ASYNC;
1010           }
1011           mutex_exit(mp->mnt_updating);
1012 
1013           /* Cleanup vnode and file pointer */
1014           vrele(vp);
1015           fd_putfile(fd);
1016           return 0;
1017 
1018 }
1019 
1020 int
linux_sys_renameat2(struct lwp * l,const struct linux_sys_renameat2_args * uap,register_t * retval)1021 linux_sys_renameat2(struct lwp *l, const struct linux_sys_renameat2_args *uap,
1022     register_t *retval)
1023 {
1024           /* {
1025                     syscallarg(int) fromfd;
1026                     syscallarg(const char *) from;
1027                     syscallarg(int) tofd;
1028                     syscallarg(const char *) to;
1029                     syscallarg(unsigned int) flags;
1030           } */
1031 
1032           struct sys_renameat_args ua;
1033           SCARG(&ua, fromfd) = SCARG(uap, fromfd);
1034           SCARG(&ua, from) = SCARG(uap, from);
1035           SCARG(&ua, tofd) = SCARG(uap, tofd);
1036           SCARG(&ua, to) = SCARG(uap, to);
1037 
1038           unsigned int flags = SCARG(uap, flags);
1039           int error;
1040 
1041           if (flags != 0) {
1042                     if (flags & ~LINUX_RENAME_ALL)
1043                               return EINVAL;
1044                     if ((flags & LINUX_RENAME_EXCHANGE) != 0 &&
1045                         (flags & (LINUX_RENAME_NOREPLACE | LINUX_RENAME_WHITEOUT))
1046                         != 0)
1047                               return EINVAL;
1048                     /*
1049                      * Suppoting renameat2 flags without support from file systems
1050                      * becomes a messy affair cause of locks and how VOP_RENAME
1051                      * protocol is implemented. So, return EOPNOTSUPP for now.
1052                      */
1053                     return EOPNOTSUPP;
1054           }
1055 
1056           error = sys_renameat(l, &ua, retval);
1057           return error;
1058 }
1059 
1060 int
linux_sys_copy_file_range(lwp_t * l,const struct linux_sys_copy_file_range_args * uap,register_t * retval)1061 linux_sys_copy_file_range(lwp_t *l,
1062     const struct linux_sys_copy_file_range_args *uap, register_t *retval)
1063 {
1064           /* {
1065                     syscallarg(int) fd_in;
1066                     syscallarg(unsigned long) off_in;
1067                     syscallarg(int) fd_out;
1068                     syscallarg(unsigned long) off_out;
1069                     syscallarg(size_t) len;
1070                     syscallarg(unsigned int) flags;
1071           } */
1072           const off_t OFF_MAX = __type_max(off_t);
1073           int fd_in, fd_out;
1074           file_t *fp_in, *fp_out;
1075           struct vnode *invp, *outvp;
1076           off_t off_in = 0, off_out = 0;
1077           struct vattr vattr_in, vattr_out;
1078           ssize_t total_copied = 0;
1079           size_t bytes_left, to_copy;
1080           bool have_off_in = false, have_off_out = false;
1081           int error = 0;
1082           size_t len = SCARG(uap, len);
1083           unsigned int flags = SCARG(uap, flags);
1084           /* Structures for actual copy */
1085           char *buffer = NULL;
1086           struct uio auio;
1087           struct iovec aiov;
1088 
1089           if (len > SSIZE_MAX) {
1090                     DPRINTF("%s: len is greater than SSIZE_MAX\n",
1091                         __func__);
1092                     return EOVERFLOW;
1093           }
1094 
1095           if (flags != 0) {
1096                     DPRINTF("%s: unsupported flags %#x\n", __func__, flags);
1097                     return EINVAL;
1098           }
1099 
1100           fd_in = SCARG(uap, fd_in);
1101           fd_out = SCARG(uap, fd_out);
1102           error = fd_getvnode(fd_in, &fp_in);
1103           if (error) {
1104                     return error;
1105           }
1106 
1107           error = fd_getvnode(fd_out, &fp_out);
1108           if (error) {
1109                     fd_putfile(fd_in);
1110                     return error;
1111           }
1112 
1113           invp = fp_in->f_vnode;
1114           outvp = fp_out->f_vnode;
1115 
1116           /* Get attributes of input and output files */
1117           VOP_GETATTR(invp, &vattr_in, l->l_cred);
1118           VOP_GETATTR(outvp, &vattr_out, l->l_cred);
1119 
1120           /* Check if input and output files are regular files */
1121           if (vattr_in.va_type == VDIR || vattr_out.va_type == VDIR) {
1122                     error = EISDIR;
1123                     DPRINTF("%s: Input or output is a directory\n", __func__);
1124                     goto out;
1125           }
1126           if ((SCARG(uap, off_in) != NULL && *SCARG(uap, off_in) < 0) ||
1127               (SCARG(uap, off_out) != NULL && *SCARG(uap, off_out) < 0) ||
1128               vattr_in.va_type != VREG || vattr_out.va_type != VREG) {
1129                     error = EINVAL;
1130                     DPRINTF("%s: Invalid offset or file type\n", __func__);
1131                     goto out;
1132           }
1133 
1134           if ((fp_in->f_flag & FREAD) == 0 ||
1135               (fp_out->f_flag & FWRITE) == 0 ||
1136               (fp_out->f_flag & FAPPEND) != 0) {
1137                     DPRINTF("%s: input file can't be read or output file "
1138                         "can't be written\n", __func__);
1139                     error = EBADF;
1140                     goto out;
1141           }
1142           /* Retrieve and validate offsets if provided */
1143           if (SCARG(uap, off_in) != NULL) {
1144                     error = copyin(SCARG(uap, off_in), &off_in, sizeof(off_in));
1145                     if (error) {
1146                               goto out;
1147                     }
1148                     have_off_in = true;
1149           }
1150 
1151           if (SCARG(uap, off_out) != NULL) {
1152                     error = copyin(SCARG(uap, off_out), &off_out, sizeof(off_out));
1153                     if (error) {
1154                               goto out;
1155                     }
1156                     have_off_out = true;
1157           }
1158 
1159           if (off_out < 0 || len > OFF_MAX - off_out ||
1160               off_in < 0 || len > OFF_MAX - off_in) {
1161                     DPRINTF("%s: New size is greater than OFF_MAX\n", __func__);
1162                     error = EFBIG;
1163                     goto out;
1164           }
1165 
1166           /* Identify overlapping ranges */
1167           if ((invp == outvp) &&
1168               ((off_in <= off_out && off_in + (off_t)len > off_out) ||
1169                     (off_in > off_out && off_out + (off_t)len > off_in))) {
1170                     DPRINTF("%s: Ranges overlap\n", __func__);
1171                     error = EINVAL;
1172                     goto out;
1173           }
1174 
1175           buffer = kmem_alloc(LINUX_COPY_FILE_RANGE_MAX_CHUNK, KM_SLEEP);
1176 
1177           bytes_left = len;
1178 
1179           while (bytes_left > 0) {
1180                     to_copy = MIN(bytes_left, LINUX_COPY_FILE_RANGE_MAX_CHUNK);
1181 
1182                     /* Lock the input vnode for reading */
1183                     vn_lock(fp_in->f_vnode, LK_SHARED | LK_RETRY);
1184                     /* Set up iovec and uio for reading */
1185                     aiov.iov_base = buffer;
1186                     aiov.iov_len = to_copy;
1187                     auio.uio_iov = &aiov;
1188                     auio.uio_iovcnt = 1;
1189                     auio.uio_offset = have_off_in ? off_in : fp_in->f_offset;
1190                     auio.uio_resid = to_copy;
1191                     auio.uio_rw = UIO_READ;
1192                     auio.uio_vmspace = l->l_proc->p_vmspace;
1193                     UIO_SETUP_SYSSPACE(&auio);
1194 
1195                     /* Perform read using vn_read */
1196                     error = VOP_READ(fp_in->f_vnode, &auio, 0, l->l_cred);
1197                     VOP_UNLOCK(fp_in->f_vnode);
1198                     if (error) {
1199                               DPRINTF("%s: Read error %d\n", __func__, error);
1200                               break;
1201                     }
1202 
1203                     size_t read_bytes = to_copy - auio.uio_resid;
1204                     if (read_bytes == 0) {
1205                               /* EOF reached */
1206                               break;
1207                     }
1208 
1209                     /* Lock the output vnode for writing */
1210                     vn_lock(fp_out->f_vnode, LK_EXCLUSIVE | LK_RETRY);
1211                     /* Set up iovec and uio for writing */
1212                     aiov.iov_base = buffer;
1213                     aiov.iov_len = read_bytes;
1214                     auio.uio_iov = &aiov;
1215                     auio.uio_iovcnt = 1;
1216                     auio.uio_offset = have_off_out ? off_out : fp_out->f_offset;
1217                     auio.uio_resid = read_bytes;
1218                     auio.uio_rw = UIO_WRITE;
1219                     auio.uio_vmspace = l->l_proc->p_vmspace;
1220                     UIO_SETUP_SYSSPACE(&auio);
1221 
1222                     /* Perform the write */
1223                     error = VOP_WRITE(fp_out->f_vnode, &auio, 0, l->l_cred);
1224                     VOP_UNLOCK(fp_out->f_vnode);
1225                     if (error) {
1226                               DPRINTF("%s: Write error %d\n", __func__, error);
1227                               break;
1228                     }
1229                     size_t written_bytes = read_bytes - auio.uio_resid;
1230                     total_copied += written_bytes;
1231                     bytes_left -= written_bytes;
1232 
1233                     /* Update offsets if provided */
1234                     if (have_off_in) {
1235                               off_in += written_bytes;
1236                     } else {
1237                               fp_in->f_offset += written_bytes;
1238                     }
1239                     if (have_off_out) {
1240                               off_out += written_bytes;
1241                     } else {
1242                               fp_out->f_offset += written_bytes;
1243                     }
1244           }
1245 
1246           if (have_off_in) {
1247                     /* Adjust user space offset */
1248                     error = copyout(&off_in, SCARG(uap, off_in), sizeof(off_t));
1249                     if (error) {
1250                               DPRINTF("%s: Error adjusting user space offset\n",
1251                                   __func__);
1252                     }
1253                     goto out;
1254           }
1255 
1256           if (have_off_out) {
1257                     /* Adjust user space offset */
1258                     error = copyout(&off_out, SCARG(uap, off_out), sizeof(off_t));
1259                     if (error) {
1260                               DPRINTF("%s: Error adjusting user space offset\n",
1261                                   __func__);
1262                     }
1263           }
1264 
1265           *retval = total_copied;
1266 out:
1267           if (buffer) {
1268                     kmem_free(buffer, LINUX_COPY_FILE_RANGE_MAX_CHUNK);
1269           }
1270           if (fp_out) {
1271                     fd_putfile(fd_out);
1272           }
1273           if (fp_in) {
1274                     fd_putfile(fd_in);
1275           }
1276           return error;
1277 }
1278 
1279 #define LINUX_NOT_SUPPORTED(fun) \
1280 int \
1281 fun(struct lwp *l, const struct fun##_args *uap, register_t *retval) \
1282 { \
1283           return EOPNOTSUPP; \
1284 }
1285 
1286 LINUX_NOT_SUPPORTED(linux_sys_setxattr)
1287 LINUX_NOT_SUPPORTED(linux_sys_lsetxattr)
1288 LINUX_NOT_SUPPORTED(linux_sys_fsetxattr)
1289 
1290 LINUX_NOT_SUPPORTED(linux_sys_getxattr)
1291 LINUX_NOT_SUPPORTED(linux_sys_lgetxattr)
1292 LINUX_NOT_SUPPORTED(linux_sys_fgetxattr)
1293 
1294 LINUX_NOT_SUPPORTED(linux_sys_listxattr)
1295 LINUX_NOT_SUPPORTED(linux_sys_llistxattr)
1296 LINUX_NOT_SUPPORTED(linux_sys_flistxattr)
1297 
1298 LINUX_NOT_SUPPORTED(linux_sys_removexattr)
1299 LINUX_NOT_SUPPORTED(linux_sys_lremovexattr)
1300 LINUX_NOT_SUPPORTED(linux_sys_fremovexattr)
1301 
1302 /*
1303  * For now just return EOPNOTSUPP, this makes glibc posix_fallocate()
1304  * to fallback to emulation.
1305  * XXX Right now no filesystem actually implements fallocate support,
1306  * so no need for mapping.
1307  */
1308 LINUX_NOT_SUPPORTED(linux_sys_fallocate)
1309