1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2014 Integros [integros.com]
26  */
27 
28 /* Portions Copyright 2007 Jeremy Teo */
29 /* Portions Copyright 2010 Robert Milkowski */
30 
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/vfs.h>
38 #include <sys/vm.h>
39 #include <sys/vnode.h>
40 #include <sys/file.h>
41 #include <sys/stat.h>
42 #include <sys/kmem.h>
43 #include <sys/taskq.h>
44 #include <sys/uio.h>
45 #include <sys/atomic.h>
46 #include <sys/namei.h>
47 #include <sys/mman.h>
48 #include <sys/cmn_err.h>
49 #include <sys/errno.h>
50 #include <sys/unistd.h>
51 #include <sys/zfs_dir.h>
52 #include <sys/zfs_ioctl.h>
53 #include <sys/fs/zfs.h>
54 #include <sys/dmu.h>
55 #include <sys/dmu_objset.h>
56 #include <sys/spa.h>
57 #include <sys/txg.h>
58 #include <sys/dbuf.h>
59 #include <sys/zap.h>
60 #include <sys/sa.h>
61 #include <sys/dirent.h>
62 #include <sys/policy.h>
63 #include <sys/sunddi.h>
64 #include <sys/filio.h>
65 #include <sys/sid.h>
66 #include <sys/zfs_ctldir.h>
67 #include <sys/zfs_fuid.h>
68 #include <sys/zfs_sa.h>
69 #include <sys/dnlc.h>
70 #include <sys/zfs_rlock.h>
71 #include <sys/buf.h>
72 #include <sys/sched.h>
73 #include <sys/acl.h>
74 #include <sys/extdirent.h>
75 
76 #ifdef __FreeBSD__
77 #include <sys/kidmap.h>
78 #include <sys/bio.h>
79 #include <vm/vm_param.h>
80 #endif
81 
82 #ifdef __NetBSD__
83 #include <dev/mm.h>
84 #include <miscfs/fifofs/fifo.h>
85 #include <miscfs/genfs/genfs.h>
86 #include <miscfs/genfs/genfs_node.h>
87 #include <uvm/uvm_extern.h>
88 #include <sys/fstrans.h>
89 #include <sys/malloc.h>
90 
91 uint_t zfs_putpage_key;
92 #endif
93 
94 /*
95  * Programming rules.
96  *
97  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
98  * properly lock its in-core state, create a DMU transaction, do the work,
99  * record this work in the intent log (ZIL), commit the DMU transaction,
100  * and wait for the intent log to commit if it is a synchronous operation.
101  * Moreover, the vnode ops must work in both normal and log replay context.
102  * The ordering of events is important to avoid deadlocks and references
103  * to freed memory.  The example below illustrates the following Big Rules:
104  *
105  *  (1)   A check must be made in each zfs thread for a mounted file system.
106  *        This is done avoiding races using ZFS_ENTER(zfsvfs).
107  *        A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
108  *        must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
109  *        can return EIO from the calling function.
110  *
111  *  (2)   VN_RELE() should always be the last thing except for zil_commit()
112  *        (if necessary) and ZFS_EXIT(). This is for 3 reasons:
113  *        First, if it's the last reference, the vnode/znode
114  *        can be freed, so the zp may point to freed memory.  Second, the last
115  *        reference will call zfs_zinactive(), which may induce a lot of work --
116  *        pushing cached pages (which acquires range locks) and syncing out
117  *        cached atime changes.  Third, zfs_zinactive() may require a new tx,
118  *        which could deadlock the system if you were already holding one.
119  *        If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
120  *
121  *  (3)   All range locks must be grabbed before calling dmu_tx_assign(),
122  *        as they can span dmu_tx_assign() calls.
123  *
124  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
125  *      dmu_tx_assign().  This is critical because we don't want to block
126  *      while holding locks.
127  *
128  *        If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
129  *        reduces lock contention and CPU usage when we must wait (note that if
130  *        throughput is constrained by the storage, nearly every transaction
131  *        must wait).
132  *
133  *      Note, in particular, that if a lock is sometimes acquired before
134  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
135  *      to use a non-blocking assign can deadlock the system.  The scenario:
136  *
137  *        Thread A has grabbed a lock before calling dmu_tx_assign().
138  *        Thread B is in an already-assigned tx, and blocks for this lock.
139  *        Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
140  *        forever, because the previous txg can't quiesce until B's tx commits.
141  *
142  *        If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
143  *        then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
144  *        calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
145  *        to indicate that this operation has already called dmu_tx_wait().
146  *        This will ensure that we don't retry forever, waiting a short bit
147  *        each time.
148  *
149  *  (5)   If the operation succeeded, generate the intent log entry for it
150  *        before dropping locks.  This ensures that the ordering of events
151  *        in the intent log matches the order in which they actually occurred.
152  *        During ZIL replay the zfs_log_* functions will update the sequence
153  *        number to indicate the zil transaction has replayed.
154  *
155  *  (6)   At the end of each vnode op, the DMU tx must always commit,
156  *        regardless of whether there were any errors.
157  *
158  *  (7)   After dropping all locks, invoke zil_commit(zilog, foid)
159  *        to ensure that synchronous semantics are provided when necessary.
160  *
161  * In general, this is how things should be ordered in each vnode op:
162  *
163  *        ZFS_ENTER(zfsvfs);            // exit if unmounted
164  * top:
165  *        zfs_dirent_lookup(&dl, ...)   // lock directory entry (may VN_HOLD())
166  *        rw_enter(...);                          // grab any other locks you need
167  *        tx = dmu_tx_create(...);      // get DMU tx
168  *        dmu_tx_hold_*();              // hold each object you might modify
169  *        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
170  *        if (error) {
171  *                  rw_exit(...);                 // drop locks
172  *                  zfs_dirent_unlock(dl);        // unlock directory entry
173  *                  VN_RELE(...);                 // release held vnodes
174  *                  if (error == ERESTART) {
175  *                            waited = B_TRUE;
176  *                            dmu_tx_wait(tx);
177  *                            dmu_tx_abort(tx);
178  *                            goto top;
179  *                  }
180  *                  dmu_tx_abort(tx);   // abort DMU tx
181  *                  ZFS_EXIT(zfsvfs);   // finished in zfs
182  *                  return (error);               // really out of space
183  *        }
184  *        error = do_real_work();                 // do whatever this VOP does
185  *        if (error == 0)
186  *                  zfs_log_*(...);               // on success, make ZIL entry
187  *        dmu_tx_commit(tx);            // commit DMU tx -- error or not
188  *        rw_exit(...);                           // drop locks
189  *        zfs_dirent_unlock(dl);                  // unlock directory entry
190  *        VN_RELE(...);                           // release held vnodes
191  *        zil_commit(zilog, foid);      // synchronous when necessary
192  *        ZFS_EXIT(zfsvfs);             // finished in zfs
193  *        return (error);                         // done, report error
194  */
195 
196 /* ARGSUSED */
197 static int
zfs_open(vnode_t ** vpp,int flag,cred_t * cr,caller_context_t * ct)198 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
199 {
200           znode_t   *zp = VTOZ(*vpp);
201           zfsvfs_t *zfsvfs = zp->z_zfsvfs;
202 
203           ZFS_ENTER(zfsvfs);
204           ZFS_VERIFY_ZP(zp);
205 
206           if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
207               ((flag & FAPPEND) == 0)) {
208                     ZFS_EXIT(zfsvfs);
209                     return (SET_ERROR(EPERM));
210           }
211 
212           if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
213               ZTOV(zp)->v_type == VREG &&
214               !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
215                     if (fs_vscan(*vpp, cr, 0) != 0) {
216                               ZFS_EXIT(zfsvfs);
217                               return (SET_ERROR(EACCES));
218                     }
219           }
220 
221           /* Keep a count of the synchronous opens in the znode */
222           if (flag & (FSYNC | FDSYNC))
223                     atomic_inc_32(&zp->z_sync_cnt);
224 
225           ZFS_EXIT(zfsvfs);
226           return (0);
227 }
228 
229 /* ARGSUSED */
230 static int
zfs_close(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr,caller_context_t * ct)231 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
232     caller_context_t *ct)
233 {
234           znode_t   *zp = VTOZ(vp);
235           zfsvfs_t *zfsvfs = zp->z_zfsvfs;
236 
237           /*
238            * Clean up any locks held by this process on the vp.
239            */
240           cleanlocks(vp, ddi_get_pid(), 0);
241           cleanshares(vp, ddi_get_pid());
242 
243           ZFS_ENTER(zfsvfs);
244           ZFS_VERIFY_ZP(zp);
245 
246           /* Decrement the synchronous opens in the znode */
247           if ((flag & (FSYNC | FDSYNC)) && (count == 1))
248                     atomic_dec_32(&zp->z_sync_cnt);
249 
250           if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
251               ZTOV(zp)->v_type == VREG &&
252               !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
253                     VERIFY(fs_vscan(vp, cr, 1) == 0);
254 
255           ZFS_EXIT(zfsvfs);
256           return (0);
257 }
258 
259 /*
260  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
261  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
262  */
263 static int
zfs_holey(vnode_t * vp,u_long cmd,offset_t * off)264 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
265 {
266           znode_t   *zp = VTOZ(vp);
267           uint64_t noff = (uint64_t)*off; /* new offset */
268           uint64_t file_sz;
269           int error;
270           boolean_t hole;
271 
272           file_sz = zp->z_size;
273           if (noff >= file_sz)  {
274                     return (SET_ERROR(ENXIO));
275           }
276 
277           if (cmd == _FIO_SEEK_HOLE)
278                     hole = B_TRUE;
279           else
280                     hole = B_FALSE;
281 
282           error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
283 
284           if (error == ESRCH)
285                     return (SET_ERROR(ENXIO));
286 
287           /*
288            * We could find a hole that begins after the logical end-of-file,
289            * because dmu_offset_next() only works on whole blocks.  If the
290            * EOF falls mid-block, then indicate that the "virtual hole"
291            * at the end of the file begins at the logical EOF, rather than
292            * at the end of the last block.
293            */
294           if (noff > file_sz) {
295                     ASSERT(hole);
296                     noff = file_sz;
297           }
298 
299           if (noff < *off)
300                     return (error);
301           *off = noff;
302           return (error);
303 }
304 
305 /* ARGSUSED */
306 static int
zfs_ioctl(vnode_t * vp,u_long com,intptr_t data,int flag,cred_t * cred,int * rvalp,caller_context_t * ct)307 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
308     int *rvalp, caller_context_t *ct)
309 {
310           offset_t off;
311           offset_t ndata;
312           dmu_object_info_t doi;
313           int error;
314           zfsvfs_t *zfsvfs;
315           znode_t *zp;
316 
317           switch (com) {
318           case _FIOFFS:
319           {
320                     return (0);
321 
322                     /*
323                      * The following two ioctls are used by bfu.  Faking out,
324                      * necessary to avoid bfu errors.
325                      */
326           }
327           case _FIOGDIO:
328           case _FIOSDIO:
329           {
330                     return (0);
331           }
332 
333           case _FIO_SEEK_DATA:
334           case _FIO_SEEK_HOLE:
335           {
336 #ifdef illumos
337                     if (ddi_copyin((void *)data, &off, sizeof (off), flag))
338                               return (SET_ERROR(EFAULT));
339 #else
340                     off = *(offset_t *)data;
341 #endif
342                     zp = VTOZ(vp);
343                     zfsvfs = zp->z_zfsvfs;
344                     ZFS_ENTER(zfsvfs);
345                     ZFS_VERIFY_ZP(zp);
346 
347                     /* offset parameter is in/out */
348                     error = zfs_holey(vp, com, &off);
349                     ZFS_EXIT(zfsvfs);
350                     if (error)
351                               return (error);
352 #ifdef illumos
353                     if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
354                               return (SET_ERROR(EFAULT));
355 #else
356                     *(offset_t *)data = off;
357 #endif
358                     return (0);
359           }
360 #ifdef illumos
361           case _FIO_COUNT_FILLED:
362           {
363                     /*
364                      * _FIO_COUNT_FILLED adds a new ioctl command which
365                      * exposes the number of filled blocks in a
366                      * ZFS object.
367                      */
368                     zp = VTOZ(vp);
369                     zfsvfs = zp->z_zfsvfs;
370                     ZFS_ENTER(zfsvfs);
371                     ZFS_VERIFY_ZP(zp);
372 
373                     /*
374                      * Wait for all dirty blocks for this object
375                      * to get synced out to disk, and the DMU info
376                      * updated.
377                      */
378                     error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
379                     if (error) {
380                               ZFS_EXIT(zfsvfs);
381                               return (error);
382                     }
383 
384                     /*
385                      * Retrieve fill count from DMU object.
386                      */
387                     error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
388                     if (error) {
389                               ZFS_EXIT(zfsvfs);
390                               return (error);
391                     }
392 
393                     ndata = doi.doi_fill_count;
394 
395                     ZFS_EXIT(zfsvfs);
396                     if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
397                               return (SET_ERROR(EFAULT));
398                     return (0);
399           }
400 #endif
401           }
402           return (SET_ERROR(ENOTTY));
403 }
404 
405 #ifdef __FreeBSD__
406 static vm_page_t
page_busy(vnode_t * vp,int64_t start,int64_t off,int64_t nbytes)407 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
408 {
409           vm_object_t obj;
410           vm_page_t pp;
411           int64_t end;
412 
413           /*
414            * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
415            * aligned boundaries, if the range is not aligned.  As a result a
416            * DEV_BSIZE subrange with partially dirty data may get marked as clean.
417            * It may happen that all DEV_BSIZE subranges are marked clean and thus
418            * the whole page would be considred clean despite have some dirty data.
419            * For this reason we should shrink the range to DEV_BSIZE aligned
420            * boundaries before calling vm_page_clear_dirty.
421            */
422           end = rounddown2(off + nbytes, DEV_BSIZE);
423           off = roundup2(off, DEV_BSIZE);
424           nbytes = end - off;
425 
426           obj = vp->v_object;
427           zfs_vmobject_assert_wlocked(obj);
428 
429           for (;;) {
430                     if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
431                         pp->valid) {
432                               if (vm_page_xbusied(pp)) {
433                                         /*
434                                          * Reference the page before unlocking and
435                                          * sleeping so that the page daemon is less
436                                          * likely to reclaim it.
437                                          */
438                                         vm_page_reference(pp);
439                                         vm_page_lock(pp);
440                                         zfs_vmobject_wunlock(obj);
441                                         vm_page_busy_sleep(pp, "zfsmwb", true);
442                                         zfs_vmobject_wlock(obj);
443                                         continue;
444                               }
445                               vm_page_sbusy(pp);
446                     } else if (pp != NULL) {
447                               ASSERT(!pp->valid);
448                               pp = NULL;
449                     }
450 
451                     if (pp != NULL) {
452                               ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
453                               vm_object_pip_add(obj, 1);
454                               pmap_remove_write(pp);
455                               if (nbytes != 0)
456                                         vm_page_clear_dirty(pp, off, nbytes);
457                     }
458                     break;
459           }
460           return (pp);
461 }
462 
463 static void
page_unbusy(vm_page_t pp)464 page_unbusy(vm_page_t pp)
465 {
466 
467           vm_page_sunbusy(pp);
468           vm_object_pip_subtract(pp->object, 1);
469 }
470 
471 static vm_page_t
page_hold(vnode_t * vp,int64_t start)472 page_hold(vnode_t *vp, int64_t start)
473 {
474           vm_object_t obj;
475           vm_page_t pp;
476 
477           obj = vp->v_object;
478           zfs_vmobject_assert_wlocked(obj);
479 
480           for (;;) {
481                     if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
482                         pp->valid) {
483                               if (vm_page_xbusied(pp)) {
484                                         /*
485                                          * Reference the page before unlocking and
486                                          * sleeping so that the page daemon is less
487                                          * likely to reclaim it.
488                                          */
489                                         vm_page_reference(pp);
490                                         vm_page_lock(pp);
491                                         zfs_vmobject_wunlock(obj);
492                                         vm_page_busy_sleep(pp, "zfsmwb", true);
493                                         zfs_vmobject_wlock(obj);
494                                         continue;
495                               }
496 
497                               ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
498                               vm_page_lock(pp);
499                               vm_page_hold(pp);
500                               vm_page_unlock(pp);
501 
502                     } else
503                               pp = NULL;
504                     break;
505           }
506           return (pp);
507 }
508 
509 static void
page_unhold(vm_page_t pp)510 page_unhold(vm_page_t pp)
511 {
512 
513           vm_page_lock(pp);
514           vm_page_unhold(pp);
515           vm_page_unlock(pp);
516 }
517 
518 /*
519  * When a file is memory mapped, we must keep the IO data synchronized
520  * between the DMU cache and the memory mapped pages.  What this means:
521  *
522  * On Write:        If we find a memory mapped page, we write to *both*
523  *                  the page and the dmu buffer.
524  */
525 static void
update_pages(vnode_t * vp,int64_t start,int len,objset_t * os,uint64_t oid,int segflg,dmu_tx_t * tx)526 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
527     int segflg, dmu_tx_t *tx)
528 {
529           vm_object_t obj;
530           struct sf_buf *sf;
531           caddr_t va;
532           int off;
533 
534           ASSERT(segflg != UIO_NOCOPY);
535           ASSERT(vp->v_mount != NULL);
536           obj = vp->v_object;
537           ASSERT(obj != NULL);
538 
539           off = start & PAGEOFFSET;
540           zfs_vmobject_wlock(obj);
541           for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
542                     vm_page_t pp;
543                     int nbytes = imin(PAGESIZE - off, len);
544 
545                     if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
546                               zfs_vmobject_wunlock(obj);
547 
548                               va = zfs_map_page(pp, &sf);
549                               (void) dmu_read(os, oid, start+off, nbytes,
550                                   va+off, DMU_READ_PREFETCH);;
551                               zfs_unmap_page(sf);
552 
553                               zfs_vmobject_wlock(obj);
554                               page_unbusy(pp);
555                     }
556                     len -= nbytes;
557                     off = 0;
558           }
559           vm_object_pip_wakeupn(obj, 0);
560           zfs_vmobject_wunlock(obj);
561 }
562 
563 /*
564  * Read with UIO_NOCOPY flag means that sendfile(2) requests
565  * ZFS to populate a range of page cache pages with data.
566  *
567  * NOTE: this function could be optimized to pre-allocate
568  * all pages in advance, drain exclusive busy on all of them,
569  * map them into contiguous KVA region and populate them
570  * in one single dmu_read() call.
571  */
572 static int
mappedread_sf(vnode_t * vp,int nbytes,uio_t * uio)573 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
574 {
575           znode_t *zp = VTOZ(vp);
576           objset_t *os = zp->z_zfsvfs->z_os;
577           struct sf_buf *sf;
578           vm_object_t obj;
579           vm_page_t pp;
580           int64_t start;
581           caddr_t va;
582           int len = nbytes;
583           int off;
584           int error = 0;
585 
586           ASSERT(uio->uio_segflg == UIO_NOCOPY);
587           ASSERT(vp->v_mount != NULL);
588           obj = vp->v_object;
589           ASSERT(obj != NULL);
590           ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
591 
592           zfs_vmobject_wlock(obj);
593           for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
594                     int bytes = MIN(PAGESIZE, len);
595 
596                     pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
597                         VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
598                     if (pp->valid == 0) {
599                               zfs_vmobject_wunlock(obj);
600                               va = zfs_map_page(pp, &sf);
601                               error = dmu_read(os, zp->z_id, start, bytes, va,
602                                   DMU_READ_PREFETCH);
603                               if (bytes != PAGESIZE && error == 0)
604                                         bzero(va + bytes, PAGESIZE - bytes);
605                               zfs_unmap_page(sf);
606                               zfs_vmobject_wlock(obj);
607                               vm_page_sunbusy(pp);
608                               vm_page_lock(pp);
609                               if (error) {
610                                         if (pp->wire_count == 0 && pp->valid == 0 &&
611                                             !vm_page_busied(pp))
612                                                   vm_page_free(pp);
613                               } else {
614                                         pp->valid = VM_PAGE_BITS_ALL;
615                                         vm_page_activate(pp);
616                               }
617                               vm_page_unlock(pp);
618                     } else {
619                               ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
620                               vm_page_sunbusy(pp);
621                     }
622                     if (error)
623                               break;
624                     uio->uio_resid -= bytes;
625                     uio->uio_offset += bytes;
626                     len -= bytes;
627           }
628           zfs_vmobject_wunlock(obj);
629           return (error);
630 }
631 
632 /*
633  * When a file is memory mapped, we must keep the IO data synchronized
634  * between the DMU cache and the memory mapped pages.  What this means:
635  *
636  * On Read:         We "read" preferentially from memory mapped pages,
637  *                  else we default from the dmu buffer.
638  *
639  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
640  *         the file is memory mapped.
641  */
642 static int
mappedread(vnode_t * vp,int nbytes,uio_t * uio)643 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
644 {
645           znode_t *zp = VTOZ(vp);
646           vm_object_t obj;
647           int64_t start;
648           caddr_t va;
649           int len = nbytes;
650           int off;
651           int error = 0;
652 
653           ASSERT(vp->v_mount != NULL);
654           obj = vp->v_object;
655           ASSERT(obj != NULL);
656 
657           start = uio->uio_loffset;
658           off = start & PAGEOFFSET;
659           zfs_vmobject_wlock(obj);
660           for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
661                     vm_page_t pp;
662                     uint64_t bytes = MIN(PAGESIZE - off, len);
663 
664                     if (pp = page_hold(vp, start)) {
665                               struct sf_buf *sf;
666                               caddr_t va;
667 
668                               zfs_vmobject_wunlock(obj);
669                               va = zfs_map_page(pp, &sf);
670 #ifdef illumos
671                               error = uiomove(va + off, bytes, UIO_READ, uio);
672 #else
673                               error = vn_io_fault_uiomove(va + off, bytes, uio);
674 #endif
675                               zfs_unmap_page(sf);
676                               zfs_vmobject_wlock(obj);
677                               page_unhold(pp);
678                     } else {
679                               zfs_vmobject_wunlock(obj);
680                               error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
681                                   uio, bytes);
682                               zfs_vmobject_wlock(obj);
683                     }
684                     len -= bytes;
685                     off = 0;
686                     if (error)
687                               break;
688           }
689           zfs_vmobject_wunlock(obj);
690           return (error);
691 }
692 #endif /* __FreeBSD__ */
693 
694 #ifdef __NetBSD__
695 
696 caddr_t
zfs_map_page(page_t * pp,enum seg_rw rw)697 zfs_map_page(page_t *pp, enum seg_rw rw)
698 {
699           vaddr_t va;
700           int flags;
701 
702 #ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
703           if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va))
704                     return (caddr_t)va;
705 #endif
706 
707           flags = UVMPAGER_MAPIN_WAITOK |
708                     (rw == S_READ ? UVMPAGER_MAPIN_WRITE : UVMPAGER_MAPIN_READ);
709           va = uvm_pagermapin(&pp, 1, flags);
710           return (caddr_t)va;
711 }
712 
713 void
zfs_unmap_page(page_t * pp,caddr_t addr)714 zfs_unmap_page(page_t *pp, caddr_t addr)
715 {
716 
717 #ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
718           vaddr_t va;
719 
720           if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va))
721                     return;
722 #endif
723           uvm_pagermapout((vaddr_t)addr, 1);
724 }
725 
726 static int
mappedread(vnode_t * vp,int nbytes,uio_t * uio)727 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
728 {
729           znode_t *zp = VTOZ(vp);
730           struct uvm_object *uobj = &vp->v_uobj;
731           krwlock_t *rw = uobj->vmobjlock;
732           int64_t start;
733           caddr_t va;
734           size_t len = nbytes;
735           int off;
736           int error = 0;
737           int npages, found;
738 
739           start = uio->uio_loffset;
740           off = start & PAGEOFFSET;
741 
742           for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
743                     page_t *pp;
744                     uint64_t bytes = MIN(PAGESIZE - off, len);
745 
746                     pp = NULL;
747                     npages = 1;
748                     rw_enter(rw, RW_WRITER);
749                     found = uvn_findpages(uobj, start, &npages, &pp, NULL,
750                         UFP_NOALLOC);
751                     rw_exit(rw);
752 
753                     /* XXXNETBSD shouldn't access userspace with the page busy */
754                     if (found) {
755                               va = zfs_map_page(pp, S_READ);
756                               error = uiomove(va + off, bytes, UIO_READ, uio);
757                               zfs_unmap_page(pp, va);
758                               rw_enter(rw, RW_WRITER);
759                               uvm_page_unbusy(&pp, 1);
760                               rw_exit(rw);
761                     } else {
762                               error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
763                                   uio, bytes);
764                     }
765 
766                     len -= bytes;
767                     off = 0;
768                     if (error)
769                               break;
770           }
771           return (error);
772 }
773 
774 static void
update_pages(vnode_t * vp,int64_t start,int len,objset_t * os,uint64_t oid,int segflg,dmu_tx_t * tx)775 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
776     int segflg, dmu_tx_t *tx)
777 {
778           struct uvm_object *uobj = &vp->v_uobj;
779           krwlock_t *rw = uobj->vmobjlock;
780           caddr_t va;
781           int off, status;
782 
783           ASSERT(vp->v_mount != NULL);
784 
785           rw_enter(rw, RW_WRITER);
786 
787           off = start & PAGEOFFSET;
788           for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
789                     page_t *pp;
790                     int nbytes = MIN(PAGESIZE - off, len);
791                     int npages, found;
792 
793                     pp = NULL;
794                     npages = 1;
795                     found = uvn_findpages(uobj, start, &npages, &pp, NULL,
796                         UFP_NOALLOC);
797                     if (found) {
798                               /*
799                                * We're about to zap the page's contents and don't
800                                * care about any existing modifications.  We must
801                                * keep track of any new modifications past this
802                                * point.  Clear the modified bit in the pmap, and
803                                * if the page is marked dirty revert to tracking
804                                * the modified bit.
805                                */
806                               switch (uvm_pagegetdirty(pp)) {
807                               case UVM_PAGE_STATUS_DIRTY:
808                                         /* Does pmap_clear_modify(). */
809                                         uvm_pagemarkdirty(pp, UVM_PAGE_STATUS_UNKNOWN);
810                                         break;
811                               case UVM_PAGE_STATUS_UNKNOWN:
812                                         pmap_clear_modify(pp);
813                                         break;
814                               case UVM_PAGE_STATUS_CLEAN:
815                                         /* Nothing to do. */
816                                         break;
817                               }
818                               rw_exit(rw);
819 
820                               va = zfs_map_page(pp, S_WRITE);
821                               (void) dmu_read(os, oid, start + off, nbytes,
822                                   va + off, DMU_READ_PREFETCH);
823                               zfs_unmap_page(pp, va);
824 
825                               rw_enter(rw, RW_WRITER);
826                               uvm_page_unbusy(&pp, 1);
827                     }
828                     len -= nbytes;
829                     off = 0;
830           }
831           rw_exit(rw);
832 }
833 #endif /* __NetBSD__ */
834 
835 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
836 
837 /*
838  * Read bytes from specified file into supplied buffer.
839  *
840  *        IN:       vp        - vnode of file to be read from.
841  *                  uio       - structure supplying read location, range info,
842  *                              and return buffer.
843  *                  ioflag    - SYNC flags; used to provide FRSYNC semantics.
844  *                  cr        - credentials of caller.
845  *                  ct        - caller context
846  *
847  *        OUT:      uio       - updated offset and range, buffer filled.
848  *
849  *        RETURN:   0 on success, error code on failure.
850  *
851  * Side Effects:
852  *        vp - atime updated if byte count > 0
853  */
854 /* ARGSUSED */
855 static int
zfs_read(vnode_t * vp,uio_t * uio,int ioflag,cred_t * cr,caller_context_t * ct)856 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
857 {
858           znode_t             *zp = VTOZ(vp);
859           zfsvfs_t  *zfsvfs = zp->z_zfsvfs;
860           ssize_t             n, nbytes;
861           int                 error = 0;
862           rl_t                *rl;
863           xuio_t              *xuio = NULL;
864 
865           ZFS_ENTER(zfsvfs);
866           ZFS_VERIFY_ZP(zp);
867 
868           if (zp->z_pflags & ZFS_AV_QUARANTINED) {
869                     ZFS_EXIT(zfsvfs);
870                     return (SET_ERROR(EACCES));
871           }
872 
873           /*
874            * Validate file offset
875            */
876           if (uio->uio_loffset < (offset_t)0) {
877                     ZFS_EXIT(zfsvfs);
878                     return (SET_ERROR(EINVAL));
879           }
880 
881           /*
882            * Fasttrack empty reads
883            */
884           if (uio->uio_resid == 0) {
885                     ZFS_EXIT(zfsvfs);
886                     return (0);
887           }
888 
889           /*
890            * Check for mandatory locks
891            */
892           if (MANDMODE(zp->z_mode)) {
893                     if (error = chklock(vp, FREAD,
894                         uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
895                               ZFS_EXIT(zfsvfs);
896                               return (error);
897                     }
898           }
899 
900           /*
901            * If we're in FRSYNC mode, sync out this znode before reading it.
902            */
903           if (zfsvfs->z_log &&
904               (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
905                     zil_commit(zfsvfs->z_log, zp->z_id);
906 
907           /*
908            * Lock the range against changes.
909            */
910           rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
911 
912           /*
913            * If we are reading past end-of-file we can skip
914            * to the end; but we might still need to set atime.
915            */
916           if (uio->uio_loffset >= zp->z_size) {
917                     error = 0;
918                     goto out;
919           }
920 
921           ASSERT(uio->uio_loffset < zp->z_size);
922           n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
923 
924 #ifdef illumos
925           if ((uio->uio_extflg == UIO_XUIO) &&
926               (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
927                     int nblk;
928                     int blksz = zp->z_blksz;
929                     uint64_t offset = uio->uio_loffset;
930 
931                     xuio = (xuio_t *)uio;
932                     if ((ISP2(blksz))) {
933                               nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
934                                   blksz)) / blksz;
935                     } else {
936                               ASSERT(offset + n <= blksz);
937                               nblk = 1;
938                     }
939                     (void) dmu_xuio_init(xuio, nblk);
940 
941                     if (vn_has_cached_data(vp)) {
942                               /*
943                                * For simplicity, we always allocate a full buffer
944                                * even if we only expect to read a portion of a block.
945                                */
946                               while (--nblk >= 0) {
947                                         (void) dmu_xuio_add(xuio,
948                                             dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
949                                             blksz), 0, blksz);
950                               }
951                     }
952           }
953 #endif    /* illumos */
954 
955           while (n > 0) {
956                     nbytes = MIN(n, zfs_read_chunk_size -
957                         P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
958 
959 #ifdef __FreeBSD__
960                     if (uio->uio_segflg == UIO_NOCOPY)
961                               error = mappedread_sf(vp, nbytes, uio);
962                     else
963 #endif /* __FreeBSD__ */
964                     if (vn_has_cached_data(vp)) {
965                               error = mappedread(vp, nbytes, uio);
966                     } else {
967                               error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
968                                   uio, nbytes);
969                     }
970                     if (error) {
971                               /* convert checksum errors into IO errors */
972                               if (error == ECKSUM)
973                                         error = SET_ERROR(EIO);
974                               break;
975                     }
976 
977                     n -= nbytes;
978           }
979 out:
980           zfs_range_unlock(rl);
981 
982           ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
983           ZFS_EXIT(zfsvfs);
984           return (error);
985 }
986 
987 /*
988  * Write the bytes to a file.
989  *
990  *        IN:       vp        - vnode of file to be written to.
991  *                  uio       - structure supplying write location, range info,
992  *                              and data buffer.
993  *                  ioflag    - FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
994  *                              set if in append mode.
995  *                  cr        - credentials of caller.
996  *                  ct        - caller context (NFS/CIFS fem monitor only)
997  *
998  *        OUT:      uio       - updated offset and range.
999  *
1000  *        RETURN:   0 on success, error code on failure.
1001  *
1002  * Timestamps:
1003  *        vp - ctime|mtime updated if byte count > 0
1004  */
1005 
1006 /* ARGSUSED */
1007 static int
zfs_write(vnode_t * vp,uio_t * uio,int ioflag,cred_t * cr,caller_context_t * ct)1008 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
1009 {
1010           znode_t             *zp = VTOZ(vp);
1011           rlim64_t  limit = MAXOFFSET_T;
1012           ssize_t             start_resid = uio->uio_resid;
1013           ssize_t             tx_bytes;
1014           uint64_t  end_size;
1015           dmu_tx_t  *tx;
1016           zfsvfs_t  *zfsvfs = zp->z_zfsvfs;
1017           zilog_t             *zilog;
1018           offset_t  woff;
1019           ssize_t             n, nbytes;
1020           rl_t                *rl;
1021           int                 max_blksz = zfsvfs->z_max_blksz;
1022           int                 error = 0;
1023           arc_buf_t *abuf;
1024           iovec_t             *aiov = NULL;
1025           xuio_t              *xuio = NULL;
1026           int                 i_iov = 0;
1027           int                 iovcnt = uio->uio_iovcnt;
1028           iovec_t             *iovp = uio->uio_iov;
1029           int                 write_eof;
1030           int                 count = 0;
1031           sa_bulk_attr_t      bulk[4];
1032           uint64_t  mtime[2], ctime[2];
1033           int                 segflg;
1034 
1035 #ifdef __NetBSD__
1036           segflg = VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ?
1037                     UIO_SYSSPACE : UIO_USERSPACE;
1038 #else
1039           segflg = uio->uio_segflg;
1040 #endif
1041 
1042           /*
1043            * Fasttrack empty write
1044            */
1045           n = start_resid;
1046           if (n == 0)
1047                     return (0);
1048 
1049           if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
1050                     limit = MAXOFFSET_T;
1051 
1052           ZFS_ENTER(zfsvfs);
1053           ZFS_VERIFY_ZP(zp);
1054 
1055           SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1056           SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1057           SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1058               &zp->z_size, 8);
1059           SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1060               &zp->z_pflags, 8);
1061 
1062           /*
1063            * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
1064            * callers might not be able to detect properly that we are read-only,
1065            * so check it explicitly here.
1066            */
1067           if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
1068                     ZFS_EXIT(zfsvfs);
1069                     return (SET_ERROR(EROFS));
1070           }
1071 
1072           /*
1073            * If immutable or not appending then return EPERM
1074            */
1075           if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
1076               ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
1077               (uio->uio_loffset < zp->z_size))) {
1078                     ZFS_EXIT(zfsvfs);
1079                     return (SET_ERROR(EPERM));
1080           }
1081 
1082           zilog = zfsvfs->z_log;
1083 
1084           /*
1085            * Validate file offset
1086            */
1087           woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
1088           if (woff < 0) {
1089                     ZFS_EXIT(zfsvfs);
1090                     return (SET_ERROR(EINVAL));
1091           }
1092 
1093           /*
1094            * Check for mandatory locks before calling zfs_range_lock()
1095            * in order to prevent a deadlock with locks set via fcntl().
1096            */
1097           if (MANDMODE((mode_t)zp->z_mode) &&
1098               (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
1099                     ZFS_EXIT(zfsvfs);
1100                     return (error);
1101           }
1102 
1103 #ifdef illumos
1104           /*
1105            * Pre-fault the pages to ensure slow (eg NFS) pages
1106            * don't hold up txg.
1107            * Skip this if uio contains loaned arc_buf.
1108            */
1109           if ((uio->uio_extflg == UIO_XUIO) &&
1110               (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
1111                     xuio = (xuio_t *)uio;
1112           else
1113                     uio_prefaultpages(MIN(n, max_blksz), uio);
1114 #endif
1115 
1116           /*
1117            * If in append mode, set the io offset pointer to eof.
1118            */
1119           if (ioflag & FAPPEND) {
1120                     /*
1121                      * Obtain an appending range lock to guarantee file append
1122                      * semantics.  We reset the write offset once we have the lock.
1123                      */
1124                     rl = zfs_range_lock(zp, 0, n, RL_APPEND);
1125                     woff = rl->r_off;
1126                     if (rl->r_len == UINT64_MAX) {
1127                               /*
1128                                * We overlocked the file because this write will cause
1129                                * the file block size to increase.
1130                                * Note that zp_size cannot change with this lock held.
1131                                */
1132                               woff = zp->z_size;
1133                     }
1134                     uio->uio_loffset = woff;
1135           } else {
1136                     /*
1137                      * Note that if the file block size will change as a result of
1138                      * this write, then this range lock will lock the entire file
1139                      * so that we can re-write the block safely.
1140                      */
1141                     rl = zfs_range_lock(zp, woff, n, RL_WRITER);
1142           }
1143 
1144 #ifdef illumos
1145           if (woff >= limit) {
1146                     zfs_range_unlock(rl);
1147                     ZFS_EXIT(zfsvfs);
1148                     return (SET_ERROR(EFBIG));
1149           }
1150 
1151 #endif
1152 #ifdef __FreeBSD__
1153           if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
1154                     zfs_range_unlock(rl);
1155                     ZFS_EXIT(zfsvfs);
1156                     return (SET_ERROR(EFBIG));
1157           }
1158 #endif
1159 #ifdef __NetBSD__
1160           /* XXXNETBSD we might need vn_rlimit_fsize() too here eventually */
1161 #endif
1162 
1163           if ((woff + n) > limit || woff > (limit - n))
1164                     n = limit - woff;
1165 
1166           /* Will this write extend the file length? */
1167           write_eof = (woff + n > zp->z_size);
1168 
1169           end_size = MAX(zp->z_size, woff + n);
1170 
1171           /*
1172            * Write the file in reasonable size chunks.  Each chunk is written
1173            * in a separate transaction; this keeps the intent log records small
1174            * and allows us to do more fine-grained space accounting.
1175            */
1176           while (n > 0) {
1177                     abuf = NULL;
1178                     woff = uio->uio_loffset;
1179                     if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1180                         zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1181                               if (abuf != NULL)
1182                                         dmu_return_arcbuf(abuf);
1183                               error = SET_ERROR(EDQUOT);
1184                               break;
1185                     }
1186 
1187                     if (xuio && abuf == NULL) {
1188                               ASSERT(i_iov < iovcnt);
1189                               aiov = &iovp[i_iov];
1190                               abuf = dmu_xuio_arcbuf(xuio, i_iov);
1191                               dmu_xuio_clear(xuio, i_iov);
1192                               DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1193                                   iovec_t *, aiov, arc_buf_t *, abuf);
1194                               ASSERT((aiov->iov_base == abuf->b_data) ||
1195                                   ((char *)aiov->iov_base - (char *)abuf->b_data +
1196                                   aiov->iov_len == arc_buf_size(abuf)));
1197                               i_iov++;
1198                     } else if (abuf == NULL && n >= max_blksz &&
1199                         woff >= zp->z_size &&
1200                         P2PHASE(woff, max_blksz) == 0 &&
1201                         zp->z_blksz == max_blksz) {
1202                               /*
1203                                * This write covers a full block.  "Borrow" a buffer
1204                                * from the dmu so that we can fill it before we enter
1205                                * a transaction.  This avoids the possibility of
1206                                * holding up the transaction if the data copy hangs
1207                                * up on a pagefault (e.g., from an NFS server mapping).
1208                                */
1209 #if defined(illumos) || defined(__NetBSD__)
1210                               size_t cbytes;
1211 #endif
1212 
1213                               abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1214                                   max_blksz);
1215                               ASSERT(abuf != NULL);
1216                               ASSERT(arc_buf_size(abuf) == max_blksz);
1217 #if defined(illumos) || defined(__NetBSD__)
1218                               if (error = uiocopy(abuf->b_data, max_blksz,
1219                                   UIO_WRITE, uio, &cbytes)) {
1220                                         dmu_return_arcbuf(abuf);
1221                                         break;
1222                               }
1223                               ASSERT(cbytes == max_blksz);
1224 #endif
1225 #ifdef __FreeBSD__
1226                               ssize_t resid = uio->uio_resid;
1227 
1228                               error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio);
1229                               if (error != 0) {
1230                                         uio->uio_offset -= resid - uio->uio_resid;
1231                                         uio->uio_resid = resid;
1232                                         dmu_return_arcbuf(abuf);
1233                                         break;
1234                               }
1235 #endif
1236                     }
1237 
1238                     /*
1239                      * Start a transaction.
1240                      */
1241                     tx = dmu_tx_create(zfsvfs->z_os);
1242                     dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1243                     dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1244                     zfs_sa_upgrade_txholds(tx, zp);
1245                     error = dmu_tx_assign(tx, TXG_WAIT);
1246                     if (error) {
1247                               dmu_tx_abort(tx);
1248                               if (abuf != NULL)
1249                                         dmu_return_arcbuf(abuf);
1250                               break;
1251                     }
1252 
1253                     /*
1254                      * If zfs_range_lock() over-locked we grow the blocksize
1255                      * and then reduce the lock range.  This will only happen
1256                      * on the first iteration since zfs_range_reduce() will
1257                      * shrink down r_len to the appropriate size.
1258                      */
1259                     if (rl->r_len == UINT64_MAX) {
1260                               uint64_t new_blksz;
1261 
1262                               if (zp->z_blksz > max_blksz) {
1263                                         /*
1264                                          * File's blocksize is already larger than the
1265                                          * "recordsize" property.  Only let it grow to
1266                                          * the next power of 2.
1267                                          */
1268                                         ASSERT(!ISP2(zp->z_blksz));
1269                                         new_blksz = MIN(end_size,
1270                                             1 << highbit64(zp->z_blksz));
1271                               } else {
1272                                         new_blksz = MIN(end_size, max_blksz);
1273                               }
1274                               zfs_grow_blocksize(zp, new_blksz, tx);
1275                               zfs_range_reduce(rl, woff, n);
1276                     }
1277 
1278                     /*
1279                      * XXX - should we really limit each write to z_max_blksz?
1280                      * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1281                      */
1282                     nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1283 
1284                     if (woff + nbytes > zp->z_size)
1285                               vnode_pager_setsize(vp, woff + nbytes);
1286 
1287                     if (abuf == NULL) {
1288                               tx_bytes = uio->uio_resid;
1289                               error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1290                                   uio, nbytes, tx);
1291                               tx_bytes -= uio->uio_resid;
1292                     } else {
1293                               tx_bytes = nbytes;
1294                               ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1295                               /*
1296                                * If this is not a full block write, but we are
1297                                * extending the file past EOF and this data starts
1298                                * block-aligned, use assign_arcbuf().  Otherwise,
1299                                * write via dmu_write().
1300                                */
1301                               if (tx_bytes < max_blksz && (!write_eof ||
1302                                   aiov->iov_base != abuf->b_data)) {
1303                                         ASSERT(xuio);
1304                                         dmu_write(zfsvfs->z_os, zp->z_id, woff,
1305                                             aiov->iov_len, aiov->iov_base, tx);
1306                                         dmu_return_arcbuf(abuf);
1307                                         xuio_stat_wbuf_copied();
1308                               } else {
1309                                         ASSERT(xuio || tx_bytes == max_blksz);
1310                                         dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1311                                             woff, abuf, tx);
1312                               }
1313 #if defined(illumos) || defined(__NetBSD__)
1314                               ASSERT(tx_bytes <= uio->uio_resid);
1315                               uioskip(uio, tx_bytes);
1316 #endif
1317                     }
1318                     if (tx_bytes && vn_has_cached_data(vp)) {
1319                               update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1320                                   zp->z_id, segflg, tx);
1321                     }
1322 
1323                     /*
1324                      * If we made no progress, we're done.  If we made even
1325                      * partial progress, update the znode and ZIL accordingly.
1326                      */
1327                     if (tx_bytes == 0) {
1328                               (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1329                                   (void *)&zp->z_size, sizeof (uint64_t), tx);
1330                               dmu_tx_commit(tx);
1331                               ASSERT(error != 0);
1332                               break;
1333                     }
1334 
1335                     /*
1336                      * Clear Set-UID/Set-GID bits on successful write if not
1337                      * privileged and at least one of the excute bits is set.
1338                      *
1339                      * It would be nice to to this after all writes have
1340                      * been done, but that would still expose the ISUID/ISGID
1341                      * to another app after the partial write is committed.
1342                      *
1343                      * Note: we don't call zfs_fuid_map_id() here because
1344                      * user 0 is not an ephemeral uid.
1345                      */
1346                     mutex_enter(&zp->z_acl_lock);
1347                     if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1348                         (S_IXUSR >> 6))) != 0 &&
1349                         (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1350                         secpolicy_vnode_setid_retain(vp, cr,
1351                         (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1352                               uint64_t newmode;
1353                               zp->z_mode &= ~(S_ISUID | S_ISGID);
1354                               newmode = zp->z_mode;
1355                               (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1356                                   (void *)&newmode, sizeof (uint64_t), tx);
1357 #ifdef __NetBSD__
1358                               cache_enter_id(vp, zp->z_mode, zp->z_uid, zp->z_gid,
1359                                   true);
1360 #endif
1361                     }
1362                     mutex_exit(&zp->z_acl_lock);
1363 
1364                     zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1365                         B_TRUE);
1366 
1367                     /*
1368                      * Update the file size (zp_size) if it has changed;
1369                      * account for possible concurrent updates.
1370                      */
1371                     while ((end_size = zp->z_size) < uio->uio_loffset) {
1372                               (void) atomic_cas_64(&zp->z_size, end_size,
1373                                   uio->uio_loffset);
1374 #ifdef illumos
1375                               ASSERT(error == 0);
1376 #else
1377                               ASSERT(error == 0 || error == EFAULT);
1378 #endif
1379                     }
1380                     /*
1381                      * If we are replaying and eof is non zero then force
1382                      * the file size to the specified eof. Note, there's no
1383                      * concurrency during replay.
1384                      */
1385                     if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1386                               zp->z_size = zfsvfs->z_replay_eof;
1387 
1388                     if (error == 0)
1389                               error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1390                     else
1391                               (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1392 
1393                     zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1394                     dmu_tx_commit(tx);
1395 
1396                     if (error != 0)
1397                               break;
1398                     ASSERT(tx_bytes == nbytes);
1399                     n -= nbytes;
1400 
1401 #ifdef illumos
1402                     if (!xuio && n > 0)
1403                               uio_prefaultpages(MIN(n, max_blksz), uio);
1404 #endif
1405           }
1406 
1407           zfs_range_unlock(rl);
1408 
1409           /*
1410            * If we're in replay mode, or we made no progress, return error.
1411            * Otherwise, it's at least a partial write, so it's successful.
1412            */
1413           if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1414                     ZFS_EXIT(zfsvfs);
1415                     return (error);
1416           }
1417 
1418 #ifdef __FreeBSD__
1419           /*
1420            * EFAULT means that at least one page of the source buffer was not
1421            * available.  VFS will re-try remaining I/O upon this error.
1422            */
1423           if (error == EFAULT) {
1424                     ZFS_EXIT(zfsvfs);
1425                     return (error);
1426           }
1427 #endif
1428 
1429           if (ioflag & (FSYNC | FDSYNC) ||
1430               zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1431                     zil_commit(zilog, zp->z_id);
1432 
1433           ZFS_EXIT(zfsvfs);
1434           return (0);
1435 }
1436 
1437 void
zfs_get_done(zgd_t * zgd,int error)1438 zfs_get_done(zgd_t *zgd, int error)
1439 {
1440           znode_t *zp = zgd->zgd_private;
1441           objset_t *os = zp->z_zfsvfs->z_os;
1442 
1443           if (zgd->zgd_db)
1444                     dmu_buf_rele(zgd->zgd_db, zgd);
1445 
1446           zfs_range_unlock(zgd->zgd_rl);
1447 
1448           /*
1449            * Release the vnode asynchronously as we currently have the
1450            * txg stopped from syncing.
1451            */
1452           VN_RELE_CLEANER(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1453 
1454           if (error == 0 && zgd->zgd_bp)
1455                     zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1456 
1457           kmem_free(zgd, sizeof (zgd_t));
1458 }
1459 
1460 #ifdef DEBUG
1461 static int zil_fault_io = 0;
1462 #endif
1463 
1464 /*
1465  * Get data to generate a TX_WRITE intent log record.
1466  */
1467 int
zfs_get_data(void * arg,lr_write_t * lr,char * buf,zio_t * zio)1468 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1469 {
1470           zfsvfs_t *zfsvfs = arg;
1471           objset_t *os = zfsvfs->z_os;
1472           znode_t *zp;
1473           uint64_t object = lr->lr_foid;
1474           uint64_t offset = lr->lr_offset;
1475           uint64_t size = lr->lr_length;
1476           blkptr_t *bp = &lr->lr_blkptr;
1477           dmu_buf_t *db;
1478           zgd_t *zgd;
1479           int error = 0;
1480 
1481           ASSERT(zio != NULL);
1482           ASSERT(size != 0);
1483 
1484           /*
1485            * Nothing to do if the file has been removed
1486            */
1487           if (zfs_zget_cleaner(zfsvfs, object, &zp) != 0)
1488                     return (SET_ERROR(ENOENT));
1489           if (zp->z_unlinked) {
1490                     /*
1491                      * Release the vnode asynchronously as we currently have the
1492                      * txg stopped from syncing.
1493                      */
1494                     VN_RELE_CLEANER(ZTOV(zp),
1495                         dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1496                     return (SET_ERROR(ENOENT));
1497           }
1498 
1499           zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1500           zgd->zgd_zilog = zfsvfs->z_log;
1501           zgd->zgd_private = zp;
1502 
1503           /*
1504            * Write records come in two flavors: immediate and indirect.
1505            * For small writes it's cheaper to store the data with the
1506            * log record (immediate); for large writes it's cheaper to
1507            * sync the data and get a pointer to it (indirect) so that
1508            * we don't have to write the data twice.
1509            */
1510           if (buf != NULL) { /* immediate write */
1511                     zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1512                     /* test for truncation needs to be done while range locked */
1513                     if (offset >= zp->z_size) {
1514                               error = SET_ERROR(ENOENT);
1515                     } else {
1516                               error = dmu_read(os, object, offset, size, buf,
1517                                   DMU_READ_NO_PREFETCH);
1518                     }
1519                     ASSERT(error == 0 || error == ENOENT);
1520           } else { /* indirect write */
1521                     /*
1522                      * Have to lock the whole block to ensure when it's
1523                      * written out and it's checksum is being calculated
1524                      * that no one can change the data. We need to re-check
1525                      * blocksize after we get the lock in case it's changed!
1526                      */
1527                     for (;;) {
1528                               uint64_t blkoff;
1529                               size = zp->z_blksz;
1530                               blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1531                               offset -= blkoff;
1532                               zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1533                                   RL_READER);
1534                               if (zp->z_blksz == size)
1535                                         break;
1536                               offset += blkoff;
1537                               zfs_range_unlock(zgd->zgd_rl);
1538                     }
1539                     /* test for truncation needs to be done while range locked */
1540                     if (lr->lr_offset >= zp->z_size)
1541                               error = SET_ERROR(ENOENT);
1542 #ifdef DEBUG
1543                     if (zil_fault_io) {
1544                               error = SET_ERROR(EIO);
1545                               zil_fault_io = 0;
1546                     }
1547 #endif
1548                     if (error == 0)
1549                               error = dmu_buf_hold(os, object, offset, zgd, &db,
1550                                   DMU_READ_NO_PREFETCH);
1551 
1552                     if (error == 0) {
1553                               blkptr_t *obp = dmu_buf_get_blkptr(db);
1554                               if (obp) {
1555                                         ASSERT(BP_IS_HOLE(bp));
1556                                         *bp = *obp;
1557                               }
1558 
1559                               zgd->zgd_db = db;
1560                               zgd->zgd_bp = bp;
1561 
1562                               ASSERT(db->db_offset == offset);
1563                               ASSERT(db->db_size == size);
1564 
1565                               error = dmu_sync(zio, lr->lr_common.lrc_txg,
1566                                   zfs_get_done, zgd);
1567                               ASSERT(error || lr->lr_length <= zp->z_blksz);
1568 
1569                               /*
1570                                * On success, we need to wait for the write I/O
1571                                * initiated by dmu_sync() to complete before we can
1572                                * release this dbuf.  We will finish everything up
1573                                * in the zfs_get_done() callback.
1574                                */
1575                               if (error == 0)
1576                                         return (0);
1577 
1578                               if (error == EALREADY) {
1579                                         lr->lr_common.lrc_txtype = TX_WRITE2;
1580                                         error = 0;
1581                               }
1582                     }
1583           }
1584 
1585           zfs_get_done(zgd, error);
1586 
1587           return (error);
1588 }
1589 
1590 /*ARGSUSED*/
1591 static int
zfs_access(vnode_t * vp,int mode,int flag,cred_t * cr,caller_context_t * ct)1592 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1593     caller_context_t *ct)
1594 {
1595           znode_t *zp = VTOZ(vp);
1596           zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1597           int error;
1598 
1599           ZFS_ENTER(zfsvfs);
1600           ZFS_VERIFY_ZP(zp);
1601 
1602           if (flag & V_ACE_MASK)
1603                     error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1604           else
1605                     error = zfs_zaccess_rwx(zp, mode, flag, cr);
1606 
1607           ZFS_EXIT(zfsvfs);
1608           return (error);
1609 }
1610 
1611 #ifdef __FreeBSD__
1612 static int
zfs_dd_callback(struct mount * mp,void * arg,int lkflags,struct vnode ** vpp)1613 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1614 {
1615           int error;
1616 
1617           *vpp = arg;
1618           error = vn_lock(*vpp, lkflags);
1619           if (error != 0)
1620                     vrele(*vpp);
1621           return (error);
1622 }
1623 
1624 static int
zfs_lookup_lock(vnode_t * dvp,vnode_t * vp,const char * name,int lkflags)1625 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1626 {
1627           znode_t *zdp = VTOZ(dvp);
1628           zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1629           int error;
1630           int ltype;
1631 
1632           ASSERT_VOP_LOCKED(dvp, __func__);
1633 #ifdef DIAGNOSTIC
1634           if ((zdp->z_pflags & ZFS_XATTR) == 0)
1635                     VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1636 #endif
1637 
1638           if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1639                     ASSERT3P(dvp, ==, vp);
1640                     vref(dvp);
1641                     ltype = lkflags & LK_TYPE_MASK;
1642                     if (ltype != VOP_ISLOCKED(dvp)) {
1643                               if (ltype == LK_EXCLUSIVE)
1644                                         vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1645                               else /* if (ltype == LK_SHARED) */
1646                                         vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1647 
1648                               /*
1649                                * Relock for the "." case could leave us with
1650                                * reclaimed vnode.
1651                                */
1652                               if (dvp->v_iflag & VI_DOOMED) {
1653                                         vrele(dvp);
1654                                         return (SET_ERROR(ENOENT));
1655                               }
1656                     }
1657                     return (0);
1658           } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1659                     /*
1660                      * Note that in this case, dvp is the child vnode, and we
1661                      * are looking up the parent vnode - exactly reverse from
1662                      * normal operation.  Unlocking dvp requires some rather
1663                      * tricky unlock/relock dance to prevent mp from being freed;
1664                      * use vn_vget_ino_gen() which takes care of all that.
1665                      *
1666                      * XXX Note that there is a time window when both vnodes are
1667                      * unlocked.  It is possible, although highly unlikely, that
1668                      * during that window the parent-child relationship between
1669                      * the vnodes may change, for example, get reversed.
1670                      * In that case we would have a wrong lock order for the vnodes.
1671                      * All other filesystems seem to ignore this problem, so we
1672                      * do the same here.
1673                      * A potential solution could be implemented as follows:
1674                      * - using LK_NOWAIT when locking the second vnode and retrying
1675                      *   if necessary
1676                      * - checking that the parent-child relationship still holds
1677                      *   after locking both vnodes and retrying if it doesn't
1678                      */
1679                     error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1680                     return (error);
1681           } else {
1682                     error = vn_lock(vp, lkflags);
1683                     if (error != 0)
1684                               vrele(vp);
1685                     return (error);
1686           }
1687 }
1688 
1689 /*
1690  * Lookup an entry in a directory, or an extended attribute directory.
1691  * If it exists, return a held vnode reference for it.
1692  *
1693  *        IN:       dvp       - vnode of directory to search.
1694  *                  nm        - name of entry to lookup.
1695  *                  pnp       - full pathname to lookup [UNUSED].
1696  *                  flags     - LOOKUP_XATTR set if looking for an attribute.
1697  *                  rdir      - root directory vnode [UNUSED].
1698  *                  cr        - credentials of caller.
1699  *                  ct        - caller context
1700  *
1701  *        OUT:      vpp       - vnode of located entry, NULL if not found.
1702  *
1703  *        RETURN:   0 on success, error code on failure.
1704  *
1705  * Timestamps:
1706  *        NA
1707  */
1708 /* ARGSUSED */
1709 static int
zfs_lookup(vnode_t * dvp,char * nm,vnode_t ** vpp,struct componentname * cnp,int nameiop,cred_t * cr,kthread_t * td,int flags)1710 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1711     int nameiop, cred_t *cr, kthread_t *td, int flags)
1712 {
1713           znode_t *zdp = VTOZ(dvp);
1714           znode_t *zp;
1715           zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1716           int       error = 0;
1717 
1718           /* fast path (should be redundant with vfs namecache) */
1719           if (!(flags & LOOKUP_XATTR)) {
1720                     if (dvp->v_type != VDIR) {
1721                               return (SET_ERROR(ENOTDIR));
1722                     } else if (zdp->z_sa_hdl == NULL) {
1723                               return (SET_ERROR(EIO));
1724                     }
1725           }
1726 
1727           DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1728 
1729           ZFS_ENTER(zfsvfs);
1730           ZFS_VERIFY_ZP(zdp);
1731 
1732           *vpp = NULL;
1733 
1734           if (flags & LOOKUP_XATTR) {
1735 #ifdef TODO
1736                     /*
1737                      * If the xattr property is off, refuse the lookup request.
1738                      */
1739                     if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1740                               ZFS_EXIT(zfsvfs);
1741                               return (SET_ERROR(EINVAL));
1742                     }
1743 #endif
1744 
1745                     /*
1746                      * We don't allow recursive attributes..
1747                      * Maybe someday we will.
1748                      */
1749                     if (zdp->z_pflags & ZFS_XATTR) {
1750                               ZFS_EXIT(zfsvfs);
1751                               return (SET_ERROR(EINVAL));
1752                     }
1753 
1754                     if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1755                               ZFS_EXIT(zfsvfs);
1756                               return (error);
1757                     }
1758 
1759                     /*
1760                      * Do we have permission to get into attribute directory?
1761                      */
1762                     if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1763                         B_FALSE, cr)) {
1764                               vrele(*vpp);
1765                               *vpp = NULL;
1766                     }
1767 
1768                     ZFS_EXIT(zfsvfs);
1769                     return (error);
1770           }
1771 
1772           /*
1773            * Check accessibility of directory.
1774            */
1775           if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1776                     ZFS_EXIT(zfsvfs);
1777                     return (error);
1778           }
1779 
1780           if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1781               NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1782                     ZFS_EXIT(zfsvfs);
1783                     return (SET_ERROR(EILSEQ));
1784           }
1785 
1786 
1787           /*
1788            * First handle the special cases.
1789            */
1790           if ((cnp->cn_flags & ISDOTDOT) != 0) {
1791                     /*
1792                      * If we are a snapshot mounted under .zfs, return
1793                      * the vp for the snapshot directory.
1794                      */
1795                     if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1796                               struct componentname cn;
1797                               vnode_t *zfsctl_vp;
1798                               int ltype;
1799 
1800                               ZFS_EXIT(zfsvfs);
1801                               ltype = VOP_ISLOCKED(dvp);
1802                               VOP_UNLOCK(dvp, 0);
1803                               error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1804                                   &zfsctl_vp);
1805                               if (error == 0) {
1806                                         cn.cn_nameptr = "snapshot";
1807                                         cn.cn_namelen = strlen(cn.cn_nameptr);
1808                                         cn.cn_nameiop = cnp->cn_nameiop;
1809                                         cn.cn_flags = cnp->cn_flags;
1810                                         cn.cn_lkflags = cnp->cn_lkflags;
1811                                         error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1812                                         vput(zfsctl_vp);
1813                               }
1814                               vn_lock(dvp, ltype | LK_RETRY);
1815                               return (error);
1816                     }
1817           }
1818           if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1819                     ZFS_EXIT(zfsvfs);
1820                     if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1821                               return (SET_ERROR(ENOTSUP));
1822                     error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1823                     return (error);
1824           }
1825 
1826           /*
1827            * The loop is retry the lookup if the parent-child relationship
1828            * changes during the dot-dot locking complexities.
1829            */
1830           for (;;) {
1831                     uint64_t parent;
1832 
1833                     error = zfs_dirlook(zdp, nm, &zp);
1834                     if (error == 0)
1835                               *vpp = ZTOV(zp);
1836 
1837                     ZFS_EXIT(zfsvfs);
1838                     if (error != 0)
1839                               break;
1840 
1841                     error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1842                     if (error != 0) {
1843                               /*
1844                                * If we've got a locking error, then the vnode
1845                                * got reclaimed because of a force unmount.
1846                                * We never enter doomed vnodes into the name cache.
1847                                */
1848                               *vpp = NULL;
1849                               return (error);
1850                     }
1851 
1852                     if ((cnp->cn_flags & ISDOTDOT) == 0)
1853                               break;
1854 
1855                     ZFS_ENTER(zfsvfs);
1856                     if (zdp->z_sa_hdl == NULL) {
1857                               error = SET_ERROR(EIO);
1858                     } else {
1859                               error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1860                                   &parent, sizeof (parent));
1861                     }
1862                     if (error != 0) {
1863                               ZFS_EXIT(zfsvfs);
1864                               vput(ZTOV(zp));
1865                               break;
1866                     }
1867                     if (zp->z_id == parent) {
1868                               ZFS_EXIT(zfsvfs);
1869                               break;
1870                     }
1871                     vput(ZTOV(zp));
1872           }
1873 
1874 out:
1875           if (error != 0)
1876                     *vpp = NULL;
1877 
1878           /* Translate errors and add SAVENAME when needed. */
1879           if (cnp->cn_flags & ISLASTCN) {
1880                     switch (nameiop) {
1881                     case CREATE:
1882                     case RENAME:
1883                               if (error == ENOENT) {
1884                                         error = EJUSTRETURN;
1885                                         cnp->cn_flags |= SAVENAME;
1886                                         break;
1887                               }
1888                               /* FALLTHROUGH */
1889                     case DELETE:
1890                               if (error == 0)
1891                                         cnp->cn_flags |= SAVENAME;
1892                               break;
1893                     }
1894           }
1895 
1896           /* Insert name into cache (as non-existent) if appropriate. */
1897           if (zfsvfs->z_use_namecache &&
1898               error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1899                     cache_enter(dvp, NULL, cnp);
1900 
1901           /* Insert name into cache if appropriate. */
1902           if (zfsvfs->z_use_namecache &&
1903               error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1904                     if (!(cnp->cn_flags & ISLASTCN) ||
1905                         (nameiop != DELETE && nameiop != RENAME)) {
1906                               cache_enter(dvp, *vpp, cnp);
1907                     }
1908           }
1909 
1910           return (error);
1911 }
1912 #endif /* __FreeBSD__ */
1913 
1914 #ifdef __NetBSD__
1915 /*
1916  * If vnode is for a device return a specfs vnode instead.
1917  */
1918 static int
specvp_check(vnode_t ** vpp,cred_t * cr)1919 specvp_check(vnode_t **vpp, cred_t *cr)
1920 {
1921           int error = 0;
1922 
1923           if (IS_DEVVP(*vpp)) {
1924                     struct vnode *svp;
1925 
1926                     svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1927                     VN_RELE(*vpp);
1928                     if (svp == NULL)
1929                               error = ENOSYS;
1930                     *vpp = svp;
1931           }
1932           return (error);
1933 }
1934 
1935 /*
1936  * Lookup an entry in a directory, or an extended attribute directory.
1937  * If it exists, return a held vnode reference for it.
1938  *
1939  *        IN:       dvp       - vnode of directory to search.
1940  *                  nm        - name of entry to lookup.
1941  *                  pnp       - full pathname to lookup [UNUSED].
1942  *                  flags     - LOOKUP_XATTR set if looking for an attribute.
1943  *                  rdir      - root directory vnode [UNUSED].
1944  *                  cr        - credentials of caller.
1945  *                  ct        - caller context
1946  *                  direntflags - directory lookup flags
1947  *                  realpnp - returned pathname.
1948  *
1949  *        OUT:      vpp       - vnode of located entry, NULL if not found.
1950  *
1951  *        RETURN:   0 if success
1952  *                  error code if failure
1953  *
1954  * Timestamps:
1955  *        NA
1956  */
1957 /* ARGSUSED */
1958 static int
zfs_lookup(vnode_t * dvp,char * nm,vnode_t ** vpp,int flags,struct componentname * cnp,int nameiop,cred_t * cr)1959 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, int flags,
1960     struct componentname *cnp, int nameiop, cred_t *cr)
1961 {
1962           znode_t *zdp = VTOZ(dvp);
1963           znode_t *zp;
1964           zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1965           int       error = 0;
1966 
1967           /* fast path */
1968           if (!(flags & LOOKUP_XATTR)) {
1969                     if (dvp->v_type != VDIR) {
1970                               return (ENOTDIR);
1971                     } else if (zdp->z_sa_hdl == NULL) {
1972                               return (SET_ERROR(EIO));
1973                     }
1974 
1975                     if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1976                               error = zfs_fastaccesschk_execute(zdp, cr);
1977                               if (!error) {
1978                                         *vpp = dvp;
1979                                         VN_HOLD(*vpp);
1980                                         return (0);
1981                               }
1982                               return (error);
1983                     } else {
1984                               vnode_t *tvp = dnlc_lookup(dvp, nm);
1985 
1986                               if (tvp) {
1987                                         error = zfs_fastaccesschk_execute(zdp, cr);
1988                                         if (error) {
1989                                                   VN_RELE(tvp);
1990                                                   return (error);
1991                                         }
1992                                         if (tvp == DNLC_NO_VNODE) {
1993                                                   VN_RELE(tvp);
1994                                                   return (ENOENT);
1995                                         } else {
1996                                                   *vpp = tvp;
1997                                                   return (specvp_check(vpp, cr));
1998                                         }
1999                               }
2000                     }
2001           }
2002 
2003           DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
2004 
2005           ZFS_ENTER(zfsvfs);
2006           ZFS_VERIFY_ZP(zdp);
2007 
2008           *vpp = NULL;
2009 
2010           if (flags & LOOKUP_XATTR) {
2011 #ifdef TODO
2012                     /*
2013                      * If the xattr property is off, refuse the lookup request.
2014                      */
2015                     if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
2016                               ZFS_EXIT(zfsvfs);
2017                               return (EINVAL);
2018                     }
2019 #endif
2020 
2021                     /*
2022                      * We don't allow recursive attributes..
2023                      * Maybe someday we will.
2024                      */
2025                     if (zdp->z_pflags & ZFS_XATTR) {
2026                               ZFS_EXIT(zfsvfs);
2027                               return (EINVAL);
2028                     }
2029 
2030                     if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
2031                               ZFS_EXIT(zfsvfs);
2032                               return (error);
2033                     }
2034 
2035                     /*
2036                      * Do we have permission to get into attribute directory?
2037                      */
2038                     if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
2039                         B_FALSE, cr)) {
2040                               VN_RELE(*vpp);
2041                               *vpp = NULL;
2042                     }
2043 
2044                     ZFS_EXIT(zfsvfs);
2045                     return (error);
2046           }
2047 
2048           if (dvp->v_type != VDIR) {
2049                     ZFS_EXIT(zfsvfs);
2050                     return (ENOTDIR);
2051           }
2052 
2053           /*
2054            * Check accessibility of directory.
2055            */
2056           if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
2057                     ZFS_EXIT(zfsvfs);
2058                     return (error);
2059           }
2060 
2061           if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
2062               NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2063                     ZFS_EXIT(zfsvfs);
2064                     return (EILSEQ);
2065           }
2066 
2067           /*
2068            * First handle the special cases.
2069            */
2070           if ((cnp->cn_flags & ISDOTDOT) != 0) {
2071                     /*
2072                      * If we are a snapshot mounted under .zfs, return
2073                      * the vp for the snapshot directory.
2074                      */
2075                     if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
2076                               ZFS_EXIT(zfsvfs);
2077                               error = zfsctl_snapshot(zfsvfs->z_parent, vpp);
2078 
2079                               return (error);
2080                     }
2081           }
2082           if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
2083                     ZFS_EXIT(zfsvfs);
2084                     if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
2085                               return (SET_ERROR(ENOTSUP));
2086                     error = zfsctl_root(zfsvfs, vpp);
2087                     return (error);
2088           }
2089 
2090           error = zfs_dirlook(zdp, nm, &zp);
2091           if (error == 0) {
2092                     *vpp = ZTOV(zp);
2093                     error = specvp_check(vpp, cr);
2094           }
2095 
2096           ZFS_EXIT(zfsvfs);
2097           return (error);
2098 }
2099 #endif
2100 
2101 /*
2102  * Attempt to create a new entry in a directory.  If the entry
2103  * already exists, truncate the file if permissible, else return
2104  * an error.  Return the vp of the created or trunc'd file.
2105  *
2106  *        IN:       dvp       - vnode of directory to put new file entry in.
2107  *                  name      - name of new file entry.
2108  *                  vap       - attributes of new file.
2109  *                  excl      - flag indicating exclusive or non-exclusive mode.
2110  *                  mode      - mode to open file with.
2111  *                  cr        - credentials of caller.
2112  *                  flag      - large file flag [UNUSED].
2113  *                  ct        - caller context
2114  *                  vsecp     - ACL to be set
2115  *
2116  *        OUT:      vpp       - vnode of created or trunc'd entry.
2117  *
2118  *        RETURN:   0 on success, error code on failure.
2119  *
2120  * Timestamps:
2121  *        dvp - ctime|mtime updated if new entry created
2122  *         vp - ctime|mtime always, atime if new
2123  */
2124 
2125 /* ARGSUSED */
2126 static int
zfs_create(vnode_t * dvp,char * name,vattr_t * vap,int excl,int mode,vnode_t ** vpp,cred_t * cr,kthread_t * td)2127 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
2128     vnode_t **vpp, cred_t *cr, kthread_t *td)
2129 {
2130           znode_t             *zp, *dzp = VTOZ(dvp);
2131           zfsvfs_t  *zfsvfs = dzp->z_zfsvfs;
2132           zilog_t             *zilog;
2133           objset_t  *os;
2134           dmu_tx_t  *tx;
2135           int                 error;
2136           ksid_t              *ksid;
2137           uid_t               uid;
2138           gid_t               gid = crgetgid(cr);
2139           zfs_acl_ids_t   acl_ids;
2140           boolean_t fuid_dirtied;
2141           void                *vsecp = NULL;
2142           int                 flag = 0;
2143           uint64_t  txtype;
2144 
2145           /*
2146            * If we have an ephemeral id, ACL, or XVATTR then
2147            * make sure file system is at proper version
2148            */
2149 
2150           ksid = crgetsid(cr, KSID_OWNER);
2151           if (ksid)
2152                     uid = ksid_getid(ksid);
2153           else
2154                     uid = crgetuid(cr);
2155 
2156           if (zfsvfs->z_use_fuids == B_FALSE &&
2157               (vsecp || (vap->va_mask & AT_XVATTR) ||
2158               IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2159                     return (SET_ERROR(EINVAL));
2160 
2161           ZFS_ENTER(zfsvfs);
2162           ZFS_VERIFY_ZP(dzp);
2163           os = zfsvfs->z_os;
2164           zilog = zfsvfs->z_log;
2165 
2166           if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
2167               NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2168                     ZFS_EXIT(zfsvfs);
2169                     return (SET_ERROR(EILSEQ));
2170           }
2171 
2172           if (vap->va_mask & AT_XVATTR) {
2173                     if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2174                         crgetuid(cr), cr, vap->va_type)) != 0) {
2175                               ZFS_EXIT(zfsvfs);
2176                               return (error);
2177                     }
2178           }
2179 
2180           *vpp = NULL;
2181 
2182           if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
2183                     vap->va_mode &= ~S_ISVTX;
2184 
2185           error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
2186           if (error) {
2187                     ZFS_EXIT(zfsvfs);
2188                     return (error);
2189           }
2190           ASSERT3P(zp, ==, NULL);
2191 
2192           /*
2193            * Create a new file object and update the directory
2194            * to reference it.
2195            */
2196           if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
2197                     goto out;
2198           }
2199 
2200           /*
2201            * We only support the creation of regular files in
2202            * extended attribute directories.
2203            */
2204 
2205           if ((dzp->z_pflags & ZFS_XATTR) &&
2206               (vap->va_type != VREG)) {
2207                     error = SET_ERROR(EINVAL);
2208                     goto out;
2209           }
2210 
2211           if ((error = zfs_acl_ids_create(dzp, 0, vap,
2212               cr, vsecp, &acl_ids)) != 0)
2213                     goto out;
2214 
2215           if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2216                     zfs_acl_ids_free(&acl_ids);
2217                     error = SET_ERROR(EDQUOT);
2218                     goto out;
2219           }
2220 
2221           getnewvnode_reserve(1);
2222 
2223           tx = dmu_tx_create(os);
2224 
2225           dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2226               ZFS_SA_BASE_ATTR_SIZE);
2227 
2228           fuid_dirtied = zfsvfs->z_fuid_dirty;
2229           if (fuid_dirtied)
2230                     zfs_fuid_txhold(zfsvfs, tx);
2231           dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2232           dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
2233           if (!zfsvfs->z_use_sa &&
2234               acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2235                     dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2236                         0, acl_ids.z_aclp->z_acl_bytes);
2237           }
2238           error = dmu_tx_assign(tx, TXG_WAIT);
2239           if (error) {
2240                     zfs_acl_ids_free(&acl_ids);
2241                     dmu_tx_abort(tx);
2242                     getnewvnode_drop_reserve();
2243                     ZFS_EXIT(zfsvfs);
2244                     return (error);
2245           }
2246           zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2247 
2248           if (fuid_dirtied)
2249                     zfs_fuid_sync(zfsvfs, tx);
2250 
2251           (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
2252           txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
2253           zfs_log_create(zilog, tx, txtype, dzp, zp, name,
2254               vsecp, acl_ids.z_fuidp, vap);
2255           zfs_acl_ids_free(&acl_ids);
2256           dmu_tx_commit(tx);
2257 
2258           getnewvnode_drop_reserve();
2259 
2260 out:
2261           if (error == 0) {
2262                     *vpp = ZTOV(zp);
2263           }
2264 
2265           if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2266                     zil_commit(zilog, 0);
2267 
2268           ZFS_EXIT(zfsvfs);
2269           return (error);
2270 }
2271 
2272 /*
2273  * Remove an entry from a directory.
2274  *
2275  *        IN:       dvp       - vnode of directory to remove entry from.
2276  *                  name      - name of entry to remove.
2277  *                  cr        - credentials of caller.
2278  *                  ct        - caller context
2279  *                  flags     - case flags
2280  *
2281  *        RETURN:   0 on success, error code on failure.
2282  *
2283  * Timestamps:
2284  *        dvp - ctime|mtime
2285  *         vp - ctime (if nlink > 0)
2286  */
2287 
2288 /*ARGSUSED*/
2289 static int
zfs_remove(vnode_t * dvp,vnode_t * vp,char * name,cred_t * cr)2290 zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2291 {
2292           znode_t             *dzp = VTOZ(dvp);
2293           znode_t             *zp = VTOZ(vp);
2294           znode_t             *xzp;
2295           zfsvfs_t  *zfsvfs = dzp->z_zfsvfs;
2296           zilog_t             *zilog;
2297           uint64_t  acl_obj, xattr_obj;
2298           uint64_t  obj = 0;
2299           dmu_tx_t  *tx;
2300           boolean_t unlinked, toobig = FALSE;
2301           uint64_t  txtype;
2302           int                 error;
2303 
2304           ZFS_ENTER(zfsvfs);
2305           ZFS_VERIFY_ZP(dzp);
2306           ZFS_VERIFY_ZP(zp);
2307           zilog = zfsvfs->z_log;
2308           zp = VTOZ(vp);
2309 
2310           xattr_obj = 0;
2311           xzp = NULL;
2312 
2313           if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2314                     goto out;
2315           }
2316 
2317           /*
2318            * Need to use rmdir for removing directories.
2319            */
2320           if (vp->v_type == VDIR) {
2321                     error = SET_ERROR(EPERM);
2322                     goto out;
2323           }
2324 
2325           vnevent_remove(vp, dvp, name, ct);
2326 
2327           obj = zp->z_id;
2328 
2329           /* are there any extended attributes? */
2330           error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2331               &xattr_obj, sizeof (xattr_obj));
2332           if (error == 0 && xattr_obj) {
2333                     error = zfs_zget(zfsvfs, xattr_obj, &xzp);
2334                     ASSERT0(error);
2335           }
2336 
2337           /*
2338            * We may delete the znode now, or we may put it in the unlinked set;
2339            * it depends on whether we're the last link, and on whether there are
2340            * other holds on the vnode.  So we dmu_tx_hold() the right things to
2341            * allow for either case.
2342            */
2343           tx = dmu_tx_create(zfsvfs->z_os);
2344           dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2345           dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2346           zfs_sa_upgrade_txholds(tx, zp);
2347           zfs_sa_upgrade_txholds(tx, dzp);
2348 
2349           if (xzp) {
2350                     dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2351                     dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
2352           }
2353 
2354           /* charge as an update -- would be nice not to charge at all */
2355           dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2356 
2357           /*
2358            * Mark this transaction as typically resulting in a net free of space
2359            */
2360           dmu_tx_mark_netfree(tx);
2361 
2362           error = dmu_tx_assign(tx, TXG_WAIT);
2363           if (error) {
2364                     dmu_tx_abort(tx);
2365                     ZFS_EXIT(zfsvfs);
2366                     return (error);
2367           }
2368 
2369           /*
2370            * Remove the directory entry.
2371            */
2372           error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
2373 
2374           if (error) {
2375                     dmu_tx_commit(tx);
2376                     goto out;
2377           }
2378 
2379           if (unlinked) {
2380                     zfs_unlinked_add(zp, tx);
2381                     vp->v_vflag |= VV_NOSYNC;
2382           }
2383 
2384           txtype = TX_REMOVE;
2385           zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2386 
2387           dmu_tx_commit(tx);
2388 out:
2389 
2390           if (xzp)
2391                     vrele(ZTOV(xzp));
2392 
2393           if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2394                     zil_commit(zilog, 0);
2395 
2396           ZFS_EXIT(zfsvfs);
2397           return (error);
2398 }
2399 
2400 /*
2401  * Create a new directory and insert it into dvp using the name
2402  * provided.  Return a pointer to the inserted directory.
2403  *
2404  *        IN:       dvp       - vnode of directory to add subdir to.
2405  *                  dirname   - name of new directory.
2406  *                  vap       - attributes of new directory.
2407  *                  cr        - credentials of caller.
2408  *                  ct        - caller context
2409  *                  flags     - case flags
2410  *                  vsecp     - ACL to be set
2411  *
2412  *        OUT:      vpp       - vnode of created directory.
2413  *
2414  *        RETURN:   0 on success, error code on failure.
2415  *
2416  * Timestamps:
2417  *        dvp - ctime|mtime updated
2418  *         vp - ctime|mtime|atime updated
2419  */
2420 /*ARGSUSED*/
2421 static int
zfs_mkdir(vnode_t * dvp,char * dirname,vattr_t * vap,vnode_t ** vpp,cred_t * cr)2422 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2423 {
2424           znode_t             *zp, *dzp = VTOZ(dvp);
2425           zfsvfs_t  *zfsvfs = dzp->z_zfsvfs;
2426           zilog_t             *zilog;
2427           uint64_t  txtype;
2428           dmu_tx_t  *tx;
2429           int                 error;
2430           ksid_t              *ksid;
2431           uid_t               uid;
2432           gid_t               gid = crgetgid(cr);
2433           zfs_acl_ids_t   acl_ids;
2434           boolean_t fuid_dirtied;
2435 
2436           ASSERT(vap->va_type == VDIR);
2437 
2438           /*
2439            * If we have an ephemeral id, ACL, or XVATTR then
2440            * make sure file system is at proper version
2441            */
2442 
2443           ksid = crgetsid(cr, KSID_OWNER);
2444           if (ksid)
2445                     uid = ksid_getid(ksid);
2446           else
2447                     uid = crgetuid(cr);
2448           if (zfsvfs->z_use_fuids == B_FALSE &&
2449               ((vap->va_mask & AT_XVATTR) ||
2450               IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2451                     return (SET_ERROR(EINVAL));
2452 
2453           ZFS_ENTER(zfsvfs);
2454           ZFS_VERIFY_ZP(dzp);
2455           zilog = zfsvfs->z_log;
2456 
2457           if (dzp->z_pflags & ZFS_XATTR) {
2458                     ZFS_EXIT(zfsvfs);
2459                     return (SET_ERROR(EINVAL));
2460           }
2461 
2462           if (zfsvfs->z_utf8 && u8_validate(dirname,
2463               strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2464                     ZFS_EXIT(zfsvfs);
2465                     return (SET_ERROR(EILSEQ));
2466           }
2467 
2468           if (vap->va_mask & AT_XVATTR) {
2469                     if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2470                         crgetuid(cr), cr, vap->va_type)) != 0) {
2471                               ZFS_EXIT(zfsvfs);
2472                               return (error);
2473                     }
2474           }
2475 
2476           if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2477               NULL, &acl_ids)) != 0) {
2478                     ZFS_EXIT(zfsvfs);
2479                     return (error);
2480           }
2481 
2482           /*
2483            * First make sure the new directory doesn't exist.
2484            *
2485            * Existence is checked first to make sure we don't return
2486            * EACCES instead of EEXIST which can cause some applications
2487            * to fail.
2488            */
2489           *vpp = NULL;
2490 
2491           if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2492                     zfs_acl_ids_free(&acl_ids);
2493                     ZFS_EXIT(zfsvfs);
2494                     return (error);
2495           }
2496           ASSERT3P(zp, ==, NULL);
2497 
2498           if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2499                     zfs_acl_ids_free(&acl_ids);
2500                     ZFS_EXIT(zfsvfs);
2501                     return (error);
2502           }
2503 
2504           if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2505                     zfs_acl_ids_free(&acl_ids);
2506                     ZFS_EXIT(zfsvfs);
2507                     return (SET_ERROR(EDQUOT));
2508           }
2509 
2510           /*
2511            * Add a new entry to the directory.
2512            */
2513           getnewvnode_reserve(1);
2514           tx = dmu_tx_create(zfsvfs->z_os);
2515           dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2516           dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2517           fuid_dirtied = zfsvfs->z_fuid_dirty;
2518           if (fuid_dirtied)
2519                     zfs_fuid_txhold(zfsvfs, tx);
2520           if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2521                     dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2522                         acl_ids.z_aclp->z_acl_bytes);
2523           }
2524 
2525           dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2526               ZFS_SA_BASE_ATTR_SIZE);
2527 
2528           error = dmu_tx_assign(tx, TXG_WAIT);
2529           if (error) {
2530                     zfs_acl_ids_free(&acl_ids);
2531                     dmu_tx_abort(tx);
2532                     getnewvnode_drop_reserve();
2533                     ZFS_EXIT(zfsvfs);
2534                     return (error);
2535           }
2536 
2537           /*
2538            * Create new node.
2539            */
2540           zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2541 
2542           if (fuid_dirtied)
2543                     zfs_fuid_sync(zfsvfs, tx);
2544 
2545           /*
2546            * Now put new name in parent dir.
2547            */
2548           (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2549 
2550           *vpp = ZTOV(zp);
2551 
2552           txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2553           zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2554               acl_ids.z_fuidp, vap);
2555 
2556           zfs_acl_ids_free(&acl_ids);
2557 
2558           dmu_tx_commit(tx);
2559 
2560           getnewvnode_drop_reserve();
2561 
2562           if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2563                     zil_commit(zilog, 0);
2564 
2565           ZFS_EXIT(zfsvfs);
2566           return (0);
2567 }
2568 
2569 /*
2570  * Remove a directory subdir entry.  If the current working
2571  * directory is the same as the subdir to be removed, the
2572  * remove will fail.
2573  *
2574  *        IN:       dvp       - vnode of directory to remove from.
2575  *                  name      - name of directory to be removed.
2576  *                  cwd       - vnode of current working directory.
2577  *                  cr        - credentials of caller.
2578  *                  ct        - caller context
2579  *                  flags     - case flags
2580  *
2581  *        RETURN:   0 on success, error code on failure.
2582  *
2583  * Timestamps:
2584  *        dvp - ctime|mtime updated
2585  */
2586 /*ARGSUSED*/
2587 static int
zfs_rmdir(vnode_t * dvp,vnode_t * vp,char * name,cred_t * cr)2588 zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2589 {
2590           znode_t             *dzp = VTOZ(dvp);
2591           znode_t             *zp = VTOZ(vp);
2592           zfsvfs_t  *zfsvfs = dzp->z_zfsvfs;
2593           zilog_t             *zilog;
2594           dmu_tx_t  *tx;
2595           int                 error;
2596 
2597           ZFS_ENTER(zfsvfs);
2598           ZFS_VERIFY_ZP(dzp);
2599           ZFS_VERIFY_ZP(zp);
2600           zilog = zfsvfs->z_log;
2601 
2602 
2603           if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2604                     goto out;
2605           }
2606 
2607           if (vp->v_type != VDIR) {
2608                     error = SET_ERROR(ENOTDIR);
2609                     goto out;
2610           }
2611 
2612           vnevent_rmdir(vp, dvp, name, ct);
2613 
2614           tx = dmu_tx_create(zfsvfs->z_os);
2615           dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2616           dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2617           dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2618           zfs_sa_upgrade_txholds(tx, zp);
2619           zfs_sa_upgrade_txholds(tx, dzp);
2620           dmu_tx_mark_netfree(tx);
2621           error = dmu_tx_assign(tx, TXG_WAIT);
2622           if (error) {
2623                     dmu_tx_abort(tx);
2624                     ZFS_EXIT(zfsvfs);
2625                     return (error);
2626           }
2627 
2628           cache_purge(dvp);
2629 
2630           error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2631 
2632           if (error == 0) {
2633                     uint64_t txtype = TX_RMDIR;
2634                     zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2635           }
2636 
2637           dmu_tx_commit(tx);
2638 
2639           cache_purge(vp);
2640 out:
2641           if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2642                     zil_commit(zilog, 0);
2643 
2644           ZFS_EXIT(zfsvfs);
2645           return (error);
2646 }
2647 
2648 /*
2649  * Read as many directory entries as will fit into the provided
2650  * buffer from the given directory cursor position (specified in
2651  * the uio structure).
2652  *
2653  *        IN:       vp        - vnode of directory to read.
2654  *                  uio       - structure supplying read location, range info,
2655  *                              and return buffer.
2656  *                  cr        - credentials of caller.
2657  *                  ct        - caller context
2658  *                  flags     - case flags
2659  *
2660  *        OUT:      uio       - updated offset and range, buffer filled.
2661  *                  eofp      - set to true if end-of-file detected.
2662  *
2663  *        RETURN:   0 on success, error code on failure.
2664  *
2665  * Timestamps:
2666  *        vp - atime updated
2667  *
2668  * Note that the low 4 bits of the cookie returned by zap is always zero.
2669  * This allows us to use the low range for "special" directory entries:
2670  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2671  * we use the offset 2 for the '.zfs' directory.
2672  */
2673 /* ARGSUSED */
2674 static int
zfs_readdir(vnode_t * vp,uio_t * uio,cred_t * cr,int * eofp,int * ncookies,off_t ** cookies)2675 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, off_t **cookies)
2676 {
2677           znode_t             *zp = VTOZ(vp);
2678           iovec_t             *iovp;
2679           edirent_t *eodp;
2680           dirent64_t          *odp;
2681           zfsvfs_t  *zfsvfs = zp->z_zfsvfs;
2682           objset_t  *os;
2683           caddr_t             outbuf;
2684           size_t              bufsize;
2685           zap_cursor_t        zc;
2686           zap_attribute_t     zap;
2687           uint_t              bytes_wanted;
2688           uint64_t  offset; /* must be unsigned; checks for < 1 */
2689           uint64_t  parent;
2690           int                 local_eof;
2691           int                 outcount;
2692           int                 error;
2693           uint8_t             prefetch;
2694           boolean_t check_sysattrs;
2695           uint8_t             type;
2696           int                 ncooks = 0;
2697           off_t               *cooks = NULL;
2698           int                 flags = 0;
2699 #ifdef __FreeBSD__
2700           boolean_t user = uio->uio_segflg != UIO_SYSSPACE;
2701 #endif
2702 #ifdef __NetBSD__
2703           boolean_t user = !VMSPACE_IS_KERNEL_P(uio->uio_vmspace);
2704 #endif
2705 
2706           ZFS_ENTER(zfsvfs);
2707           ZFS_VERIFY_ZP(zp);
2708 
2709           if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2710               &parent, sizeof (parent))) != 0) {
2711                     ZFS_EXIT(zfsvfs);
2712                     return (error);
2713           }
2714 
2715           /*
2716            * If we are not given an eof variable,
2717            * use a local one.
2718            */
2719           if (eofp == NULL)
2720                     eofp = &local_eof;
2721 
2722           /*
2723            * Check for valid iov_len.
2724            */
2725           if (uio->uio_iov->iov_len <= 0) {
2726                     ZFS_EXIT(zfsvfs);
2727                     return (SET_ERROR(EINVAL));
2728           }
2729 
2730           /*
2731            * Quit if directory has been removed (posix)
2732            */
2733           if ((*eofp = zp->z_unlinked) != 0) {
2734                     ZFS_EXIT(zfsvfs);
2735                     return (0);
2736           }
2737 
2738           error = 0;
2739           os = zfsvfs->z_os;
2740           offset = uio->uio_loffset;
2741           prefetch = zp->z_zn_prefetch;
2742 
2743           /*
2744            * Initialize the iterator cursor.
2745            */
2746           if (offset <= 3) {
2747                     /*
2748                      * Start iteration from the beginning of the directory.
2749                      */
2750                     zap_cursor_init(&zc, os, zp->z_id);
2751           } else {
2752                     /*
2753                      * The offset is a serialized cursor.
2754                      */
2755                     zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2756           }
2757 
2758           /*
2759            * Get space to change directory entries into fs independent format.
2760            */
2761           iovp = uio->uio_iov;
2762           bytes_wanted = iovp->iov_len;
2763           if (user || uio->uio_iovcnt != 1) {
2764                     bufsize = bytes_wanted;
2765                     outbuf = kmem_alloc(bufsize, KM_SLEEP);
2766                     odp = (struct dirent64 *)outbuf;
2767           } else {
2768                     bufsize = bytes_wanted;
2769                     outbuf = NULL;
2770                     odp = (struct dirent64 *)iovp->iov_base;
2771           }
2772           eodp = (struct edirent *)odp;
2773 
2774           if (ncookies != NULL) {
2775                     /*
2776                      * Minimum entry size is dirent size and 1 byte for a file name.
2777                      */
2778 #ifdef __FreeBSD__
2779                     ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2780                     cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2781 #endif
2782 #ifdef __NetBSD__
2783                     ncooks = uio->uio_resid / _DIRENT_MINSIZE(odp);
2784                     cooks = malloc(ncooks * sizeof(off_t), M_TEMP, M_WAITOK);
2785 #endif
2786                     *cookies = cooks;
2787                     *ncookies = ncooks;
2788           }
2789 
2790           /*
2791            * If this VFS supports the system attribute view interface; and
2792            * we're looking at an extended attribute directory; and we care
2793            * about normalization conflicts on this vfs; then we must check
2794            * for normalization conflicts with the sysattr name space.
2795            */
2796 #ifdef TODO
2797           check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2798               (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2799               (flags & V_RDDIR_ENTFLAGS);
2800 #else
2801           check_sysattrs = 0;
2802 #endif
2803 
2804           /*
2805            * Transform to file-system independent format
2806            */
2807           outcount = 0;
2808           while (outcount < bytes_wanted) {
2809                     ino64_t objnum;
2810                     ushort_t reclen;
2811                     off64_t *next = NULL;
2812 
2813                     /*
2814                      * Special case `.', `..', and `.zfs'.
2815                      */
2816                     if (offset == 0) {
2817                               (void) strcpy(zap.za_name, ".");
2818                               zap.za_normalization_conflict = 0;
2819                               objnum = zp->z_id;
2820                               type = DT_DIR;
2821                     } else if (offset == 1) {
2822                               (void) strcpy(zap.za_name, "..");
2823                               zap.za_normalization_conflict = 0;
2824                               objnum = parent;
2825                               type = DT_DIR;
2826                     } else if (offset == 2 && zfs_show_ctldir(zp)) {
2827                               (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2828                               zap.za_normalization_conflict = 0;
2829                               objnum = ZFSCTL_INO_ROOT;
2830                               type = DT_DIR;
2831                     } else {
2832                               /*
2833                                * Grab next entry.
2834                                */
2835                               if (error = zap_cursor_retrieve(&zc, &zap)) {
2836                                         if ((*eofp = (error == ENOENT)) != 0)
2837                                                   break;
2838                                         else
2839                                                   goto update;
2840                               }
2841 
2842                               if (zap.za_integer_length != 8 ||
2843                                   zap.za_num_integers != 1) {
2844                                         cmn_err(CE_WARN, "zap_readdir: bad directory "
2845                                             "entry, obj = %lld, offset = %lld\n",
2846                                             (u_longlong_t)zp->z_id,
2847                                             (u_longlong_t)offset);
2848                                         error = SET_ERROR(ENXIO);
2849                                         goto update;
2850                               }
2851 
2852                               objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2853                               /*
2854                                * MacOS X can extract the object type here such as:
2855                                * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2856                                */
2857                               type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2858 
2859                               if (check_sysattrs && !zap.za_normalization_conflict) {
2860 #ifdef TODO
2861                                         zap.za_normalization_conflict =
2862                                             xattr_sysattr_casechk(zap.za_name);
2863 #else
2864                                         panic("%s:%u: TODO", __func__, __LINE__);
2865 #endif
2866                               }
2867                     }
2868 
2869                     if (flags & V_RDDIR_ACCFILTER) {
2870                               /*
2871                                * If we have no access at all, don't include
2872                                * this entry in the returned information
2873                                */
2874                               znode_t   *ezp;
2875                               if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2876                                         goto skip_entry;
2877                               if (!zfs_has_access(ezp, cr)) {
2878                                         vrele(ZTOV(ezp));
2879                                         goto skip_entry;
2880                               }
2881                               vrele(ZTOV(ezp));
2882                     }
2883 
2884                     if (flags & V_RDDIR_ENTFLAGS)
2885                               reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2886                     else
2887                               reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2888 
2889                     /*
2890                      * Will this entry fit in the buffer?
2891                      */
2892                     if (outcount + reclen > bufsize) {
2893                               /*
2894                                * Did we manage to fit anything in the buffer?
2895                                */
2896                               if (!outcount) {
2897                                         error = SET_ERROR(EINVAL);
2898                                         goto update;
2899                               }
2900                               break;
2901                     }
2902                     if (flags & V_RDDIR_ENTFLAGS) {
2903                               /*
2904                                * Add extended flag entry:
2905                                */
2906                               eodp->ed_ino = objnum;
2907                               eodp->ed_reclen = reclen;
2908                               /* NOTE: ed_off is the offset for the *next* entry */
2909                               next = &(eodp->ed_off);
2910                               eodp->ed_eflags = zap.za_normalization_conflict ?
2911                                   ED_CASE_CONFLICT : 0;
2912                               (void) strncpy(eodp->ed_name, zap.za_name,
2913                                   EDIRENT_NAMELEN(reclen));
2914                               eodp = (edirent_t *)((intptr_t)eodp + reclen);
2915                     } else {
2916                               /*
2917                                * Add normal entry:
2918                                */
2919                               odp->d_ino = objnum;
2920                               odp->d_reclen = reclen;
2921                               odp->d_namlen = strlen(zap.za_name);
2922                               (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2923                               odp->d_type = type;
2924                               odp = (dirent64_t *)((intptr_t)odp + reclen);
2925                     }
2926                     outcount += reclen;
2927 
2928                     ASSERT(outcount <= bufsize);
2929 
2930                     /* Prefetch znode */
2931                     if (prefetch)
2932                               dmu_prefetch(os, objnum, 0, 0, 0,
2933                                   ZIO_PRIORITY_SYNC_READ);
2934 
2935           skip_entry:
2936                     /*
2937                      * Move to the next entry, fill in the previous offset.
2938                      */
2939                     if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2940                               zap_cursor_advance(&zc);
2941                               offset = zap_cursor_serialize(&zc);
2942                     } else {
2943                               offset += 1;
2944                     }
2945 
2946                     if (cooks != NULL) {
2947                               *cooks++ = offset;
2948                               ncooks--;
2949 #ifdef __FreeBSD__
2950                               KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2951 #endif
2952 #ifdef __NetBSD__
2953                               KASSERTMSG(ncooks >= 0, "ncooks=%d", ncooks);
2954 #endif
2955                     }
2956           }
2957           zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2958 
2959           /* Subtract unused cookies */
2960           if (ncookies != NULL)
2961                     *ncookies -= ncooks;
2962 
2963           if (!user && uio->uio_iovcnt == 1) {
2964                     iovp->iov_base += outcount;
2965                     iovp->iov_len -= outcount;
2966                     uio->uio_resid -= outcount;
2967           } else if (error = uiomove(outbuf, (size_t)outcount, UIO_READ, uio)) {
2968                     /*
2969                      * Reset the pointer.
2970                      */
2971                     offset = uio->uio_loffset;
2972           }
2973 
2974 update:
2975           zap_cursor_fini(&zc);
2976           if (user || uio->uio_iovcnt != 1)
2977                     kmem_free(outbuf, bufsize);
2978 
2979           if (error == ENOENT)
2980                     error = 0;
2981 
2982           ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2983 
2984           uio->uio_loffset = offset;
2985           ZFS_EXIT(zfsvfs);
2986           if (error != 0 && cookies != NULL) {
2987 #ifdef __FreeBSD__
2988                     free(*cookies, M_TEMP);
2989 #endif
2990 #ifdef __NetBSD__
2991                     kmem_free(*cookies, ncooks * sizeof(off_t));
2992 #endif
2993                     *cookies = NULL;
2994                     *ncookies = 0;
2995           }
2996           return (error);
2997 }
2998 
2999 ulong_t zfs_fsync_sync_cnt = 4;
3000 
3001 static int
zfs_fsync(vnode_t * vp,int syncflag,cred_t * cr,caller_context_t * ct)3002 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
3003 {
3004           znode_t   *zp = VTOZ(vp);
3005           zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3006 
3007           (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
3008 
3009           if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
3010                     ZFS_ENTER(zfsvfs);
3011                     ZFS_VERIFY_ZP(zp);
3012 
3013 #ifdef __NetBSD__
3014                     if (!zp->z_unlinked)
3015 #endif
3016                     zil_commit(zfsvfs->z_log, zp->z_id);
3017                     ZFS_EXIT(zfsvfs);
3018           }
3019           return (0);
3020 }
3021 
3022 
3023 /*
3024  * Get the requested file attributes and place them in the provided
3025  * vattr structure.
3026  *
3027  *        IN:       vp        - vnode of file.
3028  *                  vap       - va_mask identifies requested attributes.
3029  *                              If AT_XVATTR set, then optional attrs are requested
3030  *                  flags     - ATTR_NOACLCHECK (CIFS server context)
3031  *                  cr        - credentials of caller.
3032  *                  ct        - caller context
3033  *
3034  *        OUT:      vap       - attribute values.
3035  *
3036  *        RETURN:   0 (always succeeds).
3037  */
3038 /* ARGSUSED */
3039 static int
zfs_getattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr,caller_context_t * ct)3040 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
3041     caller_context_t *ct)
3042 {
3043           znode_t *zp = VTOZ(vp);
3044           zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3045           int       error = 0;
3046           uint32_t blksize;
3047           u_longlong_t nblocks;
3048           uint64_t links;
3049           uint64_t mtime[2], ctime[2], crtime[2], rdev;
3050           xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
3051           xoptattr_t *xoap = NULL;
3052           boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3053           sa_bulk_attr_t bulk[4];
3054           int count = 0;
3055 
3056           ZFS_ENTER(zfsvfs);
3057           ZFS_VERIFY_ZP(zp);
3058 
3059           zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
3060 
3061           SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3062           SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3063           SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
3064           if (vp->v_type == VBLK || vp->v_type == VCHR)
3065                     SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
3066                         &rdev, 8);
3067 
3068           if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
3069                     ZFS_EXIT(zfsvfs);
3070                     return (error);
3071           }
3072 
3073           /*
3074            * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
3075            * Also, if we are the owner don't bother, since owner should
3076            * always be allowed to read basic attributes of file.
3077            */
3078           if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
3079               (vap->va_uid != crgetuid(cr))) {
3080                     if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
3081                         skipaclchk, cr)) {
3082                               ZFS_EXIT(zfsvfs);
3083                               return (error);
3084                     }
3085           }
3086 
3087           /*
3088            * Return all attributes.  It's cheaper to provide the answer
3089            * than to determine whether we were asked the question.
3090            */
3091 
3092           vap->va_type = IFTOVT(zp->z_mode);
3093           vap->va_mode = zp->z_mode & ~S_IFMT;
3094 #ifdef illumos
3095           vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
3096 #endif
3097 #ifdef __FreeBSD__
3098           vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
3099           vap->va_nodeid = zp->z_id;
3100 #endif
3101 #ifdef __NetBSD__
3102           vap->va_fsid = vp->v_mount->mnt_stat.f_fsid;
3103           vap->va_nodeid = zp->z_id;
3104           /*
3105            * If we are a snapshot mounted under .zfs, return
3106            * the object id of the snapshot to make getcwd happy.
3107            */
3108           if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
3109                     vnode_t *cvp = vp->v_mount->mnt_vnodecovered;
3110 
3111                     if (cvp && zfsctl_is_node(cvp))
3112                               vap->va_nodeid = dmu_objset_id(zfsvfs->z_os);
3113           }
3114 #endif
3115           if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
3116                     links = zp->z_links + 1;
3117           else
3118                     links = zp->z_links;
3119           /* XXX NetBSD: use LINK_MAX when that value matches 32-bit nlink_t */
3120           vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
3121           vap->va_size = zp->z_size;
3122 #ifdef illumos
3123           vap->va_rdev = vp->v_rdev;
3124 #else
3125           if (vp->v_type == VBLK || vp->v_type == VCHR)
3126                     vap->va_rdev = zfs_cmpldev(rdev);
3127 #endif
3128           vap->va_seq = zp->z_seq;
3129           vap->va_flags = 0;  /* FreeBSD: Reset chflags(2) flags. */
3130           vap->va_filerev = zp->z_seq;
3131 
3132           /*
3133            * Add in any requested optional attributes and the create time.
3134            * Also set the corresponding bits in the returned attribute bitmap.
3135            */
3136           if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
3137                     if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
3138                               xoap->xoa_archive =
3139                                   ((zp->z_pflags & ZFS_ARCHIVE) != 0);
3140                               XVA_SET_RTN(xvap, XAT_ARCHIVE);
3141                     }
3142 
3143                     if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
3144                               xoap->xoa_readonly =
3145                                   ((zp->z_pflags & ZFS_READONLY) != 0);
3146                               XVA_SET_RTN(xvap, XAT_READONLY);
3147                     }
3148 
3149                     if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
3150                               xoap->xoa_system =
3151                                   ((zp->z_pflags & ZFS_SYSTEM) != 0);
3152                               XVA_SET_RTN(xvap, XAT_SYSTEM);
3153                     }
3154 
3155                     if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
3156                               xoap->xoa_hidden =
3157                                   ((zp->z_pflags & ZFS_HIDDEN) != 0);
3158                               XVA_SET_RTN(xvap, XAT_HIDDEN);
3159                     }
3160 
3161                     if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3162                               xoap->xoa_nounlink =
3163                                   ((zp->z_pflags & ZFS_NOUNLINK) != 0);
3164                               XVA_SET_RTN(xvap, XAT_NOUNLINK);
3165                     }
3166 
3167                     if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3168                               xoap->xoa_immutable =
3169                                   ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
3170                               XVA_SET_RTN(xvap, XAT_IMMUTABLE);
3171                     }
3172 
3173                     if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3174                               xoap->xoa_appendonly =
3175                                   ((zp->z_pflags & ZFS_APPENDONLY) != 0);
3176                               XVA_SET_RTN(xvap, XAT_APPENDONLY);
3177                     }
3178 
3179                     if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3180                               xoap->xoa_nodump =
3181                                   ((zp->z_pflags & ZFS_NODUMP) != 0);
3182                               XVA_SET_RTN(xvap, XAT_NODUMP);
3183                     }
3184 
3185                     if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
3186                               xoap->xoa_opaque =
3187                                   ((zp->z_pflags & ZFS_OPAQUE) != 0);
3188                               XVA_SET_RTN(xvap, XAT_OPAQUE);
3189                     }
3190 
3191                     if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3192                               xoap->xoa_av_quarantined =
3193                                   ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
3194                               XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
3195                     }
3196 
3197                     if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3198                               xoap->xoa_av_modified =
3199                                   ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
3200                               XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
3201                     }
3202 
3203                     if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
3204                         vp->v_type == VREG) {
3205                               zfs_sa_get_scanstamp(zp, xvap);
3206                     }
3207 
3208                     if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3209                               xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
3210                               XVA_SET_RTN(xvap, XAT_REPARSE);
3211                     }
3212                     if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
3213                               xoap->xoa_generation = zp->z_gen;
3214                               XVA_SET_RTN(xvap, XAT_GEN);
3215                     }
3216 
3217                     if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
3218                               xoap->xoa_offline =
3219                                   ((zp->z_pflags & ZFS_OFFLINE) != 0);
3220                               XVA_SET_RTN(xvap, XAT_OFFLINE);
3221                     }
3222 
3223                     if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
3224                               xoap->xoa_sparse =
3225                                   ((zp->z_pflags & ZFS_SPARSE) != 0);
3226                               XVA_SET_RTN(xvap, XAT_SPARSE);
3227                     }
3228           }
3229 
3230           ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
3231           ZFS_TIME_DECODE(&vap->va_mtime, mtime);
3232           ZFS_TIME_DECODE(&vap->va_ctime, ctime);
3233           ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
3234 
3235 
3236           sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
3237           vap->va_blksize = blksize;
3238           vap->va_bytes = nblocks << 9; /* nblocks * 512 */
3239 
3240           if (zp->z_blksz == 0) {
3241                     /*
3242                      * Block size hasn't been set; suggest maximal I/O transfers.
3243                      */
3244                     vap->va_blksize = zfsvfs->z_max_blksz;
3245           }
3246 
3247           ZFS_EXIT(zfsvfs);
3248           return (0);
3249 }
3250 
3251 /*
3252  * Set the file attributes to the values contained in the
3253  * vattr structure.
3254  *
3255  *        IN:       vp        - vnode of file to be modified.
3256  *                  vap       - new attribute values.
3257  *                              If AT_XVATTR set, then optional attrs are being set
3258  *                  flags     - ATTR_UTIME set if non-default time values provided.
3259  *                            - ATTR_NOACLCHECK (CIFS context only).
3260  *                  cr        - credentials of caller.
3261  *                  ct        - caller context
3262  *
3263  *        RETURN:   0 on success, error code on failure.
3264  *
3265  * Timestamps:
3266  *        vp - ctime updated, mtime updated if size changed.
3267  */
3268 /* ARGSUSED */
3269 static int
zfs_setattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr,caller_context_t * ct)3270 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
3271     caller_context_t *ct)
3272 {
3273           znode_t             *zp = VTOZ(vp);
3274           zfsvfs_t  *zfsvfs = zp->z_zfsvfs;
3275           zilog_t             *zilog;
3276           dmu_tx_t  *tx;
3277           vattr_t             oldva;
3278           xvattr_t  tmpxvattr;
3279           uint_t              mask = vap->va_mask;
3280           uint_t              saved_mask = 0;
3281           uint64_t  saved_mode;
3282           int                 trim_mask = 0;
3283           uint64_t  new_mode;
3284           uint64_t  new_uid, new_gid;
3285           uint64_t  xattr_obj;
3286           uint64_t  mtime[2], ctime[2];
3287           znode_t             *attrzp;
3288           int                 need_policy = FALSE;
3289           int                 err, err2;
3290           zfs_fuid_info_t *fuidp = NULL;
3291           xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
3292           xoptattr_t          *xoap;
3293           zfs_acl_t *aclp;
3294           boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3295           boolean_t fuid_dirtied = B_FALSE;
3296           sa_bulk_attr_t      bulk[7], xattr_bulk[7];
3297           int                 count = 0, xattr_count = 0;
3298 
3299           if (mask == 0)
3300                     return (0);
3301 
3302           if (mask & AT_NOSET)
3303                     return (SET_ERROR(EINVAL));
3304 
3305           ZFS_ENTER(zfsvfs);
3306           ZFS_VERIFY_ZP(zp);
3307 
3308           zilog = zfsvfs->z_log;
3309 
3310           /*
3311            * Make sure that if we have ephemeral uid/gid or xvattr specified
3312            * that file system is at proper version level
3313            */
3314 
3315           if (zfsvfs->z_use_fuids == B_FALSE &&
3316               (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
3317               ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
3318               (mask & AT_XVATTR))) {
3319                     ZFS_EXIT(zfsvfs);
3320                     return (SET_ERROR(EINVAL));
3321           }
3322 
3323           if (mask & AT_SIZE && vp->v_type == VDIR) {
3324                     ZFS_EXIT(zfsvfs);
3325                     return (SET_ERROR(EISDIR));
3326           }
3327 
3328           if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
3329                     ZFS_EXIT(zfsvfs);
3330                     return (SET_ERROR(EINVAL));
3331           }
3332 
3333           /*
3334            * If this is an xvattr_t, then get a pointer to the structure of
3335            * optional attributes.  If this is NULL, then we have a vattr_t.
3336            */
3337           xoap = xva_getxoptattr(xvap);
3338 
3339           xva_init(&tmpxvattr);
3340 
3341           /*
3342            * Immutable files can only alter immutable bit and atime
3343            */
3344           if ((zp->z_pflags & ZFS_IMMUTABLE) &&
3345               ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
3346               ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
3347                     ZFS_EXIT(zfsvfs);
3348                     return (SET_ERROR(EPERM));
3349           }
3350 
3351           if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
3352                     ZFS_EXIT(zfsvfs);
3353                     return (SET_ERROR(EPERM));
3354           }
3355 
3356           /*
3357            * Verify timestamps doesn't overflow 32 bits.
3358            * ZFS can handle large timestamps, but 32bit syscalls can't
3359            * handle times greater than 2039.  This check should be removed
3360            * once large timestamps are fully supported.
3361            */
3362           if (mask & (AT_ATIME | AT_MTIME)) {
3363                     if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
3364                         ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
3365                               ZFS_EXIT(zfsvfs);
3366                               return (SET_ERROR(EOVERFLOW));
3367                     }
3368           }
3369           if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
3370               TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
3371                     ZFS_EXIT(zfsvfs);
3372                     return (SET_ERROR(EOVERFLOW));
3373           }
3374 
3375           attrzp = NULL;
3376           aclp = NULL;
3377 
3378           /* Can this be moved to before the top label? */
3379           if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
3380                     ZFS_EXIT(zfsvfs);
3381                     return (SET_ERROR(EROFS));
3382           }
3383 
3384           /*
3385            * First validate permissions
3386            */
3387 
3388           if (mask & AT_SIZE) {
3389                     /*
3390                      * XXX - Note, we are not providing any open
3391                      * mode flags here (like FNDELAY), so we may
3392                      * block if there are locks present... this
3393                      * should be addressed in openat().
3394                      */
3395                     /* XXX - would it be OK to generate a log record here? */
3396                     err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3397                     if (err) {
3398                               ZFS_EXIT(zfsvfs);
3399                               return (err);
3400                     }
3401           }
3402 
3403           if (mask & (AT_ATIME|AT_MTIME) ||
3404               ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3405               XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3406               XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3407               XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3408               XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3409               XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3410               XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3411                     need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3412                         skipaclchk, cr);
3413           }
3414 
3415           if (mask & (AT_UID|AT_GID)) {
3416                     int       idmask = (mask & (AT_UID|AT_GID));
3417                     int       take_owner;
3418                     int       take_group;
3419 
3420                     /*
3421                      * NOTE: even if a new mode is being set,
3422                      * we may clear S_ISUID/S_ISGID bits.
3423                      */
3424 
3425                     if (!(mask & AT_MODE))
3426                               vap->va_mode = zp->z_mode;
3427 
3428                     /*
3429                      * Take ownership or chgrp to group we are a member of
3430                      */
3431 
3432                     take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3433                     take_group = (mask & AT_GID) &&
3434                         zfs_groupmember(zfsvfs, vap->va_gid, cr);
3435 
3436                     /*
3437                      * If both AT_UID and AT_GID are set then take_owner and
3438                      * take_group must both be set in order to allow taking
3439                      * ownership.
3440                      *
3441                      * Otherwise, send the check through secpolicy_vnode_setattr()
3442                      *
3443                      */
3444 
3445                     if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3446                         ((idmask == AT_UID) && take_owner) ||
3447                         ((idmask == AT_GID) && take_group)) {
3448                               if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3449                                   skipaclchk, cr) == 0) {
3450                                         /*
3451                                          * Remove setuid/setgid for non-privileged users
3452                                          */
3453                                         secpolicy_setid_clear(vap, vp, cr);
3454                                         trim_mask = (mask & (AT_UID|AT_GID));
3455                               } else {
3456                                         need_policy =  TRUE;
3457                               }
3458                     } else {
3459                               need_policy =  TRUE;
3460                     }
3461           }
3462 
3463           oldva.va_mode = zp->z_mode;
3464           zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3465           if (mask & AT_XVATTR) {
3466                     /*
3467                      * Update xvattr mask to include only those attributes
3468                      * that are actually changing.
3469                      *
3470                      * the bits will be restored prior to actually setting
3471                      * the attributes so the caller thinks they were set.
3472                      */
3473                     if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3474                               if (xoap->xoa_appendonly !=
3475                                   ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3476                                         need_policy = TRUE;
3477                               } else {
3478                                         XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3479                                         XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3480                               }
3481                     }
3482 
3483                     if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3484                               if (xoap->xoa_nounlink !=
3485                                   ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3486                                         need_policy = TRUE;
3487                               } else {
3488                                         XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3489                                         XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3490                               }
3491                     }
3492 
3493                     if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3494                               if (xoap->xoa_immutable !=
3495                                   ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3496                                         need_policy = TRUE;
3497                               } else {
3498                                         XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3499                                         XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3500                               }
3501                     }
3502 
3503                     if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3504                               if (xoap->xoa_nodump !=
3505                                   ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3506 #if 0
3507                                         /*
3508                                  * XXXSB - zfs_netbsd_setattr()
3509                                  * has already checked if this
3510                                  * request is authorised, and our
3511                                  * secpolicy_xvattr() doesn't check
3512                                  * kauth chflags.  Fix this when we
3513                                  * migrate to openzfs.
3514                                          */
3515                                         need_policy = TRUE;
3516 #endif
3517                               } else {
3518                                         XVA_CLR_REQ(xvap, XAT_NODUMP);
3519                                         XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3520                               }
3521                     }
3522 
3523                     if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3524                               if (xoap->xoa_av_modified !=
3525                                   ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3526                                         need_policy = TRUE;
3527                               } else {
3528                                         XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3529                                         XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3530                               }
3531                     }
3532 
3533                     if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3534                               if ((vp->v_type != VREG &&
3535                                   xoap->xoa_av_quarantined) ||
3536                                   xoap->xoa_av_quarantined !=
3537                                   ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3538                                         need_policy = TRUE;
3539                               } else {
3540                                         XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3541                                         XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3542                               }
3543                     }
3544 
3545                     if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3546                               ZFS_EXIT(zfsvfs);
3547                               return (SET_ERROR(EPERM));
3548                     }
3549 
3550                     if (need_policy == FALSE &&
3551                         (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3552                         XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3553                               need_policy = TRUE;
3554                     }
3555           }
3556 
3557           if (mask & AT_MODE) {
3558                     if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3559                               err = secpolicy_setid_setsticky_clear(vp, vap,
3560                                   &oldva, cr);
3561                               if (err) {
3562                                         ZFS_EXIT(zfsvfs);
3563                                         return (err);
3564                               }
3565                               trim_mask |= AT_MODE;
3566                     } else {
3567                               need_policy = TRUE;
3568                     }
3569           }
3570 
3571           if (need_policy) {
3572                     /*
3573                      * If trim_mask is set then take ownership
3574                      * has been granted or write_acl is present and user
3575                      * has the ability to modify mode.  In that case remove
3576                      * UID|GID and or MODE from mask so that
3577                      * secpolicy_vnode_setattr() doesn't revoke it.
3578                      */
3579 
3580                     if (trim_mask) {
3581                               saved_mask = vap->va_mask;
3582                               vap->va_mask &= ~trim_mask;
3583                               if (trim_mask & AT_MODE) {
3584                                         /*
3585                                          * Save the mode, as secpolicy_vnode_setattr()
3586                                          * will overwrite it with ova.va_mode.
3587                                          */
3588                                         saved_mode = vap->va_mode;
3589                               }
3590                     }
3591                     err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3592                         (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3593                     if (err) {
3594                               ZFS_EXIT(zfsvfs);
3595                               return (err);
3596                     }
3597 
3598                     if (trim_mask) {
3599                               vap->va_mask |= saved_mask;
3600                               if (trim_mask & AT_MODE) {
3601                                         /*
3602                                          * Recover the mode after
3603                                          * secpolicy_vnode_setattr().
3604                                          */
3605                                         vap->va_mode = saved_mode;
3606                               }
3607                     }
3608           }
3609 
3610           /*
3611            * secpolicy_vnode_setattr, or take ownership may have
3612            * changed va_mask
3613            */
3614           mask = vap->va_mask;
3615 
3616           if ((mask & (AT_UID | AT_GID))) {
3617                     err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3618                         &xattr_obj, sizeof (xattr_obj));
3619 
3620                     if (err == 0 && xattr_obj) {
3621                               err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3622                               if (err == 0) {
3623                                         err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3624                                         if (err != 0)
3625                                                   vrele(ZTOV(attrzp));
3626                               }
3627                               if (err)
3628                                         goto out2;
3629                     }
3630                     if (mask & AT_UID) {
3631                               new_uid = zfs_fuid_create(zfsvfs,
3632                                   (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3633                               if (new_uid != zp->z_uid &&
3634                                   zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3635                                         if (attrzp)
3636                                                   vput(ZTOV(attrzp));
3637                                         err = SET_ERROR(EDQUOT);
3638                                         goto out2;
3639                               }
3640                     }
3641 
3642                     if (mask & AT_GID) {
3643                               new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3644                                   cr, ZFS_GROUP, &fuidp);
3645                               if (new_gid != zp->z_gid &&
3646                                   zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3647                                         if (attrzp)
3648                                                   vput(ZTOV(attrzp));
3649                                         err = SET_ERROR(EDQUOT);
3650                                         goto out2;
3651                               }
3652                     }
3653           }
3654           tx = dmu_tx_create(zfsvfs->z_os);
3655 
3656           if (mask & AT_MODE) {
3657                     uint64_t pmode = zp->z_mode;
3658                     uint64_t acl_obj;
3659                     new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3660 
3661                     if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3662                         !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3663                               err = SET_ERROR(EPERM);
3664                               goto out;
3665                     }
3666 
3667                     if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3668                               goto out;
3669 
3670                     if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3671                               /*
3672                                * Are we upgrading ACL from old V0 format
3673                                * to V1 format?
3674                                */
3675                               if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3676                                   zfs_znode_acl_version(zp) ==
3677                                   ZFS_ACL_VERSION_INITIAL) {
3678                                         dmu_tx_hold_free(tx, acl_obj, 0,
3679                                             DMU_OBJECT_END);
3680                                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3681                                             0, aclp->z_acl_bytes);
3682                               } else {
3683                                         dmu_tx_hold_write(tx, acl_obj, 0,
3684                                             aclp->z_acl_bytes);
3685                               }
3686                     } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3687                               dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3688                                   0, aclp->z_acl_bytes);
3689                     }
3690                     dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3691           } else {
3692                     if ((mask & AT_XVATTR) &&
3693                         XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3694                               dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3695                     else
3696                               dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3697           }
3698 
3699           if (attrzp) {
3700                     dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3701           }
3702 
3703           fuid_dirtied = zfsvfs->z_fuid_dirty;
3704           if (fuid_dirtied)
3705                     zfs_fuid_txhold(zfsvfs, tx);
3706 
3707           zfs_sa_upgrade_txholds(tx, zp);
3708 
3709           err = dmu_tx_assign(tx, TXG_WAIT);
3710           if (err)
3711                     goto out;
3712 
3713           count = 0;
3714           /*
3715            * Set each attribute requested.
3716            * We group settings according to the locks they need to acquire.
3717            *
3718            * Note: you cannot set ctime directly, although it will be
3719            * updated as a side-effect of calling this function.
3720            */
3721 
3722           if (mask & (AT_UID|AT_GID|AT_MODE))
3723                     mutex_enter(&zp->z_acl_lock);
3724 
3725           SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3726               &zp->z_pflags, sizeof (zp->z_pflags));
3727 
3728           if (attrzp) {
3729                     if (mask & (AT_UID|AT_GID|AT_MODE))
3730                               mutex_enter(&attrzp->z_acl_lock);
3731                     SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3732                         SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3733                         sizeof (attrzp->z_pflags));
3734           }
3735 
3736           if (mask & (AT_UID|AT_GID)) {
3737 
3738                     if (mask & AT_UID) {
3739                               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3740                                   &new_uid, sizeof (new_uid));
3741                               zp->z_uid = new_uid;
3742                               if (attrzp) {
3743                                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3744                                             SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3745                                             sizeof (new_uid));
3746                                         attrzp->z_uid = new_uid;
3747                               }
3748                     }
3749 
3750                     if (mask & AT_GID) {
3751                               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3752                                   NULL, &new_gid, sizeof (new_gid));
3753                               zp->z_gid = new_gid;
3754                               if (attrzp) {
3755                                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3756                                             SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3757                                             sizeof (new_gid));
3758                                         attrzp->z_gid = new_gid;
3759                               }
3760                     }
3761                     if (!(mask & AT_MODE)) {
3762                               SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3763                                   NULL, &new_mode, sizeof (new_mode));
3764                               new_mode = zp->z_mode;
3765                     }
3766                     err = zfs_acl_chown_setattr(zp);
3767                     ASSERT(err == 0);
3768                     if (attrzp) {
3769                               err = zfs_acl_chown_setattr(attrzp);
3770                               ASSERT(err == 0);
3771                     }
3772           }
3773 
3774           if (mask & AT_MODE) {
3775                     SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3776                         &new_mode, sizeof (new_mode));
3777                     zp->z_mode = new_mode;
3778                     ASSERT3U((uintptr_t)aclp, !=, 0);
3779                     err = zfs_aclset_common(zp, aclp, cr, tx);
3780                     ASSERT0(err);
3781                     if (zp->z_acl_cached)
3782                               zfs_acl_free(zp->z_acl_cached);
3783                     zp->z_acl_cached = aclp;
3784                     aclp = NULL;
3785           }
3786 
3787 
3788           if (mask & AT_ATIME) {
3789                     ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3790                     SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3791                         &zp->z_atime, sizeof (zp->z_atime));
3792           }
3793 
3794           if (mask & AT_MTIME) {
3795                     ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3796                     SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3797                         mtime, sizeof (mtime));
3798           }
3799 
3800           /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3801           if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3802                     SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3803                         NULL, mtime, sizeof (mtime));
3804                     SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3805                         &ctime, sizeof (ctime));
3806                     zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3807                         B_TRUE);
3808           } else if (mask != 0) {
3809                     SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3810                         &ctime, sizeof (ctime));
3811                     zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3812                         B_TRUE);
3813                     if (attrzp) {
3814                               SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3815                                   SA_ZPL_CTIME(zfsvfs), NULL,
3816                                   &ctime, sizeof (ctime));
3817                               zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3818                                   mtime, ctime, B_TRUE);
3819                     }
3820           }
3821           /*
3822            * Do this after setting timestamps to prevent timestamp
3823            * update from toggling bit
3824            */
3825 
3826           if (xoap && (mask & AT_XVATTR)) {
3827 
3828                     if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
3829                               xoap->xoa_createtime = vap->va_birthtime;
3830                     /*
3831                      * restore trimmed off masks
3832                      * so that return masks can be set for caller.
3833                      */
3834 
3835                     if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3836                               XVA_SET_REQ(xvap, XAT_APPENDONLY);
3837                     }
3838                     if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3839                               XVA_SET_REQ(xvap, XAT_NOUNLINK);
3840                     }
3841                     if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3842                               XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3843                     }
3844                     if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3845                               XVA_SET_REQ(xvap, XAT_NODUMP);
3846                     }
3847                     if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3848                               XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3849                     }
3850                     if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3851                               XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3852                     }
3853 
3854                     if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3855                               ASSERT(vp->v_type == VREG);
3856 
3857                     zfs_xvattr_set(zp, xvap, tx);
3858           }
3859 
3860           if (fuid_dirtied)
3861                     zfs_fuid_sync(zfsvfs, tx);
3862 
3863           if (mask != 0)
3864                     zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3865 
3866           if (mask & (AT_UID|AT_GID|AT_MODE))
3867                     mutex_exit(&zp->z_acl_lock);
3868 
3869           if (attrzp) {
3870                     if (mask & (AT_UID|AT_GID|AT_MODE))
3871                               mutex_exit(&attrzp->z_acl_lock);
3872           }
3873 out:
3874           if (err == 0 && attrzp) {
3875                     err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3876                         xattr_count, tx);
3877                     ASSERT(err2 == 0);
3878           }
3879 
3880           if (attrzp)
3881                     vput(ZTOV(attrzp));
3882 
3883           if (aclp)
3884                     zfs_acl_free(aclp);
3885 
3886           if (fuidp) {
3887                     zfs_fuid_info_free(fuidp);
3888                     fuidp = NULL;
3889           }
3890 
3891           if (err) {
3892                     dmu_tx_abort(tx);
3893           } else {
3894                     err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3895                     dmu_tx_commit(tx);
3896           }
3897 
3898 out2:
3899           if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3900                     zil_commit(zilog, 0);
3901 
3902           ZFS_EXIT(zfsvfs);
3903           return (err);
3904 }
3905 
3906 /*
3907  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3908  * fail to acquire any lock in the path we will drop all held locks,
3909  * acquire the new lock in a blocking fashion, and then release it and
3910  * restart the rename.  This acquire/release step ensures that we do not
3911  * spin on a lock waiting for release.  On error release all vnode locks
3912  * and decrement references the way tmpfs_rename() would do.
3913  */
3914 static int
zfs_rename_relock(struct vnode * sdvp,struct vnode ** svpp,struct vnode * tdvp,struct vnode ** tvpp,const struct componentname * scnp,const struct componentname * tcnp)3915 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3916     struct vnode *tdvp, struct vnode **tvpp,
3917     const struct componentname *scnp, const struct componentname *tcnp)
3918 {
3919           zfsvfs_t  *zfsvfs;
3920           struct vnode        *nvp, *svp, *tvp;
3921           znode_t             *sdzp, *tdzp, *szp, *tzp;
3922 #ifdef __FreeBSD__
3923           const char          *snm = scnp->cn_nameptr;
3924           const char          *tnm = tcnp->cn_nameptr;
3925 #endif
3926 #ifdef __NetBSD__
3927           char *snm, *tnm;
3928 #endif
3929           int error;
3930 
3931 #ifdef __FreeBSD__
3932           VOP_UNLOCK(tdvp, 0);
3933           if (*tvpp != NULL && *tvpp != tdvp)
3934                     VOP_UNLOCK(*tvpp, 0);
3935 #endif
3936 
3937 relock:
3938           error = vn_lock(sdvp, LK_EXCLUSIVE);
3939           if (error)
3940                     goto out;
3941           sdzp = VTOZ(sdvp);
3942 
3943 #ifdef __NetBSD__
3944           if (tdvp == sdvp) {
3945           } else {
3946 #endif
3947           error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3948           if (error != 0) {
3949                     VOP_UNLOCK(sdvp, 0);
3950                     if (error != EBUSY)
3951                               goto out;
3952                     error = vn_lock(tdvp, LK_EXCLUSIVE);
3953                     if (error)
3954                               goto out;
3955                     VOP_UNLOCK(tdvp, 0);
3956                     goto relock;
3957           }
3958 #ifdef __NetBSD__
3959           } /* end if (tdvp == sdvp) */
3960 #endif
3961 
3962           tdzp = VTOZ(tdvp);
3963 
3964           /*
3965            * Before using sdzp and tdzp we must ensure that they are live.
3966            * As a porting legacy from illumos we have two things to worry
3967            * about.  One is typical for FreeBSD and it is that the vnode is
3968            * not reclaimed (doomed).  The other is that the znode is live.
3969            * The current code can invalidate the znode without acquiring the
3970            * corresponding vnode lock if the object represented by the znode
3971            * and vnode is no longer valid after a rollback or receive operation.
3972            * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3973            * that protects the znodes from the invalidation.
3974            */
3975           zfsvfs = sdzp->z_zfsvfs;
3976           ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3977           ZFS_ENTER(zfsvfs);
3978 
3979           /*
3980            * We can not use ZFS_VERIFY_ZP() here because it could directly return
3981            * bypassing the cleanup code in the case of an error.
3982            */
3983           if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3984                     ZFS_EXIT(zfsvfs);
3985                     VOP_UNLOCK(sdvp, 0);
3986 #ifdef __NetBSD__
3987                     if (tdvp != sdvp)
3988 #endif
3989                     VOP_UNLOCK(tdvp, 0);
3990                     error = SET_ERROR(EIO);
3991                     goto out;
3992           }
3993 
3994           /*
3995            * Re-resolve svp to be certain it still exists and fetch the
3996            * correct vnode.
3997            */
3998 #ifdef __NetBSD__
3999           /* ZFS wants a null-terminated name. */
4000           snm = PNBUF_GET();
4001           strlcpy(snm, scnp->cn_nameptr, scnp->cn_namelen + 1);
4002 #endif
4003           error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
4004 #ifdef __NetBSD__
4005           PNBUF_PUT(snm);
4006 #endif
4007           if (error != 0) {
4008                     /* Source entry invalid or not there. */
4009                     ZFS_EXIT(zfsvfs);
4010                     VOP_UNLOCK(sdvp, 0);
4011 #ifdef __NetBSD__
4012                     if (tdvp != sdvp)
4013 #endif
4014                     VOP_UNLOCK(tdvp, 0);
4015                     if ((scnp->cn_flags & ISDOTDOT) != 0 ||
4016                         (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
4017                               error = SET_ERROR(EINVAL);
4018                     goto out;
4019           }
4020           svp = ZTOV(szp);
4021 
4022           /*
4023            * Re-resolve tvp, if it disappeared we just carry on.
4024            */
4025 #ifdef __NetBSD__
4026           /* ZFS wants a null-terminated name. */
4027           tnm = PNBUF_GET();
4028           strlcpy(tnm, tcnp->cn_nameptr, tcnp->cn_namelen + 1);
4029 #endif
4030           error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
4031 #ifdef __NetBSD__
4032           PNBUF_PUT(tnm);
4033 #endif
4034           if (error != 0) {
4035                     ZFS_EXIT(zfsvfs);
4036                     VOP_UNLOCK(sdvp, 0);
4037 #ifdef __NetBSD__
4038                     if (tdvp != sdvp)
4039 #endif
4040                     VOP_UNLOCK(tdvp, 0);
4041                     vrele(svp);
4042                     if ((tcnp->cn_flags & ISDOTDOT) != 0)
4043                               error = SET_ERROR(EINVAL);
4044                     goto out;
4045           }
4046           if (tzp != NULL)
4047                     tvp = ZTOV(tzp);
4048           else
4049                     tvp = NULL;
4050 
4051           /*
4052            * At present the vnode locks must be acquired before z_teardown_lock,
4053            * although it would be more logical to use the opposite order.
4054            */
4055           ZFS_EXIT(zfsvfs);
4056 
4057           /*
4058            * Now try acquire locks on svp and tvp.
4059            */
4060           nvp = svp;
4061           error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
4062           if (error != 0) {
4063                     VOP_UNLOCK(sdvp, 0);
4064 #ifdef __NetBSD__
4065                     if (tdvp != sdvp)
4066 #endif
4067                     VOP_UNLOCK(tdvp, 0);
4068                     if (tvp != NULL)
4069                               vrele(tvp);
4070                     if (error != EBUSY) {
4071                               vrele(nvp);
4072                               goto out;
4073                     }
4074                     error = vn_lock(nvp, LK_EXCLUSIVE);
4075                     if (error != 0) {
4076                               vrele(nvp);
4077                               goto out;
4078                     }
4079                     VOP_UNLOCK(nvp, 0);
4080                     /*
4081                      * Concurrent rename race.
4082                      * XXX ?
4083                      */
4084                     if (nvp == tdvp) {
4085                               vrele(nvp);
4086                               error = SET_ERROR(EINVAL);
4087                               goto out;
4088                     }
4089 #ifdef __NetBSD__
4090                     if (*svpp != NULL)
4091 #endif
4092                     vrele(*svpp);
4093                     *svpp = nvp;
4094                     goto relock;
4095           }
4096 #ifdef __NetBSD__
4097           if (*svpp != NULL)
4098 #endif
4099           vrele(*svpp);
4100           *svpp = nvp;
4101 
4102           if (*tvpp != NULL)
4103                     vrele(*tvpp);
4104           *tvpp = NULL;
4105           if (tvp != NULL) {
4106                     nvp = tvp;
4107 
4108 #ifdef __NetBSD__
4109                     if (tvp == svp || tvp == sdvp) {
4110                     } else {
4111 #endif
4112                     error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
4113                     if (error != 0) {
4114                               VOP_UNLOCK(sdvp, 0);
4115 #ifdef __NetBSD__
4116                               if (tdvp != sdvp)
4117 #endif
4118                               VOP_UNLOCK(tdvp, 0);
4119 #ifdef __NetBSD__
4120                               if (*svpp != tdvp)
4121 #endif
4122                               VOP_UNLOCK(*svpp, 0);
4123                               if (error != EBUSY) {
4124                                         vrele(nvp);
4125                                         goto out;
4126                               }
4127                               error = vn_lock(nvp, LK_EXCLUSIVE);
4128                               if (error != 0) {
4129                                         vrele(nvp);
4130                                         goto out;
4131                               }
4132                               vput(nvp);
4133                               goto relock;
4134                     }
4135 #ifdef __NetBSD__
4136                     } /* end if (tvp == svp || tvp == sdvp) */
4137 #endif
4138 
4139                     *tvpp = nvp;
4140           }
4141 
4142           KASSERT(VOP_ISLOCKED(sdvp) == LK_EXCLUSIVE);
4143           KASSERT(VOP_ISLOCKED(*svpp) == LK_EXCLUSIVE);
4144           KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4145           KASSERT(*tvpp == NULL || VOP_ISLOCKED(*tvpp) == LK_EXCLUSIVE);
4146 
4147           return (0);
4148 
4149 out:
4150           return (error);
4151 }
4152 
4153 /*
4154  * Note that we must use VRELE_ASYNC in this function as it walks
4155  * up the directory tree and vrele may need to acquire an exclusive
4156  * lock if a last reference to a vnode is dropped.
4157  */
4158 static int
zfs_rename_check(znode_t * szp,znode_t * sdzp,znode_t * tdzp)4159 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
4160 {
4161           zfsvfs_t  *zfsvfs;
4162           znode_t             *zp, *zp1;
4163           uint64_t  parent;
4164           int                 error;
4165 
4166           zfsvfs = tdzp->z_zfsvfs;
4167           if (tdzp == szp)
4168                     return (SET_ERROR(EINVAL));
4169           if (tdzp == sdzp)
4170                     return (0);
4171           if (tdzp->z_id == zfsvfs->z_root)
4172                     return (0);
4173           zp = tdzp;
4174           for (;;) {
4175                     ASSERT(!zp->z_unlinked);
4176                     if ((error = sa_lookup(zp->z_sa_hdl,
4177                         SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
4178                               break;
4179 
4180                     if (parent == szp->z_id) {
4181                               error = SET_ERROR(EINVAL);
4182                               break;
4183                     }
4184                     if (parent == zfsvfs->z_root)
4185                               break;
4186                     if (parent == sdzp->z_id)
4187                               break;
4188 
4189                     error = zfs_zget(zfsvfs, parent, &zp1);
4190                     if (error != 0)
4191                               break;
4192 
4193                     if (zp != tdzp)
4194                               VN_RELE_ASYNC(ZTOV(zp),
4195                                   dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
4196                     zp = zp1;
4197           }
4198 
4199           if (error == ENOTDIR)
4200                     panic("checkpath: .. not a directory\n");
4201           if (zp != tdzp)
4202                     VN_RELE_ASYNC(ZTOV(zp),
4203                         dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
4204           return (error);
4205 }
4206 
4207 /*
4208  * Move an entry from the provided source directory to the target
4209  * directory.  Change the entry name as indicated.
4210  *
4211  *        IN:       sdvp      - Source directory containing the "old entry".
4212  *                  snm       - Old entry name.
4213  *                  tdvp      - Target directory to contain the "new entry".
4214  *                  tnm       - New entry name.
4215  *                  cr        - credentials of caller.
4216  *                  ct        - caller context
4217  *                  flags     - case flags
4218  *
4219  *        RETURN:   0 on success, error code on failure.
4220  *
4221  * Timestamps:
4222  *        sdvp,tdvp - ctime|mtime updated
4223  */
4224 /*ARGSUSED*/
4225 static int
zfs_rename(vnode_t * sdvp,vnode_t ** svpp,struct componentname * scnp,vnode_t * tdvp,vnode_t ** tvpp,struct componentname * tcnp,cred_t * cr)4226 zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
4227     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
4228     cred_t *cr)
4229 {
4230           zfsvfs_t  *zfsvfs;
4231           znode_t             *sdzp, *tdzp, *szp, *tzp;
4232           zilog_t             *zilog = NULL;
4233           dmu_tx_t  *tx;
4234 #ifdef __FreeBSD__
4235           char                *snm = __UNCONST(scnp->cn_nameptr);
4236           char                *tnm = __UNCONST(tcnp->cn_nameptr);
4237 #endif
4238 #ifdef __NetBSD__
4239           char *snm, *tnm;
4240 #endif
4241           int                 error = 0;
4242 
4243           /* Reject renames across filesystems. */
4244           if (((*svpp) != NULL && (*svpp)->v_mount != tdvp->v_mount) ||
4245               ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
4246                     error = SET_ERROR(EXDEV);
4247                     goto out;
4248           }
4249 
4250           if (zfsctl_is_node(tdvp)) {
4251                     error = SET_ERROR(EXDEV);
4252                     goto out;
4253           }
4254 
4255           /*
4256            * Lock all four vnodes to ensure safety and semantics of renaming.
4257            */
4258           error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
4259           if (error != 0) {
4260                     /* no vnodes are locked in the case of error here */
4261                     return (error);
4262           }
4263 
4264           tdzp = VTOZ(tdvp);
4265           sdzp = VTOZ(sdvp);
4266           zfsvfs = tdzp->z_zfsvfs;
4267           zilog = zfsvfs->z_log;
4268 #ifdef __NetBSD__
4269           /* ZFS wants a null-terminated name. */
4270           snm = PNBUF_GET();
4271           strlcpy(snm, scnp->cn_nameptr, scnp->cn_namelen + 1);
4272           tnm = PNBUF_GET();
4273           strlcpy(tnm, tcnp->cn_nameptr, tcnp->cn_namelen + 1);
4274 #endif
4275 
4276           /*
4277            * After we re-enter ZFS_ENTER() we will have to revalidate all
4278            * znodes involved.
4279            */
4280           ZFS_ENTER(zfsvfs);
4281 
4282           if (zfsvfs->z_utf8 && u8_validate(tnm,
4283               strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4284                     error = SET_ERROR(EILSEQ);
4285                     goto unlockout;
4286           }
4287 
4288 #ifndef __NetBSD__
4289           /* If source and target are the same file, there is nothing to do. */
4290           if ((*svpp) == (*tvpp)) {
4291                     error = 0;
4292                     goto unlockout;
4293           }
4294 #endif
4295 
4296           if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
4297               ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
4298               (*tvpp)->v_mountedhere != NULL)) {
4299                     error = SET_ERROR(EXDEV);
4300                     goto unlockout;
4301           }
4302 
4303           /*
4304            * We can not use ZFS_VERIFY_ZP() here because it could directly return
4305            * bypassing the cleanup code in the case of an error.
4306            */
4307           if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
4308                     error = SET_ERROR(EIO);
4309                     goto unlockout;
4310           }
4311 
4312           szp = VTOZ(*svpp);
4313           tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
4314           if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
4315                     error = SET_ERROR(EIO);
4316                     goto unlockout;
4317           }
4318 
4319           /*
4320            * This is to prevent the creation of links into attribute space
4321            * by renaming a linked file into/outof an attribute directory.
4322            * See the comment in zfs_link() for why this is considered bad.
4323            */
4324           if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
4325                     error = SET_ERROR(EINVAL);
4326                     goto unlockout;
4327           }
4328 
4329           /*
4330            * Must have write access at the source to remove the old entry
4331            * and write access at the target to create the new entry.
4332            * Note that if target and source are the same, this can be
4333            * done in a single check.
4334            */
4335           if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
4336                     goto unlockout;
4337 
4338           if ((*svpp)->v_type == VDIR) {
4339                     /*
4340                      * Avoid ".", "..", and aliases of "." for obvious reasons.
4341                      */
4342                     if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
4343                         sdzp == szp ||
4344                         (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
4345                               error = SET_ERROR(EINVAL);
4346                               goto unlockout;
4347                     }
4348 
4349                     /*
4350                      * Check to make sure rename is valid.
4351                      * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
4352                      */
4353                     if (error = zfs_rename_check(szp, sdzp, tdzp))
4354                               goto unlockout;
4355           }
4356 
4357           /*
4358            * Does target exist?
4359            */
4360           if (tzp) {
4361                     /*
4362                      * Source and target must be the same type.
4363                      */
4364                     if ((*svpp)->v_type == VDIR) {
4365                               if ((*tvpp)->v_type != VDIR) {
4366                                         error = SET_ERROR(ENOTDIR);
4367                                         goto unlockout;
4368                               } else {
4369                                         cache_purge(tdvp);
4370                                         if (sdvp != tdvp)
4371                                                   cache_purge(sdvp);
4372                               }
4373                     } else {
4374                               if ((*tvpp)->v_type == VDIR) {
4375                                         error = SET_ERROR(EISDIR);
4376                                         goto unlockout;
4377                               }
4378                     }
4379 
4380                     /*
4381                      * POSIX dictates that when the source and target
4382                      * entries refer to the same file object, rename
4383                      * must do nothing and exit without error.
4384                      */
4385 #ifndef __NetBSD__
4386                     /*
4387                      * But on NetBSD we have a different system call to do
4388                      * this, posix_rename, which sorta kinda handles this
4389                      * case (modulo races), and our tests expect BSD
4390                      * semantics for rename, so we'll do that until we can
4391                      * push the choice between BSD and POSIX semantics into
4392                      * the VOP_RENAME protocol as a flag.
4393                      */
4394                     if (szp->z_id == tzp->z_id) {
4395                               error = 0;
4396                               goto unlockout;
4397                     }
4398 #endif
4399           }
4400 
4401           vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
4402           if (tzp)
4403                     vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
4404 
4405           /*
4406            * notify the target directory if it is not the same
4407            * as source directory.
4408            */
4409           if (tdvp != sdvp) {
4410                     vnevent_rename_dest_dir(tdvp, ct);
4411           }
4412 
4413           tx = dmu_tx_create(zfsvfs->z_os);
4414           dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4415           dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
4416           dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
4417           dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
4418           if (sdzp != tdzp) {
4419                     dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
4420                     zfs_sa_upgrade_txholds(tx, tdzp);
4421           }
4422           if (tzp) {
4423                     dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
4424                     zfs_sa_upgrade_txholds(tx, tzp);
4425           }
4426 
4427           zfs_sa_upgrade_txholds(tx, szp);
4428           dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
4429           error = dmu_tx_assign(tx, TXG_WAIT);
4430           if (error) {
4431                     dmu_tx_abort(tx);
4432                     goto unlockout;
4433           }
4434 
4435 
4436           if (tzp && (tzp->z_id != szp->z_id))
4437                     /* Attempt to remove the existing target */
4438                     error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
4439 
4440           if (error == 0) {
4441                     if (!tzp || (tzp->z_id != szp->z_id))
4442                               error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
4443                     if (error == 0) {
4444                               szp->z_pflags |= ZFS_AV_MODIFIED;
4445 
4446                               error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
4447                                   (void *)&szp->z_pflags, sizeof (uint64_t), tx);
4448                               ASSERT0(error);
4449 
4450                               error = zfs_link_destroy(sdzp, snm, szp, tx,
4451                                   /* Kludge for BSD rename semantics.  */
4452                                   tzp && tzp->z_id == szp->z_id ? 0: ZRENAMING, NULL);
4453                               if (error == 0) {
4454                                         zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
4455                                             snm, tdzp, tnm, szp);
4456 
4457                                         /*
4458                                          * Update path information for the target vnode
4459                                          */
4460                                         vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
4461                               } else {
4462                                         /*
4463                                          * At this point, we have successfully created
4464                                          * the target name, but have failed to remove
4465                                          * the source name.  Since the create was done
4466                                          * with the ZRENAMING flag, there are
4467                                          * complications; for one, the link count is
4468                                          * wrong.  The easiest way to deal with this
4469                                          * is to remove the newly created target, and
4470                                          * return the original error.  This must
4471                                          * succeed; fortunately, it is very unlikely to
4472                                          * fail, since we just created it.
4473                                          */
4474                                         VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
4475                                             ZRENAMING, NULL), ==, 0);
4476                               }
4477                     }
4478                     if (error == 0) {
4479                               cache_purge(*svpp);
4480                               if (*tvpp != NULL)
4481                                         cache_purge(*tvpp);
4482                               cache_purge_negative(tdvp);
4483 #ifdef __NetBSD__
4484                               if (*svpp == *tvpp) {
4485                                         VN_KNOTE(sdvp, NOTE_WRITE);
4486                                         VN_KNOTE(*svpp, (szp->z_links == 0 ?
4487                                             NOTE_DELETE : NOTE_LINK));
4488                               } else {
4489                                         genfs_rename_knote(sdvp, *svpp, tdvp, *tvpp,
4490                                             tzp != NULL ? tzp->z_links : 0);
4491                               }
4492 #endif
4493                     }
4494           }
4495 
4496           dmu_tx_commit(tx);
4497 
4498           if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4499                     zil_commit(zilog, 0);
4500 
4501 unlockout:                              /* all 4 vnodes are locked, ZFS_ENTER called */
4502           ZFS_EXIT(zfsvfs);
4503 
4504           VOP_UNLOCK(*svpp, 0);
4505           VOP_UNLOCK(sdvp, 0);
4506 #ifdef __NetBSD__
4507           PNBUF_PUT(snm);
4508           PNBUF_PUT(tnm);
4509 #endif
4510 
4511           if (*tvpp != sdvp && *tvpp != *svpp)
4512           if (*tvpp != NULL)
4513                     VOP_UNLOCK(*tvpp, 0);
4514           if (tdvp != sdvp && tdvp != *svpp)
4515           if (tdvp != *tvpp)
4516                     VOP_UNLOCK(tdvp, 0);
4517 
4518 out:
4519           return (error);
4520 }
4521 
4522 /*
4523  * Insert the indicated symbolic reference entry into the directory.
4524  *
4525  *        IN:       dvp       - Directory to contain new symbolic link.
4526  *                  link      - Name for new symlink entry.
4527  *                  vap       - Attributes of new entry.
4528  *                  cr        - credentials of caller.
4529  *                  ct        - caller context
4530  *                  flags     - case flags
4531  *
4532  *        RETURN:   0 on success, error code on failure.
4533  *
4534  * Timestamps:
4535  *        dvp - ctime|mtime updated
4536  */
4537 /*ARGSUSED*/
4538 static int
zfs_symlink(vnode_t * dvp,vnode_t ** vpp,char * name,vattr_t * vap,char * link,cred_t * cr,kthread_t * td)4539 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4540     cred_t *cr, kthread_t *td)
4541 {
4542           znode_t             *zp, *dzp = VTOZ(dvp);
4543           dmu_tx_t  *tx;
4544           zfsvfs_t  *zfsvfs = dzp->z_zfsvfs;
4545           zilog_t             *zilog;
4546           uint64_t  len = strlen(link);
4547           int                 error;
4548           zfs_acl_ids_t       acl_ids;
4549           boolean_t fuid_dirtied;
4550           uint64_t  txtype = TX_SYMLINK;
4551           int                 flags = 0;
4552 
4553           ASSERT(vap->va_type == VLNK);
4554 
4555           ZFS_ENTER(zfsvfs);
4556           ZFS_VERIFY_ZP(dzp);
4557           zilog = zfsvfs->z_log;
4558 
4559           if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4560               NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4561                     ZFS_EXIT(zfsvfs);
4562                     return (SET_ERROR(EILSEQ));
4563           }
4564 
4565           if (len > MAXPATHLEN) {
4566                     ZFS_EXIT(zfsvfs);
4567                     return (SET_ERROR(ENAMETOOLONG));
4568           }
4569 
4570           if ((error = zfs_acl_ids_create(dzp, 0,
4571               vap, cr, NULL, &acl_ids)) != 0) {
4572                     ZFS_EXIT(zfsvfs);
4573                     return (error);
4574           }
4575 
4576           /*
4577            * Attempt to lock directory; fail if entry already exists.
4578            */
4579           error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4580           if (error) {
4581                     zfs_acl_ids_free(&acl_ids);
4582                     ZFS_EXIT(zfsvfs);
4583                     return (error);
4584           }
4585 
4586           if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4587                     zfs_acl_ids_free(&acl_ids);
4588                     ZFS_EXIT(zfsvfs);
4589                     return (error);
4590           }
4591 
4592           if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4593                     zfs_acl_ids_free(&acl_ids);
4594                     ZFS_EXIT(zfsvfs);
4595                     return (SET_ERROR(EDQUOT));
4596           }
4597 
4598           getnewvnode_reserve(1);
4599           tx = dmu_tx_create(zfsvfs->z_os);
4600           fuid_dirtied = zfsvfs->z_fuid_dirty;
4601           dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4602           dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4603           dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4604               ZFS_SA_BASE_ATTR_SIZE + len);
4605           dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4606           if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4607                     dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4608                         acl_ids.z_aclp->z_acl_bytes);
4609           }
4610           if (fuid_dirtied)
4611                     zfs_fuid_txhold(zfsvfs, tx);
4612           error = dmu_tx_assign(tx, TXG_WAIT);
4613           if (error) {
4614                     zfs_acl_ids_free(&acl_ids);
4615                     dmu_tx_abort(tx);
4616                     getnewvnode_drop_reserve();
4617                     ZFS_EXIT(zfsvfs);
4618                     return (error);
4619           }
4620 
4621           /*
4622            * Create a new object for the symlink.
4623            * for version 4 ZPL datsets the symlink will be an SA attribute
4624            */
4625           zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4626 
4627           if (fuid_dirtied)
4628                     zfs_fuid_sync(zfsvfs, tx);
4629 
4630           if (zp->z_is_sa)
4631                     error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4632                         link, len, tx);
4633           else
4634                     zfs_sa_symlink(zp, link, len, tx);
4635 
4636           zp->z_size = len;
4637           (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4638               &zp->z_size, sizeof (zp->z_size), tx);
4639           /*
4640            * Insert the new object into the directory.
4641            */
4642           (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4643 
4644           zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4645           *vpp = ZTOV(zp);
4646 
4647           zfs_acl_ids_free(&acl_ids);
4648 
4649           dmu_tx_commit(tx);
4650 
4651           getnewvnode_drop_reserve();
4652 
4653           if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4654                     zil_commit(zilog, 0);
4655 
4656           ZFS_EXIT(zfsvfs);
4657           return (error);
4658 }
4659 
4660 /*
4661  * Return, in the buffer contained in the provided uio structure,
4662  * the symbolic path referred to by vp.
4663  *
4664  *        IN:       vp        - vnode of symbolic link.
4665  *                  uio       - structure to contain the link path.
4666  *                  cr        - credentials of caller.
4667  *                  ct        - caller context
4668  *
4669  *        OUT:      uio       - structure containing the link path.
4670  *
4671  *        RETURN:   0 on success, error code on failure.
4672  *
4673  * Timestamps:
4674  *        vp - atime updated
4675  */
4676 /* ARGSUSED */
4677 static int
zfs_readlink(vnode_t * vp,uio_t * uio,cred_t * cr,caller_context_t * ct)4678 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4679 {
4680           znode_t             *zp = VTOZ(vp);
4681           zfsvfs_t  *zfsvfs = zp->z_zfsvfs;
4682           int                 error;
4683 
4684           ZFS_ENTER(zfsvfs);
4685           ZFS_VERIFY_ZP(zp);
4686 
4687           if (zp->z_is_sa)
4688                     error = sa_lookup_uio(zp->z_sa_hdl,
4689                         SA_ZPL_SYMLINK(zfsvfs), uio);
4690           else
4691                     error = zfs_sa_readlink(zp, uio);
4692 
4693           ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4694 
4695           ZFS_EXIT(zfsvfs);
4696           return (error);
4697 }
4698 
4699 /*
4700  * Insert a new entry into directory tdvp referencing svp.
4701  *
4702  *        IN:       tdvp      - Directory to contain new entry.
4703  *                  svp       - vnode of new entry.
4704  *                  name      - name of new entry.
4705  *                  cr        - credentials of caller.
4706  *                  ct        - caller context
4707  *
4708  *        RETURN:   0 on success, error code on failure.
4709  *
4710  * Timestamps:
4711  *        tdvp - ctime|mtime updated
4712  *         svp - ctime updated
4713  */
4714 /* ARGSUSED */
4715 static int
zfs_link(vnode_t * tdvp,vnode_t * svp,char * name,cred_t * cr,caller_context_t * ct,int flags)4716 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4717     caller_context_t *ct, int flags)
4718 {
4719           znode_t             *dzp = VTOZ(tdvp);
4720           znode_t             *tzp, *szp;
4721           zfsvfs_t  *zfsvfs = dzp->z_zfsvfs;
4722           zilog_t             *zilog;
4723           dmu_tx_t  *tx;
4724           int                 error;
4725           uint64_t  parent;
4726           uid_t               owner;
4727 
4728           ASSERT(tdvp->v_type == VDIR);
4729 
4730           ZFS_ENTER(zfsvfs);
4731           ZFS_VERIFY_ZP(dzp);
4732           zilog = zfsvfs->z_log;
4733 
4734           /*
4735            * POSIX dictates that we return EPERM here.
4736            * Better choices include ENOTSUP or EISDIR.
4737            */
4738           if (svp->v_type == VDIR) {
4739                     ZFS_EXIT(zfsvfs);
4740                     return (SET_ERROR(EPERM));
4741           }
4742 
4743           szp = VTOZ(svp);
4744           ZFS_VERIFY_ZP(szp);
4745 
4746           if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4747                     ZFS_EXIT(zfsvfs);
4748                     return (SET_ERROR(EPERM));
4749           }
4750 
4751           /* Prevent links to .zfs/shares files */
4752 
4753           if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4754               &parent, sizeof (uint64_t))) != 0) {
4755                     ZFS_EXIT(zfsvfs);
4756                     return (error);
4757           }
4758           if (parent == zfsvfs->z_shares_dir) {
4759                     ZFS_EXIT(zfsvfs);
4760                     return (SET_ERROR(EPERM));
4761           }
4762 
4763           if (zfsvfs->z_utf8 && u8_validate(name,
4764               strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4765                     ZFS_EXIT(zfsvfs);
4766                     return (SET_ERROR(EILSEQ));
4767           }
4768 
4769           /*
4770            * We do not support links between attributes and non-attributes
4771            * because of the potential security risk of creating links
4772            * into "normal" file space in order to circumvent restrictions
4773            * imposed in attribute space.
4774            */
4775           if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4776                     ZFS_EXIT(zfsvfs);
4777                     return (SET_ERROR(EINVAL));
4778           }
4779 
4780 
4781           owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4782           if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4783                     ZFS_EXIT(zfsvfs);
4784                     return (SET_ERROR(EPERM));
4785           }
4786 
4787           if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4788                     ZFS_EXIT(zfsvfs);
4789                     return (error);
4790           }
4791 
4792           /*
4793            * Attempt to lock directory; fail if entry already exists.
4794            */
4795           error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4796           if (error) {
4797                     ZFS_EXIT(zfsvfs);
4798                     return (error);
4799           }
4800 
4801           tx = dmu_tx_create(zfsvfs->z_os);
4802           dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4803           dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4804           zfs_sa_upgrade_txholds(tx, szp);
4805           zfs_sa_upgrade_txholds(tx, dzp);
4806           error = dmu_tx_assign(tx, TXG_WAIT);
4807           if (error) {
4808                     dmu_tx_abort(tx);
4809                     ZFS_EXIT(zfsvfs);
4810                     return (error);
4811           }
4812 
4813           error = zfs_link_create(dzp, name, szp, tx, 0);
4814 
4815           if (error == 0) {
4816                     uint64_t txtype = TX_LINK;
4817                     zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4818           }
4819 
4820           dmu_tx_commit(tx);
4821 
4822           if (error == 0) {
4823                     vnevent_link(svp, ct);
4824           }
4825 
4826           if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4827                     zil_commit(zilog, 0);
4828 
4829           ZFS_EXIT(zfsvfs);
4830           return (error);
4831 }
4832 
4833 
4834 /*ARGSUSED*/
4835 void
zfs_inactive(vnode_t * vp,cred_t * cr,caller_context_t * ct)4836 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4837 {
4838           znode_t   *zp = VTOZ(vp);
4839           zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4840           int error;
4841 
4842           rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4843           if (zp->z_sa_hdl == NULL) {
4844                     /*
4845                      * The fs has been unmounted, or we did a
4846                      * suspend/resume and this file no longer exists.
4847                      */
4848                     rw_exit(&zfsvfs->z_teardown_inactive_lock);
4849                     vrecycle(vp);
4850                     return;
4851           }
4852 
4853           if (zp->z_unlinked) {
4854                     /*
4855                      * Fast path to recycle a vnode of a removed file.
4856                      */
4857                     rw_exit(&zfsvfs->z_teardown_inactive_lock);
4858                     vrecycle(vp);
4859                     return;
4860           }
4861 
4862           if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4863                     dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4864 
4865                     dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4866                     zfs_sa_upgrade_txholds(tx, zp);
4867                     error = dmu_tx_assign(tx, TXG_WAIT);
4868                     if (error) {
4869                               dmu_tx_abort(tx);
4870                     } else {
4871                               (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4872                                   (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4873                               zp->z_atime_dirty = 0;
4874                               dmu_tx_commit(tx);
4875                     }
4876           }
4877           rw_exit(&zfsvfs->z_teardown_inactive_lock);
4878 }
4879 
4880 
4881 #ifdef __FreeBSD__
4882 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4883 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4884 #endif
4885 
4886 /*ARGSUSED*/
4887 static int
zfs_fid(vnode_t * vp,fid_t * fidp,caller_context_t * ct)4888 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4889 {
4890           znode_t             *zp = VTOZ(vp);
4891           zfsvfs_t  *zfsvfs = zp->z_zfsvfs;
4892           uint32_t  gen;
4893           uint64_t  gen64;
4894           uint64_t  object = zp->z_id;
4895           zfid_short_t        *zfid;
4896           int                 size, i, error;
4897 
4898           ZFS_ENTER(zfsvfs);
4899           ZFS_VERIFY_ZP(zp);
4900 
4901           if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4902               &gen64, sizeof (uint64_t))) != 0) {
4903                     ZFS_EXIT(zfsvfs);
4904                     return (error);
4905           }
4906 
4907           gen = (uint32_t)gen64;
4908 
4909           size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4910 
4911 #ifdef illumos
4912           if (fidp->fid_len < size) {
4913                     fidp->fid_len = size;
4914                     ZFS_EXIT(zfsvfs);
4915                     return (SET_ERROR(ENOSPC));
4916           }
4917 #else
4918           fidp->fid_len = size;
4919 #endif
4920 
4921           zfid = (zfid_short_t *)fidp;
4922 
4923           zfid->zf_len = size;
4924 
4925           for (i = 0; i < sizeof (zfid->zf_object); i++)
4926                     zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4927 
4928           /* Must have a non-zero generation number to distinguish from .zfs */
4929           if (gen == 0)
4930                     gen = 1;
4931           for (i = 0; i < sizeof (zfid->zf_gen); i++)
4932                     zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4933 
4934           if (size == LONG_FID_LEN) {
4935                     uint64_t  objsetid = dmu_objset_id(zfsvfs->z_os);
4936                     zfid_long_t         *zlfid;
4937 
4938                     zlfid = (zfid_long_t *)fidp;
4939 
4940                     for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4941                               zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4942 
4943                     /* XXX - this should be the generation number for the objset */
4944                     for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4945                               zlfid->zf_setgen[i] = 0;
4946           }
4947 
4948           ZFS_EXIT(zfsvfs);
4949           return (0);
4950 }
4951 
4952 static int
zfs_pathconf(vnode_t * vp,int cmd,ulong_t * valp,cred_t * cr,caller_context_t * ct)4953 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4954     caller_context_t *ct)
4955 {
4956           znode_t             *zp, *xzp;
4957           zfsvfs_t  *zfsvfs;
4958           int                 error;
4959 
4960           switch (cmd) {
4961           case _PC_LINK_MAX:
4962                     *valp = INT_MAX;
4963                     return (0);
4964 
4965           case _PC_FILESIZEBITS:
4966                     *valp = 64;
4967                     return (0);
4968 #ifdef illumos
4969           case _PC_XATTR_EXISTS:
4970                     zp = VTOZ(vp);
4971                     zfsvfs = zp->z_zfsvfs;
4972                     ZFS_ENTER(zfsvfs);
4973                     ZFS_VERIFY_ZP(zp);
4974                     *valp = 0;
4975                     error = zfs_dirent_lookup(zp, "", &xzp,
4976                         ZXATTR | ZEXISTS | ZSHARED);
4977                     if (error == 0) {
4978                               if (!zfs_dirempty(xzp))
4979                                         *valp = 1;
4980                               vrele(ZTOV(xzp));
4981                     } else if (error == ENOENT) {
4982                               /*
4983                                * If there aren't extended attributes, it's the
4984                                * same as having zero of them.
4985                                */
4986                               error = 0;
4987                     }
4988                     ZFS_EXIT(zfsvfs);
4989                     return (error);
4990 
4991           case _PC_SATTR_ENABLED:
4992           case _PC_SATTR_EXISTS:
4993                     *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4994                         (vp->v_type == VREG || vp->v_type == VDIR);
4995                     return (0);
4996 
4997           case _PC_ACCESS_FILTERING:
4998                     *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4999                         vp->v_type == VDIR;
5000                     return (0);
5001 
5002           case _PC_ACL_ENABLED:
5003                     *valp = _ACL_ACE_ENABLED;
5004                     return (0);
5005 #endif    /* illumos */
5006           case _PC_MIN_HOLE_SIZE:
5007                     *valp = (int)SPA_MINBLOCKSIZE;
5008                     return (0);
5009 #ifdef illumos
5010           case _PC_TIMESTAMP_RESOLUTION:
5011                     /* nanosecond timestamp resolution */
5012                     *valp = 1L;
5013                     return (0);
5014 #endif
5015           case _PC_ACL_EXTENDED:
5016                     *valp = 0;
5017                     return (0);
5018 
5019 #ifndef __NetBSD__
5020           case _PC_ACL_NFS4:
5021                     *valp = 1;
5022                     return (0);
5023 
5024           case _PC_ACL_PATH_MAX:
5025                     *valp = ACL_MAX_ENTRIES;
5026                     return (0);
5027 #endif
5028 
5029           default:
5030                     return (EOPNOTSUPP);
5031           }
5032 }
5033 
5034 /*ARGSUSED*/
5035 static int
zfs_getsecattr(vnode_t * vp,vsecattr_t * vsecp,int flag,cred_t * cr,caller_context_t * ct)5036 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5037     caller_context_t *ct)
5038 {
5039           znode_t *zp = VTOZ(vp);
5040           zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5041           int error;
5042           boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5043 
5044           ZFS_ENTER(zfsvfs);
5045           ZFS_VERIFY_ZP(zp);
5046           error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5047           ZFS_EXIT(zfsvfs);
5048 
5049           return (error);
5050 }
5051 
5052 /*ARGSUSED*/
5053 int
zfs_setsecattr(vnode_t * vp,vsecattr_t * vsecp,int flag,cred_t * cr,caller_context_t * ct)5054 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5055     caller_context_t *ct)
5056 {
5057           znode_t *zp = VTOZ(vp);
5058           zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5059           int error;
5060           boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5061           zilog_t   *zilog = zfsvfs->z_log;
5062 
5063           ZFS_ENTER(zfsvfs);
5064           ZFS_VERIFY_ZP(zp);
5065 
5066           error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5067 
5068           if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5069                     zil_commit(zilog, 0);
5070 
5071           ZFS_EXIT(zfsvfs);
5072           return (error);
5073 }
5074 
5075 static int
ioflags(int ioflags)5076 ioflags(int ioflags)
5077 {
5078           int flags = 0;
5079 
5080           if (ioflags & IO_APPEND)
5081                     flags |= FAPPEND;
5082           if (ioflags & IO_NDELAY)
5083                     flags |= FNONBLOCK;
5084           if (ioflags & IO_SYNC)
5085                     flags |= (FSYNC | FDSYNC | FRSYNC);
5086 
5087           return (flags);
5088 }
5089 
5090 #ifdef __NetBSD__
5091 
5092 static int
zfs_netbsd_open(void * v)5093 zfs_netbsd_open(void *v)
5094 {
5095           struct vop_open_args *ap = v;
5096 
5097           return (zfs_open(&ap->a_vp, ap->a_mode, ap->a_cred, NULL));
5098 }
5099 
5100 static int
zfs_netbsd_close(void * v)5101 zfs_netbsd_close(void *v)
5102 {
5103           struct vop_close_args *ap = v;
5104 
5105           return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
5106 }
5107 
5108 static int
zfs_netbsd_ioctl(void * v)5109 zfs_netbsd_ioctl(void *v)
5110 {
5111           struct vop_ioctl_args *ap = v;
5112 
5113           return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
5114                     ap->a_fflag, ap->a_cred, NULL, NULL));
5115 }
5116 
5117 
5118 static int
zfs_netbsd_read(void * v)5119 zfs_netbsd_read(void *v)
5120 {
5121           struct vop_read_args *ap = v;
5122           vnode_t *vp = ap->a_vp;
5123           znode_t *zp = VTOZ(vp);
5124 
5125           switch (vp->v_type) {
5126           case VBLK:
5127           case VCHR:
5128                     ZFS_ACCESSTIME_STAMP(zp->z_zfsvfs, zp);
5129                     return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap));
5130           case VFIFO:
5131                     ZFS_ACCESSTIME_STAMP(zp->z_zfsvfs, zp);
5132                     return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap));
5133           }
5134 
5135           return (zfs_read(vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL));
5136 }
5137 
5138 static int
zfs_netbsd_write(void * v)5139 zfs_netbsd_write(void *v)
5140 {
5141           struct vop_write_args *ap = v;
5142           vnode_t *vp = ap->a_vp;
5143           znode_t *zp = VTOZ(vp);
5144           struct uio *uio = ap->a_uio;
5145           off_t osize = zp->z_size;
5146           int error, resid;
5147 
5148           switch (vp->v_type) {
5149           case VBLK:
5150           case VCHR:
5151                     GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
5152                     return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap));
5153           case VFIFO:
5154                     GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
5155                     return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap));
5156           }
5157 
5158           resid = uio->uio_resid;
5159           error = zfs_write(vp, uio, ioflags(ap->a_ioflag), ap->a_cred, NULL);
5160 
5161           return error;
5162 }
5163 
5164 static int
zfs_netbsd_access(void * v)5165 zfs_netbsd_access(void *v)
5166 {
5167           struct vop_access_args /* {
5168                     struct vnode *a_vp;
5169                     accmode_t a_accmode;
5170                     kauth_cred_t a_cred;
5171           } */ *ap = v;
5172           vnode_t *vp = ap->a_vp;
5173           znode_t *zp = VTOZ(vp);
5174           accmode_t accmode;
5175           kauth_cred_t cred = ap->a_cred;
5176           int error = 0;
5177 
5178           /*
5179            * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
5180            */
5181           accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
5182           if (accmode != 0)
5183                     error = zfs_access(vp, accmode, 0, cred, NULL);
5184 
5185           /*
5186            * VADMIN has to be handled by kauth_authorize_vnode().
5187            */
5188           if (error == 0) {
5189                     accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
5190                     if (accmode != 0) {
5191                               error = kauth_authorize_vnode(cred,
5192                                   KAUTH_ACCESS_ACTION(accmode, vp->v_type,
5193                                   zp->z_mode & ALLPERMS), vp, NULL,
5194                                   genfs_can_access(vp, cred, zp->z_uid,
5195                                   zp->z_gid, zp->z_mode & ALLPERMS, NULL, accmode));
5196                     }
5197           }
5198 
5199           /*
5200            * For VEXEC, ensure that at least one execute bit is set for
5201            * non-directories.
5202            */
5203           if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
5204               (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
5205                     error = EACCES;
5206           }
5207 
5208           /* We expect EACCES as common error. */
5209           if (error == EPERM)
5210                     error = EACCES;
5211 
5212           return error;
5213 }
5214 
5215 static int
zfs_netbsd_lookup(void * v)5216 zfs_netbsd_lookup(void *v)
5217 {
5218           struct vop_lookup_v2_args /* {
5219                     struct vnode *a_dvp;
5220                     struct vnode **a_vpp;
5221                     struct componentname *a_cnp;
5222           } */ *ap = v;
5223           struct vnode *dvp = ap->a_dvp;
5224           struct vnode **vpp = ap->a_vpp;
5225           struct componentname *cnp = ap->a_cnp;
5226           char *nm, short_nm[31];
5227           int error;
5228           int iswhiteout;
5229 
5230           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5231 
5232           *vpp = NULL;
5233 
5234           /*
5235            * Do an access check before the cache lookup.  zfs_lookup does
5236            * an access check too, but it's too scary to contemplate
5237            * injecting our namecache stuff into zfs internals.
5238            *
5239            * XXX Is this the correct access check?
5240            */
5241           if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0)
5242                     goto out;
5243 
5244           /*
5245            * Check the namecache before entering zfs_lookup.
5246            * cache_lookup does the locking dance for us.
5247            */
5248           if (cache_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
5249               cnp->cn_nameiop, cnp->cn_flags, &iswhiteout, vpp)) {
5250                     if (iswhiteout) {
5251                               cnp->cn_flags |= ISWHITEOUT;
5252                     }
5253                     return *vpp == NULL ? ENOENT : 0;
5254           }
5255 
5256           /*
5257            * zfs_lookup wants a null-terminated component name, but namei
5258            * gives us a pointer into the full pathname.
5259            */
5260           ASSERT(cnp->cn_namelen < PATH_MAX - 1);
5261           if (cnp->cn_namelen + 1 > sizeof(short_nm))
5262                     nm = PNBUF_GET();
5263           else
5264                     nm = short_nm;
5265           (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5266 
5267           error = zfs_lookup(dvp, nm, vpp, 0, cnp, cnp->cn_nameiop, cnp->cn_cred);
5268 
5269           if (nm != short_nm)
5270                     PNBUF_PUT(nm);
5271 
5272           /*
5273            * Translate errors to match our namei insanity.  Also, if the
5274            * caller wants to create an entry here, it's apparently our
5275            * responsibility as lookup to make sure that's permissible.
5276            * Go figure.
5277            */
5278           if (cnp->cn_flags & ISLASTCN) {
5279                     switch (cnp->cn_nameiop) {
5280                     case CREATE:
5281                     case RENAME:
5282                               if (error == ENOENT) {
5283                                         error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
5284                                         if (error)
5285                                                   break;
5286                                         error = EJUSTRETURN;
5287                                         break;
5288                               }
5289                               break;
5290                     case DELETE:
5291                               if (error == 0) {
5292                                         error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
5293                                         if (error) {
5294                                                   VN_RELE(*vpp);
5295                                                   *vpp = NULL;
5296                                         }
5297                               }
5298                               break;
5299                     }
5300           }
5301 
5302           if (error) {
5303                     KASSERT(*vpp == NULL);
5304                     goto out;
5305           }
5306           KASSERT(*vpp != NULL);
5307 
5308           if ((cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.')) {
5309                     KASSERT(!(cnp->cn_flags & ISDOTDOT));
5310                     KASSERT(dvp == *vpp);
5311           } else if ((cnp->cn_namelen == 2) &&
5312               (cnp->cn_nameptr[0] == '.') &&
5313               (cnp->cn_nameptr[1] == '.')) {
5314                     KASSERT(cnp->cn_flags & ISDOTDOT);
5315           } else {
5316                     KASSERT(!(cnp->cn_flags & ISDOTDOT));
5317           }
5318 
5319 out:
5320           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5321 
5322           /*
5323            * Insert name into cache if appropriate.
5324            */
5325 
5326           if (error == 0 || (error == ENOENT && cnp->cn_nameiop != CREATE))
5327                     cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen,
5328                         cnp->cn_flags);
5329 
5330           return (error);
5331 }
5332 
5333 static int
zfs_netbsd_create(void * v)5334 zfs_netbsd_create(void *v)
5335 {
5336           struct vop_create_v3_args /* {
5337                     struct vnode *a_dvp;
5338                     struct vnode **a_vpp;
5339                     struct componentname *a_cnp;
5340                     struct vattr *a_vap;
5341           } */ *ap = v;
5342           struct vnode *dvp = ap->a_dvp;
5343           struct vnode **vpp = ap->a_vpp;
5344           struct componentname *cnp = ap->a_cnp;
5345           struct vattr *vap = ap->a_vap;
5346           char *nm;
5347           int mode;
5348           int error;
5349 
5350           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5351 
5352           vattr_init_mask(vap);
5353           mode = vap->va_mode & ALLPERMS;
5354 
5355           /* ZFS wants a null-terminated name. */
5356           nm = PNBUF_GET();
5357           (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5358 
5359           /* XXX !EXCL is wrong here...  */
5360           error = zfs_create(dvp, nm, vap, !EXCL, mode, vpp, cnp->cn_cred, NULL);
5361 
5362           PNBUF_PUT(nm);
5363 
5364           KASSERT((error == 0) == (*vpp != NULL));
5365           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5366           if (*vpp != NULL)
5367                     VOP_UNLOCK(*vpp, 0);
5368 
5369           return (error);
5370 }
5371 
5372 static int
zfs_netbsd_mknod(void * v)5373 zfs_netbsd_mknod(void *v)
5374 {
5375           struct vop_mknod_v3_args /* {
5376                     struct vnode *a_dvp;
5377                     struct vnode **a_vpp;
5378                     struct componentname *a_cnp;
5379                     struct vattr *a_vap;
5380           } */ *ap = v;
5381           struct vnode *dvp = ap->a_dvp;
5382           struct vnode **vpp = ap->a_vpp;
5383           struct componentname *cnp = ap->a_cnp;
5384           struct vattr *vap = ap->a_vap;
5385           char *nm;
5386           int mode;
5387           int error;
5388 
5389           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5390 
5391           vattr_init_mask(vap);
5392           mode = vap->va_mode & ALLPERMS;
5393 
5394           /* ZFS wants a null-terminated name. */
5395           nm = PNBUF_GET();
5396           (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5397 
5398           /* XXX !EXCL is wrong here...  */
5399           error = zfs_create(dvp, nm, vap, !EXCL, mode, vpp, cnp->cn_cred, NULL);
5400 
5401           PNBUF_PUT(nm);
5402 
5403           KASSERT((error == 0) == (*vpp != NULL));
5404           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5405           if (*vpp != NULL)
5406                     VOP_UNLOCK(*vpp, 0);
5407 
5408           return (error);
5409 }
5410 
5411 static int
zfs_netbsd_remove(void * v)5412 zfs_netbsd_remove(void *v)
5413 {
5414           struct vop_remove_v3_args /* {
5415                     struct vnode *a_dvp;
5416                     struct vnode *a_vp;
5417                     struct componentname *a_cnp;
5418                     nlink_t ctx_vp_new_nlink;
5419           } */ *ap = v;
5420           struct vnode *dvp = ap->a_dvp;
5421           struct vnode *vp = ap->a_vp;
5422           struct componentname *cnp = ap->a_cnp;
5423           char *nm;
5424           int error;
5425 
5426           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5427           KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
5428 
5429           /* ZFS wants a null-terminated name. */
5430           nm = PNBUF_GET();
5431           (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5432 
5433           error = zfs_remove(dvp, vp, nm, cnp->cn_cred);
5434 
5435           /*
5436            * XXX Should update ctx_vp_new_nlink, but for now the
5437            * XXX the kevent sent on "vp"  matches historical behavior.
5438            */
5439 
5440           PNBUF_PUT(nm);
5441           vput(vp);
5442           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5443           return (error);
5444 }
5445 
5446 static int
zfs_netbsd_mkdir(void * v)5447 zfs_netbsd_mkdir(void *v)
5448 {
5449           struct vop_mkdir_v3_args /* {
5450                     struct vnode *a_dvp;
5451                     struct vnode **a_vpp;
5452                     struct componentname *a_cnp;
5453                     struct vattr *a_vap;
5454           } */ *ap = v;
5455           struct vnode *dvp = ap->a_dvp;
5456           struct vnode **vpp = ap->a_vpp;
5457           struct componentname *cnp = ap->a_cnp;
5458           struct vattr *vap = ap->a_vap;
5459           char *nm;
5460           int error;
5461 
5462           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5463 
5464           vattr_init_mask(vap);
5465 
5466           /* ZFS wants a null-terminated name. */
5467           nm = PNBUF_GET();
5468           (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5469 
5470           error = zfs_mkdir(dvp, nm, vap, vpp, cnp->cn_cred);
5471 
5472           PNBUF_PUT(nm);
5473 
5474           KASSERT((error == 0) == (*vpp != NULL));
5475           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5476           if (*vpp != NULL)
5477                     VOP_UNLOCK(*vpp, 0);
5478 
5479           return (error);
5480 }
5481 
5482 static int
zfs_netbsd_rmdir(void * v)5483 zfs_netbsd_rmdir(void *v)
5484 {
5485           struct vop_rmdir_v2_args /* {
5486                     struct vnode *a_dvp;
5487                     struct vnode *a_vp;
5488                     struct componentname *a_cnp;
5489           } */ *ap = v;
5490           struct vnode *dvp = ap->a_dvp;
5491           struct vnode *vp = ap->a_vp;
5492           struct componentname *cnp = ap->a_cnp;
5493           char *nm;
5494           int error;
5495 
5496           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5497           KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
5498 
5499           /* ZFS wants a null-terminated name. */
5500           nm = PNBUF_GET();
5501           (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5502 
5503           error = zfs_rmdir(dvp, vp, nm, cnp->cn_cred);
5504 
5505           PNBUF_PUT(nm);
5506           vput(vp);
5507           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5508           return error;
5509 }
5510 
5511 static int
zfs_netbsd_readdir(void * v)5512 zfs_netbsd_readdir(void *v)
5513 {
5514           struct vop_readdir_args *ap = v;
5515 
5516           return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5517                     ap->a_ncookies, ap->a_cookies));
5518 }
5519 
5520 static int
zfs_netbsd_fsync(void * v)5521 zfs_netbsd_fsync(void *v)
5522 {
5523           struct vop_fsync_args *ap = v;
5524 
5525           return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL));
5526 }
5527 
5528 static int
zfs_spec_fsync(void * v)5529 zfs_spec_fsync(void *v)
5530 {
5531           struct vop_fsync_args *ap = v;
5532           int error;
5533 
5534           error = spec_fsync(v);
5535           if (error)
5536                     return error;
5537 
5538           return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL));
5539 }
5540 
5541 static int
zfs_netbsd_getattr(void * v)5542 zfs_netbsd_getattr(void *v)
5543 {
5544           struct vop_getattr_args *ap = v;
5545           vattr_t *vap = ap->a_vap;
5546           xvattr_t xvap;
5547           u_long fflags = 0;
5548           int error;
5549 
5550           xva_init(&xvap);
5551           xvap.xva_vattr = *vap;
5552           xvap.xva_vattr.va_mask |= AT_XVATTR;
5553 
5554           /* Convert chflags into ZFS-type flags. */
5555           /* XXX: what about SF_SETTABLE?. */
5556           XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5557           XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5558           XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5559           XVA_SET_REQ(&xvap, XAT_NODUMP);
5560           error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5561           if (error != 0)
5562                     return (error);
5563 
5564           /* Convert ZFS xattr into chflags. */
5565 #define   FLAG_CHECK(fflag, xflag, xfield)        do {                          \
5566           if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)                   \
5567                     fflags |= (fflag);                                          \
5568 } while (0)
5569           FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5570               xvap.xva_xoptattrs.xoa_immutable);
5571           FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5572               xvap.xva_xoptattrs.xoa_appendonly);
5573           FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5574               xvap.xva_xoptattrs.xoa_nounlink);
5575           FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5576               xvap.xva_xoptattrs.xoa_nodump);
5577 #undef    FLAG_CHECK
5578           *vap = xvap.xva_vattr;
5579           vap->va_flags = fflags;
5580           return (0);
5581 }
5582 
5583 static int
zfs_netbsd_setattr(void * v)5584 zfs_netbsd_setattr(void *v)
5585 {
5586           struct vop_setattr_args *ap = v;
5587           vnode_t *vp = ap->a_vp;
5588           vattr_t *vap = ap->a_vap;
5589           cred_t *cred = ap->a_cred;
5590           znode_t *zp = VTOZ(vp);
5591           xvattr_t xvap;
5592           kauth_action_t action;
5593           u_long fflags, sfflags = 0;
5594           uint64_t zflags;
5595           int error, flags = 0;
5596           bool changing_sysflags;
5597 
5598           vattr_init_mask(vap);
5599           vap->va_mask &= ~AT_NOSET;
5600           if (ISSET(vap->va_vaflags, VA_UTIMES_NULL))
5601                     flags |= ATTR_UTIME;
5602 
5603           xva_init(&xvap);
5604           xvap.xva_vattr = *vap;
5605 
5606           zflags = VTOZ(vp)->z_pflags;
5607 
5608           /* Ignore size changes on device nodes. */
5609           if (vp->v_type == VBLK || vp->v_type == VCHR)
5610                     xvap.xva_vattr.va_mask &= ~AT_SIZE;
5611           if (vap->va_flags != VNOVAL) {
5612                     int error;
5613 
5614                     fflags = vap->va_flags;
5615                     if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
5616                               return (EOPNOTSUPP);
5617 
5618 #define   FLAG_CHANGE(fflag, zflag, xflag, xfield)          do {                \
5619           if (((fflags & (fflag)) && !(zflags & (zflag))) ||                    \
5620               ((zflags & (zflag)) && !(fflags & (fflag)))) {                    \
5621                     XVA_SET_REQ(&xvap, (xflag));                                \
5622                     (xfield) = ((fflags & (fflag)) != 0);                       \
5623                     if (((fflag) & SF_SETTABLE) != 0)                           \
5624                               sfflags |= (fflag);                               \
5625           }                                                                               \
5626 } while (0)
5627                     /* Convert chflags into ZFS-type flags. */
5628                     /* XXX: what about SF_SETTABLE?. */
5629                     FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5630                         xvap.xva_xoptattrs.xoa_immutable);
5631                     FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5632                         xvap.xva_xoptattrs.xoa_appendonly);
5633                     FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5634                         xvap.xva_xoptattrs.xoa_nounlink);
5635                     FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5636                         xvap.xva_xoptattrs.xoa_nodump);
5637 #undef    FLAG_CHANGE
5638 
5639                     action = KAUTH_VNODE_WRITE_FLAGS;
5640                     changing_sysflags = false;
5641 
5642                     if (zflags & (ZFS_IMMUTABLE|ZFS_APPENDONLY|ZFS_NOUNLINK)) {
5643                               action |= KAUTH_VNODE_HAS_SYSFLAGS;
5644                     }
5645                     if (sfflags != 0) {
5646                               action |= KAUTH_VNODE_WRITE_SYSFLAGS;
5647                               changing_sysflags = true;
5648                     }
5649 
5650                     error = kauth_authorize_vnode(cred, action, vp, NULL,
5651                         genfs_can_chflags(vp, cred, zp->z_uid, changing_sysflags));
5652                     if (error)
5653                               return error;
5654           }
5655 
5656           if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
5657               vap->va_birthtime.tv_sec != VNOVAL) {
5658                     error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp,
5659                          NULL, genfs_can_chtimes(vp, cred, zp->z_uid,
5660                          vap->va_vaflags));
5661                     if (error)
5662                               return error;
5663           }
5664 
5665           error = zfs_setattr(vp, (vattr_t *)&xvap, flags, cred, NULL);
5666           if (error)
5667                     return error;
5668 
5669           cache_enter_id(vp, zp->z_mode, zp->z_uid, zp->z_gid, true);
5670 
5671           return error;
5672 }
5673 
5674 static int
zfs_netbsd_rename(void * v)5675 zfs_netbsd_rename(void *v)
5676 {
5677           struct vop_rename_args /* {
5678                     struct vnode *a_fdvp;
5679                     struct vnode *a_fvp;
5680                     struct componentname *a_fcnp;
5681                     struct vnode *a_tdvp;
5682                     struct vnode *a_tvp;
5683                     struct componentname *a_tcnp;
5684           } */ *ap = v;
5685           vnode_t *fdvp = ap->a_fdvp;
5686           vnode_t *fvp = ap->a_fvp;
5687           struct componentname *fcnp = ap->a_fcnp;
5688           vnode_t *tdvp = ap->a_tdvp;
5689           vnode_t *tvp = ap->a_tvp;
5690           struct componentname *tcnp = ap->a_tcnp;
5691           kauth_cred_t cred;
5692           int error;
5693 
5694           KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
5695           KASSERT(tvp == NULL || VOP_ISLOCKED(tvp) == LK_EXCLUSIVE);
5696           KASSERT(fdvp->v_type == VDIR);
5697           KASSERT(tdvp->v_type == VDIR);
5698 
5699           cred = fcnp->cn_cred;
5700 
5701           /*
5702            * XXX Want a better equality test.  `tcnp->cn_cred == cred'
5703            * hoses p2k because puffs transmits the creds separately and
5704            * allocates distinct but equivalent structures for them.
5705            */
5706           KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));
5707 
5708           /*
5709            * Drop the insane locks.
5710            */
5711           VOP_UNLOCK(tdvp, 0);
5712           if (tvp != NULL && tvp != tdvp)
5713                     VOP_UNLOCK(tvp, 0);
5714 
5715           /*
5716            * Release the source and target nodes; zfs_rename will look
5717            * them up again once the locking situation is sane.
5718            */
5719           VN_RELE(fvp);
5720           if (tvp != NULL)
5721                     VN_RELE(tvp);
5722           fvp = NULL;
5723           tvp = NULL;
5724 
5725           /*
5726            * Do the rename ZFSly.
5727            */
5728           error = zfs_rename(fdvp, &fvp, fcnp, tdvp, &tvp, tcnp, cred);
5729 
5730           /*
5731            * Release the directories now too, because the VOP_RENAME
5732            * protocol is insane.
5733            */
5734 
5735           VN_RELE(fdvp);
5736           VN_RELE(tdvp);
5737           if (fvp != NULL)
5738                     VN_RELE(fvp);
5739           if (tvp != NULL)
5740                     VN_RELE(tvp);
5741 
5742           return (error);
5743 }
5744 
5745 static int
zfs_netbsd_symlink(void * v)5746 zfs_netbsd_symlink(void *v)
5747 {
5748           struct vop_symlink_v3_args /* {
5749                     struct vnode *a_dvp;
5750                     struct vnode **a_vpp;
5751                     struct componentname *a_cnp;
5752                     struct vattr *a_vap;
5753                     char *a_target;
5754           } */ *ap = v;
5755           struct vnode *dvp = ap->a_dvp;
5756           struct vnode **vpp = ap->a_vpp;
5757           struct componentname *cnp = ap->a_cnp;
5758           struct vattr *vap = ap->a_vap;
5759           char *target = ap->a_target;
5760           char *nm;
5761           int error;
5762 
5763           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5764 
5765           vap->va_type = VLNK;          /* Netbsd: Syscall only sets va_mode. */
5766           vattr_init_mask(vap);
5767 
5768           /* ZFS wants a null-terminated name. */
5769           nm = PNBUF_GET();
5770           (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5771 
5772           error = zfs_symlink(dvp, vpp, nm, vap, target, cnp->cn_cred, 0);
5773 
5774           PNBUF_PUT(nm);
5775           KASSERT((error == 0) == (*vpp != NULL));
5776           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5777           if (*vpp != NULL)
5778                     VOP_UNLOCK(*vpp, 0);
5779 
5780           return (error);
5781 }
5782 
5783 static int
zfs_netbsd_readlink(void * v)5784 zfs_netbsd_readlink(void *v)
5785 {
5786           struct vop_readlink_args *ap = v;
5787 
5788           return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5789 }
5790 
5791 static int
zfs_netbsd_link(void * v)5792 zfs_netbsd_link(void *v)
5793 {
5794           struct vop_link_v2_args /* {
5795                     struct vnode *a_dvp;
5796                     struct vnode *a_vp;
5797                     struct componentname *a_cnp;
5798           } */ *ap = v;
5799           struct vnode *dvp = ap->a_dvp;
5800           struct vnode *vp = ap->a_vp;
5801           struct componentname *cnp = ap->a_cnp;
5802           char *nm;
5803           int error;
5804 
5805           KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5806 
5807           /* ZFS wants a null-terminated name. */
5808           nm = PNBUF_GET();
5809           (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5810 
5811           if ((error = vn_lock(vp, LK_EXCLUSIVE)) != 0) {
5812                     /* XXX: No ABORTOP? */
5813                     PNBUF_PUT(nm);
5814                     return error;
5815           }
5816           error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp,
5817               dvp, 0);
5818           if (error)
5819                     goto out;
5820           error = zfs_link(dvp, vp, nm, cnp->cn_cred,
5821               NULL, 0);
5822 
5823 out:
5824           PNBUF_PUT(nm);
5825           VOP_UNLOCK(vp, 0);
5826           return error;
5827 }
5828 
5829 static int
zfs_netbsd_inactive(void * v)5830 zfs_netbsd_inactive(void *v)
5831 {
5832           struct vop_inactive_v2_args *ap = v;
5833           vnode_t *vp = ap->a_vp;
5834           znode_t   *zp = VTOZ(vp);
5835 
5836           /*
5837            * NetBSD: nothing to do here, other than indicate if the
5838            * vnode should be reclaimed.  No need to lock, if we race
5839            * vrele() will call us again.
5840            */
5841           *ap->a_recycle = (zp->z_unlinked != 0);
5842 
5843           return (0);
5844 }
5845 
5846 static int
zfs_netbsd_reclaim(void * v)5847 zfs_netbsd_reclaim(void *v)
5848 {
5849           struct vop_reclaim_v2_args /* {
5850                     struct vnode *a_vp;
5851           } */ *ap = v;
5852           struct vnode *vp = ap->a_vp;
5853           znode_t   *zp;
5854           zfsvfs_t *zfsvfs;
5855           int error;
5856 
5857           VOP_UNLOCK(vp, 0);
5858           zp = VTOZ(vp);
5859           zfsvfs = zp->z_zfsvfs;
5860 
5861           KASSERTMSG(!vn_has_cached_data(vp), "vp %p", vp);
5862 
5863           rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5864 
5865           /*
5866            * Process a deferred atime update.
5867            */
5868           if (zp->z_atime_dirty && zp->z_unlinked == 0 && zp->z_sa_hdl != NULL) {
5869                     dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
5870 
5871                     dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
5872                     zfs_sa_upgrade_txholds(tx, zp);
5873                     error = dmu_tx_assign(tx, TXG_WAIT);
5874                     if (error) {
5875                               dmu_tx_abort(tx);
5876                     } else {
5877                               (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
5878                                   (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
5879                               zp->z_atime_dirty = 0;
5880                               dmu_tx_commit(tx);
5881                     }
5882           }
5883 
5884           /*
5885            * Operation zfs_znode.c::zfs_zget_cleaner() depends on this
5886            * zil_commit() as a barrier to guarantee the znode cannot
5887            * get freed before its log entries are resolved.
5888            */
5889           if (zfsvfs->z_log)
5890                     zil_commit(zfsvfs->z_log, zp->z_id);
5891 
5892           if (zp->z_sa_hdl == NULL)
5893                     zfs_znode_free(zp);
5894           else
5895                     zfs_zinactive(zp);
5896           rw_exit(&zfsvfs->z_teardown_inactive_lock);
5897           return 0;
5898 }
5899 
5900 static int
zfs_netbsd_fid(void * v)5901 zfs_netbsd_fid(void *v)
5902 {
5903           struct vop_fid_args *ap = v;
5904 
5905           return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5906 }
5907 
5908 static int
zfs_netbsd_pathconf(void * v)5909 zfs_netbsd_pathconf(void *v)
5910 {
5911           struct vop_pathconf_args *ap = v;
5912           ulong_t val;
5913           int error;
5914 
5915           error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->l_cred, NULL);
5916           if (error == 0)
5917                     *ap->a_retval = val;
5918           else if (error == EOPNOTSUPP) {
5919                     switch (ap->a_name) {
5920                     case _PC_NAME_MAX:
5921                               *ap->a_retval = NAME_MAX;
5922                               return (0);
5923                     case _PC_PATH_MAX:
5924                               *ap->a_retval = PATH_MAX;
5925                               return (0);
5926                     case _PC_LINK_MAX:
5927                               *ap->a_retval = LINK_MAX;
5928                               return (0);
5929                     case _PC_MAX_CANON:
5930                               *ap->a_retval = MAX_CANON;
5931                               return (0);
5932                     case _PC_MAX_INPUT:
5933                               *ap->a_retval = MAX_INPUT;
5934                               return (0);
5935                     case _PC_PIPE_BUF:
5936                               *ap->a_retval = PIPE_BUF;
5937                               return (0);
5938                     case _PC_CHOWN_RESTRICTED:
5939                               *ap->a_retval = 1;
5940                               return (0);
5941                     case _PC_NO_TRUNC:
5942                               *ap->a_retval = 1;
5943                               return (0);
5944                     case _PC_VDISABLE:
5945                               *ap->a_retval = _POSIX_VDISABLE;
5946                               return (0);
5947                     default:
5948                               return (EINVAL);
5949                     }
5950                     /* NOTREACHED */
5951           }
5952           return (error);
5953 }
5954 
5955 static int
zfs_netbsd_advlock(void * v)5956 zfs_netbsd_advlock(void *v)
5957 {
5958           struct vop_advlock_args /* {
5959                     struct vnode *a_vp;
5960                     void *a_id;
5961                     int a_op;
5962                     struct flock *a_fl;
5963                     int a_flags;
5964           } */ *ap = v;
5965           struct vnode *vp;
5966           struct znode *zp;
5967           struct zfsvfs *zfsvfs;
5968           int error;
5969 
5970           vp = ap->a_vp;
5971           zp = VTOZ(vp);
5972           zfsvfs = zp->z_zfsvfs;
5973 
5974           ZFS_ENTER(zfsvfs);
5975           ZFS_VERIFY_ZP(zp);
5976           error = lf_advlock(ap, &zp->z_lockf, zp->z_size);
5977           ZFS_EXIT(zfsvfs);
5978 
5979           return error;
5980 }
5981 
5982 static int
zfs_netbsd_getpages(void * v)5983 zfs_netbsd_getpages(void *v)
5984 {
5985           struct vop_getpages_args /* {
5986                     struct vnode *a_vp;
5987                     voff_t a_offset;
5988                     struct vm_page **a_m;
5989                     int *a_count;
5990                     int a_centeridx;
5991                     vm_prot_t a_access_type;
5992                     int a_advice;
5993                     int a_flags;
5994           } */ * const ap = v;
5995 
5996           vnode_t *const vp = ap->a_vp;
5997           const int flags = ap->a_flags;
5998           const bool async = (flags & PGO_SYNCIO) == 0;
5999           const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
6000 
6001           struct uvm_object * const uobj = &vp->v_uobj;
6002           krwlock_t * const rw = uobj->vmobjlock;
6003           znode_t *zp = VTOZ(vp);
6004           zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6005           vfs_t *mp;
6006           struct vm_page *pg;
6007           caddr_t va;
6008           int npages = *ap->a_count, found, err = 0;
6009 
6010           if (flags & PGO_LOCKED) {
6011                     uvn_findpages(uobj, ap->a_offset, &npages, ap->a_m, NULL,
6012                         UFP_NOWAIT | UFP_NOALLOC | UFP_NOBUSY |
6013                         (memwrite ? UFP_NORDONLY : 0));
6014                     KASSERT(npages == *ap->a_count);
6015                     if (memwrite) {
6016                               KASSERT(rw_write_held(uobj->vmobjlock));
6017                               for (int i = 0; i < npages; i++) {
6018                                         pg = ap->a_m[i];
6019                                         if (pg == NULL || pg == PGO_DONTCARE) {
6020                                                   continue;
6021                                         }
6022                                         if (uvm_pagegetdirty(pg) ==
6023                                             UVM_PAGE_STATUS_CLEAN) {
6024                                                   uvm_pagemarkdirty(pg,
6025                                                       UVM_PAGE_STATUS_UNKNOWN);
6026                                         }
6027                               }
6028                     }
6029                     return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
6030           }
6031           rw_exit(rw);
6032 
6033           if (async) {
6034                     return 0;
6035           }
6036 
6037           mp = vp->v_mount;
6038           fstrans_start(mp);
6039           if (vp->v_mount != mp) {
6040                     fstrans_done(mp);
6041                     return ENOENT;
6042           }
6043           ZFS_ENTER(zfsvfs);
6044           ZFS_VERIFY_ZP(zp);
6045 
6046           rw_enter(rw, RW_WRITER);
6047           if (ap->a_offset + (npages << PAGE_SHIFT) > round_page(vp->v_size)) {
6048                     rw_exit(rw);
6049                     ZFS_EXIT(zfsvfs);
6050                     fstrans_done(mp);
6051                     return EINVAL;
6052           }
6053           uvn_findpages(uobj, ap->a_offset, &npages, ap->a_m, NULL, UFP_ALL);
6054           KASSERT(npages == *ap->a_count);
6055 
6056           for (int i = 0; i < npages; i++) {
6057                     pg = ap->a_m[i];
6058                     if (pg->flags & PG_FAKE) {
6059                               voff_t offset = pg->offset;
6060                               KASSERT(pg->offset == ap->a_offset + (i << PAGE_SHIFT));
6061                               rw_exit(rw);
6062 
6063                               va = zfs_map_page(pg, S_WRITE);
6064                               err = dmu_read(zfsvfs->z_os, zp->z_id, offset,
6065                                   PAGE_SIZE, va, DMU_READ_PREFETCH);
6066                               zfs_unmap_page(pg, va);
6067 
6068                               if (err != 0) {
6069                                         uvm_aio_aiodone_pages(ap->a_m, npages, false, err);
6070                                         memset(ap->a_m, 0, sizeof(ap->a_m[0]) *
6071                                             npages);
6072                                         break;
6073                               }
6074                               rw_enter(rw, RW_WRITER);
6075                               pg->flags &= ~(PG_FAKE);
6076                     }
6077 
6078                     if (memwrite && uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
6079                               /* For write faults, start dirtiness tracking. */
6080                               uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
6081                     }
6082           }
6083           rw_exit(rw);
6084 
6085           ZFS_EXIT(zfsvfs);
6086           fstrans_done(mp);
6087 
6088           return (err);
6089 }
6090 
6091 static int
zfs_putapage(vnode_t * vp,page_t ** pp,int count,int flags)6092 zfs_putapage(vnode_t *vp, page_t **pp, int count, int flags)
6093 {
6094           znode_t             *zp = VTOZ(vp);
6095           zfsvfs_t  *zfsvfs = zp->z_zfsvfs;
6096           dmu_tx_t  *tx;
6097           voff_t              off, koff;
6098           voff_t              len, klen;
6099           int                 err;
6100 
6101           bool *cleanedp;
6102           struct uvm_object *uobj = &vp->v_uobj;
6103           krwlock_t *rw = uobj->vmobjlock;
6104 
6105           if (zp->z_sa_hdl == NULL) {
6106                     err = 0;
6107                     goto out;
6108           }
6109 
6110           /*
6111            * Calculate the length and assert that no whole pages are past EOF.
6112            * This check is equivalent to "off + len <= round_page(zp->z_size)",
6113            * with gyrations to avoid signed integer overflow.
6114            */
6115 
6116           off = pp[0]->offset;
6117           len = count * PAGESIZE;
6118           KASSERT(off <= zp->z_size);
6119           KASSERT(len <= round_page(zp->z_size));
6120           KASSERT(off <= round_page(zp->z_size) - len);
6121 
6122           /*
6123            * If EOF is within the last page, reduce len to avoid writing past
6124            * the file size in the ZFS buffer.  Assert that
6125            * "off + len <= zp->z_size", again avoiding signed integer overflow.
6126            */
6127 
6128           if (len > zp->z_size - off) {
6129                     len = zp->z_size - off;
6130           }
6131           KASSERT(len <= zp->z_size);
6132           KASSERT(off <= zp->z_size - len);
6133 
6134           if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
6135               zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
6136                     err = SET_ERROR(EDQUOT);
6137                     goto out;
6138           }
6139           tx = dmu_tx_create(zfsvfs->z_os);
6140           dmu_tx_hold_write(tx, zp->z_id, off, len);
6141 
6142           dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
6143           zfs_sa_upgrade_txholds(tx, zp);
6144           err = dmu_tx_assign(tx, TXG_WAIT);
6145           if (err != 0) {
6146                     dmu_tx_abort(tx);
6147                     goto out;
6148           }
6149 
6150           if (zp->z_blksz <= PAGESIZE) {
6151                     KASSERTMSG(count == 1, "vp %p pp %p count %d", vp, pp, count);
6152                     caddr_t va = zfs_map_page(*pp, S_READ);
6153                     ASSERT3U(len, <=, PAGESIZE);
6154                     dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
6155                     zfs_unmap_page(*pp, va);
6156           } else {
6157                     err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
6158           }
6159           cleanedp = tsd_get(zfs_putpage_key);
6160           *cleanedp = true;
6161 
6162           if (err == 0) {
6163                     uint64_t mtime[2], ctime[2];
6164                     sa_bulk_attr_t bulk[3];
6165                     int count = 0;
6166 
6167                     SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
6168                         &mtime, 16);
6169                     SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
6170                         &ctime, 16);
6171                     SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
6172                         &zp->z_pflags, 8);
6173                     zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
6174                         B_TRUE);
6175                     err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
6176                     ASSERT0(err);
6177                     zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
6178           }
6179           dmu_tx_commit(tx);
6180 
6181 out:
6182           uvm_aio_aiodone_pages(pp, count, true, err);
6183           return (err);
6184 }
6185 
6186 static void
zfs_netbsd_gop_markupdate(vnode_t * vp,int flags)6187 zfs_netbsd_gop_markupdate(vnode_t *vp, int flags)
6188 {
6189           znode_t             *zp = VTOZ(vp);
6190           zfsvfs_t  *zfsvfs = zp->z_zfsvfs;
6191           dmu_tx_t  *tx;
6192           sa_bulk_attr_t      bulk[2];
6193           uint64_t  mtime[2], ctime[2];
6194           int                 count = 0, err;
6195 
6196           KASSERT(flags == GOP_UPDATE_MODIFIED);
6197 
6198           tx = dmu_tx_create(zfsvfs->z_os);
6199           err = dmu_tx_assign(tx, TXG_WAIT);
6200           if (err != 0) {
6201                     dmu_tx_abort(tx);
6202                     return;
6203           }
6204           SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
6205           SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
6206           zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
6207           dmu_tx_commit(tx);
6208 }
6209 
6210 static int
zfs_netbsd_putpages(void * v)6211 zfs_netbsd_putpages(void *v)
6212 {
6213           struct vop_putpages_args /* {
6214                     struct vnode *a_vp;
6215                     voff_t a_offlo;
6216                     voff_t a_offhi;
6217                     int a_flags;
6218           } */ * const ap = v;
6219 
6220           struct vnode *vp = ap->a_vp;
6221           voff_t offlo = ap->a_offlo;
6222           voff_t offhi = ap->a_offhi;
6223           int flags = ap->a_flags;
6224 
6225           znode_t *zp = VTOZ(vp);
6226           zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6227           rl_t *rl = NULL;
6228           uint64_t len;
6229           int error;
6230           bool cleaned = false;
6231 
6232           bool async = (flags & PGO_SYNCIO) == 0;
6233           bool cleaning = (flags & PGO_CLEANIT) != 0;
6234 
6235           if (cleaning) {
6236                     ASSERT((offlo & PAGE_MASK) == 0 && (offhi & PAGE_MASK) == 0);
6237                     ASSERT(offlo < offhi || offhi == 0);
6238                     if (offhi == 0)
6239                               len = UINT64_MAX;
6240                     else
6241                               len = offhi - offlo;
6242                     rw_exit(vp->v_uobj.vmobjlock);
6243                     if (curlwp == uvm.pagedaemon_lwp) {
6244                               error = fstrans_start_nowait(vp->v_mount);
6245                               if (error)
6246                                         return error;
6247                     } else {
6248                               vfs_t *mp = vp->v_mount;
6249                               fstrans_start(mp);
6250                               if (vp->v_mount != mp) {
6251                                         fstrans_done(mp);
6252                                         ASSERT(!vn_has_cached_data(vp));
6253                                         return 0;
6254                               }
6255                     }
6256                     /*
6257                      * Cannot use ZFS_ENTER() here as it returns with error
6258                      * if z_unmounted.  The next statement is equivalent.
6259                      */
6260                     rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
6261 
6262                     rl = zfs_range_lock(zp, offlo, len, RL_WRITER);
6263                     rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
6264                     tsd_set(zfs_putpage_key, &cleaned);
6265           }
6266           error = genfs_putpages(v);
6267           if (cleaning) {
6268                     tsd_set(zfs_putpage_key, NULL);
6269                     zfs_range_unlock(rl);
6270 
6271                     /*
6272                      * Only zil_commit() if we cleaned something.  This avoids
6273                      * deadlock if we're called from zfs_netbsd_setsize().
6274                      */
6275 
6276                     if (cleaned)
6277                     if (!async || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
6278                               zil_commit(zfsvfs->z_log, zp->z_id);
6279                     ZFS_EXIT(zfsvfs);
6280                     fstrans_done(vp->v_mount);
6281           }
6282           return error;
6283 }
6284 
6285 /*
6286  * Restrict the putpages range to the ZFS block containing the offset.
6287  */
6288 static void
zfs_netbsd_gop_putrange(struct vnode * vp,off_t off,off_t * lop,off_t * hip)6289 zfs_netbsd_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
6290 {
6291           znode_t *zp = VTOZ(vp);
6292 
6293           *lop = trunc_page(rounddown2(off, zp->z_blksz));
6294           *hip = round_page(*lop + zp->z_blksz);
6295 }
6296 
6297 void
zfs_netbsd_setsize(vnode_t * vp,off_t size)6298 zfs_netbsd_setsize(vnode_t *vp, off_t size)
6299 {
6300           struct uvm_object *uobj = &vp->v_uobj;
6301           krwlock_t *rw = uobj->vmobjlock;
6302           page_t *pg;
6303           int count, pgoff;
6304           caddr_t va;
6305           off_t tsize;
6306 
6307           uvm_vnp_setsize(vp, size);
6308           if (!vn_has_cached_data(vp))
6309                     return;
6310 
6311           tsize = trunc_page(size);
6312           if (tsize == size)
6313                     return;
6314 
6315           /*
6316            * If there's a partial page, we need to zero the tail.
6317            */
6318 
6319           rw_enter(rw, RW_WRITER);
6320           count = 1;
6321           pg = NULL;
6322           if (uvn_findpages(uobj, tsize, &count, &pg, NULL, UFP_NOALLOC)) {
6323                     va = zfs_map_page(pg, S_WRITE);
6324                     pgoff = size - tsize;
6325                     memset(va + pgoff, 0, PAGESIZE - pgoff);
6326                     zfs_unmap_page(pg, va);
6327                     uvm_page_unbusy(&pg, 1);
6328           }
6329 
6330           rw_exit(rw);
6331 }
6332 
6333 static int
zfs_netbsd_print(void * v)6334 zfs_netbsd_print(void *v)
6335 {
6336           struct vop_print_args /* {
6337                     struct vnode        *a_vp;
6338           } */ *ap = v;
6339           vnode_t   *vp;
6340           znode_t   *zp;
6341 
6342           vp = ap->a_vp;
6343           zp = VTOZ(vp);
6344 
6345           printf("\tino %" PRIu64 " size %" PRIu64 "\n",
6346                  zp->z_id, zp->z_size);
6347           return 0;
6348 }
6349 
6350 const struct genfs_ops zfs_genfsops = {
6351         .gop_write = zfs_putapage,
6352           .gop_markupdate = zfs_netbsd_gop_markupdate,
6353           .gop_putrange = zfs_netbsd_gop_putrange,
6354 };
6355 
6356 int (**zfs_vnodeop_p)(void *);
6357 const struct vnodeopv_entry_desc zfs_vnodeop_entries[] = {
6358           { &vop_default_desc,                    vn_default_error },
6359           { &vop_parsepath_desc,                  genfs_parsepath },
6360           { &vop_lookup_desc,           zfs_netbsd_lookup },
6361           { &vop_create_desc,           zfs_netbsd_create },
6362           { &vop_mknod_desc,            zfs_netbsd_mknod },
6363           { &vop_open_desc,             zfs_netbsd_open },
6364           { &vop_close_desc,            zfs_netbsd_close },
6365           { &vop_access_desc,           zfs_netbsd_access },
6366           { &vop_accessx_desc,                    genfs_accessx },
6367           { &vop_getattr_desc,                    zfs_netbsd_getattr },
6368           { &vop_setattr_desc,                    zfs_netbsd_setattr },
6369           { &vop_read_desc,             zfs_netbsd_read },
6370           { &vop_write_desc,            zfs_netbsd_write },
6371           { &vop_ioctl_desc,            zfs_netbsd_ioctl },
6372           { &vop_poll_desc,             genfs_poll },
6373           { &vop_kqfilter_desc,                   genfs_kqfilter },
6374           { &vop_revoke_desc,           genfs_revoke },
6375           { &vop_fsync_desc,            zfs_netbsd_fsync },
6376           { &vop_remove_desc,           zfs_netbsd_remove },
6377           { &vop_link_desc,             zfs_netbsd_link },
6378           { &vop_lock_desc,             genfs_lock },
6379           { &vop_unlock_desc,           genfs_unlock },
6380           { &vop_rename_desc,           zfs_netbsd_rename },
6381           { &vop_mkdir_desc,            zfs_netbsd_mkdir },
6382           { &vop_rmdir_desc,            zfs_netbsd_rmdir },
6383           { &vop_symlink_desc,                    zfs_netbsd_symlink },
6384           { &vop_readdir_desc,                    zfs_netbsd_readdir },
6385           { &vop_readlink_desc,                   zfs_netbsd_readlink },
6386           { &vop_inactive_desc,                   zfs_netbsd_inactive },
6387           { &vop_reclaim_desc,                    zfs_netbsd_reclaim },
6388           { &vop_pathconf_desc,                   zfs_netbsd_pathconf },
6389           { &vop_seek_desc,             genfs_seek },
6390           { &vop_getpages_desc,                   zfs_netbsd_getpages },
6391           { &vop_putpages_desc,                   zfs_netbsd_putpages },
6392           { &vop_mmap_desc,             genfs_mmap },
6393           { &vop_islocked_desc,                   genfs_islocked },
6394           { &vop_advlock_desc,                    zfs_netbsd_advlock },
6395           { &vop_print_desc,            zfs_netbsd_print },
6396           { &vop_fcntl_desc,            genfs_fcntl },
6397           { NULL, NULL }
6398 };
6399 
6400 const struct vnodeopv_desc zfs_vnodeop_opv_desc =
6401           { &zfs_vnodeop_p, zfs_vnodeop_entries };
6402 
6403 int (**zfs_specop_p)(void *);
6404 const struct vnodeopv_entry_desc zfs_specop_entries[] = {
6405           { &vop_default_desc,                    vn_default_error },
6406           GENFS_SPECOP_ENTRIES,
6407           { &vop_close_desc,            spec_close },
6408           { &vop_access_desc,           zfs_netbsd_access },
6409           { &vop_accessx_desc,                    genfs_accessx },
6410           { &vop_getattr_desc,                    zfs_netbsd_getattr },
6411           { &vop_setattr_desc,                    zfs_netbsd_setattr },
6412           { &vop_read_desc,             /**/zfs_netbsd_read },
6413           { &vop_write_desc,            /**/zfs_netbsd_write },
6414           { &vop_fsync_desc,            zfs_spec_fsync },
6415           { &vop_lock_desc,             genfs_lock },
6416           { &vop_unlock_desc,           genfs_unlock },
6417           { &vop_inactive_desc,                   zfs_netbsd_inactive },
6418           { &vop_reclaim_desc,                    zfs_netbsd_reclaim },
6419           { &vop_islocked_desc,                   genfs_islocked },
6420           { &vop_bwrite_desc,           vn_bwrite },
6421           { &vop_print_desc,            zfs_netbsd_print },
6422           { &vop_fcntl_desc,            genfs_fcntl },
6423           { NULL, NULL }
6424 };
6425 
6426 const struct vnodeopv_desc zfs_specop_opv_desc =
6427           { &zfs_specop_p, zfs_specop_entries };
6428 
6429 int (**zfs_fifoop_p)(void *);
6430 const struct vnodeopv_entry_desc zfs_fifoop_entries[] = {
6431           { &vop_default_desc,                    vn_default_error },
6432           GENFS_FIFOOP_ENTRIES,
6433           { &vop_close_desc,            vn_fifo_bypass },
6434           { &vop_access_desc,           zfs_netbsd_access },
6435           { &vop_accessx_desc,                    genfs_accessx },
6436           { &vop_getattr_desc,                    zfs_netbsd_getattr },
6437           { &vop_setattr_desc,                    zfs_netbsd_setattr },
6438           { &vop_read_desc,             /**/zfs_netbsd_read },
6439           { &vop_write_desc,            /**/zfs_netbsd_write },
6440           { &vop_fsync_desc,            zfs_netbsd_fsync },
6441           { &vop_lock_desc,             genfs_lock },
6442           { &vop_unlock_desc,           genfs_unlock },
6443           { &vop_inactive_desc,                   zfs_netbsd_inactive },
6444           { &vop_reclaim_desc,                    zfs_netbsd_reclaim },
6445           { &vop_islocked_desc,                   genfs_islocked },
6446           { &vop_bwrite_desc,           vn_bwrite },
6447           { &vop_strategy_desc,                   vn_fifo_bypass },
6448           { &vop_print_desc,            zfs_netbsd_print },
6449           { &vop_fcntl_desc,            genfs_fcntl },
6450           { NULL, NULL }
6451 };
6452 
6453 const struct vnodeopv_desc zfs_fifoop_opv_desc =
6454           { &zfs_fifoop_p, zfs_fifoop_entries };
6455 
6456 #endif /* __NetBSD__ */
6457