1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28 /* Portions Copyright 2007 Jeremy Teo */
29 /* Portions Copyright 2010 Robert Milkowski */
30
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/vfs.h>
38 #include <sys/vm.h>
39 #include <sys/vnode.h>
40 #include <sys/file.h>
41 #include <sys/stat.h>
42 #include <sys/kmem.h>
43 #include <sys/taskq.h>
44 #include <sys/uio.h>
45 #include <sys/atomic.h>
46 #include <sys/namei.h>
47 #include <sys/mman.h>
48 #include <sys/cmn_err.h>
49 #include <sys/errno.h>
50 #include <sys/unistd.h>
51 #include <sys/zfs_dir.h>
52 #include <sys/zfs_ioctl.h>
53 #include <sys/fs/zfs.h>
54 #include <sys/dmu.h>
55 #include <sys/dmu_objset.h>
56 #include <sys/spa.h>
57 #include <sys/txg.h>
58 #include <sys/dbuf.h>
59 #include <sys/zap.h>
60 #include <sys/sa.h>
61 #include <sys/dirent.h>
62 #include <sys/policy.h>
63 #include <sys/sunddi.h>
64 #include <sys/filio.h>
65 #include <sys/sid.h>
66 #include <sys/zfs_ctldir.h>
67 #include <sys/zfs_fuid.h>
68 #include <sys/zfs_sa.h>
69 #include <sys/zfs_rlock.h>
70 #include <sys/extdirent.h>
71 #include <sys/kidmap.h>
72 #include <sys/bio.h>
73 #include <sys/buf.h>
74 #include <sys/sched.h>
75 #include <sys/acl.h>
76 #include <vm/vm_param.h>
77
78 /*
79 * Programming rules.
80 *
81 * Each vnode op performs some logical unit of work. To do this, the ZPL must
82 * properly lock its in-core state, create a DMU transaction, do the work,
83 * record this work in the intent log (ZIL), commit the DMU transaction,
84 * and wait for the intent log to commit if it is a synchronous operation.
85 * Moreover, the vnode ops must work in both normal and log replay context.
86 * The ordering of events is important to avoid deadlocks and references
87 * to freed memory. The example below illustrates the following Big Rules:
88 *
89 * (1) A check must be made in each zfs thread for a mounted file system.
90 * This is done avoiding races using ZFS_ENTER(zfsvfs).
91 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
92 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
93 * can return EIO from the calling function.
94 *
95 * (2) VN_RELE() should always be the last thing except for zil_commit()
96 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
97 * First, if it's the last reference, the vnode/znode
98 * can be freed, so the zp may point to freed memory. Second, the last
99 * reference will call zfs_zinactive(), which may induce a lot of work --
100 * pushing cached pages (which acquires range locks) and syncing out
101 * cached atime changes. Third, zfs_zinactive() may require a new tx,
102 * which could deadlock the system if you were already holding one.
103 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
104 *
105 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
106 * as they can span dmu_tx_assign() calls.
107 *
108 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
109 * dmu_tx_assign(). This is critical because we don't want to block
110 * while holding locks.
111 *
112 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
113 * reduces lock contention and CPU usage when we must wait (note that if
114 * throughput is constrained by the storage, nearly every transaction
115 * must wait).
116 *
117 * Note, in particular, that if a lock is sometimes acquired before
118 * the tx assigns, and sometimes after (e.g. z_lock), then failing
119 * to use a non-blocking assign can deadlock the system. The scenario:
120 *
121 * Thread A has grabbed a lock before calling dmu_tx_assign().
122 * Thread B is in an already-assigned tx, and blocks for this lock.
123 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
124 * forever, because the previous txg can't quiesce until B's tx commits.
125 *
126 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
127 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
128 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
129 * to indicate that this operation has already called dmu_tx_wait().
130 * This will ensure that we don't retry forever, waiting a short bit
131 * each time.
132 *
133 * (5) If the operation succeeded, generate the intent log entry for it
134 * before dropping locks. This ensures that the ordering of events
135 * in the intent log matches the order in which they actually occurred.
136 * During ZIL replay the zfs_log_* functions will update the sequence
137 * number to indicate the zil transaction has replayed.
138 *
139 * (6) At the end of each vnode op, the DMU tx must always commit,
140 * regardless of whether there were any errors.
141 *
142 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
143 * to ensure that synchronous semantics are provided when necessary.
144 *
145 * In general, this is how things should be ordered in each vnode op:
146 *
147 * ZFS_ENTER(zfsvfs); // exit if unmounted
148 * top:
149 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
150 * rw_enter(...); // grab any other locks you need
151 * tx = dmu_tx_create(...); // get DMU tx
152 * dmu_tx_hold_*(); // hold each object you might modify
153 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
154 * if (error) {
155 * rw_exit(...); // drop locks
156 * zfs_dirent_unlock(dl); // unlock directory entry
157 * VN_RELE(...); // release held vnodes
158 * if (error == ERESTART) {
159 * waited = B_TRUE;
160 * dmu_tx_wait(tx);
161 * dmu_tx_abort(tx);
162 * goto top;
163 * }
164 * dmu_tx_abort(tx); // abort DMU tx
165 * ZFS_EXIT(zfsvfs); // finished in zfs
166 * return (error); // really out of space
167 * }
168 * error = do_real_work(); // do whatever this VOP does
169 * if (error == 0)
170 * zfs_log_*(...); // on success, make ZIL entry
171 * dmu_tx_commit(tx); // commit DMU tx -- error or not
172 * rw_exit(...); // drop locks
173 * zfs_dirent_unlock(dl); // unlock directory entry
174 * VN_RELE(...); // release held vnodes
175 * zil_commit(zilog, foid); // synchronous when necessary
176 * ZFS_EXIT(zfsvfs); // finished in zfs
177 * return (error); // done, report error
178 */
179
180 /* ARGSUSED */
181 static int
zfs_open(vnode_t ** vpp,int flag,cred_t * cr,caller_context_t * ct)182 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
183 {
184 znode_t *zp = VTOZ(*vpp);
185 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
186
187 ZFS_ENTER(zfsvfs);
188 ZFS_VERIFY_ZP(zp);
189
190 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
191 ((flag & FAPPEND) == 0)) {
192 ZFS_EXIT(zfsvfs);
193 return (SET_ERROR(EPERM));
194 }
195
196 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
197 ZTOV(zp)->v_type == VREG &&
198 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
199 if (fs_vscan(*vpp, cr, 0) != 0) {
200 ZFS_EXIT(zfsvfs);
201 return (SET_ERROR(EACCES));
202 }
203 }
204
205 /* Keep a count of the synchronous opens in the znode */
206 if (flag & (FSYNC | FDSYNC))
207 atomic_inc_32(&zp->z_sync_cnt);
208
209 ZFS_EXIT(zfsvfs);
210 return (0);
211 }
212
213 /* ARGSUSED */
214 static int
zfs_close(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr,caller_context_t * ct)215 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
216 caller_context_t *ct)
217 {
218 znode_t *zp = VTOZ(vp);
219 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
220
221 /*
222 * Clean up any locks held by this process on the vp.
223 */
224 cleanlocks(vp, ddi_get_pid(), 0);
225 cleanshares(vp, ddi_get_pid());
226
227 ZFS_ENTER(zfsvfs);
228 ZFS_VERIFY_ZP(zp);
229
230 /* Decrement the synchronous opens in the znode */
231 if ((flag & (FSYNC | FDSYNC)) && (count == 1))
232 atomic_dec_32(&zp->z_sync_cnt);
233
234 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
235 ZTOV(zp)->v_type == VREG &&
236 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
237 VERIFY(fs_vscan(vp, cr, 1) == 0);
238
239 ZFS_EXIT(zfsvfs);
240 return (0);
241 }
242
243 /*
244 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
245 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
246 */
247 static int
zfs_holey(vnode_t * vp,u_long cmd,offset_t * off)248 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
249 {
250 znode_t *zp = VTOZ(vp);
251 uint64_t noff = (uint64_t)*off; /* new offset */
252 uint64_t file_sz;
253 int error;
254 boolean_t hole;
255
256 file_sz = zp->z_size;
257 if (noff >= file_sz) {
258 return (SET_ERROR(ENXIO));
259 }
260
261 if (cmd == _FIO_SEEK_HOLE)
262 hole = B_TRUE;
263 else
264 hole = B_FALSE;
265
266 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
267
268 if (error == ESRCH)
269 return (SET_ERROR(ENXIO));
270
271 /*
272 * We could find a hole that begins after the logical end-of-file,
273 * because dmu_offset_next() only works on whole blocks. If the
274 * EOF falls mid-block, then indicate that the "virtual hole"
275 * at the end of the file begins at the logical EOF, rather than
276 * at the end of the last block.
277 */
278 if (noff > file_sz) {
279 ASSERT(hole);
280 noff = file_sz;
281 }
282
283 if (noff < *off)
284 return (error);
285 *off = noff;
286 return (error);
287 }
288
289 /* ARGSUSED */
290 static int
zfs_ioctl(vnode_t * vp,u_long com,intptr_t data,int flag,cred_t * cred,int * rvalp,caller_context_t * ct)291 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
292 int *rvalp, caller_context_t *ct)
293 {
294 offset_t off;
295 offset_t ndata;
296 dmu_object_info_t doi;
297 int error;
298 zfsvfs_t *zfsvfs;
299 znode_t *zp;
300
301 switch (com) {
302 case _FIOFFS:
303 {
304 return (0);
305
306 /*
307 * The following two ioctls are used by bfu. Faking out,
308 * necessary to avoid bfu errors.
309 */
310 }
311 case _FIOGDIO:
312 case _FIOSDIO:
313 {
314 return (0);
315 }
316
317 case _FIO_SEEK_DATA:
318 case _FIO_SEEK_HOLE:
319 {
320 #ifdef illumos
321 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
322 return (SET_ERROR(EFAULT));
323 #else
324 off = *(offset_t *)data;
325 #endif
326 zp = VTOZ(vp);
327 zfsvfs = zp->z_zfsvfs;
328 ZFS_ENTER(zfsvfs);
329 ZFS_VERIFY_ZP(zp);
330
331 /* offset parameter is in/out */
332 error = zfs_holey(vp, com, &off);
333 ZFS_EXIT(zfsvfs);
334 if (error)
335 return (error);
336 #ifdef illumos
337 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
338 return (SET_ERROR(EFAULT));
339 #else
340 *(offset_t *)data = off;
341 #endif
342 return (0);
343 }
344 #ifdef illumos
345 case _FIO_COUNT_FILLED:
346 {
347 /*
348 * _FIO_COUNT_FILLED adds a new ioctl command which
349 * exposes the number of filled blocks in a
350 * ZFS object.
351 */
352 zp = VTOZ(vp);
353 zfsvfs = zp->z_zfsvfs;
354 ZFS_ENTER(zfsvfs);
355 ZFS_VERIFY_ZP(zp);
356
357 /*
358 * Wait for all dirty blocks for this object
359 * to get synced out to disk, and the DMU info
360 * updated.
361 */
362 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
363 if (error) {
364 ZFS_EXIT(zfsvfs);
365 return (error);
366 }
367
368 /*
369 * Retrieve fill count from DMU object.
370 */
371 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
372 if (error) {
373 ZFS_EXIT(zfsvfs);
374 return (error);
375 }
376
377 ndata = doi.doi_fill_count;
378
379 ZFS_EXIT(zfsvfs);
380 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
381 return (SET_ERROR(EFAULT));
382 return (0);
383 }
384 #endif
385 }
386 return (SET_ERROR(ENOTTY));
387 }
388
389 static vm_page_t
page_busy(vnode_t * vp,int64_t start,int64_t off,int64_t nbytes)390 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
391 {
392 vm_object_t obj;
393 vm_page_t pp;
394 int64_t end;
395
396 /*
397 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
398 * aligned boundaries, if the range is not aligned. As a result a
399 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
400 * It may happen that all DEV_BSIZE subranges are marked clean and thus
401 * the whole page would be considred clean despite have some dirty data.
402 * For this reason we should shrink the range to DEV_BSIZE aligned
403 * boundaries before calling vm_page_clear_dirty.
404 */
405 end = rounddown2(off + nbytes, DEV_BSIZE);
406 off = roundup2(off, DEV_BSIZE);
407 nbytes = end - off;
408
409 obj = vp->v_object;
410 zfs_vmobject_assert_wlocked(obj);
411
412 for (;;) {
413 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
414 pp->valid) {
415 if (vm_page_xbusied(pp)) {
416 /*
417 * Reference the page before unlocking and
418 * sleeping so that the page daemon is less
419 * likely to reclaim it.
420 */
421 vm_page_reference(pp);
422 vm_page_lock(pp);
423 zfs_vmobject_wunlock(obj);
424 vm_page_busy_sleep(pp, "zfsmwb", true);
425 zfs_vmobject_wlock(obj);
426 continue;
427 }
428 vm_page_sbusy(pp);
429 } else if (pp == NULL) {
430 pp = vm_page_alloc(obj, OFF_TO_IDX(start),
431 VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
432 VM_ALLOC_SBUSY);
433 } else {
434 ASSERT(pp != NULL && !pp->valid);
435 pp = NULL;
436 }
437
438 if (pp != NULL) {
439 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
440 vm_object_pip_add(obj, 1);
441 pmap_remove_write(pp);
442 if (nbytes != 0)
443 vm_page_clear_dirty(pp, off, nbytes);
444 }
445 break;
446 }
447 return (pp);
448 }
449
450 static void
page_unbusy(vm_page_t pp)451 page_unbusy(vm_page_t pp)
452 {
453
454 vm_page_sunbusy(pp);
455 vm_object_pip_subtract(pp->object, 1);
456 }
457
458 static vm_page_t
page_hold(vnode_t * vp,int64_t start)459 page_hold(vnode_t *vp, int64_t start)
460 {
461 vm_object_t obj;
462 vm_page_t pp;
463
464 obj = vp->v_object;
465 zfs_vmobject_assert_wlocked(obj);
466
467 for (;;) {
468 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
469 pp->valid) {
470 if (vm_page_xbusied(pp)) {
471 /*
472 * Reference the page before unlocking and
473 * sleeping so that the page daemon is less
474 * likely to reclaim it.
475 */
476 vm_page_reference(pp);
477 vm_page_lock(pp);
478 zfs_vmobject_wunlock(obj);
479 vm_page_busy_sleep(pp, "zfsmwb", true);
480 zfs_vmobject_wlock(obj);
481 continue;
482 }
483
484 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
485 vm_page_lock(pp);
486 vm_page_hold(pp);
487 vm_page_unlock(pp);
488
489 } else
490 pp = NULL;
491 break;
492 }
493 return (pp);
494 }
495
496 static void
page_unhold(vm_page_t pp)497 page_unhold(vm_page_t pp)
498 {
499
500 vm_page_lock(pp);
501 vm_page_unhold(pp);
502 vm_page_unlock(pp);
503 }
504
505 /*
506 * When a file is memory mapped, we must keep the IO data synchronized
507 * between the DMU cache and the memory mapped pages. What this means:
508 *
509 * On Write: If we find a memory mapped page, we write to *both*
510 * the page and the dmu buffer.
511 */
512 static void
update_pages(vnode_t * vp,int64_t start,int len,objset_t * os,uint64_t oid,int segflg,dmu_tx_t * tx)513 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
514 int segflg, dmu_tx_t *tx)
515 {
516 vm_object_t obj;
517 struct sf_buf *sf;
518 caddr_t va;
519 int off;
520
521 ASSERT(segflg != UIO_NOCOPY);
522 ASSERT(vp->v_mount != NULL);
523 obj = vp->v_object;
524 ASSERT(obj != NULL);
525
526 off = start & PAGEOFFSET;
527 zfs_vmobject_wlock(obj);
528 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
529 vm_page_t pp;
530 int nbytes = imin(PAGESIZE - off, len);
531
532 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
533 zfs_vmobject_wunlock(obj);
534
535 va = zfs_map_page(pp, &sf);
536 (void) dmu_read(os, oid, start+off, nbytes,
537 va+off, DMU_READ_PREFETCH);;
538 zfs_unmap_page(sf);
539
540 zfs_vmobject_wlock(obj);
541 page_unbusy(pp);
542 }
543 len -= nbytes;
544 off = 0;
545 }
546 vm_object_pip_wakeupn(obj, 0);
547 zfs_vmobject_wunlock(obj);
548 }
549
550 /*
551 * Read with UIO_NOCOPY flag means that sendfile(2) requests
552 * ZFS to populate a range of page cache pages with data.
553 *
554 * NOTE: this function could be optimized to pre-allocate
555 * all pages in advance, drain exclusive busy on all of them,
556 * map them into contiguous KVA region and populate them
557 * in one single dmu_read() call.
558 */
559 static int
mappedread_sf(vnode_t * vp,int nbytes,uio_t * uio)560 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
561 {
562 znode_t *zp = VTOZ(vp);
563 objset_t *os = zp->z_zfsvfs->z_os;
564 struct sf_buf *sf;
565 vm_object_t obj;
566 vm_page_t pp;
567 int64_t start;
568 caddr_t va;
569 int len = nbytes;
570 int off;
571 int error = 0;
572
573 ASSERT(uio->uio_segflg == UIO_NOCOPY);
574 ASSERT(vp->v_mount != NULL);
575 obj = vp->v_object;
576 ASSERT(obj != NULL);
577 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
578
579 zfs_vmobject_wlock(obj);
580 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
581 int bytes = MIN(PAGESIZE, len);
582
583 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
584 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
585 if (pp->valid == 0) {
586 zfs_vmobject_wunlock(obj);
587 va = zfs_map_page(pp, &sf);
588 error = dmu_read(os, zp->z_id, start, bytes, va,
589 DMU_READ_PREFETCH);
590 if (bytes != PAGESIZE && error == 0)
591 bzero(va + bytes, PAGESIZE - bytes);
592 zfs_unmap_page(sf);
593 zfs_vmobject_wlock(obj);
594 vm_page_sunbusy(pp);
595 vm_page_lock(pp);
596 if (error) {
597 if (pp->wire_count == 0 && pp->valid == 0 &&
598 !vm_page_busied(pp))
599 vm_page_free(pp);
600 } else {
601 pp->valid = VM_PAGE_BITS_ALL;
602 vm_page_activate(pp);
603 }
604 vm_page_unlock(pp);
605 } else {
606 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
607 vm_page_sunbusy(pp);
608 }
609 if (error)
610 break;
611 uio->uio_resid -= bytes;
612 uio->uio_offset += bytes;
613 len -= bytes;
614 }
615 zfs_vmobject_wunlock(obj);
616 return (error);
617 }
618
619 /*
620 * When a file is memory mapped, we must keep the IO data synchronized
621 * between the DMU cache and the memory mapped pages. What this means:
622 *
623 * On Read: We "read" preferentially from memory mapped pages,
624 * else we default from the dmu buffer.
625 *
626 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
627 * the file is memory mapped.
628 */
629 static int
mappedread(vnode_t * vp,int nbytes,uio_t * uio)630 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
631 {
632 znode_t *zp = VTOZ(vp);
633 vm_object_t obj;
634 int64_t start;
635 caddr_t va;
636 int len = nbytes;
637 int off;
638 int error = 0;
639
640 ASSERT(vp->v_mount != NULL);
641 obj = vp->v_object;
642 ASSERT(obj != NULL);
643
644 start = uio->uio_loffset;
645 off = start & PAGEOFFSET;
646 zfs_vmobject_wlock(obj);
647 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
648 vm_page_t pp;
649 uint64_t bytes = MIN(PAGESIZE - off, len);
650
651 if (pp = page_hold(vp, start)) {
652 struct sf_buf *sf;
653 caddr_t va;
654
655 zfs_vmobject_wunlock(obj);
656 va = zfs_map_page(pp, &sf);
657 #ifdef illumos
658 error = uiomove(va + off, bytes, UIO_READ, uio);
659 #else
660 error = vn_io_fault_uiomove(va + off, bytes, uio);
661 #endif
662 zfs_unmap_page(sf);
663 zfs_vmobject_wlock(obj);
664 page_unhold(pp);
665 } else {
666 zfs_vmobject_wunlock(obj);
667 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
668 uio, bytes);
669 zfs_vmobject_wlock(obj);
670 }
671 len -= bytes;
672 off = 0;
673 if (error)
674 break;
675 }
676 zfs_vmobject_wunlock(obj);
677 return (error);
678 }
679
680 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
681
682 /*
683 * Read bytes from specified file into supplied buffer.
684 *
685 * IN: vp - vnode of file to be read from.
686 * uio - structure supplying read location, range info,
687 * and return buffer.
688 * ioflag - SYNC flags; used to provide FRSYNC semantics.
689 * cr - credentials of caller.
690 * ct - caller context
691 *
692 * OUT: uio - updated offset and range, buffer filled.
693 *
694 * RETURN: 0 on success, error code on failure.
695 *
696 * Side Effects:
697 * vp - atime updated if byte count > 0
698 */
699 /* ARGSUSED */
700 static int
zfs_read(vnode_t * vp,uio_t * uio,int ioflag,cred_t * cr,caller_context_t * ct)701 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
702 {
703 znode_t *zp = VTOZ(vp);
704 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
705 ssize_t n, nbytes;
706 int error = 0;
707 rl_t *rl;
708 xuio_t *xuio = NULL;
709
710 ZFS_ENTER(zfsvfs);
711 ZFS_VERIFY_ZP(zp);
712
713 if (zp->z_pflags & ZFS_AV_QUARANTINED) {
714 ZFS_EXIT(zfsvfs);
715 return (SET_ERROR(EACCES));
716 }
717
718 /*
719 * Validate file offset
720 */
721 if (uio->uio_loffset < (offset_t)0) {
722 ZFS_EXIT(zfsvfs);
723 return (SET_ERROR(EINVAL));
724 }
725
726 /*
727 * Fasttrack empty reads
728 */
729 if (uio->uio_resid == 0) {
730 ZFS_EXIT(zfsvfs);
731 return (0);
732 }
733
734 /*
735 * Check for mandatory locks
736 */
737 if (MANDMODE(zp->z_mode)) {
738 if (error = chklock(vp, FREAD,
739 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
740 ZFS_EXIT(zfsvfs);
741 return (error);
742 }
743 }
744
745 /*
746 * If we're in FRSYNC mode, sync out this znode before reading it.
747 */
748 if (zfsvfs->z_log &&
749 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
750 zil_commit(zfsvfs->z_log, zp->z_id);
751
752 /*
753 * Lock the range against changes.
754 */
755 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
756
757 /*
758 * If we are reading past end-of-file we can skip
759 * to the end; but we might still need to set atime.
760 */
761 if (uio->uio_loffset >= zp->z_size) {
762 error = 0;
763 goto out;
764 }
765
766 ASSERT(uio->uio_loffset < zp->z_size);
767 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
768
769 #ifdef illumos
770 if ((uio->uio_extflg == UIO_XUIO) &&
771 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
772 int nblk;
773 int blksz = zp->z_blksz;
774 uint64_t offset = uio->uio_loffset;
775
776 xuio = (xuio_t *)uio;
777 if ((ISP2(blksz))) {
778 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
779 blksz)) / blksz;
780 } else {
781 ASSERT(offset + n <= blksz);
782 nblk = 1;
783 }
784 (void) dmu_xuio_init(xuio, nblk);
785
786 if (vn_has_cached_data(vp)) {
787 /*
788 * For simplicity, we always allocate a full buffer
789 * even if we only expect to read a portion of a block.
790 */
791 while (--nblk >= 0) {
792 (void) dmu_xuio_add(xuio,
793 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
794 blksz), 0, blksz);
795 }
796 }
797 }
798 #endif /* illumos */
799
800 while (n > 0) {
801 nbytes = MIN(n, zfs_read_chunk_size -
802 P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
803
804 #ifdef __FreeBSD__
805 if (uio->uio_segflg == UIO_NOCOPY)
806 error = mappedread_sf(vp, nbytes, uio);
807 else
808 #endif /* __FreeBSD__ */
809 if (vn_has_cached_data(vp)) {
810 error = mappedread(vp, nbytes, uio);
811 } else {
812 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
813 uio, nbytes);
814 }
815 if (error) {
816 /* convert checksum errors into IO errors */
817 if (error == ECKSUM)
818 error = SET_ERROR(EIO);
819 break;
820 }
821
822 n -= nbytes;
823 }
824 out:
825 zfs_range_unlock(rl);
826
827 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
828 ZFS_EXIT(zfsvfs);
829 return (error);
830 }
831
832 /*
833 * Write the bytes to a file.
834 *
835 * IN: vp - vnode of file to be written to.
836 * uio - structure supplying write location, range info,
837 * and data buffer.
838 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
839 * set if in append mode.
840 * cr - credentials of caller.
841 * ct - caller context (NFS/CIFS fem monitor only)
842 *
843 * OUT: uio - updated offset and range.
844 *
845 * RETURN: 0 on success, error code on failure.
846 *
847 * Timestamps:
848 * vp - ctime|mtime updated if byte count > 0
849 */
850
851 /* ARGSUSED */
852 static int
zfs_write(vnode_t * vp,uio_t * uio,int ioflag,cred_t * cr,caller_context_t * ct)853 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
854 {
855 znode_t *zp = VTOZ(vp);
856 rlim64_t limit = MAXOFFSET_T;
857 ssize_t start_resid = uio->uio_resid;
858 ssize_t tx_bytes;
859 uint64_t end_size;
860 dmu_tx_t *tx;
861 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
862 zilog_t *zilog;
863 offset_t woff;
864 ssize_t n, nbytes;
865 rl_t *rl;
866 int max_blksz = zfsvfs->z_max_blksz;
867 int error = 0;
868 arc_buf_t *abuf;
869 iovec_t *aiov = NULL;
870 xuio_t *xuio = NULL;
871 int i_iov = 0;
872 int iovcnt = uio->uio_iovcnt;
873 iovec_t *iovp = uio->uio_iov;
874 int write_eof;
875 int count = 0;
876 sa_bulk_attr_t bulk[4];
877 uint64_t mtime[2], ctime[2];
878
879 /*
880 * Fasttrack empty write
881 */
882 n = start_resid;
883 if (n == 0)
884 return (0);
885
886 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
887 limit = MAXOFFSET_T;
888
889 ZFS_ENTER(zfsvfs);
890 ZFS_VERIFY_ZP(zp);
891
892 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
893 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
894 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
895 &zp->z_size, 8);
896 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
897 &zp->z_pflags, 8);
898
899 /*
900 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
901 * callers might not be able to detect properly that we are read-only,
902 * so check it explicitly here.
903 */
904 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
905 ZFS_EXIT(zfsvfs);
906 return (SET_ERROR(EROFS));
907 }
908
909 /*
910 * If immutable or not appending then return EPERM
911 */
912 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
913 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
914 (uio->uio_loffset < zp->z_size))) {
915 ZFS_EXIT(zfsvfs);
916 return (SET_ERROR(EPERM));
917 }
918
919 zilog = zfsvfs->z_log;
920
921 /*
922 * Validate file offset
923 */
924 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
925 if (woff < 0) {
926 ZFS_EXIT(zfsvfs);
927 return (SET_ERROR(EINVAL));
928 }
929
930 /*
931 * Check for mandatory locks before calling zfs_range_lock()
932 * in order to prevent a deadlock with locks set via fcntl().
933 */
934 if (MANDMODE((mode_t)zp->z_mode) &&
935 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
936 ZFS_EXIT(zfsvfs);
937 return (error);
938 }
939
940 #ifdef illumos
941 /*
942 * Pre-fault the pages to ensure slow (eg NFS) pages
943 * don't hold up txg.
944 * Skip this if uio contains loaned arc_buf.
945 */
946 if ((uio->uio_extflg == UIO_XUIO) &&
947 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
948 xuio = (xuio_t *)uio;
949 else
950 uio_prefaultpages(MIN(n, max_blksz), uio);
951 #endif
952
953 /*
954 * If in append mode, set the io offset pointer to eof.
955 */
956 if (ioflag & FAPPEND) {
957 /*
958 * Obtain an appending range lock to guarantee file append
959 * semantics. We reset the write offset once we have the lock.
960 */
961 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
962 woff = rl->r_off;
963 if (rl->r_len == UINT64_MAX) {
964 /*
965 * We overlocked the file because this write will cause
966 * the file block size to increase.
967 * Note that zp_size cannot change with this lock held.
968 */
969 woff = zp->z_size;
970 }
971 uio->uio_loffset = woff;
972 } else {
973 /*
974 * Note that if the file block size will change as a result of
975 * this write, then this range lock will lock the entire file
976 * so that we can re-write the block safely.
977 */
978 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
979 }
980
981 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
982 zfs_range_unlock(rl);
983 ZFS_EXIT(zfsvfs);
984 return (EFBIG);
985 }
986
987 if (woff >= limit) {
988 zfs_range_unlock(rl);
989 ZFS_EXIT(zfsvfs);
990 return (SET_ERROR(EFBIG));
991 }
992
993 if ((woff + n) > limit || woff > (limit - n))
994 n = limit - woff;
995
996 /* Will this write extend the file length? */
997 write_eof = (woff + n > zp->z_size);
998
999 end_size = MAX(zp->z_size, woff + n);
1000
1001 /*
1002 * Write the file in reasonable size chunks. Each chunk is written
1003 * in a separate transaction; this keeps the intent log records small
1004 * and allows us to do more fine-grained space accounting.
1005 */
1006 while (n > 0) {
1007 abuf = NULL;
1008 woff = uio->uio_loffset;
1009 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1010 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1011 if (abuf != NULL)
1012 dmu_return_arcbuf(abuf);
1013 error = SET_ERROR(EDQUOT);
1014 break;
1015 }
1016
1017 if (xuio && abuf == NULL) {
1018 ASSERT(i_iov < iovcnt);
1019 aiov = &iovp[i_iov];
1020 abuf = dmu_xuio_arcbuf(xuio, i_iov);
1021 dmu_xuio_clear(xuio, i_iov);
1022 DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1023 iovec_t *, aiov, arc_buf_t *, abuf);
1024 ASSERT((aiov->iov_base == abuf->b_data) ||
1025 ((char *)aiov->iov_base - (char *)abuf->b_data +
1026 aiov->iov_len == arc_buf_size(abuf)));
1027 i_iov++;
1028 } else if (abuf == NULL && n >= max_blksz &&
1029 woff >= zp->z_size &&
1030 P2PHASE(woff, max_blksz) == 0 &&
1031 zp->z_blksz == max_blksz) {
1032 /*
1033 * This write covers a full block. "Borrow" a buffer
1034 * from the dmu so that we can fill it before we enter
1035 * a transaction. This avoids the possibility of
1036 * holding up the transaction if the data copy hangs
1037 * up on a pagefault (e.g., from an NFS server mapping).
1038 */
1039 size_t cbytes;
1040
1041 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1042 max_blksz);
1043 ASSERT(abuf != NULL);
1044 ASSERT(arc_buf_size(abuf) == max_blksz);
1045 if (error = uiocopy(abuf->b_data, max_blksz,
1046 UIO_WRITE, uio, &cbytes)) {
1047 dmu_return_arcbuf(abuf);
1048 break;
1049 }
1050 ASSERT(cbytes == max_blksz);
1051 }
1052
1053 /*
1054 * Start a transaction.
1055 */
1056 tx = dmu_tx_create(zfsvfs->z_os);
1057 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1058 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1059 zfs_sa_upgrade_txholds(tx, zp);
1060 error = dmu_tx_assign(tx, TXG_WAIT);
1061 if (error) {
1062 dmu_tx_abort(tx);
1063 if (abuf != NULL)
1064 dmu_return_arcbuf(abuf);
1065 break;
1066 }
1067
1068 /*
1069 * If zfs_range_lock() over-locked we grow the blocksize
1070 * and then reduce the lock range. This will only happen
1071 * on the first iteration since zfs_range_reduce() will
1072 * shrink down r_len to the appropriate size.
1073 */
1074 if (rl->r_len == UINT64_MAX) {
1075 uint64_t new_blksz;
1076
1077 if (zp->z_blksz > max_blksz) {
1078 /*
1079 * File's blocksize is already larger than the
1080 * "recordsize" property. Only let it grow to
1081 * the next power of 2.
1082 */
1083 ASSERT(!ISP2(zp->z_blksz));
1084 new_blksz = MIN(end_size,
1085 1 << highbit64(zp->z_blksz));
1086 } else {
1087 new_blksz = MIN(end_size, max_blksz);
1088 }
1089 zfs_grow_blocksize(zp, new_blksz, tx);
1090 zfs_range_reduce(rl, woff, n);
1091 }
1092
1093 /*
1094 * XXX - should we really limit each write to z_max_blksz?
1095 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1096 */
1097 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1098
1099 if (woff + nbytes > zp->z_size)
1100 vnode_pager_setsize(vp, woff + nbytes);
1101
1102 if (abuf == NULL) {
1103 tx_bytes = uio->uio_resid;
1104 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1105 uio, nbytes, tx);
1106 tx_bytes -= uio->uio_resid;
1107 } else {
1108 tx_bytes = nbytes;
1109 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1110 /*
1111 * If this is not a full block write, but we are
1112 * extending the file past EOF and this data starts
1113 * block-aligned, use assign_arcbuf(). Otherwise,
1114 * write via dmu_write().
1115 */
1116 if (tx_bytes < max_blksz && (!write_eof ||
1117 aiov->iov_base != abuf->b_data)) {
1118 ASSERT(xuio);
1119 dmu_write(zfsvfs->z_os, zp->z_id, woff,
1120 aiov->iov_len, aiov->iov_base, tx);
1121 dmu_return_arcbuf(abuf);
1122 xuio_stat_wbuf_copied();
1123 } else {
1124 ASSERT(xuio || tx_bytes == max_blksz);
1125 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1126 woff, abuf, tx);
1127 }
1128 ASSERT(tx_bytes <= uio->uio_resid);
1129 uioskip(uio, tx_bytes);
1130 }
1131 if (tx_bytes && vn_has_cached_data(vp)) {
1132 update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1133 zp->z_id, uio->uio_segflg, tx);
1134 }
1135
1136 /*
1137 * If we made no progress, we're done. If we made even
1138 * partial progress, update the znode and ZIL accordingly.
1139 */
1140 if (tx_bytes == 0) {
1141 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1142 (void *)&zp->z_size, sizeof (uint64_t), tx);
1143 dmu_tx_commit(tx);
1144 ASSERT(error != 0);
1145 break;
1146 }
1147
1148 /*
1149 * Clear Set-UID/Set-GID bits on successful write if not
1150 * privileged and at least one of the excute bits is set.
1151 *
1152 * It would be nice to to this after all writes have
1153 * been done, but that would still expose the ISUID/ISGID
1154 * to another app after the partial write is committed.
1155 *
1156 * Note: we don't call zfs_fuid_map_id() here because
1157 * user 0 is not an ephemeral uid.
1158 */
1159 mutex_enter(&zp->z_acl_lock);
1160 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1161 (S_IXUSR >> 6))) != 0 &&
1162 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1163 secpolicy_vnode_setid_retain(vp, cr,
1164 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1165 uint64_t newmode;
1166 zp->z_mode &= ~(S_ISUID | S_ISGID);
1167 newmode = zp->z_mode;
1168 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1169 (void *)&newmode, sizeof (uint64_t), tx);
1170 }
1171 mutex_exit(&zp->z_acl_lock);
1172
1173 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1174 B_TRUE);
1175
1176 /*
1177 * Update the file size (zp_size) if it has changed;
1178 * account for possible concurrent updates.
1179 */
1180 while ((end_size = zp->z_size) < uio->uio_loffset) {
1181 (void) atomic_cas_64(&zp->z_size, end_size,
1182 uio->uio_loffset);
1183 #ifdef illumos
1184 ASSERT(error == 0);
1185 #else
1186 ASSERT(error == 0 || error == EFAULT);
1187 #endif
1188 }
1189 /*
1190 * If we are replaying and eof is non zero then force
1191 * the file size to the specified eof. Note, there's no
1192 * concurrency during replay.
1193 */
1194 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1195 zp->z_size = zfsvfs->z_replay_eof;
1196
1197 if (error == 0)
1198 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1199 else
1200 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1201
1202 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1203 dmu_tx_commit(tx);
1204
1205 if (error != 0)
1206 break;
1207 ASSERT(tx_bytes == nbytes);
1208 n -= nbytes;
1209
1210 #ifdef illumos
1211 if (!xuio && n > 0)
1212 uio_prefaultpages(MIN(n, max_blksz), uio);
1213 #endif
1214 }
1215
1216 zfs_range_unlock(rl);
1217
1218 /*
1219 * If we're in replay mode, or we made no progress, return error.
1220 * Otherwise, it's at least a partial write, so it's successful.
1221 */
1222 if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1223 ZFS_EXIT(zfsvfs);
1224 return (error);
1225 }
1226
1227 #ifdef __FreeBSD__
1228 /*
1229 * EFAULT means that at least one page of the source buffer was not
1230 * available. VFS will re-try remaining I/O upon this error.
1231 */
1232 if (error == EFAULT) {
1233 ZFS_EXIT(zfsvfs);
1234 return (error);
1235 }
1236 #endif
1237
1238 if (ioflag & (FSYNC | FDSYNC) ||
1239 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1240 zil_commit(zilog, zp->z_id);
1241
1242 ZFS_EXIT(zfsvfs);
1243 return (0);
1244 }
1245
1246 void
zfs_get_done(zgd_t * zgd,int error)1247 zfs_get_done(zgd_t *zgd, int error)
1248 {
1249 znode_t *zp = zgd->zgd_private;
1250 objset_t *os = zp->z_zfsvfs->z_os;
1251
1252 if (zgd->zgd_db)
1253 dmu_buf_rele(zgd->zgd_db, zgd);
1254
1255 zfs_range_unlock(zgd->zgd_rl);
1256
1257 /*
1258 * Release the vnode asynchronously as we currently have the
1259 * txg stopped from syncing.
1260 */
1261 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1262
1263 if (error == 0 && zgd->zgd_bp)
1264 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1265
1266 kmem_free(zgd, sizeof (zgd_t));
1267 }
1268
1269 #ifdef DEBUG
1270 static int zil_fault_io = 0;
1271 #endif
1272
1273 /*
1274 * Get data to generate a TX_WRITE intent log record.
1275 */
1276 int
zfs_get_data(void * arg,lr_write_t * lr,char * buf,zio_t * zio)1277 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1278 {
1279 zfsvfs_t *zfsvfs = arg;
1280 objset_t *os = zfsvfs->z_os;
1281 znode_t *zp;
1282 uint64_t object = lr->lr_foid;
1283 uint64_t offset = lr->lr_offset;
1284 uint64_t size = lr->lr_length;
1285 blkptr_t *bp = &lr->lr_blkptr;
1286 dmu_buf_t *db;
1287 zgd_t *zgd;
1288 int error = 0;
1289
1290 ASSERT(zio != NULL);
1291 ASSERT(size != 0);
1292
1293 /*
1294 * Nothing to do if the file has been removed
1295 */
1296 if (zfs_zget(zfsvfs, object, &zp) != 0)
1297 return (SET_ERROR(ENOENT));
1298 if (zp->z_unlinked) {
1299 /*
1300 * Release the vnode asynchronously as we currently have the
1301 * txg stopped from syncing.
1302 */
1303 VN_RELE_ASYNC(ZTOV(zp),
1304 dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1305 return (SET_ERROR(ENOENT));
1306 }
1307
1308 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1309 zgd->zgd_zilog = zfsvfs->z_log;
1310 zgd->zgd_private = zp;
1311
1312 /*
1313 * Write records come in two flavors: immediate and indirect.
1314 * For small writes it's cheaper to store the data with the
1315 * log record (immediate); for large writes it's cheaper to
1316 * sync the data and get a pointer to it (indirect) so that
1317 * we don't have to write the data twice.
1318 */
1319 if (buf != NULL) { /* immediate write */
1320 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1321 /* test for truncation needs to be done while range locked */
1322 if (offset >= zp->z_size) {
1323 error = SET_ERROR(ENOENT);
1324 } else {
1325 error = dmu_read(os, object, offset, size, buf,
1326 DMU_READ_NO_PREFETCH);
1327 }
1328 ASSERT(error == 0 || error == ENOENT);
1329 } else { /* indirect write */
1330 /*
1331 * Have to lock the whole block to ensure when it's
1332 * written out and its checksum is being calculated
1333 * that no one can change the data. We need to re-check
1334 * blocksize after we get the lock in case it's changed!
1335 */
1336 for (;;) {
1337 uint64_t blkoff;
1338 size = zp->z_blksz;
1339 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1340 offset -= blkoff;
1341 zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1342 RL_READER);
1343 if (zp->z_blksz == size)
1344 break;
1345 offset += blkoff;
1346 zfs_range_unlock(zgd->zgd_rl);
1347 }
1348 /* test for truncation needs to be done while range locked */
1349 if (lr->lr_offset >= zp->z_size)
1350 error = SET_ERROR(ENOENT);
1351 #ifdef DEBUG
1352 if (zil_fault_io) {
1353 error = SET_ERROR(EIO);
1354 zil_fault_io = 0;
1355 }
1356 #endif
1357 if (error == 0)
1358 error = dmu_buf_hold(os, object, offset, zgd, &db,
1359 DMU_READ_NO_PREFETCH);
1360
1361 if (error == 0) {
1362 blkptr_t *obp = dmu_buf_get_blkptr(db);
1363 if (obp) {
1364 ASSERT(BP_IS_HOLE(bp));
1365 *bp = *obp;
1366 }
1367
1368 zgd->zgd_db = db;
1369 zgd->zgd_bp = bp;
1370
1371 ASSERT(db->db_offset == offset);
1372 ASSERT(db->db_size == size);
1373
1374 error = dmu_sync(zio, lr->lr_common.lrc_txg,
1375 zfs_get_done, zgd);
1376 ASSERT(error || lr->lr_length <= zp->z_blksz);
1377
1378 /*
1379 * On success, we need to wait for the write I/O
1380 * initiated by dmu_sync() to complete before we can
1381 * release this dbuf. We will finish everything up
1382 * in the zfs_get_done() callback.
1383 */
1384 if (error == 0)
1385 return (0);
1386
1387 if (error == EALREADY) {
1388 lr->lr_common.lrc_txtype = TX_WRITE2;
1389 error = 0;
1390 }
1391 }
1392 }
1393
1394 zfs_get_done(zgd, error);
1395
1396 return (error);
1397 }
1398
1399 /*ARGSUSED*/
1400 static int
zfs_access(vnode_t * vp,int mode,int flag,cred_t * cr,caller_context_t * ct)1401 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1402 caller_context_t *ct)
1403 {
1404 znode_t *zp = VTOZ(vp);
1405 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1406 int error;
1407
1408 ZFS_ENTER(zfsvfs);
1409 ZFS_VERIFY_ZP(zp);
1410
1411 if (flag & V_ACE_MASK)
1412 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1413 else
1414 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1415
1416 ZFS_EXIT(zfsvfs);
1417 return (error);
1418 }
1419
1420 static int
zfs_dd_callback(struct mount * mp,void * arg,int lkflags,struct vnode ** vpp)1421 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1422 {
1423 int error;
1424
1425 *vpp = arg;
1426 error = vn_lock(*vpp, lkflags);
1427 if (error != 0)
1428 vrele(*vpp);
1429 return (error);
1430 }
1431
1432 static int
zfs_lookup_lock(vnode_t * dvp,vnode_t * vp,const char * name,int lkflags)1433 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1434 {
1435 znode_t *zdp = VTOZ(dvp);
1436 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1437 int error;
1438 int ltype;
1439
1440 ASSERT_VOP_LOCKED(dvp, __func__);
1441 #ifdef DIAGNOSTIC
1442 if ((zdp->z_pflags & ZFS_XATTR) == 0)
1443 VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1444 #endif
1445
1446 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1447 ASSERT3P(dvp, ==, vp);
1448 vref(dvp);
1449 ltype = lkflags & LK_TYPE_MASK;
1450 if (ltype != VOP_ISLOCKED(dvp)) {
1451 if (ltype == LK_EXCLUSIVE)
1452 vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1453 else /* if (ltype == LK_SHARED) */
1454 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1455
1456 /*
1457 * Relock for the "." case could leave us with
1458 * reclaimed vnode.
1459 */
1460 if (dvp->v_iflag & VI_DOOMED) {
1461 vrele(dvp);
1462 return (SET_ERROR(ENOENT));
1463 }
1464 }
1465 return (0);
1466 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1467 /*
1468 * Note that in this case, dvp is the child vnode, and we
1469 * are looking up the parent vnode - exactly reverse from
1470 * normal operation. Unlocking dvp requires some rather
1471 * tricky unlock/relock dance to prevent mp from being freed;
1472 * use vn_vget_ino_gen() which takes care of all that.
1473 *
1474 * XXX Note that there is a time window when both vnodes are
1475 * unlocked. It is possible, although highly unlikely, that
1476 * during that window the parent-child relationship between
1477 * the vnodes may change, for example, get reversed.
1478 * In that case we would have a wrong lock order for the vnodes.
1479 * All other filesystems seem to ignore this problem, so we
1480 * do the same here.
1481 * A potential solution could be implemented as follows:
1482 * - using LK_NOWAIT when locking the second vnode and retrying
1483 * if necessary
1484 * - checking that the parent-child relationship still holds
1485 * after locking both vnodes and retrying if it doesn't
1486 */
1487 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1488 return (error);
1489 } else {
1490 error = vn_lock(vp, lkflags);
1491 if (error != 0)
1492 vrele(vp);
1493 return (error);
1494 }
1495 }
1496
1497 /*
1498 * Lookup an entry in a directory, or an extended attribute directory.
1499 * If it exists, return a held vnode reference for it.
1500 *
1501 * IN: dvp - vnode of directory to search.
1502 * nm - name of entry to lookup.
1503 * pnp - full pathname to lookup [UNUSED].
1504 * flags - LOOKUP_XATTR set if looking for an attribute.
1505 * rdir - root directory vnode [UNUSED].
1506 * cr - credentials of caller.
1507 * ct - caller context
1508 *
1509 * OUT: vpp - vnode of located entry, NULL if not found.
1510 *
1511 * RETURN: 0 on success, error code on failure.
1512 *
1513 * Timestamps:
1514 * NA
1515 */
1516 /* ARGSUSED */
1517 static int
zfs_lookup(vnode_t * dvp,char * nm,vnode_t ** vpp,struct componentname * cnp,int nameiop,cred_t * cr,kthread_t * td,int flags)1518 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1519 int nameiop, cred_t *cr, kthread_t *td, int flags)
1520 {
1521 znode_t *zdp = VTOZ(dvp);
1522 znode_t *zp;
1523 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1524 int error = 0;
1525
1526 /* fast path (should be redundant with vfs namecache) */
1527 if (!(flags & LOOKUP_XATTR)) {
1528 if (dvp->v_type != VDIR) {
1529 return (SET_ERROR(ENOTDIR));
1530 } else if (zdp->z_sa_hdl == NULL) {
1531 return (SET_ERROR(EIO));
1532 }
1533 }
1534
1535 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1536
1537 ZFS_ENTER(zfsvfs);
1538 ZFS_VERIFY_ZP(zdp);
1539
1540 *vpp = NULL;
1541
1542 if (flags & LOOKUP_XATTR) {
1543 #ifdef TODO
1544 /*
1545 * If the xattr property is off, refuse the lookup request.
1546 */
1547 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1548 ZFS_EXIT(zfsvfs);
1549 return (SET_ERROR(EINVAL));
1550 }
1551 #endif
1552
1553 /*
1554 * We don't allow recursive attributes..
1555 * Maybe someday we will.
1556 */
1557 if (zdp->z_pflags & ZFS_XATTR) {
1558 ZFS_EXIT(zfsvfs);
1559 return (SET_ERROR(EINVAL));
1560 }
1561
1562 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1563 ZFS_EXIT(zfsvfs);
1564 return (error);
1565 }
1566
1567 /*
1568 * Do we have permission to get into attribute directory?
1569 */
1570 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1571 B_FALSE, cr)) {
1572 vrele(*vpp);
1573 *vpp = NULL;
1574 }
1575
1576 ZFS_EXIT(zfsvfs);
1577 return (error);
1578 }
1579
1580 /*
1581 * Check accessibility of directory.
1582 */
1583 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1584 ZFS_EXIT(zfsvfs);
1585 return (error);
1586 }
1587
1588 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1589 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1590 ZFS_EXIT(zfsvfs);
1591 return (SET_ERROR(EILSEQ));
1592 }
1593
1594
1595 /*
1596 * First handle the special cases.
1597 */
1598 if ((cnp->cn_flags & ISDOTDOT) != 0) {
1599 /*
1600 * If we are a snapshot mounted under .zfs, return
1601 * the vp for the snapshot directory.
1602 */
1603 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1604 struct componentname cn;
1605 vnode_t *zfsctl_vp;
1606 int ltype;
1607
1608 ZFS_EXIT(zfsvfs);
1609 ltype = VOP_ISLOCKED(dvp);
1610 VOP_UNLOCK(dvp, 0);
1611 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1612 &zfsctl_vp);
1613 if (error == 0) {
1614 cn.cn_nameptr = "snapshot";
1615 cn.cn_namelen = strlen(cn.cn_nameptr);
1616 cn.cn_nameiop = cnp->cn_nameiop;
1617 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
1618 cn.cn_lkflags = cnp->cn_lkflags;
1619 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1620 vput(zfsctl_vp);
1621 }
1622 vn_lock(dvp, ltype | LK_RETRY);
1623 return (error);
1624 }
1625 }
1626 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1627 ZFS_EXIT(zfsvfs);
1628 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1629 return (SET_ERROR(ENOTSUP));
1630 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1631 return (error);
1632 }
1633
1634 /*
1635 * The loop is retry the lookup if the parent-child relationship
1636 * changes during the dot-dot locking complexities.
1637 */
1638 for (;;) {
1639 uint64_t parent;
1640
1641 error = zfs_dirlook(zdp, nm, &zp);
1642 if (error == 0)
1643 *vpp = ZTOV(zp);
1644
1645 ZFS_EXIT(zfsvfs);
1646 if (error != 0)
1647 break;
1648
1649 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1650 if (error != 0) {
1651 /*
1652 * If we've got a locking error, then the vnode
1653 * got reclaimed because of a force unmount.
1654 * We never enter doomed vnodes into the name cache.
1655 */
1656 *vpp = NULL;
1657 return (error);
1658 }
1659
1660 if ((cnp->cn_flags & ISDOTDOT) == 0)
1661 break;
1662
1663 ZFS_ENTER(zfsvfs);
1664 if (zdp->z_sa_hdl == NULL) {
1665 error = SET_ERROR(EIO);
1666 } else {
1667 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1668 &parent, sizeof (parent));
1669 }
1670 if (error != 0) {
1671 ZFS_EXIT(zfsvfs);
1672 vput(ZTOV(zp));
1673 break;
1674 }
1675 if (zp->z_id == parent) {
1676 ZFS_EXIT(zfsvfs);
1677 break;
1678 }
1679 vput(ZTOV(zp));
1680 }
1681
1682 out:
1683 if (error != 0)
1684 *vpp = NULL;
1685
1686 /* Translate errors and add SAVENAME when needed. */
1687 if (cnp->cn_flags & ISLASTCN) {
1688 switch (nameiop) {
1689 case CREATE:
1690 case RENAME:
1691 if (error == ENOENT) {
1692 error = EJUSTRETURN;
1693 cnp->cn_flags |= SAVENAME;
1694 break;
1695 }
1696 /* FALLTHROUGH */
1697 case DELETE:
1698 if (error == 0)
1699 cnp->cn_flags |= SAVENAME;
1700 break;
1701 }
1702 }
1703
1704 /* Insert name into cache (as non-existent) if appropriate. */
1705 if (zfsvfs->z_use_namecache &&
1706 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1707 cache_enter(dvp, NULL, cnp);
1708
1709 /* Insert name into cache if appropriate. */
1710 if (zfsvfs->z_use_namecache &&
1711 error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1712 if (!(cnp->cn_flags & ISLASTCN) ||
1713 (nameiop != DELETE && nameiop != RENAME)) {
1714 cache_enter(dvp, *vpp, cnp);
1715 }
1716 }
1717
1718 return (error);
1719 }
1720
1721 /*
1722 * Attempt to create a new entry in a directory. If the entry
1723 * already exists, truncate the file if permissible, else return
1724 * an error. Return the vp of the created or trunc'd file.
1725 *
1726 * IN: dvp - vnode of directory to put new file entry in.
1727 * name - name of new file entry.
1728 * vap - attributes of new file.
1729 * excl - flag indicating exclusive or non-exclusive mode.
1730 * mode - mode to open file with.
1731 * cr - credentials of caller.
1732 * flag - large file flag [UNUSED].
1733 * ct - caller context
1734 * vsecp - ACL to be set
1735 *
1736 * OUT: vpp - vnode of created or trunc'd entry.
1737 *
1738 * RETURN: 0 on success, error code on failure.
1739 *
1740 * Timestamps:
1741 * dvp - ctime|mtime updated if new entry created
1742 * vp - ctime|mtime always, atime if new
1743 */
1744
1745 /* ARGSUSED */
1746 static int
zfs_create(vnode_t * dvp,char * name,vattr_t * vap,int excl,int mode,vnode_t ** vpp,cred_t * cr,kthread_t * td)1747 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1748 vnode_t **vpp, cred_t *cr, kthread_t *td)
1749 {
1750 znode_t *zp, *dzp = VTOZ(dvp);
1751 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1752 zilog_t *zilog;
1753 objset_t *os;
1754 dmu_tx_t *tx;
1755 int error;
1756 ksid_t *ksid;
1757 uid_t uid;
1758 gid_t gid = crgetgid(cr);
1759 zfs_acl_ids_t acl_ids;
1760 boolean_t fuid_dirtied;
1761 void *vsecp = NULL;
1762 int flag = 0;
1763 uint64_t txtype;
1764
1765 /*
1766 * If we have an ephemeral id, ACL, or XVATTR then
1767 * make sure file system is at proper version
1768 */
1769
1770 ksid = crgetsid(cr, KSID_OWNER);
1771 if (ksid)
1772 uid = ksid_getid(ksid);
1773 else
1774 uid = crgetuid(cr);
1775
1776 if (zfsvfs->z_use_fuids == B_FALSE &&
1777 (vsecp || (vap->va_mask & AT_XVATTR) ||
1778 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1779 return (SET_ERROR(EINVAL));
1780
1781 ZFS_ENTER(zfsvfs);
1782 ZFS_VERIFY_ZP(dzp);
1783 os = zfsvfs->z_os;
1784 zilog = zfsvfs->z_log;
1785
1786 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1787 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1788 ZFS_EXIT(zfsvfs);
1789 return (SET_ERROR(EILSEQ));
1790 }
1791
1792 if (vap->va_mask & AT_XVATTR) {
1793 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1794 crgetuid(cr), cr, vap->va_type)) != 0) {
1795 ZFS_EXIT(zfsvfs);
1796 return (error);
1797 }
1798 }
1799
1800 *vpp = NULL;
1801
1802 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1803 vap->va_mode &= ~S_ISVTX;
1804
1805 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1806 if (error) {
1807 ZFS_EXIT(zfsvfs);
1808 return (error);
1809 }
1810 ASSERT3P(zp, ==, NULL);
1811
1812 /*
1813 * Create a new file object and update the directory
1814 * to reference it.
1815 */
1816 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1817 goto out;
1818 }
1819
1820 /*
1821 * We only support the creation of regular files in
1822 * extended attribute directories.
1823 */
1824
1825 if ((dzp->z_pflags & ZFS_XATTR) &&
1826 (vap->va_type != VREG)) {
1827 error = SET_ERROR(EINVAL);
1828 goto out;
1829 }
1830
1831 if ((error = zfs_acl_ids_create(dzp, 0, vap,
1832 cr, vsecp, &acl_ids)) != 0)
1833 goto out;
1834
1835 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1836 zfs_acl_ids_free(&acl_ids);
1837 error = SET_ERROR(EDQUOT);
1838 goto out;
1839 }
1840
1841 getnewvnode_reserve(1);
1842
1843 tx = dmu_tx_create(os);
1844
1845 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1846 ZFS_SA_BASE_ATTR_SIZE);
1847
1848 fuid_dirtied = zfsvfs->z_fuid_dirty;
1849 if (fuid_dirtied)
1850 zfs_fuid_txhold(zfsvfs, tx);
1851 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1852 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1853 if (!zfsvfs->z_use_sa &&
1854 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1855 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1856 0, acl_ids.z_aclp->z_acl_bytes);
1857 }
1858 error = dmu_tx_assign(tx, TXG_WAIT);
1859 if (error) {
1860 zfs_acl_ids_free(&acl_ids);
1861 dmu_tx_abort(tx);
1862 getnewvnode_drop_reserve();
1863 ZFS_EXIT(zfsvfs);
1864 return (error);
1865 }
1866 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1867
1868 if (fuid_dirtied)
1869 zfs_fuid_sync(zfsvfs, tx);
1870
1871 (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1872 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1873 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1874 vsecp, acl_ids.z_fuidp, vap);
1875 zfs_acl_ids_free(&acl_ids);
1876 dmu_tx_commit(tx);
1877
1878 getnewvnode_drop_reserve();
1879
1880 out:
1881 if (error == 0) {
1882 *vpp = ZTOV(zp);
1883 }
1884
1885 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1886 zil_commit(zilog, 0);
1887
1888 ZFS_EXIT(zfsvfs);
1889 return (error);
1890 }
1891
1892 /*
1893 * Remove an entry from a directory.
1894 *
1895 * IN: dvp - vnode of directory to remove entry from.
1896 * name - name of entry to remove.
1897 * cr - credentials of caller.
1898 * ct - caller context
1899 * flags - case flags
1900 *
1901 * RETURN: 0 on success, error code on failure.
1902 *
1903 * Timestamps:
1904 * dvp - ctime|mtime
1905 * vp - ctime (if nlink > 0)
1906 */
1907
1908 /*ARGSUSED*/
1909 static int
zfs_remove(vnode_t * dvp,vnode_t * vp,char * name,cred_t * cr)1910 zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1911 {
1912 znode_t *dzp = VTOZ(dvp);
1913 znode_t *zp = VTOZ(vp);
1914 znode_t *xzp;
1915 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1916 zilog_t *zilog;
1917 uint64_t acl_obj, xattr_obj;
1918 uint64_t obj = 0;
1919 dmu_tx_t *tx;
1920 boolean_t unlinked, toobig = FALSE;
1921 uint64_t txtype;
1922 int error;
1923
1924 ZFS_ENTER(zfsvfs);
1925 ZFS_VERIFY_ZP(dzp);
1926 ZFS_VERIFY_ZP(zp);
1927 zilog = zfsvfs->z_log;
1928 zp = VTOZ(vp);
1929
1930 xattr_obj = 0;
1931 xzp = NULL;
1932
1933 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1934 goto out;
1935 }
1936
1937 /*
1938 * Need to use rmdir for removing directories.
1939 */
1940 if (vp->v_type == VDIR) {
1941 error = SET_ERROR(EPERM);
1942 goto out;
1943 }
1944
1945 vnevent_remove(vp, dvp, name, ct);
1946
1947 obj = zp->z_id;
1948
1949 /* are there any extended attributes? */
1950 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1951 &xattr_obj, sizeof (xattr_obj));
1952 if (error == 0 && xattr_obj) {
1953 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1954 ASSERT0(error);
1955 }
1956
1957 /*
1958 * We may delete the znode now, or we may put it in the unlinked set;
1959 * it depends on whether we're the last link, and on whether there are
1960 * other holds on the vnode. So we dmu_tx_hold() the right things to
1961 * allow for either case.
1962 */
1963 tx = dmu_tx_create(zfsvfs->z_os);
1964 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1965 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1966 zfs_sa_upgrade_txholds(tx, zp);
1967 zfs_sa_upgrade_txholds(tx, dzp);
1968
1969 if (xzp) {
1970 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1971 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1972 }
1973
1974 /* charge as an update -- would be nice not to charge at all */
1975 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1976
1977 /*
1978 * Mark this transaction as typically resulting in a net free of space
1979 */
1980 dmu_tx_mark_netfree(tx);
1981
1982 error = dmu_tx_assign(tx, TXG_WAIT);
1983 if (error) {
1984 dmu_tx_abort(tx);
1985 ZFS_EXIT(zfsvfs);
1986 return (error);
1987 }
1988
1989 /*
1990 * Remove the directory entry.
1991 */
1992 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1993
1994 if (error) {
1995 dmu_tx_commit(tx);
1996 goto out;
1997 }
1998
1999 if (unlinked) {
2000 zfs_unlinked_add(zp, tx);
2001 vp->v_vflag |= VV_NOSYNC;
2002 }
2003
2004 txtype = TX_REMOVE;
2005 zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2006
2007 dmu_tx_commit(tx);
2008 out:
2009
2010 if (xzp)
2011 vrele(ZTOV(xzp));
2012
2013 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2014 zil_commit(zilog, 0);
2015
2016 ZFS_EXIT(zfsvfs);
2017 return (error);
2018 }
2019
2020 /*
2021 * Create a new directory and insert it into dvp using the name
2022 * provided. Return a pointer to the inserted directory.
2023 *
2024 * IN: dvp - vnode of directory to add subdir to.
2025 * dirname - name of new directory.
2026 * vap - attributes of new directory.
2027 * cr - credentials of caller.
2028 * ct - caller context
2029 * flags - case flags
2030 * vsecp - ACL to be set
2031 *
2032 * OUT: vpp - vnode of created directory.
2033 *
2034 * RETURN: 0 on success, error code on failure.
2035 *
2036 * Timestamps:
2037 * dvp - ctime|mtime updated
2038 * vp - ctime|mtime|atime updated
2039 */
2040 /*ARGSUSED*/
2041 static int
zfs_mkdir(vnode_t * dvp,char * dirname,vattr_t * vap,vnode_t ** vpp,cred_t * cr)2042 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2043 {
2044 znode_t *zp, *dzp = VTOZ(dvp);
2045 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2046 zilog_t *zilog;
2047 uint64_t txtype;
2048 dmu_tx_t *tx;
2049 int error;
2050 ksid_t *ksid;
2051 uid_t uid;
2052 gid_t gid = crgetgid(cr);
2053 zfs_acl_ids_t acl_ids;
2054 boolean_t fuid_dirtied;
2055
2056 ASSERT(vap->va_type == VDIR);
2057
2058 /*
2059 * If we have an ephemeral id, ACL, or XVATTR then
2060 * make sure file system is at proper version
2061 */
2062
2063 ksid = crgetsid(cr, KSID_OWNER);
2064 if (ksid)
2065 uid = ksid_getid(ksid);
2066 else
2067 uid = crgetuid(cr);
2068 if (zfsvfs->z_use_fuids == B_FALSE &&
2069 ((vap->va_mask & AT_XVATTR) ||
2070 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2071 return (SET_ERROR(EINVAL));
2072
2073 ZFS_ENTER(zfsvfs);
2074 ZFS_VERIFY_ZP(dzp);
2075 zilog = zfsvfs->z_log;
2076
2077 if (dzp->z_pflags & ZFS_XATTR) {
2078 ZFS_EXIT(zfsvfs);
2079 return (SET_ERROR(EINVAL));
2080 }
2081
2082 if (zfsvfs->z_utf8 && u8_validate(dirname,
2083 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2084 ZFS_EXIT(zfsvfs);
2085 return (SET_ERROR(EILSEQ));
2086 }
2087
2088 if (vap->va_mask & AT_XVATTR) {
2089 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2090 crgetuid(cr), cr, vap->va_type)) != 0) {
2091 ZFS_EXIT(zfsvfs);
2092 return (error);
2093 }
2094 }
2095
2096 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2097 NULL, &acl_ids)) != 0) {
2098 ZFS_EXIT(zfsvfs);
2099 return (error);
2100 }
2101
2102 /*
2103 * First make sure the new directory doesn't exist.
2104 *
2105 * Existence is checked first to make sure we don't return
2106 * EACCES instead of EEXIST which can cause some applications
2107 * to fail.
2108 */
2109 *vpp = NULL;
2110
2111 if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2112 zfs_acl_ids_free(&acl_ids);
2113 ZFS_EXIT(zfsvfs);
2114 return (error);
2115 }
2116 ASSERT3P(zp, ==, NULL);
2117
2118 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2119 zfs_acl_ids_free(&acl_ids);
2120 ZFS_EXIT(zfsvfs);
2121 return (error);
2122 }
2123
2124 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2125 zfs_acl_ids_free(&acl_ids);
2126 ZFS_EXIT(zfsvfs);
2127 return (SET_ERROR(EDQUOT));
2128 }
2129
2130 /*
2131 * Add a new entry to the directory.
2132 */
2133 getnewvnode_reserve(1);
2134 tx = dmu_tx_create(zfsvfs->z_os);
2135 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2136 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2137 fuid_dirtied = zfsvfs->z_fuid_dirty;
2138 if (fuid_dirtied)
2139 zfs_fuid_txhold(zfsvfs, tx);
2140 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2141 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2142 acl_ids.z_aclp->z_acl_bytes);
2143 }
2144
2145 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2146 ZFS_SA_BASE_ATTR_SIZE);
2147
2148 error = dmu_tx_assign(tx, TXG_WAIT);
2149 if (error) {
2150 zfs_acl_ids_free(&acl_ids);
2151 dmu_tx_abort(tx);
2152 getnewvnode_drop_reserve();
2153 ZFS_EXIT(zfsvfs);
2154 return (error);
2155 }
2156
2157 /*
2158 * Create new node.
2159 */
2160 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2161
2162 if (fuid_dirtied)
2163 zfs_fuid_sync(zfsvfs, tx);
2164
2165 /*
2166 * Now put new name in parent dir.
2167 */
2168 (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2169
2170 *vpp = ZTOV(zp);
2171
2172 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2173 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2174 acl_ids.z_fuidp, vap);
2175
2176 zfs_acl_ids_free(&acl_ids);
2177
2178 dmu_tx_commit(tx);
2179
2180 getnewvnode_drop_reserve();
2181
2182 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2183 zil_commit(zilog, 0);
2184
2185 ZFS_EXIT(zfsvfs);
2186 return (0);
2187 }
2188
2189 /*
2190 * Remove a directory subdir entry. If the current working
2191 * directory is the same as the subdir to be removed, the
2192 * remove will fail.
2193 *
2194 * IN: dvp - vnode of directory to remove from.
2195 * name - name of directory to be removed.
2196 * cwd - vnode of current working directory.
2197 * cr - credentials of caller.
2198 * ct - caller context
2199 * flags - case flags
2200 *
2201 * RETURN: 0 on success, error code on failure.
2202 *
2203 * Timestamps:
2204 * dvp - ctime|mtime updated
2205 */
2206 /*ARGSUSED*/
2207 static int
zfs_rmdir(vnode_t * dvp,vnode_t * vp,char * name,cred_t * cr)2208 zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2209 {
2210 znode_t *dzp = VTOZ(dvp);
2211 znode_t *zp = VTOZ(vp);
2212 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2213 zilog_t *zilog;
2214 dmu_tx_t *tx;
2215 int error;
2216
2217 ZFS_ENTER(zfsvfs);
2218 ZFS_VERIFY_ZP(dzp);
2219 ZFS_VERIFY_ZP(zp);
2220 zilog = zfsvfs->z_log;
2221
2222
2223 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2224 goto out;
2225 }
2226
2227 if (vp->v_type != VDIR) {
2228 error = SET_ERROR(ENOTDIR);
2229 goto out;
2230 }
2231
2232 vnevent_rmdir(vp, dvp, name, ct);
2233
2234 tx = dmu_tx_create(zfsvfs->z_os);
2235 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2236 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2237 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2238 zfs_sa_upgrade_txholds(tx, zp);
2239 zfs_sa_upgrade_txholds(tx, dzp);
2240 dmu_tx_mark_netfree(tx);
2241 error = dmu_tx_assign(tx, TXG_WAIT);
2242 if (error) {
2243 dmu_tx_abort(tx);
2244 ZFS_EXIT(zfsvfs);
2245 return (error);
2246 }
2247
2248 cache_purge(dvp);
2249
2250 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2251
2252 if (error == 0) {
2253 uint64_t txtype = TX_RMDIR;
2254 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2255 }
2256
2257 dmu_tx_commit(tx);
2258
2259 cache_purge(vp);
2260 out:
2261 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2262 zil_commit(zilog, 0);
2263
2264 ZFS_EXIT(zfsvfs);
2265 return (error);
2266 }
2267
2268 /*
2269 * Read as many directory entries as will fit into the provided
2270 * buffer from the given directory cursor position (specified in
2271 * the uio structure).
2272 *
2273 * IN: vp - vnode of directory to read.
2274 * uio - structure supplying read location, range info,
2275 * and return buffer.
2276 * cr - credentials of caller.
2277 * ct - caller context
2278 * flags - case flags
2279 *
2280 * OUT: uio - updated offset and range, buffer filled.
2281 * eofp - set to true if end-of-file detected.
2282 *
2283 * RETURN: 0 on success, error code on failure.
2284 *
2285 * Timestamps:
2286 * vp - atime updated
2287 *
2288 * Note that the low 4 bits of the cookie returned by zap is always zero.
2289 * This allows us to use the low range for "special" directory entries:
2290 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
2291 * we use the offset 2 for the '.zfs' directory.
2292 */
2293 /* ARGSUSED */
2294 static int
zfs_readdir(vnode_t * vp,uio_t * uio,cred_t * cr,int * eofp,int * ncookies,u_long ** cookies)2295 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2296 {
2297 znode_t *zp = VTOZ(vp);
2298 iovec_t *iovp;
2299 edirent_t *eodp;
2300 dirent64_t *odp;
2301 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2302 objset_t *os;
2303 caddr_t outbuf;
2304 size_t bufsize;
2305 zap_cursor_t zc;
2306 zap_attribute_t zap;
2307 uint_t bytes_wanted;
2308 uint64_t offset; /* must be unsigned; checks for < 1 */
2309 uint64_t parent;
2310 int local_eof;
2311 int outcount;
2312 int error;
2313 uint8_t prefetch;
2314 boolean_t check_sysattrs;
2315 uint8_t type;
2316 int ncooks;
2317 u_long *cooks = NULL;
2318 int flags = 0;
2319
2320 ZFS_ENTER(zfsvfs);
2321 ZFS_VERIFY_ZP(zp);
2322
2323 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2324 &parent, sizeof (parent))) != 0) {
2325 ZFS_EXIT(zfsvfs);
2326 return (error);
2327 }
2328
2329 /*
2330 * If we are not given an eof variable,
2331 * use a local one.
2332 */
2333 if (eofp == NULL)
2334 eofp = &local_eof;
2335
2336 /*
2337 * Check for valid iov_len.
2338 */
2339 if (uio->uio_iov->iov_len <= 0) {
2340 ZFS_EXIT(zfsvfs);
2341 return (SET_ERROR(EINVAL));
2342 }
2343
2344 /*
2345 * Quit if directory has been removed (posix)
2346 */
2347 if ((*eofp = zp->z_unlinked) != 0) {
2348 ZFS_EXIT(zfsvfs);
2349 return (0);
2350 }
2351
2352 error = 0;
2353 os = zfsvfs->z_os;
2354 offset = uio->uio_loffset;
2355 prefetch = zp->z_zn_prefetch;
2356
2357 /*
2358 * Initialize the iterator cursor.
2359 */
2360 if (offset <= 3) {
2361 /*
2362 * Start iteration from the beginning of the directory.
2363 */
2364 zap_cursor_init(&zc, os, zp->z_id);
2365 } else {
2366 /*
2367 * The offset is a serialized cursor.
2368 */
2369 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2370 }
2371
2372 /*
2373 * Get space to change directory entries into fs independent format.
2374 */
2375 iovp = uio->uio_iov;
2376 bytes_wanted = iovp->iov_len;
2377 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2378 bufsize = bytes_wanted;
2379 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2380 odp = (struct dirent64 *)outbuf;
2381 } else {
2382 bufsize = bytes_wanted;
2383 outbuf = NULL;
2384 odp = (struct dirent64 *)iovp->iov_base;
2385 }
2386 eodp = (struct edirent *)odp;
2387
2388 if (ncookies != NULL) {
2389 /*
2390 * Minimum entry size is dirent size and 1 byte for a file name.
2391 */
2392 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2393 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2394 *cookies = cooks;
2395 *ncookies = ncooks;
2396 }
2397 /*
2398 * If this VFS supports the system attribute view interface; and
2399 * we're looking at an extended attribute directory; and we care
2400 * about normalization conflicts on this vfs; then we must check
2401 * for normalization conflicts with the sysattr name space.
2402 */
2403 #ifdef TODO
2404 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2405 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2406 (flags & V_RDDIR_ENTFLAGS);
2407 #else
2408 check_sysattrs = 0;
2409 #endif
2410
2411 /*
2412 * Transform to file-system independent format
2413 */
2414 outcount = 0;
2415 while (outcount < bytes_wanted) {
2416 ino64_t objnum;
2417 ushort_t reclen;
2418 off64_t *next = NULL;
2419
2420 /*
2421 * Special case `.', `..', and `.zfs'.
2422 */
2423 if (offset == 0) {
2424 (void) strcpy(zap.za_name, ".");
2425 zap.za_normalization_conflict = 0;
2426 objnum = zp->z_id;
2427 type = DT_DIR;
2428 } else if (offset == 1) {
2429 (void) strcpy(zap.za_name, "..");
2430 zap.za_normalization_conflict = 0;
2431 objnum = parent;
2432 type = DT_DIR;
2433 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2434 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2435 zap.za_normalization_conflict = 0;
2436 objnum = ZFSCTL_INO_ROOT;
2437 type = DT_DIR;
2438 } else {
2439 /*
2440 * Grab next entry.
2441 */
2442 if (error = zap_cursor_retrieve(&zc, &zap)) {
2443 if ((*eofp = (error == ENOENT)) != 0)
2444 break;
2445 else
2446 goto update;
2447 }
2448
2449 if (zap.za_integer_length != 8 ||
2450 zap.za_num_integers != 1) {
2451 cmn_err(CE_WARN, "zap_readdir: bad directory "
2452 "entry, obj = %lld, offset = %lld\n",
2453 (u_longlong_t)zp->z_id,
2454 (u_longlong_t)offset);
2455 error = SET_ERROR(ENXIO);
2456 goto update;
2457 }
2458
2459 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2460 /*
2461 * MacOS X can extract the object type here such as:
2462 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2463 */
2464 type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2465
2466 if (check_sysattrs && !zap.za_normalization_conflict) {
2467 #ifdef TODO
2468 zap.za_normalization_conflict =
2469 xattr_sysattr_casechk(zap.za_name);
2470 #else
2471 panic("%s:%u: TODO", __func__, __LINE__);
2472 #endif
2473 }
2474 }
2475
2476 if (flags & V_RDDIR_ACCFILTER) {
2477 /*
2478 * If we have no access at all, don't include
2479 * this entry in the returned information
2480 */
2481 znode_t *ezp;
2482 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2483 goto skip_entry;
2484 if (!zfs_has_access(ezp, cr)) {
2485 vrele(ZTOV(ezp));
2486 goto skip_entry;
2487 }
2488 vrele(ZTOV(ezp));
2489 }
2490
2491 if (flags & V_RDDIR_ENTFLAGS)
2492 reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2493 else
2494 reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2495
2496 /*
2497 * Will this entry fit in the buffer?
2498 */
2499 if (outcount + reclen > bufsize) {
2500 /*
2501 * Did we manage to fit anything in the buffer?
2502 */
2503 if (!outcount) {
2504 error = SET_ERROR(EINVAL);
2505 goto update;
2506 }
2507 break;
2508 }
2509 if (flags & V_RDDIR_ENTFLAGS) {
2510 /*
2511 * Add extended flag entry:
2512 */
2513 eodp->ed_ino = objnum;
2514 eodp->ed_reclen = reclen;
2515 /* NOTE: ed_off is the offset for the *next* entry */
2516 next = &(eodp->ed_off);
2517 eodp->ed_eflags = zap.za_normalization_conflict ?
2518 ED_CASE_CONFLICT : 0;
2519 (void) strncpy(eodp->ed_name, zap.za_name,
2520 EDIRENT_NAMELEN(reclen));
2521 eodp = (edirent_t *)((intptr_t)eodp + reclen);
2522 } else {
2523 /*
2524 * Add normal entry:
2525 */
2526 odp->d_ino = objnum;
2527 odp->d_reclen = reclen;
2528 odp->d_namlen = strlen(zap.za_name);
2529 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2530 odp->d_type = type;
2531 odp = (dirent64_t *)((intptr_t)odp + reclen);
2532 }
2533 outcount += reclen;
2534
2535 ASSERT(outcount <= bufsize);
2536
2537 /* Prefetch znode */
2538 if (prefetch)
2539 dmu_prefetch(os, objnum, 0, 0, 0,
2540 ZIO_PRIORITY_SYNC_READ);
2541
2542 skip_entry:
2543 /*
2544 * Move to the next entry, fill in the previous offset.
2545 */
2546 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2547 zap_cursor_advance(&zc);
2548 offset = zap_cursor_serialize(&zc);
2549 } else {
2550 offset += 1;
2551 }
2552
2553 if (cooks != NULL) {
2554 *cooks++ = offset;
2555 ncooks--;
2556 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2557 }
2558 }
2559 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2560
2561 /* Subtract unused cookies */
2562 if (ncookies != NULL)
2563 *ncookies -= ncooks;
2564
2565 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2566 iovp->iov_base += outcount;
2567 iovp->iov_len -= outcount;
2568 uio->uio_resid -= outcount;
2569 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2570 /*
2571 * Reset the pointer.
2572 */
2573 offset = uio->uio_loffset;
2574 }
2575
2576 update:
2577 zap_cursor_fini(&zc);
2578 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2579 kmem_free(outbuf, bufsize);
2580
2581 if (error == ENOENT)
2582 error = 0;
2583
2584 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2585
2586 uio->uio_loffset = offset;
2587 ZFS_EXIT(zfsvfs);
2588 if (error != 0 && cookies != NULL) {
2589 free(*cookies, M_TEMP);
2590 *cookies = NULL;
2591 *ncookies = 0;
2592 }
2593 return (error);
2594 }
2595
2596 ulong_t zfs_fsync_sync_cnt = 4;
2597
2598 static int
zfs_fsync(vnode_t * vp,int syncflag,cred_t * cr,caller_context_t * ct)2599 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2600 {
2601 znode_t *zp = VTOZ(vp);
2602 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2603
2604 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2605
2606 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2607 ZFS_ENTER(zfsvfs);
2608 ZFS_VERIFY_ZP(zp);
2609 zil_commit(zfsvfs->z_log, zp->z_id);
2610 ZFS_EXIT(zfsvfs);
2611 }
2612 return (0);
2613 }
2614
2615
2616 /*
2617 * Get the requested file attributes and place them in the provided
2618 * vattr structure.
2619 *
2620 * IN: vp - vnode of file.
2621 * vap - va_mask identifies requested attributes.
2622 * If AT_XVATTR set, then optional attrs are requested
2623 * flags - ATTR_NOACLCHECK (CIFS server context)
2624 * cr - credentials of caller.
2625 * ct - caller context
2626 *
2627 * OUT: vap - attribute values.
2628 *
2629 * RETURN: 0 (always succeeds).
2630 */
2631 /* ARGSUSED */
2632 static int
zfs_getattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr,caller_context_t * ct)2633 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2634 caller_context_t *ct)
2635 {
2636 znode_t *zp = VTOZ(vp);
2637 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2638 int error = 0;
2639 uint32_t blksize;
2640 u_longlong_t nblocks;
2641 uint64_t links;
2642 uint64_t mtime[2], ctime[2], crtime[2], rdev;
2643 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2644 xoptattr_t *xoap = NULL;
2645 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2646 sa_bulk_attr_t bulk[4];
2647 int count = 0;
2648
2649 ZFS_ENTER(zfsvfs);
2650 ZFS_VERIFY_ZP(zp);
2651
2652 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2653
2654 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2655 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2656 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2657 if (vp->v_type == VBLK || vp->v_type == VCHR)
2658 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2659 &rdev, 8);
2660
2661 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2662 ZFS_EXIT(zfsvfs);
2663 return (error);
2664 }
2665
2666 /*
2667 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2668 * Also, if we are the owner don't bother, since owner should
2669 * always be allowed to read basic attributes of file.
2670 */
2671 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2672 (vap->va_uid != crgetuid(cr))) {
2673 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2674 skipaclchk, cr)) {
2675 ZFS_EXIT(zfsvfs);
2676 return (error);
2677 }
2678 }
2679
2680 /*
2681 * Return all attributes. It's cheaper to provide the answer
2682 * than to determine whether we were asked the question.
2683 */
2684
2685 vap->va_type = IFTOVT(zp->z_mode);
2686 vap->va_mode = zp->z_mode & ~S_IFMT;
2687 #ifdef illumos
2688 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2689 #else
2690 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2691 #endif
2692 vap->va_nodeid = zp->z_id;
2693 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2694 links = zp->z_links + 1;
2695 else
2696 links = zp->z_links;
2697 vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */
2698 vap->va_size = zp->z_size;
2699 #ifdef illumos
2700 vap->va_rdev = vp->v_rdev;
2701 #else
2702 if (vp->v_type == VBLK || vp->v_type == VCHR)
2703 vap->va_rdev = zfs_cmpldev(rdev);
2704 #endif
2705 vap->va_seq = zp->z_seq;
2706 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
2707 vap->va_filerev = zp->z_seq;
2708
2709 /*
2710 * Add in any requested optional attributes and the create time.
2711 * Also set the corresponding bits in the returned attribute bitmap.
2712 */
2713 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2714 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2715 xoap->xoa_archive =
2716 ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2717 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2718 }
2719
2720 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2721 xoap->xoa_readonly =
2722 ((zp->z_pflags & ZFS_READONLY) != 0);
2723 XVA_SET_RTN(xvap, XAT_READONLY);
2724 }
2725
2726 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2727 xoap->xoa_system =
2728 ((zp->z_pflags & ZFS_SYSTEM) != 0);
2729 XVA_SET_RTN(xvap, XAT_SYSTEM);
2730 }
2731
2732 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2733 xoap->xoa_hidden =
2734 ((zp->z_pflags & ZFS_HIDDEN) != 0);
2735 XVA_SET_RTN(xvap, XAT_HIDDEN);
2736 }
2737
2738 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2739 xoap->xoa_nounlink =
2740 ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2741 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2742 }
2743
2744 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2745 xoap->xoa_immutable =
2746 ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2747 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2748 }
2749
2750 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2751 xoap->xoa_appendonly =
2752 ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2753 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2754 }
2755
2756 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2757 xoap->xoa_nodump =
2758 ((zp->z_pflags & ZFS_NODUMP) != 0);
2759 XVA_SET_RTN(xvap, XAT_NODUMP);
2760 }
2761
2762 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2763 xoap->xoa_opaque =
2764 ((zp->z_pflags & ZFS_OPAQUE) != 0);
2765 XVA_SET_RTN(xvap, XAT_OPAQUE);
2766 }
2767
2768 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2769 xoap->xoa_av_quarantined =
2770 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2771 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2772 }
2773
2774 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2775 xoap->xoa_av_modified =
2776 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2777 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2778 }
2779
2780 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2781 vp->v_type == VREG) {
2782 zfs_sa_get_scanstamp(zp, xvap);
2783 }
2784
2785 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2786 uint64_t times[2];
2787
2788 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2789 times, sizeof (times));
2790 ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2791 XVA_SET_RTN(xvap, XAT_CREATETIME);
2792 }
2793
2794 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2795 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2796 XVA_SET_RTN(xvap, XAT_REPARSE);
2797 }
2798 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2799 xoap->xoa_generation = zp->z_gen;
2800 XVA_SET_RTN(xvap, XAT_GEN);
2801 }
2802
2803 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2804 xoap->xoa_offline =
2805 ((zp->z_pflags & ZFS_OFFLINE) != 0);
2806 XVA_SET_RTN(xvap, XAT_OFFLINE);
2807 }
2808
2809 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2810 xoap->xoa_sparse =
2811 ((zp->z_pflags & ZFS_SPARSE) != 0);
2812 XVA_SET_RTN(xvap, XAT_SPARSE);
2813 }
2814 }
2815
2816 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2817 ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2818 ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2819 ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2820
2821
2822 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2823 vap->va_blksize = blksize;
2824 vap->va_bytes = nblocks << 9; /* nblocks * 512 */
2825
2826 if (zp->z_blksz == 0) {
2827 /*
2828 * Block size hasn't been set; suggest maximal I/O transfers.
2829 */
2830 vap->va_blksize = zfsvfs->z_max_blksz;
2831 }
2832
2833 ZFS_EXIT(zfsvfs);
2834 return (0);
2835 }
2836
2837 /*
2838 * Set the file attributes to the values contained in the
2839 * vattr structure.
2840 *
2841 * IN: vp - vnode of file to be modified.
2842 * vap - new attribute values.
2843 * If AT_XVATTR set, then optional attrs are being set
2844 * flags - ATTR_UTIME set if non-default time values provided.
2845 * - ATTR_NOACLCHECK (CIFS context only).
2846 * cr - credentials of caller.
2847 * ct - caller context
2848 *
2849 * RETURN: 0 on success, error code on failure.
2850 *
2851 * Timestamps:
2852 * vp - ctime updated, mtime updated if size changed.
2853 */
2854 /* ARGSUSED */
2855 static int
zfs_setattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr,caller_context_t * ct)2856 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2857 caller_context_t *ct)
2858 {
2859 znode_t *zp = VTOZ(vp);
2860 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2861 zilog_t *zilog;
2862 dmu_tx_t *tx;
2863 vattr_t oldva;
2864 xvattr_t tmpxvattr;
2865 uint_t mask = vap->va_mask;
2866 uint_t saved_mask = 0;
2867 uint64_t saved_mode;
2868 int trim_mask = 0;
2869 uint64_t new_mode;
2870 uint64_t new_uid, new_gid;
2871 uint64_t xattr_obj;
2872 uint64_t mtime[2], ctime[2];
2873 znode_t *attrzp;
2874 int need_policy = FALSE;
2875 int err, err2;
2876 zfs_fuid_info_t *fuidp = NULL;
2877 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2878 xoptattr_t *xoap;
2879 zfs_acl_t *aclp;
2880 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2881 boolean_t fuid_dirtied = B_FALSE;
2882 sa_bulk_attr_t bulk[7], xattr_bulk[7];
2883 int count = 0, xattr_count = 0;
2884
2885 if (mask == 0)
2886 return (0);
2887
2888 if (mask & AT_NOSET)
2889 return (SET_ERROR(EINVAL));
2890
2891 ZFS_ENTER(zfsvfs);
2892 ZFS_VERIFY_ZP(zp);
2893
2894 zilog = zfsvfs->z_log;
2895
2896 /*
2897 * Make sure that if we have ephemeral uid/gid or xvattr specified
2898 * that file system is at proper version level
2899 */
2900
2901 if (zfsvfs->z_use_fuids == B_FALSE &&
2902 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2903 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2904 (mask & AT_XVATTR))) {
2905 ZFS_EXIT(zfsvfs);
2906 return (SET_ERROR(EINVAL));
2907 }
2908
2909 if (mask & AT_SIZE && vp->v_type == VDIR) {
2910 ZFS_EXIT(zfsvfs);
2911 return (SET_ERROR(EISDIR));
2912 }
2913
2914 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2915 ZFS_EXIT(zfsvfs);
2916 return (SET_ERROR(EINVAL));
2917 }
2918
2919 /*
2920 * If this is an xvattr_t, then get a pointer to the structure of
2921 * optional attributes. If this is NULL, then we have a vattr_t.
2922 */
2923 xoap = xva_getxoptattr(xvap);
2924
2925 xva_init(&tmpxvattr);
2926
2927 /*
2928 * Immutable files can only alter immutable bit and atime
2929 */
2930 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2931 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2932 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2933 ZFS_EXIT(zfsvfs);
2934 return (SET_ERROR(EPERM));
2935 }
2936
2937 if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2938 ZFS_EXIT(zfsvfs);
2939 return (SET_ERROR(EPERM));
2940 }
2941
2942 /*
2943 * Verify timestamps doesn't overflow 32 bits.
2944 * ZFS can handle large timestamps, but 32bit syscalls can't
2945 * handle times greater than 2039. This check should be removed
2946 * once large timestamps are fully supported.
2947 */
2948 if (mask & (AT_ATIME | AT_MTIME)) {
2949 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2950 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2951 ZFS_EXIT(zfsvfs);
2952 return (SET_ERROR(EOVERFLOW));
2953 }
2954 }
2955
2956 attrzp = NULL;
2957 aclp = NULL;
2958
2959 /* Can this be moved to before the top label? */
2960 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2961 ZFS_EXIT(zfsvfs);
2962 return (SET_ERROR(EROFS));
2963 }
2964
2965 /*
2966 * First validate permissions
2967 */
2968
2969 if (mask & AT_SIZE) {
2970 /*
2971 * XXX - Note, we are not providing any open
2972 * mode flags here (like FNDELAY), so we may
2973 * block if there are locks present... this
2974 * should be addressed in openat().
2975 */
2976 /* XXX - would it be OK to generate a log record here? */
2977 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2978 if (err) {
2979 ZFS_EXIT(zfsvfs);
2980 return (err);
2981 }
2982 }
2983
2984 if (mask & (AT_ATIME|AT_MTIME) ||
2985 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2986 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2987 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2988 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2989 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2990 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2991 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2992 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2993 skipaclchk, cr);
2994 }
2995
2996 if (mask & (AT_UID|AT_GID)) {
2997 int idmask = (mask & (AT_UID|AT_GID));
2998 int take_owner;
2999 int take_group;
3000
3001 /*
3002 * NOTE: even if a new mode is being set,
3003 * we may clear S_ISUID/S_ISGID bits.
3004 */
3005
3006 if (!(mask & AT_MODE))
3007 vap->va_mode = zp->z_mode;
3008
3009 /*
3010 * Take ownership or chgrp to group we are a member of
3011 */
3012
3013 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3014 take_group = (mask & AT_GID) &&
3015 zfs_groupmember(zfsvfs, vap->va_gid, cr);
3016
3017 /*
3018 * If both AT_UID and AT_GID are set then take_owner and
3019 * take_group must both be set in order to allow taking
3020 * ownership.
3021 *
3022 * Otherwise, send the check through secpolicy_vnode_setattr()
3023 *
3024 */
3025
3026 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3027 ((idmask == AT_UID) && take_owner) ||
3028 ((idmask == AT_GID) && take_group)) {
3029 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3030 skipaclchk, cr) == 0) {
3031 /*
3032 * Remove setuid/setgid for non-privileged users
3033 */
3034 secpolicy_setid_clear(vap, vp, cr);
3035 trim_mask = (mask & (AT_UID|AT_GID));
3036 } else {
3037 need_policy = TRUE;
3038 }
3039 } else {
3040 need_policy = TRUE;
3041 }
3042 }
3043
3044 oldva.va_mode = zp->z_mode;
3045 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3046 if (mask & AT_XVATTR) {
3047 /*
3048 * Update xvattr mask to include only those attributes
3049 * that are actually changing.
3050 *
3051 * the bits will be restored prior to actually setting
3052 * the attributes so the caller thinks they were set.
3053 */
3054 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3055 if (xoap->xoa_appendonly !=
3056 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3057 need_policy = TRUE;
3058 } else {
3059 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3060 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3061 }
3062 }
3063
3064 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3065 if (xoap->xoa_nounlink !=
3066 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3067 need_policy = TRUE;
3068 } else {
3069 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3070 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3071 }
3072 }
3073
3074 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3075 if (xoap->xoa_immutable !=
3076 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3077 need_policy = TRUE;
3078 } else {
3079 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3080 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3081 }
3082 }
3083
3084 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3085 if (xoap->xoa_nodump !=
3086 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3087 need_policy = TRUE;
3088 } else {
3089 XVA_CLR_REQ(xvap, XAT_NODUMP);
3090 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3091 }
3092 }
3093
3094 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3095 if (xoap->xoa_av_modified !=
3096 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3097 need_policy = TRUE;
3098 } else {
3099 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3100 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3101 }
3102 }
3103
3104 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3105 if ((vp->v_type != VREG &&
3106 xoap->xoa_av_quarantined) ||
3107 xoap->xoa_av_quarantined !=
3108 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3109 need_policy = TRUE;
3110 } else {
3111 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3112 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3113 }
3114 }
3115
3116 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3117 ZFS_EXIT(zfsvfs);
3118 return (SET_ERROR(EPERM));
3119 }
3120
3121 if (need_policy == FALSE &&
3122 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3123 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3124 need_policy = TRUE;
3125 }
3126 }
3127
3128 if (mask & AT_MODE) {
3129 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3130 err = secpolicy_setid_setsticky_clear(vp, vap,
3131 &oldva, cr);
3132 if (err) {
3133 ZFS_EXIT(zfsvfs);
3134 return (err);
3135 }
3136 trim_mask |= AT_MODE;
3137 } else {
3138 need_policy = TRUE;
3139 }
3140 }
3141
3142 if (need_policy) {
3143 /*
3144 * If trim_mask is set then take ownership
3145 * has been granted or write_acl is present and user
3146 * has the ability to modify mode. In that case remove
3147 * UID|GID and or MODE from mask so that
3148 * secpolicy_vnode_setattr() doesn't revoke it.
3149 */
3150
3151 if (trim_mask) {
3152 saved_mask = vap->va_mask;
3153 vap->va_mask &= ~trim_mask;
3154 if (trim_mask & AT_MODE) {
3155 /*
3156 * Save the mode, as secpolicy_vnode_setattr()
3157 * will overwrite it with ova.va_mode.
3158 */
3159 saved_mode = vap->va_mode;
3160 }
3161 }
3162 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3163 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3164 if (err) {
3165 ZFS_EXIT(zfsvfs);
3166 return (err);
3167 }
3168
3169 if (trim_mask) {
3170 vap->va_mask |= saved_mask;
3171 if (trim_mask & AT_MODE) {
3172 /*
3173 * Recover the mode after
3174 * secpolicy_vnode_setattr().
3175 */
3176 vap->va_mode = saved_mode;
3177 }
3178 }
3179 }
3180
3181 /*
3182 * secpolicy_vnode_setattr, or take ownership may have
3183 * changed va_mask
3184 */
3185 mask = vap->va_mask;
3186
3187 if ((mask & (AT_UID | AT_GID))) {
3188 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3189 &xattr_obj, sizeof (xattr_obj));
3190
3191 if (err == 0 && xattr_obj) {
3192 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3193 if (err == 0) {
3194 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3195 if (err != 0)
3196 vrele(ZTOV(attrzp));
3197 }
3198 if (err)
3199 goto out2;
3200 }
3201 if (mask & AT_UID) {
3202 new_uid = zfs_fuid_create(zfsvfs,
3203 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3204 if (new_uid != zp->z_uid &&
3205 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3206 if (attrzp)
3207 vput(ZTOV(attrzp));
3208 err = SET_ERROR(EDQUOT);
3209 goto out2;
3210 }
3211 }
3212
3213 if (mask & AT_GID) {
3214 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3215 cr, ZFS_GROUP, &fuidp);
3216 if (new_gid != zp->z_gid &&
3217 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3218 if (attrzp)
3219 vput(ZTOV(attrzp));
3220 err = SET_ERROR(EDQUOT);
3221 goto out2;
3222 }
3223 }
3224 }
3225 tx = dmu_tx_create(zfsvfs->z_os);
3226
3227 if (mask & AT_MODE) {
3228 uint64_t pmode = zp->z_mode;
3229 uint64_t acl_obj;
3230 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3231
3232 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3233 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3234 err = SET_ERROR(EPERM);
3235 goto out;
3236 }
3237
3238 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3239 goto out;
3240
3241 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3242 /*
3243 * Are we upgrading ACL from old V0 format
3244 * to V1 format?
3245 */
3246 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3247 zfs_znode_acl_version(zp) ==
3248 ZFS_ACL_VERSION_INITIAL) {
3249 dmu_tx_hold_free(tx, acl_obj, 0,
3250 DMU_OBJECT_END);
3251 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3252 0, aclp->z_acl_bytes);
3253 } else {
3254 dmu_tx_hold_write(tx, acl_obj, 0,
3255 aclp->z_acl_bytes);
3256 }
3257 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3258 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3259 0, aclp->z_acl_bytes);
3260 }
3261 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3262 } else {
3263 if ((mask & AT_XVATTR) &&
3264 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3265 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3266 else
3267 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3268 }
3269
3270 if (attrzp) {
3271 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3272 }
3273
3274 fuid_dirtied = zfsvfs->z_fuid_dirty;
3275 if (fuid_dirtied)
3276 zfs_fuid_txhold(zfsvfs, tx);
3277
3278 zfs_sa_upgrade_txholds(tx, zp);
3279
3280 err = dmu_tx_assign(tx, TXG_WAIT);
3281 if (err)
3282 goto out;
3283
3284 count = 0;
3285 /*
3286 * Set each attribute requested.
3287 * We group settings according to the locks they need to acquire.
3288 *
3289 * Note: you cannot set ctime directly, although it will be
3290 * updated as a side-effect of calling this function.
3291 */
3292
3293 if (mask & (AT_UID|AT_GID|AT_MODE))
3294 mutex_enter(&zp->z_acl_lock);
3295
3296 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3297 &zp->z_pflags, sizeof (zp->z_pflags));
3298
3299 if (attrzp) {
3300 if (mask & (AT_UID|AT_GID|AT_MODE))
3301 mutex_enter(&attrzp->z_acl_lock);
3302 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3303 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3304 sizeof (attrzp->z_pflags));
3305 }
3306
3307 if (mask & (AT_UID|AT_GID)) {
3308
3309 if (mask & AT_UID) {
3310 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3311 &new_uid, sizeof (new_uid));
3312 zp->z_uid = new_uid;
3313 if (attrzp) {
3314 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3315 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3316 sizeof (new_uid));
3317 attrzp->z_uid = new_uid;
3318 }
3319 }
3320
3321 if (mask & AT_GID) {
3322 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3323 NULL, &new_gid, sizeof (new_gid));
3324 zp->z_gid = new_gid;
3325 if (attrzp) {
3326 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3327 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3328 sizeof (new_gid));
3329 attrzp->z_gid = new_gid;
3330 }
3331 }
3332 if (!(mask & AT_MODE)) {
3333 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3334 NULL, &new_mode, sizeof (new_mode));
3335 new_mode = zp->z_mode;
3336 }
3337 err = zfs_acl_chown_setattr(zp);
3338 ASSERT(err == 0);
3339 if (attrzp) {
3340 err = zfs_acl_chown_setattr(attrzp);
3341 ASSERT(err == 0);
3342 }
3343 }
3344
3345 if (mask & AT_MODE) {
3346 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3347 &new_mode, sizeof (new_mode));
3348 zp->z_mode = new_mode;
3349 ASSERT3U((uintptr_t)aclp, !=, 0);
3350 err = zfs_aclset_common(zp, aclp, cr, tx);
3351 ASSERT0(err);
3352 if (zp->z_acl_cached)
3353 zfs_acl_free(zp->z_acl_cached);
3354 zp->z_acl_cached = aclp;
3355 aclp = NULL;
3356 }
3357
3358
3359 if (mask & AT_ATIME) {
3360 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3361 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3362 &zp->z_atime, sizeof (zp->z_atime));
3363 }
3364
3365 if (mask & AT_MTIME) {
3366 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3367 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3368 mtime, sizeof (mtime));
3369 }
3370
3371 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3372 if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3373 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3374 NULL, mtime, sizeof (mtime));
3375 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3376 &ctime, sizeof (ctime));
3377 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3378 B_TRUE);
3379 } else if (mask != 0) {
3380 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3381 &ctime, sizeof (ctime));
3382 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3383 B_TRUE);
3384 if (attrzp) {
3385 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3386 SA_ZPL_CTIME(zfsvfs), NULL,
3387 &ctime, sizeof (ctime));
3388 zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3389 mtime, ctime, B_TRUE);
3390 }
3391 }
3392 /*
3393 * Do this after setting timestamps to prevent timestamp
3394 * update from toggling bit
3395 */
3396
3397 if (xoap && (mask & AT_XVATTR)) {
3398
3399 /*
3400 * restore trimmed off masks
3401 * so that return masks can be set for caller.
3402 */
3403
3404 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3405 XVA_SET_REQ(xvap, XAT_APPENDONLY);
3406 }
3407 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3408 XVA_SET_REQ(xvap, XAT_NOUNLINK);
3409 }
3410 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3411 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3412 }
3413 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3414 XVA_SET_REQ(xvap, XAT_NODUMP);
3415 }
3416 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3417 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3418 }
3419 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3420 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3421 }
3422
3423 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3424 ASSERT(vp->v_type == VREG);
3425
3426 zfs_xvattr_set(zp, xvap, tx);
3427 }
3428
3429 if (fuid_dirtied)
3430 zfs_fuid_sync(zfsvfs, tx);
3431
3432 if (mask != 0)
3433 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3434
3435 if (mask & (AT_UID|AT_GID|AT_MODE))
3436 mutex_exit(&zp->z_acl_lock);
3437
3438 if (attrzp) {
3439 if (mask & (AT_UID|AT_GID|AT_MODE))
3440 mutex_exit(&attrzp->z_acl_lock);
3441 }
3442 out:
3443 if (err == 0 && attrzp) {
3444 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3445 xattr_count, tx);
3446 ASSERT(err2 == 0);
3447 }
3448
3449 if (attrzp)
3450 vput(ZTOV(attrzp));
3451
3452 if (aclp)
3453 zfs_acl_free(aclp);
3454
3455 if (fuidp) {
3456 zfs_fuid_info_free(fuidp);
3457 fuidp = NULL;
3458 }
3459
3460 if (err) {
3461 dmu_tx_abort(tx);
3462 } else {
3463 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3464 dmu_tx_commit(tx);
3465 }
3466
3467 out2:
3468 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3469 zil_commit(zilog, 0);
3470
3471 ZFS_EXIT(zfsvfs);
3472 return (err);
3473 }
3474
3475 /*
3476 * We acquire all but fdvp locks using non-blocking acquisitions. If we
3477 * fail to acquire any lock in the path we will drop all held locks,
3478 * acquire the new lock in a blocking fashion, and then release it and
3479 * restart the rename. This acquire/release step ensures that we do not
3480 * spin on a lock waiting for release. On error release all vnode locks
3481 * and decrement references the way tmpfs_rename() would do.
3482 */
3483 static int
zfs_rename_relock(struct vnode * sdvp,struct vnode ** svpp,struct vnode * tdvp,struct vnode ** tvpp,const struct componentname * scnp,const struct componentname * tcnp)3484 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3485 struct vnode *tdvp, struct vnode **tvpp,
3486 const struct componentname *scnp, const struct componentname *tcnp)
3487 {
3488 zfsvfs_t *zfsvfs;
3489 struct vnode *nvp, *svp, *tvp;
3490 znode_t *sdzp, *tdzp, *szp, *tzp;
3491 const char *snm = scnp->cn_nameptr;
3492 const char *tnm = tcnp->cn_nameptr;
3493 int error;
3494
3495 VOP_UNLOCK(tdvp, 0);
3496 if (*tvpp != NULL && *tvpp != tdvp)
3497 VOP_UNLOCK(*tvpp, 0);
3498
3499 relock:
3500 error = vn_lock(sdvp, LK_EXCLUSIVE);
3501 if (error)
3502 goto out;
3503 sdzp = VTOZ(sdvp);
3504
3505 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3506 if (error != 0) {
3507 VOP_UNLOCK(sdvp, 0);
3508 if (error != EBUSY)
3509 goto out;
3510 error = vn_lock(tdvp, LK_EXCLUSIVE);
3511 if (error)
3512 goto out;
3513 VOP_UNLOCK(tdvp, 0);
3514 goto relock;
3515 }
3516 tdzp = VTOZ(tdvp);
3517
3518 /*
3519 * Before using sdzp and tdzp we must ensure that they are live.
3520 * As a porting legacy from illumos we have two things to worry
3521 * about. One is typical for FreeBSD and it is that the vnode is
3522 * not reclaimed (doomed). The other is that the znode is live.
3523 * The current code can invalidate the znode without acquiring the
3524 * corresponding vnode lock if the object represented by the znode
3525 * and vnode is no longer valid after a rollback or receive operation.
3526 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3527 * that protects the znodes from the invalidation.
3528 */
3529 zfsvfs = sdzp->z_zfsvfs;
3530 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3531 ZFS_ENTER(zfsvfs);
3532
3533 /*
3534 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3535 * bypassing the cleanup code in the case of an error.
3536 */
3537 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3538 ZFS_EXIT(zfsvfs);
3539 VOP_UNLOCK(sdvp, 0);
3540 VOP_UNLOCK(tdvp, 0);
3541 error = SET_ERROR(EIO);
3542 goto out;
3543 }
3544
3545 /*
3546 * Re-resolve svp to be certain it still exists and fetch the
3547 * correct vnode.
3548 */
3549 error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3550 if (error != 0) {
3551 /* Source entry invalid or not there. */
3552 ZFS_EXIT(zfsvfs);
3553 VOP_UNLOCK(sdvp, 0);
3554 VOP_UNLOCK(tdvp, 0);
3555 if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3556 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3557 error = SET_ERROR(EINVAL);
3558 goto out;
3559 }
3560 svp = ZTOV(szp);
3561
3562 /*
3563 * Re-resolve tvp, if it disappeared we just carry on.
3564 */
3565 error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3566 if (error != 0) {
3567 ZFS_EXIT(zfsvfs);
3568 VOP_UNLOCK(sdvp, 0);
3569 VOP_UNLOCK(tdvp, 0);
3570 vrele(svp);
3571 if ((tcnp->cn_flags & ISDOTDOT) != 0)
3572 error = SET_ERROR(EINVAL);
3573 goto out;
3574 }
3575 if (tzp != NULL)
3576 tvp = ZTOV(tzp);
3577 else
3578 tvp = NULL;
3579
3580 /*
3581 * At present the vnode locks must be acquired before z_teardown_lock,
3582 * although it would be more logical to use the opposite order.
3583 */
3584 ZFS_EXIT(zfsvfs);
3585
3586 /*
3587 * Now try acquire locks on svp and tvp.
3588 */
3589 nvp = svp;
3590 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3591 if (error != 0) {
3592 VOP_UNLOCK(sdvp, 0);
3593 VOP_UNLOCK(tdvp, 0);
3594 if (tvp != NULL)
3595 vrele(tvp);
3596 if (error != EBUSY) {
3597 vrele(nvp);
3598 goto out;
3599 }
3600 error = vn_lock(nvp, LK_EXCLUSIVE);
3601 if (error != 0) {
3602 vrele(nvp);
3603 goto out;
3604 }
3605 VOP_UNLOCK(nvp, 0);
3606 /*
3607 * Concurrent rename race.
3608 * XXX ?
3609 */
3610 if (nvp == tdvp) {
3611 vrele(nvp);
3612 error = SET_ERROR(EINVAL);
3613 goto out;
3614 }
3615 vrele(*svpp);
3616 *svpp = nvp;
3617 goto relock;
3618 }
3619 vrele(*svpp);
3620 *svpp = nvp;
3621
3622 if (*tvpp != NULL)
3623 vrele(*tvpp);
3624 *tvpp = NULL;
3625 if (tvp != NULL) {
3626 nvp = tvp;
3627 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3628 if (error != 0) {
3629 VOP_UNLOCK(sdvp, 0);
3630 VOP_UNLOCK(tdvp, 0);
3631 VOP_UNLOCK(*svpp, 0);
3632 if (error != EBUSY) {
3633 vrele(nvp);
3634 goto out;
3635 }
3636 error = vn_lock(nvp, LK_EXCLUSIVE);
3637 if (error != 0) {
3638 vrele(nvp);
3639 goto out;
3640 }
3641 vput(nvp);
3642 goto relock;
3643 }
3644 *tvpp = nvp;
3645 }
3646
3647 return (0);
3648
3649 out:
3650 return (error);
3651 }
3652
3653 /*
3654 * Note that we must use VRELE_ASYNC in this function as it walks
3655 * up the directory tree and vrele may need to acquire an exclusive
3656 * lock if a last reference to a vnode is dropped.
3657 */
3658 static int
zfs_rename_check(znode_t * szp,znode_t * sdzp,znode_t * tdzp)3659 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3660 {
3661 zfsvfs_t *zfsvfs;
3662 znode_t *zp, *zp1;
3663 uint64_t parent;
3664 int error;
3665
3666 zfsvfs = tdzp->z_zfsvfs;
3667 if (tdzp == szp)
3668 return (SET_ERROR(EINVAL));
3669 if (tdzp == sdzp)
3670 return (0);
3671 if (tdzp->z_id == zfsvfs->z_root)
3672 return (0);
3673 zp = tdzp;
3674 for (;;) {
3675 ASSERT(!zp->z_unlinked);
3676 if ((error = sa_lookup(zp->z_sa_hdl,
3677 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3678 break;
3679
3680 if (parent == szp->z_id) {
3681 error = SET_ERROR(EINVAL);
3682 break;
3683 }
3684 if (parent == zfsvfs->z_root)
3685 break;
3686 if (parent == sdzp->z_id)
3687 break;
3688
3689 error = zfs_zget(zfsvfs, parent, &zp1);
3690 if (error != 0)
3691 break;
3692
3693 if (zp != tdzp)
3694 VN_RELE_ASYNC(ZTOV(zp),
3695 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3696 zp = zp1;
3697 }
3698
3699 if (error == ENOTDIR)
3700 panic("checkpath: .. not a directory\n");
3701 if (zp != tdzp)
3702 VN_RELE_ASYNC(ZTOV(zp),
3703 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3704 return (error);
3705 }
3706
3707 /*
3708 * Move an entry from the provided source directory to the target
3709 * directory. Change the entry name as indicated.
3710 *
3711 * IN: sdvp - Source directory containing the "old entry".
3712 * snm - Old entry name.
3713 * tdvp - Target directory to contain the "new entry".
3714 * tnm - New entry name.
3715 * cr - credentials of caller.
3716 * ct - caller context
3717 * flags - case flags
3718 *
3719 * RETURN: 0 on success, error code on failure.
3720 *
3721 * Timestamps:
3722 * sdvp,tdvp - ctime|mtime updated
3723 */
3724 /*ARGSUSED*/
3725 static int
zfs_rename(vnode_t * sdvp,vnode_t ** svpp,struct componentname * scnp,vnode_t * tdvp,vnode_t ** tvpp,struct componentname * tcnp,cred_t * cr)3726 zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3727 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3728 cred_t *cr)
3729 {
3730 zfsvfs_t *zfsvfs;
3731 znode_t *sdzp, *tdzp, *szp, *tzp;
3732 zilog_t *zilog = NULL;
3733 dmu_tx_t *tx;
3734 char *snm = scnp->cn_nameptr;
3735 char *tnm = tcnp->cn_nameptr;
3736 int error = 0;
3737
3738 /* Reject renames across filesystems. */
3739 if ((*svpp)->v_mount != tdvp->v_mount ||
3740 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3741 error = SET_ERROR(EXDEV);
3742 goto out;
3743 }
3744
3745 if (zfsctl_is_node(tdvp)) {
3746 error = SET_ERROR(EXDEV);
3747 goto out;
3748 }
3749
3750 /*
3751 * Lock all four vnodes to ensure safety and semantics of renaming.
3752 */
3753 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3754 if (error != 0) {
3755 /* no vnodes are locked in the case of error here */
3756 return (error);
3757 }
3758
3759 tdzp = VTOZ(tdvp);
3760 sdzp = VTOZ(sdvp);
3761 zfsvfs = tdzp->z_zfsvfs;
3762 zilog = zfsvfs->z_log;
3763
3764 /*
3765 * After we re-enter ZFS_ENTER() we will have to revalidate all
3766 * znodes involved.
3767 */
3768 ZFS_ENTER(zfsvfs);
3769
3770 if (zfsvfs->z_utf8 && u8_validate(tnm,
3771 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3772 error = SET_ERROR(EILSEQ);
3773 goto unlockout;
3774 }
3775
3776 /* If source and target are the same file, there is nothing to do. */
3777 if ((*svpp) == (*tvpp)) {
3778 error = 0;
3779 goto unlockout;
3780 }
3781
3782 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3783 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3784 (*tvpp)->v_mountedhere != NULL)) {
3785 error = SET_ERROR(EXDEV);
3786 goto unlockout;
3787 }
3788
3789 /*
3790 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3791 * bypassing the cleanup code in the case of an error.
3792 */
3793 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3794 error = SET_ERROR(EIO);
3795 goto unlockout;
3796 }
3797
3798 szp = VTOZ(*svpp);
3799 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3800 if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3801 error = SET_ERROR(EIO);
3802 goto unlockout;
3803 }
3804
3805 /*
3806 * This is to prevent the creation of links into attribute space
3807 * by renaming a linked file into/outof an attribute directory.
3808 * See the comment in zfs_link() for why this is considered bad.
3809 */
3810 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3811 error = SET_ERROR(EINVAL);
3812 goto unlockout;
3813 }
3814
3815 /*
3816 * Must have write access at the source to remove the old entry
3817 * and write access at the target to create the new entry.
3818 * Note that if target and source are the same, this can be
3819 * done in a single check.
3820 */
3821 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3822 goto unlockout;
3823
3824 if ((*svpp)->v_type == VDIR) {
3825 /*
3826 * Avoid ".", "..", and aliases of "." for obvious reasons.
3827 */
3828 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3829 sdzp == szp ||
3830 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3831 error = EINVAL;
3832 goto unlockout;
3833 }
3834
3835 /*
3836 * Check to make sure rename is valid.
3837 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3838 */
3839 if (error = zfs_rename_check(szp, sdzp, tdzp))
3840 goto unlockout;
3841 }
3842
3843 /*
3844 * Does target exist?
3845 */
3846 if (tzp) {
3847 /*
3848 * Source and target must be the same type.
3849 */
3850 if ((*svpp)->v_type == VDIR) {
3851 if ((*tvpp)->v_type != VDIR) {
3852 error = SET_ERROR(ENOTDIR);
3853 goto unlockout;
3854 } else {
3855 cache_purge(tdvp);
3856 if (sdvp != tdvp)
3857 cache_purge(sdvp);
3858 }
3859 } else {
3860 if ((*tvpp)->v_type == VDIR) {
3861 error = SET_ERROR(EISDIR);
3862 goto unlockout;
3863 }
3864 }
3865 }
3866
3867 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3868 if (tzp)
3869 vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3870
3871 /*
3872 * notify the target directory if it is not the same
3873 * as source directory.
3874 */
3875 if (tdvp != sdvp) {
3876 vnevent_rename_dest_dir(tdvp, ct);
3877 }
3878
3879 tx = dmu_tx_create(zfsvfs->z_os);
3880 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3881 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3882 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3883 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3884 if (sdzp != tdzp) {
3885 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3886 zfs_sa_upgrade_txholds(tx, tdzp);
3887 }
3888 if (tzp) {
3889 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3890 zfs_sa_upgrade_txholds(tx, tzp);
3891 }
3892
3893 zfs_sa_upgrade_txholds(tx, szp);
3894 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3895 error = dmu_tx_assign(tx, TXG_WAIT);
3896 if (error) {
3897 dmu_tx_abort(tx);
3898 goto unlockout;
3899 }
3900
3901
3902 if (tzp) /* Attempt to remove the existing target */
3903 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3904
3905 if (error == 0) {
3906 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3907 if (error == 0) {
3908 szp->z_pflags |= ZFS_AV_MODIFIED;
3909
3910 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3911 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3912 ASSERT0(error);
3913
3914 error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3915 NULL);
3916 if (error == 0) {
3917 zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3918 snm, tdzp, tnm, szp);
3919
3920 /*
3921 * Update path information for the target vnode
3922 */
3923 vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3924 } else {
3925 /*
3926 * At this point, we have successfully created
3927 * the target name, but have failed to remove
3928 * the source name. Since the create was done
3929 * with the ZRENAMING flag, there are
3930 * complications; for one, the link count is
3931 * wrong. The easiest way to deal with this
3932 * is to remove the newly created target, and
3933 * return the original error. This must
3934 * succeed; fortunately, it is very unlikely to
3935 * fail, since we just created it.
3936 */
3937 VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3938 ZRENAMING, NULL), ==, 0);
3939 }
3940 }
3941 if (error == 0) {
3942 cache_purge(*svpp);
3943 if (*tvpp != NULL)
3944 cache_purge(*tvpp);
3945 cache_purge_negative(tdvp);
3946 }
3947 }
3948
3949 dmu_tx_commit(tx);
3950
3951 unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
3952 ZFS_EXIT(zfsvfs);
3953 VOP_UNLOCK(*svpp, 0);
3954 VOP_UNLOCK(sdvp, 0);
3955
3956 out: /* original two vnodes are locked */
3957 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3958 zil_commit(zilog, 0);
3959
3960 if (*tvpp != NULL)
3961 VOP_UNLOCK(*tvpp, 0);
3962 if (tdvp != *tvpp)
3963 VOP_UNLOCK(tdvp, 0);
3964 return (error);
3965 }
3966
3967 /*
3968 * Insert the indicated symbolic reference entry into the directory.
3969 *
3970 * IN: dvp - Directory to contain new symbolic link.
3971 * link - Name for new symlink entry.
3972 * vap - Attributes of new entry.
3973 * cr - credentials of caller.
3974 * ct - caller context
3975 * flags - case flags
3976 *
3977 * RETURN: 0 on success, error code on failure.
3978 *
3979 * Timestamps:
3980 * dvp - ctime|mtime updated
3981 */
3982 /*ARGSUSED*/
3983 static int
zfs_symlink(vnode_t * dvp,vnode_t ** vpp,char * name,vattr_t * vap,char * link,cred_t * cr,kthread_t * td)3984 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3985 cred_t *cr, kthread_t *td)
3986 {
3987 znode_t *zp, *dzp = VTOZ(dvp);
3988 dmu_tx_t *tx;
3989 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3990 zilog_t *zilog;
3991 uint64_t len = strlen(link);
3992 int error;
3993 zfs_acl_ids_t acl_ids;
3994 boolean_t fuid_dirtied;
3995 uint64_t txtype = TX_SYMLINK;
3996 int flags = 0;
3997
3998 ASSERT(vap->va_type == VLNK);
3999
4000 ZFS_ENTER(zfsvfs);
4001 ZFS_VERIFY_ZP(dzp);
4002 zilog = zfsvfs->z_log;
4003
4004 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4005 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4006 ZFS_EXIT(zfsvfs);
4007 return (SET_ERROR(EILSEQ));
4008 }
4009
4010 if (len > MAXPATHLEN) {
4011 ZFS_EXIT(zfsvfs);
4012 return (SET_ERROR(ENAMETOOLONG));
4013 }
4014
4015 if ((error = zfs_acl_ids_create(dzp, 0,
4016 vap, cr, NULL, &acl_ids)) != 0) {
4017 ZFS_EXIT(zfsvfs);
4018 return (error);
4019 }
4020
4021 /*
4022 * Attempt to lock directory; fail if entry already exists.
4023 */
4024 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4025 if (error) {
4026 zfs_acl_ids_free(&acl_ids);
4027 ZFS_EXIT(zfsvfs);
4028 return (error);
4029 }
4030
4031 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4032 zfs_acl_ids_free(&acl_ids);
4033 ZFS_EXIT(zfsvfs);
4034 return (error);
4035 }
4036
4037 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4038 zfs_acl_ids_free(&acl_ids);
4039 ZFS_EXIT(zfsvfs);
4040 return (SET_ERROR(EDQUOT));
4041 }
4042
4043 getnewvnode_reserve(1);
4044 tx = dmu_tx_create(zfsvfs->z_os);
4045 fuid_dirtied = zfsvfs->z_fuid_dirty;
4046 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4047 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4048 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4049 ZFS_SA_BASE_ATTR_SIZE + len);
4050 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4051 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4052 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4053 acl_ids.z_aclp->z_acl_bytes);
4054 }
4055 if (fuid_dirtied)
4056 zfs_fuid_txhold(zfsvfs, tx);
4057 error = dmu_tx_assign(tx, TXG_WAIT);
4058 if (error) {
4059 zfs_acl_ids_free(&acl_ids);
4060 dmu_tx_abort(tx);
4061 getnewvnode_drop_reserve();
4062 ZFS_EXIT(zfsvfs);
4063 return (error);
4064 }
4065
4066 /*
4067 * Create a new object for the symlink.
4068 * for version 4 ZPL datsets the symlink will be an SA attribute
4069 */
4070 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4071
4072 if (fuid_dirtied)
4073 zfs_fuid_sync(zfsvfs, tx);
4074
4075 if (zp->z_is_sa)
4076 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4077 link, len, tx);
4078 else
4079 zfs_sa_symlink(zp, link, len, tx);
4080
4081 zp->z_size = len;
4082 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4083 &zp->z_size, sizeof (zp->z_size), tx);
4084 /*
4085 * Insert the new object into the directory.
4086 */
4087 (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4088
4089 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4090 *vpp = ZTOV(zp);
4091
4092 zfs_acl_ids_free(&acl_ids);
4093
4094 dmu_tx_commit(tx);
4095
4096 getnewvnode_drop_reserve();
4097
4098 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4099 zil_commit(zilog, 0);
4100
4101 ZFS_EXIT(zfsvfs);
4102 return (error);
4103 }
4104
4105 /*
4106 * Return, in the buffer contained in the provided uio structure,
4107 * the symbolic path referred to by vp.
4108 *
4109 * IN: vp - vnode of symbolic link.
4110 * uio - structure to contain the link path.
4111 * cr - credentials of caller.
4112 * ct - caller context
4113 *
4114 * OUT: uio - structure containing the link path.
4115 *
4116 * RETURN: 0 on success, error code on failure.
4117 *
4118 * Timestamps:
4119 * vp - atime updated
4120 */
4121 /* ARGSUSED */
4122 static int
zfs_readlink(vnode_t * vp,uio_t * uio,cred_t * cr,caller_context_t * ct)4123 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4124 {
4125 znode_t *zp = VTOZ(vp);
4126 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4127 int error;
4128
4129 ZFS_ENTER(zfsvfs);
4130 ZFS_VERIFY_ZP(zp);
4131
4132 if (zp->z_is_sa)
4133 error = sa_lookup_uio(zp->z_sa_hdl,
4134 SA_ZPL_SYMLINK(zfsvfs), uio);
4135 else
4136 error = zfs_sa_readlink(zp, uio);
4137
4138 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4139
4140 ZFS_EXIT(zfsvfs);
4141 return (error);
4142 }
4143
4144 /*
4145 * Insert a new entry into directory tdvp referencing svp.
4146 *
4147 * IN: tdvp - Directory to contain new entry.
4148 * svp - vnode of new entry.
4149 * name - name of new entry.
4150 * cr - credentials of caller.
4151 * ct - caller context
4152 *
4153 * RETURN: 0 on success, error code on failure.
4154 *
4155 * Timestamps:
4156 * tdvp - ctime|mtime updated
4157 * svp - ctime updated
4158 */
4159 /* ARGSUSED */
4160 static int
zfs_link(vnode_t * tdvp,vnode_t * svp,char * name,cred_t * cr,caller_context_t * ct,int flags)4161 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4162 caller_context_t *ct, int flags)
4163 {
4164 znode_t *dzp = VTOZ(tdvp);
4165 znode_t *tzp, *szp;
4166 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
4167 zilog_t *zilog;
4168 dmu_tx_t *tx;
4169 int error;
4170 uint64_t parent;
4171 uid_t owner;
4172
4173 ASSERT(tdvp->v_type == VDIR);
4174
4175 ZFS_ENTER(zfsvfs);
4176 ZFS_VERIFY_ZP(dzp);
4177 zilog = zfsvfs->z_log;
4178
4179 /*
4180 * POSIX dictates that we return EPERM here.
4181 * Better choices include ENOTSUP or EISDIR.
4182 */
4183 if (svp->v_type == VDIR) {
4184 ZFS_EXIT(zfsvfs);
4185 return (SET_ERROR(EPERM));
4186 }
4187
4188 szp = VTOZ(svp);
4189 ZFS_VERIFY_ZP(szp);
4190
4191 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4192 ZFS_EXIT(zfsvfs);
4193 return (SET_ERROR(EPERM));
4194 }
4195
4196 /* Prevent links to .zfs/shares files */
4197
4198 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4199 &parent, sizeof (uint64_t))) != 0) {
4200 ZFS_EXIT(zfsvfs);
4201 return (error);
4202 }
4203 if (parent == zfsvfs->z_shares_dir) {
4204 ZFS_EXIT(zfsvfs);
4205 return (SET_ERROR(EPERM));
4206 }
4207
4208 if (zfsvfs->z_utf8 && u8_validate(name,
4209 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4210 ZFS_EXIT(zfsvfs);
4211 return (SET_ERROR(EILSEQ));
4212 }
4213
4214 /*
4215 * We do not support links between attributes and non-attributes
4216 * because of the potential security risk of creating links
4217 * into "normal" file space in order to circumvent restrictions
4218 * imposed in attribute space.
4219 */
4220 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4221 ZFS_EXIT(zfsvfs);
4222 return (SET_ERROR(EINVAL));
4223 }
4224
4225
4226 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4227 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4228 ZFS_EXIT(zfsvfs);
4229 return (SET_ERROR(EPERM));
4230 }
4231
4232 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4233 ZFS_EXIT(zfsvfs);
4234 return (error);
4235 }
4236
4237 /*
4238 * Attempt to lock directory; fail if entry already exists.
4239 */
4240 error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4241 if (error) {
4242 ZFS_EXIT(zfsvfs);
4243 return (error);
4244 }
4245
4246 tx = dmu_tx_create(zfsvfs->z_os);
4247 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4248 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4249 zfs_sa_upgrade_txholds(tx, szp);
4250 zfs_sa_upgrade_txholds(tx, dzp);
4251 error = dmu_tx_assign(tx, TXG_WAIT);
4252 if (error) {
4253 dmu_tx_abort(tx);
4254 ZFS_EXIT(zfsvfs);
4255 return (error);
4256 }
4257
4258 error = zfs_link_create(dzp, name, szp, tx, 0);
4259
4260 if (error == 0) {
4261 uint64_t txtype = TX_LINK;
4262 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4263 }
4264
4265 dmu_tx_commit(tx);
4266
4267 if (error == 0) {
4268 vnevent_link(svp, ct);
4269 }
4270
4271 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4272 zil_commit(zilog, 0);
4273
4274 ZFS_EXIT(zfsvfs);
4275 return (error);
4276 }
4277
4278
4279 /*ARGSUSED*/
4280 void
zfs_inactive(vnode_t * vp,cred_t * cr,caller_context_t * ct)4281 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4282 {
4283 znode_t *zp = VTOZ(vp);
4284 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4285 int error;
4286
4287 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4288 if (zp->z_sa_hdl == NULL) {
4289 /*
4290 * The fs has been unmounted, or we did a
4291 * suspend/resume and this file no longer exists.
4292 */
4293 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4294 vrecycle(vp);
4295 return;
4296 }
4297
4298 if (zp->z_unlinked) {
4299 /*
4300 * Fast path to recycle a vnode of a removed file.
4301 */
4302 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4303 vrecycle(vp);
4304 return;
4305 }
4306
4307 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4308 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4309
4310 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4311 zfs_sa_upgrade_txholds(tx, zp);
4312 error = dmu_tx_assign(tx, TXG_WAIT);
4313 if (error) {
4314 dmu_tx_abort(tx);
4315 } else {
4316 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4317 (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4318 zp->z_atime_dirty = 0;
4319 dmu_tx_commit(tx);
4320 }
4321 }
4322 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4323 }
4324
4325
4326 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4327 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4328
4329 /*ARGSUSED*/
4330 static int
zfs_fid(vnode_t * vp,fid_t * fidp,caller_context_t * ct)4331 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4332 {
4333 znode_t *zp = VTOZ(vp);
4334 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4335 uint32_t gen;
4336 uint64_t gen64;
4337 uint64_t object = zp->z_id;
4338 zfid_short_t *zfid;
4339 int size, i, error;
4340
4341 ZFS_ENTER(zfsvfs);
4342 ZFS_VERIFY_ZP(zp);
4343
4344 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4345 &gen64, sizeof (uint64_t))) != 0) {
4346 ZFS_EXIT(zfsvfs);
4347 return (error);
4348 }
4349
4350 gen = (uint32_t)gen64;
4351
4352 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4353
4354 #ifdef illumos
4355 if (fidp->fid_len < size) {
4356 fidp->fid_len = size;
4357 ZFS_EXIT(zfsvfs);
4358 return (SET_ERROR(ENOSPC));
4359 }
4360 #else
4361 fidp->fid_len = size;
4362 #endif
4363
4364 zfid = (zfid_short_t *)fidp;
4365
4366 zfid->zf_len = size;
4367
4368 for (i = 0; i < sizeof (zfid->zf_object); i++)
4369 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4370
4371 /* Must have a non-zero generation number to distinguish from .zfs */
4372 if (gen == 0)
4373 gen = 1;
4374 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4375 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4376
4377 if (size == LONG_FID_LEN) {
4378 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
4379 zfid_long_t *zlfid;
4380
4381 zlfid = (zfid_long_t *)fidp;
4382
4383 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4384 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4385
4386 /* XXX - this should be the generation number for the objset */
4387 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4388 zlfid->zf_setgen[i] = 0;
4389 }
4390
4391 ZFS_EXIT(zfsvfs);
4392 return (0);
4393 }
4394
4395 static int
zfs_pathconf(vnode_t * vp,int cmd,ulong_t * valp,cred_t * cr,caller_context_t * ct)4396 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4397 caller_context_t *ct)
4398 {
4399 znode_t *zp, *xzp;
4400 zfsvfs_t *zfsvfs;
4401 int error;
4402
4403 switch (cmd) {
4404 case _PC_LINK_MAX:
4405 *valp = INT_MAX;
4406 return (0);
4407
4408 case _PC_FILESIZEBITS:
4409 *valp = 64;
4410 return (0);
4411 #ifdef illumos
4412 case _PC_XATTR_EXISTS:
4413 zp = VTOZ(vp);
4414 zfsvfs = zp->z_zfsvfs;
4415 ZFS_ENTER(zfsvfs);
4416 ZFS_VERIFY_ZP(zp);
4417 *valp = 0;
4418 error = zfs_dirent_lookup(zp, "", &xzp,
4419 ZXATTR | ZEXISTS | ZSHARED);
4420 if (error == 0) {
4421 if (!zfs_dirempty(xzp))
4422 *valp = 1;
4423 vrele(ZTOV(xzp));
4424 } else if (error == ENOENT) {
4425 /*
4426 * If there aren't extended attributes, it's the
4427 * same as having zero of them.
4428 */
4429 error = 0;
4430 }
4431 ZFS_EXIT(zfsvfs);
4432 return (error);
4433
4434 case _PC_SATTR_ENABLED:
4435 case _PC_SATTR_EXISTS:
4436 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4437 (vp->v_type == VREG || vp->v_type == VDIR);
4438 return (0);
4439
4440 case _PC_ACCESS_FILTERING:
4441 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4442 vp->v_type == VDIR;
4443 return (0);
4444
4445 case _PC_ACL_ENABLED:
4446 *valp = _ACL_ACE_ENABLED;
4447 return (0);
4448 #endif /* illumos */
4449 case _PC_MIN_HOLE_SIZE:
4450 *valp = (int)SPA_MINBLOCKSIZE;
4451 return (0);
4452 #ifdef illumos
4453 case _PC_TIMESTAMP_RESOLUTION:
4454 /* nanosecond timestamp resolution */
4455 *valp = 1L;
4456 return (0);
4457 #endif
4458 case _PC_ACL_EXTENDED:
4459 *valp = 0;
4460 return (0);
4461
4462 case _PC_ACL_NFS4:
4463 *valp = 1;
4464 return (0);
4465
4466 case _PC_ACL_PATH_MAX:
4467 *valp = ACL_MAX_ENTRIES;
4468 return (0);
4469
4470 default:
4471 return (EOPNOTSUPP);
4472 }
4473 }
4474
4475 /*ARGSUSED*/
4476 static int
zfs_getsecattr(vnode_t * vp,vsecattr_t * vsecp,int flag,cred_t * cr,caller_context_t * ct)4477 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4478 caller_context_t *ct)
4479 {
4480 znode_t *zp = VTOZ(vp);
4481 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4482 int error;
4483 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4484
4485 ZFS_ENTER(zfsvfs);
4486 ZFS_VERIFY_ZP(zp);
4487 error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4488 ZFS_EXIT(zfsvfs);
4489
4490 return (error);
4491 }
4492
4493 /*ARGSUSED*/
4494 int
zfs_setsecattr(vnode_t * vp,vsecattr_t * vsecp,int flag,cred_t * cr,caller_context_t * ct)4495 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4496 caller_context_t *ct)
4497 {
4498 znode_t *zp = VTOZ(vp);
4499 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4500 int error;
4501 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4502 zilog_t *zilog = zfsvfs->z_log;
4503
4504 ZFS_ENTER(zfsvfs);
4505 ZFS_VERIFY_ZP(zp);
4506
4507 error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4508
4509 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4510 zil_commit(zilog, 0);
4511
4512 ZFS_EXIT(zfsvfs);
4513 return (error);
4514 }
4515
4516 static int
zfs_getpages(struct vnode * vp,vm_page_t * m,int count,int reqpage)4517 zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
4518 {
4519 znode_t *zp = VTOZ(vp);
4520 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4521 objset_t *os = zp->z_zfsvfs->z_os;
4522 vm_page_t mfirst, mlast, mreq;
4523 vm_object_t object;
4524 caddr_t va;
4525 struct sf_buf *sf;
4526 off_t startoff, endoff;
4527 int i, error;
4528 vm_pindex_t reqstart, reqend;
4529 int pcount, lsize, reqsize, size;
4530
4531 ZFS_ENTER(zfsvfs);
4532 ZFS_VERIFY_ZP(zp);
4533
4534 pcount = OFF_TO_IDX(round_page(count));
4535 mreq = m[reqpage];
4536 object = mreq->object;
4537 error = 0;
4538
4539 KASSERT(vp->v_object == object, ("mismatching object"));
4540
4541 if (pcount > 1 && zp->z_blksz > PAGESIZE) {
4542 startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
4543 reqstart = OFF_TO_IDX(round_page(startoff));
4544 if (reqstart < m[0]->pindex)
4545 reqstart = 0;
4546 else
4547 reqstart = reqstart - m[0]->pindex;
4548 endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
4549 zp->z_blksz);
4550 reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
4551 if (reqend > m[pcount - 1]->pindex)
4552 reqend = m[pcount - 1]->pindex;
4553 reqsize = reqend - m[reqstart]->pindex + 1;
4554 KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
4555 ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
4556 } else {
4557 reqstart = reqpage;
4558 reqsize = 1;
4559 }
4560 mfirst = m[reqstart];
4561 mlast = m[reqstart + reqsize - 1];
4562
4563 zfs_vmobject_wlock(object);
4564
4565 for (i = 0; i < reqstart; i++) {
4566 vm_page_lock(m[i]);
4567 vm_page_free(m[i]);
4568 vm_page_unlock(m[i]);
4569 }
4570 for (i = reqstart + reqsize; i < pcount; i++) {
4571 vm_page_lock(m[i]);
4572 vm_page_free(m[i]);
4573 vm_page_unlock(m[i]);
4574 }
4575
4576 if (mreq->valid && reqsize == 1) {
4577 if (mreq->valid != VM_PAGE_BITS_ALL)
4578 vm_page_zero_invalid(mreq, TRUE);
4579 zfs_vmobject_wunlock(object);
4580 ZFS_EXIT(zfsvfs);
4581 return (zfs_vm_pagerret_ok);
4582 }
4583
4584 PCPU_INC(cnt.v_vnodein);
4585 PCPU_ADD(cnt.v_vnodepgsin, reqsize);
4586
4587 if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
4588 for (i = reqstart; i < reqstart + reqsize; i++) {
4589 if (i != reqpage) {
4590 vm_page_lock(m[i]);
4591 vm_page_free(m[i]);
4592 vm_page_unlock(m[i]);
4593 }
4594 }
4595 zfs_vmobject_wunlock(object);
4596 ZFS_EXIT(zfsvfs);
4597 return (zfs_vm_pagerret_bad);
4598 }
4599
4600 lsize = PAGE_SIZE;
4601 if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
4602 lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
4603
4604 zfs_vmobject_wunlock(object);
4605
4606 for (i = reqstart; i < reqstart + reqsize; i++) {
4607 size = PAGE_SIZE;
4608 if (i == (reqstart + reqsize - 1))
4609 size = lsize;
4610 va = zfs_map_page(m[i], &sf);
4611 error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
4612 size, va, DMU_READ_PREFETCH);
4613 if (size != PAGE_SIZE)
4614 bzero(va + size, PAGE_SIZE - size);
4615 zfs_unmap_page(sf);
4616 if (error != 0)
4617 break;
4618 }
4619
4620 zfs_vmobject_wlock(object);
4621
4622 for (i = reqstart; i < reqstart + reqsize; i++) {
4623 if (!error)
4624 m[i]->valid = VM_PAGE_BITS_ALL;
4625 KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
4626 if (i != reqpage)
4627 vm_page_readahead_finish(m[i]);
4628 }
4629
4630 zfs_vmobject_wunlock(object);
4631
4632 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4633 ZFS_EXIT(zfsvfs);
4634 return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
4635 }
4636
4637 static int
zfs_freebsd_getpages(ap)4638 zfs_freebsd_getpages(ap)
4639 struct vop_getpages_args /* {
4640 struct vnode *a_vp;
4641 vm_page_t *a_m;
4642 int a_count;
4643 int a_reqpage;
4644 vm_ooffset_t a_offset;
4645 } */ *ap;
4646 {
4647
4648 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
4649 }
4650
4651 static int
zfs_putpages(struct vnode * vp,vm_page_t * ma,size_t len,int flags,int * rtvals)4652 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4653 int *rtvals)
4654 {
4655 znode_t *zp = VTOZ(vp);
4656 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4657 rl_t *rl;
4658 dmu_tx_t *tx;
4659 struct sf_buf *sf;
4660 vm_object_t object;
4661 vm_page_t m;
4662 caddr_t va;
4663 size_t tocopy;
4664 size_t lo_len;
4665 vm_ooffset_t lo_off;
4666 vm_ooffset_t off;
4667 uint_t blksz;
4668 int ncount;
4669 int pcount;
4670 int err;
4671 int i;
4672
4673 ZFS_ENTER(zfsvfs);
4674 ZFS_VERIFY_ZP(zp);
4675
4676 object = vp->v_object;
4677 pcount = btoc(len);
4678 ncount = pcount;
4679
4680 KASSERT(ma[0]->object == object, ("mismatching object"));
4681 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4682
4683 for (i = 0; i < pcount; i++)
4684 rtvals[i] = zfs_vm_pagerret_error;
4685
4686 off = IDX_TO_OFF(ma[0]->pindex);
4687 blksz = zp->z_blksz;
4688 lo_off = rounddown(off, blksz);
4689 lo_len = roundup(len + (off - lo_off), blksz);
4690 rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4691
4692 zfs_vmobject_wlock(object);
4693 if (len + off > object->un_pager.vnp.vnp_size) {
4694 if (object->un_pager.vnp.vnp_size > off) {
4695 int pgoff;
4696
4697 len = object->un_pager.vnp.vnp_size - off;
4698 ncount = btoc(len);
4699 if ((pgoff = (int)len & PAGE_MASK) != 0) {
4700 /*
4701 * If the object is locked and the following
4702 * conditions hold, then the page's dirty
4703 * field cannot be concurrently changed by a
4704 * pmap operation.
4705 */
4706 m = ma[ncount - 1];
4707 vm_page_assert_sbusied(m);
4708 KASSERT(!pmap_page_is_write_mapped(m),
4709 ("zfs_putpages: page %p is not read-only", m));
4710 vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4711 pgoff);
4712 }
4713 } else {
4714 len = 0;
4715 ncount = 0;
4716 }
4717 if (ncount < pcount) {
4718 for (i = ncount; i < pcount; i++) {
4719 rtvals[i] = zfs_vm_pagerret_bad;
4720 }
4721 }
4722 }
4723 zfs_vmobject_wunlock(object);
4724
4725 if (ncount == 0)
4726 goto out;
4727
4728 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4729 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4730 goto out;
4731 }
4732
4733 tx = dmu_tx_create(zfsvfs->z_os);
4734 dmu_tx_hold_write(tx, zp->z_id, off, len);
4735
4736 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4737 zfs_sa_upgrade_txholds(tx, zp);
4738 err = dmu_tx_assign(tx, TXG_WAIT);
4739 if (err != 0) {
4740 dmu_tx_abort(tx);
4741 goto out;
4742 }
4743
4744 if (zp->z_blksz < PAGE_SIZE) {
4745 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4746 tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4747 va = zfs_map_page(ma[i], &sf);
4748 dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4749 zfs_unmap_page(sf);
4750 }
4751 } else {
4752 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4753 }
4754
4755 if (err == 0) {
4756 uint64_t mtime[2], ctime[2];
4757 sa_bulk_attr_t bulk[3];
4758 int count = 0;
4759
4760 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4761 &mtime, 16);
4762 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4763 &ctime, 16);
4764 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4765 &zp->z_pflags, 8);
4766 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4767 B_TRUE);
4768 (void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4769 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4770
4771 zfs_vmobject_wlock(object);
4772 for (i = 0; i < ncount; i++) {
4773 rtvals[i] = zfs_vm_pagerret_ok;
4774 vm_page_undirty(ma[i]);
4775 }
4776 zfs_vmobject_wunlock(object);
4777 PCPU_INC(cnt.v_vnodeout);
4778 PCPU_ADD(cnt.v_vnodepgsout, ncount);
4779 }
4780 dmu_tx_commit(tx);
4781
4782 out:
4783 zfs_range_unlock(rl);
4784 if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4785 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4786 zil_commit(zfsvfs->z_log, zp->z_id);
4787 ZFS_EXIT(zfsvfs);
4788 return (rtvals[0]);
4789 }
4790
4791 int
zfs_freebsd_putpages(ap)4792 zfs_freebsd_putpages(ap)
4793 struct vop_putpages_args /* {
4794 struct vnode *a_vp;
4795 vm_page_t *a_m;
4796 int a_count;
4797 int a_sync;
4798 int *a_rtvals;
4799 vm_ooffset_t a_offset;
4800 } */ *ap;
4801 {
4802
4803 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4804 ap->a_rtvals));
4805 }
4806
4807 static int
zfs_freebsd_bmap(ap)4808 zfs_freebsd_bmap(ap)
4809 struct vop_bmap_args /* {
4810 struct vnode *a_vp;
4811 daddr_t a_bn;
4812 struct bufobj **a_bop;
4813 daddr_t *a_bnp;
4814 int *a_runp;
4815 int *a_runb;
4816 } */ *ap;
4817 {
4818
4819 if (ap->a_bop != NULL)
4820 *ap->a_bop = &ap->a_vp->v_bufobj;
4821 if (ap->a_bnp != NULL)
4822 *ap->a_bnp = ap->a_bn;
4823 if (ap->a_runp != NULL)
4824 *ap->a_runp = 0;
4825 if (ap->a_runb != NULL)
4826 *ap->a_runb = 0;
4827
4828 return (0);
4829 }
4830
4831 static int
zfs_freebsd_open(ap)4832 zfs_freebsd_open(ap)
4833 struct vop_open_args /* {
4834 struct vnode *a_vp;
4835 int a_mode;
4836 struct ucred *a_cred;
4837 struct thread *a_td;
4838 } */ *ap;
4839 {
4840 vnode_t *vp = ap->a_vp;
4841 znode_t *zp = VTOZ(vp);
4842 int error;
4843
4844 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4845 if (error == 0)
4846 vnode_create_vobject(vp, zp->z_size, ap->a_td);
4847 return (error);
4848 }
4849
4850 static int
zfs_freebsd_close(ap)4851 zfs_freebsd_close(ap)
4852 struct vop_close_args /* {
4853 struct vnode *a_vp;
4854 int a_fflag;
4855 struct ucred *a_cred;
4856 struct thread *a_td;
4857 } */ *ap;
4858 {
4859
4860 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4861 }
4862
4863 static int
zfs_freebsd_ioctl(ap)4864 zfs_freebsd_ioctl(ap)
4865 struct vop_ioctl_args /* {
4866 struct vnode *a_vp;
4867 u_long a_command;
4868 caddr_t a_data;
4869 int a_fflag;
4870 struct ucred *cred;
4871 struct thread *td;
4872 } */ *ap;
4873 {
4874
4875 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4876 ap->a_fflag, ap->a_cred, NULL, NULL));
4877 }
4878
4879 static int
ioflags(int ioflags)4880 ioflags(int ioflags)
4881 {
4882 int flags = 0;
4883
4884 if (ioflags & IO_APPEND)
4885 flags |= FAPPEND;
4886 if (ioflags & IO_NDELAY)
4887 flags |= FNONBLOCK;
4888 if (ioflags & IO_SYNC)
4889 flags |= (FSYNC | FDSYNC | FRSYNC);
4890
4891 return (flags);
4892 }
4893
4894 static int
zfs_freebsd_read(ap)4895 zfs_freebsd_read(ap)
4896 struct vop_read_args /* {
4897 struct vnode *a_vp;
4898 struct uio *a_uio;
4899 int a_ioflag;
4900 struct ucred *a_cred;
4901 } */ *ap;
4902 {
4903
4904 return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4905 ap->a_cred, NULL));
4906 }
4907
4908 static int
zfs_freebsd_write(ap)4909 zfs_freebsd_write(ap)
4910 struct vop_write_args /* {
4911 struct vnode *a_vp;
4912 struct uio *a_uio;
4913 int a_ioflag;
4914 struct ucred *a_cred;
4915 } */ *ap;
4916 {
4917
4918 return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4919 ap->a_cred, NULL));
4920 }
4921
4922 static int
zfs_freebsd_access(ap)4923 zfs_freebsd_access(ap)
4924 struct vop_access_args /* {
4925 struct vnode *a_vp;
4926 accmode_t a_accmode;
4927 struct ucred *a_cred;
4928 struct thread *a_td;
4929 } */ *ap;
4930 {
4931 vnode_t *vp = ap->a_vp;
4932 znode_t *zp = VTOZ(vp);
4933 accmode_t accmode;
4934 int error = 0;
4935
4936 /*
4937 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4938 */
4939 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4940 if (accmode != 0)
4941 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4942
4943 /*
4944 * VADMIN has to be handled by vaccess().
4945 */
4946 if (error == 0) {
4947 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4948 if (accmode != 0) {
4949 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4950 zp->z_gid, accmode, ap->a_cred, NULL);
4951 }
4952 }
4953
4954 /*
4955 * For VEXEC, ensure that at least one execute bit is set for
4956 * non-directories.
4957 */
4958 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4959 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4960 error = EACCES;
4961 }
4962
4963 return (error);
4964 }
4965
4966 static int
zfs_freebsd_lookup(ap)4967 zfs_freebsd_lookup(ap)
4968 struct vop_lookup_args /* {
4969 struct vnode *a_dvp;
4970 struct vnode **a_vpp;
4971 struct componentname *a_cnp;
4972 } */ *ap;
4973 {
4974 struct componentname *cnp = ap->a_cnp;
4975 char nm[NAME_MAX + 1];
4976
4977 ASSERT(cnp->cn_namelen < sizeof(nm));
4978 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4979
4980 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4981 cnp->cn_cred, cnp->cn_thread, 0));
4982 }
4983
4984 static int
zfs_cache_lookup(ap)4985 zfs_cache_lookup(ap)
4986 struct vop_lookup_args /* {
4987 struct vnode *a_dvp;
4988 struct vnode **a_vpp;
4989 struct componentname *a_cnp;
4990 } */ *ap;
4991 {
4992 zfsvfs_t *zfsvfs;
4993
4994 zfsvfs = ap->a_dvp->v_mount->mnt_data;
4995 if (zfsvfs->z_use_namecache)
4996 return (vfs_cache_lookup(ap));
4997 else
4998 return (zfs_freebsd_lookup(ap));
4999 }
5000
5001 static int
zfs_freebsd_create(ap)5002 zfs_freebsd_create(ap)
5003 struct vop_create_args /* {
5004 struct vnode *a_dvp;
5005 struct vnode **a_vpp;
5006 struct componentname *a_cnp;
5007 struct vattr *a_vap;
5008 } */ *ap;
5009 {
5010 zfsvfs_t *zfsvfs;
5011 struct componentname *cnp = ap->a_cnp;
5012 vattr_t *vap = ap->a_vap;
5013 int error, mode;
5014
5015 ASSERT(cnp->cn_flags & SAVENAME);
5016
5017 vattr_init_mask(vap);
5018 mode = vap->va_mode & ALLPERMS;
5019 zfsvfs = ap->a_dvp->v_mount->mnt_data;
5020
5021 error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5022 ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
5023 if (zfsvfs->z_use_namecache &&
5024 error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
5025 cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
5026 return (error);
5027 }
5028
5029 static int
zfs_freebsd_remove(ap)5030 zfs_freebsd_remove(ap)
5031 struct vop_remove_args /* {
5032 struct vnode *a_dvp;
5033 struct vnode *a_vp;
5034 struct componentname *a_cnp;
5035 } */ *ap;
5036 {
5037
5038 ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5039
5040 return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5041 ap->a_cnp->cn_cred));
5042 }
5043
5044 static int
zfs_freebsd_mkdir(ap)5045 zfs_freebsd_mkdir(ap)
5046 struct vop_mkdir_args /* {
5047 struct vnode *a_dvp;
5048 struct vnode **a_vpp;
5049 struct componentname *a_cnp;
5050 struct vattr *a_vap;
5051 } */ *ap;
5052 {
5053 vattr_t *vap = ap->a_vap;
5054
5055 ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5056
5057 vattr_init_mask(vap);
5058
5059 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5060 ap->a_cnp->cn_cred));
5061 }
5062
5063 static int
zfs_freebsd_rmdir(ap)5064 zfs_freebsd_rmdir(ap)
5065 struct vop_rmdir_args /* {
5066 struct vnode *a_dvp;
5067 struct vnode *a_vp;
5068 struct componentname *a_cnp;
5069 } */ *ap;
5070 {
5071 struct componentname *cnp = ap->a_cnp;
5072
5073 ASSERT(cnp->cn_flags & SAVENAME);
5074
5075 return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5076 }
5077
5078 static int
zfs_freebsd_readdir(ap)5079 zfs_freebsd_readdir(ap)
5080 struct vop_readdir_args /* {
5081 struct vnode *a_vp;
5082 struct uio *a_uio;
5083 struct ucred *a_cred;
5084 int *a_eofflag;
5085 int *a_ncookies;
5086 u_long **a_cookies;
5087 } */ *ap;
5088 {
5089
5090 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5091 ap->a_ncookies, ap->a_cookies));
5092 }
5093
5094 static int
zfs_freebsd_fsync(ap)5095 zfs_freebsd_fsync(ap)
5096 struct vop_fsync_args /* {
5097 struct vnode *a_vp;
5098 int a_waitfor;
5099 struct thread *a_td;
5100 } */ *ap;
5101 {
5102
5103 vop_stdfsync(ap);
5104 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5105 }
5106
5107 static int
zfs_freebsd_getattr(ap)5108 zfs_freebsd_getattr(ap)
5109 struct vop_getattr_args /* {
5110 struct vnode *a_vp;
5111 struct vattr *a_vap;
5112 struct ucred *a_cred;
5113 } */ *ap;
5114 {
5115 vattr_t *vap = ap->a_vap;
5116 xvattr_t xvap;
5117 u_long fflags = 0;
5118 int error;
5119
5120 xva_init(&xvap);
5121 xvap.xva_vattr = *vap;
5122 xvap.xva_vattr.va_mask |= AT_XVATTR;
5123
5124 /* Convert chflags into ZFS-type flags. */
5125 /* XXX: what about SF_SETTABLE?. */
5126 XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5127 XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5128 XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5129 XVA_SET_REQ(&xvap, XAT_NODUMP);
5130 XVA_SET_REQ(&xvap, XAT_READONLY);
5131 XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5132 XVA_SET_REQ(&xvap, XAT_SYSTEM);
5133 XVA_SET_REQ(&xvap, XAT_HIDDEN);
5134 XVA_SET_REQ(&xvap, XAT_REPARSE);
5135 XVA_SET_REQ(&xvap, XAT_OFFLINE);
5136 XVA_SET_REQ(&xvap, XAT_SPARSE);
5137
5138 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5139 if (error != 0)
5140 return (error);
5141
5142 /* Convert ZFS xattr into chflags. */
5143 #define FLAG_CHECK(fflag, xflag, xfield) do { \
5144 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
5145 fflags |= (fflag); \
5146 } while (0)
5147 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5148 xvap.xva_xoptattrs.xoa_immutable);
5149 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5150 xvap.xva_xoptattrs.xoa_appendonly);
5151 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5152 xvap.xva_xoptattrs.xoa_nounlink);
5153 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5154 xvap.xva_xoptattrs.xoa_archive);
5155 FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5156 xvap.xva_xoptattrs.xoa_nodump);
5157 FLAG_CHECK(UF_READONLY, XAT_READONLY,
5158 xvap.xva_xoptattrs.xoa_readonly);
5159 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5160 xvap.xva_xoptattrs.xoa_system);
5161 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5162 xvap.xva_xoptattrs.xoa_hidden);
5163 FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5164 xvap.xva_xoptattrs.xoa_reparse);
5165 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5166 xvap.xva_xoptattrs.xoa_offline);
5167 FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5168 xvap.xva_xoptattrs.xoa_sparse);
5169
5170 #undef FLAG_CHECK
5171 *vap = xvap.xva_vattr;
5172 vap->va_flags = fflags;
5173 return (0);
5174 }
5175
5176 static int
zfs_freebsd_setattr(ap)5177 zfs_freebsd_setattr(ap)
5178 struct vop_setattr_args /* {
5179 struct vnode *a_vp;
5180 struct vattr *a_vap;
5181 struct ucred *a_cred;
5182 } */ *ap;
5183 {
5184 vnode_t *vp = ap->a_vp;
5185 vattr_t *vap = ap->a_vap;
5186 cred_t *cred = ap->a_cred;
5187 xvattr_t xvap;
5188 u_long fflags;
5189 uint64_t zflags;
5190
5191 vattr_init_mask(vap);
5192 vap->va_mask &= ~AT_NOSET;
5193
5194 xva_init(&xvap);
5195 xvap.xva_vattr = *vap;
5196
5197 zflags = VTOZ(vp)->z_pflags;
5198
5199 if (vap->va_flags != VNOVAL) {
5200 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5201 int error;
5202
5203 if (zfsvfs->z_use_fuids == B_FALSE)
5204 return (EOPNOTSUPP);
5205
5206 fflags = vap->va_flags;
5207 /*
5208 * XXX KDM
5209 * We need to figure out whether it makes sense to allow
5210 * UF_REPARSE through, since we don't really have other
5211 * facilities to handle reparse points and zfs_setattr()
5212 * doesn't currently allow setting that attribute anyway.
5213 */
5214 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5215 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5216 UF_OFFLINE|UF_SPARSE)) != 0)
5217 return (EOPNOTSUPP);
5218 /*
5219 * Unprivileged processes are not permitted to unset system
5220 * flags, or modify flags if any system flags are set.
5221 * Privileged non-jail processes may not modify system flags
5222 * if securelevel > 0 and any existing system flags are set.
5223 * Privileged jail processes behave like privileged non-jail
5224 * processes if the security.jail.chflags_allowed sysctl is
5225 * is non-zero; otherwise, they behave like unprivileged
5226 * processes.
5227 */
5228 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5229 priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5230 if (zflags &
5231 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5232 error = securelevel_gt(cred, 0);
5233 if (error != 0)
5234 return (error);
5235 }
5236 } else {
5237 /*
5238 * Callers may only modify the file flags on objects they
5239 * have VADMIN rights for.
5240 */
5241 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5242 return (error);
5243 if (zflags &
5244 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5245 return (EPERM);
5246 }
5247 if (fflags &
5248 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5249 return (EPERM);
5250 }
5251 }
5252
5253 #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
5254 if (((fflags & (fflag)) && !(zflags & (zflag))) || \
5255 ((zflags & (zflag)) && !(fflags & (fflag)))) { \
5256 XVA_SET_REQ(&xvap, (xflag)); \
5257 (xfield) = ((fflags & (fflag)) != 0); \
5258 } \
5259 } while (0)
5260 /* Convert chflags into ZFS-type flags. */
5261 /* XXX: what about SF_SETTABLE?. */
5262 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5263 xvap.xva_xoptattrs.xoa_immutable);
5264 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5265 xvap.xva_xoptattrs.xoa_appendonly);
5266 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5267 xvap.xva_xoptattrs.xoa_nounlink);
5268 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5269 xvap.xva_xoptattrs.xoa_archive);
5270 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5271 xvap.xva_xoptattrs.xoa_nodump);
5272 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5273 xvap.xva_xoptattrs.xoa_readonly);
5274 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5275 xvap.xva_xoptattrs.xoa_system);
5276 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5277 xvap.xva_xoptattrs.xoa_hidden);
5278 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5279 xvap.xva_xoptattrs.xoa_hidden);
5280 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5281 xvap.xva_xoptattrs.xoa_offline);
5282 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5283 xvap.xva_xoptattrs.xoa_sparse);
5284 #undef FLAG_CHANGE
5285 }
5286 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5287 }
5288
5289 static int
zfs_freebsd_rename(ap)5290 zfs_freebsd_rename(ap)
5291 struct vop_rename_args /* {
5292 struct vnode *a_fdvp;
5293 struct vnode *a_fvp;
5294 struct componentname *a_fcnp;
5295 struct vnode *a_tdvp;
5296 struct vnode *a_tvp;
5297 struct componentname *a_tcnp;
5298 } */ *ap;
5299 {
5300 vnode_t *fdvp = ap->a_fdvp;
5301 vnode_t *fvp = ap->a_fvp;
5302 vnode_t *tdvp = ap->a_tdvp;
5303 vnode_t *tvp = ap->a_tvp;
5304 int error;
5305
5306 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5307 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5308
5309 error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5310 ap->a_tcnp, ap->a_fcnp->cn_cred);
5311
5312 vrele(fdvp);
5313 vrele(fvp);
5314 vrele(tdvp);
5315 if (tvp != NULL)
5316 vrele(tvp);
5317
5318 return (error);
5319 }
5320
5321 static int
zfs_freebsd_symlink(ap)5322 zfs_freebsd_symlink(ap)
5323 struct vop_symlink_args /* {
5324 struct vnode *a_dvp;
5325 struct vnode **a_vpp;
5326 struct componentname *a_cnp;
5327 struct vattr *a_vap;
5328 char *a_target;
5329 } */ *ap;
5330 {
5331 struct componentname *cnp = ap->a_cnp;
5332 vattr_t *vap = ap->a_vap;
5333
5334 ASSERT(cnp->cn_flags & SAVENAME);
5335
5336 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
5337 vattr_init_mask(vap);
5338
5339 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5340 ap->a_target, cnp->cn_cred, cnp->cn_thread));
5341 }
5342
5343 static int
zfs_freebsd_readlink(ap)5344 zfs_freebsd_readlink(ap)
5345 struct vop_readlink_args /* {
5346 struct vnode *a_vp;
5347 struct uio *a_uio;
5348 struct ucred *a_cred;
5349 } */ *ap;
5350 {
5351
5352 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5353 }
5354
5355 static int
zfs_freebsd_link(ap)5356 zfs_freebsd_link(ap)
5357 struct vop_link_args /* {
5358 struct vnode *a_tdvp;
5359 struct vnode *a_vp;
5360 struct componentname *a_cnp;
5361 } */ *ap;
5362 {
5363 struct componentname *cnp = ap->a_cnp;
5364 vnode_t *vp = ap->a_vp;
5365 vnode_t *tdvp = ap->a_tdvp;
5366
5367 if (tdvp->v_mount != vp->v_mount)
5368 return (EXDEV);
5369
5370 ASSERT(cnp->cn_flags & SAVENAME);
5371
5372 return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5373 }
5374
5375 static int
zfs_freebsd_inactive(ap)5376 zfs_freebsd_inactive(ap)
5377 struct vop_inactive_args /* {
5378 struct vnode *a_vp;
5379 struct thread *a_td;
5380 } */ *ap;
5381 {
5382 vnode_t *vp = ap->a_vp;
5383
5384 zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5385 return (0);
5386 }
5387
5388 static int
zfs_freebsd_reclaim(ap)5389 zfs_freebsd_reclaim(ap)
5390 struct vop_reclaim_args /* {
5391 struct vnode *a_vp;
5392 struct thread *a_td;
5393 } */ *ap;
5394 {
5395 vnode_t *vp = ap->a_vp;
5396 znode_t *zp = VTOZ(vp);
5397 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5398
5399 ASSERT(zp != NULL);
5400
5401 /* Destroy the vm object and flush associated pages. */
5402 vnode_destroy_vobject(vp);
5403
5404 /*
5405 * z_teardown_inactive_lock protects from a race with
5406 * zfs_znode_dmu_fini in zfsvfs_teardown during
5407 * force unmount.
5408 */
5409 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5410 if (zp->z_sa_hdl == NULL)
5411 zfs_znode_free(zp);
5412 else
5413 zfs_zinactive(zp);
5414 rw_exit(&zfsvfs->z_teardown_inactive_lock);
5415
5416 vp->v_data = NULL;
5417 return (0);
5418 }
5419
5420 static int
zfs_freebsd_fid(ap)5421 zfs_freebsd_fid(ap)
5422 struct vop_fid_args /* {
5423 struct vnode *a_vp;
5424 struct fid *a_fid;
5425 } */ *ap;
5426 {
5427
5428 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5429 }
5430
5431 static int
zfs_freebsd_pathconf(ap)5432 zfs_freebsd_pathconf(ap)
5433 struct vop_pathconf_args /* {
5434 struct vnode *a_vp;
5435 int a_name;
5436 register_t *a_retval;
5437 } */ *ap;
5438 {
5439 ulong_t val;
5440 int error;
5441
5442 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5443 if (error == 0)
5444 *ap->a_retval = val;
5445 else if (error == EOPNOTSUPP)
5446 error = vop_stdpathconf(ap);
5447 return (error);
5448 }
5449
5450 static int
zfs_freebsd_fifo_pathconf(ap)5451 zfs_freebsd_fifo_pathconf(ap)
5452 struct vop_pathconf_args /* {
5453 struct vnode *a_vp;
5454 int a_name;
5455 register_t *a_retval;
5456 } */ *ap;
5457 {
5458
5459 switch (ap->a_name) {
5460 case _PC_ACL_EXTENDED:
5461 case _PC_ACL_NFS4:
5462 case _PC_ACL_PATH_MAX:
5463 case _PC_MAC_PRESENT:
5464 return (zfs_freebsd_pathconf(ap));
5465 default:
5466 return (fifo_specops.vop_pathconf(ap));
5467 }
5468 }
5469
5470 /*
5471 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5472 * extended attribute name:
5473 *
5474 * NAMESPACE PREFIX
5475 * system freebsd:system:
5476 * user (none, can be used to access ZFS fsattr(5) attributes
5477 * created on Solaris)
5478 */
5479 static int
zfs_create_attrname(int attrnamespace,const char * name,char * attrname,size_t size)5480 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5481 size_t size)
5482 {
5483 const char *namespace, *prefix, *suffix;
5484
5485 /* We don't allow '/' character in attribute name. */
5486 if (strchr(name, '/') != NULL)
5487 return (EINVAL);
5488 /* We don't allow attribute names that start with "freebsd:" string. */
5489 if (strncmp(name, "freebsd:", 8) == 0)
5490 return (EINVAL);
5491
5492 bzero(attrname, size);
5493
5494 switch (attrnamespace) {
5495 case EXTATTR_NAMESPACE_USER:
5496 #if 0
5497 prefix = "freebsd:";
5498 namespace = EXTATTR_NAMESPACE_USER_STRING;
5499 suffix = ":";
5500 #else
5501 /*
5502 * This is the default namespace by which we can access all
5503 * attributes created on Solaris.
5504 */
5505 prefix = namespace = suffix = "";
5506 #endif
5507 break;
5508 case EXTATTR_NAMESPACE_SYSTEM:
5509 prefix = "freebsd:";
5510 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5511 suffix = ":";
5512 break;
5513 case EXTATTR_NAMESPACE_EMPTY:
5514 default:
5515 return (EINVAL);
5516 }
5517 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5518 name) >= size) {
5519 return (ENAMETOOLONG);
5520 }
5521 return (0);
5522 }
5523
5524 /*
5525 * Vnode operating to retrieve a named extended attribute.
5526 */
5527 static int
zfs_getextattr(struct vop_getextattr_args * ap)5528 zfs_getextattr(struct vop_getextattr_args *ap)
5529 /*
5530 vop_getextattr {
5531 IN struct vnode *a_vp;
5532 IN int a_attrnamespace;
5533 IN const char *a_name;
5534 INOUT struct uio *a_uio;
5535 OUT size_t *a_size;
5536 IN struct ucred *a_cred;
5537 IN struct thread *a_td;
5538 };
5539 */
5540 {
5541 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5542 struct thread *td = ap->a_td;
5543 struct nameidata nd;
5544 char attrname[255];
5545 struct vattr va;
5546 vnode_t *xvp = NULL, *vp;
5547 int error, flags;
5548
5549 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5550 ap->a_cred, ap->a_td, VREAD);
5551 if (error != 0)
5552 return (error);
5553
5554 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5555 sizeof(attrname));
5556 if (error != 0)
5557 return (error);
5558
5559 ZFS_ENTER(zfsvfs);
5560
5561 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5562 LOOKUP_XATTR);
5563 if (error != 0) {
5564 ZFS_EXIT(zfsvfs);
5565 return (error);
5566 }
5567
5568 flags = FREAD;
5569 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5570 xvp, td);
5571 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
5572 vp = nd.ni_vp;
5573 NDFREE(&nd, NDF_ONLY_PNBUF);
5574 if (error != 0) {
5575 ZFS_EXIT(zfsvfs);
5576 if (error == ENOENT)
5577 error = ENOATTR;
5578 return (error);
5579 }
5580
5581 if (ap->a_size != NULL) {
5582 error = VOP_GETATTR(vp, &va, ap->a_cred);
5583 if (error == 0)
5584 *ap->a_size = (size_t)va.va_size;
5585 } else if (ap->a_uio != NULL)
5586 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5587
5588 VOP_UNLOCK(vp, 0);
5589 vn_close(vp, flags, ap->a_cred, td);
5590 ZFS_EXIT(zfsvfs);
5591
5592 return (error);
5593 }
5594
5595 /*
5596 * Vnode operation to remove a named attribute.
5597 */
5598 int
zfs_deleteextattr(struct vop_deleteextattr_args * ap)5599 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5600 /*
5601 vop_deleteextattr {
5602 IN struct vnode *a_vp;
5603 IN int a_attrnamespace;
5604 IN const char *a_name;
5605 IN struct ucred *a_cred;
5606 IN struct thread *a_td;
5607 };
5608 */
5609 {
5610 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5611 struct thread *td = ap->a_td;
5612 struct nameidata nd;
5613 char attrname[255];
5614 struct vattr va;
5615 vnode_t *xvp = NULL, *vp;
5616 int error, flags;
5617
5618 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5619 ap->a_cred, ap->a_td, VWRITE);
5620 if (error != 0)
5621 return (error);
5622
5623 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5624 sizeof(attrname));
5625 if (error != 0)
5626 return (error);
5627
5628 ZFS_ENTER(zfsvfs);
5629
5630 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5631 LOOKUP_XATTR);
5632 if (error != 0) {
5633 ZFS_EXIT(zfsvfs);
5634 return (error);
5635 }
5636
5637 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5638 UIO_SYSSPACE, attrname, xvp, td);
5639 error = namei(&nd);
5640 vp = nd.ni_vp;
5641 if (error != 0) {
5642 ZFS_EXIT(zfsvfs);
5643 NDFREE(&nd, NDF_ONLY_PNBUF);
5644 if (error == ENOENT)
5645 error = ENOATTR;
5646 return (error);
5647 }
5648
5649 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5650 NDFREE(&nd, NDF_ONLY_PNBUF);
5651
5652 vput(nd.ni_dvp);
5653 if (vp == nd.ni_dvp)
5654 vrele(vp);
5655 else
5656 vput(vp);
5657 ZFS_EXIT(zfsvfs);
5658
5659 return (error);
5660 }
5661
5662 /*
5663 * Vnode operation to set a named attribute.
5664 */
5665 static int
zfs_setextattr(struct vop_setextattr_args * ap)5666 zfs_setextattr(struct vop_setextattr_args *ap)
5667 /*
5668 vop_setextattr {
5669 IN struct vnode *a_vp;
5670 IN int a_attrnamespace;
5671 IN const char *a_name;
5672 INOUT struct uio *a_uio;
5673 IN struct ucred *a_cred;
5674 IN struct thread *a_td;
5675 };
5676 */
5677 {
5678 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5679 struct thread *td = ap->a_td;
5680 struct nameidata nd;
5681 char attrname[255];
5682 struct vattr va;
5683 vnode_t *xvp = NULL, *vp;
5684 int error, flags;
5685
5686 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5687 ap->a_cred, ap->a_td, VWRITE);
5688 if (error != 0)
5689 return (error);
5690
5691 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5692 sizeof(attrname));
5693 if (error != 0)
5694 return (error);
5695
5696 ZFS_ENTER(zfsvfs);
5697
5698 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5699 LOOKUP_XATTR | CREATE_XATTR_DIR);
5700 if (error != 0) {
5701 ZFS_EXIT(zfsvfs);
5702 return (error);
5703 }
5704
5705 flags = FFLAGS(O_WRONLY | O_CREAT);
5706 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5707 xvp, td);
5708 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5709 vp = nd.ni_vp;
5710 NDFREE(&nd, NDF_ONLY_PNBUF);
5711 if (error != 0) {
5712 ZFS_EXIT(zfsvfs);
5713 return (error);
5714 }
5715
5716 VATTR_NULL(&va);
5717 va.va_size = 0;
5718 error = VOP_SETATTR(vp, &va, ap->a_cred);
5719 if (error == 0)
5720 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5721
5722 VOP_UNLOCK(vp, 0);
5723 vn_close(vp, flags, ap->a_cred, td);
5724 ZFS_EXIT(zfsvfs);
5725
5726 return (error);
5727 }
5728
5729 /*
5730 * Vnode operation to retrieve extended attributes on a vnode.
5731 */
5732 static int
zfs_listextattr(struct vop_listextattr_args * ap)5733 zfs_listextattr(struct vop_listextattr_args *ap)
5734 /*
5735 vop_listextattr {
5736 IN struct vnode *a_vp;
5737 IN int a_attrnamespace;
5738 INOUT struct uio *a_uio;
5739 OUT size_t *a_size;
5740 IN struct ucred *a_cred;
5741 IN struct thread *a_td;
5742 };
5743 */
5744 {
5745 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5746 struct thread *td = ap->a_td;
5747 struct nameidata nd;
5748 char attrprefix[16];
5749 u_char dirbuf[sizeof(struct dirent)];
5750 struct dirent *dp;
5751 struct iovec aiov;
5752 struct uio auio, *uio = ap->a_uio;
5753 size_t *sizep = ap->a_size;
5754 size_t plen;
5755 vnode_t *xvp = NULL, *vp;
5756 int done, error, eof, pos;
5757
5758 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5759 ap->a_cred, ap->a_td, VREAD);
5760 if (error != 0)
5761 return (error);
5762
5763 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5764 sizeof(attrprefix));
5765 if (error != 0)
5766 return (error);
5767 plen = strlen(attrprefix);
5768
5769 ZFS_ENTER(zfsvfs);
5770
5771 if (sizep != NULL)
5772 *sizep = 0;
5773
5774 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5775 LOOKUP_XATTR);
5776 if (error != 0) {
5777 ZFS_EXIT(zfsvfs);
5778 /*
5779 * ENOATTR means that the EA directory does not yet exist,
5780 * i.e. there are no extended attributes there.
5781 */
5782 if (error == ENOATTR)
5783 error = 0;
5784 return (error);
5785 }
5786
5787 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5788 UIO_SYSSPACE, ".", xvp, td);
5789 error = namei(&nd);
5790 vp = nd.ni_vp;
5791 NDFREE(&nd, NDF_ONLY_PNBUF);
5792 if (error != 0) {
5793 ZFS_EXIT(zfsvfs);
5794 return (error);
5795 }
5796
5797 auio.uio_iov = &aiov;
5798 auio.uio_iovcnt = 1;
5799 auio.uio_segflg = UIO_SYSSPACE;
5800 auio.uio_td = td;
5801 auio.uio_rw = UIO_READ;
5802 auio.uio_offset = 0;
5803
5804 do {
5805 u_char nlen;
5806
5807 aiov.iov_base = (void *)dirbuf;
5808 aiov.iov_len = sizeof(dirbuf);
5809 auio.uio_resid = sizeof(dirbuf);
5810 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5811 done = sizeof(dirbuf) - auio.uio_resid;
5812 if (error != 0)
5813 break;
5814 for (pos = 0; pos < done;) {
5815 dp = (struct dirent *)(dirbuf + pos);
5816 pos += dp->d_reclen;
5817 /*
5818 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5819 * is what we get when attribute was created on Solaris.
5820 */
5821 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5822 continue;
5823 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5824 continue;
5825 else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5826 continue;
5827 nlen = dp->d_namlen - plen;
5828 if (sizep != NULL)
5829 *sizep += 1 + nlen;
5830 else if (uio != NULL) {
5831 /*
5832 * Format of extattr name entry is one byte for
5833 * length and the rest for name.
5834 */
5835 error = uiomove(&nlen, 1, uio->uio_rw, uio);
5836 if (error == 0) {
5837 error = uiomove(dp->d_name + plen, nlen,
5838 uio->uio_rw, uio);
5839 }
5840 if (error != 0)
5841 break;
5842 }
5843 }
5844 } while (!eof && error == 0);
5845
5846 vput(vp);
5847 ZFS_EXIT(zfsvfs);
5848
5849 return (error);
5850 }
5851
5852 int
zfs_freebsd_getacl(ap)5853 zfs_freebsd_getacl(ap)
5854 struct vop_getacl_args /* {
5855 struct vnode *vp;
5856 acl_type_t type;
5857 struct acl *aclp;
5858 struct ucred *cred;
5859 struct thread *td;
5860 } */ *ap;
5861 {
5862 int error;
5863 vsecattr_t vsecattr;
5864
5865 if (ap->a_type != ACL_TYPE_NFS4)
5866 return (EINVAL);
5867
5868 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5869 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5870 return (error);
5871
5872 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5873 if (vsecattr.vsa_aclentp != NULL)
5874 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5875
5876 return (error);
5877 }
5878
5879 int
zfs_freebsd_setacl(ap)5880 zfs_freebsd_setacl(ap)
5881 struct vop_setacl_args /* {
5882 struct vnode *vp;
5883 acl_type_t type;
5884 struct acl *aclp;
5885 struct ucred *cred;
5886 struct thread *td;
5887 } */ *ap;
5888 {
5889 int error;
5890 vsecattr_t vsecattr;
5891 int aclbsize; /* size of acl list in bytes */
5892 aclent_t *aaclp;
5893
5894 if (ap->a_type != ACL_TYPE_NFS4)
5895 return (EINVAL);
5896
5897 if (ap->a_aclp == NULL)
5898 return (EINVAL);
5899
5900 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5901 return (EINVAL);
5902
5903 /*
5904 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5905 * splitting every entry into two and appending "canonical six"
5906 * entries at the end. Don't allow for setting an ACL that would
5907 * cause chmod(2) to run out of ACL entries.
5908 */
5909 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5910 return (ENOSPC);
5911
5912 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5913 if (error != 0)
5914 return (error);
5915
5916 vsecattr.vsa_mask = VSA_ACE;
5917 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5918 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5919 aaclp = vsecattr.vsa_aclentp;
5920 vsecattr.vsa_aclentsz = aclbsize;
5921
5922 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5923 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5924 kmem_free(aaclp, aclbsize);
5925
5926 return (error);
5927 }
5928
5929 int
zfs_freebsd_aclcheck(ap)5930 zfs_freebsd_aclcheck(ap)
5931 struct vop_aclcheck_args /* {
5932 struct vnode *vp;
5933 acl_type_t type;
5934 struct acl *aclp;
5935 struct ucred *cred;
5936 struct thread *td;
5937 } */ *ap;
5938 {
5939
5940 return (EOPNOTSUPP);
5941 }
5942
5943 static int
zfs_vptocnp(struct vop_vptocnp_args * ap)5944 zfs_vptocnp(struct vop_vptocnp_args *ap)
5945 {
5946 vnode_t *covered_vp;
5947 vnode_t *vp = ap->a_vp;;
5948 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5949 znode_t *zp = VTOZ(vp);
5950 int ltype;
5951 int error;
5952
5953 ZFS_ENTER(zfsvfs);
5954 ZFS_VERIFY_ZP(zp);
5955
5956 /*
5957 * If we are a snapshot mounted under .zfs, run the operation
5958 * on the covered vnode.
5959 */
5960 if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
5961 char name[MAXNAMLEN + 1];
5962 znode_t *dzp;
5963 size_t len;
5964
5965 error = zfs_znode_parent_and_name(zp, &dzp, name);
5966 if (error == 0) {
5967 len = strlen(name);
5968 if (*ap->a_buflen < len)
5969 error = SET_ERROR(ENOMEM);
5970 }
5971 if (error == 0) {
5972 *ap->a_buflen -= len;
5973 bcopy(name, ap->a_buf + *ap->a_buflen, len);
5974 *ap->a_vpp = ZTOV(dzp);
5975 }
5976 ZFS_EXIT(zfsvfs);
5977 return (error);
5978 }
5979 ZFS_EXIT(zfsvfs);
5980
5981 covered_vp = vp->v_mount->mnt_vnodecovered;
5982 vhold(covered_vp);
5983 ltype = VOP_ISLOCKED(vp);
5984 VOP_UNLOCK(vp, 0);
5985 error = vget(covered_vp, LK_SHARED, curthread);
5986 vdrop(covered_vp);
5987 if (error == 0) {
5988 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5989 ap->a_buf, ap->a_buflen);
5990 vput(covered_vp);
5991 }
5992 vn_lock(vp, ltype | LK_RETRY);
5993 if ((vp->v_iflag & VI_DOOMED) != 0)
5994 error = SET_ERROR(ENOENT);
5995 return (error);
5996 }
5997
5998 #ifdef DIAGNOSTIC
5999 static int
zfs_lock(ap)6000 zfs_lock(ap)
6001 struct vop_lock1_args /* {
6002 struct vnode *a_vp;
6003 int a_flags;
6004 char *file;
6005 int line;
6006 } */ *ap;
6007 {
6008 vnode_t *vp;
6009 znode_t *zp;
6010 int err;
6011
6012 err = vop_stdlock(ap);
6013 if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
6014 vp = ap->a_vp;
6015 zp = vp->v_data;
6016 if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
6017 zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
6018 VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
6019 }
6020 return (err);
6021 }
6022 #endif
6023
6024 struct vop_vector zfs_vnodeops;
6025 struct vop_vector zfs_fifoops;
6026 struct vop_vector zfs_shareops;
6027
6028 struct vop_vector zfs_vnodeops = {
6029 .vop_default = &default_vnodeops,
6030 .vop_inactive = zfs_freebsd_inactive,
6031 .vop_reclaim = zfs_freebsd_reclaim,
6032 .vop_access = zfs_freebsd_access,
6033 .vop_lookup = zfs_cache_lookup,
6034 .vop_cachedlookup = zfs_freebsd_lookup,
6035 .vop_getattr = zfs_freebsd_getattr,
6036 .vop_setattr = zfs_freebsd_setattr,
6037 .vop_create = zfs_freebsd_create,
6038 .vop_mknod = zfs_freebsd_create,
6039 .vop_mkdir = zfs_freebsd_mkdir,
6040 .vop_readdir = zfs_freebsd_readdir,
6041 .vop_fsync = zfs_freebsd_fsync,
6042 .vop_open = zfs_freebsd_open,
6043 .vop_close = zfs_freebsd_close,
6044 .vop_rmdir = zfs_freebsd_rmdir,
6045 .vop_ioctl = zfs_freebsd_ioctl,
6046 .vop_link = zfs_freebsd_link,
6047 .vop_symlink = zfs_freebsd_symlink,
6048 .vop_readlink = zfs_freebsd_readlink,
6049 .vop_read = zfs_freebsd_read,
6050 .vop_write = zfs_freebsd_write,
6051 .vop_remove = zfs_freebsd_remove,
6052 .vop_rename = zfs_freebsd_rename,
6053 .vop_pathconf = zfs_freebsd_pathconf,
6054 .vop_bmap = zfs_freebsd_bmap,
6055 .vop_fid = zfs_freebsd_fid,
6056 .vop_getextattr = zfs_getextattr,
6057 .vop_deleteextattr = zfs_deleteextattr,
6058 .vop_setextattr = zfs_setextattr,
6059 .vop_listextattr = zfs_listextattr,
6060 .vop_getacl = zfs_freebsd_getacl,
6061 .vop_setacl = zfs_freebsd_setacl,
6062 .vop_aclcheck = zfs_freebsd_aclcheck,
6063 .vop_getpages = zfs_freebsd_getpages,
6064 .vop_putpages = zfs_freebsd_putpages,
6065 .vop_vptocnp = zfs_vptocnp,
6066 #ifdef DIAGNOSTIC
6067 .vop_lock1 = zfs_lock,
6068 #endif
6069 };
6070
6071 struct vop_vector zfs_fifoops = {
6072 .vop_default = &fifo_specops,
6073 .vop_fsync = zfs_freebsd_fsync,
6074 .vop_access = zfs_freebsd_access,
6075 .vop_getattr = zfs_freebsd_getattr,
6076 .vop_inactive = zfs_freebsd_inactive,
6077 .vop_read = VOP_PANIC,
6078 .vop_reclaim = zfs_freebsd_reclaim,
6079 .vop_setattr = zfs_freebsd_setattr,
6080 .vop_write = VOP_PANIC,
6081 .vop_pathconf = zfs_freebsd_fifo_pathconf,
6082 .vop_fid = zfs_freebsd_fid,
6083 .vop_getacl = zfs_freebsd_getacl,
6084 .vop_setacl = zfs_freebsd_setacl,
6085 .vop_aclcheck = zfs_freebsd_aclcheck,
6086 };
6087
6088 /*
6089 * special share hidden files vnode operations template
6090 */
6091 struct vop_vector zfs_shareops = {
6092 .vop_default = &default_vnodeops,
6093 .vop_access = zfs_freebsd_access,
6094 .vop_inactive = zfs_freebsd_inactive,
6095 .vop_reclaim = zfs_freebsd_reclaim,
6096 .vop_fid = zfs_freebsd_fid,
6097 .vop_pathconf = zfs_freebsd_pathconf,
6098 };
6099