1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2014 Integros [integros.com]
26  */
27 
28 /* Portions Copyright 2007 Jeremy Teo */
29 /* Portions Copyright 2010 Robert Milkowski */
30 
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/vfs.h>
38 #include <sys/vm.h>
39 #include <sys/vnode.h>
40 #include <sys/file.h>
41 #include <sys/stat.h>
42 #include <sys/kmem.h>
43 #include <sys/taskq.h>
44 #include <sys/uio.h>
45 #include <sys/atomic.h>
46 #include <sys/namei.h>
47 #include <sys/mman.h>
48 #include <sys/cmn_err.h>
49 #include <sys/errno.h>
50 #include <sys/unistd.h>
51 #include <sys/zfs_dir.h>
52 #include <sys/zfs_ioctl.h>
53 #include <sys/fs/zfs.h>
54 #include <sys/dmu.h>
55 #include <sys/dmu_objset.h>
56 #include <sys/spa.h>
57 #include <sys/txg.h>
58 #include <sys/dbuf.h>
59 #include <sys/zap.h>
60 #include <sys/sa.h>
61 #include <sys/dirent.h>
62 #include <sys/policy.h>
63 #include <sys/sunddi.h>
64 #include <sys/filio.h>
65 #include <sys/sid.h>
66 #include <sys/zfs_ctldir.h>
67 #include <sys/zfs_fuid.h>
68 #include <sys/zfs_sa.h>
69 #include <sys/zfs_rlock.h>
70 #include <sys/extdirent.h>
71 #include <sys/kidmap.h>
72 #include <sys/bio.h>
73 #include <sys/buf.h>
74 #include <sys/sched.h>
75 #include <sys/acl.h>
76 #include <vm/vm_param.h>
77 
78 /*
79  * Programming rules.
80  *
81  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
82  * properly lock its in-core state, create a DMU transaction, do the work,
83  * record this work in the intent log (ZIL), commit the DMU transaction,
84  * and wait for the intent log to commit if it is a synchronous operation.
85  * Moreover, the vnode ops must work in both normal and log replay context.
86  * The ordering of events is important to avoid deadlocks and references
87  * to freed memory.  The example below illustrates the following Big Rules:
88  *
89  *  (1)	A check must be made in each zfs thread for a mounted file system.
90  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
91  *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
92  *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
93  *	can return EIO from the calling function.
94  *
95  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
96  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
97  *	First, if it's the last reference, the vnode/znode
98  *	can be freed, so the zp may point to freed memory.  Second, the last
99  *	reference will call zfs_zinactive(), which may induce a lot of work --
100  *	pushing cached pages (which acquires range locks) and syncing out
101  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
102  *	which could deadlock the system if you were already holding one.
103  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
104  *
105  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
106  *	as they can span dmu_tx_assign() calls.
107  *
108  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
109  *      dmu_tx_assign().  This is critical because we don't want to block
110  *      while holding locks.
111  *
112  *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
113  *	reduces lock contention and CPU usage when we must wait (note that if
114  *	throughput is constrained by the storage, nearly every transaction
115  *	must wait).
116  *
117  *      Note, in particular, that if a lock is sometimes acquired before
118  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
119  *      to use a non-blocking assign can deadlock the system.  The scenario:
120  *
121  *	Thread A has grabbed a lock before calling dmu_tx_assign().
122  *	Thread B is in an already-assigned tx, and blocks for this lock.
123  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
124  *	forever, because the previous txg can't quiesce until B's tx commits.
125  *
126  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
127  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
128  *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
129  *	to indicate that this operation has already called dmu_tx_wait().
130  *	This will ensure that we don't retry forever, waiting a short bit
131  *	each time.
132  *
133  *  (5)	If the operation succeeded, generate the intent log entry for it
134  *	before dropping locks.  This ensures that the ordering of events
135  *	in the intent log matches the order in which they actually occurred.
136  *	During ZIL replay the zfs_log_* functions will update the sequence
137  *	number to indicate the zil transaction has replayed.
138  *
139  *  (6)	At the end of each vnode op, the DMU tx must always commit,
140  *	regardless of whether there were any errors.
141  *
142  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
143  *	to ensure that synchronous semantics are provided when necessary.
144  *
145  * In general, this is how things should be ordered in each vnode op:
146  *
147  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
148  * top:
149  *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
150  *	rw_enter(...);			// grab any other locks you need
151  *	tx = dmu_tx_create(...);	// get DMU tx
152  *	dmu_tx_hold_*();		// hold each object you might modify
153  *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
154  *	if (error) {
155  *		rw_exit(...);		// drop locks
156  *		zfs_dirent_unlock(dl);	// unlock directory entry
157  *		VN_RELE(...);		// release held vnodes
158  *		if (error == ERESTART) {
159  *			waited = B_TRUE;
160  *			dmu_tx_wait(tx);
161  *			dmu_tx_abort(tx);
162  *			goto top;
163  *		}
164  *		dmu_tx_abort(tx);	// abort DMU tx
165  *		ZFS_EXIT(zfsvfs);	// finished in zfs
166  *		return (error);		// really out of space
167  *	}
168  *	error = do_real_work();		// do whatever this VOP does
169  *	if (error == 0)
170  *		zfs_log_*(...);		// on success, make ZIL entry
171  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
172  *	rw_exit(...);			// drop locks
173  *	zfs_dirent_unlock(dl);		// unlock directory entry
174  *	VN_RELE(...);			// release held vnodes
175  *	zil_commit(zilog, foid);	// synchronous when necessary
176  *	ZFS_EXIT(zfsvfs);		// finished in zfs
177  *	return (error);			// done, report error
178  */
179 
180 /* ARGSUSED */
181 static int
zfs_open(vnode_t ** vpp,int flag,cred_t * cr,caller_context_t * ct)182 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
183 {
184 	znode_t	*zp = VTOZ(*vpp);
185 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
186 
187 	ZFS_ENTER(zfsvfs);
188 	ZFS_VERIFY_ZP(zp);
189 
190 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
191 	    ((flag & FAPPEND) == 0)) {
192 		ZFS_EXIT(zfsvfs);
193 		return (SET_ERROR(EPERM));
194 	}
195 
196 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
197 	    ZTOV(zp)->v_type == VREG &&
198 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
199 		if (fs_vscan(*vpp, cr, 0) != 0) {
200 			ZFS_EXIT(zfsvfs);
201 			return (SET_ERROR(EACCES));
202 		}
203 	}
204 
205 	/* Keep a count of the synchronous opens in the znode */
206 	if (flag & (FSYNC | FDSYNC))
207 		atomic_inc_32(&zp->z_sync_cnt);
208 
209 	ZFS_EXIT(zfsvfs);
210 	return (0);
211 }
212 
213 /* ARGSUSED */
214 static int
zfs_close(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr,caller_context_t * ct)215 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
216     caller_context_t *ct)
217 {
218 	znode_t	*zp = VTOZ(vp);
219 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
220 
221 	/*
222 	 * Clean up any locks held by this process on the vp.
223 	 */
224 	cleanlocks(vp, ddi_get_pid(), 0);
225 	cleanshares(vp, ddi_get_pid());
226 
227 	ZFS_ENTER(zfsvfs);
228 	ZFS_VERIFY_ZP(zp);
229 
230 	/* Decrement the synchronous opens in the znode */
231 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
232 		atomic_dec_32(&zp->z_sync_cnt);
233 
234 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
235 	    ZTOV(zp)->v_type == VREG &&
236 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
237 		VERIFY(fs_vscan(vp, cr, 1) == 0);
238 
239 	ZFS_EXIT(zfsvfs);
240 	return (0);
241 }
242 
243 /*
244  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
245  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
246  */
247 static int
zfs_holey(vnode_t * vp,u_long cmd,offset_t * off)248 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
249 {
250 	znode_t	*zp = VTOZ(vp);
251 	uint64_t noff = (uint64_t)*off; /* new offset */
252 	uint64_t file_sz;
253 	int error;
254 	boolean_t hole;
255 
256 	file_sz = zp->z_size;
257 	if (noff >= file_sz)  {
258 		return (SET_ERROR(ENXIO));
259 	}
260 
261 	if (cmd == _FIO_SEEK_HOLE)
262 		hole = B_TRUE;
263 	else
264 		hole = B_FALSE;
265 
266 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
267 
268 	if (error == ESRCH)
269 		return (SET_ERROR(ENXIO));
270 
271 	/*
272 	 * We could find a hole that begins after the logical end-of-file,
273 	 * because dmu_offset_next() only works on whole blocks.  If the
274 	 * EOF falls mid-block, then indicate that the "virtual hole"
275 	 * at the end of the file begins at the logical EOF, rather than
276 	 * at the end of the last block.
277 	 */
278 	if (noff > file_sz) {
279 		ASSERT(hole);
280 		noff = file_sz;
281 	}
282 
283 	if (noff < *off)
284 		return (error);
285 	*off = noff;
286 	return (error);
287 }
288 
289 /* ARGSUSED */
290 static int
zfs_ioctl(vnode_t * vp,u_long com,intptr_t data,int flag,cred_t * cred,int * rvalp,caller_context_t * ct)291 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
292     int *rvalp, caller_context_t *ct)
293 {
294 	offset_t off;
295 	offset_t ndata;
296 	dmu_object_info_t doi;
297 	int error;
298 	zfsvfs_t *zfsvfs;
299 	znode_t *zp;
300 
301 	switch (com) {
302 	case _FIOFFS:
303 	{
304 		return (0);
305 
306 		/*
307 		 * The following two ioctls are used by bfu.  Faking out,
308 		 * necessary to avoid bfu errors.
309 		 */
310 	}
311 	case _FIOGDIO:
312 	case _FIOSDIO:
313 	{
314 		return (0);
315 	}
316 
317 	case _FIO_SEEK_DATA:
318 	case _FIO_SEEK_HOLE:
319 	{
320 #ifdef illumos
321 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
322 			return (SET_ERROR(EFAULT));
323 #else
324 		off = *(offset_t *)data;
325 #endif
326 		zp = VTOZ(vp);
327 		zfsvfs = zp->z_zfsvfs;
328 		ZFS_ENTER(zfsvfs);
329 		ZFS_VERIFY_ZP(zp);
330 
331 		/* offset parameter is in/out */
332 		error = zfs_holey(vp, com, &off);
333 		ZFS_EXIT(zfsvfs);
334 		if (error)
335 			return (error);
336 #ifdef illumos
337 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
338 			return (SET_ERROR(EFAULT));
339 #else
340 		*(offset_t *)data = off;
341 #endif
342 		return (0);
343 	}
344 #ifdef illumos
345 	case _FIO_COUNT_FILLED:
346 	{
347 		/*
348 		 * _FIO_COUNT_FILLED adds a new ioctl command which
349 		 * exposes the number of filled blocks in a
350 		 * ZFS object.
351 		 */
352 		zp = VTOZ(vp);
353 		zfsvfs = zp->z_zfsvfs;
354 		ZFS_ENTER(zfsvfs);
355 		ZFS_VERIFY_ZP(zp);
356 
357 		/*
358 		 * Wait for all dirty blocks for this object
359 		 * to get synced out to disk, and the DMU info
360 		 * updated.
361 		 */
362 		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
363 		if (error) {
364 			ZFS_EXIT(zfsvfs);
365 			return (error);
366 		}
367 
368 		/*
369 		 * Retrieve fill count from DMU object.
370 		 */
371 		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
372 		if (error) {
373 			ZFS_EXIT(zfsvfs);
374 			return (error);
375 		}
376 
377 		ndata = doi.doi_fill_count;
378 
379 		ZFS_EXIT(zfsvfs);
380 		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
381 			return (SET_ERROR(EFAULT));
382 		return (0);
383 	}
384 #endif
385 	}
386 	return (SET_ERROR(ENOTTY));
387 }
388 
389 static vm_page_t
page_busy(vnode_t * vp,int64_t start,int64_t off,int64_t nbytes)390 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
391 {
392 	vm_object_t obj;
393 	vm_page_t pp;
394 	int64_t end;
395 
396 	/*
397 	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
398 	 * aligned boundaries, if the range is not aligned.  As a result a
399 	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
400 	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
401 	 * the whole page would be considred clean despite have some dirty data.
402 	 * For this reason we should shrink the range to DEV_BSIZE aligned
403 	 * boundaries before calling vm_page_clear_dirty.
404 	 */
405 	end = rounddown2(off + nbytes, DEV_BSIZE);
406 	off = roundup2(off, DEV_BSIZE);
407 	nbytes = end - off;
408 
409 	obj = vp->v_object;
410 	zfs_vmobject_assert_wlocked(obj);
411 
412 	for (;;) {
413 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
414 		    pp->valid) {
415 			if (vm_page_xbusied(pp)) {
416 				/*
417 				 * Reference the page before unlocking and
418 				 * sleeping so that the page daemon is less
419 				 * likely to reclaim it.
420 				 */
421 				vm_page_reference(pp);
422 				vm_page_lock(pp);
423 				zfs_vmobject_wunlock(obj);
424 				vm_page_busy_sleep(pp, "zfsmwb", true);
425 				zfs_vmobject_wlock(obj);
426 				continue;
427 			}
428 			vm_page_sbusy(pp);
429 		} else if (pp == NULL) {
430 			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
431 			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
432 			    VM_ALLOC_SBUSY);
433 		} else {
434 			ASSERT(pp != NULL && !pp->valid);
435 			pp = NULL;
436 		}
437 
438 		if (pp != NULL) {
439 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
440 			vm_object_pip_add(obj, 1);
441 			pmap_remove_write(pp);
442 			if (nbytes != 0)
443 				vm_page_clear_dirty(pp, off, nbytes);
444 		}
445 		break;
446 	}
447 	return (pp);
448 }
449 
450 static void
page_unbusy(vm_page_t pp)451 page_unbusy(vm_page_t pp)
452 {
453 
454 	vm_page_sunbusy(pp);
455 	vm_object_pip_subtract(pp->object, 1);
456 }
457 
458 static vm_page_t
page_hold(vnode_t * vp,int64_t start)459 page_hold(vnode_t *vp, int64_t start)
460 {
461 	vm_object_t obj;
462 	vm_page_t pp;
463 
464 	obj = vp->v_object;
465 	zfs_vmobject_assert_wlocked(obj);
466 
467 	for (;;) {
468 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
469 		    pp->valid) {
470 			if (vm_page_xbusied(pp)) {
471 				/*
472 				 * Reference the page before unlocking and
473 				 * sleeping so that the page daemon is less
474 				 * likely to reclaim it.
475 				 */
476 				vm_page_reference(pp);
477 				vm_page_lock(pp);
478 				zfs_vmobject_wunlock(obj);
479 				vm_page_busy_sleep(pp, "zfsmwb", true);
480 				zfs_vmobject_wlock(obj);
481 				continue;
482 			}
483 
484 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
485 			vm_page_lock(pp);
486 			vm_page_hold(pp);
487 			vm_page_unlock(pp);
488 
489 		} else
490 			pp = NULL;
491 		break;
492 	}
493 	return (pp);
494 }
495 
496 static void
page_unhold(vm_page_t pp)497 page_unhold(vm_page_t pp)
498 {
499 
500 	vm_page_lock(pp);
501 	vm_page_unhold(pp);
502 	vm_page_unlock(pp);
503 }
504 
505 /*
506  * When a file is memory mapped, we must keep the IO data synchronized
507  * between the DMU cache and the memory mapped pages.  What this means:
508  *
509  * On Write:	If we find a memory mapped page, we write to *both*
510  *		the page and the dmu buffer.
511  */
512 static void
update_pages(vnode_t * vp,int64_t start,int len,objset_t * os,uint64_t oid,int segflg,dmu_tx_t * tx)513 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
514     int segflg, dmu_tx_t *tx)
515 {
516 	vm_object_t obj;
517 	struct sf_buf *sf;
518 	caddr_t va;
519 	int off;
520 
521 	ASSERT(segflg != UIO_NOCOPY);
522 	ASSERT(vp->v_mount != NULL);
523 	obj = vp->v_object;
524 	ASSERT(obj != NULL);
525 
526 	off = start & PAGEOFFSET;
527 	zfs_vmobject_wlock(obj);
528 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
529 		vm_page_t pp;
530 		int nbytes = imin(PAGESIZE - off, len);
531 
532 		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
533 			zfs_vmobject_wunlock(obj);
534 
535 			va = zfs_map_page(pp, &sf);
536 			(void) dmu_read(os, oid, start+off, nbytes,
537 			    va+off, DMU_READ_PREFETCH);;
538 			zfs_unmap_page(sf);
539 
540 			zfs_vmobject_wlock(obj);
541 			page_unbusy(pp);
542 		}
543 		len -= nbytes;
544 		off = 0;
545 	}
546 	vm_object_pip_wakeupn(obj, 0);
547 	zfs_vmobject_wunlock(obj);
548 }
549 
550 /*
551  * Read with UIO_NOCOPY flag means that sendfile(2) requests
552  * ZFS to populate a range of page cache pages with data.
553  *
554  * NOTE: this function could be optimized to pre-allocate
555  * all pages in advance, drain exclusive busy on all of them,
556  * map them into contiguous KVA region and populate them
557  * in one single dmu_read() call.
558  */
559 static int
mappedread_sf(vnode_t * vp,int nbytes,uio_t * uio)560 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
561 {
562 	znode_t *zp = VTOZ(vp);
563 	objset_t *os = zp->z_zfsvfs->z_os;
564 	struct sf_buf *sf;
565 	vm_object_t obj;
566 	vm_page_t pp;
567 	int64_t start;
568 	caddr_t va;
569 	int len = nbytes;
570 	int off;
571 	int error = 0;
572 
573 	ASSERT(uio->uio_segflg == UIO_NOCOPY);
574 	ASSERT(vp->v_mount != NULL);
575 	obj = vp->v_object;
576 	ASSERT(obj != NULL);
577 	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
578 
579 	zfs_vmobject_wlock(obj);
580 	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
581 		int bytes = MIN(PAGESIZE, len);
582 
583 		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
584 		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
585 		if (pp->valid == 0) {
586 			zfs_vmobject_wunlock(obj);
587 			va = zfs_map_page(pp, &sf);
588 			error = dmu_read(os, zp->z_id, start, bytes, va,
589 			    DMU_READ_PREFETCH);
590 			if (bytes != PAGESIZE && error == 0)
591 				bzero(va + bytes, PAGESIZE - bytes);
592 			zfs_unmap_page(sf);
593 			zfs_vmobject_wlock(obj);
594 			vm_page_sunbusy(pp);
595 			vm_page_lock(pp);
596 			if (error) {
597 				if (pp->wire_count == 0 && pp->valid == 0 &&
598 				    !vm_page_busied(pp))
599 					vm_page_free(pp);
600 			} else {
601 				pp->valid = VM_PAGE_BITS_ALL;
602 				vm_page_activate(pp);
603 			}
604 			vm_page_unlock(pp);
605 		} else {
606 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
607 			vm_page_sunbusy(pp);
608 		}
609 		if (error)
610 			break;
611 		uio->uio_resid -= bytes;
612 		uio->uio_offset += bytes;
613 		len -= bytes;
614 	}
615 	zfs_vmobject_wunlock(obj);
616 	return (error);
617 }
618 
619 /*
620  * When a file is memory mapped, we must keep the IO data synchronized
621  * between the DMU cache and the memory mapped pages.  What this means:
622  *
623  * On Read:	We "read" preferentially from memory mapped pages,
624  *		else we default from the dmu buffer.
625  *
626  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
627  *	 the file is memory mapped.
628  */
629 static int
mappedread(vnode_t * vp,int nbytes,uio_t * uio)630 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
631 {
632 	znode_t *zp = VTOZ(vp);
633 	vm_object_t obj;
634 	int64_t start;
635 	caddr_t va;
636 	int len = nbytes;
637 	int off;
638 	int error = 0;
639 
640 	ASSERT(vp->v_mount != NULL);
641 	obj = vp->v_object;
642 	ASSERT(obj != NULL);
643 
644 	start = uio->uio_loffset;
645 	off = start & PAGEOFFSET;
646 	zfs_vmobject_wlock(obj);
647 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
648 		vm_page_t pp;
649 		uint64_t bytes = MIN(PAGESIZE - off, len);
650 
651 		if (pp = page_hold(vp, start)) {
652 			struct sf_buf *sf;
653 			caddr_t va;
654 
655 			zfs_vmobject_wunlock(obj);
656 			va = zfs_map_page(pp, &sf);
657 #ifdef illumos
658 			error = uiomove(va + off, bytes, UIO_READ, uio);
659 #else
660 			error = vn_io_fault_uiomove(va + off, bytes, uio);
661 #endif
662 			zfs_unmap_page(sf);
663 			zfs_vmobject_wlock(obj);
664 			page_unhold(pp);
665 		} else {
666 			zfs_vmobject_wunlock(obj);
667 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
668 			    uio, bytes);
669 			zfs_vmobject_wlock(obj);
670 		}
671 		len -= bytes;
672 		off = 0;
673 		if (error)
674 			break;
675 	}
676 	zfs_vmobject_wunlock(obj);
677 	return (error);
678 }
679 
680 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
681 
682 /*
683  * Read bytes from specified file into supplied buffer.
684  *
685  *	IN:	vp	- vnode of file to be read from.
686  *		uio	- structure supplying read location, range info,
687  *			  and return buffer.
688  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
689  *		cr	- credentials of caller.
690  *		ct	- caller context
691  *
692  *	OUT:	uio	- updated offset and range, buffer filled.
693  *
694  *	RETURN:	0 on success, error code on failure.
695  *
696  * Side Effects:
697  *	vp - atime updated if byte count > 0
698  */
699 /* ARGSUSED */
700 static int
zfs_read(vnode_t * vp,uio_t * uio,int ioflag,cred_t * cr,caller_context_t * ct)701 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
702 {
703 	znode_t		*zp = VTOZ(vp);
704 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
705 	ssize_t		n, nbytes;
706 	int		error = 0;
707 	rl_t		*rl;
708 	xuio_t		*xuio = NULL;
709 
710 	ZFS_ENTER(zfsvfs);
711 	ZFS_VERIFY_ZP(zp);
712 
713 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
714 		ZFS_EXIT(zfsvfs);
715 		return (SET_ERROR(EACCES));
716 	}
717 
718 	/*
719 	 * Validate file offset
720 	 */
721 	if (uio->uio_loffset < (offset_t)0) {
722 		ZFS_EXIT(zfsvfs);
723 		return (SET_ERROR(EINVAL));
724 	}
725 
726 	/*
727 	 * Fasttrack empty reads
728 	 */
729 	if (uio->uio_resid == 0) {
730 		ZFS_EXIT(zfsvfs);
731 		return (0);
732 	}
733 
734 	/*
735 	 * Check for mandatory locks
736 	 */
737 	if (MANDMODE(zp->z_mode)) {
738 		if (error = chklock(vp, FREAD,
739 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
740 			ZFS_EXIT(zfsvfs);
741 			return (error);
742 		}
743 	}
744 
745 	/*
746 	 * If we're in FRSYNC mode, sync out this znode before reading it.
747 	 */
748 	if (zfsvfs->z_log &&
749 	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
750 		zil_commit(zfsvfs->z_log, zp->z_id);
751 
752 	/*
753 	 * Lock the range against changes.
754 	 */
755 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
756 
757 	/*
758 	 * If we are reading past end-of-file we can skip
759 	 * to the end; but we might still need to set atime.
760 	 */
761 	if (uio->uio_loffset >= zp->z_size) {
762 		error = 0;
763 		goto out;
764 	}
765 
766 	ASSERT(uio->uio_loffset < zp->z_size);
767 	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
768 
769 #ifdef illumos
770 	if ((uio->uio_extflg == UIO_XUIO) &&
771 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
772 		int nblk;
773 		int blksz = zp->z_blksz;
774 		uint64_t offset = uio->uio_loffset;
775 
776 		xuio = (xuio_t *)uio;
777 		if ((ISP2(blksz))) {
778 			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
779 			    blksz)) / blksz;
780 		} else {
781 			ASSERT(offset + n <= blksz);
782 			nblk = 1;
783 		}
784 		(void) dmu_xuio_init(xuio, nblk);
785 
786 		if (vn_has_cached_data(vp)) {
787 			/*
788 			 * For simplicity, we always allocate a full buffer
789 			 * even if we only expect to read a portion of a block.
790 			 */
791 			while (--nblk >= 0) {
792 				(void) dmu_xuio_add(xuio,
793 				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
794 				    blksz), 0, blksz);
795 			}
796 		}
797 	}
798 #endif	/* illumos */
799 
800 	while (n > 0) {
801 		nbytes = MIN(n, zfs_read_chunk_size -
802 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
803 
804 #ifdef __FreeBSD__
805 		if (uio->uio_segflg == UIO_NOCOPY)
806 			error = mappedread_sf(vp, nbytes, uio);
807 		else
808 #endif /* __FreeBSD__ */
809 		if (vn_has_cached_data(vp)) {
810 			error = mappedread(vp, nbytes, uio);
811 		} else {
812 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
813 			    uio, nbytes);
814 		}
815 		if (error) {
816 			/* convert checksum errors into IO errors */
817 			if (error == ECKSUM)
818 				error = SET_ERROR(EIO);
819 			break;
820 		}
821 
822 		n -= nbytes;
823 	}
824 out:
825 	zfs_range_unlock(rl);
826 
827 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
828 	ZFS_EXIT(zfsvfs);
829 	return (error);
830 }
831 
832 /*
833  * Write the bytes to a file.
834  *
835  *	IN:	vp	- vnode of file to be written to.
836  *		uio	- structure supplying write location, range info,
837  *			  and data buffer.
838  *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
839  *			  set if in append mode.
840  *		cr	- credentials of caller.
841  *		ct	- caller context (NFS/CIFS fem monitor only)
842  *
843  *	OUT:	uio	- updated offset and range.
844  *
845  *	RETURN:	0 on success, error code on failure.
846  *
847  * Timestamps:
848  *	vp - ctime|mtime updated if byte count > 0
849  */
850 
851 /* ARGSUSED */
852 static int
zfs_write(vnode_t * vp,uio_t * uio,int ioflag,cred_t * cr,caller_context_t * ct)853 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
854 {
855 	znode_t		*zp = VTOZ(vp);
856 	rlim64_t	limit = MAXOFFSET_T;
857 	ssize_t		start_resid = uio->uio_resid;
858 	ssize_t		tx_bytes;
859 	uint64_t	end_size;
860 	dmu_tx_t	*tx;
861 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
862 	zilog_t		*zilog;
863 	offset_t	woff;
864 	ssize_t		n, nbytes;
865 	rl_t		*rl;
866 	int		max_blksz = zfsvfs->z_max_blksz;
867 	int		error = 0;
868 	arc_buf_t	*abuf;
869 	iovec_t		*aiov = NULL;
870 	xuio_t		*xuio = NULL;
871 	int		i_iov = 0;
872 	int		iovcnt = uio->uio_iovcnt;
873 	iovec_t		*iovp = uio->uio_iov;
874 	int		write_eof;
875 	int		count = 0;
876 	sa_bulk_attr_t	bulk[4];
877 	uint64_t	mtime[2], ctime[2];
878 
879 	/*
880 	 * Fasttrack empty write
881 	 */
882 	n = start_resid;
883 	if (n == 0)
884 		return (0);
885 
886 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
887 		limit = MAXOFFSET_T;
888 
889 	ZFS_ENTER(zfsvfs);
890 	ZFS_VERIFY_ZP(zp);
891 
892 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
893 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
894 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
895 	    &zp->z_size, 8);
896 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
897 	    &zp->z_pflags, 8);
898 
899 	/*
900 	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
901 	 * callers might not be able to detect properly that we are read-only,
902 	 * so check it explicitly here.
903 	 */
904 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
905 		ZFS_EXIT(zfsvfs);
906 		return (SET_ERROR(EROFS));
907 	}
908 
909 	/*
910 	 * If immutable or not appending then return EPERM
911 	 */
912 	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
913 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
914 	    (uio->uio_loffset < zp->z_size))) {
915 		ZFS_EXIT(zfsvfs);
916 		return (SET_ERROR(EPERM));
917 	}
918 
919 	zilog = zfsvfs->z_log;
920 
921 	/*
922 	 * Validate file offset
923 	 */
924 	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
925 	if (woff < 0) {
926 		ZFS_EXIT(zfsvfs);
927 		return (SET_ERROR(EINVAL));
928 	}
929 
930 	/*
931 	 * Check for mandatory locks before calling zfs_range_lock()
932 	 * in order to prevent a deadlock with locks set via fcntl().
933 	 */
934 	if (MANDMODE((mode_t)zp->z_mode) &&
935 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
936 		ZFS_EXIT(zfsvfs);
937 		return (error);
938 	}
939 
940 #ifdef illumos
941 	/*
942 	 * Pre-fault the pages to ensure slow (eg NFS) pages
943 	 * don't hold up txg.
944 	 * Skip this if uio contains loaned arc_buf.
945 	 */
946 	if ((uio->uio_extflg == UIO_XUIO) &&
947 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
948 		xuio = (xuio_t *)uio;
949 	else
950 		uio_prefaultpages(MIN(n, max_blksz), uio);
951 #endif
952 
953 	/*
954 	 * If in append mode, set the io offset pointer to eof.
955 	 */
956 	if (ioflag & FAPPEND) {
957 		/*
958 		 * Obtain an appending range lock to guarantee file append
959 		 * semantics.  We reset the write offset once we have the lock.
960 		 */
961 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
962 		woff = rl->r_off;
963 		if (rl->r_len == UINT64_MAX) {
964 			/*
965 			 * We overlocked the file because this write will cause
966 			 * the file block size to increase.
967 			 * Note that zp_size cannot change with this lock held.
968 			 */
969 			woff = zp->z_size;
970 		}
971 		uio->uio_loffset = woff;
972 	} else {
973 		/*
974 		 * Note that if the file block size will change as a result of
975 		 * this write, then this range lock will lock the entire file
976 		 * so that we can re-write the block safely.
977 		 */
978 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
979 	}
980 
981 	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
982 		zfs_range_unlock(rl);
983 		ZFS_EXIT(zfsvfs);
984 		return (EFBIG);
985 	}
986 
987 	if (woff >= limit) {
988 		zfs_range_unlock(rl);
989 		ZFS_EXIT(zfsvfs);
990 		return (SET_ERROR(EFBIG));
991 	}
992 
993 	if ((woff + n) > limit || woff > (limit - n))
994 		n = limit - woff;
995 
996 	/* Will this write extend the file length? */
997 	write_eof = (woff + n > zp->z_size);
998 
999 	end_size = MAX(zp->z_size, woff + n);
1000 
1001 	/*
1002 	 * Write the file in reasonable size chunks.  Each chunk is written
1003 	 * in a separate transaction; this keeps the intent log records small
1004 	 * and allows us to do more fine-grained space accounting.
1005 	 */
1006 	while (n > 0) {
1007 		abuf = NULL;
1008 		woff = uio->uio_loffset;
1009 		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1010 		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1011 			if (abuf != NULL)
1012 				dmu_return_arcbuf(abuf);
1013 			error = SET_ERROR(EDQUOT);
1014 			break;
1015 		}
1016 
1017 		if (xuio && abuf == NULL) {
1018 			ASSERT(i_iov < iovcnt);
1019 			aiov = &iovp[i_iov];
1020 			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1021 			dmu_xuio_clear(xuio, i_iov);
1022 			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1023 			    iovec_t *, aiov, arc_buf_t *, abuf);
1024 			ASSERT((aiov->iov_base == abuf->b_data) ||
1025 			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1026 			    aiov->iov_len == arc_buf_size(abuf)));
1027 			i_iov++;
1028 		} else if (abuf == NULL && n >= max_blksz &&
1029 		    woff >= zp->z_size &&
1030 		    P2PHASE(woff, max_blksz) == 0 &&
1031 		    zp->z_blksz == max_blksz) {
1032 			/*
1033 			 * This write covers a full block.  "Borrow" a buffer
1034 			 * from the dmu so that we can fill it before we enter
1035 			 * a transaction.  This avoids the possibility of
1036 			 * holding up the transaction if the data copy hangs
1037 			 * up on a pagefault (e.g., from an NFS server mapping).
1038 			 */
1039 			size_t cbytes;
1040 
1041 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1042 			    max_blksz);
1043 			ASSERT(abuf != NULL);
1044 			ASSERT(arc_buf_size(abuf) == max_blksz);
1045 			if (error = uiocopy(abuf->b_data, max_blksz,
1046 			    UIO_WRITE, uio, &cbytes)) {
1047 				dmu_return_arcbuf(abuf);
1048 				break;
1049 			}
1050 			ASSERT(cbytes == max_blksz);
1051 		}
1052 
1053 		/*
1054 		 * Start a transaction.
1055 		 */
1056 		tx = dmu_tx_create(zfsvfs->z_os);
1057 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1058 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1059 		zfs_sa_upgrade_txholds(tx, zp);
1060 		error = dmu_tx_assign(tx, TXG_WAIT);
1061 		if (error) {
1062 			dmu_tx_abort(tx);
1063 			if (abuf != NULL)
1064 				dmu_return_arcbuf(abuf);
1065 			break;
1066 		}
1067 
1068 		/*
1069 		 * If zfs_range_lock() over-locked we grow the blocksize
1070 		 * and then reduce the lock range.  This will only happen
1071 		 * on the first iteration since zfs_range_reduce() will
1072 		 * shrink down r_len to the appropriate size.
1073 		 */
1074 		if (rl->r_len == UINT64_MAX) {
1075 			uint64_t new_blksz;
1076 
1077 			if (zp->z_blksz > max_blksz) {
1078 				/*
1079 				 * File's blocksize is already larger than the
1080 				 * "recordsize" property.  Only let it grow to
1081 				 * the next power of 2.
1082 				 */
1083 				ASSERT(!ISP2(zp->z_blksz));
1084 				new_blksz = MIN(end_size,
1085 				    1 << highbit64(zp->z_blksz));
1086 			} else {
1087 				new_blksz = MIN(end_size, max_blksz);
1088 			}
1089 			zfs_grow_blocksize(zp, new_blksz, tx);
1090 			zfs_range_reduce(rl, woff, n);
1091 		}
1092 
1093 		/*
1094 		 * XXX - should we really limit each write to z_max_blksz?
1095 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1096 		 */
1097 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1098 
1099 		if (woff + nbytes > zp->z_size)
1100 			vnode_pager_setsize(vp, woff + nbytes);
1101 
1102 		if (abuf == NULL) {
1103 			tx_bytes = uio->uio_resid;
1104 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1105 			    uio, nbytes, tx);
1106 			tx_bytes -= uio->uio_resid;
1107 		} else {
1108 			tx_bytes = nbytes;
1109 			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1110 			/*
1111 			 * If this is not a full block write, but we are
1112 			 * extending the file past EOF and this data starts
1113 			 * block-aligned, use assign_arcbuf().  Otherwise,
1114 			 * write via dmu_write().
1115 			 */
1116 			if (tx_bytes < max_blksz && (!write_eof ||
1117 			    aiov->iov_base != abuf->b_data)) {
1118 				ASSERT(xuio);
1119 				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1120 				    aiov->iov_len, aiov->iov_base, tx);
1121 				dmu_return_arcbuf(abuf);
1122 				xuio_stat_wbuf_copied();
1123 			} else {
1124 				ASSERT(xuio || tx_bytes == max_blksz);
1125 				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1126 				    woff, abuf, tx);
1127 			}
1128 			ASSERT(tx_bytes <= uio->uio_resid);
1129 			uioskip(uio, tx_bytes);
1130 		}
1131 		if (tx_bytes && vn_has_cached_data(vp)) {
1132 			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1133 			    zp->z_id, uio->uio_segflg, tx);
1134 		}
1135 
1136 		/*
1137 		 * If we made no progress, we're done.  If we made even
1138 		 * partial progress, update the znode and ZIL accordingly.
1139 		 */
1140 		if (tx_bytes == 0) {
1141 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1142 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1143 			dmu_tx_commit(tx);
1144 			ASSERT(error != 0);
1145 			break;
1146 		}
1147 
1148 		/*
1149 		 * Clear Set-UID/Set-GID bits on successful write if not
1150 		 * privileged and at least one of the excute bits is set.
1151 		 *
1152 		 * It would be nice to to this after all writes have
1153 		 * been done, but that would still expose the ISUID/ISGID
1154 		 * to another app after the partial write is committed.
1155 		 *
1156 		 * Note: we don't call zfs_fuid_map_id() here because
1157 		 * user 0 is not an ephemeral uid.
1158 		 */
1159 		mutex_enter(&zp->z_acl_lock);
1160 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1161 		    (S_IXUSR >> 6))) != 0 &&
1162 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1163 		    secpolicy_vnode_setid_retain(vp, cr,
1164 		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1165 			uint64_t newmode;
1166 			zp->z_mode &= ~(S_ISUID | S_ISGID);
1167 			newmode = zp->z_mode;
1168 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1169 			    (void *)&newmode, sizeof (uint64_t), tx);
1170 		}
1171 		mutex_exit(&zp->z_acl_lock);
1172 
1173 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1174 		    B_TRUE);
1175 
1176 		/*
1177 		 * Update the file size (zp_size) if it has changed;
1178 		 * account for possible concurrent updates.
1179 		 */
1180 		while ((end_size = zp->z_size) < uio->uio_loffset) {
1181 			(void) atomic_cas_64(&zp->z_size, end_size,
1182 			    uio->uio_loffset);
1183 #ifdef illumos
1184 			ASSERT(error == 0);
1185 #else
1186 			ASSERT(error == 0 || error == EFAULT);
1187 #endif
1188 		}
1189 		/*
1190 		 * If we are replaying and eof is non zero then force
1191 		 * the file size to the specified eof. Note, there's no
1192 		 * concurrency during replay.
1193 		 */
1194 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1195 			zp->z_size = zfsvfs->z_replay_eof;
1196 
1197 		if (error == 0)
1198 			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1199 		else
1200 			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1201 
1202 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1203 		dmu_tx_commit(tx);
1204 
1205 		if (error != 0)
1206 			break;
1207 		ASSERT(tx_bytes == nbytes);
1208 		n -= nbytes;
1209 
1210 #ifdef illumos
1211 		if (!xuio && n > 0)
1212 			uio_prefaultpages(MIN(n, max_blksz), uio);
1213 #endif
1214 	}
1215 
1216 	zfs_range_unlock(rl);
1217 
1218 	/*
1219 	 * If we're in replay mode, or we made no progress, return error.
1220 	 * Otherwise, it's at least a partial write, so it's successful.
1221 	 */
1222 	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1223 		ZFS_EXIT(zfsvfs);
1224 		return (error);
1225 	}
1226 
1227 #ifdef __FreeBSD__
1228 	/*
1229 	 * EFAULT means that at least one page of the source buffer was not
1230 	 * available.  VFS will re-try remaining I/O upon this error.
1231 	 */
1232 	if (error == EFAULT) {
1233 		ZFS_EXIT(zfsvfs);
1234 		return (error);
1235 	}
1236 #endif
1237 
1238 	if (ioflag & (FSYNC | FDSYNC) ||
1239 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1240 		zil_commit(zilog, zp->z_id);
1241 
1242 	ZFS_EXIT(zfsvfs);
1243 	return (0);
1244 }
1245 
1246 void
zfs_get_done(zgd_t * zgd,int error)1247 zfs_get_done(zgd_t *zgd, int error)
1248 {
1249 	znode_t *zp = zgd->zgd_private;
1250 	objset_t *os = zp->z_zfsvfs->z_os;
1251 
1252 	if (zgd->zgd_db)
1253 		dmu_buf_rele(zgd->zgd_db, zgd);
1254 
1255 	zfs_range_unlock(zgd->zgd_rl);
1256 
1257 	/*
1258 	 * Release the vnode asynchronously as we currently have the
1259 	 * txg stopped from syncing.
1260 	 */
1261 	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1262 
1263 	if (error == 0 && zgd->zgd_bp)
1264 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1265 
1266 	kmem_free(zgd, sizeof (zgd_t));
1267 }
1268 
1269 #ifdef DEBUG
1270 static int zil_fault_io = 0;
1271 #endif
1272 
1273 /*
1274  * Get data to generate a TX_WRITE intent log record.
1275  */
1276 int
zfs_get_data(void * arg,lr_write_t * lr,char * buf,zio_t * zio)1277 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1278 {
1279 	zfsvfs_t *zfsvfs = arg;
1280 	objset_t *os = zfsvfs->z_os;
1281 	znode_t *zp;
1282 	uint64_t object = lr->lr_foid;
1283 	uint64_t offset = lr->lr_offset;
1284 	uint64_t size = lr->lr_length;
1285 	blkptr_t *bp = &lr->lr_blkptr;
1286 	dmu_buf_t *db;
1287 	zgd_t *zgd;
1288 	int error = 0;
1289 
1290 	ASSERT(zio != NULL);
1291 	ASSERT(size != 0);
1292 
1293 	/*
1294 	 * Nothing to do if the file has been removed
1295 	 */
1296 	if (zfs_zget(zfsvfs, object, &zp) != 0)
1297 		return (SET_ERROR(ENOENT));
1298 	if (zp->z_unlinked) {
1299 		/*
1300 		 * Release the vnode asynchronously as we currently have the
1301 		 * txg stopped from syncing.
1302 		 */
1303 		VN_RELE_ASYNC(ZTOV(zp),
1304 		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1305 		return (SET_ERROR(ENOENT));
1306 	}
1307 
1308 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1309 	zgd->zgd_zilog = zfsvfs->z_log;
1310 	zgd->zgd_private = zp;
1311 
1312 	/*
1313 	 * Write records come in two flavors: immediate and indirect.
1314 	 * For small writes it's cheaper to store the data with the
1315 	 * log record (immediate); for large writes it's cheaper to
1316 	 * sync the data and get a pointer to it (indirect) so that
1317 	 * we don't have to write the data twice.
1318 	 */
1319 	if (buf != NULL) { /* immediate write */
1320 		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1321 		/* test for truncation needs to be done while range locked */
1322 		if (offset >= zp->z_size) {
1323 			error = SET_ERROR(ENOENT);
1324 		} else {
1325 			error = dmu_read(os, object, offset, size, buf,
1326 			    DMU_READ_NO_PREFETCH);
1327 		}
1328 		ASSERT(error == 0 || error == ENOENT);
1329 	} else { /* indirect write */
1330 		/*
1331 		 * Have to lock the whole block to ensure when it's
1332 		 * written out and its checksum is being calculated
1333 		 * that no one can change the data. We need to re-check
1334 		 * blocksize after we get the lock in case it's changed!
1335 		 */
1336 		for (;;) {
1337 			uint64_t blkoff;
1338 			size = zp->z_blksz;
1339 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1340 			offset -= blkoff;
1341 			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1342 			    RL_READER);
1343 			if (zp->z_blksz == size)
1344 				break;
1345 			offset += blkoff;
1346 			zfs_range_unlock(zgd->zgd_rl);
1347 		}
1348 		/* test for truncation needs to be done while range locked */
1349 		if (lr->lr_offset >= zp->z_size)
1350 			error = SET_ERROR(ENOENT);
1351 #ifdef DEBUG
1352 		if (zil_fault_io) {
1353 			error = SET_ERROR(EIO);
1354 			zil_fault_io = 0;
1355 		}
1356 #endif
1357 		if (error == 0)
1358 			error = dmu_buf_hold(os, object, offset, zgd, &db,
1359 			    DMU_READ_NO_PREFETCH);
1360 
1361 		if (error == 0) {
1362 			blkptr_t *obp = dmu_buf_get_blkptr(db);
1363 			if (obp) {
1364 				ASSERT(BP_IS_HOLE(bp));
1365 				*bp = *obp;
1366 			}
1367 
1368 			zgd->zgd_db = db;
1369 			zgd->zgd_bp = bp;
1370 
1371 			ASSERT(db->db_offset == offset);
1372 			ASSERT(db->db_size == size);
1373 
1374 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1375 			    zfs_get_done, zgd);
1376 			ASSERT(error || lr->lr_length <= zp->z_blksz);
1377 
1378 			/*
1379 			 * On success, we need to wait for the write I/O
1380 			 * initiated by dmu_sync() to complete before we can
1381 			 * release this dbuf.  We will finish everything up
1382 			 * in the zfs_get_done() callback.
1383 			 */
1384 			if (error == 0)
1385 				return (0);
1386 
1387 			if (error == EALREADY) {
1388 				lr->lr_common.lrc_txtype = TX_WRITE2;
1389 				error = 0;
1390 			}
1391 		}
1392 	}
1393 
1394 	zfs_get_done(zgd, error);
1395 
1396 	return (error);
1397 }
1398 
1399 /*ARGSUSED*/
1400 static int
zfs_access(vnode_t * vp,int mode,int flag,cred_t * cr,caller_context_t * ct)1401 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1402     caller_context_t *ct)
1403 {
1404 	znode_t *zp = VTOZ(vp);
1405 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1406 	int error;
1407 
1408 	ZFS_ENTER(zfsvfs);
1409 	ZFS_VERIFY_ZP(zp);
1410 
1411 	if (flag & V_ACE_MASK)
1412 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1413 	else
1414 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1415 
1416 	ZFS_EXIT(zfsvfs);
1417 	return (error);
1418 }
1419 
1420 static int
zfs_dd_callback(struct mount * mp,void * arg,int lkflags,struct vnode ** vpp)1421 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1422 {
1423 	int error;
1424 
1425 	*vpp = arg;
1426 	error = vn_lock(*vpp, lkflags);
1427 	if (error != 0)
1428 		vrele(*vpp);
1429 	return (error);
1430 }
1431 
1432 static int
zfs_lookup_lock(vnode_t * dvp,vnode_t * vp,const char * name,int lkflags)1433 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1434 {
1435 	znode_t *zdp = VTOZ(dvp);
1436 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1437 	int error;
1438 	int ltype;
1439 
1440 	ASSERT_VOP_LOCKED(dvp, __func__);
1441 #ifdef DIAGNOSTIC
1442 	if ((zdp->z_pflags & ZFS_XATTR) == 0)
1443 		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1444 #endif
1445 
1446 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1447 		ASSERT3P(dvp, ==, vp);
1448 		vref(dvp);
1449 		ltype = lkflags & LK_TYPE_MASK;
1450 		if (ltype != VOP_ISLOCKED(dvp)) {
1451 			if (ltype == LK_EXCLUSIVE)
1452 				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1453 			else /* if (ltype == LK_SHARED) */
1454 				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1455 
1456 			/*
1457 			 * Relock for the "." case could leave us with
1458 			 * reclaimed vnode.
1459 			 */
1460 			if (dvp->v_iflag & VI_DOOMED) {
1461 				vrele(dvp);
1462 				return (SET_ERROR(ENOENT));
1463 			}
1464 		}
1465 		return (0);
1466 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1467 		/*
1468 		 * Note that in this case, dvp is the child vnode, and we
1469 		 * are looking up the parent vnode - exactly reverse from
1470 		 * normal operation.  Unlocking dvp requires some rather
1471 		 * tricky unlock/relock dance to prevent mp from being freed;
1472 		 * use vn_vget_ino_gen() which takes care of all that.
1473 		 *
1474 		 * XXX Note that there is a time window when both vnodes are
1475 		 * unlocked.  It is possible, although highly unlikely, that
1476 		 * during that window the parent-child relationship between
1477 		 * the vnodes may change, for example, get reversed.
1478 		 * In that case we would have a wrong lock order for the vnodes.
1479 		 * All other filesystems seem to ignore this problem, so we
1480 		 * do the same here.
1481 		 * A potential solution could be implemented as follows:
1482 		 * - using LK_NOWAIT when locking the second vnode and retrying
1483 		 *   if necessary
1484 		 * - checking that the parent-child relationship still holds
1485 		 *   after locking both vnodes and retrying if it doesn't
1486 		 */
1487 		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1488 		return (error);
1489 	} else {
1490 		error = vn_lock(vp, lkflags);
1491 		if (error != 0)
1492 			vrele(vp);
1493 		return (error);
1494 	}
1495 }
1496 
1497 /*
1498  * Lookup an entry in a directory, or an extended attribute directory.
1499  * If it exists, return a held vnode reference for it.
1500  *
1501  *	IN:	dvp	- vnode of directory to search.
1502  *		nm	- name of entry to lookup.
1503  *		pnp	- full pathname to lookup [UNUSED].
1504  *		flags	- LOOKUP_XATTR set if looking for an attribute.
1505  *		rdir	- root directory vnode [UNUSED].
1506  *		cr	- credentials of caller.
1507  *		ct	- caller context
1508  *
1509  *	OUT:	vpp	- vnode of located entry, NULL if not found.
1510  *
1511  *	RETURN:	0 on success, error code on failure.
1512  *
1513  * Timestamps:
1514  *	NA
1515  */
1516 /* ARGSUSED */
1517 static int
zfs_lookup(vnode_t * dvp,char * nm,vnode_t ** vpp,struct componentname * cnp,int nameiop,cred_t * cr,kthread_t * td,int flags)1518 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1519     int nameiop, cred_t *cr, kthread_t *td, int flags)
1520 {
1521 	znode_t *zdp = VTOZ(dvp);
1522 	znode_t *zp;
1523 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1524 	int	error = 0;
1525 
1526 	/* fast path (should be redundant with vfs namecache) */
1527 	if (!(flags & LOOKUP_XATTR)) {
1528 		if (dvp->v_type != VDIR) {
1529 			return (SET_ERROR(ENOTDIR));
1530 		} else if (zdp->z_sa_hdl == NULL) {
1531 			return (SET_ERROR(EIO));
1532 		}
1533 	}
1534 
1535 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1536 
1537 	ZFS_ENTER(zfsvfs);
1538 	ZFS_VERIFY_ZP(zdp);
1539 
1540 	*vpp = NULL;
1541 
1542 	if (flags & LOOKUP_XATTR) {
1543 #ifdef TODO
1544 		/*
1545 		 * If the xattr property is off, refuse the lookup request.
1546 		 */
1547 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1548 			ZFS_EXIT(zfsvfs);
1549 			return (SET_ERROR(EINVAL));
1550 		}
1551 #endif
1552 
1553 		/*
1554 		 * We don't allow recursive attributes..
1555 		 * Maybe someday we will.
1556 		 */
1557 		if (zdp->z_pflags & ZFS_XATTR) {
1558 			ZFS_EXIT(zfsvfs);
1559 			return (SET_ERROR(EINVAL));
1560 		}
1561 
1562 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1563 			ZFS_EXIT(zfsvfs);
1564 			return (error);
1565 		}
1566 
1567 		/*
1568 		 * Do we have permission to get into attribute directory?
1569 		 */
1570 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1571 		    B_FALSE, cr)) {
1572 			vrele(*vpp);
1573 			*vpp = NULL;
1574 		}
1575 
1576 		ZFS_EXIT(zfsvfs);
1577 		return (error);
1578 	}
1579 
1580 	/*
1581 	 * Check accessibility of directory.
1582 	 */
1583 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1584 		ZFS_EXIT(zfsvfs);
1585 		return (error);
1586 	}
1587 
1588 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1589 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1590 		ZFS_EXIT(zfsvfs);
1591 		return (SET_ERROR(EILSEQ));
1592 	}
1593 
1594 
1595 	/*
1596 	 * First handle the special cases.
1597 	 */
1598 	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1599 		/*
1600 		 * If we are a snapshot mounted under .zfs, return
1601 		 * the vp for the snapshot directory.
1602 		 */
1603 		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1604 			struct componentname cn;
1605 			vnode_t *zfsctl_vp;
1606 			int ltype;
1607 
1608 			ZFS_EXIT(zfsvfs);
1609 			ltype = VOP_ISLOCKED(dvp);
1610 			VOP_UNLOCK(dvp, 0);
1611 			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1612 			    &zfsctl_vp);
1613 			if (error == 0) {
1614 				cn.cn_nameptr = "snapshot";
1615 				cn.cn_namelen = strlen(cn.cn_nameptr);
1616 				cn.cn_nameiop = cnp->cn_nameiop;
1617 				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
1618 				cn.cn_lkflags = cnp->cn_lkflags;
1619 				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1620 				vput(zfsctl_vp);
1621 			}
1622 			vn_lock(dvp, ltype | LK_RETRY);
1623 			return (error);
1624 		}
1625 	}
1626 	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1627 		ZFS_EXIT(zfsvfs);
1628 		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1629 			return (SET_ERROR(ENOTSUP));
1630 		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1631 		return (error);
1632 	}
1633 
1634 	/*
1635 	 * The loop is retry the lookup if the parent-child relationship
1636 	 * changes during the dot-dot locking complexities.
1637 	 */
1638 	for (;;) {
1639 		uint64_t parent;
1640 
1641 		error = zfs_dirlook(zdp, nm, &zp);
1642 		if (error == 0)
1643 			*vpp = ZTOV(zp);
1644 
1645 		ZFS_EXIT(zfsvfs);
1646 		if (error != 0)
1647 			break;
1648 
1649 		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1650 		if (error != 0) {
1651 			/*
1652 			 * If we've got a locking error, then the vnode
1653 			 * got reclaimed because of a force unmount.
1654 			 * We never enter doomed vnodes into the name cache.
1655 			 */
1656 			*vpp = NULL;
1657 			return (error);
1658 		}
1659 
1660 		if ((cnp->cn_flags & ISDOTDOT) == 0)
1661 			break;
1662 
1663 		ZFS_ENTER(zfsvfs);
1664 		if (zdp->z_sa_hdl == NULL) {
1665 			error = SET_ERROR(EIO);
1666 		} else {
1667 			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1668 			    &parent, sizeof (parent));
1669 		}
1670 		if (error != 0) {
1671 			ZFS_EXIT(zfsvfs);
1672 			vput(ZTOV(zp));
1673 			break;
1674 		}
1675 		if (zp->z_id == parent) {
1676 			ZFS_EXIT(zfsvfs);
1677 			break;
1678 		}
1679 		vput(ZTOV(zp));
1680 	}
1681 
1682 out:
1683 	if (error != 0)
1684 		*vpp = NULL;
1685 
1686 	/* Translate errors and add SAVENAME when needed. */
1687 	if (cnp->cn_flags & ISLASTCN) {
1688 		switch (nameiop) {
1689 		case CREATE:
1690 		case RENAME:
1691 			if (error == ENOENT) {
1692 				error = EJUSTRETURN;
1693 				cnp->cn_flags |= SAVENAME;
1694 				break;
1695 			}
1696 			/* FALLTHROUGH */
1697 		case DELETE:
1698 			if (error == 0)
1699 				cnp->cn_flags |= SAVENAME;
1700 			break;
1701 		}
1702 	}
1703 
1704 	/* Insert name into cache (as non-existent) if appropriate. */
1705 	if (zfsvfs->z_use_namecache &&
1706 	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1707 		cache_enter(dvp, NULL, cnp);
1708 
1709 	/* Insert name into cache if appropriate. */
1710 	if (zfsvfs->z_use_namecache &&
1711 	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1712 		if (!(cnp->cn_flags & ISLASTCN) ||
1713 		    (nameiop != DELETE && nameiop != RENAME)) {
1714 			cache_enter(dvp, *vpp, cnp);
1715 		}
1716 	}
1717 
1718 	return (error);
1719 }
1720 
1721 /*
1722  * Attempt to create a new entry in a directory.  If the entry
1723  * already exists, truncate the file if permissible, else return
1724  * an error.  Return the vp of the created or trunc'd file.
1725  *
1726  *	IN:	dvp	- vnode of directory to put new file entry in.
1727  *		name	- name of new file entry.
1728  *		vap	- attributes of new file.
1729  *		excl	- flag indicating exclusive or non-exclusive mode.
1730  *		mode	- mode to open file with.
1731  *		cr	- credentials of caller.
1732  *		flag	- large file flag [UNUSED].
1733  *		ct	- caller context
1734  *		vsecp	- ACL to be set
1735  *
1736  *	OUT:	vpp	- vnode of created or trunc'd entry.
1737  *
1738  *	RETURN:	0 on success, error code on failure.
1739  *
1740  * Timestamps:
1741  *	dvp - ctime|mtime updated if new entry created
1742  *	 vp - ctime|mtime always, atime if new
1743  */
1744 
1745 /* ARGSUSED */
1746 static int
zfs_create(vnode_t * dvp,char * name,vattr_t * vap,int excl,int mode,vnode_t ** vpp,cred_t * cr,kthread_t * td)1747 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1748     vnode_t **vpp, cred_t *cr, kthread_t *td)
1749 {
1750 	znode_t		*zp, *dzp = VTOZ(dvp);
1751 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1752 	zilog_t		*zilog;
1753 	objset_t	*os;
1754 	dmu_tx_t	*tx;
1755 	int		error;
1756 	ksid_t		*ksid;
1757 	uid_t		uid;
1758 	gid_t		gid = crgetgid(cr);
1759 	zfs_acl_ids_t   acl_ids;
1760 	boolean_t	fuid_dirtied;
1761 	void		*vsecp = NULL;
1762 	int		flag = 0;
1763 	uint64_t	txtype;
1764 
1765 	/*
1766 	 * If we have an ephemeral id, ACL, or XVATTR then
1767 	 * make sure file system is at proper version
1768 	 */
1769 
1770 	ksid = crgetsid(cr, KSID_OWNER);
1771 	if (ksid)
1772 		uid = ksid_getid(ksid);
1773 	else
1774 		uid = crgetuid(cr);
1775 
1776 	if (zfsvfs->z_use_fuids == B_FALSE &&
1777 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1778 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1779 		return (SET_ERROR(EINVAL));
1780 
1781 	ZFS_ENTER(zfsvfs);
1782 	ZFS_VERIFY_ZP(dzp);
1783 	os = zfsvfs->z_os;
1784 	zilog = zfsvfs->z_log;
1785 
1786 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1787 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1788 		ZFS_EXIT(zfsvfs);
1789 		return (SET_ERROR(EILSEQ));
1790 	}
1791 
1792 	if (vap->va_mask & AT_XVATTR) {
1793 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1794 		    crgetuid(cr), cr, vap->va_type)) != 0) {
1795 			ZFS_EXIT(zfsvfs);
1796 			return (error);
1797 		}
1798 	}
1799 
1800 	*vpp = NULL;
1801 
1802 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1803 		vap->va_mode &= ~S_ISVTX;
1804 
1805 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1806 	if (error) {
1807 		ZFS_EXIT(zfsvfs);
1808 		return (error);
1809 	}
1810 	ASSERT3P(zp, ==, NULL);
1811 
1812 	/*
1813 	 * Create a new file object and update the directory
1814 	 * to reference it.
1815 	 */
1816 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1817 		goto out;
1818 	}
1819 
1820 	/*
1821 	 * We only support the creation of regular files in
1822 	 * extended attribute directories.
1823 	 */
1824 
1825 	if ((dzp->z_pflags & ZFS_XATTR) &&
1826 	    (vap->va_type != VREG)) {
1827 		error = SET_ERROR(EINVAL);
1828 		goto out;
1829 	}
1830 
1831 	if ((error = zfs_acl_ids_create(dzp, 0, vap,
1832 	    cr, vsecp, &acl_ids)) != 0)
1833 		goto out;
1834 
1835 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1836 		zfs_acl_ids_free(&acl_ids);
1837 		error = SET_ERROR(EDQUOT);
1838 		goto out;
1839 	}
1840 
1841 	getnewvnode_reserve(1);
1842 
1843 	tx = dmu_tx_create(os);
1844 
1845 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1846 	    ZFS_SA_BASE_ATTR_SIZE);
1847 
1848 	fuid_dirtied = zfsvfs->z_fuid_dirty;
1849 	if (fuid_dirtied)
1850 		zfs_fuid_txhold(zfsvfs, tx);
1851 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1852 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1853 	if (!zfsvfs->z_use_sa &&
1854 	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1855 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1856 		    0, acl_ids.z_aclp->z_acl_bytes);
1857 	}
1858 	error = dmu_tx_assign(tx, TXG_WAIT);
1859 	if (error) {
1860 		zfs_acl_ids_free(&acl_ids);
1861 		dmu_tx_abort(tx);
1862 		getnewvnode_drop_reserve();
1863 		ZFS_EXIT(zfsvfs);
1864 		return (error);
1865 	}
1866 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1867 
1868 	if (fuid_dirtied)
1869 		zfs_fuid_sync(zfsvfs, tx);
1870 
1871 	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1872 	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1873 	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1874 	    vsecp, acl_ids.z_fuidp, vap);
1875 	zfs_acl_ids_free(&acl_ids);
1876 	dmu_tx_commit(tx);
1877 
1878 	getnewvnode_drop_reserve();
1879 
1880 out:
1881 	if (error == 0) {
1882 		*vpp = ZTOV(zp);
1883 	}
1884 
1885 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1886 		zil_commit(zilog, 0);
1887 
1888 	ZFS_EXIT(zfsvfs);
1889 	return (error);
1890 }
1891 
1892 /*
1893  * Remove an entry from a directory.
1894  *
1895  *	IN:	dvp	- vnode of directory to remove entry from.
1896  *		name	- name of entry to remove.
1897  *		cr	- credentials of caller.
1898  *		ct	- caller context
1899  *		flags	- case flags
1900  *
1901  *	RETURN:	0 on success, error code on failure.
1902  *
1903  * Timestamps:
1904  *	dvp - ctime|mtime
1905  *	 vp - ctime (if nlink > 0)
1906  */
1907 
1908 /*ARGSUSED*/
1909 static int
zfs_remove(vnode_t * dvp,vnode_t * vp,char * name,cred_t * cr)1910 zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1911 {
1912 	znode_t		*dzp = VTOZ(dvp);
1913 	znode_t		*zp = VTOZ(vp);
1914 	znode_t		*xzp;
1915 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1916 	zilog_t		*zilog;
1917 	uint64_t	acl_obj, xattr_obj;
1918 	uint64_t	obj = 0;
1919 	dmu_tx_t	*tx;
1920 	boolean_t	unlinked, toobig = FALSE;
1921 	uint64_t	txtype;
1922 	int		error;
1923 
1924 	ZFS_ENTER(zfsvfs);
1925 	ZFS_VERIFY_ZP(dzp);
1926 	ZFS_VERIFY_ZP(zp);
1927 	zilog = zfsvfs->z_log;
1928 	zp = VTOZ(vp);
1929 
1930 	xattr_obj = 0;
1931 	xzp = NULL;
1932 
1933 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1934 		goto out;
1935 	}
1936 
1937 	/*
1938 	 * Need to use rmdir for removing directories.
1939 	 */
1940 	if (vp->v_type == VDIR) {
1941 		error = SET_ERROR(EPERM);
1942 		goto out;
1943 	}
1944 
1945 	vnevent_remove(vp, dvp, name, ct);
1946 
1947 	obj = zp->z_id;
1948 
1949 	/* are there any extended attributes? */
1950 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1951 	    &xattr_obj, sizeof (xattr_obj));
1952 	if (error == 0 && xattr_obj) {
1953 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1954 		ASSERT0(error);
1955 	}
1956 
1957 	/*
1958 	 * We may delete the znode now, or we may put it in the unlinked set;
1959 	 * it depends on whether we're the last link, and on whether there are
1960 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1961 	 * allow for either case.
1962 	 */
1963 	tx = dmu_tx_create(zfsvfs->z_os);
1964 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1965 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1966 	zfs_sa_upgrade_txholds(tx, zp);
1967 	zfs_sa_upgrade_txholds(tx, dzp);
1968 
1969 	if (xzp) {
1970 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1971 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1972 	}
1973 
1974 	/* charge as an update -- would be nice not to charge at all */
1975 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1976 
1977 	/*
1978 	 * Mark this transaction as typically resulting in a net free of space
1979 	 */
1980 	dmu_tx_mark_netfree(tx);
1981 
1982 	error = dmu_tx_assign(tx, TXG_WAIT);
1983 	if (error) {
1984 		dmu_tx_abort(tx);
1985 		ZFS_EXIT(zfsvfs);
1986 		return (error);
1987 	}
1988 
1989 	/*
1990 	 * Remove the directory entry.
1991 	 */
1992 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1993 
1994 	if (error) {
1995 		dmu_tx_commit(tx);
1996 		goto out;
1997 	}
1998 
1999 	if (unlinked) {
2000 		zfs_unlinked_add(zp, tx);
2001 		vp->v_vflag |= VV_NOSYNC;
2002 	}
2003 
2004 	txtype = TX_REMOVE;
2005 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2006 
2007 	dmu_tx_commit(tx);
2008 out:
2009 
2010 	if (xzp)
2011 		vrele(ZTOV(xzp));
2012 
2013 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2014 		zil_commit(zilog, 0);
2015 
2016 	ZFS_EXIT(zfsvfs);
2017 	return (error);
2018 }
2019 
2020 /*
2021  * Create a new directory and insert it into dvp using the name
2022  * provided.  Return a pointer to the inserted directory.
2023  *
2024  *	IN:	dvp	- vnode of directory to add subdir to.
2025  *		dirname	- name of new directory.
2026  *		vap	- attributes of new directory.
2027  *		cr	- credentials of caller.
2028  *		ct	- caller context
2029  *		flags	- case flags
2030  *		vsecp	- ACL to be set
2031  *
2032  *	OUT:	vpp	- vnode of created directory.
2033  *
2034  *	RETURN:	0 on success, error code on failure.
2035  *
2036  * Timestamps:
2037  *	dvp - ctime|mtime updated
2038  *	 vp - ctime|mtime|atime updated
2039  */
2040 /*ARGSUSED*/
2041 static int
zfs_mkdir(vnode_t * dvp,char * dirname,vattr_t * vap,vnode_t ** vpp,cred_t * cr)2042 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2043 {
2044 	znode_t		*zp, *dzp = VTOZ(dvp);
2045 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2046 	zilog_t		*zilog;
2047 	uint64_t	txtype;
2048 	dmu_tx_t	*tx;
2049 	int		error;
2050 	ksid_t		*ksid;
2051 	uid_t		uid;
2052 	gid_t		gid = crgetgid(cr);
2053 	zfs_acl_ids_t   acl_ids;
2054 	boolean_t	fuid_dirtied;
2055 
2056 	ASSERT(vap->va_type == VDIR);
2057 
2058 	/*
2059 	 * If we have an ephemeral id, ACL, or XVATTR then
2060 	 * make sure file system is at proper version
2061 	 */
2062 
2063 	ksid = crgetsid(cr, KSID_OWNER);
2064 	if (ksid)
2065 		uid = ksid_getid(ksid);
2066 	else
2067 		uid = crgetuid(cr);
2068 	if (zfsvfs->z_use_fuids == B_FALSE &&
2069 	    ((vap->va_mask & AT_XVATTR) ||
2070 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2071 		return (SET_ERROR(EINVAL));
2072 
2073 	ZFS_ENTER(zfsvfs);
2074 	ZFS_VERIFY_ZP(dzp);
2075 	zilog = zfsvfs->z_log;
2076 
2077 	if (dzp->z_pflags & ZFS_XATTR) {
2078 		ZFS_EXIT(zfsvfs);
2079 		return (SET_ERROR(EINVAL));
2080 	}
2081 
2082 	if (zfsvfs->z_utf8 && u8_validate(dirname,
2083 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2084 		ZFS_EXIT(zfsvfs);
2085 		return (SET_ERROR(EILSEQ));
2086 	}
2087 
2088 	if (vap->va_mask & AT_XVATTR) {
2089 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2090 		    crgetuid(cr), cr, vap->va_type)) != 0) {
2091 			ZFS_EXIT(zfsvfs);
2092 			return (error);
2093 		}
2094 	}
2095 
2096 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2097 	    NULL, &acl_ids)) != 0) {
2098 		ZFS_EXIT(zfsvfs);
2099 		return (error);
2100 	}
2101 
2102 	/*
2103 	 * First make sure the new directory doesn't exist.
2104 	 *
2105 	 * Existence is checked first to make sure we don't return
2106 	 * EACCES instead of EEXIST which can cause some applications
2107 	 * to fail.
2108 	 */
2109 	*vpp = NULL;
2110 
2111 	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2112 		zfs_acl_ids_free(&acl_ids);
2113 		ZFS_EXIT(zfsvfs);
2114 		return (error);
2115 	}
2116 	ASSERT3P(zp, ==, NULL);
2117 
2118 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2119 		zfs_acl_ids_free(&acl_ids);
2120 		ZFS_EXIT(zfsvfs);
2121 		return (error);
2122 	}
2123 
2124 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2125 		zfs_acl_ids_free(&acl_ids);
2126 		ZFS_EXIT(zfsvfs);
2127 		return (SET_ERROR(EDQUOT));
2128 	}
2129 
2130 	/*
2131 	 * Add a new entry to the directory.
2132 	 */
2133 	getnewvnode_reserve(1);
2134 	tx = dmu_tx_create(zfsvfs->z_os);
2135 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2136 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2137 	fuid_dirtied = zfsvfs->z_fuid_dirty;
2138 	if (fuid_dirtied)
2139 		zfs_fuid_txhold(zfsvfs, tx);
2140 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2141 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2142 		    acl_ids.z_aclp->z_acl_bytes);
2143 	}
2144 
2145 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2146 	    ZFS_SA_BASE_ATTR_SIZE);
2147 
2148 	error = dmu_tx_assign(tx, TXG_WAIT);
2149 	if (error) {
2150 		zfs_acl_ids_free(&acl_ids);
2151 		dmu_tx_abort(tx);
2152 		getnewvnode_drop_reserve();
2153 		ZFS_EXIT(zfsvfs);
2154 		return (error);
2155 	}
2156 
2157 	/*
2158 	 * Create new node.
2159 	 */
2160 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2161 
2162 	if (fuid_dirtied)
2163 		zfs_fuid_sync(zfsvfs, tx);
2164 
2165 	/*
2166 	 * Now put new name in parent dir.
2167 	 */
2168 	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2169 
2170 	*vpp = ZTOV(zp);
2171 
2172 	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2173 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2174 	    acl_ids.z_fuidp, vap);
2175 
2176 	zfs_acl_ids_free(&acl_ids);
2177 
2178 	dmu_tx_commit(tx);
2179 
2180 	getnewvnode_drop_reserve();
2181 
2182 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2183 		zil_commit(zilog, 0);
2184 
2185 	ZFS_EXIT(zfsvfs);
2186 	return (0);
2187 }
2188 
2189 /*
2190  * Remove a directory subdir entry.  If the current working
2191  * directory is the same as the subdir to be removed, the
2192  * remove will fail.
2193  *
2194  *	IN:	dvp	- vnode of directory to remove from.
2195  *		name	- name of directory to be removed.
2196  *		cwd	- vnode of current working directory.
2197  *		cr	- credentials of caller.
2198  *		ct	- caller context
2199  *		flags	- case flags
2200  *
2201  *	RETURN:	0 on success, error code on failure.
2202  *
2203  * Timestamps:
2204  *	dvp - ctime|mtime updated
2205  */
2206 /*ARGSUSED*/
2207 static int
zfs_rmdir(vnode_t * dvp,vnode_t * vp,char * name,cred_t * cr)2208 zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2209 {
2210 	znode_t		*dzp = VTOZ(dvp);
2211 	znode_t		*zp = VTOZ(vp);
2212 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2213 	zilog_t		*zilog;
2214 	dmu_tx_t	*tx;
2215 	int		error;
2216 
2217 	ZFS_ENTER(zfsvfs);
2218 	ZFS_VERIFY_ZP(dzp);
2219 	ZFS_VERIFY_ZP(zp);
2220 	zilog = zfsvfs->z_log;
2221 
2222 
2223 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2224 		goto out;
2225 	}
2226 
2227 	if (vp->v_type != VDIR) {
2228 		error = SET_ERROR(ENOTDIR);
2229 		goto out;
2230 	}
2231 
2232 	vnevent_rmdir(vp, dvp, name, ct);
2233 
2234 	tx = dmu_tx_create(zfsvfs->z_os);
2235 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2236 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2237 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2238 	zfs_sa_upgrade_txholds(tx, zp);
2239 	zfs_sa_upgrade_txholds(tx, dzp);
2240 	dmu_tx_mark_netfree(tx);
2241 	error = dmu_tx_assign(tx, TXG_WAIT);
2242 	if (error) {
2243 		dmu_tx_abort(tx);
2244 		ZFS_EXIT(zfsvfs);
2245 		return (error);
2246 	}
2247 
2248 	cache_purge(dvp);
2249 
2250 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2251 
2252 	if (error == 0) {
2253 		uint64_t txtype = TX_RMDIR;
2254 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2255 	}
2256 
2257 	dmu_tx_commit(tx);
2258 
2259 	cache_purge(vp);
2260 out:
2261 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2262 		zil_commit(zilog, 0);
2263 
2264 	ZFS_EXIT(zfsvfs);
2265 	return (error);
2266 }
2267 
2268 /*
2269  * Read as many directory entries as will fit into the provided
2270  * buffer from the given directory cursor position (specified in
2271  * the uio structure).
2272  *
2273  *	IN:	vp	- vnode of directory to read.
2274  *		uio	- structure supplying read location, range info,
2275  *			  and return buffer.
2276  *		cr	- credentials of caller.
2277  *		ct	- caller context
2278  *		flags	- case flags
2279  *
2280  *	OUT:	uio	- updated offset and range, buffer filled.
2281  *		eofp	- set to true if end-of-file detected.
2282  *
2283  *	RETURN:	0 on success, error code on failure.
2284  *
2285  * Timestamps:
2286  *	vp - atime updated
2287  *
2288  * Note that the low 4 bits of the cookie returned by zap is always zero.
2289  * This allows us to use the low range for "special" directory entries:
2290  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2291  * we use the offset 2 for the '.zfs' directory.
2292  */
2293 /* ARGSUSED */
2294 static int
zfs_readdir(vnode_t * vp,uio_t * uio,cred_t * cr,int * eofp,int * ncookies,u_long ** cookies)2295 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2296 {
2297 	znode_t		*zp = VTOZ(vp);
2298 	iovec_t		*iovp;
2299 	edirent_t	*eodp;
2300 	dirent64_t	*odp;
2301 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2302 	objset_t	*os;
2303 	caddr_t		outbuf;
2304 	size_t		bufsize;
2305 	zap_cursor_t	zc;
2306 	zap_attribute_t	zap;
2307 	uint_t		bytes_wanted;
2308 	uint64_t	offset; /* must be unsigned; checks for < 1 */
2309 	uint64_t	parent;
2310 	int		local_eof;
2311 	int		outcount;
2312 	int		error;
2313 	uint8_t		prefetch;
2314 	boolean_t	check_sysattrs;
2315 	uint8_t		type;
2316 	int		ncooks;
2317 	u_long		*cooks = NULL;
2318 	int		flags = 0;
2319 
2320 	ZFS_ENTER(zfsvfs);
2321 	ZFS_VERIFY_ZP(zp);
2322 
2323 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2324 	    &parent, sizeof (parent))) != 0) {
2325 		ZFS_EXIT(zfsvfs);
2326 		return (error);
2327 	}
2328 
2329 	/*
2330 	 * If we are not given an eof variable,
2331 	 * use a local one.
2332 	 */
2333 	if (eofp == NULL)
2334 		eofp = &local_eof;
2335 
2336 	/*
2337 	 * Check for valid iov_len.
2338 	 */
2339 	if (uio->uio_iov->iov_len <= 0) {
2340 		ZFS_EXIT(zfsvfs);
2341 		return (SET_ERROR(EINVAL));
2342 	}
2343 
2344 	/*
2345 	 * Quit if directory has been removed (posix)
2346 	 */
2347 	if ((*eofp = zp->z_unlinked) != 0) {
2348 		ZFS_EXIT(zfsvfs);
2349 		return (0);
2350 	}
2351 
2352 	error = 0;
2353 	os = zfsvfs->z_os;
2354 	offset = uio->uio_loffset;
2355 	prefetch = zp->z_zn_prefetch;
2356 
2357 	/*
2358 	 * Initialize the iterator cursor.
2359 	 */
2360 	if (offset <= 3) {
2361 		/*
2362 		 * Start iteration from the beginning of the directory.
2363 		 */
2364 		zap_cursor_init(&zc, os, zp->z_id);
2365 	} else {
2366 		/*
2367 		 * The offset is a serialized cursor.
2368 		 */
2369 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2370 	}
2371 
2372 	/*
2373 	 * Get space to change directory entries into fs independent format.
2374 	 */
2375 	iovp = uio->uio_iov;
2376 	bytes_wanted = iovp->iov_len;
2377 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2378 		bufsize = bytes_wanted;
2379 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2380 		odp = (struct dirent64 *)outbuf;
2381 	} else {
2382 		bufsize = bytes_wanted;
2383 		outbuf = NULL;
2384 		odp = (struct dirent64 *)iovp->iov_base;
2385 	}
2386 	eodp = (struct edirent *)odp;
2387 
2388 	if (ncookies != NULL) {
2389 		/*
2390 		 * Minimum entry size is dirent size and 1 byte for a file name.
2391 		 */
2392 		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2393 		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2394 		*cookies = cooks;
2395 		*ncookies = ncooks;
2396 	}
2397 	/*
2398 	 * If this VFS supports the system attribute view interface; and
2399 	 * we're looking at an extended attribute directory; and we care
2400 	 * about normalization conflicts on this vfs; then we must check
2401 	 * for normalization conflicts with the sysattr name space.
2402 	 */
2403 #ifdef TODO
2404 	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2405 	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2406 	    (flags & V_RDDIR_ENTFLAGS);
2407 #else
2408 	check_sysattrs = 0;
2409 #endif
2410 
2411 	/*
2412 	 * Transform to file-system independent format
2413 	 */
2414 	outcount = 0;
2415 	while (outcount < bytes_wanted) {
2416 		ino64_t objnum;
2417 		ushort_t reclen;
2418 		off64_t *next = NULL;
2419 
2420 		/*
2421 		 * Special case `.', `..', and `.zfs'.
2422 		 */
2423 		if (offset == 0) {
2424 			(void) strcpy(zap.za_name, ".");
2425 			zap.za_normalization_conflict = 0;
2426 			objnum = zp->z_id;
2427 			type = DT_DIR;
2428 		} else if (offset == 1) {
2429 			(void) strcpy(zap.za_name, "..");
2430 			zap.za_normalization_conflict = 0;
2431 			objnum = parent;
2432 			type = DT_DIR;
2433 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2434 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2435 			zap.za_normalization_conflict = 0;
2436 			objnum = ZFSCTL_INO_ROOT;
2437 			type = DT_DIR;
2438 		} else {
2439 			/*
2440 			 * Grab next entry.
2441 			 */
2442 			if (error = zap_cursor_retrieve(&zc, &zap)) {
2443 				if ((*eofp = (error == ENOENT)) != 0)
2444 					break;
2445 				else
2446 					goto update;
2447 			}
2448 
2449 			if (zap.za_integer_length != 8 ||
2450 			    zap.za_num_integers != 1) {
2451 				cmn_err(CE_WARN, "zap_readdir: bad directory "
2452 				    "entry, obj = %lld, offset = %lld\n",
2453 				    (u_longlong_t)zp->z_id,
2454 				    (u_longlong_t)offset);
2455 				error = SET_ERROR(ENXIO);
2456 				goto update;
2457 			}
2458 
2459 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2460 			/*
2461 			 * MacOS X can extract the object type here such as:
2462 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2463 			 */
2464 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2465 
2466 			if (check_sysattrs && !zap.za_normalization_conflict) {
2467 #ifdef TODO
2468 				zap.za_normalization_conflict =
2469 				    xattr_sysattr_casechk(zap.za_name);
2470 #else
2471 				panic("%s:%u: TODO", __func__, __LINE__);
2472 #endif
2473 			}
2474 		}
2475 
2476 		if (flags & V_RDDIR_ACCFILTER) {
2477 			/*
2478 			 * If we have no access at all, don't include
2479 			 * this entry in the returned information
2480 			 */
2481 			znode_t	*ezp;
2482 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2483 				goto skip_entry;
2484 			if (!zfs_has_access(ezp, cr)) {
2485 				vrele(ZTOV(ezp));
2486 				goto skip_entry;
2487 			}
2488 			vrele(ZTOV(ezp));
2489 		}
2490 
2491 		if (flags & V_RDDIR_ENTFLAGS)
2492 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2493 		else
2494 			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2495 
2496 		/*
2497 		 * Will this entry fit in the buffer?
2498 		 */
2499 		if (outcount + reclen > bufsize) {
2500 			/*
2501 			 * Did we manage to fit anything in the buffer?
2502 			 */
2503 			if (!outcount) {
2504 				error = SET_ERROR(EINVAL);
2505 				goto update;
2506 			}
2507 			break;
2508 		}
2509 		if (flags & V_RDDIR_ENTFLAGS) {
2510 			/*
2511 			 * Add extended flag entry:
2512 			 */
2513 			eodp->ed_ino = objnum;
2514 			eodp->ed_reclen = reclen;
2515 			/* NOTE: ed_off is the offset for the *next* entry */
2516 			next = &(eodp->ed_off);
2517 			eodp->ed_eflags = zap.za_normalization_conflict ?
2518 			    ED_CASE_CONFLICT : 0;
2519 			(void) strncpy(eodp->ed_name, zap.za_name,
2520 			    EDIRENT_NAMELEN(reclen));
2521 			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2522 		} else {
2523 			/*
2524 			 * Add normal entry:
2525 			 */
2526 			odp->d_ino = objnum;
2527 			odp->d_reclen = reclen;
2528 			odp->d_namlen = strlen(zap.za_name);
2529 			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2530 			odp->d_type = type;
2531 			odp = (dirent64_t *)((intptr_t)odp + reclen);
2532 		}
2533 		outcount += reclen;
2534 
2535 		ASSERT(outcount <= bufsize);
2536 
2537 		/* Prefetch znode */
2538 		if (prefetch)
2539 			dmu_prefetch(os, objnum, 0, 0, 0,
2540 			    ZIO_PRIORITY_SYNC_READ);
2541 
2542 	skip_entry:
2543 		/*
2544 		 * Move to the next entry, fill in the previous offset.
2545 		 */
2546 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2547 			zap_cursor_advance(&zc);
2548 			offset = zap_cursor_serialize(&zc);
2549 		} else {
2550 			offset += 1;
2551 		}
2552 
2553 		if (cooks != NULL) {
2554 			*cooks++ = offset;
2555 			ncooks--;
2556 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2557 		}
2558 	}
2559 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2560 
2561 	/* Subtract unused cookies */
2562 	if (ncookies != NULL)
2563 		*ncookies -= ncooks;
2564 
2565 	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2566 		iovp->iov_base += outcount;
2567 		iovp->iov_len -= outcount;
2568 		uio->uio_resid -= outcount;
2569 	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2570 		/*
2571 		 * Reset the pointer.
2572 		 */
2573 		offset = uio->uio_loffset;
2574 	}
2575 
2576 update:
2577 	zap_cursor_fini(&zc);
2578 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2579 		kmem_free(outbuf, bufsize);
2580 
2581 	if (error == ENOENT)
2582 		error = 0;
2583 
2584 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2585 
2586 	uio->uio_loffset = offset;
2587 	ZFS_EXIT(zfsvfs);
2588 	if (error != 0 && cookies != NULL) {
2589 		free(*cookies, M_TEMP);
2590 		*cookies = NULL;
2591 		*ncookies = 0;
2592 	}
2593 	return (error);
2594 }
2595 
2596 ulong_t zfs_fsync_sync_cnt = 4;
2597 
2598 static int
zfs_fsync(vnode_t * vp,int syncflag,cred_t * cr,caller_context_t * ct)2599 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2600 {
2601 	znode_t	*zp = VTOZ(vp);
2602 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2603 
2604 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2605 
2606 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2607 		ZFS_ENTER(zfsvfs);
2608 		ZFS_VERIFY_ZP(zp);
2609 		zil_commit(zfsvfs->z_log, zp->z_id);
2610 		ZFS_EXIT(zfsvfs);
2611 	}
2612 	return (0);
2613 }
2614 
2615 
2616 /*
2617  * Get the requested file attributes and place them in the provided
2618  * vattr structure.
2619  *
2620  *	IN:	vp	- vnode of file.
2621  *		vap	- va_mask identifies requested attributes.
2622  *			  If AT_XVATTR set, then optional attrs are requested
2623  *		flags	- ATTR_NOACLCHECK (CIFS server context)
2624  *		cr	- credentials of caller.
2625  *		ct	- caller context
2626  *
2627  *	OUT:	vap	- attribute values.
2628  *
2629  *	RETURN:	0 (always succeeds).
2630  */
2631 /* ARGSUSED */
2632 static int
zfs_getattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr,caller_context_t * ct)2633 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2634     caller_context_t *ct)
2635 {
2636 	znode_t *zp = VTOZ(vp);
2637 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2638 	int	error = 0;
2639 	uint32_t blksize;
2640 	u_longlong_t nblocks;
2641 	uint64_t links;
2642 	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2643 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2644 	xoptattr_t *xoap = NULL;
2645 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2646 	sa_bulk_attr_t bulk[4];
2647 	int count = 0;
2648 
2649 	ZFS_ENTER(zfsvfs);
2650 	ZFS_VERIFY_ZP(zp);
2651 
2652 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2653 
2654 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2655 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2656 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2657 	if (vp->v_type == VBLK || vp->v_type == VCHR)
2658 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2659 		    &rdev, 8);
2660 
2661 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2662 		ZFS_EXIT(zfsvfs);
2663 		return (error);
2664 	}
2665 
2666 	/*
2667 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2668 	 * Also, if we are the owner don't bother, since owner should
2669 	 * always be allowed to read basic attributes of file.
2670 	 */
2671 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2672 	    (vap->va_uid != crgetuid(cr))) {
2673 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2674 		    skipaclchk, cr)) {
2675 			ZFS_EXIT(zfsvfs);
2676 			return (error);
2677 		}
2678 	}
2679 
2680 	/*
2681 	 * Return all attributes.  It's cheaper to provide the answer
2682 	 * than to determine whether we were asked the question.
2683 	 */
2684 
2685 	vap->va_type = IFTOVT(zp->z_mode);
2686 	vap->va_mode = zp->z_mode & ~S_IFMT;
2687 #ifdef illumos
2688 	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2689 #else
2690 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2691 #endif
2692 	vap->va_nodeid = zp->z_id;
2693 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2694 		links = zp->z_links + 1;
2695 	else
2696 		links = zp->z_links;
2697 	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2698 	vap->va_size = zp->z_size;
2699 #ifdef illumos
2700 	vap->va_rdev = vp->v_rdev;
2701 #else
2702 	if (vp->v_type == VBLK || vp->v_type == VCHR)
2703 		vap->va_rdev = zfs_cmpldev(rdev);
2704 #endif
2705 	vap->va_seq = zp->z_seq;
2706 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2707 	vap->va_filerev = zp->z_seq;
2708 
2709 	/*
2710 	 * Add in any requested optional attributes and the create time.
2711 	 * Also set the corresponding bits in the returned attribute bitmap.
2712 	 */
2713 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2714 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2715 			xoap->xoa_archive =
2716 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2717 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2718 		}
2719 
2720 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2721 			xoap->xoa_readonly =
2722 			    ((zp->z_pflags & ZFS_READONLY) != 0);
2723 			XVA_SET_RTN(xvap, XAT_READONLY);
2724 		}
2725 
2726 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2727 			xoap->xoa_system =
2728 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2729 			XVA_SET_RTN(xvap, XAT_SYSTEM);
2730 		}
2731 
2732 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2733 			xoap->xoa_hidden =
2734 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2735 			XVA_SET_RTN(xvap, XAT_HIDDEN);
2736 		}
2737 
2738 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2739 			xoap->xoa_nounlink =
2740 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2741 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2742 		}
2743 
2744 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2745 			xoap->xoa_immutable =
2746 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2747 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2748 		}
2749 
2750 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2751 			xoap->xoa_appendonly =
2752 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2753 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2754 		}
2755 
2756 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2757 			xoap->xoa_nodump =
2758 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2759 			XVA_SET_RTN(xvap, XAT_NODUMP);
2760 		}
2761 
2762 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2763 			xoap->xoa_opaque =
2764 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2765 			XVA_SET_RTN(xvap, XAT_OPAQUE);
2766 		}
2767 
2768 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2769 			xoap->xoa_av_quarantined =
2770 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2771 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2772 		}
2773 
2774 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2775 			xoap->xoa_av_modified =
2776 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2777 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2778 		}
2779 
2780 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2781 		    vp->v_type == VREG) {
2782 			zfs_sa_get_scanstamp(zp, xvap);
2783 		}
2784 
2785 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2786 			uint64_t times[2];
2787 
2788 			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2789 			    times, sizeof (times));
2790 			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2791 			XVA_SET_RTN(xvap, XAT_CREATETIME);
2792 		}
2793 
2794 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2795 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2796 			XVA_SET_RTN(xvap, XAT_REPARSE);
2797 		}
2798 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2799 			xoap->xoa_generation = zp->z_gen;
2800 			XVA_SET_RTN(xvap, XAT_GEN);
2801 		}
2802 
2803 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2804 			xoap->xoa_offline =
2805 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2806 			XVA_SET_RTN(xvap, XAT_OFFLINE);
2807 		}
2808 
2809 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2810 			xoap->xoa_sparse =
2811 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2812 			XVA_SET_RTN(xvap, XAT_SPARSE);
2813 		}
2814 	}
2815 
2816 	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2817 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2818 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2819 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2820 
2821 
2822 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2823 	vap->va_blksize = blksize;
2824 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2825 
2826 	if (zp->z_blksz == 0) {
2827 		/*
2828 		 * Block size hasn't been set; suggest maximal I/O transfers.
2829 		 */
2830 		vap->va_blksize = zfsvfs->z_max_blksz;
2831 	}
2832 
2833 	ZFS_EXIT(zfsvfs);
2834 	return (0);
2835 }
2836 
2837 /*
2838  * Set the file attributes to the values contained in the
2839  * vattr structure.
2840  *
2841  *	IN:	vp	- vnode of file to be modified.
2842  *		vap	- new attribute values.
2843  *			  If AT_XVATTR set, then optional attrs are being set
2844  *		flags	- ATTR_UTIME set if non-default time values provided.
2845  *			- ATTR_NOACLCHECK (CIFS context only).
2846  *		cr	- credentials of caller.
2847  *		ct	- caller context
2848  *
2849  *	RETURN:	0 on success, error code on failure.
2850  *
2851  * Timestamps:
2852  *	vp - ctime updated, mtime updated if size changed.
2853  */
2854 /* ARGSUSED */
2855 static int
zfs_setattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr,caller_context_t * ct)2856 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2857     caller_context_t *ct)
2858 {
2859 	znode_t		*zp = VTOZ(vp);
2860 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2861 	zilog_t		*zilog;
2862 	dmu_tx_t	*tx;
2863 	vattr_t		oldva;
2864 	xvattr_t	tmpxvattr;
2865 	uint_t		mask = vap->va_mask;
2866 	uint_t		saved_mask = 0;
2867 	uint64_t	saved_mode;
2868 	int		trim_mask = 0;
2869 	uint64_t	new_mode;
2870 	uint64_t	new_uid, new_gid;
2871 	uint64_t	xattr_obj;
2872 	uint64_t	mtime[2], ctime[2];
2873 	znode_t		*attrzp;
2874 	int		need_policy = FALSE;
2875 	int		err, err2;
2876 	zfs_fuid_info_t *fuidp = NULL;
2877 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2878 	xoptattr_t	*xoap;
2879 	zfs_acl_t	*aclp;
2880 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2881 	boolean_t	fuid_dirtied = B_FALSE;
2882 	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2883 	int		count = 0, xattr_count = 0;
2884 
2885 	if (mask == 0)
2886 		return (0);
2887 
2888 	if (mask & AT_NOSET)
2889 		return (SET_ERROR(EINVAL));
2890 
2891 	ZFS_ENTER(zfsvfs);
2892 	ZFS_VERIFY_ZP(zp);
2893 
2894 	zilog = zfsvfs->z_log;
2895 
2896 	/*
2897 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2898 	 * that file system is at proper version level
2899 	 */
2900 
2901 	if (zfsvfs->z_use_fuids == B_FALSE &&
2902 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2903 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2904 	    (mask & AT_XVATTR))) {
2905 		ZFS_EXIT(zfsvfs);
2906 		return (SET_ERROR(EINVAL));
2907 	}
2908 
2909 	if (mask & AT_SIZE && vp->v_type == VDIR) {
2910 		ZFS_EXIT(zfsvfs);
2911 		return (SET_ERROR(EISDIR));
2912 	}
2913 
2914 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2915 		ZFS_EXIT(zfsvfs);
2916 		return (SET_ERROR(EINVAL));
2917 	}
2918 
2919 	/*
2920 	 * If this is an xvattr_t, then get a pointer to the structure of
2921 	 * optional attributes.  If this is NULL, then we have a vattr_t.
2922 	 */
2923 	xoap = xva_getxoptattr(xvap);
2924 
2925 	xva_init(&tmpxvattr);
2926 
2927 	/*
2928 	 * Immutable files can only alter immutable bit and atime
2929 	 */
2930 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2931 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2932 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2933 		ZFS_EXIT(zfsvfs);
2934 		return (SET_ERROR(EPERM));
2935 	}
2936 
2937 	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2938 		ZFS_EXIT(zfsvfs);
2939 		return (SET_ERROR(EPERM));
2940 	}
2941 
2942 	/*
2943 	 * Verify timestamps doesn't overflow 32 bits.
2944 	 * ZFS can handle large timestamps, but 32bit syscalls can't
2945 	 * handle times greater than 2039.  This check should be removed
2946 	 * once large timestamps are fully supported.
2947 	 */
2948 	if (mask & (AT_ATIME | AT_MTIME)) {
2949 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2950 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2951 			ZFS_EXIT(zfsvfs);
2952 			return (SET_ERROR(EOVERFLOW));
2953 		}
2954 	}
2955 
2956 	attrzp = NULL;
2957 	aclp = NULL;
2958 
2959 	/* Can this be moved to before the top label? */
2960 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2961 		ZFS_EXIT(zfsvfs);
2962 		return (SET_ERROR(EROFS));
2963 	}
2964 
2965 	/*
2966 	 * First validate permissions
2967 	 */
2968 
2969 	if (mask & AT_SIZE) {
2970 		/*
2971 		 * XXX - Note, we are not providing any open
2972 		 * mode flags here (like FNDELAY), so we may
2973 		 * block if there are locks present... this
2974 		 * should be addressed in openat().
2975 		 */
2976 		/* XXX - would it be OK to generate a log record here? */
2977 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2978 		if (err) {
2979 			ZFS_EXIT(zfsvfs);
2980 			return (err);
2981 		}
2982 	}
2983 
2984 	if (mask & (AT_ATIME|AT_MTIME) ||
2985 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2986 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2987 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2988 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2989 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2990 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2991 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2992 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2993 		    skipaclchk, cr);
2994 	}
2995 
2996 	if (mask & (AT_UID|AT_GID)) {
2997 		int	idmask = (mask & (AT_UID|AT_GID));
2998 		int	take_owner;
2999 		int	take_group;
3000 
3001 		/*
3002 		 * NOTE: even if a new mode is being set,
3003 		 * we may clear S_ISUID/S_ISGID bits.
3004 		 */
3005 
3006 		if (!(mask & AT_MODE))
3007 			vap->va_mode = zp->z_mode;
3008 
3009 		/*
3010 		 * Take ownership or chgrp to group we are a member of
3011 		 */
3012 
3013 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3014 		take_group = (mask & AT_GID) &&
3015 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3016 
3017 		/*
3018 		 * If both AT_UID and AT_GID are set then take_owner and
3019 		 * take_group must both be set in order to allow taking
3020 		 * ownership.
3021 		 *
3022 		 * Otherwise, send the check through secpolicy_vnode_setattr()
3023 		 *
3024 		 */
3025 
3026 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3027 		    ((idmask == AT_UID) && take_owner) ||
3028 		    ((idmask == AT_GID) && take_group)) {
3029 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3030 			    skipaclchk, cr) == 0) {
3031 				/*
3032 				 * Remove setuid/setgid for non-privileged users
3033 				 */
3034 				secpolicy_setid_clear(vap, vp, cr);
3035 				trim_mask = (mask & (AT_UID|AT_GID));
3036 			} else {
3037 				need_policy =  TRUE;
3038 			}
3039 		} else {
3040 			need_policy =  TRUE;
3041 		}
3042 	}
3043 
3044 	oldva.va_mode = zp->z_mode;
3045 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3046 	if (mask & AT_XVATTR) {
3047 		/*
3048 		 * Update xvattr mask to include only those attributes
3049 		 * that are actually changing.
3050 		 *
3051 		 * the bits will be restored prior to actually setting
3052 		 * the attributes so the caller thinks they were set.
3053 		 */
3054 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3055 			if (xoap->xoa_appendonly !=
3056 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3057 				need_policy = TRUE;
3058 			} else {
3059 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3060 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3061 			}
3062 		}
3063 
3064 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3065 			if (xoap->xoa_nounlink !=
3066 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3067 				need_policy = TRUE;
3068 			} else {
3069 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3070 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3071 			}
3072 		}
3073 
3074 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3075 			if (xoap->xoa_immutable !=
3076 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3077 				need_policy = TRUE;
3078 			} else {
3079 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3080 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3081 			}
3082 		}
3083 
3084 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3085 			if (xoap->xoa_nodump !=
3086 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3087 				need_policy = TRUE;
3088 			} else {
3089 				XVA_CLR_REQ(xvap, XAT_NODUMP);
3090 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3091 			}
3092 		}
3093 
3094 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3095 			if (xoap->xoa_av_modified !=
3096 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3097 				need_policy = TRUE;
3098 			} else {
3099 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3100 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3101 			}
3102 		}
3103 
3104 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3105 			if ((vp->v_type != VREG &&
3106 			    xoap->xoa_av_quarantined) ||
3107 			    xoap->xoa_av_quarantined !=
3108 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3109 				need_policy = TRUE;
3110 			} else {
3111 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3112 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3113 			}
3114 		}
3115 
3116 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3117 			ZFS_EXIT(zfsvfs);
3118 			return (SET_ERROR(EPERM));
3119 		}
3120 
3121 		if (need_policy == FALSE &&
3122 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3123 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3124 			need_policy = TRUE;
3125 		}
3126 	}
3127 
3128 	if (mask & AT_MODE) {
3129 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3130 			err = secpolicy_setid_setsticky_clear(vp, vap,
3131 			    &oldva, cr);
3132 			if (err) {
3133 				ZFS_EXIT(zfsvfs);
3134 				return (err);
3135 			}
3136 			trim_mask |= AT_MODE;
3137 		} else {
3138 			need_policy = TRUE;
3139 		}
3140 	}
3141 
3142 	if (need_policy) {
3143 		/*
3144 		 * If trim_mask is set then take ownership
3145 		 * has been granted or write_acl is present and user
3146 		 * has the ability to modify mode.  In that case remove
3147 		 * UID|GID and or MODE from mask so that
3148 		 * secpolicy_vnode_setattr() doesn't revoke it.
3149 		 */
3150 
3151 		if (trim_mask) {
3152 			saved_mask = vap->va_mask;
3153 			vap->va_mask &= ~trim_mask;
3154 			if (trim_mask & AT_MODE) {
3155 				/*
3156 				 * Save the mode, as secpolicy_vnode_setattr()
3157 				 * will overwrite it with ova.va_mode.
3158 				 */
3159 				saved_mode = vap->va_mode;
3160 			}
3161 		}
3162 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3163 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3164 		if (err) {
3165 			ZFS_EXIT(zfsvfs);
3166 			return (err);
3167 		}
3168 
3169 		if (trim_mask) {
3170 			vap->va_mask |= saved_mask;
3171 			if (trim_mask & AT_MODE) {
3172 				/*
3173 				 * Recover the mode after
3174 				 * secpolicy_vnode_setattr().
3175 				 */
3176 				vap->va_mode = saved_mode;
3177 			}
3178 		}
3179 	}
3180 
3181 	/*
3182 	 * secpolicy_vnode_setattr, or take ownership may have
3183 	 * changed va_mask
3184 	 */
3185 	mask = vap->va_mask;
3186 
3187 	if ((mask & (AT_UID | AT_GID))) {
3188 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3189 		    &xattr_obj, sizeof (xattr_obj));
3190 
3191 		if (err == 0 && xattr_obj) {
3192 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3193 			if (err == 0) {
3194 				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3195 				if (err != 0)
3196 					vrele(ZTOV(attrzp));
3197 			}
3198 			if (err)
3199 				goto out2;
3200 		}
3201 		if (mask & AT_UID) {
3202 			new_uid = zfs_fuid_create(zfsvfs,
3203 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3204 			if (new_uid != zp->z_uid &&
3205 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3206 				if (attrzp)
3207 					vput(ZTOV(attrzp));
3208 				err = SET_ERROR(EDQUOT);
3209 				goto out2;
3210 			}
3211 		}
3212 
3213 		if (mask & AT_GID) {
3214 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3215 			    cr, ZFS_GROUP, &fuidp);
3216 			if (new_gid != zp->z_gid &&
3217 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3218 				if (attrzp)
3219 					vput(ZTOV(attrzp));
3220 				err = SET_ERROR(EDQUOT);
3221 				goto out2;
3222 			}
3223 		}
3224 	}
3225 	tx = dmu_tx_create(zfsvfs->z_os);
3226 
3227 	if (mask & AT_MODE) {
3228 		uint64_t pmode = zp->z_mode;
3229 		uint64_t acl_obj;
3230 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3231 
3232 		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3233 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3234 			err = SET_ERROR(EPERM);
3235 			goto out;
3236 		}
3237 
3238 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3239 			goto out;
3240 
3241 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3242 			/*
3243 			 * Are we upgrading ACL from old V0 format
3244 			 * to V1 format?
3245 			 */
3246 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3247 			    zfs_znode_acl_version(zp) ==
3248 			    ZFS_ACL_VERSION_INITIAL) {
3249 				dmu_tx_hold_free(tx, acl_obj, 0,
3250 				    DMU_OBJECT_END);
3251 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3252 				    0, aclp->z_acl_bytes);
3253 			} else {
3254 				dmu_tx_hold_write(tx, acl_obj, 0,
3255 				    aclp->z_acl_bytes);
3256 			}
3257 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3258 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3259 			    0, aclp->z_acl_bytes);
3260 		}
3261 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3262 	} else {
3263 		if ((mask & AT_XVATTR) &&
3264 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3265 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3266 		else
3267 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3268 	}
3269 
3270 	if (attrzp) {
3271 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3272 	}
3273 
3274 	fuid_dirtied = zfsvfs->z_fuid_dirty;
3275 	if (fuid_dirtied)
3276 		zfs_fuid_txhold(zfsvfs, tx);
3277 
3278 	zfs_sa_upgrade_txholds(tx, zp);
3279 
3280 	err = dmu_tx_assign(tx, TXG_WAIT);
3281 	if (err)
3282 		goto out;
3283 
3284 	count = 0;
3285 	/*
3286 	 * Set each attribute requested.
3287 	 * We group settings according to the locks they need to acquire.
3288 	 *
3289 	 * Note: you cannot set ctime directly, although it will be
3290 	 * updated as a side-effect of calling this function.
3291 	 */
3292 
3293 	if (mask & (AT_UID|AT_GID|AT_MODE))
3294 		mutex_enter(&zp->z_acl_lock);
3295 
3296 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3297 	    &zp->z_pflags, sizeof (zp->z_pflags));
3298 
3299 	if (attrzp) {
3300 		if (mask & (AT_UID|AT_GID|AT_MODE))
3301 			mutex_enter(&attrzp->z_acl_lock);
3302 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3303 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3304 		    sizeof (attrzp->z_pflags));
3305 	}
3306 
3307 	if (mask & (AT_UID|AT_GID)) {
3308 
3309 		if (mask & AT_UID) {
3310 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3311 			    &new_uid, sizeof (new_uid));
3312 			zp->z_uid = new_uid;
3313 			if (attrzp) {
3314 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3315 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3316 				    sizeof (new_uid));
3317 				attrzp->z_uid = new_uid;
3318 			}
3319 		}
3320 
3321 		if (mask & AT_GID) {
3322 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3323 			    NULL, &new_gid, sizeof (new_gid));
3324 			zp->z_gid = new_gid;
3325 			if (attrzp) {
3326 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3327 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3328 				    sizeof (new_gid));
3329 				attrzp->z_gid = new_gid;
3330 			}
3331 		}
3332 		if (!(mask & AT_MODE)) {
3333 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3334 			    NULL, &new_mode, sizeof (new_mode));
3335 			new_mode = zp->z_mode;
3336 		}
3337 		err = zfs_acl_chown_setattr(zp);
3338 		ASSERT(err == 0);
3339 		if (attrzp) {
3340 			err = zfs_acl_chown_setattr(attrzp);
3341 			ASSERT(err == 0);
3342 		}
3343 	}
3344 
3345 	if (mask & AT_MODE) {
3346 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3347 		    &new_mode, sizeof (new_mode));
3348 		zp->z_mode = new_mode;
3349 		ASSERT3U((uintptr_t)aclp, !=, 0);
3350 		err = zfs_aclset_common(zp, aclp, cr, tx);
3351 		ASSERT0(err);
3352 		if (zp->z_acl_cached)
3353 			zfs_acl_free(zp->z_acl_cached);
3354 		zp->z_acl_cached = aclp;
3355 		aclp = NULL;
3356 	}
3357 
3358 
3359 	if (mask & AT_ATIME) {
3360 		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3361 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3362 		    &zp->z_atime, sizeof (zp->z_atime));
3363 	}
3364 
3365 	if (mask & AT_MTIME) {
3366 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3367 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3368 		    mtime, sizeof (mtime));
3369 	}
3370 
3371 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3372 	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3373 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3374 		    NULL, mtime, sizeof (mtime));
3375 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3376 		    &ctime, sizeof (ctime));
3377 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3378 		    B_TRUE);
3379 	} else if (mask != 0) {
3380 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3381 		    &ctime, sizeof (ctime));
3382 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3383 		    B_TRUE);
3384 		if (attrzp) {
3385 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3386 			    SA_ZPL_CTIME(zfsvfs), NULL,
3387 			    &ctime, sizeof (ctime));
3388 			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3389 			    mtime, ctime, B_TRUE);
3390 		}
3391 	}
3392 	/*
3393 	 * Do this after setting timestamps to prevent timestamp
3394 	 * update from toggling bit
3395 	 */
3396 
3397 	if (xoap && (mask & AT_XVATTR)) {
3398 
3399 		/*
3400 		 * restore trimmed off masks
3401 		 * so that return masks can be set for caller.
3402 		 */
3403 
3404 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3405 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3406 		}
3407 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3408 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3409 		}
3410 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3411 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3412 		}
3413 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3414 			XVA_SET_REQ(xvap, XAT_NODUMP);
3415 		}
3416 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3417 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3418 		}
3419 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3420 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3421 		}
3422 
3423 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3424 			ASSERT(vp->v_type == VREG);
3425 
3426 		zfs_xvattr_set(zp, xvap, tx);
3427 	}
3428 
3429 	if (fuid_dirtied)
3430 		zfs_fuid_sync(zfsvfs, tx);
3431 
3432 	if (mask != 0)
3433 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3434 
3435 	if (mask & (AT_UID|AT_GID|AT_MODE))
3436 		mutex_exit(&zp->z_acl_lock);
3437 
3438 	if (attrzp) {
3439 		if (mask & (AT_UID|AT_GID|AT_MODE))
3440 			mutex_exit(&attrzp->z_acl_lock);
3441 	}
3442 out:
3443 	if (err == 0 && attrzp) {
3444 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3445 		    xattr_count, tx);
3446 		ASSERT(err2 == 0);
3447 	}
3448 
3449 	if (attrzp)
3450 		vput(ZTOV(attrzp));
3451 
3452 	if (aclp)
3453 		zfs_acl_free(aclp);
3454 
3455 	if (fuidp) {
3456 		zfs_fuid_info_free(fuidp);
3457 		fuidp = NULL;
3458 	}
3459 
3460 	if (err) {
3461 		dmu_tx_abort(tx);
3462 	} else {
3463 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3464 		dmu_tx_commit(tx);
3465 	}
3466 
3467 out2:
3468 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3469 		zil_commit(zilog, 0);
3470 
3471 	ZFS_EXIT(zfsvfs);
3472 	return (err);
3473 }
3474 
3475 /*
3476  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3477  * fail to acquire any lock in the path we will drop all held locks,
3478  * acquire the new lock in a blocking fashion, and then release it and
3479  * restart the rename.  This acquire/release step ensures that we do not
3480  * spin on a lock waiting for release.  On error release all vnode locks
3481  * and decrement references the way tmpfs_rename() would do.
3482  */
3483 static int
zfs_rename_relock(struct vnode * sdvp,struct vnode ** svpp,struct vnode * tdvp,struct vnode ** tvpp,const struct componentname * scnp,const struct componentname * tcnp)3484 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3485     struct vnode *tdvp, struct vnode **tvpp,
3486     const struct componentname *scnp, const struct componentname *tcnp)
3487 {
3488 	zfsvfs_t	*zfsvfs;
3489 	struct vnode	*nvp, *svp, *tvp;
3490 	znode_t		*sdzp, *tdzp, *szp, *tzp;
3491 	const char	*snm = scnp->cn_nameptr;
3492 	const char	*tnm = tcnp->cn_nameptr;
3493 	int error;
3494 
3495 	VOP_UNLOCK(tdvp, 0);
3496 	if (*tvpp != NULL && *tvpp != tdvp)
3497 		VOP_UNLOCK(*tvpp, 0);
3498 
3499 relock:
3500 	error = vn_lock(sdvp, LK_EXCLUSIVE);
3501 	if (error)
3502 		goto out;
3503 	sdzp = VTOZ(sdvp);
3504 
3505 	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3506 	if (error != 0) {
3507 		VOP_UNLOCK(sdvp, 0);
3508 		if (error != EBUSY)
3509 			goto out;
3510 		error = vn_lock(tdvp, LK_EXCLUSIVE);
3511 		if (error)
3512 			goto out;
3513 		VOP_UNLOCK(tdvp, 0);
3514 		goto relock;
3515 	}
3516 	tdzp = VTOZ(tdvp);
3517 
3518 	/*
3519 	 * Before using sdzp and tdzp we must ensure that they are live.
3520 	 * As a porting legacy from illumos we have two things to worry
3521 	 * about.  One is typical for FreeBSD and it is that the vnode is
3522 	 * not reclaimed (doomed).  The other is that the znode is live.
3523 	 * The current code can invalidate the znode without acquiring the
3524 	 * corresponding vnode lock if the object represented by the znode
3525 	 * and vnode is no longer valid after a rollback or receive operation.
3526 	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3527 	 * that protects the znodes from the invalidation.
3528 	 */
3529 	zfsvfs = sdzp->z_zfsvfs;
3530 	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3531 	ZFS_ENTER(zfsvfs);
3532 
3533 	/*
3534 	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3535 	 * bypassing the cleanup code in the case of an error.
3536 	 */
3537 	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3538 		ZFS_EXIT(zfsvfs);
3539 		VOP_UNLOCK(sdvp, 0);
3540 		VOP_UNLOCK(tdvp, 0);
3541 		error = SET_ERROR(EIO);
3542 		goto out;
3543 	}
3544 
3545 	/*
3546 	 * Re-resolve svp to be certain it still exists and fetch the
3547 	 * correct vnode.
3548 	 */
3549 	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3550 	if (error != 0) {
3551 		/* Source entry invalid or not there. */
3552 		ZFS_EXIT(zfsvfs);
3553 		VOP_UNLOCK(sdvp, 0);
3554 		VOP_UNLOCK(tdvp, 0);
3555 		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3556 		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3557 			error = SET_ERROR(EINVAL);
3558 		goto out;
3559 	}
3560 	svp = ZTOV(szp);
3561 
3562 	/*
3563 	 * Re-resolve tvp, if it disappeared we just carry on.
3564 	 */
3565 	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3566 	if (error != 0) {
3567 		ZFS_EXIT(zfsvfs);
3568 		VOP_UNLOCK(sdvp, 0);
3569 		VOP_UNLOCK(tdvp, 0);
3570 		vrele(svp);
3571 		if ((tcnp->cn_flags & ISDOTDOT) != 0)
3572 			error = SET_ERROR(EINVAL);
3573 		goto out;
3574 	}
3575 	if (tzp != NULL)
3576 		tvp = ZTOV(tzp);
3577 	else
3578 		tvp = NULL;
3579 
3580 	/*
3581 	 * At present the vnode locks must be acquired before z_teardown_lock,
3582 	 * although it would be more logical to use the opposite order.
3583 	 */
3584 	ZFS_EXIT(zfsvfs);
3585 
3586 	/*
3587 	 * Now try acquire locks on svp and tvp.
3588 	 */
3589 	nvp = svp;
3590 	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3591 	if (error != 0) {
3592 		VOP_UNLOCK(sdvp, 0);
3593 		VOP_UNLOCK(tdvp, 0);
3594 		if (tvp != NULL)
3595 			vrele(tvp);
3596 		if (error != EBUSY) {
3597 			vrele(nvp);
3598 			goto out;
3599 		}
3600 		error = vn_lock(nvp, LK_EXCLUSIVE);
3601 		if (error != 0) {
3602 			vrele(nvp);
3603 			goto out;
3604 		}
3605 		VOP_UNLOCK(nvp, 0);
3606 		/*
3607 		 * Concurrent rename race.
3608 		 * XXX ?
3609 		 */
3610 		if (nvp == tdvp) {
3611 			vrele(nvp);
3612 			error = SET_ERROR(EINVAL);
3613 			goto out;
3614 		}
3615 		vrele(*svpp);
3616 		*svpp = nvp;
3617 		goto relock;
3618 	}
3619 	vrele(*svpp);
3620 	*svpp = nvp;
3621 
3622 	if (*tvpp != NULL)
3623 		vrele(*tvpp);
3624 	*tvpp = NULL;
3625 	if (tvp != NULL) {
3626 		nvp = tvp;
3627 		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3628 		if (error != 0) {
3629 			VOP_UNLOCK(sdvp, 0);
3630 			VOP_UNLOCK(tdvp, 0);
3631 			VOP_UNLOCK(*svpp, 0);
3632 			if (error != EBUSY) {
3633 				vrele(nvp);
3634 				goto out;
3635 			}
3636 			error = vn_lock(nvp, LK_EXCLUSIVE);
3637 			if (error != 0) {
3638 				vrele(nvp);
3639 				goto out;
3640 			}
3641 			vput(nvp);
3642 			goto relock;
3643 		}
3644 		*tvpp = nvp;
3645 	}
3646 
3647 	return (0);
3648 
3649 out:
3650 	return (error);
3651 }
3652 
3653 /*
3654  * Note that we must use VRELE_ASYNC in this function as it walks
3655  * up the directory tree and vrele may need to acquire an exclusive
3656  * lock if a last reference to a vnode is dropped.
3657  */
3658 static int
zfs_rename_check(znode_t * szp,znode_t * sdzp,znode_t * tdzp)3659 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3660 {
3661 	zfsvfs_t	*zfsvfs;
3662 	znode_t		*zp, *zp1;
3663 	uint64_t	parent;
3664 	int		error;
3665 
3666 	zfsvfs = tdzp->z_zfsvfs;
3667 	if (tdzp == szp)
3668 		return (SET_ERROR(EINVAL));
3669 	if (tdzp == sdzp)
3670 		return (0);
3671 	if (tdzp->z_id == zfsvfs->z_root)
3672 		return (0);
3673 	zp = tdzp;
3674 	for (;;) {
3675 		ASSERT(!zp->z_unlinked);
3676 		if ((error = sa_lookup(zp->z_sa_hdl,
3677 		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3678 			break;
3679 
3680 		if (parent == szp->z_id) {
3681 			error = SET_ERROR(EINVAL);
3682 			break;
3683 		}
3684 		if (parent == zfsvfs->z_root)
3685 			break;
3686 		if (parent == sdzp->z_id)
3687 			break;
3688 
3689 		error = zfs_zget(zfsvfs, parent, &zp1);
3690 		if (error != 0)
3691 			break;
3692 
3693 		if (zp != tdzp)
3694 			VN_RELE_ASYNC(ZTOV(zp),
3695 			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3696 		zp = zp1;
3697 	}
3698 
3699 	if (error == ENOTDIR)
3700 		panic("checkpath: .. not a directory\n");
3701 	if (zp != tdzp)
3702 		VN_RELE_ASYNC(ZTOV(zp),
3703 		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3704 	return (error);
3705 }
3706 
3707 /*
3708  * Move an entry from the provided source directory to the target
3709  * directory.  Change the entry name as indicated.
3710  *
3711  *	IN:	sdvp	- Source directory containing the "old entry".
3712  *		snm	- Old entry name.
3713  *		tdvp	- Target directory to contain the "new entry".
3714  *		tnm	- New entry name.
3715  *		cr	- credentials of caller.
3716  *		ct	- caller context
3717  *		flags	- case flags
3718  *
3719  *	RETURN:	0 on success, error code on failure.
3720  *
3721  * Timestamps:
3722  *	sdvp,tdvp - ctime|mtime updated
3723  */
3724 /*ARGSUSED*/
3725 static int
zfs_rename(vnode_t * sdvp,vnode_t ** svpp,struct componentname * scnp,vnode_t * tdvp,vnode_t ** tvpp,struct componentname * tcnp,cred_t * cr)3726 zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3727     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3728     cred_t *cr)
3729 {
3730 	zfsvfs_t	*zfsvfs;
3731 	znode_t		*sdzp, *tdzp, *szp, *tzp;
3732 	zilog_t		*zilog = NULL;
3733 	dmu_tx_t	*tx;
3734 	char		*snm = scnp->cn_nameptr;
3735 	char		*tnm = tcnp->cn_nameptr;
3736 	int		error = 0;
3737 
3738 	/* Reject renames across filesystems. */
3739 	if ((*svpp)->v_mount != tdvp->v_mount ||
3740 	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3741 		error = SET_ERROR(EXDEV);
3742 		goto out;
3743 	}
3744 
3745 	if (zfsctl_is_node(tdvp)) {
3746 		error = SET_ERROR(EXDEV);
3747 		goto out;
3748 	}
3749 
3750 	/*
3751 	 * Lock all four vnodes to ensure safety and semantics of renaming.
3752 	 */
3753 	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3754 	if (error != 0) {
3755 		/* no vnodes are locked in the case of error here */
3756 		return (error);
3757 	}
3758 
3759 	tdzp = VTOZ(tdvp);
3760 	sdzp = VTOZ(sdvp);
3761 	zfsvfs = tdzp->z_zfsvfs;
3762 	zilog = zfsvfs->z_log;
3763 
3764 	/*
3765 	 * After we re-enter ZFS_ENTER() we will have to revalidate all
3766 	 * znodes involved.
3767 	 */
3768 	ZFS_ENTER(zfsvfs);
3769 
3770 	if (zfsvfs->z_utf8 && u8_validate(tnm,
3771 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3772 		error = SET_ERROR(EILSEQ);
3773 		goto unlockout;
3774 	}
3775 
3776 	/* If source and target are the same file, there is nothing to do. */
3777 	if ((*svpp) == (*tvpp)) {
3778 		error = 0;
3779 		goto unlockout;
3780 	}
3781 
3782 	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3783 	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3784 	    (*tvpp)->v_mountedhere != NULL)) {
3785 		error = SET_ERROR(EXDEV);
3786 		goto unlockout;
3787 	}
3788 
3789 	/*
3790 	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3791 	 * bypassing the cleanup code in the case of an error.
3792 	 */
3793 	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3794 		error = SET_ERROR(EIO);
3795 		goto unlockout;
3796 	}
3797 
3798 	szp = VTOZ(*svpp);
3799 	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3800 	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3801 		error = SET_ERROR(EIO);
3802 		goto unlockout;
3803 	}
3804 
3805 	/*
3806 	 * This is to prevent the creation of links into attribute space
3807 	 * by renaming a linked file into/outof an attribute directory.
3808 	 * See the comment in zfs_link() for why this is considered bad.
3809 	 */
3810 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3811 		error = SET_ERROR(EINVAL);
3812 		goto unlockout;
3813 	}
3814 
3815 	/*
3816 	 * Must have write access at the source to remove the old entry
3817 	 * and write access at the target to create the new entry.
3818 	 * Note that if target and source are the same, this can be
3819 	 * done in a single check.
3820 	 */
3821 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3822 		goto unlockout;
3823 
3824 	if ((*svpp)->v_type == VDIR) {
3825 		/*
3826 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
3827 		 */
3828 		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3829 		    sdzp == szp ||
3830 		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3831 			error = EINVAL;
3832 			goto unlockout;
3833 		}
3834 
3835 		/*
3836 		 * Check to make sure rename is valid.
3837 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3838 		 */
3839 		if (error = zfs_rename_check(szp, sdzp, tdzp))
3840 			goto unlockout;
3841 	}
3842 
3843 	/*
3844 	 * Does target exist?
3845 	 */
3846 	if (tzp) {
3847 		/*
3848 		 * Source and target must be the same type.
3849 		 */
3850 		if ((*svpp)->v_type == VDIR) {
3851 			if ((*tvpp)->v_type != VDIR) {
3852 				error = SET_ERROR(ENOTDIR);
3853 				goto unlockout;
3854 			} else {
3855 				cache_purge(tdvp);
3856 				if (sdvp != tdvp)
3857 					cache_purge(sdvp);
3858 			}
3859 		} else {
3860 			if ((*tvpp)->v_type == VDIR) {
3861 				error = SET_ERROR(EISDIR);
3862 				goto unlockout;
3863 			}
3864 		}
3865 	}
3866 
3867 	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3868 	if (tzp)
3869 		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3870 
3871 	/*
3872 	 * notify the target directory if it is not the same
3873 	 * as source directory.
3874 	 */
3875 	if (tdvp != sdvp) {
3876 		vnevent_rename_dest_dir(tdvp, ct);
3877 	}
3878 
3879 	tx = dmu_tx_create(zfsvfs->z_os);
3880 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3881 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3882 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3883 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3884 	if (sdzp != tdzp) {
3885 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3886 		zfs_sa_upgrade_txholds(tx, tdzp);
3887 	}
3888 	if (tzp) {
3889 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3890 		zfs_sa_upgrade_txholds(tx, tzp);
3891 	}
3892 
3893 	zfs_sa_upgrade_txholds(tx, szp);
3894 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3895 	error = dmu_tx_assign(tx, TXG_WAIT);
3896 	if (error) {
3897 		dmu_tx_abort(tx);
3898 		goto unlockout;
3899 	}
3900 
3901 
3902 	if (tzp)	/* Attempt to remove the existing target */
3903 		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3904 
3905 	if (error == 0) {
3906 		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3907 		if (error == 0) {
3908 			szp->z_pflags |= ZFS_AV_MODIFIED;
3909 
3910 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3911 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3912 			ASSERT0(error);
3913 
3914 			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3915 			    NULL);
3916 			if (error == 0) {
3917 				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3918 				    snm, tdzp, tnm, szp);
3919 
3920 				/*
3921 				 * Update path information for the target vnode
3922 				 */
3923 				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3924 			} else {
3925 				/*
3926 				 * At this point, we have successfully created
3927 				 * the target name, but have failed to remove
3928 				 * the source name.  Since the create was done
3929 				 * with the ZRENAMING flag, there are
3930 				 * complications; for one, the link count is
3931 				 * wrong.  The easiest way to deal with this
3932 				 * is to remove the newly created target, and
3933 				 * return the original error.  This must
3934 				 * succeed; fortunately, it is very unlikely to
3935 				 * fail, since we just created it.
3936 				 */
3937 				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3938 				    ZRENAMING, NULL), ==, 0);
3939 			}
3940 		}
3941 		if (error == 0) {
3942 			cache_purge(*svpp);
3943 			if (*tvpp != NULL)
3944 				cache_purge(*tvpp);
3945 			cache_purge_negative(tdvp);
3946 		}
3947 	}
3948 
3949 	dmu_tx_commit(tx);
3950 
3951 unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
3952 	ZFS_EXIT(zfsvfs);
3953 	VOP_UNLOCK(*svpp, 0);
3954 	VOP_UNLOCK(sdvp, 0);
3955 
3956 out:				/* original two vnodes are locked */
3957 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3958 		zil_commit(zilog, 0);
3959 
3960 	if (*tvpp != NULL)
3961 		VOP_UNLOCK(*tvpp, 0);
3962 	if (tdvp != *tvpp)
3963 		VOP_UNLOCK(tdvp, 0);
3964 	return (error);
3965 }
3966 
3967 /*
3968  * Insert the indicated symbolic reference entry into the directory.
3969  *
3970  *	IN:	dvp	- Directory to contain new symbolic link.
3971  *		link	- Name for new symlink entry.
3972  *		vap	- Attributes of new entry.
3973  *		cr	- credentials of caller.
3974  *		ct	- caller context
3975  *		flags	- case flags
3976  *
3977  *	RETURN:	0 on success, error code on failure.
3978  *
3979  * Timestamps:
3980  *	dvp - ctime|mtime updated
3981  */
3982 /*ARGSUSED*/
3983 static int
zfs_symlink(vnode_t * dvp,vnode_t ** vpp,char * name,vattr_t * vap,char * link,cred_t * cr,kthread_t * td)3984 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3985     cred_t *cr, kthread_t *td)
3986 {
3987 	znode_t		*zp, *dzp = VTOZ(dvp);
3988 	dmu_tx_t	*tx;
3989 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3990 	zilog_t		*zilog;
3991 	uint64_t	len = strlen(link);
3992 	int		error;
3993 	zfs_acl_ids_t	acl_ids;
3994 	boolean_t	fuid_dirtied;
3995 	uint64_t	txtype = TX_SYMLINK;
3996 	int		flags = 0;
3997 
3998 	ASSERT(vap->va_type == VLNK);
3999 
4000 	ZFS_ENTER(zfsvfs);
4001 	ZFS_VERIFY_ZP(dzp);
4002 	zilog = zfsvfs->z_log;
4003 
4004 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4005 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4006 		ZFS_EXIT(zfsvfs);
4007 		return (SET_ERROR(EILSEQ));
4008 	}
4009 
4010 	if (len > MAXPATHLEN) {
4011 		ZFS_EXIT(zfsvfs);
4012 		return (SET_ERROR(ENAMETOOLONG));
4013 	}
4014 
4015 	if ((error = zfs_acl_ids_create(dzp, 0,
4016 	    vap, cr, NULL, &acl_ids)) != 0) {
4017 		ZFS_EXIT(zfsvfs);
4018 		return (error);
4019 	}
4020 
4021 	/*
4022 	 * Attempt to lock directory; fail if entry already exists.
4023 	 */
4024 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4025 	if (error) {
4026 		zfs_acl_ids_free(&acl_ids);
4027 		ZFS_EXIT(zfsvfs);
4028 		return (error);
4029 	}
4030 
4031 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4032 		zfs_acl_ids_free(&acl_ids);
4033 		ZFS_EXIT(zfsvfs);
4034 		return (error);
4035 	}
4036 
4037 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4038 		zfs_acl_ids_free(&acl_ids);
4039 		ZFS_EXIT(zfsvfs);
4040 		return (SET_ERROR(EDQUOT));
4041 	}
4042 
4043 	getnewvnode_reserve(1);
4044 	tx = dmu_tx_create(zfsvfs->z_os);
4045 	fuid_dirtied = zfsvfs->z_fuid_dirty;
4046 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4047 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4048 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4049 	    ZFS_SA_BASE_ATTR_SIZE + len);
4050 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4051 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4052 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4053 		    acl_ids.z_aclp->z_acl_bytes);
4054 	}
4055 	if (fuid_dirtied)
4056 		zfs_fuid_txhold(zfsvfs, tx);
4057 	error = dmu_tx_assign(tx, TXG_WAIT);
4058 	if (error) {
4059 		zfs_acl_ids_free(&acl_ids);
4060 		dmu_tx_abort(tx);
4061 		getnewvnode_drop_reserve();
4062 		ZFS_EXIT(zfsvfs);
4063 		return (error);
4064 	}
4065 
4066 	/*
4067 	 * Create a new object for the symlink.
4068 	 * for version 4 ZPL datsets the symlink will be an SA attribute
4069 	 */
4070 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4071 
4072 	if (fuid_dirtied)
4073 		zfs_fuid_sync(zfsvfs, tx);
4074 
4075 	if (zp->z_is_sa)
4076 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4077 		    link, len, tx);
4078 	else
4079 		zfs_sa_symlink(zp, link, len, tx);
4080 
4081 	zp->z_size = len;
4082 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4083 	    &zp->z_size, sizeof (zp->z_size), tx);
4084 	/*
4085 	 * Insert the new object into the directory.
4086 	 */
4087 	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4088 
4089 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4090 	*vpp = ZTOV(zp);
4091 
4092 	zfs_acl_ids_free(&acl_ids);
4093 
4094 	dmu_tx_commit(tx);
4095 
4096 	getnewvnode_drop_reserve();
4097 
4098 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4099 		zil_commit(zilog, 0);
4100 
4101 	ZFS_EXIT(zfsvfs);
4102 	return (error);
4103 }
4104 
4105 /*
4106  * Return, in the buffer contained in the provided uio structure,
4107  * the symbolic path referred to by vp.
4108  *
4109  *	IN:	vp	- vnode of symbolic link.
4110  *		uio	- structure to contain the link path.
4111  *		cr	- credentials of caller.
4112  *		ct	- caller context
4113  *
4114  *	OUT:	uio	- structure containing the link path.
4115  *
4116  *	RETURN:	0 on success, error code on failure.
4117  *
4118  * Timestamps:
4119  *	vp - atime updated
4120  */
4121 /* ARGSUSED */
4122 static int
zfs_readlink(vnode_t * vp,uio_t * uio,cred_t * cr,caller_context_t * ct)4123 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4124 {
4125 	znode_t		*zp = VTOZ(vp);
4126 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4127 	int		error;
4128 
4129 	ZFS_ENTER(zfsvfs);
4130 	ZFS_VERIFY_ZP(zp);
4131 
4132 	if (zp->z_is_sa)
4133 		error = sa_lookup_uio(zp->z_sa_hdl,
4134 		    SA_ZPL_SYMLINK(zfsvfs), uio);
4135 	else
4136 		error = zfs_sa_readlink(zp, uio);
4137 
4138 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4139 
4140 	ZFS_EXIT(zfsvfs);
4141 	return (error);
4142 }
4143 
4144 /*
4145  * Insert a new entry into directory tdvp referencing svp.
4146  *
4147  *	IN:	tdvp	- Directory to contain new entry.
4148  *		svp	- vnode of new entry.
4149  *		name	- name of new entry.
4150  *		cr	- credentials of caller.
4151  *		ct	- caller context
4152  *
4153  *	RETURN:	0 on success, error code on failure.
4154  *
4155  * Timestamps:
4156  *	tdvp - ctime|mtime updated
4157  *	 svp - ctime updated
4158  */
4159 /* ARGSUSED */
4160 static int
zfs_link(vnode_t * tdvp,vnode_t * svp,char * name,cred_t * cr,caller_context_t * ct,int flags)4161 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4162     caller_context_t *ct, int flags)
4163 {
4164 	znode_t		*dzp = VTOZ(tdvp);
4165 	znode_t		*tzp, *szp;
4166 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4167 	zilog_t		*zilog;
4168 	dmu_tx_t	*tx;
4169 	int		error;
4170 	uint64_t	parent;
4171 	uid_t		owner;
4172 
4173 	ASSERT(tdvp->v_type == VDIR);
4174 
4175 	ZFS_ENTER(zfsvfs);
4176 	ZFS_VERIFY_ZP(dzp);
4177 	zilog = zfsvfs->z_log;
4178 
4179 	/*
4180 	 * POSIX dictates that we return EPERM here.
4181 	 * Better choices include ENOTSUP or EISDIR.
4182 	 */
4183 	if (svp->v_type == VDIR) {
4184 		ZFS_EXIT(zfsvfs);
4185 		return (SET_ERROR(EPERM));
4186 	}
4187 
4188 	szp = VTOZ(svp);
4189 	ZFS_VERIFY_ZP(szp);
4190 
4191 	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4192 		ZFS_EXIT(zfsvfs);
4193 		return (SET_ERROR(EPERM));
4194 	}
4195 
4196 	/* Prevent links to .zfs/shares files */
4197 
4198 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4199 	    &parent, sizeof (uint64_t))) != 0) {
4200 		ZFS_EXIT(zfsvfs);
4201 		return (error);
4202 	}
4203 	if (parent == zfsvfs->z_shares_dir) {
4204 		ZFS_EXIT(zfsvfs);
4205 		return (SET_ERROR(EPERM));
4206 	}
4207 
4208 	if (zfsvfs->z_utf8 && u8_validate(name,
4209 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4210 		ZFS_EXIT(zfsvfs);
4211 		return (SET_ERROR(EILSEQ));
4212 	}
4213 
4214 	/*
4215 	 * We do not support links between attributes and non-attributes
4216 	 * because of the potential security risk of creating links
4217 	 * into "normal" file space in order to circumvent restrictions
4218 	 * imposed in attribute space.
4219 	 */
4220 	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4221 		ZFS_EXIT(zfsvfs);
4222 		return (SET_ERROR(EINVAL));
4223 	}
4224 
4225 
4226 	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4227 	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4228 		ZFS_EXIT(zfsvfs);
4229 		return (SET_ERROR(EPERM));
4230 	}
4231 
4232 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4233 		ZFS_EXIT(zfsvfs);
4234 		return (error);
4235 	}
4236 
4237 	/*
4238 	 * Attempt to lock directory; fail if entry already exists.
4239 	 */
4240 	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4241 	if (error) {
4242 		ZFS_EXIT(zfsvfs);
4243 		return (error);
4244 	}
4245 
4246 	tx = dmu_tx_create(zfsvfs->z_os);
4247 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4248 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4249 	zfs_sa_upgrade_txholds(tx, szp);
4250 	zfs_sa_upgrade_txholds(tx, dzp);
4251 	error = dmu_tx_assign(tx, TXG_WAIT);
4252 	if (error) {
4253 		dmu_tx_abort(tx);
4254 		ZFS_EXIT(zfsvfs);
4255 		return (error);
4256 	}
4257 
4258 	error = zfs_link_create(dzp, name, szp, tx, 0);
4259 
4260 	if (error == 0) {
4261 		uint64_t txtype = TX_LINK;
4262 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4263 	}
4264 
4265 	dmu_tx_commit(tx);
4266 
4267 	if (error == 0) {
4268 		vnevent_link(svp, ct);
4269 	}
4270 
4271 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4272 		zil_commit(zilog, 0);
4273 
4274 	ZFS_EXIT(zfsvfs);
4275 	return (error);
4276 }
4277 
4278 
4279 /*ARGSUSED*/
4280 void
zfs_inactive(vnode_t * vp,cred_t * cr,caller_context_t * ct)4281 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4282 {
4283 	znode_t	*zp = VTOZ(vp);
4284 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4285 	int error;
4286 
4287 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4288 	if (zp->z_sa_hdl == NULL) {
4289 		/*
4290 		 * The fs has been unmounted, or we did a
4291 		 * suspend/resume and this file no longer exists.
4292 		 */
4293 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4294 		vrecycle(vp);
4295 		return;
4296 	}
4297 
4298 	if (zp->z_unlinked) {
4299 		/*
4300 		 * Fast path to recycle a vnode of a removed file.
4301 		 */
4302 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4303 		vrecycle(vp);
4304 		return;
4305 	}
4306 
4307 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4308 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4309 
4310 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4311 		zfs_sa_upgrade_txholds(tx, zp);
4312 		error = dmu_tx_assign(tx, TXG_WAIT);
4313 		if (error) {
4314 			dmu_tx_abort(tx);
4315 		} else {
4316 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4317 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4318 			zp->z_atime_dirty = 0;
4319 			dmu_tx_commit(tx);
4320 		}
4321 	}
4322 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4323 }
4324 
4325 
4326 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4327 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4328 
4329 /*ARGSUSED*/
4330 static int
zfs_fid(vnode_t * vp,fid_t * fidp,caller_context_t * ct)4331 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4332 {
4333 	znode_t		*zp = VTOZ(vp);
4334 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4335 	uint32_t	gen;
4336 	uint64_t	gen64;
4337 	uint64_t	object = zp->z_id;
4338 	zfid_short_t	*zfid;
4339 	int		size, i, error;
4340 
4341 	ZFS_ENTER(zfsvfs);
4342 	ZFS_VERIFY_ZP(zp);
4343 
4344 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4345 	    &gen64, sizeof (uint64_t))) != 0) {
4346 		ZFS_EXIT(zfsvfs);
4347 		return (error);
4348 	}
4349 
4350 	gen = (uint32_t)gen64;
4351 
4352 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4353 
4354 #ifdef illumos
4355 	if (fidp->fid_len < size) {
4356 		fidp->fid_len = size;
4357 		ZFS_EXIT(zfsvfs);
4358 		return (SET_ERROR(ENOSPC));
4359 	}
4360 #else
4361 	fidp->fid_len = size;
4362 #endif
4363 
4364 	zfid = (zfid_short_t *)fidp;
4365 
4366 	zfid->zf_len = size;
4367 
4368 	for (i = 0; i < sizeof (zfid->zf_object); i++)
4369 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4370 
4371 	/* Must have a non-zero generation number to distinguish from .zfs */
4372 	if (gen == 0)
4373 		gen = 1;
4374 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4375 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4376 
4377 	if (size == LONG_FID_LEN) {
4378 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4379 		zfid_long_t	*zlfid;
4380 
4381 		zlfid = (zfid_long_t *)fidp;
4382 
4383 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4384 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4385 
4386 		/* XXX - this should be the generation number for the objset */
4387 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4388 			zlfid->zf_setgen[i] = 0;
4389 	}
4390 
4391 	ZFS_EXIT(zfsvfs);
4392 	return (0);
4393 }
4394 
4395 static int
zfs_pathconf(vnode_t * vp,int cmd,ulong_t * valp,cred_t * cr,caller_context_t * ct)4396 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4397     caller_context_t *ct)
4398 {
4399 	znode_t		*zp, *xzp;
4400 	zfsvfs_t	*zfsvfs;
4401 	int		error;
4402 
4403 	switch (cmd) {
4404 	case _PC_LINK_MAX:
4405 		*valp = INT_MAX;
4406 		return (0);
4407 
4408 	case _PC_FILESIZEBITS:
4409 		*valp = 64;
4410 		return (0);
4411 #ifdef illumos
4412 	case _PC_XATTR_EXISTS:
4413 		zp = VTOZ(vp);
4414 		zfsvfs = zp->z_zfsvfs;
4415 		ZFS_ENTER(zfsvfs);
4416 		ZFS_VERIFY_ZP(zp);
4417 		*valp = 0;
4418 		error = zfs_dirent_lookup(zp, "", &xzp,
4419 		    ZXATTR | ZEXISTS | ZSHARED);
4420 		if (error == 0) {
4421 			if (!zfs_dirempty(xzp))
4422 				*valp = 1;
4423 			vrele(ZTOV(xzp));
4424 		} else if (error == ENOENT) {
4425 			/*
4426 			 * If there aren't extended attributes, it's the
4427 			 * same as having zero of them.
4428 			 */
4429 			error = 0;
4430 		}
4431 		ZFS_EXIT(zfsvfs);
4432 		return (error);
4433 
4434 	case _PC_SATTR_ENABLED:
4435 	case _PC_SATTR_EXISTS:
4436 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4437 		    (vp->v_type == VREG || vp->v_type == VDIR);
4438 		return (0);
4439 
4440 	case _PC_ACCESS_FILTERING:
4441 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4442 		    vp->v_type == VDIR;
4443 		return (0);
4444 
4445 	case _PC_ACL_ENABLED:
4446 		*valp = _ACL_ACE_ENABLED;
4447 		return (0);
4448 #endif	/* illumos */
4449 	case _PC_MIN_HOLE_SIZE:
4450 		*valp = (int)SPA_MINBLOCKSIZE;
4451 		return (0);
4452 #ifdef illumos
4453 	case _PC_TIMESTAMP_RESOLUTION:
4454 		/* nanosecond timestamp resolution */
4455 		*valp = 1L;
4456 		return (0);
4457 #endif
4458 	case _PC_ACL_EXTENDED:
4459 		*valp = 0;
4460 		return (0);
4461 
4462 	case _PC_ACL_NFS4:
4463 		*valp = 1;
4464 		return (0);
4465 
4466 	case _PC_ACL_PATH_MAX:
4467 		*valp = ACL_MAX_ENTRIES;
4468 		return (0);
4469 
4470 	default:
4471 		return (EOPNOTSUPP);
4472 	}
4473 }
4474 
4475 /*ARGSUSED*/
4476 static int
zfs_getsecattr(vnode_t * vp,vsecattr_t * vsecp,int flag,cred_t * cr,caller_context_t * ct)4477 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4478     caller_context_t *ct)
4479 {
4480 	znode_t *zp = VTOZ(vp);
4481 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4482 	int error;
4483 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4484 
4485 	ZFS_ENTER(zfsvfs);
4486 	ZFS_VERIFY_ZP(zp);
4487 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4488 	ZFS_EXIT(zfsvfs);
4489 
4490 	return (error);
4491 }
4492 
4493 /*ARGSUSED*/
4494 int
zfs_setsecattr(vnode_t * vp,vsecattr_t * vsecp,int flag,cred_t * cr,caller_context_t * ct)4495 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4496     caller_context_t *ct)
4497 {
4498 	znode_t *zp = VTOZ(vp);
4499 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4500 	int error;
4501 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4502 	zilog_t	*zilog = zfsvfs->z_log;
4503 
4504 	ZFS_ENTER(zfsvfs);
4505 	ZFS_VERIFY_ZP(zp);
4506 
4507 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4508 
4509 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4510 		zil_commit(zilog, 0);
4511 
4512 	ZFS_EXIT(zfsvfs);
4513 	return (error);
4514 }
4515 
4516 static int
zfs_getpages(struct vnode * vp,vm_page_t * m,int count,int reqpage)4517 zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
4518 {
4519 	znode_t *zp = VTOZ(vp);
4520 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4521 	objset_t *os = zp->z_zfsvfs->z_os;
4522 	vm_page_t mfirst, mlast, mreq;
4523 	vm_object_t object;
4524 	caddr_t va;
4525 	struct sf_buf *sf;
4526 	off_t startoff, endoff;
4527 	int i, error;
4528 	vm_pindex_t reqstart, reqend;
4529 	int pcount, lsize, reqsize, size;
4530 
4531 	ZFS_ENTER(zfsvfs);
4532 	ZFS_VERIFY_ZP(zp);
4533 
4534 	pcount = OFF_TO_IDX(round_page(count));
4535 	mreq = m[reqpage];
4536 	object = mreq->object;
4537 	error = 0;
4538 
4539 	KASSERT(vp->v_object == object, ("mismatching object"));
4540 
4541 	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
4542 		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
4543 		reqstart = OFF_TO_IDX(round_page(startoff));
4544 		if (reqstart < m[0]->pindex)
4545 			reqstart = 0;
4546 		else
4547 			reqstart = reqstart - m[0]->pindex;
4548 		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
4549 		    zp->z_blksz);
4550 		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
4551 		if (reqend > m[pcount - 1]->pindex)
4552 			reqend = m[pcount - 1]->pindex;
4553 		reqsize = reqend - m[reqstart]->pindex + 1;
4554 		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
4555 		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
4556 	} else {
4557 		reqstart = reqpage;
4558 		reqsize = 1;
4559 	}
4560 	mfirst = m[reqstart];
4561 	mlast = m[reqstart + reqsize - 1];
4562 
4563 	zfs_vmobject_wlock(object);
4564 
4565 	for (i = 0; i < reqstart; i++) {
4566 		vm_page_lock(m[i]);
4567 		vm_page_free(m[i]);
4568 		vm_page_unlock(m[i]);
4569 	}
4570 	for (i = reqstart + reqsize; i < pcount; i++) {
4571 		vm_page_lock(m[i]);
4572 		vm_page_free(m[i]);
4573 		vm_page_unlock(m[i]);
4574 	}
4575 
4576 	if (mreq->valid && reqsize == 1) {
4577 		if (mreq->valid != VM_PAGE_BITS_ALL)
4578 			vm_page_zero_invalid(mreq, TRUE);
4579 		zfs_vmobject_wunlock(object);
4580 		ZFS_EXIT(zfsvfs);
4581 		return (zfs_vm_pagerret_ok);
4582 	}
4583 
4584 	PCPU_INC(cnt.v_vnodein);
4585 	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
4586 
4587 	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
4588 		for (i = reqstart; i < reqstart + reqsize; i++) {
4589 			if (i != reqpage) {
4590 				vm_page_lock(m[i]);
4591 				vm_page_free(m[i]);
4592 				vm_page_unlock(m[i]);
4593 			}
4594 		}
4595 		zfs_vmobject_wunlock(object);
4596 		ZFS_EXIT(zfsvfs);
4597 		return (zfs_vm_pagerret_bad);
4598 	}
4599 
4600 	lsize = PAGE_SIZE;
4601 	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
4602 		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
4603 
4604 	zfs_vmobject_wunlock(object);
4605 
4606 	for (i = reqstart; i < reqstart + reqsize; i++) {
4607 		size = PAGE_SIZE;
4608 		if (i == (reqstart + reqsize - 1))
4609 			size = lsize;
4610 		va = zfs_map_page(m[i], &sf);
4611 		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
4612 		    size, va, DMU_READ_PREFETCH);
4613 		if (size != PAGE_SIZE)
4614 			bzero(va + size, PAGE_SIZE - size);
4615 		zfs_unmap_page(sf);
4616 		if (error != 0)
4617 			break;
4618 	}
4619 
4620 	zfs_vmobject_wlock(object);
4621 
4622 	for (i = reqstart; i < reqstart + reqsize; i++) {
4623 		if (!error)
4624 			m[i]->valid = VM_PAGE_BITS_ALL;
4625 		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
4626 		if (i != reqpage)
4627 			vm_page_readahead_finish(m[i]);
4628 	}
4629 
4630 	zfs_vmobject_wunlock(object);
4631 
4632 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4633 	ZFS_EXIT(zfsvfs);
4634 	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
4635 }
4636 
4637 static int
zfs_freebsd_getpages(ap)4638 zfs_freebsd_getpages(ap)
4639 	struct vop_getpages_args /* {
4640 		struct vnode *a_vp;
4641 		vm_page_t *a_m;
4642 		int a_count;
4643 		int a_reqpage;
4644 		vm_ooffset_t a_offset;
4645 	} */ *ap;
4646 {
4647 
4648 	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
4649 }
4650 
4651 static int
zfs_putpages(struct vnode * vp,vm_page_t * ma,size_t len,int flags,int * rtvals)4652 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4653     int *rtvals)
4654 {
4655 	znode_t		*zp = VTOZ(vp);
4656 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4657 	rl_t		*rl;
4658 	dmu_tx_t	*tx;
4659 	struct sf_buf	*sf;
4660 	vm_object_t	object;
4661 	vm_page_t	m;
4662 	caddr_t		va;
4663 	size_t		tocopy;
4664 	size_t		lo_len;
4665 	vm_ooffset_t	lo_off;
4666 	vm_ooffset_t	off;
4667 	uint_t		blksz;
4668 	int		ncount;
4669 	int		pcount;
4670 	int		err;
4671 	int		i;
4672 
4673 	ZFS_ENTER(zfsvfs);
4674 	ZFS_VERIFY_ZP(zp);
4675 
4676 	object = vp->v_object;
4677 	pcount = btoc(len);
4678 	ncount = pcount;
4679 
4680 	KASSERT(ma[0]->object == object, ("mismatching object"));
4681 	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4682 
4683 	for (i = 0; i < pcount; i++)
4684 		rtvals[i] = zfs_vm_pagerret_error;
4685 
4686 	off = IDX_TO_OFF(ma[0]->pindex);
4687 	blksz = zp->z_blksz;
4688 	lo_off = rounddown(off, blksz);
4689 	lo_len = roundup(len + (off - lo_off), blksz);
4690 	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4691 
4692 	zfs_vmobject_wlock(object);
4693 	if (len + off > object->un_pager.vnp.vnp_size) {
4694 		if (object->un_pager.vnp.vnp_size > off) {
4695 			int pgoff;
4696 
4697 			len = object->un_pager.vnp.vnp_size - off;
4698 			ncount = btoc(len);
4699 			if ((pgoff = (int)len & PAGE_MASK) != 0) {
4700 				/*
4701 				 * If the object is locked and the following
4702 				 * conditions hold, then the page's dirty
4703 				 * field cannot be concurrently changed by a
4704 				 * pmap operation.
4705 				 */
4706 				m = ma[ncount - 1];
4707 				vm_page_assert_sbusied(m);
4708 				KASSERT(!pmap_page_is_write_mapped(m),
4709 				    ("zfs_putpages: page %p is not read-only", m));
4710 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4711 				    pgoff);
4712 			}
4713 		} else {
4714 			len = 0;
4715 			ncount = 0;
4716 		}
4717 		if (ncount < pcount) {
4718 			for (i = ncount; i < pcount; i++) {
4719 				rtvals[i] = zfs_vm_pagerret_bad;
4720 			}
4721 		}
4722 	}
4723 	zfs_vmobject_wunlock(object);
4724 
4725 	if (ncount == 0)
4726 		goto out;
4727 
4728 	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4729 	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4730 		goto out;
4731 	}
4732 
4733 	tx = dmu_tx_create(zfsvfs->z_os);
4734 	dmu_tx_hold_write(tx, zp->z_id, off, len);
4735 
4736 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4737 	zfs_sa_upgrade_txholds(tx, zp);
4738 	err = dmu_tx_assign(tx, TXG_WAIT);
4739 	if (err != 0) {
4740 		dmu_tx_abort(tx);
4741 		goto out;
4742 	}
4743 
4744 	if (zp->z_blksz < PAGE_SIZE) {
4745 		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4746 			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4747 			va = zfs_map_page(ma[i], &sf);
4748 			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4749 			zfs_unmap_page(sf);
4750 		}
4751 	} else {
4752 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4753 	}
4754 
4755 	if (err == 0) {
4756 		uint64_t mtime[2], ctime[2];
4757 		sa_bulk_attr_t bulk[3];
4758 		int count = 0;
4759 
4760 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4761 		    &mtime, 16);
4762 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4763 		    &ctime, 16);
4764 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4765 		    &zp->z_pflags, 8);
4766 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4767 		    B_TRUE);
4768 		(void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4769 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4770 
4771 		zfs_vmobject_wlock(object);
4772 		for (i = 0; i < ncount; i++) {
4773 			rtvals[i] = zfs_vm_pagerret_ok;
4774 			vm_page_undirty(ma[i]);
4775 		}
4776 		zfs_vmobject_wunlock(object);
4777 		PCPU_INC(cnt.v_vnodeout);
4778 		PCPU_ADD(cnt.v_vnodepgsout, ncount);
4779 	}
4780 	dmu_tx_commit(tx);
4781 
4782 out:
4783 	zfs_range_unlock(rl);
4784 	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4785 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4786 		zil_commit(zfsvfs->z_log, zp->z_id);
4787 	ZFS_EXIT(zfsvfs);
4788 	return (rtvals[0]);
4789 }
4790 
4791 int
zfs_freebsd_putpages(ap)4792 zfs_freebsd_putpages(ap)
4793 	struct vop_putpages_args /* {
4794 		struct vnode *a_vp;
4795 		vm_page_t *a_m;
4796 		int a_count;
4797 		int a_sync;
4798 		int *a_rtvals;
4799 		vm_ooffset_t a_offset;
4800 	} */ *ap;
4801 {
4802 
4803 	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4804 	    ap->a_rtvals));
4805 }
4806 
4807 static int
zfs_freebsd_bmap(ap)4808 zfs_freebsd_bmap(ap)
4809 	struct vop_bmap_args /* {
4810 		struct vnode *a_vp;
4811 		daddr_t  a_bn;
4812 		struct bufobj **a_bop;
4813 		daddr_t *a_bnp;
4814 		int *a_runp;
4815 		int *a_runb;
4816 	} */ *ap;
4817 {
4818 
4819 	if (ap->a_bop != NULL)
4820 		*ap->a_bop = &ap->a_vp->v_bufobj;
4821 	if (ap->a_bnp != NULL)
4822 		*ap->a_bnp = ap->a_bn;
4823 	if (ap->a_runp != NULL)
4824 		*ap->a_runp = 0;
4825 	if (ap->a_runb != NULL)
4826 		*ap->a_runb = 0;
4827 
4828 	return (0);
4829 }
4830 
4831 static int
zfs_freebsd_open(ap)4832 zfs_freebsd_open(ap)
4833 	struct vop_open_args /* {
4834 		struct vnode *a_vp;
4835 		int a_mode;
4836 		struct ucred *a_cred;
4837 		struct thread *a_td;
4838 	} */ *ap;
4839 {
4840 	vnode_t	*vp = ap->a_vp;
4841 	znode_t *zp = VTOZ(vp);
4842 	int error;
4843 
4844 	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4845 	if (error == 0)
4846 		vnode_create_vobject(vp, zp->z_size, ap->a_td);
4847 	return (error);
4848 }
4849 
4850 static int
zfs_freebsd_close(ap)4851 zfs_freebsd_close(ap)
4852 	struct vop_close_args /* {
4853 		struct vnode *a_vp;
4854 		int  a_fflag;
4855 		struct ucred *a_cred;
4856 		struct thread *a_td;
4857 	} */ *ap;
4858 {
4859 
4860 	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4861 }
4862 
4863 static int
zfs_freebsd_ioctl(ap)4864 zfs_freebsd_ioctl(ap)
4865 	struct vop_ioctl_args /* {
4866 		struct vnode *a_vp;
4867 		u_long a_command;
4868 		caddr_t a_data;
4869 		int a_fflag;
4870 		struct ucred *cred;
4871 		struct thread *td;
4872 	} */ *ap;
4873 {
4874 
4875 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4876 	    ap->a_fflag, ap->a_cred, NULL, NULL));
4877 }
4878 
4879 static int
ioflags(int ioflags)4880 ioflags(int ioflags)
4881 {
4882 	int flags = 0;
4883 
4884 	if (ioflags & IO_APPEND)
4885 		flags |= FAPPEND;
4886 	if (ioflags & IO_NDELAY)
4887 		flags |= FNONBLOCK;
4888 	if (ioflags & IO_SYNC)
4889 		flags |= (FSYNC | FDSYNC | FRSYNC);
4890 
4891 	return (flags);
4892 }
4893 
4894 static int
zfs_freebsd_read(ap)4895 zfs_freebsd_read(ap)
4896 	struct vop_read_args /* {
4897 		struct vnode *a_vp;
4898 		struct uio *a_uio;
4899 		int a_ioflag;
4900 		struct ucred *a_cred;
4901 	} */ *ap;
4902 {
4903 
4904 	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4905 	    ap->a_cred, NULL));
4906 }
4907 
4908 static int
zfs_freebsd_write(ap)4909 zfs_freebsd_write(ap)
4910 	struct vop_write_args /* {
4911 		struct vnode *a_vp;
4912 		struct uio *a_uio;
4913 		int a_ioflag;
4914 		struct ucred *a_cred;
4915 	} */ *ap;
4916 {
4917 
4918 	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4919 	    ap->a_cred, NULL));
4920 }
4921 
4922 static int
zfs_freebsd_access(ap)4923 zfs_freebsd_access(ap)
4924 	struct vop_access_args /* {
4925 		struct vnode *a_vp;
4926 		accmode_t a_accmode;
4927 		struct ucred *a_cred;
4928 		struct thread *a_td;
4929 	} */ *ap;
4930 {
4931 	vnode_t *vp = ap->a_vp;
4932 	znode_t *zp = VTOZ(vp);
4933 	accmode_t accmode;
4934 	int error = 0;
4935 
4936 	/*
4937 	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4938 	 */
4939 	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4940 	if (accmode != 0)
4941 		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4942 
4943 	/*
4944 	 * VADMIN has to be handled by vaccess().
4945 	 */
4946 	if (error == 0) {
4947 		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4948 		if (accmode != 0) {
4949 			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4950 			    zp->z_gid, accmode, ap->a_cred, NULL);
4951 		}
4952 	}
4953 
4954 	/*
4955 	 * For VEXEC, ensure that at least one execute bit is set for
4956 	 * non-directories.
4957 	 */
4958 	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4959 	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4960 		error = EACCES;
4961 	}
4962 
4963 	return (error);
4964 }
4965 
4966 static int
zfs_freebsd_lookup(ap)4967 zfs_freebsd_lookup(ap)
4968 	struct vop_lookup_args /* {
4969 		struct vnode *a_dvp;
4970 		struct vnode **a_vpp;
4971 		struct componentname *a_cnp;
4972 	} */ *ap;
4973 {
4974 	struct componentname *cnp = ap->a_cnp;
4975 	char nm[NAME_MAX + 1];
4976 
4977 	ASSERT(cnp->cn_namelen < sizeof(nm));
4978 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4979 
4980 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4981 	    cnp->cn_cred, cnp->cn_thread, 0));
4982 }
4983 
4984 static int
zfs_cache_lookup(ap)4985 zfs_cache_lookup(ap)
4986 	struct vop_lookup_args /* {
4987 		struct vnode *a_dvp;
4988 		struct vnode **a_vpp;
4989 		struct componentname *a_cnp;
4990 	} */ *ap;
4991 {
4992 	zfsvfs_t *zfsvfs;
4993 
4994 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
4995 	if (zfsvfs->z_use_namecache)
4996 		return (vfs_cache_lookup(ap));
4997 	else
4998 		return (zfs_freebsd_lookup(ap));
4999 }
5000 
5001 static int
zfs_freebsd_create(ap)5002 zfs_freebsd_create(ap)
5003 	struct vop_create_args /* {
5004 		struct vnode *a_dvp;
5005 		struct vnode **a_vpp;
5006 		struct componentname *a_cnp;
5007 		struct vattr *a_vap;
5008 	} */ *ap;
5009 {
5010 	zfsvfs_t *zfsvfs;
5011 	struct componentname *cnp = ap->a_cnp;
5012 	vattr_t *vap = ap->a_vap;
5013 	int error, mode;
5014 
5015 	ASSERT(cnp->cn_flags & SAVENAME);
5016 
5017 	vattr_init_mask(vap);
5018 	mode = vap->va_mode & ALLPERMS;
5019 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
5020 
5021 	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5022 	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
5023 	if (zfsvfs->z_use_namecache &&
5024 	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
5025 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
5026 	return (error);
5027 }
5028 
5029 static int
zfs_freebsd_remove(ap)5030 zfs_freebsd_remove(ap)
5031 	struct vop_remove_args /* {
5032 		struct vnode *a_dvp;
5033 		struct vnode *a_vp;
5034 		struct componentname *a_cnp;
5035 	} */ *ap;
5036 {
5037 
5038 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5039 
5040 	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5041 	    ap->a_cnp->cn_cred));
5042 }
5043 
5044 static int
zfs_freebsd_mkdir(ap)5045 zfs_freebsd_mkdir(ap)
5046 	struct vop_mkdir_args /* {
5047 		struct vnode *a_dvp;
5048 		struct vnode **a_vpp;
5049 		struct componentname *a_cnp;
5050 		struct vattr *a_vap;
5051 	} */ *ap;
5052 {
5053 	vattr_t *vap = ap->a_vap;
5054 
5055 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5056 
5057 	vattr_init_mask(vap);
5058 
5059 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5060 	    ap->a_cnp->cn_cred));
5061 }
5062 
5063 static int
zfs_freebsd_rmdir(ap)5064 zfs_freebsd_rmdir(ap)
5065 	struct vop_rmdir_args /* {
5066 		struct vnode *a_dvp;
5067 		struct vnode *a_vp;
5068 		struct componentname *a_cnp;
5069 	} */ *ap;
5070 {
5071 	struct componentname *cnp = ap->a_cnp;
5072 
5073 	ASSERT(cnp->cn_flags & SAVENAME);
5074 
5075 	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5076 }
5077 
5078 static int
zfs_freebsd_readdir(ap)5079 zfs_freebsd_readdir(ap)
5080 	struct vop_readdir_args /* {
5081 		struct vnode *a_vp;
5082 		struct uio *a_uio;
5083 		struct ucred *a_cred;
5084 		int *a_eofflag;
5085 		int *a_ncookies;
5086 		u_long **a_cookies;
5087 	} */ *ap;
5088 {
5089 
5090 	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5091 	    ap->a_ncookies, ap->a_cookies));
5092 }
5093 
5094 static int
zfs_freebsd_fsync(ap)5095 zfs_freebsd_fsync(ap)
5096 	struct vop_fsync_args /* {
5097 		struct vnode *a_vp;
5098 		int a_waitfor;
5099 		struct thread *a_td;
5100 	} */ *ap;
5101 {
5102 
5103 	vop_stdfsync(ap);
5104 	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5105 }
5106 
5107 static int
zfs_freebsd_getattr(ap)5108 zfs_freebsd_getattr(ap)
5109 	struct vop_getattr_args /* {
5110 		struct vnode *a_vp;
5111 		struct vattr *a_vap;
5112 		struct ucred *a_cred;
5113 	} */ *ap;
5114 {
5115 	vattr_t *vap = ap->a_vap;
5116 	xvattr_t xvap;
5117 	u_long fflags = 0;
5118 	int error;
5119 
5120 	xva_init(&xvap);
5121 	xvap.xva_vattr = *vap;
5122 	xvap.xva_vattr.va_mask |= AT_XVATTR;
5123 
5124 	/* Convert chflags into ZFS-type flags. */
5125 	/* XXX: what about SF_SETTABLE?. */
5126 	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5127 	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5128 	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5129 	XVA_SET_REQ(&xvap, XAT_NODUMP);
5130 	XVA_SET_REQ(&xvap, XAT_READONLY);
5131 	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5132 	XVA_SET_REQ(&xvap, XAT_SYSTEM);
5133 	XVA_SET_REQ(&xvap, XAT_HIDDEN);
5134 	XVA_SET_REQ(&xvap, XAT_REPARSE);
5135 	XVA_SET_REQ(&xvap, XAT_OFFLINE);
5136 	XVA_SET_REQ(&xvap, XAT_SPARSE);
5137 
5138 	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5139 	if (error != 0)
5140 		return (error);
5141 
5142 	/* Convert ZFS xattr into chflags. */
5143 #define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5144 	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5145 		fflags |= (fflag);					\
5146 } while (0)
5147 	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5148 	    xvap.xva_xoptattrs.xoa_immutable);
5149 	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5150 	    xvap.xva_xoptattrs.xoa_appendonly);
5151 	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5152 	    xvap.xva_xoptattrs.xoa_nounlink);
5153 	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5154 	    xvap.xva_xoptattrs.xoa_archive);
5155 	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5156 	    xvap.xva_xoptattrs.xoa_nodump);
5157 	FLAG_CHECK(UF_READONLY, XAT_READONLY,
5158 	    xvap.xva_xoptattrs.xoa_readonly);
5159 	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5160 	    xvap.xva_xoptattrs.xoa_system);
5161 	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5162 	    xvap.xva_xoptattrs.xoa_hidden);
5163 	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5164 	    xvap.xva_xoptattrs.xoa_reparse);
5165 	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5166 	    xvap.xva_xoptattrs.xoa_offline);
5167 	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5168 	    xvap.xva_xoptattrs.xoa_sparse);
5169 
5170 #undef	FLAG_CHECK
5171 	*vap = xvap.xva_vattr;
5172 	vap->va_flags = fflags;
5173 	return (0);
5174 }
5175 
5176 static int
zfs_freebsd_setattr(ap)5177 zfs_freebsd_setattr(ap)
5178 	struct vop_setattr_args /* {
5179 		struct vnode *a_vp;
5180 		struct vattr *a_vap;
5181 		struct ucred *a_cred;
5182 	} */ *ap;
5183 {
5184 	vnode_t *vp = ap->a_vp;
5185 	vattr_t *vap = ap->a_vap;
5186 	cred_t *cred = ap->a_cred;
5187 	xvattr_t xvap;
5188 	u_long fflags;
5189 	uint64_t zflags;
5190 
5191 	vattr_init_mask(vap);
5192 	vap->va_mask &= ~AT_NOSET;
5193 
5194 	xva_init(&xvap);
5195 	xvap.xva_vattr = *vap;
5196 
5197 	zflags = VTOZ(vp)->z_pflags;
5198 
5199 	if (vap->va_flags != VNOVAL) {
5200 		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5201 		int error;
5202 
5203 		if (zfsvfs->z_use_fuids == B_FALSE)
5204 			return (EOPNOTSUPP);
5205 
5206 		fflags = vap->va_flags;
5207 		/*
5208 		 * XXX KDM
5209 		 * We need to figure out whether it makes sense to allow
5210 		 * UF_REPARSE through, since we don't really have other
5211 		 * facilities to handle reparse points and zfs_setattr()
5212 		 * doesn't currently allow setting that attribute anyway.
5213 		 */
5214 		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5215 		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5216 		     UF_OFFLINE|UF_SPARSE)) != 0)
5217 			return (EOPNOTSUPP);
5218 		/*
5219 		 * Unprivileged processes are not permitted to unset system
5220 		 * flags, or modify flags if any system flags are set.
5221 		 * Privileged non-jail processes may not modify system flags
5222 		 * if securelevel > 0 and any existing system flags are set.
5223 		 * Privileged jail processes behave like privileged non-jail
5224 		 * processes if the security.jail.chflags_allowed sysctl is
5225 		 * is non-zero; otherwise, they behave like unprivileged
5226 		 * processes.
5227 		 */
5228 		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5229 		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5230 			if (zflags &
5231 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5232 				error = securelevel_gt(cred, 0);
5233 				if (error != 0)
5234 					return (error);
5235 			}
5236 		} else {
5237 			/*
5238 			 * Callers may only modify the file flags on objects they
5239 			 * have VADMIN rights for.
5240 			 */
5241 			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5242 				return (error);
5243 			if (zflags &
5244 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5245 				return (EPERM);
5246 			}
5247 			if (fflags &
5248 			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5249 				return (EPERM);
5250 			}
5251 		}
5252 
5253 #define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5254 	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5255 	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5256 		XVA_SET_REQ(&xvap, (xflag));				\
5257 		(xfield) = ((fflags & (fflag)) != 0);			\
5258 	}								\
5259 } while (0)
5260 		/* Convert chflags into ZFS-type flags. */
5261 		/* XXX: what about SF_SETTABLE?. */
5262 		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5263 		    xvap.xva_xoptattrs.xoa_immutable);
5264 		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5265 		    xvap.xva_xoptattrs.xoa_appendonly);
5266 		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5267 		    xvap.xva_xoptattrs.xoa_nounlink);
5268 		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5269 		    xvap.xva_xoptattrs.xoa_archive);
5270 		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5271 		    xvap.xva_xoptattrs.xoa_nodump);
5272 		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5273 		    xvap.xva_xoptattrs.xoa_readonly);
5274 		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5275 		    xvap.xva_xoptattrs.xoa_system);
5276 		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5277 		    xvap.xva_xoptattrs.xoa_hidden);
5278 		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5279 		    xvap.xva_xoptattrs.xoa_hidden);
5280 		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5281 		    xvap.xva_xoptattrs.xoa_offline);
5282 		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5283 		    xvap.xva_xoptattrs.xoa_sparse);
5284 #undef	FLAG_CHANGE
5285 	}
5286 	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5287 }
5288 
5289 static int
zfs_freebsd_rename(ap)5290 zfs_freebsd_rename(ap)
5291 	struct vop_rename_args  /* {
5292 		struct vnode *a_fdvp;
5293 		struct vnode *a_fvp;
5294 		struct componentname *a_fcnp;
5295 		struct vnode *a_tdvp;
5296 		struct vnode *a_tvp;
5297 		struct componentname *a_tcnp;
5298 	} */ *ap;
5299 {
5300 	vnode_t *fdvp = ap->a_fdvp;
5301 	vnode_t *fvp = ap->a_fvp;
5302 	vnode_t *tdvp = ap->a_tdvp;
5303 	vnode_t *tvp = ap->a_tvp;
5304 	int error;
5305 
5306 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5307 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5308 
5309 	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5310 	    ap->a_tcnp, ap->a_fcnp->cn_cred);
5311 
5312 	vrele(fdvp);
5313 	vrele(fvp);
5314 	vrele(tdvp);
5315 	if (tvp != NULL)
5316 		vrele(tvp);
5317 
5318 	return (error);
5319 }
5320 
5321 static int
zfs_freebsd_symlink(ap)5322 zfs_freebsd_symlink(ap)
5323 	struct vop_symlink_args /* {
5324 		struct vnode *a_dvp;
5325 		struct vnode **a_vpp;
5326 		struct componentname *a_cnp;
5327 		struct vattr *a_vap;
5328 		char *a_target;
5329 	} */ *ap;
5330 {
5331 	struct componentname *cnp = ap->a_cnp;
5332 	vattr_t *vap = ap->a_vap;
5333 
5334 	ASSERT(cnp->cn_flags & SAVENAME);
5335 
5336 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
5337 	vattr_init_mask(vap);
5338 
5339 	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5340 	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
5341 }
5342 
5343 static int
zfs_freebsd_readlink(ap)5344 zfs_freebsd_readlink(ap)
5345 	struct vop_readlink_args /* {
5346 		struct vnode *a_vp;
5347 		struct uio *a_uio;
5348 		struct ucred *a_cred;
5349 	} */ *ap;
5350 {
5351 
5352 	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5353 }
5354 
5355 static int
zfs_freebsd_link(ap)5356 zfs_freebsd_link(ap)
5357 	struct vop_link_args /* {
5358 		struct vnode *a_tdvp;
5359 		struct vnode *a_vp;
5360 		struct componentname *a_cnp;
5361 	} */ *ap;
5362 {
5363 	struct componentname *cnp = ap->a_cnp;
5364 	vnode_t *vp = ap->a_vp;
5365 	vnode_t *tdvp = ap->a_tdvp;
5366 
5367 	if (tdvp->v_mount != vp->v_mount)
5368 		return (EXDEV);
5369 
5370 	ASSERT(cnp->cn_flags & SAVENAME);
5371 
5372 	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5373 }
5374 
5375 static int
zfs_freebsd_inactive(ap)5376 zfs_freebsd_inactive(ap)
5377 	struct vop_inactive_args /* {
5378 		struct vnode *a_vp;
5379 		struct thread *a_td;
5380 	} */ *ap;
5381 {
5382 	vnode_t *vp = ap->a_vp;
5383 
5384 	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5385 	return (0);
5386 }
5387 
5388 static int
zfs_freebsd_reclaim(ap)5389 zfs_freebsd_reclaim(ap)
5390 	struct vop_reclaim_args /* {
5391 		struct vnode *a_vp;
5392 		struct thread *a_td;
5393 	} */ *ap;
5394 {
5395 	vnode_t	*vp = ap->a_vp;
5396 	znode_t	*zp = VTOZ(vp);
5397 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5398 
5399 	ASSERT(zp != NULL);
5400 
5401 	/* Destroy the vm object and flush associated pages. */
5402 	vnode_destroy_vobject(vp);
5403 
5404 	/*
5405 	 * z_teardown_inactive_lock protects from a race with
5406 	 * zfs_znode_dmu_fini in zfsvfs_teardown during
5407 	 * force unmount.
5408 	 */
5409 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5410 	if (zp->z_sa_hdl == NULL)
5411 		zfs_znode_free(zp);
5412 	else
5413 		zfs_zinactive(zp);
5414 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5415 
5416 	vp->v_data = NULL;
5417 	return (0);
5418 }
5419 
5420 static int
zfs_freebsd_fid(ap)5421 zfs_freebsd_fid(ap)
5422 	struct vop_fid_args /* {
5423 		struct vnode *a_vp;
5424 		struct fid *a_fid;
5425 	} */ *ap;
5426 {
5427 
5428 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5429 }
5430 
5431 static int
zfs_freebsd_pathconf(ap)5432 zfs_freebsd_pathconf(ap)
5433 	struct vop_pathconf_args /* {
5434 		struct vnode *a_vp;
5435 		int a_name;
5436 		register_t *a_retval;
5437 	} */ *ap;
5438 {
5439 	ulong_t val;
5440 	int error;
5441 
5442 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5443 	if (error == 0)
5444 		*ap->a_retval = val;
5445 	else if (error == EOPNOTSUPP)
5446 		error = vop_stdpathconf(ap);
5447 	return (error);
5448 }
5449 
5450 static int
zfs_freebsd_fifo_pathconf(ap)5451 zfs_freebsd_fifo_pathconf(ap)
5452 	struct vop_pathconf_args /* {
5453 		struct vnode *a_vp;
5454 		int a_name;
5455 		register_t *a_retval;
5456 	} */ *ap;
5457 {
5458 
5459 	switch (ap->a_name) {
5460 	case _PC_ACL_EXTENDED:
5461 	case _PC_ACL_NFS4:
5462 	case _PC_ACL_PATH_MAX:
5463 	case _PC_MAC_PRESENT:
5464 		return (zfs_freebsd_pathconf(ap));
5465 	default:
5466 		return (fifo_specops.vop_pathconf(ap));
5467 	}
5468 }
5469 
5470 /*
5471  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5472  * extended attribute name:
5473  *
5474  *	NAMESPACE	PREFIX
5475  *	system		freebsd:system:
5476  *	user		(none, can be used to access ZFS fsattr(5) attributes
5477  *			created on Solaris)
5478  */
5479 static int
zfs_create_attrname(int attrnamespace,const char * name,char * attrname,size_t size)5480 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5481     size_t size)
5482 {
5483 	const char *namespace, *prefix, *suffix;
5484 
5485 	/* We don't allow '/' character in attribute name. */
5486 	if (strchr(name, '/') != NULL)
5487 		return (EINVAL);
5488 	/* We don't allow attribute names that start with "freebsd:" string. */
5489 	if (strncmp(name, "freebsd:", 8) == 0)
5490 		return (EINVAL);
5491 
5492 	bzero(attrname, size);
5493 
5494 	switch (attrnamespace) {
5495 	case EXTATTR_NAMESPACE_USER:
5496 #if 0
5497 		prefix = "freebsd:";
5498 		namespace = EXTATTR_NAMESPACE_USER_STRING;
5499 		suffix = ":";
5500 #else
5501 		/*
5502 		 * This is the default namespace by which we can access all
5503 		 * attributes created on Solaris.
5504 		 */
5505 		prefix = namespace = suffix = "";
5506 #endif
5507 		break;
5508 	case EXTATTR_NAMESPACE_SYSTEM:
5509 		prefix = "freebsd:";
5510 		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5511 		suffix = ":";
5512 		break;
5513 	case EXTATTR_NAMESPACE_EMPTY:
5514 	default:
5515 		return (EINVAL);
5516 	}
5517 	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5518 	    name) >= size) {
5519 		return (ENAMETOOLONG);
5520 	}
5521 	return (0);
5522 }
5523 
5524 /*
5525  * Vnode operating to retrieve a named extended attribute.
5526  */
5527 static int
zfs_getextattr(struct vop_getextattr_args * ap)5528 zfs_getextattr(struct vop_getextattr_args *ap)
5529 /*
5530 vop_getextattr {
5531 	IN struct vnode *a_vp;
5532 	IN int a_attrnamespace;
5533 	IN const char *a_name;
5534 	INOUT struct uio *a_uio;
5535 	OUT size_t *a_size;
5536 	IN struct ucred *a_cred;
5537 	IN struct thread *a_td;
5538 };
5539 */
5540 {
5541 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5542 	struct thread *td = ap->a_td;
5543 	struct nameidata nd;
5544 	char attrname[255];
5545 	struct vattr va;
5546 	vnode_t *xvp = NULL, *vp;
5547 	int error, flags;
5548 
5549 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5550 	    ap->a_cred, ap->a_td, VREAD);
5551 	if (error != 0)
5552 		return (error);
5553 
5554 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5555 	    sizeof(attrname));
5556 	if (error != 0)
5557 		return (error);
5558 
5559 	ZFS_ENTER(zfsvfs);
5560 
5561 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5562 	    LOOKUP_XATTR);
5563 	if (error != 0) {
5564 		ZFS_EXIT(zfsvfs);
5565 		return (error);
5566 	}
5567 
5568 	flags = FREAD;
5569 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5570 	    xvp, td);
5571 	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
5572 	vp = nd.ni_vp;
5573 	NDFREE(&nd, NDF_ONLY_PNBUF);
5574 	if (error != 0) {
5575 		ZFS_EXIT(zfsvfs);
5576 		if (error == ENOENT)
5577 			error = ENOATTR;
5578 		return (error);
5579 	}
5580 
5581 	if (ap->a_size != NULL) {
5582 		error = VOP_GETATTR(vp, &va, ap->a_cred);
5583 		if (error == 0)
5584 			*ap->a_size = (size_t)va.va_size;
5585 	} else if (ap->a_uio != NULL)
5586 		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5587 
5588 	VOP_UNLOCK(vp, 0);
5589 	vn_close(vp, flags, ap->a_cred, td);
5590 	ZFS_EXIT(zfsvfs);
5591 
5592 	return (error);
5593 }
5594 
5595 /*
5596  * Vnode operation to remove a named attribute.
5597  */
5598 int
zfs_deleteextattr(struct vop_deleteextattr_args * ap)5599 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5600 /*
5601 vop_deleteextattr {
5602 	IN struct vnode *a_vp;
5603 	IN int a_attrnamespace;
5604 	IN const char *a_name;
5605 	IN struct ucred *a_cred;
5606 	IN struct thread *a_td;
5607 };
5608 */
5609 {
5610 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5611 	struct thread *td = ap->a_td;
5612 	struct nameidata nd;
5613 	char attrname[255];
5614 	struct vattr va;
5615 	vnode_t *xvp = NULL, *vp;
5616 	int error, flags;
5617 
5618 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5619 	    ap->a_cred, ap->a_td, VWRITE);
5620 	if (error != 0)
5621 		return (error);
5622 
5623 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5624 	    sizeof(attrname));
5625 	if (error != 0)
5626 		return (error);
5627 
5628 	ZFS_ENTER(zfsvfs);
5629 
5630 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5631 	    LOOKUP_XATTR);
5632 	if (error != 0) {
5633 		ZFS_EXIT(zfsvfs);
5634 		return (error);
5635 	}
5636 
5637 	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5638 	    UIO_SYSSPACE, attrname, xvp, td);
5639 	error = namei(&nd);
5640 	vp = nd.ni_vp;
5641 	if (error != 0) {
5642 		ZFS_EXIT(zfsvfs);
5643 		NDFREE(&nd, NDF_ONLY_PNBUF);
5644 		if (error == ENOENT)
5645 			error = ENOATTR;
5646 		return (error);
5647 	}
5648 
5649 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5650 	NDFREE(&nd, NDF_ONLY_PNBUF);
5651 
5652 	vput(nd.ni_dvp);
5653 	if (vp == nd.ni_dvp)
5654 		vrele(vp);
5655 	else
5656 		vput(vp);
5657 	ZFS_EXIT(zfsvfs);
5658 
5659 	return (error);
5660 }
5661 
5662 /*
5663  * Vnode operation to set a named attribute.
5664  */
5665 static int
zfs_setextattr(struct vop_setextattr_args * ap)5666 zfs_setextattr(struct vop_setextattr_args *ap)
5667 /*
5668 vop_setextattr {
5669 	IN struct vnode *a_vp;
5670 	IN int a_attrnamespace;
5671 	IN const char *a_name;
5672 	INOUT struct uio *a_uio;
5673 	IN struct ucred *a_cred;
5674 	IN struct thread *a_td;
5675 };
5676 */
5677 {
5678 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5679 	struct thread *td = ap->a_td;
5680 	struct nameidata nd;
5681 	char attrname[255];
5682 	struct vattr va;
5683 	vnode_t *xvp = NULL, *vp;
5684 	int error, flags;
5685 
5686 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5687 	    ap->a_cred, ap->a_td, VWRITE);
5688 	if (error != 0)
5689 		return (error);
5690 
5691 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5692 	    sizeof(attrname));
5693 	if (error != 0)
5694 		return (error);
5695 
5696 	ZFS_ENTER(zfsvfs);
5697 
5698 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5699 	    LOOKUP_XATTR | CREATE_XATTR_DIR);
5700 	if (error != 0) {
5701 		ZFS_EXIT(zfsvfs);
5702 		return (error);
5703 	}
5704 
5705 	flags = FFLAGS(O_WRONLY | O_CREAT);
5706 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5707 	    xvp, td);
5708 	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5709 	vp = nd.ni_vp;
5710 	NDFREE(&nd, NDF_ONLY_PNBUF);
5711 	if (error != 0) {
5712 		ZFS_EXIT(zfsvfs);
5713 		return (error);
5714 	}
5715 
5716 	VATTR_NULL(&va);
5717 	va.va_size = 0;
5718 	error = VOP_SETATTR(vp, &va, ap->a_cred);
5719 	if (error == 0)
5720 		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5721 
5722 	VOP_UNLOCK(vp, 0);
5723 	vn_close(vp, flags, ap->a_cred, td);
5724 	ZFS_EXIT(zfsvfs);
5725 
5726 	return (error);
5727 }
5728 
5729 /*
5730  * Vnode operation to retrieve extended attributes on a vnode.
5731  */
5732 static int
zfs_listextattr(struct vop_listextattr_args * ap)5733 zfs_listextattr(struct vop_listextattr_args *ap)
5734 /*
5735 vop_listextattr {
5736 	IN struct vnode *a_vp;
5737 	IN int a_attrnamespace;
5738 	INOUT struct uio *a_uio;
5739 	OUT size_t *a_size;
5740 	IN struct ucred *a_cred;
5741 	IN struct thread *a_td;
5742 };
5743 */
5744 {
5745 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5746 	struct thread *td = ap->a_td;
5747 	struct nameidata nd;
5748 	char attrprefix[16];
5749 	u_char dirbuf[sizeof(struct dirent)];
5750 	struct dirent *dp;
5751 	struct iovec aiov;
5752 	struct uio auio, *uio = ap->a_uio;
5753 	size_t *sizep = ap->a_size;
5754 	size_t plen;
5755 	vnode_t *xvp = NULL, *vp;
5756 	int done, error, eof, pos;
5757 
5758 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5759 	    ap->a_cred, ap->a_td, VREAD);
5760 	if (error != 0)
5761 		return (error);
5762 
5763 	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5764 	    sizeof(attrprefix));
5765 	if (error != 0)
5766 		return (error);
5767 	plen = strlen(attrprefix);
5768 
5769 	ZFS_ENTER(zfsvfs);
5770 
5771 	if (sizep != NULL)
5772 		*sizep = 0;
5773 
5774 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5775 	    LOOKUP_XATTR);
5776 	if (error != 0) {
5777 		ZFS_EXIT(zfsvfs);
5778 		/*
5779 		 * ENOATTR means that the EA directory does not yet exist,
5780 		 * i.e. there are no extended attributes there.
5781 		 */
5782 		if (error == ENOATTR)
5783 			error = 0;
5784 		return (error);
5785 	}
5786 
5787 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5788 	    UIO_SYSSPACE, ".", xvp, td);
5789 	error = namei(&nd);
5790 	vp = nd.ni_vp;
5791 	NDFREE(&nd, NDF_ONLY_PNBUF);
5792 	if (error != 0) {
5793 		ZFS_EXIT(zfsvfs);
5794 		return (error);
5795 	}
5796 
5797 	auio.uio_iov = &aiov;
5798 	auio.uio_iovcnt = 1;
5799 	auio.uio_segflg = UIO_SYSSPACE;
5800 	auio.uio_td = td;
5801 	auio.uio_rw = UIO_READ;
5802 	auio.uio_offset = 0;
5803 
5804 	do {
5805 		u_char nlen;
5806 
5807 		aiov.iov_base = (void *)dirbuf;
5808 		aiov.iov_len = sizeof(dirbuf);
5809 		auio.uio_resid = sizeof(dirbuf);
5810 		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5811 		done = sizeof(dirbuf) - auio.uio_resid;
5812 		if (error != 0)
5813 			break;
5814 		for (pos = 0; pos < done;) {
5815 			dp = (struct dirent *)(dirbuf + pos);
5816 			pos += dp->d_reclen;
5817 			/*
5818 			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5819 			 * is what we get when attribute was created on Solaris.
5820 			 */
5821 			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5822 				continue;
5823 			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5824 				continue;
5825 			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5826 				continue;
5827 			nlen = dp->d_namlen - plen;
5828 			if (sizep != NULL)
5829 				*sizep += 1 + nlen;
5830 			else if (uio != NULL) {
5831 				/*
5832 				 * Format of extattr name entry is one byte for
5833 				 * length and the rest for name.
5834 				 */
5835 				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5836 				if (error == 0) {
5837 					error = uiomove(dp->d_name + plen, nlen,
5838 					    uio->uio_rw, uio);
5839 				}
5840 				if (error != 0)
5841 					break;
5842 			}
5843 		}
5844 	} while (!eof && error == 0);
5845 
5846 	vput(vp);
5847 	ZFS_EXIT(zfsvfs);
5848 
5849 	return (error);
5850 }
5851 
5852 int
zfs_freebsd_getacl(ap)5853 zfs_freebsd_getacl(ap)
5854 	struct vop_getacl_args /* {
5855 		struct vnode *vp;
5856 		acl_type_t type;
5857 		struct acl *aclp;
5858 		struct ucred *cred;
5859 		struct thread *td;
5860 	} */ *ap;
5861 {
5862 	int		error;
5863 	vsecattr_t      vsecattr;
5864 
5865 	if (ap->a_type != ACL_TYPE_NFS4)
5866 		return (EINVAL);
5867 
5868 	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5869 	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5870 		return (error);
5871 
5872 	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5873 	if (vsecattr.vsa_aclentp != NULL)
5874 		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5875 
5876 	return (error);
5877 }
5878 
5879 int
zfs_freebsd_setacl(ap)5880 zfs_freebsd_setacl(ap)
5881 	struct vop_setacl_args /* {
5882 		struct vnode *vp;
5883 		acl_type_t type;
5884 		struct acl *aclp;
5885 		struct ucred *cred;
5886 		struct thread *td;
5887 	} */ *ap;
5888 {
5889 	int		error;
5890 	vsecattr_t      vsecattr;
5891 	int		aclbsize;	/* size of acl list in bytes */
5892 	aclent_t	*aaclp;
5893 
5894 	if (ap->a_type != ACL_TYPE_NFS4)
5895 		return (EINVAL);
5896 
5897 	if (ap->a_aclp == NULL)
5898 		return (EINVAL);
5899 
5900 	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5901 		return (EINVAL);
5902 
5903 	/*
5904 	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5905 	 * splitting every entry into two and appending "canonical six"
5906 	 * entries at the end.  Don't allow for setting an ACL that would
5907 	 * cause chmod(2) to run out of ACL entries.
5908 	 */
5909 	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5910 		return (ENOSPC);
5911 
5912 	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5913 	if (error != 0)
5914 		return (error);
5915 
5916 	vsecattr.vsa_mask = VSA_ACE;
5917 	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5918 	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5919 	aaclp = vsecattr.vsa_aclentp;
5920 	vsecattr.vsa_aclentsz = aclbsize;
5921 
5922 	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5923 	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5924 	kmem_free(aaclp, aclbsize);
5925 
5926 	return (error);
5927 }
5928 
5929 int
zfs_freebsd_aclcheck(ap)5930 zfs_freebsd_aclcheck(ap)
5931 	struct vop_aclcheck_args /* {
5932 		struct vnode *vp;
5933 		acl_type_t type;
5934 		struct acl *aclp;
5935 		struct ucred *cred;
5936 		struct thread *td;
5937 	} */ *ap;
5938 {
5939 
5940 	return (EOPNOTSUPP);
5941 }
5942 
5943 static int
zfs_vptocnp(struct vop_vptocnp_args * ap)5944 zfs_vptocnp(struct vop_vptocnp_args *ap)
5945 {
5946 	vnode_t *covered_vp;
5947 	vnode_t *vp = ap->a_vp;;
5948 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5949 	znode_t *zp = VTOZ(vp);
5950 	int ltype;
5951 	int error;
5952 
5953 	ZFS_ENTER(zfsvfs);
5954 	ZFS_VERIFY_ZP(zp);
5955 
5956 	/*
5957 	 * If we are a snapshot mounted under .zfs, run the operation
5958 	 * on the covered vnode.
5959 	 */
5960 	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
5961 		char name[MAXNAMLEN + 1];
5962 		znode_t *dzp;
5963 		size_t len;
5964 
5965 		error = zfs_znode_parent_and_name(zp, &dzp, name);
5966 		if (error == 0) {
5967 			len = strlen(name);
5968 			if (*ap->a_buflen < len)
5969 				error = SET_ERROR(ENOMEM);
5970 		}
5971 		if (error == 0) {
5972 			*ap->a_buflen -= len;
5973 			bcopy(name, ap->a_buf + *ap->a_buflen, len);
5974 			*ap->a_vpp = ZTOV(dzp);
5975 		}
5976 		ZFS_EXIT(zfsvfs);
5977 		return (error);
5978 	}
5979 	ZFS_EXIT(zfsvfs);
5980 
5981 	covered_vp = vp->v_mount->mnt_vnodecovered;
5982 	vhold(covered_vp);
5983 	ltype = VOP_ISLOCKED(vp);
5984 	VOP_UNLOCK(vp, 0);
5985 	error = vget(covered_vp, LK_SHARED, curthread);
5986 	vdrop(covered_vp);
5987 	if (error == 0) {
5988 		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5989 		    ap->a_buf, ap->a_buflen);
5990 		vput(covered_vp);
5991 	}
5992 	vn_lock(vp, ltype | LK_RETRY);
5993 	if ((vp->v_iflag & VI_DOOMED) != 0)
5994 		error = SET_ERROR(ENOENT);
5995 	return (error);
5996 }
5997 
5998 #ifdef DIAGNOSTIC
5999 static int
zfs_lock(ap)6000 zfs_lock(ap)
6001 	struct vop_lock1_args /* {
6002 		struct vnode *a_vp;
6003 		int a_flags;
6004 		char *file;
6005 		int line;
6006 	} */ *ap;
6007 {
6008 	vnode_t *vp;
6009 	znode_t *zp;
6010 	int err;
6011 
6012 	err = vop_stdlock(ap);
6013 	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
6014 		vp = ap->a_vp;
6015 		zp = vp->v_data;
6016 		if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
6017 		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
6018 			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
6019 	}
6020 	return (err);
6021 }
6022 #endif
6023 
6024 struct vop_vector zfs_vnodeops;
6025 struct vop_vector zfs_fifoops;
6026 struct vop_vector zfs_shareops;
6027 
6028 struct vop_vector zfs_vnodeops = {
6029 	.vop_default =		&default_vnodeops,
6030 	.vop_inactive =		zfs_freebsd_inactive,
6031 	.vop_reclaim =		zfs_freebsd_reclaim,
6032 	.vop_access =		zfs_freebsd_access,
6033 	.vop_lookup =		zfs_cache_lookup,
6034 	.vop_cachedlookup =	zfs_freebsd_lookup,
6035 	.vop_getattr =		zfs_freebsd_getattr,
6036 	.vop_setattr =		zfs_freebsd_setattr,
6037 	.vop_create =		zfs_freebsd_create,
6038 	.vop_mknod =		zfs_freebsd_create,
6039 	.vop_mkdir =		zfs_freebsd_mkdir,
6040 	.vop_readdir =		zfs_freebsd_readdir,
6041 	.vop_fsync =		zfs_freebsd_fsync,
6042 	.vop_open =		zfs_freebsd_open,
6043 	.vop_close =		zfs_freebsd_close,
6044 	.vop_rmdir =		zfs_freebsd_rmdir,
6045 	.vop_ioctl =		zfs_freebsd_ioctl,
6046 	.vop_link =		zfs_freebsd_link,
6047 	.vop_symlink =		zfs_freebsd_symlink,
6048 	.vop_readlink =		zfs_freebsd_readlink,
6049 	.vop_read =		zfs_freebsd_read,
6050 	.vop_write =		zfs_freebsd_write,
6051 	.vop_remove =		zfs_freebsd_remove,
6052 	.vop_rename =		zfs_freebsd_rename,
6053 	.vop_pathconf =		zfs_freebsd_pathconf,
6054 	.vop_bmap =		zfs_freebsd_bmap,
6055 	.vop_fid =		zfs_freebsd_fid,
6056 	.vop_getextattr =	zfs_getextattr,
6057 	.vop_deleteextattr =	zfs_deleteextattr,
6058 	.vop_setextattr =	zfs_setextattr,
6059 	.vop_listextattr =	zfs_listextattr,
6060 	.vop_getacl =		zfs_freebsd_getacl,
6061 	.vop_setacl =		zfs_freebsd_setacl,
6062 	.vop_aclcheck =		zfs_freebsd_aclcheck,
6063 	.vop_getpages =		zfs_freebsd_getpages,
6064 	.vop_putpages =		zfs_freebsd_putpages,
6065 	.vop_vptocnp =		zfs_vptocnp,
6066 #ifdef DIAGNOSTIC
6067 	.vop_lock1 =		zfs_lock,
6068 #endif
6069 };
6070 
6071 struct vop_vector zfs_fifoops = {
6072 	.vop_default =		&fifo_specops,
6073 	.vop_fsync =		zfs_freebsd_fsync,
6074 	.vop_access =		zfs_freebsd_access,
6075 	.vop_getattr =		zfs_freebsd_getattr,
6076 	.vop_inactive =		zfs_freebsd_inactive,
6077 	.vop_read =		VOP_PANIC,
6078 	.vop_reclaim =		zfs_freebsd_reclaim,
6079 	.vop_setattr =		zfs_freebsd_setattr,
6080 	.vop_write =		VOP_PANIC,
6081 	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
6082 	.vop_fid =		zfs_freebsd_fid,
6083 	.vop_getacl =		zfs_freebsd_getacl,
6084 	.vop_setacl =		zfs_freebsd_setacl,
6085 	.vop_aclcheck =		zfs_freebsd_aclcheck,
6086 };
6087 
6088 /*
6089  * special share hidden files vnode operations template
6090  */
6091 struct vop_vector zfs_shareops = {
6092 	.vop_default =		&default_vnodeops,
6093 	.vop_access =		zfs_freebsd_access,
6094 	.vop_inactive =		zfs_freebsd_inactive,
6095 	.vop_reclaim =		zfs_freebsd_reclaim,
6096 	.vop_fid =		zfs_freebsd_fid,
6097 	.vop_pathconf =		zfs_freebsd_pathconf,
6098 };
6099