1 /*        $NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $         */
2 
3 /*
4  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5  *
6  * Further information about snapshots can be obtained from:
7  *
8  *        Marshall Kirk McKusick                  http://www.mckusick.com/softdep/
9  *        1614 Oxford Street            mckusick@mckusick.com
10  *        Berkeley, CA 94709-1608                 +1-510-843-9542
11  *        USA
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  *
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  *
23  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *        @(#)ffs_snapshot.c  8.11 (McKusick) 7/23/00
36  *
37  *        from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38  */
39 
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $");
42 
43 #if defined(_KERNEL_OPT)
44 #include "opt_ffs.h"
45 #include "opt_quota.h"
46 #endif
47 
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/systm.h>
51 #include <sys/conf.h>
52 #include <sys/buf.h>
53 #include <sys/proc.h>
54 #include <sys/namei.h>
55 #include <sys/sched.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/mount.h>
59 #include <sys/resource.h>
60 #include <sys/resourcevar.h>
61 #include <sys/vnode.h>
62 #include <sys/kauth.h>
63 #include <sys/fstrans.h>
64 #include <sys/wapbl.h>
65 
66 #include <miscfs/specfs/specdev.h>
67 
68 #include <ufs/ufs/quota.h>
69 #include <ufs/ufs/ufsmount.h>
70 #include <ufs/ufs/inode.h>
71 #include <ufs/ufs/ufs_extern.h>
72 #include <ufs/ufs/ufs_bswap.h>
73 #include <ufs/ufs/ufs_wapbl.h>
74 
75 #include <ufs/ffs/fs.h>
76 #include <ufs/ffs/ffs_extern.h>
77 
78 #include <uvm/uvm.h>
79 
80 TAILQ_HEAD(inodelst, inode);                      /* List of active snapshots */
81 
82 struct snap_info {
83           kmutex_t si_lock;                       /* Lock this snapinfo */
84           kmutex_t si_snaplock;                             /* Snapshot vnode common lock */
85           lwp_t *si_owner;                        /* Snaplock owner */
86           struct inodelst si_snapshots;           /* List of active snapshots */
87           daddr_t *si_snapblklist;                /* Snapshot block hints list */
88           uint32_t si_gen;                        /* Incremented on change */
89 };
90 
91 #if !defined(FFS_NO_SNAPSHOT)
92 typedef int (*acctfunc_t)
93     (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
94 
95 static int snapshot_setup(struct mount *, struct vnode *);
96 static int snapshot_copyfs(struct mount *, struct vnode *, void **);
97 static int snapshot_expunge(struct mount *, struct vnode *,
98     struct fs *, daddr_t *, daddr_t **);
99 static int snapshot_expunge_snap(struct mount *, struct vnode *,
100     struct fs *, daddr_t);
101 static int snapshot_writefs(struct mount *, struct vnode *, void *);
102 static int cgaccount(struct vnode *, int, int *);
103 static int cgaccount1(int, struct vnode *, void *, int);
104 static int expunge(struct vnode *, struct inode *, struct fs *,
105     acctfunc_t, int);
106 static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
107     daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
108 static int fullacct(struct vnode *, void *, int, int, struct fs *,
109     daddr_t, int);
110 static int snapacct(struct vnode *, void *, int, int, struct fs *,
111     daddr_t, int);
112 static int mapacct(struct vnode *, void *, int, int, struct fs *,
113     daddr_t, int);
114 #endif /* !defined(FFS_NO_SNAPSHOT) */
115 
116 static int ffs_copyonwrite(void *, struct buf *, bool);
117 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
118 static int rwfsblk(struct vnode *, int, void *, daddr_t);
119 static int syncsnap(struct vnode *);
120 static int wrsnapblk(struct vnode *, void *, daddr_t);
121 #if !defined(FFS_NO_SNAPSHOT)
122 static int blocks_in_journal(struct fs *);
123 #endif
124 
125 static inline bool is_active_snapshot(struct snap_info *, struct inode *);
126 static inline daddr_t db_get(struct inode *, int);
127 static inline void db_assign(struct inode *, int, daddr_t);
128 static inline daddr_t ib_get(struct inode *, int);
129 static inline daddr_t idb_get(struct inode *, void *, int);
130 static inline void idb_assign(struct inode *, void *, int, daddr_t);
131 
132 #ifdef DEBUG
133 static int snapdebug = 0;
134 #endif
135 
136 int
ffs_snapshot_init(struct ufsmount * ump)137 ffs_snapshot_init(struct ufsmount *ump)
138 {
139           struct snap_info *si;
140 
141           si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
142           TAILQ_INIT(&si->si_snapshots);
143           mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
144           mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
145           si->si_owner = NULL;
146           si->si_gen = 0;
147           si->si_snapblklist = NULL;
148 
149           return 0;
150 }
151 
152 void
ffs_snapshot_fini(struct ufsmount * ump)153 ffs_snapshot_fini(struct ufsmount *ump)
154 {
155           struct snap_info *si;
156 
157           si = ump->um_snapinfo;
158           ump->um_snapinfo = NULL;
159 
160           KASSERT(TAILQ_EMPTY(&si->si_snapshots));
161           mutex_destroy(&si->si_lock);
162           mutex_destroy(&si->si_snaplock);
163           KASSERT(si->si_snapblklist == NULL);
164           kmem_free(si, sizeof(*si));
165 }
166 
167 /*
168  * Create a snapshot file and initialize it for the filesystem.
169  * Vnode is locked on entry and return.
170  */
171 int
ffs_snapshot(struct mount * mp,struct vnode * vp,struct timespec * ctime)172 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
173 {
174 #if defined(FFS_NO_SNAPSHOT)
175           return EOPNOTSUPP;
176 }
177 #else /* defined(FFS_NO_SNAPSHOT) */
178           bool suspended = false;
179           int error, redo = 0, snaploc;
180           void *sbbuf = NULL;
181           daddr_t *snaplist = NULL, snaplistsize = 0;
182           struct buf *bp, *nbp;
183           struct fs *copy_fs = NULL;
184           struct fs *fs = VFSTOUFS(mp)->um_fs;
185           struct inode *ip = VTOI(vp);
186           struct lwp *l = curlwp;
187           struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
188           struct timespec ts;
189           struct timeval starttime;
190 #ifdef DEBUG
191           struct timeval endtime;
192 #endif
193           struct vnode *devvp = ip->i_devvp;
194 
195           /*
196            * If the vnode already is a snapshot, return.
197            */
198           if ((ip->i_flags & SF_SNAPSHOT)) {
199                     if ((ip->i_flags & SF_SNAPINVAL))
200                               return EINVAL;
201                     if (ctime) {
202                               ctime->tv_sec = DIP(ip, mtime);
203                               ctime->tv_nsec = DIP(ip, mtimensec);
204                     }
205                     return 0;
206           }
207           /*
208            * Check for free snapshot slot in the superblock.
209            */
210           for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
211                     if (fs->fs_snapinum[snaploc] == 0)
212                               break;
213           if (snaploc == FSMAXSNAP)
214                     return (ENOSPC);
215           /*
216            * Prepare the vnode to become a snapshot.
217            */
218           error = snapshot_setup(mp, vp);
219           if (error)
220                     goto out;
221 
222           /*
223            * Copy all the cylinder group maps. Although the
224            * filesystem is still active, we hope that only a few
225            * cylinder groups will change between now and when we
226            * suspend operations. Thus, we will be able to quickly
227            * touch up the few cylinder groups that changed during
228            * the suspension period.
229            */
230           error = cgaccount(vp, 1, NULL);
231           if (error)
232                     goto out;
233 
234           /*
235            * snapshot is now valid
236            */
237           ip->i_flags &= ~SF_SNAPINVAL;
238           DIP_ASSIGN(ip, flags, ip->i_flags);
239           ip->i_flag |= IN_CHANGE | IN_UPDATE;
240 
241           /*
242            * Ensure that the snapshot is completely on disk.
243            * Since we have marked it as a snapshot it is safe to
244            * unlock it as no process will be allowed to write to it.
245            */
246           error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
247           if (error)
248                     goto out;
249           VOP_UNLOCK(vp);
250           /*
251            * All allocations are done, so we can now suspend the filesystem.
252            */
253           error = vfs_suspend(vp->v_mount, 0);
254           if (error == 0) {
255                     suspended = true;
256                     vrele_flush(vp->v_mount);
257                     error = VFS_SYNC(vp->v_mount, MNT_WAIT, curlwp->l_cred);
258           }
259           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
260           if (error)
261                     goto out;
262           getmicrotime(&starttime);
263           /*
264            * First, copy all the cylinder group maps that have changed.
265            */
266           error = cgaccount(vp, 2, &redo);
267           if (error)
268                     goto out;
269           /*
270            * Create a copy of the superblock and its summary information.
271            */
272           error = snapshot_copyfs(mp, vp, &sbbuf);
273           if (error)
274                     goto out;
275           copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
276           /*
277            * Expunge unlinked files from our view.
278            */
279           error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
280           if (error)
281                     goto out;
282           /*
283            * Record snapshot inode. Since this is the newest snapshot,
284            * it must be placed at the end of the list.
285            */
286           if (ip->i_nlink > 0)
287                     fs->fs_snapinum[snaploc] = ip->i_number;
288 
289           mutex_enter(&si->si_lock);
290           if (is_active_snapshot(si, ip))
291                     panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
292           TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
293           if (TAILQ_FIRST(&si->si_snapshots) == ip) {
294                     /*
295                      * If this is the first snapshot on this filesystem, put the
296                      * preliminary list in place and establish the cow handler.
297                      */
298                     si->si_snapblklist = snaplist;
299                     fscow_establish(mp, ffs_copyonwrite, devvp);
300           }
301           si->si_gen++;
302           mutex_exit(&si->si_lock);
303 
304           vp->v_vflag |= VV_SYSTEM;
305           /*
306            * Set the mtime to the time the snapshot has been taken.
307            */
308           TIMEVAL_TO_TIMESPEC(&starttime, &ts);
309           if (ctime)
310                     *ctime = ts;
311           DIP_ASSIGN(ip, mtime, ts.tv_sec);
312           DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
313           ip->i_flag |= IN_CHANGE | IN_UPDATE;
314           /*
315            * Copy allocation information from all snapshots and then
316            * expunge them from our view.
317            */
318           error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
319           if (error)
320                     goto out;
321           /*
322            * Write the superblock and its summary information to the snapshot.
323            */
324           error = snapshot_writefs(mp, vp, sbbuf);
325           if (error)
326                     goto out;
327           /*
328            * We're nearly done, ensure that the snapshot is completely on disk.
329            */
330           error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
331           if (error)
332                     goto out;
333           /*
334            * Invalidate and free all pages on the snapshot vnode.
335            * We will read and write through the buffercache.
336            */
337           rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
338           error = VOP_PUTPAGES(vp, 0, 0,
339                         PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
340           if (error)
341                     goto out;
342           /*
343            * Invalidate short ( < fs_bsize ) buffers.  We will always read
344            * full size buffers later.
345            */
346           mutex_enter(&bufcache_lock);
347           KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
348           for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
349                     nbp = LIST_NEXT(bp, b_vnbufs);
350                     if (bp->b_bcount == fs->fs_bsize)
351                               continue;
352                     error = bbusy(bp, false, 0, NULL);
353                     if (error != 0) {
354                               if (error == EPASSTHROUGH) {
355                                         nbp = LIST_FIRST(&vp->v_cleanblkhd);
356                                         continue;
357                               }
358                               break;
359                     }
360                     brelsel(bp, BC_INVAL | BC_VFLUSH);
361           }
362           mutex_exit(&bufcache_lock);
363 
364 out:
365           if (sbbuf != NULL) {
366                     free(copy_fs->fs_csp, M_UFSMNT);
367                     free(sbbuf, M_UFSMNT);
368           }
369           if (fs->fs_active != NULL) {
370                     free(fs->fs_active, M_DEVBUF);
371                     fs->fs_active = NULL;
372           }
373 
374           mutex_enter(&si->si_lock);
375           if (snaplist != NULL) {
376                     if (si->si_snapblklist == snaplist)
377                               si->si_snapblklist = NULL;
378                     free(snaplist, M_UFSMNT);
379           }
380           if (error) {
381                     fs->fs_snapinum[snaploc] = 0;
382           } else {
383                     /*
384                      * As this is the newest list, it is the most inclusive, so
385                      * should replace the previous list.
386                      */
387                     si->si_snapblklist = ip->i_snapblklist;
388           }
389           si->si_gen++;
390           mutex_exit(&si->si_lock);
391 
392           if (suspended) {
393                     VOP_UNLOCK(vp);
394                     vfs_resume(vp->v_mount);
395                     vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
396 #ifdef DEBUG
397                     getmicrotime(&endtime);
398                     timersub(&endtime, &starttime, &endtime);
399                     printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
400                         mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
401                         endtime.tv_usec / 1000, redo, fs->fs_ncg);
402 #endif
403           }
404           if (error) {
405                     if (UFS_WAPBL_BEGIN(mp) == 0) {
406                               /*
407                                * We depend on ffs_truncate() to call ffs_snapremove()
408                                * before it may return an error. On failed
409                                * ffs_truncate() we have normal file with leaked
410                                * (meta-) data, but no snapshot to use.
411                                */
412                               (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
413                               UFS_WAPBL_END(mp);
414                     }
415           } else if (ip->i_nlink > 0)
416                     vref(vp);
417           return (error);
418 }
419 
420 /*
421  * Prepare vnode to become a snapshot.
422  */
423 static int
424 snapshot_setup(struct mount *mp, struct vnode *vp)
425 {
426           int error, n, len, loc, cg;
427           daddr_t blkno, numblks;
428           struct buf *ibp, *nbp;
429           struct fs *fs = VFSTOUFS(mp)->um_fs;
430           struct lwp *l = curlwp;
431           const int wbreak = blocks_in_journal(fs)/8;
432           struct inode *ip = VTOI(vp);
433 
434           /*
435            * Check mount, readonly reference and owner.
436            */
437           if (vp->v_mount != mp)
438                     return EXDEV;
439           if (vp->v_writecount != 0)
440                     return EBUSY;
441           error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
442               0, mp, vp, NULL);
443           if (error)
444                     return EACCES;
445 
446           /*
447            * Must completely truncate the file here. Allocated
448            * blocks on a snapshot mean that block has been copied
449            * on write, see ffs_copyonwrite() testing "blkno != 0"
450            */
451           error = ufs_truncate_all(vp);
452           if (error)
453                     return error;
454 
455           /* Change inode to snapshot type file. */
456           error = UFS_WAPBL_BEGIN(mp);
457           if (error)
458                     return error;
459 #if defined(QUOTA) || defined(QUOTA2)
460           /* snapshot inodes are not accounted in quotas */
461           chkiq(ip, -1, l->l_cred, 0);
462 #endif
463           ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
464           DIP_ASSIGN(ip, flags, ip->i_flags);
465           ip->i_flag |= IN_CHANGE | IN_UPDATE;
466           ffs_update(vp, NULL, NULL, UPDATE_WAIT);
467           UFS_WAPBL_END(mp);
468 
469           KASSERT(ip->i_flags & SF_SNAPSHOT);
470           /*
471            * Write an empty list of preallocated blocks to the end of
472            * the snapshot to set size to at least that of the filesystem.
473            */
474           numblks = howmany(fs->fs_size, fs->fs_frag);
475           blkno = 1;
476           blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
477           error = vn_rdwr(UIO_WRITE, vp,
478               (void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks),
479               UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
480           if (error)
481                     return error;
482           /*
483            * Preallocate critical data structures so that we can copy
484            * them in without further allocation after we suspend all
485            * operations on the filesystem. We would like to just release
486            * the allocated buffers without writing them since they will
487            * be filled in below once we are ready to go, but this upsets
488            * the soft update code, so we go ahead and write the new buffers.
489            *
490            * Allocate all indirect blocks and mark all of them as not
491            * needing to be copied.
492            */
493           error = UFS_WAPBL_BEGIN(mp);
494           if (error)
495                     return error;
496           for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) {
497                     error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
498                         fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
499                     if (error)
500                               goto out;
501                     brelse(ibp, 0);
502                     if (wbreak > 0 && (++n % wbreak) == 0) {
503                               UFS_WAPBL_END(mp);
504                               error = UFS_WAPBL_BEGIN(mp);
505                               if (error)
506                                         return error;
507                     }
508           }
509           /*
510            * Allocate copies for the superblock and its summary information.
511            */
512           error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
513               0, &nbp);
514           if (error)
515                     goto out;
516           bawrite(nbp);
517           blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
518           len = howmany(fs->fs_cssize, fs->fs_bsize);
519           for (loc = 0; loc < len; loc++) {
520                     error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)),
521                         fs->fs_bsize, l->l_cred, 0, &nbp);
522                     if (error)
523                               goto out;
524                     bawrite(nbp);
525                     if (wbreak > 0 && (++n % wbreak) == 0) {
526                               UFS_WAPBL_END(mp);
527                               error = UFS_WAPBL_BEGIN(mp);
528                               if (error)
529                                         return error;
530                     }
531           }
532           /*
533            * Allocate all cylinder group blocks.
534            */
535           for (cg = 0; cg < fs->fs_ncg; cg++) {
536                     error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
537                         fs->fs_bsize, l->l_cred, 0, &nbp);
538                     if (error)
539                               goto out;
540                     bawrite(nbp);
541                     if (wbreak > 0 && (++n % wbreak) == 0) {
542                               UFS_WAPBL_END(mp);
543                               error = UFS_WAPBL_BEGIN(mp);
544                               if (error)
545                                         return error;
546                     }
547           }
548 
549 out:
550           UFS_WAPBL_END(mp);
551           return error;
552 }
553 
554 /*
555  * Create a copy of the superblock and its summary information.
556  * It is up to the caller to free copyfs and copy_fs->fs_csp.
557  */
558 static int
559 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
560 {
561           int error, i, len, loc, size;
562           void *space;
563           int32_t *lp;
564           struct buf *bp;
565           struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
566           struct vnode *devvp = VTOI(vp)->i_devvp;
567 
568           /*
569            * Grab a copy of the superblock and its summary information.
570            * We delay writing it until the suspension is released below.
571            */
572           *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
573           loc = ffs_blkoff(fs, fs->fs_sblockloc);
574           if (loc > 0)
575                     memset(*sbbuf, 0, loc);
576           copyfs = (struct fs *)((char *)(*sbbuf) + loc);
577           memcpy(copyfs, fs, fs->fs_sbsize);
578           size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
579           if (fs->fs_sbsize < size)
580                     memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
581                         size - fs->fs_sbsize);
582           size = ffs_blkroundup(fs, fs->fs_cssize);
583           if (fs->fs_contigsumsize > 0)
584                     size += fs->fs_ncg * sizeof(int32_t);
585           space = malloc(size, M_UFSMNT, M_WAITOK);
586           copyfs->fs_csp = space;
587           memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
588           space = (char *)space + fs->fs_cssize;
589           loc = howmany(fs->fs_cssize, fs->fs_fsize);
590           i = fs->fs_frag - loc % fs->fs_frag;
591           len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
592           if (len > 0) {
593                     if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc),
594                         len, 0, &bp)) != 0) {
595                               free(copyfs->fs_csp, M_UFSMNT);
596                               free(*sbbuf, M_UFSMNT);
597                               *sbbuf = NULL;
598                               return error;
599                     }
600                     memcpy(space, bp->b_data, (u_int)len);
601                     space = (char *)space + len;
602                     brelse(bp, BC_INVAL | BC_NOCACHE);
603           }
604           if (fs->fs_contigsumsize > 0) {
605                     copyfs->fs_maxcluster = lp = space;
606                     for (i = 0; i < fs->fs_ncg; i++)
607                               *lp++ = fs->fs_contigsumsize;
608           }
609           if (mp->mnt_wapbl)
610                     copyfs->fs_flags &= ~FS_DOWAPBL;
611           return 0;
612 }
613 
614 struct snapshot_expunge_ctx {
615           struct vnode *logvp;
616           struct vnode *vp;
617           struct fs *copy_fs;
618 };
619 
620 static bool
621 snapshot_expunge_selector(void *cl, struct vnode *xvp)
622 {
623           struct snapshot_expunge_ctx *c = cl;
624           struct inode *xp;
625 
626           KASSERT(mutex_owned(xvp->v_interlock));
627 
628           xp = VTOI(xvp);
629           if (xvp->v_type == VNON || VTOI(xvp) == NULL ||
630               (xp->i_flags & SF_SNAPSHOT))
631                     return false;
632 #ifdef DEBUG
633           if (snapdebug)
634                     vprint("ffs_snapshot: busy vnode", xvp);
635 #endif
636 
637           if (xvp == c->logvp)
638                     return true;
639 
640           if (xp->i_nlink > 0)
641                     return false;
642 
643           if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number))
644                     return false;
645 
646           return true;
647 }
648 
649 /*
650  * We must check for active files that have been unlinked (e.g., with a zero
651  * link count). We have to expunge all trace of these files from the snapshot
652  * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
653  * Note that we skip unlinked snapshot files as they will be handled separately.
654  * Calculate the snapshot list size and create a preliminary list.
655  */
656 static int
657 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
658     daddr_t *snaplistsize, daddr_t **snaplist)
659 {
660           int cg, error = 0, len, loc;
661           daddr_t blkno, *blkp;
662           struct fs *fs = VFSTOUFS(mp)->um_fs;
663           struct inode *xp;
664           struct vnode *logvp = NULL, *xvp;
665           struct vnode_iterator *marker;
666           struct snapshot_expunge_ctx ctx;
667 
668           *snaplist = NULL;
669           /*
670            * Get the log inode if any.
671            */
672           if ((fs->fs_flags & FS_DOWAPBL) &&
673               fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
674                     error = VFS_VGET(mp, fs->fs_journallocs[UFS_WAPBL_INFS_INO],
675                         LK_EXCLUSIVE, &logvp);
676                     if (error)
677                               goto out;
678           }
679           /*
680            * We also calculate the needed size for the snapshot list.
681            */
682           *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
683               FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
684 
685           vfs_vnode_iterator_init(mp, &marker);
686           ctx.logvp = logvp;
687           ctx.vp = vp;
688           ctx.copy_fs = copy_fs;
689           while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector,
690               &ctx)))
691           {
692                     /*
693                      * If there is a fragment, clear it here.
694                      */
695                     xp = VTOI(xvp);
696                     blkno = 0;
697                     loc = howmany(xp->i_size, fs->fs_bsize) - 1;
698                     if (loc < UFS_NDADDR) {
699                               len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size));
700                               if (len > 0 && len < fs->fs_bsize) {
701                                         error = UFS_WAPBL_BEGIN(mp);
702                                         if (error) {
703                                                   vrele(xvp);
704                                                   vfs_vnode_iterator_destroy(marker);
705                                                   goto out;
706                                         }
707                                         ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
708                                             len, xp->i_number);
709                                         blkno = db_get(xp, loc);
710                                         db_assign(xp, loc, 0);
711                                         UFS_WAPBL_END(mp);
712                               }
713                     }
714                     *snaplistsize += 1;
715                     error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
716                     if (blkno)
717                               db_assign(xp, loc, blkno);
718                     if (!error) {
719                               error = UFS_WAPBL_BEGIN(mp);
720                               if (!error) {
721                                         error = ffs_freefile_snap(copy_fs, vp,
722                                             xp->i_number, xp->i_mode);
723                                         UFS_WAPBL_END(mp);
724                               }
725                     }
726                     vrele(xvp);
727                     if (error) {
728                               vfs_vnode_iterator_destroy(marker);
729                               goto out;
730                     }
731           }
732           vfs_vnode_iterator_destroy(marker);
733 
734           /*
735            * Create a preliminary list of preallocated snapshot blocks.
736            */
737           *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
738           blkp = &(*snaplist)[1];
739           *blkp++ = ffs_lblkno(fs, fs->fs_sblockloc);
740           blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
741           for (cg = 0; cg < fs->fs_ncg; cg++) {
742                     if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno)
743                               break;
744                     *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
745           }
746           len = howmany(fs->fs_cssize, fs->fs_bsize);
747           for (loc = 0; loc < len; loc++)
748                     *blkp++ = blkno + loc;
749           for (; cg < fs->fs_ncg; cg++)
750                     *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
751           (*snaplist)[0] = blkp - &(*snaplist)[0];
752 
753 out:
754           if (logvp != NULL)
755                     vput(logvp);
756           if (error && *snaplist != NULL) {
757                     free(*snaplist, M_UFSMNT);
758                     *snaplist = NULL;
759           }
760 
761           return error;
762 }
763 
764 /*
765  * Copy allocation information from all the snapshots in this snapshot and
766  * then expunge them from its view. Also, collect the list of allocated
767  * blocks in i_snapblklist.
768  */
769 static int
770 snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
771     struct fs *copy_fs, daddr_t snaplistsize)
772 {
773           int error = 0, i;
774           daddr_t numblks, *snaplist = NULL;
775           struct fs *fs = VFSTOUFS(mp)->um_fs;
776           struct inode *ip = VTOI(vp), *xp;
777           struct lwp *l = curlwp;
778           struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
779 
780           TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
781                     if (xp != ip) {
782                               error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
783                               if (error)
784                                         break;
785                     }
786                     if (xp->i_nlink != 0)
787                               continue;
788                     error = UFS_WAPBL_BEGIN(mp);
789                     if (error)
790                               break;
791                     error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
792                     UFS_WAPBL_END(mp);
793                     if (error)
794                               break;
795           }
796           if (error)
797                     goto out;
798           /*
799            * Allocate space for the full list of preallocated snapshot blocks.
800            */
801           snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
802           ip->i_snapblklist = &snaplist[1];
803           /*
804            * Expunge the blocks used by the snapshots from the set of
805            * blocks marked as used in the snapshot bitmaps. Also, collect
806            * the list of allocated blocks in i_snapblklist.
807            */
808           error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
809           if (error)
810                     goto out;
811           if (snaplistsize < ip->i_snapblklist - snaplist)
812                     panic("ffs_snapshot: list too small");
813           snaplistsize = ip->i_snapblklist - snaplist;
814           snaplist[0] = snaplistsize;
815           ip->i_snapblklist = &snaplist[0];
816           /*
817            * Write out the list of allocated blocks to the end of the snapshot.
818            */
819           numblks = howmany(fs->fs_size, fs->fs_frag);
820           for (i = 0; i < snaplistsize; i++)
821                     snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
822           error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
823               snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks),
824               UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
825           for (i = 0; i < snaplistsize; i++)
826                     snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
827 out:
828           if (error && snaplist != NULL) {
829                     free(snaplist, M_UFSMNT);
830                     ip->i_snapblklist = NULL;
831           }
832           return error;
833 }
834 
835 /*
836  * Write the superblock and its summary information to the snapshot.
837  * Make sure, the first UFS_NDADDR blocks get copied to the snapshot.
838  */
839 static int
840 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
841 {
842           int error, len, loc;
843           void *space;
844           daddr_t blkno;
845           struct buf *bp;
846           struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
847           struct inode *ip = VTOI(vp);
848           struct lwp *l = curlwp;
849 
850           copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
851 
852           /*
853            * Write the superblock and its summary information
854            * to the snapshot.
855            */
856           blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
857           len = howmany(fs->fs_cssize, fs->fs_bsize);
858           space = copyfs->fs_csp;
859 #ifdef FFS_EI
860           if (UFS_FSNEEDSWAP(fs)) {
861                     ffs_sb_swap(copyfs, copyfs);
862                     ffs_csum_swap(space, space, fs->fs_cssize);
863           }
864 #endif
865           error = UFS_WAPBL_BEGIN(mp);
866           if (error)
867                     return error;
868           for (loc = 0; loc < len; loc++) {
869                     error = bread(vp, blkno + loc, fs->fs_bsize,
870                         B_MODIFY, &bp);
871                     if (error) {
872                               break;
873                     }
874                     memcpy(bp->b_data, space, fs->fs_bsize);
875                     space = (char *)space + fs->fs_bsize;
876                     bawrite(bp);
877           }
878           if (error)
879                     goto out;
880           error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc),
881               fs->fs_bsize, B_MODIFY, &bp);
882           if (error) {
883                     goto out;
884           } else {
885                     memcpy(bp->b_data, sbbuf, fs->fs_bsize);
886                     bawrite(bp);
887           }
888           /*
889            * Copy the first UFS_NDADDR blocks to the snapshot so
890            * ffs_copyonwrite() and ffs_snapblkfree() will always work on
891            * indirect blocks.
892            */
893           for (loc = 0; loc < UFS_NDADDR; loc++) {
894                     if (db_get(ip, loc) != 0)
895                               continue;
896                     error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc),
897                         fs->fs_bsize, l->l_cred, 0, &bp);
898                     if (error)
899                               break;
900                     error = rwfsblk(vp, B_READ, bp->b_data, loc);
901                     if (error) {
902                               brelse(bp, 0);
903                               break;
904                     }
905                     bawrite(bp);
906           }
907 
908 out:
909           UFS_WAPBL_END(mp);
910           return error;
911 }
912 
913 /*
914  * Copy all cylinder group maps.
915  */
916 static int
917 cgaccount(struct vnode *vp, int passno, int *redo)
918 {
919           int cg, error = 0;
920           struct buf *nbp;
921           struct fs *fs = VTOI(vp)->i_fs;
922 
923           if (redo != NULL)
924                     *redo = 0;
925           if (passno == 1)
926                     fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
927                         M_DEVBUF, M_WAITOK | M_ZERO);
928           for (cg = 0; cg < fs->fs_ncg; cg++) {
929                     if (passno == 2 && ACTIVECG_ISSET(fs, cg))
930                               continue;
931 
932                     if (redo != NULL)
933                               *redo += 1;
934                     error = UFS_WAPBL_BEGIN(vp->v_mount);
935                     if (error)
936                               return error;
937                     error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
938                         fs->fs_bsize, curlwp->l_cred, 0, &nbp);
939                     if (error) {
940                               UFS_WAPBL_END(vp->v_mount);
941                               break;
942                     }
943                     error = cgaccount1(cg, vp, nbp->b_data, passno);
944                     bawrite(nbp);
945                     UFS_WAPBL_END(vp->v_mount);
946                     if (error)
947                               break;
948           }
949           return error;
950 }
951 
952 /*
953  * Copy a cylinder group map. All the unallocated blocks are marked
954  * BLK_NOCOPY so that the snapshot knows that it need not copy them
955  * if they are later written. If passno is one, then this is a first
956  * pass, so only setting needs to be done. If passno is 2, then this
957  * is a revision to a previous pass which must be undone as the
958  * replacement pass is done.
959  */
960 static int
961 cgaccount1(int cg, struct vnode *vp, void *data, int passno)
962 {
963           struct buf *bp, *ibp;
964           struct inode *ip;
965           struct cg *cgp;
966           struct fs *fs;
967           struct lwp *l = curlwp;
968           daddr_t base, numblks;
969           int error, len, loc, ns __unused, indiroff;
970 
971           ip = VTOI(vp);
972           fs = ip->i_fs;
973           ns = UFS_FSNEEDSWAP(fs);
974           error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
975                     (int)fs->fs_cgsize, 0, &bp);
976           if (error) {
977                     return (error);
978           }
979           cgp = (struct cg *)bp->b_data;
980           if (!cg_chkmagic(cgp, ns)) {
981                     brelse(bp, 0);
982                     return (EIO);
983           }
984           ACTIVECG_SET(fs, cg);
985 
986           memcpy(data, bp->b_data, fs->fs_cgsize);
987           brelse(bp, 0);
988           if (fs->fs_cgsize < fs->fs_bsize)
989                     memset((char *)data + fs->fs_cgsize, 0,
990                         fs->fs_bsize - fs->fs_cgsize);
991           numblks = howmany(fs->fs_size, fs->fs_frag);
992           len = howmany(fs->fs_fpg, fs->fs_frag);
993           base = cgbase(fs, cg) / fs->fs_frag;
994           if (base + len >= numblks)
995                     len = numblks - base - 1;
996           loc = 0;
997           if (base < UFS_NDADDR) {
998                     for ( ; loc < UFS_NDADDR; loc++) {
999                               if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1000                                         db_assign(ip, loc, BLK_NOCOPY);
1001                               else if (db_get(ip, loc) == BLK_NOCOPY) {
1002                                         if (passno == 2)
1003                                                   db_assign(ip, loc, 0);
1004                                         else if (passno == 1)
1005                                                   panic("ffs_snapshot: lost direct block");
1006                               }
1007                     }
1008           }
1009           if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)),
1010               fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1011                     return (error);
1012           indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs);
1013           for ( ; loc < len; loc++, indiroff++) {
1014                     if (indiroff >= FFS_NINDIR(fs)) {
1015                               bawrite(ibp);
1016                               if ((error = ffs_balloc(vp,
1017                                   ffs_lblktosize(fs, (off_t)(base + loc)),
1018                                   fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1019                                         return (error);
1020                               indiroff = 0;
1021                     }
1022                     if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1023                               idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
1024                     else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
1025                               if (passno == 2)
1026                                         idb_assign(ip, ibp->b_data, indiroff, 0);
1027                               else if (passno == 1)
1028                                         panic("ffs_snapshot: lost indirect block");
1029                     }
1030           }
1031           bdwrite(ibp);
1032           return (0);
1033 }
1034 
1035 /*
1036  * Before expunging a snapshot inode, note all the
1037  * blocks that it claims with BLK_SNAP so that fsck will
1038  * be able to account for those blocks properly and so
1039  * that this snapshot knows that it need not copy them
1040  * if the other snapshot holding them is freed.
1041  */
1042 static int
1043 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
1044     acctfunc_t acctfunc, int expungetype)
1045 {
1046           int i, error, ns __unused;
1047           daddr_t lbn, rlbn;
1048           daddr_t len, blkno, numblks, blksperindir;
1049           struct ufs1_dinode *dip1;
1050           struct ufs2_dinode *dip2;
1051           struct lwp *l = curlwp;
1052           void *bap;
1053           struct buf *bp;
1054           struct mount *mp;
1055 
1056           ns = UFS_FSNEEDSWAP(fs);
1057           mp = snapvp->v_mount;
1058 
1059           error = UFS_WAPBL_BEGIN(mp);
1060           if (error)
1061                     return error;
1062           /*
1063            * Prepare to expunge the inode. If its inode block has not
1064            * yet been copied, then allocate and fill the copy.
1065            */
1066           lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1067           error = snapblkaddr(snapvp, lbn, &blkno);
1068           if (error)
1069                     return error;
1070           if (blkno != 0) {
1071                     error = bread(snapvp, lbn, fs->fs_bsize,
1072                         B_MODIFY, &bp);
1073           } else {
1074                     error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn),
1075                         fs->fs_bsize, l->l_cred, 0, &bp);
1076                     if (! error)
1077                               error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1078           }
1079           if (error) {
1080                     UFS_WAPBL_END(mp);
1081                     return error;
1082           }
1083           /*
1084            * Set a snapshot inode to be a zero length file, regular files
1085            * or unlinked snapshots to be completely unallocated.
1086            */
1087           if (fs->fs_magic == FS_UFS1_MAGIC) {
1088                     dip1 = (struct ufs1_dinode *)bp->b_data +
1089                         ino_to_fsbo(fs, cancelip->i_number);
1090                     if (cancelip->i_flags & SF_SNAPSHOT) {
1091                               dip1->di_flags =
1092                                   ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
1093                                   SF_SNAPINVAL, ns);
1094                     }
1095                     if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1096                               dip1->di_mode = 0;
1097                     dip1->di_size = 0;
1098                     dip1->di_blocks = 0;
1099                     memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t));
1100           } else {
1101                     dip2 = (struct ufs2_dinode *)bp->b_data +
1102                         ino_to_fsbo(fs, cancelip->i_number);
1103                     if (cancelip->i_flags & SF_SNAPSHOT) {
1104                               dip2->di_flags =
1105                                   ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
1106                                   SF_SNAPINVAL, ns);
1107                     }
1108                     if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1109                               dip2->di_mode = 0;
1110                     dip2->di_size = 0;
1111                     dip2->di_blocks = 0;
1112                     memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t));
1113           }
1114           bdwrite(bp);
1115           UFS_WAPBL_END(mp);
1116           /*
1117            * Now go through and expunge all the blocks in the file
1118            * using the function requested.
1119            */
1120           numblks = howmany(cancelip->i_size, fs->fs_bsize);
1121           if (fs->fs_magic == FS_UFS1_MAGIC)
1122                     bap = &cancelip->i_ffs1_db[0];
1123           else
1124                     bap = &cancelip->i_ffs2_db[0];
1125           error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype);
1126           if (error)
1127                     return (error);
1128           if (fs->fs_magic == FS_UFS1_MAGIC)
1129                     bap = &cancelip->i_ffs1_ib[0];
1130           else
1131                     bap = &cancelip->i_ffs2_ib[0];
1132           error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype);
1133           if (error)
1134                     return (error);
1135           blksperindir = 1;
1136           lbn = -UFS_NDADDR;
1137           len = numblks - UFS_NDADDR;
1138           rlbn = UFS_NDADDR;
1139           for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
1140                     error = indiracct(snapvp, ITOV(cancelip), i,
1141                         ib_get(cancelip, i), lbn, rlbn, len,
1142                         blksperindir, fs, acctfunc, expungetype);
1143                     if (error)
1144                               return (error);
1145                     blksperindir *= FFS_NINDIR(fs);
1146                     lbn -= blksperindir + 1;
1147                     len -= blksperindir;
1148                     rlbn += blksperindir;
1149           }
1150           return (0);
1151 }
1152 
1153 /*
1154  * Descend an indirect block chain for vnode cancelvp accounting for all
1155  * its indirect blocks in snapvp.
1156  */
1157 static int
1158 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
1159     daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1160     daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
1161 {
1162           int error, num, i;
1163           daddr_t subblksperindir;
1164           struct indir indirs[UFS_NIADDR + 2];
1165           daddr_t last;
1166           void *bap;
1167           struct buf *bp;
1168 
1169           if (blkno == 0) {
1170                     if (expungetype == BLK_NOCOPY)
1171                               return (0);
1172                     panic("indiracct: missing indir");
1173           }
1174           if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1175                     return (error);
1176           if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1177                     panic("indiracct: botched params");
1178           /*
1179            * We have to expand bread here since it will deadlock looking
1180            * up the block number for any blocks that are not in the cache.
1181            */
1182           error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize,
1183               false, &bp);
1184           if (error)
1185                     return error;
1186           if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1187               rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) {
1188                     brelse(bp, 0);
1189                     return (error);
1190           }
1191           /*
1192            * Account for the block pointers in this indirect block.
1193            */
1194           last = howmany(remblks, blksperindir);
1195           if (last > FFS_NINDIR(fs))
1196                     last = FFS_NINDIR(fs);
1197           bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
1198           memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1199           brelse(bp, 0);
1200           error = (*acctfunc)(snapvp, bap, 0, last,
1201               fs, level == 0 ? rlbn : -1, expungetype);
1202           if (error || level == 0)
1203                     goto out;
1204           /*
1205            * Account for the block pointers in each of the indirect blocks
1206            * in the levels below us.
1207            */
1208           subblksperindir = blksperindir / FFS_NINDIR(fs);
1209           for (lbn++, level--, i = 0; i < last; i++) {
1210                     error = indiracct(snapvp, cancelvp, level,
1211                         idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1212                         subblksperindir, fs, acctfunc, expungetype);
1213                     if (error)
1214                               goto out;
1215                     rlbn += blksperindir;
1216                     lbn -= blksperindir;
1217                     remblks -= blksperindir;
1218           }
1219 out:
1220           free(bap, M_DEVBUF);
1221           return (error);
1222 }
1223 
1224 /*
1225  * Do both snap accounting and map accounting.
1226  */
1227 static int
1228 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1229     struct fs *fs, daddr_t lblkno,
1230     int exptype /* BLK_SNAP or BLK_NOCOPY */)
1231 {
1232           int error;
1233 
1234           if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1235                     return (error);
1236           return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1237 }
1238 
1239 /*
1240  * Identify a set of blocks allocated in a snapshot inode.
1241  */
1242 static int
1243 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1244     struct fs *fs, daddr_t lblkno,
1245     int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1246 {
1247           struct inode *ip = VTOI(vp);
1248           struct lwp *l = curlwp;
1249           struct mount *mp = vp->v_mount;
1250           daddr_t blkno;
1251           daddr_t lbn;
1252           struct buf *ibp;
1253           int error, n;
1254           const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1255 
1256           error = UFS_WAPBL_BEGIN(mp);
1257           if (error)
1258                     return error;
1259           for ( n = 0; oldblkp < lastblkp; oldblkp++) {
1260                     blkno = idb_get(ip, bap, oldblkp);
1261                     if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1262                               continue;
1263                     lbn = ffs_fragstoblks(fs, blkno);
1264                     if (lbn < UFS_NDADDR) {
1265                               blkno = db_get(ip, lbn);
1266                               ip->i_flag |= IN_CHANGE | IN_UPDATE;
1267                     } else {
1268                               error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1269                                   fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1270                               if (error)
1271                                         break;
1272                               blkno = idb_get(ip, ibp->b_data,
1273                                   (lbn - UFS_NDADDR) % FFS_NINDIR(fs));
1274                     }
1275                     /*
1276                      * If we are expunging a snapshot vnode and we
1277                      * find a block marked BLK_NOCOPY, then it is
1278                      * one that has been allocated to this snapshot after
1279                      * we took our current snapshot and can be ignored.
1280                      */
1281                     if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1282                               if (lbn >= UFS_NDADDR)
1283                                         brelse(ibp, 0);
1284                     } else {
1285                               if (blkno != 0)
1286                                         panic("snapacct: bad block");
1287                               if (lbn < UFS_NDADDR)
1288                                         db_assign(ip, lbn, expungetype);
1289                               else {
1290                                         idb_assign(ip, ibp->b_data,
1291                                             (lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype);
1292                                         bdwrite(ibp);
1293                               }
1294                     }
1295                     if (wbreak > 0 && (++n % wbreak) == 0) {
1296                               UFS_WAPBL_END(mp);
1297                               error = UFS_WAPBL_BEGIN(mp);
1298                               if (error)
1299                                         return error;
1300                     }
1301           }
1302           UFS_WAPBL_END(mp);
1303           return error;
1304 }
1305 
1306 /*
1307  * Account for a set of blocks allocated in a snapshot inode.
1308  */
1309 static int
1310 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1311     struct fs *fs, daddr_t lblkno, int expungetype)
1312 {
1313           daddr_t blkno;
1314           struct inode *ip;
1315           struct mount *mp = vp->v_mount;
1316           ino_t inum;
1317           int acctit, error, n;
1318           const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1319 
1320           error = UFS_WAPBL_BEGIN(mp);
1321           if (error)
1322                     return error;
1323           ip = VTOI(vp);
1324           inum = ip->i_number;
1325           if (lblkno == -1)
1326                     acctit = 0;
1327           else
1328                     acctit = 1;
1329           for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
1330                     blkno = idb_get(ip, bap, oldblkp);
1331                     if (blkno == 0 || blkno == BLK_NOCOPY)
1332                               continue;
1333                     if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1334                               *ip->i_snapblklist++ = lblkno;
1335                     if (blkno == BLK_SNAP)
1336                               blkno = ffs_blkstofrags(fs, lblkno);
1337                     ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1338                     if (wbreak > 0 && (++n % wbreak) == 0) {
1339                               UFS_WAPBL_END(mp);
1340                               error = UFS_WAPBL_BEGIN(mp);
1341                               if (error)
1342                                         return error;
1343                     }
1344           }
1345           UFS_WAPBL_END(mp);
1346           return (0);
1347 }
1348 
1349 /*
1350  * Number of blocks that fit into the journal or zero if not logging.
1351  */
1352 static int
1353 blocks_in_journal(struct fs *fs)
1354 {
1355           off_t bpj;
1356 
1357           if ((fs->fs_flags & FS_DOWAPBL) == 0)
1358                     return 0;
1359           bpj = 1;
1360           if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
1361                     switch (fs->fs_journal_location) {
1362                     case UFS_WAPBL_JOURNALLOC_END_PARTITION:
1363                               bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
1364                                   fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
1365                               break;
1366                     case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
1367                               bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
1368                                   fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
1369                               break;
1370                     }
1371           }
1372           bpj /= fs->fs_bsize;
1373           return (bpj > 0 ? bpj : 1);
1374 }
1375 #endif /* defined(FFS_NO_SNAPSHOT) */
1376 
1377 /*
1378  * Decrement extra reference on snapshot when last name is removed.
1379  * It will not be freed until the last open reference goes away.
1380  */
1381 void
ffs_snapgone(struct vnode * vp)1382 ffs_snapgone(struct vnode *vp)
1383 {
1384           struct inode *xp, *ip = VTOI(vp);
1385           struct mount *mp = spec_node_getmountedfs(ip->i_devvp);
1386           struct fs *fs;
1387           struct snap_info *si;
1388           int snaploc;
1389 
1390           si = VFSTOUFS(mp)->um_snapinfo;
1391 
1392           /*
1393            * Find snapshot in incore list.
1394            */
1395           mutex_enter(&si->si_lock);
1396           TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1397                     if (xp == ip)
1398                               break;
1399           mutex_exit(&si->si_lock);
1400           if (xp != NULL)
1401                     vrele(ITOV(ip));
1402 #ifdef DEBUG
1403           else if (snapdebug)
1404                     printf("ffs_snapgone: lost snapshot vnode %llu\n",
1405                         (unsigned long long)ip->i_number);
1406 #endif
1407           /*
1408            * Delete snapshot inode from superblock. Keep list dense.
1409            */
1410           mutex_enter(&si->si_lock);
1411           fs = ip->i_fs;
1412           for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1413                     if (fs->fs_snapinum[snaploc] == ip->i_number)
1414                               break;
1415           if (snaploc < FSMAXSNAP) {
1416                     for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1417                               if (fs->fs_snapinum[snaploc] == 0)
1418                                         break;
1419                               fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1420                     }
1421                     fs->fs_snapinum[snaploc - 1] = 0;
1422           }
1423           si->si_gen++;
1424           mutex_exit(&si->si_lock);
1425 }
1426 
1427 /*
1428  * Prepare a snapshot file for being removed.
1429  */
1430 void
ffs_snapremove(struct vnode * vp)1431 ffs_snapremove(struct vnode *vp)
1432 {
1433           struct inode *ip = VTOI(vp), *xp;
1434           struct vnode *devvp = ip->i_devvp;
1435           struct fs *fs = ip->i_fs;
1436           struct mount *mp = spec_node_getmountedfs(devvp);
1437           struct buf *ibp;
1438           struct snap_info *si;
1439           struct lwp *l = curlwp;
1440           daddr_t numblks, blkno, dblk;
1441           int error, loc, last;
1442 
1443           si = VFSTOUFS(mp)->um_snapinfo;
1444           /*
1445            * If active, delete from incore list (this snapshot may
1446            * already have been in the process of being deleted, so
1447            * would not have been active).
1448            *
1449            * Clear copy-on-write flag if last snapshot.
1450            */
1451           mutex_enter(&si->si_snaplock);
1452           mutex_enter(&si->si_lock);
1453           if (is_active_snapshot(si, ip)) {
1454                     TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1455                     if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1456                               /* Roll back the list of preallocated blocks. */
1457                               xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1458                               si->si_snapblklist = xp->i_snapblklist;
1459                               si->si_gen++;
1460                               mutex_exit(&si->si_lock);
1461                               mutex_exit(&si->si_snaplock);
1462                     } else {
1463                               si->si_snapblklist = 0;
1464                               si->si_gen++;
1465                               mutex_exit(&si->si_lock);
1466                               mutex_exit(&si->si_snaplock);
1467                               fscow_disestablish(mp, ffs_copyonwrite, devvp);
1468                     }
1469                     if (ip->i_snapblklist != NULL) {
1470                               free(ip->i_snapblklist, M_UFSMNT);
1471                               ip->i_snapblklist = NULL;
1472                     }
1473           } else {
1474                     mutex_exit(&si->si_lock);
1475                     mutex_exit(&si->si_snaplock);
1476           }
1477           /*
1478            * Clear all BLK_NOCOPY fields. Pass any block claims to other
1479            * snapshots that want them (see ffs_snapblkfree below).
1480            */
1481           for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
1482                     dblk = db_get(ip, blkno);
1483                     if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1484                               db_assign(ip, blkno, 0);
1485                     else if ((dblk == ffs_blkstofrags(fs, blkno) &&
1486                          ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1487                          ip->i_number))) {
1488                               DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1489                               db_assign(ip, blkno, 0);
1490                     }
1491           }
1492           numblks = howmany(ip->i_size, fs->fs_bsize);
1493           for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) {
1494                     error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
1495                         fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1496                     if (error)
1497                               continue;
1498                     if (fs->fs_size - blkno > FFS_NINDIR(fs))
1499                               last = FFS_NINDIR(fs);
1500                     else
1501                               last = fs->fs_size - blkno;
1502                     for (loc = 0; loc < last; loc++) {
1503                               dblk = idb_get(ip, ibp->b_data, loc);
1504                               if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1505                                         idb_assign(ip, ibp->b_data, loc, 0);
1506                               else if (dblk == ffs_blkstofrags(fs, blkno) &&
1507                                   ffs_snapblkfree(fs, ip->i_devvp, dblk,
1508                                   fs->fs_bsize, ip->i_number)) {
1509                                         DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1510                                         idb_assign(ip, ibp->b_data, loc, 0);
1511                               }
1512                     }
1513                     bawrite(ibp);
1514                     UFS_WAPBL_END(mp);
1515                     error = UFS_WAPBL_BEGIN(mp);
1516                     KASSERT(error == 0);
1517           }
1518           /*
1519            * Clear snapshot flag and drop reference.
1520            */
1521           ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
1522           DIP_ASSIGN(ip, flags, ip->i_flags);
1523           ip->i_flag |= IN_CHANGE | IN_UPDATE;
1524 #if defined(QUOTA) || defined(QUOTA2)
1525           chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
1526           chkiq(ip, 1, l->l_cred, FORCE);
1527 #endif
1528 }
1529 
1530 /*
1531  * Notification that a block is being freed. Return zero if the free
1532  * should be allowed to proceed. Return non-zero if the snapshot file
1533  * wants to claim the block. The block will be claimed if it is an
1534  * uncopied part of one of the snapshots. It will be freed if it is
1535  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1536  * If a fragment is being freed, then all snapshots that care about
1537  * it must make a copy since a snapshot file can only claim full sized
1538  * blocks. Note that if more than one snapshot file maps the block,
1539  * we can pick one at random to claim it. Since none of the snapshots
1540  * can change, we are assurred that they will all see the same unmodified
1541  * image. When deleting a snapshot file (see ffs_snapremove above), we
1542  * must push any of these claimed blocks to one of the other snapshots
1543  * that maps it. These claimed blocks are easily identified as they will
1544  * have a block number equal to their logical block number within the
1545  * snapshot. A copied block can never have this property because they
1546  * must always have been allocated from a BLK_NOCOPY location.
1547  */
1548 int
ffs_snapblkfree(struct fs * fs,struct vnode * devvp,daddr_t bno,long size,ino_t inum)1549 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1550     long size, ino_t inum)
1551 {
1552           struct mount *mp = spec_node_getmountedfs(devvp);
1553           struct buf *ibp;
1554           struct inode *ip;
1555           struct vnode *vp = NULL;
1556           struct snap_info *si;
1557           void *saved_data = NULL;
1558           daddr_t lbn;
1559           daddr_t blkno;
1560           uint32_t gen;
1561           int indiroff = 0, error = 0, claimedblk = 0;
1562 
1563           si = VFSTOUFS(mp)->um_snapinfo;
1564           lbn = ffs_fragstoblks(fs, bno);
1565           mutex_enter(&si->si_snaplock);
1566           mutex_enter(&si->si_lock);
1567           si->si_owner = curlwp;
1568 
1569 retry:
1570           gen = si->si_gen;
1571           TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1572                     vp = ITOV(ip);
1573                     /*
1574                      * Lookup block being written.
1575                      */
1576                     if (lbn < UFS_NDADDR) {
1577                               blkno = db_get(ip, lbn);
1578                     } else {
1579                               mutex_exit(&si->si_lock);
1580                               error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1581                                   fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1582                               if (error) {
1583                                         mutex_enter(&si->si_lock);
1584                                         break;
1585                               }
1586                               indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs);
1587                               blkno = idb_get(ip, ibp->b_data, indiroff);
1588                               mutex_enter(&si->si_lock);
1589                               if (gen != si->si_gen) {
1590                                         brelse(ibp, 0);
1591                                         goto retry;
1592                               }
1593                     }
1594                     /*
1595                      * Check to see if block needs to be copied.
1596                      */
1597                     if (blkno == 0) {
1598                               /*
1599                                * A block that we map is being freed. If it has not
1600                                * been claimed yet, we will claim or copy it (below).
1601                                */
1602                               claimedblk = 1;
1603                     } else if (blkno == BLK_SNAP) {
1604                               /*
1605                                * No previous snapshot claimed the block,
1606                                * so it will be freed and become a BLK_NOCOPY
1607                                * (don't care) for us.
1608                                */
1609                               if (claimedblk)
1610                                         panic("snapblkfree: inconsistent block type");
1611                               if (lbn < UFS_NDADDR) {
1612                                         db_assign(ip, lbn, BLK_NOCOPY);
1613                                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
1614                               } else {
1615                                         idb_assign(ip, ibp->b_data, indiroff,
1616                                             BLK_NOCOPY);
1617                                         mutex_exit(&si->si_lock);
1618                                         if (ip->i_nlink > 0)
1619                                                   bwrite(ibp);
1620                                         else
1621                                                   bdwrite(ibp);
1622                                         mutex_enter(&si->si_lock);
1623                                         if (gen != si->si_gen)
1624                                                   goto retry;
1625                               }
1626                               continue;
1627                     } else /* BLK_NOCOPY or default */ {
1628                               /*
1629                                * If the snapshot has already copied the block
1630                                * (default), or does not care about the block,
1631                                * it is not needed.
1632                                */
1633                               if (lbn >= UFS_NDADDR)
1634                                         brelse(ibp, 0);
1635                               continue;
1636                     }
1637                     /*
1638                      * If this is a full size block, we will just grab it
1639                      * and assign it to the snapshot inode. Otherwise we
1640                      * will proceed to copy it. See explanation for this
1641                      * routine as to why only a single snapshot needs to
1642                      * claim this block.
1643                      */
1644                     if (size == fs->fs_bsize) {
1645 #ifdef DEBUG
1646                               if (snapdebug)
1647                                         printf("%s %llu lbn %" PRId64
1648                                             "from inum %llu\n",
1649                                             "Grabonremove: snapino",
1650                                             (unsigned long long)ip->i_number,
1651                                             lbn, (unsigned long long)inum);
1652 #endif
1653                               mutex_exit(&si->si_lock);
1654                               if (lbn < UFS_NDADDR) {
1655                                         db_assign(ip, lbn, bno);
1656                               } else {
1657                                         idb_assign(ip, ibp->b_data, indiroff, bno);
1658                                         if (ip->i_nlink > 0)
1659                                                   bwrite(ibp);
1660                                         else
1661                                                   bdwrite(ibp);
1662                               }
1663                               DIP_ADD(ip, blocks, btodb(size));
1664                               ip->i_flag |= IN_CHANGE | IN_UPDATE;
1665                               if (ip->i_nlink > 0 && mp->mnt_wapbl)
1666                                         error = syncsnap(vp);
1667                               else
1668                                         error = 0;
1669                               mutex_enter(&si->si_lock);
1670                               si->si_owner = NULL;
1671                               mutex_exit(&si->si_lock);
1672                               mutex_exit(&si->si_snaplock);
1673                               return (error == 0);
1674                     }
1675                     if (lbn >= UFS_NDADDR)
1676                               brelse(ibp, 0);
1677 #ifdef DEBUG
1678                     if (snapdebug)
1679                               printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1680                                   "Copyonremove: snapino ",
1681                                   (unsigned long long)ip->i_number,
1682                                   lbn, "for inum", (unsigned long long)inum, size);
1683 #endif
1684                     /*
1685                      * If we have already read the old block contents, then
1686                      * simply copy them to the new block. Note that we need
1687                      * to synchronously write snapshots that have not been
1688                      * unlinked, and hence will be visible after a crash,
1689                      * to ensure their integrity.
1690                      */
1691                     mutex_exit(&si->si_lock);
1692                     if (saved_data == NULL) {
1693                               saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1694                               error = rwfsblk(vp, B_READ, saved_data, lbn);
1695                               if (error) {
1696                                         free(saved_data, M_UFSMNT);
1697                                         saved_data = NULL;
1698                                         mutex_enter(&si->si_lock);
1699                                         break;
1700                               }
1701                     }
1702                     error = wrsnapblk(vp, saved_data, lbn);
1703                     if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1704                               error = syncsnap(vp);
1705                     mutex_enter(&si->si_lock);
1706                     if (error)
1707                               break;
1708                     if (gen != si->si_gen)
1709                               goto retry;
1710           }
1711           si->si_owner = NULL;
1712           mutex_exit(&si->si_lock);
1713           mutex_exit(&si->si_snaplock);
1714           if (saved_data)
1715                     free(saved_data, M_UFSMNT);
1716           /*
1717            * If we have been unable to allocate a block in which to do
1718            * the copy, then return non-zero so that the fragment will
1719            * not be freed. Although space will be lost, the snapshot
1720            * will stay consistent.
1721            */
1722           return (error);
1723 }
1724 
1725 /*
1726  * Associate snapshot files when mounting.
1727  */
1728 void
ffs_snapshot_mount(struct mount * mp)1729 ffs_snapshot_mount(struct mount *mp)
1730 {
1731           struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1732           struct fs *fs = VFSTOUFS(mp)->um_fs;
1733           struct lwp *l = curlwp;
1734           struct vnode *vp;
1735           struct inode *ip, *xp;
1736           struct snap_info *si;
1737           daddr_t snaplistsize, *snapblklist;
1738           int i, error, ns __unused, snaploc, loc;
1739 
1740           /*
1741            * No persistent snapshots on apple ufs file systems.
1742            */
1743           if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1744                     return;
1745 
1746           si = VFSTOUFS(mp)->um_snapinfo;
1747           ns = UFS_FSNEEDSWAP(fs);
1748           /*
1749            * XXX The following needs to be set before ffs_truncate or
1750            * VOP_READ can be called.
1751            */
1752           mp->mnt_stat.f_iosize = fs->fs_bsize;
1753           /*
1754            * Process each snapshot listed in the superblock.
1755            */
1756           vp = NULL;
1757           mutex_enter(&si->si_lock);
1758           for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1759                     if (fs->fs_snapinum[snaploc] == 0)
1760                               break;
1761                     if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1762                         LK_EXCLUSIVE, &vp)) != 0) {
1763                               printf("ffs_snapshot_mount: vget failed %d\n", error);
1764                               continue;
1765                     }
1766                     ip = VTOI(vp);
1767                     if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
1768                         SF_SNAPSHOT) {
1769                               printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1770                                   fs->fs_snapinum[snaploc]);
1771                               vput(vp);
1772                               vp = NULL;
1773                               for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1774                                         if (fs->fs_snapinum[loc] == 0)
1775                                                   break;
1776                                         fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1777                               }
1778                               fs->fs_snapinum[loc - 1] = 0;
1779                               snaploc--;
1780                               continue;
1781                     }
1782 
1783                     /*
1784                      * Read the block hints list. Use an empty list on
1785                      * read errors.
1786                      */
1787                     error = vn_rdwr(UIO_READ, vp,
1788                         (void *)&snaplistsize, sizeof(snaplistsize),
1789                         ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1790                         UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1791                         l->l_cred, NULL, NULL);
1792                     if (error) {
1793                               printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1794                               snaplistsize = 1;
1795                     } else
1796                               snaplistsize = ufs_rw64(snaplistsize, ns);
1797                     snapblklist = malloc(
1798                         snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1799                     if (error)
1800                               snapblklist[0] = 1;
1801                     else {
1802                               error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1803                                   snaplistsize * sizeof(daddr_t),
1804                                   ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1805                                   UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1806                                   l->l_cred, NULL, NULL);
1807                               for (i = 0; i < snaplistsize; i++)
1808                                         snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1809                               if (error) {
1810                                         printf("ffs_snapshot_mount: read_2 failed %d\n",
1811                                             error);
1812                                         snapblklist[0] = 1;
1813                               }
1814                     }
1815                     ip->i_snapblklist = &snapblklist[0];
1816 
1817                     /*
1818                      * Link it onto the active snapshot list.
1819                      */
1820                     if (is_active_snapshot(si, ip))
1821                               panic("ffs_snapshot_mount: %"PRIu64" already on list",
1822                                   ip->i_number);
1823                     else
1824                               TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1825                     vp->v_vflag |= VV_SYSTEM;
1826                     VOP_UNLOCK(vp);
1827           }
1828           /*
1829            * No usable snapshots found.
1830            */
1831           if (vp == NULL) {
1832                     mutex_exit(&si->si_lock);
1833                     return;
1834           }
1835           /*
1836            * Attach the block hints list. We always want to
1837            * use the list from the newest snapshot.
1838           */
1839           xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1840           si->si_snapblklist = xp->i_snapblklist;
1841           fscow_establish(mp, ffs_copyonwrite, devvp);
1842           si->si_gen++;
1843           mutex_exit(&si->si_lock);
1844 }
1845 
1846 /*
1847  * Disassociate snapshot files when unmounting.
1848  */
1849 void
ffs_snapshot_unmount(struct mount * mp)1850 ffs_snapshot_unmount(struct mount *mp)
1851 {
1852           struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1853           struct inode *xp;
1854           struct vnode *vp = NULL;
1855           struct snap_info *si;
1856 
1857           si = VFSTOUFS(mp)->um_snapinfo;
1858           mutex_enter(&si->si_lock);
1859           while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1860                     vp = ITOV(xp);
1861                     TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1862                     if (xp->i_snapblklist == si->si_snapblklist)
1863                               si->si_snapblklist = NULL;
1864                     free(xp->i_snapblklist, M_UFSMNT);
1865                     if (xp->i_nlink > 0) {
1866                               si->si_gen++;
1867                               mutex_exit(&si->si_lock);
1868                               vrele(vp);
1869                               mutex_enter(&si->si_lock);
1870                     }
1871           }
1872           si->si_gen++;
1873           mutex_exit(&si->si_lock);
1874           if (vp)
1875                     fscow_disestablish(mp, ffs_copyonwrite, devvp);
1876 }
1877 
1878 /*
1879  * Check for need to copy block that is about to be written,
1880  * copying the block if necessary.
1881  */
1882 static int
ffs_copyonwrite(void * v,struct buf * bp,bool data_valid)1883 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1884 {
1885           struct fs *fs;
1886           struct inode *ip;
1887           struct vnode *devvp = v, *vp = NULL;
1888           struct mount *mp = spec_node_getmountedfs(devvp);
1889           struct snap_info *si;
1890           void *saved_data = NULL;
1891           daddr_t lbn, blkno, *snapblklist;
1892           uint32_t gen;
1893           int lower, upper, mid, snapshot_locked = 0, error = 0;
1894 
1895           /*
1896            * Check for valid snapshots.
1897            */
1898           si = VFSTOUFS(mp)->um_snapinfo;
1899           mutex_enter(&si->si_lock);
1900           ip = TAILQ_FIRST(&si->si_snapshots);
1901           if (ip == NULL) {
1902                     mutex_exit(&si->si_lock);
1903                     return 0;
1904           }
1905           /*
1906            * First check to see if it is after the file system,
1907            * in the journal or in the preallocated list.
1908            * By doing these checks we avoid several potential deadlocks.
1909            */
1910           fs = ip->i_fs;
1911           lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno));
1912           if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) {
1913                     mutex_exit(&si->si_lock);
1914                     return 0;
1915           }
1916           if ((fs->fs_flags & FS_DOWAPBL) &&
1917               fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
1918                     off_t blk_off, log_start, log_end;
1919 
1920                     log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
1921                         fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1922                     log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
1923                         fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1924                     blk_off = dbtob(bp->b_blkno);
1925                     if (blk_off >= log_start && blk_off < log_end) {
1926                               mutex_exit(&si->si_lock);
1927                               return 0;
1928                     }
1929           }
1930           snapblklist = si->si_snapblklist;
1931           upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
1932           lower = 1;
1933           while (lower <= upper) {
1934                     mid = (lower + upper) / 2;
1935                     if (snapblklist[mid] == lbn)
1936                               break;
1937                     if (snapblklist[mid] < lbn)
1938                               lower = mid + 1;
1939                     else
1940                               upper = mid - 1;
1941           }
1942           if (lower <= upper) {
1943                     mutex_exit(&si->si_lock);
1944                     return 0;
1945           }
1946           /*
1947            * Not in the precomputed list, so check the snapshots.
1948            */
1949            if (si->si_owner != curlwp) {
1950                     if (!mutex_tryenter(&si->si_snaplock)) {
1951                               mutex_exit(&si->si_lock);
1952                               mutex_enter(&si->si_snaplock);
1953                               mutex_enter(&si->si_lock);
1954                     }
1955                     si->si_owner = curlwp;
1956                     snapshot_locked = 1;
1957            }
1958            if (data_valid && bp->b_bcount == fs->fs_bsize)
1959                     saved_data = bp->b_data;
1960 retry:
1961           gen = si->si_gen;
1962           TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1963                     vp = ITOV(ip);
1964                     /*
1965                      * We ensure that everything of our own that needs to be
1966                      * copied will be done at the time that ffs_snapshot is
1967                      * called. Thus we can skip the check here which can
1968                      * deadlock in doing the lookup in ffs_balloc.
1969                      */
1970                     if (bp->b_vp == vp)
1971                               continue;
1972                     /*
1973                      * Check to see if block needs to be copied.
1974                      */
1975                     if (lbn < UFS_NDADDR) {
1976                               blkno = db_get(ip, lbn);
1977                     } else {
1978                               mutex_exit(&si->si_lock);
1979                               blkno = 0; /* XXX: GCC */
1980                               if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1981                                         mutex_enter(&si->si_lock);
1982                                         break;
1983                               }
1984                               mutex_enter(&si->si_lock);
1985                               if (gen != si->si_gen)
1986                                         goto retry;
1987                     }
1988                     KASSERTMSG((blkno != BLK_SNAP || bp->b_lblkno < 0),
1989                         "ffs_copyonwrite: bad copy block: blkno %jd, lblkno %jd",
1990                         (intmax_t)blkno, (intmax_t)bp->b_lblkno);
1991                     if (blkno != 0)
1992                               continue;
1993 
1994                     if (curlwp == uvm.pagedaemon_lwp) {
1995                               error = ENOMEM;
1996                               break;
1997                     }
1998                     /* Only one level of recursion allowed. */
1999                     KASSERT(snapshot_locked);
2000                     /*
2001                      * Allocate the block into which to do the copy. Since
2002                      * multiple processes may all try to copy the same block,
2003                      * we have to recheck our need to do a copy if we sleep
2004                      * waiting for the lock.
2005                      *
2006                      * Because all snapshots on a filesystem share a single
2007                      * lock, we ensure that we will never be in competition
2008                      * with another process to allocate a block.
2009                      */
2010 #ifdef DEBUG
2011                     if (snapdebug) {
2012                               printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
2013                                   (unsigned long long)ip->i_number, lbn);
2014                               if (bp->b_vp == devvp)
2015                                         printf("fs metadata");
2016                               else
2017                                         printf("inum %llu", (unsigned long long)
2018                                             VTOI(bp->b_vp)->i_number);
2019                               printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
2020                     }
2021 #endif
2022                     /*
2023                      * If we have already read the old block contents, then
2024                      * simply copy them to the new block. Note that we need
2025                      * to synchronously write snapshots that have not been
2026                      * unlinked, and hence will be visible after a crash,
2027                      * to ensure their integrity.
2028                      */
2029                     mutex_exit(&si->si_lock);
2030                     if (saved_data == NULL) {
2031                               saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
2032                               error = rwfsblk(vp, B_READ, saved_data, lbn);
2033                               if (error) {
2034                                         free(saved_data, M_UFSMNT);
2035                                         saved_data = NULL;
2036                                         mutex_enter(&si->si_lock);
2037                                         break;
2038                               }
2039                     }
2040                     error = wrsnapblk(vp, saved_data, lbn);
2041                     if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
2042                               error = syncsnap(vp);
2043                     mutex_enter(&si->si_lock);
2044                     if (error)
2045                               break;
2046                     if (gen != si->si_gen)
2047                               goto retry;
2048           }
2049           /*
2050            * Note that we need to synchronously write snapshots that
2051            * have not been unlinked, and hence will be visible after
2052            * a crash, to ensure their integrity.
2053            */
2054           if (snapshot_locked) {
2055                     si->si_owner = NULL;
2056                     mutex_exit(&si->si_lock);
2057                     mutex_exit(&si->si_snaplock);
2058           } else
2059                     mutex_exit(&si->si_lock);
2060           if (saved_data && saved_data != bp->b_data)
2061                     free(saved_data, M_UFSMNT);
2062           return error;
2063 }
2064 
2065 /*
2066  * Read from a snapshot.
2067  */
2068 int
ffs_snapshot_read(struct vnode * vp,struct uio * uio,int ioflag)2069 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
2070 {
2071           struct inode *ip = VTOI(vp);
2072           struct fs *fs = ip->i_fs;
2073           struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
2074           struct buf *bp;
2075           daddr_t lbn, nextlbn;
2076           off_t fsbytes, bytesinfile;
2077           long size, xfersize, blkoffset;
2078           int error;
2079 
2080           mutex_enter(&si->si_snaplock);
2081 
2082           if (ioflag & IO_ALTSEMANTICS)
2083                     fsbytes = ip->i_size;
2084           else
2085                     fsbytes = ffs_lfragtosize(fs, fs->fs_size);
2086           for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2087                     bytesinfile = fsbytes - uio->uio_offset;
2088                     if (bytesinfile <= 0)
2089                               break;
2090                     lbn = ffs_lblkno(fs, uio->uio_offset);
2091                     nextlbn = lbn + 1;
2092                     size = fs->fs_bsize;
2093                     blkoffset = ffs_blkoff(fs, uio->uio_offset);
2094                     xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2095                         bytesinfile);
2096 
2097                     if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) {
2098                               if (ffs_lblktosize(fs, lbn) + size > fsbytes)
2099                                         size = ffs_fragroundup(fs,
2100                                             fsbytes - ffs_lblktosize(fs, lbn));
2101                               error = bread(vp, lbn, size, 0, &bp);
2102                     } else {
2103                               int nextsize = fs->fs_bsize;
2104                               error = breadn(vp, lbn,
2105                                   size, &nextlbn, &nextsize, 1, 0, &bp);
2106                     }
2107                     if (error)
2108                               break;
2109 
2110                     /*
2111                      * We should only get non-zero b_resid when an I/O error
2112                      * has occurred, which should cause us to break above.
2113                      * However, if the short read did not cause an error,
2114                      * then we want to ensure that we do not uiomove bad
2115                      * or uninitialized data.
2116                      */
2117                     size -= bp->b_resid;
2118                     if (size < blkoffset + xfersize) {
2119                               xfersize = size - blkoffset;
2120                               if (xfersize <= 0)
2121                                         break;
2122                     }
2123                     error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2124                     if (error)
2125                               break;
2126                     brelse(bp, BC_AGE);
2127           }
2128           if (bp != NULL)
2129                     brelse(bp, BC_AGE);
2130 
2131           mutex_exit(&si->si_snaplock);
2132           return error;
2133 }
2134 
2135 /*
2136  * Lookup a snapshots data block address.
2137  * Simpler than UFS_BALLOC() as we know all metadata is already allocated
2138  * and safe even for the pagedaemon where we cannot bread().
2139  */
2140 static int
snapblkaddr(struct vnode * vp,daddr_t lbn,daddr_t * res)2141 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
2142 {
2143           struct indir indirs[UFS_NIADDR + 2];
2144           struct inode *ip = VTOI(vp);
2145           struct fs *fs = ip->i_fs;
2146           struct buf *bp;
2147           int error, num;
2148 
2149           KASSERT(lbn >= 0);
2150 
2151           if (lbn < UFS_NDADDR) {
2152                     *res = db_get(ip, lbn);
2153                     return 0;
2154           }
2155           if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
2156                     return error;
2157           if (curlwp == uvm.pagedaemon_lwp) {
2158                     mutex_enter(&bufcache_lock);
2159                     bp = incore(vp, indirs[num-1].in_lbn);
2160                     if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
2161                               *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2162                               error = 0;
2163                     } else
2164                               error = ENOMEM;
2165                     mutex_exit(&bufcache_lock);
2166                     return error;
2167           }
2168           error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, 0, &bp);
2169           if (error == 0) {
2170                     *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2171                     brelse(bp, 0);
2172           }
2173 
2174           return error;
2175 }
2176 
2177 /*
2178  * Read or write the specified block of the filesystem vp resides on
2179  * from or to the disk bypassing the buffer cache.
2180  */
2181 static int
rwfsblk(struct vnode * vp,int flags,void * data,daddr_t lbn)2182 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
2183 {
2184           int error;
2185           struct inode *ip = VTOI(vp);
2186           struct fs *fs = ip->i_fs;
2187           struct buf *nbp;
2188 
2189           nbp = getiobuf(NULL, true);
2190           nbp->b_flags = flags;
2191           nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2192           nbp->b_error = 0;
2193           nbp->b_data = data;
2194           nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn));
2195           nbp->b_proc = NULL;
2196           nbp->b_dev = ip->i_devvp->v_rdev;
2197           SET(nbp->b_cflags, BC_BUSY);  /* mark buffer busy */
2198 
2199           bdev_strategy(nbp);
2200 
2201           error = biowait(nbp);
2202 
2203           putiobuf(nbp);
2204 
2205           return error;
2206 }
2207 
2208 /*
2209  * Write all dirty buffers to disk and invalidate them.
2210  */
2211 static int
syncsnap(struct vnode * vp)2212 syncsnap(struct vnode *vp)
2213 {
2214           int error;
2215           buf_t *bp;
2216           struct fs *fs = VTOI(vp)->i_fs;
2217 
2218           mutex_enter(&bufcache_lock);
2219           while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2220                     error = bbusy(bp, false, 0, NULL);
2221                     if (error == EPASSTHROUGH)
2222                               continue;
2223                     else if (error != 0) {
2224                               mutex_exit(&bufcache_lock);
2225                               return error;
2226                     }
2227                     KASSERT(bp->b_bcount == fs->fs_bsize);
2228                     mutex_exit(&bufcache_lock);
2229                     error = rwfsblk(vp, B_WRITE, bp->b_data,
2230                         ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)));
2231                     brelse(bp, BC_INVAL | BC_VFLUSH);
2232                     if (error)
2233                               return error;
2234                     mutex_enter(&bufcache_lock);
2235           }
2236           mutex_exit(&bufcache_lock);
2237 
2238           return 0;
2239 }
2240 
2241 /*
2242  * Write the specified block to a snapshot.
2243  */
2244 static int
wrsnapblk(struct vnode * vp,void * data,daddr_t lbn)2245 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
2246 {
2247           struct inode *ip = VTOI(vp);
2248           struct fs *fs = ip->i_fs;
2249           struct buf *bp;
2250           int error;
2251 
2252           error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2253               FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
2254           if (error)
2255                     return error;
2256           memcpy(bp->b_data, data, fs->fs_bsize);
2257           if (ip->i_nlink > 0)
2258                     error = bwrite(bp);
2259           else
2260                     bawrite(bp);
2261 
2262           return error;
2263 }
2264 
2265 /*
2266  * Check if this inode is present on the active snapshot list.
2267  * Must be called with snapinfo locked.
2268  */
2269 static inline bool
is_active_snapshot(struct snap_info * si,struct inode * ip)2270 is_active_snapshot(struct snap_info *si, struct inode *ip)
2271 {
2272           struct inode *xp;
2273 
2274           KASSERT(mutex_owned(&si->si_lock));
2275 
2276           TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2277                     if (xp == ip)
2278                               return true;
2279           return false;
2280 }
2281 
2282 /*
2283  * Get/Put direct block from inode or buffer containing disk addresses. Take
2284  * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2285  * into a global include.
2286  */
2287 static inline daddr_t
db_get(struct inode * ip,int loc)2288 db_get(struct inode *ip, int loc)
2289 {
2290           if (ip->i_ump->um_fstype == UFS1)
2291                     return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2292           else
2293                     return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2294 }
2295 
2296 static inline void
db_assign(struct inode * ip,int loc,daddr_t val)2297 db_assign(struct inode *ip, int loc, daddr_t val)
2298 {
2299           if (ip->i_ump->um_fstype == UFS1)
2300                     ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2301           else
2302                     ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2303 }
2304 
2305 __unused static inline daddr_t
ib_get(struct inode * ip,int loc)2306 ib_get(struct inode *ip, int loc)
2307 {
2308           if (ip->i_ump->um_fstype == UFS1)
2309                     return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2310           else
2311                     return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2312 }
2313 
2314 static inline daddr_t
idb_get(struct inode * ip,void * bf,int loc)2315 idb_get(struct inode *ip, void *bf, int loc)
2316 {
2317           if (ip->i_ump->um_fstype == UFS1)
2318                     return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2319           else
2320                     return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2321 }
2322 
2323 static inline void
idb_assign(struct inode * ip,void * bf,int loc,daddr_t val)2324 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
2325 {
2326           if (ip->i_ump->um_fstype == UFS1)
2327                     ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2328           else
2329                     ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2330 }
2331