1 /*        $NetBSD: ffs_wapbl.c,v 1.50 2024/12/30 09:03:07 hannken Exp $         */
2 
3 /*-
4  * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Wasabi Systems, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: ffs_wapbl.c,v 1.50 2024/12/30 09:03:07 hannken Exp $");
34 
35 #define WAPBL_INTERNAL
36 
37 #if defined(_KERNEL_OPT)
38 #include "opt_ffs.h"
39 #endif
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/vnode.h>
45 #include <sys/mount.h>
46 #include <sys/file.h>
47 #include <sys/disk.h>
48 #include <sys/ioctl.h>
49 #include <sys/errno.h>
50 #include <sys/kauth.h>
51 #include <sys/wapbl.h>
52 
53 #include <ufs/ufs/inode.h>
54 #include <ufs/ufs/quota.h>
55 #include <ufs/ufs/ufsmount.h>
56 #include <ufs/ufs/ufs_bswap.h>
57 #include <ufs/ufs/ufs_extern.h>
58 #include <ufs/ufs/ufs_wapbl.h>
59 
60 #include <ufs/ffs/fs.h>
61 #include <ufs/ffs/ffs_extern.h>
62 
63 #undef    WAPBL_DEBUG
64 #ifdef WAPBL_DEBUG
65 int ffs_wapbl_debug = 1;
66 #define DPRINTF(fmt, args...)                                                   \
67 do {                                                                                      \
68           if (ffs_wapbl_debug)                                                            \
69                     printf("%s:%d "fmt, __func__ , __LINE__, ##args); \
70 } while (/* CONSTCOND */0)
71 #else
72 #define   DPRINTF(fmt, args...)                                                           \
73 do {                                                                                      \
74           /* nothing */                                                                   \
75 } while (/* CONSTCOND */0)
76 #endif
77 
78 static int ffs_superblock_layout(struct fs *);
79 static int wapbl_log_position(struct mount *, struct fs *, struct vnode *,
80     daddr_t *, size_t *, size_t *, uint64_t *);
81 static int wapbl_create_infs_log(struct mount *, struct fs *, struct vnode *,
82     daddr_t *, size_t *, uint64_t *);
83 static void wapbl_find_log_start(struct mount *, struct vnode *, off_t,
84     daddr_t *, daddr_t *, size_t *);
85 static int wapbl_remove_log(struct mount *);
86 static int wapbl_allocate_log_file(struct mount *, struct vnode *,
87     daddr_t *, size_t *, uint64_t *);
88 
89 /*
90  * Return the super block layout format - UFS1 or UFS2.
91  * WAPBL only works with UFS2 layout (which is still available
92  * with FFSv1).
93  *
94  * XXX Should this be in ufs/ffs/fs.h?  Same style of check is
95  * also used in ffs_alloc.c in a few places.
96  */
97 static int
ffs_superblock_layout(struct fs * fs)98 ffs_superblock_layout(struct fs *fs)
99 {
100           if ((fs->fs_magic == FS_UFS1_MAGIC) &&
101               ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))
102                     return 1;
103           else
104                     return 2;
105 }
106 
107 /*
108  * This function is invoked after a log is replayed to
109  * disk to perform logical cleanup actions as described by
110  * the log
111  */
112 void
ffs_wapbl_replay_finish(struct mount * mp)113 ffs_wapbl_replay_finish(struct mount *mp)
114 {
115           struct wapbl_replay *wr = mp->mnt_wapbl_replay;
116           int i;
117           int error;
118 
119           if (!wr)
120                     return;
121 
122           KDASSERT((mp->mnt_flag & MNT_RDONLY) == 0);
123 
124           for (i = 0; i < wr->wr_inodescnt; i++) {
125                     struct vnode *vp;
126                     struct inode *ip;
127                     error = VFS_VGET(mp, wr->wr_inodes[i].wr_inumber,
128                         LK_EXCLUSIVE, &vp);
129                     if (error) {
130                               printf("%s: %s: unable to cleanup inode %" PRIu32 "\n",
131                                   __func__, VFSTOUFS(mp)->um_fs->fs_fsmnt,
132                                   wr->wr_inodes[i].wr_inumber);
133                               continue;
134                     }
135                     ip = VTOI(vp);
136                     KDASSERT(wr->wr_inodes[i].wr_inumber == ip->i_number);
137 #ifdef WAPBL_DEBUG
138                     printf("%s%s: %s: cleaning inode %" PRIu64 " size=%" PRIu64
139                         " mode=%o nlink=%d\n",
140                         __func__, VFSTOUFS(mp)->um_fs->fs_fsmnt,
141                         ip->i_number, ip->i_size, ip->i_mode, ip->i_nlink);
142 #endif
143                     KASSERT(ip->i_nlink == 0);
144 
145                     /*
146                      * The journal may have left partially allocated inodes in mode
147                      * zero.  This may occur if a crash occurs between the node
148                      * allocation in ffs_nodeallocg and when the node is properly
149                      * initialized in ufs_makeinode.  If so, just deallocate them.
150                      */
151                     if (ip->i_mode == 0) {
152                               error = UFS_WAPBL_BEGIN(mp);
153                               if (error) {
154                                         printf("%s: %s: "
155                                             "unable to cleanup inode %" PRIu32 "\n",
156                                             __func__, VFSTOUFS(mp)->um_fs->fs_fsmnt,
157                                             wr->wr_inodes[i].wr_inumber);
158                               } else {
159                                         ffs_vfree(vp, ip->i_number,
160                                             wr->wr_inodes[i].wr_imode);
161                                         UFS_WAPBL_END(mp);
162                               }
163                     }
164                     vput(vp);
165           }
166           wapbl_replay_stop(wr);
167           wapbl_replay_free(wr);
168           mp->mnt_wapbl_replay = NULL;
169 }
170 
171 /* Callback for wapbl */
172 void
ffs_wapbl_sync_metadata(struct mount * mp,struct wapbl_dealloc * fdealloc)173 ffs_wapbl_sync_metadata(struct mount *mp, struct wapbl_dealloc *fdealloc)
174 {
175           struct ufsmount *ump = VFSTOUFS(mp);
176           struct fs *fs = ump->um_fs;
177           int error __diagused;
178           struct wapbl_dealloc *wd;
179 
180           UFS_WAPBL_JLOCK_ASSERT(ump->um_mountp);
181 
182           for (wd = fdealloc; wd != NULL; wd = TAILQ_NEXT(wd, wd_entries)) {
183                     /*
184                      * blkfree errors are unreported, might silently fail
185                      * if it cannot read the cylinder group block
186                      */
187                     ffs_blkfree(fs, ump->um_devvp,
188                         FFS_DBTOFSB(fs, wd->wd_blkno), wd->wd_len, -1);
189           }
190 
191           mutex_enter(&ump->um_lock);
192           if (fs->fs_fmod != 0) {
193                     fs->fs_fmod = 0;
194                     fs->fs_time = time_second;
195                     mutex_exit(&ump->um_lock);
196                     error = ffs_cgupdate(ump, 0);
197                     KASSERT(error == 0);
198           } else {
199                     mutex_exit(&ump->um_lock);
200           }
201 }
202 
203 void
ffs_wapbl_abort_sync_metadata(struct mount * mp,struct wapbl_dealloc * fdealloc)204 ffs_wapbl_abort_sync_metadata(struct mount *mp, struct wapbl_dealloc *fdealloc)
205 {
206           struct ufsmount *ump = VFSTOUFS(mp);
207           struct fs *fs = ump->um_fs;
208           struct wapbl_dealloc *wd;
209 
210           for (wd = fdealloc; wd != NULL; wd = TAILQ_NEXT(wd, wd_entries)) {
211                     /*
212                      * Since the above blkfree may have failed, this blkalloc might
213                      * fail as well, so don't check its error.  Note that if the
214                      * blkfree succeeded above, then this shouldn't fail because
215                      * the buffer will be locked in the current transaction.
216                      */
217                     ffs_blkalloc_ump(ump, FFS_DBTOFSB(fs, wd->wd_blkno),
218                         wd->wd_len);
219           }
220 }
221 
222 static int
wapbl_remove_log(struct mount * mp)223 wapbl_remove_log(struct mount *mp)
224 {
225           struct ufsmount *ump = VFSTOUFS(mp);
226           struct fs *fs = ump->um_fs;
227           struct vnode *vp;
228           struct inode *ip;
229           ino_t log_ino;
230           int error;
231 
232           /* If super block layout is too old to support WAPBL, return */
233           if (ffs_superblock_layout(fs) < 2)
234                     return 0;
235 
236           /* If all the log locators are 0, just clean up */
237           if (fs->fs_journallocs[0] == 0 &&
238               fs->fs_journallocs[1] == 0 &&
239               fs->fs_journallocs[2] == 0 &&
240               fs->fs_journallocs[3] == 0) {
241                     DPRINTF("empty locators, just clear\n");
242                     goto done;
243           }
244 
245           switch (fs->fs_journal_location) {
246           case UFS_WAPBL_JOURNALLOC_NONE:
247                     /* nothing! */
248                     DPRINTF("no log\n");
249                     break;
250 
251           case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
252                     log_ino = fs->fs_journallocs[UFS_WAPBL_INFS_INO];
253                     DPRINTF("in-fs log, ino = %" PRId64 "\n",log_ino);
254 
255                     /* if no existing log inode, just clear all fields and bail */
256                     if (log_ino == 0)
257                               goto done;
258                     error = VFS_VGET(mp, log_ino, LK_EXCLUSIVE, &vp);
259                     if (error != 0) {
260                               printf("%s: %s: vget failed %d\n", __func__,
261                                   fs->fs_fsmnt, error);
262                               /* clear out log info on error */
263                               goto done;
264                     }
265                     ip = VTOI(vp);
266                     KASSERT(log_ino == ip->i_number);
267                     if ((ip->i_flags & SF_LOG) == 0) {
268                               printf("%s: %s: try to clear non-log inode "
269                                   "%" PRId64 "\n", __func__, fs->fs_fsmnt, log_ino);
270                               vput(vp);
271                               /* clear out log info on error */
272                               goto done;
273                     }
274 
275                     /*
276                      * remove the log inode by setting its link count back
277                      * to zero and bail.
278                      */
279                     ip->i_nlink = 0;
280                     DIP_ASSIGN(ip, nlink, 0);
281                     vput(vp);
282                     break;
283 
284           case UFS_WAPBL_JOURNALLOC_END_PARTITION:
285                     DPRINTF("end-of-partition log\n");
286                     /* no extra work required */
287                     break;
288 
289           default:
290                     printf("%s: %s: unknown journal type %d\n", __func__,
291                         fs->fs_fsmnt, fs->fs_journal_location);
292                     break;
293           }
294 
295 
296 done:
297           /* Clear out all previous knowledge of journal */
298           fs->fs_journal_version = 0;
299           fs->fs_journal_location = 0;
300           fs->fs_journal_flags = 0;
301           fs->fs_journallocs[0] = 0;
302           fs->fs_journallocs[1] = 0;
303           fs->fs_journallocs[2] = 0;
304           fs->fs_journallocs[3] = 0;
305           (void) ffs_sbupdate(ump, MNT_WAIT);
306 
307           return 0;
308 }
309 
310 int
ffs_wapbl_start(struct mount * mp)311 ffs_wapbl_start(struct mount *mp)
312 {
313           struct ufsmount *ump = VFSTOUFS(mp);
314           struct fs *fs = ump->um_fs;
315           struct vnode *devvp = ump->um_devvp;
316           daddr_t off;
317           size_t count;
318           size_t blksize;
319           uint64_t extradata;
320           int error;
321 
322           if (mp->mnt_wapbl == NULL) {
323                     if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) {
324                               /* Clear out any existing journal file */
325                               error = wapbl_remove_log(mp);
326                               if (error != 0)
327                                         return error;
328                     }
329 
330                     if (mp->mnt_flag & MNT_LOG) {
331                               KDASSERT(fs->fs_ronly == 0);
332 
333                               /* WAPBL needs UFS2 format super block */
334                               if (ffs_superblock_layout(fs) < 2) {
335                                         printf("%s: %s: fs superblock in old format, "
336                                            "not journaling\n", __func__,
337                                            VFSTOUFS(mp)->um_fs->fs_fsmnt);
338                                         mp->mnt_flag &= ~MNT_LOG;
339                                         return EINVAL;
340                               }
341 
342                               error = wapbl_log_position(mp, fs, devvp, &off,
343                                   &count, &blksize, &extradata);
344                               if (error)
345                                         return error;
346 
347                               /*
348                                * Make sure we don't carry over any delayed write
349                                * buffers when updating to log. Need to turn off
350                                * async termporarily, to prevent ffs_sync() writes
351                                * themselves being turned into delayed writes.
352                                */
353                               if (mp->mnt_flag & MNT_UPDATE) {
354                                         int saveflag = mp->mnt_flag & MNT_ASYNC;
355                                         mp->mnt_flag &= ~MNT_ASYNC;
356                                         ffs_sync(mp, MNT_WAIT, FSCRED);
357                                         mp->mnt_flag |= saveflag;
358                               }
359 
360                               error = wapbl_start(&mp->mnt_wapbl, mp, devvp, off,
361                                   count, blksize, mp->mnt_wapbl_replay,
362                                   ffs_wapbl_sync_metadata,
363                                   ffs_wapbl_abort_sync_metadata);
364                               if (error)
365                                         return error;
366 
367                               mp->mnt_wapbl_op = &wapbl_ops;
368 
369 #ifdef WAPBL_DEBUG
370                               printf("%s: %s: enabling logging\n", __func__,
371                                   fs->fs_fsmnt);
372 #endif
373 
374                               if ((fs->fs_flags & FS_DOWAPBL) == 0) {
375                                         fs->fs_flags |= FS_DOWAPBL;
376                                         if ((error = UFS_WAPBL_BEGIN(mp)) != 0)
377                                                   goto out;
378                                         error = ffs_sbupdate(ump, MNT_WAIT);
379                                         if (error) {
380                                                   UFS_WAPBL_END(mp);
381                                                   goto out;
382                                         }
383                                         UFS_WAPBL_END(mp);
384                                         error = wapbl_flush(mp->mnt_wapbl, 1);
385                                         if (error)
386                                                   goto out;
387                               }
388 
389                               /*
390                                * XXX discard interferes with block deallocation
391                                * registration and hence log consistency
392                                */
393                               if (mp->mnt_flag & MNT_DISCARD) {
394                                         CLR(mp->mnt_flag, MNT_DISCARD);
395                                         printf("%s: %s: disabling discard to preserve log consistency\n", __func__,
396                                             fs->fs_fsmnt);
397 
398                                         if (ump->um_discarddata != NULL) {
399                                         ffs_discard_finish(ump->um_discarddata,
400                                                       0);
401                                         ump->um_discarddata = NULL;
402                                         }
403                               }
404 
405                     } else if (fs->fs_flags & FS_DOWAPBL) {
406                               fs->fs_fmod = 1;
407                               fs->fs_flags &= ~FS_DOWAPBL;
408                     }
409           }
410 
411           /*
412            * It is recommended that you finish replay with logging enabled.
413            * However, even if logging is not enabled, the remaining log
414            * replay should be safely recoverable with an fsck, so perform
415            * it anyway.
416            */
417           if ((fs->fs_ronly == 0) && mp->mnt_wapbl_replay) {
418                     int saveflag = mp->mnt_flag & MNT_RDONLY;
419                     /*
420                      * Make sure MNT_RDONLY is not set so that the inode
421                      * cleanup in ufs_inactive will actually do its work.
422                      */
423                     mp->mnt_flag &= ~MNT_RDONLY;
424                     ffs_wapbl_replay_finish(mp);
425                     mp->mnt_flag |= saveflag;
426                     KASSERT(fs->fs_ronly == 0);
427           }
428 
429           return 0;
430 out:
431           ffs_wapbl_stop(mp, MNT_FORCE);
432           return error;
433 }
434 
435 int
ffs_wapbl_stop(struct mount * mp,int force)436 ffs_wapbl_stop(struct mount *mp, int force)
437 {
438           struct ufsmount *ump = VFSTOUFS(mp);
439           struct fs *fs = ump->um_fs;
440           int error;
441 
442           if (mp->mnt_wapbl) {
443                     KDASSERT(fs->fs_ronly == 0);
444 
445                     /*
446                      * Make sure turning off FS_DOWAPBL is only removed
447                      * as the only change in the final flush since otherwise
448                      * a transaction may reorder writes.
449                      */
450                     error = wapbl_flush(mp->mnt_wapbl, 1);
451                     if (error && !force)
452                               return error;
453                     if (error && force)
454                               goto forceout;
455                     error = UFS_WAPBL_BEGIN(mp);
456                     if (error && !force)
457                               return error;
458                     if (error && force)
459                               goto forceout;
460                     KASSERT(fs->fs_flags & FS_DOWAPBL);
461 
462                     fs->fs_flags &= ~FS_DOWAPBL;
463                     error = ffs_sbupdate(ump, MNT_WAIT);
464                     KASSERT(error == 0);          /* XXX a bit drastic! */
465                     UFS_WAPBL_END(mp);
466           forceout:
467                     error = wapbl_stop(mp->mnt_wapbl, force);
468                     if (error) {
469                               KASSERT(!force);
470                               fs->fs_flags |= FS_DOWAPBL;
471                               return error;
472                     }
473                     fs->fs_flags &= ~FS_DOWAPBL; /* Repeat in case of forced error */
474                     mp->mnt_wapbl = NULL;
475 
476 #ifdef WAPBL_DEBUG
477                     printf("%s: %s: disabled logging\n", __func__, fs->fs_fsmnt);
478 #endif
479           }
480 
481           return 0;
482 }
483 
484 int
ffs_wapbl_replay_start(struct mount * mp,struct fs * fs,struct vnode * devvp)485 ffs_wapbl_replay_start(struct mount *mp, struct fs *fs, struct vnode *devvp)
486 {
487           int error;
488           daddr_t off;
489           size_t count;
490           size_t blksize;
491           uint64_t extradata;
492 
493           /*
494            * WAPBL needs UFS2 format super block, if we got here with a
495            * UFS1 format super block something is amiss...
496            */
497           if (ffs_superblock_layout(fs) < 2)
498                     return EINVAL;
499 
500           error = wapbl_log_position(mp, fs, devvp, &off, &count, &blksize,
501               &extradata);
502 
503           if (error)
504                     return error;
505 
506           error = wapbl_replay_start(&mp->mnt_wapbl_replay, devvp, off,
507                     count, blksize);
508           if (error)
509                     return error;
510 
511           mp->mnt_wapbl_op = &wapbl_ops;
512 
513           return 0;
514 }
515 
516 /*
517  * If the superblock doesn't already have a recorded journal location
518  * then we allocate the journal in one of two positions:
519  *
520  *  - At the end of the partition after the filesystem if there's
521  *    enough space.  "Enough space" is defined as >= 1MB of journal
522  *    per 1GB of filesystem or 64MB, whichever is smaller.
523  *
524  *  - Inside the filesystem.  We try to allocate a contiguous journal
525  *    based on the total filesystem size - the target is 1MB of journal
526  *    per 1GB of filesystem, up to a maximum journal size of 64MB.  As
527  *    a worst case allowing for fragmentation, we'll allocate a journal
528  *    1/4 of the desired size but never smaller than 1MB.
529  *
530  *    XXX In the future if we allow for non-contiguous journal files we
531  *    can tighten the above restrictions.
532  *
533  * XXX
534  * These seems like a lot of duplication both here and in some of
535  * the userland tools (fsck_ffs, dumpfs, tunefs) with similar
536  * "switch (fs_journal_location)" constructs.  Can we centralise
537  * this sort of code somehow/somewhere?
538  */
539 static int
wapbl_log_position(struct mount * mp,struct fs * fs,struct vnode * devvp,daddr_t * startp,size_t * countp,size_t * blksizep,uint64_t * extradatap)540 wapbl_log_position(struct mount *mp, struct fs *fs, struct vnode *devvp,
541     daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap)
542 {
543           struct ufsmount *ump = VFSTOUFS(mp);
544           daddr_t logstart, logend, desired_logsize;
545           uint64_t numsecs;
546           unsigned secsize;
547           int error, location;
548 
549           if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
550                     switch (fs->fs_journal_location) {
551                     case UFS_WAPBL_JOURNALLOC_END_PARTITION:
552                               DPRINTF("found existing end-of-partition log\n");
553                               *startp = fs->fs_journallocs[UFS_WAPBL_EPART_ADDR];
554                               *countp = fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
555                               *blksizep = fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ];
556                               DPRINTF(" start = %" PRId64 ", size = %zu, "
557                                   "blksize = %zu\n", *startp, *countp, *blksizep);
558                               return 0;
559 
560                     case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
561                               DPRINTF("found existing in-filesystem log\n");
562                               *startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR];
563                               *countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
564                               *blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
565                               DPRINTF(" start = %" PRId64 ", size = %zu, "
566                                   "blksize = %zu\n", *startp, *countp, *blksizep);
567                               return 0;
568 
569                     default:
570                               printf("%s: %s: unknown journal type %d\n", __func__,
571                                   fs->fs_fsmnt, fs->fs_journal_location);
572                               return EINVAL;
573                     }
574           }
575 
576           desired_logsize =
577               ffs_lfragtosize(fs, fs->fs_size) / UFS_WAPBL_JOURNAL_SCALE;
578           DPRINTF("desired log size = %" PRId64 " kB\n", desired_logsize / 1024);
579           desired_logsize = uimax(desired_logsize, UFS_WAPBL_MIN_JOURNAL_SIZE);
580           desired_logsize = uimin(desired_logsize, UFS_WAPBL_MAX_JOURNAL_SIZE);
581           DPRINTF("adjusted desired log size = %" PRId64 " kB\n",
582               desired_logsize / 1024);
583 
584           /* Is there space after after filesystem on partition for log? */
585           logstart = FFS_FSBTODB(fs, fs->fs_size);
586           error = getdisksize(devvp, &numsecs, &secsize);
587           if (error)
588                     return error;
589           KDASSERT(secsize != 0);
590           logend = btodb(numsecs * secsize);
591 
592           if (dbtob(logend - logstart) >= desired_logsize) {
593                     DPRINTF("enough space, use end-of-partition log\n");
594 
595                     location = UFS_WAPBL_JOURNALLOC_END_PARTITION;
596                     *blksizep = secsize;
597 
598                     *startp = logstart;
599                     *countp = (logend - logstart);
600                     *extradatap = 0;
601 
602                     /* convert to physical block numbers */
603                     *startp = dbtob(*startp) / secsize;
604                     *countp = dbtob(*countp) / secsize;
605 
606                     fs->fs_journallocs[UFS_WAPBL_EPART_ADDR] = *startp;
607                     fs->fs_journallocs[UFS_WAPBL_EPART_COUNT] = *countp;
608                     fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ] = *blksizep;
609                     fs->fs_journallocs[UFS_WAPBL_EPART_UNUSED] = *extradatap;
610           } else {
611                     DPRINTF("end-of-partition has only %" PRId64 " free\n",
612                         logend - logstart);
613 
614                     location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM;
615                     *blksizep = secsize;
616 
617                     error = wapbl_create_infs_log(mp, fs, devvp,
618                                       startp, countp, extradatap);
619                     ffs_sync(mp, MNT_WAIT, FSCRED);
620 
621                     /* convert to physical block numbers */
622                     *startp = dbtob(*startp) / secsize;
623                     *countp = dbtob(*countp) / secsize;
624 
625                     fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] = *startp;
626                     fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] = *countp;
627                     fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = *blksizep;
628                     fs->fs_journallocs[UFS_WAPBL_INFS_INO] = *extradatap;
629           }
630 
631           if (error == 0) {
632                     /* update superblock with log location */
633                     fs->fs_journal_version = UFS_WAPBL_VERSION;
634                     fs->fs_journal_location = location;
635                     fs->fs_journal_flags = 0;
636 
637                     error = ffs_sbupdate(ump, MNT_WAIT);
638           }
639 
640           return error;
641 }
642 
643 /*
644  * Try to create a journal log inside the filesystem.
645  */
646 static int
wapbl_create_infs_log(struct mount * mp,struct fs * fs,struct vnode * devvp,daddr_t * startp,size_t * countp,uint64_t * extradatap)647 wapbl_create_infs_log(struct mount *mp, struct fs *fs, struct vnode *devvp,
648     daddr_t *startp, size_t *countp, uint64_t *extradatap)
649 {
650           struct vnode *vp, *rvp;
651           struct vattr va;
652           struct inode *ip;
653           int error;
654 
655           if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp)) != 0)
656                     return error;
657 
658           vattr_null(&va);
659           va.va_type = VREG;
660           va.va_mode = 0;
661 
662           error = vcache_new(mp, rvp, &va, NOCRED, NULL, &vp);
663           vput(rvp);
664           if (error)
665                     return error;
666 
667           error = vn_lock(vp, LK_EXCLUSIVE);
668           if (error) {
669                     vrele(vp);
670                     return error;
671           }
672 
673           ip = VTOI(vp);
674           ip->i_flags = SF_LOG;
675           DIP_ASSIGN(ip, flags, ip->i_flags);
676           ip->i_nlink = 1;
677           DIP_ASSIGN(ip, nlink, 1);
678           ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
679           ffs_update(vp, NULL, NULL, UPDATE_WAIT);
680 
681           if ((error = wapbl_allocate_log_file(mp, vp,
682                            startp, countp, extradatap)) != 0) {
683                     /*
684                      * If we couldn't allocate the space for the log file,
685                      * remove the inode by setting its link count back to
686                      * zero and bail.
687                      */
688                     ip->i_nlink = 0;
689                     DIP_ASSIGN(ip, nlink, 0);
690                     vput(vp);
691 
692                     return error;
693           }
694 
695           /*
696            * Now that we have the place-holder inode for the journal,
697            * we don't need the vnode ever again.
698            */
699           vput(vp);
700 
701           return 0;
702 }
703 
704 int
wapbl_allocate_log_file(struct mount * mp,struct vnode * vp,daddr_t * startp,size_t * countp,uint64_t * extradatap)705 wapbl_allocate_log_file(struct mount *mp, struct vnode *vp,
706     daddr_t *startp, size_t *countp, uint64_t *extradatap)
707 {
708           struct ufsmount *ump = VFSTOUFS(mp);
709           struct fs *fs = ump->um_fs;
710           daddr_t addr, indir_addr;
711           off_t logsize;
712           size_t size;
713           int error;
714 
715           logsize = 0;
716           /* check if there's a suggested log size */
717           if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG &&
718               fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM)
719                     logsize = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
720 
721           if (vp->v_size > 0) {
722                     printf("%s: %s: file size (%" PRId64 ") non zero\n", __func__,
723                         fs->fs_fsmnt, vp->v_size);
724                     return EEXIST;
725           }
726           wapbl_find_log_start(mp, vp, logsize, &addr, &indir_addr, &size);
727           if (addr == 0) {
728                     printf("%s: %s: log not allocated, largest extent is "
729                         "%" PRId64 "MB\n", __func__, fs->fs_fsmnt,
730                         ffs_lblktosize(fs, size) / (1024 * 1024));
731                     return ENOSPC;
732           }
733 
734           logsize = ffs_lblktosize(fs, size);     /* final log size */
735 
736           VTOI(vp)->i_ffs_first_data_blk = addr;
737           VTOI(vp)->i_ffs_first_indir_blk = indir_addr;
738 
739           error = GOP_ALLOC(vp, 0, logsize, B_CONTIG, FSCRED);
740           if (error) {
741                     printf("%s: %s: GOP_ALLOC error %d\n", __func__, fs->fs_fsmnt,
742                         error);
743                     return error;
744           }
745 
746           *startp     = FFS_FSBTODB(fs, addr);
747           *countp     = btodb(logsize);
748           *extradatap = VTOI(vp)->i_number;
749 
750           return 0;
751 }
752 
753 /*
754  * Find a suitable location for the journal in the filesystem.
755  *
756  * Our strategy here is to look for a contiguous block of free space
757  * at least "logfile" MB in size (plus room for any indirect blocks).
758  * We start at the middle of the filesystem and check each cylinder
759  * group working outwards.  If "logfile" MB is not available as a
760  * single contiguous chunk, then return the address and size of the
761  * largest chunk found.
762  *
763  * XXX
764  * At what stage does the search fail?  Is if the largest space we could
765  * find is less than a quarter the requested space reasonable?  If the
766  * search fails entirely, return a block address if "0" it indicate this.
767  */
768 static void
wapbl_find_log_start(struct mount * mp,struct vnode * vp,off_t logsize,daddr_t * addr,daddr_t * indir_addr,size_t * size)769 wapbl_find_log_start(struct mount *mp, struct vnode *vp, off_t logsize,
770     daddr_t *addr, daddr_t *indir_addr, size_t *size)
771 {
772           struct ufsmount *ump = VFSTOUFS(mp);
773           struct fs *fs = ump->um_fs;
774           struct vnode *devvp = ump->um_devvp;
775           struct cg *cgp;
776           struct buf *bp;
777           uint8_t *blksfree;
778           daddr_t blkno, best_addr, start_addr;
779           daddr_t desired_blks, min_desired_blks;
780           daddr_t freeblks, best_blks;
781           int bpcg, cg, error, fixedsize, indir_blks, n, s;
782           const int needswap = UFS_FSNEEDSWAP(fs);
783 
784           if (logsize == 0) {
785                     fixedsize = 0;      /* We can adjust the size if tight */
786                     logsize = ffs_lfragtosize(fs, fs->fs_dsize) /
787                         UFS_WAPBL_JOURNAL_SCALE;
788                     DPRINTF("suggested log size = %" PRId64 "\n", logsize);
789                     logsize = uimax(logsize, UFS_WAPBL_MIN_JOURNAL_SIZE);
790                     logsize = uimin(logsize, UFS_WAPBL_MAX_JOURNAL_SIZE);
791                     DPRINTF("adjusted log size = %" PRId64 "\n", logsize);
792           } else {
793                     fixedsize = 1;
794                     DPRINTF("fixed log size = %" PRId64 "\n", logsize);
795           }
796 
797           desired_blks = logsize / fs->fs_bsize;
798           DPRINTF("desired blocks = %" PRId64 "\n", desired_blks);
799 
800           /* add in number of indirect blocks needed */
801           indir_blks = 0;
802           if (desired_blks >= UFS_NDADDR) {
803                     struct indir indirs[UFS_NIADDR + 2];
804                     int num;
805 
806                     error = ufs_getlbns(vp, desired_blks, indirs, &num);
807                     if (error) {
808                               printf("%s: %s:  ufs_getlbns failed, error %d!\n",
809                                   __func__, fs->fs_fsmnt, error);
810                               goto bad;
811                     }
812 
813                     switch (num) {
814                     case 2:
815                               indir_blks = 1;               /* 1st level indirect */
816                               break;
817                     case 3:
818                               indir_blks = 1 +    /* 1st level indirect */
819                                   1 +                       /* 2nd level indirect */
820                                   indirs[1].in_off + 1; /* extra 1st level indirect */
821                               break;
822                     default:
823                               printf("%s: %s: unexpected numlevels %d from "
824                                   "ufs_getlbns\n", __func__, fs->fs_fsmnt, num);
825                               *size = 0;
826                               goto bad;
827                     }
828                     desired_blks += indir_blks;
829           }
830           DPRINTF("desired blocks = %" PRId64 " (including indirect)\n",
831               desired_blks);
832 
833           /*
834            * If a specific size wasn't requested, allow for a smaller log
835            * if we're really tight for space...
836            */
837           min_desired_blks = desired_blks;
838           if (!fixedsize)
839                     min_desired_blks = desired_blks / 4;
840 
841           /* Look at number of blocks per CG.  If it's too small, bail early. */
842           bpcg = ffs_fragstoblks(fs, fs->fs_fpg);
843           if (min_desired_blks > bpcg) {
844                     printf("%s: %s: cylinder group size of %" PRId64 " MB "
845                         " is not big enough for journal\n", __func__, fs->fs_fsmnt,
846                         ffs_lblktosize(fs, bpcg) / (1024 * 1024));
847                     goto bad;
848           }
849 
850           /*
851            * Start with the middle cylinder group, and search outwards in
852            * both directions until we either find the requested log size
853            * or reach the start/end of the file system.  If we reach the
854            * start/end without finding enough space for the full requested
855            * log size, use the largest extent found if it is large enough
856            * to satisfy the our minimum size.
857            *
858            * XXX
859            * Can we just use the cluster contigsum stuff (esp on UFS2)
860            * here to simplify this search code?
861            */
862           best_addr = 0;
863           best_blks = 0;
864           for (cg = fs->fs_ncg / 2, s = 0, n = 1;
865               best_blks < desired_blks && cg >= 0 && cg < fs->fs_ncg;
866               s++, n = -n, cg += n * s) {
867                     DPRINTF("check cg %d of %d\n", cg, fs->fs_ncg);
868                     error = bread(devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
869                         fs->fs_cgsize, 0, &bp);
870                     if (error) {
871                               continue;
872                     }
873                     cgp = (struct cg *)bp->b_data;
874                     if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
875                               brelse(bp, 0);
876                               continue;
877                     }
878 
879                     blksfree = cg_blksfree(cgp, needswap);
880 
881                     for (blkno = 0; blkno < bpcg;) {
882                               /* look for next free block */
883                               /* XXX use scanc() and fragtbl[] here? */
884                               for (; blkno < bpcg - min_desired_blks; blkno++)
885                                         if (ffs_isblock(fs, blksfree, blkno))
886                                                   break;
887 
888                               /* past end of search space in this CG? */
889                               if (blkno >= bpcg - min_desired_blks)
890                                         break;
891 
892                               /* count how many free blocks in this extent */
893                               start_addr = blkno;
894                               for (freeblks = 0; blkno < bpcg; blkno++, freeblks++)
895                                         if (!ffs_isblock(fs, blksfree, blkno))
896                                                   break;
897 
898                               if (freeblks > best_blks) {
899                                         best_blks = freeblks;
900                                         best_addr = ffs_blkstofrags(fs, start_addr) +
901                                             cgbase(fs, cg);
902 
903                                         if (freeblks >= desired_blks) {
904                                                   DPRINTF("found len %" PRId64
905                                                       " at offset %" PRId64 " in gc\n",
906                                                       freeblks, start_addr);
907                                                   break;
908                                         }
909                               }
910                     }
911                     brelse(bp, 0);
912           }
913           DPRINTF("best found len = %" PRId64 ", wanted %" PRId64
914               " at addr %" PRId64 "\n", best_blks, desired_blks, best_addr);
915 
916           if (best_blks < min_desired_blks) {
917                     *addr = 0;
918                     *indir_addr = 0;
919           } else {
920                     /* put indirect blocks at start, and data blocks after */
921                     *addr = best_addr + ffs_blkstofrags(fs, indir_blks);
922                     *indir_addr = best_addr;
923           }
924           *size = uimin(desired_blks, best_blks) - indir_blks;
925           return;
926 
927 bad:
928           *addr = 0;
929           *indir_addr = 0;
930           *size = 0;
931           return;
932 }
933