1 /*        $NetBSD: vfs_trans.c,v 1.73 2024/12/07 02:27:38 riastradh Exp $       */
2 
3 /*-
4  * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Juergen Hannken-Illjes.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.73 2024/12/07 02:27:38 riastradh Exp $");
34 
35 /*
36  * File system transaction operations.
37  */
38 
39 #ifdef _KERNEL_OPT
40 #include "opt_ddb.h"
41 #endif
42 
43 #include <sys/param.h>
44 #include <sys/types.h>
45 
46 #include <sys/atomic.h>
47 #include <sys/buf.h>
48 #include <sys/fstrans.h>
49 #include <sys/hash.h>
50 #include <sys/kmem.h>
51 #include <sys/mount.h>
52 #include <sys/pool.h>
53 #include <sys/proc.h>
54 #include <sys/pserialize.h>
55 #include <sys/sdt.h>
56 #include <sys/systm.h>
57 #include <sys/vnode.h>
58 
59 #include <miscfs/deadfs/deadfs.h>
60 #include <miscfs/specfs/specdev.h>
61 
62 #define FSTRANS_MOUNT_HASHSIZE          32
63 
64 enum fstrans_lock_type {
65           FSTRANS_LAZY,                           /* Granted while not suspended */
66           FSTRANS_SHARED                          /* Granted while not suspending */
67 };
68 
69 struct fscow_handler {
70           LIST_ENTRY(fscow_handler) ch_list;
71           int (*ch_func)(void *, struct buf *, bool);
72           void *ch_arg;
73 };
74 struct fstrans_lwp_info {
75           struct fstrans_lwp_info *fli_succ;
76           struct lwp *fli_self;
77           struct mount *fli_mount;
78           struct fstrans_lwp_info *fli_alias;
79           struct fstrans_mount_info *fli_mountinfo;
80           int fli_trans_cnt;
81           int fli_alias_cnt;
82           int fli_cow_cnt;
83           enum fstrans_lock_type fli_lock_type;
84           LIST_ENTRY(fstrans_lwp_info) fli_list;
85 };
86 struct fstrans_mount_info {
87           enum fstrans_state fmi_state;
88           unsigned int fmi_ref_cnt;
89           bool fmi_gone;
90           bool fmi_cow_change;
91           SLIST_ENTRY(fstrans_mount_info) fmi_hash;
92           LIST_HEAD(, fscow_handler) fmi_cow_handler;
93           struct mount *fmi_mount;
94           struct fstrans_mount_info *fmi_lower_info;
95           struct lwp *fmi_owner;
96 };
97 SLIST_HEAD(fstrans_mount_hashhead, fstrans_mount_info);
98 
99 static kmutex_t vfs_suspend_lock        /* Serialize suspensions. */
100     __cacheline_aligned;
101 static kmutex_t fstrans_lock            /* Fstrans big lock. */
102     __cacheline_aligned;
103 static kcondvar_t fstrans_state_cv;     /* Fstrans or cow state changed. */
104 static kcondvar_t fstrans_count_cv;     /* Fstrans or cow count changed. */
105 static pserialize_t fstrans_psz;        /* Pserialize state. */
106 static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
107                                                   /* List of all fstrans_lwp_info. */
108 static pool_cache_t fstrans_lwp_cache;  /* Cache of fstrans_lwp_info. */
109 
110 static u_long fstrans_mount_hashmask;
111 static struct fstrans_mount_hashhead *fstrans_mount_hashtab;
112 static int fstrans_gone_count;                    /* Number of fstrans_mount_info gone. */
113 
114 static inline uint32_t fstrans_mount_hash(struct mount *);
115 static inline struct fstrans_mount_info *fstrans_mount_get(struct mount *);
116 static void fstrans_mount_dtor(struct fstrans_mount_info *);
117 static void fstrans_clear_lwp_info(void);
118 static inline struct fstrans_lwp_info *
119     fstrans_get_lwp_info(struct mount *, bool);
120 static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
121 static int fstrans_lwp_pcc(void *, void *, int);
122 static void fstrans_lwp_pcd(void *, void *);
123 static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
124 static bool grant_lock(const struct fstrans_mount_info *,
125     const enum fstrans_lock_type);
126 static bool state_change_done(const struct fstrans_mount_info *);
127 static bool cow_state_change_done(const struct fstrans_mount_info *);
128 static void cow_change_enter(struct fstrans_mount_info *);
129 static void cow_change_done(struct fstrans_mount_info *);
130 
131 /*
132  * Initialize.
133  */
134 void
fstrans_init(void)135 fstrans_init(void)
136 {
137 
138           mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
139           mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
140           cv_init(&fstrans_state_cv, "fstchg");
141           cv_init(&fstrans_count_cv, "fstcnt");
142           fstrans_psz = pserialize_create();
143           LIST_INIT(&fstrans_fli_head);
144           fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
145               coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
146               fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
147           KASSERT(fstrans_lwp_cache != NULL);
148           fstrans_mount_hashtab = hashinit(FSTRANS_MOUNT_HASHSIZE, HASH_SLIST,
149               true, &fstrans_mount_hashmask);
150 }
151 
152 /*
153  * pool_cache constructor for fstrans_lwp_info.  Updating the global list
154  * produces cache misses on MP.  Minimise by keeping free entries on list.
155  */
156 int
fstrans_lwp_pcc(void * arg,void * obj,int flags)157 fstrans_lwp_pcc(void *arg, void *obj, int flags)
158 {
159           struct fstrans_lwp_info *fli = obj;
160 
161           memset(fli, 0, sizeof(*fli));
162 
163           mutex_enter(&fstrans_lock);
164           LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
165           mutex_exit(&fstrans_lock);
166 
167           return 0;
168 }
169 
170 /*
171  * pool_cache destructor
172  */
173 void
fstrans_lwp_pcd(void * arg,void * obj)174 fstrans_lwp_pcd(void *arg, void *obj)
175 {
176           struct fstrans_lwp_info *fli = obj;
177 
178           mutex_enter(&fstrans_lock);
179           LIST_REMOVE(fli, fli_list);
180           mutex_exit(&fstrans_lock);
181 }
182 
183 /*
184  * Deallocate lwp state.
185  */
186 void
fstrans_lwp_dtor(lwp_t * l)187 fstrans_lwp_dtor(lwp_t *l)
188 {
189           struct fstrans_lwp_info *fli, *fli_next;
190 
191           if (l->l_fstrans == NULL)
192                     return;
193 
194           mutex_enter(&fstrans_lock);
195           for (fli = l->l_fstrans; fli; fli = fli_next) {
196                     KASSERT(fli->fli_trans_cnt == 0);
197                     KASSERT(fli->fli_cow_cnt == 0);
198                     KASSERT(fli->fli_self == l);
199                     if (fli->fli_mount != NULL)
200                               fstrans_mount_dtor(fli->fli_mountinfo);
201                     fli_next = fli->fli_succ;
202                     fli->fli_alias_cnt = 0;
203                     fli->fli_mount = NULL;
204                     fli->fli_alias = NULL;
205                     fli->fli_mountinfo = NULL;
206                     fli->fli_self = NULL;
207           }
208           mutex_exit(&fstrans_lock);
209 
210           for (fli = l->l_fstrans; fli; fli = fli_next) {
211                     fli_next = fli->fli_succ;
212                     pool_cache_put(fstrans_lwp_cache, fli);
213           }
214           l->l_fstrans = NULL;
215 }
216 
217 /*
218  * mount pointer to hash
219  */
220 static inline uint32_t
fstrans_mount_hash(struct mount * mp)221 fstrans_mount_hash(struct mount *mp)
222 {
223 
224           return hash32_buf(&mp, sizeof(mp), HASH32_BUF_INIT) &
225               fstrans_mount_hashmask;
226 }
227 
228 /*
229  * retrieve fstrans_mount_info by mount or NULL
230  */
231 static inline struct fstrans_mount_info *
fstrans_mount_get(struct mount * mp)232 fstrans_mount_get(struct mount *mp)
233 {
234           uint32_t indx;
235           struct fstrans_mount_info *fmi, *fmi_lower;
236 
237           KASSERT(mutex_owned(&fstrans_lock));
238 
239           indx = fstrans_mount_hash(mp);
240           SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) {
241                     if (fmi->fmi_mount == mp) {
242                               if (__predict_false(mp->mnt_lower != NULL &&
243                                   fmi->fmi_lower_info == NULL)) {
244                                         /*
245                                          * Intern the lower/lowest mount into
246                                          * this mount info on first lookup.
247                                          */
248                                         KASSERT(fmi->fmi_ref_cnt == 1);
249 
250                                         fmi_lower = fstrans_mount_get(mp->mnt_lower);
251                                         if (fmi_lower && fmi_lower->fmi_lower_info)
252                                                   fmi_lower = fmi_lower->fmi_lower_info;
253                                         if (fmi_lower == NULL)
254                                                   return NULL;
255                                         fmi->fmi_lower_info = fmi_lower;
256                                         fmi->fmi_lower_info->fmi_ref_cnt += 1;
257                               }
258                               return fmi;
259                     }
260           }
261 
262           return NULL;
263 }
264 
265 /*
266  * Dereference mount state.
267  */
268 static void
fstrans_mount_dtor(struct fstrans_mount_info * fmi)269 fstrans_mount_dtor(struct fstrans_mount_info *fmi)
270 {
271 
272           KASSERT(mutex_owned(&fstrans_lock));
273 
274           KASSERT(fmi != NULL);
275           fmi->fmi_ref_cnt -= 1;
276           if (__predict_true(fmi->fmi_ref_cnt > 0)) {
277                     return;
278           }
279 
280           KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
281           KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
282           KASSERT(fmi->fmi_owner == NULL);
283 
284           if (fmi->fmi_lower_info)
285                     fstrans_mount_dtor(fmi->fmi_lower_info);
286 
287           KASSERT(fstrans_gone_count > 0);
288           fstrans_gone_count -= 1;
289 
290           KASSERT(fmi->fmi_mount->mnt_lower == NULL);
291 
292           kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
293           kmem_free(fmi, sizeof(*fmi));
294 }
295 
296 /*
297  * Allocate mount state.
298  */
299 int
fstrans_mount(struct mount * mp)300 fstrans_mount(struct mount *mp)
301 {
302           uint32_t indx;
303           struct fstrans_mount_info *newfmi;
304 
305           indx = fstrans_mount_hash(mp);
306 
307           newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
308           newfmi->fmi_state = FSTRANS_NORMAL;
309           newfmi->fmi_ref_cnt = 1;
310           newfmi->fmi_gone = false;
311           LIST_INIT(&newfmi->fmi_cow_handler);
312           newfmi->fmi_cow_change = false;
313           newfmi->fmi_mount = mp;
314           newfmi->fmi_lower_info = NULL;
315           newfmi->fmi_owner = NULL;
316 
317           mutex_enter(&fstrans_lock);
318           SLIST_INSERT_HEAD(&fstrans_mount_hashtab[indx], newfmi, fmi_hash);
319           mutex_exit(&fstrans_lock);
320 
321           return 0;
322 }
323 
324 /*
325  * Deallocate mount state.
326  */
327 void
fstrans_unmount(struct mount * mp)328 fstrans_unmount(struct mount *mp)
329 {
330           uint32_t indx;
331           struct fstrans_mount_info *fmi;
332 
333           indx = fstrans_mount_hash(mp);
334 
335           mutex_enter(&fstrans_lock);
336           fmi = fstrans_mount_get(mp);
337           KASSERT(fmi != NULL);
338           fmi->fmi_gone = true;
339           SLIST_REMOVE(&fstrans_mount_hashtab[indx],
340               fmi, fstrans_mount_info, fmi_hash);
341           fstrans_gone_count += 1;
342           fstrans_mount_dtor(fmi);
343           mutex_exit(&fstrans_lock);
344 }
345 
346 /*
347  * Clear mount entries whose mount is gone.
348  */
349 static void
fstrans_clear_lwp_info(void)350 fstrans_clear_lwp_info(void)
351 {
352           struct fstrans_lwp_info **p, *fli, *tofree = NULL;
353 
354           /*
355            * Scan our list clearing entries whose mount is gone.
356            */
357           mutex_enter(&fstrans_lock);
358           for (p = &curlwp->l_fstrans; *p; ) {
359                     fli = *p;
360                     if (fli->fli_mount != NULL &&
361                         fli->fli_mountinfo->fmi_gone &&
362                         fli->fli_trans_cnt == 0 &&
363                         fli->fli_cow_cnt == 0 &&
364                         fli->fli_alias_cnt == 0) {
365                               *p = (*p)->fli_succ;
366                               fstrans_mount_dtor(fli->fli_mountinfo);
367                               if (fli->fli_alias) {
368                                         KASSERT(fli->fli_alias->fli_alias_cnt > 0);
369                                         fli->fli_alias->fli_alias_cnt--;
370                               }
371                               fli->fli_mount = NULL;
372                               fli->fli_alias = NULL;
373                               fli->fli_mountinfo = NULL;
374                               fli->fli_self = NULL;
375                               p = &curlwp->l_fstrans;
376                               fli->fli_succ = tofree;
377                               tofree = fli;
378                     } else {
379                               p = &(*p)->fli_succ;
380                     }
381           }
382 #ifdef DIAGNOSTIC
383           for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
384                     if (fli->fli_alias != NULL)
385                               KASSERT(fli->fli_alias->fli_self == curlwp);
386 #endif /* DIAGNOSTIC */
387           mutex_exit(&fstrans_lock);
388 
389           while (tofree != NULL) {
390                     fli = tofree;
391                     tofree = fli->fli_succ;
392                     pool_cache_put(fstrans_lwp_cache, fli);
393           }
394 }
395 
396 /*
397  * Allocate and return per lwp info for this mount.
398  */
399 static struct fstrans_lwp_info *
fstrans_alloc_lwp_info(struct mount * mp)400 fstrans_alloc_lwp_info(struct mount *mp)
401 {
402           struct fstrans_lwp_info *fli, *fli_lower;
403           struct fstrans_mount_info *fmi;
404 
405           for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
406                     if (fli->fli_mount == mp)
407                               return fli;
408           }
409 
410           /*
411            * Lookup mount info and get lower mount per lwp info.
412            */
413           mutex_enter(&fstrans_lock);
414           fmi = fstrans_mount_get(mp);
415           if (fmi == NULL) {
416                     mutex_exit(&fstrans_lock);
417                     return NULL;
418           }
419           fmi->fmi_ref_cnt += 1;
420           mutex_exit(&fstrans_lock);
421 
422           if (fmi->fmi_lower_info) {
423                     fli_lower =
424                         fstrans_alloc_lwp_info(fmi->fmi_lower_info->fmi_mount);
425                     if (fli_lower == NULL) {
426                               mutex_enter(&fstrans_lock);
427                               fstrans_mount_dtor(fmi);
428                               mutex_exit(&fstrans_lock);
429 
430                               return NULL;
431                     }
432           } else {
433                     fli_lower = NULL;
434           }
435 
436           /*
437            * Allocate a new entry.
438            */
439           fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
440           KASSERT(fli->fli_trans_cnt == 0);
441           KASSERT(fli->fli_cow_cnt == 0);
442           KASSERT(fli->fli_alias_cnt == 0);
443           KASSERT(fli->fli_mount == NULL);
444           KASSERT(fli->fli_alias == NULL);
445           KASSERT(fli->fli_mountinfo == NULL);
446           KASSERT(fli->fli_self == NULL);
447 
448           /*
449            * Attach the mount info and alias.
450            */
451 
452           fli->fli_self = curlwp;
453           fli->fli_mount = mp;
454           fli->fli_mountinfo = fmi;
455 
456           fli->fli_succ = curlwp->l_fstrans;
457           curlwp->l_fstrans = fli;
458 
459           if (fli_lower) {
460                     fli->fli_alias = fli_lower;
461                     fli->fli_alias->fli_alias_cnt++;
462                     fli = fli->fli_alias;
463           }
464 
465           return fli;
466 }
467 
468 /*
469  * Retrieve the per lwp info for this mount allocating if necessary.
470  */
471 static inline struct fstrans_lwp_info *
fstrans_get_lwp_info(struct mount * mp,bool do_alloc)472 fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
473 {
474           struct fstrans_lwp_info *fli;
475 
476           /*
477            * Scan our list for a match.
478            */
479           for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
480                     if (fli->fli_mount == mp) {
481                               KASSERT(mp->mnt_lower == NULL ||
482                                   fli->fli_alias != NULL);
483                               if (fli->fli_alias != NULL)
484                                         fli = fli->fli_alias;
485                               break;
486                     }
487           }
488 
489           if (do_alloc) {
490                     if (__predict_false(fli == NULL))
491                               fli = fstrans_alloc_lwp_info(mp);
492           }
493 
494           return fli;
495 }
496 
497 /*
498  * Check if this lock type is granted at this state.
499  */
500 static bool
grant_lock(const struct fstrans_mount_info * fmi,const enum fstrans_lock_type type)501 grant_lock(const struct fstrans_mount_info *fmi,
502     const enum fstrans_lock_type type)
503 {
504 
505           if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
506                     return true;
507           if (fmi->fmi_owner == curlwp)
508                     return true;
509           if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
510                     return true;
511 
512           return false;
513 }
514 
515 /*
516  * Start a transaction.  If this thread already has a transaction on this
517  * file system increment the reference counter.
518  */
519 static inline int
_fstrans_start(struct mount * mp,enum fstrans_lock_type lock_type,int wait)520 _fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
521 {
522           int s;
523           struct fstrans_lwp_info *fli;
524           struct fstrans_mount_info *fmi;
525 
526           ASSERT_SLEEPABLE();
527 
528           fli = fstrans_get_lwp_info(mp, true);
529           if (fli == NULL)
530                     return 0;
531           fmi = fli->fli_mountinfo;
532 
533           if (fli->fli_trans_cnt > 0) {
534                     fli->fli_trans_cnt += 1;
535 
536                     return 0;
537           }
538 
539           s = pserialize_read_enter();
540           if (__predict_true(grant_lock(fmi, lock_type))) {
541                     fli->fli_trans_cnt = 1;
542                     fli->fli_lock_type = lock_type;
543                     pserialize_read_exit(s);
544 
545                     return 0;
546           }
547           pserialize_read_exit(s);
548 
549           if (! wait)
550                     return SET_ERROR(EBUSY);
551 
552           mutex_enter(&fstrans_lock);
553           while (! grant_lock(fmi, lock_type))
554                     cv_wait(&fstrans_state_cv, &fstrans_lock);
555           fli->fli_trans_cnt = 1;
556           fli->fli_lock_type = lock_type;
557           mutex_exit(&fstrans_lock);
558 
559           return 0;
560 }
561 
562 void
fstrans_start(struct mount * mp)563 fstrans_start(struct mount *mp)
564 {
565           int error __diagused;
566 
567           error = _fstrans_start(mp, FSTRANS_SHARED, 1);
568           KASSERT(error == 0);
569 }
570 
571 int
fstrans_start_nowait(struct mount * mp)572 fstrans_start_nowait(struct mount *mp)
573 {
574 
575           return _fstrans_start(mp, FSTRANS_SHARED, 0);
576 }
577 
578 void
fstrans_start_lazy(struct mount * mp)579 fstrans_start_lazy(struct mount *mp)
580 {
581           int error __diagused;
582 
583           error = _fstrans_start(mp, FSTRANS_LAZY, 1);
584           KASSERT(error == 0);
585 }
586 
587 /*
588  * Finish a transaction.
589  */
590 void
fstrans_done(struct mount * mp)591 fstrans_done(struct mount *mp)
592 {
593           int s;
594           struct fstrans_lwp_info *fli;
595           struct fstrans_mount_info *fmi;
596 
597           fli = fstrans_get_lwp_info(mp, false);
598           if (fli == NULL)
599                     return;
600           fmi = fli->fli_mountinfo;
601           KASSERT(fli->fli_trans_cnt > 0);
602 
603           if (fli->fli_trans_cnt > 1) {
604                     fli->fli_trans_cnt -= 1;
605 
606                     return;
607           }
608 
609           if (__predict_false(fstrans_gone_count > 0))
610                     fstrans_clear_lwp_info();
611 
612           s = pserialize_read_enter();
613           if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
614                     fli->fli_trans_cnt = 0;
615                     pserialize_read_exit(s);
616 
617                     return;
618           }
619           pserialize_read_exit(s);
620 
621           mutex_enter(&fstrans_lock);
622           fli->fli_trans_cnt = 0;
623           cv_signal(&fstrans_count_cv);
624           mutex_exit(&fstrans_lock);
625 }
626 
627 /*
628  * Check if we hold an lock.
629  */
630 int
fstrans_held(struct mount * mp)631 fstrans_held(struct mount *mp)
632 {
633           struct fstrans_lwp_info *fli;
634           struct fstrans_mount_info *fmi;
635 
636           KASSERT(mp != dead_rootmount);
637 
638           fli = fstrans_get_lwp_info(mp, false);
639           if (fli == NULL)
640                     return 0;
641           fmi = fli->fli_mountinfo;
642 
643           return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
644 }
645 
646 /*
647  * Check if this thread has an exclusive lock.
648  */
649 int
fstrans_is_owner(struct mount * mp)650 fstrans_is_owner(struct mount *mp)
651 {
652           struct fstrans_lwp_info *fli;
653           struct fstrans_mount_info *fmi;
654 
655           KASSERT(mp != dead_rootmount);
656 
657           fli = fstrans_get_lwp_info(mp, false);
658           if (fli == NULL)
659                     return 0;
660           fmi = fli->fli_mountinfo;
661 
662           return (fmi->fmi_owner == curlwp);
663 }
664 
665 /*
666  * True, if no thread is in a transaction not granted at the current state.
667  */
668 static bool
state_change_done(const struct fstrans_mount_info * fmi)669 state_change_done(const struct fstrans_mount_info *fmi)
670 {
671           struct fstrans_lwp_info *fli;
672 
673           KASSERT(mutex_owned(&fstrans_lock));
674 
675           LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
676                     if (fli->fli_mountinfo != fmi)
677                               continue;
678                     if (fli->fli_trans_cnt == 0)
679                               continue;
680                     if (fli->fli_self == curlwp)
681                               continue;
682                     if (grant_lock(fmi, fli->fli_lock_type))
683                               continue;
684 
685                     return false;
686           }
687 
688           return true;
689 }
690 
691 /*
692  * Set new file system state.
693  */
694 int
fstrans_setstate(struct mount * mp,enum fstrans_state new_state)695 fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
696 {
697           int error;
698           enum fstrans_state old_state;
699           struct fstrans_lwp_info *fli;
700           struct fstrans_mount_info *fmi;
701 
702           KASSERT(mp != dead_rootmount);
703 
704           fli = fstrans_get_lwp_info(mp, true);
705           if (fli == NULL)
706                     return SET_ERROR(ENOENT);
707           fmi = fli->fli_mountinfo;
708           old_state = fmi->fmi_state;
709           if (old_state == new_state)
710                     return 0;
711 
712           mutex_enter(&fstrans_lock);
713           fmi->fmi_state = new_state;
714           pserialize_perform(fstrans_psz);
715 
716           /*
717            * All threads see the new state now.
718            * Wait for transactions invalid at this state to leave.
719            */
720           error = 0;
721           while (! state_change_done(fmi)) {
722                     error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
723                     if (error) {
724                               new_state = fmi->fmi_state = FSTRANS_NORMAL;
725                               break;
726                     }
727           }
728           if (old_state != new_state) {
729                     if (old_state == FSTRANS_NORMAL) {
730                               KASSERT(fmi->fmi_owner == NULL);
731                               fmi->fmi_owner = curlwp;
732                     }
733                     if (new_state == FSTRANS_NORMAL) {
734                               KASSERT(fmi->fmi_owner == curlwp);
735                               fmi->fmi_owner = NULL;
736                     }
737           }
738           cv_broadcast(&fstrans_state_cv);
739           mutex_exit(&fstrans_lock);
740 
741           return error;
742 }
743 
744 /*
745  * Get current file system state.
746  */
747 enum fstrans_state
fstrans_getstate(struct mount * mp)748 fstrans_getstate(struct mount *mp)
749 {
750           struct fstrans_lwp_info *fli;
751           struct fstrans_mount_info *fmi;
752 
753           KASSERT(mp != dead_rootmount);
754 
755           fli = fstrans_get_lwp_info(mp, true);
756           KASSERT(fli != NULL);
757           fmi = fli->fli_mountinfo;
758 
759           return fmi->fmi_state;
760 }
761 
762 /*
763  * Request a filesystem to suspend all operations.
764  */
765 int
vfs_suspend(struct mount * mp,int nowait)766 vfs_suspend(struct mount *mp, int nowait)
767 {
768           struct fstrans_lwp_info *fli;
769           int error;
770 
771           if (mp == dead_rootmount)
772                     return SET_ERROR(EOPNOTSUPP);
773 
774           fli = fstrans_get_lwp_info(mp, true);
775           if (fli == NULL)
776                     return SET_ERROR(ENOENT);
777 
778           if (nowait) {
779                     if (!mutex_tryenter(&vfs_suspend_lock))
780                               return SET_ERROR(EWOULDBLOCK);
781           } else
782                     mutex_enter(&vfs_suspend_lock);
783 
784           if ((error = VFS_SUSPENDCTL(fli->fli_mount, SUSPEND_SUSPEND)) != 0) {
785                     mutex_exit(&vfs_suspend_lock);
786                     return error;
787           }
788 
789           if ((mp->mnt_iflag & IMNT_GONE) != 0) {
790                     vfs_resume(mp);
791                     return SET_ERROR(ENOENT);
792           }
793 
794           return 0;
795 }
796 
797 /*
798  * Request a filesystem to resume all operations.
799  */
800 void
vfs_resume(struct mount * mp)801 vfs_resume(struct mount *mp)
802 {
803           struct fstrans_lwp_info *fli;
804 
805           KASSERT(mp != dead_rootmount);
806 
807           fli = fstrans_get_lwp_info(mp, false);
808           mp = fli->fli_mount;
809 
810           VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
811           mutex_exit(&vfs_suspend_lock);
812 }
813 
814 /*
815  * True, if no thread is running a cow handler.
816  */
817 static bool
cow_state_change_done(const struct fstrans_mount_info * fmi)818 cow_state_change_done(const struct fstrans_mount_info *fmi)
819 {
820           struct fstrans_lwp_info *fli;
821 
822           KASSERT(mutex_owned(&fstrans_lock));
823           KASSERT(fmi->fmi_cow_change);
824 
825           LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
826                     if (fli->fli_mount != fmi->fmi_mount)
827                               continue;
828                     if (fli->fli_cow_cnt == 0)
829                               continue;
830 
831                     return false;
832           }
833 
834           return true;
835 }
836 
837 /*
838  * Prepare for changing this mounts cow list.
839  * Returns with fstrans_lock locked.
840  */
841 static void
cow_change_enter(struct fstrans_mount_info * fmi)842 cow_change_enter(struct fstrans_mount_info *fmi)
843 {
844 
845           mutex_enter(&fstrans_lock);
846 
847           /*
848            * Wait for other threads changing the list.
849            */
850           while (fmi->fmi_cow_change)
851                     cv_wait(&fstrans_state_cv, &fstrans_lock);
852 
853           /*
854            * Wait until all threads are aware of a state change.
855            */
856           fmi->fmi_cow_change = true;
857           pserialize_perform(fstrans_psz);
858 
859           while (! cow_state_change_done(fmi))
860                     cv_wait(&fstrans_count_cv, &fstrans_lock);
861 }
862 
863 /*
864  * Done changing this mounts cow list.
865  */
866 static void
cow_change_done(struct fstrans_mount_info * fmi)867 cow_change_done(struct fstrans_mount_info *fmi)
868 {
869 
870           KASSERT(mutex_owned(&fstrans_lock));
871 
872           fmi->fmi_cow_change = false;
873           pserialize_perform(fstrans_psz);
874 
875           cv_broadcast(&fstrans_state_cv);
876 
877           mutex_exit(&fstrans_lock);
878 }
879 
880 /*
881  * Add a handler to this mount.
882  */
883 int
fscow_establish(struct mount * mp,int (* func)(void *,struct buf *,bool),void * arg)884 fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
885     void *arg)
886 {
887           struct fstrans_mount_info *fmi;
888           struct fscow_handler *newch;
889 
890           KASSERT(mp != dead_rootmount);
891 
892           mutex_enter(&fstrans_lock);
893           fmi = fstrans_mount_get(mp);
894           KASSERT(fmi != NULL);
895           fmi->fmi_ref_cnt += 1;
896           mutex_exit(&fstrans_lock);
897 
898           newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
899           newch->ch_func = func;
900           newch->ch_arg = arg;
901 
902           cow_change_enter(fmi);
903           LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
904           cow_change_done(fmi);
905 
906           return 0;
907 }
908 
909 /*
910  * Remove a handler from this mount.
911  */
912 int
fscow_disestablish(struct mount * mp,int (* func)(void *,struct buf *,bool),void * arg)913 fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
914     void *arg)
915 {
916           struct fstrans_mount_info *fmi;
917           struct fscow_handler *hp = NULL;
918 
919           KASSERT(mp != dead_rootmount);
920 
921           mutex_enter(&fstrans_lock);
922           fmi = fstrans_mount_get(mp);
923           KASSERT(fmi != NULL);
924           mutex_exit(&fstrans_lock);
925 
926           cow_change_enter(fmi);
927           LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
928                     if (hp->ch_func == func && hp->ch_arg == arg)
929                               break;
930           if (hp != NULL) {
931                     LIST_REMOVE(hp, ch_list);
932                     kmem_free(hp, sizeof(*hp));
933           }
934           fstrans_mount_dtor(fmi);
935           cow_change_done(fmi);
936 
937           return hp ? 0 : SET_ERROR(EINVAL);
938 }
939 
940 /*
941  * Check for need to copy block that is about to be written.
942  */
943 int
fscow_run(struct buf * bp,bool data_valid)944 fscow_run(struct buf *bp, bool data_valid)
945 {
946           int error, s;
947           struct mount *mp;
948           struct fstrans_lwp_info *fli;
949           struct fstrans_mount_info *fmi;
950           struct fscow_handler *hp;
951 
952           /*
953            * First check if we need run the copy-on-write handler.
954            */
955           if ((bp->b_flags & B_COWDONE))
956                     return 0;
957           if (bp->b_vp == NULL) {
958                     bp->b_flags |= B_COWDONE;
959                     return 0;
960           }
961           if (bp->b_vp->v_type == VBLK)
962                     mp = spec_node_getmountedfs(bp->b_vp);
963           else
964                     mp = bp->b_vp->v_mount;
965           if (mp == NULL || mp == dead_rootmount) {
966                     bp->b_flags |= B_COWDONE;
967                     return 0;
968           }
969 
970           fli = fstrans_get_lwp_info(mp, true);
971           KASSERT(fli != NULL);
972           fmi = fli->fli_mountinfo;
973 
974           /*
975            * On non-recursed run check if other threads
976            * want to change the list.
977            */
978           if (fli->fli_cow_cnt == 0) {
979                     s = pserialize_read_enter();
980                     if (__predict_false(fmi->fmi_cow_change)) {
981                               pserialize_read_exit(s);
982                               mutex_enter(&fstrans_lock);
983                               while (fmi->fmi_cow_change)
984                                         cv_wait(&fstrans_state_cv, &fstrans_lock);
985                               fli->fli_cow_cnt = 1;
986                               mutex_exit(&fstrans_lock);
987                     } else {
988                               fli->fli_cow_cnt = 1;
989                               pserialize_read_exit(s);
990                     }
991           } else
992                     fli->fli_cow_cnt += 1;
993 
994           /*
995            * Run all copy-on-write handlers, stop on error.
996            */
997           error = 0;
998           LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
999                     if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
1000                               break;
1001           if (error == 0)
1002                     bp->b_flags |= B_COWDONE;
1003 
1004           /*
1005            * Check if other threads want to change the list.
1006            */
1007           if (fli->fli_cow_cnt > 1) {
1008                     fli->fli_cow_cnt -= 1;
1009           } else {
1010                     s = pserialize_read_enter();
1011                     if (__predict_false(fmi->fmi_cow_change)) {
1012                               pserialize_read_exit(s);
1013                               mutex_enter(&fstrans_lock);
1014                               fli->fli_cow_cnt = 0;
1015                               cv_signal(&fstrans_count_cv);
1016                               mutex_exit(&fstrans_lock);
1017                     } else {
1018                               fli->fli_cow_cnt = 0;
1019                               pserialize_read_exit(s);
1020                     }
1021           }
1022 
1023           return error;
1024 }
1025 
1026 #if defined(DDB)
1027 void fstrans_dump(int);
1028 
1029 static void
fstrans_print_lwp(struct proc * p,struct lwp * l,int verbose)1030 fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
1031 {
1032           char prefix[9];
1033           struct fstrans_lwp_info *fli;
1034 
1035           snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
1036           LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
1037                     if (fli->fli_self != l)
1038                               continue;
1039                     if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
1040                               if (! verbose)
1041                                         continue;
1042                     }
1043                     printf("%-8s", prefix);
1044                     if (verbose)
1045                               printf(" @%p", fli);
1046                     if (fli->fli_mount == dead_rootmount)
1047                               printf(" <dead>");
1048                     else if (fli->fli_mount != NULL)
1049                               printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
1050                     else
1051                               printf(" NULL");
1052                     if (fli->fli_alias != NULL) {
1053                               struct mount *amp = fli->fli_alias->fli_mount;
1054 
1055                               printf(" alias");
1056                               if (verbose)
1057                                         printf(" @%p", fli->fli_alias);
1058                               if (amp == NULL)
1059                                         printf(" NULL");
1060                               else
1061                                         printf(" (%s)", amp->mnt_stat.f_mntonname);
1062                     }
1063                     if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
1064                               printf(" gone");
1065                     if (fli->fli_trans_cnt == 0) {
1066                               printf(" -");
1067                     } else {
1068                               switch (fli->fli_lock_type) {
1069                               case FSTRANS_LAZY:
1070                                         printf(" lazy");
1071                                         break;
1072                               case FSTRANS_SHARED:
1073                                         printf(" shared");
1074                                         break;
1075                               default:
1076                                         printf(" %#x", fli->fli_lock_type);
1077                                         break;
1078                               }
1079                     }
1080                     printf(" %d cow %d alias %d\n",
1081                         fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
1082                     prefix[0] = '\0';
1083           }
1084 }
1085 
1086 static void
fstrans_print_mount(struct mount * mp,int verbose)1087 fstrans_print_mount(struct mount *mp, int verbose)
1088 {
1089           uint32_t indx;
1090           struct fstrans_mount_info *fmi;
1091 
1092           indx = fstrans_mount_hash(mp);
1093           SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash)
1094                     if (fmi->fmi_mount == mp)
1095                               break;
1096 
1097           if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
1098                     return;
1099 
1100           printf("%-16s ", mp->mnt_stat.f_mntonname);
1101           if (fmi == NULL) {
1102                     printf("(null)\n");
1103                     return;
1104           }
1105           printf("owner %p ", fmi->fmi_owner);
1106           switch (fmi->fmi_state) {
1107           case FSTRANS_NORMAL:
1108                     printf("state normal\n");
1109                     break;
1110           case FSTRANS_SUSPENDING:
1111                     printf("state suspending\n");
1112                     break;
1113           case FSTRANS_SUSPENDED:
1114                     printf("state suspended\n");
1115                     break;
1116           default:
1117                     printf("state %#x\n", fmi->fmi_state);
1118                     break;
1119           }
1120 }
1121 
1122 void
fstrans_dump(int full)1123 fstrans_dump(int full)
1124 {
1125           const struct proclist_desc *pd;
1126           struct proc *p;
1127           struct lwp *l;
1128           struct mount *mp;
1129 
1130           printf("Fstrans locks by lwp:\n");
1131           for (pd = proclists; pd->pd_list != NULL; pd++)
1132                     PROCLIST_FOREACH(p, pd->pd_list)
1133                               LIST_FOREACH(l, &p->p_lwps, l_sibling)
1134                                         fstrans_print_lwp(p, l, full == 1);
1135 
1136           printf("Fstrans state by mount:\n");
1137           for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
1138                     fstrans_print_mount(mp, full == 1);
1139 }
1140 #endif /* defined(DDB) */
1141