xref: /dragonfly/sys/vfs/hammer/hammer_redo.c (revision 33e8e9be6b7c4055fd4732d96a344332c3347e43)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * HAMMER redo - REDO record support for the UNDO/REDO FIFO.
37  *
38  * See also hammer_undo.c
39  */
40 
41 #include "hammer.h"
42 
43 RB_GENERATE2(hammer_redo_rb_tree, hammer_inode, rb_redonode,
44                hammer_redo_rb_compare, hammer_off_t, redo_fifo_start);
45 
46 /*
47  * HAMMER version 4+ REDO support.
48  *
49  * REDO records are used to improve fsync() performance.  Instead of having
50  * to go through a complete double-flush cycle involving at least two disk
51  * synchronizations the fsync need only flush UNDO/REDO FIFO buffers through
52  * the related REDO records, which is a single synchronization requiring
53  * no track seeking.  If a recovery becomes necessary the recovery code
54  * will generate logical data writes based on the REDO records encountered.
55  * That is, the recovery code will UNDO any partial meta-data/data writes
56  * at the raw disk block level and then REDO the data writes at the logical
57  * level.
58  */
59 int
hammer_generate_redo(hammer_transaction_t trans,hammer_inode_t ip,hammer_off_t file_off,uint32_t flags,void * base,int len)60 hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
61                          hammer_off_t file_off, uint32_t flags,
62                          void *base, int len)
63 {
64           hammer_mount_t hmp;
65           hammer_volume_t root_volume;
66           hammer_blockmap_t undomap;
67           hammer_buffer_t buffer = NULL;
68           hammer_fifo_redo_t redo;
69           hammer_fifo_tail_t tail;
70           hammer_off_t next_offset;
71           int error;
72           int bytes;
73           int n;
74 
75           /*
76            * Setup
77            */
78           hmp = trans->hmp;
79 
80           root_volume = trans->rootvol;
81           undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
82 
83           /*
84            * No undo recursion when modifying the root volume
85            */
86           hammer_modify_volume_noundo(NULL, root_volume);
87           hammer_lock_ex(&hmp->undo_lock);
88 
89           /* undo had better not roll over (loose test) */
90           if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3)
91                     hpanic("insufficient UNDO/REDO FIFO space for redo!");
92 
93           /*
94            * Loop until the undo for the entire range has been laid down.
95            * Loop at least once (len might be 0 as a degenerate case).
96            */
97           for (;;) {
98                     /*
99                      * Fetch the layout offset in the UNDO FIFO, wrap it as
100                      * necessary.
101                      */
102                     if (undomap->next_offset == undomap->alloc_offset)
103                               undomap->next_offset = HAMMER_ENCODE_UNDO(0);
104                     next_offset = undomap->next_offset;
105 
106                     /*
107                      * This is a tail-chasing FIFO, when we hit the start of a new
108                      * buffer we don't have to read it in.
109                      */
110                     if ((next_offset & HAMMER_BUFMASK) == 0) {
111                               redo = hammer_bnew(hmp, next_offset, &error, &buffer);
112                               hammer_format_undo(hmp,
113                                                      redo, hmp->undo_seqno ^ 0x40000000);
114                     } else {
115                               redo = hammer_bread(hmp, next_offset, &error, &buffer);
116                     }
117                     if (error)
118                               break;
119                     hammer_modify_buffer_noundo(NULL, buffer);
120 
121                     /*
122                      * Calculate how big a media structure fits up to the next
123                      * alignment point and how large a data payload we can
124                      * accomodate.
125                      *
126                      * If n calculates to 0 or negative there is no room for
127                      * anything but a PAD.
128                      */
129                     bytes = HAMMER_UNDO_ALIGN -
130                               ((int)next_offset & HAMMER_UNDO_MASK);
131                     n = bytes -
132                         (int)sizeof(struct hammer_fifo_redo) -
133                         (int)sizeof(struct hammer_fifo_tail);
134 
135                     /*
136                      * If available space is insufficient for any payload
137                      * we have to lay down a PAD.
138                      *
139                      * The minimum PAD is 8 bytes and the head and tail will
140                      * overlap each other in that case.  PADs do not have
141                      * sequence numbers or CRCs.
142                      *
143                      * A PAD may not start on a boundary.  That is, every
144                      * 512-byte block in the UNDO/REDO FIFO must begin with
145                      * a record containing a sequence number.
146                      */
147                     if (n <= 0) {
148                               KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
149                               KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0);
150                               tail = (void *)((char *)redo + bytes - sizeof(*tail));
151                               if ((void *)redo != (void *)tail) {
152                                         tail->tail_signature = HAMMER_TAIL_SIGNATURE;
153                                         tail->tail_type = HAMMER_HEAD_TYPE_PAD;
154                                         tail->tail_size = bytes;
155                               }
156                               redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
157                               redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
158                               redo->head.hdr_size = bytes;
159                               /* NO CRC OR SEQ NO */
160                               undomap->next_offset += bytes;
161                               hammer_modify_buffer_done(buffer);
162                               hammer_stats_redo += bytes;
163                               continue;
164                     }
165 
166                     /*
167                      * When generating an inode-related REDO record we track
168                      * the point in the UNDO/REDO FIFO containing the inode's
169                      * earliest REDO record.  See hammer_generate_redo_sync().
170                      *
171                      * redo_fifo_next is cleared when an inode is staged to
172                      * the backend and then used to determine how to reassign
173                      * redo_fifo_start after the inode flush completes.
174                      */
175                     if (ip) {
176                               redo->redo_objid = ip->obj_id;
177                               redo->redo_localization = ip->obj_localization;
178                               if ((ip->flags & HAMMER_INODE_RDIRTY) == 0) {
179                                         ip->redo_fifo_start = next_offset;
180                                         if (RB_INSERT(hammer_redo_rb_tree,
181                                                         &hmp->rb_redo_root, ip)) {
182                                                   hpanic("cannot insert inode %p on "
183                                                         "redo FIFO", ip);
184                                         }
185                                         ip->flags |= HAMMER_INODE_RDIRTY;
186                               }
187                               if (ip->redo_fifo_next == 0)
188                                         ip->redo_fifo_next = next_offset;
189                     } else {
190                               redo->redo_objid = 0;
191                               redo->redo_localization = 0;
192                     }
193 
194                     /*
195                      * Calculate the actual payload and recalculate the size
196                      * of the media structure as necessary.  If no data buffer
197                      * is supplied there is no payload.
198                      */
199                     if (base == NULL) {
200                               n = 0;
201                     } else if (n > len) {
202                               n = len;
203                     }
204                     bytes = HAMMER_HEAD_DOALIGN(n) +
205                               (int)sizeof(struct hammer_fifo_redo) +
206                               (int)sizeof(struct hammer_fifo_tail);
207                     if (hammer_debug_general & 0x0080) {
208                               hdkprintf("redo %016jx %d %d\n",
209                                         (intmax_t)next_offset, bytes, n);
210                     }
211 
212                     redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
213                     redo->head.hdr_type = HAMMER_HEAD_TYPE_REDO;
214                     redo->head.hdr_size = bytes;
215                     redo->head.hdr_seq = hmp->undo_seqno++;
216                     redo->head.hdr_crc = 0;
217                     redo->redo_offset = file_off;
218                     redo->redo_flags = flags;
219 
220                     /*
221                      * Incremental payload.  If no payload we throw the entire
222                      * len into redo_data_bytes and will not loop.
223                      */
224                     if (base) {
225                               redo->redo_data_bytes = n;
226                               bcopy(base, redo + 1, n);
227                               len -= n;
228                               base = (char *)base + n;
229                               file_off += n;
230                     } else {
231                               redo->redo_data_bytes = len;
232                               file_off += len;
233                               len = 0;
234                     }
235 
236                     tail = (void *)((char *)redo + bytes - sizeof(*tail));
237                     tail->tail_signature = HAMMER_TAIL_SIGNATURE;
238                     tail->tail_type = HAMMER_HEAD_TYPE_REDO;
239                     tail->tail_size = bytes;
240 
241                     KKASSERT(bytes >= sizeof(redo->head));
242                     hammer_crc_set_fifo_head(hmp->version, &redo->head, bytes);
243                     undomap->next_offset += bytes;
244                     hammer_stats_redo += bytes;
245 
246                     /*
247                      * Before we finish off the buffer we have to deal with any
248                      * junk between the end of the media structure we just laid
249                      * down and the UNDO alignment boundary.  We do this by laying
250                      * down a dummy PAD.  Even though we will probably overwrite
251                      * it almost immediately we have to do this so recovery runs
252                      * can iterate the UNDO space without having to depend on
253                      * the indices in the volume header.
254                      *
255                      * This dummy PAD will be overwritten on the next undo so
256                      * we do not adjust undomap->next_offset.
257                      */
258                     bytes = HAMMER_UNDO_ALIGN -
259                               ((int)undomap->next_offset & HAMMER_UNDO_MASK);
260                     if (bytes != HAMMER_UNDO_ALIGN) {
261                               KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
262                               redo = (void *)(tail + 1);
263                               tail = (void *)((char *)redo + bytes - sizeof(*tail));
264                               if ((void *)redo != (void *)tail) {
265                                         tail->tail_signature = HAMMER_TAIL_SIGNATURE;
266                                         tail->tail_type = HAMMER_HEAD_TYPE_PAD;
267                                         tail->tail_size = bytes;
268                               }
269                               redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
270                               redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
271                               redo->head.hdr_size = bytes;
272                               /* NO CRC OR SEQ NO */
273                     }
274                     hammer_modify_buffer_done(buffer);
275                     if (len == 0)
276                               break;
277           }
278           hammer_modify_volume_done(root_volume);
279           hammer_unlock(&hmp->undo_lock);
280 
281           if (buffer)
282                     hammer_rel_buffer(buffer, 0);
283 
284           /*
285            * Make sure the nominal undo span contains at least one REDO_SYNC,
286            * otherwise the REDO recovery will not be triggered.
287            */
288           if ((hmp->flags & HAMMER_MOUNT_REDO_SYNC) == 0 &&
289               flags != HAMMER_REDO_SYNC) {
290                     hammer_generate_redo_sync(trans);
291           }
292 
293           return(error);
294 }
295 
296 /*
297  * Generate a REDO SYNC record.  At least one such record must be generated
298  * in the nominal recovery span for the recovery code to be able to run
299  * REDOs outside of the span.
300  *
301  * The SYNC record contains the aggregate earliest UNDO/REDO FIFO offset
302  * for all inodes with active REDOs.  This changes dynamically as inodes
303  * get flushed.
304  *
305  * During recovery stage2 any new flush cycles must specify the original
306  * redo sync offset.  That way a crash will re-run the REDOs, at least
307  * up to the point where the UNDO FIFO does not overwrite the area.
308  */
309 void
hammer_generate_redo_sync(hammer_transaction_t trans)310 hammer_generate_redo_sync(hammer_transaction_t trans)
311 {
312           hammer_mount_t hmp = trans->hmp;
313           hammer_inode_t ip;
314           hammer_off_t redo_fifo_start;
315 
316           if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) {
317                     ip = NULL;
318                     redo_fifo_start = hmp->recover_stage2_offset;
319           } else {
320                     ip = RB_FIRST(hammer_redo_rb_tree, &hmp->rb_redo_root);
321                     if (ip)
322                               redo_fifo_start = ip->redo_fifo_start;
323                     else
324                               redo_fifo_start = 0;
325           }
326           if (redo_fifo_start) {
327                     if (hammer_debug_io & 0x0004) {
328                               hdkprintf("SYNC IP %p %016jx\n",
329                                         ip, (intmax_t)redo_fifo_start);
330                     }
331                     hammer_generate_redo(trans, NULL, redo_fifo_start,
332                                              HAMMER_REDO_SYNC, NULL, 0);
333                     trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC;
334           }
335 }
336 
337 /*
338  * This is called when an inode is queued to the backend.
339  */
340 void
hammer_redo_fifo_start_flush(hammer_inode_t ip)341 hammer_redo_fifo_start_flush(hammer_inode_t ip)
342 {
343           ip->redo_fifo_next = 0;
344 }
345 
346 /*
347  * This is called when an inode backend flush is finished.  We have to make
348  * sure that RDIRTY is not set unless dirty bufs are present.  Dirty bufs
349  * can get destroyed through operations such as truncations and leave
350  * us with a stale redo_fifo_next.
351  */
352 void
hammer_redo_fifo_end_flush(hammer_inode_t ip)353 hammer_redo_fifo_end_flush(hammer_inode_t ip)
354 {
355           hammer_mount_t hmp = ip->hmp;
356 
357           hammer_lock_ex(&hmp->undo_lock);
358           if (ip->flags & HAMMER_INODE_RDIRTY) {
359                     RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
360                     ip->flags &= ~HAMMER_INODE_RDIRTY;
361           }
362           if ((ip->flags & HAMMER_INODE_BUFS) == 0)
363                     ip->redo_fifo_next = 0;
364           if (ip->redo_fifo_next) {
365                     ip->redo_fifo_start = ip->redo_fifo_next;
366                     if (RB_INSERT(hammer_redo_rb_tree, &hmp->rb_redo_root, ip)) {
367                               hpanic("cannot reinsert inode %p on redo FIFO", ip);
368                     }
369                     ip->flags |= HAMMER_INODE_RDIRTY;
370           }
371           hammer_unlock(&hmp->undo_lock);
372 }
373