1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
24 */
25
26 #include <sys/dmu_objset.h>
27 #include <sys/metaslab.h>
28 #include <sys/metaslab_impl.h>
29 #include <sys/spa.h>
30 #include <sys/spa_impl.h>
31 #include <sys/spa_log_spacemap.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zap.h>
34
35 /*
36 * Log Space Maps
37 *
38 * Log space maps are an optimization in ZFS metadata allocations for pools
39 * whose workloads are primarily random-writes. Random-write workloads are also
40 * typically random-free, meaning that they are freeing from locations scattered
41 * throughout the pool. This means that each TXG we will have to append some
42 * FREE records to almost every metaslab. With log space maps, we hold their
43 * changes in memory and log them altogether in one pool-wide space map on-disk
44 * for persistence. As more blocks are accumulated in the log space maps and
45 * more unflushed changes are accounted in memory, we flush a selected group
46 * of metaslabs every TXG to relieve memory pressure and potential overheads
47 * when loading the pool. Flushing a metaslab to disk relieves memory as we
48 * flush any unflushed changes from memory to disk (i.e. the metaslab's space
49 * map) and saves import time by making old log space maps obsolete and
50 * eventually destroying them. [A log space map is said to be obsolete when all
51 * its entries have made it to their corresponding metaslab space maps].
52 *
53 * == On disk data structures used ==
54 *
55 * - The pool has a new feature flag and a new entry in the MOS. The feature
56 * is activated when we create the first log space map and remains active
57 * for the lifetime of the pool. The new entry in the MOS Directory [refer
58 * to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
59 * pairs are of the form <key: txg, value: log space map object for that txg>.
60 * This entry is our on-disk reference of the log space maps that exist in
61 * the pool for each TXG and it is used during import to load all the
62 * metaslab unflushed changes in memory. To see how this structure is first
63 * created and later populated refer to spa_generate_syncing_log_sm(). To see
64 * how it is used during import time refer to spa_ld_log_sm_metadata().
65 *
66 * - Each vdev has a new entry in its vdev_top_zap (see field
67 * VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
68 * each metaslab in this vdev. This field is the on-disk counterpart of the
69 * in-memory field ms_unflushed_txg which tells us from which TXG and onwards
70 * the metaslab haven't had its changes flushed. During import, we use this
71 * to ignore any entries in the space map log that are for this metaslab but
72 * from a TXG before msp_unflushed_txg. At that point, we also populate its
73 * in-memory counterpart and from there both fields are updated every time
74 * we flush that metaslab.
75 *
76 * - A space map is created every TXG and, during that TXG, it is used to log
77 * all incoming changes (the log space map). When created, the log space map
78 * is referenced in memory by spa_syncing_log_sm and its object ID is inserted
79 * to the space map ZAP mentioned above. The log space map is closed at the
80 * end of the TXG and will be destroyed when it becomes fully obsolete. We
81 * know when a log space map has become obsolete by looking at the oldest
82 * (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
83 * than the log space map's TXG, then it means that there is no metaslab who
84 * doesn't have the changes from that log and we can therefore destroy it.
85 * [see spa_cleanup_old_sm_logs()].
86 *
87 * == Important in-memory structures ==
88 *
89 * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
90 * the pool by their ms_unflushed_txg field. It is primarily used for three
91 * reasons. First of all, it is used during flushing where we try to flush
92 * metaslabs in-order from the oldest-flushed to the most recently flushed
93 * every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
94 * oldest flushed metaslab to distinguish which log space maps have become
95 * obsolete and which ones are still relevant. Finally it tells us which
96 * metaslabs have unflushed changes in a pool where this feature was just
97 * enabled, as we don't immediately add all of the pool's metaslabs but we
98 * add them over time as they go through metaslab_sync(). The reason that
99 * we do that is to ease these pools into the behavior of the flushing
100 * algorithm (described later on).
101 *
102 * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
103 * counterpart of the space map ZAP mentioned above. It's an AVL tree whose
104 * nodes represent the log space maps in the pool. This in-memory
105 * representation of log space maps in the pool sorts the log space maps by
106 * the TXG that they were created (which is also the TXG of their unflushed
107 * changes). It also contains the following extra information for each
108 * space map:
109 * [1] The number of metaslabs that were last flushed on that TXG. This is
110 * important because if that counter is zero and this is the oldest
111 * log then it means that it is also obsolete.
112 * [2] The number of blocks of that space map. This field is used by the
113 * block heuristic of our flushing algorithm (described later on).
114 * It represents how many blocks of metadata changes ZFS had to write
115 * to disk for that TXG.
116 *
117 * - The per-spa field spa_log_summary is a list of entries that summarizes
118 * the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
119 * AVL tree mentioned above. The reason this exists is that our flushing
120 * algorithm (described later) tries to estimate how many metaslabs to flush
121 * in each TXG by iterating over all the log space maps and looking at their
122 * block counts. Summarizing that information means that don't have to
123 * iterate through each space map, minimizing the runtime overhead of the
124 * flushing algorithm which would be induced in syncing context. In terms of
125 * implementation the log summary is used as a queue:
126 * * we modify or pop entries from its head when we flush metaslabs
127 * * we modify or append entries to its tail when we sync changes.
128 *
129 * - Each metaslab has two new range trees that hold its unflushed changes,
130 * ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
131 *
132 * == Flushing algorithm ==
133 *
134 * The decision of how many metaslabs to flush on a give TXG is guided by
135 * two heuristics:
136 *
137 * [1] The memory heuristic -
138 * We keep track of the memory used by the unflushed trees from all the
139 * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
140 * stays below a certain threshold which is determined by an arbitrary hard
141 * limit and an arbitrary percentage of the system's memory [see
142 * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
143 * unflushed changes are passing that threshold, we flush metaslabs, which
144 * empties their unflushed range trees, reducing the memory used.
145 *
146 * [2] The block heuristic -
147 * We try to keep the total number of blocks in the log space maps in check
148 * so the log doesn't grow indefinitely and we don't induce a lot of overhead
149 * when loading the pool. At the same time we don't want to flush a lot of
150 * metaslabs too often as this would defeat the purpose of the log space map.
151 * As a result we set a limit in the amount of blocks that we think it's
152 * acceptable for the log space maps to have and try not to cross it.
153 * [see sus_blocklimit from spa_unflushed_stats].
154 *
155 * In order to stay below the block limit every TXG we have to estimate how
156 * many metaslabs we need to flush based on the current rate of incoming blocks
157 * and our history of log space map blocks. The main idea here is to answer
158 * the question of how many metaslabs do we need to flush in order to get rid
159 * at least an X amount of log space map blocks. We can answer this question
160 * by iterating backwards from the oldest log space map to the newest one
161 * and looking at their metaslab and block counts. At this point the log summary
162 * mentioned above comes handy as it reduces the amount of things that we have
163 * to iterate (even though it may reduce the preciseness of our estimates due
164 * to its aggregation of data). So with that in mind, we project the incoming
165 * rate of the current TXG into the future and attempt to approximate how many
166 * metaslabs would we need to flush from now in order to avoid exceeding our
167 * block limit in different points in the future (granted that we would keep
168 * flushing the same number of metaslabs for every TXG). Then we take the
169 * maximum number from all these estimates to be on the safe side. For the
170 * exact implementation details of algorithm refer to
171 * spa_estimate_metaslabs_to_flush.
172 */
173
174 /*
175 * This is used as the block size for the space maps used for the
176 * log space map feature. These space maps benefit from a bigger
177 * block size as we expect to be writing a lot of data to them at
178 * once.
179 */
180 unsigned long zfs_log_sm_blksz = 1ULL << 17;
181
182 /*
183 * Percentage of the overall system's memory that ZFS allows to be
184 * used for unflushed changes (e.g. the sum of size of all the nodes
185 * in the unflushed trees).
186 *
187 * Note that this value is calculated over 1000000 for finer granularity
188 * (thus the _ppm suffix; reads as "parts per million"). As an example,
189 * the default of 1000 allows 0.1% of memory to be used.
190 */
191 unsigned long zfs_unflushed_max_mem_ppm = 1000;
192
193 /*
194 * Specific hard-limit in memory that ZFS allows to be used for
195 * unflushed changes.
196 */
197 unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
198
199 /*
200 * The following tunable determines the number of blocks that can be used for
201 * the log space maps. It is expressed as a percentage of the total number of
202 * metaslabs in the pool (i.e. the default of 400 means that the number of log
203 * blocks is capped at 4 times the number of metaslabs).
204 *
205 * This value exists to tune our flushing algorithm, with higher values
206 * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
207 * flushing metaslabs more aggressively with the upside of saving overheads
208 * when loading the pool. Another factor in this tradeoff is that flushing
209 * less often can potentially lead to better utilization of the metaslab space
210 * map's block size as we accumulate more changes per flush.
211 *
212 * Given that this tunable indirectly controls the flush rate (metaslabs
213 * flushed per txg) and that's why making it a percentage in terms of the
214 * number of metaslabs in the pool makes sense here.
215 *
216 * As a rule of thumb we default this tunable to 400% based on the following:
217 *
218 * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
219 * it is reasonable to expect that the amount of obsolete entries changes
220 * linearly from txg to txg (e.g. the oldest log should have the most
221 * obsolete entries, and the most recent one the least). With this we could
222 * say that, at any given time, about half of the entries in the whole space
223 * map log are obsolete. Thus for every two entries for a metaslab in the
224 * log space map, only one of them is valid and actually makes it to the
225 * metaslab's space map.
226 * [factor of 2]
227 * 2] Each entry in the log space map is guaranteed to be two words while
228 * entries in metaslab space maps are generally single-word.
229 * [an extra factor of 2 - 400% overall]
230 * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
231 * account any consolidation of segments from the log space map to the
232 * unflushed range trees nor their history (e.g. a segment being allocated,
233 * then freed, then allocated again means 3 log space map entries but 0
234 * metaslab space map entries). Depending on the workload, we've seen ~1.8
235 * non-obsolete log space map entries per metaslab entry, for a total of
236 * ~600%. Since most of these estimates though are workload dependent, we
237 * default on 400% to be conservative.
238 *
239 * Thus we could say that even in the worst
240 * case of [1] and [2], the factor should end up being 4.
241 *
242 * That said, regardless of the number of metaslabs in the pool we need to
243 * provide upper and lower bounds for the log block limit.
244 * [see zfs_unflushed_log_block_{min,max}]
245 */
246 unsigned long zfs_unflushed_log_block_pct = 400;
247
248 /*
249 * If the number of metaslabs is small and our incoming rate is high, we could
250 * get into a situation that we are flushing all our metaslabs every TXG. Thus
251 * we always allow at least this many log blocks.
252 */
253 unsigned long zfs_unflushed_log_block_min = 1000;
254
255 /*
256 * If the log becomes too big, the import time of the pool can take a hit in
257 * terms of performance. Thus we have a hard limit in the size of the log in
258 * terms of blocks.
259 */
260 static unsigned long zfs_unflushed_log_block_max = (1ULL << 17);
261
262 /*
263 * Also we have a hard limit in the size of the log in terms of dirty TXGs.
264 */
265 static unsigned long zfs_unflushed_log_txg_max = 1000;
266
267 /*
268 * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
269 * stability of the flushing algorithm (longer summary) vs its runtime overhead
270 * (smaller summary is faster to traverse).
271 */
272 unsigned long zfs_max_logsm_summary_length = 10;
273
274 /*
275 * Tunable that sets the lower bound on the metaslabs to flush every TXG.
276 *
277 * Setting this to 0 has no effect since if the pool is idle we won't even be
278 * creating log space maps and therefore we won't be flushing. On the other
279 * hand if the pool has any incoming workload our block heuristic will start
280 * flushing metaslabs anyway.
281 *
282 * The point of this tunable is to be used in extreme cases where we really
283 * want to flush more metaslabs than our adaptable heuristic plans to flush.
284 */
285 unsigned long zfs_min_metaslabs_to_flush = 1;
286
287 /*
288 * Tunable that specifies how far in the past do we want to look when trying to
289 * estimate the incoming log blocks for the current TXG.
290 *
291 * Setting this too high may not only increase runtime but also minimize the
292 * effect of the incoming rates from the most recent TXGs as we take the
293 * average over all the blocks that we walk
294 * [see spa_estimate_incoming_log_blocks].
295 */
296 unsigned long zfs_max_log_walking = 5;
297
298 /*
299 * This tunable exists solely for testing purposes. It ensures that the log
300 * spacemaps are not flushed and destroyed during export in order for the
301 * relevant log spacemap import code paths to be tested (effectively simulating
302 * a crash).
303 */
304 int zfs_keep_log_spacemaps_at_export = 0;
305
306 static uint64_t
spa_estimate_incoming_log_blocks(spa_t * spa)307 spa_estimate_incoming_log_blocks(spa_t *spa)
308 {
309 ASSERT3U(spa_sync_pass(spa), ==, 1);
310 uint64_t steps = 0, sum = 0;
311 for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
312 sls != NULL && steps < zfs_max_log_walking;
313 sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
314 if (sls->sls_txg == spa_syncing_txg(spa)) {
315 /*
316 * skip the log created in this TXG as this would
317 * make our estimations inaccurate.
318 */
319 continue;
320 }
321 sum += sls->sls_nblocks;
322 steps++;
323 }
324 return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
325 }
326
327 uint64_t
spa_log_sm_blocklimit(spa_t * spa)328 spa_log_sm_blocklimit(spa_t *spa)
329 {
330 return (spa->spa_unflushed_stats.sus_blocklimit);
331 }
332
333 void
spa_log_sm_set_blocklimit(spa_t * spa)334 spa_log_sm_set_blocklimit(spa_t *spa)
335 {
336 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
337 ASSERT0(spa_log_sm_blocklimit(spa));
338 return;
339 }
340
341 uint64_t msdcount = 0;
342 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
343 e; e = list_next(&spa->spa_log_summary, e))
344 msdcount += e->lse_msdcount;
345
346 uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
347 spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
348 zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
349 }
350
351 uint64_t
spa_log_sm_nblocks(spa_t * spa)352 spa_log_sm_nblocks(spa_t *spa)
353 {
354 return (spa->spa_unflushed_stats.sus_nblocks);
355 }
356
357 /*
358 * Ensure that the in-memory log space map structures and the summary
359 * have the same block and metaslab counts.
360 */
361 static void
spa_log_summary_verify_counts(spa_t * spa)362 spa_log_summary_verify_counts(spa_t *spa)
363 {
364 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
365
366 if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
367 return;
368
369 uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
370
371 uint64_t ms_in_summary = 0, blk_in_summary = 0;
372 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
373 e; e = list_next(&spa->spa_log_summary, e)) {
374 ms_in_summary += e->lse_mscount;
375 blk_in_summary += e->lse_blkcount;
376 }
377
378 uint64_t ms_in_logs = 0, blk_in_logs = 0;
379 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
380 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
381 ms_in_logs += sls->sls_mscount;
382 blk_in_logs += sls->sls_nblocks;
383 }
384
385 VERIFY3U(ms_in_logs, ==, ms_in_summary);
386 VERIFY3U(ms_in_logs, ==, ms_in_avl);
387 VERIFY3U(blk_in_logs, ==, blk_in_summary);
388 VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
389 }
390
391 static boolean_t
summary_entry_is_full(spa_t * spa,log_summary_entry_t * e,uint64_t txg)392 summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
393 {
394 if (e->lse_end == txg)
395 return (0);
396 if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
397 zfs_max_logsm_summary_length))
398 return (1);
399 uint64_t blocks_per_row = MAX(1,
400 DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
401 zfs_max_logsm_summary_length));
402 return (blocks_per_row <= e->lse_blkcount);
403 }
404
405 /*
406 * Update the log summary information to reflect the fact that a metaslab
407 * was flushed or destroyed (e.g due to device removal or pool export/destroy).
408 *
409 * We typically flush the oldest flushed metaslab so the first (and oldest)
410 * entry of the summary is updated. However if that metaslab is getting loaded
411 * we may flush the second oldest one which may be part of an entry later in
412 * the summary. Moreover, if we call into this function from metaslab_fini()
413 * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
414 * for a txg as an argument so we can locate the appropriate summary entry for
415 * the metaslab.
416 */
417 void
spa_log_summary_decrement_mscount(spa_t * spa,uint64_t txg,boolean_t dirty)418 spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
419 {
420 /*
421 * We don't track summary data for read-only pools and this function
422 * can be called from metaslab_fini(). In that case return immediately.
423 */
424 if (!spa_writeable(spa))
425 return;
426
427 log_summary_entry_t *target = NULL;
428 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
429 e != NULL; e = list_next(&spa->spa_log_summary, e)) {
430 if (e->lse_start > txg)
431 break;
432 target = e;
433 }
434
435 if (target == NULL || target->lse_mscount == 0) {
436 /*
437 * We didn't find a summary entry for this metaslab. We must be
438 * at the teardown of a spa_load() attempt that got an error
439 * while reading the log space maps.
440 */
441 VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
442 return;
443 }
444
445 target->lse_mscount--;
446 if (dirty)
447 target->lse_msdcount--;
448 }
449
450 /*
451 * Update the log summary information to reflect the fact that we destroyed
452 * old log space maps. Since we can only destroy the oldest log space maps,
453 * we decrement the block count of the oldest summary entry and potentially
454 * destroy it when that count hits 0.
455 *
456 * This function is called after a metaslab is flushed and typically that
457 * metaslab is the oldest flushed, which means that this function will
458 * typically decrement the block count of the first entry of the summary and
459 * potentially free it if the block count gets to zero (its metaslab count
460 * should be zero too at that point).
461 *
462 * There are certain scenarios though that don't work exactly like that so we
463 * need to account for them:
464 *
465 * Scenario [1]: It is possible that after we flushed the oldest flushed
466 * metaslab and we destroyed the oldest log space map, more recent logs had 0
467 * metaslabs pointing to them so we got rid of them too. This can happen due
468 * to metaslabs being destroyed through device removal, or because the oldest
469 * flushed metaslab was loading but we kept flushing more recently flushed
470 * metaslabs due to the memory pressure of unflushed changes. Because of that,
471 * we always iterate from the beginning of the summary and if blocks_gone is
472 * bigger than the block_count of the current entry we free that entry (we
473 * expect its metaslab count to be zero), we decrement blocks_gone and on to
474 * the next entry repeating this procedure until blocks_gone gets decremented
475 * to 0. Doing this also works for the typical case mentioned above.
476 *
477 * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
478 * the first (and oldest) entry in the summary. If the first few entries of
479 * the summary were only accounting metaslabs from a device that was just
480 * removed, then the current oldest flushed metaslab could be accounted by an
481 * entry somewhere in the middle of the summary. Moreover flushing that
482 * metaslab will destroy all the log space maps older than its ms_unflushed_txg
483 * because they became obsolete after the removal. Thus, iterating as we did
484 * for scenario [1] works out for this case too.
485 *
486 * Scenario [3]: At times we decide to flush all the metaslabs in the pool
487 * in one TXG (either because we are exporting the pool or because our flushing
488 * heuristics decided to do so). When that happens all the log space maps get
489 * destroyed except the one created for the current TXG which doesn't have
490 * any log blocks yet. As log space maps get destroyed with every metaslab that
491 * we flush, entries in the summary are also destroyed. This brings a weird
492 * corner-case when we flush the last metaslab and the log space map of the
493 * current TXG is in the same summary entry with other log space maps that
494 * are older. When that happens we are eventually left with this one last
495 * summary entry whose blocks are gone (blocks_gone equals the entry's block
496 * count) but its metaslab count is non-zero (because it accounts all the
497 * metaslabs in the pool as they all got flushed). Under this scenario we can't
498 * free this last summary entry as it's referencing all the metaslabs in the
499 * pool and its block count will get incremented at the end of this sync (when
500 * we close the syncing log space map). Thus we just decrement its current
501 * block count and leave it alone. In the case that the pool gets exported,
502 * its metaslab count will be decremented over time as we call metaslab_fini()
503 * for all the metaslabs in the pool and the entry will be freed at
504 * spa_unload_log_sm_metadata().
505 */
506 void
spa_log_summary_decrement_blkcount(spa_t * spa,uint64_t blocks_gone)507 spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
508 {
509 log_summary_entry_t *e = list_head(&spa->spa_log_summary);
510 if (e->lse_txgcount > 0)
511 e->lse_txgcount--;
512 for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
513 if (e->lse_blkcount > blocks_gone) {
514 e->lse_blkcount -= blocks_gone;
515 blocks_gone = 0;
516 break;
517 } else if (e->lse_mscount == 0) {
518 /* remove obsolete entry */
519 blocks_gone -= e->lse_blkcount;
520 list_remove(&spa->spa_log_summary, e);
521 kmem_free(e, sizeof (log_summary_entry_t));
522 } else {
523 /* Verify that this is scenario [3] mentioned above. */
524 VERIFY3U(blocks_gone, ==, e->lse_blkcount);
525
526 /*
527 * Assert that this is scenario [3] further by ensuring
528 * that this is the only entry in the summary.
529 */
530 VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
531 ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
532
533 blocks_gone = e->lse_blkcount = 0;
534 break;
535 }
536 }
537
538 /*
539 * Ensure that there is no way we are trying to remove more blocks
540 * than the # of blocks in the summary.
541 */
542 ASSERT0(blocks_gone);
543 }
544
545 void
spa_log_sm_decrement_mscount(spa_t * spa,uint64_t txg)546 spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
547 {
548 spa_log_sm_t target = { .sls_txg = txg };
549 spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
550 &target, NULL);
551
552 if (sls == NULL) {
553 /*
554 * We must be at the teardown of a spa_load() attempt that
555 * got an error while reading the log space maps.
556 */
557 VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
558 return;
559 }
560
561 ASSERT(sls->sls_mscount > 0);
562 sls->sls_mscount--;
563 }
564
565 void
spa_log_sm_increment_current_mscount(spa_t * spa)566 spa_log_sm_increment_current_mscount(spa_t *spa)
567 {
568 spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
569 ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
570 last_sls->sls_mscount++;
571 }
572
573 static void
summary_add_data(spa_t * spa,uint64_t txg,uint64_t metaslabs_flushed,uint64_t metaslabs_dirty,uint64_t nblocks)574 summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
575 uint64_t metaslabs_dirty, uint64_t nblocks)
576 {
577 log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
578
579 if (e == NULL || summary_entry_is_full(spa, e, txg)) {
580 e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
581 e->lse_start = e->lse_end = txg;
582 e->lse_txgcount = 1;
583 list_insert_tail(&spa->spa_log_summary, e);
584 }
585
586 ASSERT3U(e->lse_start, <=, txg);
587 if (e->lse_end < txg) {
588 e->lse_end = txg;
589 e->lse_txgcount++;
590 }
591 e->lse_mscount += metaslabs_flushed;
592 e->lse_msdcount += metaslabs_dirty;
593 e->lse_blkcount += nblocks;
594 }
595
596 static void
spa_log_summary_add_incoming_blocks(spa_t * spa,uint64_t nblocks)597 spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
598 {
599 summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
600 }
601
602 void
spa_log_summary_add_flushed_metaslab(spa_t * spa,boolean_t dirty)603 spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
604 {
605 summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
606 }
607
608 void
spa_log_summary_dirty_flushed_metaslab(spa_t * spa,uint64_t txg)609 spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
610 {
611 log_summary_entry_t *target = NULL;
612 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
613 e != NULL; e = list_next(&spa->spa_log_summary, e)) {
614 if (e->lse_start > txg)
615 break;
616 target = e;
617 }
618 ASSERT3P(target, !=, NULL);
619 ASSERT3U(target->lse_mscount, !=, 0);
620 target->lse_msdcount++;
621 }
622
623 /*
624 * This function attempts to estimate how many metaslabs should
625 * we flush to satisfy our block heuristic for the log spacemap
626 * for the upcoming TXGs.
627 *
628 * Specifically, it first tries to estimate the number of incoming
629 * blocks in this TXG. Then by projecting that incoming rate to
630 * future TXGs and using the log summary, it figures out how many
631 * flushes we would need to do for future TXGs individually to
632 * stay below our block limit and returns the maximum number of
633 * flushes from those estimates.
634 */
635 static uint64_t
spa_estimate_metaslabs_to_flush(spa_t * spa)636 spa_estimate_metaslabs_to_flush(spa_t *spa)
637 {
638 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
639 ASSERT3U(spa_sync_pass(spa), ==, 1);
640 ASSERT(spa_log_sm_blocklimit(spa) != 0);
641
642 /*
643 * This variable contains the incoming rate that will be projected
644 * and used for our flushing estimates in the future.
645 */
646 uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
647
648 /*
649 * At any point in time this variable tells us how many
650 * TXGs in the future we are so we can make our estimations.
651 */
652 uint64_t txgs_in_future = 1;
653
654 /*
655 * This variable tells us how much room do we have until we hit
656 * our limit. When it goes negative, it means that we've exceeded
657 * our limit and we need to flush.
658 *
659 * Note that since we start at the first TXG in the future (i.e.
660 * txgs_in_future starts from 1) we already decrement this
661 * variable by the incoming rate.
662 */
663 int64_t available_blocks =
664 spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
665
666 int64_t available_txgs = zfs_unflushed_log_txg_max;
667 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
668 e; e = list_next(&spa->spa_log_summary, e))
669 available_txgs -= e->lse_txgcount;
670
671 /*
672 * This variable tells us the total number of flushes needed to
673 * keep the log size within the limit when we reach txgs_in_future.
674 */
675 uint64_t total_flushes = 0;
676
677 /* Holds the current maximum of our estimates so far. */
678 uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
679
680 /*
681 * For our estimations we only look as far in the future
682 * as the summary allows us.
683 */
684 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
685 e; e = list_next(&spa->spa_log_summary, e)) {
686
687 /*
688 * If there is still room before we exceed our limit
689 * then keep skipping TXGs accumulating more blocks
690 * based on the incoming rate until we exceed it.
691 */
692 if (available_blocks >= 0 && available_txgs >= 0) {
693 uint64_t skip_txgs = (incoming == 0) ?
694 available_txgs + 1 : MIN(available_txgs + 1,
695 (available_blocks / incoming) + 1);
696 available_blocks -= (skip_txgs * incoming);
697 available_txgs -= skip_txgs;
698 txgs_in_future += skip_txgs;
699 ASSERT3S(available_blocks, >=, -incoming);
700 ASSERT3S(available_txgs, >=, -1);
701 }
702
703 /*
704 * At this point we're far enough into the future where
705 * the limit was just exceeded and we flush metaslabs
706 * based on the current entry in the summary, updating
707 * our available_blocks.
708 */
709 ASSERT(available_blocks < 0 || available_txgs < 0);
710 available_blocks += e->lse_blkcount;
711 available_txgs += e->lse_txgcount;
712 total_flushes += e->lse_msdcount;
713
714 /*
715 * Keep the running maximum of the total_flushes that
716 * we've done so far over the number of TXGs in the
717 * future that we are. The idea here is to estimate
718 * the average number of flushes that we should do
719 * every TXG so that when we are that many TXGs in the
720 * future we stay under the limit.
721 */
722 max_flushes_pertxg = MAX(max_flushes_pertxg,
723 DIV_ROUND_UP(total_flushes, txgs_in_future));
724 }
725 return (max_flushes_pertxg);
726 }
727
728 uint64_t
spa_log_sm_memused(spa_t * spa)729 spa_log_sm_memused(spa_t *spa)
730 {
731 return (spa->spa_unflushed_stats.sus_memused);
732 }
733
734 static boolean_t
spa_log_exceeds_memlimit(spa_t * spa)735 spa_log_exceeds_memlimit(spa_t *spa)
736 {
737 if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
738 return (B_TRUE);
739
740 uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
741 zfs_unflushed_max_mem_ppm) / 1000000;
742 if (spa_log_sm_memused(spa) > system_mem_allowed)
743 return (B_TRUE);
744
745 return (B_FALSE);
746 }
747
748 boolean_t
spa_flush_all_logs_requested(spa_t * spa)749 spa_flush_all_logs_requested(spa_t *spa)
750 {
751 return (spa->spa_log_flushall_txg != 0);
752 }
753
754 void
spa_flush_metaslabs(spa_t * spa,dmu_tx_t * tx)755 spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
756 {
757 uint64_t txg = dmu_tx_get_txg(tx);
758
759 if (spa_sync_pass(spa) != 1)
760 return;
761
762 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
763 return;
764
765 /*
766 * If we don't have any metaslabs with unflushed changes
767 * return immediately.
768 */
769 if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
770 return;
771
772 /*
773 * During SPA export we leave a few empty TXGs to go by [see
774 * spa_final_dirty_txg() to understand why]. For this specific
775 * case, it is important to not flush any metaslabs as that
776 * would dirty this TXG.
777 *
778 * That said, during one of these dirty TXGs that is less or
779 * equal to spa_final_dirty(), spa_unload() will request that
780 * we try to flush all the metaslabs for that TXG before
781 * exporting the pool, thus we ensure that we didn't get a
782 * request of flushing everything before we attempt to return
783 * immediately.
784 */
785 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
786 !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
787 !spa_flush_all_logs_requested(spa))
788 return;
789
790 /*
791 * We need to generate a log space map before flushing because this
792 * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
793 * for this TXG's flushed metaslab count (aka sls_mscount which is
794 * manipulated in many ways down the metaslab_flush() codepath).
795 *
796 * That is not to say that we may generate a log space map when we
797 * don't need it. If we are flushing metaslabs, that means that we
798 * were going to write changes to disk anyway, so even if we were
799 * not flushing, a log space map would have been created anyway in
800 * metaslab_sync().
801 */
802 spa_generate_syncing_log_sm(spa, tx);
803
804 /*
805 * This variable tells us how many metaslabs we want to flush based
806 * on the block-heuristic of our flushing algorithm (see block comment
807 * of log space map feature). We also decrement this as we flush
808 * metaslabs and attempt to destroy old log space maps.
809 */
810 uint64_t want_to_flush;
811 if (spa_flush_all_logs_requested(spa)) {
812 ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
813 want_to_flush = UINT64_MAX;
814 } else {
815 want_to_flush = spa_estimate_metaslabs_to_flush(spa);
816 }
817
818 /* Used purely for verification purposes */
819 uint64_t visited = 0;
820
821 /*
822 * Ideally we would only iterate through spa_metaslabs_by_flushed
823 * using only one variable (curr). We can't do that because
824 * metaslab_flush() mutates position of curr in the AVL when
825 * it flushes that metaslab by moving it to the end of the tree.
826 * Thus we always keep track of the original next node of the
827 * current node (curr) in another variable (next).
828 */
829 metaslab_t *next = NULL;
830 for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
831 curr != NULL; curr = next) {
832 next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
833
834 /*
835 * If this metaslab has been flushed this txg then we've done
836 * a full circle over the metaslabs.
837 */
838 if (metaslab_unflushed_txg(curr) == txg)
839 break;
840
841 /*
842 * If we are done flushing for the block heuristic and the
843 * unflushed changes don't exceed the memory limit just stop.
844 */
845 if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
846 break;
847
848 if (metaslab_unflushed_dirty(curr)) {
849 mutex_enter(&curr->ms_sync_lock);
850 mutex_enter(&curr->ms_lock);
851 metaslab_flush(curr, tx);
852 mutex_exit(&curr->ms_lock);
853 mutex_exit(&curr->ms_sync_lock);
854 if (want_to_flush > 0)
855 want_to_flush--;
856 } else
857 metaslab_unflushed_bump(curr, tx, B_FALSE);
858
859 visited++;
860 }
861 ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
862
863 spa_log_sm_set_blocklimit(spa);
864 }
865
866 /*
867 * Close the log space map for this TXG and update the block counts
868 * for the log's in-memory structure and the summary.
869 */
870 void
spa_sync_close_syncing_log_sm(spa_t * spa)871 spa_sync_close_syncing_log_sm(spa_t *spa)
872 {
873 if (spa_syncing_log_sm(spa) == NULL)
874 return;
875 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
876
877 spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
878 ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
879
880 sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
881 spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
882
883 /*
884 * Note that we can't assert that sls_mscount is not 0,
885 * because there is the case where the first metaslab
886 * in spa_metaslabs_by_flushed is loading and we were
887 * not able to flush any metaslabs the current TXG.
888 */
889 ASSERT(sls->sls_nblocks != 0);
890
891 spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
892 spa_log_summary_verify_counts(spa);
893
894 space_map_close(spa->spa_syncing_log_sm);
895 spa->spa_syncing_log_sm = NULL;
896
897 /*
898 * At this point we tried to flush as many metaslabs as we
899 * can as the pool is getting exported. Reset the "flush all"
900 * so the last few TXGs before closing the pool can be empty
901 * (e.g. not dirty).
902 */
903 if (spa_flush_all_logs_requested(spa)) {
904 ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
905 spa->spa_log_flushall_txg = 0;
906 }
907 }
908
909 void
spa_cleanup_old_sm_logs(spa_t * spa,dmu_tx_t * tx)910 spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
911 {
912 objset_t *mos = spa_meta_objset(spa);
913
914 uint64_t spacemap_zap;
915 int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
916 DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
917 if (error == ENOENT) {
918 ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
919 return;
920 }
921 VERIFY0(error);
922
923 metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
924 uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
925
926 /* Free all log space maps older than the oldest_flushed_txg. */
927 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
928 sls && sls->sls_txg < oldest_flushed_txg;
929 sls = avl_first(&spa->spa_sm_logs_by_txg)) {
930 ASSERT0(sls->sls_mscount);
931 avl_remove(&spa->spa_sm_logs_by_txg, sls);
932 space_map_free_obj(mos, sls->sls_sm_obj, tx);
933 VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
934 spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
935 spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
936 kmem_free(sls, sizeof (spa_log_sm_t));
937 }
938 }
939
940 static spa_log_sm_t *
spa_log_sm_alloc(uint64_t sm_obj,uint64_t txg)941 spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
942 {
943 spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
944 sls->sls_sm_obj = sm_obj;
945 sls->sls_txg = txg;
946 return (sls);
947 }
948
949 void
spa_generate_syncing_log_sm(spa_t * spa,dmu_tx_t * tx)950 spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
951 {
952 uint64_t txg = dmu_tx_get_txg(tx);
953 objset_t *mos = spa_meta_objset(spa);
954
955 if (spa_syncing_log_sm(spa) != NULL)
956 return;
957
958 if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
959 return;
960
961 uint64_t spacemap_zap;
962 int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
963 DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
964 if (error == ENOENT) {
965 ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
966
967 error = 0;
968 spacemap_zap = zap_create(mos,
969 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
970 VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
971 DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
972 &spacemap_zap, tx));
973 spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
974 }
975 VERIFY0(error);
976
977 uint64_t sm_obj;
978 ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
979 ==, ENOENT);
980 sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
981 VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
982 avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
983
984 /*
985 * We pass UINT64_MAX as the space map's representation size
986 * and SPA_MINBLOCKSHIFT as the shift, to make the space map
987 * accept any sorts of segments since there's no real advantage
988 * to being more restrictive (given that we're already going
989 * to be using 2-word entries).
990 */
991 VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
992 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
993
994 spa_log_sm_set_blocklimit(spa);
995 }
996
997 /*
998 * Find all the log space maps stored in the space map ZAP and sort
999 * them by their TXG in spa_sm_logs_by_txg.
1000 */
1001 static int
spa_ld_log_sm_metadata(spa_t * spa)1002 spa_ld_log_sm_metadata(spa_t *spa)
1003 {
1004 int error;
1005 uint64_t spacemap_zap;
1006
1007 ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
1008
1009 error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
1010 DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
1011 if (error == ENOENT) {
1012 /* the space map ZAP doesn't exist yet */
1013 return (0);
1014 } else if (error != 0) {
1015 spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
1016 "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
1017 error);
1018 return (error);
1019 }
1020
1021 zap_cursor_t zc;
1022 zap_attribute_t za;
1023 for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
1024 (error = zap_cursor_retrieve(&zc, &za)) == 0;
1025 zap_cursor_advance(&zc)) {
1026 uint64_t log_txg = zfs_strtonum(za.za_name, NULL);
1027 spa_log_sm_t *sls =
1028 spa_log_sm_alloc(za.za_first_integer, log_txg);
1029 avl_add(&spa->spa_sm_logs_by_txg, sls);
1030 }
1031 zap_cursor_fini(&zc);
1032 if (error != ENOENT) {
1033 spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
1034 "zap_cursor_retrieve(spacemap_zap) [error %d]",
1035 error);
1036 return (error);
1037 }
1038
1039 for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1040 m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1041 spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
1042 spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
1043 &target, NULL);
1044
1045 /*
1046 * At this point if sls is zero it means that a bug occurred
1047 * in ZFS the last time the pool was open or earlier in the
1048 * import code path. In general, we would have placed a
1049 * VERIFY() here or in this case just let the kernel panic
1050 * with NULL pointer dereference when incrementing sls_mscount,
1051 * but since this is the import code path we can be a bit more
1052 * lenient. Thus, for DEBUG bits we always cause a panic, while
1053 * in production we log the error and just fail the import.
1054 */
1055 ASSERT(sls != NULL);
1056 if (sls == NULL) {
1057 spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
1058 "encountered: could not find log spacemap for "
1059 "TXG %ld [error %d]",
1060 metaslab_unflushed_txg(m), ENOENT);
1061 return (ENOENT);
1062 }
1063 sls->sls_mscount++;
1064 }
1065
1066 return (0);
1067 }
1068
1069 typedef struct spa_ld_log_sm_arg {
1070 spa_t *slls_spa;
1071 uint64_t slls_txg;
1072 } spa_ld_log_sm_arg_t;
1073
1074 static int
spa_ld_log_sm_cb(space_map_entry_t * sme,void * arg)1075 spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
1076 {
1077 uint64_t offset = sme->sme_offset;
1078 uint64_t size = sme->sme_run;
1079 uint32_t vdev_id = sme->sme_vdev;
1080
1081 spa_ld_log_sm_arg_t *slls = arg;
1082 spa_t *spa = slls->slls_spa;
1083
1084 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
1085
1086 /*
1087 * If the vdev has been removed (i.e. it is indirect or a hole)
1088 * skip this entry. The contents of this vdev have already moved
1089 * elsewhere.
1090 */
1091 if (!vdev_is_concrete(vd))
1092 return (0);
1093
1094 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1095 ASSERT(!ms->ms_loaded);
1096
1097 /*
1098 * If we have already flushed entries for this TXG to this
1099 * metaslab's space map, then ignore it. Note that we flush
1100 * before processing any allocations/frees for that TXG, so
1101 * the metaslab's space map only has entries from *before*
1102 * the unflushed TXG.
1103 */
1104 if (slls->slls_txg < metaslab_unflushed_txg(ms))
1105 return (0);
1106
1107 switch (sme->sme_type) {
1108 case SM_ALLOC:
1109 range_tree_remove_xor_add_segment(offset, offset + size,
1110 ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
1111 break;
1112 case SM_FREE:
1113 range_tree_remove_xor_add_segment(offset, offset + size,
1114 ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
1115 break;
1116 default:
1117 panic("invalid maptype_t");
1118 break;
1119 }
1120 if (!metaslab_unflushed_dirty(ms)) {
1121 metaslab_set_unflushed_dirty(ms, B_TRUE);
1122 spa_log_summary_dirty_flushed_metaslab(spa,
1123 metaslab_unflushed_txg(ms));
1124 }
1125 return (0);
1126 }
1127
1128 static int
spa_ld_log_sm_data(spa_t * spa)1129 spa_ld_log_sm_data(spa_t *spa)
1130 {
1131 spa_log_sm_t *sls, *psls;
1132 int error = 0;
1133
1134 /*
1135 * If we are not going to do any writes there is no need
1136 * to read the log space maps.
1137 */
1138 if (!spa_writeable(spa))
1139 return (0);
1140
1141 ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
1142 ASSERT0(spa->spa_unflushed_stats.sus_memused);
1143
1144 hrtime_t read_logs_starttime = gethrtime();
1145
1146 /* Prefetch log spacemaps dnodes. */
1147 for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
1148 sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1149 dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
1150 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
1151 }
1152
1153 uint_t pn = 0;
1154 uint64_t ps = 0;
1155 psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
1156 while (sls != NULL) {
1157 /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
1158 if (psls != NULL && pn < 16 &&
1159 (pn < 2 || ps < 2 * dmu_prefetch_max)) {
1160 error = space_map_open(&psls->sls_sm,
1161 spa_meta_objset(spa), psls->sls_sm_obj, 0,
1162 UINT64_MAX, SPA_MINBLOCKSHIFT);
1163 if (error != 0) {
1164 spa_load_failed(spa, "spa_ld_log_sm_data(): "
1165 "failed at space_map_open(obj=%llu) "
1166 "[error %d]",
1167 (u_longlong_t)sls->sls_sm_obj, error);
1168 goto out;
1169 }
1170 dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
1171 0, 0, space_map_length(psls->sls_sm),
1172 ZIO_PRIORITY_ASYNC_READ);
1173 pn++;
1174 ps += space_map_length(psls->sls_sm);
1175 psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
1176 continue;
1177 }
1178
1179 /* Load TXG log spacemap into ms_unflushed_allocs/frees. */
1180 cond_resched();
1181 ASSERT0(sls->sls_nblocks);
1182 sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
1183 spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
1184 summary_add_data(spa, sls->sls_txg,
1185 sls->sls_mscount, 0, sls->sls_nblocks);
1186
1187 struct spa_ld_log_sm_arg vla = {
1188 .slls_spa = spa,
1189 .slls_txg = sls->sls_txg
1190 };
1191 error = space_map_iterate(sls->sls_sm,
1192 space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
1193 if (error != 0) {
1194 spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
1195 "at space_map_iterate(obj=%llu) [error %d]",
1196 (u_longlong_t)sls->sls_sm_obj, error);
1197 goto out;
1198 }
1199
1200 pn--;
1201 ps -= space_map_length(sls->sls_sm);
1202 space_map_close(sls->sls_sm);
1203 sls->sls_sm = NULL;
1204 sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
1205
1206 /* Update log block limits considering just loaded. */
1207 spa_log_sm_set_blocklimit(spa);
1208 }
1209
1210 hrtime_t read_logs_endtime = gethrtime();
1211 spa_load_note(spa,
1212 "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
1213 "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
1214 (u_longlong_t)spa_log_sm_nblocks(spa),
1215 (u_longlong_t)zfs_log_sm_blksz,
1216 (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
1217
1218 out:
1219 if (error != 0) {
1220 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1221 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1222 if (sls->sls_sm) {
1223 space_map_close(sls->sls_sm);
1224 sls->sls_sm = NULL;
1225 }
1226 }
1227 } else {
1228 ASSERT0(pn);
1229 ASSERT0(ps);
1230 }
1231 /*
1232 * Now that the metaslabs contain their unflushed changes:
1233 * [1] recalculate their actual allocated space
1234 * [2] recalculate their weights
1235 * [3] sum up the memory usage of their unflushed range trees
1236 * [4] optionally load them, if debug_load is set
1237 *
1238 * Note that even in the case where we get here because of an
1239 * error (e.g. error != 0), we still want to update the fields
1240 * below in order to have a proper teardown in spa_unload().
1241 */
1242 for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1243 m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1244 mutex_enter(&m->ms_lock);
1245 m->ms_allocated_space = space_map_allocated(m->ms_sm) +
1246 range_tree_space(m->ms_unflushed_allocs) -
1247 range_tree_space(m->ms_unflushed_frees);
1248
1249 vdev_t *vd = m->ms_group->mg_vd;
1250 metaslab_space_update(vd, m->ms_group->mg_class,
1251 range_tree_space(m->ms_unflushed_allocs), 0, 0);
1252 metaslab_space_update(vd, m->ms_group->mg_class,
1253 -range_tree_space(m->ms_unflushed_frees), 0, 0);
1254
1255 ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
1256 metaslab_recalculate_weight_and_sort(m);
1257
1258 spa->spa_unflushed_stats.sus_memused +=
1259 metaslab_unflushed_changes_memused(m);
1260
1261 if (metaslab_debug_load && m->ms_sm != NULL) {
1262 VERIFY0(metaslab_load(m));
1263 metaslab_set_selected_txg(m, 0);
1264 }
1265 mutex_exit(&m->ms_lock);
1266 }
1267
1268 return (error);
1269 }
1270
1271 static int
spa_ld_unflushed_txgs(vdev_t * vd)1272 spa_ld_unflushed_txgs(vdev_t *vd)
1273 {
1274 spa_t *spa = vd->vdev_spa;
1275 objset_t *mos = spa_meta_objset(spa);
1276
1277 if (vd->vdev_top_zap == 0)
1278 return (0);
1279
1280 uint64_t object = 0;
1281 int error = zap_lookup(mos, vd->vdev_top_zap,
1282 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1283 sizeof (uint64_t), 1, &object);
1284 if (error == ENOENT)
1285 return (0);
1286 else if (error != 0) {
1287 spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
1288 "zap_lookup(vdev_top_zap=%llu) [error %d]",
1289 (u_longlong_t)vd->vdev_top_zap, error);
1290 return (error);
1291 }
1292
1293 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
1294 metaslab_t *ms = vd->vdev_ms[m];
1295 ASSERT(ms != NULL);
1296
1297 metaslab_unflushed_phys_t entry;
1298 uint64_t entry_size = sizeof (entry);
1299 uint64_t entry_offset = ms->ms_id * entry_size;
1300
1301 error = dmu_read(mos, object,
1302 entry_offset, entry_size, &entry, 0);
1303 if (error != 0) {
1304 spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
1305 "failed at dmu_read(obj=%llu) [error %d]",
1306 (u_longlong_t)object, error);
1307 return (error);
1308 }
1309
1310 ms->ms_unflushed_txg = entry.msp_unflushed_txg;
1311 ms->ms_unflushed_dirty = B_FALSE;
1312 ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs));
1313 ASSERT(range_tree_is_empty(ms->ms_unflushed_frees));
1314 if (ms->ms_unflushed_txg != 0) {
1315 mutex_enter(&spa->spa_flushed_ms_lock);
1316 avl_add(&spa->spa_metaslabs_by_flushed, ms);
1317 mutex_exit(&spa->spa_flushed_ms_lock);
1318 }
1319 }
1320 return (0);
1321 }
1322
1323 /*
1324 * Read all the log space map entries into their respective
1325 * metaslab unflushed trees and keep them sorted by TXG in the
1326 * SPA's metadata. In addition, setup all the metadata for the
1327 * memory and the block heuristics.
1328 */
1329 int
spa_ld_log_spacemaps(spa_t * spa)1330 spa_ld_log_spacemaps(spa_t *spa)
1331 {
1332 int error;
1333
1334 spa_log_sm_set_blocklimit(spa);
1335
1336 for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1337 vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
1338 error = spa_ld_unflushed_txgs(vd);
1339 if (error != 0)
1340 return (error);
1341 }
1342
1343 error = spa_ld_log_sm_metadata(spa);
1344 if (error != 0)
1345 return (error);
1346
1347 /*
1348 * Note: we don't actually expect anything to change at this point
1349 * but we grab the config lock so we don't fail any assertions
1350 * when using vdev_lookup_top().
1351 */
1352 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1353 error = spa_ld_log_sm_data(spa);
1354 spa_config_exit(spa, SCL_CONFIG, FTAG);
1355
1356 return (error);
1357 }
1358
1359 /* BEGIN CSTYLED */
1360 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW,
1361 "Specific hard-limit in memory that ZFS allows to be used for "
1362 "unflushed changes");
1363
1364 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW,
1365 "Percentage of the overall system memory that ZFS allows to be "
1366 "used for unflushed changes (value is calculated over 1000000 for "
1367 "finer granularity)");
1368
1369 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW,
1370 "Hard limit (upper-bound) in the size of the space map log "
1371 "in terms of blocks.");
1372
1373 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
1374 "Lower-bound limit for the maximum amount of blocks allowed in "
1375 "log spacemap (see zfs_unflushed_log_block_max)");
1376
1377 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, ULONG, ZMOD_RW,
1378 "Hard limit (upper-bound) in the size of the space map log "
1379 "in terms of dirty TXGs.");
1380
1381 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
1382 "Tunable used to determine the number of blocks that can be used for "
1383 "the spacemap log, expressed as a percentage of the total number of "
1384 "metaslabs in the pool (e.g. 400 means the number of log blocks is "
1385 "capped at 4 times the number of metaslabs)");
1386
1387 ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW,
1388 "The number of past TXGs that the flushing algorithm of the log "
1389 "spacemap feature uses to estimate incoming log blocks");
1390
1391 ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW,
1392 "Maximum number of rows allowed in the summary of the spacemap log");
1393
1394 ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW,
1395 "Minimum number of metaslabs to flush per dirty TXG");
1396
1397 ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
1398 "Prevent the log spacemaps from being flushed and destroyed "
1399 "during pool export/destroy");
1400 /* END CSTYLED */
1401