diff --git a/include/sys/spa.h b/include/sys/spa.h index 0de8a1867a12..60067a6f3419 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1150,7 +1150,8 @@ extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_l2cache(spa_t *, uint64_t guid); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); -extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); +extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva, + uint64_t birth_txg); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_dedup(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 62cf196eeaa4..7934ef813fad 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -346,6 +346,7 @@ struct spa { vdev_raidz_expand_t *spa_raidz_expand; zthr_t *spa_raidz_expand_zthr; + uint64_t spa_raidz_expand_acct_txg; /* txg variable deflate on */ uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 131cfc9cd16b..4428d100fa82 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -133,6 +133,7 @@ extern void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); +extern uint64_t vdev_get_deflate_ratio(vdev_t *vd, uint64_t birth_txg); extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, uint64_t txg); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 634594aca124..d7e357222234 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -275,6 +275,7 @@ struct vdev { list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ + uint64_t vdev_deflate_ratio_current; /* current geometry */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 56382ca85b55..86c16c0e93f7 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -84,6 +84,7 @@ typedef enum spa_feature { SPA_FEATURE_AVZ_V2, SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_RAIDZ_EXPANSION, + SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING, SPA_FEATURE_FAST_DEDUP, SPA_FEATURE_LONGNAME, SPA_FEATURE_LARGE_MICROZAP, diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index bed2c7979a1b..980adf13a0ff 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -690,7 +690,7 @@ - + @@ -1668,8 +1668,103 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1681,6 +1776,14 @@ + + + + + + + + @@ -4908,6 +5011,12 @@ + + + + + + @@ -5154,6 +5263,19 @@ + + + + + + + + + + + + + @@ -6563,13 +6685,14 @@ - - - - - - - + + + + + + + + @@ -8332,6 +8455,7 @@ + @@ -9948,8 +10072,8 @@ - - + + @@ -10010,7 +10134,7 @@ - + diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 4cd003a718b2..f60eb9ce77dd 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -1489,6 +1489,12 @@ + + + + + + @@ -1503,6 +1509,93 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1577,6 +1670,8 @@ + + @@ -1592,6 +1687,14 @@ + + + + + + + + diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index b4404a6eb58d..3308358cf410 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -903,6 +903,22 @@ amount usable space in the pool. See .Xr zpool-attach 8 . . +.feature org.openzfs raidz_expansion_accounting no enabled_txg +This feature enables correct per-block space accounting after RAIDZ +expansion. +When a RAIDZ vdev is expanded, the deflation ratio changes because +there are more data disks per parity disk. +Without this feature, all blocks use a single deflation ratio regardless +of when they were written, which causes inaccurate capacity reporting. +With this feature, each block's deflation ratio is determined by the +geometry that was active when it was written, identified via its birth +transaction group. +.Pp +This feature becomes +.Sy active +when a RAIDZ expansion is initiated and will never return to being +.Sy enabled . +. .feature com.delphix redaction_bookmarks no bookmarks extensible_dataset This feature enables the use of redacted .Nm zfs Cm send Ns s , diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 6ba9892eeb64..34d6f43113a0 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -761,6 +761,19 @@ zpool_feature_init(void) "Support for raidz expansion", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t raidz_expand_acct_deps[] = { + SPA_FEATURE_ENABLED_TXG, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING, + "org.openzfs:raidz_expansion_accounting", + "raidz_expansion_accounting", + "Per-block deflation ratio after raidz expansion", + ZFEATURE_FLAG_MOS, + ZFEATURE_TYPE_BOOLEAN, raidz_expand_acct_deps, sfeatures); + } + zfeature_register(SPA_FEATURE_FAST_DEDUP, "com.klarasystems:fast_dedup", "fast_dedup", "Support for advanced deduplication", diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 77b8e814dfd0..017b69583ae0 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -56,8 +56,17 @@ ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva; uint64_t dsize = 0; + /* + * Use ddp_phys_birth for the deflation ratio lookup. + * If DDT later added copies after an expansion, those + * new DVAs use the new geometry but share this birth; + * the resulting dsize is slightly approximate for the + * extra copies but this only affects DDT statistics, + * not persistent accounting. + */ + uint64_t birth = ddt_phys_birth(ddp, v); for (int d = 0; d < ndvas; d++) - dsize += dva_get_dsize_sync(spa, &dvas[d]); + dsize += dva_get_dsize_sync(spa, &dvas[d], birth); uint64_t refcnt = ddt_phys_refcnt(ddp, v); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 4397c14b5c77..e6546ab8f4c9 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -422,12 +422,12 @@ spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, */ static void spa_prop_add_metaslab_class(nvlist_t *nv, metaslab_class_t *mc, - zpool_mc_props_t mcp, uint64_t *sizep, uint64_t *allocp, uint64_t *usablep, - uint64_t *usedp) + zpool_mc_props_t mcp, int64_t dspace_correction, + uint64_t *sizep, uint64_t *allocp, uint64_t *usablep, uint64_t *usedp) { uint64_t size = metaslab_class_get_space(mc); uint64_t alloc = metaslab_class_get_alloc(mc); - uint64_t dsize = metaslab_class_get_dspace(mc); + uint64_t dsize = metaslab_class_get_dspace(mc) + dspace_correction; uint64_t dalloc = metaslab_class_get_dalloc(mc); uint64_t cap = (size == 0) ? 0 : (alloc * 100 / size); const zprop_source_t src = ZPROP_SRC_NONE; @@ -489,20 +489,55 @@ spa_prop_get_config(spa_t *spa, nvlist_t *nv) if (rvd != NULL) { spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + /* + * For expanded RAIDZ vdevs, the metaslab class dspace + * was computed using the original (txg 0) deflation + * ratio, which understates usable capacity. Compute + * a correction for the normal class using only the + * FREE portion of each vdev. See spa_update_dspace() + * for the rationale. + */ + int64_t normal_dspace_corr = 0; + if (spa_deflate(spa)) { + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + if (vd->vdev_mg == NULL || + vd->vdev_mg->mg_class != mc) + continue; + if (vd->vdev_deflate_ratio_current != 0 && + vd->vdev_deflate_ratio_current != + vd->vdev_deflate_ratio) { + uint64_t space = + vd->vdev_stat.vs_space; + uint64_t aspace = + vd->vdev_stat.vs_alloc; + uint64_t fspace = space - aspace; + int64_t old_fs = (fspace >> + SPA_MINBLOCKSHIFT) * + vd->vdev_deflate_ratio; + int64_t new_fs = (fspace >> + SPA_MINBLOCKSHIFT) * + vd->vdev_deflate_ratio_current; + normal_dspace_corr += + (new_fs - old_fs); + } + } + } + size = alloc = usable = used = 0; spa_prop_add_metaslab_class(nv, mc, ZPOOL_MC_PROPS_NORMAL, - &size, &alloc, &usable, &used); + normal_dspace_corr, &size, &alloc, &usable, &used); spa_prop_add_metaslab_class(nv, spa_special_class(spa), - ZPOOL_MC_PROPS_SPECIAL, &size, &alloc, &usable, &used); + ZPOOL_MC_PROPS_SPECIAL, 0, &size, &alloc, &usable, &used); spa_prop_add_metaslab_class(nv, spa_dedup_class(spa), - ZPOOL_MC_PROPS_DEDUP, &size, &alloc, &usable, &used); + ZPOOL_MC_PROPS_DEDUP, 0, &size, &alloc, &usable, &used); spa_prop_add_metaslab_class(nv, spa_log_class(spa), - ZPOOL_MC_PROPS_LOG, NULL, NULL, NULL, NULL); + ZPOOL_MC_PROPS_LOG, 0, NULL, NULL, NULL, NULL); spa_prop_add_metaslab_class(nv, spa_embedded_log_class(spa), - ZPOOL_MC_PROPS_ELOG, &size, &alloc, &usable, &used); + ZPOOL_MC_PROPS_ELOG, 0, &size, &alloc, &usable, &used); spa_prop_add_metaslab_class(nv, spa_special_embedded_log_class(spa), ZPOOL_MC_PROPS_SELOG, - &size, &alloc, &usable, &used); + 0, &size, &alloc, &usable, &used); spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); @@ -5297,6 +5332,18 @@ spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } + /* + * Cache the txg at which per-block deflate ratio accounting was + * enabled. Blocks born before this txg use the legacy fixed ratio; + * blocks born at or after this txg use per-birth-txg ratios. + */ + if (spa_feature_is_active(spa, + SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING)) { + (void) spa_feature_enabled_txg(spa, + SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING, + &spa->spa_raidz_expand_acct_txg); + } + /* * Encryption was added before bookmark_v2, even though bookmark_v2 * is now a dependency. If this pool has encryption enabled without diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 9b110f31f9ed..ec48cef42cdc 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2015,6 +2015,53 @@ void spa_update_dspace(spa_t *spa) { spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa)); + + /* + * For expanded RAIDZ vdevs, the metaslab class dspace was computed + * using the original (txg 0) deflation ratio, which understates the + * usable capacity. Apply a correction using the current geometry + * ratio, but only for the FREE portion of the vdev. + * + * The allocated portion must remain at the old ratio because + * dd_used_bytes (tracked per-block via bp_get_dsize_sync) also + * uses the old ratio for blocks born before the expansion + * accounting feature was enabled. Correcting only the free + * portion keeps spa_rdspace - dd_used_bytes = true available. + * + * As old blocks are rewritten with the new geometry, vs_alloc + * shrinks (less physical space per block) and the correction + * automatically grows, converging to the full correction after + * a complete rewrite. + * + * Only correct vdevs in the normal class, matching spa_rdspace's + * derivation from spa_normal_class(). RAIDZ vdevs are always in + * the normal class, but this keeps the accounting self-consistent. + */ + if (spa_deflate(spa)) { + metaslab_class_t *mc = spa_normal_class(spa); + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + if (vd->vdev_mg == NULL || + vd->vdev_mg->mg_class != mc) + continue; + if (vd->vdev_deflate_ratio_current != 0 && + vd->vdev_deflate_ratio_current != + vd->vdev_deflate_ratio) { + uint64_t space = vd->vdev_stat.vs_space; + uint64_t aspace = vd->vdev_stat.vs_alloc; + uint64_t fspace = space - aspace; + int64_t old_fs = (fspace >> + SPA_MINBLOCKSHIFT) * + vd->vdev_deflate_ratio; + int64_t new_fs = (fspace >> + SPA_MINBLOCKSHIFT) * + vd->vdev_deflate_ratio_current; + spa->spa_rdspace += (new_fs - old_fs); + } + } + } + if (spa->spa_nonallocating_dspace > 0) { /* * Subtract the space provided by all non-allocating vdevs that @@ -2288,8 +2335,17 @@ spa_set_deadman_synctime(hrtime_t ns) } } +/* + * Compute the deflated size of a DVA. The birth_txg parameter is used to + * determine the correct deflation ratio for RAIDZ vdevs that have been + * expanded. Blocks born under an older (narrower) geometry used more + * physical space per unit of data; using the birth-txg-appropriate ratio + * ensures accurate per-block accounting. For callers that do not have a + * birth txg (or for non-expanded vdevs), the cached vdev_deflate_ratio + * based on current geometry is used. + */ uint64_t -dva_get_dsize_sync(spa_t *spa, const dva_t *dva) +dva_get_dsize_sync(spa_t *spa, const dva_t *dva, uint64_t birth_txg) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; @@ -2298,9 +2354,10 @@ dva_get_dsize_sync(spa_t *spa, const dva_t *dva) if (asize != 0 && spa->spa_deflate) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); - if (vd != NULL) - dsize = (asize >> SPA_MINBLOCKSHIFT) * - vd->vdev_deflate_ratio; + if (vd != NULL) { + uint64_t ratio = vdev_get_deflate_ratio(vd, birth_txg); + dsize = (asize >> SPA_MINBLOCKSHIFT) * ratio; + } } return (dsize); @@ -2311,8 +2368,16 @@ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; + /* + * Use physical birth: this reflects when the DVAs were actually + * allocated, and thus which RAIDZ geometry was in effect. For + * dedup/clones the logical birth may differ, but the physical + * birth matches the on-disk allocation geometry. + */ + uint64_t birth_txg = BP_GET_PHYSICAL_BIRTH(bp); + for (int d = 0; d < BP_GET_NDVAS(bp); d++) - dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d], birth_txg); return (dsize); } @@ -2321,11 +2386,12 @@ uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; + uint64_t birth_txg = BP_GET_PHYSICAL_BIRTH(bp); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp); d++) - dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d], birth_txg); spa_config_exit(spa, SCL_VDEV, FTAG); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index de4161559b4e..51d199d03723 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1390,7 +1390,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; + tvd->vdev_deflate_ratio_current = svd->vdev_deflate_ratio_current; svd->vdev_deflate_ratio = 0; + svd->vdev_deflate_ratio_current = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; @@ -2076,6 +2078,11 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) * account for existing bp's. We also hard-code txg 0 for the same reason * since expanded RAIDZ vdevs can use a different asize for different birth * txg's. + * + * We also compute vdev_deflate_ratio_current using UINT64_MAX (current + * geometry) for use in capacity reporting (vs_dspace). This reflects + * the actual usable capacity for new writes after RAIDZ expansion. + * The txg-0-based ratio is preserved for persistent per-block accounting. */ static void vdev_set_deflate_ratio(vdev_t *vd) @@ -2084,6 +2091,9 @@ vdev_set_deflate_ratio(vdev_t *vd) vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> SPA_MINBLOCKSHIFT); + vd->vdev_deflate_ratio_current = (1 << 17) / + (vdev_psize_to_asize_txg(vd, 1 << 17, UINT64_MAX) >> + SPA_MINBLOCKSHIFT); } } @@ -5285,6 +5295,60 @@ vdev_deflated_space(vdev_t *vd, int64_t space) return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } +/* + * Return the deflation ratio appropriate for a block born at the given txg. + * For vdevs that have never been expanded (or non-RAIDZ vdevs), this is + * simply the cached vdev_deflate_ratio (based on txg 0 geometry). + * + * For expanded RAIDZ vdevs, the ratio depends on the logical width at the + * time the block was written. The ratio is computed from the block's birth + * txg so that a given block always produces the same deflated size, + * preserving the consistency of persistent accounting (DN_USED_BYTES). + * + * The raidz_expansion_accounting feature records the txg at which per-block + * ratio tracking was enabled. Blocks born before that txg were accounted + * using the legacy fixed ratio, so we must continue using it for those + * blocks to prevent born/free mismatches. + */ +uint64_t +vdev_get_deflate_ratio(vdev_t *vd, uint64_t birth_txg) +{ + /* + * Fast path: if birth_txg is 0 or UINT64_MAX, use the cached ratio + * (based on txg 0 / original geometry). + */ + if (birth_txg == 0 || birth_txg == UINT64_MAX) + return (vd->vdev_deflate_ratio); + + /* + * If the raidz_expansion_accounting feature is not active, or + * this block was born before the feature was enabled, use the + * legacy fixed ratio. This ensures blocks that were accounted + * under the old code are freed with the same ratio they were born + * with. + */ + uint64_t acct_txg = vd->vdev_spa->spa_raidz_expand_acct_txg; + if (acct_txg == 0 || birth_txg < acct_txg) + return (vd->vdev_deflate_ratio); + + uint64_t asize_at_birth = vdev_psize_to_asize_txg(vd, 1 << 17, + birth_txg); + uint64_t asize_at_txg0 = vdev_psize_to_asize_txg(vd, 1 << 17, 0); + + /* + * If the asize at birth matches the original (txg 0) geometry, + * this block was born before any expansion. Use the cached ratio. + */ + if (asize_at_birth == asize_at_txg0) + return (vd->vdev_deflate_ratio); + + /* + * This block was born after an expansion changed the geometry. + * Compute the ratio from the birth-txg-specific asize. + */ + return ((1 << 17) / (asize_at_birth >> SPA_MINBLOCKSHIFT)); +} + /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. @@ -6807,6 +6871,7 @@ EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); +EXPORT_SYMBOL(vdev_get_deflate_ratio); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 520ddd692bda..81e398b26f51 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -5129,6 +5129,22 @@ vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); + /* + * Enable per-block deflation ratio accounting. The enabled_txg + * recorded here marks the boundary: blocks born before it use the + * legacy fixed ratio, blocks born at or after use per-birth-txg + * ratios. Only enable on the first expansion with this code; the + * feature stays active across subsequent expansions. + */ + if (!spa_feature_is_active(spa, + SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING)) { + spa_feature_enable(spa, + SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING, tx); + spa_feature_incr(spa, + SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING, tx); + spa->spa_raidz_expand_acct_txg = dmu_tx_get_txg(tx); + } + vdrz->vd_physical_width++; VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index f22f3c759e9b..5163f7ec91c5 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -908,7 +908,9 @@ tags = ['functional', 'redacted_send'] tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_expand_001_pos', 'raidz_expand_002_pos', 'raidz_expand_003_neg', 'raidz_expand_003_pos', 'raidz_expand_004_pos', 'raidz_expand_005_pos', 'raidz_expand_006_neg', - 'raidz_expand_007_neg'] + 'raidz_expand_007_neg', 'raidz_expand_008_pos', + 'raidz_expand_009_pos', 'raidz_expand_010_pos', + 'raidz_expand_011_pos', 'raidz_expand_012_pos'] tags = ['functional', 'raidz'] timeout = 1200 diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index fdf211877e62..374b3e4e3af2 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1875,6 +1875,11 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/raidz/raidz_expand_005_pos.ksh \ functional/raidz/raidz_expand_006_neg.ksh \ functional/raidz/raidz_expand_007_neg.ksh \ + functional/raidz/raidz_expand_008_pos.ksh \ + functional/raidz/raidz_expand_009_pos.ksh \ + functional/raidz/raidz_expand_010_pos.ksh \ + functional/raidz/raidz_expand_011_pos.ksh \ + functional/raidz/raidz_expand_012_pos.ksh \ functional/raidz/setup.ksh \ functional/redacted_send/cleanup.ksh \ functional/redacted_send/redacted_compressed.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index dcb2b92cc0e1..760d839543ee 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -172,6 +172,7 @@ if is_linux || is_freebsd; then "feature@block_cloning" "feature@vdev_zaps_v2" "feature@raidz_expansion" + "feature@raidz_expansion_accounting" "feature@fast_dedup" "feature@longname" "feature@large_microzap" diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_008_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_008_pos.ksh new file mode 100755 index 000000000000..5e150265288f --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_008_pos.ksh @@ -0,0 +1,304 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Skountz. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that space accounting is correct after RAIDZ expansion. +# After expanding a RAIDZ vdev, the reported pool capacity should +# reflect the current (post-expansion) geometry, not the original. +# +# STRATEGY: +# 1. Create a RAIDZ2 pool with 4 disks +# 2. Write test data and record the pool size and free space +# 3. Attach a 5th disk to expand the RAIDZ2 vdev +# 4. Wait for expansion to complete +# 5. Verify that the reported pool capacity increased to reflect the +# new geometry (3 data disks instead of 2) +# 6. Verify that reported free space is consistent: +# free space should be approximately (new_capacity - data_used) +# 7. Verify zfs dataset available also reflects the correction +# (exercises spa_update_dspace() code path) +# 8. Verify accounting survives export/import +# + +typeset -r devs=4 +typeset -r dev_size_mb=768 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Create disk files +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +pool=$TESTPOOL +opts="-o cachefile=none" + +# Create a RAIDZ2 pool with 3 disks (1 data + 2 parity) +log_must zpool create -f $opts $pool raidz2 ${disks[1..3]} +log_must zfs set primarycache=metadata $pool + +# Write test data +log_must zfs create $pool/fs +log_must dd if=/dev/urandom of=/$pool/fs/testfile bs=1M count=300 +log_must sync_pool $pool + +# Record pre-expansion accounting +typeset pre_size=$(zpool list -Hpo size $pool) +typeset pre_alloc=$(zpool list -Hpo allocated $pool) +typeset pre_free=$(zpool list -Hpo free $pool) +typeset pre_usable=$(zpool list -Hpo usable $pool) +typeset pre_avail=$(zfs get -Hpo value available $pool/fs) + +log_note "Pre-expansion: size=$pre_size alloc=$pre_alloc free=$pre_free usable=$pre_usable avail=$pre_avail" + +# Verify pre-expansion consistency: free should be approximately size - alloc +typeset pre_sum=$(($pre_alloc + $pre_free)) +typeset pre_diff +if [[ $pre_sum -gt $pre_size ]]; then + pre_diff=$(($pre_sum - $pre_size)) +else + pre_diff=$(($pre_size - $pre_sum)) +fi +# Allow 5% tolerance for metadata overhead +typeset tolerance=$(($pre_size / 20)) +if [[ $pre_diff -gt $tolerance ]]; then + log_fail "Pre-expansion: alloc+free differs from size by $pre_diff" \ + "(tolerance: $tolerance)" +fi + +# Expand: attach 4th disk +log_must zpool attach -w $pool raidz2-0 ${disks[4]} + +# Wait for scrub to complete if one was triggered +is_pool_scrubbing $pool && wait_scrubbed $pool +sleep 3 + +# Record post-expansion accounting +typeset post_size=$(zpool list -Hpo size $pool) +typeset post_alloc=$(zpool list -Hpo allocated $pool) +typeset post_free=$(zpool list -Hpo free $pool) +typeset post_usable=$(zpool list -Hpo usable $pool) +typeset post_avail=$(zfs get -Hpo value available $pool/fs) + +log_note "Post-expansion: size=$post_size alloc=$post_alloc free=$post_free usable=$post_usable avail=$post_avail" + +# Verify that raw pool size increased +if [[ $post_size -le $pre_size ]]; then + log_fail "Pool size did not increase after expansion" \ + "(before=$pre_size after=$post_size)" +fi + +# Verify the deflation ratio correction via the USABLE property. +# USABLE reflects deflated (parity-adjusted) capacity; SIZE is raw physical. +# +# With 4 disks of ~768MB each: total raw ~ 3072MB +# Old geometry (3 disks RAIDZ2): usable ratio = 1/3 = 33% +# New geometry (4 disks RAIDZ2): usable ratio = 2/4 = 50% +# Expected new usable ~ 3072MB * 0.5 = 1536MB +# Old usable was ~ 2304MB * 0.33 = 768MB (3 disks) +# Usable increase should be ~100%, well above the raw 33%. +# +# Use USABLE (not SIZE) because SIZE is raw physical space that grows +# linearly with the number of disks. USABLE incorporates the deflation +# ratio correction and should reflect the improved data-to-parity ratio. +typeset usable_increase=$(( ($post_usable - $pre_usable) * 100 / $pre_usable )) +log_note "Usable capacity increase: ${usable_increase}%" + +# With RAIDZ2 expanding from 3->4 disks: +# Raw space increases by 33% (1 disk / 3 disks) +# But usable space should increase by more than 33% because the +# data-to-parity ratio improves (from 1/3 to 2/4). If the increase +# is only 33% or less, the deflation ratio is stuck at old geometry. +if [[ $usable_increase -le 33 ]]; then + log_fail "Usable capacity increase too small (${usable_increase}%)." \ + "Expected >33% for RAIDZ2 3->4 expansion." \ + "This suggests deflation ratio is using old geometry." +fi + +# Verify the spa_update_dspace() correction via zfs available. +# This exercises a different code path than ZPOOL_PROP_USABLE above: +# zfs available -> dsl_pool_adjustedsize() -> spa_get_dspace() -> +# spa_update_dspace() which applies the same deflation correction. +typeset avail_increase=$(( ($post_avail - $pre_avail) * 100 / $pre_avail )) +log_note "Dataset available increase: ${avail_increase}%" + +if [[ $avail_increase -le 33 ]]; then + log_fail "Dataset available increase too small (${avail_increase}%)." \ + "Expected >33% for RAIDZ2 3->4 expansion." \ + "This suggests spa_update_dspace() correction is not applied." +fi + +# Verify post-expansion consistency: free should be approximately size - alloc +typeset post_sum=$(($post_alloc + $post_free)) +typeset post_diff +if [[ $post_sum -gt $post_size ]]; then + post_diff=$(($post_sum - $post_size)) +else + post_diff=$(($post_size - $post_sum)) +fi +typeset post_tolerance=$(($post_size / 20)) +if [[ $post_diff -gt $post_tolerance ]]; then + log_fail "Post-expansion: alloc+free differs from size by $post_diff" \ + "(tolerance: $post_tolerance)" +fi + +# Verify accounting survives export/import (vdev_deflate_ratio_current +# is recomputed in vdev_set_deflate_ratio() during vdev_open()) +log_must zpool export $pool +log_must zpool import -d $TEST_BASE_DIR $pool + +typeset reimport_size=$(zpool list -Hpo size $pool) +typeset reimport_alloc=$(zpool list -Hpo allocated $pool) +typeset reimport_free=$(zpool list -Hpo free $pool) +typeset reimport_usable=$(zpool list -Hpo usable $pool) +typeset reimport_avail=$(zfs get -Hpo value available $pool/fs) + +log_note "After reimport: size=$reimport_size alloc=$reimport_alloc free=$reimport_free usable=$reimport_usable avail=$reimport_avail" + +if [[ $reimport_size -ne $post_size ]]; then + log_fail "Pool size changed after reimport" \ + "(before=$post_size after=$reimport_size)" +fi + +# Verify that usable capacity survives export/import. +# vdev_deflate_ratio_current is not persisted on disk — it is recomputed +# in vdev_set_deflate_ratio() during vdev_open(). The correction in +# spa_prop_get_config() depends on this recomputed value and vs_alloc, +# which can shift slightly across export/import (deferred frees settle). +typeset usable_reimport_diff +if [[ $reimport_usable -gt $post_usable ]]; then + usable_reimport_diff=$(($reimport_usable - $post_usable)) +else + usable_reimport_diff=$(($post_usable - $reimport_usable)) +fi +typeset usable_reimport_tol=$(($post_usable / 100)) +if [[ $usable_reimport_diff -gt $usable_reimport_tol ]]; then + log_fail "Usable capacity changed significantly after reimport" \ + "(before=$post_usable after=$reimport_usable" \ + "diff=$usable_reimport_diff)" +fi + +# Allow 1% tolerance for zfs available — it includes deferred frees +# and other transient state that can shift slightly across export/import. +typeset avail_reimport_diff +if [[ $reimport_avail -gt $post_avail ]]; then + avail_reimport_diff=$(($reimport_avail - $post_avail)) +else + avail_reimport_diff=$(($post_avail - $reimport_avail)) +fi +typeset avail_reimport_tol=$(($post_avail / 100)) +if [[ $avail_reimport_diff -gt $avail_reimport_tol ]]; then + log_fail "Dataset available changed significantly after reimport" \ + "(before=$post_avail after=$reimport_avail diff=$avail_reimport_diff)" +fi + +typeset reimport_sum=$(($reimport_alloc + $reimport_free)) +typeset reimport_diff +if [[ $reimport_sum -gt $reimport_size ]]; then + reimport_diff=$(($reimport_sum - $reimport_size)) +else + reimport_diff=$(($reimport_size - $reimport_sum)) +fi +if [[ $reimport_diff -gt $post_tolerance ]]; then + log_fail "After reimport: alloc+free differs from size by" \ + "$reimport_diff (tolerance: $post_tolerance)" +fi + +verify_pool $pool + +# ============================================================ +# USABLE accuracy: verify the dspace correction excludes allocated space. +# +# After expansion, the dspace correction adjusts USABLE to reflect +# the new geometry. If the correction covers total vdev space +# (allocated + free) instead of just the free portion, USABLE is +# inflated and approaches total * R_new even when the pool has +# significant old-geometry data. +# +# With the correct (free-only) correction, USABLE should be +# noticeably below total * R_new because old-geometry blocks +# deflate at the lower R_old. We verify this by computing the +# geometric maximum (total * (ndisks-nparity)/ndisks) and checking +# that USABLE is below it by at least the expected alloc * delta. +# ============================================================ + +typeset post2_alloc=$(zpool list -Hpo allocated $pool) +typeset post2_size=$(zpool list -Hpo size $pool) +typeset post2_usable=$(zpool list -Hpo usable $pool) + +# Geometric max: the usable capacity if correction applied to total. +# RAIDZ2 with 4 disks after expansion: (4 - 2) / 4 = 1/2. +typeset usable_max=$(($post2_size / 2)) + +# The over-correction (alloc * (R_new - R_old)) makes usable approach +# usable_max. With correct free-only correction and >20% pool +# utilization, usable should be well below usable_max. +typeset fill_pct=$(($post2_alloc * 100 / $post2_size)) + +log_note "USABLE check: usable=$post2_usable usable_max=$usable_max" \ + "alloc=$post2_alloc fill=${fill_pct}%" + +if [[ $fill_pct -gt 20 ]]; then + # With >20% fill, the gap between usable and usable_max should + # be at least 5% of usable_max. The buggy total correction + # leaves a gap of <1%; the correct free-only correction leaves + # a gap proportional to fill_pct * (R_new - R_old). + typeset usable_gap=$(($usable_max - $post2_usable)) + typeset min_gap=$(($usable_max * 5 / 100)) + log_note "USABLE gap: gap=$usable_gap min_expected=$min_gap" + if [[ $usable_gap -lt $min_gap ]]; then + log_fail "USABLE ($post2_usable) is within ${usable_gap}" \ + "bytes of geometric max ($usable_max)." \ + "Expected gap >= $min_gap for ${fill_pct}% fill." \ + "dspace correction is applied to total instead" \ + "of free space." + fi +fi + +log_pass "RAIDZ expansion space accounting is correct." diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_009_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_009_pos.ksh new file mode 100755 index 000000000000..f12599b501e2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_009_pos.ksh @@ -0,0 +1,216 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026 by Skountz. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that space accounting remains accurate across multiple +# sequential RAIDZ expansions. After each expansion, write data and +# confirm that the reported freespace, consumed space, and write size +# all agree within an acceptable tolerance. +# +# STRATEGY: +# 1. Create a RAIDZ1 pool with 3 disks +# 2. For each additional disk (4th, 5th, 6th): +# a. Record free space before writing +# b. Write a known amount of data +# c. Verify that consumed space increased by approximately the +# write size (accounting for parity and metadata overhead) +# d. Verify alloc + free ≈ size +# e. Attach the next disk and wait for expansion +# f. Verify accounting consistency after expansion +# 3. Verify the pool +# + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +# +# Verify that alloc + free ≈ size within 10% tolerance. +# A wider tolerance is needed here because multiple expansions with +# mixed-geometry blocks create more metadata overhead variance. +# +function verify_accounting #