diff --git a/include/sys/spa.h b/include/sys/spa.h
index 0de8a1867a12..60067a6f3419 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1150,7 +1150,8 @@ extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
boolean_t l2cache);
extern boolean_t spa_has_l2cache(spa_t *, uint64_t guid);
extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
+extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva,
+ uint64_t birth_txg);
extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
extern boolean_t spa_has_dedup(spa_t *spa);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 62cf196eeaa4..7934ef813fad 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -346,6 +346,7 @@ struct spa {
vdev_raidz_expand_t *spa_raidz_expand;
zthr_t *spa_raidz_expand_zthr;
+ uint64_t spa_raidz_expand_acct_txg; /* txg variable deflate on */
uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */
spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 131cfc9cd16b..4428d100fa82 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -133,6 +133,7 @@ extern void vdev_space_update(vdev_t *vd,
int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
+extern uint64_t vdev_get_deflate_ratio(vdev_t *vd, uint64_t birth_txg);
extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize,
uint64_t txg);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 634594aca124..d7e357222234 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -275,6 +275,7 @@ struct vdev {
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
+ uint64_t vdev_deflate_ratio_current; /* current geometry */
uint64_t vdev_islog; /* is an intent log device */
uint64_t vdev_noalloc; /* device is passivated? */
uint64_t vdev_removing; /* device is being removed? */
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 56382ca85b55..86c16c0e93f7 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -84,6 +84,7 @@ typedef enum spa_feature {
SPA_FEATURE_AVZ_V2,
SPA_FEATURE_REDACTION_LIST_SPILL,
SPA_FEATURE_RAIDZ_EXPANSION,
+ SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING,
SPA_FEATURE_FAST_DEDUP,
SPA_FEATURE_LONGNAME,
SPA_FEATURE_LARGE_MICROZAP,
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index bed2c7979a1b..980adf13a0ff 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -690,7 +690,7 @@
-
+
@@ -1668,8 +1668,103 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -1681,6 +1776,14 @@
+
+
+
+
+
+
+
+
@@ -4908,6 +5011,12 @@
+
+
+
+
+
+
@@ -5154,6 +5263,19 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -6563,13 +6685,14 @@
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
@@ -8332,6 +8455,7 @@
+
@@ -9948,8 +10072,8 @@
-
-
+
+
@@ -10010,7 +10134,7 @@
-
+
diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi
index 4cd003a718b2..f60eb9ce77dd 100644
--- a/lib/libzfs_core/libzfs_core.abi
+++ b/lib/libzfs_core/libzfs_core.abi
@@ -1489,6 +1489,12 @@
+
+
+
+
+
+
@@ -1503,6 +1509,93 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -1577,6 +1670,8 @@
+
+
@@ -1592,6 +1687,14 @@
+
+
+
+
+
+
+
+
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index b4404a6eb58d..3308358cf410 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -903,6 +903,22 @@ amount usable space in the pool.
See
.Xr zpool-attach 8 .
.
+.feature org.openzfs raidz_expansion_accounting no enabled_txg
+This feature enables correct per-block space accounting after RAIDZ
+expansion.
+When a RAIDZ vdev is expanded, the deflation ratio changes because
+there are more data disks per parity disk.
+Without this feature, all blocks use a single deflation ratio regardless
+of when they were written, which causes inaccurate capacity reporting.
+With this feature, each block's deflation ratio is determined by the
+geometry that was active when it was written, identified via its birth
+transaction group.
+.Pp
+This feature becomes
+.Sy active
+when a RAIDZ expansion is initiated and will never return to being
+.Sy enabled .
+.
.feature com.delphix redaction_bookmarks no bookmarks extensible_dataset
This feature enables the use of redacted
.Nm zfs Cm send Ns s ,
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index 6ba9892eeb64..34d6f43113a0 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -761,6 +761,19 @@ zpool_feature_init(void)
"Support for raidz expansion",
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
+ {
+ static const spa_feature_t raidz_expand_acct_deps[] = {
+ SPA_FEATURE_ENABLED_TXG,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING,
+ "org.openzfs:raidz_expansion_accounting",
+ "raidz_expansion_accounting",
+ "Per-block deflation ratio after raidz expansion",
+ ZFEATURE_FLAG_MOS,
+ ZFEATURE_TYPE_BOOLEAN, raidz_expand_acct_deps, sfeatures);
+ }
+
zfeature_register(SPA_FEATURE_FAST_DEDUP,
"com.klarasystems:fast_dedup", "fast_dedup",
"Support for advanced deduplication",
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 77b8e814dfd0..017b69583ae0 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -56,8 +56,17 @@ ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
uint64_t dsize = 0;
+ /*
+ * Use ddp_phys_birth for the deflation ratio lookup.
+ * If DDT later added copies after an expansion, those
+ * new DVAs use the new geometry but share this birth;
+ * the resulting dsize is slightly approximate for the
+ * extra copies but this only affects DDT statistics,
+ * not persistent accounting.
+ */
+ uint64_t birth = ddt_phys_birth(ddp, v);
for (int d = 0; d < ndvas; d++)
- dsize += dva_get_dsize_sync(spa, &dvas[d]);
+ dsize += dva_get_dsize_sync(spa, &dvas[d], birth);
uint64_t refcnt = ddt_phys_refcnt(ddp, v);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 4397c14b5c77..e6546ab8f4c9 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -422,12 +422,12 @@ spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props,
*/
static void
spa_prop_add_metaslab_class(nvlist_t *nv, metaslab_class_t *mc,
- zpool_mc_props_t mcp, uint64_t *sizep, uint64_t *allocp, uint64_t *usablep,
- uint64_t *usedp)
+ zpool_mc_props_t mcp, int64_t dspace_correction,
+ uint64_t *sizep, uint64_t *allocp, uint64_t *usablep, uint64_t *usedp)
{
uint64_t size = metaslab_class_get_space(mc);
uint64_t alloc = metaslab_class_get_alloc(mc);
- uint64_t dsize = metaslab_class_get_dspace(mc);
+ uint64_t dsize = metaslab_class_get_dspace(mc) + dspace_correction;
uint64_t dalloc = metaslab_class_get_dalloc(mc);
uint64_t cap = (size == 0) ? 0 : (alloc * 100 / size);
const zprop_source_t src = ZPROP_SRC_NONE;
@@ -489,20 +489,55 @@ spa_prop_get_config(spa_t *spa, nvlist_t *nv)
if (rvd != NULL) {
spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
+ /*
+ * For expanded RAIDZ vdevs, the metaslab class dspace
+ * was computed using the original (txg 0) deflation
+ * ratio, which understates usable capacity. Compute
+ * a correction for the normal class using only the
+ * FREE portion of each vdev. See spa_update_dspace()
+ * for the rationale.
+ */
+ int64_t normal_dspace_corr = 0;
+ if (spa_deflate(spa)) {
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ if (vd->vdev_mg == NULL ||
+ vd->vdev_mg->mg_class != mc)
+ continue;
+ if (vd->vdev_deflate_ratio_current != 0 &&
+ vd->vdev_deflate_ratio_current !=
+ vd->vdev_deflate_ratio) {
+ uint64_t space =
+ vd->vdev_stat.vs_space;
+ uint64_t aspace =
+ vd->vdev_stat.vs_alloc;
+ uint64_t fspace = space - aspace;
+ int64_t old_fs = (fspace >>
+ SPA_MINBLOCKSHIFT) *
+ vd->vdev_deflate_ratio;
+ int64_t new_fs = (fspace >>
+ SPA_MINBLOCKSHIFT) *
+ vd->vdev_deflate_ratio_current;
+ normal_dspace_corr +=
+ (new_fs - old_fs);
+ }
+ }
+ }
+
size = alloc = usable = used = 0;
spa_prop_add_metaslab_class(nv, mc, ZPOOL_MC_PROPS_NORMAL,
- &size, &alloc, &usable, &used);
+ normal_dspace_corr, &size, &alloc, &usable, &used);
spa_prop_add_metaslab_class(nv, spa_special_class(spa),
- ZPOOL_MC_PROPS_SPECIAL, &size, &alloc, &usable, &used);
+ ZPOOL_MC_PROPS_SPECIAL, 0, &size, &alloc, &usable, &used);
spa_prop_add_metaslab_class(nv, spa_dedup_class(spa),
- ZPOOL_MC_PROPS_DEDUP, &size, &alloc, &usable, &used);
+ ZPOOL_MC_PROPS_DEDUP, 0, &size, &alloc, &usable, &used);
spa_prop_add_metaslab_class(nv, spa_log_class(spa),
- ZPOOL_MC_PROPS_LOG, NULL, NULL, NULL, NULL);
+ ZPOOL_MC_PROPS_LOG, 0, NULL, NULL, NULL, NULL);
spa_prop_add_metaslab_class(nv, spa_embedded_log_class(spa),
- ZPOOL_MC_PROPS_ELOG, &size, &alloc, &usable, &used);
+ ZPOOL_MC_PROPS_ELOG, 0, &size, &alloc, &usable, &used);
spa_prop_add_metaslab_class(nv,
spa_special_embedded_log_class(spa), ZPOOL_MC_PROPS_SELOG,
- &size, &alloc, &usable, &used);
+ 0, &size, &alloc, &usable, &used);
spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src);
spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
@@ -5297,6 +5332,18 @@ spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
}
+ /*
+ * Cache the txg at which per-block deflate ratio accounting was
+ * enabled. Blocks born before this txg use the legacy fixed ratio;
+ * blocks born at or after this txg use per-birth-txg ratios.
+ */
+ if (spa_feature_is_active(spa,
+ SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING)) {
+ (void) spa_feature_enabled_txg(spa,
+ SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING,
+ &spa->spa_raidz_expand_acct_txg);
+ }
+
/*
* Encryption was added before bookmark_v2, even though bookmark_v2
* is now a dependency. If this pool has encryption enabled without
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 9b110f31f9ed..ec48cef42cdc 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -2015,6 +2015,53 @@ void
spa_update_dspace(spa_t *spa)
{
spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa));
+
+ /*
+ * For expanded RAIDZ vdevs, the metaslab class dspace was computed
+ * using the original (txg 0) deflation ratio, which understates the
+ * usable capacity. Apply a correction using the current geometry
+ * ratio, but only for the FREE portion of the vdev.
+ *
+ * The allocated portion must remain at the old ratio because
+ * dd_used_bytes (tracked per-block via bp_get_dsize_sync) also
+ * uses the old ratio for blocks born before the expansion
+ * accounting feature was enabled. Correcting only the free
+ * portion keeps spa_rdspace - dd_used_bytes = true available.
+ *
+ * As old blocks are rewritten with the new geometry, vs_alloc
+ * shrinks (less physical space per block) and the correction
+ * automatically grows, converging to the full correction after
+ * a complete rewrite.
+ *
+ * Only correct vdevs in the normal class, matching spa_rdspace's
+ * derivation from spa_normal_class(). RAIDZ vdevs are always in
+ * the normal class, but this keeps the accounting self-consistent.
+ */
+ if (spa_deflate(spa)) {
+ metaslab_class_t *mc = spa_normal_class(spa);
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ if (vd->vdev_mg == NULL ||
+ vd->vdev_mg->mg_class != mc)
+ continue;
+ if (vd->vdev_deflate_ratio_current != 0 &&
+ vd->vdev_deflate_ratio_current !=
+ vd->vdev_deflate_ratio) {
+ uint64_t space = vd->vdev_stat.vs_space;
+ uint64_t aspace = vd->vdev_stat.vs_alloc;
+ uint64_t fspace = space - aspace;
+ int64_t old_fs = (fspace >>
+ SPA_MINBLOCKSHIFT) *
+ vd->vdev_deflate_ratio;
+ int64_t new_fs = (fspace >>
+ SPA_MINBLOCKSHIFT) *
+ vd->vdev_deflate_ratio_current;
+ spa->spa_rdspace += (new_fs - old_fs);
+ }
+ }
+ }
+
if (spa->spa_nonallocating_dspace > 0) {
/*
* Subtract the space provided by all non-allocating vdevs that
@@ -2288,8 +2335,17 @@ spa_set_deadman_synctime(hrtime_t ns)
}
}
+/*
+ * Compute the deflated size of a DVA. The birth_txg parameter is used to
+ * determine the correct deflation ratio for RAIDZ vdevs that have been
+ * expanded. Blocks born under an older (narrower) geometry used more
+ * physical space per unit of data; using the birth-txg-appropriate ratio
+ * ensures accurate per-block accounting. For callers that do not have a
+ * birth txg (or for non-expanded vdevs), the cached vdev_deflate_ratio
+ * based on current geometry is used.
+ */
uint64_t
-dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
+dva_get_dsize_sync(spa_t *spa, const dva_t *dva, uint64_t birth_txg)
{
uint64_t asize = DVA_GET_ASIZE(dva);
uint64_t dsize = asize;
@@ -2298,9 +2354,10 @@ dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
if (asize != 0 && spa->spa_deflate) {
vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
- if (vd != NULL)
- dsize = (asize >> SPA_MINBLOCKSHIFT) *
- vd->vdev_deflate_ratio;
+ if (vd != NULL) {
+ uint64_t ratio = vdev_get_deflate_ratio(vd, birth_txg);
+ dsize = (asize >> SPA_MINBLOCKSHIFT) * ratio;
+ }
}
return (dsize);
@@ -2311,8 +2368,16 @@ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
{
uint64_t dsize = 0;
+ /*
+ * Use physical birth: this reflects when the DVAs were actually
+ * allocated, and thus which RAIDZ geometry was in effect. For
+ * dedup/clones the logical birth may differ, but the physical
+ * birth matches the on-disk allocation geometry.
+ */
+ uint64_t birth_txg = BP_GET_PHYSICAL_BIRTH(bp);
+
for (int d = 0; d < BP_GET_NDVAS(bp); d++)
- dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d], birth_txg);
return (dsize);
}
@@ -2321,11 +2386,12 @@ uint64_t
bp_get_dsize(spa_t *spa, const blkptr_t *bp)
{
uint64_t dsize = 0;
+ uint64_t birth_txg = BP_GET_PHYSICAL_BIRTH(bp);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (int d = 0; d < BP_GET_NDVAS(bp); d++)
- dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d], birth_txg);
spa_config_exit(spa, SCL_VDEV, FTAG);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index de4161559b4e..51d199d03723 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1390,7 +1390,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
}
tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
+ tvd->vdev_deflate_ratio_current = svd->vdev_deflate_ratio_current;
svd->vdev_deflate_ratio = 0;
+ svd->vdev_deflate_ratio_current = 0;
tvd->vdev_islog = svd->vdev_islog;
svd->vdev_islog = 0;
@@ -2076,6 +2078,11 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
* account for existing bp's. We also hard-code txg 0 for the same reason
* since expanded RAIDZ vdevs can use a different asize for different birth
* txg's.
+ *
+ * We also compute vdev_deflate_ratio_current using UINT64_MAX (current
+ * geometry) for use in capacity reporting (vs_dspace). This reflects
+ * the actual usable capacity for new writes after RAIDZ expansion.
+ * The txg-0-based ratio is preserved for persistent per-block accounting.
*/
static void
vdev_set_deflate_ratio(vdev_t *vd)
@@ -2084,6 +2091,9 @@ vdev_set_deflate_ratio(vdev_t *vd)
vd->vdev_deflate_ratio = (1 << 17) /
(vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
SPA_MINBLOCKSHIFT);
+ vd->vdev_deflate_ratio_current = (1 << 17) /
+ (vdev_psize_to_asize_txg(vd, 1 << 17, UINT64_MAX) >>
+ SPA_MINBLOCKSHIFT);
}
}
@@ -5285,6 +5295,60 @@ vdev_deflated_space(vdev_t *vd, int64_t space)
return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
}
+/*
+ * Return the deflation ratio appropriate for a block born at the given txg.
+ * For vdevs that have never been expanded (or non-RAIDZ vdevs), this is
+ * simply the cached vdev_deflate_ratio (based on txg 0 geometry).
+ *
+ * For expanded RAIDZ vdevs, the ratio depends on the logical width at the
+ * time the block was written. The ratio is computed from the block's birth
+ * txg so that a given block always produces the same deflated size,
+ * preserving the consistency of persistent accounting (DN_USED_BYTES).
+ *
+ * The raidz_expansion_accounting feature records the txg at which per-block
+ * ratio tracking was enabled. Blocks born before that txg were accounted
+ * using the legacy fixed ratio, so we must continue using it for those
+ * blocks to prevent born/free mismatches.
+ */
+uint64_t
+vdev_get_deflate_ratio(vdev_t *vd, uint64_t birth_txg)
+{
+ /*
+ * Fast path: if birth_txg is 0 or UINT64_MAX, use the cached ratio
+ * (based on txg 0 / original geometry).
+ */
+ if (birth_txg == 0 || birth_txg == UINT64_MAX)
+ return (vd->vdev_deflate_ratio);
+
+ /*
+ * If the raidz_expansion_accounting feature is not active, or
+ * this block was born before the feature was enabled, use the
+ * legacy fixed ratio. This ensures blocks that were accounted
+ * under the old code are freed with the same ratio they were born
+ * with.
+ */
+ uint64_t acct_txg = vd->vdev_spa->spa_raidz_expand_acct_txg;
+ if (acct_txg == 0 || birth_txg < acct_txg)
+ return (vd->vdev_deflate_ratio);
+
+ uint64_t asize_at_birth = vdev_psize_to_asize_txg(vd, 1 << 17,
+ birth_txg);
+ uint64_t asize_at_txg0 = vdev_psize_to_asize_txg(vd, 1 << 17, 0);
+
+ /*
+ * If the asize at birth matches the original (txg 0) geometry,
+ * this block was born before any expansion. Use the cached ratio.
+ */
+ if (asize_at_birth == asize_at_txg0)
+ return (vd->vdev_deflate_ratio);
+
+ /*
+ * This block was born after an expansion changed the geometry.
+ * Compute the ratio from the birth-txg-specific asize.
+ */
+ return ((1 << 17) / (asize_at_birth >> SPA_MINBLOCKSHIFT));
+}
+
/*
* Update the in-core space usage stats for this vdev, its metaslab class,
* and the root vdev.
@@ -6807,6 +6871,7 @@ EXPORT_SYMBOL(vdev_degrade);
EXPORT_SYMBOL(vdev_online);
EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear);
+EXPORT_SYMBOL(vdev_get_deflate_ratio);
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
"Target number of metaslabs per top-level vdev");
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 520ddd692bda..81e398b26f51 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -5129,6 +5129,22 @@ vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
+ /*
+ * Enable per-block deflation ratio accounting. The enabled_txg
+ * recorded here marks the boundary: blocks born before it use the
+ * legacy fixed ratio, blocks born at or after use per-birth-txg
+ * ratios. Only enable on the first expansion with this code; the
+ * feature stays active across subsequent expansions.
+ */
+ if (!spa_feature_is_active(spa,
+ SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING)) {
+ spa_feature_enable(spa,
+ SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING, tx);
+ spa_feature_incr(spa,
+ SPA_FEATURE_RAIDZ_EXPANSION_ACCOUNTING, tx);
+ spa->spa_raidz_expand_acct_txg = dmu_tx_get_txg(tx);
+ }
+
vdrz->vd_physical_width++;
VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index f22f3c759e9b..5163f7ec91c5 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -908,7 +908,9 @@ tags = ['functional', 'redacted_send']
tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_expand_001_pos',
'raidz_expand_002_pos', 'raidz_expand_003_neg', 'raidz_expand_003_pos',
'raidz_expand_004_pos', 'raidz_expand_005_pos', 'raidz_expand_006_neg',
- 'raidz_expand_007_neg']
+ 'raidz_expand_007_neg', 'raidz_expand_008_pos',
+ 'raidz_expand_009_pos', 'raidz_expand_010_pos',
+ 'raidz_expand_011_pos', 'raidz_expand_012_pos']
tags = ['functional', 'raidz']
timeout = 1200
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index fdf211877e62..374b3e4e3af2 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1875,6 +1875,11 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/raidz/raidz_expand_005_pos.ksh \
functional/raidz/raidz_expand_006_neg.ksh \
functional/raidz/raidz_expand_007_neg.ksh \
+ functional/raidz/raidz_expand_008_pos.ksh \
+ functional/raidz/raidz_expand_009_pos.ksh \
+ functional/raidz/raidz_expand_010_pos.ksh \
+ functional/raidz/raidz_expand_011_pos.ksh \
+ functional/raidz/raidz_expand_012_pos.ksh \
functional/raidz/setup.ksh \
functional/redacted_send/cleanup.ksh \
functional/redacted_send/redacted_compressed.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index dcb2b92cc0e1..760d839543ee 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -172,6 +172,7 @@ if is_linux || is_freebsd; then
"feature@block_cloning"
"feature@vdev_zaps_v2"
"feature@raidz_expansion"
+ "feature@raidz_expansion_accounting"
"feature@fast_dedup"
"feature@longname"
"feature@large_microzap"
diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_008_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_008_pos.ksh
new file mode 100755
index 000000000000..5e150265288f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_008_pos.ksh
@@ -0,0 +1,304 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Skountz. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify that space accounting is correct after RAIDZ expansion.
+# After expanding a RAIDZ vdev, the reported pool capacity should
+# reflect the current (post-expansion) geometry, not the original.
+#
+# STRATEGY:
+# 1. Create a RAIDZ2 pool with 4 disks
+# 2. Write test data and record the pool size and free space
+# 3. Attach a 5th disk to expand the RAIDZ2 vdev
+# 4. Wait for expansion to complete
+# 5. Verify that the reported pool capacity increased to reflect the
+# new geometry (3 data disks instead of 2)
+# 6. Verify that reported free space is consistent:
+# free space should be approximately (new_capacity - data_used)
+# 7. Verify zfs dataset available also reflects the correction
+# (exercises spa_update_dspace() code path)
+# 8. Verify accounting survives export/import
+#
+
+typeset -r devs=4
+typeset -r dev_size_mb=768
+
+typeset -a disks
+
+prefetch_disable=$(get_tunable PREFETCH_DISABLE)
+
+function cleanup
+{
+ poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL"
+
+ for i in {0..$devs}; do
+ log_must rm -f "$TEST_BASE_DIR/dev-$i"
+ done
+
+ log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable
+}
+
+log_onexit cleanup
+
+log_must set_tunable32 PREFETCH_DISABLE 1
+
+# Create disk files
+for i in {0..$(($devs))}; do
+ device=$TEST_BASE_DIR/dev-$i
+ log_must truncate -s ${dev_size_mb}M $device
+ disks[${#disks[*]}+1]=$device
+done
+
+pool=$TESTPOOL
+opts="-o cachefile=none"
+
+# Create a RAIDZ2 pool with 3 disks (1 data + 2 parity)
+log_must zpool create -f $opts $pool raidz2 ${disks[1..3]}
+log_must zfs set primarycache=metadata $pool
+
+# Write test data
+log_must zfs create $pool/fs
+log_must dd if=/dev/urandom of=/$pool/fs/testfile bs=1M count=300
+log_must sync_pool $pool
+
+# Record pre-expansion accounting
+typeset pre_size=$(zpool list -Hpo size $pool)
+typeset pre_alloc=$(zpool list -Hpo allocated $pool)
+typeset pre_free=$(zpool list -Hpo free $pool)
+typeset pre_usable=$(zpool list -Hpo usable $pool)
+typeset pre_avail=$(zfs get -Hpo value available $pool/fs)
+
+log_note "Pre-expansion: size=$pre_size alloc=$pre_alloc free=$pre_free usable=$pre_usable avail=$pre_avail"
+
+# Verify pre-expansion consistency: free should be approximately size - alloc
+typeset pre_sum=$(($pre_alloc + $pre_free))
+typeset pre_diff
+if [[ $pre_sum -gt $pre_size ]]; then
+ pre_diff=$(($pre_sum - $pre_size))
+else
+ pre_diff=$(($pre_size - $pre_sum))
+fi
+# Allow 5% tolerance for metadata overhead
+typeset tolerance=$(($pre_size / 20))
+if [[ $pre_diff -gt $tolerance ]]; then
+ log_fail "Pre-expansion: alloc+free differs from size by $pre_diff" \
+ "(tolerance: $tolerance)"
+fi
+
+# Expand: attach 4th disk
+log_must zpool attach -w $pool raidz2-0 ${disks[4]}
+
+# Wait for scrub to complete if one was triggered
+is_pool_scrubbing $pool && wait_scrubbed $pool
+sleep 3
+
+# Record post-expansion accounting
+typeset post_size=$(zpool list -Hpo size $pool)
+typeset post_alloc=$(zpool list -Hpo allocated $pool)
+typeset post_free=$(zpool list -Hpo free $pool)
+typeset post_usable=$(zpool list -Hpo usable $pool)
+typeset post_avail=$(zfs get -Hpo value available $pool/fs)
+
+log_note "Post-expansion: size=$post_size alloc=$post_alloc free=$post_free usable=$post_usable avail=$post_avail"
+
+# Verify that raw pool size increased
+if [[ $post_size -le $pre_size ]]; then
+ log_fail "Pool size did not increase after expansion" \
+ "(before=$pre_size after=$post_size)"
+fi
+
+# Verify the deflation ratio correction via the USABLE property.
+# USABLE reflects deflated (parity-adjusted) capacity; SIZE is raw physical.
+#
+# With 4 disks of ~768MB each: total raw ~ 3072MB
+# Old geometry (3 disks RAIDZ2): usable ratio = 1/3 = 33%
+# New geometry (4 disks RAIDZ2): usable ratio = 2/4 = 50%
+# Expected new usable ~ 3072MB * 0.5 = 1536MB
+# Old usable was ~ 2304MB * 0.33 = 768MB (3 disks)
+# Usable increase should be ~100%, well above the raw 33%.
+#
+# Use USABLE (not SIZE) because SIZE is raw physical space that grows
+# linearly with the number of disks. USABLE incorporates the deflation
+# ratio correction and should reflect the improved data-to-parity ratio.
+typeset usable_increase=$(( ($post_usable - $pre_usable) * 100 / $pre_usable ))
+log_note "Usable capacity increase: ${usable_increase}%"
+
+# With RAIDZ2 expanding from 3->4 disks:
+# Raw space increases by 33% (1 disk / 3 disks)
+# But usable space should increase by more than 33% because the
+# data-to-parity ratio improves (from 1/3 to 2/4). If the increase
+# is only 33% or less, the deflation ratio is stuck at old geometry.
+if [[ $usable_increase -le 33 ]]; then
+ log_fail "Usable capacity increase too small (${usable_increase}%)." \
+ "Expected >33% for RAIDZ2 3->4 expansion." \
+ "This suggests deflation ratio is using old geometry."
+fi
+
+# Verify the spa_update_dspace() correction via zfs available.
+# This exercises a different code path than ZPOOL_PROP_USABLE above:
+# zfs available -> dsl_pool_adjustedsize() -> spa_get_dspace() ->
+# spa_update_dspace() which applies the same deflation correction.
+typeset avail_increase=$(( ($post_avail - $pre_avail) * 100 / $pre_avail ))
+log_note "Dataset available increase: ${avail_increase}%"
+
+if [[ $avail_increase -le 33 ]]; then
+ log_fail "Dataset available increase too small (${avail_increase}%)." \
+ "Expected >33% for RAIDZ2 3->4 expansion." \
+ "This suggests spa_update_dspace() correction is not applied."
+fi
+
+# Verify post-expansion consistency: free should be approximately size - alloc
+typeset post_sum=$(($post_alloc + $post_free))
+typeset post_diff
+if [[ $post_sum -gt $post_size ]]; then
+ post_diff=$(($post_sum - $post_size))
+else
+ post_diff=$(($post_size - $post_sum))
+fi
+typeset post_tolerance=$(($post_size / 20))
+if [[ $post_diff -gt $post_tolerance ]]; then
+ log_fail "Post-expansion: alloc+free differs from size by $post_diff" \
+ "(tolerance: $post_tolerance)"
+fi
+
+# Verify accounting survives export/import (vdev_deflate_ratio_current
+# is recomputed in vdev_set_deflate_ratio() during vdev_open())
+log_must zpool export $pool
+log_must zpool import -d $TEST_BASE_DIR $pool
+
+typeset reimport_size=$(zpool list -Hpo size $pool)
+typeset reimport_alloc=$(zpool list -Hpo allocated $pool)
+typeset reimport_free=$(zpool list -Hpo free $pool)
+typeset reimport_usable=$(zpool list -Hpo usable $pool)
+typeset reimport_avail=$(zfs get -Hpo value available $pool/fs)
+
+log_note "After reimport: size=$reimport_size alloc=$reimport_alloc free=$reimport_free usable=$reimport_usable avail=$reimport_avail"
+
+if [[ $reimport_size -ne $post_size ]]; then
+ log_fail "Pool size changed after reimport" \
+ "(before=$post_size after=$reimport_size)"
+fi
+
+# Verify that usable capacity survives export/import.
+# vdev_deflate_ratio_current is not persisted on disk — it is recomputed
+# in vdev_set_deflate_ratio() during vdev_open(). The correction in
+# spa_prop_get_config() depends on this recomputed value and vs_alloc,
+# which can shift slightly across export/import (deferred frees settle).
+typeset usable_reimport_diff
+if [[ $reimport_usable -gt $post_usable ]]; then
+ usable_reimport_diff=$(($reimport_usable - $post_usable))
+else
+ usable_reimport_diff=$(($post_usable - $reimport_usable))
+fi
+typeset usable_reimport_tol=$(($post_usable / 100))
+if [[ $usable_reimport_diff -gt $usable_reimport_tol ]]; then
+ log_fail "Usable capacity changed significantly after reimport" \
+ "(before=$post_usable after=$reimport_usable" \
+ "diff=$usable_reimport_diff)"
+fi
+
+# Allow 1% tolerance for zfs available — it includes deferred frees
+# and other transient state that can shift slightly across export/import.
+typeset avail_reimport_diff
+if [[ $reimport_avail -gt $post_avail ]]; then
+ avail_reimport_diff=$(($reimport_avail - $post_avail))
+else
+ avail_reimport_diff=$(($post_avail - $reimport_avail))
+fi
+typeset avail_reimport_tol=$(($post_avail / 100))
+if [[ $avail_reimport_diff -gt $avail_reimport_tol ]]; then
+ log_fail "Dataset available changed significantly after reimport" \
+ "(before=$post_avail after=$reimport_avail diff=$avail_reimport_diff)"
+fi
+
+typeset reimport_sum=$(($reimport_alloc + $reimport_free))
+typeset reimport_diff
+if [[ $reimport_sum -gt $reimport_size ]]; then
+ reimport_diff=$(($reimport_sum - $reimport_size))
+else
+ reimport_diff=$(($reimport_size - $reimport_sum))
+fi
+if [[ $reimport_diff -gt $post_tolerance ]]; then
+ log_fail "After reimport: alloc+free differs from size by" \
+ "$reimport_diff (tolerance: $post_tolerance)"
+fi
+
+verify_pool $pool
+
+# ============================================================
+# USABLE accuracy: verify the dspace correction excludes allocated space.
+#
+# After expansion, the dspace correction adjusts USABLE to reflect
+# the new geometry. If the correction covers total vdev space
+# (allocated + free) instead of just the free portion, USABLE is
+# inflated and approaches total * R_new even when the pool has
+# significant old-geometry data.
+#
+# With the correct (free-only) correction, USABLE should be
+# noticeably below total * R_new because old-geometry blocks
+# deflate at the lower R_old. We verify this by computing the
+# geometric maximum (total * (ndisks-nparity)/ndisks) and checking
+# that USABLE is below it by at least the expected alloc * delta.
+# ============================================================
+
+typeset post2_alloc=$(zpool list -Hpo allocated $pool)
+typeset post2_size=$(zpool list -Hpo size $pool)
+typeset post2_usable=$(zpool list -Hpo usable $pool)
+
+# Geometric max: the usable capacity if correction applied to total.
+# RAIDZ2 with 4 disks after expansion: (4 - 2) / 4 = 1/2.
+typeset usable_max=$(($post2_size / 2))
+
+# The over-correction (alloc * (R_new - R_old)) makes usable approach
+# usable_max. With correct free-only correction and >20% pool
+# utilization, usable should be well below usable_max.
+typeset fill_pct=$(($post2_alloc * 100 / $post2_size))
+
+log_note "USABLE check: usable=$post2_usable usable_max=$usable_max" \
+ "alloc=$post2_alloc fill=${fill_pct}%"
+
+if [[ $fill_pct -gt 20 ]]; then
+ # With >20% fill, the gap between usable and usable_max should
+ # be at least 5% of usable_max. The buggy total correction
+ # leaves a gap of <1%; the correct free-only correction leaves
+ # a gap proportional to fill_pct * (R_new - R_old).
+ typeset usable_gap=$(($usable_max - $post2_usable))
+ typeset min_gap=$(($usable_max * 5 / 100))
+ log_note "USABLE gap: gap=$usable_gap min_expected=$min_gap"
+ if [[ $usable_gap -lt $min_gap ]]; then
+ log_fail "USABLE ($post2_usable) is within ${usable_gap}" \
+ "bytes of geometric max ($usable_max)." \
+ "Expected gap >= $min_gap for ${fill_pct}% fill." \
+ "dspace correction is applied to total instead" \
+ "of free space."
+ fi
+fi
+
+log_pass "RAIDZ expansion space accounting is correct."
diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_009_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_009_pos.ksh
new file mode 100755
index 000000000000..f12599b501e2
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_009_pos.ksh
@@ -0,0 +1,216 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2026 by Skountz. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify that space accounting remains accurate across multiple
+# sequential RAIDZ expansions. After each expansion, write data and
+# confirm that the reported freespace, consumed space, and write size
+# all agree within an acceptable tolerance.
+#
+# STRATEGY:
+# 1. Create a RAIDZ1 pool with 3 disks
+# 2. For each additional disk (4th, 5th, 6th):
+# a. Record free space before writing
+# b. Write a known amount of data
+# c. Verify that consumed space increased by approximately the
+# write size (accounting for parity and metadata overhead)
+# d. Verify alloc + free ≈ size
+# e. Attach the next disk and wait for expansion
+# f. Verify accounting consistency after expansion
+# 3. Verify the pool
+#
+
+typeset -r devs=6
+typeset -r dev_size_mb=512
+
+typeset -a disks
+
+prefetch_disable=$(get_tunable PREFETCH_DISABLE)
+
+function cleanup
+{
+ poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL"
+
+ for i in {0..$devs}; do
+ log_must rm -f "$TEST_BASE_DIR/dev-$i"
+ done
+
+ log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable
+}
+
+#
+# Verify that alloc + free ≈ size within 10% tolerance.
+# A wider tolerance is needed here because multiple expansions with
+# mixed-geometry blocks create more metadata overhead variance.
+#
+function verify_accounting #