Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,19 @@ extern dnode_sums_t dnode_sums;

#endif

/*
* Assert that we are not modifying the range tree for the syncing TXG from
* a non-syncing thread. We verify that either the transaction group is
* strictly newer than the one currently syncing (meaning it's being modified
* in open context), OR the current thread is the sync thread itself. If this
* triggers, it indicates a race where dn_free_ranges is being modified while
* dnode_sync() may be iterating over it.
*/
#define FREE_RANGE_VERIFY(tx, dn) \
ASSERT((tx)->tx_txg > spa_syncing_txg((dn)->dn_objset->os_spa) || \
dmu_objset_pool((dn)->dn_objset)->dp_tx.tx_sync_thread == \
curthread)

#ifdef __cplusplus
}
#endif
Expand Down
12 changes: 12 additions & 0 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -2201,6 +2201,17 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)

mutex_enter(&dn->dn_mtx);
int txgoff = tx->tx_txg & TXG_MASK;

/*
* Assert that we are not modifying the range tree for the syncing
* TXG from a non-syncing thread. We verify that the tx's
* transaction group is strictly newer than the one currently
* syncing (meaning we are in open context). If this triggers,
* it indicates a race where syncing dn_free_range tree is
* being modified while dnode_sync() may be iterating over it.
*/
ASSERT(tx->tx_txg > spa_syncing_txg(dn->dn_objset->os_spa));

if (dn->dn_free_ranges[txgoff] != NULL) {
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
}
Expand Down Expand Up @@ -2388,6 +2399,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_blkid != DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
if (dn->dn_free_ranges[txgoff] != NULL) {
FREE_RANGE_VERIFY(tx, dn);
zfs_range_tree_clear(dn->dn_free_ranges[txgoff],
db->db_blkid, 1);
}
Expand Down
2 changes: 2 additions & 0 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -2409,6 +2409,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
mutex_enter(&dn->dn_mtx);
{
int txgoff = tx->tx_txg & TXG_MASK;

FREE_RANGE_VERIFY(tx, dn);
if (dn->dn_free_ranges[txgoff] == NULL) {
dn->dn_free_ranges[txgoff] =
zfs_range_tree_create_flags(
Expand Down
105 changes: 60 additions & 45 deletions module/zfs/dnode_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -440,24 +440,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
}
}

typedef struct dnode_sync_free_range_arg {
dnode_t *dsfra_dnode;
dmu_tx_t *dsfra_tx;
boolean_t dsfra_free_indirects;
} dnode_sync_free_range_arg_t;

static void
dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
{
dnode_sync_free_range_arg_t *dsfra = arg;
dnode_t *dn = dsfra->dsfra_dnode;

mutex_exit(&dn->dn_mtx);
dnode_sync_free_range_impl(dn, blkid, nblks,
dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
mutex_enter(&dn->dn_mtx);
}

/*
* Try to kick all the dnode's dbufs out of the cache...
*/
Expand Down Expand Up @@ -634,6 +616,64 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
*/
}

/*
* We cannot simply detach the range tree (set dn_free_ranges to NULL)
* before processing it because dnode_block_freed() relies on it to
* correctly identify blocks that have been freed in the current TXG
* (for dbuf_read() calls on holes). If we detached it early, a concurrent
* reader might see the block as valid on disk and return stale data
* instead of zeros.
*
* We also can't use zfs_range_tree_walk() nor zfs_range_tree_vacate()
* with a callback that drops dn_mtx (dnode_sync_free_range()). This is
* unsafe because another thread (spa_sync_deferred_frees() ->
* dnode_free_range()) could acquire dn_mtx and modify the tree while the
* walk or vacate was in progress. This leads to tree corruption or panic
* when we resume.
*
* To fix the race while maintaining visibility, we process the tree
* incrementally. We pick a segment, drop the lock to sync it, and
* re-acquire the lock to remove it. By always restarting from the head
* of the tree, we ensure we are never using an invalid iterator.
* We use zfs_range_tree_clear() instead of ..._remove() because the range
* might have already been removed while the lock was dropped (specifically
* in the dbuf_dirty path mentioned above). ..._clear() handles this
* gracefully, while ..._remove() would panic on a missing segment.
*/
static void
dnode_sync_free_ranges(dnode_t *dn, dmu_tx_t *tx)
{
int txgoff = tx->tx_txg & TXG_MASK;

mutex_enter(&dn->dn_mtx);
zfs_range_tree_t *rt = dn->dn_free_ranges[txgoff];
if (rt != NULL) {
boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
dn->dn_free_txg <= tx->tx_txg;
zfs_range_seg_t *rs;

if (freeing_dnode) {
ASSERT(zfs_range_tree_contains(rt, 0,
dn->dn_maxblkid + 1));
}

while ((rs = zfs_range_tree_first(rt)) != NULL) {
uint64_t start = zfs_rs_get_start(rs, rt);
uint64_t size = zfs_rs_get_end(rs, rt) - start;

mutex_exit(&dn->dn_mtx);
dnode_sync_free_range_impl(dn, start, size,
freeing_dnode, tx);
mutex_enter(&dn->dn_mtx);

zfs_range_tree_clear(rt, start, size);
}
zfs_range_tree_destroy(rt);
dn->dn_free_ranges[txgoff] = NULL;
}
mutex_exit(&dn->dn_mtx);
}

/*
* Write out the dnode's dirty buffers.
* Does not wait for zio completions.
Expand Down Expand Up @@ -781,32 +821,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
}

/* process all the "freed" ranges in the file */
if (dn->dn_free_ranges[txgoff] != NULL) {
dnode_sync_free_range_arg_t dsfra;
dsfra.dsfra_dnode = dn;
dsfra.dsfra_tx = tx;
dsfra.dsfra_free_indirects = freeing_dnode;
mutex_enter(&dn->dn_mtx);
if (freeing_dnode) {
ASSERT(zfs_range_tree_contains(
dn->dn_free_ranges[txgoff], 0,
dn->dn_maxblkid + 1));
}
/*
* Because dnode_sync_free_range() must drop dn_mtx during its
* processing, using it as a callback to zfs_range_tree_vacate()
* is not safe. No other operations (besides destroy) are
* allowed once zfs_range_tree_vacate() has begun, and dropping
* dn_mtx would leave a window open for another thread to
* observe that invalid (and unsafe) state.
*/
zfs_range_tree_walk(dn->dn_free_ranges[txgoff],
dnode_sync_free_range, &dsfra);
zfs_range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL);
zfs_range_tree_destroy(dn->dn_free_ranges[txgoff]);
dn->dn_free_ranges[txgoff] = NULL;
mutex_exit(&dn->dn_mtx);
}
dnode_sync_free_ranges(dn, tx);

if (freeing_dnode) {
dn->dn_objset->os_freed_dnodes++;
Expand All @@ -828,7 +843,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
}

/*
* This must be done after dnode_sync_free_range()
* This must be done after dnode_sync_free_ranges()
* and dnode_increase_indirection(). See dnode_new_blkid()
* for an explanation of the high bit being set.
*/
Expand Down
Loading