Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 19 additions & 19 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ members = [".", "crates/sentrix-primitives", "crates/sentrix-wallet", "crates/se
# `version.workspace = true`. Same goes for edition/license/repository so
# they can't drift across crates.
[workspace.package]
version = "2.2.28"
version = "2.2.30"
edition = "2024"
license = "BUSL-1.1"
repository = "https://github.com/sentrix-labs/sentrix"
Expand Down
32 changes: 30 additions & 2 deletions crates/sentrix-trie/src/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,15 +311,43 @@ impl TrieStorage {
}

/// Garbage-collect node and value entries not present in `live_hashes`.
///
/// Prefer [`Self::gc_nodes`] + [`Self::gc_values`] for the periodic prune:
/// the nodes pass alone can run 10–20 min on a big chain.db, during which
/// the apply loop commits new leaf VALUES. Driving both passes from one
/// `live` snapshot lets the values pass delete those freshly-committed
/// (still-live) values — the 2026-06-04 testnet trie corruption. The split
/// methods let the caller refresh `live` between passes. This combined
/// method stays for non-racy callers (one-shot recovery / tests).
pub fn gc_orphaned_nodes(
&self,
live_hashes: &std::collections::HashSet<NodeHash>,
) -> SentrixResult<usize> {
let node_count = self.gc_table(tables::TABLE_TRIE_NODES, live_hashes)?;
let value_count = self.gc_table(tables::TABLE_TRIE_VALUES, live_hashes)?;
let node_count = self.gc_nodes(live_hashes)?;
let value_count = self.gc_values(live_hashes)?;
Ok(node_count + value_count)
}

/// GC only the trie_nodes table against `live_hashes`. See
/// [`Self::gc_orphaned_nodes`] for why nodes and values are split.
pub fn gc_nodes(
&self,
live_hashes: &std::collections::HashSet<NodeHash>,
) -> SentrixResult<usize> {
self.gc_table(tables::TABLE_TRIE_NODES, live_hashes)
}

/// GC only the trie_values table against `live_hashes`. The caller MUST
/// refresh `live_hashes` with any roots committed since the nodes pass
/// before calling this, or it will delete leaf values written during the
/// (long) nodes pass.
pub fn gc_values(
&self,
live_hashes: &std::collections::HashSet<NodeHash>,
) -> SentrixResult<usize> {
self.gc_table(tables::TABLE_TRIE_VALUES, live_hashes)
}

/// Shared helper: scan an MDBX table for hashes not in `live_hashes` and remove them.
///
/// Streams the table via `iter_from` so only the orphan-hash subset
Expand Down
139 changes: 108 additions & 31 deletions crates/sentrix-trie/src/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -657,46 +657,66 @@ impl SentrixTrie {
// the prune built `live` from a snapshot that pre-dated those
// writes and then deleted the freshly-committed data.
//
// Re-load the highest version currently in TABLE_TRIE_ROOTS and
// walk every root committed AFTER self.version. Shrinks the
// race window from "full walk duration" to "the gap between
// this re-load and the gc_orphaned_nodes commit" — at most one
// block's worth of writes can still slip through, and the B3b
// total_minted self-heal plus boot-time verify_integrity catch
// those as defense in depth.
// Re-load the highest version currently in TABLE_TRIE_ROOTS and walk
// every root committed AFTER self.version, BEFORE the nodes GC.
//
// A proper race-free fix requires running the live walk and the
// delete inside the same MDBX RW transaction. That's a bigger
// refactor (exposing the txn handle through TrieStorage) and
// belongs in its own PR. This patch is the minimal targeted
// closer for the observed symptom.
let on_disk_latest = self.cache.storage.latest_version()?.unwrap_or(self.version);
let mut augmented_roots = 0usize;
for version in (self.version + 1)..=on_disk_latest {
// 2026-06-04 follow-up: the original single augment above still left a
// hole. gc_orphaned_nodes GC'd nodes THEN values off this one snapshot,
// and the nodes pass alone runs 10–20 min on a big chain.db. Blocks
// committed during that pass wrote new leaf values the snapshot never
// saw, and the values pass deleted them — orphan value for a hot
// account leaf at testnet h=6,239,526. So we now split the GC and
// re-augment `live` again between the passes (see below). The complete
// race-free fix is still one RW txn around walk+delete (a TrieStorage
// refactor, tracked separately); boot-time verify_integrity is the
// backstop.
let on_disk_latest = self.augment_live_to_latest(&mut live, self.version + 1)?;

// Pass 1 — nodes. Pass 2 — values, but only after re-walking the roots
// committed DURING pass 1, so freshly-written leaf values survive.
let nodes_gc = self.cache.storage.gc_nodes(&live)?;
self.augment_live_to_latest(&mut live, on_disk_latest + 1)?;
let values_gc = self.cache.storage.gc_values(&live)?;

tracing::info!(
"trie prune: removed {} old roots, GC'd {} nodes + {} values",
roots_pruned,
nodes_gc,
values_gc
);
Ok((roots_pruned, nodes_gc + values_gc))
}

/// Walk every committed root in `[from_version, on-disk latest]` not yet in
/// `live` and add its reachable hashes. Returns the on-disk latest version
/// observed. Keeps the live set current across the long nodes/values GC
/// passes — the prune runs on a clone whose `self.version` is frozen while
/// the apply loop keeps committing newer roots.
fn augment_live_to_latest(
&self,
live: &mut std::collections::HashSet<NodeHash>,
from_version: u64,
) -> SentrixResult<u64> {
let latest = self.cache.storage.latest_version()?.unwrap_or(self.version);
let mut added = 0usize;
for version in from_version..=latest {
if let Some(root) = self.cache.storage.load_root(version)?
&& !live.contains(&root)
{
self.collect_reachable(root, 0, &mut live)?;
augmented_roots += 1;
self.collect_reachable(root, 0, live)?;
added += 1;
}
}
if augmented_roots > 0 {
if added > 0 {
tracing::info!(
"trie prune: augmented live set with {} new roots committed during walk \
(snapshot v{} → on-disk v{})",
augmented_roots,
self.version,
on_disk_latest
"trie prune: augmented live set with {} roots committed during walk \
(from v{} → on-disk v{})",
added,
from_version,
latest
);
}

let nodes_gc = self.cache.storage.gc_orphaned_nodes(&live)?;
tracing::info!(
"trie prune: removed {} old roots, GC'd {} orphaned entries",
roots_pruned,
nodes_gc
);
Ok((roots_pruned, nodes_gc))
Ok(latest)
}

/// Recursively collect all node hashes reachable from `hash` at `depth`.
Expand Down Expand Up @@ -1105,6 +1125,63 @@ mod tests {
);
}

/// Regression for the 2026-06-04 testnet trie corruption: a leaf value
/// committed DURING the (long) nodes GC pass must survive the values GC
/// pass. The prune runs on a clone whose version is frozen, so its initial
/// `live` set predates that commit; splitting node/value GC and
/// re-augmenting `live` between the passes is what keeps the fresh value.
/// Asserts both directions: the stale `live` would have deleted it (it's
/// absent), and the re-augmented `live` keeps it (it's present).
#[test]
fn test_values_gc_keeps_leaf_committed_during_nodes_pass() {
use std::collections::HashSet;
let (_dir, mdbx) = temp_mdbx();
let mut trie = SentrixTrie::open(Arc::clone(&mdbx), 0).unwrap();

// v1: account A committed — this is the prune snapshot.
let a = address_to_key("0xaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
trie.insert(&a, &account_value_bytes(100, 0)).unwrap();
trie.commit(1).unwrap();

// The background prune clones the trie frozen at v1 and builds `live`
// from the v1 root only.
let pruner = SentrixTrie::open(Arc::clone(&mdbx), 1).unwrap();
let mut live: HashSet<[u8; 32]> = HashSet::new();
pruner.collect_reachable(pruner.root, 0, &mut live).unwrap();

// --- pretend the long nodes GC pass is running here ---
// Meanwhile the apply loop commits account B at v2 on the live trie.
let b = address_to_key("0xbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
let b_val = account_value_bytes(777, 3);
trie.insert(&b, &b_val).unwrap();
trie.commit(2).unwrap();
let b_vh = crate::node::hash_leaf(&b, &b_val);
assert!(
pruner.cache.storage.load_value(&b_vh).unwrap().is_some(),
"B's value must be on disk after its commit"
);

// The stale snapshot live set does NOT cover B — a values GC here would
// delete a live value (the bug).
assert!(
!live.contains(&b_vh),
"stale live set must not yet contain B's value (this is what caused the bug)"
);

// The fix: re-augment `live` to the on-disk latest before the values
// pass, then GC values.
pruner.augment_live_to_latest(&mut live, 2).unwrap();
assert!(
live.contains(&b_vh),
"re-augment must add B's value hash committed during the nodes pass"
);
pruner.cache.storage.gc_values(&live).unwrap();
assert!(
pruner.cache.storage.load_value(&b_vh).unwrap().is_some(),
"leaf value committed during the nodes pass must survive the values GC"
);
}

/// Updates accumulate internal-node storage between commits; prune() is
/// what reclaims it. Inline cleanup of old_internal_hashes (removed
/// 2026-04-20) was unsound — see the comment in insert() and the
Expand Down
Loading