Skip to content
Merged
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ members = [".", "crates/sentrix-primitives", "crates/sentrix-wallet", "crates/se
# `version.workspace = true`. Same goes for edition/license/repository so
# they can't drift across crates.
[workspace.package]
version = "2.2.31"
version = "2.2.35"
edition = "2024"
license = "BUSL-1.1"
repository = "https://github.com/sentrix-labs/sentrix"
Expand Down
104 changes: 75 additions & 29 deletions bin/sentrix/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1136,36 +1136,76 @@ async fn cmd_start(
let writer_storage = storage.clone();
let writer_shared = shared.clone();
tokio::spawn(async move {
while let Some(target_height) = save_rx.recv().await {
// Drain coalesced heights: if the writer is behind, multiple
// FinalizeBlock pushes can stack up. One snapshot covers all
// of them since save_blockchain writes the full state blob.
let mut latest = target_height;
while let Ok(h) = save_rx.try_recv() {
latest = h;
}
let bc = writer_shared.read().await;
let height_at_save = bc.height();
match writer_storage.save_blockchain(&bc) {
Ok(()) => {
tracing::debug!(
// POLL-driven persistence (not save_tx-signal-driven). The signal
// was only pushed in the commit path that is SKIPPED when add_block
// returns Err on the BFT apply-from-stash state_root recompute
// mismatch (the proposal carries the proposer's root; our local
// recompute differs — the separate, open determinism issue). The
// block is still canonical (2/3 precommit justification) and the
// chain advances, but the writer never fired → its block:{N} key
// was never written → it aged out of the in-memory window into a
// permanent storage gap → observer/fullnode GetBlocks sync stalled
// on the missing height. Polling the chain and persisting whatever
// is committed decouples durability from the apply result.
//
// Block:{N} keys are written via the BATCHED save_blocks (one MDBX
// txn / one fsync per tick). An earlier attempt used per-block
// save_block, whose per-block full-env mdbx.sync() contended with
// the apply path's trie write txns and stalled consensus. The full
// state blob (save_blockchain) runs on a slow cadence purely to
// bound load-time B2 replay; B2 rebuilds accounts from the block:{N}
// keys we now persist, and the graceful-shutdown path saves the
// blob on clean exit.
let mut last_persisted: u64 = { writer_shared.read().await.height() };
let mut last_blob_save = std::time::Instant::now();
let mut ticker = tokio::time::interval(std::time::Duration::from_secs(5));
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
loop {
ticker.tick().await;
// Drain the legacy signal channel so producers' try_send does
// not accumulate; persistence no longer keys off it.
while save_rx.try_recv().is_ok() {}

// Clone the newly-committed block range under a short read lock,
// then release before the disk write so the lock is never held
// across I/O (would stall the validator's write lock).
let new_blocks: Vec<sentrix::core::block::Block> = {
let bc = writer_shared.read().await;
let h = bc.height();
(last_persisted.saturating_add(1)..=h)
.filter_map(|i| bc.get_block(i).cloned())
.collect()
};
if let Some(top) = new_blocks.last().map(|b| b.index) {
match writer_storage.save_blocks(&new_blocks) {
Ok(()) => last_persisted = top,
Err(e) => tracing::error!(
target: "save_writer",
"background save_blockchain ok queued_for=h{} caught_up_to=h{}",
latest,
height_at_save,
);
"save_blocks failed for range ..={}: {}",
top,
e,
),
}
Err(e) => {
}

// Periodic full-state checkpoint (accounts blob + blob_height)
// to keep B2 replay bounded on an unclean restart. Infrequent
// (60s) — strictly less often than the previous per-finalize
// save, so the brief read-lock hold during serialize can't
// accumulate into back-pressure.
if last_blob_save.elapsed() >= std::time::Duration::from_secs(60) {
let bc = writer_shared.read().await;
if let Err(e) = writer_storage.save_blockchain(&bc) {
tracing::error!(
target: "save_writer",
"background save_blockchain failed queued_for=h{} caught_up_to=h{}: {}",
latest, height_at_save, e,
"periodic save_blockchain failed at h{}: {}",
bc.height(),
e,
);
}
last_blob_save = std::time::Instant::now();
}
drop(bc);
}
tracing::info!(target: "save_writer", "save channel closed; writer exiting");
});
}

Expand Down Expand Up @@ -2914,21 +2954,27 @@ async fn cmd_start(
}

if let Some(mut blk) = proposed_block.take() {
// Apply the finalized block we already hold, whoever proposed
// it. The hash-match guard above proved `blk` IS the
// FinalizeBlock action's block, and `justification` carries
// its 2/3 precommit certificate — so this is the canonical
// committed block. This previously broke out and waited for
// libp2p NewBlock/sync to re-deliver a peer's block; when
// gossip missed, the node sat in Finalize re-triggering sync
// while holding the block the whole time → chain crawl/stall.
// `validate_block` below still re-checks structure + the
// justification supermajority before we write.
if blk.validator != wallet.address {
tracing::info!(
target: "finalize_trace",
"BFT finalize peer-propose: h={} round={} block={:.16}… \
proposer={} is not local validator {}; waiting for \
libp2p NewBlock/sync instead of executing peer block \
in the BFT loop",
"BFT finalize: applying peer-proposed finalized block \
h={} round={} block={:.16}… proposer={} from local \
stash (valid 2/3 justification)",
height,
round,
block_hash,
blk.validator,
wallet.address,
);
lp2p_clone.trigger_sync().await;
break;
}

blk.round = round;
Expand Down
191 changes: 174 additions & 17 deletions crates/sentrix-core/src/block_executor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1723,23 +1723,76 @@ impl Blockchain {
return Ok(());
}

tracing::error!(
"CRITICAL #1e: state_root mismatch at block {} — received {} \
vs computed {}. Local trie and peer's trie disagree on the \
post-block state. Rejecting.",
block_index,
hex::encode(received_root),
hex::encode(computed_root),
);
// 2026-04-23 divergence rate-alarm: per-event ERROR
// line above is truthful but gets lost in log noise
// during a real divergence (~1/s). Record the
// rejection in the rolling tracker, which emits a
// LOUD rate-limited alarm pointing at the rsync
// recovery playbook when the rate crosses threshold.
// See `DivergenceTracker` in blockchain.rs for the
// full rationale.
self.divergence_tracker.record_rejection(block_index);
// Observer-tolerant accept (gated, default OFF). An observer/
// fullnode applies EVERY block via add_block_from_peer (Peer) and
// strictly rejecting a #1e here halts it on canonical data: the
// block already passed the strict 2/3-precommit justification
// verification earlier in add_block_impl, so it IS the network-
// agreed block (consensus is on block_hash, not state_root). The
// mismatch is the chain's known imperfect state-commitment
// (recurring/oscillating state_root) that validators already
// tolerate via the apply-from-stash path. With
// SENTRIX_OBSERVER_TOLERANT_STATE_ROOT=1 set, accept the block and
// stamp the proposer's (canonical) received root so the observer's
// chain stays consistent with the committed roots; its local
// accounts diverge from that root (the same pre-existing imperfection
// every node has), so served state is no worse than a validator's.
// Default OFF → validators keep the strict #1e reject below. Only an
// observer node sets this env.
if self.source_for_current_add == BlockSource::Peer
&& std::env::var_os("SENTRIX_OBSERVER_TOLERANT_STATE_ROOT")
.is_some_and(|v| v == "1")
{
tracing::debug!(
"observer-tolerant: #1e at block {} (received {} vs computed \
{}) — accepting justified canonical block, stamping received \
root (local state diverges; chain state-commitment imperfect)",
block_index,
hex::encode(received_root),
hex::encode(computed_root),
);
last.state_root = Some(received_root);
self.maybe_prune_trie();
emit_apply_profile(
profile_t0,
profile_t1,
profile_t2,
profile_height,
profile_txs,
);
return Ok(());
}

// A SelfProduced mismatch is the BFT finalize apply-from-stash
// path: the stashed proposal carries the proposer's PRE-apply
// state_root (computed at propose time, before this block's txs),
// which never equals the freshly computed POST-apply root. That's
// expected and self-heals — the block still commits via the libp2p
// receive path, which CHECKs against the canonical committed root.
// Only a Peer-source mismatch is a real cross-node divergence, so
// keep the LOUD alarm + divergence_tracker for that case and log the
// self-apply case quietly without polluting the divergence rate.
if self.source_for_current_add == BlockSource::Peer {
tracing::error!(
"CRITICAL #1e: state_root mismatch at block {} — received {} \
vs computed {}. Local trie and peer's trie disagree on the \
post-block state. Rejecting.",
block_index,
hex::encode(received_root),
hex::encode(computed_root),
);
// Record in the rolling tracker, which emits a LOUD rate-limited
// alarm pointing at the recovery playbook when the rate crosses
// threshold. See `DivergenceTracker` in blockchain.rs.
self.divergence_tracker.record_rejection(block_index);
} else {
tracing::debug!(
"#1e self-apply mismatch at block {} (expected: stashed \
proposal carries the pre-apply root) — block commits via \
the receive path",
block_index,
);
}
return Err(SentrixError::ChainValidationFailed(format!(
"state_root mismatch at block {}: received {}, computed {}",
block_index,
Expand Down Expand Up @@ -2865,6 +2918,110 @@ mod tests {
);
}

/// Regression: a Pass-2 failure (#1e state_root mismatch) must restore
/// `stake_registry` / `epoch_manager` / `slashing`, not just AccountDB.
/// Pre-fix the C-03 snapshot omitted them, so the reward bundle's
/// `pending_rewards` increment leaked on every rejected block — and post
/// STATE_IN_TRIE that leak diverged the next block's state_root.
#[test]
fn test_c03_pass2_failure_restores_staking_state() {
use sentrix_primitives::block::{Block, STATE_ROOT_FORK_HEIGHT};
use sentrix_primitives::justification::BlockJustification;
use sentrix_staking::staking::ValidatorStake;
use sentrix_storage::MdbxStorage;
use std::sync::Arc;
use tempfile::TempDir;

let _guard = crate::test_util::env_test_lock();
// Forks that (a) run the reward bundle in apply and (b) commit staking
// state into the root, so a #1e reject can leak it absent the fix.
unsafe {
std::env::set_var("VOYAGER_REWARD_V2_HEIGHT", "0");
std::env::set_var("REWARD_APPLY_PATH_HEIGHT", "0");
std::env::set_var("STATE_IN_TRIE_HEIGHT", "0");
}

let mut bc = setup();
bc.voyager_activated = true;
for addr in ["v1", "v2", "v3", "v4"] {
bc.stake_registry.validators.insert(
addr.to_string(),
ValidatorStake {
address: addr.to_string(),
self_stake: 1000,
total_delegated: 0,
commission_rate: 1000,
max_commission_rate: 2000,
is_jailed: false,
jail_until: 0,
is_tombstoned: false,
blocks_signed: 0,
blocks_missed: 0,
pending_rewards: 0,
registration_height: 0,
last_commission_change_height: 0,
},
);
}
bc.stake_registry.active_set =
vec!["v1".into(), "v2".into(), "v3".into(), "v4".into()];

// Pad past STATE_ROOT_FORK_HEIGHT so the #1e check enforces (the
// below-fork path just stamps the root instead of rejecting).
let pad_height = STATE_ROOT_FORK_HEIGHT + 1;
let prev = bc.latest_block().unwrap().hash.clone();
let mut pad = Block::new(
pad_height,
prev,
vec![Transaction::new_coinbase("v1".into(), 0, pad_height, 1_777_000_000)],
"v1".into(),
);
pad.timestamp = 1_777_000_000;
bc.chain.push(pad);

// Trie required so update_trie_for_block computes a real root to diff.
let dir = TempDir::new().unwrap();
let mdbx = Arc::new(MdbxStorage::open(dir.path()).unwrap());
bc.init_trie(mdbx).unwrap();

let height = bc.height() + 1;
let prev_hash = bc.latest_block().unwrap().hash.clone();
let reward = bc.get_block_reward();
let coinbase = Transaction::new_coinbase("v1".into(), reward, height, 1_777_000_001);
let mut block = Block::new(height, prev_hash, vec![coinbase], "v1".into());
block.timestamp = 1_777_000_001;
// Tamper the declared root so #1e fires AFTER the reward bundle ran.
block.state_root = Some([0xAB; 32]);
block.hash = block.calculate_hash();
let mut just = BlockJustification::new(height, 0, block.hash.clone());
just.add_precommit("v1".into(), vec![], 1000);
just.add_precommit("v2".into(), vec![], 1000);
just.add_precommit("v3".into(), vec![], 1000);
block.justification = Some(just);

let pending_before = bc.stake_registry.validators.get("v1").unwrap().pending_rewards;

let err = bc
.add_block_from_peer(block)
.expect_err("tampered state_root must be rejected (#1e)");
assert!(
format!("{err:?}").contains("state_root mismatch"),
"expected #1e state_root mismatch, got: {err:?}"
);

let pending_after = bc.stake_registry.validators.get("v1").unwrap().pending_rewards;
assert_eq!(
pending_after, pending_before,
"pending_rewards must roll back after a #1e reject (leaked pre-fix)"
);

unsafe {
std::env::remove_var("VOYAGER_REWARD_V2_HEIGHT");
std::env::remove_var("REWARD_APPLY_PATH_HEIGHT");
std::env::remove_var("STATE_IN_TRIE_HEIGHT");
}
}

#[test]
fn test_add_block_succeeds_without_trie() {
// update_trie_for_block returning Ok(None) must not fail add_block.
Expand Down
Loading
Loading