sentrix-labs · github-actions · Jun 6, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
@@ -7,7 +7,7 @@ members = [".", "crates/sentrix-primitives", "crates/sentrix-wallet", "crates/se
 # `version.workspace = true`. Same goes for edition/license/repository so
 # they can't drift across crates.
 [workspace.package]
-version = "2.2.31"
+version = "2.2.35"
 edition = "2024"
 license = "BUSL-1.1"
 repository = "https://github.com/sentrix-labs/sentrix"

@@ -1136,36 +1136,76 @@ async fn cmd_start(
         let writer_storage = storage.clone();
         let writer_shared = shared.clone();
         tokio::spawn(async move {
-            while let Some(target_height) = save_rx.recv().await {
-                // Drain coalesced heights: if the writer is behind, multiple
-                // FinalizeBlock pushes can stack up. One snapshot covers all
-                // of them since save_blockchain writes the full state blob.
-                let mut latest = target_height;
-                while let Ok(h) = save_rx.try_recv() {
-                    latest = h;
-                }
-                let bc = writer_shared.read().await;
-                let height_at_save = bc.height();
-                match writer_storage.save_blockchain(&bc) {
-                    Ok(()) => {
-                        tracing::debug!(
+            // POLL-driven persistence (not save_tx-signal-driven). The signal
+            // was only pushed in the commit path that is SKIPPED when add_block
+            // returns Err on the BFT apply-from-stash state_root recompute
+            // mismatch (the proposal carries the proposer's root; our local
+            // recompute differs — the separate, open determinism issue). The
+            // block is still canonical (2/3 precommit justification) and the
+            // chain advances, but the writer never fired → its block:{N} key
+            // was never written → it aged out of the in-memory window into a
+            // permanent storage gap → observer/fullnode GetBlocks sync stalled
+            // on the missing height. Polling the chain and persisting whatever
+            // is committed decouples durability from the apply result.
+            //
+            // Block:{N} keys are written via the BATCHED save_blocks (one MDBX
+            // txn / one fsync per tick). An earlier attempt used per-block
+            // save_block, whose per-block full-env mdbx.sync() contended with
+            // the apply path's trie write txns and stalled consensus. The full
+            // state blob (save_blockchain) runs on a slow cadence purely to
+            // bound load-time B2 replay; B2 rebuilds accounts from the block:{N}
+            // keys we now persist, and the graceful-shutdown path saves the
+            // blob on clean exit.
+            let mut last_persisted: u64 = { writer_shared.read().await.height() };
+            let mut last_blob_save = std::time::Instant::now();
+            let mut ticker = tokio::time::interval(std::time::Duration::from_secs(5));
+            ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+            loop {
+                ticker.tick().await;
+                // Drain the legacy signal channel so producers' try_send does
+                // not accumulate; persistence no longer keys off it.
+                while save_rx.try_recv().is_ok() {}
+
+                // Clone the newly-committed block range under a short read lock,
+                // then release before the disk write so the lock is never held
+                // across I/O (would stall the validator's write lock).
+                let new_blocks: Vec<sentrix::core::block::Block> = {
+                    let bc = writer_shared.read().await;
+                    let h = bc.height();
+                    (last_persisted.saturating_add(1)..=h)
+                        .filter_map(|i| bc.get_block(i).cloned())
+                        .collect()
+                };
+                if let Some(top) = new_blocks.last().map(|b| b.index) {
+                    match writer_storage.save_blocks(&new_blocks) {
+                        Ok(()) => last_persisted = top,
+                        Err(e) => tracing::error!(
                             target: "save_writer",
-                            "background save_blockchain ok queued_for=h{} caught_up_to=h{}",
-                            latest,
-                            height_at_save,
-                        );
+                            "save_blocks failed for range ..={}: {}",
+                            top,
+                            e,
+                        ),
                     }
-                    Err(e) => {
+                }
+
+                // Periodic full-state checkpoint (accounts blob + blob_height)
+                // to keep B2 replay bounded on an unclean restart. Infrequent
+                // (60s) — strictly less often than the previous per-finalize
+                // save, so the brief read-lock hold during serialize can't
+                // accumulate into back-pressure.
+                if last_blob_save.elapsed() >= std::time::Duration::from_secs(60) {
+                    let bc = writer_shared.read().await;
+                    if let Err(e) = writer_storage.save_blockchain(&bc) {
                         tracing::error!(
                             target: "save_writer",
-                            "background save_blockchain failed queued_for=h{} caught_up_to=h{}: {}",
-                            latest, height_at_save, e,
+                            "periodic save_blockchain failed at h{}: {}",
+                            bc.height(),
+                            e,
                         );
                     }
+                    last_blob_save = std::time::Instant::now();
                 }
-                drop(bc);
             }
-            tracing::info!(target: "save_writer", "save channel closed; writer exiting");
         });
     }
 
@@ -2914,21 +2954,27 @@ async fn cmd_start(
                                     }
 
                                     if let Some(mut blk) = proposed_block.take() {
+                                        // Apply the finalized block we already hold, whoever proposed
+                                        // it. The hash-match guard above proved `blk` IS the
+                                        // FinalizeBlock action's block, and `justification` carries
+                                        // its 2/3 precommit certificate — so this is the canonical
+                                        // committed block. This previously broke out and waited for
+                                        // libp2p NewBlock/sync to re-deliver a peer's block; when
+                                        // gossip missed, the node sat in Finalize re-triggering sync
+                                        // while holding the block the whole time → chain crawl/stall.
+                                        // `validate_block` below still re-checks structure + the
+                                        // justification supermajority before we write.
                                         if blk.validator != wallet.address {
                                             tracing::info!(
                                                 target: "finalize_trace",
-                                                "BFT finalize peer-propose: h={} round={} block={:.16}… \
-                                                 proposer={} is not local validator {}; waiting for \
-                                                 libp2p NewBlock/sync instead of executing peer block \
-                                                 in the BFT loop",
+                                                "BFT finalize: applying peer-proposed finalized block \
+                                                 h={} round={} block={:.16}… proposer={} from local \
+                                                 stash (valid 2/3 justification)",
                                                 height,
                                                 round,
                                                 block_hash,
                                                 blk.validator,
-                                                wallet.address,
                                             );
-                                            lp2p_clone.trigger_sync().await;
-                                            break;
                                         }
 
                                         blk.round = round;

@@ -1723,23 +1723,76 @@ impl Blockchain {
                                 return Ok(());
                             }
 
-                            tracing::error!(
-                                "CRITICAL #1e: state_root mismatch at block {} — received {} \
-                                 vs computed {}. Local trie and peer's trie disagree on the \
-                                 post-block state. Rejecting.",
-                                block_index,
-                                hex::encode(received_root),
-                                hex::encode(computed_root),
-                            );
-                            // 2026-04-23 divergence rate-alarm: per-event ERROR
-                            // line above is truthful but gets lost in log noise
-                            // during a real divergence (~1/s). Record the
-                            // rejection in the rolling tracker, which emits a
-                            // LOUD rate-limited alarm pointing at the rsync
-                            // recovery playbook when the rate crosses threshold.
-                            // See `DivergenceTracker` in blockchain.rs for the
-                            // full rationale.
-                            self.divergence_tracker.record_rejection(block_index);
+                            // Observer-tolerant accept (gated, default OFF). An observer/
+                            // fullnode applies EVERY block via add_block_from_peer (Peer) and
+                            // strictly rejecting a #1e here halts it on canonical data: the
+                            // block already passed the strict 2/3-precommit justification
+                            // verification earlier in add_block_impl, so it IS the network-
+                            // agreed block (consensus is on block_hash, not state_root). The
+                            // mismatch is the chain's known imperfect state-commitment
+                            // (recurring/oscillating state_root) that validators already
+                            // tolerate via the apply-from-stash path. With
+                            // SENTRIX_OBSERVER_TOLERANT_STATE_ROOT=1 set, accept the block and
+                            // stamp the proposer's (canonical) received root so the observer's
+                            // chain stays consistent with the committed roots; its local
+                            // accounts diverge from that root (the same pre-existing imperfection
+                            // every node has), so served state is no worse than a validator's.
+                            // Default OFF → validators keep the strict #1e reject below. Only an
+                            // observer node sets this env.
+                            if self.source_for_current_add == BlockSource::Peer
+                                && std::env::var_os("SENTRIX_OBSERVER_TOLERANT_STATE_ROOT")
+                                    .is_some_and(|v| v == "1")
+                            {
+                                tracing::debug!(
+                                    "observer-tolerant: #1e at block {} (received {} vs computed \
+                                     {}) — accepting justified canonical block, stamping received \
+                                     root (local state diverges; chain state-commitment imperfect)",
+                                    block_index,
+                                    hex::encode(received_root),
+                                    hex::encode(computed_root),
+                                );
+                                last.state_root = Some(received_root);
+                                self.maybe_prune_trie();
+                                emit_apply_profile(
+                                    profile_t0,
+                                    profile_t1,
+                                    profile_t2,
+                                    profile_height,
+                                    profile_txs,
+                                );
+                                return Ok(());
+                            }
+
+                            // A SelfProduced mismatch is the BFT finalize apply-from-stash
+                            // path: the stashed proposal carries the proposer's PRE-apply
+                            // state_root (computed at propose time, before this block's txs),
+                            // which never equals the freshly computed POST-apply root. That's
+                            // expected and self-heals — the block still commits via the libp2p
+                            // receive path, which CHECKs against the canonical committed root.
+                            // Only a Peer-source mismatch is a real cross-node divergence, so
+                            // keep the LOUD alarm + divergence_tracker for that case and log the
+                            // self-apply case quietly without polluting the divergence rate.
+                            if self.source_for_current_add == BlockSource::Peer {
+                                tracing::error!(
+                                    "CRITICAL #1e: state_root mismatch at block {} — received {} \
+                                     vs computed {}. Local trie and peer's trie disagree on the \
+                                     post-block state. Rejecting.",
+                                    block_index,
+                                    hex::encode(received_root),
+                                    hex::encode(computed_root),
+                                );
+                                // Record in the rolling tracker, which emits a LOUD rate-limited
+                                // alarm pointing at the recovery playbook when the rate crosses
+                                // threshold. See `DivergenceTracker` in blockchain.rs.
+                                self.divergence_tracker.record_rejection(block_index);
+                            } else {
+                                tracing::debug!(
+                                    "#1e self-apply mismatch at block {} (expected: stashed \
+                                     proposal carries the pre-apply root) — block commits via \
+                                     the receive path",
+                                    block_index,
+                                );
+                            }
                             return Err(SentrixError::ChainValidationFailed(format!(
                                 "state_root mismatch at block {}: received {}, computed {}",
                                 block_index,
@@ -2865,6 +2918,110 @@ mod tests {
         );
     }
 
+    /// Regression: a Pass-2 failure (#1e state_root mismatch) must restore
+    /// `stake_registry` / `epoch_manager` / `slashing`, not just AccountDB.
+    /// Pre-fix the C-03 snapshot omitted them, so the reward bundle's
+    /// `pending_rewards` increment leaked on every rejected block — and post
+    /// STATE_IN_TRIE that leak diverged the next block's state_root.
+    #[test]
+    fn test_c03_pass2_failure_restores_staking_state() {
+        use sentrix_primitives::block::{Block, STATE_ROOT_FORK_HEIGHT};
+        use sentrix_primitives::justification::BlockJustification;
+        use sentrix_staking::staking::ValidatorStake;
+        use sentrix_storage::MdbxStorage;
+        use std::sync::Arc;
+        use tempfile::TempDir;
+
+        let _guard = crate::test_util::env_test_lock();
+        // Forks that (a) run the reward bundle in apply and (b) commit staking
+        // state into the root, so a #1e reject can leak it absent the fix.
+        unsafe {
+            std::env::set_var("VOYAGER_REWARD_V2_HEIGHT", "0");
+            std::env::set_var("REWARD_APPLY_PATH_HEIGHT", "0");
+            std::env::set_var("STATE_IN_TRIE_HEIGHT", "0");
+        }
+
+        let mut bc = setup();
+        bc.voyager_activated = true;
+        for addr in ["v1", "v2", "v3", "v4"] {
+            bc.stake_registry.validators.insert(
+                addr.to_string(),
+                ValidatorStake {
+                    address: addr.to_string(),
+                    self_stake: 1000,
+                    total_delegated: 0,
+                    commission_rate: 1000,
+                    max_commission_rate: 2000,
+                    is_jailed: false,
+                    jail_until: 0,
+                    is_tombstoned: false,
+                    blocks_signed: 0,
+                    blocks_missed: 0,
+                    pending_rewards: 0,
+                    registration_height: 0,
+                    last_commission_change_height: 0,
+                },
+            );
+        }
+        bc.stake_registry.active_set =
+            vec!["v1".into(), "v2".into(), "v3".into(), "v4".into()];
+
+        // Pad past STATE_ROOT_FORK_HEIGHT so the #1e check enforces (the
+        // below-fork path just stamps the root instead of rejecting).
+        let pad_height = STATE_ROOT_FORK_HEIGHT + 1;
+        let prev = bc.latest_block().unwrap().hash.clone();
+        let mut pad = Block::new(
+            pad_height,
+            prev,
+            vec![Transaction::new_coinbase("v1".into(), 0, pad_height, 1_777_000_000)],
+            "v1".into(),
+        );
+        pad.timestamp = 1_777_000_000;
+        bc.chain.push(pad);
+
+        // Trie required so update_trie_for_block computes a real root to diff.
+        let dir = TempDir::new().unwrap();
+        let mdbx = Arc::new(MdbxStorage::open(dir.path()).unwrap());
+        bc.init_trie(mdbx).unwrap();
+
+        let height = bc.height() + 1;
+        let prev_hash = bc.latest_block().unwrap().hash.clone();
+        let reward = bc.get_block_reward();
+        let coinbase = Transaction::new_coinbase("v1".into(), reward, height, 1_777_000_001);
+        let mut block = Block::new(height, prev_hash, vec![coinbase], "v1".into());
+        block.timestamp = 1_777_000_001;
+        // Tamper the declared root so #1e fires AFTER the reward bundle ran.
+        block.state_root = Some([0xAB; 32]);
+        block.hash = block.calculate_hash();
+        let mut just = BlockJustification::new(height, 0, block.hash.clone());
+        just.add_precommit("v1".into(), vec![], 1000);
+        just.add_precommit("v2".into(), vec![], 1000);
+        just.add_precommit("v3".into(), vec![], 1000);
+        block.justification = Some(just);
+
+        let pending_before = bc.stake_registry.validators.get("v1").unwrap().pending_rewards;
+
+        let err = bc
+            .add_block_from_peer(block)
+            .expect_err("tampered state_root must be rejected (#1e)");
+        assert!(
+            format!("{err:?}").contains("state_root mismatch"),
+            "expected #1e state_root mismatch, got: {err:?}"
+        );
+
+        let pending_after = bc.stake_registry.validators.get("v1").unwrap().pending_rewards;
+        assert_eq!(
+            pending_after, pending_before,
+            "pending_rewards must roll back after a #1e reject (leaked pre-fix)"
+        );
+
+        unsafe {
+            std::env::remove_var("VOYAGER_REWARD_V2_HEIGHT");
+            std::env::remove_var("REWARD_APPLY_PATH_HEIGHT");
+            std::env::remove_var("STATE_IN_TRIE_HEIGHT");
+        }
+    }
+
     #[test]
     fn test_add_block_succeeds_without_trie() {
         // update_trie_for_block returning Ok(None) must not fail add_block.