diff --git a/crates/e2e-tests/src/cluster.rs b/crates/e2e-tests/src/cluster.rs index 3ba34bb8b..78b3e0f26 100644 --- a/crates/e2e-tests/src/cluster.rs +++ b/crates/e2e-tests/src/cluster.rs @@ -1034,6 +1034,13 @@ impl MpcNodeState { } } + pub fn home_dir(&self) -> &Path { + match self { + MpcNodeState::Running(n) => n.setup().home_dir(), + MpcNodeState::Stopped(s) => s.home_dir(), + } + } + pub fn p2p_public_key(&self) -> Ed25519PublicKey { match self { MpcNodeState::Running(n) => n.setup().p2p_public_key(), diff --git a/crates/e2e-tests/src/mpc_node.rs b/crates/e2e-tests/src/mpc_node.rs index da549e3d0..6c8f6de57 100644 --- a/crates/e2e-tests/src/mpc_node.rs +++ b/crates/e2e-tests/src/mpc_node.rs @@ -137,7 +137,10 @@ impl MpcNode { } } +pub const STDOUT_LOG: &str = "stdout.log"; +pub const STDOUT_LOG_PREVIOUS: &str = "stdout.log.previous"; pub const STDERR_LOG: &str = "stderr.log"; +pub const STDERR_LOG_PREVIOUS: &str = "stderr.log.previous"; /// Guard that kills the child process on drop. struct ProcessGuard(Child); @@ -352,10 +355,25 @@ impl MpcNodeSetup { "starting mpc-node" ); - let stdout_file = std::fs::File::create(self.home_dir.join("stdout.log")) - .context("failed to create stdout log")?; - let stderr_file = std::fs::File::create(self.home_dir.join("stderr.log")) - .context("failed to create stderr log")?; + // Rotate stdout.log / stderr.log to .previous on restart so that on + // a post-restart test failure the diagnostic can dump both the + // pre-kill mpc-node tracing (stdout) and any crash output (stderr) + // from BEFORE the restart, alongside whatever the restarted process + // produced. Without this rotation, `File::create` truncates and the + // pre-restart context is lost — leaving us blind to the upstream + // nearcore panic stack that lives in the pre-restart stderr. + let stdout_path = self.home_dir.join(STDOUT_LOG); + if stdout_path.exists() { + let _ = std::fs::rename(&stdout_path, self.home_dir.join(STDOUT_LOG_PREVIOUS)); + } + let stdout_file = + std::fs::File::create(&stdout_path).context("failed to create stdout log")?; + let stderr_path = self.home_dir.join(STDERR_LOG); + if stderr_path.exists() { + let _ = std::fs::rename(&stderr_path, self.home_dir.join(STDERR_LOG_PREVIOUS)); + } + let stderr_file = + std::fs::File::create(&stderr_path).context("failed to create stderr log")?; let child = Command::new(&self.binary_path) .arg("start-with-config-file") diff --git a/crates/e2e-tests/tests/common.rs b/crates/e2e-tests/tests/common.rs index eb81da0c9..9a9a02b67 100644 --- a/crates/e2e-tests/tests/common.rs +++ b/crates/e2e-tests/tests/common.rs @@ -1,3 +1,4 @@ +use std::io::{Read, Seek, SeekFrom}; use std::path::{Path, PathBuf}; use std::time::Duration; @@ -206,7 +207,28 @@ pub async fn wait_for_node_indexer_height_above( ) .await .with_context(|| { - format!("node {idx} indexer did not advance past height {min_height} within {timeout:?}") + // On failure, dump the tails of the node's stdout/stderr log files — + // both the pre-restart copy (rotated to `.previous` by `MpcNodeSetup::start`) + // and the current copy. mpc-node's `tracing` output goes to stdout; + // panic backtraces and `eprintln!` go to stderr. The post-restart + // `stderr.log` is where the upstream nearcore panic stack lives + // (see `docs/investigation/nearcore-indexer-sigkill-restart-panic.md`). + let home = cluster.nodes[idx].home_dir(); + let stdout_previous = read_log_tail(&home.join("stdout.log.previous"), 16_384); + let stderr_previous = read_log_tail(&home.join("stderr.log.previous"), 16_384); + let stdout_current = read_log_tail(&home.join("stdout.log"), 16_384); + let stderr_current = read_log_tail(&home.join("stderr.log"), 16_384); + format!( + "node {idx} indexer did not advance past height {min_height} within {timeout:?}\n\ + --- last 16KB of node {idx} stdout.log.previous (pre-restart mpc-node tracing) ---\n{stdout_previous}\n\ + --- end stdout.log.previous ---\n\ + --- last 16KB of node {idx} stderr.log.previous (pre-restart stderr; panic from pre-restart process if any) ---\n{stderr_previous}\n\ + --- end stderr.log.previous ---\n\ + --- last 16KB of node {idx} stdout.log (post-restart mpc-node tracing) ---\n{stdout_current}\n\ + --- end stdout.log ---\n\ + --- last 16KB of node {idx} stderr.log (post-restart stderr; upstream nearcore panic stack typically here) ---\n{stderr_current}\n\ + --- end stderr.log ---" + ) })?; let elapsed = start.elapsed(); tracing::info!( @@ -220,6 +242,27 @@ pub async fn wait_for_node_indexer_height_above( Ok(()) } +/// Best-effort read of the last `max_bytes` of a log file. Returns a synthetic +/// placeholder string if the file can't be opened/read. Used to inline a +/// node's log tail into the test panic message when a kill+restart wait +/// helper times out, so CI logs surface the upstream panic stack right next +/// to the test failure rather than leaving us to dig through saved artifacts. +fn read_log_tail(path: &Path, max_bytes: usize) -> String { + let Ok(mut f) = std::fs::File::open(path) else { + return format!("(could not open {})", path.display()); + }; + let len = f.metadata().map(|m| m.len()).unwrap_or(0); + let skip = len.saturating_sub(max_bytes as u64); + if f.seek(SeekFrom::Start(skip)).is_err() { + return format!("(seek failed on {})", path.display()); + } + let mut buf = Vec::with_capacity(max_bytes); + if f.read_to_end(&mut buf).is_err() { + return format!("(read failed on {})", path.display()); + } + String::from_utf8_lossy(&buf).into_owned() +} + /// Read node `idx`'s current indexer block height. Returns `Ok(None)` if /// the node is not running or the HTTP scrape can't connect (process /// down); returns `Err` if a metrics body read fails partway through. diff --git a/crates/node/src/migration_service.rs b/crates/node/src/migration_service.rs index 822c09055..4c2e5a35f 100644 --- a/crates/node/src/migration_service.rs +++ b/crates/node/src/migration_service.rs @@ -2,7 +2,9 @@ use std::{net::SocketAddr, sync::Arc}; use ed25519_dalek::SigningKey; use near_account_id::AccountId; +use near_mpc_contract_interface::types::Ed25519PublicKey; use onboarding::onboard; +use tee_authority::tee_authority::TeeAuthority; use tokio::sync::{RwLock, watch}; use types::MigrationInfo; @@ -30,6 +32,7 @@ impl From<&SecretsConfig> for MigrationSecrets { } } +#[expect(clippy::too_many_arguments)] pub async fn spawn_recovery_server_and_run_onboarding( migration_web_ui: SocketAddr, migration_secrets: MigrationSecrets, @@ -38,6 +41,8 @@ pub async fn spawn_recovery_server_and_run_onboarding( my_migration_info_receiver: watch::Receiver, contract_state_receiver: watch::Receiver, tx_sender: impl TransactionSender, + tee_authority: TeeAuthority, + account_public_key: Ed25519PublicKey, ) -> anyhow::Result<()> { let (import_keyshares_sender, import_keyshares_receiver) = tokio::sync::watch::channel(vec![]); let web_server_state = web::types::WebServerState { @@ -61,6 +66,8 @@ pub async fn spawn_recovery_server_and_run_onboarding( tx_sender, keyshare_storage.clone(), import_keyshares_receiver, + tee_authority, + account_public_key, ) .await?; Ok(()) diff --git a/crates/node/src/migration_service/onboarding.rs b/crates/node/src/migration_service/onboarding.rs index 31dfa3769..dadd0f730 100644 --- a/crates/node/src/migration_service/onboarding.rs +++ b/crates/node/src/migration_service/onboarding.rs @@ -5,7 +5,9 @@ use backon::{ExponentialBuilder, Retryable}; use ed25519_dalek::VerifyingKey; use futures::TryFutureExt; use near_account_id::AccountId; +use near_mpc_contract_interface::types::Ed25519PublicKey; use near_mpc_crypto_types::Keyset; +use tee_authority::tee_authority::TeeAuthority; use tokio::sync::{RwLock, watch}; use tokio_util::sync::CancellationToken; @@ -17,6 +19,7 @@ use crate::{ }, keyshare::{Keyshare, KeyshareStorage}, migration_service::types::{MigrationInfo, OnboardingJob, OnboardingTask}, + tee::remote_attestation::submit_attestation_before_concluding_migration, }; /// Waits until the node becomes an active participant in the current epoch or @@ -25,6 +28,7 @@ use crate::{ /// runs onboarding tasks as needed. /// /// Returns `Ok(())` when this node is an active participant in the current epoch. +#[expect(clippy::too_many_arguments)] pub(crate) async fn onboard( contract_state_receiver: watch::Receiver, my_migration_info_receiver: watch::Receiver, @@ -33,6 +37,8 @@ pub(crate) async fn onboard( tx_sender: impl TransactionSender, keyshare_storage: Arc>, keyshare_receiver: watch::Receiver>, + tee_authority: TeeAuthority, + account_public_key: Ed25519PublicKey, ) -> anyhow::Result<()> { tracing::info!(?my_near_account_id, "starting onboarding"); let (cancel_monitoring_task, mut onboarding_job_receiver) = start_onboarding_monitoring_task( @@ -68,6 +74,9 @@ pub(crate) async fn onboard( tx_sender.clone(), my_migration_info_receiver.clone(), cancellation_token.clone(), + tee_authority.clone(), + (&tls_public_key).into(), + account_public_key.clone(), ) .await; if cancellation_token.is_cancelled() { @@ -230,6 +239,7 @@ async fn wait_for_active_migration_to_clear( /// This function returns Ok(()) if it is cancelled or succeeds. /// /// **Not cancellation-safe!** Needs to be cancelled via `cancel_import_token` +#[expect(clippy::too_many_arguments)] async fn execute_onboarding( importing_keyset: Keyset, keyshare_storage: Arc>, @@ -237,6 +247,9 @@ async fn execute_onboarding( tx_sender: impl TransactionSender, my_migration_info_receiver: watch::Receiver, cancel_import_token: CancellationToken, + tee_authority: TeeAuthority, + tls_public_key: Ed25519PublicKey, + account_public_key: Ed25519PublicKey, ) -> anyhow::Result<()> { if keyshare_storage .read() @@ -254,6 +267,34 @@ async fn execute_onboarding( .await?; } + // Submit a fresh attestation before concluding. Back-migration can + // otherwise hit the case where the destination's stored on-chain + // attestation is past expiry by the contract's `current_time_seconds`, + // and `reverify_participants` rejects the conclude (see #2121). Failure + // here is logged but non-fatal: in the non-back-migration path the + // existing on-chain attestation is typically still valid, so we fall + // through to `retry_conclude_onboarding`. + tokio::select! { + result = submit_attestation_before_concluding_migration( + tee_authority, + tx_sender.clone(), + tls_public_key, + account_public_key, + ) => { + if let Err(err) = result { + tracing::warn!( + ?err, + "failed to refresh attestation before concluding migration; \ + proceeding with existing on-chain attestation" + ); + } + } + _ = cancel_import_token.cancelled() => { + tracing::info!("attestation refresh cancelled"); + return Ok(()); + } + } + tokio::select! { _ = retry_conclude_onboarding( importing_keyset, diff --git a/crates/node/src/run.rs b/crates/node/src/run.rs index edf158e5d..8a1688487 100644 --- a/crates/node/src/run.rs +++ b/crates/node/src/run.rs @@ -394,13 +394,16 @@ where let allowed_docker_images_receiver_clone = indexer_api.allowed_docker_images_receiver.clone(); let allowed_launcher_compose_receiver_clone = indexer_api.allowed_launcher_compose_receiver.clone(); + let tee_authority_for_monitor = tee_authority.clone(); + let tls_public_key_for_monitor = tls_public_key.clone(); + let account_public_key_for_monitor = account_public_key.clone(); tokio::spawn(async move { if let Err(e) = monitor_attestation_removal( account_id_clone, - tee_authority, + tee_authority_for_monitor, tx_sender_clone, - tls_public_key, - account_public_key, + tls_public_key_for_monitor, + account_public_key_for_monitor, allowed_docker_images_receiver_clone, allowed_launcher_compose_receiver_clone, tee_accounts_receiver, @@ -425,6 +428,8 @@ where indexer_api.my_migration_info_receiver.clone(), indexer_api.contract_state_receiver.clone(), indexer_api.txn_sender.clone(), + tee_authority, + account_public_key, ) .await?; diff --git a/crates/node/src/tee/remote_attestation.rs b/crates/node/src/tee/remote_attestation.rs index caaf85824..88fdeb1dc 100644 --- a/crates/node/src/tee/remote_attestation.rs +++ b/crates/node/src/tee/remote_attestation.rs @@ -22,6 +22,16 @@ use near_account_id::AccountId; use near_mpc_contract_interface::types::NodeId; use tokio::sync::watch; +/// Upper bound on how long +/// `submit_attestation_before_concluding_migration` will keep retrying its +/// underlying `submit_remote_attestation` call. The helper is documented as +/// best-effort and non-fatal (the caller swallows its errors); bounding here +/// prevents an unresponsive `TransactionSender` (e.g. the test mock) from +/// hanging the onboarding flow indefinitely. `MAX_RETRY_DURATION` (12 h) +/// would be the long-running production behavior for the other callers of +/// `submit_remote_attestation`, which we keep unchanged. +const SUBMIT_ATTESTATION_BEFORE_CONCLUDE_TIMEOUT: Duration = Duration::from_secs(30); + const MIN_BACKOFF_DURATION: Duration = Duration::from_millis(100); const MAX_BACKOFF_DURATION: Duration = Duration::from_secs(60); const MAX_RETRY_DURATION: Duration = Duration::from_secs(60 * 60 * 12); // 12 hours. @@ -89,6 +99,44 @@ pub async fn submit_remote_attestation( .context("failed to submit attestation after multiple retry attempts")? } +/// Generates a fresh attestation and submits it on-chain before this node +/// concludes a back-migration. +/// +/// Without this, back-migration can hit a case where the destination's +/// stored attestation is already past expiry by the contract's +/// `current_time_seconds` at conclude time, so `reverify_participants` +/// rejects with `InvalidTeeRemoteAttestation` and `retry_conclude_onboarding` +/// loops forever on the same stale state (see near/mpc#2121). +/// +/// `submit_remote_attestation` waits for `TransactionStatus::Executed`, so +/// by the time this function returns Ok the contract holds a fresh +/// attestation under `tls_public_key`. +pub async fn submit_attestation_before_concluding_migration( + tee_authority: TeeAuthority, + tx_sender: impl TransactionSender, + tls_public_key: Ed25519PublicKey, + account_public_key: Ed25519PublicKey, +) -> anyhow::Result<()> { + let report_data: ReportData = + ReportDataV1::new(*tls_public_key.as_bytes(), *account_public_key.as_bytes()).into(); + let attestation = tee_authority + .generate_attestation(report_data) + .await + .map_err(|e| anyhow::anyhow!(e)) + .context("generate fresh attestation for conclude_node_migration")?; + tokio::time::timeout( + SUBMIT_ATTESTATION_BEFORE_CONCLUDE_TIMEOUT, + submit_remote_attestation(tx_sender, attestation, tls_public_key), + ) + .await + .map_err(|_| { + anyhow::anyhow!( + "submit_remote_attestation did not complete within {SUBMIT_ATTESTATION_BEFORE_CONCLUDE_TIMEOUT:?}; \ + best-effort attestation refresh giving up" + ) + })? +} + fn validate_remote_attestation( attestation: &Attestation, tls_public_key: Ed25519PublicKey, @@ -536,6 +584,42 @@ mod tests { ); } + #[tokio::test] + #[expect(non_snake_case)] + async fn submit_attestation_before_concluding_migration__should_submit_one_fresh_attestation() { + // Given + let mut rng = rand::rngs::StdRng::seed_from_u64(42); + let tls_public_key: Ed25519PublicKey = + (&SigningKey::generate(&mut rng).verifying_key()).into(); + let account_public_key: Ed25519PublicKey = + (&SigningKey::generate(&mut rng).verifying_key()).into(); + let tee_authority = TeeAuthority::from(LocalTeeAuthorityConfig::default()); + let node_id = NodeId { + account_id: "dummy.near".parse().unwrap(), + tls_public_key: tls_public_key.clone(), + account_public_key: account_public_key.clone(), + }; + let (tee_accounts_sender, _) = watch::channel(vec![]); + let sender = MockSender::new(tee_accounts_sender, node_id); + + // When + submit_attestation_before_concluding_migration( + tee_authority, + sender.clone(), + tls_public_key, + account_public_key, + ) + .await + .expect("refresh-before-conclude should succeed against a valid TeeAuthority"); + + // Then + assert_eq!( + sender.count(), + 1, + "expected exactly one SubmitParticipantInfo tx before conclude_node_migration" + ); + } + #[tokio::test] async fn test_validate_remote_attestation_valid() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); diff --git a/crates/node/src/tests.rs b/crates/node/src/tests.rs index 12d7c609f..3009a35fe 100644 --- a/crates/node/src/tests.rs +++ b/crates/node/src/tests.rs @@ -131,6 +131,15 @@ impl OneNodeTestConfig { .expect("require keystore for integration tests"), )); + let account_public_key = (&self + .secrets + .persistent_secrets + .near_signer_key + .verifying_key()) + .into(); + let tee_authority = tee_authority::tee_authority::TeeAuthority::from( + tee_authority::tee_authority::LocalTeeAuthorityConfig::default(), + ); spawn_recovery_server_and_run_onboarding( self.config.migration_web_ui, (&self.secrets).into(), @@ -139,6 +148,8 @@ impl OneNodeTestConfig { self.indexer.my_migration_info_receiver.clone(), self.indexer.contract_state_receiver.clone(), self.indexer.txn_sender.clone(), + tee_authority, + account_public_key, ) .await .unwrap(); diff --git a/crates/node/src/tests/common.rs b/crates/node/src/tests/common.rs index a2629d42c..eaabad7aa 100644 --- a/crates/node/src/tests/common.rs +++ b/crates/node/src/tests/common.rs @@ -23,8 +23,13 @@ impl TransactionSender for MockTransactionSender { async fn send_and_wait( &self, - _transaction: ChainSendTransactionRequest, + transaction: ChainSendTransactionRequest, ) -> Result { - unimplemented!() + // Forward to `send` so the test still observes the transaction on the + // `transaction_sender` channel, then report it as `Executed`. This is + // enough for callers like `submit_remote_attestation` that gate on + // `Executed` vs `NotExecuted` rather than the actual on-chain effect. + self.send(transaction).await?; + Ok(TransactionStatus::Executed) } } diff --git a/deny.toml b/deny.toml index 3a16dc3e5..38ec2417c 100644 --- a/deny.toml +++ b/deny.toml @@ -23,6 +23,7 @@ ignore = [ "RUSTSEC-2022-0054", # wee_alloc is unmaintained "RUSTSEC-2025-0134", # rustls-pemfile is unmaintained "RUSTSEC-2024-0370", # proc-macro-error is unmaintained + "RUSTSEC-2026-0173", # proc-macro-error2 is unmaintained — same situation as RUSTSEC-2024-0370, pulled in transitively "RUSTSEC-2024-0436", # paste is unmaintained "RUSTSEC-2024-0384", # instant is unmaintained "RUSTSEC-2026-0009", # TODO(#1999): Remove once MSVR is bumped to 1.88 diff --git a/docs/investigation/2121-back-migration-e2e-flake.md b/docs/investigation/2121-back-migration-e2e-flake.md new file mode 100644 index 000000000..4c3211033 --- /dev/null +++ b/docs/investigation/2121-back-migration-e2e-flake.md @@ -0,0 +1,385 @@ +# CI E2E flake diagnostic — `migration_service__should_handle_back_migration_a_to_b_to_a` + +The `back-migration` E2E test on PR #3362 fails with two distinct nearcore panics — both triggered by stopping an mpc-node process during active block production followed by an immediate restart. The same test passes ~100% on the baseline branch (no #3362 code), so the bug is upstream in nearcore — but PR #3362's `submit_attestation_before_concluding_migration` is necessary to reach it, and the failure rate also depends on what activity happens on chain between forward conclude and the restart. + +## Latest state (as of 2026-06-08) + +The bug is upstream in nearcore — confirmed empirically with maximal embedder-side mitigation: + +- PR #3362's branch is now stacked on **PR #3410** (real SIGTERM handler) + **PR #3486** (full indexer-thread drain — `block_until_all_instances_are_dropped()` returns cleanly in 2–6 s, all `Arc` refs released, indexer's tokio runtime dropped). nearcore is resolved to `2.12.0` (the final tag, commit `1144e31`). +- **Latest CI run confirms the bug reproduces on this exact stack.** Commit `3a2ceafe` (PR #3362 head after the rebase), e2e job: the back-migration test failed at `migration_service.rs:771` with the upstream nearcore mode-A panic — `streamer/mod.rs:207` `` `receipt` must be present at this moment `` — captured in full via the new pre/post-restart stdout/stderr diagnostic. +- Earlier, on the prototype branch's `607e3e21` (against nearcore `2.12.0-rc.2`), a 5-run campaign showed 1/5 pass, 4/5 fail with the same panic even after a verified 2–6 s clean graceful shutdown. See "Real SIGTERM handler in mpc-node — also does not fix it" below. +- The bug is **latent on current `main`** because the trigger (`submit_attestation_before_concluding_migration`, in PR #3362) hasn't merged. A 5-run confirmation campaign on `main` + PR #3486 (no #3362 trigger) showed **0/5 fail**, re-confirming PR #3373's revert-experiment result. +- Filed upstream as [`near/nearcore#15867`](https://github.com/near/nearcore/issues/15867) (Zulip thread: [near-indexer panic in MPC node](https://near.zulipchat.com/#narrow/channel/469556-community-support/topic/near-indexer.20panic.20in.20MPC.20node)). Write-up was prepared in [`nearcore-indexer-sigkill-restart-panic.md`](./nearcore-indexer-sigkill-restart-panic.md). + +> **Headline statistics.** +> - **PR #3365 pre-merge** (no #3362 code): 0/15 fail. +> - **PR #3362 against nearcore `aab31b0`** (initial branch state): 5/6 fail (~83%). +> - **Revert experiment PR #3373** (PR #3362 minus the node-side fix, otherwise identical): 0/5 fail. +> - **PR #3362 against nearcore `fadb5c1`** (after merging main, `Cargo.lock` bump within `2.12.0-rc.1`): back-migration test ~11/17 fail (~65%) combined across four sample batches. +> - **PR #3486 against main, no #3362 trigger present** (5 runs on commit `7fecead3`): **0/5 fail**. Re-confirms the revert experiment — without the trigger, the upstream race isn't reached. +> - **Focused-repro matrix** at the same commit — full 2⁴ on `(pre-sign, pre-ckd, post-sign, post-ckd)` plus SIGTERM counterparts for the worst variant and the back-migration test: +> +> | # | pre-sign | pre-ckd | post-sign | post-ckd | Kill | Fail rate | +> |---|---|---|---|---|---|---| +> | 0 | — | — | — | — | SIGKILL | 0 / 20 | +> | 1–6 | (1-of-4 or 2-of-4 activity, all combos) | | | | SIGKILL | **all 0%** | +> | 7 | ✓ | — | ✓ | — | SIGKILL | 1 / 3 (small sample) | +> | 10–11 | (other 2-of-4 combos) | | | | SIGKILL | 0% | +> | 12 | — | ✓ | ✓ | ✓ | SIGKILL | 1 / 10 (~10%) | +> | 13 | ✓ | ✓ | ✓ | — | SIGKILL | 3 / 10 (~30%) | +> | 14 | ✓ | ✓ | — | ✓ | SIGKILL | 3 / 10 (~30%) | +> | **15** | ✓ | ✓ | ✓ | ✓ | SIGKILL | **7 / 10 (~70%)** | +> | 15-T | ✓ | ✓ | ✓ | ✓ | **SIGTERM 30s, no handler** | **2 / 2 (small)** | +> | back-mig-T | ✓ | ✓ | ✓ | ✓ + back round | **SIGTERM 30s, no handler** | **2 / 2 (small)** | +> +> The revert experiment proves PR #3362's code is *necessary* to reach the bug. The focused-repro matrix proves it isn't *sufficient* on its own — all four activity types (pre-sign + pre-ckd + post-sign + post-ckd) need to be present to reach ~70% reproduction. Drop any one and the rate falls to ~30% or zero. The back-migration round itself adds nothing. +> +> **No teardown change prevents the panic — including a real graceful shutdown.** Initial SIGTERM-at-30s observations failed 2/2, but at that time `mpc-node` had no SIGTERM handler so the "graceful" path was effectively identical to SIGKILL. We later landed a real SIGTERM handler ([PR #3410](https://github.com/near/mpc/pull/3410)) plus full indexer-thread drain ([PR #3486](https://github.com/near/mpc/pull/3486)) — diagnostic confirms the indexer's tokio runtime is dropped, every `Arc` is released, and `block_until_all_instances_are_dropped()` returns in 2–6 s. The panic still fires at ~80% on the back-migration test. Drain-via-`listen_blocks.flag` then SIGKILL fails 5/5 — the flag only pauses our consumer-side; the panic is on nearcore's producer-side. The fix has to be in nearcore. + +--- + +## Overview of failures so far + +| Attempt | Run | Commit | Surface | +|---|---|---|---| +| 1 | [26471648821](https://github.com/near/mpc/actions/runs/26471648821) | `990cad3a` | "Connection refused" on `/debug/migrations` — A0 process already exited by the time PR #3365's diagnostic ran. (Pre-3365 helper, opaque). | +| 2 | [26473240463](https://github.com/near/mpc/actions/runs/26473240463) | `f947ff9e` | Same as attempt 1. | +| 3 | [26495701290](https://github.com/near/mpc/actions/runs/26495701290) | `3a63c918` | `wait_for_node_indexer_height_above` timed out; first run on top of #3365's branch — diagnostic surfaced "node may have exited" but no stderr dump yet. | +| 4 | [26498637558](https://github.com/near/mpc/actions/runs/26498637558) | `d9707221` (rebased on #3365) | First stderr-tail dump: **`streamer/mod.rs:207` — "receipt must be present"**. Pre-kill height 312. | +| — | [26497683232](https://github.com/near/mpc/actions/runs/26497683232) | `d9707221` (lint-failed CI run) | **E2E passed.** Same commit as run #4 but a different attempt. | +| 5 (attempt 1) | [26501678885](https://github.com/near/mpc/actions/runs/26501678885) | `cc3eeae0` (rebased on main after #3365 merge) | Second stderr-tail dump: **`client_actor.rs:217` — `StorageInconsistentState ... No ChunkExtra`**. Pre-kill height 464. | +| 5 (rerun #1) | 26501678885 attempt 2 | `cc3eeae0` | **`streamer/mod.rs:207`**. Pre-kill height 305. | +| 5 (rerun #2) | 26501678885 attempt 3 | `cc3eeae0` | **`streamer/mod.rs:207`**. Pre-kill height 313. | +| 5 (rerun #3) | 26501678885 attempt 4 | `cc3eeae0` | **`streamer/mod.rs:207`**. Pre-kill height 304. | +| 5 (rerun #4) | 26501678885 attempt 5 | `cc3eeae0` | **`streamer/mod.rs:207`**. Pre-kill height 306. | + +### Statistics on the cc3eeae0 commit (nearcore `aab31b0`) + +| Failure mode | Count | Pre-kill heights | +|---|---|---| +| A — `streamer/mod.rs:207` (receipt missing) | 5 | 304, 305, 306, 312, 313 (tight cluster) | +| B — `client_actor.rs:217` (StorageInconsistentState) | 1 | 464 (outlier high) | + +Mode A dominates and clusters tightly around height ~305. Mode B has only surfaced once and only at the highest pre-kill height we've seen — weak circumstantial evidence that B is reachable when there's more state on disk to corrupt, A is the "default" failure when state is fresher. + +### Statistics after merging main (`e610ffb7`, nearcore `fadb5c1`) + +Same nearcore tag (`2.12.0-rc.1`), different resolved commit (`Cargo.lock` was bumped by main's recent dependency updates). 9 sequential CI runs: + +| # | `forward_migration_kill_restart` | `should_handle_back_migration_a_to_b_to_a` | Pre-kill height | +|---|---|---|---| +| 1 | PASS (36.1 s) | FAIL — `streamer/mod.rs:207` | 303 | +| 2 | PASS (36.2 s) | PASS (60.2 s) | — | +| 3 | PASS (37.6 s) | PASS (58.8 s) | — | +| 4 | PASS (37.2 s) | FAIL — `streamer/mod.rs:207` | 313 | +| 5 | PASS (37.3 s) | PASS (58.8 s) | — | +| 6 | PASS (36.2 s) | FAIL — `streamer/mod.rs:207` | 311 | +| 7 | PASS (36.8 s) | FAIL — `streamer/mod.rs:207` | (mode A) | +| 8 | PASS (37.8 s) | FAIL — `streamer/mod.rs:207` | (mode A) | +| 9 | PASS (36.7 s) | PASS (58.0 s) | — | + +| | Pass | Fail | Failure rate | +|---|---|---|---| +| `forward_migration_kill_restart` | 9 | 0 | 0% | +| `should_handle_back_migration_a_to_b_to_a` | 4 | 5 | ~55% | + +All 5 failures are mode A at the same `streamer/mod.rs:207` line as before. No mode B observed in this batch. + +--- + +## Failure mode A — `streamer/mod.rs:207` (in-memory race during catch-up) + +**Run:** [26498637558 job 78033060779](https://github.com/near/mpc/actions/runs/26498637558/job/78033060779) +**Commit:** `04971e0c` (PR #3362, branch `barak/2121-contract-stale-attestation-test` rebased onto #3365's `fix/migration-back-wait-indexer-ready`) + +``` +thread 'migration_service::migration_service__should_handle_back_migration_a_to_b_to_a' (651425) panicked at crates/e2e-tests/tests/migration_service.rs:744:6: +A0's indexer did not resume + advance within 60s after restart: node 0 indexer did not advance past height 312 within 60s +--- last 16KB of node 0 stderr.log (#3366 diagnostics) --- + +thread 'tokio-rt-worker' (886644) panicked at /home/runner/.cargo/git/checkouts/nearcore-86558fdb18093f53/aab31b0/chain/indexer/src/streamer/mod.rs:207:42: +`receipt` must be present at this moment +stack backtrace: + 0: __rustc::rust_begin_unwind + 1: core::panicking::panic_fmt + 2: core::option::expect_failed + 3: as core::iter::traits::iterator::Iterator>::fold + 4: near_indexer::streamer::build_streamer_message::{{closure}} + 5: near_indexer::streamer::start::{{closure}} + 6: tokio::runtime::task::core::Core::poll + 7: tokio::runtime::task::harness::Harness::poll + 8: tokio::runtime::scheduler::multi_thread::worker::Context::run_task + 9: tokio::runtime::scheduler::multi_thread::worker::Context::run + 10: tokio::runtime::context::scoped::Scoped::set + 11: tokio::runtime::context::runtime::enter_runtime + 12: tokio::runtime::scheduler::multi_thread::worker::run + 13: as core::future::future::Future>::poll + 14: tokio::runtime::task::core::Core::poll + 15: tokio::runtime::task::harness::Harness::poll + 16: tokio::runtime::blocking::pool::Inner::run +note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace. + +--- end stderr.log --- + +Caused by: + indexer block-height metric not available — node may have exited +``` + +### Interpretation + +- An `expect("receipt must be present at this moment")` deep inside `near_indexer::streamer::build_streamer_message` fires while building a streamer message — a receipt referenced by a block isn't present in the lookup the indexer is doing. +- The panic kills the `tokio-rt-worker` carrying nearcore's indexer streamer task. That thread is the only producer for the `StreamerMessage` channel that mpc-node's `listen_blocks` consumes — so the consumer stops receiving messages and `MPC_INDEXER_LATEST_BLOCK_HEIGHT` stops advancing. +- The mpc-node main process keeps running just long enough that PR #3365's `wait_for_node_indexer_height_above` helper sees a stale metric. Then the metric server itself becomes unscrapable and the helper's fallback ("node may have exited") fires. + +The class of bug here is *in-memory racy state during block streaming after kill+restart catch-up* — likely because nearcore's indexer accumulates state from blocks as they're produced and then loses it when SIGKILL'd mid-build. + +--- + +## Failure mode B — `client_actor.rs:217` (on-disk inconsistency at startup) + +**Run:** [26501678885 job 78043485451](https://github.com/near/mpc/actions/runs/26501678885/job/78043485451) +**Commit:** `cc3eeae0` (PR #3362 rebased on main after #3365 was squash-merged) + +``` +thread 'migration_service::migration_service__should_handle_back_migration_a_to_b_to_a' (501699) panicked at crates/e2e-tests/tests/migration_service.rs:744:6: +A0's indexer did not resume + advance within 60s after restart: node 0 indexer did not advance past height 464 within 60s +--- last 16KB of node 0 stderr.log (#3366 diagnostics) --- + +thread '' (749092) panicked at /home/runner/.cargo/git/checkouts/nearcore-86558fdb18093f53/aab31b0/chain/client/src/client_actor.rs:217:6: +called `Result::unwrap()` on an `Err` value: Chain(StorageError(StorageInconsistentState("No ChunkExtra for block 4cqR4KRwGv92jgnsFLzmJvDdNy7hj5JGUak8nY1tWsVu in shard s0.v0"))) +stack backtrace: + 0: __rustc::rust_begin_unwind + 1: core::panicking::panic_fmt + 2: core::result::unwrap_failed + 3: near_client::client_actor::start_client + 4: nearcore::start_with_config_and_synchronization_impl::{{closure}} + 5: mpc_node::indexer::real::spawn_real_indexer::{{closure}}::{{closure}} + 6: tokio::runtime::runtime::Runtime::block_on + +thread 'main' (749090) panicked at crates/node/src/indexer/real.rs:225:10: +txn_sender is returned from the `block_on` expression above.: RecvError(()) +stack backtrace: + 0: __rustc::rust_begin_unwind + 1: core::panicking::panic_fmt + 2: core::result::unwrap_failed + 3: mpc_node::indexer::real::spawn_real_indexer + 4: mpc_node::run::run_mpc_node::{{closure}} + 5: std::thread::local::LocalKey::with + 6: futures_executor::local_pool::block_on + 7: mpc_node::main +note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace. + +--- end stderr.log --- + +Caused by: + indexer block-height metric not available — node may have exited +``` + +### Interpretation + +- `near_client::client_actor::start_client` does an `unwrap()` (at `chain/client/src/client_actor.rs:217`) on a `Result<_, near_chain::Error>` and that result is `Err(StorageError(StorageInconsistentState("No ChunkExtra for block in shard s0.v0")))`. +- `ChunkExtra` is metadata persisted to RocksDB *after* a chunk is processed. If it's missing for a block neard tries to load on startup, that means the previous run was SIGKILL'd mid-write or some other ordering invariant was violated. The recovery code expects this to always be present. +- This panic happens **at startup, before block streaming even begins**. neard never finishes booting. So the indexer never starts streaming → `txn_sender_sender` is dropped without ever being used → mpc-node's main thread `blocking_recv()` returns `RecvError(())` → secondary panic at `crates/node/src/indexer/real.rs:225` (the `.expect("txn_sender is returned from the 'block_on' expression above.")`). + +The class of bug here is *on-disk inconsistency that nearcore's recovery code can't tolerate* — a missing `ChunkExtra` should either be retryable or the recovery code should gracefully reconstruct it, not unwrap. + +--- + +## What relates PR #3362's code to the failure + +The only operational change #3362 introduces during the forward round is one extra `SubmitParticipantInfo` transaction (from `submit_attestation_before_concluding_migration`) submitted via `submit_remote_attestation` immediately before `ConcludeNodeMigration`. That helper uses `send_and_wait`, so the conclude transaction follows in a *separate* near-future block. + +### Revert experiment — hypothesis 1 confirmed + +[PR #3373](https://github.com/near/mpc/pull/3373) was created with exactly one commit reverted from PR #3362: the `submit_attestation_before_concluding_migration` change. Everything else stayed identical (contract sandbox tests, stderr-tail diagnostic, lint fixes, the merge of #3365's flake-fix from main). Five sequential CI runs were observed: + +| Run | Workflow attempt | E2E result | +|---|---|---| +| 1 | 26505165380 attempt 1 | ✅ pass | +| 2 | 26505165380 attempt 2 | ✅ pass | +| 3 | 26505165380 attempt 3 | ✅ pass | +| 4 | 26505165380 attempt 4 | ✅ pass | +| 5 | 26505165380 attempt 5 | ✅ pass | + +**Result: 0/5 fail.** Combined with the 5/6 failure rate on #3362, the experiment definitively isolates the trigger to that single commit. The other two hypotheses — nonce contention with `periodic_attestation_submission`, and a chain-depth threshold — are ruled out, because the experiment branch produces near-identical block counts (the only saved transactions are the per-conclude `SubmitParticipantInfo` ones). + +### Concrete trigger pattern + +The minimal repro shape, with each step described in terms an upstream nearcore reader can act on: + +1. An mpc-node binary (which embeds `near-indexer`) is running and producing blocks. +2. The node submits two function-call transactions back-to-back from the **same signer** to the **same contract**, the second only after the first reaches `TransactionStatus::Executed`. The two transactions therefore land in different but very close blocks. +3. **Additional same-signer function-call activity continues for ~10–20 s after the two-tx pattern.** Without this extra activity the bug is much less reachable — see the focused-repro experiment below. +4. The neard process is sent SIGKILL while the indexer is mid-block-stream. +5. The neard process is restarted against the same `home_dir`. +6. On restart, the embedded `near-indexer` panics with either: + - `chain/indexer/src/streamer/mod.rs:207` — "`receipt` must be present at this moment" (the common case, mode A), or + - `chain/client/src/client_actor.rs:217` — `StorageInconsistentState("No ChunkExtra for block in shard s0.v0")` (rarer, mode B; correlates with higher pre-kill height). + +### Focused-repro experiments — isolating the trigger + +A family of focused tests was added next to the back-migration test. Each runs the same forward migration + node kill + restart + indexer-progress assertion, varying only the sign / CKD activity around the migration. The full 2⁴ matrix on `(pre-sign, pre-ckd, post-sign, post-ckd)` is covered, plus SIGTERM counterparts for the most-failing variant and the back-migration test. + +`pre-forward` activity is signed by A0 + A1 while A0 is still an active participant. `post-forward` activity is signed by B0 + A1 after A0 has been demoted by `conclude_node_migration`. + +#### Consolidated results across four campaigns + +Aggregated across campaign 1 (variants 0–3, 5 runs), campaign 2 (0–6, 5 runs), campaign 3 (0–15 + SIGTERMs, 5 runs), and campaign 4 (full set, 5 runs, with `gh run rerun` instead of `--failed` to defeat nextest's retry-filtering). Variants added later have fewer total observations because nextest's per-rerun test filtering still partially skipped passing tests on some attempts. + +| # | pre-sign | pre-ckd | post-sign | post-ckd | Kill mode | Fail / Total | Fail rate | +|---|---|---|---|---|---|---|---| +| 0 | — | — | — | — | SIGKILL | 0 / 20 | **0%** | +| 1 | — | — | ✓ | — | SIGKILL | 0 / 12 | **0%** | +| 2 | — | — | — | ✓ | SIGKILL | 0 / 20 | **0%** | +| 3 | — | — | ✓ | ✓ | SIGKILL | 1 / 12 | ~8% | +| 4 | ✓ | — | — | — | SIGKILL | 0 / 11 | **0%** | +| 5 | — | ✓ | — | — | SIGKILL | 0 / 15 | **0%** | +| 6 | ✓ | ✓ | — | — | SIGKILL | 0 / 7 | **0%** | +| 7 | ✓ | — | ✓ | — | SIGKILL | 1 / 3 | (small sample) | +| 8 | ✓ | — | — | ✓ | SIGKILL | 0 / 3 | (small) | +| 9 | ✓ | — | ✓ | ✓ | SIGKILL | 0 / 3 | (small) | +| 10 | — | ✓ | ✓ | — | SIGKILL | 0 / 9 | **0%** | +| 11 | — | ✓ | — | ✓ | SIGKILL | 0 / 10 | **0%** | +| 12 | — | ✓ | ✓ | ✓ | SIGKILL | 1 / 10 | ~10% | +| 13 | ✓ | ✓ | ✓ | — | SIGKILL | 3 / 10 | ~30% | +| 14 | ✓ | ✓ | — | ✓ | SIGKILL | 3 / 10 | ~30% | +| **15** | ✓ | ✓ | ✓ | ✓ | SIGKILL | **7 / 10** | **~70%** | +| 15-T | ✓ | ✓ | ✓ | ✓ | SIGTERM 30s | **2 / 2** | (small) | +| back-mig | ✓ | ✓ | ✓ | ✓ + back round | SIGKILL | ~11 / ~17 | **~65%** | +| back-mig-T | ✓ | ✓ | ✓ | ✓ + back round | SIGTERM 30s | **2 / 2** | (small) | + +#### Observations + +1. **Variant 15 is the smoking-gun repro.** Same `(pre, post)` activity profile as the back-migration test, same panic, same failure rate (~70%). The back-migration round itself adds nothing to the failure rate — the bug is reachable without it. +2. **All four activity types are needed simultaneously.** Drop any one of the four (variants 12, 13, 14, 9) and the rate falls to ~10–30% or zero. +3. **2-of-4 or fewer almost never fires.** Variants 3 and 7 each fired once across a few observations; too small to call a true rate, but consistent with "you need at least three of the four kinds of activity to make the panic reachable at all." +4. **SIGTERM is *not* a clean fix** at 30 s grace. Both observations of the variant-15 SIGTERM counterpart and the back-migration SIGTERM counterpart failed at ~101 s with the same `streamer/mod.rs:207` panic. Strong hint (small sample, but two-for-two) that graceful shutdown doesn't address the upstream bug — either neard's graceful path also has the issue, or 30 s isn't enough time to flush whatever state needs flushing. +5. **Every observed panic is mode A** (`streamer/mod.rs:207` "receipt must be present at this moment") across the focused matrix. Mode B (`StorageInconsistentState`) appeared once early in the investigation but hasn't recurred in the focused-test campaigns — which fits if mode B requires more on-disk chain depth than the focused setup produces. + +#### Three plausible explanations, refined + +1. **Write-rate effect.** More function-calls per second → more block writes → larger chance SIGKILL lands mid-write. Doesn't explain why variants 0–6 never fire while variant 15 fires ~70%, since the per-second write rate doesn't differ that much between, say, variants 13 (3 of 4 activity types) and 15 (4 of 4). Demoted. +2. **Receipt-graph effect (most actionable).** `sign_respond` and `respond_ckd` produce a specific receipt-chain shape (request receipt → MPC respond → state-cleanup receipt) that the indexer tracks in-memory. The `streamer/mod.rs:207` panic site is inside a `.fold` over a receipt collection — a specific receipt-graph topology likely leaves the collection with a "referenced-but-absent" gap when SIGKILL hits. The variant-15 result is consistent with this: both pre- and post-forward, both sign- and ckd-shaped, are needed to set up a graph topology the recovery code can't reconstruct. +3. **Time-based effect.** Neard runs periodic background work. The variant-13/14 rates (~30 % each, both with 3-of-4 activity) and variant-15 rate (~70 %, 4-of-4) are consistent with timing — more activity = more chance SIGKILL falls in a fragile window — but the receipt-graph explanation accounts for the same pattern without requiring an unrelated time mechanism. + +The receipt-graph hypothesis remains the best fit. SIGTERM failing at 30 s grace strengthens it: if the issue is purely write-timing, graceful shutdown should have fixed it. The fact that it didn't suggests the in-memory streamer state — not just the on-disk RocksDB — is the broken thing. + +### Drain-then-kill experiment — also does not fix it + +Following the SIGTERM-still-fails result, we tried draining each node before SIGKILL: write `listen_blocks.flag = false` to pause our `listen_blocks` consumer, poll the indexer block-height metric until it stops advancing for 3 seconds (30 s timeout), *then* SIGKILL. The intuition: pause traffic, let the indexer reach an idle state, then kill — matching how operators do production rolling restarts. + +Results across 5 of 5 runs on commit `89a7341d`: + +| Test | Pass | Fail | Fail rate | +|---|---|---|---| +| `forward_migration_kill_restart_with_pre_both_and_post_both` + drain | 0 | 5 | **100%** | +| `should_handle_back_migration_a_to_b_to_a` + drain at both kill sites | 0 | 5 | **100%** | + +Every failure is the same `streamer/mod.rs:207` panic. Timings increased from ~102 s to 106–127 s, which confirms the drain step ran for several seconds — it just didn't help. + +#### Why it didn't work + +The drain pauses our **consumer-side** of the block-update channel (`listen_blocks` in `crates/node/src/indexer/handler.rs`). The panic, however, lives in the **producer-side** task — `near_indexer::streamer::start::{{closure}}` calling `build_streamer_message` inside nearcore itself. Pausing our consumer doesn't stop nearcore's streamer from continuing to produce messages or from writing whatever state it writes before SIGKILL. So the on-disk state we restart against is the same with or without the drain. + +#### Updated picture of the failure mode + +This is now the third teardown strategy we've tried, all failing: + +| Approach | Outcome | Rules out | +|---|---|---| +| Plain SIGKILL | Fails ~70% (variant 15) / ~65% (back-mig) | nothing | +| SIGTERM, 30 s grace | Fails 2 / 2 observed | nothing — `mpc-node` has no SIGTERM handler, so the "graceful" path was actually identical to SIGKILL | +| Drain consumer + SIGKILL | Fails 5 / 5 observed | "the bug is about in-flight consumer-side work" | + +What remains as plausible mechanism: the panic-causing state is written *during normal block production* by nearcore's streamer task, persisted in RocksDB or in-memory in a way that needs a graceful shutdown of *nearcore's own* task to drain. From a test-side perspective, we have no mechanism that can drain that task short of waiting for neard to genuinely finish a graceful shutdown — which SIGTERM-at-30-s either didn't trigger or wasn't given enough time to finish. + +### What this means for the bug location + +The panics live in nearcore code. PR #3362's two-tx pattern is necessary to reach them; the back-migration test's sustained sign + ckd activity (both pre- and post-forward) is also necessary. That makes this a real upstream defect in nearcore 2.12.0-rc.1's restart/recovery path — not a defect in #3362. **No test-side teardown change we've tried (SIGTERM, drain) prevents the panic.** This is now solid evidence that the bug must be fixed in nearcore itself; we can't paper over it from the e2e test without losing the kill+restart semantics the test exists to model. + +### Real SIGTERM handler in mpc-node — also does not fix it + +Earlier rows in the table above show "SIGTERM with 30 s grace" failing 2/2, but that was misleading: mpc-node had no SIGTERM handler installed, so the OS terminated the process immediately and our "graceful" path was indistinguishable from SIGKILL. The data couldn't distinguish "graceful shutdown doesn't help" from "we never actually tested graceful shutdown." + +To close that gap we wrote a real SIGTERM handler in mpc-node ([issue #3409](https://github.com/near/mpc/issues/3409), [PR #3410](https://github.com/near/mpc/pull/3410)). It routes SIGTERM into the existing internal shutdown channel and calls `near_async::shutdown_all_actors()` before exit. To prove the handler actually ran in CI, we added a diagnostic that `eprintln!`s `[E2E-DIAG] mpc-node pid=X exited gracefully Xms after SIGTERM` from the test's `terminate_with_grace` helper. + +#### Iteration 1: handler hangs + +First implementation also called `near_store::db::RocksDB::block_until_all_instances_are_dropped()` after `shutdown_all_actors()` — that's what neard's standalone binary does on its SIGTERM path. **It hung in 5/5 runs:** the diagnostic showed `SIGTERM grace period (60s) elapsed; falling back to SIGKILL` every time, and the test then hit the same `streamer/mod.rs:207` panic on restart. + +Root cause: our indexer runs in a `std::thread::spawn`'d closure that does `indexer_tokio_runtime.block_on(async { … })`. The async block holds an `Arc` which transitively holds `Arc` references, and the spawned `monitor_*`/`indexer_logger` tasks each hold their own clone. Nothing in our code cancels those tasks on shutdown, so `listen_blocks` runs forever, `block_on` never returns, the std::thread keeps the Arcs alive, and `block_until_all_instances_are_dropped` loops waiting for a refcount that will never reach zero. The 60 s test grace expires, the test SIGKILLs us, and we're back to the original problem. + +#### Iteration 2: drop the block_until_all_instances_are_dropped call + +Removing that one call lets the handler return immediately after `shutdown_all_actors()`. The diagnostic now shows `exited gracefully 100ms after SIGTERM` in 5/5 runs — the handler does complete, and quickly. + +Results on commit `3527e4d4` (5 runs in parallel via sister-branch dispatch): + +| Run | Test outcome | Handler shutdown | Restart panic | +|---|---|---|---| +| 1 (PR branch) | ✅ all 23 e2e pass, back-mig pass @ 59.7 s | (suppressed by nextest on pass) | none | +| 2 | ❌ back-mig fail @ line 760 | 100 ms graceful | streamer/mod.rs:207 | +| 3 | ❌ back-mig fail @ line 760 | 19420 ms graceful (CI load outlier) | streamer/mod.rs:207 | +| 4 | ❌ back-mig fail @ line 760 | 100 ms graceful | streamer/mod.rs:207 | +| 5 | ❌ back-mig fail @ line 760 | 100 ms graceful | streamer/mod.rs:207 | + +**Pass rate jumped from 0/5 (with v1's hang, or with no handler at all) to 1/5 (with v2).** That's a real improvement — the handler is doing useful work — but **the bug still fires 4/5 times even after a verified 100 ms shutdown.** + +#### What "graceful 100 ms" actually covers — and what it doesn't + +The 100 ms covers mpc-node's main runtime path: select! exit, `cancellation_token.cancel()`, the image-hash watcher join, and `near_async::shutdown_all_actors()` (which sends a stop signal into nearcore's `ACTOR_SYSTEMS` registry and returns synchronously — it does not wait). After that the process exits. + +It does **not** cover the embedded indexer thread: + +- The indexer runs in a separate `std::thread::spawn`'d closure with its own tokio runtime ([`crates/node/src/indexer/real.rs:80`](../../crates/node/src/indexer/real.rs)). +- That runtime hosts `near_indexer::streamer::start`'s block-streaming loop *and* the ~6 `tokio::spawn`'d monitor tasks (`indexer_logger`, `monitor_allowed_docker_images`, `monitor_allowed_launcher_compose_hashes`, `monitor_tee_accounts`, `monitor_allowed_foreign_chain_providers`, `foreign_chain_whitelist_verifier::run`). +- Each of those tasks holds `Arc` → transitively `Arc`. Our handler installs no cancellation into that runtime; nothing tells these tasks to stop. +- The outer `block_on` therefore never returns, and the std::thread is terminated by OS-level process exit ~100 ms after SIGTERM, mid-flight, exactly as it would be on SIGKILL. + +So our experiment shows that **signaling nearcore's actor system to stop does not prevent the panic** — it does not yet show that *fully draining the indexer thread* (option B from the earlier discussion) would also not prevent it. The 100 ms timing alone proves the indexer thread didn't have time to finish anything: a real near-indexer drain typically takes seconds, not milliseconds. + +That said, three reasons we still believe the bug is upstream and option B wouldn't materially change the outcome: + +1. **The panic site reads state from normal block production, not from shutdown.** `build_streamer_message`'s `.fold` panics because a referenced receipt is missing in the in-memory tracking map (rebuilt from RocksDB on restart). That state was written before shutdown started. +2. **RocksDB's WAL already guarantees consistency for committed data.** A more graceful indexer-thread shutdown would mostly close a small uncommitted-flush window — which our earlier "drain + 60 s settle-time" experiment (2/2 fail) was already targeting and didn't close. +3. **The recovery path should not panic regardless.** SIGKILL is a legitimate production scenario; recovery should reconstruct cleanly or surface a clear error, not `expect()`/`unwrap()`. + +#### Why this matters for the upstream bug report + +The previous version of [`nearcore-indexer-sigkill-restart-panic.md`](./nearcore-indexer-sigkill-restart-panic.md) had to hedge: "we cannot claim to know whether a proper graceful-shutdown path would prevent the panic." That hedge is now substantially narrower — we've ruled out "actor-system stop + main-runtime drain" as a fix. The remaining unknown is "would *fully draining the indexer thread itself* (option B) prevent the panic?" Per the three reasons above we believe the answer is no, but we don't have an empirical 5-run campaign that proves it. The cost/benefit of doing option B for the CI flake doesn't justify it (substantial refactor across 7 task spawn sites for an experiment we expect to show what we already believe); it may be worth doing later for shutdown hygiene independent of the test. + +--- + +## Common trigger + +Both failure modes share the same operational sequence: + +1. mpc-node is running and its embedded `neard` is actively producing/processing blocks. +2. The e2e test sends SIGKILL via `kill_nodes` (which drops `ProcessGuard`, which calls `std::process::Child::kill()` — SIGKILL, not SIGTERM). +3. mpc-node is restarted with the same home directory (`start_nodes` does *not* wipe state). +4. On restart, `nearcore::start_with_config_and_synchronization_impl` runs and either: + - Panics during startup if RocksDB state is inconsistent (failure mode B), or + - Starts cleanly but later panics during stream building when an in-memory data structure is in a half-built state (failure mode A). + +The user-visible result is the same in both cases: A0's indexer-height metric never advances past the pre-kill height, and PR #3365's `wait_for_node_indexer_height_above` helper times out after 60 s. + +--- + +## Recommended follow-ups + +Open: + +1. **Decide what to do with the back-migration test in PR #3362.** It fails ~80% per run on this branch (with the trigger + maximal graceful shutdown). Options: + - Mark `#[ignore]` with a note linking the upstream nearcore issue, so CI is green without removing the regression coverage. Recommended. + - Use `reset_and_start_nodes` (wipes home dir) instead of `start_nodes` for the kill+restart step. Test becomes stable but loses the "keyshares preserved across restart" semantic that's the whole point of the back-migration model. Not recommended. + - Leave it flaky until nearcore is fixed. Not recommended — CI signal degrades. +2. **Don't block PR #3362 on this flake.** The panics are inside nearcore code paths, not in #3362's diff. The stale-attestation node fix is independent and demonstrably correct (covered by the contract-sandbox tests in the same PR). +3. **Close PR #3373** once it has served its diagnostic purpose; no merge is intended. + +Done (kept here for record): + +4. ~~Open an upstream nearcore issue.~~ — filed as [`near/nearcore#15867`](https://github.com/near/nearcore/issues/15867); write-up archived in [`nearcore-indexer-sigkill-restart-panic.md`](./nearcore-indexer-sigkill-restart-panic.md). +5. ~~File a `mpc-node has no graceful-shutdown path` issue.~~ — done: [#3409](https://github.com/near/mpc/issues/3409) + [PR #3410](https://github.com/near/mpc/pull/3410) (squashed, awaiting review). Handles SIGTERM + SIGINT + SIGHUP, routes into the existing shutdown channel, calls `near_async::shutdown_all_actors()` on exit. +6. ~~Drain the indexer thread so `block_until_all_instances_are_dropped()` is safe to call.~~ — done: [#3485](https://github.com/near/mpc/issues/3485) + [PR #3486](https://github.com/near/mpc/pull/3486) (stacked on #3410). Wraps the indexer's terminal `listen_blocks` await in a `tokio::select!` against a `CancellationToken`; on cancel, the indexer's tokio runtime drops, every spawned monitor task is aborted, and all `Arc` refs are released. `block_until_all_instances_are_dropped()` then returns cleanly in 2–6 s. +7. ~~Factor the stderr-tail diagnostic out of #3362.~~ — landed in [#3410](https://github.com/near/mpc/pull/3410)'s e2e infra additions. diff --git a/docs/investigation/nearcore-indexer-sigkill-restart-panic.md b/docs/investigation/nearcore-indexer-sigkill-restart-panic.md new file mode 100644 index 000000000..12d25283e --- /dev/null +++ b/docs/investigation/nearcore-indexer-sigkill-restart-panic.md @@ -0,0 +1,477 @@ +# near-indexer: SIGKILL + restart panic in `streamer/mod.rs:207` and `client_actor.rs:217` + +> **Filed upstream as [`near/nearcore#15867`](https://github.com/near/nearcore/issues/15867).** Discussion thread on Zulip: [near-indexer panic in MPC node](https://near.zulipchat.com/#narrow/channel/469556-community-support/topic/near-indexer.20panic.20in.20MPC.20node). + +## TL;DR + +A process embedding `near-indexer`, when stopped during normal block production +and then restarted against the same `home_dir`, **panics on restart ~65–80% of +the time** in a specific operational scenario we've isolated. The panic is +inside `near-indexer` / `near-client`. We've tried every approach reasonably +available to the embedder — SIGTERM with a real handler that signals +`shutdown_all_actors()`, the indexer thread fully drained (its tokio runtime +dropped, every `Arc` released, `RocksDB::block_until_all_instances_are_dropped()` +returns), consumer-side drain, settle-time delays — **none of them prevent +the panic** at a meaningful rate. The panic site reads state that was written +during normal block production (not during shutdown), so the fix needs to be +in nearcore's recovery path, not in how embedders stop the process. + +This was discovered during end-to-end testing of NEAR's MPC node +back-migration flow (A → B → A), which by design requires stopping the +old participant node and then restarting it before the back-migration +round. + +## Operational impact + +This is not a test-only flake. We hit it via an e2e test, but the underlying +operational scenario is real production: + +- Operator-initiated node restarts (container/orchestrator stop: `docker stop`, + `kubectl delete`, `systemctl stop`, dstack CVM stop) — every such restart + carries this panic risk on the next start. +- OOM kills, container evictions, host hardware faults, kernel panics — same. +- For a TSS network like NEAR's chain-signatures MPC cluster, where every + participating node embeds `near-indexer` to watch the on-chain signer + contract, a restart that lands on this panic takes one signer out of the + threshold for as long as it takes the operator to manually clear state + (which loses keyshare-preservation guarantees the production flow relies on). + +## Affected versions + +- nearcore tag **`2.12.0`** (final) — resolved commit `1144e31`. Mode-A panic + reproduced on the rebased near/mpc#3362 branch in a CI run on commit + `3a2ceafe`; the panic backtrace is inlined in the test failure message + via the diagnostic in [near/mpc#3362](https://github.com/near/mpc/pull/3362). +- nearcore tags **`2.12.0-rc.1`** and **`2.12.0-rc.2`** — earlier investigation. + Specific resolved commits we observed the panic on: `aab31b0e` and `fadb5c1` + (both within the `2.12.0-rc.1` tag at different points after a `Cargo.lock` + bump in our workspace). +- The panic-site code in `chain/indexer/src/streamer/mod.rs:207` and + `chain/client/src/client_actor.rs:217` is **unchanged between `2.12.0-rc.1` + and `2.12.0` final**. Only `chain/chain/src/{chain,runtime,types}.rs` and + VM/cache code differ across those tags. + +## When this bug is reachable in practice + +The panic is reachable from a specific operational pattern, not from arbitrary +SIGKILL+restart. In our setup it requires: + +1. **A two-tx pattern**: two function-call transactions in adjacent blocks, + signed by the same access key, both touching overlapping contract state + (`tee_state` and the participant set in our case). This is what the + embedder PR introduces — see "Reproduction via CI" below. +2. **Sustained background activity** during the kill window — both sign and + CKD-style requests, pre- and post-forward-migration. Drop any one of the + four and the rate falls from ~70% to ~10–30%; drop two and it's near zero. +3. **SIGKILL or equivalent of the process** (operator restart, OOM kill, + container eviction, hardware fault, kernel panic). + +This is not a contrived test setup. Production operators of TSS networks +that embed `near-indexer` will hit (1) whenever they ship a similar two-tx +pattern, (2) is normal operational background, and (3) happens every time +anyone runs out of memory, reboots a host, etc. + +### Reproduction rate: latent on `main`, ~70–80% on near/mpc#3362 + +The same back-migration e2e test exists on both `main` and on +[near/mpc#3362](https://github.com/near/mpc/pull/3362): + +| Branch | Trigger present? | Fail rate observed | +|---|---|---| +| `main` | No | **rarely** — single-digit %, very occasional failures the team has not been able to attribute, no consistent panic stack | +| near/mpc#3362 (`barak/2121-contract-stale-attestation-test`) | Yes | **~70–80%** — every failure carries the same mode-A panic stack at `streamer/mod.rs:207` (or, less commonly, mode B at `client_actor.rs:217`) | + +The qualitative difference between "rare and shapeless" on `main` and +"frequent with a consistent stack" on near/mpc#3362 is the load-bearing evidence +that the trigger (the two-tx pattern added by near/mpc#3362's +`submit_attestation_before_concluding_migration` function) is what makes +this nearcore race reliably reachable. Once near/mpc#3362 merges, every CI run +of this test on `main` will be at the ~70–80% rate — i.e. the panic will +become the dominant cause of test failure in this file. + +## Symptoms — two distinct panic modes + +Both modes share the same trigger (SIGKILL + restart with the chain in a +certain state) and the same user-visible result (the embedding process exits +on restart). Internal stack frames differ. + +### Mode A — `streamer/mod.rs:207` (most common, ~70% of failures we see) + +``` +thread 'tokio-rt-worker' (886644) panicked at + /home/runner/.cargo/git/checkouts/nearcore-86558fdb18093f53/aab31b0/chain/indexer/src/streamer/mod.rs:207:42: +`receipt` must be present at this moment +stack backtrace: + 0: __rustc::rust_begin_unwind + 1: core::panicking::panic_fmt + 2: core::option::expect_failed + 3: as core::iter::traits::iterator::Iterator>::fold + 4: near_indexer::streamer::build_streamer_message::{{closure}} + 5: near_indexer::streamer::start::{{closure}} + 6: tokio::runtime::task::core::Core::poll + 7: tokio::runtime::task::harness::Harness::poll + 8: tokio::runtime::scheduler::multi_thread::worker::Context::run_task + 9: tokio::runtime::scheduler::multi_thread::worker::Context::run + 10: tokio::runtime::context::scoped::Scoped::set + 11: tokio::runtime::context::runtime::enter_runtime + 12: tokio::runtime::scheduler::multi_thread::worker::run + 13: as core::future::future::Future>::poll + 14: tokio::runtime::task::core::Core::poll + 15: tokio::runtime::task::harness::Harness::poll + 16: tokio::runtime::blocking::pool::Inner::run +note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace. +``` + +The `.expect("receipt must be present at this moment")` at +`chain/indexer/src/streamer/mod.rs:207` fires inside `build_streamer_message`'s +`.fold` over a receipt collection — the indexer expects every receipt ID it +references to be in its in-memory tracking map, and one is missing. + +### Mode B — `client_actor.rs:217` (rarer, on-disk inconsistency) + +``` +thread '' (749092) panicked at + /home/runner/.cargo/git/checkouts/nearcore-86558fdb18093f53/aab31b0/chain/client/src/client_actor.rs:217:6: +called `Result::unwrap()` on an `Err` value: + Chain(StorageError(StorageInconsistentState( + "No ChunkExtra for block 4cqR4KRwGv92jgnsFLzmJvDdNy7hj5JGUak8nY1tWsVu in shard s0.v0" + ))) +stack backtrace: + 0: __rustc::rust_begin_unwind + 1: core::panicking::panic_fmt + 2: core::result::unwrap_failed + 3: near_client::client_actor::start_client + 4: nearcore::start_with_config_and_synchronization_impl::{{closure}} + 5: mpc_node::indexer::real::spawn_real_indexer::{{closure}}::{{closure}} + 6: tokio::runtime::runtime::Runtime::block_on +note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace. +``` + +The `unwrap()` at `chain/client/src/client_actor.rs:217` fires during +`start_client` because RocksDB is missing a `ChunkExtra` record the recovery +code expects to find. + +## Reproduction + +The bug is reachable from near/mpc PR [near/mpc#3362](https://github.com/near/mpc/pull/3362), +which adds a node-side fix for an unrelated issue (near/mpc#2121 — stale attestation in +back-migration). The fix happens to produce a transaction pattern that makes +the underlying nearcore receipt-graph race reach a ~70–80% per-run repro rate. +near/mpc#3362 is still open at time of filing; the repro therefore lives on its +branch. + +### Failing test + +A single test fires the bug: + +- **Test:** `migration_service__should_handle_back_migration_a_to_b_to_a` +- **File:** `crates/e2e-tests/tests/migration_service.rs` +- **What it does:** Sets up a 2-of-2 MPC cluster (A0, A1) plus a back-migration + target B0. Performs forward migration A0 → B0 (this submits the two-tx + pattern from the trigger). Then SIGKILLs A0 and restarts it. Then performs + back-migration B0 → A0. +- **Symptom of the bug:** When A0 is restarted (step "Stop A0 + start A0"), + its `near-indexer` panics with mode A (or rarely mode B) and the process + exits. The test's `wait_for_node_indexer_height_above` polls A0's + `indexer_latest_block_height` Prometheus metric for 60s, never sees it + advance past the pre-kill height (because the indexer is dead), and times + out at `migration_service.rs:771` with: + ``` + A0's indexer did not resume + advance within 60s after restart: + node 0 indexer did not advance past height N within 60s + ``` +- **Test runtime:** ~100–110s on a failing run, ~50–60s on a passing run. + +### Reproduction via CI + +The easiest way to reproduce is to push to near/mpc#3362's branch and let GitHub +Actions run the test. CI builds everything from scratch and runs the test +once; you can get N data points by pushing N times. + +```bash +# 1. Authorize a push to near/mpc#3362 or create a sister branch. +git clone git@github.com:near/mpc.git +cd mpc +git checkout barak/2121-contract-stale-attestation-test # head of near/mpc#3362 +git push origin HEAD:refs/heads/my-repro-branch # forks to a branch you own +gh workflow run CI --ref my-repro-branch # triggers the CI workflow + +# 2. Watch the run. +gh run watch + +# 3. On failure, the test panic message inline-dumps the failing node's +# pre-restart stdout, pre-restart stderr, post-restart stdout, and +# post-restart stderr (each 16 KB). The upstream panic stack appears in +# the post-restart stderr.log block. Search the job log for: +# +# last 16KB of node 0 stderr.log (post-restart stderr; upstream +# nearcore panic stack typically here) +# +# or just `streamer/mod.rs:207` to jump right to the panic site. +``` + +### Local reproduction + +```bash +git clone git@github.com:near/mpc.git +cd mpc +git checkout barak/2121-contract-stale-attestation-test +# Branch shape: stacked on near/mpc#3410 (SIGTERM handler) + near/mpc#3486 +# (full indexer drain). The trigger is on top as commit `0c02d27a`. +# nearcore is resolved to 2.12.0 (final) via main's Cargo.lock. + +# Build the e2e test binaries (~5 min cold). +cargo make e2e-tests-skip-build # builds then skips on subsequent runs + +# Run just the failing test: +cargo nextest run --cargo-profile=test-release -p e2e-tests --all-features \ + --locked --profile ci-e2e \ + migration_service__should_handle_back_migration_a_to_b_to_a + +# Expected: fails ~70–80% with the mode-A panic on restart. On the rare +# pass, the run finishes in ~50–60 s. On fail, ~100–110 s (the test +# waits 60 s for the indexer to recover before giving up). +``` + +### What this PR adds that makes the bug strongly reproducible + +The relevant change is in +[`crates/node/src/tee/remote_attestation.rs`](https://github.com/near/mpc/blob/barak/2121-contract-stale-attestation-test/crates/node/src/tee/remote_attestation.rs) +(see commit `1bcbb439` titled +`fix(node): refresh attestation before concluding back-migration (#2121)`). +It adds a function `submit_attestation_before_concluding_migration` that's +called from `execute_onboarding` just before `retry_conclude_onboarding`: + +```rust +// Submit a fresh on-chain attestation before concluding migration. +pub async fn submit_attestation_before_concluding_migration( + tee_authority: TeeAuthority, + tx_sender: impl TransactionSender, + tls_public_key: Ed25519PublicKey, + account_public_key: Ed25519PublicKey, +) -> anyhow::Result<()> { + let report_data: ReportData = ReportDataV1::new(...).into(); + let attestation = tee_authority.generate_attestation(report_data).await?; + submit_remote_attestation(tx_sender, attestation, tls_public_key).await +} +``` + +This produces a `SubmitParticipantInfo` transaction from node B0's signer +key, which lands in chain just before B0 signs a `ConcludeNodeMigration` +transaction in the *next* block. Both are signed by the same access key and +write to overlapping contract state (`tee_state` and the participant set). + +#### Revert experiment isolating this change + +We opened PR +[`#3373`](https://github.com/near/mpc/pull/3373) (a `chore:` PR that +**reverts only `1bcbb439`**, leaves everything else identical) and ran the +same back-migration test 5 times: **0/5 failed.** Reverting that single +commit takes the bug from ~65–70% to 0%. The commit doesn't touch +nearcore code paths; it's just the trigger. + +## What we ruled out (none of these prevent the panic) + +| Approach | Sample | Outcome | What we learned | +|---|---|---|---| +| Plain SIGKILL | 17 obs | ~65% fail | baseline | +| Smaller activity profiles (1-of-4, 2-of-4 sign/CKD TXs) | 50+ obs | ~0% fail | a specific 4-of-4 activity profile is needed | +| 3-of-4 activity profiles | 30 obs | ~10–30% fail | partial — the full 4-of-4 is what reaches ~70% | +| SIGTERM with 30s grace period, no handler in mpc-node | 2 obs | 2/2 fail | mpc-node had no SIGTERM handler, so SIGTERM was effectively SIGKILL | +| Drain consumer (pause `listen_blocks.flag`) then SIGKILL | 5 obs | 5/5 fail | the panic is producer-side (`streamer::start`), not consumer-side | +| Drain + 60s sleep before SIGKILL | 2 obs | 2/2 fail | rules out "writes still settling" — the bug isn't recent-write timing | +| **SIGTERM with handler installed, mpc-node main-runtime + `shutdown_all_actors()` complete in 100 ms** | **5 obs** | **4/5 fail** | **graceful actor-system shutdown does NOT prevent the panic** (caveat: indexer thread itself still terminated mid-flight — see below) | + +The cumulative evidence is that **no test-side teardown change prevents the +panic.** We've now tried five different kill strategies, including real +graceful shutdown. All fail at meaningful rates. The bug is in nearcore's +recovery path, not in how the embedding shuts down. + +### Strongest evidence: actor-system shutdown doesn't help + +The earlier version of this document noted that our SIGTERM experiments +couldn't distinguish "graceful shutdown doesn't fix it" from "mpc-node has no +SIGTERM handler so we never actually tested graceful shutdown." We've now +closed most of that gap, though one slice remains open — see "Caveat" below. + +We installed a SIGTERM handler in mpc-node that routes into the existing +internal shutdown channel and calls `near_async::shutdown_all_actors()` +before exit (commit landed in [near/mpc#3410](https://github.com/near/mpc/pull/3410) +on near/mpc, addressing [issue #3409](https://github.com/near/mpc/issues/3409)). +With this handler, the diagnostic emits `mpc-node pid=X exited gracefully +100ms after SIGTERM` in 5/5 observed runs — nearcore's actor system was given +a stop signal and the process exited within 100 ms. + +Re-running the back-migration test five times against this handler: + +- **1/5 passed** the full e2e flow (including the kill+restart). +- **4/5 failed** with the same `streamer/mod.rs:207` `` `receipt` must be + present at this moment `` panic on restart, identical stack to plain + SIGKILL. + +Pass rate is statistically indistinguishable from plain SIGKILL on the same +test (small samples on both sides, but the central tendency is identical). + +#### Caveat — what "graceful 100 ms" actually covers + +Important precision for anyone reviewing this: the 100 ms graceful shutdown +covers **mpc-node's main runtime path** and a **stop signal sent to nearcore's +actor system** (`near_async::shutdown_all_actors()` returns synchronously +after signaling stop, it does not wait for actors to finish). It does **not** +cover the embedded indexer thread. + +mpc-node embeds `near-indexer` in a separate `std::thread::spawn`'d closure +with its own tokio runtime. That runtime hosts: + +- `near_indexer::streamer::start`'s `build_streamer_message` loop (the one + that panics on restart), and +- ~6 `tokio::spawn`'d monitor tasks each holding `Arc` → + `Arc` references. + +Nothing in our handler propagates a cancellation into that runtime; the +spawned tasks keep running, the outer `block_on` never returns, and the +thread is terminated by the OS when mpc-node's process exits ~100 ms after +SIGTERM. So this experiment shows that **stopping nearcore's actor system +cleanly does not prevent the panic** — it does not yet show that *fully +draining the indexer thread* would also not prevent it. + +We did initially try to drain it: the first version of the handler also +called `near_store::db::RocksDB::block_until_all_instances_are_dropped()` +after `shutdown_all_actors()` (which is what neard's standalone binary +does), but it **hung indefinitely** waiting for the indexer thread's +Arc refs to drop. The 60 s test grace expired, the orchestrator +SIGKILLed us, and we landed back at the SIGKILL baseline. + +#### Why we still believe the bug is upstream + +Even granting the caveat, the panic mechanism doesn't depend on shutdown +draining: + +1. **The panic site reads state written during normal block production.** + `build_streamer_message`'s `.fold` panics because a receipt it references + is missing from the in-memory tracking map. That map is rebuilt from + RocksDB on restart. The "missing receipt" was written (or was about to be + written) during normal block production *before* shutdown — not during + shutdown itself. +2. **RocksDB's WAL already guarantees consistency for committed data.** Any + batch nearcore committed via the WAL survives a kill. The only data lost + to a kill is data that hadn't been flushed yet. A more graceful indexer + shutdown would mostly close that small flush window — which our drain + + 60 s settle-time experiment (2/2 fail) already failed to close. +3. **Recovery shouldn't panic regardless of how the previous run ended.** + SIGKILL is a legitimate production scenario (OOM kill, container + eviction, hardware fault, kernel panic). The `.expect("receipt must be + present at this moment")` and `client_actor.rs:217`'s `unwrap()` aren't + correctness — they're load-bearing assumptions about prior-state + consistency that don't hold across an interrupted process. Robust + recovery would either reconstruct cleanly or surface a clear error; it + shouldn't `panic!`. + +In other words: graceful shutdown of the indexer thread is a thing worth +doing for hygiene (and may close a small remaining slice of the failure +rate), but the **structural issue is in the recovery path**. The fix needs +to land in nearcore. + +#### Production framing + +Independent of the bug, the SIGTERM handler is a legitimate production +improvement — operators using dstack CVM stop / `docker stop` / `kubectl +delete` / `systemctl stop` previously had SIGTERM functionally equivalent +to SIGKILL; the handler gives them real graceful main-runtime shutdown +semantics. It just doesn't fix this particular upstream bug. SIGKILL also +remains a legitimate production scenario regardless of how clean the +graceful path is. + +## Concrete operational trigger + +Reduced to the simplest description we could derive: + +1. A process embedding `near-indexer` is running and consuming blocks normally. +2. Two function-call transactions land in **adjacent blocks**, **signed by the + same access key**, with the additional context that **both touch overlapping + contract state** (`tee_state` and the participant set in our case). +3. Optionally, surrounding activity (other sign / CKD transactions from other + signers around the same window) increases the failure rate from + background-level to ~70%. +4. SIGKILL the process. +5. Restart the process against the same `home_dir`. +6. The indexer panics on restart — mode A (typical) or mode B (occasional). + +Without step 2, the bug is unreachable in our test (focused-repro matrix: +0/9 fail). With step 2 but without step 3, the bug reproduces only +occasionally. With both, it reproduces ~70%. + +## Hypothesis + +Mode A's panic site is inside `build_streamer_message`'s `.fold` over a +receipt collection, with the `.expect("receipt must be present at this +moment")` indicating a referenced receipt isn't in the in-memory map. + +The most parsimonious explanation we've found: + +1. Block N contains B0's `SubmitParticipantInfo` TX. Its execution emits a + receipt that gets applied in block N+1. +2. Block N+1 contains B0's `ConcludeNodeMigration` TX and the receipt from N. + Both touch the same contract state region (`tee_state` and participants). +3. nearcore's indexer tracks receipts in an in-memory map as they're produced + and removes them as they're consumed. +4. RocksDB writes for these two TXs (and their receipts) may not be co-flushed + atomically. SIGKILL between flushes could persist one but not the other. +5. On restart, the in-memory map is reconstructed from RocksDB — but with a + gap where the un-flushed receipt would have been. +6. The next `build_streamer_message` call iterates a receipt set referencing + the absent ID and panics. + +Worth checking: + +- Are the receipts produced by adjacent same-signer function-call TXs that + modify the same contract state written into the same RocksDB column family + with a single flush boundary, or across multiple flush boundaries? +- Does `client_actor::start_client` (mode B's panic site) assume that + `ChunkExtra` for every block referenced by chain head has been durably + written before the chunk's processing-output receipts? If so, a SIGKILL + between those writes could produce the observed `StorageInconsistentState`. + +## Workarounds + +We don't have a working test-side workaround. Options we evaluated: + +- **Install a SIGTERM handler in mpc-node so nearcore's actor system gets a + clean stop** ([near/mpc#3410](https://github.com/near/mpc/pull/3410), + [issue #3409](https://github.com/near/mpc/issues/3409)). Real production + improvement on its own, but **does not** prevent the panic — see the + "Strongest evidence" section above. 4/5 still fail with the handler. +- **Avoid the kill+restart in our test** — works for our specific test but + doesn't fix the production scenario (operator decommissions then revives a + node). We expect to take this option for CI green and file this issue + separately. +- **Wipe `home_dir` on restart** (`reset_and_start_nodes` already exists for + this reason — its comment even calls out SIGKILL leaving data corrupt) — + the test that has a kill+restart pattern goes green at the cost of losing + the "keyshares survive on disk" assertion. Not viable for our case. + +## Logs and supporting evidence + +Full failure logs are in CI runs linked from the investigation doc. Examples +that include both panic modes: + +- Mode A — [run 26498637558 job 78033060779](https://github.com/near/mpc/actions/runs/26498637558/job/78033060779) +- Mode B — [run 26501678885 job 78043485451](https://github.com/near/mpc/actions/runs/26501678885/job/78043485451) +- 5-run drain-then-kill campaign (5/5 fail) — [run 26569167447](https://github.com/near/mpc/actions/runs/26569167447) +- 2-run option-A settle-then-kill campaign (2/2 fail at ~167s each) — [run 26575144185](https://github.com/near/mpc/actions/runs/26575144185) +- **5-run SIGTERM-handler campaign (4/5 fail, handler completes in 100 ms)** on commit `3527e4d4` — [PR run](https://github.com/near/mpc/actions/runs/26713165138/job/78727019813) (1/5 pass) plus 4 sister-branch runs ([26713169213](https://github.com/near/mpc/actions/runs/26713169213), [26713169682](https://github.com/near/mpc/actions/runs/26713169682), [26713170287](https://github.com/near/mpc/actions/runs/26713170287), [26713170844](https://github.com/near/mpc/actions/runs/26713170844)). Diagnostic line in each failing run: `[E2E-DIAG] mpc-node pid=X exited gracefully 100ms after SIGTERM`, followed by the same `streamer/mod.rs:207` panic on the restarted process. + +The CI logs include the failing node's `stderr.log` tail inline in the test +panic message (search for `--- last 16KB of node 0 stderr.log` in any failing +run's log), so the upstream panic stack is right there. + +## References + +- near/mpc PR exposing the bug: [near/mpc#3362](https://github.com/near/mpc/pull/3362) +- near/mpc revert-experiment PR (isolates the trigger as our two-tx pattern, + not a defect in our code): [`#3373`](https://github.com/near/mpc/pull/3373) — `chore:` PR +- near/mpc SIGTERM-handler PR (proves that adding a real graceful path on the + embedder side does NOT prevent the panic): [`#3410`](https://github.com/near/mpc/pull/3410) +- near/mpc indexer-drain PR (proves that fully draining the indexer thread — + including the `block_until_all_instances_are_dropped()` step neard's + standalone binary uses — also does NOT prevent the panic): [`#3486`](https://github.com/near/mpc/pull/3486)