diff --git a/crates/e2e-tests/tests/common.rs b/crates/e2e-tests/tests/common.rs index c11b72a47..8f9644b1b 100644 --- a/crates/e2e-tests/tests/common.rs +++ b/crates/e2e-tests/tests/common.rs @@ -35,6 +35,7 @@ pub const CONTRACT_UPGRADE_COMPATIBILITY_MAINNET_PORT_SEED: u16 = 18; pub const CONTRACT_UPGRADE_COMPATIBILITY_TESTNET_PORT_SEED: u16 = 19; pub const TIMEOUT_METRIC_PORT_SEED: u16 = 20; pub const MIGRATION_BACK_PORT_SEED: u16 = 21; +pub const MIGRATION_BACK_NO_RESTART_PORT_SEED: u16 = 22; /// Start a cluster, wait for Running state and presignatures to buffer. /// diff --git a/crates/e2e-tests/tests/migration_service.rs b/crates/e2e-tests/tests/migration_service.rs index 6bad70654..82b66fb53 100644 --- a/crates/e2e-tests/tests/migration_service.rs +++ b/crates/e2e-tests/tests/migration_service.rs @@ -628,42 +628,47 @@ async fn migration_service__should_migrate_nodes_via_backup_cli() { } } -/// Back-migration (A → B → A) end-to-end via the backup CLI. Investigation -/// context for the production scenario lives at near/mpc#2121; this test -/// does NOT regress against that bug (see Caveat below). +/// What to do to A0 between the forward and back directions of a +/// round-trip back-migration test. +enum BetweenDirections { + /// A0 stays running. Exercises the in-memory state-transition path + /// between forward and back directions; doesn't model the + /// disk-persisted-restart scenario. + None, + /// A0 is killed and restarted from its on-disk home dir. Models the + /// "operator decommissioned A then resurrected from disk" scenario + /// investigated in #2121. Asserts that keyshares survive the restart. + KillAndRestartA, +} + +/// Shared scaffolding for the back-migration round-trip tests. Sets up +/// the cluster, runs forward A→B, applies `between` to A0, runs back +/// B→A, kills B0, and asserts sign+ckd work with A0+A1. /// -/// Starts a cluster with 2 participating nodes (A0, A1) + 1 migration -/// target (B0). A1 stays a participant throughout so threshold (2) holds -/// during both directions. Per direction: +/// Per direction the migration round does: /// 1. Register backup service /// 2. GET keyshares from source node /// 3. Initiate node migration /// 4. PUT keyshares to target node /// 5. Verify migration finalizes and sign + ckd requests succeed -/// Between the forward and back directions, A0's CVM is killed and -/// restarted so its keyshares stay on disk through the restart. -/// After the back direction, B0 is killed so the final sign + ckd -/// assertions prove A0 + A1 carry the workload alone. /// /// Caveat: attestation is mocked as `{"Mock": "Valid"}` throughout, so -/// this test cannot exercise the on-chain stale-attestation failure mode — -/// which is the leading suspect for #2121's production symptom. This is a -/// positive-baseline regression test for the back-migration code path, not -/// a reproducer for #2121. -#[tokio::test] -#[expect(non_snake_case)] -async fn migration_service__should_handle_back_migration_a_to_b_to_a() { +/// neither variant can exercise the on-chain stale-attestation failure +/// mode — the leading suspect for #2121's production symptom. These +/// are positive-baseline regression tests for the back-migration code +/// path, not a reproducer for #2121. +async fn run_back_migration_round_trip(port_seed: u16, between: BetweenDirections) { let backup_cli = must_get_backup_cli_path(); - // Given: 2 participants (A0, A1) + 1 migration target (B0). + // Given: 2 participants (A0, A1) + 1 migration target (B0). A1 stays a + // participant throughout so threshold (2) holds in both directions. let num_nodes = 2; - let (mut cluster, running) = - common::must_setup_cluster(common::MIGRATION_BACK_PORT_SEED, |c| { - c.num_nodes = num_nodes; - c.threshold = num_nodes; - c.migration_targets = vec![0]; - }) - .await; + let (mut cluster, running) = common::must_setup_cluster(port_seed, |c| { + c.num_nodes = num_nodes; + c.threshold = num_nodes; + c.migration_targets = vec![0]; + }) + .await; // A0 (idx 0) is the source we migrate from; B0 lives at `num_nodes` // since `migration_targets[0] = 0` puts the first target right after @@ -701,66 +706,76 @@ async fn migration_service__should_handle_back_migration_a_to_b_to_a() { .await .expect("ckd request failed after forward migration"); - // Simulate operator decommissioning the old node and bringing it back - // online later for the back-migration. `kill_nodes` + `start_nodes` - // preserves the node's home dir (keyshares stay on disk) — that disk - // state is the trigger for the suspected P1 early-return. - // - // Snapshot A0's keyshare-file listing before kill so we can verify - // it's still there after start. Keyshares live in - // `home_dir/permanent_keys/` (see `crates/node/src/keyshare/local.rs`). - let a0_keyshares_before = snapshot_permanent_keys(&cluster, a_idx); - assert!( - !a0_keyshares_before.is_empty(), - "A0's permanent_keys/ is empty before kill — test setup did not produce keyshares, \ - so the back-migration P1 precondition can't be modeled" - ); - // Snapshot A0's indexer height before the kill so the post-restart - // readiness check (below) can prove the indexer has resumed and - // advanced past where it was — not just bound the web port. - let a0_indexer_height_before_kill = common::current_node_indexer_height(&cluster, a_idx) - .await - .expect("failed to read A0's indexer height before kill") - .expect("A0's indexer should be reporting a block height before the kill"); - cluster - .kill_nodes(&[a_idx]) - .expect("failed to kill A0 before back-migration"); - cluster - .start_nodes(&[a_idx]) - .expect("failed to restart A0 for back-migration"); - cluster - .wait_for_node_healthy(a_idx) - .await - .expect("A0 did not become healthy after restart"); - // `wait_for_node_healthy` only checks the web port; wait for the - // indexer to actually resume before back-migration (see #3366). - common::wait_for_node_indexer_height_above( - &cluster, - a_idx, - a0_indexer_height_before_kill, - Duration::from_secs(60), - ) - .await - .expect("A0's indexer did not resume + advance within 60s after restart"); - // Validate the assumption the test rests on: A0's keyshares are - // still on disk after the restart. If they're missing or different, - // the test no longer models production's back-migration scenario - // (e.g. someone swapped `start_nodes` for `reset_and_start_nodes`, - // which wipes the home dir). - let a0_keyshares_after = snapshot_permanent_keys(&cluster, a_idx); - assert_eq!( - a0_keyshares_before, a0_keyshares_after, - "A0's permanent_keys/ contents changed across kill+start — \ - keyshares were NOT preserved on disk" - ); + // Hook: apply the variant-specific between-directions action. + match between { + BetweenDirections::None => {} + BetweenDirections::KillAndRestartA => { + // Simulate operator decommissioning the old node and bringing + // it back online later for the back-migration. `kill_nodes` + + // `start_nodes` preserves the node's home dir (keyshares stay + // on disk) — that disk state is the trigger for the suspected + // P1 early-return. + // + // Snapshot A0's keyshare-file listing before kill so we can + // verify it's still there after start. Keyshares live in + // `home_dir/permanent_keys/` (see + // `crates/node/src/keyshare/local.rs`). + let a0_keyshares_before = snapshot_permanent_keys(&cluster, a_idx); + assert!( + !a0_keyshares_before.is_empty(), + "A0's permanent_keys/ is empty before kill — test setup did not produce \ + keyshares, so the back-migration P1 precondition can't be modeled" + ); + // Snapshot A0's indexer height before the kill so the + // post-restart readiness check (below) can prove the indexer + // resumed and advanced past where it was — not just bound the + // web port. + let a0_indexer_height_before_kill = + common::current_node_indexer_height(&cluster, a_idx) + .await + .expect("failed to read A0's indexer height before kill") + .expect("A0's indexer should be reporting a block height before the kill"); + cluster + .kill_nodes(&[a_idx]) + .expect("failed to kill A0 before back-migration"); + cluster + .start_nodes(&[a_idx]) + .expect("failed to restart A0 for back-migration"); + cluster + .wait_for_node_healthy(a_idx) + .await + .expect("A0 did not become healthy after restart"); + // `wait_for_node_healthy` only checks the web port; wait for + // the indexer to actually resume before back-migration + // (see #3366). + common::wait_for_node_indexer_height_above( + &cluster, + a_idx, + a0_indexer_height_before_kill, + Duration::from_secs(60), + ) + .await + .expect("A0's indexer did not resume + advance within 60s after restart"); + // Validate the assumption the test rests on: A0's keyshares + // are still on disk after the restart. If they're missing or + // different, the test no longer models production's + // back-migration scenario (e.g. someone swapped `start_nodes` + // for `reset_and_start_nodes`, which wipes the home dir). + let a0_keyshares_after = snapshot_permanent_keys(&cluster, a_idx); + assert_eq!( + a0_keyshares_before, a0_keyshares_after, + "A0's permanent_keys/ contents changed across kill+start — \ + keyshares were NOT preserved on disk" + ); + } + } // When: back-migration B0 → A0 via the same helper, indices swapped. run_migration_round(&cluster, b_idx, a_idx, &backup_cli, "back").await; - // Kill B0 before the final assertions so the sign + ckd requests cannot - // be served by the now-demoted node — that proves A0 + A1 are carrying - // the workload alone, mirroring what the forward test does by killing - // the source after migration. + // Kill B0 before the final assertions so the sign + ckd requests + // cannot be served by the now-demoted node — that proves A0 + A1 are + // carrying the workload alone. cluster .kill_nodes(&[b_idx]) .expect("failed to kill B0 after back-migration"); @@ -773,3 +788,33 @@ async fn migration_service__should_handle_back_migration_a_to_b_to_a() { .await .expect("ckd request failed after back-migration"); } + +/// Back-migration A→B→A without kill+restart of A0. Stable variant that +/// exercises the back-migration code path through A0's in-memory state +/// transitions; doesn't depend on A0 surviving a sandbox-blocks-during- +/// catch-up window (see #3366). +/// +/// The kill+restart sibling `_a_to_b_to_a` owns coverage of the +/// disk-persisted-restart scenario from #2121. +#[tokio::test] +#[expect(non_snake_case)] +async fn migration_service__should_handle_back_migration_a_to_b_to_a_no_restart() { + run_back_migration_round_trip( + common::MIGRATION_BACK_NO_RESTART_PORT_SEED, + BetweenDirections::None, + ) + .await; +} + +/// Back-migration A→B→A with kill+restart of A0 between the two +/// directions, modeling the disk-persisted-state scenario investigated +/// in #2121. +#[tokio::test] +#[expect(non_snake_case)] +async fn migration_service__should_handle_back_migration_a_to_b_to_a() { + run_back_migration_round_trip( + common::MIGRATION_BACK_PORT_SEED, + BetweenDirections::KillAndRestartA, + ) + .await; +}