diff --git a/Cargo.lock b/Cargo.lock index f59965734..2b470501b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5994,6 +5994,7 @@ dependencies = [ "anyhow", "assert_matches", "async-trait", + "average", "axum", "backon", "backtrace", diff --git a/crates/node/Cargo.toml b/crates/node/Cargo.toml index 8ddaa30e6..3c8851e2d 100644 --- a/crates/node/Cargo.toml +++ b/crates/node/Cargo.toml @@ -104,6 +104,7 @@ built = { workspace = true } [dev-dependencies] assert_matches = { workspace = true } +average = { workspace = true } blstrs = { workspace = true } elliptic-curve = { workspace = true } insta = { workspace = true } diff --git a/crates/node/src/p2p.rs b/crates/node/src/p2p.rs index 9fa480270..1073acbe6 100644 --- a/crates/node/src/p2p.rs +++ b/crates/node/src/p2p.rs @@ -933,7 +933,7 @@ pub mod testing { // The base port number used, hoping the OS is not using ports in this range pub const BASE_PORT: u16 = 10000; // This constant must be equal to the total number of ports defined below - pub const TOTAL_DEFINED_PORTS: u16 = 22; + pub const TOTAL_DEFINED_PORTS: u16 = 23; // Maximum number of nodes that can be handled without port collisions pub const MAX_NODES: u16 = 10; // Maximum number of cases that can be handled without port collisions @@ -1006,6 +1006,7 @@ pub mod testing { pub const RECONNECTION_TEST: Self = Self::new(19); pub const FOREIGN_CHAIN_POLICY_TEST: Self = Self::new(20); pub const BACKUP_CLI_WEBSERVER_PUT_KEYSHARES_HOSTNAME: Self = Self::new(21); + pub const ASSET_GENERATION_SIGNING_CONTENTION_TEST: Self = Self::new(22); } pub fn generate_test_p2p_configs( diff --git a/crates/node/src/tests.rs b/crates/node/src/tests.rs index 12d7c609f..3137e9ead 100644 --- a/crates/node/src/tests.rs +++ b/crates/node/src/tests.rs @@ -51,6 +51,7 @@ use tokio::time::timeout; pub mod common; +mod asset_generation_signing_contention; mod basic_cluster; mod changing_participant_details; mod faulty; diff --git a/crates/node/src/tests/asset_generation_signing_contention.rs b/crates/node/src/tests/asset_generation_signing_contention.rs new file mode 100644 index 000000000..2bb527b4e --- /dev/null +++ b/crates/node/src/tests/asset_generation_signing_contention.rs @@ -0,0 +1,304 @@ +//! Reproduction for issue #1175 — "asset generation impacts signing performance". +//! +//! Background asset generation (triples + presignatures) and signing both run on +//! the same `cores`-limited per-epoch MPC runtime (see +//! [`crate::coordinator::Coordinator`]'s `create_runtime_and_run`) with no priority +//! separation between them. The cait-sith poke loop in +//! [`crate::protocol::run_protocol`] is CPU-bound and does not yield between +//! network rounds, and the follower/passive side of generation +//! ([`crate::mpc_client`]'s `monitor_passive_channels_inner`) is unbounded. After a +//! resharing every node's triple/presignature stores are empty at once, so the +//! whole network refills simultaneously and each node is flooded with generation +//! work — which starves signing. +//! +//! This test reproduces that locally *without* nearcore, isolating the MPC-runtime +//! contention from the "nearcore lags" theory. It runs a 4-node in-process cluster +//! and compares signing latency in two states: +//! - steady state — a small desired triple buffer that fills with a single batch +//! and then idles; +//! - post-resharing — the mainnet desired triple buffer (2^14), which starts +//! empty and so triple generation runs flat out. +//! +//! Two knobs are not literal mainnet values, both because this in-process cluster +//! has near-instant networking (see `CORES_PER_NODE` and the `*_TRIPLE_CONCURRENCY` +//! constants): mainnet reaches CPU saturation because real cross-node latency +//! keeps many generations in flight at once, whereas here protocols finish too +//! fast to pile up that way. To recreate the same saturation cheaply we (a) give +//! MPC a single core, and (b) drive triple generation harder in the post-resharing +//! scenario (higher concurrency, no stagger) while leaving the steady scenario +//! with a gentle top-off concurrency. The buffer *size* that defines +//! "post-resharing" is the real mainnet 2^14. +//! +//! It is timing-sensitive, so it is `#[ignore]`d and excluded from CI. Run it +//! manually and read the printed report. `RUST_LOG=off` silences all `tracing` +//! output so the `[#1175]` summary lines aren't buried; the report goes through +//! `println!`, not tracing, so this only mutes the noise. Drop `RUST_LOG=off` if +//! you need the logs to diagnose a failure. +//! ```text +//! RUST_LOG=off cargo nextest run --cargo-profile=test-release -p mpc-node \ +//! signing_latency__should_degrade_under_concurrent_asset_generation \ +//! --run-ignored all --no-capture +//! ``` +//! A handful of `panicked at crates/node/src/indexer/fake.rs … Result::unwrap() +//! on an Err value: Closed` lines may also appear on stderr at the end — these +//! are a pre-existing teardown race in the fake indexer's channels (the cluster +//! drops while background tasks are still using them) and don't affect the +//! result. The `[#1175]` summary still sits near the top of the test output. + +use crate::indexer::participants::ContractState; +use crate::p2p::testing::PortSeed; +use crate::tests::{ + DEFAULT_BLOCK_TIME, DEFAULT_MAX_PROTOCOL_WAIT_TIME, IntegrationTestSetup, + request_signature_and_await_response, +}; +use crate::tracking::AutoAbortTask; +use average::{Estimate, Max, Mean, Quantile}; +use mpc_node_config::{PresignatureConfig, TripleConfig}; +use mpc_primitives::domain::DomainId; +use near_mpc_contract_interface::types::{ + DomainConfig, DomainPurpose, Protocol, ReconstructionThreshold, +}; +use near_time::Clock; +use std::time::Duration; + +const NUM_PARTICIPANTS: usize = 4; +const THRESHOLD: usize = 3; +const TXN_DELAY_BLOCKS: u64 = 1; + +// The only knob that differs between the two scenarios is the desired triple +// buffer: the post-resharing run targets the mainnet size (2^14) and so generates +// continuously, while the steady run targets a tiny buffer that fills in a couple +// of batches and then leaves triple generation idle. Everything else is identical. +const CORES_PER_NODE: usize = 1; +/// Mainnet triple buffer (2^14): empty after a resharing, so a node refills toward +/// it continuously — the post-resharing state. +const REFILL_TRIPLES_TO_BUFFER: usize = 16_384; +/// Small enough to fill in a couple of batches and then idle — steady state, where +/// the buffer is already full. +const STEADY_TRIPLES_TO_BUFFER: usize = 128; +/// Triple generation is the heavy CPU op, so it is what we drive to recreate the +/// refill load. On mainnet (concurrency 2, 1s stagger) saturation builds because +/// real cross-node latency keeps many generations in flight at once; this +/// in-process cluster has near-instant networking, so to reach the same CPU +/// pressure within seconds we drop the stagger and raise concurrency in the +/// post-resharing scenario. +const REFILL_TRIPLE_CONCURRENCY: usize = 6; +/// Steady state only needs to top off as signing consumes presignatures — using +/// the post-resharing concurrency here would let the initial buffer fill spawn 6 +/// concurrent CPU-heavy batches on the single MPC core, overshooting the small +/// target and producing a multi-second latency tail on early measured signatures. +/// One batch at a time is enough to refill the small steady buffer. +const STEADY_TRIPLE_CONCURRENCY: usize = 1; +const TRIPLE_STAGGER_SEC: u64 = 0; +/// Presignatures are buffered modestly and identically in both scenarios, so they +/// stay available for signing without themselves being the variable under test. +const PRESIGNATURE_CONCURRENCY: usize = 2; +const PRESIGNATURES_TO_BUFFER: usize = 64; + +const WARMUP_SIGNATURES: usize = 2; +const MEASURED_SIGNATURES: usize = 8; +/// Generous relative to the observed steady-state tail so a loaded host doesn't +/// produce spurious timeouts in the steady-state scenario; the post-resharing +/// scenario blows past it regardless. +const PER_SIGNATURE_TIMEOUT: Duration = Duration::from_secs(15); + +/// Summary statistics over a batch of signing attempts. Latency stats are computed +/// with the `average` crate (as in the threshold-signatures benches); timeouts are +/// counted separately since they have no finite latency. +struct LatencyReport { + label: &'static str, + n_ok: usize, + timeouts: usize, + mean: Duration, + p50: Duration, + p90: Duration, + max: Duration, +} + +impl LatencyReport { + fn from_attempts(label: &'static str, attempts: &[Option]) -> Self { + let timeouts = attempts.iter().filter(|a| a.is_none()).count(); + let latencies: Vec = attempts + .iter() + .filter_map(|a| a.map(|d| d.as_secs_f64())) + .collect(); + let n_ok = latencies.len(); + + // Every request timed out: report the timeout bound as a sentinel so the + // printed report and the comparison still reflect the degradation. + if latencies.is_empty() { + return Self { + label, + n_ok, + timeouts, + mean: PER_SIGNATURE_TIMEOUT, + p50: PER_SIGNATURE_TIMEOUT, + p90: PER_SIGNATURE_TIMEOUT, + max: PER_SIGNATURE_TIMEOUT, + }; + } + + let mut mean = Mean::new(); + let mut p50 = Quantile::new(0.5); + let mut p90 = Quantile::new(0.9); + let mut max = Max::new(); + for &seconds in &latencies { + mean.add(seconds); + p50.add(seconds); + p90.add(seconds); + max.add(seconds); + } + Self { + label, + n_ok, + timeouts, + mean: Duration::from_secs_f64(mean.mean()), + p50: Duration::from_secs_f64(p50.quantile()), + p90: Duration::from_secs_f64(p90.quantile()), + max: Duration::from_secs_f64(max.max()), + } + } + + fn print(&self) { + println!( + "[#1175] {:<13} signing latency: n_ok={} timeouts={} \ + mean={:?} p50={:?} p90={:?} max={:?}", + self.label, self.n_ok, self.timeouts, self.mean, self.p50, self.p90, self.max, + ); + } +} + +/// Brings up a 4-node cluster, completes keygen, then measures end-to-end signing +/// latency. When `buffers_empty` is true the nodes use the mainnet desired buffers +/// (so generation runs flat out — the post-resharing state); otherwise they use +/// small buffers that fill and idle (steady state). +async fn measure_signing_latency(buffers_empty: bool, case: u16) -> LatencyReport { + let label = if buffers_empty { + "post-resharing" + } else { + "steady-state" + }; + let temp_dir = tempfile::tempdir().unwrap(); + let mut setup = IntegrationTestSetup::new( + Clock::real(), + temp_dir.path(), + (0..NUM_PARTICIPANTS) + .map(|i| format!("test{i}").parse().unwrap()) + .collect(), + THRESHOLD, + TXN_DELAY_BLOCKS, + PortSeed::ASSET_GENERATION_SIGNING_CONTENTION_TEST.with_case(case), + DEFAULT_BLOCK_TIME, + ); + + let ecdsa_domain = DomainConfig { + id: DomainId(0), + protocol: Protocol::CaitSith, + reconstruction_threshold: ReconstructionThreshold::new(THRESHOLD as u64), + purpose: DomainPurpose::Sign, + }; + + // Same config everywhere except triple generation: the desired buffer differs + // (empty-and-refilling vs. already-full), and concurrency matches the load the + // scenario is meant to model — aggressive refill vs. gentle top-off. + for node in &mut setup.configs { + node.config.cores = Some(CORES_PER_NODE); + node.config.triple = TripleConfig { + concurrency: if buffers_empty { + REFILL_TRIPLE_CONCURRENCY + } else { + STEADY_TRIPLE_CONCURRENCY + }, + desired_triples_to_buffer: if buffers_empty { + REFILL_TRIPLES_TO_BUFFER + } else { + STEADY_TRIPLES_TO_BUFFER + }, + parallel_triple_generation_stagger_time_sec: TRIPLE_STAGGER_SEC, + timeout_sec: 60, + }; + node.config.presignature = PresignatureConfig { + concurrency: PRESIGNATURE_CONCURRENCY, + desired_presignatures_to_buffer: PRESIGNATURES_TO_BUFFER, + timeout_sec: 60, + }; + } + + { + let mut contract = setup.indexer.contract_mut().await; + contract.initialize(setup.participants.clone()); + contract.add_domains(vec![ecdsa_domain.clone()]); + } + + let _runs = setup + .configs + .into_iter() + .map(|config| AutoAbortTask::from(tokio::spawn(config.run()))) + .collect::>(); + + setup + .indexer + .wait_for_contract_state( + |state| matches!(state, ContractState::Running(_)), + DEFAULT_MAX_PROTOCOL_WAIT_TIME, + ) + .await + .expect("timeout waiting for keygen to complete"); + + // Warm up so presignatures are flowing before we start measuring. (User names + // must be valid NEAR account IDs: lowercase alphanumeric.) + for i in 0..WARMUP_SIGNATURES { + let _ = request_signature_and_await_response( + &mut setup.indexer, + &format!("warmup{i}"), + &ecdsa_domain, + PER_SIGNATURE_TIMEOUT, + ) + .await; + } + + let mut attempts = Vec::with_capacity(MEASURED_SIGNATURES); + for i in 0..MEASURED_SIGNATURES { + let latency = request_signature_and_await_response( + &mut setup.indexer, + &format!("user{i}"), + &ecdsa_domain, + PER_SIGNATURE_TIMEOUT, + ) + .await; + attempts.push(latency); + } + + LatencyReport::from_attempts(label, &attempts) +} + +#[test_log::test(tokio::test(flavor = "multi_thread", worker_threads = 4))] +#[ignore = "timing-sensitive reproduction for #1175; run manually with --run-ignored"] +#[expect(non_snake_case)] +async fn signing_latency__should_degrade_under_concurrent_asset_generation() { + // Given a 4-node cluster signing in steady state (asset buffers already full), + let steady = measure_signing_latency(false, 0).await; + + // When the same cluster signs just after a resharing (buffers empty, so every + // node refills toward the mainnet target while signing), + let post_resharing = measure_signing_latency(true, 1).await; + + steady.print(); + post_resharing.print(); + + // Then signing latency degrades sharply during the refill: either signing + // requests start timing out, or the median latency inflates well beyond the + // steady-state baseline. (The steady-state baseline itself must stay healthy.) + assert_eq!( + steady.timeouts, 0, + "steady-state baseline should not time out; harness is unhealthy" + ); + assert!( + post_resharing.timeouts > 0 || post_resharing.p50 >= steady.p50 * 2, + "expected post-resharing refill to degrade signing: \ + steady p50 {:?}, post-resharing p50 {:?}, post-resharing timeouts {}", + steady.p50, + post_resharing.p50, + post_resharing.timeouts, + ); +} diff --git a/docs/design/signing-starvation-solution.md b/docs/design/signing-starvation-solution.md new file mode 100644 index 000000000..94c05296d --- /dev/null +++ b/docs/design/signing-starvation-solution.md @@ -0,0 +1,130 @@ +# Issue #1175 — Signing starvation after resharing + +## Problem + +After every resharing, signing latency degrades for hours-to-days until the +asset buffers refill. The ticket suspects "asset generation should run on a +separate lower-priority thread but doesn't" — and that is exactly what the +code does today. + +## Root cause + +Three architectural facts, plus two that make them fire after a resharing: + +1. **Shared runtime, equal priority.** Signing, leader gen, and follower gen + all run as peer `tracking::spawn` tasks on the same `cores`-limited + per-epoch tokio runtime (`coordinator.rs::create_runtime_and_run`). The + `cores` limit exists to protect the indexer, not signing. +2. **CPU-bound, non-yielding poke loop.** `run_protocol` in `protocol.rs` + runs `protocol.poke()` until `Action::Wait`. A 64-batch triple gen burst + is tens-to-hundreds of ms between awaits. +3. **Unbounded follower fan-out.** + `mpc_client.rs::monitor_passive_channels_inner` spawns one task per + incoming peer channel with no cap, so a node has no way to bound how much + follower work peers can push onto it. +4. **Resharing erases the asset buffers.** Triples reference the old + participant set and get cleaned up (`assets::cleanup`); presignatures + embed the old keyshare and are also wiped. Every node empties at once and + refills toward `desired_*_to_buffer`, so the load spike is synchronized + network-wide. +5. **Mainnet runs two CaitSith domains.** Presignature generation runs + per-domain (one background loop per `(provider, domain)` in + `spawn_background_tasks`), so the per-node presig pipeline doubles. + +## Solution options + +### A — Lower-OS-priority gen runtime + +A second tokio runtime (`gen_runtime`) dedicated to triple/presignature +generation, with worker threads spawned at lower OS priority via +`thread-priority` (Linux `nice`, macOS QoS, Windows-equivalent). Signing, +network routing, and indexer stay on the original `mpc_runtime` at normal +priority; the OS scheduler preempts gen whenever normal-priority work is +ready. + +Blast radius: + +- `tracking.rs` grows `spawn_on` / `spawn_checked_on` so call sites can + target a specific runtime; existing `spawn` becomes a `Handle::current()` + wrapper (no call-site churn elsewhere). +- `runtime.rs` adds a helper that builds the runtime with + `Builder::on_thread_start` setting `ThreadPriority::Min`. +- `coordinator.rs` builds `gen_runtime` next to `mpc_runtime` and passes its + handle into `MpcClient`. +- `mpc_client.rs` routes the four `*_background_tasks` spawns to + `gen_runtime`, and dispatches triple/presig follower channels in + `monitor_passive_channels_inner` to `gen_runtime` (signing/CKD/foreign-tx + follower channels stay on `mpc_runtime`). + +Caveat: when no normal-priority thread is ready (signing happens to be +awaiting), gen threads still run. On a CPU-oversubscribed host that leaves +some residual tail. + +### A.1 — `gen_cores` config knob + +`Option` on `ConfigFile`, defaulting to `cores` when unset, mirrored +into `/debug/node_config`. Lets operators size `gen_runtime` independently +of `mpc_runtime` as a defensive bound on top of OS priority — useful if +priority preemption ever proves insufficient on a particular kernel/host. + +### B — Bound follower concurrency + +Cap concurrent follower gen tasks per peer, both as DoS protection and as a +defensive bound on fan-out. + +**Must be per-peer admission, not a single global semaphore.** A small +global cap deadlocks in the threshold-N circular-wait case (A waits on B to +free a slot to accept A's gen; B waits on C; C waits on A). Concrete shape: +`try_admit(leader_id) -> Option`, non-blocking; on unavailable, drop +the channel and let the leader time out and retry with different +participants. + +Necessary only if memory pressure or scheduler queue depth turn out to be a +problem after A. If A leaves only CPU contention, B is overkill. + +### C — `yield_now()` in the poke loop + +A single `tokio::task::yield_now().await` in `run_protocol`'s outer loop +shortens the maximum CPU burst between cooperative yield points +(a comment in `protocol.rs` already documents the hazard). + +With A in place this mainly helps fairness *within* `gen_runtime`; gen tasks +yielding to each other does not free a core for signing. Cheap, harmless, +defensible independently of #1175. + +### Lower `presignature.concurrency` + +Existing `ConfigFile` knob; mainnet sets it to 16, which exceeds +`cores = 12` — leader-side presig generation alone over-subscribes the +runtime once enough triples are available. Halving or quartering this +directly shrinks the number of concurrent presig poke loops on the runtime, +at the cost of a slower refill toward +`desired_presignatures_to_buffer`. Pure operational tuning — no code change, +no protocol rollout. + +### Reduce `SUPPORTED_TRIPLE_GENERATION_BATCH_SIZE` + +Drop the `SUPPORTED_TRIPLE_GENERATION_BATCH_SIZE` const in `triple.rs` from +64 → ~16. Each follower poke burst +becomes 4× shorter; gen threads idle more often, opening more chances for +higher-priority threads to grab a core. **Protocol-affecting**: followers +validate `count == BATCH_SIZE` in `triple.rs`, so leader and follower +must agree — coordinated network-wide rollout. Worth doing only if a +protocol bump is already on the table. + +### E — `spawn_blocking` or dedicated rayon pool + +Move CPU-bound compute off async worker threads entirely. Overlaps with what +A achieves through runtime separation, but at a deeper restructuring layer. + +## Open questions + +1. **Does A fully fix mainnet?** Mechanism is correct, but host shape + matters (CPU oversubscription leaves residual tail). A testnet or + latency-injected run resolves this. +2. **Is follower fan-out a memory/scheduler problem, or only CPU?** If only + CPU, A suffices and B is unneeded. +3. **What does the indexer contribute to the latency floor?** The indexer + runtime is unbounded today (`indexer/real.rs`, tracked in #1515). Worth + profiling on a real node so it does not confound a "did we fix it?" + measurement.