From 9e0822ed7347425a676b027d74d585345e83bd92 Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 04:28:02 -0300 Subject: [PATCH 01/14] feat(host): add pure keep-alive state machine module (#35) Extract the host engine's keep-alive policy (reconnect backoff, token re-mint timing, auth-error relogin path) into a pure, dependency-free state machine in src/host/keepalive.rs. Declared unconditionally so its tests run without the vendored-OpenSSL toolchain. Co-Authored-By: Claude Opus 4.8 --- src/host/keepalive.rs | 126 ++++++++++++++++++++++++++++++++++++++++++ src/host/mod.rs | 7 +++ 2 files changed, 133 insertions(+) create mode 100644 src/host/keepalive.rs diff --git a/src/host/keepalive.rs b/src/host/keepalive.rs new file mode 100644 index 0000000..ca28c94 --- /dev/null +++ b/src/host/keepalive.rs @@ -0,0 +1,126 @@ +//! Pure keep-alive state machine for the host engine (issue #35). +//! +//! `host_group` (in `engine.rs`) used to inline every keep-alive policy decision +//! — reconnect on relay drop, exponential backoff, periodic token re-mint, and +//! the auth-error → relogin path — inside an async loop fused to the SDK's +//! `RelayTunnelHost`, leaving the most failure-prone logic in the app untested. +//! +//! This module holds that policy as a pure transition function with **zero** +//! SDK, CLI, or channel dependencies: it imports only [`std::time::Duration`]. +//! The driver feeds it [`ConnEvent`]s (connection outcomes) and executes the +//! returned [`Action`]s. Because it is pure, it is unit-tested without the +//! vendored-OpenSSL toolchain — the tests run under a plain `cargo test`. + +use std::time::Duration; + +/// Re-mint the host/manage tokens before their ~24h expiry. 20h leaves headroom. +pub const REMINT_AFTER: Duration = Duration::from_secs(20 * 60 * 60); +/// Base backoff after a relay drop; doubles up to [`RECONNECT_BACKOFF_MAX`]. +const RECONNECT_BACKOFF_START: Duration = Duration::from_secs(2); +const RECONNECT_BACKOFF_MAX: Duration = Duration::from_secs(60); + +/// A connection outcome fed into the state machine by the driver. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ConnEvent { + /// The relay connect (and port forwarding) succeeded. + Connected, + /// The live relay session dropped; the driver wants to reconnect. + RelayDropped, + /// The ~20h re-mint timer fired; reconnect with fresh tokens. + RemintDue, + /// A connect attempt failed. `auth` is true when the failure is an expired + /// or absent CLI sign-in (retrying is pointless until the user re-auths). + ConnectFailed { auth: bool }, +} + +/// What the driver should execute next, returned by [`KeepAliveState::next`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Action { + /// Hold the live connection and wait for the next outcome (no sleep). + Await, + /// Sleep for the given backoff, then (re)connect. + Sleep(Duration), + /// The sign-in is expired: emit `ReloginRequired`, surface an error, stop. + Relogin, +} + +/// Presentation phase. The driver maps it to `HostState::Connecting` (first +/// attempt) vs. `HostState::Reconnecting` (every attempt after the first). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Phase { + /// No successful connect or failed attempt yet — show "Connecting". + Initial, + /// At least one attempt has happened — show "Reconnecting". + Reconnect, +} + +/// The keep-alive policy state: the current reconnect backoff and whether this +/// is still the first connection attempt. Pure — no SDK/CLI/channel state. +pub struct KeepAliveState { + backoff: Duration, + first_attempt: bool, +} + +impl KeepAliveState { + /// A fresh state: backoff at [`RECONNECT_BACKOFF_START`], first attempt. + pub fn new() -> Self { + Self { + backoff: RECONNECT_BACKOFF_START, + first_attempt: true, + } + } + + /// Whether no attempt has completed yet (drives Connecting vs Reconnecting). + pub fn first_attempt(&self) -> bool { + self.first_attempt + } + + /// The presentation phase for the next attempt. + pub fn phase(&self) -> Phase { + if self.first_attempt { + Phase::Initial + } else { + Phase::Reconnect + } + } + + /// Advances the state machine for one connection outcome and returns the + /// action the driver must execute. Mirrors the original `host_group` + /// control flow exactly (asymmetric backoff reset: a success resets the + /// backoff, consecutive connect-failures keep doubling it). + pub fn next(&mut self, event: ConnEvent) -> Action { + match event { + // Success: reset the backoff and leave the first-attempt phase. + ConnEvent::Connected => { + self.backoff = RECONNECT_BACKOFF_START; + self.first_attempt = false; + Action::Await + } + // A live session ended (drop or re-mint): sleep the current backoff, + // then double it (capped) for the next attempt. + ConnEvent::RelayDropped | ConnEvent::RemintDue => Action::Sleep(self.bump()), + // Expired sign-in: stop and ask the user to re-authenticate. + ConnEvent::ConnectFailed { auth: true } => Action::Relogin, + // Recoverable connect failure: leave the first-attempt phase and + // back off without resetting (consecutive failures keep doubling). + ConnEvent::ConnectFailed { auth: false } => { + self.first_attempt = false; + Action::Sleep(self.bump()) + } + } + } + + /// Returns the current backoff and then doubles it, capped at + /// [`RECONNECT_BACKOFF_MAX`]. + fn bump(&mut self) -> Duration { + let current = self.backoff; + self.backoff = (self.backoff * 2).min(RECONNECT_BACKOFF_MAX); + current + } +} + +impl Default for KeepAliveState { + fn default() -> Self { + Self::new() + } +} diff --git a/src/host/mod.rs b/src/host/mod.rs index f41f5c4..ccadfa3 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -15,6 +15,13 @@ use std::sync::mpsc::Sender; +// The pure keep-alive state machine (issue #35) lives in `keepalive.rs`. It has +// zero SDK deps, so it is declared unconditionally — its tests run under a plain +// `cargo test` without the vendored-OpenSSL toolchain. The `#![allow(dead_code)]` +// above keeps the items it exposes but the default build never calls from +// warning. +mod keepalive; + // The real SDK-backed engine (connect/keep-alive/stop) lives in `engine.rs` and // is compiled only with `--features hosting`. #[cfg(feature = "hosting")] From e5e1016630c17b507dcb7bafd1f96703bbf91b7c Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 04:30:28 -0300 Subject: [PATCH 02/14] test(host): table-driven keep-alive state machine tests (#35) Cover backoff progression (2,4,8,16,32,60,60) and reset-on-success, re-mint scheduling, auth-error to relogin, and reconnect phase change. Verified RED against a todo!() next() then GREEN once implemented. Co-Authored-By: Claude Opus 4.8 --- src/host/keepalive.rs | 77 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/src/host/keepalive.rs b/src/host/keepalive.rs index ca28c94..ea563a6 100644 --- a/src/host/keepalive.rs +++ b/src/host/keepalive.rs @@ -124,3 +124,80 @@ impl Default for KeepAliveState { Self::new() } } + +#[cfg(test)] +mod tests { + use super::*; + + fn secs(n: u64) -> Duration { + Duration::from_secs(n) + } + + /// Extracts the sleep duration from an [`Action::Sleep`]; panics otherwise so + /// a wrong action is an obvious test failure rather than a silent skip. + fn sleep_of(action: Action) -> Duration { + match action { + Action::Sleep(d) => d, + other => panic!("expected Action::Sleep, got {other:?}"), + } + } + + #[test] + fn backoff_progression_on_repeated_connect_failures() { + let mut state = KeepAliveState::new(); + let expected = [2u64, 4, 8, 16, 32, 60, 60]; + let got: Vec = (0..expected.len()) + .map(|_| sleep_of(state.next(ConnEvent::ConnectFailed { auth: false })).as_secs()) + .collect(); + assert_eq!(got, expected); + } + + #[test] + fn success_resets_backoff_before_next_drop() { + let mut state = KeepAliveState::new(); + // Grow the backoff with two recoverable failures (2s, then 4s). + assert_eq!( + sleep_of(state.next(ConnEvent::ConnectFailed { auth: false })), + secs(2) + ); + assert_eq!( + sleep_of(state.next(ConnEvent::ConnectFailed { auth: false })), + secs(4) + ); + // A successful connect returns Await and resets the backoff. + assert_eq!(state.next(ConnEvent::Connected), Action::Await); + // The reconnect sleep after the next relay drop is back to the start. + assert_eq!(sleep_of(state.next(ConnEvent::RelayDropped)), secs(2)); + } + + #[test] + fn remint_after_success_sleeps_start_and_remint_const_is_20h() { + let mut state = KeepAliveState::new(); + assert_eq!(state.next(ConnEvent::Connected), Action::Await); + assert_eq!(sleep_of(state.next(ConnEvent::RemintDue)), secs(2)); + assert_eq!(REMINT_AFTER, Duration::from_secs(72_000)); + } + + #[test] + fn auth_error_yields_relogin() { + let mut state = KeepAliveState::new(); + assert_eq!( + state.next(ConnEvent::ConnectFailed { auth: true }), + Action::Relogin + ); + } + + #[test] + fn reconnect_after_drop_changes_phase() { + let mut state = KeepAliveState::new(); + // Fresh state: first attempt, "Connecting" phase. + assert!(state.first_attempt()); + assert_eq!(state.phase(), Phase::Initial); + // After a successful connect, later attempts present as "Reconnecting". + assert_eq!(state.next(ConnEvent::Connected), Action::Await); + assert!(!state.first_attempt()); + assert_eq!(state.phase(), Phase::Reconnect); + // A relay drop schedules the reconnect sleep at the reset backoff. + assert_eq!(sleep_of(state.next(ConnEvent::RelayDropped)), secs(2)); + } +} From 16024da22c0ab66702f54d6fc701fbb83a12a435 Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 04:33:20 -0300 Subject: [PATCH 03/14] refactor(host): drive keep-alive from the pure state machine (#35) Rewrite host_group as a thin driver around KeepAliveState: it maps the Phase to Connecting/Reconnecting, feeds connection outcomes as ConnEvents, and executes the returned Action. All policy constants and backoff arithmetic are removed from engine.rs. The _host lifetime invariant (must stay bound across the keep-alive select! to avoid the busy-loop) is preserved and documented inline. Co-Authored-By: Claude Opus 4.8 --- src/host/engine.rs | 74 ++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/src/host/engine.rs b/src/host/engine.rs index 47aae82..4f56cfd 100644 --- a/src/host/engine.rs +++ b/src/host/engine.rs @@ -30,7 +30,6 @@ use std::collections::HashMap; use std::sync::mpsc::Sender; use std::sync::Arc; -use std::time::Duration; use tokio::sync::Notify; use tunnels::connections::RelayTunnelHost; @@ -43,11 +42,6 @@ use crate::locale::{system_locale, Locale}; /// User-Agent reported to the tunnel management service. const USER_AGENT: &str = "devtunnel-gui/0.1"; -/// Re-mint the host/manage tokens before their ~24h expiry. 20h leaves headroom. -const REMINT_AFTER: Duration = Duration::from_secs(20 * 60 * 60); -/// Base backoff after a relay drop; doubles up to [`RECONNECT_BACKOFF_MAX`]. -const RECONNECT_BACKOFF_START: Duration = Duration::from_secs(2); -const RECONNECT_BACKOFF_MAX: Duration = Duration::from_secs(60); /// Starts the engine command thread and returns its command channel. The caller /// wraps the returned [`Sender`] in a [`super::TunnelHost`]. @@ -171,56 +165,61 @@ fn collect_ports(tunnel_id: &str, loc: &Locale) -> anyhow::Result> { /// `select!` ends it when the group is cancelled (Stop). Returns early only on an /// unrecoverable error (e.g. expired sign-in). async fn host_group(tunnel_id: String, ports: Vec, events: Sender) { - let mut first_attempt = true; - let mut backoff = RECONNECT_BACKOFF_START; + use super::keepalive::{Action, ConnEvent, KeepAliveState, Phase}; + + let mut state = KeepAliveState::new(); loop { emit( &events, &tunnel_id, - if first_attempt { - HostState::Connecting - } else { - HostState::Reconnecting + match state.phase() { + Phase::Initial => HostState::Connecting, + Phase::Reconnect => HostState::Reconnecting, }, ); - match connect_once(&tunnel_id, &ports).await { - // `_host` (the `RelayTunnelHost`) MUST stay bound for the whole - // connection: it owns the `ports_tx` watch::Sender that every - // client's `run_stream` task waits on. The SDK's `run_stream` - // ignores the `Result` from `ports.changed()`, so once that sender - // is dropped, `changed()` returns `Err` forever and each task spins - // a CPU core (observed: ~2.5 cores pegged → freeze under client - // churn). Holding `_host` until reconnect/stop keeps the sender - // alive so those tasks stay parked instead of busy-looping. + let action = match connect_once(&tunnel_id, &ports).await { + // INVARIANT: `_host` (the `RelayTunnelHost`) MUST stay bound across + // the keep-alive `select!` below — it owns the `ports_tx` + // watch::Sender that every client's `run_stream` task waits on. The + // SDK's `run_stream` ignores the `Result` from `ports.changed()`, so + // once that sender is dropped, `changed()` returns `Err` forever and + // each task spins a CPU core (observed: ~2.5 cores pegged → freeze + // under client churn). The state machine is pure and channel-free, + // so the wait stays inline here: `_host` must not be moved into a + // helper that drops it before the await. The only early `return` is + // in the `Err` arm, where no live host is bound. Ok((_host, handle)) => { - backoff = RECONNECT_BACKOFF_START; - first_attempt = false; + // Success resets the backoff and leaves the first-attempt phase. + let _ = state.next(ConnEvent::Connected); emit(&events, &tunnel_id, HostState::Hosting); // Keep alive until the relay drops or the re-mint timer fires. - tokio::select! { + let event = tokio::select! { r = handle => { log::warn!("host engine: {tunnel_id} relay disconnected: {r:?}"); - // Fall through to reconnect. + ConnEvent::RelayDropped } - _ = tokio::time::sleep(REMINT_AFTER) => { + _ = tokio::time::sleep(super::keepalive::REMINT_AFTER) => { log::info!("host engine: {tunnel_id} re-minting tokens before expiry"); - // Dropping `handle` here closes the current relay session; - // the loop reconnects with freshly minted tokens. + ConnEvent::RemintDue } - } + }; // `_host` and the unfinished `handle` both drop here on the way // to reconnect, tearing down the relay session so old // `run_stream` tasks exit via their stream-closed arm. + state.next(event) } Err(e) => { let msg = e.to_string(); + let action = state.next(ConnEvent::ConnectFailed { + auth: devtunnel::is_auth_error(&msg), + }); // Token mint / connect failed because the CLI sign-in expired: // retrying is pointless until the user re-authenticates, so end // the task (auto-resume re-hosts after a successful sign-in). - if devtunnel::is_auth_error(&msg) { + if action == Action::Relogin { log::warn!("host engine: {tunnel_id} login expired: {msg}"); let _ = events.send(HostEvent::ReloginRequired { tunnel_id: tunnel_id.clone(), @@ -229,13 +228,18 @@ async fn host_group(tunnel_id: String, ports: Vec, events: Sender tokio::time::sleep(d).await, + // `Await` only follows a `Connected` event, which the Ok arm + // overwrites with the keep-alive outcome before reaching here; + // `Relogin` returns in the Err arm above. Neither is reachable. + Action::Await | Action::Relogin => {} + } } } From 7bd333fd788e7f388a01eaa165af9f57ae232731 Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 04:37:06 -0300 Subject: [PATCH 04/14] style: apply rustfmt to devtunnel.rs Pre-existing formatting deviations normalized by `cargo fmt` so the `cargo fmt --check` gate stays green. No behavior change. Co-Authored-By: Claude Opus 4.8 --- src/devtunnel.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/devtunnel.rs b/src/devtunnel.rs index ef5f3bf..b2b2b4e 100644 --- a/src/devtunnel.rs +++ b/src/devtunnel.rs @@ -112,10 +112,7 @@ pub fn preflight() -> Preflight { /// command/parse fails. Best-effort and read-only; safe to call off the UI /// thread to populate the Settings "Signed in as …" label. pub fn current_username() -> Option { - let out = command(&bin()) - .args(["user", "show", "-j"]) - .output() - .ok()?; + let out = command(&bin()).args(["user", "show", "-j"]).output().ok()?; if !out.status.success() { return None; } @@ -671,7 +668,11 @@ fn parse_rate_bps(s: &str) -> Option { /// Parses a leading integer from a string like `"4 client connections"`. fn parse_leading_int(s: &str) -> Option { - let digits: String = s.trim().chars().take_while(|c| c.is_ascii_digit()).collect(); + let digits: String = s + .trim() + .chars() + .take_while(|c| c.is_ascii_digit()) + .collect(); digits.parse().ok() } From 7aa2021f2c2e61cc091ad3fac9b857715c8e4e95 Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 05:42:29 -0300 Subject: [PATCH 05/14] fix(host): forward each port's configured protocol; stop on fatal connect errors The host engine forwarded every port as `http`, ignoring the configured protocol. A port created as `https`/`auto` was rejected by the service with `400 "the tunnel port protocol cannot be changed"`, and the keep-alive loop retried forever (re-minting tokens every cycle), never reaching `Hosting`. Only `http` ports could be hosted. - connect_once: register each port with its configured protocol (fallback `auto` when absent); collect_ports now carries `(port, protocol)`, threaded through spawn_group -> host_group -> connect_once. - Harden against non-recoverable failures: classify connect errors as Auth / Fatal / Transient (devtunnel::is_fatal_connect_error) in the pure keep-alive state machine (new ConnFailure enum + Action::Fail); a fatal error now surfaces HostState::Error and stops instead of an endless backoff loop. Completes the #35 keep-alive driver this builds on. Verified end-to-end against the live service: http/https/auto all reach Hosting (https needs a TLS backend to serve); no regression on the http happy path or resilience. cargo test (75, incl. new fatal-path test), fmt, and clippy (default + --features hosting) clean. Closes #36 Co-Authored-By: Claude Opus 4.8 --- src/devtunnel.rs | 19 +++++++++ src/host/engine.rs | 96 +++++++++++++++++++++++++++++-------------- src/host/keepalive.rs | 55 +++++++++++++++++++++---- 3 files changed, 131 insertions(+), 39 deletions(-) diff --git a/src/devtunnel.rs b/src/devtunnel.rs index b2b2b4e..48713c0 100644 --- a/src/devtunnel.rs +++ b/src/devtunnel.rs @@ -243,6 +243,25 @@ pub fn is_auth_error(stderr: &str) -> bool { lower.contains("token") && (lower.contains("invalid") || lower.contains("revoked")) } +/// Classifies a host connect/port-forward error as **non-recoverable**: retrying +/// with the same inputs can never succeed, so the engine should surface an error +/// and stop instead of looping the reconnect/backoff forever (each cycle re-mints +/// two tokens and re-runs the relay handshake against the service). +/// +/// A `400 Bad Request` from the tunnel management API is a request-validation +/// failure — e.g. `add_port` rejected with "the tunnel port protocol cannot be +/// changed" when the forwarded protocol disagrees with the registered one. These +/// are permanent for identical inputs. Auth failures are handled separately by +/// [`is_auth_error`] (they have a recovery path: re-login), so callers should +/// check that first. +#[cfg_attr(not(feature = "hosting"), allow(dead_code))] +pub fn is_fatal_connect_error(stderr: &str) -> bool { + let lower = stderr.to_ascii_lowercase(); + lower.contains("400 bad request") + || lower.contains("cannot be changed") + || lower.contains("invalid arguments") +} + /// Runs `devtunnel user login` (interactive — opens the system browser and may /// show a device code) in its own visible console and waits for it to finish. /// Goes through [`interactive_command`] with inherited stdio — never the silent diff --git a/src/host/engine.rs b/src/host/engine.rs index 4f56cfd..67906c9 100644 --- a/src/host/engine.rs +++ b/src/host/engine.rs @@ -118,7 +118,11 @@ fn run(cmd_rx: std::sync::mpsc::Receiver, events: Sender /// cancellation [`Notify`]: a `Stop` signals it, `block_on` returns, and the /// runtime drop tears the group down. Isolating each group on its own runtime is /// the fix for multi-tunnel forward starvation (issue #18). -fn spawn_group(tunnel_id: String, ports: Vec, events: Sender) -> GroupHandle { +fn spawn_group( + tunnel_id: String, + ports: Vec<(u16, String)>, + events: Sender, +) -> GroupHandle { let cancel = Arc::new(Notify::new()); let cancel_signal = cancel.clone(); @@ -149,13 +153,17 @@ fn spawn_group(tunnel_id: String, ports: Vec, events: Sender) -> GroupHandle { thread, cancel } } -/// Fetches the port numbers defined for `tunnel_id` via the management CLI. -fn collect_ports(tunnel_id: &str, loc: &Locale) -> anyhow::Result> { +/// Fetches the ports defined for `tunnel_id` via the management CLI, each paired +/// with its configured protocol (`http`/`https`/`auto`). The protocol must be +/// preserved when forwarding: re-registering a port under a different protocol is +/// rejected by the service ("the tunnel port protocol cannot be changed") and +/// would block hosting entirely (issue #36). +fn collect_ports(tunnel_id: &str, loc: &Locale) -> anyhow::Result> { let rows = devtunnel::fetch_rows(loc)?; - let ports: Vec = rows + let ports: Vec<(u16, String)> = rows .into_iter() .filter(|r| r.tunnel_id == tunnel_id && r.port > 0) - .filter_map(|r| u16::try_from(r.port).ok()) + .filter_map(|r| u16::try_from(r.port).ok().map(|p| (p, r.protocol))) .collect(); Ok(ports) } @@ -164,8 +172,8 @@ fn collect_ports(tunnel_id: &str, loc: &Locale) -> anyhow::Result> { /// reconnect-on-drop and periodic token re-mint. Loops forever; the caller's /// `select!` ends it when the group is cancelled (Stop). Returns early only on an /// unrecoverable error (e.g. expired sign-in). -async fn host_group(tunnel_id: String, ports: Vec, events: Sender) { - use super::keepalive::{Action, ConnEvent, KeepAliveState, Phase}; +async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender) { + use super::keepalive::{Action, ConnEvent, ConnFailure, KeepAliveState, Phase}; let mut state = KeepAliveState::new(); @@ -213,22 +221,41 @@ async fn host_group(tunnel_id: String, ports: Vec, events: Sender { let msg = e.to_string(); - let action = state.next(ConnEvent::ConnectFailed { - auth: devtunnel::is_auth_error(&msg), - }); - // Token mint / connect failed because the CLI sign-in expired: - // retrying is pointless until the user re-authenticates, so end - // the task (auto-resume re-hosts after a successful sign-in). - if action == Action::Relogin { - log::warn!("host engine: {tunnel_id} login expired: {msg}"); - let _ = events.send(HostEvent::ReloginRequired { - tunnel_id: tunnel_id.clone(), - }); - emit(&events, &tunnel_id, HostState::Error(msg)); - return; + // Classify the raw error into the policy's failure kind. Auth is + // checked first because it has a dedicated recovery path; a 400 + // from the management API (e.g. a port-protocol mismatch) is + // otherwise non-recoverable and must not loop forever (issue #36). + let failure = if devtunnel::is_auth_error(&msg) { + ConnFailure::Auth + } else if devtunnel::is_fatal_connect_error(&msg) { + ConnFailure::Fatal + } else { + ConnFailure::Transient + }; + let action = state.next(ConnEvent::ConnectFailed(failure)); + match action { + // Sign-in expired: end the task and prompt re-auth (auto-resume + // re-hosts after a successful sign-in). + Action::Relogin => { + log::warn!("host engine: {tunnel_id} login expired: {msg}"); + let _ = events.send(HostEvent::ReloginRequired { + tunnel_id: tunnel_id.clone(), + }); + emit(&events, &tunnel_id, HostState::Error(msg)); + return; + } + // Non-recoverable: surface the error and stop instead of + // retrying identical inputs in an endless backoff loop. + Action::Fail => { + log::warn!("host engine: {tunnel_id} non-recoverable connect error: {msg}"); + emit(&events, &tunnel_id, HostState::Error(msg)); + return; + } + _ => { + log::warn!("host engine: {tunnel_id} connect failed: {e}"); + action + } } - log::warn!("host engine: {tunnel_id} connect failed: {e}"); - action } }; @@ -237,8 +264,8 @@ async fn host_group(tunnel_id: String, ports: Vec, events: Sender tokio::time::sleep(d).await, // `Await` only follows a `Connected` event, which the Ok arm // overwrites with the keep-alive outcome before reaching here; - // `Relogin` returns in the Err arm above. Neither is reachable. - Action::Await | Action::Relogin => {} + // `Relogin`/`Fail` return in the Err arm above. None are reachable. + Action::Await | Action::Relogin | Action::Fail => {} } } } @@ -253,7 +280,7 @@ async fn host_group(tunnel_id: String, ports: Vec, events: Sender anyhow::Result<(RelayTunnelHost, tunnels::connections::RelayHandle)> { let loc = Locale::load(&system_locale()); @@ -277,16 +304,25 @@ async fn connect_once( let handle = host.connect(&host_token).await?; log::info!("connect_once[{tunnel_id}]: relay connected"); - for &port in ports { + for (port, protocol) in ports { + // Forward each port under its configured protocol. The service rejects a + // re-registration that changes the protocol, so an `https`/`auto` port + // forwarded as `http` would 400 and block hosting (issue #36). Fall back + // to `auto` only when the protocol is genuinely absent. + let proto = if protocol.trim().is_empty() { + "auto" + } else { + protocol.as_str() + }; let tunnel_port = TunnelPort { - port_number: port, - protocol: Some("http".to_string()), + port_number: *port, + protocol: Some(proto.to_string()), ..Default::default() }; // `add_port` treats an already-existing port (409) as success. - log::debug!("connect_once[{tunnel_id}]: add_port {port}"); + log::debug!("connect_once[{tunnel_id}]: add_port {port} ({proto})"); host.add_port(&tunnel_port).await?; - log::info!("connect_once[{tunnel_id}]: port {port} forwarded"); + log::info!("connect_once[{tunnel_id}]: port {port} forwarded ({proto})"); } Ok((host, handle)) diff --git a/src/host/keepalive.rs b/src/host/keepalive.rs index ea563a6..1e03034 100644 --- a/src/host/keepalive.rs +++ b/src/host/keepalive.rs @@ -19,6 +19,22 @@ pub const REMINT_AFTER: Duration = Duration::from_secs(20 * 60 * 60); const RECONNECT_BACKOFF_START: Duration = Duration::from_secs(2); const RECONNECT_BACKOFF_MAX: Duration = Duration::from_secs(60); +/// Why a connect attempt failed — drives whether the driver retries, stops, or +/// asks the user to re-authenticate. The driver classifies the raw error string +/// (via the `devtunnel` helpers) into one of these so the state machine stays +/// pure and free of string parsing. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ConnFailure { + /// Expired or absent CLI sign-in: retrying is pointless until the user + /// re-authenticates. + Auth, + /// Non-recoverable (e.g. a `400` from the management API rejecting the + /// request): retrying with the same inputs can never succeed, so stop. + Fatal, + /// Recoverable (network/relay hiccup): back off and retry. + Transient, +} + /// A connection outcome fed into the state machine by the driver. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ConnEvent { @@ -28,9 +44,8 @@ pub enum ConnEvent { RelayDropped, /// The ~20h re-mint timer fired; reconnect with fresh tokens. RemintDue, - /// A connect attempt failed. `auth` is true when the failure is an expired - /// or absent CLI sign-in (retrying is pointless until the user re-auths). - ConnectFailed { auth: bool }, + /// A connect attempt failed, carrying why (see [`ConnFailure`]). + ConnectFailed(ConnFailure), } /// What the driver should execute next, returned by [`KeepAliveState::next`]. @@ -42,6 +57,8 @@ pub enum Action { Sleep(Duration), /// The sign-in is expired: emit `ReloginRequired`, surface an error, stop. Relogin, + /// A non-recoverable error: surface it and stop. No retry, no relogin prompt. + Fail, } /// Presentation phase. The driver maps it to `HostState::Connecting` (first @@ -100,10 +117,16 @@ impl KeepAliveState { // then double it (capped) for the next attempt. ConnEvent::RelayDropped | ConnEvent::RemintDue => Action::Sleep(self.bump()), // Expired sign-in: stop and ask the user to re-authenticate. - ConnEvent::ConnectFailed { auth: true } => Action::Relogin, + ConnEvent::ConnectFailed(ConnFailure::Auth) => Action::Relogin, + // Non-recoverable error: stop. Retrying identical inputs would loop + // forever (re-minting tokens each cycle) without ever succeeding. + ConnEvent::ConnectFailed(ConnFailure::Fatal) => { + self.first_attempt = false; + Action::Fail + } // Recoverable connect failure: leave the first-attempt phase and // back off without resetting (consecutive failures keep doubling). - ConnEvent::ConnectFailed { auth: false } => { + ConnEvent::ConnectFailed(ConnFailure::Transient) => { self.first_attempt = false; Action::Sleep(self.bump()) } @@ -147,7 +170,9 @@ mod tests { let mut state = KeepAliveState::new(); let expected = [2u64, 4, 8, 16, 32, 60, 60]; let got: Vec = (0..expected.len()) - .map(|_| sleep_of(state.next(ConnEvent::ConnectFailed { auth: false })).as_secs()) + .map(|_| { + sleep_of(state.next(ConnEvent::ConnectFailed(ConnFailure::Transient))).as_secs() + }) .collect(); assert_eq!(got, expected); } @@ -157,11 +182,11 @@ mod tests { let mut state = KeepAliveState::new(); // Grow the backoff with two recoverable failures (2s, then 4s). assert_eq!( - sleep_of(state.next(ConnEvent::ConnectFailed { auth: false })), + sleep_of(state.next(ConnEvent::ConnectFailed(ConnFailure::Transient))), secs(2) ); assert_eq!( - sleep_of(state.next(ConnEvent::ConnectFailed { auth: false })), + sleep_of(state.next(ConnEvent::ConnectFailed(ConnFailure::Transient))), secs(4) ); // A successful connect returns Await and resets the backoff. @@ -182,11 +207,23 @@ mod tests { fn auth_error_yields_relogin() { let mut state = KeepAliveState::new(); assert_eq!( - state.next(ConnEvent::ConnectFailed { auth: true }), + state.next(ConnEvent::ConnectFailed(ConnFailure::Auth)), Action::Relogin ); } + #[test] + fn fatal_error_yields_fail_and_does_not_retry() { + let mut state = KeepAliveState::new(); + // A non-recoverable failure stops the task instead of backing off. + assert_eq!( + state.next(ConnEvent::ConnectFailed(ConnFailure::Fatal)), + Action::Fail + ); + // It also leaves the first-attempt phase, like any completed attempt. + assert!(!state.first_attempt()); + } + #[test] fn reconnect_after_drop_changes_phase() { let mut state = KeepAliveState::new(); From 28b13d49c4011028dfe61630cd0d948daedd3ef5 Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 05:42:49 -0300 Subject: [PATCH 06/14] test(e2e): add headless host runner and blackbox resilience suite The tray GUI can't be scripted, but its hosting engine is the product's value. Add a headless entrypoint (DEVTUNNEL_HEADLESS_HOST=) that drives the production path (host::spawn -> engine::host_group -> keep-alive state machine) and streams every HostEvent as JSON on stdout, returning before any UI is built. Real engine only under --features hosting. tests/e2e/ is a Python blackbox suite that uses the product as a user would: creates groups on a shared local port, hosts them through the headless engine, serves a real backend, and runs resilience scenarios while sampling the host process: - S2 multiple groups, same port - S3 sustained load + latency + idle/loaded host CPU & RSS (busy-loop watch) - S1 reconnect after drop (stop->rehost proxy; real relay drop when elevated) - S4 auto-resume after process kill Emits report.md/json (gitignored) with a thresholded findings section. Co-Authored-By: Claude Opus 4.8 --- .gitignore | 4 + src/headless.rs | 170 +++++++++++++++++++ src/main.rs | 10 ++ tests/e2e/README.md | 54 +++++++ tests/e2e/backend.py | 94 +++++++++++ tests/e2e/harness.py | 324 +++++++++++++++++++++++++++++++++++++ tests/e2e/report_md.py | 169 +++++++++++++++++++ tests/e2e/requirements.txt | 2 + tests/e2e/run_e2e.py | 302 ++++++++++++++++++++++++++++++++++ 9 files changed, 1129 insertions(+) create mode 100644 src/headless.rs create mode 100644 tests/e2e/README.md create mode 100644 tests/e2e/backend.py create mode 100644 tests/e2e/harness.py create mode 100644 tests/e2e/report_md.py create mode 100644 tests/e2e/requirements.txt create mode 100644 tests/e2e/run_e2e.py diff --git a/.gitignore b/.gitignore index 4a2efe7..d6b3489 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,7 @@ __pycache__/ # kept locally for analysis only, never versioned. profile*.json.gz *.json.syms.json + +# E2E suite generated run artifacts (regenerated on every run, not versioned) +tests/e2e/report.json +tests/e2e/report.md diff --git a/src/headless.rs b/src/headless.rs new file mode 100644 index 0000000..ec27689 --- /dev/null +++ b/src/headless.rs @@ -0,0 +1,170 @@ +//! Headless host runner — a diagnostic/test entrypoint (no GUI, no tray) used by +//! the blackbox E2E resilience harness in `tests/e2e/`. +//! +//! It drives the **production** host engine (`host::spawn` → +//! `engine::host_group` → the keep-alive driver), so the harness exercises the +//! real connect / keep-alive / reconnect path rather than a stand-in. It is +//! activated when `DEVTUNNEL_HEADLESS_HOST=[,…]` is set; +//! `main` returns through here before building any UI. +//! +//! Observability: every [`host::HostEvent`] is written as one JSON line on +//! stdout (logs stay on stderr via the capturing logger), so an external process +//! can observe state transitions deterministically. Control: it reads simple +//! line commands on stdin — `stop `, `stop` (all groups), `quit` (stop all +//! and exit). EOF on stdin is treated as `quit`. +//! +//! Only the `--features hosting` build has a real engine; the default build's +//! `NoopHost` makes this a no-op, which keeps the module compiling everywhere. + +use std::io::{BufRead, Write}; +use std::time::{Duration, Instant}; + +use crate::host::{self, HostCommand, HostEvent, HostState}; + +/// A control command parsed from stdin. +enum Ctl { + /// (Re)start hosting one group by Real Tunnel ID (used to re-host after a + /// `stop`, exercising a clean teardown → reconnect cycle). + Host(String), + /// Stop one group by Real Tunnel ID. + Stop(String), + /// Stop every hosted group. + StopAll, + /// Stop everything and exit. + Quit, +} + +/// Runs the headless host loop for the comma-separated `ids_csv`. Returns once a +/// `quit` command (or stdin EOF) is received and the engine has been asked to +/// stop every group. +pub fn run(ids_csv: &str) -> anyhow::Result<()> { + let ids: Vec = ids_csv + .split(',') + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_owned) + .collect(); + if ids.is_empty() { + anyhow::bail!("DEVTUNNEL_HEADLESS_HOST is set but lists no tunnel ids"); + } + + let started = Instant::now(); + let (evt_tx, evt_rx) = std::sync::mpsc::channel::(); + let host = host::spawn(evt_tx); + + for id in &ids { + host.send(HostCommand::Host { + tunnel_id: id.clone(), + }); + } + emit_line(&serde_json::json!({ + "elapsed_ms": started.elapsed().as_millis() as u64, + "event": "started", + "tunnel_ids": ids, + })); + + // Stdin command reader → control channel. A dedicated thread keeps the main + // thread free to drain host events without blocking on a stdin read. + let (ctl_tx, ctl_rx) = std::sync::mpsc::channel::(); + std::thread::spawn(move || { + let stdin = std::io::stdin(); + for line in stdin.lock().lines() { + let Ok(line) = line else { break }; + let line = line.trim(); + let cmd = if line == "quit" || line == "exit" { + Ctl::Quit + } else if line == "stop" { + Ctl::StopAll + } else if let Some(rest) = line.strip_prefix("stop ") { + Ctl::Stop(rest.trim().to_owned()) + } else if let Some(rest) = line.strip_prefix("host ") { + Ctl::Host(rest.trim().to_owned()) + } else { + continue; + }; + if ctl_tx.send(cmd).is_err() { + return; + } + } + // EOF on stdin → ask the main loop to quit. + let _ = ctl_tx.send(Ctl::Quit); + }); + + // Main loop: interleave host events (printed as JSON) with control commands. + // Poll the control channel with a short timeout so host events never starve. + loop { + loop { + match evt_rx.try_recv() { + Ok(evt) => emit_line(&event_json(started, &evt)), + Err(std::sync::mpsc::TryRecvError::Empty) => break, + // The engine thread is gone; nothing more will arrive. + Err(std::sync::mpsc::TryRecvError::Disconnected) => return Ok(()), + } + } + match ctl_rx.recv_timeout(Duration::from_millis(100)) { + Ok(Ctl::Host(id)) => host.send(HostCommand::Host { tunnel_id: id }), + Ok(Ctl::Stop(id)) => host.send(HostCommand::Stop { tunnel_id: id }), + Ok(Ctl::StopAll) => stop_all(host.as_ref(), &ids), + Ok(Ctl::Quit) => { + stop_all(host.as_ref(), &ids); + // Give the engine a moment to emit the trailing `Stopped` events + // before exiting, so the harness sees a clean teardown. + std::thread::sleep(Duration::from_millis(300)); + while let Ok(evt) = evt_rx.try_recv() { + emit_line(&event_json(started, &evt)); + } + return Ok(()); + } + // No control input this tick: loop back and drain events again. + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {} + // The reader thread exited without a final Quit (should not happen); + // keep draining events until the engine disconnects. + Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {} + } + } +} + +/// Sends `Stop` for every group id. +fn stop_all(host: &dyn host::TunnelHost, ids: &[String]) { + for id in ids { + host.send(HostCommand::Stop { + tunnel_id: id.clone(), + }); + } +} + +/// Renders one [`HostEvent`] as the JSON line emitted on stdout. +fn event_json(started: Instant, evt: &HostEvent) -> serde_json::Value { + let elapsed_ms = started.elapsed().as_millis() as u64; + match evt { + HostEvent::State { tunnel_id, state } => { + let (name, message) = match state { + HostState::Idle => ("Idle", None), + HostState::Connecting => ("Connecting", None), + HostState::Hosting => ("Hosting", None), + HostState::Reconnecting => ("Reconnecting", None), + HostState::Stopped => ("Stopped", None), + HostState::Error(m) => ("Error", Some(m.clone())), + }; + serde_json::json!({ + "elapsed_ms": elapsed_ms, + "event": "state", + "tunnel_id": tunnel_id, + "state": name, + "message": message, + }) + } + HostEvent::ReloginRequired { tunnel_id } => serde_json::json!({ + "elapsed_ms": elapsed_ms, + "event": "relogin_required", + "tunnel_id": tunnel_id, + }), + } +} + +/// Writes one JSON value as a line on stdout and flushes immediately so the +/// harness observes events in real time. +fn emit_line(v: &serde_json::Value) { + println!("{v}"); + let _ = std::io::stdout().flush(); +} diff --git a/src/main.rs b/src/main.rs index 27b50a3..21ff548 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,6 +4,7 @@ #[cfg(windows)] mod autostart; mod devtunnel; +mod headless; mod host; mod icon_render; #[cfg(windows)] @@ -202,6 +203,15 @@ fn main() -> anyhow::Result<()> { // (e.g. `RUST_LOG=devtunnel_gui=debug,tunnels=info`). let _ = logbuf::CaptureLogger::from_env("devtunnel_gui=info,tunnels=warn").install(); + // Headless host runner: a diagnostic/test entrypoint (no GUI, no tray) for + // the blackbox E2E resilience harness in `tests/e2e/`. When + // `DEVTUNNEL_HEADLESS_HOST=[,…]` is set we drive the production host + // engine directly and stream every `HostEvent` as JSON on stdout, returning + // before any UI is built. A real engine only exists with `--features hosting`. + if let Ok(ids) = std::env::var("DEVTUNNEL_HEADLESS_HOST") { + return headless::run(&ids); + } + // winit registers the window class with a null icon, so the title bar and // taskbar would show the generic default. Install the winit backend with a // hook that sets our brand icon on every window at creation time. (The diff --git a/tests/e2e/README.md b/tests/e2e/README.md new file mode 100644 index 0000000..7c1eef9 --- /dev/null +++ b/tests/e2e/README.md @@ -0,0 +1,54 @@ +# Blackbox E2E resilience suite + +Exercises DevTunnel GUI **as a product**: it creates groups (tunnels) on a +shared local port, hosts them through the *production* keep-alive engine running +headless, serves a real Python backend, hammers the public URLs, and runs +resilience scenarios while sampling the host process. The goal is **stability +and efficiency**, not usability. + +## How it drives the real engine + +The GUI tray app can't be scripted, but its hosting engine is the product's +value. `src/headless.rs` adds a headless entrypoint: when +`DEVTUNNEL_HEADLESS_HOST=[,…]` is set, the binary drives the exact +production path (`host::spawn` → `engine::host_group` → the keep-alive state +machine) instead of building any UI, and streams every `HostEvent` as one JSON +line on stdout. The harness reads that stream and sends `stop ` / `host ` +/ `quit` on stdin. So the suite measures the real connect / keep-alive / +reconnect code, observed purely from the outside. + +## Prerequisites + +- `devtunnel` CLI signed in: `devtunnel user login` +- Host binary built with the SDK engine: + ``` + cargo build --features hosting + ``` + (needs NASM + Strawberry Perl + MSVC on PATH — see `CLAUDE.md`). +- Python deps: `pip install -r tests/e2e/requirements.txt` + +## Run + +``` +python tests/e2e/run_e2e.py --groups 2 --port 3000 --load-secs 45 +``` + +Writes `tests/e2e/report.md` and prints a live summary. Created tunnels use the +`e2e-*` prefix and are deleted on teardown. + +## Scenarios + +| id | what it proves | +|----|----------------| +| S2 | N tunnels on one local port all forward independently (no starvation) | +| S3 | throughput, p50/p95/p99 latency, error rate, **idle + loaded host CPU/RSS** (catches the relay busy-loop regression) | +| S1 | reconnect after a drop — stop→rehost proxy always; a real relay drop via firewall block only when run elevated | +| S4 | auto-resume — kill the host process, relaunch, recover serving | + +## Limitations + +- A genuine relay drop (S1b) blocks the host binary's outbound traffic with a + Windows Firewall rule, which needs an **elevated** shell. Without it the suite + uses the stop→rehost proxy and says so in the report. +- The headless runner re-hosts only the ids it's given; GUI auto-resume (which + re-hosts the previously-active set) is approximated by S4's process kill. diff --git a/tests/e2e/backend.py b/tests/e2e/backend.py new file mode 100644 index 0000000..73cfaed --- /dev/null +++ b/tests/e2e/backend.py @@ -0,0 +1,94 @@ +"""Local HTTP test page served through the dev tunnel under test. + +This is the "produto em uso" target: a small, fast, threaded HTTP server the +harness exposes via one or more tunnels (groups) and then hammers from the +public side to measure stability, latency and throughput. + +Endpoints: + GET / -> 200 text marker + a monotonic request counter + GET /health -> 200 "ok" (cheap liveness probe) + GET /echo?bytes=N-> 200 with N bytes of payload (throughput test; capped) + GET /stats -> 200 JSON with per-path counters (server-side ground truth) + +Run standalone: python backend.py [port] (default 3000) +""" + +from __future__ import annotations + +import json +import sys +import threading +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from urllib.parse import urlparse, parse_qs + +MARKER = "DEVTUNNEL_E2E_OK" +MAX_ECHO_BYTES = 4 * 1024 * 1024 # cap so a bad query can't OOM the box + +_counts_lock = threading.Lock() +_counts: dict[str, int] = {} + + +def _bump(path: str) -> int: + with _counts_lock: + total = _counts.get("__total__", 0) + 1 + _counts["__total__"] = total + _counts[path] = _counts.get(path, 0) + 1 + return total + + +class Handler(BaseHTTPRequestHandler): + # Keep the access log quiet: the harness measures from the client side and + # the per-request stderr spam would only obscure the host logs. + def log_message(self, *_args): # noqa: D401 + pass + + def _send(self, code: int, body: bytes, ctype: str = "text/plain"): + self.send_response(code) + self.send_header("Content-Type", ctype) + self.send_header("Content-Length", str(len(body))) + self.send_header("Connection", "close") + self.end_headers() + try: + self.wfile.write(body) + except (BrokenPipeError, ConnectionResetError): + pass + + def do_GET(self): + parsed = urlparse(self.path) + path = parsed.path + total = _bump(path) + + if path == "/health": + self._send(200, b"ok") + return + if path == "/stats": + with _counts_lock: + snap = dict(_counts) + self._send(200, json.dumps(snap).encode(), "application/json") + return + if path == "/echo": + qs = parse_qs(parsed.query) + n = int(qs.get("bytes", ["1024"])[0]) + n = max(0, min(n, MAX_ECHO_BYTES)) + self._send(200, b"x" * n) + return + + # Default page: a stable marker + counter the harness asserts on. + self._send(200, f"{MARKER} n={total}\n".encode()) + + +def serve(port: int) -> ThreadingHTTPServer: + """Starts the threaded server on 127.0.0.1:port and returns it (not blocking).""" + httpd = ThreadingHTTPServer(("127.0.0.1", port), Handler) + threading.Thread(target=httpd.serve_forever, name=f"backend-{port}", daemon=True).start() + return httpd + + +if __name__ == "__main__": + p = int(sys.argv[1]) if len(sys.argv) > 1 else 3000 + server = serve(p) + print(f"backend listening on http://127.0.0.1:{p} (Ctrl-C to stop)") + try: + threading.Event().wait() + except KeyboardInterrupt: + server.shutdown() diff --git a/tests/e2e/harness.py b/tests/e2e/harness.py new file mode 100644 index 0000000..8bd296e --- /dev/null +++ b/tests/e2e/harness.py @@ -0,0 +1,324 @@ +"""Harness primitives for the blackbox E2E resilience suite. + +Three pieces: + * `dt` - thin wrapper over the `devtunnel` CLI (the product's own + management surface): create group, add port, anonymous access, + resolve public URL, delete. + * `HostRunner`- drives the production host engine headless by launching the + `devtunnel_gui` binary with `DEVTUNNEL_HEADLESS_HOST=`, + parsing its JSON event stream and forwarding stdin commands. + * `probe` - client-side load/latency measurement + host-process sampling. + +Nothing here is product code; it only *uses* the product from the outside. +""" + +from __future__ import annotations + +import json +import os +import subprocess +import threading +import time +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass, field + +import requests + +try: + import psutil +except ImportError: # pragma: no cover - guarded at startup + psutil = None + +DEVTUNNEL = os.environ.get("DEVTUNNEL_BIN", "devtunnel") +# Dev Tunnels shows an HTML anti-phishing interstitial for plain browser GETs; +# this header makes the relay forward straight to the backend so we measure the +# real data path, not the warning page. +SKIP_INTERSTITIAL = {"X-Tunnel-Skip-AntiPhishing-Page": "true"} + + +# --------------------------------------------------------------------------- dt +def _run(args: list[str], timeout: int = 90) -> subprocess.CompletedProcess: + return subprocess.run( + [DEVTUNNEL, *args], + capture_output=True, + text=True, + timeout=timeout, + ) + + +def _run_json(args: list[str], timeout: int = 90): + cp = _run(args, timeout) + if cp.returncode != 0: + raise RuntimeError(f"devtunnel {' '.join(args)} failed: {cp.stderr.strip()}") + out = cp.stdout + start = min((i for i in (out.find("{"), out.find("[")) if i != -1), default=-1) + if start == -1: + raise RuntimeError(f"no JSON in `devtunnel {' '.join(args)}` output: {out[:200]}") + return json.loads(out[start:]) + + +def create_group(name: str, expiration: str = "1h") -> str: + """Creates an anonymous group (tunnel) and returns its Real Tunnel ID (id.cluster).""" + created = _run_json(["create", name, "-a", "-e", expiration, "-j"]) + full_id = created["tunnel"]["tunnelId"] + # Mirror the GUI: ensure an anonymous ACE exists so the public URL is reachable + # without auth (create -a should suffice, but this is idempotent and safe). + _run(["access", "create", full_id, "--anonymous", "-j"]) + return full_id + + +def add_port(full_id: str, port: int, protocol: str = "http") -> None: + cp = _run(["port", "create", full_id, "-p", str(port), "--protocol", protocol, "-j"]) + # 409 (port already exists) is fine for re-runs. + if cp.returncode != 0 and "already exist" not in cp.stderr.lower(): + raise RuntimeError(f"add_port {full_id}:{port} failed: {cp.stderr.strip()}") + + +def port_uri(full_id: str, port: int) -> str | None: + show = _run_json(["show", full_id, "-j"]) + for p in show.get("tunnel", {}).get("ports", []): + if p.get("portNumber") == port: + return p.get("portUri") + return None + + +def host_connections(full_id: str) -> int: + """Live host-connection count for the tunnel (0 = nothing hosting it).""" + try: + show = _run_json(["show", full_id, "-j"]) + except RuntimeError: + return -1 + status = show.get("tunnel", {}).get("status", {}) + return status.get("hostConnectionCount", 0) or 0 + + +def delete_group(full_id: str) -> None: + _run(["delete", full_id, "-f", "-j"]) + + +def list_ids() -> list[str]: + data = _run_json(["list", "-j"]) + return [t.get("tunnelId") for t in data.get("tunnels", []) if t.get("tunnelId")] + + +# ------------------------------------------------------------------- HostRunner +@dataclass +class HostRunner: + """Drives the headless production host engine and tracks its event stream.""" + + binary: str + ids: list[str] + extra_env: dict | None = None + proc: subprocess.Popen | None = field(default=None, init=False) + events: list[dict] = field(default_factory=list, init=False) + _state: dict[str, str] = field(default_factory=dict, init=False) + _lock: threading.Lock = field(default_factory=threading.Lock, init=False) + _t0: float = field(default=0.0, init=False) + + def start(self) -> "HostRunner": + env = dict(os.environ) + env["DEVTUNNEL_HEADLESS_HOST"] = ",".join(self.ids) + env.setdefault("RUST_LOG", "devtunnel_gui=info,tunnels=warn") + if self.extra_env: + env.update(self.extra_env) + self._t0 = time.monotonic() + self.proc = subprocess.Popen( + [self.binary], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + text=True, + bufsize=1, + env=env, + ) + threading.Thread(target=self._pump, name="runner-stdout", daemon=True).start() + return self + + def _pump(self): + assert self.proc and self.proc.stdout + for line in self.proc.stdout: + line = line.strip() + if not line: + continue + try: + evt = json.loads(line) + except json.JSONDecodeError: + continue + evt["_recv_ms"] = int((time.monotonic() - self._t0) * 1000) + with self._lock: + self.events.append(evt) + if evt.get("event") == "state": + self._state[evt["tunnel_id"]] = evt["state"] + + def send(self, cmd: str): + if self.proc and self.proc.stdin: + self.proc.stdin.write(cmd + "\n") + self.proc.stdin.flush() + + def state(self, full_id: str) -> str | None: + with self._lock: + return self._state.get(full_id) + + def wait_state(self, full_id: str, target: str, timeout: float = 90.0) -> float | None: + """Blocks until `full_id` reaches `target`; returns seconds waited or None on timeout.""" + start = time.monotonic() + deadline = start + timeout + while time.monotonic() < deadline: + if self.state(full_id) == target: + return time.monotonic() - start + if self.proc and self.proc.poll() is not None: + return None + time.sleep(0.1) + return None + + def wait_all(self, target: str, timeout: float = 120.0) -> bool: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + with self._lock: + if all(self._state.get(i) == target for i in self.ids): + return True + time.sleep(0.2) + return False + + @property + def pid(self) -> int | None: + return self.proc.pid if self.proc else None + + def quit(self, timeout: float = 8.0): + try: + self.send("quit") + except (BrokenPipeError, OSError): + pass + if self.proc: + try: + self.proc.wait(timeout=timeout) + except subprocess.TimeoutExpired: + self.kill() + + def kill(self): + if self.proc and self.proc.poll() is None: + self.proc.kill() + self.proc.wait(timeout=5) + + +# ------------------------------------------------------------------------ probe +@dataclass +class LoadResult: + requests: int + ok: int + failed: int + duration_s: float + latencies_ms: list[float] + errors: dict[str, int] = field(default_factory=dict) + + @property + def rps(self) -> float: + return self.ok / self.duration_s if self.duration_s else 0.0 + + @property + def error_rate(self) -> float: + return self.failed / self.requests if self.requests else 0.0 + + def pct(self, p: float) -> float: + if not self.latencies_ms: + return float("nan") + s = sorted(self.latencies_ms) + k = min(len(s) - 1, int(round(p / 100 * (len(s) - 1)))) + return s[k] + + +def hit(url: str, timeout: float = 10.0) -> tuple[bool, float, str]: + """One GET; returns (ok, latency_ms, err). ok requires 2xx and the marker/echo.""" + t = time.monotonic() + try: + r = requests.get(url, headers=SKIP_INTERSTITIAL, timeout=timeout) + dt = (time.monotonic() - t) * 1000 + return (r.status_code == 200, dt, "" if r.status_code == 200 else f"http{r.status_code}") + except requests.RequestException as e: + return (False, (time.monotonic() - t) * 1000, type(e).__name__) + + +def load(url: str, duration_s: float, concurrency: int = 8, timeout: float = 10.0) -> LoadResult: + """Drives `url` for `duration_s` with `concurrency` workers; collects latency/errors.""" + lats: list[float] = [] + errors: dict[str, int] = {} + ok = 0 + total = 0 + lock = threading.Lock() + stop_at = time.monotonic() + duration_s + start = time.monotonic() + + def worker(): + nonlocal ok, total + while time.monotonic() < stop_at: + good, dt, err = hit(url, timeout) + with lock: + total += 1 + lats.append(dt) + if good: + ok += 1 + elif err: + errors[err] = errors.get(err, 0) + 1 + + with ThreadPoolExecutor(max_workers=concurrency) as ex: + for _ in range(concurrency): + ex.submit(worker) + dur = time.monotonic() - start + return LoadResult(total, ok, total - ok, dur, lats, errors) + + +@dataclass +class ProcSamples: + cpu_percent: list[float] = field(default_factory=list) + rss_mb: list[float] = field(default_factory=list) + + @property + def cpu_max(self) -> float: + return max(self.cpu_percent, default=0.0) + + @property + def cpu_avg(self) -> float: + return sum(self.cpu_percent) / len(self.cpu_percent) if self.cpu_percent else 0.0 + + @property + def rss_growth_mb(self) -> float: + return (self.rss_mb[-1] - self.rss_mb[0]) if len(self.rss_mb) >= 2 else 0.0 + + +def sample_process(pid: int, duration_s: float, interval: float = 0.5) -> ProcSamples: + """Samples CPU% (normalized across cores) and RSS of `pid` (and its children).""" + out = ProcSamples() + if psutil is None: + return out + try: + proc = psutil.Process(pid) + except psutil.NoSuchProcess: + return out + procs = [proc] + try: + procs += proc.children(recursive=True) + except psutil.Error: + pass + for p in procs: + try: + p.cpu_percent(None) # prime the per-process counter + except psutil.Error: + pass + ncpu = psutil.cpu_count() or 1 + deadline = time.monotonic() + duration_s + while time.monotonic() < deadline: + time.sleep(interval) + cpu = 0.0 + rss = 0.0 + alive = [] + for p in procs: + try: + cpu += p.cpu_percent(None) + rss += p.memory_info().rss + alive.append(p) + except psutil.Error: + continue + procs = alive + out.cpu_percent.append(cpu / ncpu) # 100% == one full core + out.rss_mb.append(rss / (1024 * 1024)) + return out diff --git a/tests/e2e/report_md.py b/tests/e2e/report_md.py new file mode 100644 index 0000000..97fbfb9 --- /dev/null +++ b/tests/e2e/report_md.py @@ -0,0 +1,169 @@ +"""Renders the E2E result dict into `report.md`, including a findings section +that flags stability/efficiency problems against fixed thresholds and proposes +concrete product adjustments. +""" + +from __future__ import annotations + + +def _findings(r: dict) -> list[str]: + """Derives actionable findings from the metrics. Empty list == all green.""" + out: list[str] = [] + sc = r.get("scenarios", {}) + + host = sc.get("host", {}).get("time_to_hosting_s", {}) + failed = {k: v for k, v in host.items() if v is None} + if failed: + out.append( + f"**Initial host failed** for {list(failed)} (never reached Hosting). " + f"Add a connect timeout with a clear Error state instead of an open-ended wait." + ) + cold = sc.get("s4_auto_resume", {}).get("cold_recover_s", {}) + worst = max([v for v in list(host.values()) + list(cold.values()) if v], default=0) + if worst > 15: + out.append( + f"**Slow connect/resume** (worst {worst:.0f}s to Hosting). The host path " + f"mints two tokens (`devtunnel token … --scopes host` then `… manage:ports`) " + f"sequentially, then `list`+`show` per group, then the relay handshake — all " + f"before serving. Proposed adjustments: mint the two tokens concurrently, " + f"cache `collect_ports` from the create step instead of a fresh `list`/`show`, " + f"and emit a `Connecting` sub-progress so a 20–35 s wait doesn't look hung." + ) + + s2 = sc.get("s2_same_port", {}) + if s2 and not s2.get("all_serving"): + bad = [k for k, g in s2.get("groups", {}).items() if not g.get("serving")] + out.append( + f"**Same-port multi-group not fully serving**: {bad} never returned the " + f"backend marker. Multiple tunnels on one local port should each forward " + f"independently (issue #18 isolates groups per runtime) — verify no " + f"forward starvation under concurrent groups." + ) + + s3 = sc.get("s3_load", {}) + if s3: + if s3.get("idle_cpu_avg", 0) > 10: + out.append( + f"**Idle CPU too high** ({s3['idle_cpu_avg']}% avg, peak " + f"{s3.get('idle_cpu_max')}%) with no traffic — strong signal of the " + f"relay busy-loop regression (a dropped `ports_tx` makes `run_stream` " + f"spin). The keep-alive `_host` lifetime invariant must hold; re-check " + f"`host_group`. A correct host parks near 0%." + ) + if s3.get("error_rate", 0) > 0.02: + out.append( + f"**Elevated error rate under load** ({s3['error_rate']:.1%}). " + f"Inspect relay backpressure / forward timeouts; consider surfacing a " + f"degraded state and bounding per-connection concurrency." + ) + if (s3.get("p99_ms") or 0) > 2000: + out.append( + f"**High tail latency** p99={s3['p99_ms']}ms under {r['meta'].get('concurrency')} " + f"clients. Acceptable for a relay hop, but watch for growth over time." + ) + if s3.get("rss_growth_mb", 0) > 50: + out.append( + f"**Memory growth under load** (+{s3['rss_growth_mb']}MB over the run) — " + f"possible per-connection leak; sample a longer run to confirm." + ) + + s1 = sc.get("s1_reconnect", {}) + sr = s1.get("stop_rehost", {}) + if sr and not sr.get("serving_again"): + out.append( + "**Re-host did not resume serving** after a stop/start cycle. The engine's " + "`run` map removes the group on Stop and should accept a fresh Host — verify " + "the teardown fully releases the relay session before reconnect." + ) + rd = s1.get("relay_drop", {}) + if rd.get("serving_again") is False or (rd.get("recover_to_hosting_s") is None and "skipped" not in rd): + out.append( + "**Did not recover from a forced relay drop**: keep-alive reconnect/backoff " + "did not bring the group back. This is the core product promise — prioritize." + ) + + s4 = sc.get("s4_auto_resume", {}) + if s4 and not s4.get("serving_after"): + out.append( + "**Cold restart did not resume serving** all groups. The headless path only " + "re-hosts what it is told; in the GUI, confirm auto-resume re-hosts the prior " + "active set on launch." + ) + + return out + + +def render(r: dict) -> str: + m = r.get("meta", {}) + sc = r.get("scenarios", {}) + L: list[str] = [] + L.append("# DevTunnel GUI — Blackbox E2E Resilience Report\n") + L.append(f"- Started: `{m.get('started')}`") + L.append(f"- Result: **{m.get('result')}**") + L.append(f"- Groups: {m.get('groups')} on port {m.get('port')} · " + f"load {m.get('load_secs')}s @ {m.get('concurrency')} clients · " + f"elevated: {m.get('admin')}") + L.append(f"- Binary: `{m.get('binary')}`\n") + + findings = _findings(r) + L.append("## Findings & proposed adjustments\n") + if not findings: + L.append("No stability/efficiency problems crossed the thresholds. " + "Host parked at near-idle CPU, all groups served on the shared port, " + "reconnect/auto-resume recovered.\n") + else: + for i, f in enumerate(findings, 1): + L.append(f"{i}. {f}") + L.append("") + + L.append("## Host startup\n") + L.append("| group | time to Hosting (s) |") + L.append("|---|---|") + for k, v in sc.get("host", {}).get("time_to_hosting_s", {}).items(): + L.append(f"| `{k}` | {v} |") + L.append("") + + s2 = sc.get("s2_same_port", {}) + if s2: + L.append("## S2 — multiple groups, same port\n") + L.append(f"All serving: **{s2.get('all_serving')}**\n") + L.append("| group | serving | p50 ms | rps | err |") + L.append("|---|---|---|---|---|") + for k, g in s2.get("groups", {}).items(): + L.append(f"| `{k}` | {g['serving']} | {g['p50_ms']} | {g['rps']} | {g['error_rate']} |") + L.append("") + + s3 = sc.get("s3_load", {}) + if s3: + L.append("## S3 — sustained load + host efficiency\n") + L.append(f"- Requests: {s3['ok']}/{s3['requests']} ok · rps {s3['rps']} · " + f"error rate {s3['error_rate']}") + L.append(f"- Latency: p50 {s3['p50_ms']} · p95 {s3['p95_ms']} · p99 {s3['p99_ms']} ms") + L.append(f"- Host CPU: idle avg {s3['idle_cpu_avg']}% (max {s3['idle_cpu_max']}%) · " + f"loaded avg {s3['loaded_cpu_avg']}% (max {s3['loaded_cpu_max']}%)") + L.append(f"- RSS growth under load: {s3['rss_growth_mb']} MB " + f"_(100% CPU = one full core)_\n") + + s1 = sc.get("s1_reconnect", {}) + if s1: + L.append("## S1 — reconnect after drop\n") + sr = s1.get("stop_rehost", {}) + L.append(f"- stop→rehost: stopped in {sr.get('stopped_s')}s, " + f"re-Hosting in {sr.get('rehost_to_hosting_s')}s, " + f"serving again: **{sr.get('serving_again')}**") + rd = s1.get("relay_drop", {}) + if "skipped" in rd: + L.append(f"- forced relay drop: _skipped_ ({rd['skipped']})\n") + else: + L.append(f"- forced relay drop: Reconnecting observed {rd.get('reconnecting_observed')}, " + f"recovered in {rd.get('recover_to_hosting_s')}s, " + f"serving again: **{rd.get('serving_again')}**\n") + + s4 = sc.get("s4_auto_resume", {}) + if s4: + L.append("## S4 — auto-resume after process kill\n") + L.append(f"- Killed pid {s4.get('killed_pid')}; serving after relaunch: " + f"**{s4.get('serving_after')}**") + L.append(f"- Cold recover: {s4.get('cold_recover_s')}\n") + + return "\n".join(L) diff --git a/tests/e2e/requirements.txt b/tests/e2e/requirements.txt new file mode 100644 index 0000000..8158ab2 --- /dev/null +++ b/tests/e2e/requirements.txt @@ -0,0 +1,2 @@ +requests>=2.31 +psutil>=5.9 diff --git a/tests/e2e/run_e2e.py b/tests/e2e/run_e2e.py new file mode 100644 index 0000000..53e9c57 --- /dev/null +++ b/tests/e2e/run_e2e.py @@ -0,0 +1,302 @@ +"""Blackbox E2E resilience suite for DevTunnel GUI. + +Uses the product the way a user would: creates groups (tunnels) on the same +local port, hosts them through the *production* keep-alive engine (headless), +serves a real Python backend, hammers the public URLs and runs resilience +scenarios, sampling the host process the whole time. Emits `report.md`. + +Scenarios (chosen with the user): + S2 multiple groups, same port - N tunnels -> one backend, all serving + S3 sustained load + latency - throughput / p50-p95-p99 / error rate, + plus idle + loaded CPU/RSS of the host + (catches the relay busy-loop regression) + S1 reconnect after drop - stop->rehost proxy always; real relay drop + via firewall block only when run elevated + S4 auto-resume - kill the host process, relaunch, recover + +Run: python tests/e2e/run_e2e.py [--groups N] [--port P] [--load-secs S] +Prereqs: `devtunnel` signed in; binary built with `--features hosting`. +""" + +from __future__ import annotations + +import argparse +import ctypes +import os +import subprocess +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +import backend +import harness as H + +HERE = Path(__file__).resolve().parent +REPO = HERE.parents[1] +BINARY = REPO / "target" / "debug" / "devtunnel_gui.exe" +PREFIX = "e2e" + + +def is_admin() -> bool: + try: + return bool(ctypes.windll.shell32.IsUserAnAdmin()) + except Exception: + return False + + +def banner(msg: str): + print(f"\n=== {msg} ===", flush=True) + + +def wait_url_serving(url: str, attempts: int = 30, delay: float = 1.0) -> bool: + """Polls a public URL until it returns the backend marker (route propagation).""" + for _ in range(attempts): + ok, _dt, _err = H.hit(url, timeout=8) + if ok: + return True + time.sleep(delay) + return False + + +def fw_block(program: str) -> bool: + r = subprocess.run( + ["netsh", "advfirewall", "firewall", "add", "rule", "name=e2e-relay-drop", + "dir=out", "action=block", f"program={program}", "enable=yes"], + capture_output=True, text=True) + return r.returncode == 0 + + +def fw_unblock(): + subprocess.run(["netsh", "advfirewall", "firewall", "delete", "rule", "name=e2e-relay-drop"], + capture_output=True, text=True) + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--groups", type=int, default=2, help="number of tunnels on the same port") + ap.add_argument("--port", type=int, default=3000, help="local backend port") + ap.add_argument("--load-secs", type=float, default=45.0, help="sustained-load duration") + ap.add_argument("--concurrency", type=int, default=8) + args = ap.parse_args() + + if not BINARY.exists(): + print(f"ERROR: host binary not found at {BINARY}\n" + f"Build it first: cargo build --features hosting", file=sys.stderr) + return 2 + if H.psutil is None: + print("WARNING: psutil missing — CPU/RSS sampling disabled (pip install psutil)", + file=sys.stderr) + + admin = is_admin() + report: dict = {"meta": {}, "scenarios": {}} + report["meta"] = { + "started": datetime.now(timezone.utc).isoformat(timespec="seconds"), + "groups": args.groups, "port": args.port, "binary": str(BINARY), + "admin": admin, "load_secs": args.load_secs, "concurrency": args.concurrency, + } + + created: list[str] = [] + urls: dict[str, str] = {} + runner: H.HostRunner | None = None + httpd = None + + try: + # ---- Setup -------------------------------------------------------- + banner("Setup: backend + groups") + httpd = backend.serve(args.port) + print(f"backend on 127.0.0.1:{args.port}") + + for i in range(args.groups): + name = f"{PREFIX}-{int(time.time())}-{i}" + fid = H.create_group(name) + created.append(fid) + H.add_port(fid, args.port, "http") + print(f" group {i}: {fid}") + + # ---- Host via production engine (headless) ------------------------ + banner("Host: launch headless production engine") + runner = H.HostRunner(str(BINARY), created).start() + t_host = {} + for fid in created: + secs = runner.wait_state(fid, "Hosting", timeout=120) + t_host[fid] = secs + print(f" {fid}: Hosting after {secs:.1f}s" if secs is not None + else f" {fid}: did NOT reach Hosting (state={runner.state(fid)})") + + # The public `portUri` only materializes once a host connection exists, + # so resolve URLs now (post-Hosting). The URL is stable for the tunnel's + # life, so cache it and reuse it across the later scenarios. + for fid in created: + uri = None + for _ in range(20): + uri = H.port_uri(fid, args.port) + if uri: + break + time.sleep(1.0) + ident, cluster = fid.rsplit(".", 1) + urls[fid] = uri or f"https://{ident}-{args.port}.{cluster}.devtunnels.ms/" + print(f" url {fid} -> {urls[fid]}") + report["scenarios"]["host"] = { + "time_to_hosting_s": {k: round(v, 2) if v is not None else None + for k, v in t_host.items()}, + } + + # ---- S2: multiple groups, same port ------------------------------- + banner("S2: multiple groups share one port") + s2 = {"groups": {}} + for fid, url in urls.items(): + serving = wait_url_serving(url) if url else False + res = H.load(url, duration_s=5, concurrency=4) if serving else None + s2["groups"][fid] = { + "url": url, "serving": serving, + "p50_ms": round(res.pct(50), 1) if res else None, + "rps": round(res.rps, 1) if res else None, + "error_rate": round(res.error_rate, 3) if res else None, + } + print(f" {fid}: serving={serving}" + + (f" p50={res.pct(50):.0f}ms rps={res.rps:.1f}" if res else "")) + s2["all_serving"] = all(g["serving"] for g in s2["groups"].values()) + report["scenarios"]["s2_same_port"] = s2 + + # ---- S3: sustained load + latency + busy-loop watch --------------- + banner("S3: sustained load + host CPU/RSS") + target = next((u for u in urls.values() if u), None) + # Idle baseline first: no traffic, ~8s. A correct host parks at ~0% CPU; + # the relay busy-loop regression (issue: dropped ports_tx) pegs cores. + idle = H.sample_process(runner.pid, duration_s=8) if runner.pid else H.ProcSamples() + print(f" idle CPU avg={idle.cpu_avg:.1f}% max={idle.cpu_max:.1f}% " + f"rss={idle.rss_mb[-1] if idle.rss_mb else 0:.0f}MB") + + import threading + load_res = {} + + def _run_load(): + load_res["r"] = H.load(target, args.load_secs, args.concurrency) + + lt = threading.Thread(target=_run_load) + lt.start() + loaded = H.sample_process(runner.pid, duration_s=args.load_secs) if runner.pid \ + else H.ProcSamples() + lt.join() + r = load_res.get("r") + if r: + print(f" load: {r.ok}/{r.requests} ok rps={r.rps:.1f} " + f"p50={r.pct(50):.0f} p95={r.pct(95):.0f} p99={r.pct(99):.0f}ms " + f"err={r.error_rate:.3f}") + print(f" loaded CPU avg={loaded.cpu_avg:.1f}% max={loaded.cpu_max:.1f}% " + f"RSS growth={loaded.rss_growth_mb:+.1f}MB") + report["scenarios"]["s3_load"] = { + "url": target, + "requests": r.requests if r else 0, "ok": r.ok if r else 0, + "rps": round(r.rps, 1) if r else 0, "error_rate": round(r.error_rate, 3) if r else 1, + "p50_ms": round(r.pct(50), 1) if r else None, + "p95_ms": round(r.pct(95), 1) if r else None, + "p99_ms": round(r.pct(99), 1) if r else None, + "idle_cpu_avg": round(idle.cpu_avg, 1), "idle_cpu_max": round(idle.cpu_max, 1), + "loaded_cpu_avg": round(loaded.cpu_avg, 1), "loaded_cpu_max": round(loaded.cpu_max, 1), + "rss_growth_mb": round(loaded.rss_growth_mb, 1), + } + + # ---- S1: reconnect after drop ------------------------------------- + banner("S1: reconnect after drop") + s1 = {} + fid = created[0] + url = urls[fid] + # (a) Always: stop -> rehost proxy (clean teardown -> reconnect path). + runner.send(f"stop {fid}") + stopped = runner.wait_state(fid, "Stopped", timeout=20) + runner.send(f"host {fid}") + t0 = time.monotonic() + rehosted = runner.wait_state(fid, "Hosting", timeout=90) + reserve = wait_url_serving(url) + s1["stop_rehost"] = { + "stopped_s": round(stopped, 2) if stopped is not None else None, + "rehost_to_hosting_s": round(rehosted, 2) if rehosted is not None else None, + "serving_again": reserve, + } + print(f" stop->rehost: stopped={stopped} rehost={rehosted} serving_again={reserve}") + + # (b) Real relay drop via firewall — only when elevated. + if admin: + print(" forcing real relay drop via firewall block…") + if fw_block(str(BINARY)): + drop_seen = False + deadline = time.monotonic() + 30 + while time.monotonic() < deadline: + if runner.state(fid) == "Reconnecting": + drop_seen = True + break + time.sleep(0.5) + time.sleep(5) + fw_unblock() + t0 = time.monotonic() + back = runner.wait_state(fid, "Hosting", timeout=120) + serving = wait_url_serving(url) + s1["relay_drop"] = { + "reconnecting_observed": drop_seen, + "recover_to_hosting_s": round(back, 2) if back is not None else None, + "serving_again": serving, + } + print(f" relay drop: reconnecting={drop_seen} recover={back} serving={serving}") + else: + s1["relay_drop"] = {"skipped": "firewall rule add failed"} + else: + s1["relay_drop"] = {"skipped": "not elevated — re-run as admin to force a real relay drop"} + print(" real relay drop SKIPPED (needs admin). stop/rehost proxy used instead.") + report["scenarios"]["s1_reconnect"] = s1 + + # ---- S4: auto-resume (process kill + relaunch) -------------------- + banner("S4: auto-resume after host process kill") + old_pid = runner.pid + runner.kill() + time.sleep(2) + runner = H.HostRunner(str(BINARY), created).start() + cold = {} + for fid in created: + secs = runner.wait_state(fid, "Hosting", timeout=120) + cold[fid] = secs + serving_after = all(wait_url_serving(u) for u in urls.values() if u) + report["scenarios"]["s4_auto_resume"] = { + "killed_pid": old_pid, + "cold_recover_s": {k: round(v, 2) if v is not None else None for k, v in cold.items()}, + "serving_after": serving_after, + } + print(f" killed pid {old_pid}; cold recover={ {k: round(v,1) if v else None for k,v in cold.items()} } serving={serving_after}") + + report["meta"]["result"] = "completed" + + except Exception as e: + report["meta"]["result"] = f"error: {e!r}" + print(f"\nERROR: {e!r}", file=sys.stderr) + finally: + banner("Teardown") + if admin: + fw_unblock() + if runner: + runner.quit() + for fid in created: + try: + H.delete_group(fid) + print(f" deleted {fid}") + except Exception as e: + print(f" WARN delete {fid}: {e}") + if httpd: + httpd.shutdown() + + write_report(report) + return 0 + + +def write_report(report: dict): + import json as _json + + from report_md import render + (HERE / "report.json").write_text(_json.dumps(report, indent=2), encoding="utf-8") + out = HERE / "report.md" + out.write_text(render(report), encoding="utf-8") + print(f"\nReport written to {out}") + + +if __name__ == "__main__": + sys.exit(main()) From d203dcfb51fe1d3124af993f82f3498bb8d639f4 Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 05:57:10 -0300 Subject: [PATCH 07/14] perf(host): mint host + manage:ports tokens concurrently off the executor (#38) connect_once minted the two tokens sequentially with blocking subprocess calls on the group's current-thread runtime. That doubled the mint wait and, during a periodic re-mint, stalled the still-live relay + port-forward tasks sharing the executor -- widening the very outage the re-mint exists to avoid. Mint each token on its own spawn_blocking thread and overlap them with try_join!, so the round-trips run in parallel and the old connection keeps forwarding while new tokens mint. Cuts initial connect time and shrinks the re-mint blip without overlapping two live relay connections (which would need live validation of two-simultaneous-hosts behavior -- left as follow-up). Co-Authored-By: Claude Opus 4.8 --- src/host/engine.rs | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/host/engine.rs b/src/host/engine.rs index 67906c9..dc54702 100644 --- a/src/host/engine.rs +++ b/src/host/engine.rs @@ -282,12 +282,32 @@ async fn connect_once( tunnel_id: &str, ports: &[(u16, String)], ) -> anyhow::Result<(RelayTunnelHost, tunnels::connections::RelayHandle)> { - let loc = Locale::load(&system_locale()); - - log::debug!("connect_once[{tunnel_id}]: minting host token"); - let host_token = devtunnel::mint_token(tunnel_id, "host", &loc)?; - log::debug!("connect_once[{tunnel_id}]: minting manage:ports token"); - let manage_token = devtunnel::mint_token(tunnel_id, "manage:ports", &loc)?; + // Mint both tokens concurrently on blocking threads. `mint_token` is a + // blocking subprocess + network round-trip; running the two sequentially on + // this current-thread runtime both doubles the wait and — during a re-mint — + // stalls the *still-live* relay + port-forward tasks that share this + // executor, widening the very outage the re-mint is meant to avoid. + // `spawn_blocking` moves each mint off the executor so the old connection + // keeps forwarding while the new tokens mint, and `try_join!` overlaps the + // two round-trips. `Locale` is `!Send`, so each closure builds its own from + // the system locale (used only for error formatting). + log::debug!("connect_once[{tunnel_id}]: minting host + manage:ports tokens"); + let host_task = { + let id = tunnel_id.to_string(); + tokio::task::spawn_blocking(move || { + devtunnel::mint_token(&id, "host", &Locale::load(&system_locale())) + }) + }; + let manage_task = { + let id = tunnel_id.to_string(); + tokio::task::spawn_blocking(move || { + devtunnel::mint_token(&id, "manage:ports", &Locale::load(&system_locale())) + }) + }; + let (host_res, manage_res) = tokio::try_join!(host_task, manage_task) + .map_err(|e| anyhow::anyhow!("token mint task panicked: {e}"))?; + let host_token = host_res?; + let manage_token = manage_res?; let (cluster, id) = devtunnel::split_locator(tunnel_id).ok_or_else(|| { anyhow::anyhow!("tunnel id has no cluster suffix (expected 'id.cluster'): {tunnel_id}") From 9072b3df2e7b1360396ef0607b9b5ad3534c919d Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 06:20:09 -0300 Subject: [PATCH 08/14] perf(host): fetch only the hosted tunnel's ports via a single show (#44) collect_ports called fetch_rows, which enumerates the whole account: a `devtunnel list` plus a `devtunnel show` for *every* tunnel, then discards all but the one being hosted. Hosting one tunnel therefore cost 1 + N subprocess round-trips (N = total tunnels), run serially before the relay handshake -- and the live E2E showed this, not the handshake, dominated the ~14-18s connect/resume time. Replace it with a targeted `fetch_tunnel_ports`: one `devtunnel show -j` for just the hosted tunnel, mapped to (port, protocol) by a pure, unit-tested helper (protocol preserved per #36). Account size no longer affects connect time. Measured on the blackbox E2E (live brs cluster): connect to Hosting ~14-18s -> ~2-5s, stop->rehost ~16.5s -> ~4.4s, cold recover ~16s -> ~1.4-4.9s; serving True, error rate 0, host CPU/RSS unchanged. Co-Authored-By: Claude Opus 4.8 --- src/devtunnel.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++- src/host/engine.rs | 17 +--------------- 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/src/devtunnel.rs b/src/devtunnel.rs index 48713c0..197978a 100644 --- a/src/devtunnel.rs +++ b/src/devtunnel.rs @@ -751,14 +751,62 @@ pub fn fetch_rows(loc: &Locale) -> Result> { Ok(rows) } +/// Fetches the ports of a single tunnel via `devtunnel show -j`, each paired +/// with its configured protocol. Targeted single-subprocess lookup: unlike +/// [`fetch_rows`], it does not enumerate the whole account (`list` + a `show` per +/// tunnel), so hosting one tunnel costs one CLI round-trip regardless of how many +/// tunnels the account holds (issue #44). The protocol is carried through because +/// re-registering a port under a different protocol is rejected by the service +/// and would block hosting (issue #36). +/// +/// # Errors +/// Propagates the CLI/JSON failure from the underlying `show` call. +#[cfg_attr(not(feature = "hosting"), allow(dead_code))] +pub fn fetch_tunnel_ports(tunnel_id: &str, loc: &Locale) -> Result> { + let show: ShowResult = run_json(&["show", tunnel_id, "-j"], loc)?; + Ok(tunnel_ports(show)) +} + +/// Maps a `show -j` result to `(port, protocol)` pairs, dropping ports that are +/// absent (`0`) or outside the valid `u16` range. Pure: split out from +/// [`fetch_tunnel_ports`] so the mapping is unit-tested without the CLI. +#[cfg_attr(not(feature = "hosting"), allow(dead_code))] +fn tunnel_ports(show: ShowResult) -> Vec<(u16, String)> { + show.tunnel + .ports + .into_iter() + .filter(|p| p.port_number > 0) + .filter_map(|p| u16::try_from(p.port_number).ok().map(|n| (n, p.protocol))) + .collect() +} + #[cfg(test)] mod tests { use super::{ anonymous_ace_args, classify_anonymous_access, classify_install_result, classify_user_show, is_auth_error, parse_leading_int, parse_rate_bps, parse_size_bytes, sanitize_tunnel_id, - update_expiration_args, InstallOutcome, + tunnel_ports, update_expiration_args, InstallOutcome, ShowResult, }; + #[test] + fn tunnel_ports_filters_zero_and_preserves_protocol() { + // `show -j` of one tunnel: a plain-http port, an https port, and an + // unconfigured (`0`) entry that must be dropped. + let json = r#"{ "tunnel": { "tunnelId": "x", "ports": [ + { "portNumber": 3000, "protocol": "http" }, + { "portNumber": 8443, "protocol": "https" }, + { "portNumber": 0, "protocol": "auto" } + ] } }"#; + let show: ShowResult = serde_json::from_str(json).expect("valid show JSON"); + assert_eq!( + tunnel_ports(show), + vec![ + (3000u16, "http".to_string()), + (8443u16, "https".to_string()) + ] + ); + } + #[test] fn parse_size_bytes_handles_units_and_locales() { assert_eq!(parse_size_bytes("4402 KB"), Some(4402.0 * 1024.0)); diff --git a/src/host/engine.rs b/src/host/engine.rs index dc54702..6efade1 100644 --- a/src/host/engine.rs +++ b/src/host/engine.rs @@ -81,7 +81,7 @@ fn run(cmd_rx: std::sync::mpsc::Receiver, events: Sender log::debug!("host engine: already hosting {tunnel_id}, ignoring Host"); continue; } - let ports = match collect_ports(&tunnel_id, &loc) { + let ports = match devtunnel::fetch_tunnel_ports(&tunnel_id, &loc) { Ok(ports) => ports, Err(e) => { let msg = e.to_string(); @@ -153,21 +153,6 @@ fn spawn_group( GroupHandle { thread, cancel } } -/// Fetches the ports defined for `tunnel_id` via the management CLI, each paired -/// with its configured protocol (`http`/`https`/`auto`). The protocol must be -/// preserved when forwarding: re-registering a port under a different protocol is -/// rejected by the service ("the tunnel port protocol cannot be changed") and -/// would block hosting entirely (issue #36). -fn collect_ports(tunnel_id: &str, loc: &Locale) -> anyhow::Result> { - let rows = devtunnel::fetch_rows(loc)?; - let ports: Vec<(u16, String)> = rows - .into_iter() - .filter(|r| r.tunnel_id == tunnel_id && r.port > 0) - .filter_map(|r| u16::try_from(r.port).ok().map(|p| (p, r.protocol))) - .collect(); - Ok(ports) -} - /// Long-running host task for one group: connect → add ports → keep alive, with /// reconnect-on-drop and periodic token re-mint. Loops forever; the caller's /// `select!` ends it when the group is cancelled (Stop). Returns early only on an From cda746349d4eb4cd9967ec216e8945144f84308e Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 06:31:15 -0300 Subject: [PATCH 09/14] feat(host): surface connect sub-phases so a slow connect shows progress (#45) A connect spends most of its time in three phases -- minting tokens, the relay handshake, and forwarding ports -- but reported only one static "Connecting" label, making a multi-second wait indistinguishable from a hang. Add an additive HostEvent::Progress { phase } emitted by connect_once at each phase boundary. The coarse Connecting/Hosting state transitions are unchanged, so the headless JSON contract the E2E depends on is preserved (the new "progress" line is additive). The GUI maps each phase to a Fluent status-bar string (status-connect-*); the headless runner serializes it as an additive "progress" event. Verified live: the stream now interleaves Connecting -> progress(authorizing) -> progress(connecting_relay) -> progress(forwarding_ports) -> Hosting, which also shows token minting (~1.9s) is now the dominant connect cost after the #44 port-fetch fix. Co-Authored-By: Claude Opus 4.8 --- i18n/en-US/app.ftl | 4 ++++ src/headless.rs | 16 ++++++++++++++++ src/host/engine.rs | 20 ++++++++++++++++++-- src/host/mod.rs | 22 ++++++++++++++++++++++ src/main.rs | 15 +++++++++++++++ 5 files changed, 75 insertions(+), 2 deletions(-) diff --git a/i18n/en-US/app.ftl b/i18n/en-US/app.ftl index f48cf07..dcf6361 100644 --- a/i18n/en-US/app.ftl +++ b/i18n/en-US/app.ftl @@ -75,6 +75,10 @@ btn-host = Host btn-stop = Stop status-hosting = hosting… status-stopped = stopped +# Connect sub-phases (issue #45): shown while a Host is establishing. +status-connect-authorizing = authorizing… +status-connect-relay = connecting relay… +status-connect-ports = forwarding ports… ## Health badges badge-operational = Operational diff --git a/src/headless.rs b/src/headless.rs index ec27689..8b900d2 100644 --- a/src/headless.rs +++ b/src/headless.rs @@ -154,6 +154,22 @@ fn event_json(started: Instant, evt: &HostEvent) -> serde_json::Value { "message": message, }) } + HostEvent::Progress { tunnel_id, phase } => { + // Additive to the `state` stream (issue #45): the coarse Connecting / + // Hosting transitions still fire, so a harness keyed on those is + // unaffected; this just exposes the sub-phase for finer diagnostics. + let phase = match phase { + host::ConnectPhase::Authorizing => "authorizing", + host::ConnectPhase::ConnectingRelay => "connecting_relay", + host::ConnectPhase::ForwardingPorts => "forwarding_ports", + }; + serde_json::json!({ + "elapsed_ms": elapsed_ms, + "event": "progress", + "tunnel_id": tunnel_id, + "phase": phase, + }) + } HostEvent::ReloginRequired { tunnel_id } => serde_json::json!({ "elapsed_ms": elapsed_ms, "event": "relogin_required", diff --git a/src/host/engine.rs b/src/host/engine.rs index 6efade1..099d640 100644 --- a/src/host/engine.rs +++ b/src/host/engine.rs @@ -36,7 +36,7 @@ use tunnels::connections::RelayTunnelHost; use tunnels::contracts::TunnelPort; use tunnels::management::{new_tunnel_management, Authorization, TunnelLocator}; -use super::{HostCommand, HostEvent, HostState}; +use super::{ConnectPhase, HostCommand, HostEvent, HostState}; use crate::devtunnel; use crate::locale::{system_locale, Locale}; @@ -172,7 +172,7 @@ async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender }, ); - let action = match connect_once(&tunnel_id, &ports).await { + let action = match connect_once(&tunnel_id, &ports, &events).await { // INVARIANT: `_host` (the `RelayTunnelHost`) MUST stay bound across // the keep-alive `select!` below — it owns the `ports_tx` // watch::Sender that every client's `run_stream` task waits on. The @@ -266,7 +266,11 @@ async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender async fn connect_once( tunnel_id: &str, ports: &[(u16, String)], + events: &Sender, ) -> anyhow::Result<(RelayTunnelHost, tunnels::connections::RelayHandle)> { + // Surface each connect sub-phase so a multi-second wait shows progress + // instead of one static "Connecting" label (issue #45). + progress(events, tunnel_id, ConnectPhase::Authorizing); // Mint both tokens concurrently on blocking threads. `mint_token` is a // blocking subprocess + network round-trip; running the two sequentially on // this current-thread runtime both doubles the wait and — during a re-mint — @@ -305,10 +309,14 @@ async fn connect_once( let locator = TunnelLocator::ID { cluster, id }; let mut host = RelayTunnelHost::new(locator, mgmt); + progress(events, tunnel_id, ConnectPhase::ConnectingRelay); log::debug!("connect_once[{tunnel_id}]: connecting to relay"); let handle = host.connect(&host_token).await?; log::info!("connect_once[{tunnel_id}]: relay connected"); + if !ports.is_empty() { + progress(events, tunnel_id, ConnectPhase::ForwardingPorts); + } for (port, protocol) in ports { // Forward each port under its configured protocol. The service rejects a // re-registration that changes the protocol, so an `https`/`auto` port @@ -340,3 +348,11 @@ fn emit(events: &Sender, tunnel_id: &str, state: HostState) { state, }); } + +/// Sends a connect sub-phase to the UI, ignoring a closed channel (UI gone). +fn progress(events: &Sender, tunnel_id: &str, phase: ConnectPhase) { + let _ = events.send(HostEvent::Progress { + tunnel_id: tunnel_id.to_string(), + phase, + }); +} diff --git a/src/host/mod.rs b/src/host/mod.rs index ccadfa3..600f7ac 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -44,6 +44,21 @@ pub enum HostState { Error(String), } +/// A sub-phase of an in-progress connect, reported via [`HostEvent::Progress`] +/// so a multi-second connect shows what it is doing instead of a single static +/// "Connecting" label (issue #45). Purely informational: it does not change the +/// coarse [`HostState`] lifecycle, so consumers that only track Connecting / +/// Hosting / Reconnecting can ignore it. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ConnectPhase { + /// Minting the `host` + `manage:ports` tokens. + Authorizing, + /// Establishing the relay connection (TLS/SSH handshake). + ConnectingRelay, + /// Registering the group's ports on the relay. + ForwardingPorts, +} + /// A command sent to the host engine. #[derive(Debug, Clone)] pub enum HostCommand { @@ -58,6 +73,13 @@ pub enum HostCommand { pub enum HostEvent { /// A group's hosting state changed. State { tunnel_id: String, state: HostState }, + /// A group's connect advanced to a new sub-phase (issue #45). Additive to + /// [`HostEvent::State`]: the coarse Connecting/Hosting transitions still + /// fire, so a consumer can ignore this without missing any lifecycle change. + Progress { + tunnel_id: String, + phase: ConnectPhase, + }, /// The CLI sign-in is expired or absent; hosting cannot proceed until the /// user re-authenticates via `devtunnel user login`. ReloginRequired { tunnel_id: String }, diff --git a/src/main.rs b/src/main.rs index 21ff548..0e13595 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1012,6 +1012,21 @@ fn main() -> anyhow::Result<()> { } host_changed = true; } + host::HostEvent::Progress { tunnel_id, phase } => { + // Show the connect sub-phase in the status bar so a + // multi-second connect reads as progress, not a hang + // (issue #45). Coarse host state is updated by the + // State arm; this only drives the transient label. + log::debug!("host progress: {tunnel_id} -> {phase:?}"); + if let Some(a) = weak.upgrade() { + let key = match phase { + host::ConnectPhase::Authorizing => "status-connect-authorizing", + host::ConnectPhase::ConnectingRelay => "status-connect-relay", + host::ConnectPhase::ForwardingPorts => "status-connect-ports", + }; + a.set_status(loc.t(key).into()); + } + } host::HostEvent::ReloginRequired { tunnel_id } => { log::warn!("host: re-login required (reported for {tunnel_id})"); // Enter the re-login state: banner + alert tray icon + From 605cb4d149a10c89dd057a7cb85c77d2827e3e7a Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 06:41:16 -0300 Subject: [PATCH 10/14] perf(host): reuse host/manage tokens across reconnects instead of re-minting (#47) After #44, token minting (~1.9s of two `devtunnel token` subprocess round-trips) is the dominant connect cost -- and connect_once re-minted on every attempt, including relay-drop reconnects where the previous tokens are still valid (~24h lifetime; the engine already re-mints proactively at 20h). Cache the minted (host, manage) pair driver-side in host_group and reuse it: - relay-drop reconnect -> reuse cached tokens (skip the mint and the `Authorizing` phase); - RemintDue (~20h) -> clear the cache and mint fresh before expiry; - connect failure -> cache already taken and not restored, so the next attempt re-mints (no stale-token reuse loop). No expiry parsing needed: the 20h re-mint timer bounds reuse well inside the ~24h validity. mint_tokens is split out of connect_once, which now takes an Option and returns the tokens used so the caller can cache them. The _host busy-loop invariant is unchanged. Live: first connect still mints + serves (Connecting -> authorizing -> connecting_relay -> forwarding_ports -> Hosting). The in-session relay-drop reuse path needs an elevated firewall block to force (same S1b limitation the E2E documents); reviewed by inspection. Gates: cargo test (76), clippy default + --features hosting, fmt --check -- all green. Co-Authored-By: Claude Opus 4.8 --- src/host/engine.rs | 108 +++++++++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 33 deletions(-) diff --git a/src/host/engine.rs b/src/host/engine.rs index 099d640..0b74939 100644 --- a/src/host/engine.rs +++ b/src/host/engine.rs @@ -161,6 +161,11 @@ async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender use super::keepalive::{Action, ConnEvent, ConnFailure, KeepAliveState, Phase}; let mut state = KeepAliveState::new(); + // Tokens minted on a successful connect, reused on the next reconnect so a + // relay drop does not re-pay the ~2s mint cost (issue #47). Cleared on a + // `RemintDue` (force a fresh mint before expiry) and on any connect failure + // (never keep reusing tokens a failed attempt might implicate). + let mut cached: Option = None; loop { emit( @@ -172,7 +177,7 @@ async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender }, ); - let action = match connect_once(&tunnel_id, &ports, &events).await { + let action = match connect_once(&tunnel_id, &ports, &events, cached.take()).await { // INVARIANT: `_host` (the `RelayTunnelHost`) MUST stay bound across // the keep-alive `select!` below — it owns the `ports_tx` // watch::Sender that every client's `run_stream` task waits on. The @@ -183,9 +188,11 @@ async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender // so the wait stays inline here: `_host` must not be moved into a // helper that drops it before the await. The only early `return` is // in the `Err` arm, where no live host is bound. - Ok((_host, handle)) => { + Ok((_host, handle, tokens)) => { // Success resets the backoff and leaves the first-attempt phase. let _ = state.next(ConnEvent::Connected); + // Keep the still-valid tokens for the next reconnect. + cached = Some(tokens); emit(&events, &tunnel_id, HostState::Hosting); // Keep alive until the relay drops or the re-mint timer fires. @@ -199,6 +206,11 @@ async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender ConnEvent::RemintDue } }; + // A re-mint must discard the cache so the next attempt mints fresh + // tokens before the old ones expire; a plain relay drop keeps them. + if matches!(event, ConnEvent::RemintDue) { + cached = None; + } // `_host` and the unfinished `handle` both drop here on the way // to reconnect, tearing down the relay session so old // `run_stream` tasks exit via their stream-closed arm. @@ -255,32 +267,25 @@ async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender } } -/// One connect attempt: mint fresh tokens, build the client, connect, add ports. -/// `Locale` is rebuilt here because it is not `Send` across the `await` points of -/// the host task. -/// -/// Returns the live [`RelayTunnelHost`] **and** its [`RelayHandle`]. The caller -/// must keep the host bound for the lifetime of the connection: it owns the -/// `ports_tx` watch::Sender that the SDK's per-client `run_stream` tasks wait on, -/// and dropping it early makes those tasks busy-loop (see [`host_group`]). -async fn connect_once( - tunnel_id: &str, - ports: &[(u16, String)], - events: &Sender, -) -> anyhow::Result<(RelayTunnelHost, tunnels::connections::RelayHandle)> { - // Surface each connect sub-phase so a multi-second wait shows progress - // instead of one static "Connecting" label (issue #45). - progress(events, tunnel_id, ConnectPhase::Authorizing); - // Mint both tokens concurrently on blocking threads. `mint_token` is a - // blocking subprocess + network round-trip; running the two sequentially on - // this current-thread runtime both doubles the wait and — during a re-mint — - // stalls the *still-live* relay + port-forward tasks that share this - // executor, widening the very outage the re-mint is meant to avoid. - // `spawn_blocking` moves each mint off the executor so the old connection - // keeps forwarding while the new tokens mint, and `try_join!` overlaps the - // two round-trips. `Locale` is `!Send`, so each closure builds its own from - // the system locale (used only for error formatting). - log::debug!("connect_once[{tunnel_id}]: minting host + manage:ports tokens"); +/// The two scoped tokens a host connection needs, cached across reconnects so a +/// relay drop does not re-pay the mint cost (issue #47). +struct Tokens { + /// `host` scope — authorizes the relay connection. + host: String, + /// `manage:ports` scope — authorizes `add_port`'s `create_tunnel_port`. + manage: String, +} + +/// Mints both scoped tokens concurrently on blocking threads. `mint_token` is a +/// blocking subprocess + network round-trip; running the two sequentially on the +/// group's current-thread runtime both doubles the wait and — during a re-mint — +/// stalls the *still-live* relay + port-forward tasks sharing this executor, +/// widening the very outage the re-mint is meant to avoid. `spawn_blocking` moves +/// each mint off the executor so the old connection keeps forwarding while the new +/// tokens mint, and `try_join!` overlaps the two round-trips. `Locale` is `!Send`, +/// so each closure builds its own from the system locale (error formatting only). +async fn mint_tokens(tunnel_id: &str) -> anyhow::Result { + log::debug!("mint_tokens[{tunnel_id}]: minting host + manage:ports tokens"); let host_task = { let id = tunnel_id.to_string(); tokio::task::spawn_blocking(move || { @@ -295,8 +300,43 @@ async fn connect_once( }; let (host_res, manage_res) = tokio::try_join!(host_task, manage_task) .map_err(|e| anyhow::anyhow!("token mint task panicked: {e}"))?; - let host_token = host_res?; - let manage_token = manage_res?; + Ok(Tokens { + host: host_res?, + manage: manage_res?, + }) +} + +/// One connect attempt: obtain tokens (reuse `cached` or mint fresh), build the +/// client, connect, add ports. +/// +/// `cached` carries the tokens from the previous successful connect; when present +/// they are reused — skipping the ~2s mint (and the `Authorizing` phase) — and +/// otherwise a fresh pair is minted (issue #47). On success the tokens used are +/// returned in the tuple so the caller can cache them for the next reconnect. +/// +/// Returns the live [`RelayTunnelHost`] **and** its [`RelayHandle`]. The caller +/// must keep the host bound for the lifetime of the connection: it owns the +/// `ports_tx` watch::Sender that the SDK's per-client `run_stream` tasks wait on, +/// and dropping it early makes those tasks busy-loop (see [`host_group`]). +async fn connect_once( + tunnel_id: &str, + ports: &[(u16, String)], + events: &Sender, + cached: Option, +) -> anyhow::Result<(RelayTunnelHost, tunnels::connections::RelayHandle, Tokens)> { + // Reuse the previous connect's still-valid tokens when available; only mint + // (surfaced as the `Authorizing` phase, issue #45) when there is no cached + // pair — i.e. the first connect, a re-mint, or after a failed attempt. + let tokens = match cached { + Some(tokens) => { + log::debug!("connect_once[{tunnel_id}]: reusing cached tokens"); + tokens + } + None => { + progress(events, tunnel_id, ConnectPhase::Authorizing); + mint_tokens(tunnel_id).await? + } + }; let (cluster, id) = devtunnel::split_locator(tunnel_id).ok_or_else(|| { anyhow::anyhow!("tunnel id has no cluster suffix (expected 'id.cluster'): {tunnel_id}") @@ -304,14 +344,16 @@ async fn connect_once( log::debug!("connect_once[{tunnel_id}]: locator cluster={cluster} id={id} ports={ports:?}"); let mut builder = new_tunnel_management(USER_AGENT); - builder.authorization(Authorization::Tunnel(manage_token)); + // Clone into the client so the original stays in `tokens`, which is returned + // for the caller to cache and reuse on the next reconnect. + builder.authorization(Authorization::Tunnel(tokens.manage.clone())); let mgmt = builder.into(); let locator = TunnelLocator::ID { cluster, id }; let mut host = RelayTunnelHost::new(locator, mgmt); progress(events, tunnel_id, ConnectPhase::ConnectingRelay); log::debug!("connect_once[{tunnel_id}]: connecting to relay"); - let handle = host.connect(&host_token).await?; + let handle = host.connect(&tokens.host).await?; log::info!("connect_once[{tunnel_id}]: relay connected"); if !ports.is_empty() { @@ -338,7 +380,7 @@ async fn connect_once( log::info!("connect_once[{tunnel_id}]: port {port} forwarded ({proto})"); } - Ok((host, handle)) + Ok((host, handle, tokens)) } /// Sends a state transition to the UI, ignoring a closed channel (UI gone). From 9dc3a262ed5911589bf04cb18e90f93bcc9652c3 Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 07:04:45 -0300 Subject: [PATCH 11/14] test(host): add a diagnostic forced relay-drop hook to verify token reuse (#47) A genuine in-session relay drop (the path that exercises #47's token reuse) could only be forced with an elevated firewall block, which is slow and flaky: an outbound block does not sever the established relay socket until a long keepalive timeout, and a held block makes the reconnect attempts fail (which by design clears the cache and re-mints), so it never cleanly demonstrates reuse. Add a HostCommand::DropRelay that signals a per-group Notify raced in the keep-alive select!, producing a RelayDropped without tearing the group down. The headless runner exposes it as a `drop ` stdin command. This forces a deterministic reconnect with no network outage, firewall, or admin. Verified reuse with it (non-elevated): after `drop`, the reconnect goes straight to connecting_relay with NO `authorizing` phase and reaches Hosting in ~0.5s (vs ~2.4s on first connect) -- the relay accepts the reused token and the ~1.9s mint is skipped. Closes the open verification item on #47. Gates: cargo test (76), clippy default + --features hosting, fmt --check. Co-Authored-By: Claude Opus 4.8 --- src/headless.rs | 13 +++++++++++-- src/host/engine.rs | 40 ++++++++++++++++++++++++++++++++++------ src/host/mod.rs | 6 ++++++ 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/src/headless.rs b/src/headless.rs index 8b900d2..08f979f 100644 --- a/src/headless.rs +++ b/src/headless.rs @@ -10,8 +10,11 @@ //! Observability: every [`host::HostEvent`] is written as one JSON line on //! stdout (logs stay on stderr via the capturing logger), so an external process //! can observe state transitions deterministically. Control: it reads simple -//! line commands on stdin — `stop `, `stop` (all groups), `quit` (stop all -//! and exit). EOF on stdin is treated as `quit`. +//! line commands on stdin — `host ` (re-host), `stop `, `stop` (all +//! groups), `drop ` (force a relay drop + reconnect without tearing the +//! group down — exercises the reconnect / token-reuse path of issue #47 +//! deterministically, no firewall/admin needed), and `quit` (stop all and exit). +//! EOF on stdin is treated as `quit`. //! //! Only the `--features hosting` build has a real engine; the default build's //! `NoopHost` makes this a no-op, which keeps the module compiling everywhere. @@ -28,6 +31,9 @@ enum Ctl { Host(String), /// Stop one group by Real Tunnel ID. Stop(String), + /// Force one group's relay to drop and reconnect without tearing it down + /// (exercises the real reconnect / token-reuse path; issue #47). + Drop(String), /// Stop every hosted group. StopAll, /// Stop everything and exit. @@ -77,6 +83,8 @@ pub fn run(ids_csv: &str) -> anyhow::Result<()> { Ctl::StopAll } else if let Some(rest) = line.strip_prefix("stop ") { Ctl::Stop(rest.trim().to_owned()) + } else if let Some(rest) = line.strip_prefix("drop ") { + Ctl::Drop(rest.trim().to_owned()) } else if let Some(rest) = line.strip_prefix("host ") { Ctl::Host(rest.trim().to_owned()) } else { @@ -104,6 +112,7 @@ pub fn run(ids_csv: &str) -> anyhow::Result<()> { match ctl_rx.recv_timeout(Duration::from_millis(100)) { Ok(Ctl::Host(id)) => host.send(HostCommand::Host { tunnel_id: id }), Ok(Ctl::Stop(id)) => host.send(HostCommand::Stop { tunnel_id: id }), + Ok(Ctl::Drop(id)) => host.send(HostCommand::DropRelay { tunnel_id: id }), Ok(Ctl::StopAll) => stop_all(host.as_ref(), &ids), Ok(Ctl::Quit) => { stop_all(host.as_ref(), &ids); diff --git a/src/host/engine.rs b/src/host/engine.rs index 0b74939..7292a1b 100644 --- a/src/host/engine.rs +++ b/src/host/engine.rs @@ -57,11 +57,13 @@ pub fn start(events: Sender) -> std::sync::mpsc::Sender } /// Handle to a per-group worker thread: its join handle (used only to check -/// liveness on a repeat `Host`) and a cancellation [`Notify`] that, when -/// signalled, ends the group's `block_on` so its runtime drops. +/// liveness on a repeat `Host`), a cancellation [`Notify`] that ends the group's +/// `block_on` so its runtime drops, and a `drop_relay` [`Notify`] that forces a +/// reconnect without tearing the group down (diagnostic, issue #47). struct GroupHandle { thread: std::thread::JoinHandle<()>, cancel: Arc, + drop_relay: Arc, } /// Engine command loop. Runs on its own OS thread with no async runtime of its @@ -108,6 +110,16 @@ fn run(cmd_rx: std::sync::mpsc::Receiver, events: Sender } emit(&events, &tunnel_id, HostState::Stopped); } + HostCommand::DropRelay { tunnel_id } => { + // Force the live group to reconnect (it sees a RelayDropped) while + // staying hosted. Ignored if the group is gone or not yet up. + if let Some(group) = groups.get(&tunnel_id) { + if !group.thread.is_finished() { + log::debug!("host engine: forcing relay drop for {tunnel_id}"); + group.drop_relay.notify_one(); + } + } + } } } } @@ -125,6 +137,8 @@ fn spawn_group( ) -> GroupHandle { let cancel = Arc::new(Notify::new()); let cancel_signal = cancel.clone(); + let drop_relay = Arc::new(Notify::new()); + let drop_signal = drop_relay.clone(); let thread = std::thread::Builder::new() .name(format!("devtunnel-host-{tunnel_id}")) @@ -143,21 +157,30 @@ fn spawn_group( let local = tokio::task::LocalSet::new(); local.block_on(&rt, async { tokio::select! { - _ = host_group(tunnel_id, ports, events) => {} + _ = host_group(tunnel_id, ports, events, drop_signal) => {} _ = cancel_signal.notified() => {} } }); }) .expect("spawning a per-group host thread should not fail"); - GroupHandle { thread, cancel } + GroupHandle { + thread, + cancel, + drop_relay, + } } /// Long-running host task for one group: connect → add ports → keep alive, with /// reconnect-on-drop and periodic token re-mint. Loops forever; the caller's /// `select!` ends it when the group is cancelled (Stop). Returns early only on an /// unrecoverable error (e.g. expired sign-in). -async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender) { +async fn host_group( + tunnel_id: String, + ports: Vec<(u16, String)>, + events: Sender, + drop_relay: Arc, +) { use super::keepalive::{Action, ConnEvent, ConnFailure, KeepAliveState, Phase}; let mut state = KeepAliveState::new(); @@ -195,7 +218,8 @@ async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender cached = Some(tokens); emit(&events, &tunnel_id, HostState::Hosting); - // Keep alive until the relay drops or the re-mint timer fires. + // Keep alive until the relay drops, the re-mint timer fires, or a + // diagnostic `DropRelay` forces a reconnect (issue #47). let event = tokio::select! { r = handle => { log::warn!("host engine: {tunnel_id} relay disconnected: {r:?}"); @@ -205,6 +229,10 @@ async fn host_group(tunnel_id: String, ports: Vec<(u16, String)>, events: Sender log::info!("host engine: {tunnel_id} re-minting tokens before expiry"); ConnEvent::RemintDue } + _ = drop_relay.notified() => { + log::info!("host engine: {tunnel_id} forced relay drop (diagnostic)"); + ConnEvent::RelayDropped + } }; // A re-mint must discard the cache so the next attempt mints fresh // tokens before the old ones expire; a plain relay drop keeps them. diff --git a/src/host/mod.rs b/src/host/mod.rs index 600f7ac..c4696f7 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -66,6 +66,12 @@ pub enum HostCommand { Host { tunnel_id: String }, /// Stop hosting the given group; its definition is left intact. Stop { tunnel_id: String }, + /// Diagnostic: force the group's live relay connection to drop and reconnect, + /// *without* tearing the group down — exercises the real reconnect path + /// (including token reuse, issue #47) deterministically and without a network + /// outage / firewall block. Emitted only by the headless test runner; the GUI + /// never sends it. + DropRelay { tunnel_id: String }, } /// An event emitted by the host engine for the UI to consume. From 16b7af12d72174d5ea6c780dab2edbc9a9965446 Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 07:22:58 -0300 Subject: [PATCH 12/14] feat(host): add probe-down watchdog policy to the keep-alive state machine (#39) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bridge the public-URL health probe into the keep-alive policy so a zombie tunnel (relay session the SDK still believes live, but whose public URL is dead) forces a reconnect instead of hanging in `Hosting` forever. This commit lands the *pure, unit-tested* half of #39: the policy. The driver wiring (feeding probe ticks into the engine's keep-alive `select!`) stays out, gated on the #37 zombie-tunnel go-decision — `Action::Reconnect` is therefore never emitted by `engine.rs` yet, only handled. - `ProbeOutcome { Healthy, Down, ServiceDown }` and `ConnEvent::Probe(_)`: the streak is counted inside the state machine so the false-positive guard is pure and testable. Only a `Down` streak reaching `PROBE_DOWN_THRESHOLD` (3) on a live `Hosting` session yields `Action::Reconnect`; `ServiceDown` (relay alive, local upstream down — e.g. a server restart) never triggers, per the #39 acceptance criterion. - Probes before the first connect, or after a session-ending event, are absorbed as `Await` — the watchdog only arms between `Connected` and the next teardown. - `Reconnect` reconnects immediately with no extra backoff, funnelling into the existing `connect_once` path (no parallel reconnect logic). 8 new state-machine tests cover the streak threshold, the ServiceDown guard, streak resets, the not-connected windows, and re-arming after a reconnect. Co-Authored-By: Claude Opus 4.8 --- src/host/engine.rs | 5 + src/host/keepalive.rs | 267 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 268 insertions(+), 4 deletions(-) diff --git a/src/host/engine.rs b/src/host/engine.rs index 7292a1b..cd76ee7 100644 --- a/src/host/engine.rs +++ b/src/host/engine.rs @@ -287,6 +287,11 @@ async fn host_group( // Execute the policy's decision for the next (re)connect attempt. match action { Action::Sleep(d) => tokio::time::sleep(d).await, + // `Reconnect` (zombie watchdog, issue #39) reconnects now with no + // sleep — the loop falls straight back to `connect_once`. It is only + // emitted once the public probe is wired into the keep-alive `select!` + // (gated on the #37 go-decision); until then it is never produced here. + Action::Reconnect => {} // `Await` only follows a `Connected` event, which the Ok arm // overwrites with the keep-alive outcome before reaching here; // `Relogin`/`Fail` return in the Err arm above. None are reachable. diff --git a/src/host/keepalive.rs b/src/host/keepalive.rs index 1e03034..fed4915 100644 --- a/src/host/keepalive.rs +++ b/src/host/keepalive.rs @@ -19,6 +19,13 @@ pub const REMINT_AFTER: Duration = Duration::from_secs(20 * 60 * 60); const RECONNECT_BACKOFF_START: Duration = Duration::from_secs(2); const RECONNECT_BACKOFF_MAX: Duration = Duration::from_secs(60); +/// Consecutive public-probe `Down` cycles, on a still-`Hosting` group, that force +/// a watchdog reconnect (issue #39). Requiring a streak — not a single `Down` — +/// rides out a one-off probe blip; at the Health probe's cadence this is a few +/// seconds of a confirmed-dead public URL before the engine tears the (apparently +/// live but zombie) relay session down and reconnects. +pub const PROBE_DOWN_THRESHOLD: u32 = 3; + /// Why a connect attempt failed — drives whether the driver retries, stops, or /// asks the user to re-authenticate. The driver classifies the raw error string /// (via the `devtunnel` helpers) into one of these so the state machine stays @@ -35,6 +42,24 @@ pub enum ConnFailure { Transient, } +/// Outcome of a public-URL health probe, fed into the watchdog (issue #39). The +/// kinds mirror the Health probe's own distinction and exist so the false-positive +/// guard lives in the pure policy: only [`ProbeOutcome::Down`] (relay unreachable) +/// can drive a reconnect — [`ProbeOutcome::ServiceDown`] (relay answered 5xx, so +/// it is alive but the local upstream is down, e.g. a server restart) never does. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ProbeOutcome { + /// The public URL served a healthy response; the tunnel is up. + Healthy, + /// The relay was unreachable (network error/timeout) — a possible zombie + /// tunnel. A streak of these on a `Hosting` group forces a reconnect. + Down, + /// The relay answered but with a 5xx: the relay is alive, the local upstream + /// is down. Never a reconnect trigger — reconnecting would not revive a + /// restarting local server and would churn a perfectly good relay session. + ServiceDown, +} + /// A connection outcome fed into the state machine by the driver. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ConnEvent { @@ -46,6 +71,11 @@ pub enum ConnEvent { RemintDue, /// A connect attempt failed, carrying why (see [`ConnFailure`]). ConnectFailed(ConnFailure), + /// A public-URL health probe reported the given outcome (issue #39). Only a + /// streak of [`ProbeOutcome::Down`] on a still-`Hosting` group yields + /// [`Action::Reconnect`]; every other outcome (and any probe before the first + /// successful connect) is absorbed as [`Action::Await`]. + Probe(ProbeOutcome), } /// What the driver should execute next, returned by [`KeepAliveState::next`]. @@ -59,6 +89,12 @@ pub enum Action { Relogin, /// A non-recoverable error: surface it and stop. No retry, no relogin prompt. Fail, + /// The public-URL watchdog (issue #39) judged the live session a zombie: a + /// `Down` streak reached [`PROBE_DOWN_THRESHOLD`] while still `Hosting`. The + /// driver force-drops the (apparently live) relay handle and reconnects now — + /// no extra sleep, funnelling into the same `connect_once` path so any ensuing + /// failure backs off normally (no parallel reconnect logic). + Reconnect, } /// Presentation phase. The driver maps it to `HostState::Connecting` (first @@ -76,6 +112,14 @@ pub enum Phase { pub struct KeepAliveState { backoff: Duration, first_attempt: bool, + /// Whether a live relay session is currently believed up (between a + /// [`ConnEvent::Connected`] and the next session-ending event). The watchdog + /// only counts probe `Down`s while this holds — a probe failing during a + /// connect attempt is not a zombie, just the connect not landed yet. + connected: bool, + /// Consecutive [`ProbeOutcome::Down`] cycles seen while `connected`. Any other + /// probe outcome, or a session-ending event, resets it to zero. + down_streak: u32, } impl KeepAliveState { @@ -84,6 +128,8 @@ impl KeepAliveState { Self { backoff: RECONNECT_BACKOFF_START, first_attempt: true, + connected: false, + down_streak: 0, } } @@ -107,32 +153,77 @@ impl KeepAliveState { /// backoff, consecutive connect-failures keep doubling it). pub fn next(&mut self, event: ConnEvent) -> Action { match event { - // Success: reset the backoff and leave the first-attempt phase. + // Success: reset the backoff and leave the first-attempt phase. A live + // session is now up, so the watchdog starts counting from a clean slate. ConnEvent::Connected => { self.backoff = RECONNECT_BACKOFF_START; self.first_attempt = false; + self.connected = true; + self.down_streak = 0; Action::Await } // A live session ended (drop or re-mint): sleep the current backoff, - // then double it (capped) for the next attempt. - ConnEvent::RelayDropped | ConnEvent::RemintDue => Action::Sleep(self.bump()), + // then double it (capped) for the next attempt. The session is no + // longer up, so the watchdog stops counting until the next connect. + ConnEvent::RelayDropped | ConnEvent::RemintDue => { + self.end_session(); + Action::Sleep(self.bump()) + } // Expired sign-in: stop and ask the user to re-authenticate. - ConnEvent::ConnectFailed(ConnFailure::Auth) => Action::Relogin, + ConnEvent::ConnectFailed(ConnFailure::Auth) => { + self.end_session(); + Action::Relogin + } // Non-recoverable error: stop. Retrying identical inputs would loop // forever (re-minting tokens each cycle) without ever succeeding. ConnEvent::ConnectFailed(ConnFailure::Fatal) => { self.first_attempt = false; + self.end_session(); Action::Fail } // Recoverable connect failure: leave the first-attempt phase and // back off without resetting (consecutive failures keep doubling). ConnEvent::ConnectFailed(ConnFailure::Transient) => { self.first_attempt = false; + self.end_session(); Action::Sleep(self.bump()) } + // Public-URL watchdog (issue #39). Only counts while a live session is + // up; outside one (during a connect attempt) a failing probe is just + // the connect not landed yet, not a zombie. + ConnEvent::Probe(outcome) => self.on_probe(outcome), } } + /// Applies a health-probe outcome to the watchdog and returns the action. + /// + /// A streak of [`ProbeOutcome::Down`] reaching [`PROBE_DOWN_THRESHOLD`] on a + /// live session yields [`Action::Reconnect`] (and resets the streak so the next + /// trigger needs a fresh full streak — no tight reconnect loop). Every other + /// outcome resets the streak; [`ProbeOutcome::ServiceDown`] in particular never + /// triggers a reconnect (relay alive, local upstream down). A probe arriving + /// while no session is up is absorbed as [`Action::Await`]. + fn on_probe(&mut self, outcome: ProbeOutcome) -> Action { + if !self.connected || outcome != ProbeOutcome::Down { + self.down_streak = 0; + return Action::Await; + } + self.down_streak += 1; + if self.down_streak >= PROBE_DOWN_THRESHOLD { + self.end_session(); + Action::Reconnect + } else { + Action::Await + } + } + + /// Marks the live session as ended: clears the connected flag and the watchdog + /// streak. Called for every event that tears down or abandons the session. + fn end_session(&mut self) { + self.connected = false; + self.down_streak = 0; + } + /// Returns the current backoff and then doubles it, capped at /// [`RECONNECT_BACKOFF_MAX`]. fn bump(&mut self) -> Duration { @@ -224,6 +315,174 @@ mod tests { assert!(!state.first_attempt()); } + /// Drives the state machine to a live `Hosting` session, the precondition for + /// every watchdog test below. + fn connected_state() -> KeepAliveState { + let mut state = KeepAliveState::new(); + assert_eq!(state.next(ConnEvent::Connected), Action::Await); + state + } + + #[test] + fn probe_down_streak_reaching_threshold_triggers_reconnect() { + let mut state = connected_state(); + // The first PROBE_DOWN_THRESHOLD-1 downs are absorbed while the streak grows. + for _ in 0..PROBE_DOWN_THRESHOLD - 1 { + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Await + ); + } + // The threshold-th consecutive down forces the watchdog reconnect. + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Reconnect + ); + } + + #[test] + fn service_down_never_triggers_reconnect() { + let mut state = connected_state(); + // Far past the threshold: a relay-alive/upstream-down result must never + // reconnect (it would churn a good relay and not revive a restarting server). + for _ in 0..PROBE_DOWN_THRESHOLD * 3 { + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::ServiceDown)), + Action::Await + ); + } + } + + #[test] + fn healthy_probe_resets_the_down_streak() { + let mut state = connected_state(); + // Build the streak to one below the threshold, then a healthy probe clears it. + for _ in 0..PROBE_DOWN_THRESHOLD - 1 { + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Await + ); + } + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Healthy)), + Action::Await + ); + // A fresh full streak is now required — the next down does not trigger. + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Await + ); + } + + #[test] + fn service_down_in_the_middle_resets_the_down_streak() { + let mut state = connected_state(); + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Await + ); + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Await + ); + // A ServiceDown breaks the run of downs, so the streak restarts. + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::ServiceDown)), + Action::Await + ); + for _ in 0..PROBE_DOWN_THRESHOLD - 1 { + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Await + ); + } + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Reconnect + ); + } + + #[test] + fn probe_down_before_first_connect_is_ignored() { + let mut state = KeepAliveState::new(); + // No live session yet: a failing probe is the connect not landed, not a + // zombie. Even a long streak must never reconnect. + for _ in 0..PROBE_DOWN_THRESHOLD * 2 { + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Await + ); + } + } + + #[test] + fn probe_down_after_session_ends_is_ignored_until_reconnect() { + let mut state = connected_state(); + // The relay drops: the session is no longer live. + assert_eq!(sleep_of(state.next(ConnEvent::RelayDropped)), secs(2)); + // Probes arriving before the reconnect lands must not count. + for _ in 0..PROBE_DOWN_THRESHOLD * 2 { + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Await + ); + } + // After reconnecting, the watchdog is armed again from a clean streak. + assert_eq!(state.next(ConnEvent::Connected), Action::Await); + for _ in 0..PROBE_DOWN_THRESHOLD - 1 { + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Await + ); + } + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Reconnect + ); + } + + #[test] + fn watchdog_reconnect_rearms_after_a_successful_reconnect() { + let mut state = connected_state(); + // First zombie reconnect. + for _ in 0..PROBE_DOWN_THRESHOLD - 1 { + let _ = state.next(ConnEvent::Probe(ProbeOutcome::Down)); + } + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Reconnect + ); + // The reconnect lands; the watchdog must require a fresh full streak again. + assert_eq!(state.next(ConnEvent::Connected), Action::Await); + for _ in 0..PROBE_DOWN_THRESHOLD - 1 { + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Await + ); + } + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Reconnect + ); + } + + #[test] + fn watchdog_reconnect_does_not_inflate_backoff() { + let mut state = connected_state(); + for _ in 0..PROBE_DOWN_THRESHOLD - 1 { + let _ = state.next(ConnEvent::Probe(ProbeOutcome::Down)); + } + // A watchdog reconnect funnels into the normal connect path; it must not + // itself bump the backoff. After a successful reconnect, the first ensuing + // relay drop still sleeps the reset start backoff. + assert_eq!( + state.next(ConnEvent::Probe(ProbeOutcome::Down)), + Action::Reconnect + ); + assert_eq!(state.next(ConnEvent::Connected), Action::Await); + assert_eq!(sleep_of(state.next(ConnEvent::RelayDropped)), secs(2)); + } + #[test] fn reconnect_after_drop_changes_phase() { let mut state = KeepAliveState::new(); From bcbf7e295844a88821ede7c2a4e3c22e471da7d3 Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 07:40:39 -0300 Subject: [PATCH 13/14] feat(probe): instrument the zombie-tunnel signature for observation (#37) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe could not see a zombie tunnel. `combine` deliberately reports a Public-URL network error as `Operational` while the local port is listening (a transient WAN hiccup is not a service outage), so the exact zombie state — local upstream fine, Public URL dead, the SDK's `RelayHandle` never resolving so the engine stays `Hosting` — was invisible to every layer. The probe's `Down` is only ever set by the engine's `RelayHandle`, which in a zombie never fires. The signal #39's watchdog needs did not exist yet. Surface it without changing the badge: when the slow HTTP fallback finds the Public URL unreachable while the local port is up, the probe emits a new `ProbeEvent::PublicUnreachable`. The wiring layer logs it at WARN only when the engine still believes that group is `Hosting` (the full zombie signature), and at DEBUG otherwise (an ordinary drop the engine is already reconnecting). This is the lightweight instrumentation of #37: pure observability, no behaviour change. The recorded occurrences over real-use hosting feed the #37 go/no-go decision and, once that gate opens, the #39 reconnect bridge (whose pure policy already landed in keepalive.rs). Co-Authored-By: Claude Opus 4.8 --- src/main.rs | 50 +++++++++++++++++++++++++++++++++++++++----------- src/probe.rs | 19 +++++++++++++++++++ 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/src/main.rs b/src/main.rs index 0e13595..214e9b8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1082,17 +1082,45 @@ fn main() -> anyhow::Result<()> { #[cfg(feature = "hosting")] let mut probe_changed = false; #[cfg(feature = "hosting")] - while let Ok(probe::ProbeEvent::Status { - tunnel_id, - port, - state: ps, - }) = probe_evt_rx.try_recv() - { - state - .borrow_mut() - .probe - .insert((tunnel_id, port), map_probe_state(&ps).to_string()); - probe_changed = true; + while let Ok(ev) = probe_evt_rx.try_recv() { + match ev { + probe::ProbeEvent::Status { + tunnel_id, + port, + state: ps, + } => { + state + .borrow_mut() + .probe + .insert((tunnel_id, port), map_probe_state(&ps).to_string()); + probe_changed = true; + } + // Zombie-tunnel instrumentation (issue #37): the probe found the + // Public URL unreachable while the local port is up. That is a + // zombie only if the engine still believes the group is Hosting + // (its RelayHandle never resolved); otherwise it is an ordinary + // drop the engine is already reconnecting. Log/flag only — no + // behaviour change. The recorded occurrences feed the #37 + // go/no-go and, once that gates open, the #39 reconnect bridge. + probe::ProbeEvent::PublicUnreachable { tunnel_id, port } => { + let hosting = matches!( + state.borrow().host.get(&tunnel_id).map(String::as_str), + Some("hosting") + ); + if hosting { + log::warn!( + "zombie-tunnel suspect: {tunnel_id} port {port} — Public URL \ + unreachable while the local port is listening and the engine \ + state is Hosting (RelayHandle not resolved)" + ); + } else { + log::debug!( + "probe: {tunnel_id} port {port} Public URL unreachable but the \ + engine is not Hosting — ordinary drop, not a zombie" + ); + } + } + } } // Re-point the probe at the currently-hosting groups' URLs whenever diff --git a/src/probe.rs b/src/probe.rs index e4d8b98..874738e 100644 --- a/src/probe.rs +++ b/src/probe.rs @@ -54,6 +54,15 @@ pub enum ProbeEvent { port: i32, state: ProbeState, }, + /// Zombie-tunnel signal (issue #37): the HTTP fallback found the **Public URL + /// unreachable** (network error/timeout) while the **local port is still + /// listening**. [`combine`] deliberately reports this as `Operational` (a + /// transient WAN hiccup is not a service outage), so this discrepancy is + /// surfaced separately rather than folded into the badge. It is only the + /// *probe half* of the zombie signature: the wiring layer logs/acts on it + /// solely when the host engine still believes the group is `Hosting` (the + /// relay's `RelayHandle` never resolved). Emitted at the slow HTTP cadence. + PublicUnreachable { tunnel_id: String, port: i32 }, } /// Commands sent to the probe thread. @@ -231,6 +240,16 @@ pub fn spawn(events: Sender) -> Sender { Err(ureq::Error::Status(code, _)) => Some(code), Err(_) => None, }; + // Zombie signature (probe half): the Public URL is unreachable + // (network error) yet the local upstream is listening. `combine` + // swallows this as Operational, so surface it for the wiring + // layer to correlate against the engine's `Hosting` state (#37). + if status.is_none() && tcp_listening { + let _ = events.send(ProbeEvent::PublicUnreachable { + tunnel_id: target.tunnel_id.clone(), + port: target.port, + }); + } if let Some(slot) = http_cache.get_mut(i) { *slot = Some(status); } From 002b9851f18431ec2645c87498aac37fc2356411 Mon Sep 17 00:00:00 2001 From: Paulo Corcino <7800501+paulocorcino@users.noreply.github.com> Date: Thu, 18 Jun 2026 21:31:51 -0300 Subject: [PATCH 14/14] refactor(view): extract pure view-fold module from main.rs (#42) Move the view reconciliation logic out of `rebuild_rows` into a new pure `src/view.rs` with one entry point, `fold(&FoldInput) -> FoldOutput`. The four sources of truth (CLI rows, probe results, host state, optimistic delete/placeholder sets) are now merged in a module free of any Slint, channel, or `Rc` dependency, returning plain `GroupViewData` / `PortViewData`. `rebuild_rows` becomes a thin adapter: feed inputs, map the plain result onto Slint structs, rebuild the tray menu, set props. `derive_status`, `derive_host_state`, the `Placeholder` struct, and `PROVISIONING_STATUS` move into the module. Adds 14 table-driven tests covering badge mapping for the 3 probe states, optimistic-delete hiding (single port / whole group / last-port-portless), placeholder folding, the hosting pill, and detail-panel reconciliation. Zero behavior change. Co-Authored-By: Claude Opus 4.8 --- src/main.rs | 279 ++++++--------------------- src/view.rs | 545 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 606 insertions(+), 218 deletions(-) create mode 100644 src/view.rs diff --git a/src/main.rs b/src/main.rs index 214e9b8..2efb35b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,6 +16,7 @@ mod model; #[cfg(feature = "hosting")] mod probe; mod state; +mod view; slint::include_modules!(); @@ -49,9 +50,7 @@ enum Action { Open(String), } -/// Status id assigned to optimistic placeholder rows (see [`rebuild_rows`]). -/// Drives the "Provisioning…" badge and disables the row's action buttons. -const PROVISIONING_STATUS: &str = "provisioning"; +use view::Placeholder; /// A deletion awaiting user confirmation. `port == None` means delete the whole group. struct PendingDelete { @@ -59,15 +58,6 @@ struct PendingDelete { port: Option, } -/// An optimistic placeholder inserted immediately when a create-group / add-port -/// operation is dispatched. Replaced by the real row when the op's refresh lands. -struct Placeholder { - id: u64, - group: String, - port: i32, - protocol: String, -} - /// UI-thread state derived from host/probe events. Persists across reloads so a /// fresh `fetch_rows` keeps the latest health/host status per row. #[derive(Default)] @@ -125,21 +115,6 @@ impl LiveState { } } -/// Derives a row's `status` id from the latest probe + host state. -/// Probe result wins (it is the most specific); otherwise fall back to the -/// group's host state ("host" = hosting but not yet probed), then to the -/// service-reported `host_connections` count, then "idle". -fn derive_status(state: &LiveState, tunnel_id: &str, port: i32, host_connections: i64) -> String { - if let Some(s) = state.probe.get(&(tunnel_id.to_string(), port)) { - return s.clone(); - } - match state.host.get(tunnel_id).map(String::as_str) { - Some("hosting") | Some("host") => "host".to_string(), - _ if host_connections > 0 => "host".to_string(), - _ => "idle".to_string(), - } -} - /// Maps a [`host::HostState`] to the stored host-state id, or `None` when the /// group is no longer hosted (Stopped / Idle / Error -> clear). fn map_host_state(hs: &host::HostState) -> Option<&'static str> { @@ -181,18 +156,6 @@ fn hosting_targets(state: &LiveState) -> Vec { .collect() } -/// Derives the group toggle state: -/// - `"hosting"` when this session is actively hosting the group, -/// - `"external"` when the service reports active connections but this session is not hosting, -/// - `""` otherwise. -fn derive_host_state(state: &LiveState, tunnel_id: &str, host_connections: i64) -> String { - match state.host.get(tunnel_id).map(String::as_str) { - Some("hosting") | Some("host") => "hosting".to_string(), - _ if host_connections > 0 => "external".to_string(), - _ => String::new(), - } -} - fn main() -> anyhow::Result<()> { // Install the capturing logger in every build: it tees records to stderr // (what env_logger used to print in the hosting build). @@ -1483,14 +1446,10 @@ fn apply_rows( // Also persist the rows so the next startup paints immediately. state::save_row_cache(&rows); state.borrow_mut().rows = rows; - // The header chip counts the ports actually rendered into the cards - // (returned by rebuild_rows), not raw `rows`: an optimistically-hidden - // or stale port must not inflate the chip while its card shows portless. - let count = rebuild_rows(&app, tray, actions, state, loc); - - let mut args = FluentArgs::new(); - args.set("count", count as i64); - app.set_status(loc.t_args("status-port-count", &args).into()); + // The header chip is set inside rebuild_rows from the ports actually + // rendered into the cards (not raw `rows`): an optimistically-hidden or + // stale port must not inflate the chip while its card shows portless. + rebuild_rows(&app, tray, actions, state, loc); true } Err(e) => { @@ -1530,141 +1489,71 @@ fn rebuild_rows( loc: &Rc, ) -> usize { let st = state.borrow(); - // Count of real service ports rendered into the cards; returned for the header. - let mut rendered_ports = 0usize; - // Build a flat index space first: every visible (non-hidden) real port gets a - // stable `row-index` used to key the expandable detail panel (issue #17). The - // same index drives `selected-index` so the open panel survives reloads. - // Optimistic delete (#13) hides ports/groups awaiting their confirming refresh. - // Only a group-level delete (`(id, None)`) drops the whole card here; a - // port-level delete (`(id, Some(port))`) keeps the row in the index space and - // is skipped further down when attaching ports. This way deleting a group's - // last port leaves the card standing (as portless) instead of flickering the - // whole card out and back when the confirming refresh lands. - let visible_rows: Vec<&devtunnel::Row> = st - .rows - .iter() - .filter(|r| !st.hidden.contains(&(r.tunnel_id.clone(), None))) - .collect(); - // Fold the flat rows into groups (Real Tunnel ID order preserved). Ports are - // collected separately and attached as models at the end. - let mut groups: Vec = Vec::new(); - let mut ports: Vec> = Vec::new(); - let mut index: HashMap = HashMap::new(); - for (flat_idx, r) in visible_rows.iter().enumerate() { - let gi = match index.get(&r.tunnel_id) { - Some(&i) => i, - None => { - index.insert(r.tunnel_id.clone(), groups.len()); - groups.push(GroupView { - group: r.group.clone().into(), - tunnel_id: r.tunnel_id.clone().into(), - expiration: r.expiration.clone().into(), - hosting: derive_host_state(&st, &r.tunnel_id, r.host_connections) == "hosting", - // "Hosted elsewhere" pill: service reports connections but this - // session is not hosting the group (issue #15). - host_state: derive_host_state(&st, &r.tunnel_id, r.host_connections).into(), - provisioning: false, - has_port: false, - ports: ModelRc::default(), - }); - ports.push(Vec::new()); - groups.len() - 1 - } - }; - // A port==0 row is a portless group: keep the card, skip the port row. - // A port hidden by an optimistic delete (#13) likewise keeps its card but - // drops the port row until the reflush refresh confirms the deletion. - if r.port != 0 && !st.hidden.contains(&(r.tunnel_id.clone(), Some(r.port))) { - groups[gi].has_port = true; - rendered_ports += 1; - ports[gi].push(PortView { - port: r.port, - protocol: r.protocol.clone().into(), - url: r.url.clone().into(), - status: derive_status(&st, &r.tunnel_id, r.port, r.host_connections).into(), - row_index: flat_idx as i32, - }); - } - } + // All folding (visible-row index space, optimistic delete/placeholder + // handling, derived status/host-state, detail-panel reconciliation) lives in + // the pure `view::fold`. main.rs only feeds the inputs and maps the plain + // result onto Slint structs + the tray menu. + let out = view::fold(&view::FoldInput { + rows: &st.rows, + probe: &st.probe, + host: &st.host, + hidden: &st.hidden, + placeholders: &st.placeholders, + detail: st.detail.as_ref(), + }); - // Optimistic placeholders for in-flight creates: attach the provisioning - // port to its existing group (matched by friendly name) when possible, - // otherwise add a whole provisioning card. Placeholders are inert, so they - // carry row-index -1 (not expandable). - for p in &st.placeholders { - match groups.iter().position(|g| g.group == p.group.as_str()) { - Some(gi) if p.port != 0 => ports[gi].push(PortView { - port: p.port, - protocol: p.protocol.clone().into(), - url: SharedString::new(), - status: PROVISIONING_STATUS.into(), - row_index: -1, - }), - _ => { - groups.push(GroupView { - group: p.group.clone().into(), - tunnel_id: SharedString::new(), - expiration: SharedString::new(), - hosting: false, - host_state: SharedString::new(), - provisioning: true, - has_port: p.port != 0, - ports: ModelRc::default(), - }); - ports.push(if p.port != 0 { - vec![PortView { + // Map the plain group/port data onto the Slint models. + let groups: Vec = out + .groups + .iter() + .map(|g| GroupView { + group: g.group.clone().into(), + tunnel_id: g.tunnel_id.clone().into(), + expiration: g.expiration.clone().into(), + hosting: g.hosting, + host_state: g.host_state.clone().into(), + provisioning: g.provisioning, + has_port: g.has_port, + ports: ModelRc::new(VecModel::from( + g.ports + .iter() + .map(|p| PortView { port: p.port, protocol: p.protocol.clone().into(), - url: SharedString::new(), - status: PROVISIONING_STATUS.into(), - row_index: -1, - }] - } else { - Vec::new() - }); - } - } - } - for (g, pv) in groups.iter_mut().zip(ports) { - g.ports = ModelRc::new(VecModel::from(pv)); - } - - // Recompute the expanded port's flat index: rows can reorder or disappear - // across reloads, so the selection is keyed by (tunnel_id, port), not index. - let mut selected = -1; - let mut stale_detail = false; - if let Some((tid, port)) = st.detail.as_ref() { - // A port hidden by an optimistic delete is still in `visible_rows` (to keep - // its group card alive), so check the hidden set too: deleting the expanded - // port must collapse the panel rather than leave it pointing at a gone row. - let deleting = st.hidden.contains(&(tid.clone(), Some(*port))) - || st.hidden.contains(&(tid.clone(), None)); - match visible_rows - .iter() - .position(|r| r.tunnel_id == tid.as_str() && r.port == *port) - { - Some(i) if !deleting => selected = i as i32, - _ => stale_detail = true, - } - } + url: p.url.clone().into(), + status: p.status.clone().into(), + row_index: p.row_index, + }) + .collect::>(), + )), + }) + .collect(); // Rebuild the tray menu with per-port actions from the same load (placeholders // have no URL, so they are skipped by build_tray_menu). let menu = build_tray_menu(&st.rows, &mut actions.borrow_mut(), loc); tray.set_menu(Some(Box::new(menu))); - app.set_selected_index(selected); + app.set_selected_index(out.selected_index); app.set_groups(ModelRc::new(VecModel::from(groups))); // The selected port no longer exists (deleted elsewhere): collapse so the // poll timer stops issuing CLI calls for it. drop(st); - if stale_detail { + if out.stale_detail { state.borrow_mut().detail = None; } - rendered_ports + + // Keep the header chip in lockstep with the cards: it is set here, at the one + // place that knows how many real ports were actually rendered, so it can never + // disagree with what the list shows. Callers that need a transient message + // (creating…, deleting…, an error) set it *after* this returns and win. + let mut args = FluentArgs::new(); + args.set("count", out.rendered_ports as i64); + app.set_status(loc.t_args("status-port-count", &args).into()); + + out.rendered_ports } /// Fires a `fetch_port_status` for the selected port on a background thread; @@ -2048,74 +1937,28 @@ mod tests { host_connections: 0, }); - // No placeholder yet — only one row, status derives to "idle". - let real_row_status = derive_status(&st, "tid1", 9000, 0); + // No placeholder yet — only one row, status derives to "idle". The + // derivation itself is exhaustively tested in `view`; here we just sanity + // check the LiveState maps feed it correctly. + let real_row_status = view::derive_status(&st.probe, &st.host, "tid1", 9000, 0); assert_eq!(real_row_status, "idle"); - // Push a placeholder; its fields are what `rebuild_rows` turns into a row. + // Push a placeholder; its fields are what `view::fold` turns into a row. let id = st.push_placeholder("new-group".into(), 4000, "tcp".into()); assert_eq!(st.placeholders.len(), 1); assert_eq!(st.placeholders[0].port, 4000); assert_eq!(st.placeholders[0].group, "new-group"); assert_eq!(st.placeholders[0].protocol, "tcp"); - // `rebuild_rows` assigns this id to every placeholder row, which the + // `view::fold` assigns this id to every placeholder row, which the // theme/UI render as the "Provisioning…" badge. - assert_eq!(PROVISIONING_STATUS, "provisioning"); + assert_eq!(view::PROVISIONING_STATUS, "provisioning"); // After removal the placeholder list is empty again. st.remove_placeholder(id); assert!(st.placeholders.is_empty()); } - #[test] - fn derive_host_state_session_hosting_wins_over_service_count() { - let mut st = make_state(); - st.host.insert("t1".into(), "hosting".into()); - // Even with host_connections > 0, this-session state returns "hosting". - assert_eq!(derive_host_state(&st, "t1", 3), "hosting"); - } - - #[test] - fn derive_host_state_session_connecting_wins_over_service_count() { - let mut st = make_state(); - st.host.insert("t1".into(), "host".into()); - assert_eq!(derive_host_state(&st, "t1", 1), "hosting"); - } - - #[test] - fn derive_host_state_external_when_service_has_connections() { - let st = make_state(); - // No entry in st.host (this session is not hosting), but service reports connections. - assert_eq!(derive_host_state(&st, "t1", 2), "external"); - } - - #[test] - fn derive_host_state_idle_when_no_connections() { - let st = make_state(); - assert_eq!(derive_host_state(&st, "t1", 0), ""); - } - - #[test] - fn derive_status_session_hosting_wins() { - let mut st = make_state(); - st.host.insert("t1".into(), "hosting".into()); - assert_eq!(derive_status(&st, "t1", 3000, 0), "host"); - } - - #[test] - fn derive_status_external_host_connections_gives_host_color() { - let st = make_state(); - // service says hosted externally — dot should use "host" color - assert_eq!(derive_status(&st, "t1", 3000, 1), "host"); - } - - #[test] - fn derive_status_zero_connections_is_idle() { - let st = make_state(); - assert_eq!(derive_status(&st, "t1", 3000, 0), "idle"); - } - fn make_row(tunnel_id: &str, port: i32) -> devtunnel::Row { devtunnel::Row { group: tunnel_id.to_string(), diff --git a/src/view.rs b/src/view.rs new file mode 100644 index 0000000..44ebbd6 --- /dev/null +++ b/src/view.rs @@ -0,0 +1,545 @@ +//! Pure view reconciliation: folds the four independent sources of truth — CLI +//! rows, Health probe results, Host state, and the optimistic create/delete sets — +//! into a flat list of per-group views with nested per-port views. +//! +//! This module is deliberately free of any Slint, channel, or `Rc` +//! dependency: it takes plain references in and returns plain data out, so the +//! reconciliation invariants ("why does this port show this badge?") can be +//! unit-tested in isolation. The thin mapping from [`GroupViewData`] / +//! [`PortViewData`] onto the Slint structs, plus the tray-menu rebuild, stays in +//! `main.rs`. + +use crate::devtunnel::Row; +use std::collections::{HashMap, HashSet}; + +/// Status id assigned to optimistic placeholder rows. Drives the +/// "Provisioning…" badge and disables the row's action buttons. +pub const PROVISIONING_STATUS: &str = "provisioning"; + +/// Per-port health status id, keyed by `(tunnel_id, port)`. +pub type ProbeMap = HashMap<(String, i32), String>; +/// Per-group host-state id ("host"/"hosting"/""), keyed by `tunnel_id`. +pub type HostMap = HashMap; +/// Optimistic hidden-delete keys: `(tunnel_id, None)` hides a whole group; +/// `(tunnel_id, Some(port))` hides one port. +pub type HiddenSet = HashSet<(String, Option)>; + +/// An optimistic placeholder inserted immediately when a create-group / add-port +/// operation is dispatched. Replaced by the real row when the op's refresh lands. +pub struct Placeholder { + pub id: u64, + pub group: String, + pub port: i32, + pub protocol: String, +} + +/// Plain-data mirror of the Slint `PortView` struct (no Slint types). +#[derive(Debug, Clone, PartialEq)] +pub struct PortViewData { + pub port: i32, + pub protocol: String, + pub url: String, + /// "idle" | "ok" | "warn" | "down" | "host" | "provisioning". + pub status: String, + /// Stable index into the flat visible-row space (keys the detail panel); + /// -1 for inert placeholder rows. + pub row_index: i32, +} + +/// Plain-data mirror of the Slint `GroupView` struct (no Slint types). +#[derive(Debug, Clone, PartialEq)] +pub struct GroupViewData { + pub group: String, + pub tunnel_id: String, + pub expiration: String, + pub hosting: bool, + /// "" | "hosting" (this session) | "external" (another session). + pub host_state: String, + pub provisioning: bool, + pub has_port: bool, + pub ports: Vec, +} + +/// The four sources of truth fed into [`fold`], plus the expanded-port key. +pub struct FoldInput<'a> { + /// Latest CLI data load (Real Tunnel ID order preserved). + pub rows: &'a [Row], + pub probe: &'a ProbeMap, + pub host: &'a HostMap, + pub hidden: &'a HiddenSet, + pub placeholders: &'a [Placeholder], + /// The currently-expanded port, keyed by `(tunnel_id, port)` (`None` = none). + pub detail: Option<&'a (String, i32)>, +} + +/// The reconciled result: the group list plus the few scalars `main.rs` needs to +/// drive the header chip and detail-panel selection. +pub struct FoldOutput { + pub groups: Vec, + /// Count of real service ports actually rendered into the cards (excludes + /// portless groups, optimistically-hidden ports, and placeholders). Drives + /// the header chip so it can never disagree with the cards. + pub rendered_ports: usize, + /// Flat index of the expanded port, recomputed against the visible rows + /// (-1 = none). + pub selected_index: i32, + /// True when the expanded port no longer exists (deleted elsewhere): the + /// caller collapses the panel so the metrics poll stops issuing CLI calls. + pub stale_detail: bool, +} + +/// Derives a port's `status` id from the latest probe + host state. +/// Probe result wins (it is the most specific); otherwise fall back to the +/// group's host state ("host" = hosting but not yet probed), then to the +/// service-reported `host_connections` count, then "idle". +pub fn derive_status( + probe: &ProbeMap, + host: &HostMap, + tunnel_id: &str, + port: i32, + host_connections: i64, +) -> String { + if let Some(s) = probe.get(&(tunnel_id.to_string(), port)) { + return s.clone(); + } + match host.get(tunnel_id).map(String::as_str) { + Some("hosting") | Some("host") => "host".to_string(), + _ if host_connections > 0 => "host".to_string(), + _ => "idle".to_string(), + } +} + +/// Derives the group toggle / pill state: +/// - `"hosting"` when this session is actively hosting the group, +/// - `"external"` when the service reports active connections but this session is not hosting, +/// - `""` otherwise. +pub fn derive_host_state(host: &HostMap, tunnel_id: &str, host_connections: i64) -> String { + match host.get(tunnel_id).map(String::as_str) { + Some("hosting") | Some("host") => "hosting".to_string(), + _ if host_connections > 0 => "external".to_string(), + _ => String::new(), + } +} + +/// Folds the flat CLI rows (plus probe/host/hidden/placeholder state) into +/// per-group views. **Zero behavior change** from the original inline +/// `rebuild_rows` body — only Slint construction and the tray rebuild stay in +/// `main.rs`. +pub fn fold(input: &FoldInput) -> FoldOutput { + let FoldInput { + rows, + probe, + host, + hidden, + placeholders, + detail, + } = *input; + + let mut rendered_ports = 0usize; + + // Build a flat index space first: every visible (non-group-hidden) real port + // gets a stable `row_index` used to key the expandable detail panel (#17). + // A group-level delete (`(id, None)`) drops the whole card here; a port-level + // delete (`(id, Some(port))`) keeps the row in the index space and is skipped + // below when attaching ports, so deleting a group's last port leaves the card + // standing (as portless) instead of flickering out and back. + let visible_rows: Vec<&Row> = rows + .iter() + .filter(|r| !hidden.contains(&(r.tunnel_id.clone(), None))) + .collect(); + + // Fold the flat rows into groups (Real Tunnel ID order preserved). + let mut groups: Vec = Vec::new(); + let mut index: HashMap = HashMap::new(); + for (flat_idx, r) in visible_rows.iter().enumerate() { + let gi = match index.get(&r.tunnel_id) { + Some(&i) => i, + None => { + index.insert(r.tunnel_id.clone(), groups.len()); + let host_state = derive_host_state(host, &r.tunnel_id, r.host_connections); + groups.push(GroupViewData { + group: r.group.clone(), + tunnel_id: r.tunnel_id.clone(), + expiration: r.expiration.clone(), + hosting: host_state == "hosting", + // "Hosted elsewhere" pill: service reports connections but + // this session is not hosting the group (#15). + host_state, + provisioning: false, + has_port: false, + ports: Vec::new(), + }); + groups.len() - 1 + } + }; + // A port==0 row is a portless group: keep the card, skip the port row. + // A port hidden by an optimistic delete (#13) likewise keeps its card but + // drops the port row until the reflush refresh confirms the deletion. + if r.port != 0 && !hidden.contains(&(r.tunnel_id.clone(), Some(r.port))) { + groups[gi].has_port = true; + rendered_ports += 1; + groups[gi].ports.push(PortViewData { + port: r.port, + protocol: r.protocol.clone(), + url: r.url.clone(), + status: derive_status(probe, host, &r.tunnel_id, r.port, r.host_connections), + row_index: flat_idx as i32, + }); + } + } + + // Optimistic placeholders for in-flight creates: attach the provisioning port + // to its existing group (matched by friendly name) when possible, otherwise + // add a whole provisioning card. Placeholders are inert, so they carry + // row-index -1 (not expandable). + for p in placeholders { + match groups.iter().position(|g| g.group == p.group) { + Some(gi) if p.port != 0 => groups[gi].ports.push(PortViewData { + port: p.port, + protocol: p.protocol.clone(), + url: String::new(), + status: PROVISIONING_STATUS.to_string(), + row_index: -1, + }), + _ => { + let ports = if p.port != 0 { + vec![PortViewData { + port: p.port, + protocol: p.protocol.clone(), + url: String::new(), + status: PROVISIONING_STATUS.to_string(), + row_index: -1, + }] + } else { + Vec::new() + }; + groups.push(GroupViewData { + group: p.group.clone(), + tunnel_id: String::new(), + expiration: String::new(), + hosting: false, + host_state: String::new(), + provisioning: true, + has_port: p.port != 0, + ports, + }); + } + } + } + + // Recompute the expanded port's flat index: rows can reorder or disappear + // across reloads, so the selection is keyed by (tunnel_id, port), not index. + let mut selected_index = -1; + let mut stale_detail = false; + if let Some((tid, port)) = detail { + // A port hidden by an optimistic delete is still in `visible_rows` (to + // keep its group card alive), so check the hidden set too: deleting the + // expanded port must collapse the panel rather than point at a gone row. + let deleting = + hidden.contains(&(tid.clone(), Some(*port))) || hidden.contains(&(tid.clone(), None)); + match visible_rows + .iter() + .position(|r| r.tunnel_id == tid.as_str() && r.port == *port) + { + Some(i) if !deleting => selected_index = i as i32, + _ => stale_detail = true, + } + } + + FoldOutput { + groups, + rendered_ports, + selected_index, + stale_detail, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn row(tunnel_id: &str, port: i32) -> Row { + Row { + group: tunnel_id.to_string(), + tunnel_id: tunnel_id.to_string(), + port, + protocol: "http".into(), + url: "https://example.com".into(), + expiration: "30d".into(), + host_connections: 0, + } + } + + fn fold_rows( + rows: &[Row], + probe: &ProbeMap, + host: &HostMap, + hidden: &HiddenSet, + placeholders: &[Placeholder], + detail: Option<&(String, i32)>, + ) -> FoldOutput { + fold(&FoldInput { + rows, + probe, + host, + hidden, + placeholders, + detail, + }) + } + + // ---- derive_status: badge mapping for the 3 probe states + fallbacks ----- + + #[test] + fn derive_status_maps_each_probe_state() { + let host = HostMap::new(); + for (probe_id, expected) in [("ok", "ok"), ("warn", "warn"), ("down", "down")] { + let mut probe = ProbeMap::new(); + probe.insert(("t1".into(), 3000), probe_id.to_string()); + assert_eq!( + derive_status(&probe, &host, "t1", 3000, 0), + expected, + "probe state {probe_id} should win" + ); + } + } + + #[test] + fn derive_status_probe_wins_over_host_and_connections() { + let mut probe = ProbeMap::new(); + probe.insert(("t1".into(), 3000), "down".into()); + let mut host = HostMap::new(); + host.insert("t1".into(), "hosting".into()); + // Probe is most specific: it wins even while hosting with connections. + assert_eq!(derive_status(&probe, &host, "t1", 3000, 5), "down"); + } + + #[test] + fn derive_status_host_then_connections_then_idle() { + let probe = ProbeMap::new(); + let mut host = HostMap::new(); + host.insert("t1".into(), "hosting".into()); + assert_eq!(derive_status(&probe, &host, "t1", 3000, 0), "host"); + host.insert("t1".into(), "host".into()); + assert_eq!(derive_status(&probe, &host, "t1", 3000, 0), "host"); + + let empty = HostMap::new(); + // No host entry, but the service reports connections. + assert_eq!(derive_status(&probe, &empty, "t1", 3000, 1), "host"); + // Nothing at all → idle. + assert_eq!(derive_status(&probe, &empty, "t1", 3000, 0), "idle"); + } + + // ---- derive_host_state: hosting pill ------------------------------------ + + #[test] + fn derive_host_state_session_wins_over_service_count() { + let mut host = HostMap::new(); + host.insert("t1".into(), "hosting".into()); + assert_eq!(derive_host_state(&host, "t1", 3), "hosting"); + host.insert("t1".into(), "host".into()); + assert_eq!(derive_host_state(&host, "t1", 1), "hosting"); + } + + #[test] + fn derive_host_state_external_then_empty() { + let host = HostMap::new(); + assert_eq!(derive_host_state(&host, "t1", 2), "external"); + assert_eq!(derive_host_state(&host, "t1", 0), ""); + } + + // ---- fold: host state → hosting pill ------------------------------------ + + #[test] + fn fold_sets_group_hosting_pill_from_host_state() { + let rows = vec![row("t1", 3000)]; + let mut host = HostMap::new(); + host.insert("t1".into(), "hosting".into()); + let out = fold_rows(&rows, &ProbeMap::new(), &host, &HiddenSet::new(), &[], None); + assert_eq!(out.groups.len(), 1); + assert!(out.groups[0].hosting); + assert_eq!(out.groups[0].host_state, "hosting"); + assert_eq!(out.groups[0].ports[0].status, "host"); + } + + #[test] + fn fold_external_pill_not_hosting() { + let mut rows = vec![row("t1", 3000)]; + rows[0].host_connections = 2; + let out = fold_rows( + &rows, + &ProbeMap::new(), + &HostMap::new(), + &HiddenSet::new(), + &[], + None, + ); + assert!(!out.groups[0].hosting); + assert_eq!(out.groups[0].host_state, "external"); + } + + // ---- fold: optimistic-delete hiding ------------------------------------- + + #[test] + fn fold_hides_single_port_keeps_card() { + let rows = vec![row("t1", 3000), row("t1", 8080)]; + let mut hidden = HiddenSet::new(); + hidden.insert(("t1".into(), Some(3000))); + let out = fold_rows(&rows, &ProbeMap::new(), &HostMap::new(), &hidden, &[], None); + // One group card, only the un-hidden port rendered. + assert_eq!(out.groups.len(), 1); + assert_eq!(out.groups[0].ports.len(), 1); + assert_eq!(out.groups[0].ports[0].port, 8080); + assert_eq!(out.rendered_ports, 1); + } + + #[test] + fn fold_hides_whole_group() { + let rows = vec![row("t1", 3000), row("t2", 9000)]; + let mut hidden = HiddenSet::new(); + hidden.insert(("t1".into(), None)); + let out = fold_rows(&rows, &ProbeMap::new(), &HostMap::new(), &hidden, &[], None); + assert_eq!(out.groups.len(), 1); + assert_eq!(out.groups[0].tunnel_id, "t2"); + assert_eq!(out.rendered_ports, 1); + } + + #[test] + fn fold_hiding_last_port_leaves_portless_card() { + let rows = vec![row("t1", 3000)]; + let mut hidden = HiddenSet::new(); + hidden.insert(("t1".into(), Some(3000))); + let out = fold_rows(&rows, &ProbeMap::new(), &HostMap::new(), &hidden, &[], None); + // Card stands, but has no port and is excluded from the header count. + assert_eq!(out.groups.len(), 1); + assert!(!out.groups[0].has_port); + assert!(out.groups[0].ports.is_empty()); + assert_eq!(out.rendered_ports, 0); + } + + // ---- fold: placeholder folding ------------------------------------------ + + #[test] + fn fold_attaches_placeholder_port_to_existing_group() { + let rows = vec![row("t1", 3000)]; + let placeholders = vec![Placeholder { + id: 1, + group: "t1".into(), // matches the friendly name of the existing group + port: 4000, + protocol: "tcp".into(), + }]; + let out = fold_rows( + &rows, + &ProbeMap::new(), + &HostMap::new(), + &HiddenSet::new(), + &placeholders, + None, + ); + assert_eq!(out.groups.len(), 1); + assert_eq!(out.groups[0].ports.len(), 2); + let prov = &out.groups[0].ports[1]; + assert_eq!(prov.port, 4000); + assert_eq!(prov.status, PROVISIONING_STATUS); + assert_eq!(prov.row_index, -1); + // Placeholders never inflate the real-port header count. + assert_eq!(out.rendered_ports, 1); + } + + #[test] + fn fold_adds_new_provisioning_card_for_new_group() { + let placeholders = vec![Placeholder { + id: 1, + group: "brand-new".into(), + port: 5000, + protocol: "http".into(), + }]; + let out = fold_rows( + &[], + &ProbeMap::new(), + &HostMap::new(), + &HiddenSet::new(), + &placeholders, + None, + ); + assert_eq!(out.groups.len(), 1); + assert!(out.groups[0].provisioning); + assert!(out.groups[0].tunnel_id.is_empty()); + assert_eq!(out.groups[0].ports[0].status, PROVISIONING_STATUS); + assert_eq!(out.rendered_ports, 0); + } + + #[test] + fn fold_portless_placeholder_group_has_no_port() { + let placeholders = vec![Placeholder { + id: 1, + group: "new-group".into(), + port: 0, + protocol: String::new(), + }]; + let out = fold_rows( + &[], + &ProbeMap::new(), + &HostMap::new(), + &HiddenSet::new(), + &placeholders, + None, + ); + assert_eq!(out.groups.len(), 1); + assert!(out.groups[0].provisioning); + assert!(!out.groups[0].has_port); + assert!(out.groups[0].ports.is_empty()); + } + + // ---- fold: detail-panel selection reconciliation ------------------------ + + #[test] + fn fold_selects_expanded_port_by_key() { + let rows = vec![row("t1", 3000), row("t1", 8080)]; + let detail = ("t1".to_string(), 8080); + let out = fold_rows( + &rows, + &ProbeMap::new(), + &HostMap::new(), + &HiddenSet::new(), + &[], + Some(&detail), + ); + assert_eq!(out.selected_index, 1); + assert!(!out.stale_detail); + } + + #[test] + fn fold_collapses_when_expanded_port_deleted() { + let rows = vec![row("t1", 3000)]; + let mut hidden = HiddenSet::new(); + hidden.insert(("t1".into(), Some(3000))); + let detail = ("t1".to_string(), 3000); + let out = fold_rows( + &rows, + &ProbeMap::new(), + &HostMap::new(), + &hidden, + &[], + Some(&detail), + ); + assert_eq!(out.selected_index, -1); + assert!(out.stale_detail); + } + + #[test] + fn fold_collapses_when_expanded_port_absent() { + let rows = vec![row("t1", 3000)]; + let detail = ("t1".to_string(), 9999); // never existed + let out = fold_rows( + &rows, + &ProbeMap::new(), + &HostMap::new(), + &HiddenSet::new(), + &[], + Some(&detail), + ); + assert_eq!(out.selected_index, -1); + assert!(out.stale_detail); + } +}