From 49fa165a94aa45406bebe21d186002e8c49a918c Mon Sep 17 00:00:00 2001 From: obchain Date: Sun, 3 May 2026 22:52:55 +0530 Subject: [PATCH 1/3] feat(cli): add --hold-secs to replay so Grafana sees the demo (closes #422) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `replay` exits in well under one Prometheus scrape interval, so the Grafana dashboard never sees any replay data — Queue depth, Profit predicted, Executor simulations/min all show "No data" or zeros even when the bot identified $39k+ of liquidatable opportunities. Add a `--hold-secs ` flag (default 0, current behaviour) that keeps the metrics exporter open for N seconds after the one-shot pipeline emits. Also start the metrics exporter at the top of the replay path so the held-open window actually serves `/metrics` — previously only the `listen` path installed the exporter. Local validation against block 91323624: curl http://127.0.0.1:9091/metrics | grep profit_usd_cents_sum charon_executor_profit_usd_cents_sum{chain="bnb"} 3952448 Matches the four JSON records' `predicted_net_profit_usd_cents` sum (\$39,524.48). Default-zero hold preserves CI / smoke-test behaviour. --- crates/charon-cli/src/main.rs | 72 ++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/crates/charon-cli/src/main.rs b/crates/charon-cli/src/main.rs index 793125c..f8d3e2e 100644 --- a/crates/charon-cli/src/main.rs +++ b/crates/charon-cli/src/main.rs @@ -251,6 +251,17 @@ enum Command { /// scan set must come entirely from the file. #[arg(long = "borrower-file")] borrower_file: PathBuf, + + /// Hold the metrics exporter open for this many seconds + /// after the one-shot pipeline emits. Prometheus's default + /// scrape interval is 15s; replay runs in well under a + /// second on a warm cache, so without a hold window the + /// Grafana dashboard never sees any replay data. Pass + /// `--hold-secs 30` (or higher) for a panel-friendly demo; + /// keep the default `0` for CI / smoke tests where extra + /// process lifetime would just slow the suite. See #422. + #[arg(long = "hold-secs", default_value_t = 0)] + hold_secs: u64, }, } @@ -417,8 +428,9 @@ async fn main() -> Result<()> { Command::Replay { block, borrower_file, + hold_secs, } => { - run_replay(&config, block, borrower_file).await?; + run_replay(&config, block, borrower_file, hold_secs).await?; } } @@ -1618,7 +1630,47 @@ async fn run_listen( /// required because the simulation gate's `eth_call` impersonates the /// `CharonLiquidator.owner()` (TxBuilder wires that), but no signed /// transaction is ever produced. -async fn run_replay(config: &Config, block: u64, borrower_file: PathBuf) -> Result<()> { +async fn run_replay( + config: &Config, + block: u64, + borrower_file: PathBuf, + hold_secs: u64, +) -> Result<()> { + // Bring up the metrics exporter early so any counters bumped + // during the one-shot pipeline are exposed on /metrics for + // Prometheus to scrape during the optional `--hold-secs` + // window. Without this the dashboard never sees replay data — + // the binary exits in well under one scrape interval (#422). + let _metrics_task: Option> = if config.metrics.enabled { + match charon_metrics::install(config.metrics.bind) { + Ok(Some(exporter)) => { + charon_metrics::set_build_info( + env!("CARGO_PKG_VERSION"), + option_env!("CHARON_GIT_SHA").unwrap_or("unknown"), + ); + info!( + bind = %config.metrics.bind, + "replay: metrics exporter listening on /metrics" + ); + Some(tokio::spawn(async move { + if let Err(err) = exporter.await { + warn!(?err, "replay: metrics exporter task ended"); + } + })) + } + Ok(None) => None, + Err(err) => { + warn!( + ?err, + "replay: metrics exporter install failed — continuing without /metrics" + ); + None + } + } + } else { + None + }; + let venus_cfg = config.protocol.get("venus").context( "replay: [protocol.venus] missing — replay requires a configured Venus protocol", )?; @@ -1825,6 +1877,22 @@ async fn run_replay(config: &Config, block: u64, borrower_file: PathBuf) -> Resu emitted, "replay: complete" ); + + // Hold the metrics exporter open for at least one Prometheus + // scrape interval if the operator asked for it. Default 0 + // (current CI / smoke-test friendly behaviour). Operators + // running the local Notion cheat sheet pass `--hold-secs 30` + // to let the dashboard show Queue depth / Profit predicted / + // simulations-per-minute against the replay's emitted set + // (#422). + if hold_secs > 0 { + info!( + hold_secs, + bind = %config.metrics.bind, + "replay: holding metrics exporter open — Ctrl-C to exit early" + ); + tokio::time::sleep(std::time::Duration::from_secs(hold_secs)).await; + } Ok(()) } From 2dcba7c05628fb05c6c38abda4c26801443d9f05 Mon Sep 17 00:00:00 2001 From: obchain Date: Sun, 3 May 2026 23:18:26 +0530 Subject: [PATCH 2/3] fix(grafana): show cumulative profit instead of increase() (#422 follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Profit (predicted/realised) stat panels used `sum(increase(...)[$__range])) / 100` which only fires when Prometheus sees a 0->bump transition. `replay` bumps the counter once in <1s, well below Prometheus's 15s scrape interval, so the panel always showed 0 even when the bot identified $39k+ of opportunities. Switch the two profit-stat panels to the gauge form `sum(charon_executor_*_profit_usd_cents_sum) / 100` so the dashboard shows the cumulative total since the bot started. For replay this is the replay's emitted profit; for production listen runs it is the running total since the latest bot start, which is the more useful number anyway (operators read realised profit per range from the heatmap below). Legend updated to 'cumulative' to match the new semantic. Bumped dashboard version so provisioning re-imports. Pairs with #422's --hold-secs flag — together they let the dashboard serve a useful demo number against any pinned-block fork run. --- deploy/grafana/charon.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/deploy/grafana/charon.json b/deploy/grafana/charon.json index 95dd743..79e4d88 100644 --- a/deploy/grafana/charon.json +++ b/deploy/grafana/charon.json @@ -223,8 +223,8 @@ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(increase(charon_executor_profit_usd_cents_sum{instance=~\"$instance\",chain=~\"$chain\",job=~\"$job\"}[$__range])) / 100", - "legendFormat": "profit (selected range)", + "expr": "sum(charon_executor_profit_usd_cents_sum{instance=~\"$instance\",chain=~\"$chain\",job=~\"$job\"}) / 100", + "legendFormat": "profit (cumulative)", "range": true, "refId": "A" } @@ -264,8 +264,8 @@ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(increase(charon_executor_realised_profit_usd_cents_sum{instance=~\"$instance\",chain=~\"$chain\",job=~\"$job\"}[$__range])) / 100", - "legendFormat": "realised profit (selected range)", + "expr": "sum(charon_executor_realised_profit_usd_cents_sum{instance=~\"$instance\",chain=~\"$chain\",job=~\"$job\"}) / 100", + "legendFormat": "realised profit (cumulative)", "range": true, "refId": "A" } @@ -611,6 +611,6 @@ "timezone": "", "title": "Charon Bot", "uid": "charon-v0", - "version": 1, + "version": 3, "weekStart": "" } From 52cf7d14a53b3687b2c9a3a80085921b28bdc9e1 Mon Sep 17 00:00:00 2001 From: obchain Date: Sun, 3 May 2026 23:21:52 +0530 Subject: [PATCH 3/3] test(cli): include hold_secs in replay subcommand parse test --- crates/charon-cli/src/main.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/charon-cli/src/main.rs b/crates/charon-cli/src/main.rs index f8d3e2e..b183b9e 100644 --- a/crates/charon-cli/src/main.rs +++ b/crates/charon-cli/src/main.rs @@ -3382,9 +3382,11 @@ mod replay_cli_parse_tests { Command::Replay { block, borrower_file, + hold_secs, } => { assert_eq!(block, 41_000_000); assert_eq!(borrower_file, PathBuf::from("/tmp/seed.txt")); + assert_eq!(hold_secs, 0); } other => panic!("expected Command::Replay, got {other:?}"), }