diff --git a/deploy/otel/grafana/dashboards/helix-internals.json b/deploy/otel/grafana/dashboards/helix-internals.json new file mode 100644 index 0000000..62dc6ab --- /dev/null +++ b/deploy/otel/grafana/dashboards/helix-internals.json @@ -0,0 +1,491 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Internals & research surface: the #209 phase-1 tuning-signal instruments (dense cosine calibration, shard-router fan-out/discrimination, know/miss decisions, session-elision savings, splice ratio). Companion to helix-overview.json and helix-pipeline-observatory.json.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "Tuning signals (#209 phase 1)", + "type": "row" + }, + { + "datasource": "Prometheus", + "description": "helix_dense_cosine — raw cosine of every dense-tier hit at its computation site. arm=hot is the BGE-M3 dense-recall merge in query_genes (pre-weight, pre-floor); arm=cold is the cold-tier ΣĒMA scan. Calibration data for dense_additive_weight / dense_additive_min_cosine (#203, #209).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "Dense cosine distribution (p50 / p95 by arm)", + "type": "timeseries", + "targets": [ + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.50, sum by (le, arm) (rate(helix_dense_cosine_bucket[5m])))", + "legendFormat": "{{arm}} p50", + "refId": "A" + }, + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.95, sum by (le, arm) (rate(helix_dense_cosine_bucket[5m])))", + "legendFormat": "{{arm}} p95", + "refId": "B" + } + ] + }, + { + "datasource": "Prometheus", + "description": "helix_shard_fanout — number of shards ShardRouter consulted per query. Mean and p95 of the per-query distribution; tracks the #165 router-degeneracy finding.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "Shard fan-out (shards consulted per query)", + "type": "timeseries", + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(helix_shard_fanout_sum[5m]) / rate(helix_shard_fanout_count[5m])", + "legendFormat": "mean", + "refId": "A" + }, + { + "datasource": "Prometheus", + "expr": "histogram_quantile(0.95, sum by (le) (rate(helix_shard_fanout_bucket[5m])))", + "legendFormat": "p95", + "refId": "B" + } + ] + }, + { + "datasource": "Prometheus", + "description": "helix_shard_discrimination — routed / known healthy shards per query, mean over 5m. 1.0 = the router consults every shard (zero discrimination, #165 degeneracy); lower is a more selective route. Acceptance metric for the AND-mode router.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.7 + }, + { + "color": "red", + "value": 0.95 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.0.0", + "title": "Shard discrimination (fraction of shards hit)", + "type": "stat", + "targets": [ + { + "datasource": "Prometheus", + "expr": "rate(helix_shard_discrimination_sum[5m]) / rate(helix_shard_discrimination_count[5m])", + "legendFormat": "fraction", + "refId": "A" + } + ] + }, + { + "datasource": "Prometheus", + "description": "helix_know_decision_total — decide_know_or_miss outcomes per hour. Labels: outcome (know | miss | abstain) and reason (none for know; the MissBlock reason otherwise). Calibrates [know] floors/margins; input for miss-reason-driven escalation (SNOW-2 arm E).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "Know / miss / abstain decisions", + "type": "timeseries", + "targets": [ + { + "datasource": "Prometheus", + "expr": "sum by (outcome) (increase(helix_know_decision_total[1h]))", + "legendFormat": "{{outcome}}", + "refId": "A" + }, + { + "datasource": "Prometheus", + "expr": "sum by (reason) (increase(helix_know_decision_total{outcome!=\"know\"}[1h]))", + "legendFormat": "miss: {{reason}}", + "refId": "B" + } + ] + }, + { + "datasource": "Prometheus", + "description": "helix_session_tokens_saved_total — estimated tokens saved when the session working-set register elides an already-delivered document with a stub (~4 chars/token). Prices the elision arm and the \"~40% tokens on multi-turn\" claim.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "Session tokens saved by elision (per hour)", + "type": "timeseries", + "targets": [ + { + "datasource": "Prometheus", + "expr": "increase(helix_session_tokens_saved_total[1h])", + "legendFormat": "tokens saved / h", + "refId": "A" + } + ] + }, + { + "datasource": "Prometheus", + "description": "helix_splice_ratio — raw_chars / compressed_chars per assembled context window (the value shipped in ContextWindow.compression_ratio). Balancing signal for splice_aggressiveness: rising ratio with rising abstain rate is a net loss.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "title": "Splice compression ratio (mean by caller class)", + "type": "timeseries", + "targets": [ + { + "datasource": "Prometheus", + "expr": "sum by (caller_model_class) (rate(helix_splice_ratio_sum[5m])) / sum by (caller_model_class) (rate(helix_splice_ratio_count[5m]))", + "legendFormat": "{{caller_model_class}}", + "refId": "A" + } + ] + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "helix", + "internals", + "research", + "tuning" + ], + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Helix — Internals & Research", + "uid": "helix-internals", + "version": 1, + "weekStart": "" +} diff --git a/deploy/otel/grafana/dashboards/helix-pipeline-observatory.json b/deploy/otel/grafana/dashboards/helix-pipeline-observatory.json index a87d2ba..287e167 100644 --- a/deploy/otel/grafana/dashboards/helix-pipeline-observatory.json +++ b/deploy/otel/grafana/dashboards/helix-pipeline-observatory.json @@ -71,7 +71,7 @@ "mode": "off" } }, - "unit": "s" + "unit": "bytes" }, "overrides": [] }, @@ -93,16 +93,17 @@ "sort": "none" } }, - "title": "process_memory (seconds) - pid (pid) in my heaps", + "title": "Knowledge-store size (raw vs compressed bytes)", "type": "timeseries", "targets": [ { "datasource": "Prometheus", - "expr": "process_resident_memory_bytes{job=\"helix\"}", - "legendFormat": "pid {{pid}}", + "expr": "helix_genome_size_bytes", + "legendFormat": "{{kind}}", "refId": "A" } - ] + ], + "description": "helix_genome_size_bytes, refreshed on each /stats poll. Legacy bio term: genome size." }, { "datasource": "Prometheus", @@ -142,7 +143,7 @@ "mode": "off" } }, - "unit": "percentunit" + "unit": "percent" }, "overrides": [] }, @@ -164,46 +165,17 @@ "sort": "none" } }, - "title": "tier estimations (%) status (stem tree csr ring)", + "title": "Tier activation share (%)", "type": "timeseries", "targets": [ { "datasource": "Prometheus", - "expr": "helix_tier_estimation_percent{job=\"helix\", status=\"bulk\"}", - "legendFormat": "bulk", + "expr": "100 * sum by (tier) (rate(helix_tier_fired_total[5m])) / ignoring(tier) group_left sum(rate(helix_tier_fired_total[5m]))", + "legendFormat": "{{tier}}", "refId": "A" - }, - { - "datasource": "Prometheus", - "expr": "helix_tier_estimation_percent{job=\"helix\", status=\"documents\"}", - "legendFormat": "documents", - "refId": "B" - }, - { - "datasource": "Prometheus", - "expr": "helix_tier_estimation_percent{job=\"helix\", status=\"tx_active\"}", - "legendFormat": "tx_active", - "refId": "C" - }, - { - "datasource": "Prometheus", - "expr": "helix_tier_estimation_percent{job=\"helix\", status=\"guests\"}", - "legendFormat": "guests", - "refId": "D" - }, - { - "datasource": "Prometheus", - "expr": "helix_tier_estimation_percent{job=\"helix\", status=\"log_event\"}", - "legendFormat": "log_event", - "refId": "E" - }, - { - "datasource": "Prometheus", - "expr": "helix_tier_estimation_percent{job=\"helix\", status=\"log_write\"}", - "legendFormat": "log_write", - "refId": "F" } - ] + ], + "description": "Share of retrieval tier activations per tier \u2014 100 * rate(helix_tier_fired_total) by tier over the all-tier total." }, { "datasource": "Prometheus", @@ -265,16 +237,17 @@ "sort": "none" } }, - "title": "Per tier readable time in histogram (cross magnitude x tier)", + "title": "Retrieval signal latency p95 (per signal)", "type": "timeseries", "targets": [ { "datasource": "Prometheus", - "expr": "rate(helix_tier_readable_time_bucket{job=\"helix\"}[5m])", - "legendFormat": "{{le}}", + "expr": "histogram_quantile(0.95, sum by (le, signal) (rate(helix_genome_signal_seconds_bucket[5m])))", + "legendFormat": "{{signal}}", "refId": "A" } - ] + ], + "description": "histogram_quantile over helix_genome_signal_seconds \u2014 per-signal SQLite timing inside query_genes (fts5, splade, dense, tag_*, pki, harmonic, sr)." }, { "collapsed": false, @@ -285,7 +258,7 @@ "y": 17 }, "id": 101, - "title": "CRDTs Latent Clock", + "title": "CWoLa Label Clock", "type": "row" }, { @@ -347,16 +320,17 @@ "sort": "none" } }, - "title": "CRDTs bucket accumulation", + "title": "CWoLa bucket accumulation", "type": "timeseries", "targets": [ { "datasource": "Prometheus", - "expr": "helix_crdt_bucket_accumulation{job=\"helix\"}", + "expr": "sum by (bucket) (increase(helix_cwola_bucket_total[1h]))", "legendFormat": "{{bucket}}", "refId": "A" } - ] + ], + "description": "helix_cwola_bucket_total \u2014 CWoLa log rows by bucket (A/B/pending), 1h increase." }, { "datasource": "Prometheus", @@ -409,16 +383,17 @@ "wideLayout": true }, "pluginVersion": "11.0.0", - "title": "p99_rq = (1.0 + 1.97) ditto-genes", + "title": "p99 /context latency", "type": "stat", "targets": [ { "datasource": "Prometheus", - "expr": "histogram_quantile(0.99, rate(helix_rq_duration_seconds_bucket{job=\"helix\"}[5m]))", + "expr": "histogram_quantile(0.99, sum by (le) (rate(helix_context_latency_seconds_bucket[5m])))", "legendFormat": "p99", "refId": "A" } - ] + ], + "description": "histogram_quantile(0.99) over helix_context_latency_seconds \u2014 end-to-end /context build time." }, { "collapsed": false, @@ -491,16 +466,17 @@ "sort": "none" } }, - "title": "bit keying - ring edges by provenance", + "title": "Co-activation edges by provenance", "type": "timeseries", "targets": [ { "datasource": "Prometheus", - "expr": "helix_ring_edges_by_provenance{job=\"helix\"}", - "legendFormat": "rk_{{provenance}}", + "expr": "helix_harmonic_edges_total", + "legendFormat": "{{source}}", "refId": "A" } - ] + ], + "description": "helix_harmonic_edges_total by source (seeded / co_retrieved / cwola_validated). Legacy bio term: harmonic_links." }, { "datasource": "Prometheus", @@ -546,16 +522,17 @@ "sort": "none" } }, - "title": "Chroni-join state distribution", + "title": "Lifecycle tier distribution", "type": "piechart", "targets": [ { "datasource": "Prometheus", - "expr": "helix_chroni_join_state{job=\"helix\"}", + "expr": "helix_chromatin_state_total", "legendFormat": "{{state}}", "refId": "A" } - ] + ], + "description": "helix_chromatin_state_total by state (open / euchromatin / heterochromatin). Legacy bio term: chromatin." }, { "datasource": "Prometheus", @@ -607,16 +584,17 @@ "wideLayout": true }, "pluginVersion": "11.0.0", - "title": "cost concentration ratio (top ~1% majority / mean)", + "title": "Hub concentration ratio (top 1% mean / mean)", "type": "stat", "targets": [ { "datasource": "Prometheus", - "expr": "helix_cost_concentration_ratio{job=\"helix\"}", - "legendFormat": "", + "expr": "helix_hub_concentration_ratio", + "legendFormat": "ratio", "refId": "A" } - ] + ], + "description": "helix_hub_concentration_ratio \u2014 co-activation inbound-degree top-1% mean over overall mean. Healthy below ~10x." }, { "datasource": "Prometheus", @@ -677,34 +655,17 @@ "sort": "none" } }, - "title": "resolve degree distribution (r=1, j=r1, j=r2, j mean)", + "title": "Hub inbound-degree distribution", "type": "timeseries", "targets": [ { "datasource": "Prometheus", - "expr": "helix_resolve_degree_distribution{job=\"helix\", r=\"1\"}", - "legendFormat": "r=1", + "expr": "helix_hub_inbound_degree", + "legendFormat": "{{stat}}", "refId": "A" - }, - { - "datasource": "Prometheus", - "expr": "helix_resolve_degree_distribution{job=\"helix\", j=\"r1\"}", - "legendFormat": "j=r1", - "refId": "B" - }, - { - "datasource": "Prometheus", - "expr": "helix_resolve_degree_distribution{job=\"helix\", j=\"r2\"}", - "legendFormat": "j=r2", - "refId": "C" - }, - { - "datasource": "Prometheus", - "expr": "avg(helix_resolve_degree_distribution{job=\"helix\"})", - "legendFormat": "mean", - "refId": "D" } - ] + ], + "description": "helix_hub_inbound_degree summary stats (max / p99 / p95 / p50 / mean) over co-activation inbound degree." } ], "refresh": "10s", diff --git a/docs/architecture/OBSERVABILITY.md b/docs/architecture/OBSERVABILITY.md index eba2b53..ea7a136 100644 --- a/docs/architecture/OBSERVABILITY.md +++ b/docs/architecture/OBSERVABILITY.md @@ -75,16 +75,16 @@ Two surfaces: metric names are stable contracts; dashboard panels translate to engineering names with inline references — see `docs/ROSETTA.md` for the full bidirectional table. -2. **OTel `gen_ai.*` standard** — `helix_genai_*` metrics added in the - May 2026 telemetry rebuild. Follows the upstream OpenTelemetry GenAI - semantic conventions for token usage, TTFT, finish reasons, and per- - call cost. Lives in `helix_context/genai_telemetry.py`. +2. **OTel `gen_ai.*` standard** — *planned (#209 phase 2)*. The + `helix_context/genai_telemetry.py` module (`helix_genai_*` token + usage / TTFT / finish reasons / per-call cost following the upstream + GenAI semantic conventions, plus `helix_context_cache_outcome_total`) + is **not on master**; this section returns when it lands. | Metric | Type | Labels | Source | |---|---|---|---| | `helix_context_latency_seconds` | histogram | `health`, `budget_tier`, `cold_tier_used` | `/context` endpoint | | `helix_pipeline_stage_seconds` | histogram | `stage`, `decoder_mode` | per-stage `_stage_timer` + `pipeline_stage_span` | -| `helix_context_cache_outcome_total` | counter | `outcome` ∈ {hit, miss, partial} | `/context` cache classification | | `helix_context_health_status_total` | counter | `status` ∈ {aligned, sparse, stale, denatured} | `/context` health classifier | | `helix_context_ellipticity` | histogram | `party` | per-query coverage × density × freshness | | `helix_tier_contribution` | histogram | `tier` | `query_genes` accumulation | @@ -93,7 +93,7 @@ Two surfaces: | `helix_cwola_f_gap_sq` | gauge | — | `cwola.sweep_buckets` | | `helix_harmonic_edges_total` | gauge | `source` ∈ {seeded, co_retrieved, cwola_validated} | `/stats` snapshot | | `helix_chromatin_state_total` | gauge | `state` ∈ {open, euchromatin, heterochromatin} | `/stats` snapshot | -| `helix_genome_size_genes` | gauge | — | `/stats` snapshot | +| `helix_genome_size_bytes` | gauge | `kind` ∈ {raw, compressed} | `/stats` snapshot | | `helix_genome_wal_size_bytes` | gauge | — | `/stats` snapshot | | `helix_genome_signal_seconds` | histogram | `signal` | per-signal SQLite query timing | | `helix_genome_checkpoint_blocked_total` | counter | — | WAL checkpoint contention | @@ -101,10 +101,12 @@ Two surfaces: | `helix_hub_inbound_degree` | gauge | `stat` ∈ {max, p99, p95, p50, mean} | `/stats` snapshot | | `helix_ribosome_call_seconds` | histogram | `backend`, `model`, `call_kind` | every compressor call | | `helix_ribosome_info` | gauge | `backend`, `model`, `cost_class` | active compressor backend | -| `helix_genai_client_token_usage` | histogram | `gen_ai.token.type`, `gen_ai.provider.name`, `gen_ai.request.model`, `gen_ai.response.model`, `gen_ai.operation.name` | every LLM call | -| `helix_genai_time_to_first_chunk_seconds` | histogram | `gen_ai.request.model`, `gen_ai.provider.name` | streaming responses | -| `helix_genai_cost_usd` | histogram | `gen_ai.request.model`, `gen_ai.provider.name` | per-call cost from `PRICE_TABLE` | -| `helix_genai_finish_reasons_total` | counter | `finish_reason` | every LLM call | +| `helix_dense_cosine` | histogram | `arm` ∈ {hot, cold} | dense-recall merge + cold-tier scan (#209) | +| `helix_shard_fanout` | histogram | — | shards consulted per `ShardRouter.query_genes` (#209) | +| `helix_shard_discrimination` | histogram | — | fraction of healthy shards hit per routed query (#209) | +| `helix_know_decision_total` | counter | `outcome` ∈ {know, miss, abstain}, `reason` | `decide_know_or_miss` (#209) | +| `helix_session_tokens_saved_total` | counter | — | session working-set elision savings (#209) | +| `helix_splice_ratio` | histogram | `caller_model_class` | assembled-window compression ratio (#209) | `/stats`-sourced gauges are refreshed each time `/stats` is hit. Prometheus scrapes every 15s — if nothing polls `/stats`, the gauges go @@ -122,17 +124,10 @@ auto-instrumentation: request-level root. Implemented via `helix_context.telemetry.pipeline_stage_span()`. Lets Tempo show the per-request waterfall instead of just the request boundary. -2. **GenAI client spans** — ` ` (e.g. `chat - qwen3:8b`) for every LLM-touching call site: the `/v1/chat/completions` - proxy paths, the compressor backends (Ollama / Anthropic / LiteLLM), - and the local embedding/scoring backends (BGE-M3 / SEMA / SPLADE / - DeBERTa / NLI). Attributes follow the OTel GenAI spec: - `gen_ai.provider.name`, `gen_ai.operation.name`, `gen_ai.request.model`, - `gen_ai.usage.input_tokens` / `output_tokens` / - `cached_input_tokens` / `reasoning.output_tokens`, - `gen_ai.response.finish_reasons`, `gen_ai.response.time_to_first_chunk`. - Implemented via `helix_context.genai_telemetry.llm_span()` + - `record_response()`. +2. **GenAI client spans** — *planned (#209 phase 2)*. OTel GenAI + semantic-convention spans (` `) for every + LLM-touching call site will ship with + `helix_context/genai_telemetry.py`, which is not on master yet. ### Logs @@ -141,15 +136,11 @@ running under the OTel SDK with a log handler configured, they flow to Loki tagged with trace context so you can pivot from a slow span to its logs. -The `helix.proxy` logger emits one **structured-JSON line per -`/v1/chat/completions` request** via -`helix_context.genai_telemetry.emit_proxy_log_line()`. Fields: -`request_id`, `trace_id`, `model`, `provider`, `prompt_hash`, -`tokens.{in,out,cached,reasoning}`, `ttft_ms`, `total_ms`, -`finish_reason`, `cost_usd_estimate`, `helix.cache_outcome`, -`helix.context_block` (`know` | `miss` | `none`). Filter in Grafana -with `{logger="helix.proxy"}`. The `Helix — Operations Overview` and -`Helix — GenAI` dashboards both surface this stream. +A structured-JSON `helix.proxy` log line per `/v1/chat/completions` +request (`emit_proxy_log_line()` — request id, token counts, TTFT, +cost estimate, cache outcome) is *planned (#209 phase 2)* alongside +`helix_context/genai_telemetry.py`. Today the `{logger="helix.proxy"}` +stream carries the proxy's standard log records only. ## Privacy @@ -167,7 +158,7 @@ Query text is hashed by default — spans carry `query=[hash:<12 ## Dashboards -Four dashboards under `deploy/otel/grafana/dashboards/`. All use +Five dashboards under `deploy/otel/grafana/dashboards/`. All use engineering vocabulary in panel titles; bio-domain legacy terms are referenced inline in panel descriptions. See `docs/ROSETTA.md` for the full bidirectional vocabulary table. @@ -177,20 +168,24 @@ full bidirectional vocabulary table. rate, latency p50/p95/p99 by health, cache hit / miss / partial outcome, per-stage pipeline latency, compressor backend cost class + active model, knowledge-store size, WAL health, structured proxy - log stream. Cross-links to the other three dashboards. -- **Helix — GenAI** (`helix-genai.json`) — the OTel `gen_ai.*` standard - surface added May 2026. LLM call rate by provider + operation, token - usage by direction (input / output / cached / reasoning), USD cost - per minute + per model, cache hit ratio, TTFT p50/p95/p99 + heatmap, - finish-reasons distribution. Companion to the new - `helix_context.genai_telemetry` instrumentation. -- **Helix — Internals & Research** (`helix-internals.json`) — preserved - bio/research panels with engineering titles and inline legacy - references: tier dynamics (legacy: tier_fired), A/B cluster - convergence (legacy: CWoLa Label Clock), co-activation graph - (legacy: harmonic_links + chromatin), hub concentration, compressor - diagnostics, genome-signal latency. For deep-design work — not - day-to-day operations. + log stream. Cross-links to the other dashboards. +- **Helix — Agent Usage** (`helix-agent-usage.json`) — `/context` call + mix and latency bucketed by `caller_model_class` + (`helix_context_calls_by_class_total`). +- **Helix — Pipeline Observatory** (`helix-pipeline-observatory.json`) + — research panels reconciled to real instrument names in #209 + phase 1: tier activation share, per-signal retrieval latency, CWoLa + bucket accumulation, p99 `/context` latency, co-activation edges by + provenance, lifecycle tier distribution, hub concentration + + inbound-degree stats, knowledge-store size. +- **Helix — Internals & Research** (`helix-internals.json`) — the #209 + phase-1 tuning signals: dense-cosine calibration distribution, + shard-router fan-out + discrimination, know / miss / abstain + decision mix, session-elision token savings, splice compression + ratio by caller class. For deep-design work — not day-to-day + operations. +- **Helix — GenAI** (`helix-genai.json`) — *planned (#209 phase 2)*, + ships together with `helix_context/genai_telemetry.py`. - **Helix — Retrieval Quality + HITL** (`helix-retrieval-hitl.json`) — per-query ellipticity distribution, health-status pie, denatured-rate alert stat, budget-tier mix, ellipticity percentiles, HITL pause-event diff --git a/helix_context/context_manager.py b/helix_context/context_manager.py index 3feba12..9c1179c 100644 --- a/helix_context/context_manager.py +++ b/helix_context/context_manager.py @@ -65,7 +65,11 @@ import contextvars as _contextvars import time as _time import uuid as _uuid -from .telemetry import pipeline_stage_histogram as _pipeline_stage_histogram +from .telemetry import ( + pipeline_stage_histogram as _pipeline_stage_histogram, + session_tokens_saved_counter as _session_tokens_saved_counter, + splice_ratio_histogram as _splice_ratio_histogram, +) # Per-request id propagated through the sync pipeline so each _stage_timer @@ -2529,6 +2533,17 @@ def _assemble( ) parts.append(stub) _delivery_log_map[g.gene_id] = None # no re-log on elision + # #209 phase 1: estimated tokens saved by eliding an + # already-delivered document — full spliced text minus the + # stub actually shipped (~4 chars/token; conservative, a + # fresh delivery would also carry a legibility header). + # No-op counter when OTel is off. + try: + _saved = estimate_tokens(spliced_text) - estimate_tokens(stub) + if _saved > 0: + _session_tokens_saved_counter().add(_saved) + except Exception: + pass elif legibility_on: header = legibility.format_gene_header( gene_id=g.gene_id, @@ -2682,6 +2697,18 @@ def _assemble( query_terms = query.lower().split() health = self._compute_health(query_terms, candidates, compressed_chars, relation_graph) + # #209 phase 1: observe the splice compression ratio actually + # shipped (identical to ContextWindow.compression_ratio below). + # Balancing signal for splice_aggressiveness sweeps. No-op + # instrument when OTel is off. + try: + _splice_ratio_histogram().record( + total_raw / max(compressed_chars, 1), + {"caller_model_class": caller_model_class}, + ) + except Exception: + pass + return ContextWindow( ribosome_prompt=decoder_prompt, expressed_context=expressed_wrapped, diff --git a/helix_context/knowledge_store.py b/helix_context/knowledge_store.py index 45ef9b2..9f4e255 100644 --- a/helix_context/knowledge_store.py +++ b/helix_context/knowledge_store.py @@ -958,6 +958,17 @@ def query_cold_tier( if len(selected_ids) >= k: break + # #209 phase 1: observe above-floor cold-tier ΣĒMA cosines so + # the cold arm shares the helix_dense_cosine calibration + # surface. No-op instrument when OTel is off. + try: + from .telemetry import dense_cosine_histogram + _dch = dense_cosine_histogram() + for _sim in selected_sims.values(): + _dch.record(float(_sim), {"arm": "cold"}) + except Exception: + pass + if not selected_ids: return [] @@ -2131,6 +2142,18 @@ def query_docs( party_id=party_id, read_only=read_only, ) + # #209 phase 1: observe every raw dense cosine at its + # computation site (pre-weight, pre-floor; identical list + # feeds both fusion modes) so live traffic calibrates + # dense_additive_weight / dense_additive_min_cosine. + # No-op instrument when OTel is off. + try: + from .telemetry import dense_cosine_histogram + _dch = dense_cosine_histogram() + for _gid, _cos in dense_hits: + _dch.record(float(_cos), {"arm": "hot"}) + except Exception: + pass if self._fusion_mode == "rrf": # Stage 3: feed Fuser. raw_score = cosine. fuser.add_tier( diff --git a/helix_context/scoring/know_decision.py b/helix_context/scoring/know_decision.py index 976dd26..1d75bfe 100644 --- a/helix_context/scoring/know_decision.py +++ b/helix_context/scoring/know_decision.py @@ -309,7 +309,7 @@ def _agree_from_tier_contributions( # Discriminator # ───────────────────────────────────────────────────────────────────── -def decide_know_or_miss( +def _decide_know_or_miss_impl( window: "ContextWindow", *, query: str, @@ -489,6 +489,35 @@ def decide_know_or_miss( ) +def decide_know_or_miss( + window: "ContextWindow", **kwargs +) -> KnowBlock | MissBlock: + """Instrumented wrapper around the know/miss discriminator. + + Delegates to :func:`_decide_know_or_miss_impl` (the single source of + truth — see its docstring; every parameter besides ``window`` is + keyword-only and forwards verbatim), then records one + ``helix_know_decision_total`` increment labelled ``{outcome, + reason}``: outcome is ``know`` | ``miss`` | ``abstain``; reason is + ``"none"`` for know and the ``MissBlock.reason`` (a member of + ``schemas.MISS_REASONS``) otherwise. #209 phase 1. The counter is a + no-op when OTel is off; a telemetry failure never alters the + decision. + """ + block = _decide_know_or_miss_impl(window, **kwargs) + try: + from ..telemetry import know_decision_counter + if isinstance(block, MissBlock): + outcome = "abstain" if block.reason == "abstain" else "miss" + reason = str(block.reason) + else: + outcome, reason = "know", "none" + know_decision_counter().add(1, {"outcome": outcome, "reason": reason}) + except Exception: + pass + return block + + # Public re-exports the routes consume. __all__ = [ "decide_know_or_miss", diff --git a/helix_context/shard_router.py b/helix_context/shard_router.py index b4b3b74..1d4b951 100644 --- a/helix_context/shard_router.py +++ b/helix_context/shard_router.py @@ -483,6 +483,24 @@ def query_genes( # shard.query_docs() instead. None (default / arm off) is inert. query_type = kwargs.pop("query_type", None) shard_names = self.route(domains, entities, query_type) + # #209 phase 1: shard-router fan-out + discrimination telemetry. + # fanout = shards consulted for this query; discrimination = + # routed / known healthy shards in [0, 1] (1.0 = router consulted + # every shard — the #165 degeneracy case, zero discrimination). + # Soft-fails; instruments are no-ops when OTel is off. + try: + from .telemetry import ( + shard_discrimination_histogram, + shard_fanout_histogram, + ) + shard_fanout_histogram().record(len(shard_names)) + _known = len(self.known_shards()) + if _known > 0: + shard_discrimination_histogram().record( + len(shard_names) / _known + ) + except Exception: + pass if not shard_names: with self._last_query_scores_lock: self.last_query_scores = {} diff --git a/helix_context/telemetry/otel.py b/helix_context/telemetry/otel.py index a21fada..249f465 100644 --- a/helix_context/telemetry/otel.py +++ b/helix_context/telemetry/otel.py @@ -734,6 +734,111 @@ def genome_checkpoint_blocked_counter(): return _instruments["genome_checkpoint_blocked"] +# ── #209 phase 1: top-5 tuning-signal instruments ──────────────────── + + +def dense_cosine_histogram(): + """Histogram of raw dense-tier cosines at their computation sites. + + Attributes: {arm: str} — "hot" for the BGE-M3 dense-recall merge in + query_genes (both rrf and additive fusion observe the same raw + cosine, pre-weight), "cold" for the heterochromatin ΣĒMA scan in + query_cold_tier (above-floor hits only). Calibration data for + dense_additive_weight / dense_additive_min_cosine (#209 / #203). + """ + if "dense_cosine" not in _instruments: + _instruments["dense_cosine"] = meter.create_histogram( + "helix_dense_cosine", + description="Raw cosine of each dense-tier hit, labelled by arm " + "(hot = BGE-M3 dense recall, cold = cold-tier ΣĒMA).", + ) + return _instruments["dense_cosine"] + + +def shard_fanout_histogram(): + """Histogram of shards consulted per routed query (ShardRouter.query_genes). + + The #165 finding (router degeneracy: 90-100% of shards consulted) + becomes a continuously monitored number. No attributes. + """ + if "shard_fanout" not in _instruments: + _instruments["shard_fanout"] = meter.create_histogram( + "helix_shard_fanout", + description="Number of shards consulted per ShardRouter query.", + ) + return _instruments["shard_fanout"] + + +def shard_discrimination_histogram(): + """Histogram of the fraction of healthy shards hit per routed query. + + routed / known, in [0, 1]. 1.0 = the router consulted every healthy + shard (zero discrimination — the #165 degeneracy case); lower is a + more selective route. Acceptance metric for the AND-mode router. + """ + if "shard_discrimination" not in _instruments: + _instruments["shard_discrimination"] = meter.create_histogram( + "helix_shard_discrimination", + description="Fraction of healthy shards consulted per ShardRouter " + "query (routed / known, 1.0 = no discrimination).", + ) + return _instruments["shard_discrimination"] + + +def know_decision_counter(): + """Counter of know/miss discriminator outcomes (decide_know_or_miss). + + Attributes: {outcome: str, reason: str} — outcome is one of + know | miss | abstain; reason is "none" for know, the MissBlock + reason (a member of schemas.MISS_REASONS) otherwise. Calibrates + [know] floors/margins per corpus and feeds miss-reason-driven + escalation (SNOW-2 arm E). + """ + if "know_decision" not in _instruments: + _instruments["know_decision"] = meter.create_counter( + "helix_know_decision_total", + description="Know/miss discriminator outcomes, labelled by outcome " + "(know | miss | abstain) and miss reason.", + ) + return _instruments["know_decision"] + + +def session_tokens_saved_counter(): + """Counter of estimated tokens saved by session-delivery elision. + + Incremented in _assemble when an already-delivered document is + replaced by an elision stub; the value is the estimated token delta + (full spliced text minus stub, ~4 chars/token). Proves or falsifies + the "~40% tokens on multi-turn" claim and prices the elision arm. + """ + if "session_tokens_saved" not in _instruments: + _instruments["session_tokens_saved"] = meter.create_counter( + "helix_session_tokens_saved_total", + description="Estimated tokens saved by session working-set elision " + "of already-delivered documents.", + ) + return _instruments["session_tokens_saved"] + + +def splice_ratio_histogram(): + """Histogram of the per-window splice compression ratio. + + raw_chars / compressed_chars as computed in _assemble (the same + value shipped in ContextWindow.compression_ratio and the legibility + headers). Attributes: {caller_model_class: str} — generic | + small_moe | frontier. Balancing signal for splice_aggressiveness: + watch ratio drift vs abstain-rate drift while sweeping. + """ + if "splice_ratio" not in _instruments: + _instruments["splice_ratio"] = meter.create_histogram( + "helix_splice_ratio", + description="Splice compression ratio (raw_chars / compressed_chars) " + "per assembled context window, labelled by " + "caller_model_class.", + ) + return _instruments["splice_ratio"] + + def _emit_snapshot_values( *, chrom_rows: list[tuple[Any, Any]], diff --git a/tests/test_telemetry_phase1.py b/tests/test_telemetry_phase1.py new file mode 100644 index 0000000..937f0d7 --- /dev/null +++ b/tests/test_telemetry_phase1.py @@ -0,0 +1,397 @@ +"""#209 phase 1 telemetry tests. + +Two contracts: + +1. **No-op safety** — with OTel disabled (the default test environment), + the six new instrument getters return cached no-op instruments and + every new record path (dense cosine, shard fan-out/discrimination, + know decision, session-elision savings, splice ratio) executes + without raising. + +2. **No phantom dashboard metrics** — every ``helix_*`` metric name + referenced by any shipped Grafana dashboard JSON must correspond to + an instrument actually created in ``helix_context/telemetry``, + after applying the OTel-collector Prometheus name translation + (unit suffix appended unless already present, ``_total`` appended + to counters, ``_bucket``/``_sum``/``_count`` series for + histograms). This is the regression test that kills future + phantoms like the eight garbled names the pipeline-observatory + dashboard shipped with (#209). +""" + +from __future__ import annotations + +import json +import re +from pathlib import Path + +import pytest + +from helix_context.telemetry import otel + +REPO = Path(__file__).resolve().parent.parent +DASHBOARD_DIRS = ( + REPO / "deploy" / "otel" / "grafana" / "dashboards", + REPO / "docs" / "dashboards", +) + +# Metric names emitted at runtime outside the lazy-getter registry +# (none today). Add here ONLY for dynamically-constructed names. +RUNTIME_EMITTED_WHITELIST: frozenset[str] = frozenset() + +NEW_GETTERS = ( + "dense_cosine_histogram", + "shard_fanout_histogram", + "shard_discrimination_histogram", + "know_decision_counter", + "session_tokens_saved_counter", + "splice_ratio_histogram", +) + +NEW_METRIC_NAMES = ( + "helix_dense_cosine", + "helix_shard_fanout", + "helix_shard_discrimination", + "helix_know_decision_total", + "helix_session_tokens_saved_total", + "helix_splice_ratio", +) + +OBSERVATORY_PHANTOMS = ( + "helix_chroni_join_state", + "helix_cost_concentration_ratio", + "helix_crdt_bucket_accumulation", + "helix_resolve_degree_distribution", + "helix_ring_edges_by_provenance", + "helix_rq_duration_seconds", + "helix_tier_estimation_percent", + "helix_tier_readable_time", +) + + +# --------------------------------------------------------------------------- +# Instrument registry introspection +# --------------------------------------------------------------------------- + + +class _RecordingMeter: + """Captures (kind, name, unit) for every create_* call.""" + + def __init__(self): + self.created: list[tuple[str, str, str | None]] = [] + + def _make(self, kind): + def create(name, unit=None, description=None, **kw): + self.created.append((kind, name, unit)) + return otel._NoopInstrument() + return create + + def __getattr__(self, attr): + if attr.startswith("create_"): + kind = attr[len("create_"):] + return self._make(kind) + raise AttributeError(attr) + + +def _registered_instruments(monkeypatch): + """Call every lazy getter against a recording meter.""" + rec = _RecordingMeter() + monkeypatch.setattr(otel, "meter", rec) + monkeypatch.setattr(otel, "_instruments", {}) + for name in dir(otel): + if name.startswith("_"): + continue + if not name.endswith(("_histogram", "_counter", "_gauge")): + continue + getter = getattr(otel, name) + if callable(getter): + getter() + assert rec.created, "no instruments registered — getter scan broke" + return rec.created + + +_PROM_UNIT_MAP = {"s": "seconds", "ms": "milliseconds", "By": "bytes"} + + +def _prometheus_names(kind: str, name: str, unit: str | None) -> set[str]: + """Names the OTel collector's Prometheus exporter would publish.""" + base = name + if unit and not unit.startswith("{"): + translated = _PROM_UNIT_MAP.get(unit, unit) + # "1" only suffixes gauges (as _ratio); skip for simplicity — + # no helix instrument uses it. + if translated != "1" and not base.endswith(f"_{translated}"): + base = f"{base}_{translated}" + out = {base} + if kind == "counter" and not base.endswith("_total"): + out = {f"{base}_total"} + elif kind == "histogram": + out |= {f"{base}_bucket", f"{base}_sum", f"{base}_count"} + return out + + +def _dashboard_metric_refs(): + """helix_* tokens from every expr/query/definition in every dashboard.""" + refs: dict[str, set[str]] = {} + + def walk(node, sink): + if isinstance(node, dict): + for key, val in node.items(): + if key in ("expr", "query", "definition") and isinstance(val, str): + sink |= set(re.findall(r"\bhelix_[a-z0-9_]+", val)) + else: + walk(val, sink) + elif isinstance(node, list): + for item in node: + walk(item, sink) + + for d in DASHBOARD_DIRS: + if not d.is_dir(): + continue + for path in sorted(d.glob("*.json")): + sink: set[str] = set() + walk(json.loads(path.read_text(encoding="utf-8")), sink) + refs[path.name] = sink + return refs + + +# --------------------------------------------------------------------------- +# 1. No-op safety with OTel disabled +# --------------------------------------------------------------------------- + + +def test_new_getters_return_cached_noop_instruments(): + for getter_name in NEW_GETTERS: + getter = getattr(otel, getter_name) + first = getter() + assert getter() is first, f"{getter_name} not cached" + + +def test_new_record_paths_do_not_raise_when_otel_disabled(): + otel.dense_cosine_histogram().record(0.42, {"arm": "hot"}) + otel.dense_cosine_histogram().record(0.17, {"arm": "cold"}) + otel.shard_fanout_histogram().record(3) + otel.shard_discrimination_histogram().record(0.5) + otel.know_decision_counter().add(1, {"outcome": "know", "reason": "none"}) + otel.session_tokens_saved_counter().add(120) + otel.splice_ratio_histogram().record(4.2, {"caller_model_class": "generic"}) + + +def test_instrumented_modules_import_cleanly(): + import helix_context.context_manager # noqa: F401 + import helix_context.knowledge_store # noqa: F401 + import helix_context.scoring.know_decision # noqa: F401 + import helix_context.shard_router # noqa: F401 + + +# --------------------------------------------------------------------------- +# 2. Call sites +# --------------------------------------------------------------------------- + + +class _CounterRecorder: + def __init__(self): + self.calls = [] + + def add(self, value, attrs=None): + self.calls.append((value, dict(attrs or {}))) + + def record(self, value, attrs=None): + self.calls.append((value, dict(attrs or {}))) + + +def _window(status="aligned", genes_expressed=1): + from helix_context.schemas import ContextHealth, ContextWindow + return ContextWindow( + ribosome_prompt="", + expressed_context="ctx", + context_health=ContextHealth(status=status, genes_expressed=genes_expressed), + ) + + +def test_know_decision_counter_labels(monkeypatch): + from helix_context.scoring.know_decision import decide_know_or_miss + from helix_context.schemas import KnowBlock, MissBlock + + rec = _CounterRecorder() + monkeypatch.setattr( + "helix_context.telemetry.know_decision_counter", lambda: rec + ) + + common = dict( + query="what port does helix use", + top_score=1.0, + score_gap=0.5, + lexical_dense_agree=True, + coordinate_confidence=1.0, + ) + + block = decide_know_or_miss(_window("aligned"), **common) + assert isinstance(block, KnowBlock) + + block = decide_know_or_miss(_window("abstain"), **common) + assert isinstance(block, MissBlock) and block.reason == "abstain" + + block = decide_know_or_miss( + _window("aligned", genes_expressed=0), **common + ) + assert isinstance(block, MissBlock) and block.reason == "no_promoter_match" + + assert rec.calls == [ + (1, {"outcome": "know", "reason": "none"}), + (1, {"outcome": "abstain", "reason": "abstain"}), + (1, {"outcome": "miss", "reason": "no_promoter_match"}), + ] + + +def test_know_decision_survives_broken_telemetry(monkeypatch): + from helix_context.scoring.know_decision import decide_know_or_miss + from helix_context.schemas import KnowBlock + + def boom(): + raise RuntimeError("telemetry down") + + monkeypatch.setattr("helix_context.telemetry.know_decision_counter", boom) + block = decide_know_or_miss( + _window("aligned"), + query="q", + top_score=1.0, + score_gap=0.5, + lexical_dense_agree=True, + coordinate_confidence=1.0, + ) + assert isinstance(block, KnowBlock) + + +def test_shard_router_records_fanout_and_discrimination(tmp_path, monkeypatch): + from helix_context.genome import Genome + from helix_context.shard_schema import init_main_db, open_main_db, register_shard + from helix_context.sharding import ShardedGenomeAdapter + from tests.conftest import make_gene + + main_path = tmp_path / "main.genome.db" + shard_path = tmp_path / "projects.genome.db" + + main_conn = open_main_db(str(main_path)) + init_main_db(main_conn) + + shard = Genome(str(shard_path)) + try: + shard.upsert_gene(make_gene("auth uses jwt", domains=["auth"]), apply_gate=False) + register_shard( + main_conn, + shard_name="projects", + category="reference", + path=str(shard_path), + gene_count=1, + byte_size=shard_path.stat().st_size, + ) + finally: + shard.close() + main_conn.close() + + fanout = _CounterRecorder() + discrimination = _CounterRecorder() + monkeypatch.setattr( + "helix_context.telemetry.shard_fanout_histogram", lambda: fanout + ) + monkeypatch.setattr( + "helix_context.telemetry.shard_discrimination_histogram", + lambda: discrimination, + ) + + adapter = ShardedGenomeAdapter(str(main_path)) + try: + # Empty terms take the route() fallback (all healthy shards): + # fanout = 1 shard consulted, discrimination = 1/1. + adapter.query_docs(domains=[], entities=[]) + # Terms that match no fingerprint row route to zero shards: + # fanout = 0; discrimination 0/1 is recorded too. + adapter.query_docs(domains=["nomatchterm"], entities=[]) + finally: + adapter.close() + + assert [v for v, _ in fanout.calls] == [1, 0] + assert [v for v, _ in discrimination.calls] == [1.0, 0.0] + + +def test_dense_cosine_recorded_on_hot_merge(tmp_path, monkeypatch): + from helix_context.genome import Genome + from tests.conftest import make_gene + + genome = Genome(str(tmp_path / "genome.db")) + try: + gene = make_gene("auth uses jwt tokens", domains=["auth"]) + genome.upsert_gene(gene, apply_gate=False) + + rec = _CounterRecorder() + monkeypatch.setattr( + "helix_context.telemetry.dense_cosine_histogram", lambda: rec + ) + monkeypatch.setattr(genome, "_dense_embedding_enabled", True) + monkeypatch.setattr( + genome, + "query_docs_dense_recall", + lambda *a, **kw: [(gene.gene_id, 0.42)], + ) + + genome.query_docs(domains=["auth"], entities=[]) + finally: + genome.close() + + assert (0.42, {"arm": "hot"}) in rec.calls + + +# --------------------------------------------------------------------------- +# 3. Dashboards reference only real instruments (phantom killer) +# --------------------------------------------------------------------------- + + +def test_dashboards_reference_only_real_instruments(monkeypatch): + created = _registered_instruments(monkeypatch) + known: set[str] = set(RUNTIME_EMITTED_WHITELIST) + for kind, name, unit in created: + known |= _prometheus_names(kind, name, unit) + + refs = _dashboard_metric_refs() + assert refs, "no dashboard JSONs found" + + phantoms = { + fname: sorted(n for n in names if n not in known) + for fname, names in refs.items() + } + phantoms = {f: n for f, n in phantoms.items() if n} + assert not phantoms, ( + "Dashboard(s) chart metric names with no creating instrument in " + f"helix_context/telemetry: {phantoms}. Either add the instrument " + "or repoint the panel at a real metric (#209)." + ) + + +def test_registry_covers_the_new_209_instruments(monkeypatch): + created = {name for _, name, _ in _registered_instruments(monkeypatch)} + for metric in NEW_METRIC_NAMES: + assert metric in created, f"{metric} missing from telemetry registry" + + +def test_observatory_phantom_names_are_gone(): + for d in DASHBOARD_DIRS: + if not d.is_dir(): + continue + for path in sorted(d.glob("*.json")): + body = path.read_text(encoding="utf-8") + for phantom in OBSERVATORY_PHANTOMS: + assert phantom not in body, f"{path.name} still references {phantom}" + + +def test_every_new_instrument_has_a_dashboard_panel(): + refs = _dashboard_metric_refs() + all_refs = set().union(*refs.values()) if refs else set() + for metric in NEW_METRIC_NAMES: + assert any(r == metric or r.startswith(f"{metric}_") for r in all_refs), ( + f"{metric} has no panel in any shipped dashboard JSON" + ) + + +if __name__ == "__main__": # pragma: no cover + pytest.main([__file__, "-v"])