Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions diagnostic/build-109814a2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"generated_at": "2026-06-21T13:22:50.894001+00:00",
"commit": "109814a2",
"diagnostic_logd": "diagnostic/build-109814a2.logd",
"diagnostic_logd_error": null,
"message_blocker": null,
"chunked": false,
"chunk_size_bytes": null,
"password": "872a9fb44bddac895e8d",
"decrypt_command": "encryptly unpack diagnostic/build-109814a2.logd <outdir> --password 872a9fb44bddac895e8d",
"total_modules": 10,
"passed": 2,
"failed": 8,
"modules": [
{
"name": "backend",
"status": "FAIL",
"elapsed_seconds": 0.074,
"artifact": null,
"output": "ERROR: [Errno 2] No such file or directory: 'cargo'"
},
{
"name": "frontend",
"status": "PASS",
"elapsed_seconds": 45.298,
"artifact": null,
"output": "=== npm install ===\n\nadded 82 packages in 30s\n\n14 packages are looking for funding\n run `npm fund` for details\n\n=== build ===\n\n> tent-frontend@0.0.0 build\n> tsc -b && vite build\n\nvite v6.4.3 building for production...\ntransforming...\n\u2713 100 modules transformed.\nrendering chunks...\ncomputing gzip size...\ndist/index.html 0.63 kB \u2502 gzip: 0.35 kB\ndist/assets/state-BkjSKDbY.js 8.91 kB \u2502 gzip: 3.54 kB \u2502 map: 57.15 kB\ndist/assets/vendor-CREcWLHI.js 48.93 kB \u2502 gzip: 17.25 kB \u2502 map: 481.27 kB\ndist/assets/index-CyxcoTyU.js 231.32 kB \u2502 gzip: 72.16 kB \u2502 map: 1,045.57 kB\n\u2713 built in 3.92s\n"
},
{
"name": "market",
"status": "FAIL",
"elapsed_seconds": 0.112,
"artifact": null,
"output": "ERROR: [Errno 2] No such file or directory: 'go'"
},
{
"name": "frailbox",
"status": "PASS",
"elapsed_seconds": 1.947,
"artifact": null,
"output": "=== build ===\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c src/arena.c -o build/src/arena.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c src/logger.c -o build/src/logger.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c src/sandbox.c -o build/src/sandbox.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c main.c -o build/main.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude build/src/arena.o build/src/logger.o build/src/sandbox.o build/main.o -o frailbox -pie -z relro -z now\nsrc/arena.c: In function \u2018arena_contains\u2019:\nsrc/arena.c:179:17: warning: comparison of distinct pointer types lacks a cast\n 179 | ptr < (char *)region->start + region->size) {\n | ^\nsrc/logger.c: In function \u2018log_message\u2019:\nsrc/logger.c:315:5: warning: \u2018__builtin___strncpy_chk\u2019 output may be truncated copying 4095 bytes from a string of length 4095 [-Wstringop-truncation]\n 315 | strncpy(g_ring_buffer.entries[g_ring_buffer.head], message, MAX_LOG_LINE - 1);\n | ^\n"
},
{
"name": "engine",
"status": "FAIL",
"elapsed_seconds": 0.116,
"artifact": null,
"output": "=== build ===\nCMake Error: The current CMakeCache.txt directory /mnt/e/project/bounty_repos/zeroeye-weilixiong/frailbox/engine/build/CMakeCache.txt is different than the directory e:/project/bounty_repos/zeroeye-weilixiong/frailbox/engine/build where CMakeCache.txt was created. This may result in binaries being created in the wrong place. If you are not sure, reedit the CMakeCache.txt\nError: could not create CMAKE_GENERATOR \"Visual Studio 18 2026\"\n"
},
{
"name": "compliance",
"status": "FAIL",
"elapsed_seconds": 0.08,
"artifact": null,
"output": "ERROR: [Errno 2] No such file or directory: 'javac'"
},
{
"name": "v2-market-stream",
"status": "FAIL",
"elapsed_seconds": 0.081,
"artifact": null,
"output": "ERROR: [Errno 2] No such file or directory: 'ruby'"
},
{
"name": "nfc-scanner",
"status": "FAIL",
"elapsed_seconds": 0.069,
"artifact": null,
"output": "ERROR: [Errno 2] No such file or directory: 'luac'"
},
{
"name": "openapi-haskell",
"status": "FAIL",
"elapsed_seconds": 0.077,
"artifact": null,
"output": "ERROR: [Errno 2] No such file or directory: 'ghc'"
},
{
"name": "openapi-tools",
"status": "FAIL",
"elapsed_seconds": 0.067,
"artifact": null,
"output": "ERROR: [Errno 2] No such file or directory: 'luac'"
}
],
"pr_note": "Include the encrypted diagnostic logd artifact(s): diagnostic/build-109814a2.logd. The encrypted .logd is the required diagnostic content for PR review; this JSON file is metadata. Maintainers may ask you to remove these diagnostic artifacts before merging."
}
Binary file added diagnostic/build-109814a2.logd
Binary file not shown.
22 changes: 22 additions & 0 deletions docs/OPERATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,28 @@ Key metrics to monitor:
| `queue_depth` | Gauge | Message queue depth | > 1000 | > 10000 |
| `goroutine_count` | Gauge | Go routine count | > 5000 | > 10000 |
| `gc_pause_time_ms` | Histogram | GC pause time | > 100ms | > 500ms |
| `tot_health_check_status` | Gauge | Health check status (0=OK,1=WARNING,2=CRITICAL) | WARNING | CRITICAL |
| `tot_health_check_metric_stale` | Gauge | 1 if metric timestamp is stale | > 0 stale metrics | > 0 stale metrics |
| `tot_health_check_metric_age_seconds` | Gauge | Age of health metric in seconds | > 300s | > 600s |

### Stale-Metric Guard

The `tools/health_check.py` tool exports Prometheus metrics with a
stale-metric guard. Before export, every metric is annotated with its age
(`tot_health_check_metric_age_seconds`) and a stale flag
(`tot_health_check_metric_stale`). A metric is considered stale when its
timestamp is older than `STALE_METRIC_THRESHOLD_SECONDS` (default 300s,
overridable via `--stale-threshold` or the
`STALE_METRIC_THRESHOLD_SECONDS` environment variable). Metrics without a
usable timestamp are reported as stale so outdated data is never silently
exported.

Use `python3 tools/health_check.py --prometheus` to emit the exposition
format. The JSON output (`--json`) includes a `stale_metrics` array whose
entries carry `service`, `environment`, `metric_name`, `timestamp`,
`age_seconds`, and `stale` for each metric. Secret-looking values
(passwords, tokens, API keys) are redacted from the exported diagnostic
output.

### Grafana Dashboards

Expand Down
146 changes: 146 additions & 0 deletions tests/test_health_check_stale_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""
Tests for the health check Prometheus stale-metric guard.

Covers fresh and stale metric detection, the required JSON fields,
Prometheus exposition formatting, secret redaction, and default
output compatibility.
"""

import sys
import unittest
from datetime import datetime, timedelta
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent / "tools"))

from health_check import (
collect_health_metrics,
flag_stale_metrics,
format_prometheus,
redact_secrets,
)


def _sample_results(timestamp=None):
return {
"timestamp": timestamp or datetime.now().isoformat(),
"hostname": "test-host",
"services": {
"backend": {"status": "OK", "detail": "HTTP 200", "code": 200, "endpoint": "http://localhost:8080/health"},
},
"infrastructure": {
"redis": {"status": "OK", "detail": "Connected", "endpoint": "localhost:6379"},
},
"system": {
"disk": {"status": "OK", "detail": "40.0% used"},
"memory": {"status": "WARNING", "detail": "82.0% used"},
},
"overall_status": "DEGRADED",
}


class TestCollectHealthMetrics(unittest.TestCase):
def test_includes_required_fields(self):
metrics = collect_health_metrics(_sample_results(), environment="staging")
self.assertGreater(len(metrics), 0)
for m in metrics:
self.assertIn("service", m)
self.assertIn("environment", m)
self.assertEqual(m["environment"], "staging")
self.assertIn("metric_name", m)
self.assertIn("timestamp", m)
self.assertIn("status", m)

def test_covers_all_categories(self):
metrics = collect_health_metrics(_sample_results())
names = {m["metric_name"] for m in metrics}
self.assertIn("service.backend.status", names)
self.assertIn("infrastructure.redis.status", names)
self.assertIn("system.disk.status", names)
self.assertIn("system.memory.status", names)


class TestStaleMetricGuard(unittest.TestCase):
def test_fresh_metrics_not_stale(self):
now = datetime.now()
ts = (now - timedelta(seconds=10)).isoformat()
flagged = flag_stale_metrics(collect_health_metrics(_sample_results(timestamp=ts)), now=now, threshold=300)
self.assertTrue(flagged)
for m in flagged:
self.assertFalse(m["stale"], f"{m['metric_name']} should be fresh")
self.assertIsNotNone(m["age_seconds"])
self.assertLessEqual(m["age_seconds"], 11)

def test_stale_metrics_flagged_with_age(self):
now = datetime.now()
ts = (now - timedelta(seconds=3600)).isoformat()
flagged = flag_stale_metrics(collect_health_metrics(_sample_results(timestamp=ts)), now=now, threshold=300)
for m in flagged:
self.assertTrue(m["stale"], f"{m['metric_name']} should be stale")
self.assertIsNotNone(m["age_seconds"])
self.assertGreater(m["age_seconds"], 300)

def test_threshold_boundary(self):
now = datetime.now()
under = flag_stale_metrics(
collect_health_metrics(_sample_results(timestamp=(now - timedelta(seconds=299)).isoformat())),
now=now, threshold=300,
)
over = flag_stale_metrics(
collect_health_metrics(_sample_results(timestamp=(now - timedelta(seconds=301)).isoformat())),
now=now, threshold=300,
)
self.assertFalse(any(m["stale"] for m in under))
self.assertTrue(all(m["stale"] for m in over))

def test_missing_timestamp_is_stale(self):
metrics = [{"service": "backend", "environment": "prod", "metric_name": "service.backend.status", "timestamp": None, "status": "OK"}]
flagged = flag_stale_metrics(metrics, now=datetime.now(), threshold=300)
self.assertTrue(flagged[0]["stale"])
self.assertIsNone(flagged[0]["age_seconds"])


class TestPrometheusFormat(unittest.TestCase):
def test_emits_help_type_and_metrics(self):
text = format_prometheus(_sample_results(), threshold=300)
self.assertIn("# HELP tot_health_check_status", text)
self.assertIn("# TYPE tot_health_check_status gauge", text)
self.assertIn("# HELP tot_health_check_metric_stale", text)
self.assertIn("tot_health_check_status{", text)
self.assertIn("tot_health_check_metric_stale{", text)

def test_stale_flag_exported(self):
now = datetime.now()
ts = (now - timedelta(seconds=3600)).isoformat()
text = format_prometheus(_sample_results(timestamp=ts), now=now, threshold=300)
self.assertRegex(text, r'tot_health_check_metric_stale\{[^}]*\} 1')

def test_fresh_flag_zero(self):
now = datetime.now()
ts = (now - timedelta(seconds=5)).isoformat()
text = format_prometheus(_sample_results(timestamp=ts), now=now, threshold=300)
self.assertRegex(text, r'tot_health_check_metric_stale\{[^}]*\} 0')


class TestRedaction(unittest.TestCase):
def test_redacts_passwords_and_tokens(self):
text = "password=hunter2 token=abc123 ghp_" + "x" * 30
redacted = redact_secrets(text)
self.assertNotIn("hunter2", redacted)
self.assertIn("REDACTED", redacted)
self.assertNotIn("ghp_" + "x" * 30, redacted)

def test_redaction_preserves_keys(self):
self.assertIn("api_key=REDACTED", redact_secrets("api_key=supersecret123"))


class TestDefaultCompatibility(unittest.TestCase):
def test_results_structure_preserved(self):
results = _sample_results()
results["stale_metrics"] = flag_stale_metrics(collect_health_metrics(results))
for key in ("timestamp", "hostname", "services", "infrastructure", "system", "overall_status"):
self.assertIn(key, results)


if __name__ == "__main__":
unittest.main()
Loading