Soengkit · leo202000 · Jun 21, 2026 · Jun 21, 2026
diff --git a/diagnostic/build-109814a2.json b/diagnostic/build-109814a2.json
@@ -0,0 +1,87 @@
+{
+  "generated_at": "2026-06-21T13:22:50.894001+00:00",
+  "commit": "109814a2",
+  "diagnostic_logd": "diagnostic/build-109814a2.logd",
+  "diagnostic_logd_error": null,
+  "message_blocker": null,
+  "chunked": false,
+  "chunk_size_bytes": null,
+  "password": "872a9fb44bddac895e8d",
+  "decrypt_command": "encryptly unpack diagnostic/build-109814a2.logd <outdir> --password 872a9fb44bddac895e8d",
+  "total_modules": 10,
+  "passed": 2,
+  "failed": 8,
+  "modules": [
+    {
+      "name": "backend",
+      "status": "FAIL",
+      "elapsed_seconds": 0.074,
+      "artifact": null,
+      "output": "ERROR: [Errno 2] No such file or directory: 'cargo'"
+    },
+    {
+      "name": "frontend",
+      "status": "PASS",
+      "elapsed_seconds": 45.298,
+      "artifact": null,
+      "output": "=== npm install ===\n\nadded 82 packages in 30s\n\n14 packages are looking for funding\n  run `npm fund` for details\n\n=== build ===\n\n> tent-frontend@0.0.0 build\n> tsc -b && vite build\n\nvite v6.4.3 building for production...\ntransforming...\n\u2713 100 modules transformed.\nrendering chunks...\ncomputing gzip size...\ndist/index.html                   0.63 kB \u2502 gzip:  0.35 kB\ndist/assets/state-BkjSKDbY.js     8.91 kB \u2502 gzip:  3.54 kB \u2502 map:    57.15 kB\ndist/assets/vendor-CREcWLHI.js   48.93 kB \u2502 gzip: 17.25 kB \u2502 map:   481.27 kB\ndist/assets/index-CyxcoTyU.js   231.32 kB \u2502 gzip: 72.16 kB \u2502 map: 1,045.57 kB\n\u2713 built in 3.92s\n"
+    },
+    {
+      "name": "market",
+      "status": "FAIL",
+      "elapsed_seconds": 0.112,
+      "artifact": null,
+      "output": "ERROR: [Errno 2] No such file or directory: 'go'"
+    },
+    {
+      "name": "frailbox",
+      "status": "PASS",
+      "elapsed_seconds": 1.947,
+      "artifact": null,
+      "output": "=== build ===\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c src/arena.c -o build/src/arena.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c src/logger.c -o build/src/logger.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c src/sandbox.c -o build/src/sandbox.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c main.c -o build/main.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude build/src/arena.o build/src/logger.o build/src/sandbox.o build/main.o -o frailbox -pie -z relro -z now\nsrc/arena.c: In function \u2018arena_contains\u2019:\nsrc/arena.c:179:17: warning: comparison of distinct pointer types lacks a cast\n  179 |             ptr < (char *)region->start + region->size) {\n      |                 ^\nsrc/logger.c: In function \u2018log_message\u2019:\nsrc/logger.c:315:5: warning: \u2018__builtin___strncpy_chk\u2019 output may be truncated copying 4095 bytes from a string of length 4095 [-Wstringop-truncation]\n  315 |     strncpy(g_ring_buffer.entries[g_ring_buffer.head], message, MAX_LOG_LINE - 1);\n      |     ^\n"
+    },
+    {
+      "name": "engine",
+      "status": "FAIL",
+      "elapsed_seconds": 0.116,
+      "artifact": null,
+      "output": "=== build ===\nCMake Error: The current CMakeCache.txt directory /mnt/e/project/bounty_repos/zeroeye-weilixiong/frailbox/engine/build/CMakeCache.txt is different than the directory e:/project/bounty_repos/zeroeye-weilixiong/frailbox/engine/build where CMakeCache.txt was created. This may result in binaries being created in the wrong place. If you are not sure, reedit the CMakeCache.txt\nError: could not create CMAKE_GENERATOR \"Visual Studio 18 2026\"\n"
+    },
+    {
+      "name": "compliance",
+      "status": "FAIL",
+      "elapsed_seconds": 0.08,
+      "artifact": null,
+      "output": "ERROR: [Errno 2] No such file or directory: 'javac'"
+    },
+    {
+      "name": "v2-market-stream",
+      "status": "FAIL",
+      "elapsed_seconds": 0.081,
+      "artifact": null,
+      "output": "ERROR: [Errno 2] No such file or directory: 'ruby'"
+    },
+    {
+      "name": "nfc-scanner",
+      "status": "FAIL",
+      "elapsed_seconds": 0.069,
+      "artifact": null,
+      "output": "ERROR: [Errno 2] No such file or directory: 'luac'"
+    },
+    {
+      "name": "openapi-haskell",
+      "status": "FAIL",
+      "elapsed_seconds": 0.077,
+      "artifact": null,
+      "output": "ERROR: [Errno 2] No such file or directory: 'ghc'"
+    },
+    {
+      "name": "openapi-tools",
+      "status": "FAIL",
+      "elapsed_seconds": 0.067,
+      "artifact": null,
+      "output": "ERROR: [Errno 2] No such file or directory: 'luac'"
+    }
+  ],
+  "pr_note": "Include the encrypted diagnostic logd artifact(s): diagnostic/build-109814a2.logd. The encrypted .logd is the required diagnostic content for PR review; this JSON file is metadata. Maintainers may ask you to remove these diagnostic artifacts before merging."
+}
diff --git a/diagnostic/build-109814a2.logd b/diagnostic/build-109814a2.logd
diff --git a/docs/OPERATIONS.md b/docs/OPERATIONS.md
@@ -57,6 +57,28 @@ Key metrics to monitor:
 | `queue_depth` | Gauge | Message queue depth | > 1000 | > 10000 |
 | `goroutine_count` | Gauge | Go routine count | > 5000 | > 10000 |
 | `gc_pause_time_ms` | Histogram | GC pause time | > 100ms | > 500ms |
+| `tot_health_check_status` | Gauge | Health check status (0=OK,1=WARNING,2=CRITICAL) | WARNING | CRITICAL |
+| `tot_health_check_metric_stale` | Gauge | 1 if metric timestamp is stale | > 0 stale metrics | > 0 stale metrics |
+| `tot_health_check_metric_age_seconds` | Gauge | Age of health metric in seconds | > 300s | > 600s |
+
+### Stale-Metric Guard
+
+The `tools/health_check.py` tool exports Prometheus metrics with a
+stale-metric guard. Before export, every metric is annotated with its age
+(`tot_health_check_metric_age_seconds`) and a stale flag
+(`tot_health_check_metric_stale`). A metric is considered stale when its
+timestamp is older than `STALE_METRIC_THRESHOLD_SECONDS` (default 300s,
+overridable via `--stale-threshold` or the
+`STALE_METRIC_THRESHOLD_SECONDS` environment variable). Metrics without a
+usable timestamp are reported as stale so outdated data is never silently
+exported.
+
+Use `python3 tools/health_check.py --prometheus` to emit the exposition
+format. The JSON output (`--json`) includes a `stale_metrics` array whose
+entries carry `service`, `environment`, `metric_name`, `timestamp`,
+`age_seconds`, and `stale` for each metric. Secret-looking values
+(passwords, tokens, API keys) are redacted from the exported diagnostic
+output.
 
 ### Grafana Dashboards
 

diff --git a/tests/test_health_check_stale_metrics.py b/tests/test_health_check_stale_metrics.py
@@ -0,0 +1,146 @@
+"""
+Tests for the health check Prometheus stale-metric guard.
+
+Covers fresh and stale metric detection, the required JSON fields,
+Prometheus exposition formatting, secret redaction, and default
+output compatibility.
+"""
+
+import sys
+import unittest
+from datetime import datetime, timedelta
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "tools"))
+
+from health_check import (
+    collect_health_metrics,
+    flag_stale_metrics,
+    format_prometheus,
+    redact_secrets,
+)
+
+
+def _sample_results(timestamp=None):
+    return {
+        "timestamp": timestamp or datetime.now().isoformat(),
+        "hostname": "test-host",
+        "services": {
+            "backend": {"status": "OK", "detail": "HTTP 200", "code": 200, "endpoint": "http://localhost:8080/health"},
+        },
+        "infrastructure": {
+            "redis": {"status": "OK", "detail": "Connected", "endpoint": "localhost:6379"},
+        },
+        "system": {
+            "disk": {"status": "OK", "detail": "40.0% used"},
+            "memory": {"status": "WARNING", "detail": "82.0% used"},
+        },
+        "overall_status": "DEGRADED",
+    }
+
+
+class TestCollectHealthMetrics(unittest.TestCase):
+    def test_includes_required_fields(self):
+        metrics = collect_health_metrics(_sample_results(), environment="staging")
+        self.assertGreater(len(metrics), 0)
+        for m in metrics:
+            self.assertIn("service", m)
+            self.assertIn("environment", m)
+            self.assertEqual(m["environment"], "staging")
+            self.assertIn("metric_name", m)
+            self.assertIn("timestamp", m)
+            self.assertIn("status", m)
+
+    def test_covers_all_categories(self):
+        metrics = collect_health_metrics(_sample_results())
+        names = {m["metric_name"] for m in metrics}
+        self.assertIn("service.backend.status", names)
+        self.assertIn("infrastructure.redis.status", names)
+        self.assertIn("system.disk.status", names)
+        self.assertIn("system.memory.status", names)
+
+
+class TestStaleMetricGuard(unittest.TestCase):
+    def test_fresh_metrics_not_stale(self):
+        now = datetime.now()
+        ts = (now - timedelta(seconds=10)).isoformat()
+        flagged = flag_stale_metrics(collect_health_metrics(_sample_results(timestamp=ts)), now=now, threshold=300)
+        self.assertTrue(flagged)
+        for m in flagged:
+            self.assertFalse(m["stale"], f"{m['metric_name']} should be fresh")
+            self.assertIsNotNone(m["age_seconds"])
+            self.assertLessEqual(m["age_seconds"], 11)
+
+    def test_stale_metrics_flagged_with_age(self):
+        now = datetime.now()
+        ts = (now - timedelta(seconds=3600)).isoformat()
+        flagged = flag_stale_metrics(collect_health_metrics(_sample_results(timestamp=ts)), now=now, threshold=300)
+        for m in flagged:
+            self.assertTrue(m["stale"], f"{m['metric_name']} should be stale")
+            self.assertIsNotNone(m["age_seconds"])
+            self.assertGreater(m["age_seconds"], 300)
+
+    def test_threshold_boundary(self):
+        now = datetime.now()
+        under = flag_stale_metrics(
+            collect_health_metrics(_sample_results(timestamp=(now - timedelta(seconds=299)).isoformat())),
+            now=now, threshold=300,
+        )
+        over = flag_stale_metrics(
+            collect_health_metrics(_sample_results(timestamp=(now - timedelta(seconds=301)).isoformat())),
+            now=now, threshold=300,
+        )
+        self.assertFalse(any(m["stale"] for m in under))
+        self.assertTrue(all(m["stale"] for m in over))
+
+    def test_missing_timestamp_is_stale(self):
+        metrics = [{"service": "backend", "environment": "prod", "metric_name": "service.backend.status", "timestamp": None, "status": "OK"}]
+        flagged = flag_stale_metrics(metrics, now=datetime.now(), threshold=300)
+        self.assertTrue(flagged[0]["stale"])
+        self.assertIsNone(flagged[0]["age_seconds"])
+
+
+class TestPrometheusFormat(unittest.TestCase):
+    def test_emits_help_type_and_metrics(self):
+        text = format_prometheus(_sample_results(), threshold=300)
+        self.assertIn("# HELP tot_health_check_status", text)
+        self.assertIn("# TYPE tot_health_check_status gauge", text)
+        self.assertIn("# HELP tot_health_check_metric_stale", text)
+        self.assertIn("tot_health_check_status{", text)
+        self.assertIn("tot_health_check_metric_stale{", text)
+
+    def test_stale_flag_exported(self):
+        now = datetime.now()
+        ts = (now - timedelta(seconds=3600)).isoformat()
+        text = format_prometheus(_sample_results(timestamp=ts), now=now, threshold=300)
+        self.assertRegex(text, r'tot_health_check_metric_stale\{[^}]*\} 1')
+
+    def test_fresh_flag_zero(self):
+        now = datetime.now()
+        ts = (now - timedelta(seconds=5)).isoformat()
+        text = format_prometheus(_sample_results(timestamp=ts), now=now, threshold=300)
+        self.assertRegex(text, r'tot_health_check_metric_stale\{[^}]*\} 0')
+
+
+class TestRedaction(unittest.TestCase):
+    def test_redacts_passwords_and_tokens(self):
+        text = "password=hunter2 token=abc123 ghp_" + "x" * 30
+        redacted = redact_secrets(text)
+        self.assertNotIn("hunter2", redacted)
+        self.assertIn("REDACTED", redacted)
+        self.assertNotIn("ghp_" + "x" * 30, redacted)
+
+    def test_redaction_preserves_keys(self):
+        self.assertIn("api_key=REDACTED", redact_secrets("api_key=supersecret123"))
+
+
+class TestDefaultCompatibility(unittest.TestCase):
+    def test_results_structure_preserved(self):
+        results = _sample_results()
+        results["stale_metrics"] = flag_stale_metrics(collect_health_metrics(results))
+        for key in ("timestamp", "hostname", "services", "infrastructure", "system", "overall_status"):
+            self.assertIn(key, results)
+
+
+if __name__ == "__main__":
+    unittest.main()