diff --git a/diagnostic/build-109814a2.json b/diagnostic/build-109814a2.json new file mode 100644 index 00000000..29a7a911 --- /dev/null +++ b/diagnostic/build-109814a2.json @@ -0,0 +1,87 @@ +{ + "generated_at": "2026-06-21T13:22:50.894001+00:00", + "commit": "109814a2", + "diagnostic_logd": "diagnostic/build-109814a2.logd", + "diagnostic_logd_error": null, + "message_blocker": null, + "chunked": false, + "chunk_size_bytes": null, + "password": "872a9fb44bddac895e8d", + "decrypt_command": "encryptly unpack diagnostic/build-109814a2.logd --password 872a9fb44bddac895e8d", + "total_modules": 10, + "passed": 2, + "failed": 8, + "modules": [ + { + "name": "backend", + "status": "FAIL", + "elapsed_seconds": 0.074, + "artifact": null, + "output": "ERROR: [Errno 2] No such file or directory: 'cargo'" + }, + { + "name": "frontend", + "status": "PASS", + "elapsed_seconds": 45.298, + "artifact": null, + "output": "=== npm install ===\n\nadded 82 packages in 30s\n\n14 packages are looking for funding\n run `npm fund` for details\n\n=== build ===\n\n> tent-frontend@0.0.0 build\n> tsc -b && vite build\n\nvite v6.4.3 building for production...\ntransforming...\n\u2713 100 modules transformed.\nrendering chunks...\ncomputing gzip size...\ndist/index.html 0.63 kB \u2502 gzip: 0.35 kB\ndist/assets/state-BkjSKDbY.js 8.91 kB \u2502 gzip: 3.54 kB \u2502 map: 57.15 kB\ndist/assets/vendor-CREcWLHI.js 48.93 kB \u2502 gzip: 17.25 kB \u2502 map: 481.27 kB\ndist/assets/index-CyxcoTyU.js 231.32 kB \u2502 gzip: 72.16 kB \u2502 map: 1,045.57 kB\n\u2713 built in 3.92s\n" + }, + { + "name": "market", + "status": "FAIL", + "elapsed_seconds": 0.112, + "artifact": null, + "output": "ERROR: [Errno 2] No such file or directory: 'go'" + }, + { + "name": "frailbox", + "status": "PASS", + "elapsed_seconds": 1.947, + "artifact": null, + "output": "=== build ===\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c src/arena.c -o build/src/arena.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c src/logger.c -o build/src/logger.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c src/sandbox.c -o build/src/sandbox.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c main.c -o build/main.o\ngcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude build/src/arena.o build/src/logger.o build/src/sandbox.o build/main.o -o frailbox -pie -z relro -z now\nsrc/arena.c: In function \u2018arena_contains\u2019:\nsrc/arena.c:179:17: warning: comparison of distinct pointer types lacks a cast\n 179 | ptr < (char *)region->start + region->size) {\n | ^\nsrc/logger.c: In function \u2018log_message\u2019:\nsrc/logger.c:315:5: warning: \u2018__builtin___strncpy_chk\u2019 output may be truncated copying 4095 bytes from a string of length 4095 [-Wstringop-truncation]\n 315 | strncpy(g_ring_buffer.entries[g_ring_buffer.head], message, MAX_LOG_LINE - 1);\n | ^\n" + }, + { + "name": "engine", + "status": "FAIL", + "elapsed_seconds": 0.116, + "artifact": null, + "output": "=== build ===\nCMake Error: The current CMakeCache.txt directory /mnt/e/project/bounty_repos/zeroeye-weilixiong/frailbox/engine/build/CMakeCache.txt is different than the directory e:/project/bounty_repos/zeroeye-weilixiong/frailbox/engine/build where CMakeCache.txt was created. This may result in binaries being created in the wrong place. If you are not sure, reedit the CMakeCache.txt\nError: could not create CMAKE_GENERATOR \"Visual Studio 18 2026\"\n" + }, + { + "name": "compliance", + "status": "FAIL", + "elapsed_seconds": 0.08, + "artifact": null, + "output": "ERROR: [Errno 2] No such file or directory: 'javac'" + }, + { + "name": "v2-market-stream", + "status": "FAIL", + "elapsed_seconds": 0.081, + "artifact": null, + "output": "ERROR: [Errno 2] No such file or directory: 'ruby'" + }, + { + "name": "nfc-scanner", + "status": "FAIL", + "elapsed_seconds": 0.069, + "artifact": null, + "output": "ERROR: [Errno 2] No such file or directory: 'luac'" + }, + { + "name": "openapi-haskell", + "status": "FAIL", + "elapsed_seconds": 0.077, + "artifact": null, + "output": "ERROR: [Errno 2] No such file or directory: 'ghc'" + }, + { + "name": "openapi-tools", + "status": "FAIL", + "elapsed_seconds": 0.067, + "artifact": null, + "output": "ERROR: [Errno 2] No such file or directory: 'luac'" + } + ], + "pr_note": "Include the encrypted diagnostic logd artifact(s): diagnostic/build-109814a2.logd. The encrypted .logd is the required diagnostic content for PR review; this JSON file is metadata. Maintainers may ask you to remove these diagnostic artifacts before merging." +} \ No newline at end of file diff --git a/diagnostic/build-109814a2.logd b/diagnostic/build-109814a2.logd new file mode 100644 index 00000000..dacb7d68 Binary files /dev/null and b/diagnostic/build-109814a2.logd differ diff --git a/docs/OPERATIONS.md b/docs/OPERATIONS.md index 58642e7b..13dcac86 100644 --- a/docs/OPERATIONS.md +++ b/docs/OPERATIONS.md @@ -57,6 +57,28 @@ Key metrics to monitor: | `queue_depth` | Gauge | Message queue depth | > 1000 | > 10000 | | `goroutine_count` | Gauge | Go routine count | > 5000 | > 10000 | | `gc_pause_time_ms` | Histogram | GC pause time | > 100ms | > 500ms | +| `tot_health_check_status` | Gauge | Health check status (0=OK,1=WARNING,2=CRITICAL) | WARNING | CRITICAL | +| `tot_health_check_metric_stale` | Gauge | 1 if metric timestamp is stale | > 0 stale metrics | > 0 stale metrics | +| `tot_health_check_metric_age_seconds` | Gauge | Age of health metric in seconds | > 300s | > 600s | + +### Stale-Metric Guard + +The `tools/health_check.py` tool exports Prometheus metrics with a +stale-metric guard. Before export, every metric is annotated with its age +(`tot_health_check_metric_age_seconds`) and a stale flag +(`tot_health_check_metric_stale`). A metric is considered stale when its +timestamp is older than `STALE_METRIC_THRESHOLD_SECONDS` (default 300s, +overridable via `--stale-threshold` or the +`STALE_METRIC_THRESHOLD_SECONDS` environment variable). Metrics without a +usable timestamp are reported as stale so outdated data is never silently +exported. + +Use `python3 tools/health_check.py --prometheus` to emit the exposition +format. The JSON output (`--json`) includes a `stale_metrics` array whose +entries carry `service`, `environment`, `metric_name`, `timestamp`, +`age_seconds`, and `stale` for each metric. Secret-looking values +(passwords, tokens, API keys) are redacted from the exported diagnostic +output. ### Grafana Dashboards diff --git a/tests/test_health_check_stale_metrics.py b/tests/test_health_check_stale_metrics.py new file mode 100644 index 00000000..b52da93d --- /dev/null +++ b/tests/test_health_check_stale_metrics.py @@ -0,0 +1,146 @@ +""" +Tests for the health check Prometheus stale-metric guard. + +Covers fresh and stale metric detection, the required JSON fields, +Prometheus exposition formatting, secret redaction, and default +output compatibility. +""" + +import sys +import unittest +from datetime import datetime, timedelta +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "tools")) + +from health_check import ( + collect_health_metrics, + flag_stale_metrics, + format_prometheus, + redact_secrets, +) + + +def _sample_results(timestamp=None): + return { + "timestamp": timestamp or datetime.now().isoformat(), + "hostname": "test-host", + "services": { + "backend": {"status": "OK", "detail": "HTTP 200", "code": 200, "endpoint": "http://localhost:8080/health"}, + }, + "infrastructure": { + "redis": {"status": "OK", "detail": "Connected", "endpoint": "localhost:6379"}, + }, + "system": { + "disk": {"status": "OK", "detail": "40.0% used"}, + "memory": {"status": "WARNING", "detail": "82.0% used"}, + }, + "overall_status": "DEGRADED", + } + + +class TestCollectHealthMetrics(unittest.TestCase): + def test_includes_required_fields(self): + metrics = collect_health_metrics(_sample_results(), environment="staging") + self.assertGreater(len(metrics), 0) + for m in metrics: + self.assertIn("service", m) + self.assertIn("environment", m) + self.assertEqual(m["environment"], "staging") + self.assertIn("metric_name", m) + self.assertIn("timestamp", m) + self.assertIn("status", m) + + def test_covers_all_categories(self): + metrics = collect_health_metrics(_sample_results()) + names = {m["metric_name"] for m in metrics} + self.assertIn("service.backend.status", names) + self.assertIn("infrastructure.redis.status", names) + self.assertIn("system.disk.status", names) + self.assertIn("system.memory.status", names) + + +class TestStaleMetricGuard(unittest.TestCase): + def test_fresh_metrics_not_stale(self): + now = datetime.now() + ts = (now - timedelta(seconds=10)).isoformat() + flagged = flag_stale_metrics(collect_health_metrics(_sample_results(timestamp=ts)), now=now, threshold=300) + self.assertTrue(flagged) + for m in flagged: + self.assertFalse(m["stale"], f"{m['metric_name']} should be fresh") + self.assertIsNotNone(m["age_seconds"]) + self.assertLessEqual(m["age_seconds"], 11) + + def test_stale_metrics_flagged_with_age(self): + now = datetime.now() + ts = (now - timedelta(seconds=3600)).isoformat() + flagged = flag_stale_metrics(collect_health_metrics(_sample_results(timestamp=ts)), now=now, threshold=300) + for m in flagged: + self.assertTrue(m["stale"], f"{m['metric_name']} should be stale") + self.assertIsNotNone(m["age_seconds"]) + self.assertGreater(m["age_seconds"], 300) + + def test_threshold_boundary(self): + now = datetime.now() + under = flag_stale_metrics( + collect_health_metrics(_sample_results(timestamp=(now - timedelta(seconds=299)).isoformat())), + now=now, threshold=300, + ) + over = flag_stale_metrics( + collect_health_metrics(_sample_results(timestamp=(now - timedelta(seconds=301)).isoformat())), + now=now, threshold=300, + ) + self.assertFalse(any(m["stale"] for m in under)) + self.assertTrue(all(m["stale"] for m in over)) + + def test_missing_timestamp_is_stale(self): + metrics = [{"service": "backend", "environment": "prod", "metric_name": "service.backend.status", "timestamp": None, "status": "OK"}] + flagged = flag_stale_metrics(metrics, now=datetime.now(), threshold=300) + self.assertTrue(flagged[0]["stale"]) + self.assertIsNone(flagged[0]["age_seconds"]) + + +class TestPrometheusFormat(unittest.TestCase): + def test_emits_help_type_and_metrics(self): + text = format_prometheus(_sample_results(), threshold=300) + self.assertIn("# HELP tot_health_check_status", text) + self.assertIn("# TYPE tot_health_check_status gauge", text) + self.assertIn("# HELP tot_health_check_metric_stale", text) + self.assertIn("tot_health_check_status{", text) + self.assertIn("tot_health_check_metric_stale{", text) + + def test_stale_flag_exported(self): + now = datetime.now() + ts = (now - timedelta(seconds=3600)).isoformat() + text = format_prometheus(_sample_results(timestamp=ts), now=now, threshold=300) + self.assertRegex(text, r'tot_health_check_metric_stale\{[^}]*\} 1') + + def test_fresh_flag_zero(self): + now = datetime.now() + ts = (now - timedelta(seconds=5)).isoformat() + text = format_prometheus(_sample_results(timestamp=ts), now=now, threshold=300) + self.assertRegex(text, r'tot_health_check_metric_stale\{[^}]*\} 0') + + +class TestRedaction(unittest.TestCase): + def test_redacts_passwords_and_tokens(self): + text = "password=hunter2 token=abc123 ghp_" + "x" * 30 + redacted = redact_secrets(text) + self.assertNotIn("hunter2", redacted) + self.assertIn("REDACTED", redacted) + self.assertNotIn("ghp_" + "x" * 30, redacted) + + def test_redaction_preserves_keys(self): + self.assertIn("api_key=REDACTED", redact_secrets("api_key=supersecret123")) + + +class TestDefaultCompatibility(unittest.TestCase): + def test_results_structure_preserved(self): + results = _sample_results() + results["stale_metrics"] = flag_stale_metrics(collect_health_metrics(results)) + for key in ("timestamp", "hostname", "services", "infrastructure", "system", "overall_status"): + self.assertIn(key, results) + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/health_check.py b/tools/health_check.py index 5cd0a613..6c0a849b 100644 --- a/tools/health_check.py +++ b/tools/health_check.py @@ -38,6 +38,7 @@ import subprocess import sys import time +import re from datetime import datetime from typing import Any, Dict, List, Optional, Tuple @@ -64,6 +65,11 @@ MEMORY_THRESHOLD_WARNING = 80 MEMORY_THRESHOLD_CRITICAL = 90 +# Stale-metric guard configuration +STALE_METRIC_THRESHOLD_SECONDS = int(os.environ.get("STALE_METRIC_THRESHOLD_SECONDS", "300")) +ENVIRONMENT = os.environ.get("ENVIRONMENT", "production") +_STATUS_TO_VALUE = {"OK": 0, "WARNING": 1, "CRITICAL": 2} + # --------------------------------------------------------------------------- # CHECK FUNCTIONS # --------------------------------------------------------------------------- @@ -300,10 +306,159 @@ def print_health_report(results: Dict[str, Any]): print() +# --------------------------------------------------------------------------- +# PROMETHEUS EXPORT & STALE-METRIC GUARD +# --------------------------------------------------------------------------- + +def redact_secrets(text: str) -> str: + """Redact secret-looking values from diagnostic/output text.""" + if not text: + return text + text = re.sub( + r'(?i)((?:password|passwd|pwd|secret|token|api[_-]?key|access[_-]?key|private[_-]?key|credential)\s*[:=]\s*)[^\s,;"\']+', + r'\1REDACTED', + text, + ) + text = re.sub(r'(?i)(authorization\s*[:=]\s*bearer\s+)[^\s,;"\']+', r'\1REDACTED', text) + text = re.sub(r'ghp_[A-Za-z0-9]{20,}', 'ghp_REDACTED', text) + text = re.sub(r'sk-[A-Za-z0-9]{20,}', 'sk-REDACTED', text) + return text + + +def _escape_prometheus_label(value) -> str: + """Escape a value for safe use inside a Prometheus label.""" + return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n") + + +def collect_health_metrics(results: Dict[str, Any], environment: Optional[str] = None) -> List[Dict[str, Any]]: + """Flatten health check results into metric records for stale reporting. + + Each record carries service, environment, metric name, timestamp and status. + """ + environment = environment or ENVIRONMENT + timestamp = results.get("timestamp") + metrics: List[Dict[str, Any]] = [] + + for service_name, info in results.get("services", {}).items(): + metrics.append({ + "service": service_name, + "environment": environment, + "metric_name": f"service.{service_name}.status", + "timestamp": timestamp, + "status": info.get("status") if isinstance(info, dict) else None, + }) + if isinstance(info, dict): + for sub_name, sub_info in info.items(): + if sub_name != "status" and isinstance(sub_info, dict) and "status" in sub_info: + metrics.append({ + "service": service_name, + "environment": environment, + "metric_name": f"service.{service_name}.{sub_name}.status", + "timestamp": timestamp, + "status": sub_info.get("status"), + }) + + for infra_name, info in results.get("infrastructure", {}).items(): + metrics.append({ + "service": infra_name, + "environment": environment, + "metric_name": f"infrastructure.{infra_name}.status", + "timestamp": timestamp, + "status": info.get("status") if isinstance(info, dict) else None, + }) + + for sys_name, info in results.get("system", {}).items(): + metrics.append({ + "service": "system", + "environment": environment, + "metric_name": f"system.{sys_name}.status", + "timestamp": timestamp, + "status": info.get("status") if isinstance(info, dict) else None, + }) + + return metrics + + +def flag_stale_metrics( + metrics: List[Dict[str, Any]], + now: Optional[datetime] = None, + threshold: int = STALE_METRIC_THRESHOLD_SECONDS, +) -> List[Dict[str, Any]]: + """Annotate each metric with age_seconds and a stale flag. + + A metric is stale when its timestamp is older than ``threshold`` seconds + relative to ``now``. Metrics without a usable timestamp are reported as + stale so outdated data is never silently exported. + """ + now = now or datetime.now() + if isinstance(now, str): + now = datetime.fromisoformat(now) + + flagged: List[Dict[str, Any]] = [] + for metric in metrics: + record = dict(metric) + timestamp = record.get("timestamp") + age_seconds: Optional[float] = None + stale = True + if timestamp is not None: + try: + collected_at = datetime.fromisoformat(timestamp) if isinstance(timestamp, str) else timestamp + age_seconds = (now - collected_at).total_seconds() + stale = age_seconds > threshold + except (ValueError, TypeError): + age_seconds = None + stale = True + record["age_seconds"] = round(age_seconds, 3) if age_seconds is not None else None + record["stale"] = bool(stale) + flagged.append(record) + return flagged + + +def format_prometheus( + results: Dict[str, Any], + now: Optional[datetime] = None, + threshold: int = STALE_METRIC_THRESHOLD_SECONDS, + environment: Optional[str] = None, +) -> str: + """Render health check results as Prometheus exposition text. + + Stale metrics are flagged via ``tot_health_check_metric_stale`` so scrapers + can alert before exporting outdated data. Secret-looking values are redacted. + """ + metrics = flag_stale_metrics( + collect_health_metrics(results, environment=environment), + now=now, + threshold=threshold, + ) + lines = [ + "# HELP tot_health_check_status Health check status (0=OK,1=WARNING,2=CRITICAL).", + "# TYPE tot_health_check_status gauge", + "# HELP tot_health_check_metric_stale 1 if the metric timestamp is stale, 0 otherwise.", + "# TYPE tot_health_check_metric_stale gauge", + "# HELP tot_health_check_metric_age_seconds Age of the metric in seconds.", + "# TYPE tot_health_check_metric_age_seconds gauge", + ] + for metric in metrics: + labels = ( + f'service="{_escape_prometheus_label(metric.get("service", ""))}",' + f'environment="{_escape_prometheus_label(metric.get("environment", ""))}",' + f'metric="{_escape_prometheus_label(metric.get("metric_name", ""))}"' + ) + status_value = _STATUS_TO_VALUE.get(str(metric.get("status")).upper(), 2) + stale_value = 1 if metric.get("stale") else 0 + lines.append(f"tot_health_check_status{{{labels}}} {status_value}") + lines.append(f"tot_health_check_metric_stale{{{labels}}} {stale_value}") + if metric.get("age_seconds") is not None: + lines.append(f"tot_health_check_metric_age_seconds{{{labels}}} {metric['age_seconds']}") + return redact_secrets("\n".join(lines) + "\n") + + def parse_args(): parser = argparse.ArgumentParser(description="Health check tool") parser.add_argument("--service", "-s", help="Check specific service only") parser.add_argument("--json", "-j", action="store_true", help="JSON output") + parser.add_argument("--prometheus", "-p", action="store_true", help="Prometheus exposition output with stale-metric guard") + parser.add_argument("--stale-threshold", type=int, default=STALE_METRIC_THRESHOLD_SECONDS, help="Seconds after which a metric is considered stale") parser.add_argument("--watch", "-w", action="store_true", help="Continuous monitoring") parser.add_argument("--interval", "-i", type=int, default=30, help="Check interval in seconds") parser.add_argument("--output", "-o", help="Output file path") @@ -318,7 +473,12 @@ def main(): try: while True: results = run_health_checks(args.service, args.json) - if args.json: + results["stale_metrics"] = flag_stale_metrics( + collect_health_metrics(results), threshold=args.stale_threshold + ) + if args.prometheus: + print(format_prometheus(results, threshold=args.stale_threshold)) + elif args.json: print(json.dumps(results, indent=2)) else: print_health_report(results) @@ -327,15 +487,22 @@ def main(): print("\nMonitoring stopped") else: results = run_health_checks(args.service, args.json) - if args.json: - output = json.dumps(results, indent=2) - print(output) + results["stale_metrics"] = flag_stale_metrics( + collect_health_metrics(results), threshold=args.stale_threshold + ) + + if args.prometheus: + print(format_prometheus(results, threshold=args.stale_threshold)) + elif args.json: + print(json.dumps(results, indent=2)) else: print_health_report(results) if args.output: with open(args.output, "w") as f: - if args.json: + if args.prometheus: + f.write(format_prometheus(results, threshold=args.stale_threshold)) + elif args.json: json.dump(results, f, indent=2) else: json.dump(results, f, indent=2)