diff --git a/diagnostic/build-3774a0c8-part001.logd b/diagnostic/build-3774a0c8-part001.logd new file mode 100644 index 00000000..0003b73e Binary files /dev/null and b/diagnostic/build-3774a0c8-part001.logd differ diff --git a/diagnostic/build-3774a0c8-part002.logd b/diagnostic/build-3774a0c8-part002.logd new file mode 100644 index 00000000..8d947267 Binary files /dev/null and b/diagnostic/build-3774a0c8-part002.logd differ diff --git a/diagnostic/build-3774a0c8.json b/diagnostic/build-3774a0c8.json new file mode 100644 index 00000000..796a4bf3 --- /dev/null +++ b/diagnostic/build-3774a0c8.json @@ -0,0 +1,90 @@ +{ + "generated_at": "2026-06-21T20:24:32.837844+00:00", + "commit": "3774a0c8", + "diagnostic_logd": [ + "diagnostic/build-3774a0c8-part001.logd", + "diagnostic/build-3774a0c8-part002.logd" + ], + "diagnostic_logd_error": null, + "message_blocker": null, + "chunked": true, + "chunk_size_bytes": 41943040, + "password": "f1880266ec27193ad5e5", + "decrypt_command": "encryptly unpack diagnostic/build-3774a0c8.logd --password f1880266ec27193ad5e5", + "total_modules": 10, + "passed": 1, + "failed": 9, + "modules": [ + { + "name": "backend", + "status": "FAIL", + "elapsed_seconds": 0, + "artifact": null, + "output": "Command not found: [Errno 2] No such file or directory: 'cargo'" + }, + { + "name": "frontend", + "status": "PASS", + "elapsed_seconds": 9.151, + "artifact": "/tmp/zeroeye-work/frontend/dist", + "output": "> tent-frontend@0.0.0 build\n> tsc -b && vite build\n\nvite v6.4.3 building for production...\ntransforming...\n\u2713 100 modules transformed.\nrendering chunks...\ncomputing gzip size...\ndist/index.html 0.62 kB \u2502 gzip: 0.34 kB\ndist/assets/state-BkjSKDbY.js 8.91 kB \u2502 gzip: 3.54 kB \u2502 map: 57.15 kB\ndist/assets/vendor-CREcWLHI.js 48.93 kB \u2502 gzip: 17.25 kB \u2502 map: 481.27 kB\ndist/assets/index-CyxcoTyU.js 231.32 kB \u2502 gzip: 72.16 kB \u2502 map: 1,044.42 kB\n\u2713 built in 1.82s" + }, + { + "name": "market", + "status": "FAIL", + "elapsed_seconds": 0, + "artifact": null, + "output": "Command not found: [Errno 2] No such file or directory: 'go'" + }, + { + "name": "frailbox", + "status": "FAIL", + "elapsed_seconds": 0.071, + "artifact": null, + "output": "gcc -Wall -Wextra -Wpedantic -std=c2x -O2 -g -D_FORTIFY_SOURCE=3 -fstack-protector-strong -fPIE -Iinclude -MMD -MP -c src/arena.c -o build/src/arena.o\n: warning: \"_FORTIFY_SOURCE\" redefined\n: note: this is the location of the previous definition\nsrc/arena.c: In function \u2018region_alloc\u2019:\nsrc/arena.c:13:36: error: \u2018MAP_ANONYMOUS\u2019 undeclared (first use in this function)\n 13 | int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS;\n | ^~~~~~~~~~~~~\nsrc/arena.c:13:36: note: each undeclared identifier is reported only once for each function it appears in\nsrc/arena.c:17:23: error: \u2018MAP_HUGETLB\u2019 undeclared (first use in this function)\n 17 | mmap_flags |= MAP_HUGETLB;\n | ^~~~~~~~~~~\nsrc/arena.c: In function \u2018arena_contains\u2019:\nsrc/arena.c:179:17: warning: comparison of distinct pointer types lacks a cast\n 179 | ptr < (char *)region->start + region->size) {\n | ^\nmake: *** [Makefile:27: build/src/arena.o] Error 1" + }, + { + "name": "engine", + "status": "FAIL", + "elapsed_seconds": 0.065, + "artifact": null, + "output": "CMake configure failed:\n-- Configuring incomplete, errors occurred!\nCMake Error at CMakeLists.txt:1 (cmake_minimum_required):\n CMake 3.28 or higher is required. You are running version 3.22.1" + }, + { + "name": "compliance", + "status": "FAIL", + "elapsed_seconds": 0, + "artifact": null, + "output": "Command not found: [Errno 2] No such file or directory: 'javac'" + }, + { + "name": "v2-market-stream", + "status": "FAIL", + "elapsed_seconds": 0, + "artifact": null, + "output": "Command not found: [Errno 2] No such file or directory: 'ruby'" + }, + { + "name": "nfc-scanner", + "status": "FAIL", + "elapsed_seconds": 0, + "artifact": null, + "output": "Command not found: [Errno 2] No such file or directory: 'luac'" + }, + { + "name": "openapi-haskell", + "status": "FAIL", + "elapsed_seconds": 0, + "artifact": null, + "output": "Command not found: [Errno 2] No such file or directory: 'ghc'" + }, + { + "name": "openapi-tools", + "status": "FAIL", + "elapsed_seconds": 0, + "artifact": null, + "output": "Command not found: [Errno 2] No such file or directory: 'luac'" + } + ], + "pr_note": "Include the encrypted diagnostic logd artifact(s): diagnostic/build-3774a0c8-part001.logd, diagnostic/build-3774a0c8-part002.logd. The encrypted .logd is the required diagnostic content for PR review; this JSON file is metadata. Maintainers may ask you to remove these diagnostic artifacts before merging." +} diff --git a/tools/health_check.py b/tools/health_check.py index 5cd0a613..f3674069 100644 --- a/tools/health_check.py +++ b/tools/health_check.py @@ -3,48 +3,20 @@ Health check tool for the Tent of Trials platform. Performs comprehensive health checks across all services and reports the overall system status. - -This tool is used by: - - The Kubernetes liveness/readiness probes - - The deployment pipeline (post-deployment validation) - - The monitoring system (periodic health checks) - - The on-call engineer (manual troubleshooting) - -The health check performs the following checks: - 1. Service availability (HTTP health endpoints) - 2. Database connectivity (connection test) - 3. Redis connectivity (ping test) - 4. Kafka connectivity (metadata fetch) - 5. Message queue depth (consumer lag check) - 6. Certificate expiry (TLS certificate check) - 7. Disk space (filesystem usage check) - 8. Memory usage (process memory check) - -Each check returns a status of OK, WARNING, or CRITICAL, along with -a detail message and optional diagnostic data. - -Usage: - python3 health_check.py # Check all services - python3 health_check.py --service backend # Check specific service - python3 health_check.py --json # JSON output - python3 health_check.py --watch # Continuous monitoring """ import argparse import json import os +import re import socket import ssl import subprocess import sys import time -from datetime import datetime +from datetime import datetime, timezone from typing import Any, Dict, List, Optional, Tuple -# --------------------------------------------------------------------------- -# CONSTANTS -# --------------------------------------------------------------------------- - SERVICES = { "backend": {"host": "localhost", "port": 8080, "path": "/health", "timeout": 5}, "market": {"host": "localhost", "port": 8081, "path": "/health", "timeout": 5}, @@ -58,15 +30,23 @@ "kafka": {"host": os.environ.get("KAFKA_HOST", "localhost"), "port": int(os.environ.get("KAFKA_PORT", "9092")), "timeout": 5}, } +PROMETHEUS_STALENESS_SECONDS = 300 DISK_THRESHOLD_WARNING = 80 DISK_THRESHOLD_CRITICAL = 90 - MEMORY_THRESHOLD_WARNING = 80 MEMORY_THRESHOLD_CRITICAL = 90 -# --------------------------------------------------------------------------- -# CHECK FUNCTIONS -# --------------------------------------------------------------------------- +_SENSITIVE_PATTERNS = [ + re.compile(r'(password|secret|token|api[_-]?key|auth)[=:\s"]+([^"&\s]{3,})', re.IGNORECASE), + re.compile(r'Bearer [A-Za-z0-9_\-\.]{10,}'), + re.compile(r'eyJ[A-Za-z0-9_\-]+\.eyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+'), +] + +def redact_secrets(value: str) -> str: + result = str(value) + for pattern in _SENSITIVE_PATTERNS: + result = pattern.sub(lambda m: f'{m.group(1)}***REDACTED***' if m.group(1) else '***REDACTED***', result) + return result def check_http_service(host: str, port: int, path: str, timeout: int) -> Tuple[str, str, int]: import http.client @@ -77,22 +57,16 @@ def check_http_service(host: str, port: int, path: str, timeout: int) -> Tuple[s status = resp.status body = resp.read().decode("utf-8", errors="replace")[:200] conn.close() - if status == 200: - result = "OK" - detail = f"HTTP {status}" + result, detail = "OK", f"HTTP {status}" elif status < 500: - result = "WARNING" - detail = f"HTTP {status}: {body[:100]}" + result, detail = "WARNING", f"HTTP {status}: {body[:100]}" else: - result = "CRITICAL" - detail = f"HTTP {status}: {body[:100]}" - + result, detail = "CRITICAL", f"HTTP {status}: {body[:100]}" return result, detail, status except Exception as e: return "CRITICAL", str(e), 0 - def check_tcp_port(host: str, port: int, timeout: int) -> Tuple[str, str, float]: try: start = time.time() @@ -107,7 +81,6 @@ def check_tcp_port(host: str, port: int, timeout: int) -> Tuple[str, str, float] except Exception as e: return "CRITICAL", str(e), 0 - def check_certificate_expiry(host: str, port: int = 443) -> Tuple[str, str, int]: try: ctx = ssl.create_default_context() @@ -116,11 +89,9 @@ def check_certificate_expiry(host: str, port: int = 443) -> Tuple[str, str, int] cert = ssock.getpeercert() if not cert: return "WARNING", "No certificate found", 0 - from datetime import datetime as dt expires = dt.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z") days_left = (expires - dt.now()).days - if days_left > 30: return "OK", f"Certificate expires in {days_left} days", days_left elif days_left > 7: @@ -130,7 +101,6 @@ def check_certificate_expiry(host: str, port: int = 443) -> Tuple[str, str, int] except Exception as e: return "WARNING", f"Cannot check: {e}", 0 - def check_disk_usage(path: str = "/") -> Tuple[str, str, float]: try: stat = os.statvfs(path) @@ -138,7 +108,6 @@ def check_disk_usage(path: str = "/") -> Tuple[str, str, float]: free = stat.f_frsize * stat.f_bavail used = total - free pct = (used / total) * 100 - if pct < DISK_THRESHOLD_WARNING: return "OK", f"{pct:.1f}% used ({used // (1024**3)}GB/{total // (1024**3)}GB)", pct elif pct < DISK_THRESHOLD_CRITICAL: @@ -148,7 +117,6 @@ def check_disk_usage(path: str = "/") -> Tuple[str, str, float]: except Exception as e: return "WARNING", f"Cannot check: {e}", 0 - def check_memory_usage() -> Tuple[str, str, float]: try: with open("/proc/meminfo") as f: @@ -162,12 +130,10 @@ def check_memory_usage() -> Tuple[str, str, float]: meminfo[key] = int(value) * 1024 except ValueError: pass - total = meminfo.get("MemTotal", 0) available = meminfo.get("MemAvailable", 0) used = total - available pct = (used / total) * 100 if total > 0 else 0 - if pct < MEMORY_THRESHOLD_WARNING: return "OK", f"{pct:.1f}% used ({used // (1024**3)}GB/{total // (1024**3)}GB)", pct elif pct < MEMORY_THRESHOLD_CRITICAL: @@ -177,7 +143,6 @@ def check_memory_usage() -> Tuple[str, str, float]: except Exception as e: return "WARNING", f"Cannot check: {e}", 0 - def check_load_average() -> Tuple[str, str, float]: try: with open("/proc/loadavg") as f: @@ -185,7 +150,6 @@ def check_load_average() -> Tuple[str, str, float]: load = float(parts[0]) cpu_count = os.cpu_count() or 1 load_pct = (load / cpu_count) * 100 - if load_pct < 70: return "OK", f"Load: {load} ({load_pct:.0f}% of {cpu_count} cores)", load elif load_pct < 90: @@ -195,53 +159,103 @@ def check_load_average() -> Tuple[str, str, float]: except Exception as e: return "WARNING", f"Cannot check: {e}", 0 +def _format_age(seconds: float) -> str: + if seconds < 60: + return f"{seconds:.0f}s" + elif seconds < 3600: + return f"{seconds/60:.1f}m" + elif seconds < 86400: + return f"{seconds/3600:.1f}h" + else: + return f"{seconds/86400:.1f}d" -# --------------------------------------------------------------------------- -# HEALTH CHECK RUNNER -# --------------------------------------------------------------------------- +def check_prometheus_staleness(prom_url: str, threshold: int = None) -> Tuple[str, str, List[Dict[str, Any]]]: + if threshold is None: + threshold = PROMETHEUS_STALENESS_SECONDS + try: + import urllib.request + req = urllib.request.Request(prom_url, headers={"Accept": "text/plain"}) + with urllib.request.urlopen(req, timeout=10) as resp: + content = resp.read().decode("utf-8", errors="replace") + now = time.time() + stale_metrics: List[Dict[str, Any]] = [] + for line in content.splitlines(): + line = line.rstrip() + if not line or line.startswith("#"): + continue + parts = line.split() + if len(parts) < 3: + continue + metric_full = parts[0] + try: + value = float(parts[-2]) + ts = float(parts[-1]) + except ValueError: + continue + age = now - ts + if age > threshold: + metric_name = metric_full.split("{")[0] if "{" in metric_full else metric_full + labels = "" + if "{" in metric_full: + labels_str = metric_full.split("{", 1)[1].rstrip("}") + labels = redact_secrets(labels_str) + stale_metrics.append({ + "metric": metric_name, + "labels": labels, + "value": value, + "timestamp": ts, + "age_seconds": round(age, 1), + "age_readable": _format_age(age), + }) + if not stale_metrics: + return "OK", "No stale Prometheus metrics detected", [] + by_name: Dict[str, Dict[str, Any]] = {} + for m in stale_metrics: + key = m["metric"] + if key not in by_name or m["age_seconds"] > by_name[key]["age_seconds"]: + by_name[key] = m + oldest = max(s["age_seconds"] for s in stale_metrics) + detail = f"{len(stale_metrics)} stale metric(s) detected, oldest {threshold/60:.0f}m+ ({_format_age(oldest)})" + if oldest > threshold * 2: + return "CRITICAL", detail, list(by_name.values()) + else: + return "WARNING", detail, list(by_name.values()) + except Exception as e: + return "WARNING", f"Cannot check Prometheus staleness: {e}", [] -def run_health_checks(service: Optional[str] = None, json_output: bool = False) -> Dict[str, Any]: +def run_health_checks( + service: Optional[str] = None, + json_output: bool = False, + prom_url: Optional[str] = None, +) -> Dict[str, Any]: results: Dict[str, Any] = { - "timestamp": datetime.now().isoformat(), + "timestamp": datetime.now(timezone.utc).isoformat(), "hostname": socket.gethostname(), "services": {}, "infrastructure": {}, "system": {}, "overall_status": "OK", } - all_ok = True - # Check services for name, config in SERVICES.items(): if service and name != service: continue - status, detail, code = check_http_service( - config["host"], config["port"], config["path"], config["timeout"] - ) - results["services"][name] = { - "status": status, - "detail": detail, - "code": code, - "endpoint": f"http://{config['host']}:{config['port']}{config['path']}", - } + status, detail, code = check_http_service(config["host"], config["port"], config["path"], config["timeout"]) + results["services"][name] = {"status": status, "detail": detail, "code": code, + "endpoint": f"http://{config['host']}:{config['port']}{config['path']}"} if status == "CRITICAL": all_ok = False - # Check infrastructure for name, config in INFRASTRUCTURE.items(): if service and name != service: continue status, detail, latency = check_tcp_port(config["host"], config["port"], config["timeout"]) - results["infrastructure"][name] = { - "status": status, - "detail": detail, - "endpoint": f"{config['host']}:{config['port']}", - } + results["infrastructure"][name] = {"status": status, "detail": detail, + "endpoint": f"{config['host']}:{config['port']}"} if status == "CRITICAL": all_ok = False - # Check system resources disk_status, disk_detail, disk_pct = check_disk_usage() results["system"]["disk"] = {"status": disk_status, "detail": disk_detail} if disk_status == "CRITICAL": @@ -255,25 +269,28 @@ def run_health_checks(service: Optional[str] = None, json_output: bool = False) load_status, load_detail, load_val = check_load_average() results["system"]["load"] = {"status": load_status, "detail": load_detail} - # Check certificate expiry (web services) for name, config in SERVICES.items(): if service and name != service: continue if config["port"] == 443: cert_status, cert_detail, days_left = check_certificate_expiry(config["host"]) - results["services"][name]["certificate"] = { - "status": cert_status, - "detail": cert_detail, - "days_remaining": days_left, - } + results["services"][name]["certificate"] = {"status": cert_status, "detail": cert_detail, "days_remaining": days_left} if cert_status == "CRITICAL": all_ok = False - results["overall_status"] = "OK" if all_ok else "DEGRADED" + if prom_url: + prom_status, prom_detail, stale_list = check_prometheus_staleness(prom_url) + results["system"]["prometheus_stale_metrics"] = { + "status": prom_status, "detail": prom_detail, + "staleness_threshold_seconds": PROMETHEUS_STALENESS_SECONDS, + "stale_metrics": stale_list, + } + if prom_status in ("CRITICAL", "WARNING"): + all_ok = False + results["overall_status"] = "OK" if all_ok else "DEGRADED" return results - def print_health_report(results: Dict[str, Any]): print(f"\n{'='*60}") print(f" HEALTH CHECK REPORT") @@ -281,25 +298,21 @@ def print_health_report(results: Dict[str, Any]): print(f" Time: {results['timestamp']}") print(f" Overall: {results['overall_status']}") print(f"{'='*60}") - - for category, items in [("Services", results["services"]), - ("Infrastructure", results["infrastructure"]), - ("System", results["system"])]: + for category, items in [("Services", results["services"]), ("Infrastructure", results["infrastructure"]), ("System", results["system"])]: if items: print(f"\n {category}:") for name, check in items.items(): if isinstance(check, dict) and "status" in check: status_icon = {"OK": "✓", "WARNING": "⚠", "CRITICAL": "✗"}.get(check["status"], "?") print(f" {status_icon} {name}: {check['detail']}") - else: - print(f" {name}:") - for sub_name, sub_check in check.items(): - if isinstance(sub_check, dict) and "status" in sub_check: - sub_icon = {"OK": "✓", "WARNING": "⚠", "CRITICAL": "✗"}.get(sub_check["status"], "?") - print(f" {sub_icon} {sub_name}: {sub_check['detail']}") + if name == "prometheus_stale_metrics" and check.get("stale_metrics"): + for m in check["stale_metrics"][:5]: + labels = f" ({m['labels']})" if m.get("labels") else "" + print(f" └─ STALE: {m['metric']}{labels} age={m['age_readable']}") + if len(check["stale_metrics"]) > 5: + print(f" └─ ... and {len(check['stale_metrics'])-5} more") print() - def parse_args(): parser = argparse.ArgumentParser(description="Health check tool") parser.add_argument("--service", "-s", help="Check specific service only") @@ -307,17 +320,16 @@ def parse_args(): parser.add_argument("--watch", "-w", action="store_true", help="Continuous monitoring") parser.add_argument("--interval", "-i", type=int, default=30, help="Check interval in seconds") parser.add_argument("--output", "-o", help="Output file path") + parser.add_argument("--prom-url", "-p", help="Prometheus metrics URL for staleness check") return parser.parse_args() - def main(): args = parse_args() - if args.watch: print(f"Continuous monitoring (interval: {args.interval}s). Press Ctrl+C to stop.") try: while True: - results = run_health_checks(args.service, args.json) + results = run_health_checks(args.service, args.json, args.prom_url) if args.json: print(json.dumps(results, indent=2)) else: @@ -326,26 +338,18 @@ def main(): except KeyboardInterrupt: print("\nMonitoring stopped") else: - results = run_health_checks(args.service, args.json) + results = run_health_checks(args.service, args.json, args.prom_url) if args.json: - output = json.dumps(results, indent=2) - print(output) + print(json.dumps(results, indent=2)) else: print_health_report(results) - if args.output: with open(args.output, "w") as f: - if args.json: - json.dump(results, f, indent=2) - else: - json.dump(results, f, indent=2) + json.dump(results, f, indent=2) print(f"Report saved to {args.output}") - if results["overall_status"] == "DEGRADED": return 1 - return 0 - if __name__ == "__main__": - main() + sys.exit(main())