From 0b93a7efb9a523b387985b8b6c4e248d54a4053a Mon Sep 17 00:00:00 2001
From: h1065153539-create <h1065153539@gmail.com>
Date: Sun, 21 Jun 2026 01:36:29 +0800
Subject: [PATCH] Add log aggregator redaction summary report

---
 diagnostic/build-00000000.json                |  47 +++---
 docs/OPERATIONS.md                            |  16 ++
 .../test_log_aggregator_redaction_summary.py  |  79 ++++++++++
 tools/log_aggregator.py                       | 142 ++++++++++++++++--
 4 files changed, 251 insertions(+), 33 deletions(-)
 create mode 100644 tests/test_log_aggregator_redaction_summary.py
diff --git a/diagnostic/build-00000000.json b/diagnostic/build-00000000.json
index 33e2ca62..71058f50 100644
--- a/diagnostic/build-00000000.json
+++ b/diagnostic/build-00000000.json
@@ -1,23 +1,24 @@
-{
-  "generated_at": "2026-06-16T15:23:47.496569+00:00",
-  "commit": "00000000",
-  "diagnostic_logd": "diagnostic/build-00000000.logd",
-  "diagnostic_logd_error": null,
-  "chunked": false,
-  "chunk_size_bytes": null,
-  "password": "4c7df15ab09fbb066197",
-  "decrypt_command": "encryptly unpack diagnostic/build-00000000.logd <outdir> --password 4c7df15ab09fbb066197",
-  "total_modules": 1,
-  "passed": 0,
-  "failed": 1,
-  "modules": [
-    {
-      "name": "frailbox",
-      "status": "FAIL",
-      "elapsed_seconds": 0,
-      "artifact": null,
-      "output": "Command not found: [Errno 2] No such file or directory: 'make'"
-    }
-  ],
-  "pr_note": "Include this JSON diagnostic report and diagnostic/build-00000000.logd in your PR. Maintainers may ask you to remove these diagnostic artifacts before merging."
-}
+{
+  "generated_at": "2026-06-20T17:35:52.560716+00:00",
+  "commit": "00000000",
+  "diagnostic_logd": null,
+  "diagnostic_logd_error": null,
+  "message_blocker": null,
+  "chunked": false,
+  "chunk_size_bytes": null,
+  "password": null,
+  "decrypt_command": null,
+  "total_modules": 1,
+  "passed": 0,
+  "failed": 1,
+  "modules": [
+    {
+      "name": "encryptly-preflight",
+      "status": "FAIL",
+      "elapsed_seconds": 80.625,
+      "artifact": null,
+      "output": "encryptly could not create an archive. You may have timed out; try launching it in the background and waiting for it to finish with no timeout due to a bug in encryptly. 2abc495b35f893a0c2e4"
+    }
+  ],
+  "pr_note": "Encrypted diagnostic logd artifact was not created; include this JSON report showing why. The encrypted .logd is the required diagnostic content for PR review; this JSON file is metadata. Maintainers may ask you to remove these diagnostic artifacts before merging."
+}
diff --git a/docs/OPERATIONS.md b/docs/OPERATIONS.md
index 58642e7b..06a81f8b 100644
--- a/docs/OPERATIONS.md
+++ b/docs/OPERATIONS.md
@@ -87,6 +87,22 @@ Alerts are sent to PagerDuty and Slack (#ops-alerts channel).
 
 ## Incident Response
 
+### Offline Log Redaction Summary
+
+When reviewing archived logs outside the ELK stack, generate an audit summary
+alongside the normal aggregation report:
+
+```bash
+python3 tools/log_aggregator.py \
+  --input "/var/log/tent/*.log" \
+  --output log_report.json \
+  --redaction-summary redaction-summary.json
+```
+
+For a human-readable report, use `--redaction-summary-format text`. The summary
+includes processed, malformed, and redacted-field counts per source file without
+exposing raw secret-looking values.
+
 ### Severity Levels
 
 | Level | Description | Examples | Response Time |
diff --git a/tests/test_log_aggregator_redaction_summary.py b/tests/test_log_aggregator_redaction_summary.py
new file mode 100644
index 00000000..26a024da
--- /dev/null
+++ b/tests/test_log_aggregator_redaction_summary.py
@@ -0,0 +1,79 @@
+import json
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT / "tools"))
+
+from log_aggregator import LogAggregator, redact_secret_value  # noqa: E402
+
+
+class LogAggregatorRedactionSummaryTests(unittest.TestCase):
+    def test_redaction_summary_counts_valid_malformed_and_secret_fields(self):
+        with tempfile.TemporaryDirectory() as raw_dir:
+            log_path = Path(raw_dir) / "app.log"
+            log_path.write_text(
+                "\n".join(
+                    [
+                        json.dumps(
+                            {
+                                "timestamp": "2026-06-20T12:00:00",
+                                "level": "INFO",
+                                "service": "api",
+                                "message": "started",
+                                "token": "secret-token",
+                            }
+                        ),
+                        "",
+                        "2026-06-20 12:01:00 ERROR [worker] password=hunter2 failed",
+                    ]
+                )
+                + "\n",
+                encoding="utf-8",
+            )
+
+            aggregator = LogAggregator()
+            self.assertEqual(aggregator.process_file(str(log_path)), 2)
+            summary = aggregator.get_redaction_summary()
+
+            self.assertEqual(summary["processed_lines"], 3)
+            self.assertEqual(summary["parsed_entries"], 2)
+            self.assertEqual(summary["malformed_lines"], 1)
+            self.assertGreaterEqual(summary["redacted_fields"], 2)
+            self.assertEqual(summary["by_file"][0]["source_file"], "app.log")
+
+            rendered = json.dumps(aggregator.entries)
+            self.assertNotIn("secret-token", rendered)
+            self.assertNotIn("hunter2", rendered)
+
+    def test_text_and_json_redaction_summary_exports(self):
+        with tempfile.TemporaryDirectory() as raw_dir:
+            raw_path = Path(raw_dir)
+            log_path = raw_path / "service.log"
+            json_summary = raw_path / "summary.json"
+            text_summary = raw_path / "summary.txt"
+            log_path.write_text(
+                '{"level":"WARN","service":"svc","message":"authorization: Bearer abc123"}\n',
+                encoding="utf-8",
+            )
+
+            aggregator = LogAggregator()
+            aggregator.process_file(str(log_path))
+            aggregator.export_redaction_summary(str(json_summary), "json")
+            aggregator.export_redaction_summary(str(text_summary), "text")
+
+            data = json.loads(json_summary.read_text(encoding="utf-8"))
+            self.assertEqual(data["redacted_fields"], 2)
+            self.assertIn("Log Redaction Summary", text_summary.read_text(encoding="utf-8"))
+
+    def test_redact_secret_value_preserves_safe_fields(self):
+        redacted, count = redact_secret_value({"service": "api", "message": "ok"})
+        self.assertEqual(count, 0)
+        self.assertEqual(redacted, {"service": "api", "message": "ok"})
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/log_aggregator.py b/tools/log_aggregator.py
index c9527d30..90d1f443 100644
--- a/tools/log_aggregator.py
+++ b/tools/log_aggregator.py
@@ -46,12 +46,60 @@
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
-from typing import Any, Counter, Dict, List, Optional, Tuple
+from typing import Any, Counter, Dict, Iterable, List, Optional, Tuple
 from collections import defaultdict, Counter
 
 logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
 logger = logging.getLogger("log_aggregator")
 
+SECRET_FIELD_RE = re.compile(
+    r"(api[_-]?key|authorization|bearer|cookie|credential|password|secret|token)",
+    re.IGNORECASE,
+)
+SECRET_VALUE_RE = re.compile(
+    r"(?i)\b(api[_-]?key|authorization|bearer|password|secret|token)\b\s*[:=]?\s*(?:bearer\s+)?\S+"
+)
+
+
+def redact_secret_value(value: Any) -> Tuple[Any, int]:
+    """Redact secret-looking values while returning the number of redactions."""
+
+    if isinstance(value, dict):
+        redacted: Dict[str, Any] = {}
+        count = 0
+        for key, item in value.items():
+            if SECRET_FIELD_RE.search(str(key)):
+                redacted[str(key)] = "[REDACTED]"
+                count += 1
+            else:
+                cleaned, child_count = redact_secret_value(item)
+                redacted[str(key)] = cleaned
+                count += child_count
+        return redacted, count
+
+    if isinstance(value, list):
+        redacted_items = []
+        count = 0
+        for item in value:
+            cleaned, child_count = redact_secret_value(item)
+            redacted_items.append(cleaned)
+            count += child_count
+        return redacted_items, count
+
+    if isinstance(value, str):
+        matches = list(SECRET_VALUE_RE.finditer(value))
+        if not matches:
+            return value, 0
+        return (
+            SECRET_VALUE_RE.sub(
+                lambda match: f"{match.group(1)}=[REDACTED]",
+                value,
+            ),
+            len(matches),
+        )
+
+    return value, 0
+
 # ---------------------------------------------------------------------------
 # LOG PARSERS
 # ---------------------------------------------------------------------------
@@ -206,6 +254,12 @@ class LogAggregator:
     def __init__(self):
         self.parsers = [JSONLogParser(), TextLogParser(), NginxLogParser()]
         self.entries: List[Dict[str, Any]] = []
+        self.total_lines = 0
+        self.malformed_lines = 0
+        self.redacted_fields = 0
+        self.malformed_by_file: Counter = Counter()
+        self.redactions_by_file: Counter = Counter()
+        self.processed_by_file: Counter = Counter()
         self.level_counts: Counter = Counter()
         self.service_counts: Counter = Counter()
         self.hourly_counts: Counter = Counter()
@@ -219,12 +273,12 @@ def process_file(self, filepath: str) -> int:
             if filepath.endswith('.gz'):
                 with gzip.open(filepath, 'rt', errors='replace') as f:
                     for line in f:
-                        if self._parse_line(line):
+                        if self._parse_line(line, filepath):
                             parsed_count += 1
             else:
                 with open(filepath, 'r', errors='replace') as f:
                     for line in f:
-                        if self._parse_line(line):
+                        if self._parse_line(line, filepath):
                             parsed_count += 1
         except Exception as e:
             logger.error(f"Error processing {filepath}: {e}")
@@ -240,15 +294,31 @@ def process_directory(self, dirpath: str, pattern: str = "*.log") -> int:
             logger.debug(f"  {filepath.name}: {count} entries")
         return total
 
-    def _parse_line(self, line: str) -> bool:
+    def _parse_line(self, line: str, source_file: Optional[str] = None) -> bool:
+        self.total_lines += 1
+        source_name = Path(source_file).name if source_file else "<stream>"
+        self.processed_by_file[source_name] += 1
+
         for parser in self.parsers:
             entry = parser.parse(line)
             if entry:
+                redacted_fields, redaction_count = redact_secret_value(entry.get('fields', {}))
+                redacted_message, message_redactions = redact_secret_value(entry.get('message', ''))
+                redaction_count += message_redactions
+                entry['fields'] = redacted_fields
+                entry['message'] = redacted_message
+                entry['source_file'] = source_name
+                if redaction_count:
+                    self.redacted_fields += redaction_count
+                    self.redactions_by_file[source_name] += redaction_count
                 self.entries.append(entry)
                 ts = entry.get('timestamp')
                 if ts:
-                    hour = datetime.fromtimestamp(ts, tz=timezone.utc).strftime('%Y-%m-%dT%H:00')
-                    self.hourly_counts[hour] += 1
+                    try:
+                        hour = datetime.fromtimestamp(float(ts), tz=timezone.utc).strftime('%Y-%m-%dT%H:00')
+                        self.hourly_counts[hour] += 1
+                    except (TypeError, ValueError):
+                        pass
                 level = entry.get('level', 'unknown').lower()
                 self.level_counts[level] += 1
                 service = entry.get('service', 'unknown')
@@ -260,6 +330,8 @@ def _parse_line(self, line: str) -> bool:
                     self.errors_by_service[service].append(msg)
                     self.error_patterns[msg] += 1
                 return True
+        self.malformed_lines += 1
+        self.malformed_by_file[source_name] += 1
         return False
 
     def get_summary(self) -> Dict[str, Any]:
@@ -279,8 +351,8 @@ def get_summary(self) -> Dict[str, Any]:
 
     def _get_time_range(self) -> Optional[Dict[str, str]]:
         timestamps = [
-            e['timestamp'] for e in self.entries
-            if e.get('timestamp')
+            float(e['timestamp']) for e in self.entries
+            if isinstance(e.get('timestamp'), (int, float))
         ]
         if not timestamps:
             return None
@@ -303,7 +375,7 @@ def get_error_timeline(self) -> List[Dict[str, Any]]:
             level = entry.get('level', '').lower()
             if level in ('error', 'critical'):
                 ts = entry.get('timestamp')
-                if ts:
+                if isinstance(ts, (int, float)):
                     hour = datetime.fromtimestamp(ts, tz=timezone.utc).strftime('%Y-%m-%dT%H:00')
                     errors_by_hour[hour] += 1
         return [
@@ -359,6 +431,50 @@ def export_json(self, output_path: str):
             }, f, indent=2, default=str)
         logger.info(f"Report exported to {output_path}")
 
+    def get_redaction_summary(self) -> Dict[str, Any]:
+        return {
+            'processed_lines': self.total_lines,
+            'parsed_entries': len(self.entries),
+            'malformed_lines': self.malformed_lines,
+            'redacted_fields': self.redacted_fields,
+            'source_files': sorted(self.processed_by_file),
+            'by_file': [
+                {
+                    'source_file': source,
+                    'processed_lines': self.processed_by_file[source],
+                    'malformed_lines': self.malformed_by_file.get(source, 0),
+                    'redacted_fields': self.redactions_by_file.get(source, 0),
+                }
+                for source in sorted(self.processed_by_file)
+            ],
+        }
+
+    def export_redaction_summary(self, output_path: str, output_format: str = "json"):
+        summary = self.get_redaction_summary()
+        if output_format == "json":
+            with open(output_path, 'w') as f:
+                json.dump(summary, f, indent=2)
+                f.write("\n")
+        else:
+            lines = [
+                "Log Redaction Summary",
+                "=====================",
+                f"processed_lines: {summary['processed_lines']}",
+                f"parsed_entries: {summary['parsed_entries']}",
+                f"malformed_lines: {summary['malformed_lines']}",
+                f"redacted_fields: {summary['redacted_fields']}",
+                "",
+                "by_file:",
+            ]
+            for item in summary['by_file']:
+                lines.append(
+                    f"  - {item['source_file']}: processed={item['processed_lines']} "
+                    f"malformed={item['malformed_lines']} redacted={item['redacted_fields']}"
+                )
+            with open(output_path, 'w') as f:
+                f.write("\n".join(lines) + "\n")
+        logger.info(f"Redaction summary exported to {output_path}")
+
     def generate_html_report(self, output_path: str):
         summary = self.get_summary()
         html = f"""<!DOCTYPE html>
@@ -410,6 +526,8 @@ def parse_args():
     parser.add_argument("--dir", help="Directory containing log files")
     parser.add_argument("--output", "-o", default="log_report.json", help="Output file path")
     parser.add_argument("--format", choices=["json", "csv", "html"], default="json", help="Output format")
+    parser.add_argument("--redaction-summary", help="Optional path for redaction and malformed-record summary")
+    parser.add_argument("--redaction-summary-format", choices=["json", "text"], default="json", help="Redaction summary output format")
     parser.add_argument("--search", help="Search for a string in logs")
     parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
     return parser.parse_args()
@@ -445,9 +563,10 @@ def main():
             print(f"  ... and {len(results) - 20} more")
 
     summary = aggregator.get_summary()
+    time_range = summary.get('time_range') or {}
     print(f"\nSummary:")
     print(f"  Total entries: {summary['total_entries']:,}")
-    print(f"  Time range: {summary.get('time_range', {}).get('start', 'N/A')} to {summary.get('time_range', {}).get('end', 'N/A')}")
+    print(f"  Time range: {time_range.get('start', 'N/A')} to {time_range.get('end', 'N/A')}")
     print(f"  Error rate: {summary.get('error_rate', 0)}%")
     print(f"  By level: {', '.join(f'{k}={v}' for k, v in summary.get('by_level', {}).items())}")
     print(f"  By service: {', '.join(f'{k}={v}' for k, v in summary.get('by_service', {}).items())}")
@@ -459,6 +578,9 @@ def main():
     else:
         aggregator.export_json(args.output)
 
+    if args.redaction_summary:
+        aggregator.export_redaction_summary(args.redaction_summary, args.redaction_summary_format)
+
     return 0