From fb9871b6607ab8fd4ce18eecbaf87e0864206aca Mon Sep 17 00:00:00 2001
From: "Marcus W." <marcus.westermark@altal.fi>
Date: Sun, 12 Oct 2025 12:27:23 +0200
Subject: [PATCH] Add optional crawl mode to SRI parser

---
 README.md             | 10 ++++++----
 docs/sri-reference.md | 12 +++++++-----
 scripts/sri_parser.py | 22 ++++++++++++++++------
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 6b277aa..7e0a9fc 100644
--- a/README.md
+++ b/README.md
@@ -155,8 +155,10 @@ Each domain receives the following DNS entries:
 Use `scripts/sri_parser.py` when you need a focused crawl that inventories
 "unsafe" Subresource Integrity implementations called out by
 [SecurityScorecard's guidance](https://support.securityscorecard.com/hc/en-us/articles/41067186972827-Unsafe-Implementation-of-Subresource-Integrity-SRI).
-The script walks same-origin links, inspects third-party JavaScript and CSS
-includes, and reports every resource that:
+By default the scanner inspects only the requested page so the results mirror
+SecurityScorecard's behaviour. Add the `--crawl` flag to follow same-origin
+links, inspect third-party JavaScript and CSS includes across multiple pages,
+and report every resource that:
 
 - Omits an `integrity` attribute entirely
 - Supplies hashes that do not start with `sha256-`, `sha384-`, or `sha512-`
@@ -168,11 +170,11 @@ The crawler also records any restrictive `Content-Security-Policy` headers so
 you can tell whether a compensating control is in place.
 
 ```bash
-# Human-readable output
+# Human-readable output for the landing page only
 python scripts/sri_parser.py https://example.com
 
 # JSON report with a deeper crawl (depth 2, up to 50 pages)
-python scripts/sri_parser.py https://example.com --max-depth 2 --max-pages 50 --json
+python scripts/sri_parser.py https://example.com --crawl --max-depth 2 --max-pages 50 --json
 ```
 
 The report lists the affected page, resource URL, integrity/crossorigin values,
diff --git a/docs/sri-reference.md b/docs/sri-reference.md
index bcfc23d..b4f8deb 100644
--- a/docs/sri-reference.md
+++ b/docs/sri-reference.md
@@ -144,11 +144,13 @@ following are true:
 - Third-party resources load over insecure HTTP
 - Cross-origin resources omit the required `crossorigin` attribute
 
-The `scripts/sri_parser.py` helper crawls same-origin pages, inventories each
-external JavaScript and CSS include, and produces a report listing every
-resource that violates one or more of the checks above. The output includes the
-page URL, resource URL, recorded integrity/crossorigin values, and concise
-reason codes so the risky includes can be triaged quickly.
+The `scripts/sri_parser.py` helper inspects the requested page by default so the
+results align with SecurityScorecard's single-page scanner. Enable the
+`--crawl` flag to follow same-origin pages, inventory each external JavaScript
+and CSS include, and produce a report listing every resource that violates one
+or more of the checks above. The output includes the page URL, resource URL,
+recorded integrity/crossorigin values, and concise reason codes so the risky
+includes can be triaged quickly.
 
 ### Compensating Controls
 
diff --git a/scripts/sri_parser.py b/scripts/sri_parser.py
index 3284a54..788e200 100644
--- a/scripts/sri_parser.py
+++ b/scripts/sri_parser.py
@@ -51,8 +51,8 @@ class SRIParser:
     def __init__(
         self,
         base_url: str,
-        max_depth: int = 1,
-        max_pages: int = 25,
+        max_depth: int = 0,
+        max_pages: int = 1,
         timeout: int = 10,
         user_agent: str = "SRI-Parser/1.0 (+https://github.com/security-domain/domain-security-analyzer)",
     ) -> None:
@@ -271,12 +271,22 @@ def build_arg_parser() -> argparse.ArgumentParser:
         description="Crawl a site and report unsafe Subresource Integrity implementations.",
     )
     parser.add_argument("url", help="Base URL or domain to crawl")
-    parser.add_argument("--max-depth", type=int, default=1, help="Maximum crawl depth (default: 1)")
+    parser.add_argument(
+        "--crawl",
+        action="store_true",
+        help="Follow same-origin links up to --max-depth/--max-pages",
+    )
+    parser.add_argument(
+        "--max-depth",
+        type=int,
+        default=1,
+        help="Maximum crawl depth when --crawl is set (default: 1)",
+    )
     parser.add_argument(
         "--max-pages",
         type=int,
         default=25,
-        help="Maximum number of same-origin pages to visit (default: 25)",
+        help="Maximum number of same-origin pages to visit when --crawl is set (default: 25)",
     )
     parser.add_argument(
         "--timeout",
@@ -333,8 +343,8 @@ def main() -> None:
 
     sri_parser = SRIParser(
         base_url=args.url,
-        max_depth=args.max_depth,
-        max_pages=args.max_pages,
+        max_depth=args.max_depth if args.crawl else 0,
+        max_pages=args.max_pages if args.crawl else 1,
         timeout=args.timeout,
     )
     report = sri_parser.crawl()