From fb9871b6607ab8fd4ce18eecbaf87e0864206aca Mon Sep 17 00:00:00 2001 From: "Marcus W." Date: Sun, 12 Oct 2025 12:27:23 +0200 Subject: [PATCH] Add optional crawl mode to SRI parser --- README.md | 10 ++++++---- docs/sri-reference.md | 12 +++++++----- scripts/sri_parser.py | 22 ++++++++++++++++------ 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 6b277aa..7e0a9fc 100644 --- a/README.md +++ b/README.md @@ -155,8 +155,10 @@ Each domain receives the following DNS entries: Use `scripts/sri_parser.py` when you need a focused crawl that inventories "unsafe" Subresource Integrity implementations called out by [SecurityScorecard's guidance](https://support.securityscorecard.com/hc/en-us/articles/41067186972827-Unsafe-Implementation-of-Subresource-Integrity-SRI). -The script walks same-origin links, inspects third-party JavaScript and CSS -includes, and reports every resource that: +By default the scanner inspects only the requested page so the results mirror +SecurityScorecard's behaviour. Add the `--crawl` flag to follow same-origin +links, inspect third-party JavaScript and CSS includes across multiple pages, +and report every resource that: - Omits an `integrity` attribute entirely - Supplies hashes that do not start with `sha256-`, `sha384-`, or `sha512-` @@ -168,11 +170,11 @@ The crawler also records any restrictive `Content-Security-Policy` headers so you can tell whether a compensating control is in place. ```bash -# Human-readable output +# Human-readable output for the landing page only python scripts/sri_parser.py https://example.com # JSON report with a deeper crawl (depth 2, up to 50 pages) -python scripts/sri_parser.py https://example.com --max-depth 2 --max-pages 50 --json +python scripts/sri_parser.py https://example.com --crawl --max-depth 2 --max-pages 50 --json ``` The report lists the affected page, resource URL, integrity/crossorigin values, diff --git a/docs/sri-reference.md b/docs/sri-reference.md index bcfc23d..b4f8deb 100644 --- a/docs/sri-reference.md +++ b/docs/sri-reference.md @@ -144,11 +144,13 @@ following are true: - Third-party resources load over insecure HTTP - Cross-origin resources omit the required `crossorigin` attribute -The `scripts/sri_parser.py` helper crawls same-origin pages, inventories each -external JavaScript and CSS include, and produces a report listing every -resource that violates one or more of the checks above. The output includes the -page URL, resource URL, recorded integrity/crossorigin values, and concise -reason codes so the risky includes can be triaged quickly. +The `scripts/sri_parser.py` helper inspects the requested page by default so the +results align with SecurityScorecard's single-page scanner. Enable the +`--crawl` flag to follow same-origin pages, inventory each external JavaScript +and CSS include, and produce a report listing every resource that violates one +or more of the checks above. The output includes the page URL, resource URL, +recorded integrity/crossorigin values, and concise reason codes so the risky +includes can be triaged quickly. ### Compensating Controls diff --git a/scripts/sri_parser.py b/scripts/sri_parser.py index 3284a54..788e200 100644 --- a/scripts/sri_parser.py +++ b/scripts/sri_parser.py @@ -51,8 +51,8 @@ class SRIParser: def __init__( self, base_url: str, - max_depth: int = 1, - max_pages: int = 25, + max_depth: int = 0, + max_pages: int = 1, timeout: int = 10, user_agent: str = "SRI-Parser/1.0 (+https://github.com/security-domain/domain-security-analyzer)", ) -> None: @@ -271,12 +271,22 @@ def build_arg_parser() -> argparse.ArgumentParser: description="Crawl a site and report unsafe Subresource Integrity implementations.", ) parser.add_argument("url", help="Base URL or domain to crawl") - parser.add_argument("--max-depth", type=int, default=1, help="Maximum crawl depth (default: 1)") + parser.add_argument( + "--crawl", + action="store_true", + help="Follow same-origin links up to --max-depth/--max-pages", + ) + parser.add_argument( + "--max-depth", + type=int, + default=1, + help="Maximum crawl depth when --crawl is set (default: 1)", + ) parser.add_argument( "--max-pages", type=int, default=25, - help="Maximum number of same-origin pages to visit (default: 25)", + help="Maximum number of same-origin pages to visit when --crawl is set (default: 25)", ) parser.add_argument( "--timeout", @@ -333,8 +343,8 @@ def main() -> None: sri_parser = SRIParser( base_url=args.url, - max_depth=args.max_depth, - max_pages=args.max_pages, + max_depth=args.max_depth if args.crawl else 0, + max_pages=args.max_pages if args.crawl else 1, timeout=args.timeout, ) report = sri_parser.crawl()