From 1cec7dd4a8f21673dce9ada28ac0d5736c4f2d04 Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Thu, 23 Apr 2026 00:38:21 +0200 Subject: [PATCH 1/3] fix(copyright): capture comment attribution authors Improve shared author extraction for comment-style attribution lines so compare-outputs can keep Bower verification repos aligned without target-specific handling. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus Signed-off-by: Maxim Stykow --- src/scanner/process/copyright.rs | 70 ++++++++++++++++++++++++++- src/scanner/process/copyright_test.rs | 21 ++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/src/scanner/process/copyright.rs b/src/scanner/process/copyright.rs index 74ce6b083..6d447734e 100644 --- a/src/scanner/process/copyright.rs +++ b/src/scanner/process/copyright.rs @@ -228,6 +228,12 @@ fn extract_comment_author_supplements(text_content: &str) -> Vec = LazyLock::new(|| { + Regex::new( + r"(?i)\b(?:written|edited|modified|updated|originally)\s+by\s+(?P[^()\n]+?)\s*\(\s*(?P(?:[^)\s]+@[^)\s]+|https?://[^)\s]+))\s*\)\s*\.?$|^(?:[#;/*!\-\s]+)?(?:[^()\n]*?\bby\s+(?P[^()\n]+?)\s*\(\s*(?P(?:[^)\s]+@[^)\s]+|https?://[^)\s]+))\s*\))\s*\.?$", + ) + .expect("valid parenthesized contact author regex") + }); static DOCKER_MAINTAINER_LABEL_RE: LazyLock = LazyLock::new(|| { Regex::new(r#"(?i)^label\s+maintainer\s*=\s*[\"']?(?P[^\"'\n]+<[^>]+>)[\"']?\s*$"#) .expect("valid docker maintainer label regex") @@ -241,21 +247,41 @@ fn extract_comment_author_supplements(text_content: &str) -> Vec Vec String { + line.trim() + .trim_end_matches("*/") + .trim_end_matches("-->") + .trim() + .to_string() +} + +fn normalize_comment_author_candidate(author: &str) -> String { + static ANGLE_URL_AUTHOR_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"^(?P[^<>]+?)\s*<\s*(?Phttps?://[^>\s]+)\s*>\s*$") + .expect("valid angle url author regex") + }); + + let trimmed = author.trim().trim_end_matches('.').trim(); + if let Some(captures) = ANGLE_URL_AUTHOR_RE.captures(trimmed) { + let name = captures + .name("name") + .map(|m| m.as_str().trim()) + .unwrap_or(trimmed); + let url = captures + .name("url") + .map(|m| m.as_str().trim_end_matches('/')) + .unwrap_or(trimmed); + return format!("{name} {url}"); + } + + trimmed.to_string() +} + +fn normalize_parenthesized_contact_author(name: &str, contact: &str) -> String { + let normalized_name = name.trim().trim_end_matches('.').trim(); + let normalized_contact = if contact.starts_with("http://") || contact.starts_with("https://") { + contact.trim_end_matches('/') + } else { + contact.trim() + }; + format!("{normalized_name} ({normalized_contact})") +} + fn has_explicit_copyright_marker(text: &str) -> bool { let lower = text.to_ascii_lowercase(); lower.contains("(c)") || lower.contains('©') || lower.contains("copr") diff --git a/src/scanner/process/copyright_test.rs b/src/scanner/process/copyright_test.rs index 47e1a1434..72858dd26 100644 --- a/src/scanner/process/copyright_test.rs +++ b/src/scanner/process/copyright_test.rs @@ -79,3 +79,24 @@ LABEL maintainer=\"Progress Chef \"\n"; ] ); } + +#[test] +fn test_extract_comment_author_supplements_handles_c_style_translator_headers() { + let text = "/* Translated by Jorge Barreiro . */\n\ +/* Written by Mathias Bynens */\n\ +/* Written by Cloudream (cloudream@gmail.com). */\n\ +/* Written by S A Sureshkumar (saskumar@live.com). */\n"; + + let authors = extract_comment_author_supplements(text); + let values: Vec<_> = authors.into_iter().map(|author| author.author).collect(); + + assert_eq!( + values, + vec![ + "Jorge Barreiro ", + "Mathias Bynens https://mathiasbynens.be", + "Cloudream (cloudream@gmail.com)", + "S A Sureshkumar (saskumar@live.com)", + ] + ); +} From 0c3088ebdd3e8a17a91e994513c75936d0e39f3e Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Thu, 23 Apr 2026 00:39:03 +0200 Subject: [PATCH 2/3] docs(scorecard): mark Bower verification complete Record the fully reviewed Bower compare-target set as verified while keeping the row notes stable and the verification narrative in benchmark artifacts instead. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus Signed-off-by: Maxim Stykow --- .../package-detection/PARSER_VERIFICATION_SCORECARD.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/implementation-plans/package-detection/PARSER_VERIFICATION_SCORECARD.md b/docs/implementation-plans/package-detection/PARSER_VERIFICATION_SCORECARD.md index 017b76070..d393114e1 100644 --- a/docs/implementation-plans/package-detection/PARSER_VERIFICATION_SCORECARD.md +++ b/docs/implementation-plans/package-detection/PARSER_VERIFICATION_SCORECARD.md @@ -57,8 +57,6 @@ Method rules: The ranking below is ordered by **practical verification value first**: broad ecosystem prevalence, likelihood of exposing real parser-plus-license/copyright interactions under `--profile common`, and coverage breadth within the implemented family. -<<<<<<< HEAD - | Priority | Ecosystem | Status | Candidate targets | Priority and scope notes | | -------- | ------------------------------------------------------------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | 0a | Cross-cutting broad `C++` repository scans (non-parser reference) | 🟢 Verified | `boostorg/boost` (236 files)
`boostorg/json` (701 files)
`mongodb/mongo` (11k files) | There is no generic `C++` parser row. These repositories are still valuable reference targets because they exercise multiple implemented `C++`-adjacent families and package-adjacent detection in realistic trees. They complement, but do not replace, family-specific verification for Autotools, Conan, vcpkg, Bazel, and Buck. | @@ -112,7 +110,7 @@ The ranking below is ordered by **practical verification value first**: broad ec | 36 | Buck | 🟢 Verified | `facebook/buck2` (2k–10k files)
`facebook/watchman` (500–2k files)
`facebook/react-native` (10k–50k files) | Real Buck lane, even if narrower than Bazel in practice. `facebook/buck2` is the canonical direct reference, `facebook/watchman` is a smaller focused contrast, and `facebook/react-native` adds a large mixed-language consumer tree. Watch Buck metadata separately from the rest of the monorepo so unrelated JS/native/common-profile noise does not hide actual build-metadata gaps. | | 37 | FreeBSD | ⚪ Planned | FreeBSD `pkg` package archive sample
FreeBSD `bash` package archive sample
FreeBSD `curl` package archive sample | Important artifact-family support, but narrower day-to-day scan prevalence than the higher-priority distro lanes. | | 38 | Chef | 🟢 Verified | `sous-chefs/apache2` (<500 files)
`sous-chefs/mysql` (<500 files)
`chef/chef` (2k–10k files) | Worth covering, but lower priority than the mainstream language and distro families. | -| 39 | Bower | ⚪ Planned | `jquery/jquery-ui` (500–2k files)
`select2/select2` (<500 files)
`jashkenas/backbone` (<500 files) | Legacy ecosystem with ongoing value mostly for backward compatibility. | +| 39 | Bower | 🟢 Verified | `jquery/jquery-ui` (500–2k files)
`select2/select2` (<500 files)
`jashkenas/backbone` (<500 files) | Legacy ecosystem with ongoing value mostly for backward compatibility. | | 40 | Haxe | ⚪ Planned | `openfl/openfl` (500–2k files)
`HaxeFlixel/flixel` (500–2k files)
`HeapsIO/heaps` (500–2k files) | Smaller ecosystem; still useful, but lower-value than the broader mainstream families above. | | 41 | Windows Update | ⚪ Planned | `wsusscn2.cab` extracted tree
Windows cumulative update `.msu` extracted tree
Windows servicing stack update extracted tree | Artifact-oriented family with real value, but specialized and best handled after the higher-signal source/package ecosystems. | | 42 | `misc.py` recognizers | ⚪ Planned | Apache Tomcat binary release artifacts
Firefox add-on / language-pack artifacts
NSIS official installer artifacts | Broad recognizer family, but not a normal package-manager lane; treat as specialized follow-up verification. | From 18d2b452b5c47120b6197bbee96006543e80acdf Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Thu, 23 Apr 2026 00:39:31 +0200 Subject: [PATCH 3/3] docs(benchmarks): record Bower verification runs Add the reviewed jquery-ui, select2, and backbone compare snapshots to the benchmark table and regenerate the chart so the aggregate stats stay in sync with the new recorded runs. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus Signed-off-by: Maxim Stykow --- docs/BENCHMARKS.md | 5 ++++- docs/benchmarks/scan-duration-vs-files.svg | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md index 12f8d4de1..306104eca 100644 --- a/docs/BENCHMARKS.md +++ b/docs/BENCHMARKS.md @@ -11,7 +11,7 @@ The chart below uses a log-log scatter plot: file count on the x-axis, wall-cloc ![Scan duration vs. file count for Provenant and ScanCode](benchmarks/scan-duration-vs-files.svg) -> Provenant is faster on 142 of 144 recorded runs, with a **11.6× median speedup** and **10.1× geometric-mean speedup** overall; the median gap grows from **6.4×** on sub-100-file targets to **20.1×** on 10k+ file targets. +> Provenant is faster on 145 of 147 recorded runs, with a **11.6× median speedup** and **10.2× geometric-mean speedup** overall; the median gap grows from **6.4×** on sub-100-file targets to **20.1×** on 10k+ file targets. > Generated from the benchmark timing rows in this document via `cargo run --manifest-path xtask/Cargo.toml --bin generate-benchmark-chart`. ## Current benchmark examples @@ -82,12 +82,15 @@ The tables below provide the per-target detail behind the chart. Each row is one | [denoland/std @ a864f62](https://github.com/denoland/std/tree/a864f62bcc8a5f20716d2becab3cfe224a2ad810)
2,812 files | 2026-04-22 · std-31214 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 10 proc | Provenant: 16.30s
ScanCode: 394.76s
**24.22× faster (-95.9%)** | Broader Deno package visibility (`45` vs `3` packages) from the root and leaf `*/deno.json` manifests across the standard-library tree, plus concrete Cargo lock package identities on embedded Rust fixtures instead of anonymous `cargo_lock` rows, with zero top-level license-expression deltas under the shared profile | | [getsentry/self-hosted @ 8728919](https://github.com/getsentry/self-hosted/tree/8728919e080836c53724f277d4d36cc310fc5011)
129 files | 2026-04-15 · self-hosted-22209 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 12.14s
ScanCode: 78.89s
**6.50× faster (-84.6%)** | Broader mixed Docker/npm/Python package extraction (`2` vs `1` packages, `111` vs `0` dependencies) from the integration-test `package-lock.json`, `uv.lock`, and committed service Dockerfiles, plus the more specific `Apache-2.0 AND FSL-1.1-ALv2` license classification on `LICENSE.md` where ScanCode reports only `FSL-1.1-ALv2` | | [iTowns/itowns @ 08e08f5](https://github.com/iTowns/itowns/tree/08e08f512983b6f3d60d04d431b67b3c5e2e1584)
616 files | 2026-04-19 · itowns-87752 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 10 proc | Provenant: 12.53s
ScanCode: 170.19s
**13.58× faster (-92.6%)** | Direct `publiccode.yml` package visibility on the root metadata file (`1` vs `0` on that file), with matched top-level package and dependency counts elsewhere plus Unicode-preserving Potree copyright normalization and cleaner URL shaping across README and docs material | +| [jashkenas/backbone @ da75718](https://github.com/jashkenas/backbone/tree/da75718e896e52e84aa1f0411ba67fafcdcf6af3)
122 files | 2026-04-22 · backbone-8407 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc | Provenant: 11.27s
ScanCode: 104.56s
**9.28× faster (-89.2%)** | Matched Bower package and dependency coverage on the repo-root `bower.json`, with datasource-tagged Bower package identity instead of a bare purl-only row, package-level party metadata from `package.json`, and much faster same-host runtime | +| [jquery/jquery-ui @ eda7aa3](https://github.com/jquery/jquery-ui/tree/eda7aa34fa59d8f764b2164be3e3b7f14639b0db)
1,083 files | 2026-04-22 · jquery-ui-93350 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc | Provenant: 15.56s
ScanCode: 303.29s
**19.49× faster (-94.9%)** | Matched Bower package and dependency coverage on the repo-root `bower.json`, with datasource-tagged Bower package identity instead of a bare purl-only row, cleaner Unicode-preserving author normalization across locale files and vendored docs, and much faster same-host runtime | | [metabase/metabase @ 10997b1](https://github.com/metabase/metabase/tree/10997b10908414ab05773b085a56a37fcdebcd1a)
18,030 files | 2026-04-13 · metabase-21346 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 51.84s
ScanCode: 1330.92s
**25.67× faster (-96.1%)** | Broader package and dependency extraction (`8` vs `1` packages, `1436` vs `423` dependencies) from the root and driver `deps.edn` manifests plus committed `bun.lock` and `uv.lock`, with cleaner OFL font URL normalization where ScanCode preserves broken concatenated links | | [microsoft/vscode @ 0c1e100](https://github.com/microsoft/vscode/tree/0c1e100626c19724d1222c2bc4b63ba3556858a7)
14,398 files | 2026-04-12 · vscode-89240 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 58.96s
ScanCode: 1410.57s
**23.92× faster (-95.8%)** | Broader monorepo package and dependency extraction (`138` vs `1` packages, `7718` vs `1815` dependencies) from the root `package-lock.json`, many extension fixture manifests and lockfiles, and embedded Cargo/Docker metadata, plus richer named package identities where ScanCode emits generic lockfile and archive rows | | [npm/cli @ 05dbba5](https://github.com/npm/cli/tree/05dbba5b8d727ddb2c098ce0553714eae791c5f2)
6,698 files | 2026-04-09 · cli-89026 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc | Provenant: 295.10s
ScanCode: 3376.85s
**11.44× faster (-91.3%)** | Clean root npm workspace manifest coverage without ScanCode's workspace-assembly scan errors, fewer large registry-fixture JSON timeouts, and cleaner handling of duplicated private-workspace dependency exports and repeated MIT-style registry-fixture metadata noise | | [oakserver/oak @ 185baef](https://github.com/oakserver/oak/tree/185baef02551a84798000f25d3bd01c2fdfcb1ce)
103 files | 2026-04-22 · oak-39847 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 10 proc | Provenant: 12.95s
ScanCode: 115.73s
**8.94× faster (-88.8%)** | Direct Deno package visibility on the root `deno.json` (`1` vs `0` packages), plus Dockerfile package visibility on `.devcontainer/Dockerfile`, with cleaner trailing-slash URL normalization across README and docs material | | [oven-sh/bun @ 700fc11](https://github.com/oven-sh/bun/tree/700fc117a2fd01ac0201deaa6fa69c5557acb04f)
12,551 files | 2026-04-09 · bun-18972 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 43.05s
ScanCode: 849.10s
**19.72× faster (-94.9%)** | Far broader Bun/npm-family package extraction (`382` vs `29` packages, `5773` vs `323` dependencies) from the repo's 52 committed `bun.lock` / `bun.lockb` inputs that ScanCode leaves at zero, plus legacy `bun.lockb` coverage on `bench/bundle` and plainer `BSD-2-Clause` rebucketing where ScanCode uses the over-specific `BSD-2-Clause-Views` label | | [renovatebot/renovate @ 91a7213](https://github.com/renovatebot/renovate/tree/91a72131e8aefcda8f0dab7499f378f7eb41300f)
3,663 files | 2026-04-13 · renovate-30308 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 23.74s
ScanCode: 446.79s
**18.82× faster (-94.7%)** | Broader fixture-heavy package and dependency extraction (`52` vs `1` packages, `1778` vs `1485` dependencies) from committed `project.clj`, `deps.edn`, and cross-ecosystem manager fixtures, plus Leiningen package identity on `lib/modules/manager/leiningen/__fixtures__/project.clj` where ScanCode stays manifest-blind | +| [select2/select2 @ 595494a](https://github.com/select2/select2/tree/595494a72fee67b0a61c64701cbb72e3121f97b9)
704 files | 2026-04-22 · select2-925 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 4 proc | Provenant: 12.57s
ScanCode: 146.24s
**11.63× faster (-91.4%)** | Matched Bower package and dependency coverage on the repo-root `bower.json`, with datasource-tagged Bower package identity instead of a bare purl-only row, cleaner package-author normalization in `package.json`, and much faster same-host runtime | | [vercel/next.js @ 8e5a36f](https://github.com/vercel/next.js/tree/8e5a36f6347528d8968da97262f372f908897bac)
28,044 files | 2026-04-11 · next.js-35897 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 41.11s
ScanCode: 850.20s
**20.68× faster (-95.2%)** | Broader monorepo package and dependency extraction (`464` vs `249` packages, `13787` vs `12017` dependencies) from the root `pnpm-lock.yaml`, many workspace fixture subtrees, and embedded Cargo/npm metadata, plus zero scan errors where ScanCode crashes on workspace `package.json` and `pnpm-lock.yaml` inputs | | [yarnpkg/berry @ c0274d6](https://github.com/yarnpkg/berry/tree/c0274d6d7ba5939f447e78aaf16e456a00cf0bd1)
3,552 files | 2026-04-12 · berry-43600 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 23.75s
ScanCode: 194.82s
**8.20× faster (-87.8%)** | Broader dependency extraction (`2835` vs `1301`) from Berry `yarn.lock`, workspace manifests, and `.pnp.cjs`, plus cleaner workspace package assembly that avoids ScanCode's duplicated npm package rows (`204` vs `395`) and `package.json` / `yarn.lock` assembly crashes while still surfacing extra Docker and Windows package inputs committed in the tree | diff --git a/docs/benchmarks/scan-duration-vs-files.svg b/docs/benchmarks/scan-duration-vs-files.svg index 4afc8354d..ae3e81b15 100644 --- a/docs/benchmarks/scan-duration-vs-files.svg +++ b/docs/benchmarks/scan-duration-vs-files.svg @@ -134,6 +134,9 @@ ScanCode: 92.08s SwiftFiddle/swiftfiddle-web @ df09b80 Files: 109 ScanCode: 84.73s + jashkenas/backbone @ da75718 +Files: 122 +ScanCode: 104.56s getsentry/self-hosted @ 8728919 Files: 129 ScanCode: 78.89s @@ -233,6 +236,9 @@ ScanCode: 203.47s boostorg/json @ 70efd4b Files: 701 ScanCode: 150.19s + select2/select2 @ 595494a +Files: 704 +ScanCode: 146.24s tokio-rs/tokio @ 5db10f5 Files: 833 ScanCode: 62.23s @@ -254,6 +260,9 @@ ScanCode: 84.36s composer/composer @ a2bf8cb Files: 1030 ScanCode: 84.94s + jquery/jquery-ui @ eda7aa3 +Files: 1083 +ScanCode: 303.29s pointfreeco/swift-composable-architecture @ 7517cc3 Files: 1098 ScanCode: 127.50s @@ -568,6 +577,9 @@ Provenant: 10.77s SwiftFiddle/swiftfiddle-web @ df09b80 Files: 109 Provenant: 10.21s + jashkenas/backbone @ da75718 +Files: 122 +Provenant: 11.27s getsentry/self-hosted @ 8728919 Files: 129 Provenant: 12.14s @@ -667,6 +679,9 @@ Provenant: 14.37s boostorg/json @ 70efd4b Files: 701 Provenant: 32.30s + select2/select2 @ 595494a +Files: 704 +Provenant: 12.57s tokio-rs/tokio @ 5db10f5 Files: 833 Provenant: 18.81s @@ -688,6 +703,9 @@ Provenant: 20.09s composer/composer @ a2bf8cb Files: 1030 Provenant: 21.23s + jquery/jquery-ui @ eda7aa3 +Files: 1083 +Provenant: 15.56s pointfreeco/swift-composable-architecture @ 7517cc3 Files: 1098 Provenant: 10.40s