From 59567cf7af8e586cb67ea91887bfce19c10b7c23 Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Wed, 22 Apr 2026 14:31:08 +0200 Subject: [PATCH 1/2] fix(parser): handle inline opam descriptions Preserve later opam fields when descriptions are inline or move to the next quoted line, while keeping existing triple-quoted fixture behavior stable across the parser and golden tests. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus Signed-off-by: Maxim Stykow --- src/parsers/opam.rs | 173 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 164 insertions(+), 9 deletions(-) diff --git a/src/parsers/opam.rs b/src/parsers/opam.rs index 70488e50f..a12283cc6 100644 --- a/src/parsers/opam.rs +++ b/src/parsers/opam.rs @@ -233,7 +233,7 @@ fn parse_opam_data(text: &str) -> OpamData { "version" => data.version = clean_value(&value), "synopsis" => data.synopsis = clean_value(&value), "description" => { - data.description = parse_multiline_string(&lines, &mut i); + data.description = parse_description_field(&lines, &mut i, &value); } "homepage" => data.homepage = clean_value(&value), "dev-repo" => data.dev_repo = clean_value(&value), @@ -301,13 +301,49 @@ fn clean_value(value: &str) -> Option { } } -/// Parse a multiline string enclosed in triple quotes -fn parse_multiline_string(lines: &[&str], i: &mut usize) -> Option { +/// Parse an OPAM description field. +/// +/// OPAM descriptions can be encoded as an inline quoted string, a quoted string +/// on the following line, or a triple-quoted multiline string. +fn parse_description_field(lines: &[&str], i: &mut usize, first_value: &str) -> Option { + let trimmed = first_value.trim(); + + if trimmed.is_empty() { + let next_trimmed = lines.get(*i + 1)?.trim(); + + if next_trimmed.starts_with("\"\"\"") { + *i += 1; + return parse_triple_quoted_string(lines, i, next_trimmed); + } + + if next_trimmed.starts_with('"') { + *i += 1; + return clean_value(next_trimmed); + } + + return None; + } + + if trimmed.starts_with("\"\"\"") { + return parse_triple_quoted_string(lines, i, trimmed); + } + + clean_value(trimmed) +} + +/// Parse a multiline string enclosed in triple quotes. +fn parse_triple_quoted_string(lines: &[&str], i: &mut usize, first_value: &str) -> Option { let mut result = String::new(); let mut iteration_count: usize = 0; - if let Some((_, value)) = parse_key_value(lines[*i]) { - result.push_str(value.trim_matches('"').trim()); + let first_content = first_value.trim().trim_start_matches("\"\"\""); + if let Some(end_index) = first_content.find("\"\"\"") { + let cleaned = first_content[..end_index].trim(); + return (!cleaned.is_empty()).then(|| truncate_field(cleaned.to_string())); + } + + if !first_content.trim().is_empty() { + result.push_str(first_content.trim()); } *i += 1; @@ -317,13 +353,24 @@ fn parse_multiline_string(lines: &[&str], i: &mut usize) -> Option { warn!("parse_multiline_string: exceeded MAX_ITERATION_COUNT, breaking"); break; } - let line = lines[*i]; - result.push(' '); - result.push_str(line.trim_matches('"').trim()); + let line = lines[*i].trim(); - if line.contains("\"\"\"") { + if let Some(end_index) = line.find("\"\"\"") { + let before_end = line[..end_index].trim(); + if !before_end.is_empty() { + if !result.is_empty() { + result.push(' '); + } + result.push_str(before_end); + } break; } + + let content = line.trim_matches('"').trim(); + if !result.is_empty() { + result.push(' '); + } + result.push_str(content); *i += 1; } @@ -676,6 +723,114 @@ mod tests { ); } + #[test] + fn test_parse_opam_keeps_fields_after_single_line_description() { + let package = parse_opam( + r#"opam-version: "2.0" +name: "dune-rpc" +version: "3.23.0" +description: "Library to connect and control a running dune instance" +maintainer: ["Jane Street Group, LLC "] +authors: ["Jane Street Group, LLC "] +license: "MIT" +homepage: "https://github.com/ocaml/dune" +bug-reports: "https://github.com/ocaml/dune/issues" +depends: [ + "dune" {>= "3.23"} + "ocamlc-loc" + "stdune" {= version} + "odoc" {with-doc} +] +dev-repo: "git+https://github.com/ocaml/dune.git" +"#, + ); + + assert_eq!(package.name.as_deref(), Some("dune-rpc")); + assert_eq!(package.version.as_deref(), Some("3.23.0")); + assert_eq!( + package.description.as_deref(), + Some("Library to connect and control a running dune instance") + ); + assert_eq!( + package.homepage_url.as_deref(), + Some("https://github.com/ocaml/dune") + ); + assert_eq!( + package.bug_tracking_url.as_deref(), + Some("https://github.com/ocaml/dune/issues") + ); + assert_eq!( + package.vcs_url.as_deref(), + Some("git+https://github.com/ocaml/dune.git") + ); + assert_eq!( + package.declared_license_expression_spdx.as_deref(), + Some("MIT") + ); + assert_eq!(package.dependencies.len(), 4); + assert_eq!( + package.dependencies[0].purl.as_deref(), + Some("pkg:opam/dune") + ); + assert_eq!( + package.dependencies[0].extracted_requirement.as_deref(), + Some(">= 3.23") + ); + assert_eq!( + package.dependencies[2].extracted_requirement.as_deref(), + Some("= version") + ); + assert_eq!( + package.dependencies[3].extracted_requirement.as_deref(), + Some("with-doc") + ); + } + + #[test] + fn test_parse_opam_keeps_fields_after_next_line_description() { + let package = parse_opam( + r#"opam-version: "2.0" +name: "chrome-trace" +version: "3.23.0" +description: + "This library offers no backwards compatibility guarantees. Use at your own risk." +maintainer: ["Jane Street Group, LLC "] +license: "MIT" +depends: [ + "dune" {>= "3.23"} + "ocaml" {>= "4.14"} + "odoc" {with-doc} +] +dev-repo: "git+https://github.com/ocaml/dune.git" +"#, + ); + + assert_eq!(package.name.as_deref(), Some("chrome-trace")); + assert_eq!( + package.description.as_deref(), + Some( + "This library offers no backwards compatibility guarantees. Use at your own risk." + ) + ); + assert_eq!( + package.vcs_url.as_deref(), + Some("git+https://github.com/ocaml/dune.git") + ); + assert_eq!(package.dependencies.len(), 3); + assert_eq!( + package.dependencies[1].purl.as_deref(), + Some("pkg:opam/ocaml") + ); + assert_eq!( + package.dependencies[1].extracted_requirement.as_deref(), + Some(">= 4.14") + ); + assert_eq!( + package.dependencies[2].extracted_requirement.as_deref(), + Some("with-doc") + ); + } + #[test] fn test_extract_parties() { let authors = vec!["Author One".to_string()]; From baf5a8a7b26cafc00dc2dfac1c14d75335f6e19e Mon Sep 17 00:00:00 2001 From: Maxim Stykow Date: Wed, 22 Apr 2026 14:31:57 +0200 Subject: [PATCH 2/2] docs(benchmarks): record opam verification runs Record the dune, ocaml-lsp, and merlin compare-output results, regenerate the benchmark chart summary, and mark the OCaml/opam scorecard row verified. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus Signed-off-by: Maxim Stykow --- docs/BENCHMARKS.md | 5 ++++- docs/benchmarks/scan-duration-vs-files.svg | 18 ++++++++++++++++++ .../PARSER_VERIFICATION_SCORECARD.md | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md index 0990d0923..601613c9e 100644 --- a/docs/BENCHMARKS.md +++ b/docs/BENCHMARKS.md @@ -11,7 +11,7 @@ The chart below uses a log-log scatter plot: file count on the x-axis, wall-cloc ![Scan duration vs. file count for Provenant and ScanCode](benchmarks/scan-duration-vs-files.svg) -> Provenant is faster on 130 of 132 recorded runs, with a **11.5× median speedup** and **9.9× geometric-mean speedup** overall; the median gap grows from **6.4×** on sub-100-file targets to **19.7×** on 10k+ file targets. +> Provenant is faster on 133 of 135 recorded runs, with a **11.7× median speedup** and **10.1× geometric-mean speedup** overall; the median gap grows from **6.4×** on sub-100-file targets to **19.7×** on 10k+ file targets. > Generated from the benchmark timing rows in this document via `cargo run --manifest-path xtask/Cargo.toml --bin generate-benchmark-chart`. ## Current benchmark examples @@ -184,6 +184,9 @@ The tables below provide the per-target detail behind the chart. Each row is one | [nix-community/dream2nix @ 69eb01f](https://github.com/nix-community/dream2nix/tree/69eb01fa0995e1e90add49d8ca5bcba213b0416f)
515 files | 2026-04-12 · dream2nix-60485 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 19.91s
ScanCode: 33.50s
**1.68× faster (-40.6%)** | Broader Nix package and dependency extraction (`53` vs `22` packages, `887` vs `843` dependencies) from committed `flake.lock` inputs and flake-compat-backed `default.nix` wrapper surfaces across the tree, with cleaner root-package visibility on repository entrypoints that ScanCode leaves unassembled | | [NixOS/nix @ 262e98f](https://github.com/NixOS/nix/tree/262e98f67e09f83393dc84c2629df84cce2fe299)
2,889 files | 2026-04-11 · nix-94957 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 21.86s
ScanCode: 104.41s
**4.78× faster (-79.1%)** | Broader Nix package and dependency extraction (`2` vs `0` packages, `67` vs `0` dependencies) from committed `flake.lock` inputs and Nix manifest surfaces across the tree, plus safer URL credential stripping and Unicode-preserving author normalization across release-note metadata | | [numtide/devshell @ 255a2b1](https://github.com/numtide/devshell/tree/255a2b1725a20d060f566e4755dbf571bbbb5f76)
84 files | 2026-04-12 · devshell-83906 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 10.57s
ScanCode: 37.57s
**3.55× faster (-71.9%)** | Broader Nix package and dependency extraction (`5` vs `0` packages, `17` vs `0` dependencies) from committed `flake.lock` inputs, root `default.nix`, and template flake surfaces, with cleaner root-package visibility on flake-compat-backed entrypoints that ScanCode leaves unassembled | +| [ocaml/dune @ b13ab94](https://github.com/ocaml/dune/tree/b13ab949e185a205a39eb6163eea050b7d60a047)
7,751 files | 2026-04-22 · dune-32635 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 20.74s
ScanCode: 519.01s
**25.02× faster (-96.0%)** | Broader opam and Nix package visibility (`4` vs `2` packages, `130` vs `116` dependencies) from the generated `opam/*.opam` manifests and `flake.lock`, with structured opam description, maintainer, and dependency recovery instead of ScanCode's field-bleeding author text on those manifests | +| [ocaml/merlin @ 30b4f24](https://github.com/ocaml/merlin/tree/30b4f24fdd76fdbf32685aac73de7fd4a6ff7470)
2,120 files | 2026-04-22 · merlin-47624 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 31.93s
ScanCode: 656.13s
**20.55× faster (-95.1%)** | Direct opam package visibility (`1` vs `0` packages) with broader dependency extraction (`27` vs `24`) from the repo-root `merlin*.opam`, `dot-merlin-reader.opam`, `ocaml-index.opam`, and `flake.lock` surfaces, plus Unicode-preserving copyright normalization across the Merlin source tree | +| [ocaml/ocaml-lsp @ 788ff73](https://github.com/ocaml/ocaml-lsp/tree/788ff738991189537141776bfa07652547bff9c4)
546 files | 2026-04-22 · ocaml-lsp-41966 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 13.83s
ScanCode: 185.33s
**13.40× faster (-92.5%)** | Broader opam package visibility (`3` vs `1` packages) with slightly richer dependency extraction (`380` vs `376`) from the root and submodule `.opam` manifests plus `flake.lock`, with cleaner maintainer and email recovery on opam metadata and Unicode-preserving copyright normalization | | [univention/Nubus @ fef2258](https://github.com/univention/Nubus/tree/fef2258483c56cce0e1f14e4c8d8fce24d26b891)
16 files | 2026-04-19 · Nubus-321 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 10 proc | Provenant: 10.53s
ScanCode: 72.03s
**6.84× faster (-85.4%)** | Direct `publiccode.yml` package visibility on the root metadata file (`1` vs `0` on that file), with cleaner SPDX copyright placeholder normalization for `Univention GmbH` and the same zero-scan-error behavior under the shared profile | | [yesodweb/yesod @ 1b033c7](https://github.com/yesodweb/yesod/tree/1b033c741ce81d01070de993b285a17e71178156)
324 files | 2026-04-17 · yesod-71400 · macOS 26.3.1 · Apple M1 Max · 32 GB · arm64 · 9 proc | Provenant: 10.62s
ScanCode: 99.03s
**9.32× faster (-89.3%)** | Broader multi-package Hackage extraction (`16` vs `0` packages, `391` vs `0` dependencies) from the repo's many sibling `yesod-*/*.cabal` manifests, with explicit package identities across the Yesod family where ScanCode stays manifest-blind | diff --git a/docs/benchmarks/scan-duration-vs-files.svg b/docs/benchmarks/scan-duration-vs-files.svg index 0c1aba840..38d4df095 100644 --- a/docs/benchmarks/scan-duration-vs-files.svg +++ b/docs/benchmarks/scan-duration-vs-files.svg @@ -191,6 +191,9 @@ ScanCode: 96.27s nix-community/dream2nix @ 69eb01f Files: 515 ScanCode: 33.50s + ocaml/ocaml-lsp @ 788ff73 +Files: 546 +ScanCode: 185.33s Alamofire/Alamofire @ ac01666 Files: 567 ScanCode: 175.16s @@ -272,6 +275,9 @@ ScanCode: 534.66s rubocop/rubocop @ 4e0d642 Files: 2081 ScanCode: 182.30s + ocaml/merlin @ 30b4f24 +Files: 2120 +ScanCode: 656.13s prefix-dev/pixi @ 6458b15 Files: 2372 ScanCode: 503.12s @@ -365,6 +371,9 @@ ScanCode: 1201.90s apache/kafka @ 0d9fe51 Files: 7179 ScanCode: 751.77s + ocaml/dune @ b13ab94 +Files: 7751 +ScanCode: 519.01s facebook/react-native @ 179e0cd Files: 7765 ScanCode: 527.81s @@ -589,6 +598,9 @@ Provenant: 13.20s nix-community/dream2nix @ 69eb01f Files: 515 Provenant: 19.91s + ocaml/ocaml-lsp @ 788ff73 +Files: 546 +Provenant: 13.83s Alamofire/Alamofire @ ac01666 Files: 567 Provenant: 14.71s @@ -670,6 +682,9 @@ Provenant: 24.26s rubocop/rubocop @ 4e0d642 Files: 2081 Provenant: 24.15s + ocaml/merlin @ 30b4f24 +Files: 2120 +Provenant: 31.93s prefix-dev/pixi @ 6458b15 Files: 2372 Provenant: 62.21s @@ -763,6 +778,9 @@ Provenant: 51.67s apache/kafka @ 0d9fe51 Files: 7179 Provenant: 53.61s + ocaml/dune @ b13ab94 +Files: 7751 +Provenant: 20.74s facebook/react-native @ 179e0cd Files: 7765 Provenant: 34.99s diff --git a/docs/implementation-plans/package-detection/PARSER_VERIFICATION_SCORECARD.md b/docs/implementation-plans/package-detection/PARSER_VERIFICATION_SCORECARD.md index ec72acd6f..42553d9e9 100644 --- a/docs/implementation-plans/package-detection/PARSER_VERIFICATION_SCORECARD.md +++ b/docs/implementation-plans/package-detection/PARSER_VERIFICATION_SCORECARD.md @@ -106,7 +106,7 @@ The ranking below is ordered by **practical verification value first**: broad ec | 32 | Linux Distro (`os-release`) | ⚪ Planned | Debian base-image rootfs snapshot
Fedora base-image rootfs snapshot
Distroless `base-debian12` rootfs snapshot | This row is rootfs-only on purpose. Debian and Fedora give conventional distro metadata layouts, while Distroless shows the minimal-image case where `os-release` may be one of the few package-identity signals present. Watch path/layout differences and do not treat intentionally sparse distroless metadata as a parser regression by itself. | | 33 | AboutCode | ⚪ Planned | `aboutcode-org/scancode-toolkit` (10k–50k files)
`aboutcode-org/scancode.io` (500–2k files)
`aboutcode-org/dejacode` (500–2k files) | Niche but very high-fit `.ABOUT` lane. `aboutcode-org/scancode-toolkit` is the broadest real-world `.ABOUT` reference, while `aboutcode-org/scancode.io` and `aboutcode-org/dejacode` provide smaller product-style contrasts. Watch `.ABOUT` extraction staying visible beside denser package, README, and license output in these application trees. | | 34 | Hex / Elixir | ⚪ Planned | `phoenixframework/phoenix` (500–2k files)
`elixir-ecto/ecto` (500–2k files)
`elixir-plug/plug` (<500 files) | Useful ecosystem, but current Rust scope is still the lockfile/static subset, so this ranks below the broader mainstream families. | -| 35 | OCaml / opam | ⚪ Planned | `ocaml/dune` (500–2k files)
`ocaml/ocaml-lsp` (500–2k files)
`ocaml/merlin` (500–2k files) | Good `opam` coverage, but lower practical verification priority than the broader ecosystems above. | +| 35 | OCaml / opam | 🟢 Verified | `ocaml/dune` (500–2k files)
`ocaml/ocaml-lsp` (500–2k files)
`ocaml/merlin` (500–2k files) | Good `opam` coverage, but lower practical verification priority than the broader ecosystems above. | | 36 | Buck | 🟢 Verified | `facebook/buck2` (2k–10k files)
`facebook/watchman` (500–2k files)
`facebook/react-native` (10k–50k files) | Real Buck lane, even if narrower than Bazel in practice. `facebook/buck2` is the canonical direct reference, `facebook/watchman` is a smaller focused contrast, and `facebook/react-native` adds a large mixed-language consumer tree. Watch Buck metadata separately from the rest of the monorepo so unrelated JS/native/common-profile noise does not hide actual build-metadata gaps. | | 37 | FreeBSD | ⚪ Planned | FreeBSD `pkg` package archive sample
FreeBSD `bash` package archive sample
FreeBSD `curl` package archive sample | Important artifact-family support, but narrower day-to-day scan prevalence than the higher-priority distro lanes. | | 38 | Chef | ⚪ Planned | `sous-chefs/apache2` (<500 files)
`sous-chefs/mysql` (<500 files)
`chef/chef` (2k–10k files) | Worth covering, but lower priority than the mainstream language and distro families. |