diff --git a/README.md b/README.md index 36f1a8fec..fd90127b7 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,7 @@ provenant --json-pp scan-results.json --license --package ~/projects/my-codebase Use `-` as `FILE` to write an output stream to stdout, for example `--json-pp -`. Multiple output flags can be used in a single run, matching ScanCode CLI behavior. When using `--from-json`, you can pass multiple JSON inputs. Native directory scans also support multiple input paths, matching ScanCode's common-prefix behavior. +When you need to scan an explicit allowlist of files or directories under one root (for example PR-changed files from CI), use `--paths-file ` with one explicit scan root instead of expanding the list into positional args. Use `--incremental` for repeated scans of the same tree. After a completed scan, Provenant keeps an incremental manifest and uses it on the next run to skip unchanged files. That is useful for local iteration, CI-style reruns, and retrying after a later failed or interrupted scan. The diff --git a/docs/CLI_GUIDE.md b/docs/CLI_GUIDE.md index 76c190765..e1e5bc197 100644 --- a/docs/CLI_GUIDE.md +++ b/docs/CLI_GUIDE.md @@ -481,8 +481,55 @@ This is useful for: - scanning split source trees in one run - collecting one combined report for several directories +These native multi-input paths still follow the current common-prefix behavior. They work best when you can invoke Provenant from a cwd where the relative input paths share a usable common ancestor. + You can also pass multiple JSON inputs with `--from-json`. +### 20. "I want to scan only files matching certain patterns" + +```sh +provenant --json-pp scan.json --license /path/to/repo --include "*.rs" --include "src/**/*.toml" +``` + +Use `--include` when you want glob-style path filtering inside one scan root. + +Current behavior: + +- `--include` matches file/path patterns; repeated flags are additive +- use `**` when you want recursion across directory boundaries +- plain directory-looking tokens such as `src/foo` are treated as literal path patterns, not as an implicit “scan this whole subtree” shortcut +- if you already know the exact files or directories you want, prefer `--paths-file` instead of encoding that selection indirectly through globs + +### 21. "I have an explicit list of files or directories to scan" + +```sh +provenant --json-pp scan.json --license /path/to/repo --paths-file changed-files.txt +``` + +Use this when you already have a selected path list under one known root, especially for CI and pull-request workflows where cwd cannot be the repo root. + +`--paths-file` is the preferred workflow when: + +- `git diff --name-only` or another tool already produced the changed-file list +- Provenant must run from a fixed mount location or other non-repo cwd +- you want Provenant itself, not shell `xargs`, to own the selection semantics + +Current behavior: + +- pass exactly one native scan root as the positional input +- entries in the paths file are interpreted relative to that root +- one path per line, with blank lines ignored and CRLF tolerated +- directory entries select that subtree +- missing entries are skipped with a warning +- `--paths-file -` reads the list from stdin +- `--paths-file` cannot currently be combined with `--from-json` + +Example with stdin: + +```sh +git diff --name-only --diff-filter=d origin/main...HEAD | provenant --json-pp - --license /path/to/repo --paths-file - +``` + ## Important Flag Combinations These are worth learning early because they change what the output means: @@ -497,6 +544,7 @@ These are worth learning early because they change what the output means: - `--tallies-key-files` requires `--tallies` and `--classify` - `--tallies-by-facet` requires `--facet` and `--tallies` - `--debian ` requires `--license`, `--copyright`, and `--license-text` +- `--paths-file ` requires exactly one native scan root and is currently native-scan only (no `--from-json`) - `--reindex` only matters when the license engine is initialized (`--license` and some `--from-json` reference-recompute flows) - `--no-license-index-cache` only matters when the license engine is initialized @@ -512,6 +560,8 @@ If you are not sure where to start, use this rule of thumb: - Want browser-friendly review? → `--html` - Want policy-aware license review? → add `--license-references`, `--filter-clues`, and optionally `--license-policy` - Want summary/tally/facet review? → add `--classify`, `--summary`, and optionally `--tallies*` / `--facet` +- Want glob-style file filtering inside one scan root? → add one or more `--include` patterns +- Want an explicit rooted list of files/directories? → use `--paths-file` - Already have JSON and only want to filter or reshape it? → `--from-json` ## Where to Go Next diff --git a/docs/MIGRATING_FROM_SCANCODE.md b/docs/MIGRATING_FROM_SCANCODE.md index 49f48b344..465f9383a 100644 --- a/docs/MIGRATING_FROM_SCANCODE.md +++ b/docs/MIGRATING_FROM_SCANCODE.md @@ -113,6 +113,26 @@ These are not random incompatibilities; they are documented behavior improvement See [Beyond-Parity Improvements](improvements/README.md) for the full index. +### 6. Path selection is split more explicitly between patterns and exact rooted paths + +If you previously relied on `--include` as a rough way to express “scan this subtree”, pay close attention to Provenant's newer split here. + +- `--include` is for glob-style path filtering +- recursion should be explicit in the pattern (for example `src/**`) +- `--paths-file` is the explicit rooted workflow for “scan exactly these files or directories under this root” + +That means Provenant now prefers: + +- `--include '*.rs' --include 'src/**/*.toml'` when you mean pattern filtering +- `--paths-file changed-files.txt /path/to/repo` when you already know the exact rooted file or directory list + +This is a workflow-level difference worth knowing when you migrate existing ScanCode habits or shell wrappers. + +See also: + +- [CLI Guide](CLI_GUIDE.md) +- [CLI Workflows](improvements/cli-workflows.md) + ## Practical migration advice If you are moving an existing ScanCode workflow to Provenant: @@ -121,6 +141,7 @@ If you are moving an existing ScanCode workflow to Provenant: 2. compare outputs on one representative codebase 3. check this guide if you see a meaningful delta 4. use the exported dataset workflow if you previously customized license/rule data in a ScanCode checkout +5. if your old workflow used `--include` to approximate explicit path lists, consider switching that part to `--paths-file` ## Other differences worth knowing diff --git a/docs/implementation-plans/infrastructure/CLI_PLAN.md b/docs/implementation-plans/infrastructure/CLI_PLAN.md index 187a117a0..7ec13eafc 100644 --- a/docs/implementation-plans/infrastructure/CLI_PLAN.md +++ b/docs/implementation-plans/infrastructure/CLI_PLAN.md @@ -44,21 +44,22 @@ Treat this file as a maintained compatibility ledger rather than the primary use ### Invocation & Input Handling -| Flag | What it does | Status | Notes | -| ---------------------- | ------------------------------------------------------- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `...` | Supplies the path or paths to scan | `Done` | Native scans now support the upstream-style relative multi-input common-prefix flow, and `--from-json` still supports multiple scan files. | -| `-h, --help` | Prints CLI help | `Done` | Provided by `clap`. | -| `-V, --version` | Prints CLI version | `Done` | Provided by `clap`. | -| `-q, --quiet` | Reduces runtime output | `Done` | Matches the current quiet-mode surface. | -| `-v, --verbose` | Increases runtime path reporting | `Done` | Matches the current verbose-path surface: per-file paths on TTY, bounded progress plus per-file warning/error context on non-TTY stderr. | -| `-m, --max-depth` | Limits recursive scan depth | `Done` | `0` means no depth limit. | -| `-n, --processes` | Controls worker count | `Done` | Positive values set the worker count; `0` disables parallel file scanning; `-1` also disables timeout-backed interruption checks. | -| `--timeout` | Sets per-file processing timeout | `Done` | Wired through the scanner runtime. | -| `--exclude / --ignore` | Excludes files by glob pattern | `Done` | `--ignore` is the ScanCode-facing alias. | -| `--include` | Re-includes matching paths after filtering | `Done` | Native scans now apply ScanCode-style combined include/ignore path filtering before file scanning; `--from-json` applies the same path selection as a shaping step over the loaded result tree. | -| `--strip-root` | Rewrites paths relative to the scan root | `Done` | Root-resource, single-file, native multi-input, nested reference, and top-level package/dependency path projection are now handled in the final shaping pass. | -| `--full-root` | Preserves absolute/rooted output paths | `Done` | Full-root display paths now follow the ScanCode-style formatting pass, including path cleanup and field-specific projection rules. | -| `--from-json` | Loads prior scan JSON instead of rescanning input files | `Done` | Supports multiple input scans, shaping-time include/ignore filtering, root-flag reshaping per loaded scan before merge, and recomputation of followed top-level license outputs after load. | +| Flag | What it does | Status | Notes | +| ---------------------- | ------------------------------------------------------- | --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `...` | Supplies the path or paths to scan | `Done` | Native scans now support the upstream-style relative multi-input common-prefix flow, and `--from-json` still supports multiple scan files. | +| `--paths-file ` | Loads selected native scan paths from a file | `Rust-specific` | Explicit-path convenience for one rooted native scan. v1 uses one explicit scan root and root-relative entries instead of extending the common-prefix argv flow. | +| `-h, --help` | Prints CLI help | `Done` | Provided by `clap`. | +| `-V, --version` | Prints CLI version | `Done` | Provided by `clap`. | +| `-q, --quiet` | Reduces runtime output | `Done` | Matches the current quiet-mode surface. | +| `-v, --verbose` | Increases runtime path reporting | `Done` | Matches the current verbose-path surface: per-file paths on TTY, bounded progress plus per-file warning/error context on non-TTY stderr. | +| `-m, --max-depth` | Limits recursive scan depth | `Done` | `0` means no depth limit. | +| `-n, --processes` | Controls worker count | `Done` | Positive values set the worker count; `0` disables parallel file scanning; `-1` also disables timeout-backed interruption checks. | +| `--timeout` | Sets per-file processing timeout | `Done` | Wired through the scanner runtime. | +| `--exclude / --ignore` | Excludes files by glob pattern | `Done` | `--ignore` is the ScanCode-facing alias. | +| `--include` | Re-includes matching paths after filtering | `Done` | Native scans now apply ScanCode-style combined include/ignore path filtering before file scanning; `--from-json` applies the same path selection as a shaping step over the loaded result tree. | +| `--strip-root` | Rewrites paths relative to the scan root | `Done` | Root-resource, single-file, native multi-input, nested reference, and top-level package/dependency path projection are now handled in the final shaping pass. | +| `--full-root` | Preserves absolute/rooted output paths | `Done` | Full-root display paths now follow the ScanCode-style formatting pass, including path cleanup and field-specific projection rules. | +| `--from-json` | Loads prior scan JSON instead of rescanning input files | `Done` | Supports multiple input scans, shaping-time include/ignore filtering, root-flag reshaping per loaded scan before merge, and recomputation of followed top-level license outputs after load. | ### Output Formats & Result Shaping diff --git a/docs/improvements/README.md b/docs/improvements/README.md index a1337b728..2a1dc49f7 100644 --- a/docs/improvements/README.md +++ b/docs/improvements/README.md @@ -87,6 +87,7 @@ Python has unsafe patterns (code execution, DoS vulnerabilities), we use safe al | [Copyright Detection](copyright-detection.md) | 🐛 Bug Fix + 🔍 Enhanced + 🛡️ Security | Year range stops at 2039, short-year typo, French/Spanish case bugs, string-based POS tags, global mutable singleton | Year range 2099, all regex bugs fixed, type-safe enum POS tags, thread-safe `LazyLock`, and shared media metadata clues from supported images and fonts | Correct year detection, reliable i18n, compile-time safety, parallel scanning | | [Email/URL Detection](email-url-detection.md) | 🐛 Bug Fix + 🔍 Enhanced + 🛡️ Security | TLD length too strict, IPv6/private-IP issues, less explicit URL handling | Extended TLD support, robust host/IP filtering, credential stripping, and shared metadata clues from supported images and fonts | Better extraction correctness and safer metadata handling | | [License Detection](license-detection.md) | 🐛 Bug Fix + 🔍 Enhanced + 🛡️ Security + ⚡ Performance | SPDX-LID fan-out duplicates, stale whole-query cache after SPDX subtraction, bounds-only `qcontains`/`qoverlap`, zero-overlap surround merges, per-candidate rule cloning, and global mutable state | SPDX-LID deduplication, immutable whole-query snapshot for AHO, position-set-aware containment/overlap, explicit AHO extra-matchables tracking, unified `PositionSpan` enum, borrowed rule references in candidates, and thread-safe `Arc` | Correct SPDX match counts, clean architectural separation from stale-cache coupling, correct positional overlap semantics, reduced allocation overhead, and safe parallel scanning | +| [CLI Workflows](cli-workflows.md) | ✨ New Feature + 🐛 Bug Fix | Selected-file and PR-changed-file scans still depend on argv expansion, include-filter approximations, or cwd-sensitive multi-input behavior, and repeated full rescans still pay the full cost on unchanged trees | Added rooted `--paths-file` native-scan selection with stdin support, root-relative entries, recoverable missing-entry warnings, cwd-independent explicit-root workflows, and opt-in `--incremental` unchanged-file reuse under a shared cache root | Better PR/changed-file scanning, cleaner CI/container ergonomics, safer selected-file scope control, and faster repeated native rescans | | Cross-cutting (All Parsers) | 🛡️ Security | No DoS limits | File size + iteration limits | Protection against resource exhaustion | ## Per-Improvement Documentation @@ -148,6 +149,7 @@ Most areas with improvements have a dedicated document. Cross-cutting security h - **[copyright-detection.md](copyright-detection.md)** — 🐛 Bug Fix + 🔍 Enhanced + 🛡️ Security: Year range fix, regex typo fixes, type-safe POS tags, thread-safe design - **[email-url-detection.md](email-url-detection.md)** — 🐛 Bug Fix + 🔍 Enhanced + 🛡️ Security: Email/URL extraction hardening, stronger filtering, and EXIF/XMP-backed metadata detection - **[license-detection.md](license-detection.md)** — 🐛 Bug Fix + 🔍 Enhanced + 🛡️ Security + ⚡ Performance: SPDX-LID deduplication, position-set-aware containment/overlap, immutable whole-query snapshot for AHO, explicit extra-matchables tracking, unified PositionSpan, and borrowed rule references in candidates +- **[cli-workflows.md](cli-workflows.md)** — ✨ New Feature + 🐛 Bug Fix: rooted `--paths-file` selected-file scanning plus opt-in `--incremental` rescans with cache-root controls for repeated native workflows Related broader architecture/security docs: diff --git a/docs/improvements/cli-workflows.md b/docs/improvements/cli-workflows.md new file mode 100644 index 000000000..9657948cb --- /dev/null +++ b/docs/improvements/cli-workflows.md @@ -0,0 +1,51 @@ +# CLI Workflows + +## Type + +- ✨ New Feature + 🐛 Bug Fix + +## Python Reference Status + +- Explicit selected-file scans still lean on argv expansion, include filters, or cwd-sensitive multi-input behavior. +- The upstream issue history shows the same pain point recurring for pull-request and changed-file workflows, especially when tooling must run from a fixed location instead of the repository root. +- There is still no settled first-class rooted path-list input that lets users say “scan this one tree, but only these listed files/directories” without shell glue. + +This document is the landing zone for stable, user-visible CLI workflow improvements that go beyond the Python reference implementation. + +## Improvement 1: Rooted selected-path scanning + +### Rust Improvements + +- Added `--paths-file ` for native scans under one explicit root. +- Entries are interpreted relative to that root instead of relative to the process cwd. +- `--paths-file -` reads the selected path list from stdin, so `git diff --name-only ... | provenant ... --paths-file -` works directly. +- Blank lines are ignored, CRLF line endings are tolerated, and directory entries keep selecting descendant files through the existing rooted include/filter pipeline. +- Missing entries are skipped with warnings instead of silently widening the scan scope. +- The resolved warnings flow into both terminal output and structured header warnings so automated consumers can see the same recoverable issue summary. + +### Impact + +- Pull-request and changed-file scans no longer depend on `xargs` or cwd-sensitive positional-argument workarounds. +- CI/container workflows can run Provenant from a fixed mount location while still scanning a repository elsewhere through one explicit root. +- Selected-file workflows stay within the existing single-root scan and shaping model, so output remains one coherent rooted tree instead of an ad hoc multi-root result. + +## Improvement 2: Incremental rescans + +### Python Reference Status + +- The Python reference does not offer native unchanged-file reuse across repeated scans of the same tree. +- Re-running a large scan means paying the full collection and file-processing cost again, even when only a small subset changed. + +### Rust Improvements + +- Added opt-in `--incremental` reuse for repeated native scans of the same rooted tree. +- Provenant persists an incremental manifest under the shared cache root and reuses unchanged file results after validating stored metadata and SHA-256 against the previous completed scan. +- `--cache-dir` and `PROVENANT_CACHE` let users choose that shared cache root explicitly. +- `--cache-clear` clears the shared incremental and license-index cache state before a run without changing the scan contract. +- Incremental reuse stays separate from `--from-json`: replay reshapes an existing result, while `--incremental` accelerates fresh native rescans. + +### Impact + +- Repeated local and CI reruns can skip unchanged work instead of rescanning everything from scratch. +- Cache-root controls make the workflow usable in containerized and shared-runner environments. +- Users get a beyond-parity repeated-scan workflow without changing the output model or scan semantics. diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 885862d0c..e787a73d3 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -246,9 +246,14 @@ pub struct Cli { #[arg(long = "exclude", visible_alias = "ignore", value_delimiter = ',')] pub exclude: Vec, + /// Include files matching PATTERN. Use `**` when you want recursion across directories. #[arg(long, value_delimiter = ',')] pub include: Vec, + /// Read selected scan paths from FILE (or '-' for stdin), relative to the explicit scan root. + #[arg(long = "paths-file", value_name = "FILE", allow_hyphen_values = true)] + pub paths_file: Vec, + #[arg(long = "cache-dir", value_name = "PATH")] pub cache_dir: Option, @@ -711,6 +716,7 @@ impl Cli { self.package_in_compiled, ); push_bool_option(&mut flags, "--package-only", self.package_only); + push_array_option(&mut flags, "--paths-file", &self.paths_file); push_non_default_process_mode_option( &mut flags, "--processes", @@ -921,6 +927,8 @@ mod tests { "--license", "--package", "--strip-root", + "--paths-file", + "changed-files.txt", "--ignore", "*.git*", "--ignore", @@ -943,6 +951,12 @@ mod tests { ); assert_eq!(options.get("--license"), Some(&JsonValue::Bool(true))); assert_eq!(options.get("--package"), Some(&JsonValue::Bool(true))); + assert_eq!( + options.get("--paths-file"), + Some(&JsonValue::Array(vec![JsonValue::String( + "changed-files.txt".to_string() + )])) + ); assert_eq!(options.get("--strip-root"), Some(&JsonValue::Bool(true))); assert_eq!( options.get("--ignore"), @@ -1168,6 +1182,23 @@ mod tests { assert!(parsed.filter_clues); } + #[test] + fn test_parses_repeated_paths_file_flags_including_stdin_dash() { + let parsed = Cli::try_parse_from([ + "provenant", + "--json-pp", + "scan.json", + "--paths-file", + "changed-files.txt", + "--paths-file", + "-", + "samples", + ]) + .expect("cli parse should accept repeated --paths-file flags"); + + assert_eq!(parsed.paths_file, vec!["changed-files.txt", "-"]); + } + #[test] fn test_parses_ignore_author_and_holder_filters() { let parsed = Cli::try_parse_from([ diff --git a/src/cli/run/mod.rs b/src/cli/run/mod.rs index 239588240..f73101ac1 100644 --- a/src/cli/run/mod.rs +++ b/src/cli/run/mod.rs @@ -20,11 +20,12 @@ use crate::post_processing::{ }; use crate::progress::{ProgressMode, ScanProgress, format_default_scan_error}; use crate::scan_result_shaping::{ - apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source, + SelectedPath, apply_cli_path_selection_filter, apply_ignore_resource_filter, apply_mark_source, apply_only_findings_filter, apply_user_path_filters_to_collected, filter_redundant_clues, filter_redundant_clues_with_rules, load_and_merge_json_inputs, normalize_paths, normalize_top_level_output_paths, populate_info_resource_counts, - prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, trim_preloaded_assembly_to_files, + prepare_filter_clue_rule_lookup, resolve_native_scan_inputs, resolve_paths_file_entries, + trim_preloaded_assembly_to_files, }; use crate::scanner::{ LicenseScanOptions, TextDetectionOptions, collect_paths, process_collected_with_memory_limit, @@ -39,6 +40,7 @@ use regex::Regex; use std::collections::{BTreeMap, HashMap}; use std::env; use std::fs; +use std::io::Read; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::Instant; @@ -94,6 +96,7 @@ pub fn run() -> Result<()> { preloaded_license_references, preloaded_license_rule_references, preloaded_extra_errors, + extra_warnings, imported_spdx_license_list_version, imported_license_index_provenance, mut active_license_engine, @@ -126,14 +129,18 @@ pub fn run() -> Result<()> { license_references, license_rule_references, extra_errors, + Vec::new(), imported_spdx_license_list_version, imported_license_index_provenance, None, ) } else { - let (scan_path, native_input_includes) = resolve_native_scan_inputs(&cli.dir_path)?; - let mut native_include_patterns = cli.include.clone(); - native_include_patterns.extend(native_input_includes); + let (scan_path, selected_paths, missing_paths_file_entries) = + resolve_native_scan_selection(&cli)?; + let paths_file_warnings = build_paths_file_warning_messages(&missing_paths_file_entries); + for warning in &paths_file_warnings { + progress.output_written(warning); + } let cache_config = prepare_cache_config(Some(Path::new(&scan_path)), &cli)?; shared_license_cache_config = Some(build_license_cache_config(&cache_config, &cli)); @@ -145,7 +152,8 @@ pub fn run() -> Result<()> { let user_excluded_count = apply_user_path_filters_to_collected( &mut collected, Path::new(&scan_path), - &native_include_patterns, + &selected_paths, + &cli.include, &cli.exclude, ); let total_files = collected.file_count(); @@ -309,6 +317,7 @@ pub fn run() -> Result<()> { Vec::new(), Vec::new(), runtime_errors, + paths_file_warnings, None, None, license_engine, @@ -557,7 +566,7 @@ pub fn run() -> Result<()> { spdx_license_list_version, license_index_provenance, extra_errors, - extra_warnings: Vec::new(), + extra_warnings, header_options: cli.output_header_options(), options: CreateOutputOptions { facet_rules: &facet_rules, @@ -642,15 +651,67 @@ fn touch_license_golden_symbols() { let _ = crate::license_detection::LicenseDetectionEngine::detect_matches_with_kind; } +fn resolve_native_scan_selection(cli: &Cli) -> Result<(String, Vec, Vec)> { + if cli.paths_file.is_empty() { + let (scan_path, selected_paths) = resolve_native_scan_inputs(&cli.dir_path)?; + return Ok((scan_path, selected_paths, Vec::new())); + } + + let scan_path = cli + .dir_path + .first() + .cloned() + .ok_or_else(|| anyhow!("--paths-file requires one positional scan root"))?; + let path_file_entries = load_paths_file_entries(&cli.paths_file)?; + let resolved = resolve_paths_file_entries(Path::new(&scan_path), &path_file_entries)?; + if resolved.selections.is_empty() { + return Err(anyhow!( + "--paths-file did not resolve to any existing files or directories under {:?}", + Path::new(&scan_path) + )); + } + + Ok((scan_path, resolved.selections, resolved.missing_entries)) +} + +fn load_paths_file_entries(paths_files: &[String]) -> Result> { + let mut entries = Vec::new(); + for paths_file in paths_files { + let content = read_paths_file_content(paths_file)?; + entries.extend(content.lines().map(ToOwned::to_owned)); + } + Ok(entries) +} + +fn read_paths_file_content(paths_file: &str) -> Result { + if paths_file == "-" { + let mut content = String::new(); + std::io::stdin() + .read_to_string(&mut content) + .map_err(|err| anyhow!("Failed to read --paths-file from stdin: {err}"))?; + return Ok(content); + } + + fs::read_to_string(paths_file) + .map_err(|err| anyhow!("Failed to read --paths-file {:?}: {err}", paths_file)) +} + +fn build_paths_file_warning_messages(missing_entries: &[String]) -> Vec { + missing_entries + .iter() + .map(|entry| format!("Skipping missing --paths-file entry: {entry}")) + .collect() +} + fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> { if cli.show_attribution { return Ok(()); } if cli.export_license_dataset.is_some() { - if !cli.dir_path.is_empty() { + if !cli.dir_path.is_empty() || !cli.paths_file.is_empty() { return Err(anyhow!( - "--export-license-dataset does not accept scan input paths" + "--export-license-dataset does not accept scan input paths or --paths-file" )); } @@ -693,12 +754,24 @@ fn validate_scan_option_compatibility(cli: &Cli) -> Result<()> { )); } + if cli.from_json && !cli.paths_file.is_empty() { + return Err(anyhow!( + "--paths-file is only supported for native scan mode, not --from-json" + )); + } + if cli.from_json && cli.incremental { return Err(anyhow!( "--incremental is only supported for directory scan mode, not --from-json" )); } + if !cli.paths_file.is_empty() && cli.dir_path.len() != 1 { + return Err(anyhow!( + "--paths-file requires exactly one positional scan root" + )); + } + if !cli.from_json && cli.dir_path.is_empty() { return Err(anyhow!("Directory path is required for scan operations")); } diff --git a/src/cli/run/tests.rs b/src/cli/run/tests.rs index 74b6da431..0f3a4a2d8 100644 --- a/src/cli/run/tests.rs +++ b/src/cli/run/tests.rs @@ -330,6 +330,46 @@ fn validate_scan_option_compatibility_allows_multiple_paths_without_from_json() assert!(validate_scan_option_compatibility(&cli).is_ok()); } +#[test] +fn validate_scan_option_compatibility_rejects_paths_file_with_from_json() { + let cli = crate::cli::Cli::try_parse_from([ + "provenant", + "--json-pp", + "scan.json", + "--from-json", + "--paths-file", + "changed-files.txt", + "sample-scan.json", + ]) + .unwrap(); + + let error = validate_scan_option_compatibility(&cli).unwrap_err(); + assert!( + error + .to_string() + .contains("--paths-file is only supported for native scan mode") + ); +} + +#[test] +fn validate_scan_option_compatibility_rejects_paths_file_without_single_root() { + let cli = crate::cli::Cli::try_parse_from([ + "provenant", + "--json-pp", + "scan.json", + "--paths-file", + "changed-files.txt", + ]) + .unwrap(); + + let error = validate_scan_option_compatibility(&cli).unwrap_err(); + assert!( + error + .to_string() + .contains("--paths-file requires exactly one positional scan root") + ); +} + #[test] fn validate_scan_option_compatibility_rejects_mark_source_without_info() { let mut cli = @@ -1125,3 +1165,90 @@ fn build_collection_exclude_patterns_skips_vcs_metadata_directories() { assert_eq!(collected.file_count(), 1); assert!(collected.excluded_count >= 3); } + +#[test] +fn resolve_native_scan_selection_uses_paths_file_under_explicit_root() { + let temp_dir = tempfile::TempDir::new().expect("create temp dir"); + let scan_root = temp_dir.path().join("repo"); + fs::create_dir_all(scan_root.join("src")).expect("create src dir"); + fs::create_dir_all(scan_root.join("docs")).expect("create docs dir"); + fs::write(scan_root.join("src/lib.rs"), "pub fn demo() {}\n").expect("write lib"); + fs::write(scan_root.join("docs/guide.md"), "# guide\n").expect("write guide"); + + let paths_file_a = temp_dir.path().join("changed-a.txt"); + let paths_file_b = temp_dir.path().join("changed-b.txt"); + fs::write(&paths_file_a, "src/lib.rs\r\nmissing.rs\n").expect("write first paths file"); + fs::write(&paths_file_b, "docs\nsrc/lib.rs\n").expect("write second paths file"); + + let other_cwd = tempfile::TempDir::new().expect("create alternate cwd"); + let old_cwd = std::env::current_dir().expect("current dir"); + std::env::set_current_dir(other_cwd.path()).expect("set cwd"); + + let cli = crate::cli::Cli::try_parse_from([ + "provenant", + "--json-pp", + "scan.json", + "--paths-file", + paths_file_a.to_str().expect("utf-8 path"), + "--paths-file", + paths_file_b.to_str().expect("utf-8 path"), + scan_root.to_str().expect("utf-8 path"), + ]) + .expect("cli parse should succeed"); + + let result = resolve_native_scan_selection(&cli); + + std::env::set_current_dir(old_cwd).expect("restore cwd"); + + let (resolved_root, includes, missing_entries) = + result.expect("paths file selection should resolve"); + assert_eq!(resolved_root, scan_root.to_str().expect("utf-8 path")); + assert_eq!( + includes, + vec![ + crate::scan_result_shaping::SelectedPath::Exact("src/lib.rs".to_string()), + crate::scan_result_shaping::SelectedPath::Subtree("docs".to_string()) + ] + ); + assert_eq!(missing_entries, vec!["missing.rs"]); +} + +#[test] +fn resolve_native_scan_selection_errors_when_paths_file_keeps_no_existing_entries() { + let temp_dir = tempfile::TempDir::new().expect("create temp dir"); + let scan_root = temp_dir.path().join("repo"); + fs::create_dir_all(&scan_root).expect("create scan root"); + let paths_file = temp_dir.path().join("changed.txt"); + fs::write(&paths_file, "missing.rs\n").expect("write paths file"); + + let cli = crate::cli::Cli::try_parse_from([ + "provenant", + "--json-pp", + "scan.json", + "--paths-file", + paths_file.to_str().expect("utf-8 path"), + scan_root.to_str().expect("utf-8 path"), + ]) + .expect("cli parse should succeed"); + + let error = resolve_native_scan_selection(&cli).expect_err("selection should fail"); + assert!( + error + .to_string() + .contains("did not resolve to any existing files or directories") + ); +} + +#[test] +fn build_paths_file_warning_messages_formats_missing_entries_for_headers() { + let warnings = + build_paths_file_warning_messages(&["missing.rs".to_string(), "docs/guide.md".to_string()]); + + assert_eq!( + warnings, + vec![ + "Skipping missing --paths-file entry: missing.rs".to_string(), + "Skipping missing --paths-file entry: docs/guide.md".to_string(), + ] + ); +} diff --git a/src/scan_result_shaping/mod.rs b/src/scan_result_shaping/mod.rs index 8377910e4..9667d7af2 100644 --- a/src/scan_result_shaping/mod.rs +++ b/src/scan_result_shaping/mod.rs @@ -22,8 +22,8 @@ use anyhow::Result; pub(crate) use json_input::load_and_merge_json_inputs; pub(crate) use selection::{ - apply_cli_path_selection_filter, apply_user_path_filters_to_collected, - resolve_native_scan_inputs, + SelectedPath, apply_cli_path_selection_filter, apply_user_path_filters_to_collected, + resolve_native_scan_inputs, resolve_paths_file_entries, }; fn retain_matching_files_with_ancestor_dirs(files: &mut Vec, mut keep_file: F) diff --git a/src/scan_result_shaping/selection.rs b/src/scan_result_shaping/selection.rs index 12bda6db6..bc143df8f 100644 --- a/src/scan_result_shaping/selection.rs +++ b/src/scan_result_shaping/selection.rs @@ -4,6 +4,8 @@ use anyhow::{Result, anyhow}; use glob::Pattern; use std::collections::HashSet; +use std::ffi::OsString; +use std::fs; use std::path::{Path, PathBuf}; use crate::models::FileInfo; @@ -15,7 +17,13 @@ use super::apply_path_selection_filter; #[path = "selection_test.rs"] mod selection_test; -pub(crate) fn resolve_native_scan_inputs(inputs: &[String]) -> Result<(String, Vec)> { +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) enum SelectedPath { + Exact(String), + Subtree(String), +} + +pub(crate) fn resolve_native_scan_inputs(inputs: &[String]) -> Result<(String, Vec)> { if inputs.is_empty() { return Err(anyhow!("No directory input path provided")); } @@ -42,12 +50,126 @@ pub(crate) fn resolve_native_scan_inputs(inputs: &[String]) -> Result<(String, V let synthetic_includes = inputs .iter() - .map(|path| path.replace('\\', "/").trim_end_matches('/').to_string()) + .map(|path| build_selected_path(path, Path::new(path).is_dir())) .collect(); Ok((common_prefix, synthetic_includes)) } +#[derive(Debug)] +pub(crate) struct ResolvedPathsFileEntries { + pub selections: Vec, + pub missing_entries: Vec, +} + +pub(crate) fn resolve_paths_file_entries( + scan_root: &Path, + entries: &[String], +) -> Result { + let root_metadata = fs::metadata(scan_root).map_err(|err| { + anyhow!( + "Failed to access scan root {:?} for --paths-file: {err}", + scan_root + ) + })?; + if !root_metadata.is_dir() { + return Err(anyhow!( + "--paths-file requires the positional scan root to be a directory: {:?}", + scan_root + )); + } + + let mut selections = Vec::new(); + let mut missing_entries = Vec::new(); + let mut seen = HashSet::new(); + + for entry in entries { + let Some(normalized) = normalize_paths_file_entry(entry)? else { + continue; + }; + + let absolute = scan_root.join(&normalized); + if absolute.exists() { + let selection = build_selected_path(&normalized, absolute.is_dir()); + if seen.insert(selection_cache_key(&selection)) { + selections.push(selection); + } + } else if seen.insert(format!("missing:{normalized}")) { + missing_entries.push(normalized); + } + } + + Ok(ResolvedPathsFileEntries { + selections, + missing_entries, + }) +} + +fn build_selected_path(path: &str, is_directory: bool) -> SelectedPath { + let normalized = normalize_match_input(path); + if is_directory { + SelectedPath::Subtree(normalized) + } else { + SelectedPath::Exact(normalized) + } +} + +fn selection_cache_key(selection: &SelectedPath) -> String { + match selection { + SelectedPath::Exact(path) => format!("exact:{path}"), + SelectedPath::Subtree(path) => format!("subtree:{path}"), + } +} + +fn normalize_paths_file_entry(entry: &str) -> Result> { + let entry = entry.trim_end_matches('\r'); + if entry.trim().is_empty() { + return Ok(None); + } + + let path = Path::new(entry); + if path.is_absolute() { + return Err(anyhow!( + "--paths-file entries must be relative to the declared scan root: {entry:?}" + )); + } + + let mut normalized = PathBuf::new(); + for component in path.components() { + match component { + std::path::Component::CurDir => {} + std::path::Component::Normal(segment) => normalized.push(segment), + std::path::Component::ParentDir => { + if !normalized.pop() { + return Err(anyhow!( + "--paths-file entry escapes the declared scan root: {entry:?}" + )); + } + } + std::path::Component::RootDir | std::path::Component::Prefix(_) => { + return Err(anyhow!( + "--paths-file entries must be relative to the declared scan root: {entry:?}" + )); + } + } + } + + if normalized.as_os_str().is_empty() { + return Err(anyhow!( + "--paths-file entries must name a file or directory under the declared scan root: {entry:?}" + )); + } + + let normalized = normalized + .components() + .map(|component| OsString::from(component.as_os_str())) + .collect::() + .to_string_lossy() + .replace('\\', "/"); + + Ok(Some(normalized)) +} + pub(crate) fn common_path_prefix(inputs: &[String]) -> Option { let first = inputs.first()?; let mut shared_components: Vec<_> = Path::new(first).components().collect(); @@ -79,6 +201,7 @@ pub(crate) fn common_path_prefix(inputs: &[String]) -> Option { pub(crate) fn apply_user_path_filters_to_collected( collected: &mut CollectedPaths, scan_root: &Path, + selected_paths: &[SelectedPath], include_patterns: &[String], exclude_patterns: &[String], ) -> usize { @@ -86,7 +209,8 @@ pub(crate) fn apply_user_path_filters_to_collected( let before_dirs = collected.directories.len(); collected.files.retain(|(path, _)| { let relative_path = normalize_scan_relative_path(path, scan_root); - is_included_path(&relative_path, include_patterns, exclude_patterns) + matches_selected_path(&relative_path, selected_paths) + && is_included_path(&relative_path, include_patterns, exclude_patterns) }); let kept_file_paths: HashSet<_> = collected @@ -96,7 +220,8 @@ pub(crate) fn apply_user_path_filters_to_collected( .collect(); collected.directories.retain(|(path, _)| { let relative_path = normalize_scan_relative_path(path, scan_root); - is_included_path(&relative_path, include_patterns, exclude_patterns) + (matches_selected_path(&relative_path, selected_paths) + && is_included_path(&relative_path, include_patterns, exclude_patterns)) || kept_file_paths .iter() .any(|file_path| file_path.starts_with(path)) @@ -145,8 +270,8 @@ pub(crate) fn is_included_path( return false; } - let normalized_path = path.replace('\\', "/").to_ascii_lowercase(); - let stripped_path = normalized_path.trim_start_matches(['/', '0']).to_string(); + let normalized_path = normalize_match_input(path); + let stripped_path = normalized_path.trim_start_matches('/').to_string(); if !include_patterns.is_empty() && !include_patterns @@ -174,35 +299,40 @@ fn path_matches_scancode_pattern( }; if !normalized_pattern.contains('/') { - stripped_path - .split('/') + let basename = stripped_path + .rsplit('/') + .next() .filter(|segment| !segment.is_empty()) - .any(|segment| compiled.matches(segment)) + .unwrap_or(stripped_path); + compiled.matches(basename) } else { - matching_path_candidates(normalized_path, stripped_path) - .iter() + [normalized_path, stripped_path] + .into_iter() + .filter(|candidate| !candidate.is_empty()) .any(|candidate| compiled.matches(candidate)) } } -fn matching_path_candidates<'a>(normalized_path: &'a str, stripped_path: &'a str) -> Vec<&'a str> { - let mut candidates = Vec::new(); - - for path in [normalized_path, stripped_path] { - if path.is_empty() { - continue; - } +fn matches_selected_path(path: &str, selected_paths: &[SelectedPath]) -> bool { + if selected_paths.is_empty() { + return true; + } - candidates.push(path); - let mut current = path; - while let Some((parent, _)) = current.rsplit_once('/') { - if parent.is_empty() { - break; - } - candidates.push(parent); - current = parent; + let normalized_path = normalize_match_input(path); + selected_paths.iter().any(|selection| match selection { + SelectedPath::Exact(exact) => normalized_path == *exact, + SelectedPath::Subtree(root) => { + normalized_path == *root + || normalized_path + .strip_prefix(root) + .is_some_and(|suffix| suffix.starts_with('/')) } - } + }) +} - candidates +fn normalize_match_input(path: &str) -> String { + path.replace('\\', "/") + .trim_start_matches('/') + .trim_end_matches('/') + .to_ascii_lowercase() } diff --git a/src/scan_result_shaping/selection_test.rs b/src/scan_result_shaping/selection_test.rs index d1af2f345..1754a40c3 100644 --- a/src/scan_result_shaping/selection_test.rs +++ b/src/scan_result_shaping/selection_test.rs @@ -62,6 +62,7 @@ fn apply_user_path_filters_to_collected_filters_files_without_pruning_directorie let removed = apply_user_path_filters_to_collected( &mut collected, &scan_root, + &[] as &[SelectedPath], &["*.doc".to_string()], &[], ); @@ -105,7 +106,13 @@ fn apply_user_path_filters_to_collected_keeps_single_file_root_input() { collection_errors: Vec::new(), }; - let removed = apply_user_path_filters_to_collected(&mut collected, &scan_root, &[], &[]); + let removed = apply_user_path_filters_to_collected( + &mut collected, + &scan_root, + &[] as &[SelectedPath], + &[], + &[], + ); assert_eq!(removed, 0); assert_eq!(collected.files.len(), 1); @@ -116,8 +123,8 @@ fn apply_user_path_filters_to_collected_keeps_single_file_root_input() { } #[test] -fn is_included_path_treats_directory_include_patterns_recursively() { - assert!(is_included_path( +fn is_included_path_does_not_recurse_on_bare_directory_patterns() { + assert!(!is_included_path( "src/foo/bar/baz.txt", &["src/foo".to_string()], &[] @@ -129,14 +136,50 @@ fn is_included_path_treats_directory_include_patterns_recursively() { )); } +#[test] +fn is_included_path_requires_explicit_recursive_wildcard_for_subtrees() { + assert!(is_included_path( + "src/foo/bar/baz.txt", + &["src/foo/**".to_string()], + &[] + )); + assert!(is_included_path( + "src/foo/file.txt", + &["src/foo/**".to_string()], + &[] + )); + assert!(!is_included_path( + "src/other/file.txt", + &["src/foo/**".to_string()], + &[] + )); +} + #[test] fn resolve_native_scan_inputs_builds_common_prefix_and_synthetic_includes() { - let (scan_root, includes) = - resolve_native_scan_inputs(&["src/foo".to_string(), "src/bar/baz".to_string()]) - .expect("multiple relative inputs should resolve"); + let temp_dir = tempfile::tempdir().expect("tempdir"); + let parent = temp_dir.path().join("src"); + fs::create_dir_all(parent.join("foo")).expect("create foo dir"); + fs::create_dir_all(parent.join("bar")).expect("create bar dir"); + fs::write(parent.join("bar/baz"), "data\n").expect("write baz file"); + + let old_cwd = std::env::current_dir().expect("current dir"); + std::env::set_current_dir(temp_dir.path()).expect("set cwd"); + + let result = resolve_native_scan_inputs(&["src/foo".to_string(), "src/bar/baz".to_string()]); + + std::env::set_current_dir(old_cwd).expect("restore cwd"); + + let (scan_root, includes) = result.expect("multiple relative inputs should resolve"); assert_eq!(scan_root, "src"); - assert_eq!(includes, vec!["src/foo", "src/bar/baz"]); + assert_eq!( + includes, + vec![ + SelectedPath::Subtree("src/foo".to_string()), + SelectedPath::Exact("src/bar/baz".to_string()) + ] + ); } #[test] @@ -155,5 +198,88 @@ fn resolve_native_scan_inputs_uses_component_aware_prefix_for_siblings() { let (scan_root, includes) = result.expect("sibling inputs should resolve"); assert_eq!(scan_root, "src"); - assert_eq!(includes, vec!["src/bar", "src/baz"]); + assert_eq!( + includes, + vec![ + SelectedPath::Subtree("src/bar".to_string()), + SelectedPath::Subtree("src/baz".to_string()) + ] + ); +} + +#[test] +fn resolve_paths_file_entries_normalizes_existing_entries_and_tracks_missing() { + let temp_dir = tempfile::tempdir().expect("tempdir"); + let scan_root = temp_dir.path().join("repo"); + fs::create_dir_all(scan_root.join("src/nested")).expect("create nested source dir"); + fs::create_dir_all(scan_root.join("docs")).expect("create docs dir"); + fs::write(scan_root.join("src/nested/main.rs"), "fn main() {}\n").expect("write source"); + + let resolved = resolve_paths_file_entries( + &scan_root, + &[ + "./src/nested/../nested/main.rs".to_string(), + "docs\r".to_string(), + "src/nested/main.rs".to_string(), + "missing/file.rs".to_string(), + " ".to_string(), + ], + ) + .expect("paths file entries should resolve"); + + assert_eq!( + resolved.selections, + vec![ + SelectedPath::Exact("src/nested/main.rs".to_string()), + SelectedPath::Subtree("docs".to_string()) + ] + ); + assert_eq!(resolved.missing_entries, vec!["missing/file.rs"]); +} + +#[test] +fn resolve_paths_file_entries_rejects_entries_that_escape_root() { + let temp_dir = tempfile::tempdir().expect("tempdir"); + let scan_root = temp_dir.path().join("repo"); + fs::create_dir_all(&scan_root).expect("create scan root"); + + let error = resolve_paths_file_entries(&scan_root, &["../secret.txt".to_string()]) + .expect_err("escaping entry should be rejected"); + + assert!(error.to_string().contains("escapes the declared scan root")); +} + +#[test] +fn resolve_paths_file_entries_uses_explicit_root_not_current_working_directory() { + let scan_root_parent = tempfile::tempdir().expect("scan root parent"); + let other_cwd = tempfile::tempdir().expect("alternate cwd"); + let scan_root = scan_root_parent.path().join("repo"); + fs::create_dir_all(scan_root.join("src")).expect("create src dir"); + fs::write(scan_root.join("src/lib.rs"), "pub fn demo() {}\n").expect("write lib"); + + let old_cwd = std::env::current_dir().expect("current dir"); + std::env::set_current_dir(other_cwd.path()).expect("set cwd"); + + let result = resolve_paths_file_entries(&scan_root, &["src/lib.rs".to_string()]); + + std::env::set_current_dir(old_cwd).expect("restore cwd"); + + let resolved = result.expect("absolute scan root should make cwd irrelevant"); + assert_eq!( + resolved.selections, + vec![SelectedPath::Exact("src/lib.rs".to_string())] + ); + assert!(resolved.missing_entries.is_empty()); +} + +#[test] +fn matches_selected_path_keeps_exact_file_selection_narrow() { + assert!(matches_selected_path( + "README.md", + &[SelectedPath::Exact("readme.md".to_string())] + )); + assert!(!matches_selected_path( + "docs/README.md", + &[SelectedPath::Exact("readme.md".to_string())] + )); }