From 46158c47d9d35e907c66484d88574c7364095e24 Mon Sep 17 00:00:00 2001
From: Agustin Groh <agusgroh@gmail.com>
Date: Tue, 24 Mar 2026 12:26:18 -0300
Subject: [PATCH] feat(hfh):SP-4181 implement raw output format for folder
 hashing

---
 CHANGELOG.md                        |   4 +
 src/scanoss/cli.py                  |   2 +-
 src/scanoss/scanners/scanner_hfh.py | 192 +++++++++++++++++++++++++++-
 3 files changed, 194 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1739c118..d53be9c7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+### Added
+- Added `--format raw` option to `folder-scan` command to export HFH results in snippet-scanner JSON format
+  - Expands directory-level HFH results into per-file entries keyed by relative file path
+  - Assigns each file to the most specific matching `path_id` (deepest directory match wins)
 
 ## [1.50.0] - 2026-03-17
 ### Fixed
diff --git a/src/scanoss/cli.py b/src/scanoss/cli.py
index 03055ac2..29282f8d 100644
--- a/src/scanoss/cli.py
+++ b/src/scanoss/cli.py
@@ -988,7 +988,7 @@ def setup_args() -> None:  # noqa: PLR0912, PLR0915
         '--format',
         '-f',
         type=str,
-        choices=['json', 'cyclonedx'],
+        choices=['json', 'cyclonedx', 'raw'],
         default='json',
         help='Result output format (optional - default: json)',
     )
diff --git a/src/scanoss/scanners/scanner_hfh.py b/src/scanoss/scanners/scanner_hfh.py
index 739a8921..62923bec 100644
--- a/src/scanoss/scanners/scanner_hfh.py
+++ b/src/scanoss/scanners/scanner_hfh.py
@@ -22,11 +22,15 @@
   THE SOFTWARE.
 """
 
+import hashlib
 import json
+import os
 import threading
 import time
-from typing import Dict, Optional
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
 
+from packageurl.contrib import purl2url
 from progress.spinner import Spinner
 
 from scanoss.constants import (
@@ -88,6 +92,7 @@ def __init__(  # noqa: PLR0913
             debug=config.debug,
             trace=config.trace,
             quiet=config.quiet,
+            config=config,
         )
         self.file_filters = FileFilters(
             debug=config.debug,
@@ -162,7 +167,16 @@ class ScannerHFHPresenter(AbstractPresenter):
     Handles the presentation of the folder hashing scan results
     """
 
-    def __init__(self, scanner: ScannerHFH, **kwargs):
+    def __init__(self, scanner: ScannerHFH, config: ScannerConfig = None, **kwargs):
+        """
+        Initialize the presenter.
+
+        Args:
+            scanner (ScannerHFH): The HFH scanner instance containing scan results and file filters.
+            config (ScannerConfig, optional): Scanner configuration, used to access the API base URL
+                for constructing file_url in raw output format.
+            **kwargs: Additional arguments passed to AbstractPresenter (debug, trace, quiet, etc.).
+        """
         super().__init__(**kwargs)
         self.scanner = scanner
 
@@ -249,4 +263,176 @@ def _format_csv_output(self) -> str:
         raise NotImplementedError('CSV output is not implemented')
 
     def _format_raw_output(self) -> str:
-        raise NotImplementedError('Raw output is not implemented')
+        """
+        Convert HFH scan results into snippet-scanner JSON format.
+
+        Expands directory-level HFH results into per-file entries keyed by
+        relative file path, matching the structure returned by the snippet scanner.
+        For each file, computes the MD5 hash and constructs the file_url using
+        the API base URL from the scanner config.
+
+        Returns:
+            str: A JSON string with the snippet-scanner format, or '{}' if no results.
+        """
+        if not self.scanner.scan_results or 'results' not in self.scanner.scan_results:
+            return '{}'
+
+        hfh_results = self.scanner.scan_results.get('results', [])
+        if not hfh_results:
+            return '{}'
+
+        # Collect best-match component info per path_id
+        path_components = self._extract_best_components(hfh_results)
+        if not path_components:
+            return '{}'
+
+        # Get all filtered files once (relative paths to scan_dir)
+        all_files = self.scanner.file_filters.get_filtered_files_from_folder(self.scanner.scan_dir)
+
+        # Sort path_ids by depth (deepest first) so most-specific match wins.
+        # Root path '.' is always last (-1), others sort by separator count then path length.
+        # Example with path_ids: ['.', 'external', 'project-1.0', 'project-1.0/src/lib']
+        #   Sorted result: ['project-1.0/src/lib', 'project-1.0', 'external', '.']
+        #   - 'project-1.0/src/lib' (depth 2) claims its files first
+        #   - 'project-1.0' (depth 0, len 11) claims remaining files under it
+        #   - 'external' (depth 0, len 8) claims external/ files
+        #   - '.' (root, always last) picks up everything else
+        sorted_path_ids = sorted(
+            path_components.keys(),
+            key=lambda p: (-1, 0) if p == '.' else (p.count(os.sep), len(p)),
+            reverse=True,
+        )
+
+        output = {}
+        claimed_files = set()
+        scan_dir = Path(self.scanner.scan_dir).resolve()
+
+        for path_id in sorted_path_ids:
+            component, best_version = path_components[path_id]
+            for file_path in all_files:
+                if file_path in claimed_files:
+                    continue
+                if not self._file_matches_path_id(file_path, path_id):
+                    continue
+
+                claimed_files.add(file_path)
+                # Path.__truediv__ (/) joins paths using the correct OS separator
+                file_hash = self._compute_file_md5(scan_dir / file_path)
+                api_url = self.scanner.client.orig_url or ''
+                entry = self._build_snippet_entry(component, best_version, file_path, file_hash, api_url)
+                output[file_path] = [entry]
+
+        return json.dumps(output, indent=2)
+
+    @staticmethod
+    def _extract_best_components(hfh_results: List[Dict]) -> Dict[str, Tuple[Dict, Dict]]:
+        """
+        Extract the best-match component and version for each path_id from HFH results.
+
+        Filters for components with order == 1 (best match) and takes their first version.
+        Results without a qualifying component or without versions are skipped.
+
+        Args:
+            hfh_results (List[Dict]): The 'results' list from the HFH API response.
+
+        Returns:
+            Dict[str, Tuple[Dict, Dict]]: A dict mapping path_id to (component, best_version).
+        """
+        path_components = {}
+        for result in hfh_results:
+            path_id = result.get('path_id', '.')
+            components = result.get('components', [])
+            best = [c for c in components if c.get('order') == 1]
+            if not best:
+                continue
+            component = best[0]
+            versions = component.get('versions', [])
+            if not versions:
+                continue
+            path_components[path_id] = (component, versions[0])
+        return path_components
+
+    @staticmethod
+    def _file_matches_path_id(file_path: str, path_id: str) -> bool:
+        """
+        Check if a file path belongs under a given path_id directory.
+
+        Both file_path and path_id are relative to the scan root directory.
+        A path_id of '.' matches all files (root directory).
+
+        Args:
+            file_path (str): Relative file path from the scan root.
+            path_id (str): Relative directory path from the HFH result.
+
+        Returns:
+            bool: True if the file is under the given path_id directory.
+        """
+        if path_id == '.':
+            return True
+        # file_path and path_id are both relative to scan_dir
+        return file_path == path_id or file_path.startswith(path_id + os.sep)
+
+    @staticmethod
+    def _compute_file_md5(file_path: Path) -> str:
+        """
+        Compute the MD5 hash of a file's contents.
+
+        Uses the same approach as the snippet scanner (winnowing.py) to ensure
+        consistent file_hash values across scan types.
+
+        Args:
+            file_path (Path): Absolute path to the file.
+
+        Returns:
+            str: The MD5 hex digest, or an empty string if the file cannot be read.
+        """
+        try:
+            return hashlib.md5(file_path.read_bytes()).hexdigest()
+        except (OSError, IOError):
+            return ''
+
+    @staticmethod
+    def _build_snippet_entry(
+        component: Dict, best_version: Dict, file_path: str, file_hash: str, base_url: str,
+    ) -> Dict:
+        """
+        Build a snippet-scanner-compatible result entry from an HFH component.
+
+        Maps HFH component fields to the standard scan result format. Fields not
+        available from HFH (url_hash, release_date, licenses) are included as empty
+        values since downstream validators require them.
+
+        Args:
+            component (Dict): The HFH component with purl, name, vendor fields.
+            best_version (Dict): The top version entry with version and score fields.
+            file_path (str): Relative file path from the scan root directory.
+            file_hash (str): Pre-computed MD5 hash of the local file.
+            base_url (str): API base URL used to construct the file_url field.
+
+        Returns:
+            Dict: A result entry compatible with the snippet-scanner JSON format.
+        """
+        purl = component.get('purl', '')
+        version = best_version.get('version', '')
+
+        url = purl2url.get_repo_url(purl) if purl else ''
+        return {
+            'id': 'file',
+            'matched': '100%',
+            'purl': [purl],
+            'component': component.get('name', ''),
+            'vendor': component.get('vendor', ''),
+            'version': version,
+            'latest': version,
+            'url': url or '',
+            'file': file_path,
+            'file_hash': file_hash,
+            'file_url': f'{base_url}/file_contents/{file_hash}',
+            'source_hash': file_hash,
+            'url_hash': '',
+            'release_date': '',
+            'licenses': [],
+            'lines': 'all',
+            'oss_lines': 'all',
+            'status': 'pending',
+        }