From c2c5dcc32e10e4da90945a963764346f0981180d Mon Sep 17 00:00:00 2001 From: Praneeth Perumalla Date: Mon, 15 Jun 2026 16:10:41 +0530 Subject: [PATCH 1/5] feat: integrate deduplicator into scan response - apply deduplication after scan aggregation - add raw_finding_count and finding_count - support DISABLE_DEDUP - support configurable DEDUP_EPSILON - gracefully skip deduplication when sentence-transformers is unavailable --- backend/app/main.py | 40 +++++--- backend/app/models.py | 7 ++ backend/app/utils/deduplicator.py | 73 ++++++++++++++ backend/tests/test_scan_dedup.py | 160 ++++++++++++++++++++++++++++++ 4 files changed, 268 insertions(+), 12 deletions(-) create mode 100644 backend/app/utils/deduplicator.py create mode 100644 backend/tests/test_scan_dedup.py diff --git a/backend/app/main.py b/backend/app/main.py index 81e8359..3f16545 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -41,6 +41,7 @@ upsert_contributor_stat, ) from .models import ( + DeduplicatedScanResponse, Finding, FixRequest, FixResponse, @@ -59,6 +60,7 @@ from .scanners.gitleaks import run_gitleaks from .scanners.osv import run_osv_scanner from .scanners.semgrep import run_semgrep +from .utils.deduplicator import deduplicate from .utils.fs import ensure_dir, safe_rmtree, unzip_to_dir _MAX_UPLOAD_MB_RAW = os.environ.get("MAX_UPLOAD_MB") @@ -320,7 +322,7 @@ def _maybe_use_single_top_folder(repo_dir: Path) -> Path: return repo_dir -@app.post("/scan", response_model=ScanResponse) +@app.post("/scan", response_model=DeduplicatedScanResponse) async def scan( request: Request, project: UploadFile = File(...), @@ -360,6 +362,26 @@ async def scan( semgrep, osv, gitleaks, entropy, findings = _scan_repo_dir(scan_root) + raw_finding_count = len(findings) + + disable_dedup = os.environ.get("DISABLE_DEDUP", "false").lower() in { + "1", + "true", + "yes", + } + + try: + epsilon = float(os.environ.get("DEDUP_EPSILON", "0.15")) + except ValueError: + epsilon = 0.15 + + if disable_dedup: + dedup_findings = findings + else: + dedup_findings = deduplicate(findings, epsilon=epsilon) + + finding_count = len(dedup_findings) + try: async with await get_db() as db: await db.execute( @@ -367,7 +389,7 @@ async def scan( (job_id, project_name, "zip"), ) rows = [] - for f in findings: + for f in dedup_findings: engine = (f.metadata or {}).get("engine") scanner = {"osv-scanner": "osv"}.get(engine, engine) rule_id = ( @@ -407,17 +429,11 @@ async def scan( await db.commit() except Exception: logger.exception("DB write failed for job %s", job_id) - return ScanResponse( + return DeduplicatedScanResponse( job_id=job_id, - project_name=project_name, - repo_path=str(scan_root), - findings=findings, - scanners={ - "semgrep": {"ok": True, "count": len(semgrep)}, - "osv": {"ok": True, "count": len(osv)}, - "gitleaks": {"ok": True, "count": len(gitleaks)}, - "entropy": {"ok": True, "count": len(entropy)}, - }, + raw_finding_count=raw_finding_count, + finding_count=finding_count, + findings=dedup_findings, ) diff --git a/backend/app/models.py b/backend/app/models.py index 6aca77b..bd9da4d 100644 --- a/backend/app/models.py +++ b/backend/app/models.py @@ -75,3 +75,10 @@ class OrgJobStatusResponse(BaseModel): org_job_id: str status: str repos: List[RepoStatus] + + +class DeduplicatedScanResponse(BaseModel): + job_id: str + raw_finding_count: int + finding_count: int + findings: List[Finding] diff --git a/backend/app/utils/deduplicator.py b/backend/app/utils/deduplicator.py new file mode 100644 index 0000000..2f0dc59 --- /dev/null +++ b/backend/app/utils/deduplicator.py @@ -0,0 +1,73 @@ +import logging +from typing import List + +from app.models import Finding + +logger = logging.getLogger(__name__) + +_MODEL = None + + +def get_model(): + """Lazily load and cache the SentenceTransformer model.""" + global _MODEL + if _MODEL is None: + from sentence_transformers import SentenceTransformer + + _MODEL = SentenceTransformer("all-MiniLM-L6-v2") + return _MODEL + + +def deduplicate(findings: List[Finding], epsilon: float = 0.15) -> List[Finding]: + """ + Deduplicates finding descriptions/messages using SentenceTransformer embeddings. + Returns the original findings list if sentence-transformers is unavailable or if loading/encoding fails. + """ + if not findings: + return findings + + # Check for sentence_transformers availability + import importlib.util + + if importlib.util.find_spec("sentence_transformers") is None: + logger.warning( + "sentence-transformers is not available. Skipping deduplication." + ) + return findings + + try: + import numpy as np + except ImportError: + logger.warning("numpy is not available. Skipping deduplication.") + return findings + + try: + model = get_model() + texts = [f.description if f.description else f.title for f in findings] + embeddings = model.encode(texts, convert_to_numpy=True) + + if len(embeddings.shape) == 1: + embeddings = np.expand_dims(embeddings, axis=0) + + # Normalize embeddings to compute cosine similarity using dot product + norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + normalized_embeddings = embeddings / norms + + keep = [] + for i in range(len(findings)): + is_dup = False + for j in keep: + sim = np.dot(normalized_embeddings[i], normalized_embeddings[j]) + dist = 1.0 - sim + if dist <= epsilon: + is_dup = True + break + if not is_dup: + keep.append(i) + + return [findings[idx] for idx in keep] + + except Exception as e: + logger.error(f"Error during deduplication: {e}. Skipping deduplication.") + return findings diff --git a/backend/tests/test_scan_dedup.py b/backend/tests/test_scan_dedup.py new file mode 100644 index 0000000..9fc008a --- /dev/null +++ b/backend/tests/test_scan_dedup.py @@ -0,0 +1,160 @@ +import io +import zipfile +from unittest.mock import patch + +import numpy as np +import pytest +from fastapi.testclient import TestClient + +import app.utils.deduplicator as dedup_mod +from app.main import app as fastapi_app +from app.models import Finding, Location + +client = TestClient(fastapi_app) + + +def make_dummy_zip(): + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file: + zip_file.writestr("dummy.py", "print('hello')") + zip_buffer.seek(0) + return zip_buffer + + +class MockSentenceTransformer: + def __init__(self, *args, **kwargs): + """Initialize mock.""" + pass + + def encode(self, texts, **kwargs): + embs = [] + for text in texts: + if "SQL Injection" in text: + embs.append([1.0, 0.0]) + else: + embs.append([0.0, 1.0]) + return np.array(embs) + + +findings_input = [ + Finding( + id="1", + category="sast", + severity="HIGH", + title="SQL Injection", + description="SQL Injection in auth.py", + location=Location(path="auth.py", start_line=10), + ), + Finding( + id="2", + category="sast", + severity="HIGH", + title="SQL Injection", + description="SQL Injection in auth.py", + location=Location(path="auth.py", start_line=15), + ), + Finding( + id="3", + category="secret", + severity="CRITICAL", + title="Hardcoded Password", + description="Hardcoded password in config.py", + location=Location(path="config.py", start_line=5), + ), +] + + +@pytest.fixture(autouse=True) +def reset_dedup_cache(): + dedup_mod._MODEL = None + yield + dedup_mod._MODEL = None + + +# Case 1: Dedup enabled with duplicate findings +@patch("app.main.unzip_to_dir") +@patch("app.main._scan_repo_dir") +@patch("sentence_transformers.SentenceTransformer", new=MockSentenceTransformer) +def test_scan_dedup_enabled(mock_scan, mock_unzip, monkeypatch): + monkeypatch.delenv("DISABLE_DEDUP", raising=False) + monkeypatch.setenv("DEDUP_EPSILON", "0.15") + mock_scan.return_value = ([], [], [], [], findings_input) + + zip_file = make_dummy_zip() + res = client.post( + "/scan", + files={"project": ("project.zip", zip_file, "application/zip")}, + data={"project_name": "test_project"}, + ) + assert res.status_code == 200 + data = res.json() + assert data["raw_finding_count"] == 3 + assert data["finding_count"] == 2 + assert len(data["findings"]) == 2 + assert {f["id"] for f in data["findings"]} == {"1", "3"} + + +# Case 2: DISABLE_DEDUP=true +@patch("app.main.unzip_to_dir") +@patch("app.main._scan_repo_dir") +@patch("sentence_transformers.SentenceTransformer", new=MockSentenceTransformer) +def test_scan_dedup_disabled(mock_scan, mock_unzip, monkeypatch): + monkeypatch.setenv("DISABLE_DEDUP", "true") + mock_scan.return_value = ([], [], [], [], findings_input) + + zip_file = make_dummy_zip() + res = client.post( + "/scan", + files={"project": ("project.zip", zip_file, "application/zip")}, + data={"project_name": "test_project"}, + ) + assert res.status_code == 200 + data = res.json() + assert data["raw_finding_count"] == 3 + assert data["finding_count"] == 3 + assert len(data["findings"]) == 3 + + +# Case 3: sentence-transformers unavailable +@patch("app.main.unzip_to_dir") +@patch("app.main._scan_repo_dir") +def test_scan_dedup_sentence_transformers_unavailable( + mock_scan, mock_unzip, monkeypatch +): + monkeypatch.delenv("DISABLE_DEDUP", raising=False) + mock_scan.return_value = ([], [], [], [], findings_input) + + with patch.dict("sys.modules", {"sentence_transformers": None}): + zip_file = make_dummy_zip() + res = client.post( + "/scan", + files={"project": ("project.zip", zip_file, "application/zip")}, + data={"project_name": "test_project"}, + ) + assert res.status_code == 200 + data = res.json() + assert data["raw_finding_count"] == 3 + assert data["finding_count"] == 3 + assert len(data["findings"]) == 3 + + +# Case 4: Invalid DEDUP_EPSILON value (fallback to 0.15) +@patch("app.main.unzip_to_dir") +@patch("app.main._scan_repo_dir") +@patch("sentence_transformers.SentenceTransformer", new=MockSentenceTransformer) +def test_scan_dedup_invalid_epsilon(mock_scan, mock_unzip, monkeypatch): + monkeypatch.delenv("DISABLE_DEDUP", raising=False) + monkeypatch.setenv("DEDUP_EPSILON", "abc") + mock_scan.return_value = ([], [], [], [], findings_input) + + zip_file = make_dummy_zip() + res = client.post( + "/scan", + files={"project": ("project.zip", zip_file, "application/zip")}, + data={"project_name": "test_project"}, + ) + assert res.status_code == 200 + data = res.json() + assert data["raw_finding_count"] == 3 + assert data["finding_count"] == 2 + assert len(data["findings"]) == 2 From 178a359920ca8a5442f7dea3c9086cee6f9d42d3 Mon Sep 17 00:00:00 2001 From: Praneeth Perumalla Date: Mon, 15 Jun 2026 16:59:04 +0530 Subject: [PATCH 2/5] test: fix dedup tests when sentence-transformers is unavailable --- backend/tests/test_scan_dedup.py | 41 ++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/backend/tests/test_scan_dedup.py b/backend/tests/test_scan_dedup.py index 9fc008a..94288ef 100644 --- a/backend/tests/test_scan_dedup.py +++ b/backend/tests/test_scan_dedup.py @@ -74,10 +74,12 @@ def reset_dedup_cache(): # Case 1: Dedup enabled with duplicate findings @patch("app.main.unzip_to_dir") @patch("app.main._scan_repo_dir") -@patch("sentence_transformers.SentenceTransformer", new=MockSentenceTransformer) -def test_scan_dedup_enabled(mock_scan, mock_unzip, monkeypatch): +@patch("app.utils.deduplicator.get_model") +def test_scan_dedup_enabled(mock_get_model, mock_scan, mock_unzip, monkeypatch): monkeypatch.delenv("DISABLE_DEDUP", raising=False) monkeypatch.setenv("DEDUP_EPSILON", "0.15") + mock_get_model.return_value = MockSentenceTransformer() + monkeypatch.setattr("importlib.util.find_spec", lambda name: object()) mock_scan.return_value = ([], [], [], [], findings_input) zip_file = make_dummy_zip() @@ -97,9 +99,10 @@ def test_scan_dedup_enabled(mock_scan, mock_unzip, monkeypatch): # Case 2: DISABLE_DEDUP=true @patch("app.main.unzip_to_dir") @patch("app.main._scan_repo_dir") -@patch("sentence_transformers.SentenceTransformer", new=MockSentenceTransformer) -def test_scan_dedup_disabled(mock_scan, mock_unzip, monkeypatch): +@patch("app.utils.deduplicator.get_model") +def test_scan_dedup_disabled(mock_get_model, mock_scan, mock_unzip, monkeypatch): monkeypatch.setenv("DISABLE_DEDUP", "true") + mock_get_model.return_value = MockSentenceTransformer() mock_scan.return_value = ([], [], [], [], findings_input) zip_file = make_dummy_zip() @@ -122,29 +125,31 @@ def test_scan_dedup_sentence_transformers_unavailable( mock_scan, mock_unzip, monkeypatch ): monkeypatch.delenv("DISABLE_DEDUP", raising=False) + monkeypatch.setattr("importlib.util.find_spec", lambda name: None) mock_scan.return_value = ([], [], [], [], findings_input) - with patch.dict("sys.modules", {"sentence_transformers": None}): - zip_file = make_dummy_zip() - res = client.post( - "/scan", - files={"project": ("project.zip", zip_file, "application/zip")}, - data={"project_name": "test_project"}, - ) - assert res.status_code == 200 - data = res.json() - assert data["raw_finding_count"] == 3 - assert data["finding_count"] == 3 - assert len(data["findings"]) == 3 + zip_file = make_dummy_zip() + res = client.post( + "/scan", + files={"project": ("project.zip", zip_file, "application/zip")}, + data={"project_name": "test_project"}, + ) + assert res.status_code == 200 + data = res.json() + assert data["raw_finding_count"] == 3 + assert data["finding_count"] == 3 + assert len(data["findings"]) == 3 # Case 4: Invalid DEDUP_EPSILON value (fallback to 0.15) @patch("app.main.unzip_to_dir") @patch("app.main._scan_repo_dir") -@patch("sentence_transformers.SentenceTransformer", new=MockSentenceTransformer) -def test_scan_dedup_invalid_epsilon(mock_scan, mock_unzip, monkeypatch): +@patch("app.utils.deduplicator.get_model") +def test_scan_dedup_invalid_epsilon(mock_get_model, mock_scan, mock_unzip, monkeypatch): monkeypatch.delenv("DISABLE_DEDUP", raising=False) monkeypatch.setenv("DEDUP_EPSILON", "abc") + mock_get_model.return_value = MockSentenceTransformer() + monkeypatch.setattr("importlib.util.find_spec", lambda name: object()) mock_scan.return_value = ([], [], [], [], findings_input) zip_file = make_dummy_zip() From b687ee67483e5a7cc17d10cbca1dc66ca6cd2760 Mon Sep 17 00:00:00 2001 From: Praneeth Perumalla Date: Mon, 15 Jun 2026 17:00:33 +0530 Subject: [PATCH 3/5] chore: retrigger CI From 679ccadc50f53f0d79756cc183693807c02dcf22 Mon Sep 17 00:00:00 2001 From: Praneeth Perumalla Date: Tue, 16 Jun 2026 11:09:08 +0530 Subject: [PATCH 4/5] chore: retrigger guardrails From d5573c136db213fae31151df3365e85dc41890a3 Mon Sep 17 00:00:00 2001 From: Praneeth Perumalla Date: Sun, 21 Jun 2026 12:48:14 +0530 Subject: [PATCH 5/5] chore: retrigger guardrails