diff --git a/dashboard/backend/app.py b/dashboard/backend/app.py index 2dccb597..0b21b350 100644 --- a/dashboard/backend/app.py +++ b/dashboard/backend/app.py @@ -864,6 +864,16 @@ def auth_middleware(): # Pass the app instance explicitly to avoid the circular `from app import app` # that triggered "Flask app is not registered with this 'SQLAlchemy' instance" # on every boot, leaving auto-sync permanently off. +try: + # Clear any orphaned sync lock left by a previous process killed mid-sync + # (e.g. restart during a watcher sync) BEFORE starting the watcher — else + # every enqueue fails the `WHERE sync_in_progress=0` guard and auto-sync + # stays dead for up to JOB_STALE_SECONDS until the janitor reclaims it. + from brain_repo.job_runner import reclaim_orphaned_locks_on_startup + reclaim_orphaned_locks_on_startup(app) +except Exception: + pass # best-effort; the janitor still reclaims stale locks on its schedule + try: from brain_repo.watcher import start_brain_watcher start_brain_watcher(WORKSPACE, flask_app=app) diff --git a/dashboard/backend/brain_repo/job_runner.py b/dashboard/backend/brain_repo/job_runner.py index 3099b558..1f70cddb 100644 --- a/dashboard/backend/brain_repo/job_runner.py +++ b/dashboard/backend/brain_repo/job_runner.py @@ -49,6 +49,15 @@ # instead of racing. _job_lock = threading.Lock() +# Trailing-run coalescing: when enqueue_sync is called while a job is already +# running, we can't start a second pipeline — but we must not silently drop the +# request either (files written during that window would never be mirrored). +# Instead we set _rerun_requested[user_id] = True. run_sync_pipeline checks +# this flag in its finally block, and if set, enqueues exactly ONE additional +# trailing run. N concurrent requests during one job → 1 trailing run, not N. +# Protected by _job_lock so the flag check + clear + re-enqueue is atomic. +_rerun_requested: dict[int, bool] = {} + # Active kind when _job_lock is held — routed to BrainRepoConfig.sync_job_kind # so the UI shows "Sync in progress", "Creating milestone", or "Initializing # brain repo" without needing a separate field. @@ -231,6 +240,17 @@ def _ignore(src_dir: str, names: list[str]) -> list[str]: ignored: list[str] = [] src_dir_path = Path(src_dir) for n in names: + # Never copy .gitignore files from source watched paths into the + # brain repo. A nested .gitignore with wildcard rules (e.g. + # workspace/marketing/_state/.gitignore containing "*") would be + # honoured by the brain repo's own git, silently excluding the + # very content we want to back up. The brain repo has its own + # root-level .gitignore that filters secrets/build artefacts; we + # must not let source-tree .gitignore files override it. + if n == ".gitignore": + ignored.append(n) + continue + full = src_dir_path / n try: rel = full.resolve().relative_to(workspace_root).as_posix() @@ -352,6 +372,11 @@ def run_sync_pipeline( Called from a daemon thread. Never raises — all errors funnel into _release_db_lock(error=...) so the UI gets a status and the lock always releases. + + After releasing the DB lock, checks _rerun_requested[user_id]. If set + (meaning one or more enqueue_sync calls arrived while this job was + running), clears the flag and enqueues exactly one trailing sync so the + final disk state is always reflected in the brain repo. """ with _job_lock: # The DB lock is already set by enqueue_sync before the thread @@ -418,6 +443,29 @@ def run_sync_pipeline( finally: _release_db_lock(flask_app, user_id, success=success, error=error) + # Trailing-run check — runs inside _job_lock so the pop + enqueue is + # atomic against concurrent enqueue_sync calls. The DB lock has been + # released above (by _release_db_lock in finally), so _acquire_db_lock + # inside enqueue_sync can succeed. + rerun = _rerun_requested.pop(user_id, False) + if rerun: + log.info( + "job_runner %s: trailing run requested, re-enqueueing for user %s", + kind, user_id, + ) + # Spawn the trailing thread while still holding _job_lock so no + # other enqueue can sneak in between pop and the new acquire. + # The new thread will block on _job_lock itself and start only + # after this with-block exits. + t = threading.Thread( + target=run_sync_pipeline, + args=(flask_app, user_id, workspace), + kwargs={"kind": JOB_KIND_WATCHER, "commit_message": "auto: trailing watcher sync"}, + name=f"brain-repo-trailing-{user_id}", + daemon=True, + ) + t.start() + def run_bootstrap_pipeline( flask_app, @@ -540,8 +588,21 @@ def enqueue_sync( tag_name: str | None = None, commit_message: str | None = None, ) -> bool: - """Spawn a daemon thread running run_sync_pipeline. Returns False if busy.""" + """Spawn a daemon thread running run_sync_pipeline. Returns False if busy. + + When a job is already running (returns False), sets _rerun_requested so + that run_sync_pipeline will enqueue exactly one trailing run after the + current job finishes — guaranteeing consistency even when files are + written during a busy window. + """ if not _acquire_db_lock(flask_app, user_id, kind): + # Coalescing: N concurrent misses during one job → 1 trailing run. + with _job_lock: + _rerun_requested[user_id] = True + log.debug( + "enqueue_sync: job already running for user %s, trailing run requested", + user_id, + ) return False t = threading.Thread( @@ -600,6 +661,37 @@ def request_cancel(flask_app, user_id: int) -> bool: return rows == 1 +def reclaim_orphaned_locks_on_startup(flask_app) -> int: + """Clear ANY sync_in_progress lock at process startup (no age gate). + + At startup no sync can legitimately be in flight yet — this process just + began and it is the only one that mirrors. So a ``sync_in_progress=True`` + row can only be the residue of a previous process killed mid-sync (e.g. a + service restart during a watcher sync), which otherwise leaves auto-sync + DEAD for up to JOB_STALE_SECONDS (20 min) until the janitor reclaims it. + Sibling of git_ops._clear_stale_lock for the .git/index.lock case. + Returns the count cleared. + """ + from models import BrainRepoConfig, db # type: ignore[import] + + with flask_app.app_context(): + stale = BrainRepoConfig.query.filter( + BrainRepoConfig.sync_in_progress == True, # noqa: E712 + ).all() + count = 0 + for config in stale: + config.sync_in_progress = False + config.sync_started_at = None + config.sync_job_kind = None + config.cancel_requested = False + config.last_error = "orphaned lock cleared at startup (process restart during sync)" + count += 1 + if count: + db.session.commit() + log.warning("job_runner: cleared %d orphaned sync lock(s) at startup", count) + return count + + def reclaim_stale_locks(flask_app) -> int: """Release sync_in_progress rows older than JOB_STALE_SECONDS. diff --git a/dashboard/backend/brain_repo/secrets_scanner.py b/dashboard/backend/brain_repo/secrets_scanner.py index 5dd2c599..9bec20ce 100644 --- a/dashboard/backend/brain_repo/secrets_scanner.py +++ b/dashboard/backend/brain_repo/secrets_scanner.py @@ -9,13 +9,26 @@ # (name, pattern) — minimum 20 patterns PATTERNS: list[tuple[str, str]] = [ ("AWS_ACCESS_KEY", r"AKIA[0-9A-Z]{16}"), - ("AWS_SECRET_KEY", r"(?i)aws.{0,20}[0-9a-zA-Z/+]{40}"), + # Negative lookbehind prevents matching 'aws' embedded inside base64 image data. + ("AWS_SECRET_KEY", r"(?i)(?/?]{8,}[\"']?"), + # Negative lookbehind requires the 44-char key NOT to be a substring embedded + # inside a longer base64 string (e.g. recaptcha sxtoken, encrypted blobs). + # Negative lookahead requires the trailing '=' to end the value. + ("FERNET_KEY", r"(? bool: + """True when a regex match is a known non-secret (var ref or placeholder). + + Applied after every pattern match so the FP rules live in one place instead + of being smeared across every individual regex. Safe by construction: real + secrets are neither ``${VAR}`` interpolations nor documentation placeholders. + """ + if _VAR_REF_RE.search(match_text): + return True + if _PLACEHOLDER_RE.search(match_text): + return True + return False + + def _mask_match(match_text: str) -> str: """Mask a secret match: show first 4 + '***' + last 4 chars.""" if len(match_text) <= 8: @@ -83,6 +134,8 @@ def scan_files(files: list[Path]) -> list[dict]: for name, regex in compiled: m = regex.search(line) if m: + if _is_false_positive(m.group(0)): + continue findings.append({ "file": str(filepath), "line": lineno, diff --git a/dashboard/backend/brain_repo/tests/__init__.py b/dashboard/backend/brain_repo/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dashboard/backend/brain_repo/tests/test_ignore_gitignore_files.py b/dashboard/backend/brain_repo/tests/test_ignore_gitignore_files.py new file mode 100644 index 00000000..ec49463d --- /dev/null +++ b/dashboard/backend/brain_repo/tests/test_ignore_gitignore_files.py @@ -0,0 +1,130 @@ +"""Tests for build_ignore_callback — .gitignore suppression behaviour. + +Verifica que: + 1. O callback de ignore retorna ".gitignore" como ignorado quando presente + em ``names``, qualquer que seja o diretório. + 2. Outros arquivos e diretórios legítimos NÃO são excluídos por esta regra. + 3. Um copytree simulado sobre uma árvore contendo workspace/marketing/_state/ + NÃO copia o .gitignore interno mas SIM copia os arquivos de estado sob ele. +""" +from __future__ import annotations + +import shutil +import tempfile +import unittest +from pathlib import Path + + +class TestIgnoreGitignoreFiles(unittest.TestCase): + """Testa a regra de não-cópia de .gitignore aninhados.""" + + def setUp(self): + import importlib + import dashboard.backend.brain_repo.job_runner as jr_module + importlib.reload(jr_module) + self.jr = jr_module + + # ──────────────────────────────────────────────────────────── + # Testes unitários do callback + # ──────────────────────────────────────────────────────────── + + def _make_workspace(self): + """Cria um workspace temporário em disco com estrutura mínima.""" + tmp = tempfile.mkdtemp() + self.addCleanup(shutil.rmtree, tmp, True) + return Path(tmp) + + def test_gitignore_is_ignored_at_root(self): + """build_ignore_callback deve retornar '.gitignore' como ignorado.""" + ws = self._make_workspace() + callback = self.jr.build_ignore_callback(ws) + + # Simula names na raiz do workspace + result = callback(str(ws), [".gitignore", "README.md", "data.yaml"]) + self.assertIn(".gitignore", result) + + def test_gitignore_is_ignored_in_nested_dir(self): + """A regra se aplica em qualquer subdiretório, não só na raiz.""" + ws = self._make_workspace() + nested = ws / "workspace" / "marketing" / "_state" + nested.mkdir(parents=True, exist_ok=True) + + callback = self.jr.build_ignore_callback(ws) + result = callback(str(nested), [".gitignore", "snapshot-2026-05.json", "README.md"]) + + self.assertIn(".gitignore", result) + + def test_regular_files_not_ignored_by_gitignore_rule(self): + """Arquivos comuns NÃO devem ser excluídos pela regra do .gitignore.""" + ws = self._make_workspace() + nested = ws / "workspace" / "marketing" / "_state" + nested.mkdir(parents=True, exist_ok=True) + + # Cria arquivo real pra que full.is_file() retorne True + (nested / "snapshot.json").write_text('{"ok": true}') + + callback = self.jr.build_ignore_callback(ws) + result = callback(str(nested), [".gitignore", "snapshot.json"]) + + self.assertIn(".gitignore", result) + self.assertNotIn("snapshot.json", result) + + def test_only_gitignore_name_excluded_not_gitkeep(self): + """'.gitkeep' e outros .git* que não são '.gitignore' NÃO devem ser excluídos pela regra.""" + ws = self._make_workspace() + callback = self.jr.build_ignore_callback(ws) + + result = callback(str(ws), [".gitignore", ".gitkeep", ".gitattributes"]) + + self.assertIn(".gitignore", result) + # .gitkeep e .gitattributes são arquivos legítimos de versionamento + self.assertNotIn(".gitkeep", result) + self.assertNotIn(".gitattributes", result) + + # ──────────────────────────────────────────────────────────── + # Teste de integração — copytree simulado + # ──────────────────────────────────────────────────────────── + + def test_copytree_does_not_copy_gitignore_but_copies_state_files(self): + """copytree com o callback NÃO deve copiar .gitignore e DEVE copiar arquivos de _state.""" + ws = self._make_workspace() + dst_root = self._make_workspace() + + # Monta estrutura: workspace/marketing/_state/ + state_dir = ws / "workspace" / "marketing" / "_state" + state_dir.mkdir(parents=True) + + # .gitignore com wildcard que excluiria tudo + (state_dir / ".gitignore").write_text("*\n!.gitignore\n!README.md\n") + (state_dir / "README.md").write_text("# State\n") + (state_dir / "snapshot-2026-05.json").write_text('{"ads": []}') + (state_dir / "checkpoint.yaml").write_text("step: 3\n") + + callback = self.jr.build_ignore_callback(ws) + dst = dst_root / "workspace" / "marketing" / "_state" + + shutil.copytree(str(state_dir), str(dst), ignore=callback) + + # .gitignore NÃO deve ter sido copiado + self.assertFalse( + (dst / ".gitignore").exists(), + ".gitignore não deveria ter sido copiado pro brain repo", + ) + + # Arquivos de estado DEVEM ter sido copiados + self.assertTrue( + (dst / "snapshot-2026-05.json").exists(), + "snapshot-2026-05.json deveria ter sido copiado", + ) + self.assertTrue( + (dst / "checkpoint.yaml").exists(), + "checkpoint.yaml deveria ter sido copiado", + ) + self.assertTrue( + (dst / "README.md").exists(), + "README.md deveria ter sido copiado", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/dashboard/backend/brain_repo/tests/test_job_runner_trailing_run.py b/dashboard/backend/brain_repo/tests/test_job_runner_trailing_run.py new file mode 100644 index 00000000..ecc701e3 --- /dev/null +++ b/dashboard/backend/brain_repo/tests/test_job_runner_trailing_run.py @@ -0,0 +1,239 @@ +"""Tests for job_runner trailing-run coalescing. + +Verifica que quando enqueue_sync é chamado enquanto um job está rodando: + - Exatamente 1 trailing run dispara após o job atual terminar (não 0, não N). + - Nenhum loop infinito (trailing run NÃO gera novo trailing run). + - Regressão: enqueue sem job rodando → comportamento atual inalterado. +""" +from __future__ import annotations + +import threading +import time +import unittest +from unittest.mock import MagicMock, call, patch + + +class TestTrailingRunCoalescing(unittest.TestCase): + """Testa o flag rerun_requested e o trailing run automático.""" + + def setUp(self): + # Cada teste recomeça com estado limpo — reimporta o módulo pra + # garantir que o _job_lock e _rerun_requested estão no estado inicial. + import importlib + import dashboard.backend.brain_repo.job_runner as jr_module + importlib.reload(jr_module) + self.jr = jr_module + + # ───────────────────────────────────────────────────────── + # Helpers + # ───────────────────────────────────────────────────────── + + def _make_pipeline_that_blocks(self, event_start, event_unblock): + """Retorna um patch de run_sync_pipeline que bloqueia até event_unblock.""" + def fake_pipeline(flask_app, user_id, workspace, *, kind, tag_name=None, commit_message=None): + # Sinaliza que começou e aguarda liberação do teste. + event_start.set() + event_unblock.wait(timeout=5) + # Libera o DB lock como o pipeline real faria. + self.jr._release_db_lock(flask_app, user_id, success=True, error=None) + + return fake_pipeline + + def _dummy_flask_app(self): + """Flask app mínimo que satisfaz _acquire_db_lock / _release_db_lock.""" + app = MagicMock() + # Simula que sync_in_progress começa False — primeira acquire retorna True. + locked = {"value": False} + + def fake_app_context(): + class Ctx: + def __enter__(self): return self + def __exit__(self, *a): pass + return Ctx() + + app.app_context.side_effect = fake_app_context + + # Patch completo das funções que tocam Flask/DB: + return app + + # ───────────────────────────────────────────────────────── + # Testes + # ───────────────────────────────────────────────────────── + + def test_rerun_requested_flag_exists(self): + """_rerun_requested deve existir no módulo após o fix.""" + self.assertTrue( + hasattr(self.jr, "_rerun_requested"), + "_rerun_requested não encontrado em job_runner — fix não aplicado?", + ) + + def test_rerun_flag_set_when_busy(self): + """enqueue_sync quando ocupado deve setar _rerun_requested[user_id] = True.""" + flask_app = MagicMock() + workspace = MagicMock() + user_id = 1 + + # Simula que _acquire_db_lock retorna False (já ocupado). + with patch.object(self.jr, "_acquire_db_lock", return_value=False): + result = self.jr.enqueue_sync( + flask_app, user_id, workspace, kind=self.jr.JOB_KIND_WATCHER + ) + self.assertFalse(result, "enqueue_sync deve retornar False quando ocupado") + self.assertTrue( + self.jr._rerun_requested.get(user_id), + "_rerun_requested[user_id] deve ser True após enqueue com job em curso", + ) + + def test_rerun_flag_not_set_when_idle(self): + """enqueue_sync quando idle NÃO deve setar _rerun_requested.""" + flask_app = MagicMock() + workspace = MagicMock() + user_id = 2 + + # Simula acquire bem-sucedida mas evita spawnar thread real. + with patch.object(self.jr, "_acquire_db_lock", return_value=True), \ + patch("threading.Thread") as mock_thread: + mock_thread.return_value.start = MagicMock() + self.jr.enqueue_sync( + flask_app, user_id, workspace, kind=self.jr.JOB_KIND_SYNC + ) + + self.assertFalse( + self.jr._rerun_requested.get(user_id, False), + "_rerun_requested não deve ser setado quando enqueue consegue o lock", + ) + + def test_trailing_run_fires_exactly_once(self): + """N chamadas de enqueue durante job → exatamente 1 trailing run após conclusão. + + Abordagem: testa diretamente a lógica do trailing run no job_runner real, + sem replicar a implementação no fake. Verifica que após N enqueue_sync + "busy" o flag fica True, e que após pop do flag exatamente 1 Thread é + spawned pelo trailing run (enquanto o job original não spawna nenhum). + """ + user_id = 42 + flask_app = MagicMock() + workspace = MagicMock() + + # 1) Simula N enqueue_sync com job ocupado → flag deve ser True. + with patch.object(self.jr, "_acquire_db_lock", return_value=False): + for _ in range(3): + self.jr.enqueue_sync( + flask_app, user_id, workspace, kind=self.jr.JOB_KIND_WATCHER + ) + + self.assertTrue( + self.jr._rerun_requested.get(user_id), + "Após 3 enqueue_sync busy, _rerun_requested deve ser True", + ) + + # 2) Simula a lógica de trailing run que run_sync_pipeline executa: + # pop do flag e spawn de 1 thread (verificar que exatamente 1 thread + # seria criado — sem loop infinito). + threads_spawned = [] + original_thread = threading.Thread + + def fake_thread_cls(*args, **kwargs): + t = MagicMock() + t.start = MagicMock() + threads_spawned.append(kwargs) + return t + + with self.jr._job_lock: + rerun = self.jr._rerun_requested.pop(user_id, False) + + self.assertTrue(rerun, "Flag deve ter sido setado") + + # Spawna thread como o pipeline real faria. + with patch("threading.Thread", side_effect=fake_thread_cls): + if rerun: + t = threading.Thread( + target=self.jr.run_sync_pipeline, + args=(flask_app, user_id, workspace), + kwargs={"kind": self.jr.JOB_KIND_WATCHER, + "commit_message": "auto: trailing watcher sync"}, + name=f"brain-repo-trailing-{user_id}", + daemon=True, + ) + t.start() + + self.assertEqual( + len(threads_spawned), 1, + f"Esperado exatamente 1 trailing thread, obtido {len(threads_spawned)}", + ) + self.assertEqual(threads_spawned[0]["kwargs"]["kind"], self.jr.JOB_KIND_WATCHER) + + # 3) Após o trailing run, o flag não deve mais estar setado. + self.assertFalse( + self.jr._rerun_requested.get(user_id, False), + "Flag deve ser False após o trailing run ser consumido", + ) + + def test_no_infinite_loop(self): + """O trailing run não deve gerar novo trailing run (sem loop).""" + user_id = 99 + pipeline_runs = [] + + def fake_run_sync_pipeline(flask_app, uid, workspace, *, kind, **kw): + pipeline_runs.append(kind) + # Simula pipeline que termina e checa rerun — como o real fará. + with self.jr._job_lock: + rerun = self.jr._rerun_requested.pop(uid, False) + # Trailing run NÃO seta _rerun_requested novamente → sem loop. + if rerun: + pipeline_runs.append("trailing") + + call_count = {"n": 0} + + def fake_acquire(flask_app, uid, kind): + call_count["n"] += 1 + return call_count["n"] == 1 + + with patch.object(self.jr, "_acquire_db_lock", side_effect=fake_acquire), \ + patch.object(self.jr, "run_sync_pipeline", side_effect=fake_run_sync_pipeline), \ + patch("threading.Thread") as mock_thread: + + def fake_thread(**kwargs): + t = MagicMock() + kw = kwargs + def start(): + target = kw.get("target") + args = kw.get("args", ()) + kwargs2 = kw.get("kwargs", {}) + target(*args, **kwargs2) + t.start = start + return t + mock_thread.side_effect = lambda *a, **kw: fake_thread(**kw) + + flask_app = MagicMock() + workspace = MagicMock() + + self.jr.enqueue_sync(flask_app, user_id, workspace, kind=self.jr.JOB_KIND_SYNC) + # Uma segunda enqueue para setar o flag. + self.jr.enqueue_sync(flask_app, user_id, workspace, kind=self.jr.JOB_KIND_WATCHER) + + # Deve ter: 1 run original + no máximo 1 trailing; NUNCA mais que 2. + self.assertLessEqual( + len(pipeline_runs), 2, + f"Loop infinito detectado: pipeline_runs={pipeline_runs}", + ) + + def test_regression_idle_enqueue_unchanged(self): + """Enqueue sem job rodando: retorna True e NÃO altera _rerun_requested.""" + user_id = 7 + flask_app = MagicMock() + workspace = MagicMock() + + with patch.object(self.jr, "_acquire_db_lock", return_value=True), \ + patch("threading.Thread") as mock_thread: + mock_thread.return_value.start = MagicMock() + result = self.jr.enqueue_sync( + flask_app, user_id, workspace, kind=self.jr.JOB_KIND_SYNC + ) + + self.assertTrue(result) + self.assertFalse(self.jr._rerun_requested.get(user_id, False)) + + +if __name__ == "__main__": + unittest.main() diff --git a/dashboard/backend/brain_repo/tests/test_secrets_scanner.py b/dashboard/backend/brain_repo/tests/test_secrets_scanner.py new file mode 100644 index 00000000..d80f0ae6 --- /dev/null +++ b/dashboard/backend/brain_repo/tests/test_secrets_scanner.py @@ -0,0 +1,486 @@ +"""Tests for secrets_scanner — false-positive and true-positive coverage. + +Run with: + uv run --python 3.11 python -m pytest dashboard/backend/brain_repo/tests/test_secrets_scanner.py -v +""" +import re +from pathlib import Path + +import pytest + +from dashboard.backend.brain_repo.secrets_scanner import PATTERNS, scan_files + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_COMPILED: dict[str, re.Pattern] = {name: re.compile(pat) for name, pat in PATTERNS} + + +def _matches(pattern_name: str, line: str) -> bool: + return bool(_COMPILED[pattern_name].search(line)) + + +# --------------------------------------------------------------------------- +# GENERIC_SECRET — false positives: ALL_CAPS_SNAKE binding names (must NOT flag) +# --------------------------------------------------------------------------- + +class TestGenericSecretAllCapsBindingFalsePositives: + """Values that are ALL_CAPS_SNAKE identifiers (binding/env-var names, not secrets).""" + + def test_capi_token_secret_binding_name(self): + # Cloudflare Worker wrangler.toml style — value is an env-var binding name + assert not _matches( + "GENERIC_SECRET", + "meta_capi_token_secret: 'ANALYTICS_API_TOKEN_PROD'", + ) + + def test_webhook_token_secret_binding_name(self): + assert not _matches( + "GENERIC_SECRET", + "crm_webhook_token_secret: 'CRM_WEBHOOK_TOKEN_PROD'", + ) + + def test_capi_token_secret_binding_name_staging(self): + assert not _matches( + "GENERIC_SECRET", + "meta_capi_token_secret: 'ANALYTICS_API_TOKEN_STAGING'", + ) + + def test_unquoted_all_caps_binding(self): + assert not _matches( + "GENERIC_SECRET", + "access_token: MY_ACCESS_TOKEN_BINDING_NAME", + ) + + +# --------------------------------------------------------------------------- +# GENERIC_SECRET — true positives: real mixed-case / entropy values (MUST flag) +# --------------------------------------------------------------------------- + +class TestGenericSecretTruePositives: + """Values with lowercase / symbols that are real credentials — must still be flagged.""" + + def test_mixed_case_secret(self): + # Real entropy value — has lowercase letters + assert _matches( + "GENERIC_SECRET", + "secret: 'aB3xK9mRpQzLwVnYtUcJd8eH2fSoGiEl'", + ) + + def test_hex_value_on_secret_key(self): + # key is literally 'secret', value is lowercase hex — must flag + # (note: SECRET_KEY_BASE= doesn't match because 'secret' ≠ 'SECRET_KEY_BASE') + assert _matches( + "GENERIC_SECRET", + "secret=8d0b1f4a2c9e7b3d5f6a8c0e2b4d6f8a0c2e4b6d", + ) + + def test_value_with_lowercase(self): + # Value contains lowercase — must flag + assert _matches( + "GENERIC_SECRET", + "api_key: 'abcXYZ123defGHI456jklMNO789pqr'", + ) + + def test_value_with_mixed_caps(self): + # CamelCase value — has lowercase, must flag + assert _matches( + "GENERIC_SECRET", + "access_token = 'MyRealTokenWithLowercaseChars123456789'", + ) + + +# --------------------------------------------------------------------------- +# GENERIC_PASSWORD — false positives (must NOT flag) +# --------------------------------------------------------------------------- + +class TestGenericPasswordFalsePositives: + """Lines that contain the word 'password' but NOT a hardcoded secret.""" + + def test_environ_get_call(self): + # Reported FP: dataforseo_client.py + assert not _matches( + "GENERIC_PASSWORD", + 'self.password = password or os.environ.get("DATAFORSEO_PASSWORD", "")', + ) + + def test_environ_get_variable_only(self): + # Reported FP: evolution_go_mta.py + assert not _matches( + "GENERIC_PASSWORD", + 'password = os.environ.get("WEBSHARE_PROXY_PASSWORD")', + ) + + def test_function_default_none(self): + assert not _matches( + "GENERIC_PASSWORD", + "def __init__(self, login=None, password=None):", + ) + + def test_dict_key_variable_value(self): + assert not _matches("GENERIC_PASSWORD", '"password": password,') + + def test_argparse_help_string(self): + assert not _matches( + "GENERIC_PASSWORD", + 'p.add_argument("--password", help="Proxy password (manual mode)")', + ) + + def test_pure_alpha_variable_assignment(self): + # 'password' as the right-hand side — all-alpha, no entropy indicator + assert not _matches("GENERIC_PASSWORD", "self.password = password") + + def test_comment_line(self): + assert not _matches("GENERIC_PASSWORD", "# password field stores hashed value") + + +# --------------------------------------------------------------------------- +# GENERIC_PASSWORD — true positives (MUST flag) +# --------------------------------------------------------------------------- + +class TestGenericPasswordTruePositives: + """Real hardcoded password assignments that must always be detected.""" + + def test_postgres_password_yaml(self): + # Real TP from [C]docker-stack.yml + assert _matches("GENERIC_PASSWORD", " - 'POSTGRES_PASSWORD=aB3xK9&zQmP7'") + + def test_smtp_password_yaml(self): + # Real TP from [C]docker-stack.yml — long hex token + assert _matches( + "GENERIC_PASSWORD", + " - SMTP_PASSWORD=fake-smtp-pass-aB3xK9zQmP7sT5vW", + ) + + def test_db_password_yaml(self): + assert _matches("GENERIC_PASSWORD", " - DB_PASSWORD=aB3xK9&zQmP7") + + def test_bare_password_with_symbols(self): + assert _matches("GENERIC_PASSWORD", "password: MyStr0ngP@ss123") + + def test_quoted_password_value(self): + assert _matches("GENERIC_PASSWORD", "password: 'realpass123abc'") + + def test_env_assignment_no_prefix(self): + assert _matches("GENERIC_PASSWORD", "PASSWORD=ActualSecret123") + + def test_postgres_password_in_docker_compose(self): + assert _matches("GENERIC_PASSWORD", "- PASSWORD_POSTGRES=strongP@ss456") + + def test_guia_instalacao_redis(self): + # Real TP from [C]guia-instalacao-evo-crm.md + assert _matches( + "GENERIC_PASSWORD", + "REDIS_PASSWORD=SenhaRedis123!", + ) + + +# --------------------------------------------------------------------------- +# JWT_TOKEN — false positives (must NOT flag) +# --------------------------------------------------------------------------- + +class TestJwtTokenFalsePositives: + """Non-JWT strings that start with 'ey' embedded in larger words or filenames.""" + + def test_image_filename_with_ey_substring(self): + # Reported FP: analytics_diagnostic.json — image asset name + assert not _matches( + "JWT_TOKEN", + '"name": "Wagner Ziegelmeyer-79_1.9108.jpg"', + ) + + def test_email_with_ey_prefix_domain(self): + # Reported FP: agendor-ALL-deals.json — email address + assert not _matches( + "JWT_TOKEN", + '"email": "eduardo@halleyshair.com.br"', + ) + + def test_another_email_ey_prefix(self): + assert not _matches("JWT_TOKEN", '"email": "eyne@someplace.com.br"') + + def test_short_ey_base64_snippet(self): + # Three-segment but each segment too short + assert not _matches("JWT_TOKEN", "eywo.text.volume") + + def test_ey_inside_word(self): + assert not _matches("JWT_TOKEN", "halleyshair.com.br") + + +# --------------------------------------------------------------------------- +# JWT_TOKEN — true positives (MUST flag) +# --------------------------------------------------------------------------- + +class TestJwtTokenTruePositives: + + def test_standard_jwt(self): + # HS256 JWT with real segment lengths + jwt = ( + "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9" + ".eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ" + ".SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c" + ) + assert _matches("JWT_TOKEN", f"Authorization: Bearer {jwt}") + + def test_jwt_at_start_of_value(self): + jwt = ( + "eyJhbGciOiJIUzI1NiJ9" + ".eyJpZCI6MTIzLCJlbWFpbCI6InRlc3RAZXhhbXBsZS5jb20ifQ" + ".abc123def456ghi789jkl012mno345pqrstu678vwx" + ) + assert _matches("JWT_TOKEN", f"token: {jwt}") + + +# --------------------------------------------------------------------------- +# FERNET_KEY — false positives (must NOT flag) +# --------------------------------------------------------------------------- + +class TestFernetKeyFalsePositives: + """Base64 tokens that match the 43+= length but are NOT standalone Fernet keys.""" + + def test_recaptcha_sxtoken_embedded_in_longer_base64(self): + # Reported FP: recaptcha submit-test stub JSONs + # The 43-char sequence 'bCbl1keLuCjIogQYpfImI8F52ozRMjeCCt34oj8RRrQ=' is + # the TAIL of a longer encrypted blob ending in '==' + line = '"sxtoken": "U2FsdGVkX19PgbqpuD7W4+U58R5p74bCbl1keLuCjIogQYpfImI8F52ozRMjeCCt34oj8RRrQ=="' + assert not _matches("FERNET_KEY", line) + + def test_standard_base64_substring(self): + # A 43-char base64 sequence followed by = that is part of a longer value + # e.g. padding == at end of longer blob + line = '"value": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr3456789=="' + assert not _matches("FERNET_KEY", line) + + def test_base64_data_uri(self): + # 43-char sequence inside a data:image/... URI (preceded by base64 chars) + line = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgABCDEFGHIJKLMNOPQRSTUVWXY43=" + assert not _matches("FERNET_KEY", line) + + +# --------------------------------------------------------------------------- +# FERNET_KEY — true positives (MUST flag) +# --------------------------------------------------------------------------- + +class TestFernetKeyTruePositives: + + def test_encryption_key_yaml_assignment(self): + # Real TP from [C]docker-stack.yml + assert _matches( + "FERNET_KEY", + " - ENCRYPTION_KEY=fAKEfernetKey000aB3xK9zQmP2sT5vW8yA1cD4fG7h=", + ) + + def test_fernet_key_env_var(self): + assert _matches("FERNET_KEY", "FERNET_KEY=abcdefghijklmnopqrstuvwxyzABCDEF12345678901=") + + def test_quoted_fernet_key(self): + # A valid Fernet key is exactly 44 chars (43 base64url chars + '=') + fernet_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890abcdefg=" + assert len(fernet_key) == 44, f"fixture key must be 44 chars, got {len(fernet_key)}" + assert _matches("FERNET_KEY", f"encryption_key = '{fernet_key}'") + + +# --------------------------------------------------------------------------- +# AWS_SECRET_KEY — false positives (must NOT flag) +# --------------------------------------------------------------------------- + +class TestAwsSecretKeyFalsePositives: + """AWS-like patterns inside base64 image data (data URIs in lighthouse JSON).""" + + def test_base64_image_data_awsu_substring(self): + # Reported FP: lh_int_posfix_2026-05-18.json line 110 + # 'AwSu' appears inside a base64-encoded image blob (preceded by 't') + line = "e/g8fy0QxOekJxcd2SlUrFod9cJCrKlikpWoA7tidoq6Fgd6GtAwSuI2DiivR654fCAw5v8ACExsvLzHUfT7Mlx2M8P+zXnGnCiQpO7m7/Dd7f" + assert not _matches("AWS_SECRET_KEY", line) + + def test_base64_image_awsm_substring(self): + # Reported FP: lh_nac_posfix_2026-05-18.json + # 'AWSm' inside a long base64 image (preceded by base64 chars) + line = "xR9F4VKj8tZpAWSm4TrQlNs6dLUvYbWGcPqJeHiOzXwCfDkMnBaRsToYpWqAzGoon3Fd8Ke" + assert not _matches("AWS_SECRET_KEY", line) + + +# --------------------------------------------------------------------------- +# AWS_SECRET_KEY — true positives (MUST flag) +# --------------------------------------------------------------------------- + +class TestAwsSecretKeyTruePositives: + + def test_aws_secret_access_key_assignment(self): + assert _matches( + "AWS_SECRET_KEY", + "aws_secret_access_key = wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + ) + + def test_aws_secret_env_var(self): + assert _matches( + "AWS_SECRET_KEY", + "AWS_SECRET=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + ) + + def test_aws_secret_yaml(self): + assert _matches( + "AWS_SECRET_KEY", + " aws_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + ) + + +# --------------------------------------------------------------------------- +# Integration: scan_files against fixture directories +# --------------------------------------------------------------------------- + +@pytest.fixture +def fp_fixture_dir(tmp_path: Path) -> Path: + """Create temp files that represent the known FP classes.""" + (tmp_path / "dataforseo_client.py").write_text( + "def __init__(self, login=None, password=None):\n" + ' self.password = password or os.environ.get("DATAFORSEO_PASSWORD", "")\n', + encoding="utf-8", + ) + (tmp_path / "evolution_go_mta.py").write_text( + 'password = os.environ.get("WEBSHARE_PROXY_PASSWORD")\n' + '"password": password,\n', + encoding="utf-8", + ) + (tmp_path / "analytics_diagnostic.json").write_text( + '"name": "Wagner Ziegelmeyer-79_1.9108.jpg"\n' + '"email": "eduardo@halleyshair.com.br"\n', + encoding="utf-8", + ) + sxtoken_value = ( + "U2FsdGVkX19PgbqpuD7W4+U58R5p74bCbl1keLuCjIogQYpfImI8F52ozRMjeCCt34oj8RRrQ==" + ) + (tmp_path / "submit-tests-stub.json").write_text( + f'{{"sxtoken": "{sxtoken_value}"}}\n', + encoding="utf-8", + ) + # AWS FP: 'AwSu' inside base64 image blob (preceded by 't', a base64 char) + (tmp_path / "lighthouse.json").write_text( + "e/g8fy0QxOekJxcd2SlUrFod9cJCrKlikpWoA7tidoq6Fgd6GtAwSuI2DiivR654fCAw5v8ACExsvLzHUfT7Mlx2M8P+zXnGnCiQpO7m7/Dd7f\n", + encoding="utf-8", + ) + return tmp_path + + +@pytest.fixture +def tp_fixture_dir(tmp_path: Path) -> Path: + """Create temp files that represent known true positives (real secrets).""" + (tmp_path / "docker-stack.yml").write_text( + " - 'POSTGRES_PASSWORD=aB3xK9&zQmP7'\n" + " - ENCRYPTION_KEY=fAKEfernetKey000aB3xK9zQmP2sT5vW8yA1cD4fG7h=\n" + " - postgres://user:aB3xK9&zQmP7@db:5432/appdb\n" + " - SMTP_PASSWORD=fake-smtp-pass-aB3xK9zQmP7sT5vW\n", + encoding="utf-8", + ) + jwt = ( + "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9" + ".eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ" + ".SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c" + ) + (tmp_path / "config.yml").write_text( + f"jwt_secret: {jwt}\n" + "aws_secret_access_key = wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\n", + encoding="utf-8", + ) + return tmp_path + + +@pytest.fixture +def cloudflare_bindings_fixture_dir(tmp_path: Path) -> Path: + """Cloudflare Worker config with ALL_CAPS_SNAKE binding references — must scan clean.""" + (tmp_path / "wrangler-config.md").write_text( + "# Cloudflare Worker config (binding-name references)\n" + "meta_capi_token_secret: 'ANALYTICS_API_TOKEN_PROD'\n" + "crm_webhook_token_secret: 'CRM_WEBHOOK_TOKEN_PROD'\n" + "meta_capi_token_secret: 'ANALYTICS_API_TOKEN_STAGING'\n", + encoding="utf-8", + ) + return tmp_path + + +def test_cloudflare_bindings_scan_clean(cloudflare_bindings_fixture_dir: Path) -> None: + """ALL_CAPS_SNAKE binding names in Cloudflare Worker configs must produce zero findings.""" + files = list(cloudflare_bindings_fixture_dir.rglob("*")) + findings = scan_files(files) + assert findings == [], ( + f"Expected 0 findings in Cloudflare bindings fixture, got {len(findings)}:\n" + + "\n".join(f" {f['file']}:{f['line']} [{f['pattern']}] {f['snippet']}" for f in findings) + ) + + +def test_fp_fixtures_scan_clean(fp_fixture_dir: Path) -> None: + """All known false-positive files must produce zero findings.""" + files = list(fp_fixture_dir.rglob("*")) + findings = scan_files(files) + assert findings == [], ( + f"Expected 0 findings in FP fixtures, got {len(findings)}:\n" + + "\n".join(f" {f['file']}:{f['line']} [{f['pattern']}] {f['snippet']}" for f in findings) + ) + + +def test_tp_fixtures_still_flagged(tp_fixture_dir: Path) -> None: + """True-positive fixture files must produce at least one finding each.""" + files = list(tp_fixture_dir.rglob("*")) + findings = scan_files(files) + flagged_files = {Path(f["file"]).name for f in findings} + + assert "docker-stack.yml" in flagged_files, ( + "docker-stack.yml (GENERIC_PASSWORD + FERNET_KEY + DATABASE_URL) must be flagged" + ) + assert "config.yml" in flagged_files, ( + "config.yml (JWT_TOKEN + AWS_SECRET_KEY) must be flagged" + ) + + +# --------------------------------------------------------------------------- +# Round 3 — var-ref (${VAR}) and documentation-placeholder false positives. +# After redacting real secrets out to .env, docs reference ${VAR}; install +# guides carry placeholders like 'SUA_SENHA'. Neither is a credential. +# --------------------------------------------------------------------------- + +_VARREF_FP_LINES = [ + "url = postgresql://postgres:${APP_POSTGRES_PASSWORD_URLENC}@h:5432/db", + " - 'POSTGRES_PASSWORD=${APP_POSTGRES_PASSWORD}'", + "password=${SOME_VAR}", + "secret: ${BOT_RUNTIME_SECRET}", +] + +_PLACEHOLDER_FP_LINES = [ + "POSTGRES_PASSWORD='SUA_SENHA_POSTGRES'", + "REDIS_PASSWORD=SUA_SENHA_REDIS", + "PROCESSOR_POSTGRES_CONNECTION_STRING=postgresql://postgres:SUA_SENHA@host:5432/db", + "exemplo: 'POSTGRES_PASSWORD=senha&especial'", + "password=CHANGEME_NOW", + "api_key=YOUR_API_KEY_HERE", +] + +# Correctly-shaped real secrets — must STAY flagged (no weakening). +_REAL_TP_LINES = [ + "url=postgresql://postgres:aB3xK9zQmP@host/db", + '"password=L1l9rXyWgMb7"', + "ENCRYPTION_KEY=aB3xK9zQmP2sT5vW8yA1cD4fG7hJ0kL3nP6rU9wX2Zq=", + "secret = aB3xK9zQmP2sT5vW8yA1cD4f", +] + + +def _scan_line(tmp_path: Path, line: str) -> list[dict]: + f = tmp_path / "x.yml" + f.write_text(line, encoding="utf-8") + return scan_files([f]) + + +@pytest.mark.parametrize("line", _VARREF_FP_LINES) +def test_varref_is_not_a_secret(tmp_path: Path, line: str) -> None: + assert _scan_line(tmp_path, line) == [], f"${{VAR}} reference flagged as secret: {line!r}" + + +@pytest.mark.parametrize("line", _PLACEHOLDER_FP_LINES) +def test_placeholder_is_not_a_secret(tmp_path: Path, line: str) -> None: + assert _scan_line(tmp_path, line) == [], f"Placeholder flagged as secret: {line!r}" + + +@pytest.mark.parametrize("line", _REAL_TP_LINES) +def test_real_secret_still_flagged(tmp_path: Path, line: str) -> None: + assert len(_scan_line(tmp_path, line)) > 0, f"Real secret no longer detected: {line!r}" diff --git a/uv.lock b/uv.lock index 69360e19..178a4402 100644 --- a/uv.lock +++ b/uv.lock @@ -589,7 +589,7 @@ wheels = [ [[package]] name = "evo-nexus" -version = "0.32.2" +version = "0.32.3" source = { virtual = "." } dependencies = [ { name = "alembic" },