Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions dashboard/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,6 +864,16 @@ def auth_middleware():
# Pass the app instance explicitly to avoid the circular `from app import app`
# that triggered "Flask app is not registered with this 'SQLAlchemy' instance"
# on every boot, leaving auto-sync permanently off.
try:
# Clear any orphaned sync lock left by a previous process killed mid-sync
# (e.g. restart during a watcher sync) BEFORE starting the watcher — else
# every enqueue fails the `WHERE sync_in_progress=0` guard and auto-sync
# stays dead for up to JOB_STALE_SECONDS until the janitor reclaims it.
from brain_repo.job_runner import reclaim_orphaned_locks_on_startup
reclaim_orphaned_locks_on_startup(app)
except Exception:
pass # best-effort; the janitor still reclaims stale locks on its schedule

try:
from brain_repo.watcher import start_brain_watcher
start_brain_watcher(WORKSPACE, flask_app=app)
Expand Down
94 changes: 93 additions & 1 deletion dashboard/backend/brain_repo/job_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@
# instead of racing.
_job_lock = threading.Lock()

# Trailing-run coalescing: when enqueue_sync is called while a job is already
# running, we can't start a second pipeline — but we must not silently drop the
# request either (files written during that window would never be mirrored).
# Instead we set _rerun_requested[user_id] = True. run_sync_pipeline checks
# this flag in its finally block, and if set, enqueues exactly ONE additional
# trailing run. N concurrent requests during one job → 1 trailing run, not N.
# Protected by _job_lock so the flag check + clear + re-enqueue is atomic.
_rerun_requested: dict[int, bool] = {}

# Active kind when _job_lock is held — routed to BrainRepoConfig.sync_job_kind
# so the UI shows "Sync in progress", "Creating milestone", or "Initializing
# brain repo" without needing a separate field.
Expand Down Expand Up @@ -231,6 +240,17 @@ def _ignore(src_dir: str, names: list[str]) -> list[str]:
ignored: list[str] = []
src_dir_path = Path(src_dir)
for n in names:
# Never copy .gitignore files from source watched paths into the
# brain repo. A nested .gitignore with wildcard rules (e.g.
# workspace/marketing/_state/.gitignore containing "*") would be
# honoured by the brain repo's own git, silently excluding the
# very content we want to back up. The brain repo has its own
# root-level .gitignore that filters secrets/build artefacts; we
# must not let source-tree .gitignore files override it.
if n == ".gitignore":
ignored.append(n)
continue

full = src_dir_path / n
try:
rel = full.resolve().relative_to(workspace_root).as_posix()
Expand Down Expand Up @@ -352,6 +372,11 @@ def run_sync_pipeline(
Called from a daemon thread. Never raises — all errors funnel into
_release_db_lock(error=...) so the UI gets a status and the lock
always releases.

After releasing the DB lock, checks _rerun_requested[user_id]. If set
(meaning one or more enqueue_sync calls arrived while this job was
running), clears the flag and enqueues exactly one trailing sync so the
final disk state is always reflected in the brain repo.
"""
with _job_lock:
# The DB lock is already set by enqueue_sync before the thread
Expand Down Expand Up @@ -418,6 +443,29 @@ def run_sync_pipeline(
finally:
_release_db_lock(flask_app, user_id, success=success, error=error)

# Trailing-run check — runs inside _job_lock so the pop + enqueue is
# atomic against concurrent enqueue_sync calls. The DB lock has been
# released above (by _release_db_lock in finally), so _acquire_db_lock
# inside enqueue_sync can succeed.
rerun = _rerun_requested.pop(user_id, False)
if rerun:
log.info(
"job_runner %s: trailing run requested, re-enqueueing for user %s",
kind, user_id,
)
# Spawn the trailing thread while still holding _job_lock so no
# other enqueue can sneak in between pop and the new acquire.
# The new thread will block on _job_lock itself and start only
# after this with-block exits.
t = threading.Thread(
target=run_sync_pipeline,
args=(flask_app, user_id, workspace),
kwargs={"kind": JOB_KIND_WATCHER, "commit_message": "auto: trailing watcher sync"},
name=f"brain-repo-trailing-{user_id}",
daemon=True,
)
t.start()


def run_bootstrap_pipeline(
flask_app,
Expand Down Expand Up @@ -540,8 +588,21 @@ def enqueue_sync(
tag_name: str | None = None,
commit_message: str | None = None,
) -> bool:
"""Spawn a daemon thread running run_sync_pipeline. Returns False if busy."""
"""Spawn a daemon thread running run_sync_pipeline. Returns False if busy.

When a job is already running (returns False), sets _rerun_requested so
that run_sync_pipeline will enqueue exactly one trailing run after the
current job finishes — guaranteeing consistency even when files are
written during a busy window.
"""
if not _acquire_db_lock(flask_app, user_id, kind):
# Coalescing: N concurrent misses during one job → 1 trailing run.
with _job_lock:
_rerun_requested[user_id] = True
log.debug(
"enqueue_sync: job already running for user %s, trailing run requested",
user_id,
)
return False

t = threading.Thread(
Expand Down Expand Up @@ -600,6 +661,37 @@ def request_cancel(flask_app, user_id: int) -> bool:
return rows == 1


def reclaim_orphaned_locks_on_startup(flask_app) -> int:
"""Clear ANY sync_in_progress lock at process startup (no age gate).

At startup no sync can legitimately be in flight yet — this process just
began and it is the only one that mirrors. So a ``sync_in_progress=True``
row can only be the residue of a previous process killed mid-sync (e.g. a
service restart during a watcher sync), which otherwise leaves auto-sync
DEAD for up to JOB_STALE_SECONDS (20 min) until the janitor reclaims it.
Sibling of git_ops._clear_stale_lock for the .git/index.lock case.
Returns the count cleared.
"""
from models import BrainRepoConfig, db # type: ignore[import]

with flask_app.app_context():
stale = BrainRepoConfig.query.filter(
BrainRepoConfig.sync_in_progress == True, # noqa: E712
).all()
count = 0
for config in stale:
config.sync_in_progress = False
config.sync_started_at = None
config.sync_job_kind = None
config.cancel_requested = False
config.last_error = "orphaned lock cleared at startup (process restart during sync)"
count += 1
if count:
db.session.commit()
log.warning("job_runner: cleared %d orphaned sync lock(s) at startup", count)
return count


def reclaim_stale_locks(flask_app) -> int:
"""Release sync_in_progress rows older than JOB_STALE_SECONDS.

Expand Down
63 changes: 58 additions & 5 deletions dashboard/backend/brain_repo/secrets_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,26 @@
# (name, pattern) — minimum 20 patterns
PATTERNS: list[tuple[str, str]] = [
("AWS_ACCESS_KEY", r"AKIA[0-9A-Z]{16}"),
("AWS_SECRET_KEY", r"(?i)aws.{0,20}[0-9a-zA-Z/+]{40}"),
# Negative lookbehind prevents matching 'aws' embedded inside base64 image data.
("AWS_SECRET_KEY", r"(?i)(?<![A-Za-z0-9+/])aws.{0,25}[0-9a-zA-Z/+]{40}"),
("GITHUB_TOKEN", r"gh[pousr]_[A-Za-z0-9_]{36,255}"),
("ANTHROPIC_API_KEY", r"sk-ant-api[0-9]{2}-[A-Za-z0-9_\-]{93,}AA"),
("OPENAI_API_KEY", r"sk-[a-zA-Z0-9]{20,}T3BlbkFJ[a-zA-Z0-9]{20,}"),
("OPENAI_PROJECT_KEY", r"sk-proj-[A-Za-z0-9_\-]{40,}"),
("GENERIC_SECRET", r"(?i)(secret|api_key|private_key|access_token|auth_token)\s*[=:]\s*[\"']?[A-Za-z0-9_\-]{20,}[\"']?"),
("JWT_TOKEN", r"ey[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+"),
# Positive lookahead on the value (case-sensitive via (?-i:...)):
# Only match when the value contains at least one lowercase letter or entropy
# character (+, /, =). This excludes pure ALL_CAPS_UNDERSCORE identifiers such as
# 'SERVICE_API_TOKEN_PROD' or 'WEBHOOK_TOKEN_PROD' — those are
# binding/env-var NAMES used in Cloudflare Worker wrangler config and similar
# tooling, not credential values. Real secrets (base64/hex/Fernet/JWT) always
# contain at least one lowercase letter or one of the base64 symbols '+', '/', '='.
# (?-i:...) disables the outer case-insensitive flag for this lookahead only so
# that [a-z] matches literal lowercase characters, not A-Z.
("GENERIC_SECRET", r"(?i)(secret|api_key|private_key|access_token|auth_token)\s*[=:]\s*[\"']?(?=(?-i:[A-Za-z0-9_\-]*[a-z+/=]))[A-Za-z0-9_\-]{20,}[\"']?"),
# Negative lookbehind prevents matching 'ey' embedded inside a longer word
# (e.g. 'halleyshair', 'Ziegelmeyer-79'). Minimum segment lengths exclude
# image filenames (eyer…1.jpg) and short domain labels.
("JWT_TOKEN", r"(?<![A-Za-z0-9_\-])ey[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{20,}"),
("SSH_PRIVATE_KEY", r"-----BEGIN (?:RSA|EC|OPENSSH) PRIVATE KEY-----"),
("STRIPE_KEY", r"(?:sk|pk)_(?:live|test)_[0-9a-zA-Z]{24,}"),
("SENDGRID_KEY", r"SG\.[A-Za-z0-9_\-]{22}\.[A-Za-z0-9_\-]{43}"),
Expand All @@ -27,8 +40,17 @@
("DIGITALOCEAN_TOKEN", r"dop_v1_[a-f0-9]{64}"),
("HEROKU_KEY", r"(?i)heroku.{0,20}[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"),
("DATABASE_URL_WITH_PASSWORD", r"(?:postgres|mysql|mongodb)(?:ql)?://[^:]+:[^@]{6,}@"),
("FERNET_KEY", r"[A-Za-z0-9_\-]{43}="),
("GENERIC_PASSWORD", r"(?i)password\s*[=:]\s*[\"']?[A-Za-z0-9!@#$%^&*()_+\-=\[\]{};':\"\\|,.<>/?]{8,}[\"']?"),
# Negative lookbehind requires the 44-char key NOT to be a substring embedded
# inside a longer base64 string (e.g. recaptcha sxtoken, encrypted blobs).
# Negative lookahead requires the trailing '=' to end the value.
("FERNET_KEY", r"(?<![A-Za-z0-9+/])[A-Za-z0-9_\-]{43}=(?![A-Za-z0-9_\-=])"),
# Require the value to be either a quoted string literal OR a bare value
# that contains at least one digit/symbol (entropy indicator). This excludes:
# - 'password = os.environ.get(...)' (function call, no digit/symbol before paren)
# - 'password=None' / 'password=password' (pure-alpha variable names)
# - argument definitions: '--password' help strings
# password[_suffix] form matches env-var names like POSTGRES_PASSWORD, SMTP_PASSWORD.
("GENERIC_PASSWORD", r"(?i)password[a-z_0-9]*\s*[=:]\s*(?:[\"'][A-Za-z0-9!@#$%^&*_+\-]{8,}[\"']|(?=[^\s\"']*[0-9!@#$%^&*_+\-][^\s\"']*)[A-Za-z0-9!@#$%^&*_+\-]{8,})"),
]

_CHECKED_EXTENSIONS = {
Expand All @@ -40,6 +62,35 @@
_AUTO_EXCLUDE_SUFFIXES = {".pyc"}


# A value that is purely an env-var interpolation (e.g. ${DB_PASSWORD}) is a
# REFERENCE, never a credential — common after redacting secrets out to .env.
_VAR_REF_RE = re.compile(r"\$\{[A-Za-z_][A-Za-z0-9_]*\}")

# Documentation placeholders that look password-shaped but carry no secret.
# Each token is specific enough (8+ chars / unambiguous marker) that a real
# base64/hex/Fernet secret won't contain it by chance. Keep this list tight —
# every entry must be a phrase no genuine credential would embed.
_PLACEHOLDER_RE = re.compile(
r"(?:SUA_SENHA|SUA_SEGREDO|SEU_SENHA|SEU_SEGREDO|SEU_TOKEN|"
r"YOUR_|YOUR-|CHANGE[_-]?ME|senha&especial)",
re.IGNORECASE,
)


def _is_false_positive(match_text: str) -> bool:
"""True when a regex match is a known non-secret (var ref or placeholder).

Applied after every pattern match so the FP rules live in one place instead
of being smeared across every individual regex. Safe by construction: real
secrets are neither ``${VAR}`` interpolations nor documentation placeholders.
"""
if _VAR_REF_RE.search(match_text):
return True
if _PLACEHOLDER_RE.search(match_text):
return True
return False


def _mask_match(match_text: str) -> str:
"""Mask a secret match: show first 4 + '***' + last 4 chars."""
if len(match_text) <= 8:
Expand Down Expand Up @@ -83,6 +134,8 @@ def scan_files(files: list[Path]) -> list[dict]:
for name, regex in compiled:
m = regex.search(line)
Comment on lines 134 to 135

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚨 issue (security): Single search per line can drop real secrets when the first match is a false positive.

Because we only do a single regex.search(line) per pattern, if that first match is treated as a false positive we skip the rest of the line and may miss a real secret later in the same line.

Consider iterating all matches with for m in regex.finditer(line): and running _is_false_positive inside that loop, adding findings for each non–false-positive match. This ensures multiple secrets on one line (or secrets after placeholders) are all detected.

if m:
if _is_false_positive(m.group(0)):
continue
findings.append({
"file": str(filepath),
"line": lineno,
Expand Down
Empty file.
130 changes: 130 additions & 0 deletions dashboard/backend/brain_repo/tests/test_ignore_gitignore_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""Tests for build_ignore_callback — .gitignore suppression behaviour.

Verifica que:
1. O callback de ignore retorna ".gitignore" como ignorado quando presente
em ``names``, qualquer que seja o diretório.
2. Outros arquivos e diretórios legítimos NÃO são excluídos por esta regra.
3. Um copytree simulado sobre uma árvore contendo workspace/marketing/_state/
NÃO copia o .gitignore interno mas SIM copia os arquivos de estado sob ele.
"""
from __future__ import annotations

import shutil
import tempfile
import unittest
from pathlib import Path


class TestIgnoreGitignoreFiles(unittest.TestCase):
"""Testa a regra de não-cópia de .gitignore aninhados."""

def setUp(self):
import importlib
import dashboard.backend.brain_repo.job_runner as jr_module
importlib.reload(jr_module)
self.jr = jr_module

# ────────────────────────────────────────────────────────────
# Testes unitários do callback
# ────────────────────────────────────────────────────────────

def _make_workspace(self):
"""Cria um workspace temporário em disco com estrutura mínima."""
tmp = tempfile.mkdtemp()
self.addCleanup(shutil.rmtree, tmp, True)
return Path(tmp)

def test_gitignore_is_ignored_at_root(self):
"""build_ignore_callback deve retornar '.gitignore' como ignorado."""
ws = self._make_workspace()
callback = self.jr.build_ignore_callback(ws)

# Simula names na raiz do workspace
result = callback(str(ws), [".gitignore", "README.md", "data.yaml"])
self.assertIn(".gitignore", result)

def test_gitignore_is_ignored_in_nested_dir(self):
"""A regra se aplica em qualquer subdiretório, não só na raiz."""
ws = self._make_workspace()
nested = ws / "workspace" / "marketing" / "_state"
nested.mkdir(parents=True, exist_ok=True)

callback = self.jr.build_ignore_callback(ws)
result = callback(str(nested), [".gitignore", "snapshot-2026-05.json", "README.md"])

self.assertIn(".gitignore", result)

def test_regular_files_not_ignored_by_gitignore_rule(self):
"""Arquivos comuns NÃO devem ser excluídos pela regra do .gitignore."""
ws = self._make_workspace()
nested = ws / "workspace" / "marketing" / "_state"
nested.mkdir(parents=True, exist_ok=True)

# Cria arquivo real pra que full.is_file() retorne True
(nested / "snapshot.json").write_text('{"ok": true}')

callback = self.jr.build_ignore_callback(ws)
result = callback(str(nested), [".gitignore", "snapshot.json"])

self.assertIn(".gitignore", result)
self.assertNotIn("snapshot.json", result)

def test_only_gitignore_name_excluded_not_gitkeep(self):
"""'.gitkeep' e outros .git* que não são '.gitignore' NÃO devem ser excluídos pela regra."""
ws = self._make_workspace()
callback = self.jr.build_ignore_callback(ws)

result = callback(str(ws), [".gitignore", ".gitkeep", ".gitattributes"])

self.assertIn(".gitignore", result)
# .gitkeep e .gitattributes são arquivos legítimos de versionamento
self.assertNotIn(".gitkeep", result)
self.assertNotIn(".gitattributes", result)

# ────────────────────────────────────────────────────────────
# Teste de integração — copytree simulado
# ────────────────────────────────────────────────────────────

def test_copytree_does_not_copy_gitignore_but_copies_state_files(self):
"""copytree com o callback NÃO deve copiar .gitignore e DEVE copiar arquivos de _state."""
ws = self._make_workspace()
dst_root = self._make_workspace()

# Monta estrutura: workspace/marketing/_state/
state_dir = ws / "workspace" / "marketing" / "_state"
state_dir.mkdir(parents=True)

# .gitignore com wildcard que excluiria tudo
(state_dir / ".gitignore").write_text("*\n!.gitignore\n!README.md\n")
(state_dir / "README.md").write_text("# State\n")
(state_dir / "snapshot-2026-05.json").write_text('{"ads": []}')
(state_dir / "checkpoint.yaml").write_text("step: 3\n")

callback = self.jr.build_ignore_callback(ws)
dst = dst_root / "workspace" / "marketing" / "_state"

shutil.copytree(str(state_dir), str(dst), ignore=callback)

# .gitignore NÃO deve ter sido copiado
self.assertFalse(
(dst / ".gitignore").exists(),
".gitignore não deveria ter sido copiado pro brain repo",
)

# Arquivos de estado DEVEM ter sido copiados
self.assertTrue(
(dst / "snapshot-2026-05.json").exists(),
"snapshot-2026-05.json deveria ter sido copiado",
)
self.assertTrue(
(dst / "checkpoint.yaml").exists(),
"checkpoint.yaml deveria ter sido copiado",
)
self.assertTrue(
(dst / "README.md").exists(),
"README.md deveria ter sido copiado",
)


if __name__ == "__main__":
unittest.main()
Loading