diff --git a/src/adapters/sherlock_runner.py b/src/adapters/sherlock_runner.py
index cb2206f..b3cd9c5 100644
--- a/src/adapters/sherlock_runner.py
+++ b/src/adapters/sherlock_runner.py
@@ -14,6 +14,7 @@
from __future__ import annotations
import asyncio
+import logging
from typing import Any
from collections.abc import Callable
@@ -25,6 +26,8 @@
from core.config import AppSettings
from core.domain.models import SocialProfile
+logger = logging.getLogger(__name__)
+
def _slug(name: str) -> str:
out = []
@@ -71,7 +74,8 @@ async def run_sherlock_username(
max_concurrency: int,
no_nsfw: bool,
progress_callback: Callable[[int, int, str], None] | None = None,
-) -> list[SocialProfile]:
+) -> tuple[list[SocialProfile], int]:
+ """Run Sherlock checks. Returns (found_profiles, error_count)."""
sem = asyncio.Semaphore(max(1, max_concurrency))
# Rate limiter por dominio
@@ -187,8 +191,9 @@ async def check(site_name: str, info: dict[str, Any], username: str) -> SocialPr
bio=html_meta.get("meta_description"),
image_url=html_meta.get("og_image"),
)
- except Exception:
- return None
+ except Exception as exc:
+ logger.debug("Sherlock check failed for %s on %s: %s", username, site_name, exc)
+ return exc # Return exception to count it
tasks: list[asyncio.Future[SocialProfile | None]] = []
task_labels: dict[asyncio.Future[SocialProfile | None], str] = {}
@@ -201,6 +206,7 @@ async def check(site_name: str, info: dict[str, Any], username: str) -> SocialPr
completed = 0
found: list[SocialProfile] = []
+ error_count = 0
for t in asyncio.as_completed(tasks):
r = await t
completed += 1
@@ -210,7 +216,12 @@ async def check(site_name: str, info: dict[str, Any], username: str) -> SocialPr
except Exception:
# Nunca dejar que la UI rompa el scanning.
pass
- if r is not None:
+ if isinstance(r, Exception):
+ error_count += 1
+ elif r is not None:
found.append(r)
- return found
+ if error_count:
+ logger.info("Sherlock scan completed: %d found, %d errors out of %d checks.", len(found), error_count, total)
+
+ return found, error_count
diff --git a/src/adapters/site_lists/runner.py b/src/adapters/site_lists/runner.py
index 3ec6941..0f6ea7e 100644
--- a/src/adapters/site_lists/runner.py
+++ b/src/adapters/site_lists/runner.py
@@ -13,6 +13,7 @@
from __future__ import annotations
import asyncio
+import logging
from typing import Any
from adapters.http_client import build_async_client
@@ -26,6 +27,8 @@
from core.config import AppSettings
from core.domain.models import SocialProfile
+logger = logging.getLogger(__name__)
+
def _slug(name: str) -> str:
out = []
@@ -66,7 +69,8 @@ async def run_username_sites(
max_concurrency: int,
categories: set[str] | None,
no_nsfw: bool,
-) -> list[SocialProfile]:
+) -> tuple[list[SocialProfile], int]:
+ """Run username site checks. Returns (found_profiles, error_count)."""
semaphore = asyncio.Semaphore(max(1, max_concurrency))
# Rate limiter por dominio
@@ -127,13 +131,17 @@ async def check(site: UsernameSite, username: str) -> SocialProfile | None:
bio=html_meta.get("meta_description"),
image_url=html_meta.get("og_image"),
)
- except Exception:
- # Errores: para masivo preferimos no contaminar con cientos de errores.
- return None
+ except Exception as exc:
+ logger.debug("Site-list check failed for %s on %s: %s", username, site.name, exc)
+ return exc
results = await asyncio.gather(*(check(s, username) for s in filtered for username in usernames), return_exceptions=False)
- return [r for r in results if r is not None]
+ error_count = sum(1 for r in results if isinstance(r, Exception))
+ found = [r for r in results if isinstance(r, SocialProfile)]
+ if error_count:
+ logger.info("Username site-list scan: %d found, %d errors.", len(found), error_count)
+ return found, error_count
async def run_email_sites(
@@ -144,7 +152,8 @@ async def run_email_sites(
max_concurrency: int,
categories: set[str] | None,
no_nsfw: bool,
-) -> list[SocialProfile]:
+) -> tuple[list[SocialProfile], int]:
+ """Run email site checks. Returns (found_profiles, error_count)."""
semaphore = asyncio.Semaphore(max(1, max_concurrency))
# Rate limiter por dominio
@@ -213,9 +222,14 @@ async def check(site: EmailSite, email: str) -> SocialProfile | None:
bio=html_meta.get("meta_description"),
image_url=html_meta.get("og_image"),
)
- except Exception:
- return None
+ except Exception as exc:
+ logger.debug("Email site-list check failed for %s on %s: %s", email, site.name, exc)
+ return exc
results = await asyncio.gather(*(check(s, email) for s in filtered for email in emails), return_exceptions=False)
- return [r for r in results if r is not None]
+ error_count = sum(1 for r in results if isinstance(r, Exception))
+ found = [r for r in results if isinstance(r, SocialProfile)]
+ if error_count:
+ logger.info("Email site-list scan: %d found, %d errors.", len(found), error_count)
+ return found, error_count
diff --git a/src/core/services/identity_pipeline.py b/src/core/services/identity_pipeline.py
index cf83a79..fcafc81 100644
--- a/src/core/services/identity_pipeline.py
+++ b/src/core/services/identity_pipeline.py
@@ -10,6 +10,7 @@
from __future__ import annotations
import asyncio
+import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, Iterable, Sequence
@@ -54,6 +55,8 @@
from core.domain.models import PersonEntity, SocialProfile
from core.resources_loader import get_default_list_path, load_sherlock_data
+logger = logging.getLogger(__name__)
+
@dataclass
class SiteListOptions:
@@ -98,6 +101,7 @@ class PipelineResult:
usernames: list[str]
emails: list[str]
warnings: list[str] = field(default_factory=list)
+ scan_errors: int = 0
_USERNAME_SCANNERS = (
@@ -226,6 +230,7 @@ async def hunt(
email_scanners = [scanner() for scanner in _EMAIL_SCANNERS]
profiles: list[SocialProfile] = []
+ total_scan_errors: int = 0
all_usernames = set(usernames)
all_emails = set(emails)
scanned_usernames: set[str] = set()
@@ -250,7 +255,8 @@ async def safe_scan(
if derived_from and isinstance(profile.metadata, dict):
profile.metadata = {**profile.metadata, "derived_from": derived_from}
return collected
- except Exception as exc: # pragma: no cover - defensive fallback
+ except Exception as exc:
+ logger.debug("Scanner %s failed for %s: %s", name, value, exc)
fallback_url = f"https://{network}.com/{value}"
if network == "x":
fallback_url = f"https://x.com/{value}"
@@ -364,16 +370,16 @@ def extract_extras(perfiles: Iterable[SocialProfile]) -> tuple[set[str], set[str
hooks.warning(message)
else:
sites_file = load_username_sites(username_path)
- profiles.extend(
- await run_username_sites(
- usernames=usernames,
- sites=sites_file.sites,
- settings=settings,
- max_concurrency=max_concurrency,
- categories=request.site_lists.categories,
- no_nsfw=no_nsfw_effective,
- )
+ site_profiles, site_errors = await run_username_sites(
+ usernames=usernames,
+ sites=sites_file.sites,
+ settings=settings,
+ max_concurrency=max_concurrency,
+ categories=request.site_lists.categories,
+ no_nsfw=no_nsfw_effective,
)
+ profiles.extend(site_profiles)
+ total_scan_errors += site_errors
if emails:
email_path = request.site_lists.email_path
@@ -388,16 +394,16 @@ def extract_extras(perfiles: Iterable[SocialProfile]) -> tuple[set[str], set[str
hooks.warning(message)
else:
sites_file = load_email_sites(email_path)
- profiles.extend(
- await run_email_sites(
- emails=emails,
- sites=sites_file.sites,
- settings=settings,
- max_concurrency=max_concurrency,
- categories=request.site_lists.categories,
- no_nsfw=no_nsfw_effective,
- )
+ email_site_profiles, email_site_errors = await run_email_sites(
+ emails=emails,
+ sites=sites_file.sites,
+ settings=settings,
+ max_concurrency=max_concurrency,
+ categories=request.site_lists.categories,
+ no_nsfw=no_nsfw_effective,
)
+ profiles.extend(email_site_profiles)
+ total_scan_errors += email_site_errors
if request.use_sherlock and usernames:
manifest = request.sherlock_manifest or load_sherlock_data(refresh=False)
@@ -415,16 +421,16 @@ def extract_extras(perfiles: Iterable[SocialProfile]) -> tuple[set[str], set[str
hooks.sherlock_start(total)
progress_cb = hooks.sherlock_progress if total else None
- profiles.extend(
- await run_sherlock_username(
- usernames=usernames,
- manifest=manifest,
- settings=settings,
- max_concurrency=max_concurrency,
- no_nsfw=no_nsfw_effective,
- progress_callback=progress_cb,
- )
+ sherlock_profiles, sherlock_errors = await run_sherlock_username(
+ usernames=usernames,
+ manifest=manifest,
+ settings=settings,
+ max_concurrency=max_concurrency,
+ no_nsfw=no_nsfw_effective,
+ progress_callback=progress_cb,
)
+ profiles.extend(sherlock_profiles)
+ total_scan_errors += sherlock_errors
profiles = dedupe_profiles(profiles)
@@ -461,11 +467,27 @@ def extract_extras(perfiles: Iterable[SocialProfile]) -> tuple[set[str], set[str
person = PersonEntity(target=target_label, profiles=profiles)
+ # Count errors from safe_scan fallback profiles
+ for p in profiles:
+ if isinstance(p.metadata, dict) and p.metadata.get("error"):
+ total_scan_errors += 1
+
+ if total_scan_errors:
+ msg = (
+ f"{total_scan_errors} scanner(s) returned errors "
+ f"(timeouts, SSL, 5xx, etc.). Results may be incomplete."
+ )
+ warnings.append(msg)
+ if hooks.warning:
+ hooks.warning(msg)
+ logger.info("Scan completed with %d errors.", total_scan_errors)
+
return PipelineResult(
person=person,
usernames=usernames,
emails=emails,
warnings=warnings,
+ scan_errors=total_scan_errors,
)
diff --git a/tests/test_agent_engine_loop.py b/tests/test_agent_engine_loop.py
new file mode 100644
index 0000000..cc1ad44
--- /dev/null
+++ b/tests/test_agent_engine_loop.py
@@ -0,0 +1,265 @@
+"""Tests for AgentEngine.run() with fully mocked LLM client (issue #32).
+
+Covers:
+- Single tool call → report flow
+- Max steps respected
+- Forced report generation when steps exhausted
+- LLM error handling
+- on_step callback invocations
+"""
+
+from __future__ import annotations
+
+import json
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from core.config import AppSettings
+from core.services.agent_engine import AgentEngine, AgentStep
+
+
+def _make_settings() -> AppSettings:
+ return AppSettings(
+ ai_api_key="test-key-123",
+ ai_base_url="https://fake.api.local",
+ ai_model="test-model",
+ )
+
+
+def _tool_call(*, name: str, arguments: dict, call_id: str = "call_1"):
+ """Create a mock tool call object."""
+ tc = MagicMock()
+ tc.function.name = name
+ tc.function.arguments = json.dumps(arguments)
+ tc.id = call_id
+ return tc
+
+
+def _assistant_message(*, tool_calls=None, content=None):
+ """Create a mock assistant message."""
+ msg = MagicMock()
+ msg.tool_calls = tool_calls
+ msg.content = content
+ msg.model_dump.return_value = {
+ "role": "assistant",
+ "content": content,
+ "tool_calls": [
+ {
+ "id": tc.id,
+ "type": "function",
+ "function": {"name": tc.function.name, "arguments": tc.function.arguments},
+ }
+ for tc in (tool_calls or [])
+ ] or None,
+ }
+ return msg
+
+
+def _chat_response(*, message):
+ """Create a mock chat completion response."""
+ choice = MagicMock()
+ choice.message = message
+ resp = MagicMock()
+ resp.choices = [choice]
+ return resp
+
+
+# ---------------------------------------------------------------------------
+# Single tool call → report
+# ---------------------------------------------------------------------------
+
+class TestSingleToolCallToReport:
+ @pytest.mark.asyncio
+ async def test_scan_then_report(self):
+ """LLM calls scan_username, then generate_report → finished_naturally=True."""
+
+ # Step 1: LLM wants to call scan_username
+ scan_call = _tool_call(
+ name="scan_username",
+ arguments={"username": "testuser"},
+ call_id="call_scan",
+ )
+ scan_msg = _assistant_message(tool_calls=[scan_call])
+ scan_response = _chat_response(message=scan_msg)
+
+ # Step 2: LLM calls generate_report
+ report_call = _tool_call(
+ name="generate_report",
+ arguments={
+ "summary": "## 1. Identity\nTest analysis",
+ "highlights": ["Found on GitHub"],
+ "confidence": 0.8,
+ },
+ call_id="call_report",
+ )
+ report_msg = _assistant_message(tool_calls=[report_call])
+ report_response = _chat_response(message=report_msg)
+
+ mock_client = AsyncMock()
+ mock_client.chat.completions.create = AsyncMock(
+ side_effect=[scan_response, report_response]
+ )
+
+ # Mock execute_tool to return scan results
+ scan_result = json.dumps({
+ "target": "testuser",
+ "total_scanned": 1,
+ "confirmed": 1,
+ "profiles": [{"network": "github", "username": "testuser", "exists": True, "url": "https://github.com/testuser"}],
+ })
+
+ with patch("core.services.agent_engine.AsyncOpenAI", return_value=mock_client), \
+ patch("core.services.agent_engine.execute_tool", AsyncMock(return_value=scan_result)):
+ engine = AgentEngine(settings=_make_settings())
+ result = await engine.run("investigate testuser", max_steps=5)
+
+ assert result.finished_naturally is True
+ assert result.total_steps >= 2
+
+
+# ---------------------------------------------------------------------------
+# Max steps respected
+# ---------------------------------------------------------------------------
+
+class TestMaxStepsRespected:
+ @pytest.mark.asyncio
+ async def test_stops_after_max_steps(self):
+ """Engine should stop after max_steps even if LLM keeps calling tools."""
+
+ # LLM always wants to call scan_username (never calls generate_report)
+ scan_call = _tool_call(
+ name="scan_username",
+ arguments={"username": "user"},
+ call_id="call_1",
+ )
+ scan_msg = _assistant_message(tool_calls=[scan_call])
+ scan_response = _chat_response(message=scan_msg)
+
+ # For forced report: LLM returns text instead of tool call
+ text_msg = _assistant_message(content="Final analysis summary.")
+ text_response = _chat_response(message=text_msg)
+
+ mock_client = AsyncMock()
+ # 3 scan responses + 1 forced report attempt
+ mock_client.chat.completions.create = AsyncMock(
+ side_effect=[scan_response, scan_response, scan_response, text_response]
+ )
+
+ scan_result = json.dumps({
+ "target": "user",
+ "profiles": [{"network": "github", "username": "user", "exists": True, "url": "https://github.com/user"}],
+ })
+
+ with patch("core.services.agent_engine.AsyncOpenAI", return_value=mock_client), \
+ patch("core.services.agent_engine.execute_tool", AsyncMock(return_value=scan_result)):
+ engine = AgentEngine(settings=_make_settings())
+ result = await engine.run("investigate user", max_steps=3)
+
+ assert result.total_steps <= 4 # 3 steps + possible forced report
+
+
+# ---------------------------------------------------------------------------
+# LLM error handling
+# ---------------------------------------------------------------------------
+
+class TestLLMErrorHandling:
+ @pytest.mark.asyncio
+ async def test_llm_error_breaks_loop(self):
+ """If the LLM call raises, the loop should break with error recorded."""
+
+ mock_client = AsyncMock()
+ mock_client.chat.completions.create = AsyncMock(
+ side_effect=Exception("API connection failed")
+ )
+
+ with patch("core.services.agent_engine.AsyncOpenAI", return_value=mock_client):
+ engine = AgentEngine(settings=_make_settings())
+ result = await engine.run("investigate user", max_steps=5)
+
+ assert result.total_steps >= 1
+ # The first step should have recorded the error
+ error_step = result.steps[0]
+ assert error_step.reasoning is not None
+ assert "LLM error" in error_step.reasoning
+ assert result.finished_naturally is False
+
+
+# ---------------------------------------------------------------------------
+# on_step callback
+# ---------------------------------------------------------------------------
+
+class TestOnStepCallbackInLoop:
+ @pytest.mark.asyncio
+ async def test_callback_called_for_each_step(self):
+ """on_step should be called for every step in the loop."""
+ captured_steps: list[AgentStep] = []
+
+ # LLM sends text, then error (to end quickly)
+ text_msg = _assistant_message(content="Thinking...")
+ text_response = _chat_response(message=text_msg)
+
+ mock_client = AsyncMock()
+ mock_client.chat.completions.create = AsyncMock(
+ side_effect=[text_response, Exception("done")]
+ )
+
+ with patch("core.services.agent_engine.AsyncOpenAI", return_value=mock_client):
+ engine = AgentEngine(
+ settings=_make_settings(),
+ on_step=lambda s: captured_steps.append(s),
+ )
+ await engine.run("investigate user", max_steps=3)
+
+ # At least 1 step should have triggered the callback
+ assert len(captured_steps) >= 1
+
+
+# ---------------------------------------------------------------------------
+# Forced report generation
+# ---------------------------------------------------------------------------
+
+class TestForcedReport:
+ @pytest.mark.asyncio
+ async def test_forced_report_when_profiles_collected(self):
+ """When max_steps exhausted with collected profiles, engine forces report."""
+
+ # Step 1: LLM calls scan_username
+ scan_call = _tool_call(
+ name="scan_username",
+ arguments={"username": "user"},
+ call_id="call_1",
+ )
+ scan_msg = _assistant_message(tool_calls=[scan_call])
+ scan_response = _chat_response(message=scan_msg)
+
+ # Forced report: LLM calls generate_report
+ report_call = _tool_call(
+ name="generate_report",
+ arguments={
+ "summary": "Forced analysis",
+ "highlights": ["Found"],
+ "confidence": 0.5,
+ },
+ call_id="call_forced",
+ )
+ report_msg = _assistant_message(tool_calls=[report_call])
+ forced_response = _chat_response(message=report_msg)
+
+ mock_client = AsyncMock()
+ mock_client.chat.completions.create = AsyncMock(
+ side_effect=[scan_response, forced_response]
+ )
+
+ scan_result = json.dumps({
+ "target": "user",
+ "profiles": [{"network": "github", "username": "user", "exists": True, "url": "https://github.com/user"}],
+ })
+
+ with patch("core.services.agent_engine.AsyncOpenAI", return_value=mock_client), \
+ patch("core.services.agent_engine.execute_tool", AsyncMock(return_value=scan_result)):
+ engine = AgentEngine(settings=_make_settings())
+ result = await engine.run("investigate user", max_steps=1)
+
+ # Should have generated a report even though max_steps was 1
+ assert result.person is not None
diff --git a/tests/test_agent_tools_execution.py b/tests/test_agent_tools_execution.py
new file mode 100644
index 0000000..fa3142c
--- /dev/null
+++ b/tests/test_agent_tools_execution.py
@@ -0,0 +1,289 @@
+"""Tests for execute_tool() dispatch logic (issue #32).
+
+Covers:
+- scan_username dispatch
+- scan_email dispatch
+- breach_check disabled/enabled
+- fetch_url with mocked HTTP
+- fetch_url invalid scheme
+- generate_report echo
+- Unknown tool error
+"""
+
+from __future__ import annotations
+
+import json
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from core.config import AppSettings
+from core.domain.models import PersonEntity, SocialProfile
+from core.services.agent_tools import execute_tool
+from core.services.identity_pipeline import PipelineResult
+
+
+def _settings() -> AppSettings:
+ return AppSettings()
+
+
+def _pipeline_result(profiles: list[SocialProfile] | None = None) -> PipelineResult:
+ profs = profiles or [
+ SocialProfile(
+ url="https://github.com/testuser",
+ username="testuser",
+ network_name="github",
+ exists=True,
+ metadata={"source": "test"},
+ ),
+ ]
+ return PipelineResult(
+ person=PersonEntity(target="test", profiles=profs),
+ usernames=["testuser"],
+ emails=[],
+ )
+
+
+# ---------------------------------------------------------------------------
+# scan_username
+# ---------------------------------------------------------------------------
+
+class TestExecuteToolScanUsername:
+ @pytest.mark.asyncio
+ async def test_returns_json_with_profiles(self):
+ mock_scan = AsyncMock(return_value=_pipeline_result())
+
+ with patch("core.services.agent_tools.scan_username", mock_scan):
+ result = await execute_tool(
+ "scan_username",
+ {"username": "testuser"},
+ settings=_settings(),
+ )
+
+ data = json.loads(result)
+ assert data["target"] == "testuser"
+ assert "profiles" in data
+ assert data["confirmed"] >= 1
+
+ @pytest.mark.asyncio
+ async def test_empty_username_returns_error(self):
+ result = await execute_tool(
+ "scan_username",
+ {"username": ""},
+ settings=_settings(),
+ )
+ data = json.loads(result)
+ assert "error" in data
+
+
+# ---------------------------------------------------------------------------
+# scan_email
+# ---------------------------------------------------------------------------
+
+class TestExecuteToolScanEmail:
+ @pytest.mark.asyncio
+ async def test_returns_json_with_profiles(self):
+ mock_scan = AsyncMock(return_value=_pipeline_result())
+
+ with patch("core.services.agent_tools.scan_email", mock_scan):
+ result = await execute_tool(
+ "scan_email",
+ {"email": "test@test.com"},
+ settings=_settings(),
+ )
+
+ data = json.loads(result)
+ assert data["target"] == "test@test.com"
+
+ @pytest.mark.asyncio
+ async def test_empty_email_returns_error(self):
+ result = await execute_tool(
+ "scan_email",
+ {"email": ""},
+ settings=_settings(),
+ )
+ data = json.loads(result)
+ assert "error" in data
+
+
+# ---------------------------------------------------------------------------
+# breach_check
+# ---------------------------------------------------------------------------
+
+class TestExecuteToolBreachCheck:
+ @pytest.mark.asyncio
+ async def test_disabled_returns_error(self):
+ result = await execute_tool(
+ "breach_check",
+ {"email": "test@test.com"},
+ settings=_settings(),
+ enable_breach_check=False,
+ )
+ data = json.loads(result)
+ assert "error" in data
+ assert "disabled" in data["error"].lower()
+
+ @pytest.mark.asyncio
+ async def test_enabled_returns_results(self):
+ breach_profiles = [
+ SocialProfile(
+ url="https://haveibeenpwned.com/test@test.com",
+ username="test@test.com",
+ network_name="hibp",
+ exists=True,
+ metadata={"breaches": {"breach1": {"date": "2020-01-01"}}},
+ )
+ ]
+ mock_breach = MagicMock(return_value=breach_profiles)
+
+ with patch("core.services.agent_tools.enrich_profiles_with_breach_data", mock_breach):
+ result = await execute_tool(
+ "breach_check",
+ {"email": "test@test.com"},
+ settings=_settings(),
+ enable_breach_check=True,
+ )
+
+ data = json.loads(result)
+ assert data["target"] == "test@test.com"
+ assert "results" in data
+
+ @pytest.mark.asyncio
+ async def test_empty_email_returns_error(self):
+ result = await execute_tool(
+ "breach_check",
+ {"email": ""},
+ settings=_settings(),
+ enable_breach_check=True,
+ )
+ data = json.loads(result)
+ assert "error" in data
+
+
+# ---------------------------------------------------------------------------
+# fetch_url
+# ---------------------------------------------------------------------------
+
+class TestExecuteToolFetchUrl:
+ @pytest.mark.asyncio
+ async def test_successful_fetch(self):
+ import httpx
+ from contextlib import asynccontextmanager
+
+ resp = MagicMock(spec=httpx.Response)
+ resp.status_code = 200
+ resp.text = "
Test Page"
+ resp.url = httpx.URL("https://example.com")
+
+ @asynccontextmanager
+ async def mock_client_cm(*args, **kwargs):
+ client = AsyncMock()
+ client.get = AsyncMock(return_value=resp)
+ yield client
+
+ with patch("adapters.http_client.build_async_client", mock_client_cm):
+ result = await execute_tool(
+ "fetch_url",
+ {"url": "https://example.com"},
+ settings=_settings(),
+ )
+
+ data = json.loads(result)
+ assert data["status_code"] == 200
+ assert "title" in data or "error" not in data
+
+ @pytest.mark.asyncio
+ async def test_prepends_https(self):
+ """URLs without scheme get https:// prepended."""
+ import httpx
+ from contextlib import asynccontextmanager
+
+ resp = MagicMock(spec=httpx.Response)
+ resp.status_code = 200
+ resp.text = "Test"
+ resp.url = httpx.URL("https://example.com")
+
+ @asynccontextmanager
+ async def mock_client_cm(*args, **kwargs):
+ client = AsyncMock()
+ client.get = AsyncMock(return_value=resp)
+ yield client
+
+ with patch("adapters.http_client.build_async_client", mock_client_cm):
+ result = await execute_tool(
+ "fetch_url",
+ {"url": "example.com"},
+ settings=_settings(),
+ )
+
+ data = json.loads(result)
+ assert "error" not in data
+
+ @pytest.mark.asyncio
+ async def test_empty_url_returns_error(self):
+ result = await execute_tool(
+ "fetch_url",
+ {"url": ""},
+ settings=_settings(),
+ )
+ data = json.loads(result)
+ assert "error" in data
+
+ @pytest.mark.asyncio
+ async def test_http_error_status(self):
+ import httpx
+ from contextlib import asynccontextmanager
+
+ resp = MagicMock(spec=httpx.Response)
+ resp.status_code = 500
+ resp.url = httpx.URL("https://example.com")
+
+ @asynccontextmanager
+ async def mock_client_cm(*args, **kwargs):
+ client = AsyncMock()
+ client.get = AsyncMock(return_value=resp)
+ yield client
+
+ with patch("adapters.http_client.build_async_client", mock_client_cm):
+ result = await execute_tool(
+ "fetch_url",
+ {"url": "https://example.com"},
+ settings=_settings(),
+ )
+
+ data = json.loads(result)
+ assert "error" in data
+ assert "500" in data["error"]
+
+
+# ---------------------------------------------------------------------------
+# generate_report
+# ---------------------------------------------------------------------------
+
+class TestExecuteToolGenerateReport:
+ @pytest.mark.asyncio
+ async def test_echo_response(self):
+ result = await execute_tool(
+ "generate_report",
+ {"summary": "test", "highlights": ["a"], "confidence": 0.9},
+ settings=_settings(),
+ )
+ data = json.loads(result)
+ assert data["status"] == "report_generated"
+
+
+# ---------------------------------------------------------------------------
+# Unknown tool
+# ---------------------------------------------------------------------------
+
+class TestExecuteToolUnknown:
+ @pytest.mark.asyncio
+ async def test_unknown_tool_returns_error(self):
+ result = await execute_tool(
+ "nonexistent_tool",
+ {},
+ settings=_settings(),
+ )
+ data = json.loads(result)
+ assert "error" in data
+ assert "Unknown tool" in data["error"]
diff --git a/tests/test_hunt_pipeline.py b/tests/test_hunt_pipeline.py
new file mode 100644
index 0000000..66e44f0
--- /dev/null
+++ b/tests/test_hunt_pipeline.py
@@ -0,0 +1,312 @@
+"""Tests for the hunt() pipeline orchestration (issue #32).
+
+Covers:
+- Expansion loop: discovers new emails/usernames from scan results
+- Loop termination when nothing new is found
+- Sherlock integration path
+- Site-list integration path
+- Deduplication
+- Breach check integration
+- Hooks (warning callbacks)
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from core.config import AppSettings
+from core.domain.models import SocialProfile
+from core.services.identity_pipeline import (
+ HuntRequest,
+ PipelineHooks,
+ SiteListOptions,
+ hunt,
+)
+
+
+def _profile(*, network: str, username: str, exists: bool = True, **extra_meta) -> SocialProfile:
+ return SocialProfile(
+ url=f"https://{network}.com/{username}",
+ username=username,
+ network_name=network,
+ exists=exists,
+ metadata={"source": "test", **extra_meta},
+ )
+
+
+# ---------------------------------------------------------------------------
+# Expansion loop
+# ---------------------------------------------------------------------------
+
+class TestExpansionLoop:
+ """Verify that hunt() discovers new emails/usernames from scan results
+ and re-scans them in subsequent rounds."""
+
+ @pytest.mark.asyncio
+ async def test_expansion_discovers_new_usernames(self):
+ """When a scanner result contains other_users, those are scanned
+ in the next round."""
+ round_counter = {"count": 0}
+
+ class FakeScanner:
+ """Returns a profile with other_users on the first round only."""
+ async def scan(self, value: str):
+ round_counter["count"] += 1
+ meta = {"source": "test"}
+ if value == "primary" and round_counter["count"] <= 20:
+ meta["other_users"] = ["discovered_user"]
+ return SocialProfile(
+ url=f"https://fake.com/{value}",
+ username=value,
+ network_name="fake",
+ exists=True,
+ metadata=meta,
+ )
+
+ with patch(
+ "core.services.identity_pipeline._USERNAME_SCANNERS",
+ (type(FakeScanner()),),
+ ), patch(
+ "core.services.identity_pipeline._EMAIL_SCANNERS",
+ (),
+ ):
+ settings = AppSettings()
+ request = HuntRequest(
+ usernames=["primary"],
+ emails=[],
+ scan_localpart=False,
+ use_sherlock=False,
+ )
+ result = await hunt(settings=settings, request=request)
+
+ # Should have scanned both "primary" and "discovered_user"
+ scanned_users = {p.username for p in result.person.profiles}
+ assert "primary" in scanned_users
+ assert "discovered_user" in scanned_users
+
+ @pytest.mark.asyncio
+ async def test_expansion_terminates_when_nothing_new(self):
+ """The loop should terminate when no new usernames/emails are found."""
+
+ class StableScanner:
+ async def scan(self, value: str):
+ return SocialProfile(
+ url=f"https://stable.com/{value}",
+ username=value,
+ network_name="stable",
+ exists=True,
+ metadata={"source": "test"},
+ )
+
+ with patch(
+ "core.services.identity_pipeline._USERNAME_SCANNERS",
+ (type(StableScanner()),),
+ ), patch(
+ "core.services.identity_pipeline._EMAIL_SCANNERS",
+ (),
+ ):
+ settings = AppSettings()
+ request = HuntRequest(
+ usernames=["user1"],
+ emails=[],
+ scan_localpart=False,
+ use_sherlock=False,
+ )
+ result = await hunt(settings=settings, request=request)
+
+ # Should have exactly 1 profile — no expansion happened
+ assert len(result.person.profiles) == 1
+
+
+# ---------------------------------------------------------------------------
+# Sherlock integration
+# ---------------------------------------------------------------------------
+
+class TestSherlockIntegration:
+ @pytest.mark.asyncio
+ async def test_sherlock_called_when_enabled(self):
+ # Detect calling convention: development does tuple unpacking
+ # (sherlock_profiles, sherlock_errors = ...), main does
+ # profiles.extend(await run_sherlock_username(...)).
+ import inspect
+ _hunt_src = inspect.getsource(hunt)
+ _uses_tuple = "sherlock_profiles, sherlock_errors" in _hunt_src
+
+ sherlock_profiles = [_profile(network="reddit", username="testuser")]
+ mock_sherlock = AsyncMock(
+ return_value=(sherlock_profiles, 0) if _uses_tuple else sherlock_profiles,
+ )
+
+ class EmptyScanner:
+ async def scan(self, value: str):
+ return SocialProfile(
+ url=f"https://empty.com/{value}",
+ username=value,
+ network_name="empty",
+ exists=False,
+ metadata={},
+ )
+
+ with patch(
+ "core.services.identity_pipeline._USERNAME_SCANNERS",
+ (type(EmptyScanner()),),
+ ), patch(
+ "core.services.identity_pipeline._EMAIL_SCANNERS",
+ (),
+ ), patch(
+ "core.services.identity_pipeline.run_sherlock_username",
+ mock_sherlock,
+ ), patch(
+ "core.services.identity_pipeline.load_sherlock_data",
+ return_value={"TestSite": {"url": "http://test/{}", "errorType": "status_code"}},
+ ):
+ settings = AppSettings()
+ request = HuntRequest(
+ usernames=["testuser"],
+ emails=[],
+ scan_localpart=False,
+ use_sherlock=True,
+ )
+ result = await hunt(settings=settings, request=request)
+
+ mock_sherlock.assert_called_once()
+ # Sherlock profile should be in results
+ networks = {p.network_name for p in result.person.profiles}
+ assert "reddit" in networks
+
+
+# ---------------------------------------------------------------------------
+# Site-list integration
+# ---------------------------------------------------------------------------
+
+class TestSiteListIntegration:
+ @pytest.mark.asyncio
+ async def test_warning_when_path_missing(self):
+ """When site-list path doesn't exist, a warning is emitted."""
+ warnings_received = []
+
+ class EmptyScanner:
+ async def scan(self, value: str):
+ return SocialProfile(
+ url=f"https://e.com/{value}",
+ username=value,
+ network_name="e",
+ exists=False,
+ metadata={},
+ )
+
+ with patch(
+ "core.services.identity_pipeline._USERNAME_SCANNERS",
+ (type(EmptyScanner()),),
+ ), patch(
+ "core.services.identity_pipeline._EMAIL_SCANNERS",
+ (),
+ ), patch(
+ "core.services.identity_pipeline.get_default_list_path",
+ return_value=None,
+ ):
+ settings = AppSettings()
+ hooks = PipelineHooks(
+ warning=lambda msg: warnings_received.append(msg),
+ )
+ request = HuntRequest(
+ usernames=["user"],
+ emails=[],
+ scan_localpart=False,
+ use_sherlock=False,
+ site_lists=SiteListOptions(
+ enabled=True,
+ username_path=Path("/nonexistent/path.json"),
+ ),
+ )
+ await hunt(settings=settings, request=request, hooks=hooks)
+
+ assert len(warnings_received) >= 1
+ assert "not configured" in warnings_received[0].lower() or "missing" in warnings_received[0].lower()
+
+
+# ---------------------------------------------------------------------------
+# Breach check integration
+# ---------------------------------------------------------------------------
+
+class TestBreachCheckIntegration:
+ @pytest.mark.asyncio
+ async def test_breach_check_called_when_enabled(self):
+ """When use_breach_check=True, enrich_profiles_with_breach_data is called."""
+
+ breach_profile = _profile(network="hibp", username="test@test.com")
+ mock_breach = MagicMock(return_value=[breach_profile])
+
+ class EmptyScanner:
+ async def scan(self, value: str):
+ return SocialProfile(
+ url=f"https://e.com/{value}",
+ username=value,
+ network_name="e",
+ exists=False,
+ metadata={},
+ )
+
+ with patch(
+ "core.services.identity_pipeline._USERNAME_SCANNERS",
+ (),
+ ), patch(
+ "core.services.identity_pipeline._EMAIL_SCANNERS",
+ (type(EmptyScanner()),),
+ ), patch(
+ "adapters.breach_check.enrich_profiles_with_breach_data",
+ mock_breach,
+ ):
+ settings = AppSettings()
+ request = HuntRequest(
+ usernames=[],
+ emails=["test@test.com"],
+ scan_localpart=False,
+ use_sherlock=False,
+ use_breach_check=True,
+ )
+ await hunt(settings=settings, request=request)
+
+ mock_breach.assert_called_once_with(emails=["test@test.com"])
+
+
+# ---------------------------------------------------------------------------
+# Deduplication in pipeline
+# ---------------------------------------------------------------------------
+
+class TestPipelineDeduplication:
+ @pytest.mark.asyncio
+ async def test_duplicate_profiles_are_removed(self):
+ """If two scanners return the same profile, hunt() deduplicates."""
+
+ class DuplicateScanner:
+ async def scan(self, value: str):
+ return SocialProfile(
+ url="https://github.com/user",
+ username="user",
+ network_name="github",
+ exists=True,
+ metadata={},
+ )
+
+ with patch(
+ "core.services.identity_pipeline._USERNAME_SCANNERS",
+ (type(DuplicateScanner()), type(DuplicateScanner())),
+ ), patch(
+ "core.services.identity_pipeline._EMAIL_SCANNERS",
+ (),
+ ):
+ settings = AppSettings()
+ request = HuntRequest(
+ usernames=["user"],
+ emails=[],
+ scan_localpart=False,
+ use_sherlock=False,
+ )
+ result = await hunt(settings=settings, request=request)
+
+ github_profiles = [p for p in result.person.profiles if p.network_name == "github"]
+ assert len(github_profiles) == 1
diff --git a/tests/test_osint_scanners.py b/tests/test_osint_scanners.py
new file mode 100644
index 0000000..d20ccc8
--- /dev/null
+++ b/tests/test_osint_scanners.py
@@ -0,0 +1,253 @@
+"""Tests for OSINT scanners with mocked HTTP (issue #32).
+
+Covers positive/negative match detection and metadata extraction for
+representative scanners: X, GitLab, Keybase, DevTo, Medium, Pinterest.
+
+GitHub and Reddit use specific_scrapers so are tested via mock of
+their deep fetch functions.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+from contextlib import asynccontextmanager
+
+import pytest
+import httpx
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _mock_response(*, status_code: int = 200, text: str = "", url: str = "https://example.com", headers: dict | None = None) -> MagicMock:
+ resp = MagicMock(spec=httpx.Response)
+ resp.status_code = status_code
+ resp.text = text
+ resp.url = httpx.URL(url)
+ resp.headers = headers or {}
+ resp.json.return_value = {}
+ return resp
+
+
+@asynccontextmanager
+async def _mock_client(response: MagicMock):
+ """Context manager that yields a mock AsyncClient."""
+ client = AsyncMock()
+ client.get = AsyncMock(return_value=response)
+ client.post = AsyncMock(return_value=response)
+ yield client
+
+
+# ---------------------------------------------------------------------------
+# X (Twitter) Scanner
+# ---------------------------------------------------------------------------
+
+class TestXScanner:
+ @pytest.mark.asyncio
+ async def test_exists_on_200(self):
+ from adapters.osint_sources.x import XScanner
+
+ resp = _mock_response(status_code=200, url="https://x.com/testuser")
+ with patch("adapters.osint_sources.x.build_async_client", return_value=_mock_client(resp)):
+ scanner = XScanner()
+ profile = await scanner.scan("testuser")
+
+ assert profile.exists is True
+ assert profile.network_name == "x"
+ assert profile.username == "testuser"
+
+ @pytest.mark.asyncio
+ async def test_not_exists_on_404(self):
+ from adapters.osint_sources.x import XScanner
+
+ resp = _mock_response(status_code=404, url="https://x.com/nonexistent")
+ with patch("adapters.osint_sources.x.build_async_client", return_value=_mock_client(resp)):
+ scanner = XScanner()
+ profile = await scanner.scan("nonexistent")
+
+ assert profile.exists is False
+
+
+# ---------------------------------------------------------------------------
+# GitLab Scanner
+# ---------------------------------------------------------------------------
+
+class TestGitLabScanner:
+ @pytest.mark.asyncio
+ async def test_exists_on_200_extracts_name(self):
+ from adapters.osint_sources.gitlab import GitLabScanner
+
+ html = "John Doe · GitLab"
+ resp = _mock_response(status_code=200, text=html, url="https://gitlab.com/johndoe")
+ with patch("adapters.osint_sources.gitlab.build_async_client", return_value=_mock_client(resp)):
+ scanner = GitLabScanner()
+ profile = await scanner.scan("johndoe")
+
+ assert profile.exists is True
+ assert profile.network_name == "gitlab"
+ assert profile.metadata.get("name") == "John Doe"
+
+ @pytest.mark.asyncio
+ async def test_not_exists_on_404(self):
+ from adapters.osint_sources.gitlab import GitLabScanner
+
+ resp = _mock_response(status_code=404, url="https://gitlab.com/nobody")
+ with patch("adapters.osint_sources.gitlab.build_async_client", return_value=_mock_client(resp)):
+ scanner = GitLabScanner()
+ profile = await scanner.scan("nobody")
+
+ assert profile.exists is False
+
+
+# ---------------------------------------------------------------------------
+# GitHub Scanner (mocks fetch_github_deep)
+# ---------------------------------------------------------------------------
+
+class TestGitHubScanner:
+ @pytest.mark.asyncio
+ async def test_exists_with_api_data(self):
+ from adapters.osint_sources.github import GitHubScanner
+
+ api_data = {
+ "login": "octocat",
+ "name": "The Octocat",
+ "bio": "A GitHub mascot",
+ "avatar_url": "https://avatars.githubusercontent.com/u/1",
+ "email": "octocat@github.com",
+ "blog": "https://octocat.dev",
+ "twitter_username": "octocat_tw",
+ "company": "GitHub",
+ "location": "San Francisco",
+ }
+
+ with patch("adapters.osint_sources.github.fetch_github_deep", AsyncMock(return_value=api_data)):
+ scanner = GitHubScanner()
+ result = await scanner.scan("octocat")
+
+ if isinstance(result, list):
+ main = result[0]
+ else:
+ main = result
+
+ assert main.exists is True
+ assert main.network_name == "github"
+ assert main.bio == "A GitHub mascot"
+ assert main.image_url == "https://avatars.githubusercontent.com/u/1"
+ # Should extract other_emails, other_users
+ assert "octocat@github.com" in main.metadata.get("other_emails", [])
+ assert "octocat_tw" in main.metadata.get("other_users", [])
+
+ @pytest.mark.asyncio
+ async def test_not_exists(self):
+ from adapters.osint_sources.github import GitHubScanner
+
+ with patch("adapters.osint_sources.github.fetch_github_deep", AsyncMock(return_value=None)):
+ scanner = GitHubScanner()
+ result = await scanner.scan("nonexistent_user_xyz")
+
+ profile = result[0] if isinstance(result, list) else result
+ assert profile.exists is False
+
+
+# ---------------------------------------------------------------------------
+# Reddit Scanner (mocks fetch_reddit_deep)
+# ---------------------------------------------------------------------------
+
+class TestRedditScanner:
+ @pytest.mark.asyncio
+ async def test_exists_with_data(self):
+ from adapters.osint_sources.reddit import RedditScanner
+
+ api_data = {
+ "public_description": "A redditor",
+ "icon_img": "https://styles.redditmedia.com/icon.png",
+ }
+
+ with patch("adapters.osint_sources.reddit.fetch_reddit_deep", AsyncMock(return_value=api_data)):
+ scanner = RedditScanner()
+ profile = await scanner.scan("testuser")
+
+ assert profile.exists is True
+ assert profile.network_name == "reddit"
+ assert profile.bio == "A redditor"
+
+ @pytest.mark.asyncio
+ async def test_not_exists(self):
+ from adapters.osint_sources.reddit import RedditScanner
+
+ with patch("adapters.osint_sources.reddit.fetch_reddit_deep", AsyncMock(return_value=None)):
+ scanner = RedditScanner()
+ profile = await scanner.scan("nobody")
+
+ assert profile.exists is False
+
+
+# ---------------------------------------------------------------------------
+# Keybase Scanner
+# ---------------------------------------------------------------------------
+
+class TestKeybaseScanner:
+ @pytest.mark.asyncio
+ async def test_exists_on_200(self):
+ from adapters.osint_sources.keybase import KeybaseScanner
+
+ resp = _mock_response(status_code=200, url="https://keybase.io/user1")
+ with patch("adapters.osint_sources.keybase.build_async_client", return_value=_mock_client(resp)):
+ scanner = KeybaseScanner()
+ profile = await scanner.scan("user1")
+
+ assert profile.exists is True
+ assert profile.network_name == "keybase"
+
+ @pytest.mark.asyncio
+ async def test_not_exists_on_404(self):
+ from adapters.osint_sources.keybase import KeybaseScanner
+
+ resp = _mock_response(status_code=404, url="https://keybase.io/nobody")
+ with patch("adapters.osint_sources.keybase.build_async_client", return_value=_mock_client(resp)):
+ scanner = KeybaseScanner()
+ profile = await scanner.scan("nobody")
+
+ assert profile.exists is False
+
+
+# ---------------------------------------------------------------------------
+# Telegram Scanner
+# ---------------------------------------------------------------------------
+
+class TestTelegramScanner:
+ @pytest.mark.asyncio
+ async def test_exists_when_not_contact_page(self):
+ from adapters.osint_sources.telegram import TelegramScanner
+
+ html = """
+
+
+ Chad Fowler
+
+ """
+
+ resp = _mock_response(status_code=200, text=html, url="https://t.me/chadfowler")
+ with patch("adapters.osint_sources.telegram.build_async_client", return_value=_mock_client(resp)):
+ scanner = TelegramScanner()
+ profile = await scanner.scan("chadfowler")
+
+ assert profile.exists is True
+ assert profile.network_name == "telegram"
+ assert profile.metadata.get("name") == "Chad Fowler"
+
+ @pytest.mark.asyncio
+ async def test_not_exists_when_contact_page(self):
+ from adapters.osint_sources.telegram import TelegramScanner
+
+ html = """
+
+ """
+
+ resp = _mock_response(status_code=200, text=html, url="https://t.me/nobody")
+ with patch("adapters.osint_sources.telegram.build_async_client", return_value=_mock_client(resp)):
+ scanner = TelegramScanner()
+ profile = await scanner.scan("nobody")
+
+ assert profile.exists is False
diff --git a/tests/test_profile_enricher.py b/tests/test_profile_enricher.py
new file mode 100644
index 0000000..c2f80c7
--- /dev/null
+++ b/tests/test_profile_enricher.py
@@ -0,0 +1,158 @@
+"""Tests for profile_enricher with mocked HTTP (issue #32).
+
+Covers:
+- Enriches profile without bio/avatar from HTML metadata
+- Skips non-existing profiles
+- Skips profiles with existing bio
+- Handles HTTP errors gracefully
+"""
+
+from __future__ import annotations
+
+from contextlib import asynccontextmanager
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+import httpx
+
+from core.config import AppSettings
+from core.domain.models import SocialProfile
+from adapters.profile_enricher import enrich_profiles_from_html
+
+
+def _mock_response(*, status_code: int = 200, text: str = "", url: str = "https://example.com") -> MagicMock:
+ resp = MagicMock(spec=httpx.Response)
+ resp.status_code = status_code
+ resp.text = text
+ resp.url = httpx.URL(url)
+ return resp
+
+
+@asynccontextmanager
+async def _mock_client_cm(response: MagicMock):
+ client = AsyncMock()
+ client.get = AsyncMock(return_value=response)
+ yield client
+
+
+# ---------------------------------------------------------------------------
+# Enriches profiles
+# ---------------------------------------------------------------------------
+
+class TestEnrichProfilesFromHTML:
+ @pytest.mark.asyncio
+ async def test_enriches_profile_without_bio(self):
+ """Profile without bio gets bio from HTML meta description."""
+ html = ''
+ resp = _mock_response(status_code=200, text=html, url="https://github.com/user")
+
+ profile = SocialProfile(
+ url="https://github.com/user",
+ username="user",
+ network_name="github",
+ exists=True,
+ metadata={},
+ )
+
+ with patch("adapters.profile_enricher.build_async_client", return_value=_mock_client_cm(resp)):
+ await enrich_profiles_from_html(
+ profiles=[profile],
+ settings=AppSettings(),
+ )
+
+ assert profile.bio == "A developer"
+ assert profile.image_url == "https://img.com/avatar.jpg"
+
+ @pytest.mark.asyncio
+ async def test_skips_non_existing_profiles(self):
+ """Profiles with exists=False are not fetched."""
+ profile = SocialProfile(
+ url="https://github.com/nobody",
+ username="nobody",
+ network_name="github",
+ exists=False,
+ metadata={},
+ )
+
+ resp = _mock_response()
+
+ with patch("adapters.profile_enricher.build_async_client", return_value=_mock_client_cm(resp)):
+ await enrich_profiles_from_html(
+ profiles=[profile],
+ settings=AppSettings(),
+ )
+
+ # Bio should remain None — the enricher should have skipped it
+ assert profile.bio is None
+
+ @pytest.mark.asyncio
+ async def test_skips_profiles_with_existing_bio(self):
+ """Profiles that already have bio are not re-fetched."""
+ profile = SocialProfile(
+ url="https://github.com/user",
+ username="user",
+ network_name="github",
+ exists=True,
+ metadata={},
+ bio="Already has a bio",
+ )
+
+ resp = _mock_response(
+ text='',
+ )
+
+ with patch("adapters.profile_enricher.build_async_client", return_value=_mock_client_cm(resp)):
+ await enrich_profiles_from_html(
+ profiles=[profile],
+ settings=AppSettings(),
+ )
+
+ # Bio should remain unchanged
+ assert profile.bio == "Already has a bio"
+
+ @pytest.mark.asyncio
+ async def test_handles_http_error_gracefully(self):
+ """HTTP 500 should not crash the enricher."""
+ profile = SocialProfile(
+ url="https://github.com/user",
+ username="user",
+ network_name="github",
+ exists=True,
+ metadata={},
+ )
+
+ resp = _mock_response(status_code=500)
+
+ with patch("adapters.profile_enricher.build_async_client", return_value=_mock_client_cm(resp)):
+ # Should not raise
+ await enrich_profiles_from_html(
+ profiles=[profile],
+ settings=AppSettings(),
+ )
+
+ assert profile.bio is None
+
+ @pytest.mark.asyncio
+ async def test_handles_exception_gracefully(self):
+ """Network exception should not crash the enricher."""
+ profile = SocialProfile(
+ url="https://github.com/user",
+ username="user",
+ network_name="github",
+ exists=True,
+ metadata={},
+ )
+
+ @asynccontextmanager
+ async def failing_client(*args, **kwargs):
+ client = AsyncMock()
+ client.get = AsyncMock(side_effect=ConnectionError("simulated"))
+ yield client
+
+ with patch("adapters.profile_enricher.build_async_client", return_value=failing_client()):
+ await enrich_profiles_from_html(
+ profiles=[profile],
+ settings=AppSettings(),
+ )
+
+ assert profile.bio is None
diff --git a/tests/test_resources_loader.py b/tests/test_resources_loader.py
new file mode 100644
index 0000000..cbca920
--- /dev/null
+++ b/tests/test_resources_loader.py
@@ -0,0 +1,149 @@
+"""Tests for resources_loader (issue #32).
+
+Covers:
+- load_sherlock_data() cached path (no network)
+- load_sherlock_data() download path (mocked httpx)
+- load_sherlock_data() download failure propagates
+- get_default_list_path() returns existing file
+- get_default_list_path() returns None when missing
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from core.resources_loader import load_sherlock_data, get_default_list_path
+
+
+# ---------------------------------------------------------------------------
+# load_sherlock_data — cached path
+# ---------------------------------------------------------------------------
+
+class TestLoadSherlockCached:
+ def test_loads_from_cache(self, tmp_path: Path):
+ """When sherlock.json exists and refresh=False, loads from cache."""
+ data_dir = tmp_path / "data"
+ data_dir.mkdir()
+ cache_file = data_dir / "sherlock.json"
+ expected = {"TestSite": {"url": "http://test/{}", "errorType": "status_code"}}
+ cache_file.write_text(json.dumps(expected), encoding="utf-8")
+
+ with patch("core.resources_loader._data_dir", return_value=data_dir):
+ result = load_sherlock_data(refresh=False)
+
+ assert result == expected
+
+ def test_does_not_call_network_when_cached(self, tmp_path: Path):
+ """Cached path should not make any HTTP request."""
+ data_dir = tmp_path / "data"
+ data_dir.mkdir()
+ cache_file = data_dir / "sherlock.json"
+ cache_file.write_text("{}", encoding="utf-8")
+
+ mock_get = MagicMock()
+ with patch("core.resources_loader._data_dir", return_value=data_dir), \
+ patch("core.resources_loader.httpx.get", mock_get):
+ load_sherlock_data(refresh=False)
+
+ mock_get.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# load_sherlock_data — download path
+# ---------------------------------------------------------------------------
+
+class TestLoadSherlockDownload:
+ def test_downloads_when_no_cache(self, tmp_path: Path):
+ """When cache doesn't exist, downloads from URL."""
+ data_dir = tmp_path / "data"
+ data_dir.mkdir()
+
+ expected = {"DownloadedSite": {"url": "http://dl/{}", "errorType": "status_code"}}
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.json.return_value = expected
+ mock_resp.raise_for_status = MagicMock()
+
+ with patch("core.resources_loader._data_dir", return_value=data_dir), \
+ patch("core.resources_loader.httpx.get", return_value=mock_resp):
+ result = load_sherlock_data(refresh=False)
+
+ assert result == expected
+ # Should have saved to cache
+ cache_file = data_dir / "sherlock.json"
+ assert cache_file.exists()
+
+ def test_downloads_when_refresh(self, tmp_path: Path):
+ """When refresh=True, downloads even if cache exists."""
+ data_dir = tmp_path / "data"
+ data_dir.mkdir()
+ cache_file = data_dir / "sherlock.json"
+ cache_file.write_text('{"old": true}', encoding="utf-8")
+
+ new_data = {"NewSite": {"url": "http://new/{}"}}
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.json.return_value = new_data
+ mock_resp.raise_for_status = MagicMock()
+
+ with patch("core.resources_loader._data_dir", return_value=data_dir), \
+ patch("core.resources_loader.httpx.get", return_value=mock_resp):
+ result = load_sherlock_data(refresh=True)
+
+ assert result == new_data
+
+
+# ---------------------------------------------------------------------------
+# load_sherlock_data — download failure
+# ---------------------------------------------------------------------------
+
+class TestLoadSherlockDownloadFailure:
+ def test_download_failure_raises(self, tmp_path: Path):
+ """When download fails, exception should propagate (not silently empty)."""
+ data_dir = tmp_path / "data"
+ data_dir.mkdir()
+
+ import httpx
+ mock_resp = MagicMock()
+ mock_resp.raise_for_status.side_effect = httpx.HTTPStatusError(
+ "Server Error",
+ request=MagicMock(),
+ response=MagicMock(status_code=500),
+ )
+
+ with patch("core.resources_loader._data_dir", return_value=data_dir), \
+ patch("core.resources_loader.httpx.get", return_value=mock_resp):
+ with pytest.raises(httpx.HTTPStatusError):
+ load_sherlock_data(refresh=False)
+
+
+# ---------------------------------------------------------------------------
+# get_default_list_path
+# ---------------------------------------------------------------------------
+
+class TestGetDefaultListPath:
+ def test_returns_existing_file(self, tmp_path: Path):
+ """When a file exists in the search path, returns it."""
+ data_dir = tmp_path / "data"
+ data_dir.mkdir()
+ test_file = data_dir / "username_sites.json"
+ test_file.write_text("[]", encoding="utf-8")
+
+ with patch("core.resources_loader._project_root", return_value=tmp_path), \
+ patch("core.resources_loader.get_user_config_dir", return_value=tmp_path / "config"):
+ result = get_default_list_path("username_sites.json")
+
+ assert result is not None
+ assert result.exists()
+
+ def test_returns_none_when_missing(self, tmp_path: Path):
+ """When no file exists, returns None."""
+ with patch("core.resources_loader._project_root", return_value=tmp_path), \
+ patch("core.resources_loader.get_user_config_dir", return_value=tmp_path / "config"):
+ result = get_default_list_path("nonexistent_file.json")
+
+ assert result is None
diff --git a/tests/test_scanner_error_handling.py b/tests/test_scanner_error_handling.py
new file mode 100644
index 0000000..0b1360f
--- /dev/null
+++ b/tests/test_scanner_error_handling.py
@@ -0,0 +1,185 @@
+"""Tests for scanner error handling and observability (issue #34).
+
+Covers:
+- safe_scan error fallback path (previously # pragma: no cover)
+- Sherlock runner error counting
+- Site-list runner error counting
+- PipelineResult.scan_errors tracking
+"""
+
+from __future__ import annotations
+
+from unittest.mock import patch
+
+import pytest
+
+from core.config import AppSettings
+from core.services.identity_pipeline import PipelineResult, hunt, HuntRequest
+from core.domain.models import PersonEntity
+
+
+# ---------------------------------------------------------------------------
+# PipelineResult.scan_errors field
+# ---------------------------------------------------------------------------
+
+class TestPipelineResultScanErrors:
+ def test_default_zero(self):
+ result = PipelineResult(
+ person=PersonEntity(target="test"),
+ usernames=["test"],
+ emails=[],
+ )
+ assert result.scan_errors == 0
+
+ def test_can_set_errors(self):
+ result = PipelineResult(
+ person=PersonEntity(target="test"),
+ usernames=["test"],
+ emails=[],
+ scan_errors=5,
+ )
+ assert result.scan_errors == 5
+
+
+# ---------------------------------------------------------------------------
+# safe_scan error fallback (previously # pragma: no cover)
+# ---------------------------------------------------------------------------
+
+class TestSafeScanErrorFallback:
+ """Verify that safe_scan catches exceptions and returns a fallback profile
+ with exists=False and error metadata."""
+
+ @pytest.mark.asyncio
+ async def test_safe_scan_catches_scanner_error(self):
+ """When a scanner raises, safe_scan should return a profile with
+ exists=False and error in metadata."""
+
+ class FailingScanner:
+ """A scanner that always raises."""
+ async def scan(self, value: str):
+ raise ConnectionError("simulated network failure")
+
+ # Import the function indirectly by running a minimal pipeline
+ # with a patched scanner list
+ scanner = FailingScanner()
+
+ # We test safe_scan indirectly via the identity_pipeline.hunt
+ # by mocking _USERNAME_SCANNERS
+ with patch(
+ "core.services.identity_pipeline._USERNAME_SCANNERS",
+ (type(scanner),),
+ ), patch(
+ "core.services.identity_pipeline._EMAIL_SCANNERS",
+ (),
+ ):
+ settings = AppSettings()
+ request = HuntRequest(
+ usernames=["testuser"],
+ emails=[],
+ scan_localpart=False,
+ use_sherlock=False,
+ )
+ result = await hunt(settings=settings, request=request)
+
+ # The failing scanner should have produced a profile with error metadata
+ error_profiles = [
+ p for p in result.person.profiles
+ if isinstance(p.metadata, dict) and p.metadata.get("error")
+ ]
+ assert len(error_profiles) >= 1
+ assert error_profiles[0].exists is False
+ assert "simulated network failure" in str(error_profiles[0].metadata["error"])
+ assert result.scan_errors >= 1
+
+
+# ---------------------------------------------------------------------------
+# Sherlock runner error counting
+# ---------------------------------------------------------------------------
+
+class TestSherlockErrorCounting:
+ @pytest.mark.asyncio
+ async def test_returns_error_count(self):
+ from adapters.sherlock_runner import run_sherlock_username
+
+ # Create a manifest with one site that will fail (invalid URL)
+ manifest = {
+ "TestSite": {
+ "url": "http://localhost:1/__NONEXISTENT__/{}",
+ "errorType": "status_code",
+ "urlMain": "http://localhost:1",
+ },
+ }
+ settings = AppSettings()
+ found, error_count = await run_sherlock_username(
+ usernames=["testuser"],
+ manifest=manifest,
+ settings=settings,
+ max_concurrency=5,
+ no_nsfw=False,
+ )
+ # The request to localhost:1 should fail (connection refused)
+ # so error_count should be 1 and found should be empty
+ assert error_count >= 1
+ assert isinstance(found, list)
+
+
+# ---------------------------------------------------------------------------
+# Site-list runner error counting
+# ---------------------------------------------------------------------------
+
+class TestSiteListErrorCounting:
+ @pytest.mark.asyncio
+ async def test_username_sites_returns_error_count(self):
+ from adapters.site_lists.runner import run_username_sites
+ from adapters.site_lists.models import UsernameSite
+
+ sites = [
+ UsernameSite(
+ name="FailSite",
+ uri_check="http://localhost:1/__NONEXISTENT__/{account}",
+ e_code=404,
+ e_string="not found",
+ m_code=200,
+ m_string=None,
+ cat="test",
+ ),
+ ]
+ settings = AppSettings()
+ found, error_count = await run_username_sites(
+ usernames=["testuser"],
+ sites=sites,
+ settings=settings,
+ max_concurrency=5,
+ categories=None,
+ no_nsfw=False,
+ )
+ assert error_count >= 1
+ assert isinstance(found, list)
+
+ @pytest.mark.asyncio
+ async def test_email_sites_returns_error_count(self):
+ from adapters.site_lists.runner import run_email_sites
+ from adapters.site_lists.models import EmailSite
+
+ sites = [
+ EmailSite(
+ name="FailSite",
+ uri_check="http://localhost:1/__NONEXISTENT__/{account}",
+ e_code=404,
+ e_string="not found",
+ m_code=200,
+ m_string=None,
+ cat="test",
+ ),
+ ]
+ settings = AppSettings()
+ found, error_count = await run_email_sites(
+ emails=["test@test.com"],
+ sites=sites,
+ settings=settings,
+ max_concurrency=5,
+ categories=None,
+ no_nsfw=False,
+ )
+ assert error_count >= 1
+ assert isinstance(found, list)
diff --git a/tests/test_sherlock_runner_integration.py b/tests/test_sherlock_runner_integration.py
new file mode 100644
index 0000000..46a0d20
--- /dev/null
+++ b/tests/test_sherlock_runner_integration.py
@@ -0,0 +1,307 @@
+"""Tests for Sherlock runner with mocked HTTP (issue #32).
+
+Covers:
+- Positive match (status_code errorType): 200 → exists=True
+- Negative match (status_code errorType): 404 → filtered out
+- Message errorType: response contains errorMsg → filtered out
+- NSFW filtering
+- Progress callback
+- Error counting (from #34 fix)
+"""
+
+from __future__ import annotations
+
+from contextlib import asynccontextmanager
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+import httpx
+
+from core.config import AppSettings
+from adapters.sherlock_runner import run_sherlock_username
+
+
+def _mock_response(*, status_code: int = 200, text: str = "", url: str = "https://example.com") -> MagicMock:
+ resp = MagicMock(spec=httpx.Response)
+ resp.status_code = status_code
+ resp.text = text
+ resp.url = httpx.URL(url)
+ return resp
+
+
+@asynccontextmanager
+async def _mock_client(responses: dict[str, MagicMock] | MagicMock):
+ """Context manager that yields a mock AsyncClient.
+
+ Args:
+ responses: Either a single response (used for all URLs) or a dict
+ mapping URL substrings to responses.
+ """
+ client = AsyncMock()
+
+ if isinstance(responses, dict):
+ async def smart_get(url, **kwargs):
+ for pattern, resp in responses.items():
+ if pattern in str(url):
+ return resp
+ return _mock_response(status_code=404)
+
+ async def smart_request(method, url, **kwargs):
+ return await smart_get(url)
+
+ client.get = AsyncMock(side_effect=smart_get)
+ client.request = AsyncMock(side_effect=smart_request)
+ else:
+ client.get = AsyncMock(return_value=responses)
+ client.request = AsyncMock(return_value=responses)
+
+ yield client
+
+
+# ---------------------------------------------------------------------------
+# Positive match (status_code errorType)
+# ---------------------------------------------------------------------------
+
+class TestSherlockPositiveMatch:
+ @pytest.mark.asyncio
+ async def test_status_code_200_is_found(self):
+ """A site with errorType=status_code and HTTP 200 → profile exists."""
+ manifest = {
+ "GitHub": {
+ "url": "https://github.com/{}",
+ "errorType": "status_code",
+ "urlMain": "https://github.com",
+ },
+ }
+
+ resp = _mock_response(status_code=200, text="profile", url="https://github.com/testuser")
+
+ with patch("adapters.sherlock_runner.build_async_client", return_value=_mock_client(resp)), \
+ patch("adapters.sherlock_runner.request_with_retry", AsyncMock(return_value=resp)):
+ result = await run_sherlock_username(
+ usernames=["testuser"],
+ manifest=manifest,
+ settings=AppSettings(),
+ max_concurrency=5,
+ no_nsfw=False,
+ )
+
+ # run_sherlock_username returns (list[SocialProfile], error_count)
+ if isinstance(result, tuple):
+ found, errors = result
+ else:
+ found, errors = result, 0
+
+ assert len(found) == 1
+ assert found[0].exists is True
+ assert found[0].network_name == "github"
+ assert found[0].username == "testuser"
+ assert errors == 0
+
+
+# ---------------------------------------------------------------------------
+# Negative match (status_code errorType)
+# ---------------------------------------------------------------------------
+
+class TestSherlockNegativeMatch:
+ @pytest.mark.asyncio
+ async def test_status_code_404_not_found(self):
+ """A site with errorType=status_code and HTTP 404 → profile NOT found."""
+ manifest = {
+ "GitHub": {
+ "url": "https://github.com/{}",
+ "errorType": "status_code",
+ "urlMain": "https://github.com",
+ },
+ }
+
+ resp = _mock_response(status_code=404, url="https://github.com/nobody")
+
+ with patch("adapters.sherlock_runner.build_async_client", return_value=_mock_client(resp)), \
+ patch("adapters.sherlock_runner.request_with_retry", AsyncMock(return_value=resp)):
+ result = await run_sherlock_username(
+ usernames=["nobody"],
+ manifest=manifest,
+ settings=AppSettings(),
+ max_concurrency=5,
+ no_nsfw=False,
+ )
+
+ if isinstance(result, tuple):
+ found, _ = result
+ else:
+ found = result
+
+ assert len(found) == 0
+
+
+# ---------------------------------------------------------------------------
+# Message errorType
+# ---------------------------------------------------------------------------
+
+class TestSherlockMessageErrorType:
+ @pytest.mark.asyncio
+ async def test_error_message_in_response_means_not_found(self):
+ """A site with errorType=message and errorMsg in response → not found."""
+ manifest = {
+ "TestSite": {
+ "url": "https://testsite.com/users/{}",
+ "errorType": "message",
+ "errorMsg": "User not found",
+ "urlMain": "https://testsite.com",
+ },
+ }
+
+ resp = _mock_response(
+ status_code=200,
+ text="User not found",
+ url="https://testsite.com/users/nobody",
+ )
+
+ with patch("adapters.sherlock_runner.build_async_client", return_value=_mock_client(resp)), \
+ patch("adapters.sherlock_runner.request_with_retry", AsyncMock(return_value=resp)):
+ result = await run_sherlock_username(
+ usernames=["nobody"],
+ manifest=manifest,
+ settings=AppSettings(),
+ max_concurrency=5,
+ no_nsfw=False,
+ )
+
+ if isinstance(result, tuple):
+ found, _ = result
+ else:
+ found = result
+
+ assert len(found) == 0
+
+ @pytest.mark.asyncio
+ async def test_no_error_message_means_found(self):
+ """A site with errorType=message where errorMsg is absent → found."""
+ manifest = {
+ "TestSite": {
+ "url": "https://testsite.com/users/{}",
+ "errorType": "message",
+ "errorMsg": "User not found",
+ "urlMain": "https://testsite.com",
+ },
+ }
+
+ resp = _mock_response(
+ status_code=200,
+ text="John's Profile",
+ url="https://testsite.com/users/john",
+ )
+
+ with patch("adapters.sherlock_runner.build_async_client", return_value=_mock_client(resp)), \
+ patch("adapters.sherlock_runner.request_with_retry", AsyncMock(return_value=resp)):
+ result = await run_sherlock_username(
+ usernames=["john"],
+ manifest=manifest,
+ settings=AppSettings(),
+ max_concurrency=5,
+ no_nsfw=False,
+ )
+
+ if isinstance(result, tuple):
+ found, _ = result
+ else:
+ found = result
+
+ assert len(found) == 1
+ assert found[0].exists is True
+
+
+# ---------------------------------------------------------------------------
+# NSFW filtering
+# ---------------------------------------------------------------------------
+
+class TestSherlockNSFWFiltering:
+ @pytest.mark.asyncio
+ async def test_nsfw_sites_filtered_when_no_nsfw(self):
+ """NSFW sites should be skipped when no_nsfw=True."""
+ manifest = {
+ "SafeSite": {
+ "url": "https://safe.com/{}",
+ "errorType": "status_code",
+ "urlMain": "https://safe.com",
+ },
+ "NSFWSite": {
+ "url": "https://nsfw.com/{}",
+ "errorType": "status_code",
+ "urlMain": "https://nsfw.com",
+ "isNSFW": True,
+ },
+ }
+
+ resp = _mock_response(status_code=200, text="profile", url="https://safe.com/user")
+
+ with patch("adapters.sherlock_runner.build_async_client", return_value=_mock_client(resp)), \
+ patch("adapters.sherlock_runner.request_with_retry", AsyncMock(return_value=resp)):
+ result = await run_sherlock_username(
+ usernames=["user"],
+ manifest=manifest,
+ settings=AppSettings(),
+ max_concurrency=5,
+ no_nsfw=True,
+ )
+
+ if isinstance(result, tuple):
+ found, _ = result
+ else:
+ found = result
+
+ site_names = {p.metadata.get("site_name") for p in found}
+ assert "NSFWSite" not in site_names
+ assert "SafeSite" in site_names
+
+
+# ---------------------------------------------------------------------------
+# Progress callback
+# ---------------------------------------------------------------------------
+
+class TestSherlockProgressCallback:
+ @pytest.mark.asyncio
+ async def test_callback_called_with_correct_counts(self):
+ """Progress callback should be called with correct total and progress."""
+ manifest = {
+ "Site1": {
+ "url": "https://site1.com/{}",
+ "errorType": "status_code",
+ "urlMain": "https://site1.com",
+ },
+ "Site2": {
+ "url": "https://site2.com/{}",
+ "errorType": "status_code",
+ "urlMain": "https://site2.com",
+ },
+ }
+
+ resp = _mock_response(status_code=200, text="profile")
+ progress_calls: list[tuple[int, int, str]] = []
+
+ def progress_cb(completed: int, total: int, label: str) -> None:
+ progress_calls.append((completed, total, label))
+
+ with patch("adapters.sherlock_runner.build_async_client", return_value=_mock_client(resp)), \
+ patch("adapters.sherlock_runner.request_with_retry", AsyncMock(return_value=resp)):
+ result = await run_sherlock_username(
+ usernames=["user"],
+ manifest=manifest,
+ settings=AppSettings(),
+ max_concurrency=5,
+ no_nsfw=False,
+ progress_callback=progress_cb,
+ )
+
+ if isinstance(result, tuple):
+ found, _ = result
+ else:
+ found = result
+
+ # Should be called once for initial (0, total) + once per site
+ assert len(progress_calls) >= 2
+ # Final call should have completed == total
+ totals = {c[1] for c in progress_calls}
+ assert 2 in totals # 2 sites × 1 username = 2 total
+ assert isinstance(found, list) # Verify return type