Skip to content
21 changes: 16 additions & 5 deletions src/adapters/sherlock_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from __future__ import annotations

import asyncio
import logging
from typing import Any
from collections.abc import Callable

Expand All @@ -25,6 +26,8 @@
from core.config import AppSettings
from core.domain.models import SocialProfile

logger = logging.getLogger(__name__)


def _slug(name: str) -> str:
out = []
Expand Down Expand Up @@ -71,7 +74,8 @@ async def run_sherlock_username(
max_concurrency: int,
no_nsfw: bool,
progress_callback: Callable[[int, int, str], None] | None = None,
) -> list[SocialProfile]:
) -> tuple[list[SocialProfile], int]:
"""Run Sherlock checks. Returns (found_profiles, error_count)."""
sem = asyncio.Semaphore(max(1, max_concurrency))

# Rate limiter por dominio
Expand Down Expand Up @@ -187,8 +191,9 @@ async def check(site_name: str, info: dict[str, Any], username: str) -> SocialPr
bio=html_meta.get("meta_description"),
image_url=html_meta.get("og_image"),
)
except Exception:
return None
except Exception as exc:
logger.debug("Sherlock check failed for %s on %s: %s", username, site_name, exc)
return exc # Return exception to count it

tasks: list[asyncio.Future[SocialProfile | None]] = []
task_labels: dict[asyncio.Future[SocialProfile | None], str] = {}
Expand All @@ -201,6 +206,7 @@ async def check(site_name: str, info: dict[str, Any], username: str) -> SocialPr

completed = 0
found: list[SocialProfile] = []
error_count = 0
for t in asyncio.as_completed(tasks):
r = await t
completed += 1
Expand All @@ -210,7 +216,12 @@ async def check(site_name: str, info: dict[str, Any], username: str) -> SocialPr
except Exception:
# Nunca dejar que la UI rompa el scanning.
pass
if r is not None:
if isinstance(r, Exception):
error_count += 1
elif r is not None:
found.append(r)

return found
if error_count:
logger.info("Sherlock scan completed: %d found, %d errors out of %d checks.", len(found), error_count, total)

return found, error_count
32 changes: 23 additions & 9 deletions src/adapters/site_lists/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from __future__ import annotations

import asyncio
import logging
from typing import Any

from adapters.http_client import build_async_client
Expand All @@ -26,6 +27,8 @@
from core.config import AppSettings
from core.domain.models import SocialProfile

logger = logging.getLogger(__name__)


def _slug(name: str) -> str:
out = []
Expand Down Expand Up @@ -66,7 +69,8 @@ async def run_username_sites(
max_concurrency: int,
categories: set[str] | None,
no_nsfw: bool,
) -> list[SocialProfile]:
) -> tuple[list[SocialProfile], int]:
"""Run username site checks. Returns (found_profiles, error_count)."""
semaphore = asyncio.Semaphore(max(1, max_concurrency))

# Rate limiter por dominio
Expand Down Expand Up @@ -127,13 +131,17 @@ async def check(site: UsernameSite, username: str) -> SocialProfile | None:
bio=html_meta.get("meta_description"),
image_url=html_meta.get("og_image"),
)
except Exception:
# Errores: para masivo preferimos no contaminar con cientos de errores.
return None
except Exception as exc:
logger.debug("Site-list check failed for %s on %s: %s", username, site.name, exc)
return exc

results = await asyncio.gather(*(check(s, username) for s in filtered for username in usernames), return_exceptions=False)

return [r for r in results if r is not None]
error_count = sum(1 for r in results if isinstance(r, Exception))
found = [r for r in results if isinstance(r, SocialProfile)]
if error_count:
logger.info("Username site-list scan: %d found, %d errors.", len(found), error_count)
return found, error_count


async def run_email_sites(
Expand All @@ -144,7 +152,8 @@ async def run_email_sites(
max_concurrency: int,
categories: set[str] | None,
no_nsfw: bool,
) -> list[SocialProfile]:
) -> tuple[list[SocialProfile], int]:
"""Run email site checks. Returns (found_profiles, error_count)."""
semaphore = asyncio.Semaphore(max(1, max_concurrency))

# Rate limiter por dominio
Expand Down Expand Up @@ -213,9 +222,14 @@ async def check(site: EmailSite, email: str) -> SocialProfile | None:
bio=html_meta.get("meta_description"),
image_url=html_meta.get("og_image"),
)
except Exception:
return None
except Exception as exc:
logger.debug("Email site-list check failed for %s on %s: %s", email, site.name, exc)
return exc

results = await asyncio.gather(*(check(s, email) for s in filtered for email in emails), return_exceptions=False)

return [r for r in results if r is not None]
error_count = sum(1 for r in results if isinstance(r, Exception))
found = [r for r in results if isinstance(r, SocialProfile)]
if error_count:
logger.info("Email site-list scan: %d found, %d errors.", len(found), error_count)
return found, error_count
78 changes: 50 additions & 28 deletions src/core/services/identity_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from __future__ import annotations

import asyncio
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, Iterable, Sequence
Expand Down Expand Up @@ -54,6 +55,8 @@
from core.domain.models import PersonEntity, SocialProfile
from core.resources_loader import get_default_list_path, load_sherlock_data

logger = logging.getLogger(__name__)


@dataclass
class SiteListOptions:
Expand Down Expand Up @@ -98,6 +101,7 @@ class PipelineResult:
usernames: list[str]
emails: list[str]
warnings: list[str] = field(default_factory=list)
scan_errors: int = 0


_USERNAME_SCANNERS = (
Expand Down Expand Up @@ -226,6 +230,7 @@ async def hunt(
email_scanners = [scanner() for scanner in _EMAIL_SCANNERS]

profiles: list[SocialProfile] = []
total_scan_errors: int = 0
all_usernames = set(usernames)
all_emails = set(emails)
scanned_usernames: set[str] = set()
Expand All @@ -250,7 +255,8 @@ async def safe_scan(
if derived_from and isinstance(profile.metadata, dict):
profile.metadata = {**profile.metadata, "derived_from": derived_from}
return collected
except Exception as exc: # pragma: no cover - defensive fallback
except Exception as exc:
logger.debug("Scanner %s failed for %s: %s", name, value, exc)
fallback_url = f"https://{network}.com/{value}"
if network == "x":
fallback_url = f"https://x.com/{value}"
Expand Down Expand Up @@ -364,16 +370,16 @@ def extract_extras(perfiles: Iterable[SocialProfile]) -> tuple[set[str], set[str
hooks.warning(message)
else:
sites_file = load_username_sites(username_path)
profiles.extend(
await run_username_sites(
usernames=usernames,
sites=sites_file.sites,
settings=settings,
max_concurrency=max_concurrency,
categories=request.site_lists.categories,
no_nsfw=no_nsfw_effective,
)
site_profiles, site_errors = await run_username_sites(
usernames=usernames,
sites=sites_file.sites,
settings=settings,
max_concurrency=max_concurrency,
categories=request.site_lists.categories,
no_nsfw=no_nsfw_effective,
)
profiles.extend(site_profiles)
total_scan_errors += site_errors

if emails:
email_path = request.site_lists.email_path
Expand All @@ -388,16 +394,16 @@ def extract_extras(perfiles: Iterable[SocialProfile]) -> tuple[set[str], set[str
hooks.warning(message)
else:
sites_file = load_email_sites(email_path)
profiles.extend(
await run_email_sites(
emails=emails,
sites=sites_file.sites,
settings=settings,
max_concurrency=max_concurrency,
categories=request.site_lists.categories,
no_nsfw=no_nsfw_effective,
)
email_site_profiles, email_site_errors = await run_email_sites(
emails=emails,
sites=sites_file.sites,
settings=settings,
max_concurrency=max_concurrency,
categories=request.site_lists.categories,
no_nsfw=no_nsfw_effective,
)
profiles.extend(email_site_profiles)
total_scan_errors += email_site_errors

if request.use_sherlock and usernames:
manifest = request.sherlock_manifest or load_sherlock_data(refresh=False)
Expand All @@ -415,16 +421,16 @@ def extract_extras(perfiles: Iterable[SocialProfile]) -> tuple[set[str], set[str
hooks.sherlock_start(total)

progress_cb = hooks.sherlock_progress if total else None
profiles.extend(
await run_sherlock_username(
usernames=usernames,
manifest=manifest,
settings=settings,
max_concurrency=max_concurrency,
no_nsfw=no_nsfw_effective,
progress_callback=progress_cb,
)
sherlock_profiles, sherlock_errors = await run_sherlock_username(
usernames=usernames,
manifest=manifest,
settings=settings,
max_concurrency=max_concurrency,
no_nsfw=no_nsfw_effective,
progress_callback=progress_cb,
)
profiles.extend(sherlock_profiles)
total_scan_errors += sherlock_errors

profiles = dedupe_profiles(profiles)

Expand Down Expand Up @@ -461,11 +467,27 @@ def extract_extras(perfiles: Iterable[SocialProfile]) -> tuple[set[str], set[str

person = PersonEntity(target=target_label, profiles=profiles)

# Count errors from safe_scan fallback profiles
for p in profiles:
if isinstance(p.metadata, dict) and p.metadata.get("error"):
total_scan_errors += 1

if total_scan_errors:
msg = (
f"{total_scan_errors} scanner(s) returned errors "
f"(timeouts, SSL, 5xx, etc.). Results may be incomplete."
)
warnings.append(msg)
if hooks.warning:
hooks.warning(msg)
logger.info("Scan completed with %d errors.", total_scan_errors)

return PipelineResult(
person=person,
usernames=usernames,
emails=emails,
warnings=warnings,
scan_errors=total_scan_errors,
)


Expand Down
Loading
Loading