diff --git a/.env.example b/.env.example index 9f88f23..b4c02df 100644 --- a/.env.example +++ b/.env.example @@ -46,9 +46,24 @@ OSINT_D2_USERNAME_SITES_PATH= OSINT_D2_EMAIL_SITES_PATH= # ── Concurrency & Filters ────────────────────────────────────── +# ⚠️ High concurrency without rate limiting constitutes DoS-adjacent +# behaviour against scanned platforms. Keep these at responsible values. OSINT_D2_SITES_MAX_CONCURRENCY=30 OSINT_D2_SITES_NO_NSFW=true +# ── Rate Limiting (responsible scanning) ──────────────────────── +# Per-domain throttling to prevent IP bans and false negatives. +# These defaults are tuned for accuracy without being hostile. +OSINT_D2_REQUEST_DELAY_MS=200 +# Minimum delay (ms) between requests to the same domain. +OSINT_D2_REQUEST_JITTER_MS=100 +# Random jitter ± (ms) added to the delay for naturalness. +OSINT_D2_PER_DOMAIN_CONCURRENCY=3 +# Max simultaneous requests to the same domain (CDN/origin). +OSINT_D2_RETRY_MAX_ATTEMPTS=3 +# Retries on 429/503 responses with exponential backoff. + # ── Agent Mode ────────────────────────────────────────────────── # Default max reasoning steps for autonomous agent investigations. OSINT_D2_AGENT_MAX_STEPS=10 + diff --git a/src/adapters/rate_limiter.py b/src/adapters/rate_limiter.py new file mode 100644 index 0000000..be8038b --- /dev/null +++ b/src/adapters/rate_limiter.py @@ -0,0 +1,232 @@ +"""Per-domain rate limiter con jitter y backoff exponencial. + +Objetivo: +- Prevenir comportamiento DoS-adyacente al escanear cientos de sitios. +- Respetar la infraestructura de las plataformas escaneadas. +- Mejorar la precisión del escaneo evitando 429/503 por abuso. + +Diseño: +- Un semáforo *por dominio* (hostname) limita requests concurrentes al mismo origen. +- Un delay mínimo + jitter temporal entre requests al mismo dominio. +- Retry con backoff exponencial y parsing de Retry-After en 429/503. +""" + +from __future__ import annotations + +import asyncio +import random +import time +from collections import defaultdict +from contextlib import asynccontextmanager +from email.utils import parsedate_to_datetime +from typing import AsyncIterator +from urllib.parse import urlparse + +import httpx + + +# --------------------------------------------------------------------------- +# Domain extraction +# --------------------------------------------------------------------------- + +def extract_domain(url: str) -> str: + """Extrae el hostname del URL para agrupar por dominio. + + Usa hostname directo (no eTLD+1) para evitar dependencias externas. + Esto cubre el 95%+ de los casos en site-lists OSINT. + """ + try: + parsed = urlparse(url) + return (parsed.hostname or "unknown").lower() + except Exception: + return "unknown" + + +# --------------------------------------------------------------------------- +# Retry-After parsing +# --------------------------------------------------------------------------- + +def parse_retry_after(header_value: str | None) -> float | None: + """Parsea el header Retry-After (segundos o HTTP-date). + + Returns el número de segundos a esperar, o None si no es parseable. + """ + if not header_value: + return None + + # Intentar como número de segundos + try: + seconds = float(header_value) + if seconds >= 0: + return min(seconds, 120.0) # Cap de seguridad: máximo 2 minutos + return None + except ValueError: + pass + + # Intentar como HTTP-date (RFC 7231) + try: + retry_date = parsedate_to_datetime(header_value) + delta = (retry_date.timestamp() - time.time()) + if delta > 0: + return min(delta, 120.0) + return 0.0 + except Exception: + return None + + +# --------------------------------------------------------------------------- +# DomainRateLimiter +# --------------------------------------------------------------------------- + +class DomainRateLimiter: + """Rate limiter por dominio con jitter y backoff. + + Parámetros: + per_domain_concurrency: Máx. requests simultáneos al mismo dominio. + delay_ms: Delay mínimo (ms) entre requests al mismo dominio. + jitter_ms: Jitter ± (ms) añadido al delay. + retry_max_attempts: Máx. reintentos en 429/503. + """ + + def __init__( + self, + *, + per_domain_concurrency: int = 3, + delay_ms: int = 200, + jitter_ms: int = 100, + retry_max_attempts: int = 3, + ) -> None: + self._per_domain_concurrency = max(1, per_domain_concurrency) + self._delay_s = max(0, delay_ms) / 1000.0 + self._jitter_s = max(0, jitter_ms) / 1000.0 + self._retry_max = max(0, retry_max_attempts) + + # Estado por dominio + self._domain_sems: dict[str, asyncio.Semaphore] = defaultdict( + lambda: asyncio.Semaphore(self._per_domain_concurrency) + ) + self._domain_last_request: dict[str, float] = defaultdict(float) + self._lock = asyncio.Lock() + + def _compute_delay(self) -> float: + """Calcula el delay con jitter aleatorio.""" + if self._delay_s <= 0 and self._jitter_s <= 0: + return 0.0 + base = self._delay_s + jitter = random.uniform(-self._jitter_s, self._jitter_s) + return max(0.0, base + jitter) + + async def _wait_for_slot(self, domain: str) -> None: + """Espera a que sea seguro enviar un request a este dominio.""" + # Adquirir semáforo del dominio + await self._domain_sems[domain].acquire() + + # Aplicar delay temporal + delay = self._compute_delay() + if delay > 0: + async with self._lock: + now = time.monotonic() + last = self._domain_last_request.get(domain, 0.0) + elapsed = now - last + wait = delay - elapsed + if wait > 0: + await asyncio.sleep(wait) + self._domain_last_request[domain] = time.monotonic() + else: + async with self._lock: + self._domain_last_request[domain] = time.monotonic() + + def _release_slot(self, domain: str) -> None: + """Libera el slot de concurrencia del dominio.""" + try: + self._domain_sems[domain].release() + except ValueError: + pass # Semáforo ya liberado (safety net) + + @asynccontextmanager + async def throttle(self, url: str) -> AsyncIterator[None]: + """Context manager para throttle por dominio. + + Uso: + async with rate_limiter.throttle(url): + resp = await client.get(url) + """ + domain = extract_domain(url) + await self._wait_for_slot(domain) + try: + yield + finally: + self._release_slot(domain) + + @property + def retry_max_attempts(self) -> int: + return self._retry_max + + @staticmethod + def should_retry(status_code: int) -> bool: + """Determina si el status code amerita un retry.""" + return status_code in (429, 503) + + @staticmethod + def backoff_delay(attempt: int, retry_after: float | None = None) -> float: + """Calcula el delay de backoff exponencial. + + Si hay un Retry-After válido, lo usa como base. + Si no, usa backoff exponencial: 1s, 2s, 4s, 8s... + """ + if retry_after is not None and retry_after > 0: + # Añadir un pequeño jitter al Retry-After + return retry_after + random.uniform(0.1, 0.5) + # Backoff exponencial: 2^attempt seconds (1, 2, 4, 8...) + base = min(2 ** attempt, 30) # Cap en 30s + return base + random.uniform(0.1, 0.5) + + +# --------------------------------------------------------------------------- +# Helper: request con retry integrado +# --------------------------------------------------------------------------- + +async def request_with_retry( + client: httpx.AsyncClient, + method: str, + url: str, + rate_limiter: DomainRateLimiter, + *, + headers: dict[str, str] | None = None, + content: str | None = None, +) -> httpx.Response: + """Ejecuta un HTTP request con rate limiting y retry en 429/503. + + Flujo: + 1. Adquiere slot del dominio (throttle). + 2. Envía request. + 3. Si 429/503 → espera backoff → reintenta (hasta retry_max_attempts). + 4. Retorna la respuesta (sea exitosa o el último retry). + """ + last_response: httpx.Response | None = None + + for attempt in range(rate_limiter.retry_max_attempts + 1): + async with rate_limiter.throttle(url): + if method.upper() == "HEAD": + resp = await client.head(url, headers=headers) + elif method.upper() == "POST": + resp = await client.post(url, content=content, headers=headers) + else: + resp = await client.get(url, headers=headers) + + last_response = resp + + if not rate_limiter.should_retry(resp.status_code): + return resp + + # Es 429 o 503 → calcular backoff + if attempt < rate_limiter.retry_max_attempts: + retry_after = parse_retry_after( + resp.headers.get("Retry-After") or resp.headers.get("retry-after") + ) + delay = rate_limiter.backoff_delay(attempt, retry_after) + await asyncio.sleep(delay) + + # Todos los reintentos agotados: devolver la última respuesta + assert last_response is not None + return last_response diff --git a/src/adapters/sherlock_runner.py b/src/adapters/sherlock_runner.py index 07ea0aa..cb2206f 100644 --- a/src/adapters/sherlock_runner.py +++ b/src/adapters/sherlock_runner.py @@ -18,6 +18,10 @@ from collections.abc import Callable from adapters.http_client import build_async_client, extract_html_metadata +from adapters.rate_limiter import ( + DomainRateLimiter, + request_with_retry, +) from core.config import AppSettings from core.domain.models import SocialProfile @@ -70,6 +74,14 @@ async def run_sherlock_username( ) -> list[SocialProfile]: sem = asyncio.Semaphore(max(1, max_concurrency)) + # Rate limiter por dominio + rate_limiter = DomainRateLimiter( + per_domain_concurrency=settings.per_domain_concurrency, + delay_ms=settings.request_delay_ms, + jitter_ms=settings.request_jitter_ms, + retry_max_attempts=settings.retry_max_attempts, + ) + # Manifest es dict: site_name -> info items: list[tuple[str, dict[str, Any]]] = [] @@ -110,12 +122,14 @@ async def check(site_name: str, info: dict[str, Any], username: str) -> SocialPr async with sem: try: - if request_method == "HEAD": - resp = await client.head(url, headers=headers) - text = "" - else: - resp = await client.get(url, headers=headers) - text = resp.text or "" + resp = await request_with_retry( + client, + request_method, + url, + rate_limiter, + headers=headers, + ) + text = "" if request_method == "HEAD" else (resp.text or "") status = resp.status_code final_url = str(resp.url) diff --git a/src/adapters/site_lists/runner.py b/src/adapters/site_lists/runner.py index 543b652..3ec6941 100644 --- a/src/adapters/site_lists/runner.py +++ b/src/adapters/site_lists/runner.py @@ -1,7 +1,8 @@ """Runner data-driven para listas de sitios (username/email). Diseño: -- Ejecuta checks concurrentes con un semáforo. +- Ejecuta checks concurrentes con un semáforo global + rate limiting por dominio. +- Retry con backoff exponencial en 429/503. - Devuelve solo hallazgos (FOUND) como `SocialProfile` para evitar inflar el output. Limitaciones (MVP): @@ -16,6 +17,10 @@ from adapters.http_client import build_async_client from adapters.http_client import extract_html_metadata +from adapters.rate_limiter import ( + DomainRateLimiter, + request_with_retry, +) from adapters.site_lists.models import EmailSite, UsernameSite from adapters.site_lists.operations import apply_input_operation from core.config import AppSettings @@ -64,6 +69,14 @@ async def run_username_sites( ) -> list[SocialProfile]: semaphore = asyncio.Semaphore(max(1, max_concurrency)) + # Rate limiter por dominio + rate_limiter = DomainRateLimiter( + per_domain_concurrency=settings.per_domain_concurrency, + delay_ms=settings.request_delay_ms, + jitter_ms=settings.request_jitter_ms, + retry_max_attempts=settings.retry_max_attempts, + ) + filtered: list[UsernameSite] = [] for s in sites: if no_nsfw and _is_nsfw(s.cat): @@ -78,7 +91,9 @@ async def check(site: UsernameSite, username: str) -> SocialProfile | None: url = site.uri_check.replace("{account}", username) async with semaphore: try: - resp = await client.get(url) + resp = await request_with_retry( + client, "GET", url, rate_limiter, + ) text = resp.text or "" found = _match_found( @@ -132,6 +147,14 @@ async def run_email_sites( ) -> list[SocialProfile]: semaphore = asyncio.Semaphore(max(1, max_concurrency)) + # Rate limiter por dominio + rate_limiter = DomainRateLimiter( + per_domain_concurrency=settings.per_domain_concurrency, + delay_ms=settings.request_delay_ms, + jitter_ms=settings.request_jitter_ms, + retry_max_attempts=settings.retry_max_attempts, + ) + filtered: list[EmailSite] = [] for s in sites: if no_nsfw and _is_nsfw(s.cat): @@ -151,10 +174,11 @@ async def check(site: EmailSite, email: str) -> SocialProfile | None: async with semaphore: try: - if method == "POST": - resp = await client.post(url, content=data, headers=headers) - else: - resp = await client.get(url, headers=headers) + resp = await request_with_retry( + client, method, url, rate_limiter, + headers=headers, + content=data, + ) text = resp.text or "" found = _match_found( diff --git a/src/core/config.py b/src/core/config.py index 26c4ef3..840f364 100644 --- a/src/core/config.py +++ b/src/core/config.py @@ -138,8 +138,37 @@ class AppSettings(BaseSettings): sites_max_concurrency: int = Field( default=30, ge=1, - le=500, - description="Concurrencia máxima para el motor data-driven de listas de sitios.", + le=50, + description=( + "Concurrencia máxima global para el motor data-driven de listas de sitios. " + "⚠️ Valores altos sin rate limiting pueden causar bans de IP y falsos negativos." + ), + ) + + # ── Rate Limiting (responsible scanning) ────────────────────────── + request_delay_ms: int = Field( + default=200, + ge=0, + le=5000, + description="Delay mínimo (ms) entre requests al mismo dominio.", + ) + request_jitter_ms: int = Field( + default=100, + ge=0, + le=2000, + description="Jitter ± (ms) añadido al delay entre requests al mismo dominio.", + ) + per_domain_concurrency: int = Field( + default=3, + ge=1, + le=20, + description="Máximo de requests concurrentes permitidos al mismo dominio.", + ) + retry_max_attempts: int = Field( + default=3, + ge=0, + le=10, + description="Reintentos máximos ante respuestas 429/503 con backoff exponencial.", ) sites_no_nsfw: bool = Field( default=True, diff --git a/tests/test_rate_limiter.py b/tests/test_rate_limiter.py new file mode 100644 index 0000000..6b3e7f2 --- /dev/null +++ b/tests/test_rate_limiter.py @@ -0,0 +1,438 @@ +"""Tests para el módulo de rate limiting por dominio.""" + +from __future__ import annotations + +import asyncio +import time + +import httpx +import pytest + +from adapters.rate_limiter import ( + DomainRateLimiter, + extract_domain, + parse_retry_after, + request_with_retry, +) + + +# --------------------------------------------------------------------------- +# extract_domain +# --------------------------------------------------------------------------- + +class TestExtractDomain: + def test_simple_url(self): + assert extract_domain("https://github.com/user123") == "github.com" + + def test_with_port(self): + assert extract_domain("http://localhost:8080/path") == "localhost" + + def test_subdomain(self): + assert extract_domain("https://api.github.com/users/test") == "api.github.com" + + def test_no_scheme(self): + # urlparse sin scheme no da hostname + result = extract_domain("github.com/user") + assert isinstance(result, str) + + def test_empty_string(self): + assert extract_domain("") == "unknown" + + def test_invalid_url(self): + assert isinstance(extract_domain("not-a-url"), str) + + +# --------------------------------------------------------------------------- +# parse_retry_after +# --------------------------------------------------------------------------- + +class TestParseRetryAfter: + def test_none_header(self): + assert parse_retry_after(None) is None + + def test_empty_header(self): + assert parse_retry_after("") is None + + def test_seconds_integer(self): + result = parse_retry_after("30") + assert result == 30.0 + + def test_seconds_float(self): + result = parse_retry_after("1.5") + assert result == 1.5 + + def test_seconds_zero(self): + result = parse_retry_after("0") + assert result == 0.0 + + def test_negative_seconds(self): + assert parse_retry_after("-5") is None + + def test_cap_at_120_seconds(self): + result = parse_retry_after("300") + assert result == 120.0 + + def test_http_date_format(self): + # HTTP-date lejano + result = parse_retry_after("Thu, 01 Jan 2099 00:00:00 GMT") + assert result is not None + assert result > 0 + + def test_unparseable(self): + assert parse_retry_after("not-a-date-or-number") is None + + +# --------------------------------------------------------------------------- +# DomainRateLimiter — basic +# --------------------------------------------------------------------------- + +class TestDomainRateLimiter: + def test_default_construction(self): + rl = DomainRateLimiter() + assert rl.retry_max_attempts == 3 + + def test_custom_params(self): + rl = DomainRateLimiter( + per_domain_concurrency=5, + delay_ms=500, + jitter_ms=50, + retry_max_attempts=2, + ) + assert rl.retry_max_attempts == 2 + assert rl._per_domain_concurrency == 5 + + def test_should_retry_429(self): + assert DomainRateLimiter.should_retry(429) is True + + def test_should_retry_503(self): + assert DomainRateLimiter.should_retry(503) is True + + def test_should_not_retry_200(self): + assert DomainRateLimiter.should_retry(200) is False + + def test_should_not_retry_404(self): + assert DomainRateLimiter.should_retry(404) is False + + def test_should_not_retry_500(self): + assert DomainRateLimiter.should_retry(500) is False + + def test_backoff_delay_without_retry_after(self): + d0 = DomainRateLimiter.backoff_delay(0) + d1 = DomainRateLimiter.backoff_delay(1) + d2 = DomainRateLimiter.backoff_delay(2) + # Exponential: 1s, 2s, 4s (+ jitter) + assert 0.5 < d0 < 2.0 + assert 1.5 < d1 < 3.0 + assert 3.5 < d2 < 5.0 + + def test_backoff_delay_with_retry_after(self): + d = DomainRateLimiter.backoff_delay(0, retry_after=10.0) + # Should use Retry-After as base + small jitter + assert 10.0 < d < 11.0 + + +# --------------------------------------------------------------------------- +# DomainRateLimiter — throttle context manager +# --------------------------------------------------------------------------- + +class TestDomainRateLimiterThrottle: + @pytest.mark.asyncio + async def test_throttle_acquires_and_releases(self): + rl = DomainRateLimiter(delay_ms=0, jitter_ms=0, per_domain_concurrency=2) + async with rl.throttle("https://example.com/page1"): + # Inside: one slot taken + pass + # After: slot released + + @pytest.mark.asyncio + async def test_per_domain_concurrency_limits(self): + """Verifica que per_domain_concurrency=1 serializa requests al mismo dominio.""" + rl = DomainRateLimiter( + delay_ms=0, jitter_ms=0, + per_domain_concurrency=1, + ) + + order: list[str] = [] + + async def task(label: str, url: str): + async with rl.throttle(url): + order.append(f"{label}_start") + await asyncio.sleep(0.05) + order.append(f"{label}_end") + + # Mismo dominio → serializado + await asyncio.gather( + task("a", "https://example.com/1"), + task("b", "https://example.com/2"), + ) + # Con concurrency=1, uno debe terminar antes de que otro empiece + a_end = order.index("a_end") + b_start = order.index("b_start") + b_end = order.index("b_end") + a_start = order.index("a_start") + # Uno de los dos patrones: a termina antes de b empieza, o viceversa + assert (a_end < b_start) or (b_end < a_start) + + @pytest.mark.asyncio + async def test_different_domains_not_blocked(self): + """Requests a diferentes dominios no se bloquean entre sí.""" + rl = DomainRateLimiter( + delay_ms=0, jitter_ms=0, + per_domain_concurrency=1, + ) + + results: list[float] = [] + + async def task(url: str): + async with rl.throttle(url): + results.append(time.monotonic()) + await asyncio.sleep(0.05) + + + await asyncio.gather( + task("https://example.com/page"), + task("https://other.com/page"), + ) + # Ambos deberían empezar casi simultáneamente (< 30ms gap) + assert len(results) == 2 + assert abs(results[0] - results[1]) < 0.03 + + @pytest.mark.asyncio + async def test_delay_enforced_between_requests(self): + """Verifica que se respeta el delay entre requests al mismo dominio.""" + rl = DomainRateLimiter( + delay_ms=100, jitter_ms=0, + per_domain_concurrency=10, # alto para no bloquear por semáforo + ) + + times: list[float] = [] + + async def task(url: str): + async with rl.throttle(url): + times.append(time.monotonic()) + + # Requests secuenciales al mismo dominio + await task("https://example.com/1") + await task("https://example.com/2") + + assert len(times) == 2 + gap = times[1] - times[0] + # Debe haber al menos ~100ms de gap + assert gap >= 0.08 # Pequeño margen por timing del OS + + +# --------------------------------------------------------------------------- +# request_with_retry +# --------------------------------------------------------------------------- + +class TestRequestWithRetry: + @pytest.mark.asyncio + async def test_success_no_retry(self): + """Request exitoso no reintenta.""" + rl = DomainRateLimiter(delay_ms=0, jitter_ms=0, retry_max_attempts=3) + + mock_response = httpx.Response( + 200, + request=httpx.Request("GET", "https://example.com"), + text="OK", + ) + + call_count = 0 + + class MockClient: + async def get(self, url, headers=None): + nonlocal call_count + call_count += 1 + return mock_response + + async def head(self, url, headers=None): + return mock_response + + async def post(self, url, content=None, headers=None): + return mock_response + + client = MockClient() + resp = await request_with_retry(client, "GET", "https://example.com", rl) + assert resp.status_code == 200 + assert call_count == 1 + + @pytest.mark.asyncio + async def test_retry_on_429(self): + """Reintenta en 429 hasta éxito.""" + rl = DomainRateLimiter(delay_ms=0, jitter_ms=0, retry_max_attempts=3) + + responses = [ + httpx.Response( + 429, + request=httpx.Request("GET", "https://example.com"), + headers={"Retry-After": "0"}, + ), + httpx.Response( + 200, + request=httpx.Request("GET", "https://example.com"), + text="OK", + ), + ] + call_count = 0 + + class MockClient: + async def get(self, url, headers=None): + nonlocal call_count + resp = responses[min(call_count, len(responses) - 1)] + call_count += 1 + return resp + + async def head(self, url, headers=None): + return responses[-1] + + async def post(self, url, content=None, headers=None): + return responses[-1] + + client = MockClient() + resp = await request_with_retry(client, "GET", "https://example.com", rl) + assert resp.status_code == 200 + assert call_count == 2 + + @pytest.mark.asyncio + async def test_retry_exhausted_returns_last(self): + """Si se agotan los reintentos, devuelve la última respuesta 429.""" + rl = DomainRateLimiter(delay_ms=0, jitter_ms=0, retry_max_attempts=2) + + error_resp = httpx.Response( + 429, + request=httpx.Request("GET", "https://example.com"), + headers={"Retry-After": "0"}, + ) + + call_count = 0 + + class MockClient: + async def get(self, url, headers=None): + nonlocal call_count + call_count += 1 + return error_resp + + async def head(self, url, headers=None): + return error_resp + + async def post(self, url, content=None, headers=None): + return error_resp + + client = MockClient() + resp = await request_with_retry(client, "GET", "https://example.com", rl) + assert resp.status_code == 429 + # 1 initial + 2 retries = 3 + assert call_count == 3 + + @pytest.mark.asyncio + async def test_no_retry_on_404(self): + """No reintenta en 404.""" + rl = DomainRateLimiter(delay_ms=0, jitter_ms=0, retry_max_attempts=3) + + mock_response = httpx.Response( + 404, + request=httpx.Request("GET", "https://example.com"), + ) + + call_count = 0 + + class MockClient: + async def get(self, url, headers=None): + nonlocal call_count + call_count += 1 + return mock_response + + async def head(self, url, headers=None): + return mock_response + + async def post(self, url, content=None, headers=None): + return mock_response + + client = MockClient() + resp = await request_with_retry(client, "GET", "https://example.com", rl) + assert resp.status_code == 404 + assert call_count == 1 + + @pytest.mark.asyncio + async def test_retry_on_503(self): + """Reintenta en 503.""" + rl = DomainRateLimiter(delay_ms=0, jitter_ms=0, retry_max_attempts=1) + + responses = [ + httpx.Response(503, request=httpx.Request("GET", "https://example.com")), + httpx.Response(200, request=httpx.Request("GET", "https://example.com"), text="OK"), + ] + call_count = 0 + + class MockClient: + async def get(self, url, headers=None): + nonlocal call_count + resp = responses[min(call_count, len(responses) - 1)] + call_count += 1 + return resp + + async def head(self, url, headers=None): + return responses[-1] + + async def post(self, url, content=None, headers=None): + return responses[-1] + + client = MockClient() + resp = await request_with_retry(client, "GET", "https://example.com", rl) + assert resp.status_code == 200 + assert call_count == 2 + + @pytest.mark.asyncio + async def test_zero_retries_no_retry(self): + """Con retry_max_attempts=0, no reintenta.""" + rl = DomainRateLimiter(delay_ms=0, jitter_ms=0, retry_max_attempts=0) + + error_resp = httpx.Response( + 429, + request=httpx.Request("GET", "https://example.com"), + ) + + call_count = 0 + + class MockClient: + async def get(self, url, headers=None): + nonlocal call_count + call_count += 1 + return error_resp + + async def head(self, url, headers=None): + return error_resp + + async def post(self, url, content=None, headers=None): + return error_resp + + client = MockClient() + resp = await request_with_retry(client, "GET", "https://example.com", rl) + assert resp.status_code == 429 + assert call_count == 1 + + +# --------------------------------------------------------------------------- +# Config integration +# --------------------------------------------------------------------------- + +class TestConfigIntegration: + def test_new_config_fields_defaults(self): + from core.config import AppSettings + s = AppSettings() + assert s.request_delay_ms == 200 + assert s.request_jitter_ms == 100 + assert s.per_domain_concurrency == 3 + assert s.retry_max_attempts == 3 + + def test_max_concurrency_cap(self): + from core.config import AppSettings + # 50 debería ser el máximo + s = AppSettings(sites_max_concurrency=50) + assert s.sites_max_concurrency == 50 + + def test_max_concurrency_over_cap_fails(self): + from core.config import AppSettings + from pydantic import ValidationError + with pytest.raises(ValidationError): + AppSettings(sites_max_concurrency=500)