diff --git a/.env.example b/.env.example index 440b765..eee9425 100644 --- a/.env.example +++ b/.env.example @@ -1,14 +1,23 @@ DATABASE_URL=postgresql+psycopg://sentinel:sentinel@localhost:5432/sentinel ANTHROPIC_API_KEY= -LLM_PROVIDER=anthropic # one of: anthropic, fake (tests/CI use 'fake') +LLM_PROVIDER=anthropic # one of: anthropic, gemini, fake (tests/CI use 'fake') CLAUDE_MODEL=claude-sonnet-4-6 LLM_TEMPERATURE=0.0 # pin to 0.0 for determinism in eval (M9) LLM_MAX_TOKENS=1024 -EMBEDDINGS_PROVIDER=openai # one of: openai, voyage, fake (tests/CI use 'fake') -EMBEDDING_DIM=1536 # 1536 = text-embedding-3-small; 1024 = voyage-3-lite +EMBEDDINGS_PROVIDER=openai # one of: openai, voyage, gemini, fake (tests/CI use 'fake') +EMBEDDING_DIM=1536 # 1536 = text-embedding-3-small / gemini-embedding-2; 1024 = voyage-3-lite OPENAI_API_KEY= OPENAI_EMBEDDING_MODEL=text-embedding-3-small VOYAGE_API_KEY= + +# Google AI Studio / Gemini (free key path). Set LLM_PROVIDER=gemini and/or +# EMBEDDINGS_PROVIDER=gemini above to run the stack on a single free Google key. +GEMINI_API_KEY= +# GOOGLE_API_KEY= # fallback if GEMINI_API_KEY is unset +GEMINI_MODEL=gemini-3.5-flash # fallback: gemini-2.5-flash if 3.5 is unavailable to your account/region +GEMINI_EMBEDDING_MODEL=gemini-embedding-2 # fallback: gemini-embedding-001 if -2 is unavailable +GEMINI_BASE_URL=https://generativelanguage.googleapis.com/v1beta + CHUNK_SIZE_TOKENS=512 CHUNK_OVERLAP_TOKENS=64 RETRIEVAL_TOP_K=5 diff --git a/CLAUDE.md b/CLAUDE.md index 55e5abc..373d63c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -53,9 +53,11 @@ This is a portfolio project demonstrating enterprise-grade, auditable AI for reg - **Backend:** Python 3.12, FastAPI, Pydantic v2, SQLAlchemy 2.x, Alembic - **DB:** PostgreSQL 16 + `pgvector` -- **AI:** Anthropic Claude API for generation/extraction; embeddings via a hosted provider - (`text-embedding-3-small` or `voyage-3-lite`) **behind an interface** in `backend/app/llm/` and - `backend/app/embeddings/` so both are swappable and **mocked in tests** (no live API calls in CI). +- **AI:** hosted LLM for generation/extraction (Anthropic Claude **or** Google Gemini, via + `LLM_PROVIDER`); embeddings via a hosted provider (`text-embedding-3-small`, `gemini-embedding-2`, + or `voyage-3-lite`, via `EMBEDDINGS_PROVIDER`) — all **behind an interface** in `backend/app/llm/` + and `backend/app/embeddings/` so both are swappable and **mocked in tests** (no live API calls in + CI). A single free Google AI Studio key can drive both LLM and embeddings. - **Frontend:** React + TypeScript (Vite), Recharts - **Infra:** Docker + docker-compose (dev); Terraform → AWS ECS Fargate + RDS (M10) - **CI/CD:** GitHub Actions diff --git a/HANDOFF.md b/HANDOFF.md index 8666477..a2a38fc 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -41,8 +41,10 @@ draft the case-study writeup or polish the résumé — but the engineering is C -F enforce_admins=false -F required_status_checks=null -F restrictions=null ``` (Or do it in GitHub → Settings → Branches → add rule on `main`: "Require a pull request before merging".) -6. **API keys:** have an Anthropic API key (the app's LLM) and an embeddings key (OpenAI or Voyage). You'll - put them in `.env` (gitignored) during M2/M3. **CI needs none** — tests mock both providers. +6. **API keys:** have an Anthropic API key (the app's LLM) and an embeddings key (OpenAI or Voyage) — + **or** a single free Google AI Studio key, which drives both the LLM and embeddings when you set + `LLM_PROVIDER=gemini` + `EMBEDDINGS_PROVIDER=gemini` (see the README "Google-only quickstart"). You'll + put them in `.env` (gitignored) during M2/M3. **CI needs none** — tests mock every provider. --- @@ -339,14 +341,19 @@ node_modules/ dist/ ``` DATABASE_URL=postgresql+psycopg://sentinel:sentinel@localhost:5432/sentinel ANTHROPIC_API_KEY= -EMBEDDINGS_PROVIDER=openai # or: voyage +LLM_PROVIDER=anthropic # or: gemini, fake +EMBEDDINGS_PROVIDER=openai # or: voyage, gemini, fake OPENAI_API_KEY= VOYAGE_API_KEY= +GEMINI_API_KEY= # free Google AI Studio key drives both LLM + embeddings RETRIEVAL_TOP_K=5 RETRIEVAL_MIN_SCORE=0.30 CONFIDENCE_REVIEW_THRESHOLD=0.75 ``` +> The committed `.env.example` is the source of truth and has the full set +> (Gemini model/base-url options included); this is an abridged illustration. + ### `docker-compose.yml` ```yaml services: diff --git a/MILESTONES.md b/MILESTONES.md index b309604..a3f67dc 100644 --- a/MILESTONES.md +++ b/MILESTONES.md @@ -160,5 +160,17 @@ and presentable. Do not skip ahead — later milestones assume earlier ones exis - Eval expansion (larger labeled set, per-category breakdown). - Observability: OpenTelemetry traces, dashboards. - Reranking stage before generation. +- Shared provider HTTP base. `ClaudeClient`, `GeminiClient`, `OpenAIEmbedder`, and + `GeminiEmbedder` each repeat api-key validation, base-URL normalization, the `httpx` + POST + headers + error handling, and the timeout knob. Extract a small shared base (or + transport helper) to DRY all four and shrink each constructor's argument surface — done + across all providers together so they stay consistent (deliberately out of scope for the + Gemini-provider PR, which keeps the new classes parallel to the existing ones). +- Role-aware embedding API + Gemini retrieval prefixes. `gemini-embedding-2` recommends + instruction-prefixing queries vs. documents (no `task_type` for text retrieval), but the + `EmbeddingProvider.embed(texts)` interface is role-agnostic and shared by ingest and query + paths. Adding prefixes cleanly needs a query/document role threaded through the Protocol and + all providers — deferred from the Gemini-provider PR to avoid changing retrieval semantics + for OpenAI/fake. Requires tests before any behaviour change. > Do not pull backlog items into earlier PRs. Park ideas here. diff --git a/PROGRESS.md b/PROGRESS.md index 3340442..6b0ef49 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -28,6 +28,7 @@ - **#13** — record real-provider eval numbers (M9 follow-up). Stays open until keys are wired and `make eval` is run for real. - **Backlog (MILESTONES.md):** multi-tenant + RBAC, eval set expansion, OTel traces, Multi-AZ + private subnets + ACM TLS + S3/DynamoDB Terraform backend. - **Design system** — dual-theme (dark default + light) audit-grade visual layer for the frontend + a real `GET /dashboard/kpis` endpoint, on branch `claude/serene-maxwell-54yMC` (draft PR). Net-new work beyond the M0–M11 roadmap; `make check` green (201 backend pytest, 7 frontend Vitest, ruff/mypy/tsc/build clean). +- **Gemini provider** — first-class Google AI Studio / Gemini support for **both** LLM (`GeminiClient`, `:generateContent`) and embeddings (`GeminiEmbedder`, `:batchEmbedContents`, requesting 1536 dims), so the whole stack runs on a single free Google key; wired through config, both factories, eval run-metadata (active-model labels), `.env.example`/README/eval/architecture/demo docs, and Terraform/ECS (optional `gemini_api_key` SSM param + provider env vars). On branch `feat/gemini-provider`. New offline tests mock `httpx`; `fake` stays the CI default (no live calls). `make check` green (222 backend pytest [+21 Gemini], 7 frontend Vitest, ruff/mypy/terraform clean). No fabricated eval numbers — real-provider eval not run. --- diff --git a/README.md b/README.md index 650c3f2..362c815 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ The full step-by-step is in [`docs/demo.md`](docs/demo.md). Short version # 1. clone git clone https://github.com/div0rce/sentinel.git cd sentinel -cp .env.example .env # set ANTHROPIC_API_KEY and OPENAI_API_KEY +cp .env.example .env # set ANTHROPIC_API_KEY + OPENAI_API_KEY (or use the Google-only path below) # 2. start Postgres + the API docker compose up -d db @@ -143,6 +143,28 @@ curl -s http://localhost:8000/query \ Open for the SPA: **Query**, **Review**, and **Dashboard** views. +### Google-only quickstart (one free Google AI Studio key) + +Sentinel speaks Gemini for both the LLM and embeddings, so you can run the whole +stack on a single free [Google AI Studio](https://aistudio.google.com/apikey) key +— no Anthropic or OpenAI key required. After `cp .env.example .env`, set: + +```bash +GEMINI_API_KEY=... # GOOGLE_API_KEY also works +LLM_PROVIDER=gemini +GEMINI_MODEL=gemini-3.5-flash # fallback: gemini-2.5-flash if 3.5 isn't available to your account +EMBEDDINGS_PROVIDER=gemini +GEMINI_EMBEDDING_MODEL=gemini-embedding-2 +EMBEDDING_DIM=1536 +``` + +Then continue with `docker compose up -d db && make dev && make migrate && make seed`. + +> **Switching embedding providers?** Embeddings from different providers/models are +> **not comparable** — never mix them in one seeded DB. After changing +> `EMBEDDINGS_PROVIDER`/`GEMINI_EMBEDDING_MODEL`/`EMBEDDING_DIM`, reset and reseed: +> `docker compose down -v && docker compose up -d db && make migrate && make seed`. + ### Run the test suite ```bash @@ -184,6 +206,19 @@ export EMBEDDINGS_PROVIDER=openai make migrate && make seed && make eval ``` +…or on a single free Google AI Studio key (re-seed first, since Gemini embeddings +are not comparable to OpenAI's): + +```bash +export GEMINI_API_KEY=... +export LLM_PROVIDER=gemini +export EMBEDDINGS_PROVIDER=gemini +export GEMINI_MODEL=gemini-3.5-flash +export GEMINI_EMBEDDING_MODEL=gemini-embedding-2 +export EMBEDDING_DIM=1536 +make migrate && make seed && make eval +``` + ## Governance & guardrails Three pillars, all deterministic and tested: diff --git a/backend/app/config.py b/backend/app/config.py index d1c954b..4b45941 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -43,11 +43,20 @@ class Settings(BaseSettings): "against the canonical database schema dimension before storing vectors." ), ) - embeddings_provider: Literal["openai", "voyage", "fake"] = "openai" + embeddings_provider: Literal["openai", "voyage", "gemini", "fake"] = "openai" openai_embedding_model: str = Field( default="text-embedding-3-small", description="OpenAI embedding model id used when embeddings_provider='openai'.", ) + gemini_embedding_model: str = Field( + default="gemini-embedding-2", + description=( + "Gemini embedding model id used when embeddings_provider='gemini'. Supports " + "flexible output dimensions (128–3072); EMBEDDING_DIM must still equal the " + "database schema dimension (1536). If 'gemini-embedding-2' is unavailable to " + "your account/region, 'gemini-embedding-001' is a compatible alternative." + ), + ) # --- Chunking (consumed from M2 onward) --------------------------------------- @@ -67,7 +76,7 @@ class Settings(BaseSettings): # --- LLM (consumed from M3 onward) -------------------------------------------- - llm_provider: Literal["anthropic", "fake"] = "anthropic" + llm_provider: Literal["anthropic", "gemini", "fake"] = "anthropic" claude_model: str = Field( default="claude-sonnet-4-6", description=( @@ -76,6 +85,18 @@ class Settings(BaseSettings): "model-versioning docs); bumping this default is intentional." ), ) + gemini_model: str = Field( + default="gemini-3.5-flash", + description=( + "Gemini model id used when llm_provider='gemini'. If 'gemini-3.5-flash' is " + "not available to your account/region, 'gemini-2.5-flash' is a stable " + "fallback." + ), + ) + gemini_base_url: str = Field( + default="https://generativelanguage.googleapis.com/v1beta", + description="Base URL for the Gemini (Google AI Studio) REST API.", + ) llm_temperature: float = Field( default=0.0, ge=0.0, @@ -97,6 +118,14 @@ class Settings(BaseSettings): anthropic_api_key: str = "" openai_api_key: str = "" voyage_api_key: str = "" + gemini_api_key: str = "" + google_api_key: str = Field( + default="", + description=( + "Fallback for GEMINI_API_KEY. Google AI Studio keys work under either name; " + "GEMINI_API_KEY is the documented one and takes precedence." + ), + ) # --- Retrieval and review thresholds (consumed from M3/M5 onward) ------------- @@ -116,6 +145,32 @@ class Settings(BaseSettings): ), ) + # --- Resolved-by-provider model labels (consumed by the eval harness) --------- + + @property + def active_llm_model(self) -> str: + """Model id of the *currently selected* LLM provider. + + Used by the eval harness so RESULTS.md reports the model that actually ran + rather than always labelling it with ``claude_model`` (Golden Rule #5). + """ + if self.llm_provider == "anthropic": + return self.claude_model + if self.llm_provider == "gemini": + return self.gemini_model + return "fake-llm" + + @property + def active_embedding_model(self) -> str: + """Embedding model id of the *currently selected* embeddings provider.""" + if self.embeddings_provider == "openai": + return self.openai_embedding_model + if self.embeddings_provider == "gemini": + return self.gemini_embedding_model + # 'voyage' has no model field yet (provider unimplemented) and 'fake' is + # non-semantic; fall back to the provider name so the label is never wrong. + return self.embeddings_provider + @lru_cache(maxsize=1) def get_settings() -> Settings: diff --git a/backend/app/embeddings/__init__.py b/backend/app/embeddings/__init__.py index 562fb9f..231e66d 100644 --- a/backend/app/embeddings/__init__.py +++ b/backend/app/embeddings/__init__.py @@ -5,6 +5,7 @@ * :class:`EmbeddingProvider` — the protocol all providers implement. * :class:`FakeEmbedder` — deterministic, no-API embedder for tests/CI. * :class:`OpenAIEmbedder` — hosted ``text-embedding-3-*`` via OpenAI's REST API. +* :class:`GeminiEmbedder` — hosted ``gemini-embedding-*`` via Google's REST API. * :func:`get_embedder` — factory that maps :class:`backend.app.config.Settings` to the right provider, validating that the runtime ``embedding_dim`` matches the canonical database schema dimension before any vector is generated. @@ -15,12 +16,14 @@ from backend.app.config import Settings, get_settings from backend.app.embeddings.base import EmbeddingProvider from backend.app.embeddings.fake import FakeEmbedder +from backend.app.embeddings.gemini_provider import GeminiEmbedder from backend.app.embeddings.openai_provider import OpenAIEmbedder from backend.app.models import SCHEMA_EMBEDDING_DIM __all__ = [ "EmbeddingProvider", "FakeEmbedder", + "GeminiEmbedder", "OpenAIEmbedder", "get_embedder", ] @@ -52,6 +55,13 @@ def get_embedder(settings: Settings | None = None) -> EmbeddingProvider: model=settings.openai_embedding_model, dim=SCHEMA_EMBEDDING_DIM, ) + if provider == "gemini": + return GeminiEmbedder( + api_key=settings.gemini_api_key or settings.google_api_key, + model=settings.gemini_embedding_model, + dim=SCHEMA_EMBEDDING_DIM, + base_url=settings.gemini_base_url, + ) if provider == "voyage": # Voyage support arrives in a later milestone; fail loudly so misconfiguration # in CI or production is surfaced before any ingest work runs. diff --git a/backend/app/embeddings/gemini_provider.py b/backend/app/embeddings/gemini_provider.py new file mode 100644 index 0000000..a97e92f --- /dev/null +++ b/backend/app/embeddings/gemini_provider.py @@ -0,0 +1,112 @@ +"""Google AI Studio / Gemini embeddings provider. + +Wraps a single POST against ``:batchEmbedContents`` so the rest of the pipeline can swap +providers behind :class:`backend.app.embeddings.base.EmbeddingProvider`. Like the OpenAI +embedder, the Google SDK is intentionally not a dependency — the embeddings endpoint is +small and stable. + +The REST field that controls vector size is the snake_case ``output_dimensionality`` (the +JS SDK uses camelCase ``outputDimensionality``; the REST body does not). ``gemini-embedding-2`` +supports flexible dimensions, so we request exactly the schema dimension and validate the +returned length to fail loudly on any mismatch. + +CI never exercises this provider (``EMBEDDINGS_PROVIDER=fake``). It exists so local runs +and deployments can flip the provider via env without code changes. +""" + +from __future__ import annotations + +from collections.abc import Sequence +from typing import Any + +import httpx + +from backend.app.gemini_common import raise_for_gemini_error + + +class GeminiEmbedder: + """Gemini ``models/{model}:batchEmbedContents`` provider.""" + + DEFAULT_BASE_URL = "https://generativelanguage.googleapis.com/v1beta" + DEFAULT_TIMEOUT_SECONDS = 30.0 + + # Constructor is kept parallel to OpenAIEmbedder (api_key, model, dim, base_url, + # timeout): keyword-only and self-documenting at every call site. Consistency + # across the provider classes is deliberate; see the shared-provider-base backlog + # item in MILESTONES.md for the cross-cutting DRY pass. + def __init__( + self, + *, + api_key: str, + model: str, + dim: int, + base_url: str = DEFAULT_BASE_URL, + timeout: float = DEFAULT_TIMEOUT_SECONDS, + ) -> None: + """Configure the embedder. ``dim`` is the output vector size requested from + the API (via ``output_dimensionality``) and validated on every response; + ``base_url`` is normalised (trailing slash stripped). Raises ``ValueError`` + when ``api_key`` is empty or ``dim < 1`` — fail fast rather than at the first + request.""" + if not api_key: + raise ValueError("GEMINI_API_KEY is required to use GeminiEmbedder") + if dim < 1: + raise ValueError(f"dim must be >= 1, got {dim}") + self._api_key = api_key + self._model = model + self._dim = dim + self._base_url = base_url.rstrip("/") + self._timeout = timeout + + @property + def dim(self) -> int: + return self._dim + + def embed(self, texts: Sequence[str]) -> list[list[float]]: + """Embed ``texts`` into one ``dim``-length vector each, in input order. + + Returns ``[]`` for empty input (no request issued). Raises ``RuntimeError`` + on a non-2xx response, or if the API returns the wrong number of vectors or + a vector whose length differs from ``dim`` — surfacing a provider/dimension + mismatch loudly rather than letting a bad vector reach pgvector.""" + if not texts: + return [] + # batchEmbedContents returns one embedding per request, in order. The model + # must be the fully-qualified ``models/{id}`` form inside each request object. + requests: list[dict[str, Any]] = [ + { + "model": f"models/{self._model}", + "content": {"parts": [{"text": text}]}, + "output_dimensionality": self._dim, + } + for text in texts + ] + response = httpx.post( + f"{self._base_url}/models/{self._model}:batchEmbedContents", + headers={ + "x-goog-api-key": self._api_key, + "content-type": "application/json", + }, + json={"requests": requests}, + timeout=self._timeout, + ) + raise_for_gemini_error(response, model=self._model) + return _parse_batch_embeddings(response.json(), expected_count=len(texts), dim=self._dim) + + +def _parse_batch_embeddings( + body: dict[str, Any], *, expected_count: int, dim: int +) -> list[list[float]]: + """Extract and validate vectors from a ``batchEmbedContents`` response body. + + Returns one ``dim``-length vector per request, in order. Raises ``RuntimeError`` + if the count or any vector length disagrees with what was requested, so a + provider/dimension mismatch fails loudly rather than reaching pgvector.""" + items = body.get("embeddings") or [] + vectors: list[list[float]] = [list(item["values"]) for item in items] + if len(vectors) != expected_count: + raise RuntimeError(f"Gemini returned {len(vectors)} embeddings, expected {expected_count}") + for vec in vectors: + if len(vec) != dim: + raise RuntimeError(f"Gemini returned vector of length {len(vec)}, expected {dim}") + return vectors diff --git a/backend/app/gemini_common.py b/backend/app/gemini_common.py new file mode 100644 index 0000000..5226165 --- /dev/null +++ b/backend/app/gemini_common.py @@ -0,0 +1,39 @@ +"""Shared helpers for the Gemini (Google AI Studio) REST providers. + +This module is intentionally provider-neutral: both the LLM client +(:mod:`backend.app.llm.gemini`) and the embeddings provider +(:mod:`backend.app.embeddings.gemini_provider`) import it, so the embeddings +layer never has to reach into ``llm/``. +""" + +from __future__ import annotations + +import httpx + + +def raise_for_gemini_error(response: httpx.Response, *, model: str) -> None: + """Raise a ``RuntimeError`` with operational context on a non-2xx Gemini response. + + Gemini error bodies carry a useful ``error.message``; surface it alongside the + status code and model so failures are debuggable. The API key and the request + body are deliberately *not* included — error messages routinely end up in logs. + On a 2xx response this is a no-op. + """ + if response.is_success: + return + + detail = "" + try: + body = response.json() + except (ValueError, httpx.DecodingError): + body = None + if isinstance(body, dict): + error = body.get("error") + if isinstance(error, dict): + message = error.get("message") + if message: + detail = f": {message}" + + raise RuntimeError( + f"Gemini request for model {model!r} failed with status {response.status_code}{detail}" + ) diff --git a/backend/app/llm/__init__.py b/backend/app/llm/__init__.py index 9b71032..369ed1b 100644 --- a/backend/app/llm/__init__.py +++ b/backend/app/llm/__init__.py @@ -5,6 +5,7 @@ * :class:`LLMClient` — the protocol all providers implement. * :class:`FakeLLM` — deterministic, no-API client for tests. * :class:`ClaudeClient` — hosted Anthropic Claude via the ``/v1/messages`` API. +* :class:`GeminiClient` — hosted Google Gemini via the ``:generateContent`` API. * :func:`get_llm` — factory that maps :class:`backend.app.config.Settings` to the right provider. """ @@ -15,10 +16,12 @@ from backend.app.llm.base import LLMClient, LLMResponse from backend.app.llm.claude import ClaudeClient from backend.app.llm.fake import FakeLLM +from backend.app.llm.gemini import GeminiClient __all__ = [ "ClaudeClient", "FakeLLM", + "GeminiClient", "LLMClient", "LLMResponse", "get_llm", @@ -41,4 +44,10 @@ def get_llm(settings: Settings | None = None) -> LLMClient: api_key=settings.anthropic_api_key, model=settings.claude_model, ) + if provider == "gemini": + return GeminiClient( + api_key=settings.gemini_api_key or settings.google_api_key, + model=settings.gemini_model, + base_url=settings.gemini_base_url, + ) raise ValueError(f"Unknown LLM provider: {provider!r}") diff --git a/backend/app/llm/gemini.py b/backend/app/llm/gemini.py new file mode 100644 index 0000000..0723f7c --- /dev/null +++ b/backend/app/llm/gemini.py @@ -0,0 +1,105 @@ +"""Google AI Studio / Gemini ``generateContent`` API client. + +Talks to the public ``v1beta`` ``:generateContent`` endpoint with ``httpx`` directly +— matching :class:`backend.app.llm.claude.ClaudeClient`, which deliberately avoids the +vendor SDK for one small HTTP call. A Google AI Studio key is free and low-friction, so +this provider lets the whole RAG stack run without Anthropic/OpenAI keys. + +CI never exercises this client (``LLM_PROVIDER=fake``); it exists so local runs and +deployments can flip the provider via env without code changes. +""" + +from __future__ import annotations + +from typing import Any + +import httpx + +from backend.app.gemini_common import raise_for_gemini_error +from backend.app.llm.base import LLMResponse + + +class GeminiClient: + """Gemini ``models/{model}:generateContent`` provider.""" + + DEFAULT_BASE_URL = "https://generativelanguage.googleapis.com/v1beta" + DEFAULT_TIMEOUT_SECONDS = 60.0 + + def __init__( + self, + *, + api_key: str, + model: str, + base_url: str = DEFAULT_BASE_URL, + timeout: float = DEFAULT_TIMEOUT_SECONDS, + ) -> None: + """Configure the client. ``base_url`` is normalised (trailing slash stripped) + so the request URL is well-formed regardless of how ``GEMINI_BASE_URL`` is set. + Raises ``ValueError`` when ``api_key`` is empty — fail fast rather than at the + first request.""" + if not api_key: + raise ValueError("GEMINI_API_KEY is required to use GeminiClient") + self._api_key = api_key + self._model = model + self._base_url = base_url.rstrip("/") + self._timeout = timeout + + @property + def model_name(self) -> str: + return self._model + + def complete( + self, + *, + system: str, + user: str, + max_tokens: int, + temperature: float, + ) -> LLMResponse: + """Return a single completion for the (system, user) pair. + + ``system`` is sent as a ``systemInstruction`` only when non-empty (an empty + one can be rejected). Text parts from the first candidate are concatenated; + an empty/blocked response yields ``LLMResponse(text="")`` rather than raising, + so callers (e.g. the RAG refusal path) can handle it. Raises ``RuntimeError`` + on a non-2xx response (without leaking the key).""" + payload: dict[str, Any] = { + "contents": [{"role": "user", "parts": [{"text": user}]}], + "generationConfig": { + "temperature": temperature, + "maxOutputTokens": max_tokens, + }, + } + # Only attach a system instruction when one is supplied; an empty + # systemInstruction can be rejected by the API. + if system: + payload["systemInstruction"] = {"parts": [{"text": system}]} + + response = httpx.post( + f"{self._base_url}/models/{self._model}:generateContent", + headers={ + "x-goog-api-key": self._api_key, + "content-type": "application/json", + }, + json=payload, + timeout=self._timeout, + ) + raise_for_gemini_error(response, model=self._model) + return _parse_generate_content(response.json(), default_model=self._model) + + +def _parse_generate_content(body: dict[str, Any], *, default_model: str) -> LLMResponse: + """Build an :class:`LLMResponse` from a ``generateContent`` response body. + + Concatenates the text parts of the first candidate; the model is taken from the + response (``modelVersion``/``model``) or falls back to ``default_model``. A + missing/empty candidate list yields empty text rather than an error.""" + candidates = body.get("candidates") or [] + model = str(body.get("modelVersion") or body.get("model") or default_model) + if not candidates: + return LLMResponse(text="", model=model, stop_reason=None) + + first = candidates[0] + parts = (first.get("content") or {}).get("parts") or [] + text = "".join(str(part.get("text", "")) for part in parts if "text" in part) + return LLMResponse(text=text, model=model, stop_reason=first.get("finishReason")) diff --git a/backend/tests/test_eval_harness.py b/backend/tests/test_eval_harness.py index 0c966d6..5e9e97a 100644 --- a/backend/tests/test_eval_harness.py +++ b/backend/tests/test_eval_harness.py @@ -523,9 +523,9 @@ def test_render_writes_real_metrics_when_quotable(session: Session, tmp_path: Pa ), settings_summary={ "llm_provider": "anthropic", - "claude_model": "claude-sonnet-4-6", + "llm_model": "claude-sonnet-4-6", "embeddings_provider": "openai", - "openai_embedding_model": "text-embedding-3-small", + "embedding_model": "text-embedding-3-small", "embedding_dim": SCHEMA_EMBEDDING_DIM, "llm_temperature": 0.0, "retrieval_top_k": 5, diff --git a/backend/tests/test_gemini_embeddings.py b/backend/tests/test_gemini_embeddings.py new file mode 100644 index 0000000..1cf1784 --- /dev/null +++ b/backend/tests/test_gemini_embeddings.py @@ -0,0 +1,174 @@ +"""Tests for the Gemini embeddings provider and its factory wiring. + +No network: ``httpx.post`` is monkeypatched with a typed fake so the batch request shape +and response parsing are exercised offline. +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from backend.app.config import Settings +from backend.app.embeddings import FakeEmbedder, GeminiEmbedder, OpenAIEmbedder, get_embedder +from backend.app.embeddings.gemini_provider import GeminiEmbedder as GeminiEmbedderDirect +from backend.app.models import SCHEMA_EMBEDDING_DIM + + +class _FakeResponse: + def __init__(self, *, status_code: int, body: dict[str, Any]) -> None: + self.status_code = status_code + self._body = body + + @property + def is_success(self) -> bool: + return self.status_code < 400 + + def json(self) -> Any: + return self._body + + +def _patch_post( + monkeypatch: pytest.MonkeyPatch, + *, + status_code: int = 200, + body: dict[str, Any] | None = None, +) -> dict[str, Any]: + captured: dict[str, Any] = {} + + def fake_post( + url: str, + *, + headers: dict[str, str], + json: dict[str, Any], + timeout: float, + ) -> _FakeResponse: + captured["url"] = url + captured["headers"] = headers + captured["json"] = json + captured["timeout"] = timeout + return _FakeResponse(status_code=status_code, body=body or {}) + + monkeypatch.setattr("backend.app.embeddings.gemini_provider.httpx.post", fake_post) + return captured + + +# --- factory ------------------------------------------------------------------ + + +def test_factory_returns_gemini_when_provider_is_gemini() -> None: + settings = Settings(embeddings_provider="gemini", gemini_api_key="test-key") + embedder = get_embedder(settings) + assert isinstance(embedder, GeminiEmbedder) + assert embedder.dim == SCHEMA_EMBEDDING_DIM + + +def test_factory_raises_when_gemini_key_missing() -> None: + settings = Settings(embeddings_provider="gemini", gemini_api_key="", google_api_key="") + with pytest.raises(ValueError, match="GEMINI_API_KEY is required"): + get_embedder(settings) + + +def test_factory_falls_back_to_google_api_key(monkeypatch: pytest.MonkeyPatch) -> None: + settings = Settings(embeddings_provider="gemini", gemini_api_key="", google_api_key="goog") + embedder = get_embedder(settings) + assert isinstance(embedder, GeminiEmbedder) + + captured = _patch_post( + monkeypatch, + body={"embeddings": [{"values": [0.0] * SCHEMA_EMBEDDING_DIM}]}, + ) + embedder.embed(["hello"]) + assert captured["headers"]["x-goog-api-key"] == "goog" + + +def test_factory_fake_and_openai_behaviour_unchanged() -> None: + assert isinstance(get_embedder(Settings(embeddings_provider="fake")), FakeEmbedder) + openai = get_embedder(Settings(embeddings_provider="openai", openai_api_key="sk-x")) + assert isinstance(openai, OpenAIEmbedder) + + +# --- behaviour ---------------------------------------------------------------- + + +def test_embed_empty_returns_empty() -> None: + embedder = GeminiEmbedderDirect(api_key="k", model="gemini-embedding-2", dim=4) + assert embedder.embed([]) == [] + + +def test_multiple_inputs_return_vectors_in_order(monkeypatch: pytest.MonkeyPatch) -> None: + captured = _patch_post( + monkeypatch, + body={ + "embeddings": [ + {"values": [0.1, 0.2, 0.3, 0.4]}, + {"values": [0.5, 0.6, 0.7, 0.8]}, + ] + }, + ) + embedder = GeminiEmbedderDirect(api_key="ek", model="gemini-embedding-2", dim=4) + vectors = embedder.embed(["first", "second"]) + + assert vectors == [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]] + assert captured["headers"]["x-goog-api-key"] == "ek" + requests = captured["json"]["requests"] + assert len(requests) == 2 + assert requests[0]["content"]["parts"][0]["text"] == "first" + assert requests[0]["model"] == "models/gemini-embedding-2" + assert all(r["output_dimensionality"] == 4 for r in requests) + + +def test_timeout_forwarded_and_url_normalized(monkeypatch: pytest.MonkeyPatch) -> None: + captured = _patch_post(monkeypatch, body={"embeddings": [{"values": [1.0, 2.0, 3.0, 4.0]}]}) + embedder = GeminiEmbedderDirect( + api_key="k", + model="gemini-embedding-2", + dim=4, + base_url="https://example.test/v1beta/", # trailing slash on purpose + timeout=7.0, + ) + embedder.embed(["x"]) + assert captured["timeout"] == 7.0 + assert ( + captured["url"] + == "https://example.test/v1beta/models/gemini-embedding-2:batchEmbedContents" + ) + + +def test_wrong_vector_dimension_raises(monkeypatch: pytest.MonkeyPatch) -> None: + _patch_post(monkeypatch, body={"embeddings": [{"values": [0.1, 0.2, 0.3]}]}) + embedder = GeminiEmbedderDirect(api_key="k", model="gemini-embedding-2", dim=4) + with pytest.raises(RuntimeError, match="length 3, expected 4"): + embedder.embed(["x"]) + + +def test_wrong_count_raises(monkeypatch: pytest.MonkeyPatch) -> None: + _patch_post(monkeypatch, body={"embeddings": [{"values": [0.1, 0.2, 0.3, 0.4]}]}) + embedder = GeminiEmbedderDirect(api_key="k", model="gemini-embedding-2", dim=4) + with pytest.raises(RuntimeError, match="1 embeddings, expected 2"): + embedder.embed(["x", "y"]) + + +def test_missing_key_raises() -> None: + with pytest.raises(ValueError, match="GEMINI_API_KEY is required"): + GeminiEmbedderDirect(api_key="", model="gemini-embedding-2", dim=4) + + +def test_non_2xx_raises_runtime_error_without_leaking_key( + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_post( + monkeypatch, + status_code=429, + body={"error": {"message": "quota exceeded"}}, + ) + embedder = GeminiEmbedderDirect(api_key="secret-key-123", model="gemini-embedding-2", dim=4) + with pytest.raises(RuntimeError) as excinfo: + embedder.embed(["x"]) + + message = str(excinfo.value) + assert "429" in message + assert "quota exceeded" in message + assert "gemini-embedding-2" in message + assert "secret-key-123" not in message diff --git a/backend/tests/test_gemini_llm.py b/backend/tests/test_gemini_llm.py new file mode 100644 index 0000000..8e2bf85 --- /dev/null +++ b/backend/tests/test_gemini_llm.py @@ -0,0 +1,182 @@ +"""Tests for the Gemini LLM client and its factory wiring. + +No network: ``httpx.post`` is monkeypatched with a typed fake so the request shape and +response parsing are exercised offline (matching the repo's no-live-API-in-CI rule). +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from backend.app.config import Settings +from backend.app.llm import FakeLLM, GeminiClient, get_llm +from backend.app.llm.gemini import GeminiClient as GeminiClientDirect + + +class _FakeResponse: + """Minimal stand-in for ``httpx.Response`` used by the Gemini providers.""" + + def __init__(self, *, status_code: int, body: dict[str, Any]) -> None: + self.status_code = status_code + self._body = body + + @property + def is_success(self) -> bool: + return self.status_code < 400 + + def json(self) -> Any: + return self._body + + +def _patch_post( + monkeypatch: pytest.MonkeyPatch, + *, + status_code: int = 200, + body: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Patch ``gemini.httpx.post`` and return a dict that captures the call kwargs.""" + captured: dict[str, Any] = {} + + def fake_post( + url: str, + *, + headers: dict[str, str], + json: dict[str, Any], + timeout: float, + ) -> _FakeResponse: + captured["url"] = url + captured["headers"] = headers + captured["json"] = json + captured["timeout"] = timeout + return _FakeResponse(status_code=status_code, body=body or {}) + + monkeypatch.setattr("backend.app.llm.gemini.httpx.post", fake_post) + return captured + + +# --- factory ------------------------------------------------------------------ + + +def test_factory_returns_gemini_when_provider_is_gemini() -> None: + settings = Settings(llm_provider="gemini", gemini_api_key="test-key") + assert isinstance(get_llm(settings), GeminiClient) + + +def test_factory_raises_when_gemini_key_missing() -> None: + settings = Settings(llm_provider="gemini", gemini_api_key="", google_api_key="") + with pytest.raises(ValueError, match="GEMINI_API_KEY is required"): + get_llm(settings) + + +def test_factory_falls_back_to_google_api_key(monkeypatch: pytest.MonkeyPatch) -> None: + settings = Settings(llm_provider="gemini", gemini_api_key="", google_api_key="goog-key") + client = get_llm(settings) + assert isinstance(client, GeminiClient) + + captured = _patch_post(monkeypatch, body={"candidates": []}) + client.complete(system="s", user="u", max_tokens=8, temperature=0.0) + assert captured["headers"]["x-goog-api-key"] == "goog-key" + + +def test_fake_provider_still_works_without_keys() -> None: + settings = Settings(llm_provider="fake") + assert isinstance(get_llm(settings), FakeLLM) + + +# --- request shape ------------------------------------------------------------ + + +def test_request_uses_api_key_header_and_expected_body( + monkeypatch: pytest.MonkeyPatch, +) -> None: + captured = _patch_post( + monkeypatch, + body={"candidates": [{"content": {"parts": [{"text": "ok"}]}}]}, + ) + client = GeminiClientDirect(api_key="header-key", model="gemini-3.5-flash") + client.complete(system="be terse", user="hello", max_tokens=64, temperature=0.0) + + assert captured["headers"]["x-goog-api-key"] == "header-key" + payload = captured["json"] + assert payload["systemInstruction"] == {"parts": [{"text": "be terse"}]} + assert payload["contents"] == [{"role": "user", "parts": [{"text": "hello"}]}] + assert payload["generationConfig"]["temperature"] == 0.0 + assert payload["generationConfig"]["maxOutputTokens"] == 64 + + +def test_empty_system_omits_system_instruction(monkeypatch: pytest.MonkeyPatch) -> None: + captured = _patch_post(monkeypatch, body={"candidates": []}) + client = GeminiClientDirect(api_key="k", model="gemini-3.5-flash") + client.complete(system="", user="hi", max_tokens=8, temperature=0.0) + assert "systemInstruction" not in captured["json"] + + +def test_timeout_forwarded_and_url_normalized(monkeypatch: pytest.MonkeyPatch) -> None: + captured = _patch_post(monkeypatch, body={"candidates": []}) + client = GeminiClientDirect( + api_key="k", + model="gemini-3.5-flash", + base_url="https://example.test/v1beta/", # trailing slash on purpose + timeout=12.5, + ) + client.complete(system="s", user="u", max_tokens=8, temperature=0.0) + assert captured["timeout"] == 12.5 + assert captured["url"] == "https://example.test/v1beta/models/gemini-3.5-flash:generateContent" + + +# --- response parsing --------------------------------------------------------- + + +def test_concatenates_text_parts_and_maps_finish_reason( + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_post( + monkeypatch, + body={ + "candidates": [ + { + "content": {"parts": [{"text": "Hello "}, {"text": "world"}]}, + "finishReason": "STOP", + } + ], + "modelVersion": "gemini-3.5-flash-001", + }, + ) + client = GeminiClientDirect(api_key="k", model="gemini-3.5-flash") + result = client.complete(system="s", user="u", max_tokens=8, temperature=0.0) + assert result.text == "Hello world" + assert result.stop_reason == "STOP" + assert result.model == "gemini-3.5-flash-001" + + +def test_empty_candidates_returns_empty_text(monkeypatch: pytest.MonkeyPatch) -> None: + _patch_post(monkeypatch, body={"candidates": []}) + client = GeminiClientDirect(api_key="k", model="gemini-3.5-flash") + result = client.complete(system="s", user="u", max_tokens=8, temperature=0.0) + assert result.text == "" + assert result.model == "gemini-3.5-flash" + assert result.stop_reason is None + + +# --- error handling ----------------------------------------------------------- + + +def test_non_2xx_raises_runtime_error_without_leaking_key( + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_post( + monkeypatch, + status_code=400, + body={"error": {"message": "model not found"}}, + ) + client = GeminiClientDirect(api_key="secret-key-123", model="gemini-3.5-flash") + with pytest.raises(RuntimeError) as excinfo: + client.complete(system="s", user="u", max_tokens=8, temperature=0.0) + + message = str(excinfo.value) + assert "400" in message + assert "model not found" in message + assert "gemini-3.5-flash" in message + assert "secret-key-123" not in message diff --git a/docs/architecture.md b/docs/architecture.md index c3c2d4b..76c4e12 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -123,10 +123,21 @@ the database never sees raw emails / SSNs / phone numbers / IPs. ### Embeddings (`backend/app/embeddings/`) -Provider behind an interface. Two implementations: `OpenAIEmbedder` -(`text-embedding-3-small`, 1536 dims) and `FakeEmbedder` (deterministic SHA-256 -projection used in CI and unit tests). Provider is selected by -`EMBEDDINGS_PROVIDER`. CI runs offline with `EMBEDDINGS_PROVIDER=fake`. +Provider behind an interface. Implementations: `OpenAIEmbedder` +(`text-embedding-3-small`, 1536 dims), `GeminiEmbedder` (`gemini-embedding-2`, +requesting 1536 dims via the REST `output_dimensionality` field), and +`FakeEmbedder` (deterministic SHA-256 projection used in CI and unit tests). +Provider is selected by `EMBEDDINGS_PROVIDER`. CI runs offline with +`EMBEDDINGS_PROVIDER=fake`. Vectors from different providers are not comparable, +so a provider/model/dimension change requires a re-seed. + +### LLM (`backend/app/llm/`) + +Single-turn chat behind an interface. Implementations: `ClaudeClient` +(Anthropic `/v1/messages`), `GeminiClient` (Google `:generateContent`), and +`FakeLLM` (canned, deterministic). Provider is selected by `LLM_PROVIDER`; both +hosted clients talk to their REST API directly via `httpx` (no vendor SDK). A +single free Google AI Studio key can therefore drive both the LLM and embeddings. ### Retrieval (`backend/app/retrieval.py`) @@ -407,7 +418,7 @@ flowchart TB ecr_be[(ECR sentinel-backend)] ecr_fe[(ECR sentinel-frontend)] - ssm[(SSM SecureString
/sentinel/anthropic_api_key
/sentinel/openai_api_key
/sentinel/database_url)] + ssm[(SSM SecureString
/sentinel/anthropic_api_key
/sentinel/openai_api_key
/sentinel/gemini_api_key
/sentinel/database_url)] cwlogs[CloudWatch Logs
retention 7d] gha[GitHub Actions OIDC role
scoped: ECR push + ECS update-service] @@ -461,8 +472,9 @@ subnets in the no-NAT design), the SG bars internet reach. | **Total idle floor** | **~$45/mo** | Plus per-second Fargate + traffic charges. | The estimate excludes a NAT Gateway (~$32/mo idle) by design: ECS tasks live -in public subnets with `assign_public_ip = true` so they can reach ECR, -Anthropic, OpenAI, and CloudWatch without one. This is acceptable **only** +in public subnets with `assign_public_ip = true` so they can reach ECR, the +external model APIs (Anthropic / OpenAI / Gemini), and CloudWatch without one. +This is acceptable **only** because the security groups are tight (above) and the deployment is ephemeral. Run `terraform destroy` immediately after demo screenshots — the operator recipe lives in `infra/README.md`. diff --git a/docs/demo.md b/docs/demo.md index ed4a696..348d4bc 100644 --- a/docs/demo.md +++ b/docs/demo.md @@ -19,10 +19,14 @@ final section repeats the demo on AWS using the M10 Terraform stack. | Node | 20 LTS | Vite dev server for the frontend | | Anthropic API key | claude-sonnet-4-6 access | for `/query` and `/extract` | | OpenAI API key | text-embedding-3-small access | for embeddings at ingest time | +| _or_ Google AI Studio key | gemini-3.5-flash + gemini-embedding-2 | drives **both** LLM and embeddings on one free key | Without API keys you can still run the test suite (it uses the deterministic fake LLM and embedder) but `/query` and `/extract` against the real -synthetic corpus need real keys. +synthetic corpus need real keys. The lowest-friction path is a single free +[Google AI Studio](https://aistudio.google.com/apikey) key with +`LLM_PROVIDER=gemini` and `EMBEDDINGS_PROVIDER=gemini` (see the README's +"Google-only quickstart"). --- @@ -315,6 +319,9 @@ aws ssm put-parameter --name /sentinel/anthropic_api_key \ --type SecureString --value "$ANTHROPIC_API_KEY" --overwrite aws ssm put-parameter --name /sentinel/openai_api_key \ --type SecureString --value "$OPENAI_API_KEY" --overwrite +# Only if deploying with -var='llm_provider=gemini' / 'embeddings_provider=gemini': +aws ssm put-parameter --name /sentinel/gemini_api_key \ + --type SecureString --value "$GEMINI_API_KEY" --overwrite # Force the backend to pick up the new secret values aws ecs update-service --cluster sentinel-cluster \ diff --git a/docs/evaluation.md b/docs/evaluation.md index 62b4bd2..0733979 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -43,6 +43,13 @@ the run in `eval/RESULTS.md`): `backend.app.models.SCHEMA_EMBEDDING_DIM`. - **Temperature:** `0.0` per CLAUDE.md house style ("pin temperatures for LLM calls used in eval"). + +A run may also use the **Gemini** provider (`gemini-3.5-flash` + +`gemini-embedding-2`, 1536 dimensions) — see "Reproducing the numbers" below. +`eval/RESULTS.md` records the *active* provider/model for the run (the +`llm_model` / `embedding_model` run-metadata labels resolve from the selected +provider, so a Gemini run never mislabels itself as Claude/OpenAI). Numbers from +different providers are not comparable and should not be mixed in one results file. - **k:** `5` for retrieval and as the default top-k passed to RAG. `Settings` defaults reflect these pins; `.env.example` documents them. @@ -160,12 +167,21 @@ ever shipping a number that could be misread as a quality claim. ## Reproducing the numbers ```bash -# 1. Wire keys (real run) +# 1. Wire keys (real run) — Anthropic + OpenAI… export ANTHROPIC_API_KEY=... export OPENAI_API_KEY=... export LLM_PROVIDER=anthropic export EMBEDDINGS_PROVIDER=openai +# …or a single free Google AI Studio key: +# export GEMINI_API_KEY=... +# export LLM_PROVIDER=gemini +# export EMBEDDINGS_PROVIDER=gemini +# export GEMINI_MODEL=gemini-3.5-flash +# export GEMINI_EMBEDDING_MODEL=gemini-embedding-2 +# export EMBEDDING_DIM=1536 +# (Re-seed when switching embedding providers — vectors are not comparable.) + # 2. Apply migrations to a fresh DB make migrate diff --git a/eval/harness.py b/eval/harness.py index fa32c92..453be3a 100644 --- a/eval/harness.py +++ b/eval/harness.py @@ -493,11 +493,13 @@ def evaluate_rag( def _settings_summary(settings: Settings) -> dict[str, Any]: + # Report the *active* provider's model, not a hardcoded Anthropic/OpenAI label, so + # a Gemini (or fake) run does not mislabel itself in RESULTS.md (Golden Rule #5). return { "llm_provider": settings.llm_provider, - "claude_model": settings.claude_model, + "llm_model": settings.active_llm_model, "embeddings_provider": settings.embeddings_provider, - "openai_embedding_model": settings.openai_embedding_model, + "embedding_model": settings.active_embedding_model, "embedding_dim": settings.embedding_dim, "llm_temperature": settings.llm_temperature, "retrieval_top_k": settings.retrieval_top_k, diff --git a/eval/results.py b/eval/results.py index 3fe034c..7ae68d6 100644 --- a/eval/results.py +++ b/eval/results.py @@ -33,9 +33,9 @@ def render(report: HarnessReport, *, run_at: datetime | None = None) -> str: lines.append("") lines.append(f"- **Run at (UTC):** {run_at.isoformat()}") lines.append(f"- **LLM provider:** `{report.settings_summary['llm_provider']}`") - lines.append(f"- **LLM model:** `{report.settings_summary['claude_model']}`") + lines.append(f"- **LLM model:** `{report.settings_summary['llm_model']}`") lines.append(f"- **Embedding provider:** `{report.settings_summary['embeddings_provider']}`") - lines.append(f"- **Embedding model:** `{report.settings_summary['openai_embedding_model']}`") + lines.append(f"- **Embedding model:** `{report.settings_summary['embedding_model']}`") lines.append(f"- **Embedding dim:** `{report.settings_summary['embedding_dim']}`") lines.append(f"- **LLM temperature:** `{report.settings_summary['llm_temperature']}`") lines.append(f"- **Retrieval k:** `{report.settings_summary['retrieval_top_k']}`") diff --git a/eval/run.py b/eval/run.py index e858740..b3cf073 100644 --- a/eval/run.py +++ b/eval/run.py @@ -40,8 +40,8 @@ def _build_parser() -> argparse.ArgumentParser: def _print_summary(report: HarnessReport) -> None: s = report.settings_summary print( - f"eval: llm={s['llm_provider']}/{s['claude_model']} " - f"emb={s['embeddings_provider']}/{s['openai_embedding_model']} " + f"eval: llm={s['llm_provider']}/{s['llm_model']} " + f"emb={s['embeddings_provider']}/{s['embedding_model']} " f"temp={s['llm_temperature']} k={s['retrieval_top_k']}" ) er = report.extraction diff --git a/infra/README.md b/infra/README.md index dd6df60..e56b20b 100644 --- a/infra/README.md +++ b/infra/README.md @@ -20,8 +20,8 @@ scoped GitHub Actions OIDC role that the manual-dispatch CD workflow assumes. The VPC has two `/24` public subnets and **no NAT Gateway**. ECS tasks live in those public subnets and get assigned public IPs (`assign_public_ip = true`) -so they can reach ECR for image pulls and Anthropic / OpenAI for outbound API -calls. +so they can reach ECR for image pulls and the external model APIs +(Anthropic / OpenAI / Gemini) for outbound calls. This is chosen because a NAT Gateway is the largest avoidable line item in any small AWS deployment (≈$32/month idle, plus ~$0.045/GB processed). For a demo @@ -50,7 +50,7 @@ backend_sg ──→ rds_sg (5432) FastAPI → Postgres ``` Egress is open on the task SGs (so containers can reach ECR / Anthropic / -OpenAI / CloudWatch). RDS has no egress. +OpenAI / Gemini / CloudWatch). RDS has no egress. ### Public routing @@ -93,7 +93,7 @@ binding numbers.** | ECR storage | <$1/mo | 20-image cap on each repo. | | Secrets / SSM | $0 | Standard parameters, not Advanced. | | CloudWatch Logs | <$1/mo | 7-day retention; demo log volume is tiny. | -| Data transfer | variable | Outbound from ECS tasks → Anthropic/OpenAI. | +| Data transfer | variable | Outbound from ECS tasks → model APIs. | | **Total idle floor** | **~$45/mo** | Plus per-second Fargate charges + traffic. | `terraform destroy` removes all of the above. Run it the moment screenshots @@ -153,10 +153,16 @@ aws ssm put-parameter --name /sentinel/anthropic_api_key \ aws ssm put-parameter --name /sentinel/openai_api_key \ --type SecureString --value "$OPENAI_API_KEY" --overwrite + +# Only needed if you deploy with the Gemini provider (see below): +aws ssm put-parameter --name /sentinel/gemini_api_key \ + --type SecureString --value "$GEMINI_API_KEY" --overwrite ``` (`/sentinel/database_url` is composed by Terraform from the RDS outputs and -already populated.) +already populated. The Gemini parameter is **provisioned by default** but only +consumed when `llm_provider` or `embeddings_provider` is set to `gemini`, so you +can leave it at its `REPLACE_ME` placeholder unless you select Gemini.) Then bounce the backend service so the new secret values are picked up: @@ -166,6 +172,24 @@ aws ecs update-service \ --force-new-deployment --no-cli-pager ``` +### Deploy with the Gemini provider (one free Google AI Studio key) + +The backend provider/model env vars are Terraform variables (defaults preserve +the Anthropic + OpenAI stack). To run the deployed demo entirely on Gemini: + +```bash +terraform apply \ + -var='llm_provider=gemini' \ + -var='embeddings_provider=gemini' \ + -var='gemini_model=gemini-3.5-flash' \ + -var='gemini_embedding_model=gemini-embedding-2' \ + -var='embedding_dim=1536' +``` + +Write the key (above) before bouncing the service. Embeddings from different +providers are not comparable — switching providers on an already-seeded RDS means +re-ingesting the corpus. + ### Run migrations + seed The backend image runs migrations at task start? **No** — by design. Run them diff --git a/infra/main.tf b/infra/main.tf index b25380b..b6a7363 100644 --- a/infra/main.tf +++ b/infra/main.tf @@ -76,6 +76,14 @@ module "ecs" { database_url_secret_arn = module.secrets.database_url_arn anthropic_key_secret_arn = module.secrets.anthropic_key_arn openai_key_secret_arn = module.secrets.openai_key_arn + gemini_key_secret_arn = module.secrets.gemini_key_arn + + # Provider selection (defaults preserve the Anthropic + OpenAI stack). + llm_provider = var.llm_provider + embeddings_provider = var.embeddings_provider + embedding_dim = var.embedding_dim + gemini_model = var.gemini_model + gemini_embedding_model = var.gemini_embedding_model } # OIDC role for the GitHub Actions CD workflow. Created only when a repo is supplied. diff --git a/infra/modules/ecs/main.tf b/infra/modules/ecs/main.tf index 172c1cd..e180284 100644 --- a/infra/modules/ecs/main.tf +++ b/infra/modules/ecs/main.tf @@ -58,6 +58,7 @@ data "aws_iam_policy_document" "task_execution_secrets" { var.database_url_secret_arn, var.anthropic_key_secret_arn, var.openai_key_secret_arn, + var.gemini_key_secret_arn, ] } statement { @@ -210,11 +211,13 @@ locals { ] environment = [ { name = "PORT", value = "8000" }, - { name = "EMBEDDINGS_PROVIDER", value = "openai" }, - { name = "LLM_PROVIDER", value = "anthropic" }, - { name = "EMBEDDING_DIM", value = "1536" }, - { name = "OPENAI_EMBEDDING_MODEL", value = "text-embedding-3-small" }, - { name = "CLAUDE_MODEL", value = "claude-sonnet-4-6" }, + { name = "EMBEDDINGS_PROVIDER", value = var.embeddings_provider }, + { name = "LLM_PROVIDER", value = var.llm_provider }, + { name = "EMBEDDING_DIM", value = var.embedding_dim }, + { name = "OPENAI_EMBEDDING_MODEL", value = var.openai_embedding_model }, + { name = "CLAUDE_MODEL", value = var.claude_model }, + { name = "GEMINI_MODEL", value = var.gemini_model }, + { name = "GEMINI_EMBEDDING_MODEL", value = var.gemini_embedding_model }, { name = "LLM_TEMPERATURE", value = "0.0" }, { name = "PII_REDACTION_ENABLED", value = "true" }, { name = "SENTINEL_LOG_FORMAT", value = "json" }, @@ -223,6 +226,7 @@ locals { { name = "DATABASE_URL", valueFrom = var.database_url_secret_arn }, { name = "ANTHROPIC_API_KEY", valueFrom = var.anthropic_key_secret_arn }, { name = "OPENAI_API_KEY", valueFrom = var.openai_key_secret_arn }, + { name = "GEMINI_API_KEY", valueFrom = var.gemini_key_secret_arn }, ] logConfiguration = { logDriver = "awslogs" @@ -294,7 +298,7 @@ resource "aws_ecs_service" "backend" { network_configuration { subnets = var.public_subnet_ids security_groups = [var.backend_sg_id] - assign_public_ip = true # Required in no-NAT topology so tasks can reach ECR/Anthropic/OpenAI. + assign_public_ip = true # Required in no-NAT topology so tasks can reach ECR + external model APIs (Anthropic/OpenAI/Gemini). } load_balancer { diff --git a/infra/modules/ecs/variables.tf b/infra/modules/ecs/variables.tf index 5ee0487..fbd579e 100644 --- a/infra/modules/ecs/variables.tf +++ b/infra/modules/ecs/variables.tf @@ -62,3 +62,62 @@ variable "anthropic_key_secret_arn" { variable "openai_key_secret_arn" { type = string } + +variable "gemini_key_secret_arn" { + description = "ARN of the SSM SecureString containing the Gemini API key." + type = string +} + +# --- backend provider selection (env-driven; defaults preserve today's behaviour) --- + +variable "llm_provider" { + description = "Backend LLM provider." + type = string + default = "anthropic" + + validation { + condition = contains(["anthropic", "gemini", "fake"], var.llm_provider) + error_message = "llm_provider must be one of: anthropic, gemini, fake." + } +} + +variable "embeddings_provider" { + description = "Backend embeddings provider." + type = string + default = "openai" + + validation { + condition = contains(["openai", "voyage", "gemini", "fake"], var.embeddings_provider) + error_message = "embeddings_provider must be one of: openai, voyage, gemini, fake." + } +} + +variable "embedding_dim" { + description = "Embedding vector dimensionality. Must match the pgvector schema (1536)." + type = string + default = "1536" +} + +variable "claude_model" { + description = "Anthropic model id (used when llm_provider = anthropic)." + type = string + default = "claude-sonnet-4-6" +} + +variable "openai_embedding_model" { + description = "OpenAI embedding model id (used when embeddings_provider = openai)." + type = string + default = "text-embedding-3-small" +} + +variable "gemini_model" { + description = "Gemini chat model id (used when llm_provider = gemini)." + type = string + default = "gemini-3.5-flash" +} + +variable "gemini_embedding_model" { + description = "Gemini embedding model id (used when embeddings_provider = gemini)." + type = string + default = "gemini-embedding-2" +} diff --git a/infra/modules/network/main.tf b/infra/modules/network/main.tf index d84d51a..4ff0512 100644 --- a/infra/modules/network/main.tf +++ b/infra/modules/network/main.tf @@ -57,8 +57,8 @@ resource "aws_route_table_association" "public" { # frontend_sg ─→ backend_sg (8000) (nginx /api proxy to FastAPI) # backend_sg ──→ rds_sg (5432) (FastAPI to Postgres) # -# Egress is intentionally open: tasks need to reach ECR, Anthropic, OpenAI, and -# CloudWatch Logs. RDS does not need egress. +# Egress is intentionally open: tasks need to reach ECR, the external model APIs +# (Anthropic / OpenAI / Gemini), and CloudWatch Logs. RDS does not need egress. resource "aws_security_group" "alb" { name = "${var.project_name}-alb" diff --git a/infra/modules/secrets/main.tf b/infra/modules/secrets/main.tf index b63674f..60b06b5 100644 --- a/infra/modules/secrets/main.tf +++ b/infra/modules/secrets/main.tf @@ -1,11 +1,12 @@ # SSM Parameter Store entries for runtime secrets the ECS task pulls in via the # task execution role. # -# - anthropic/openai keys are placeholders. Overwrite out-of-band: +# - anthropic/openai/gemini keys are placeholders. Overwrite out-of-band: # aws ssm put-parameter --name /sentinel/anthropic_api_key \ # --type SecureString --value "$ANTHROPIC_API_KEY" --overwrite # `lifecycle.ignore_changes = [value]` keeps Terraform from clobbering the -# real value on subsequent applies. +# real value on subsequent applies. The gemini key is provisioned by default +# but only consumed when a provider is set to "gemini". # # - DATABASE_URL is composed from RDS outputs supplied by the caller. It is # sensitive (carries the master password) but Terraform-owned, so its @@ -44,6 +45,19 @@ resource "aws_ssm_parameter" "openai_api_key" { } } +# Provisioned by default but only consumed when llm_provider or embeddings_provider +# is set to "gemini". Overwrite out-of-band like the other keys. +resource "aws_ssm_parameter" "gemini_api_key" { + name = "${local.prefix}/gemini_api_key" + description = "Gemini API key consumed by the backend at task start. Overwrite out-of-band." + type = "SecureString" + value = "REPLACE_ME" + + lifecycle { + ignore_changes = [value] + } +} + resource "aws_ssm_parameter" "database_url" { name = "${local.prefix}/database_url" description = "psycopg URL for the RDS instance. Composed from rds outputs." diff --git a/infra/modules/secrets/outputs.tf b/infra/modules/secrets/outputs.tf index 43ed825..981ade3 100644 --- a/infra/modules/secrets/outputs.tf +++ b/infra/modules/secrets/outputs.tf @@ -6,6 +6,10 @@ output "openai_key_arn" { value = aws_ssm_parameter.openai_api_key.arn } +output "gemini_key_arn" { + value = aws_ssm_parameter.gemini_api_key.arn +} + output "database_url_arn" { value = aws_ssm_parameter.database_url.arn } diff --git a/infra/variables.tf b/infra/variables.tf index eece20b..3f51b61 100644 --- a/infra/variables.tf +++ b/infra/variables.tf @@ -111,3 +111,47 @@ variable "log_retention_days" { type = number default = 7 } + +# --- Backend provider selection (passed through to the ECS task; defaults +# --- preserve the Anthropic + OpenAI stack). Override at apply time to deploy +# --- on Gemini, e.g. -var='llm_provider=gemini' -var='embeddings_provider=gemini'. + +variable "llm_provider" { + description = "Backend LLM provider: anthropic, gemini, or fake." + type = string + default = "anthropic" + + validation { + condition = contains(["anthropic", "gemini", "fake"], var.llm_provider) + error_message = "llm_provider must be one of: anthropic, gemini, fake." + } +} + +variable "embeddings_provider" { + description = "Backend embeddings provider: openai, voyage, gemini, or fake." + type = string + default = "openai" + + validation { + condition = contains(["openai", "voyage", "gemini", "fake"], var.embeddings_provider) + error_message = "embeddings_provider must be one of: openai, voyage, gemini, fake." + } +} + +variable "embedding_dim" { + description = "Embedding vector dimensionality. Must match the pgvector schema (1536)." + type = string + default = "1536" +} + +variable "gemini_model" { + description = "Gemini chat model id (used when llm_provider = gemini)." + type = string + default = "gemini-3.5-flash" +} + +variable "gemini_embedding_model" { + description = "Gemini embedding model id (used when embeddings_provider = gemini)." + type = string + default = "gemini-embedding-2" +}