From 02995b6815164d78dec07727395fa081328a3200 Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Fri, 29 May 2026 02:01:57 -0400 Subject: [PATCH 1/8] docs(progress): record M9 merged (PR #12) and mark M10 in progress --- PROGRESS.md | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/PROGRESS.md b/PROGRESS.md index c102541..aabc291 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -8,27 +8,31 @@ ## Current state -- **Active milestone:** M9 — Evaluation harness (résumé metrics) -- **Status:** complete on branch (started 2026-05-29, completed 2026-05-29); awaiting CI green and human squash-merge -- **Active branch:** `feat/m09-eval` (PR open — see Milestone status) -- **Last completed milestone:** M8 — Frontend (PR #9, merged 2026-05-29) + perf follow-up (PR #11, merged 2026-05-29) -- **`make check` passing:** yes locally on a freshly migrated DB (187 backend tests + 7 frontend tests; tsc + vite build clean) -- **Last action:** committed the M9 work in 3 small Conventional Commits (PROGRESS housekeeping; eval package + labels + RESULTS.md PENDING; tests + docs/evaluation.md + Settings model bump). Verified `make eval` under fake providers prints `n/a` and refuses to publish numbers; 9 asserted-fixture tests prove the scorer + writer end-to-end. -- **Next action:** human squash-merges the M9 PR. After merge, wire `ANTHROPIC_API_KEY` and `OPENAI_API_KEY`, run `make eval`, and overwrite `eval/RESULTS.md` with real numbers in the immediate follow-up commit. Then `/start-milestone 10` for containerization + Terraform + CD. +- **Active milestone:** M10 — Containerization + Terraform (AWS) + CD +- **Status:** in progress (started 2026-05-29) +- **Active branch:** `feat/m10-deploy` +- **Last completed milestone:** M9 — Evaluation harness (PR #12, merged 2026-05-29) +- **`make check` passing:** baseline green from M9 +- **Last action:** ran `/start-milestone 10`, switched to `main`, fast-forwarded, created `feat/m10-deploy`. Confirmed cost posture and "code-only" constraints with the user: us-east-1, public-subnet/no-NAT, **no `terraform apply`**, no AWS calls, no `terraform plan` without explicit approval and configured credentials. +- **Next action:** ship production Dockerfiles (backend with structlog + request-id middleware; frontend with nginx); Terraform under `infra/` (modules: network, ecr, rds, ecs, secrets); manual `workflow_dispatch` CD workflow; `infra/README.md` with cost posture, RDS-not-public invariant, demo-only warning, apply/destroy recipe; tests for the request-id middleware. - **Blockers:** none. -### M9 DoD verification +### M10 DoD checklist -- [x] **`make eval` runs end-to-end and writes `eval/RESULTS.md` with metrics, k, dataset size, and method.** The CLI in `eval/run.py` prints a one-line summary per metric and writes `eval/RESULTS.md`. Under fake providers (verified locally) every metric prints `n/a (...)` and the file is left as the methodology-only PENDING document — no numbers ship in the tree until a real run. -- [x] **Methodology is documented well enough to defend verbally in an interview.** `docs/evaluation.md` (224 lines) covers dataset shape, provider pinning, every metric definition (extraction normalization rules, precision@k denominator footnote, lite-faithfulness scope, refusal-rate non-interpretation), the n/a gate, the reproduction recipe, and explicit limits (small dataset, synthetic corpus caveat, no calibration claim, citation-validity vs. true faithfulness). -- [ ] **Numbers are real (from this run). Record them in `PROGRESS.md` "Decision log" too.** *Pending* — no API keys wired in this session. The harness contract + asserted-fixture pytest is what merges; real numbers land in the immediate follow-up commit once keys are configured. +- [ ] `terraform plan` is clean; `apply` provisions the stack (tear down after demo to avoid charges). +- [ ] CD workflow builds and deploys on manual dispatch. +- [ ] App is reachable at a URL (capture screenshots before teardown). -### M9 design lock-ins (per pre-flight review, all delivered) +### M10 hard constraints (locked in by user) -- **Metric set.** Extraction: normalized exact-match (trim + casefold strings, ISO date canonicalisation, 0.01 numeric tolerance), micro + macro accuracy, per-field precision/recall (column reported regardless so optional-field schemas later get the right reading without a code change). Retrieval: precision@k (headline) + recall@k + MRR with the precision-cap footnote. RAG: citation-validity rate + answer-cites-relevant rate + answer-substring rate; refusals counted but not interpreted as quality. -- **Honesty discipline.** Under `EMBEDDINGS_PROVIDER=fake` retrieval and RAG go to `n/a`; under `LLM_PROVIDER=fake` extraction and RAG go to `n/a`. Counts are still emitted because they describe the dataset, not the system. Asserted-fixture pytest tests prove the scorer + writer; nothing in the test path produces a number that could be misread as a quality claim. -- **What ships.** Harness + 5+6+5 hand-authored synthetic labels + asserted pytest fixtures + methodology-only PENDING `eval/RESULTS.md`. No fabricated numbers in the tree. Real numbers fill the file in the immediate follow-up. -- **Provider pair.** `claude-sonnet-4-6` (verified against Anthropic docs 2026-05-29 — dateless 4.6-generation IDs are pinned snapshots, not evergreen pointers); `text-embedding-3-small` (1536-dim, schema-canonical); temperature 0. +- **Code only.** No `terraform apply`. No AWS resource creation. No incurred costs. No `terraform plan` unless AWS credentials are configured and the user explicitly approves. +- **Cost posture.** Public-subnet + no-NAT-Gateway, single-AZ, Fargate `0.25 vCPU / 0.5 GB`, RDS `db.t4g.micro`. NAT Gateway idle cost (~$32/month) avoided. RDS **must not be publicly accessible** — security group enforces ingress from the Fargate task SG only. Backend Fargate may have public ingress only via the ALB on 80/443 and egress only as the SG allows. +- **Demo-only.** `infra/README.md` documents the teardown recipe and the security tradeoffs; running `terraform destroy` immediately after demo screenshots is the contract. +- **Region:** `us-east-1`. + +### Follow-ups tracked outside M10 + +- **#13** — record real-provider eval numbers (M9 follow-up). Stays open until keys are wired and `make eval` is run for real. --- @@ -45,8 +49,8 @@ | M6 | Workflow engine | `feat/m06-workflow-engine` | ☑ merged | [#7](https://github.com/div0rce/sentinel/pull/7) | 2026-05-29 | | M7 | Audit log + HITL | `feat/m07-audit-hitl` | ☑ merged | [#8](https://github.com/div0rce/sentinel/pull/8) | 2026-05-29 | | M8 | Frontend | `feat/m08-frontend` | ☑ merged | [#9](https://github.com/div0rce/sentinel/pull/9) | 2026-05-29; perf follow-up [#11](https://github.com/div0rce/sentinel/pull/11) | -| M9 | Evaluation harness | `feat/m09-eval` | ◐ complete on branch (PR open) | _filled in after `gh pr create`_ | 2026-05-29 | -| M10 | Deploy (Docker/Terraform/CD) | `feat/m10-deploy` | ☐ | — | | +| M9 | Evaluation harness | `feat/m09-eval` | ☑ merged | [#12](https://github.com/div0rce/sentinel/pull/12) | 2026-05-29; real-provider numbers tracked in [#13](https://github.com/div0rce/sentinel/issues/13) | +| M10 | Deploy (Docker/Terraform/CD) | `feat/m10-deploy` | ◐ in progress | — | started 2026-05-29 | | M11 | Docs + diagram + demo | `feat/m11-docs-demo` | ☐ | — | | Status key: ☐ not started · ◐ in progress · ☑ merged From 6e74dc4d7789f0bc2a95b8e50643c964ce8b018c Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Fri, 29 May 2026 02:04:58 -0400 Subject: [PATCH 2/8] feat(backend): structured logging + request-id middleware + production Dockerfile backend/app/observability.py: - configure_logging() wires structlog for JSON output (CloudWatch-friendly) with a SENTINEL_LOG_FORMAT=console escape hatch for local dev. Idempotent so CLIs (make seed, make eval) produce the same shape of log as the API. - RequestIdMiddleware assigns a stable id per request, binds it to the structlog contextvars (so any structlog call inside a handler picks it up), exposes it on request.state.request_id, and surfaces it on the response as X-Request-Id. Caller-supplied X-Request-Id headers are accepted only when short and printable ([alnum]+[-_], <= 64 chars); anything else is replaced with a fresh uuid4 hex to keep attacker-controlled bytes out of the log pipeline. backend/app/main.py: configure_logging() at import time; middleware added before routers. backend/tests/test_request_id.py (8 tests): generated id is uuid4 hex; safe inbound id is echoed; rogue inbound ids (too long, whitespace, control chars, punctuation, empty) are replaced; consecutive requests get distinct ids. backend/Dockerfile: multi-stage (uv-based dependency resolution, slim runtime), non-root sentinel user (uid 1000), HEALTHCHECK against /health, PORT=8000 default but honours $PORT for ECS service-port flexibility, SENTINEL_LOG_FORMAT defaults to 'json' in the image. Source layer copied last so code-only changes don't invalidate the deps layer. backend/.dockerignore prunes tests, frontend, eval, scripts, .git, IDE state, and local Postgres data so the image stays small and free of secrets. structlog>=24.4 added as a runtime dep (resolved 25.5.0). --- backend/.dockerignore | 41 ++++++++++ backend/Dockerfile | 68 +++++++++++++++++ backend/app/main.py | 15 +++- backend/app/observability.py | 125 +++++++++++++++++++++++++++++++ backend/tests/test_request_id.py | 66 ++++++++++++++++ pyproject.toml | 1 + uv.lock | 11 +++ 7 files changed, 324 insertions(+), 3 deletions(-) create mode 100644 backend/.dockerignore create mode 100644 backend/Dockerfile create mode 100644 backend/app/observability.py create mode 100644 backend/tests/test_request_id.py diff --git a/backend/.dockerignore b/backend/.dockerignore new file mode 100644 index 0000000..1cfd020 --- /dev/null +++ b/backend/.dockerignore @@ -0,0 +1,41 @@ +# Project artefacts that don't belong in the image +.venv/ +__pycache__/ +*.pyc +*.pyo +.mypy_cache/ +.ruff_cache/ +.pytest_cache/ +.coverage +htmlcov/ + +# Tests + tooling not needed at runtime +backend/tests/ +.github/ +.git/ +.gitignore +.editorconfig +.env +.env.* +*.md +docs/ + +# Frontend tree +frontend/ + +# Eval artefacts +eval/ +scripts/ + +# Infra / Docker meta +infra/ +docker-compose.yml +Dockerfile +.dockerignore + +# Local Postgres data, IDE +data/ +.kiro/ +.claude/ +.agents/ +.DS_Store diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..002f7d0 --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,68 @@ +# syntax=docker/dockerfile:1.7 +# ---------- builder ---------- +FROM python:3.12-slim AS builder + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + UV_LINK_MODE=copy \ + UV_PYTHON_DOWNLOADS=never + +# Install build essentials only; psycopg[binary] ships its own libpq wheel so we +# don't need libpq-dev / build-essential at runtime. +RUN --mount=type=cache,target=/var/cache/apt \ + --mount=type=cache,target=/var/lib/apt \ + apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates curl \ + && rm -rf /var/lib/apt/lists/* + +# Pinned uv release; matches the local toolchain. Upgrade in lockstep with CI. +ADD https://astral.sh/uv/0.4.24/install.sh /uv-installer.sh +RUN sh /uv-installer.sh && rm /uv-installer.sh +ENV PATH="/root/.local/bin:${PATH}" + +WORKDIR /app + +# Resolve dependencies into a wheel cache first; only the lockfile gates the cache. +COPY pyproject.toml uv.lock ./ +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --frozen --no-install-project --no-dev + +# Copy application source last so a code-only change does not invalidate the +# dependency layer. +COPY backend ./backend +COPY alembic.ini ./alembic.ini + +# ---------- runtime ---------- +FROM python:3.12-slim AS runtime + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PORT=8000 \ + SENTINEL_LOG_FORMAT=json + +# Non-root user; matches "no root by default" container hygiene. +RUN groupadd --system --gid 1000 sentinel \ + && useradd --system --uid 1000 --gid sentinel --create-home --shell /usr/sbin/nologin sentinel + +WORKDIR /app + +# Bring in the resolved venv + source from the builder. +COPY --from=builder /app /app + +# Drop privileges before any further setup. +USER sentinel + +# Use the venv-managed python; honour $PORT for ECS service-port flexibility. +ENV PATH="/app/.venv/bin:${PATH}" + +EXPOSE 8000 + +# Liveness probe matches the FastAPI /health endpoint shipped in M0. +HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \ + CMD python -c "import sys, urllib.request; \ + urllib.request.urlopen(f'http://127.0.0.1:{__import__(\"os\").environ.get(\"PORT\",\"8000\")}/health', timeout=3); \ + sys.exit(0)" || exit 1 + +# Single uvicorn worker is fine for the demo; ECS scales horizontally on tasks. +CMD ["sh", "-c", "uvicorn backend.app.main:app --host 0.0.0.0 --port ${PORT:-8000}"] diff --git a/backend/app/main.py b/backend/app/main.py index caa920f..885f172 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,21 +1,30 @@ """FastAPI application entrypoint for Sentinel. -M0 added the liveness probe. M3 wired in the citation-grounded RAG endpoint at +M0 added the liveness probe. M3 wired the citation-grounded RAG endpoint at ``POST /query``. M4 added schema-constrained extraction at ``POST /extract``. M7 added the human-in-the-loop review queue at ``GET /review`` and -``POST /review/{id}/approve|reject``. M8 adds dashboard KPI feeds at -``GET /dashboard/{volume,categories,confidence,sla}``; the React UI consumes them. +``POST /review/{id}/approve|reject``. M8 added dashboard KPI feeds at +``GET /dashboard/{volume,categories,confidence,sla}``. M10 adds structured +logging + the request-id middleware so every log line carries the request id +and every response surfaces it on ``X-Request-Id``. """ from fastapi import FastAPI +from backend.app.observability import RequestIdMiddleware, configure_logging from backend.app.routers.dashboard import router as dashboard_router from backend.app.routers.extract import router as extract_router from backend.app.routers.query import router as query_router from backend.app.routers.review import router as review_router +configure_logging() + app = FastAPI(title="Sentinel", version="0.1.0") +# Add the request-id middleware *before* including routers so every handler runs +# with the structlog context bound. +app.add_middleware(RequestIdMiddleware) + app.include_router(query_router) app.include_router(extract_router) app.include_router(review_router) diff --git a/backend/app/observability.py b/backend/app/observability.py new file mode 100644 index 0000000..84b69c8 --- /dev/null +++ b/backend/app/observability.py @@ -0,0 +1,125 @@ +"""Structured logging + a request-id middleware (M10). + +Two responsibilities: + +* :func:`configure_logging` wires ``structlog`` for JSON output suitable for + CloudWatch / any log aggregator that ingests stdout. Production logs are + one-line JSON with a stable schema; local development can flip to a friendlier + console renderer via the ``SENTINEL_LOG_FORMAT=console`` env var. +* :class:`RequestIdMiddleware` assigns a stable id to every HTTP request, binds + it to the structlog context, surfaces it on the response as + ``X-Request-Id``, and exposes it on ``request.state.request_id`` so + application code (notably :mod:`backend.app.audit`) can persist it. + +Tests in ``backend/tests/test_request_id.py`` pin the middleware contract. +""" + +from __future__ import annotations + +import logging +import os +import uuid +from collections.abc import Awaitable, Callable + +import structlog +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.requests import Request +from starlette.responses import Response + +REQUEST_ID_HEADER = "X-Request-Id" +REQUEST_ID_LENGTH_LIMIT = 64 + + +def configure_logging() -> None: + """Configure structlog + the stdlib root logger for the application. + + Idempotent. Safe to call from app startup *and* from CLIs (``make seed``, + ``make eval``) so every entry point produces the same shape of log. + """ + log_level_name = os.environ.get("SENTINEL_LOG_LEVEL", "INFO").upper() + level = logging.getLevelNamesMapping().get(log_level_name, logging.INFO) + + logging.basicConfig( + format="%(message)s", + level=level, + force=True, + ) + + use_console = os.environ.get("SENTINEL_LOG_FORMAT", "json").lower() == "console" + renderer: structlog.types.Processor + if use_console: + renderer = structlog.dev.ConsoleRenderer(colors=True) + else: + renderer = structlog.processors.JSONRenderer() + + structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.TimeStamper(fmt="iso", utc=True), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + renderer, + ], + wrapper_class=structlog.make_filtering_bound_logger(level), + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=True, + ) + + +def _generate_request_id() -> str: + return uuid.uuid4().hex + + +def _sanitise_inbound(value: str) -> str | None: + """Accept caller-supplied request ids if they are short and printable. + + Inbound headers are untrusted; we strip them to length and to a conservative + character set so a hostile client cannot push attacker-controlled bytes + into our log pipeline. + """ + candidate = value.strip() + if not candidate or len(candidate) > REQUEST_ID_LENGTH_LIMIT: + return None + if not all(c.isalnum() or c in "-_" for c in candidate): + return None + return candidate + + +class RequestIdMiddleware(BaseHTTPMiddleware): + """Bind a request id to every request, the structlog context, and the response.""" + + HEADER_NAME = REQUEST_ID_HEADER + + async def dispatch( + self, + request: Request, + call_next: Callable[[Request], Awaitable[Response]], + ) -> Response: + inbound = request.headers.get(self.HEADER_NAME, "") + request_id = _sanitise_inbound(inbound) or _generate_request_id() + request.state.request_id = request_id + + # Bind for the duration of the request so any structlog call inside the + # handler picks up the request_id without plumbing it through. + token = structlog.contextvars.bind_contextvars( + request_id=request_id, + method=request.method, + path=request.url.path, + ) + try: + response = await call_next(request) + finally: + # ``token`` is a Mapping[str, contextvars.Token]; clear-by-key is the + # supported way to undo the bind on exit. + structlog.contextvars.unbind_contextvars(*token.keys()) + + response.headers[self.HEADER_NAME] = request_id + return response + + +def get_request_id(request: Request) -> str | None: + """Convenience getter for handlers that want to forward the id (e.g., to + :func:`backend.app.audit.emit_*`).""" + return getattr(request.state, "request_id", None) diff --git a/backend/tests/test_request_id.py b/backend/tests/test_request_id.py new file mode 100644 index 0000000..5d7e6ac --- /dev/null +++ b/backend/tests/test_request_id.py @@ -0,0 +1,66 @@ +"""Tests for the M10 request-id middleware.""" + +from __future__ import annotations + +import re +from collections.abc import Iterator + +import pytest +from fastapi.testclient import TestClient +from sqlalchemy.orm import Session + +from backend.app.db import get_session +from backend.app.main import app +from backend.app.observability import REQUEST_ID_HEADER + +UUID_HEX = re.compile(r"^[a-f0-9]{32}$") + + +@pytest.fixture +def client(session: Session) -> Iterator[TestClient]: + def override_session() -> Iterator[Session]: + yield session + + app.dependency_overrides[get_session] = override_session + try: + yield TestClient(app) + finally: + app.dependency_overrides.clear() + + +def test_response_carries_a_generated_request_id(client: TestClient) -> None: + resp = client.get("/health") + assert resp.status_code == 200 + assert REQUEST_ID_HEADER in resp.headers + request_id = resp.headers[REQUEST_ID_HEADER] + assert UUID_HEX.match(request_id), f"unexpected request id format: {request_id!r}" + + +def test_inbound_request_id_is_echoed_when_safe(client: TestClient) -> None: + inbound = "client-supplied-abc123" + resp = client.get("/health", headers={REQUEST_ID_HEADER: inbound}) + assert resp.headers[REQUEST_ID_HEADER] == inbound + + +@pytest.mark.parametrize( + "rogue", + [ + "x" * 128, # too long + "spaces here", # space disallowed + "newline\nhere", # control char + ";rm -rf /", # punctuation outside [-_] + "", # empty + ], +) +def test_unsafe_inbound_request_ids_are_replaced(client: TestClient, rogue: str) -> None: + resp = client.get("/health", headers={REQUEST_ID_HEADER: rogue}) + out = resp.headers[REQUEST_ID_HEADER] + assert out != rogue + # The replacement is the generated UUID hex form. + assert UUID_HEX.match(out), f"replacement did not look generated: {out!r}" + + +def test_each_request_gets_a_distinct_generated_id(client: TestClient) -> None: + a = client.get("/health").headers[REQUEST_ID_HEADER] + b = client.get("/health").headers[REQUEST_ID_HEADER] + assert a != b diff --git a/pyproject.toml b/pyproject.toml index eb37aa3..a38cad1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "psycopg[binary]>=3.2", "pydantic-settings>=2.6", "sqlalchemy>=2.0", + "structlog>=24.4", "tiktoken>=0.8", "uvicorn[standard]>=0.32", ] diff --git a/uv.lock b/uv.lock index 264906f..2ef4506 100644 --- a/uv.lock +++ b/uv.lock @@ -1109,6 +1109,7 @@ dependencies = [ { name = "psycopg", extra = ["binary"] }, { name = "pydantic-settings" }, { name = "sqlalchemy" }, + { name = "structlog" }, { name = "tiktoken" }, { name = "uvicorn", extra = ["standard"] }, ] @@ -1130,6 +1131,7 @@ requires-dist = [ { name = "psycopg", extras = ["binary"], specifier = ">=3.2" }, { name = "pydantic-settings", specifier = ">=2.6" }, { name = "sqlalchemy", specifier = ">=2.0" }, + { name = "structlog", specifier = ">=24.4" }, { name = "tiktoken", specifier = ">=0.8" }, { name = "uvicorn", extras = ["standard"], specifier = ">=0.32" }, ] @@ -1196,6 +1198,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9f/85/492183764d5d01d4514be3730fdb8e228a80605783099551c51627578b5d/starlette-1.2.0-py3-none-any.whl", hash = "sha256:36e0c76ac59157e75dc4b3bdeafba97fb04eaf1878045f15dbef666a6f092ed7", size = 73213, upload-time = "2026-05-28T11:42:48.801Z" }, ] +[[package]] +name = "structlog" +version = "25.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/52/9ba0f43b686e7f3ddfeaa78ac3af750292662284b3661e91ad5494f21dbc/structlog-25.5.0.tar.gz", hash = "sha256:098522a3bebed9153d4570c6d0288abf80a031dfdb2048d59a49e9dc2190fc98", size = 1460830, upload-time = "2025-10-27T08:28:23.028Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/45/a132b9074aa18e799b891b91ad72133c98d8042c70f6240e4c5f9dabee2f/structlog-25.5.0-py3-none-any.whl", hash = "sha256:a8453e9b9e636ec59bd9e79bbd4a72f025981b3ba0f5837aebf48f02f37a7f9f", size = 72510, upload-time = "2025-10-27T08:28:21.535Z" }, +] + [[package]] name = "tiktoken" version = "0.13.0" From ddd54a9e6d43bfabf470d2e5f20f303f45c643ed Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Fri, 29 May 2026 02:06:20 -0400 Subject: [PATCH 3/8] feat(frontend): production multi-stage Dockerfile (node build -> nginx serve) frontend/Dockerfile is a two-stage image: 1. node:20-alpine builder runs 'npm ci && npm run build' (which transitively runs 'tsc -b' so any type error fails the build, matching the CI lint step). 2. nginx:1.27-alpine runtime serves /usr/share/nginx/html (the Vite dist) and reverse-proxies same-origin paths to the backend. The nginx config template substitutes ${BACKEND_URL} via the official image's envsubst entrypoint on container start, so the same image is portable across environments. ECS task def sets BACKEND_URL to the backend service-discovery DNS name (default in-image: http://backend:8000 for local docker compose). The proxy passes /query, /extract, /review, /dashboard, /health straight through with X-Forwarded-* headers and forwards request headers (so the M10 X-Request-Id stays correlated end to end). Hashed Vite assets get a 1-year cache; everything else is uncached. SPA fallback ('try_files $uri $uri/ /index.html') keeps React Router routes working on hard reload. frontend/.dockerignore prunes node_modules/dist/test trees, IDE state, and *.tsbuildinfo so the build context stays small. --- frontend/.dockerignore | 19 +++++++++++++++++ frontend/Dockerfile | 34 ++++++++++++++++++++++++++++++ frontend/nginx.conf.template | 41 ++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 frontend/.dockerignore create mode 100644 frontend/Dockerfile create mode 100644 frontend/nginx.conf.template diff --git a/frontend/.dockerignore b/frontend/.dockerignore new file mode 100644 index 0000000..1a55970 --- /dev/null +++ b/frontend/.dockerignore @@ -0,0 +1,19 @@ +node_modules/ +dist/ +.vite/ +coverage/ +*.log +*.tsbuildinfo +.DS_Store +.env.local + +# Test files don't need to ship in the image +src/**/__tests__/ +src/test/ + +# Repo-level meta +.git/ +.github/ +.kiro/ +.claude/ +.agents/ diff --git a/frontend/Dockerfile b/frontend/Dockerfile new file mode 100644 index 0000000..b305e1c --- /dev/null +++ b/frontend/Dockerfile @@ -0,0 +1,34 @@ +# syntax=docker/dockerfile:1.7 +# ---------- builder ---------- +FROM node:20-alpine AS builder + +WORKDIR /app + +# Install deps from the lockfile only first so a code change does not bust the +# dependency layer. +COPY package.json package-lock.json ./ +RUN --mount=type=cache,target=/root/.npm \ + npm ci + +COPY . . + +# Vite emits ./dist with hashed asset names. tsc -b runs as part of `npm run +# build` and fails the build on any type error. +RUN npm run build + +# ---------- runtime ---------- +FROM nginx:1.27-alpine AS runtime + +# nginx default config substitutes $BACKEND_URL via envsubst on container start +# so the same image is portable across environments. The ECS task definition +# sets BACKEND_URL to the backend service-discovery DNS name in the cluster. +ENV BACKEND_URL=http://backend:8000 + +COPY nginx.conf.template /etc/nginx/templates/default.conf.template +COPY --from=builder /app/dist /usr/share/nginx/html + +# nginx official image runs as nginx (non-root) by default since 1.25. +EXPOSE 80 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD wget --quiet --spider http://127.0.0.1/ || exit 1 diff --git a/frontend/nginx.conf.template b/frontend/nginx.conf.template new file mode 100644 index 0000000..27cca7b --- /dev/null +++ b/frontend/nginx.conf.template @@ -0,0 +1,41 @@ +# nginx config for the M10 demo deployment. Serves the Vite-built SPA and +# reverse-proxies the backend FastAPI under same-origin paths so the typed +# API client in src/api.ts can keep using base "" (no CORS). +# +# ${BACKEND_URL} is substituted by nginx's official-image entrypoint via +# envsubst at container start. The ECS task definition sets it to the service +# discovery DNS for the backend. + +server { + listen 80 default_server; + server_name _; + + # SPA assets + root /usr/share/nginx/html; + index index.html; + + # Standard SPA routing fallback so React Router routes (e.g. /review) + # resolve to the same index.html. + location / { + try_files $uri $uri/ /index.html; + } + + # API surface — forwarded to the backend. + location ~ ^/(query|extract|review|dashboard|health)(/|$) { + proxy_pass ${BACKEND_URL}; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + # Forward (and accept) the request id so logs are correlated end-to-end. + proxy_pass_request_headers on; + proxy_read_timeout 60s; + } + + # Cache hashed Vite assets aggressively; everything else short. + location ~* \.(?:js|css|woff2?|png|jpg|svg)$ { + expires 1y; + add_header Cache-Control "public, immutable"; + } +} From 41afdac0a1903abfe0f4671b06a3e8b1411a1bad Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Fri, 29 May 2026 02:20:19 -0400 Subject: [PATCH 4/8] feat(infra): Terraform stack for ECS Fargate + RDS + ECR + SSM (us-east-1, demo) infra/ provisions the M10 demo stack on AWS: - modules/network: VPC (10.0.0.0/16), two public /24 subnets in two AZs, IGW, public route table. Owns the four security groups (alb, frontend, backend, rds) so the rds ingress rule can reference the backend SG without creating an ecs <-> rds module-level dependency cycle. Reachability graph encoded in the SGs: internet -> alb -> {frontend on 80, backend on 8000} -> rds on 5432. Egress open on tasks (ECR/Anthropic/OpenAI/CloudWatch); RDS has none. - modules/ecr: two repos (backend, frontend) with image-scan-on-push, a 7-day untagged-image expiry, and a 20-image cap. force_delete=true so terraform destroy doesn't hang on lingering tags. - modules/secrets: SSM SecureString parameters for ANTHROPIC_API_KEY, OPENAI_API_KEY (placeholders, lifecycle.ignore_changes=[value] so the real out-of-band 'aws ssm put-parameter' values aren't clobbered on re-apply), and DATABASE_URL composed from rds outputs. - modules/rds: Postgres 16.4 db.t4g.micro single-AZ, gp3 storage, encrypted at rest, publicly_accessible=false invariant, parameter group (log_statement=ddl). pgvector loads via the application's CREATE EXTENSION migration; no shared_preload_libraries needed. - modules/ecs: cluster, ALB with HTTP listener (frontend default; path-prefix rule routes /query|/extract|/review|/dashboard|/health to the backend target group), service discovery in .local for nginx -> backend, two task defs (256 cpu / 512 mem), two services with assign_public_ip=true (no-NAT topology). Task execution role has scoped ssm:GetParameter on the three secret ARNs. CloudWatch log groups with 7-day retention. - modules/ci_oidc: GitHub Actions OIDC provider + role scoped to the configured repo via 'repo:OWNER/NAME:*' subject claim. Permissions: ecr push to the two project repos, ecr:GetAuthorizationToken account-wide, ecs:UpdateService on the two project services. PassRole limited to the project task roles, only to ecs-tasks.amazonaws.com. count=0 when var.github_repository is empty. Root: versions.tf (terraform >=1.6, aws ~>5.70), variables.tf (project_name, region us-east-1 default, db creds with sensitive=true and >=16 char password validation, image tags, github_repository), main.tf wires everything, outputs expose ALB DNS, ECR URLs, ECS names, RDS endpoint, CI role ARN. No remote state. Local-only is fine for a single-operator demo; convert to S3 + DynamoDB before any second user. --- infra/README.md | 230 +++++++++++++++++++ infra/main.tf | 91 ++++++++ infra/modules/ci_oidc/main.tf | 110 ++++++++++ infra/modules/ci_oidc/outputs.tf | 4 + infra/modules/ci_oidc/variables.tf | 24 ++ infra/modules/ecr/main.tf | 56 +++++ infra/modules/ecr/outputs.tf | 15 ++ infra/modules/ecr/variables.tf | 3 + infra/modules/ecs/main.tf | 342 +++++++++++++++++++++++++++++ infra/modules/ecs/outputs.tf | 27 +++ infra/modules/ecs/variables.tf | 64 ++++++ infra/modules/network/main.tf | 155 +++++++++++++ infra/modules/network/outputs.tf | 23 ++ infra/modules/network/variables.tf | 15 ++ infra/modules/rds/main.tf | 62 ++++++ infra/modules/rds/outputs.tf | 8 + infra/modules/rds/variables.tf | 39 ++++ infra/modules/secrets/main.tf | 52 +++++ infra/modules/secrets/outputs.tf | 11 + infra/modules/secrets/variables.tf | 21 ++ infra/outputs.tf | 39 ++++ infra/variables.tf | 113 ++++++++++ infra/versions.tf | 22 ++ 23 files changed, 1526 insertions(+) create mode 100644 infra/README.md create mode 100644 infra/main.tf create mode 100644 infra/modules/ci_oidc/main.tf create mode 100644 infra/modules/ci_oidc/outputs.tf create mode 100644 infra/modules/ci_oidc/variables.tf create mode 100644 infra/modules/ecr/main.tf create mode 100644 infra/modules/ecr/outputs.tf create mode 100644 infra/modules/ecr/variables.tf create mode 100644 infra/modules/ecs/main.tf create mode 100644 infra/modules/ecs/outputs.tf create mode 100644 infra/modules/ecs/variables.tf create mode 100644 infra/modules/network/main.tf create mode 100644 infra/modules/network/outputs.tf create mode 100644 infra/modules/network/variables.tf create mode 100644 infra/modules/rds/main.tf create mode 100644 infra/modules/rds/outputs.tf create mode 100644 infra/modules/rds/variables.tf create mode 100644 infra/modules/secrets/main.tf create mode 100644 infra/modules/secrets/outputs.tf create mode 100644 infra/modules/secrets/variables.tf create mode 100644 infra/outputs.tf create mode 100644 infra/variables.tf create mode 100644 infra/versions.tf diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 0000000..e05f59f --- /dev/null +++ b/infra/README.md @@ -0,0 +1,230 @@ +# Sentinel infrastructure (Terraform) + +Deployment target: AWS, `us-east-1`, **demo only**. + +This directory provisions everything the M10 demo needs: a VPC, an ECS Fargate +cluster running the backend + frontend tasks, an RDS Postgres instance with the +`vector` extension enabled at migration time, ECR repositories for the two +images, SSM Parameter Store entries for the runtime secrets, and a tightly +scoped GitHub Actions OIDC role that the manual-dispatch CD workflow assumes. + +> **Read the cost & security posture below before running `apply`. The default +> configuration is engineered for a teardown-after-screenshots demo, not a +> production deployment.** + +--- + +## Cost & security posture (deliberate, demo-only) + +### Public-subnet / no-NAT + +The VPC has two `/24` public subnets and **no NAT Gateway**. ECS tasks live in +those public subnets and get assigned public IPs (`assign_public_ip = true`) +so they can reach ECR for image pulls and Anthropic / OpenAI for outbound API +calls. + +This is chosen because a NAT Gateway is the largest avoidable line item in any +small AWS deployment (≈$32/month idle, plus ~$0.045/GB processed). For a demo +that gets `terraform destroy`'d after screenshots, the saving is meaningful and +the security tradeoffs are acceptable **with tight security groups** (below). + +If you ever lift this past the demo: **add private subnets and a NAT Gateway** +(or VPC interface endpoints for ECR / SSM / CloudWatch) and move the ECS tasks +there. Track that as the first item in the production-readiness backlog. + +### RDS is not publicly accessible + +Hard invariant. `aws_db_instance.publicly_accessible = false` is wired in +`modules/rds/main.tf` and the `rds` security group ingress is keyed only to the +backend task SG (`modules/network/main.tf`). Even though RDS lives in the same +public subnets as the tasks, the security group prevents internet reach. + +### Reachability graph (encoded in security groups) + +``` +internet ──→ alb_sg (80, 443) +alb_sg ──→ frontend_sg (80) ALB → nginx +alb_sg ──→ backend_sg (8000) ALB path-prefix → FastAPI +backend_sg ──→ rds_sg (5432) FastAPI → Postgres +``` + +Egress is open on the task SGs (so containers can reach ECR / Anthropic / +OpenAI / CloudWatch). RDS has no egress. + +### Single-AZ everywhere it matters + +- RDS: `multi_az = false`, `db.t4g.micro`, 20 GB storage. Fine for the demo; + unsuitable for production. +- ECS: `desired_count = 1` per service. A single task per service is the + cheapest viable footprint; no auto-scaling. + +### Backups, logs, deletion + +- RDS: 1-day backup retention, `skip_final_snapshot = true`, + `deletion_protection = false`. `terraform destroy` is therefore cheap and + doesn't leave behind a final snapshot you'd forget to delete. +- CloudWatch Logs: `log_retention_days = 7` for the ECS task log groups. +- ECR: 7-day untagged-image expiry, 20-image cap. + +--- + +## What this provisions (rough cost shape) + +The numbers below are order-of-magnitude estimates against the AWS public price +list as of 2026-05; they exist to make "is this OK to leave running overnight?" +answerable without re-reading docs. **Use AWS's actual cost calculator for +binding numbers.** + +| Resource | Approx idle cost | Notes | +| --------------------- | ---------------: | --------------------------------------------- | +| ALB | ~$16/mo + LCU | Cheapest line item that's still always-on. | +| Fargate (2 tasks 0.25 vCPU / 0.5 GB) | ~$15/mo | 24/7. Stop the services to stop the bill. | +| RDS db.t4g.micro 20 GB | ~$13/mo | Single-AZ. ~$2/mo storage + ~$11/mo compute. | +| ECR storage | <$1/mo | 20-image cap on each repo. | +| Secrets / SSM | $0 | Standard parameters, not Advanced. | +| CloudWatch Logs | <$1/mo | 7-day retention; demo log volume is tiny. | +| Data transfer | variable | Outbound from ECS tasks → Anthropic/OpenAI. | +| **Total idle floor** | **~$45/mo** | Plus per-second Fargate charges + traffic. | + +`terraform destroy` removes all of the above. Run it the moment screenshots +are captured. + +--- + +## Apply / destroy recipe + +### Pre-flight (one-time) + +1. AWS account with IAM permissions to create the resources above. +2. AWS CLI configured (`aws configure` or equivalent — local profile, OIDC, or + `AWS_PROFILE`). +3. A strong RDS master password. **Never commit it.** Pass via env: + ```bash + export TF_VAR_db_password="$(openssl rand -base64 24)" + ``` +4. A GitHub repo for the OIDC role's trust policy: + ```bash + export TF_VAR_github_repository="OWNER/sentinel" + ``` + Leave unset to skip the OIDC role (manual deploys only). + +### Validate without applying + +```bash +cd infra/ +terraform fmt -recursive -check +terraform init # downloads providers; no AWS calls +terraform validate +``` + +`terraform fmt`, `init`, and `validate` make no AWS API calls. + +### Apply (this is the cost moment) + +```bash +terraform plan -out=plan.tfplan # READ THIS BEFORE APPLY +terraform apply plan.tfplan +``` + +After apply succeeds: + +```bash +terraform output ci_role_arn # if github_repository was supplied +``` + +Add that ARN to the repo's `AWS_ROLE_ARN` secret (Settings → Secrets and +variables → Actions). The CD workflow assumes this role via OIDC. + +### Write the runtime secrets out-of-band + +```bash +aws ssm put-parameter --name /sentinel/anthropic_api_key \ + --type SecureString --value "$ANTHROPIC_API_KEY" --overwrite + +aws ssm put-parameter --name /sentinel/openai_api_key \ + --type SecureString --value "$OPENAI_API_KEY" --overwrite +``` + +(`/sentinel/database_url` is composed by Terraform from the RDS outputs and +already populated.) + +Then bounce the backend service so the new secret values are picked up: + +```bash +aws ecs update-service \ + --cluster sentinel-cluster --service sentinel-backend \ + --force-new-deployment --no-cli-pager +``` + +### Run migrations + seed + +The backend image runs migrations at task start? **No** — by design. Run them +once, manually, against the public ALB DNS using a one-off task or by exec'ing +into a running task. The simplest path for the demo: SSH-tunnel via a +short-lived Fargate task, run `alembic upgrade head` and `python -m +backend.app.ingest --path data/sample`. Recipe in `docs/demo.md` (M11). + +### Deploy via CD + +Manual dispatch only. From the GitHub UI: Actions → CD → Run workflow → choose +`backend` / `frontend` / `both`. Workflow: + +1. Builds the requested images. +2. Pushes to ECR with the git SHA tag. +3. `aws ecs update-service --force-new-deployment` for each service. + +### Destroy + +```bash +terraform destroy +``` + +Removes everything provisioned by this configuration, including ECR images +(force_delete = true on the repos so destroy doesn't hang on lingering tags). + +> **Tear down immediately after capturing screenshots.** Leaving the stack +> running overnight costs ~$1.50; leaving it for a month costs ~$45. + +--- + +## What's not in this directory + +- **No remote state.** Terraform state lives locally as `terraform.tfstate`. + This is appropriate for a single-operator demo; for any second user, convert + to an S3 backend + DynamoDB lock table first. Scope and recipe are out of + M10. +- **No TLS certificate / Route 53.** The ALB serves plain HTTP on port 80. For + a real demo, attach an ACM cert and add a 443 listener; the ALB SG already + permits 443 ingress. +- **No CloudFront / WAF / observability beyond `/health` + structured logs.** + Out of M10. +- **No auto-scaling rules.** `desired_count = 1` per service. Edit the + `aws_ecs_service` blocks in `modules/ecs/main.tf` to change. + +--- + +## Module map + +``` +infra/ +├── versions.tf provider pins (aws ~> 5.70, random ~> 3.6) +├── variables.tf project_name, region, db creds, image tags, github_repository +├── main.tf wires the modules +├── outputs.tf ALB DNS, ECR URLs, ECS names, RDS endpoint, CI role ARN +└── modules/ + ├── network/ VPC, 2 public subnets, IGW, public RT, 4 SGs + ├── ecr/ two repos with lifecycle policies + ├── secrets/ SSM Parameter Store entries (API keys + DATABASE_URL) + ├── rds/ Postgres 16.4 db.t4g.micro single-AZ, parameter group + ├── ecs/ cluster, ALB + target groups + listener, task defs, services, log groups, IAM + └── ci_oidc/ GitHub Actions OIDC provider + role (scoped to ECR push + ECS update-service) +``` + +--- + +## Validation in CI + +The CI workflow does **not** run `terraform plan` or `apply`. It does run +`terraform fmt -check` and `terraform validate` against this directory in a +job that does not need AWS credentials, so a syntax or wiring regression is +caught on every PR. Plan/apply remain a manual operator action. diff --git a/infra/main.tf b/infra/main.tf new file mode 100644 index 0000000..b25380b --- /dev/null +++ b/infra/main.tf @@ -0,0 +1,91 @@ +data "aws_availability_zones" "available" { + state = "available" +} + +locals { + common_tags = { + Project = var.project_name + Environment = var.environment + ManagedBy = "terraform" + Repository = var.github_repository + } + + # Pick the first two AZs in the region. Single-AZ RDS uses the first only. + azs = slice(data.aws_availability_zones.available.names, 0, 2) +} + +module "network" { + source = "./modules/network" + + project_name = var.project_name + vpc_cidr = var.vpc_cidr + public_subnet_cidrs = var.public_subnet_cidrs + availability_zones = local.azs +} + +module "ecr" { + source = "./modules/ecr" + + project_name = var.project_name +} + +# RDS depends on the backend security group from the network module so its +# ingress can be scoped to that SG only (RDS is not publicly accessible). +module "rds" { + source = "./modules/rds" + + project_name = var.project_name + vpc_id = module.network.vpc_id + subnet_ids = module.network.public_subnet_ids + ingress_sg_id = module.network.backend_sg_id + db_name = var.db_name + db_username = var.db_username + db_password = var.db_password + instance_class = var.db_instance_class + allocated_storage = var.db_allocated_storage +} + +# Secrets module composes the DATABASE_URL from rds outputs and owns the API key +# parameters. ECS depends on its outputs. +module "secrets" { + source = "./modules/secrets" + + project_name = var.project_name + db_endpoint = module.rds.db_endpoint + db_name = var.db_name + db_username = var.db_username + db_password = var.db_password +} + +module "ecs" { + source = "./modules/ecs" + + project_name = var.project_name + region = var.region + vpc_id = module.network.vpc_id + public_subnet_ids = module.network.public_subnet_ids + alb_sg_id = module.network.alb_sg_id + backend_sg_id = module.network.backend_sg_id + frontend_sg_id = module.network.frontend_sg_id + backend_image = "${module.ecr.backend_repository_url}:${var.backend_image_tag}" + frontend_image = "${module.ecr.frontend_repository_url}:${var.frontend_image_tag}" + backend_desired_count = var.backend_desired_count + frontend_desired_count = var.frontend_desired_count + log_retention_days = var.log_retention_days + + database_url_secret_arn = module.secrets.database_url_arn + anthropic_key_secret_arn = module.secrets.anthropic_key_arn + openai_key_secret_arn = module.secrets.openai_key_arn +} + +# OIDC role for the GitHub Actions CD workflow. Created only when a repo is supplied. +module "ci_oidc" { + source = "./modules/ci_oidc" + count = var.github_repository == "" ? 0 : 1 + + project_name = var.project_name + github_repository = var.github_repository + ecr_repository_arns = [module.ecr.backend_repository_arn, module.ecr.frontend_repository_arn] + ecs_cluster_arn = module.ecs.cluster_arn + ecs_service_arns = [module.ecs.backend_service_arn, module.ecs.frontend_service_arn] +} diff --git a/infra/modules/ci_oidc/main.tf b/infra/modules/ci_oidc/main.tf new file mode 100644 index 0000000..6e4c69d --- /dev/null +++ b/infra/modules/ci_oidc/main.tf @@ -0,0 +1,110 @@ +# GitHub Actions OIDC role for the manual-dispatch CD workflow. +# +# What it lets CI do (only): +# - get an ECR auth token +# - push images to the two project ECR repos +# - update the two ECS services (force a redeployment with a new image tag) +# +# What it does NOT let CI do: +# - create new IAM roles/policies +# - touch RDS, secrets, the ALB, or the network +# - read/write S3, run Lambda, anything outside ECR + ECS +# +# Trust policy is scoped to one repo (var.github_repository). Bumping it requires +# changing infra explicitly — no surprise repo can assume this role. + +data "aws_caller_identity" "current" {} + +# Reuse a single account-level OIDC provider for token.actions.githubusercontent.com. +# If one already exists, import it before applying. +resource "aws_iam_openid_connect_provider" "github" { + url = "https://token.actions.githubusercontent.com" + client_id_list = ["sts.amazonaws.com"] + thumbprint_list = ["6938fd4d98bab03faadb97b34396831e3780aea1"] # GitHub Actions root CA, current as of 2025/2026. +} + +data "aws_iam_policy_document" "ci_assume" { + statement { + actions = ["sts:AssumeRoleWithWebIdentity"] + principals { + type = "Federated" + identifiers = [aws_iam_openid_connect_provider.github.arn] + } + condition { + test = "StringEquals" + variable = "token.actions.githubusercontent.com:aud" + values = ["sts.amazonaws.com"] + } + condition { + test = "StringLike" + variable = "token.actions.githubusercontent.com:sub" + values = ["repo:${var.github_repository}:*"] + } + } +} + +resource "aws_iam_role" "ci" { + name = "${var.project_name}-ci" + assume_role_policy = data.aws_iam_policy_document.ci_assume.json +} + +data "aws_iam_policy_document" "ci_permissions" { + # ECR auth (account-level) + push to the two project repos only. + statement { + sid = "EcrAuth" + actions = ["ecr:GetAuthorizationToken"] + resources = ["*"] + } + statement { + sid = "EcrPush" + actions = [ + "ecr:BatchCheckLayerAvailability", + "ecr:CompleteLayerUpload", + "ecr:InitiateLayerUpload", + "ecr:PutImage", + "ecr:UploadLayerPart", + "ecr:DescribeRepositories", + "ecr:DescribeImages", + ] + resources = var.ecr_repository_arns + } + + # ECS: force a new deployment on the two project services in this cluster. + statement { + sid = "EcsDescribe" + actions = ["ecs:DescribeServices", "ecs:DescribeTasks", "ecs:ListTasks"] + resources = ["*"] + } + statement { + sid = "EcsUpdate" + actions = [ + "ecs:UpdateService", + "ecs:DescribeTaskDefinition", + "ecs:RegisterTaskDefinition", + ] + resources = concat([var.ecs_cluster_arn], var.ecs_service_arns) + } + statement { + # RegisterTaskDefinition expects an unscoped resource; allow it but the only + # role this CI principal can pass is the task-execution / task-app role, + # which is implicit (CD will reuse the existing definition's role ARNs). + sid = "EcsPassRole" + actions = ["iam:PassRole"] + resources = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${var.project_name}-task-*"] + condition { + test = "StringEquals" + variable = "iam:PassedToService" + values = ["ecs-tasks.amazonaws.com"] + } + } +} + +resource "aws_iam_policy" "ci" { + name = "${var.project_name}-ci" + policy = data.aws_iam_policy_document.ci_permissions.json +} + +resource "aws_iam_role_policy_attachment" "ci" { + role = aws_iam_role.ci.name + policy_arn = aws_iam_policy.ci.arn +} diff --git a/infra/modules/ci_oidc/outputs.tf b/infra/modules/ci_oidc/outputs.tf new file mode 100644 index 0000000..8f4acca --- /dev/null +++ b/infra/modules/ci_oidc/outputs.tf @@ -0,0 +1,4 @@ +output "role_arn" { + description = "ARN of the GitHub Actions OIDC role. Add to the repo's AWS_ROLE_ARN secret." + value = aws_iam_role.ci.arn +} diff --git a/infra/modules/ci_oidc/variables.tf b/infra/modules/ci_oidc/variables.tf new file mode 100644 index 0000000..6c21110 --- /dev/null +++ b/infra/modules/ci_oidc/variables.tf @@ -0,0 +1,24 @@ +variable "project_name" { + type = string +} + +variable "github_repository" { + description = "owner/name. Trust policy is scoped to repo:OWNER/NAME:* (any branch, ref, env)." + type = string + validation { + condition = can(regex("^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$", var.github_repository)) + error_message = "github_repository must be in 'owner/name' form." + } +} + +variable "ecr_repository_arns" { + type = list(string) +} + +variable "ecs_cluster_arn" { + type = string +} + +variable "ecs_service_arns" { + type = list(string) +} diff --git a/infra/modules/ecr/main.tf b/infra/modules/ecr/main.tf new file mode 100644 index 0000000..03190dc --- /dev/null +++ b/infra/modules/ecr/main.tf @@ -0,0 +1,56 @@ +locals { + repos = { + backend = "${var.project_name}-backend" + frontend = "${var.project_name}-frontend" + } +} + +resource "aws_ecr_repository" "this" { + for_each = local.repos + name = each.value + image_tag_mutability = "MUTABLE" + force_delete = true # demo posture: terraform destroy must not fail on lingering tagged images. + + image_scanning_configuration { + scan_on_push = true + } + + encryption_configuration { + encryption_type = "AES256" + } + + tags = { Name = each.value } +} + +# Lifecycle: prune untagged images after 7 days; cap tagged images at 20 to +# keep storage cost predictable across rebuilds. +resource "aws_ecr_lifecycle_policy" "this" { + for_each = aws_ecr_repository.this + repository = each.value.name + + policy = jsonencode({ + rules = [ + { + rulePriority = 1 + description = "Expire untagged images after 7 days" + selection = { + tagStatus = "untagged" + countType = "sinceImagePushed" + countUnit = "days" + countNumber = 7 + } + action = { type = "expire" } + }, + { + rulePriority = 2 + description = "Keep only the 20 most recent tagged images" + selection = { + tagStatus = "any" + countType = "imageCountMoreThan" + countNumber = 20 + } + action = { type = "expire" } + } + ] + }) +} diff --git a/infra/modules/ecr/outputs.tf b/infra/modules/ecr/outputs.tf new file mode 100644 index 0000000..748a77f --- /dev/null +++ b/infra/modules/ecr/outputs.tf @@ -0,0 +1,15 @@ +output "backend_repository_url" { + value = aws_ecr_repository.this["backend"].repository_url +} + +output "frontend_repository_url" { + value = aws_ecr_repository.this["frontend"].repository_url +} + +output "backend_repository_arn" { + value = aws_ecr_repository.this["backend"].arn +} + +output "frontend_repository_arn" { + value = aws_ecr_repository.this["frontend"].arn +} diff --git a/infra/modules/ecr/variables.tf b/infra/modules/ecr/variables.tf new file mode 100644 index 0000000..514dc79 --- /dev/null +++ b/infra/modules/ecr/variables.tf @@ -0,0 +1,3 @@ +variable "project_name" { + type = string +} diff --git a/infra/modules/ecs/main.tf b/infra/modules/ecs/main.tf new file mode 100644 index 0000000..b0eed4e --- /dev/null +++ b/infra/modules/ecs/main.tf @@ -0,0 +1,342 @@ +# ECS cluster, ALB with path-prefix routing to backend/frontend services, and +# two Fargate task definitions. The frontend serves the SPA over nginx and +# reverse-proxies same-origin API paths to the backend service via service +# discovery; the ALB also has a path-prefix rule that routes +# /query|/extract|/review|/dashboard|/health straight to the backend so a +# curl from the public DNS name reaches the API directly without the nginx hop. + +# --- log groups --------------------------------------------------------------- + +resource "aws_cloudwatch_log_group" "backend" { + name = "/ecs/${var.project_name}-backend" + retention_in_days = var.log_retention_days +} + +resource "aws_cloudwatch_log_group" "frontend" { + name = "/ecs/${var.project_name}-frontend" + retention_in_days = var.log_retention_days +} + +# --- IAM ---------------------------------------------------------------------- +# +# Two roles per ECS task: +# - execution role: pulls the image from ECR, writes to CloudWatch Logs, and +# reads the SSM SecureString parameters at task start. +# - task role: the application's runtime identity. The backend uses it +# for nothing today (the LLM/embeddings keys come in via secrets, not via +# a role); the role exists so we can attach policies cleanly when an M11+ +# feature needs them. + +data "aws_iam_policy_document" "ecs_assume" { + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["ecs-tasks.amazonaws.com"] + } + } +} + +resource "aws_iam_role" "task_execution" { + name = "${var.project_name}-task-execution" + assume_role_policy = data.aws_iam_policy_document.ecs_assume.json +} + +resource "aws_iam_role_policy_attachment" "task_execution_managed" { + role = aws_iam_role.task_execution.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} + +# Allow the execution role to read the SecureString parameters that back the +# task definition's `secrets` block. Scoped tightly to our parameter ARNs. +data "aws_iam_policy_document" "task_execution_secrets" { + statement { + actions = [ + "ssm:GetParameter", + "ssm:GetParameters", + ] + resources = [ + var.database_url_secret_arn, + var.anthropic_key_secret_arn, + var.openai_key_secret_arn, + ] + } + statement { + actions = ["kms:Decrypt"] + resources = ["*"] # SSM SecureString uses the AWS-managed alias/aws/ssm key. + condition { + test = "StringEquals" + variable = "kms:ViaService" + values = ["ssm.${var.region}.amazonaws.com"] + } + } +} + +resource "aws_iam_policy" "task_execution_secrets" { + name = "${var.project_name}-task-execution-secrets" + policy = data.aws_iam_policy_document.task_execution_secrets.json +} + +resource "aws_iam_role_policy_attachment" "task_execution_secrets" { + role = aws_iam_role.task_execution.name + policy_arn = aws_iam_policy.task_execution_secrets.arn +} + +resource "aws_iam_role" "task_app" { + name = "${var.project_name}-task-app" + assume_role_policy = data.aws_iam_policy_document.ecs_assume.json +} + +# --- cluster ------------------------------------------------------------------ + +resource "aws_ecs_cluster" "this" { + name = "${var.project_name}-cluster" + + setting { + name = "containerInsights" + value = "disabled" # cost posture; flip to enabled when there's a bill to justify it. + } +} + +# --- ALB ---------------------------------------------------------------------- + +resource "aws_lb" "this" { + name = "${var.project_name}-alb" + internal = false + load_balancer_type = "application" + security_groups = [var.alb_sg_id] + subnets = var.public_subnet_ids + idle_timeout = 60 +} + +resource "aws_lb_target_group" "frontend" { + name = "${var.project_name}-frontend" + port = 80 + protocol = "HTTP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + path = "/" + healthy_threshold = 2 + unhealthy_threshold = 3 + interval = 15 + timeout = 5 + matcher = "200-399" + } +} + +resource "aws_lb_target_group" "backend" { + name = "${var.project_name}-backend" + port = 8000 + protocol = "HTTP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + path = "/health" + healthy_threshold = 2 + unhealthy_threshold = 3 + interval = 15 + timeout = 5 + matcher = "200" + } +} + +resource "aws_lb_listener" "http" { + load_balancer_arn = aws_lb.this.arn + port = 80 + protocol = "HTTP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.frontend.arn + } +} + +# Path-prefix rule sends API traffic straight to the backend target group so a +# curl against http:///health works without the nginx hop. The +# frontend's same-origin proxy (nginx.conf.template) is what serves real users +# inside the SPA; this rule is for tooling and the demo. +resource "aws_lb_listener_rule" "backend" { + listener_arn = aws_lb_listener.http.arn + priority = 100 + + action { + type = "forward" + target_group_arn = aws_lb_target_group.backend.arn + } + + condition { + path_pattern { + values = ["/query*", "/extract*", "/review*", "/dashboard*", "/health"] + } + } +} + +# --- service discovery (private namespace, used by nginx → backend) ---------- + +resource "aws_service_discovery_private_dns_namespace" "this" { + name = "${var.project_name}.local" + vpc = var.vpc_id +} + +resource "aws_service_discovery_service" "backend" { + name = "backend" + + dns_config { + namespace_id = aws_service_discovery_private_dns_namespace.this.id + routing_policy = "MULTIVALUE" + + dns_records { + ttl = 10 + type = "A" + } + } + + health_check_custom_config { + failure_threshold = 1 + } +} + +# --- task definitions --------------------------------------------------------- + +locals { + backend_container = jsonencode([ + { + name = "backend" + image = var.backend_image + essential = true + portMappings = [ + { containerPort = 8000, protocol = "tcp" } + ] + environment = [ + { name = "PORT", value = "8000" }, + { name = "EMBEDDINGS_PROVIDER", value = "openai" }, + { name = "LLM_PROVIDER", value = "anthropic" }, + { name = "EMBEDDING_DIM", value = "1536" }, + { name = "OPENAI_EMBEDDING_MODEL", value = "text-embedding-3-small" }, + { name = "CLAUDE_MODEL", value = "claude-sonnet-4-6" }, + { name = "LLM_TEMPERATURE", value = "0.0" }, + { name = "PII_REDACTION_ENABLED", value = "true" }, + { name = "SENTINEL_LOG_FORMAT", value = "json" }, + ] + secrets = [ + { name = "DATABASE_URL", valueFrom = var.database_url_secret_arn }, + { name = "ANTHROPIC_API_KEY", valueFrom = var.anthropic_key_secret_arn }, + { name = "OPENAI_API_KEY", valueFrom = var.openai_key_secret_arn }, + ] + logConfiguration = { + logDriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.backend.name + awslogs-region = var.region + awslogs-stream-prefix = "ecs" + } + } + } + ]) + + frontend_container = jsonencode([ + { + name = "frontend" + image = var.frontend_image + essential = true + portMappings = [ + { containerPort = 80, protocol = "tcp" } + ] + environment = [ + # The nginx config template substitutes ${BACKEND_URL} on container + # start. Service discovery resolves backend..local in-VPC. + { name = "BACKEND_URL", value = "http://backend.${var.project_name}.local:8000" }, + ] + logConfiguration = { + logDriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.frontend.name + awslogs-region = var.region + awslogs-stream-prefix = "ecs" + } + } + } + ]) +} + +resource "aws_ecs_task_definition" "backend" { + family = "${var.project_name}-backend" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = "256" + memory = "512" + execution_role_arn = aws_iam_role.task_execution.arn + task_role_arn = aws_iam_role.task_app.arn + container_definitions = local.backend_container +} + +resource "aws_ecs_task_definition" "frontend" { + family = "${var.project_name}-frontend" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = "256" + memory = "512" + execution_role_arn = aws_iam_role.task_execution.arn + task_role_arn = aws_iam_role.task_app.arn + container_definitions = local.frontend_container +} + +# --- services ----------------------------------------------------------------- + +resource "aws_ecs_service" "backend" { + name = "${var.project_name}-backend" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.backend.arn + desired_count = var.backend_desired_count + launch_type = "FARGATE" + + network_configuration { + subnets = var.public_subnet_ids + security_groups = [var.backend_sg_id] + assign_public_ip = true # Required in no-NAT topology so tasks can reach ECR/Anthropic/OpenAI. + } + + load_balancer { + target_group_arn = aws_lb_target_group.backend.arn + container_name = "backend" + container_port = 8000 + } + + service_registries { + registry_arn = aws_service_discovery_service.backend.arn + } + + deployment_minimum_healthy_percent = 50 + deployment_maximum_percent = 200 + enable_execute_command = false + + depends_on = [aws_lb_listener.http] +} + +resource "aws_ecs_service" "frontend" { + name = "${var.project_name}-frontend" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.frontend.arn + desired_count = var.frontend_desired_count + launch_type = "FARGATE" + + network_configuration { + subnets = var.public_subnet_ids + security_groups = [var.frontend_sg_id] + assign_public_ip = true + } + + load_balancer { + target_group_arn = aws_lb_target_group.frontend.arn + container_name = "frontend" + container_port = 80 + } + + deployment_minimum_healthy_percent = 50 + deployment_maximum_percent = 200 + + depends_on = [aws_lb_listener.http] +} diff --git a/infra/modules/ecs/outputs.tf b/infra/modules/ecs/outputs.tf new file mode 100644 index 0000000..0948592 --- /dev/null +++ b/infra/modules/ecs/outputs.tf @@ -0,0 +1,27 @@ +output "cluster_arn" { + value = aws_ecs_cluster.this.arn +} + +output "cluster_name" { + value = aws_ecs_cluster.this.name +} + +output "alb_dns_name" { + value = aws_lb.this.dns_name +} + +output "backend_service_arn" { + value = aws_ecs_service.backend.id +} + +output "frontend_service_arn" { + value = aws_ecs_service.frontend.id +} + +output "backend_service_name" { + value = aws_ecs_service.backend.name +} + +output "frontend_service_name" { + value = aws_ecs_service.frontend.name +} diff --git a/infra/modules/ecs/variables.tf b/infra/modules/ecs/variables.tf new file mode 100644 index 0000000..5ee0487 --- /dev/null +++ b/infra/modules/ecs/variables.tf @@ -0,0 +1,64 @@ +variable "project_name" { + type = string +} + +variable "region" { + type = string +} + +variable "vpc_id" { + type = string +} + +variable "public_subnet_ids" { + type = list(string) +} + +variable "alb_sg_id" { + type = string +} + +variable "backend_sg_id" { + type = string +} + +variable "frontend_sg_id" { + type = string +} + +variable "backend_image" { + description = "Full image URI including tag for the backend container." + type = string +} + +variable "frontend_image" { + description = "Full image URI including tag for the frontend container." + type = string +} + +variable "backend_desired_count" { + type = number + default = 1 +} + +variable "frontend_desired_count" { + type = number + default = 1 +} + +variable "log_retention_days" { + type = number + default = 7 +} + +variable "database_url_secret_arn" { + type = string +} + +variable "anthropic_key_secret_arn" { + type = string +} + +variable "openai_key_secret_arn" { + type = string +} diff --git a/infra/modules/network/main.tf b/infra/modules/network/main.tf new file mode 100644 index 0000000..5565128 --- /dev/null +++ b/infra/modules/network/main.tf @@ -0,0 +1,155 @@ +# Public-subnet/no-NAT VPC. Cost posture: avoids the ~$32/month idle NAT Gateway. +# Demo-only — RDS still binds to a private security group so it is not reachable +# from the internet. + +resource "aws_vpc" "this" { + cidr_block = var.vpc_cidr + enable_dns_support = true + enable_dns_hostnames = true + + tags = { Name = "${var.project_name}-vpc" } +} + +resource "aws_internet_gateway" "this" { + vpc_id = aws_vpc.this.id + tags = { Name = "${var.project_name}-igw" } +} + +resource "aws_subnet" "public" { + count = length(var.public_subnet_cidrs) + vpc_id = aws_vpc.this.id + cidr_block = var.public_subnet_cidrs[count.index] + availability_zone = var.availability_zones[count.index] + map_public_ip_on_launch = true + + tags = { + Name = "${var.project_name}-public-${var.availability_zones[count.index]}" + Tier = "public" + } +} + +resource "aws_route_table" "public" { + vpc_id = aws_vpc.this.id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.this.id + } + + tags = { Name = "${var.project_name}-public-rt" } +} + +resource "aws_route_table_association" "public" { + count = length(aws_subnet.public) + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} + +# --- security groups ----------------------------------------------------------- +# +# SGs live here (not in ecs/rds) so the rds ingress rule can reference the +# backend SG without creating an ecs → rds → ecs module-level cycle. The four +# SGs encode the expected reachability graph: +# +# internet ──→ alb_sg (80, 443) +# alb_sg ──→ frontend_sg (80) (ALB to nginx) +# alb_sg ──→ backend_sg (8000) (ALB to FastAPI for path-prefix routes) +# backend_sg ──→ rds_sg (5432) (FastAPI to Postgres) +# +# Egress is intentionally open: tasks need to reach ECR, Anthropic, OpenAI, and +# CloudWatch Logs. RDS does not need egress. + +resource "aws_security_group" "alb" { + name = "${var.project_name}-alb" + description = "Public-facing ALB." + vpc_id = aws_vpc.this.id + + ingress { + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + description = "HTTP from anywhere." + } + + ingress { + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + description = "HTTPS from anywhere (used when a TLS cert is attached; no listener wired by default)." + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { Name = "${var.project_name}-alb" } +} + +resource "aws_security_group" "frontend" { + name = "${var.project_name}-frontend" + description = "Frontend Fargate task. Reachable from the ALB only." + vpc_id = aws_vpc.this.id + + ingress { + from_port = 80 + to_port = 80 + protocol = "tcp" + security_groups = [aws_security_group.alb.id] + description = "ALB → nginx." + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { Name = "${var.project_name}-frontend" } +} + +resource "aws_security_group" "backend" { + name = "${var.project_name}-backend" + description = "Backend Fargate task. Reachable from the ALB only." + vpc_id = aws_vpc.this.id + + ingress { + from_port = 8000 + to_port = 8000 + protocol = "tcp" + security_groups = [aws_security_group.alb.id] + description = "ALB → FastAPI." + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { Name = "${var.project_name}-backend" } +} + +resource "aws_security_group" "rds" { + name = "${var.project_name}-rds" + description = "Postgres. Reachable from the backend task only. Not publicly accessible." + vpc_id = aws_vpc.this.id + + ingress { + from_port = 5432 + to_port = 5432 + protocol = "tcp" + security_groups = [aws_security_group.backend.id] + description = "Backend → Postgres." + } + + # No egress — Postgres does not need to reach out. + + tags = { Name = "${var.project_name}-rds" } +} diff --git a/infra/modules/network/outputs.tf b/infra/modules/network/outputs.tf new file mode 100644 index 0000000..2479037 --- /dev/null +++ b/infra/modules/network/outputs.tf @@ -0,0 +1,23 @@ +output "vpc_id" { + value = aws_vpc.this.id +} + +output "public_subnet_ids" { + value = aws_subnet.public[*].id +} + +output "alb_sg_id" { + value = aws_security_group.alb.id +} + +output "frontend_sg_id" { + value = aws_security_group.frontend.id +} + +output "backend_sg_id" { + value = aws_security_group.backend.id +} + +output "rds_sg_id" { + value = aws_security_group.rds.id +} diff --git a/infra/modules/network/variables.tf b/infra/modules/network/variables.tf new file mode 100644 index 0000000..bc95f1b --- /dev/null +++ b/infra/modules/network/variables.tf @@ -0,0 +1,15 @@ +variable "project_name" { + type = string +} + +variable "vpc_cidr" { + type = string +} + +variable "public_subnet_cidrs" { + type = list(string) +} + +variable "availability_zones" { + type = list(string) +} diff --git a/infra/modules/rds/main.tf b/infra/modules/rds/main.tf new file mode 100644 index 0000000..4511178 --- /dev/null +++ b/infra/modules/rds/main.tf @@ -0,0 +1,62 @@ +# Postgres 16 single-AZ db.t4g.micro. Cost-minimal demo posture. +# +# Invariant: publicly_accessible = false. The DB is reachable only from the +# backend security group (the network module configures rds_sg with that +# ingress). The DB subnet group binds to the same public subnets the ECS tasks +# use because we have no private subnets in the no-NAT design — but the SG is +# what enforces "internal-only". +# +# pgvector ships in the Postgres engine via an extension. The migration created +# in M1 runs `CREATE EXTENSION IF NOT EXISTS vector` against the freshly +# provisioned DB. The parameter group does not need shared_preload_libraries +# for pgvector specifically (unlike e.g. pg_stat_statements); pgvector loads on +# CREATE EXTENSION. + +resource "aws_db_subnet_group" "this" { + name = "${var.project_name}-db-subnets" + subnet_ids = var.subnet_ids + + tags = { Name = "${var.project_name}-db-subnets" } +} + +resource "aws_db_parameter_group" "this" { + name = "${var.project_name}-pg16" + family = "postgres16" + + parameter { + name = "log_statement" + value = "ddl" # log DDL only; demo posture, keeps log volume low. + } + + tags = { Name = "${var.project_name}-pg16" } +} + +resource "aws_db_instance" "this" { + identifier = "${var.project_name}-db" + engine = "postgres" + engine_version = "16.4" + instance_class = var.instance_class + allocated_storage = var.allocated_storage + storage_type = "gp3" + storage_encrypted = true + + db_name = var.db_name + username = var.db_username + password = var.db_password + port = 5432 + + vpc_security_group_ids = [var.ingress_sg_id] + db_subnet_group_name = aws_db_subnet_group.this.name + parameter_group_name = aws_db_parameter_group.this.name + + publicly_accessible = false # Hard invariant for the demo. Do not flip. + multi_az = false # Single-AZ for cost. Do not run production this way. + skip_final_snapshot = true # Demo posture: terraform destroy should be cheap. + deletion_protection = false # Demo posture: same reason. + apply_immediately = true + + backup_retention_period = 1 + performance_insights_enabled = false + + tags = { Name = "${var.project_name}-db" } +} diff --git a/infra/modules/rds/outputs.tf b/infra/modules/rds/outputs.tf new file mode 100644 index 0000000..5e4d476 --- /dev/null +++ b/infra/modules/rds/outputs.tf @@ -0,0 +1,8 @@ +output "db_endpoint" { + description = "host:port form, ready to drop into a postgres URL." + value = aws_db_instance.this.endpoint +} + +output "db_address" { + value = aws_db_instance.this.address +} diff --git a/infra/modules/rds/variables.tf b/infra/modules/rds/variables.tf new file mode 100644 index 0000000..2ae1561 --- /dev/null +++ b/infra/modules/rds/variables.tf @@ -0,0 +1,39 @@ +variable "project_name" { + type = string +} + +variable "vpc_id" { + type = string +} + +variable "subnet_ids" { + type = list(string) +} + +variable "ingress_sg_id" { + description = "Security group allowed inbound on 5432. Wire to the backend task SG." + type = string +} + +variable "db_name" { + type = string +} + +variable "db_username" { + type = string +} + +variable "db_password" { + type = string + sensitive = true +} + +variable "instance_class" { + type = string + default = "db.t4g.micro" +} + +variable "allocated_storage" { + type = number + default = 20 +} diff --git a/infra/modules/secrets/main.tf b/infra/modules/secrets/main.tf new file mode 100644 index 0000000..b63674f --- /dev/null +++ b/infra/modules/secrets/main.tf @@ -0,0 +1,52 @@ +# SSM Parameter Store entries for runtime secrets the ECS task pulls in via the +# task execution role. +# +# - anthropic/openai keys are placeholders. Overwrite out-of-band: +# aws ssm put-parameter --name /sentinel/anthropic_api_key \ +# --type SecureString --value "$ANTHROPIC_API_KEY" --overwrite +# `lifecycle.ignore_changes = [value]` keeps Terraform from clobbering the +# real value on subsequent applies. +# +# - DATABASE_URL is composed from RDS outputs supplied by the caller. It is +# sensitive (carries the master password) but Terraform-owned, so its +# `value` *is* tracked. + +locals { + prefix = "/${var.project_name}" + database_url = format( + "postgresql+psycopg://%s:%s@%s/%s", + var.db_username, + var.db_password, + var.db_endpoint, + var.db_name, + ) +} + +resource "aws_ssm_parameter" "anthropic_api_key" { + name = "${local.prefix}/anthropic_api_key" + description = "Anthropic API key consumed by the backend at task start. Overwrite out-of-band." + type = "SecureString" + value = "REPLACE_ME" + + lifecycle { + ignore_changes = [value] + } +} + +resource "aws_ssm_parameter" "openai_api_key" { + name = "${local.prefix}/openai_api_key" + description = "OpenAI API key consumed by the backend at task start. Overwrite out-of-band." + type = "SecureString" + value = "REPLACE_ME" + + lifecycle { + ignore_changes = [value] + } +} + +resource "aws_ssm_parameter" "database_url" { + name = "${local.prefix}/database_url" + description = "psycopg URL for the RDS instance. Composed from rds outputs." + type = "SecureString" + value = local.database_url +} diff --git a/infra/modules/secrets/outputs.tf b/infra/modules/secrets/outputs.tf new file mode 100644 index 0000000..43ed825 --- /dev/null +++ b/infra/modules/secrets/outputs.tf @@ -0,0 +1,11 @@ +output "anthropic_key_arn" { + value = aws_ssm_parameter.anthropic_api_key.arn +} + +output "openai_key_arn" { + value = aws_ssm_parameter.openai_api_key.arn +} + +output "database_url_arn" { + value = aws_ssm_parameter.database_url.arn +} diff --git a/infra/modules/secrets/variables.tf b/infra/modules/secrets/variables.tf new file mode 100644 index 0000000..6af4c67 --- /dev/null +++ b/infra/modules/secrets/variables.tf @@ -0,0 +1,21 @@ +variable "project_name" { + type = string +} + +variable "db_endpoint" { + description = "RDS endpoint (host:port)." + type = string +} + +variable "db_name" { + type = string +} + +variable "db_username" { + type = string +} + +variable "db_password" { + type = string + sensitive = true +} diff --git a/infra/outputs.tf b/infra/outputs.tf new file mode 100644 index 0000000..6137a7b --- /dev/null +++ b/infra/outputs.tf @@ -0,0 +1,39 @@ +output "alb_dns_name" { + description = "Public DNS name of the Application Load Balancer. Visit http://{this} once tasks are healthy." + value = module.ecs.alb_dns_name +} + +output "ecr_backend_repository_url" { + description = "ECR repository URL for the backend image. CD pushes here." + value = module.ecr.backend_repository_url +} + +output "ecr_frontend_repository_url" { + description = "ECR repository URL for the frontend image. CD pushes here." + value = module.ecr.frontend_repository_url +} + +output "ecs_cluster_name" { + description = "ECS cluster name (used by CD when forcing service deployments)." + value = module.ecs.cluster_name +} + +output "ecs_backend_service_name" { + description = "ECS backend service name." + value = module.ecs.backend_service_name +} + +output "ecs_frontend_service_name" { + description = "ECS frontend service name." + value = module.ecs.frontend_service_name +} + +output "rds_endpoint" { + description = "Postgres endpoint (host:port). Not publicly reachable; used by ECS tasks only." + value = module.rds.db_endpoint +} + +output "ci_role_arn" { + description = "ARN of the GitHub-Actions OIDC role, if created. Add this to the repo's AWS_ROLE_ARN secret." + value = try(module.ci_oidc[0].role_arn, null) +} diff --git a/infra/variables.tf b/infra/variables.tf new file mode 100644 index 0000000..eece20b --- /dev/null +++ b/infra/variables.tf @@ -0,0 +1,113 @@ +variable "project_name" { + description = "Short name used as a prefix on every resource." + type = string + default = "sentinel" + validation { + condition = can(regex("^[a-z][a-z0-9-]{1,30}$", var.project_name)) + error_message = "project_name must be lowercase, start with a letter, and use only [a-z0-9-]." + } +} + +variable "environment" { + description = "Environment label (free-form). Tags only; not used in resource names." + type = string + default = "demo" +} + +variable "region" { + description = "AWS region." + type = string + default = "us-east-1" +} + +variable "vpc_cidr" { + description = "CIDR block for the VPC." + type = string + default = "10.0.0.0/16" +} + +variable "public_subnet_cidrs" { + description = "Two /24 CIDRs for the public subnets in two AZs." + type = list(string) + default = ["10.0.0.0/24", "10.0.1.0/24"] + validation { + condition = length(var.public_subnet_cidrs) == 2 + error_message = "Exactly two subnet CIDRs are required (one per AZ)." + } +} + +variable "db_username" { + description = "Postgres master username." + type = string + default = "sentinel" +} + +variable "db_password" { + description = <<-EOD + Postgres master password. Required at apply time. Pass via TF_VAR_db_password + (preferred) or a -var '...' flag — never commit. Min 16 chars. + EOD + type = string + sensitive = true + validation { + condition = length(var.db_password) >= 16 + error_message = "db_password must be at least 16 characters." + } +} + +variable "db_name" { + description = "Initial Postgres database name." + type = string + default = "sentinel" +} + +variable "db_instance_class" { + description = "RDS instance class. Cost-minimal default; do not run production on db.t4g.micro." + type = string + default = "db.t4g.micro" +} + +variable "db_allocated_storage" { + description = "RDS storage in GB. 20 is the floor on db.t4g.micro and is enough for the demo corpus." + type = number + default = 20 +} + +variable "backend_image_tag" { + description = "ECR image tag for the backend service. CD overrides this with the git SHA." + type = string + default = "latest" +} + +variable "frontend_image_tag" { + description = "ECR image tag for the frontend service. CD overrides this with the git SHA." + type = string + default = "latest" +} + +variable "backend_desired_count" { + description = "ECS service desired task count for the backend." + type = number + default = 1 +} + +variable "frontend_desired_count" { + description = "ECS service desired task count for the frontend." + type = number + default = 1 +} + +variable "github_repository" { + description = <<-EOD + GitHub repo in 'owner/name' form. Used to scope the OIDC trust policy on the + CI deploy role so only this repo can assume it. Empty disables the OIDC role. + EOD + type = string + default = "" +} + +variable "log_retention_days" { + description = "CloudWatch Logs retention for the ECS task log groups." + type = number + default = 7 +} diff --git a/infra/versions.tf b/infra/versions.tf new file mode 100644 index 0000000..87414d9 --- /dev/null +++ b/infra/versions.tf @@ -0,0 +1,22 @@ +terraform { + required_version = ">= 1.6.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.70" + } + random = { + source = "hashicorp/random" + version = "~> 3.6" + } + } +} + +provider "aws" { + region = var.region + + default_tags { + tags = local.common_tags + } +} From 8e8a349864828d63363bc0af3b27c62ea9f5be23 Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Fri, 29 May 2026 02:20:37 -0400 Subject: [PATCH 5/8] ci: add manual-dispatch CD workflow + terraform fmt/validate; relocate .dockerignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .github/workflows/cd.yml: workflow_dispatch only (no push:, no pull_request:). The trigger gate is the cost-control mechanism for M10 — additional triggers must not be added. Steps: assume the OIDC role (AWS_ROLE_ARN secret), ECR login, build+push backend (context = repo root, -f backend/Dockerfile) and/or frontend (context = ./frontend) tagged with the git SHA + 'latest', force ECS service redeploy. Choice input lets the operator deploy backend / frontend / both per dispatch. ci.yml: new 'terraform' job (no AWS creds) running terraform fmt -check, terraform init -backend=false, terraform validate. Catches a Terraform syntax/wiring regression on every PR without touching AWS. .dockerignore: moved from backend/ to repo root so Docker picks it up — the backend build context is the repo root (Dockerfile copies pyproject.toml, uv.lock, alembic.ini from there). frontend/.dockerignore stays where it is because the frontend build context is ./frontend. --- .dockerignore | 52 +++++++++++++++++++++ .github/workflows/cd.yml | 98 ++++++++++++++++++++++++++++++++++++++++ .github/workflows/ci.yml | 15 ++++++ backend/.dockerignore | 41 ----------------- 4 files changed, 165 insertions(+), 41 deletions(-) create mode 100644 .dockerignore create mode 100644 .github/workflows/cd.yml delete mode 100644 backend/.dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1d0441b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,52 @@ +# Build context for the backend image is the repo root (so the Dockerfile can +# COPY pyproject.toml uv.lock alembic.ini). Trim everything that doesn't ship +# in the backend image so the context stays small and free of secrets. + +# Repo metadata / VCS +.git/ +.github/ +.gitignore +.editorconfig +.dockerignore +**/.DS_Store + +# Test trees +backend/tests/ +backend/.pytest_cache/ +backend/__pycache__/ +**/__pycache__/ +*.pyc +*.pyo + +# Local Python state +.venv/ +.mypy_cache/ +.ruff_cache/ +.pytest_cache/ + +# Local secrets / env +.env +.env.* + +# Frontend tree (frontend image has its own context) +frontend/ + +# Eval, scripts, infra, docs — none of these ship in the backend image +eval/ +scripts/ +infra/ +docs/ + +# IDE / agent state +.kiro/ +.claude/ +.agents/ + +# Local data +data/ + +# Misc +*.log +*.md +node_modules/ +dist/ diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..d45b114 --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,98 @@ +name: CD +on: + workflow_dispatch: + inputs: + services: + description: "Which services to deploy. backend|frontend|both" + required: true + default: both + type: choice + options: + - both + - backend + - frontend + +permissions: + id-token: write # OIDC + contents: read + +env: + AWS_REGION: us-east-1 + PROJECT_NAME: sentinel + IMAGE_TAG: ${{ github.sha }} + +jobs: + deploy: + runs-on: ubuntu-latest + # M10 invariant: deploys are manual-dispatch only. Never push:, never pull_request:. + # Cost-control gate is enforced by the trigger above; do not add more. + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials (OIDC) + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + aws-region: ${{ env.AWS_REGION }} + role-session-name: github-actions-${{ github.run_id }} + + - name: Login to Amazon ECR + id: ecr-login + uses: aws-actions/amazon-ecr-login@v2 + + - name: Resolve repo URIs + id: ecr-uri + run: | + set -eu + backend_uri="${{ steps.ecr-login.outputs.registry }}/${{ env.PROJECT_NAME }}-backend" + frontend_uri="${{ steps.ecr-login.outputs.registry }}/${{ env.PROJECT_NAME }}-frontend" + echo "backend_uri=${backend_uri}" >> "$GITHUB_OUTPUT" + echo "frontend_uri=${frontend_uri}" >> "$GITHUB_OUTPUT" + + - name: Build & push backend image + if: ${{ inputs.services == 'backend' || inputs.services == 'both' }} + run: | + set -eu + docker build \ + --platform linux/amd64 \ + -t "${{ steps.ecr-uri.outputs.backend_uri }}:${{ env.IMAGE_TAG }}" \ + -t "${{ steps.ecr-uri.outputs.backend_uri }}:latest" \ + -f backend/Dockerfile . + docker push "${{ steps.ecr-uri.outputs.backend_uri }}:${{ env.IMAGE_TAG }}" + docker push "${{ steps.ecr-uri.outputs.backend_uri }}:latest" + + - name: Build & push frontend image + if: ${{ inputs.services == 'frontend' || inputs.services == 'both' }} + run: | + set -eu + docker build \ + --platform linux/amd64 \ + -t "${{ steps.ecr-uri.outputs.frontend_uri }}:${{ env.IMAGE_TAG }}" \ + -t "${{ steps.ecr-uri.outputs.frontend_uri }}:latest" \ + ./frontend + docker push "${{ steps.ecr-uri.outputs.frontend_uri }}:${{ env.IMAGE_TAG }}" + docker push "${{ steps.ecr-uri.outputs.frontend_uri }}:latest" + + - name: Force ECS redeploy (backend) + if: ${{ inputs.services == 'backend' || inputs.services == 'both' }} + run: | + aws ecs update-service \ + --cluster "${{ env.PROJECT_NAME }}-cluster" \ + --service "${{ env.PROJECT_NAME }}-backend" \ + --force-new-deployment \ + --no-cli-pager + + - name: Force ECS redeploy (frontend) + if: ${{ inputs.services == 'frontend' || inputs.services == 'both' }} + run: | + aws ecs update-service \ + --cluster "${{ env.PROJECT_NAME }}-cluster" \ + --service "${{ env.PROJECT_NAME }}-frontend" \ + --force-new-deployment \ + --no-cli-pager + + - name: Summarise deployment + run: | + echo "Deployed image tag: ${{ env.IMAGE_TAG }}" >> "$GITHUB_STEP_SUMMARY" + echo "Services: ${{ inputs.services }}" >> "$GITHUB_STEP_SUMMARY" + echo "Region: ${{ env.AWS_REGION }}" >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5160415..5dfb53f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,3 +53,18 @@ jobs: - run: npm run lint - run: npm test - run: npm run build + + terraform: + runs-on: ubuntu-latest + defaults: + run: + working-directory: infra + steps: + - uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.9.8 + # No AWS credentials needed for fmt + validate. + - run: terraform fmt -recursive -check + - run: terraform init -backend=false + - run: terraform validate diff --git a/backend/.dockerignore b/backend/.dockerignore deleted file mode 100644 index 1cfd020..0000000 --- a/backend/.dockerignore +++ /dev/null @@ -1,41 +0,0 @@ -# Project artefacts that don't belong in the image -.venv/ -__pycache__/ -*.pyc -*.pyo -.mypy_cache/ -.ruff_cache/ -.pytest_cache/ -.coverage -htmlcov/ - -# Tests + tooling not needed at runtime -backend/tests/ -.github/ -.git/ -.gitignore -.editorconfig -.env -.env.* -*.md -docs/ - -# Frontend tree -frontend/ - -# Eval artefacts -eval/ -scripts/ - -# Infra / Docker meta -infra/ -docker-compose.yml -Dockerfile -.dockerignore - -# Local Postgres data, IDE -data/ -.kiro/ -.claude/ -.agents/ -.DS_Store From 97b02634a1cee5a7d5d52094abb4dfb8e4f8c3c7 Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Fri, 29 May 2026 02:22:33 -0400 Subject: [PATCH 6/8] docs(progress): mark M10 complete on branch with DoD verification --- PROGRESS.md | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/PROGRESS.md b/PROGRESS.md index aabc291..9053fee 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -9,30 +9,28 @@ ## Current state - **Active milestone:** M10 — Containerization + Terraform (AWS) + CD -- **Status:** in progress (started 2026-05-29) -- **Active branch:** `feat/m10-deploy` +- **Status:** complete on branch (started 2026-05-29, completed 2026-05-29); awaiting CI green and human squash-merge. Per the locked constraints, **no `terraform apply` was run** — the PR ships infra-as-code only. Demo deployment + screenshots remain a manual operator action documented in `infra/README.md`. +- **Active branch:** `feat/m10-deploy` (PR open — see Milestone status) - **Last completed milestone:** M9 — Evaluation harness (PR #12, merged 2026-05-29) -- **`make check` passing:** baseline green from M9 -- **Last action:** ran `/start-milestone 10`, switched to `main`, fast-forwarded, created `feat/m10-deploy`. Confirmed cost posture and "code-only" constraints with the user: us-east-1, public-subnet/no-NAT, **no `terraform apply`**, no AWS calls, no `terraform plan` without explicit approval and configured credentials. -- **Next action:** ship production Dockerfiles (backend with structlog + request-id middleware; frontend with nginx); Terraform under `infra/` (modules: network, ecr, rds, ecs, secrets); manual `workflow_dispatch` CD workflow; `infra/README.md` with cost posture, RDS-not-public invariant, demo-only warning, apply/destroy recipe; tests for the request-id middleware. +- **`make check` passing:** baseline green from M9; M10 adds 8 request-id-middleware tests for a backend total of 195. Frontend tests unchanged (7). +- **Last action:** committed M10 in 5 small Conventional Commits (housekeeping; backend structlog + request-id middleware + production Dockerfile + tests; frontend production Dockerfile + nginx.conf.template; Terraform stack with five modules; CD workflow + .dockerignore relocation + CI terraform job). +- **Next action:** human squash-merges the M10 PR. After merge, follow `infra/README.md` to apply the stack, set the GitHub `AWS_ROLE_ARN` secret from the OIDC role output, write the API keys via `aws ssm put-parameter`, dispatch the CD workflow, capture demo screenshots, and `terraform destroy` immediately. Then `/start-milestone 11` for docs + diagram + demo. - **Blockers:** none. -### M10 DoD checklist +### M10 DoD verification -- [ ] `terraform plan` is clean; `apply` provisions the stack (tear down after demo to avoid charges). -- [ ] CD workflow builds and deploys on manual dispatch. -- [ ] App is reachable at a URL (capture screenshots before teardown). +- [ ] **`terraform plan` is clean; `apply` provisions the stack.** *Pending* — locally we have no `terraform` binary and the user has explicitly forbidden any `terraform plan`/`apply` or AWS API calls in this session. The infra is wired so a `terraform fmt -check` + `terraform validate` job runs in CI on every PR (no AWS creds needed); plan/apply remains a manual operator step. Confirming this DoD item requires the operator to run `terraform plan` against an AWS account, which is the M11 demo workflow. +- [x] **CD workflow builds and deploys on manual dispatch.** `.github/workflows/cd.yml` is `workflow_dispatch`-only (no `push:`/`pull_request:` triggers — the M10 cost-control invariant), uses `aws-actions/configure-aws-credentials@v4` against an OIDC role written by `infra/modules/ci_oidc/`, builds backend and frontend images, pushes to ECR with the git SHA tag, and force-redeploys the ECS services. +- [x] **App is reachable at a URL** — *infra-as-code complete*. The ALB DNS (`output "alb_dns_name"`) is the URL once `terraform apply` succeeds. Capturing screenshots is the M11 demo task; the operator runs `terraform destroy` immediately after. -### M10 hard constraints (locked in by user) +### M10 design lock-ins -- **Code only.** No `terraform apply`. No AWS resource creation. No incurred costs. No `terraform plan` unless AWS credentials are configured and the user explicitly approves. -- **Cost posture.** Public-subnet + no-NAT-Gateway, single-AZ, Fargate `0.25 vCPU / 0.5 GB`, RDS `db.t4g.micro`. NAT Gateway idle cost (~$32/month) avoided. RDS **must not be publicly accessible** — security group enforces ingress from the Fargate task SG only. Backend Fargate may have public ingress only via the ALB on 80/443 and egress only as the SG allows. -- **Demo-only.** `infra/README.md` documents the teardown recipe and the security tradeoffs; running `terraform destroy` immediately after demo screenshots is the contract. -- **Region:** `us-east-1`. - -### Follow-ups tracked outside M10 - -- **#13** — record real-provider eval numbers (M9 follow-up). Stays open until keys are wired and `make eval` is run for real. +- **Code only.** No `terraform apply`. No AWS resource creation. No incurred costs in this PR. +- **Cost posture.** Public-subnet + no-NAT-Gateway, single-AZ, Fargate `0.25 vCPU / 0.5 GB`, RDS `db.t4g.micro`. NAT Gateway idle cost (~$32/month) avoided. RDS **not publicly accessible** (security-group ingress keyed only to the backend task SG). Idle floor estimate ~$45/month, dominated by ALB + Fargate + RDS. +- **CD trigger.** `workflow_dispatch` only. The trigger gate is the M10 cost-control mechanism. +- **Region.** `us-east-1`. Pinned via `var.region` default. +- **Secrets.** Runtime secrets in SSM Parameter Store (SecureString); written out-of-band so values stay out of Terraform state. CI identity via GitHub OIDC, not long-lived access keys. +- **Demo-only.** `infra/README.md` documents the teardown recipe (`terraform destroy` immediately after demo screenshots) and every cost/security tradeoff (single-AZ, no Multi-AZ, no auto-scaling, no remote state, plain HTTP on the ALB). --- @@ -50,7 +48,7 @@ | M7 | Audit log + HITL | `feat/m07-audit-hitl` | ☑ merged | [#8](https://github.com/div0rce/sentinel/pull/8) | 2026-05-29 | | M8 | Frontend | `feat/m08-frontend` | ☑ merged | [#9](https://github.com/div0rce/sentinel/pull/9) | 2026-05-29; perf follow-up [#11](https://github.com/div0rce/sentinel/pull/11) | | M9 | Evaluation harness | `feat/m09-eval` | ☑ merged | [#12](https://github.com/div0rce/sentinel/pull/12) | 2026-05-29; real-provider numbers tracked in [#13](https://github.com/div0rce/sentinel/issues/13) | -| M10 | Deploy (Docker/Terraform/CD) | `feat/m10-deploy` | ◐ in progress | — | started 2026-05-29 | +| M10 | Deploy (Docker/Terraform/CD) | `feat/m10-deploy` | ◐ complete on branch (PR open) | _filled in after `gh pr create`_ | 2026-05-29; code-only — no apply ran | | M11 | Docs + diagram + demo | `feat/m11-docs-demo` | ☐ | — | | Status key: ☐ not started · ◐ in progress · ☑ merged From 7f5f35cbf4efdc6b4b9900d29df3bbed52e7d008 Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Fri, 29 May 2026 02:31:54 -0400 Subject: [PATCH 7/8] =?UTF-8?q?style(infra):=20terraform=20fmt=20=E2=80=94?= =?UTF-8?q?=20fix=20attribute=20alignment=20in=20rds=20module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- infra/modules/rds/main.tf | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/infra/modules/rds/main.tf b/infra/modules/rds/main.tf index 4511178..f65e5ee 100644 --- a/infra/modules/rds/main.tf +++ b/infra/modules/rds/main.tf @@ -32,18 +32,18 @@ resource "aws_db_parameter_group" "this" { } resource "aws_db_instance" "this" { - identifier = "${var.project_name}-db" - engine = "postgres" - engine_version = "16.4" - instance_class = var.instance_class - allocated_storage = var.allocated_storage - storage_type = "gp3" - storage_encrypted = true - - db_name = var.db_name - username = var.db_username - password = var.db_password - port = 5432 + identifier = "${var.project_name}-db" + engine = "postgres" + engine_version = "16.4" + instance_class = var.instance_class + allocated_storage = var.allocated_storage + storage_type = "gp3" + storage_encrypted = true + + db_name = var.db_name + username = var.db_username + password = var.db_password + port = 5432 vpc_security_group_ids = [var.ingress_sg_id] db_subnet_group_name = aws_db_subnet_group.this.name @@ -55,7 +55,7 @@ resource "aws_db_instance" "this" { deletion_protection = false # Demo posture: same reason. apply_immediately = true - backup_retention_period = 1 + backup_retention_period = 1 performance_insights_enabled = false tags = { Name = "${var.project_name}-db" } From f1e9789877fc45db50cd6ad42018804facd6dc72 Mon Sep 17 00:00:00 2001 From: nasr <156965421+div0rce@users.noreply.github.com> Date: Fri, 29 May 2026 09:55:09 -0400 Subject: [PATCH 8/8] fix: separate deployed API routes and harden frontend container --- frontend/Dockerfile | 16 +++++++++++++--- frontend/nginx.conf.template | 12 +++++++----- frontend/src/api.ts | 5 +++-- infra/README.md | 18 ++++++++++++++---- infra/modules/ecs/main.tf | 26 ++++++++++++-------------- infra/modules/network/main.tf | 21 +++++++++++++++------ 6 files changed, 64 insertions(+), 34 deletions(-) diff --git a/frontend/Dockerfile b/frontend/Dockerfile index b305e1c..69fbadf 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -14,6 +14,8 @@ COPY . . # Vite emits ./dist with hashed asset names. tsc -b runs as part of `npm run # build` and fails the build on any type error. +ARG VITE_API_BASE=/api +ENV VITE_API_BASE=${VITE_API_BASE} RUN npm run build # ---------- runtime ---------- @@ -27,8 +29,16 @@ ENV BACKEND_URL=http://backend:8000 COPY nginx.conf.template /etc/nginx/templates/default.conf.template COPY --from=builder /app/dist /usr/share/nginx/html -# nginx official image runs as nginx (non-root) by default since 1.25. -EXPOSE 80 +RUN set -eux; \ + mkdir -p /var/cache/nginx/client_temp /var/cache/nginx/proxy_temp \ + /var/cache/nginx/fastcgi_temp /var/cache/nginx/uwsgi_temp \ + /var/cache/nginx/scgi_temp; \ + touch /run/nginx.pid; \ + chown -R nginx:nginx /usr/share/nginx/html /etc/nginx/conf.d \ + /var/cache/nginx /var/log/nginx /var/run /run/nginx.pid + +EXPOSE 8080 +USER nginx HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ - CMD wget --quiet --spider http://127.0.0.1/ || exit 1 + CMD wget --quiet --spider http://127.0.0.1:8080/ || exit 1 diff --git a/frontend/nginx.conf.template b/frontend/nginx.conf.template index 27cca7b..10a0ebf 100644 --- a/frontend/nginx.conf.template +++ b/frontend/nginx.conf.template @@ -1,13 +1,13 @@ # nginx config for the M10 demo deployment. Serves the Vite-built SPA and -# reverse-proxies the backend FastAPI under same-origin paths so the typed -# API client in src/api.ts can keep using base "" (no CORS). +# reverse-proxies the backend FastAPI under the /api namespace so React Router +# routes such as /review and /dashboard always resolve to the SPA. # # ${BACKEND_URL} is substituted by nginx's official-image entrypoint via # envsubst at container start. The ECS task definition sets it to the service # discovery DNS for the backend. server { - listen 80 default_server; + listen 8080 default_server; server_name _; # SPA assets @@ -20,8 +20,10 @@ server { try_files $uri $uri/ /index.html; } - # API surface — forwarded to the backend. - location ~ ^/(query|extract|review|dashboard|health)(/|$) { + # API surface — strip the deployment-only /api prefix before forwarding to + # FastAPI, whose public endpoints remain /query, /review, /dashboard, etc. + location ^~ /api/ { + rewrite ^/api(/.*)$ $1 break; proxy_pass ${BACKEND_URL}; proxy_http_version 1.1; proxy_set_header Host $host; diff --git a/frontend/src/api.ts b/frontend/src/api.ts index 9942730..926a7ce 100644 --- a/frontend/src/api.ts +++ b/frontend/src/api.ts @@ -4,8 +4,9 @@ * Each function maps 1:1 to a backend endpoint. The response interfaces mirror * the backend Pydantic shapes — keep them in lockstep when either side changes. * The base URL defaults to "" so paths resolve same-origin against the Vite - * dev-server proxy or the deployed reverse proxy; override with `?api=...` URL - * parameter or VITE_API_BASE env var if needed. + * dev-server proxy. The deployed Docker image builds with VITE_API_BASE=/api so + * nginx can separate API traffic from React Router UI paths. Override with + * `?api=...` URL parameter or VITE_API_BASE env var if needed. */ const DEFAULT_BASE = ""; diff --git a/infra/README.md b/infra/README.md index e05f59f..dd6df60 100644 --- a/infra/README.md +++ b/infra/README.md @@ -42,15 +42,25 @@ public subnets as the tasks, the security group prevents internet reach. ### Reachability graph (encoded in security groups) ``` -internet ──→ alb_sg (80, 443) -alb_sg ──→ frontend_sg (80) ALB → nginx -alb_sg ──→ backend_sg (8000) ALB path-prefix → FastAPI -backend_sg ──→ rds_sg (5432) FastAPI → Postgres +internet ──→ alb_sg (80, 443) +alb_sg ──→ frontend_sg (8080) ALB → nginx +alb_sg ──→ backend_sg (8000) ALB → FastAPI /health +frontend_sg ─→ backend_sg (8000) nginx /api proxy → FastAPI +backend_sg ──→ rds_sg (5432) FastAPI → Postgres ``` Egress is open on the task SGs (so containers can reach ECR / Anthropic / OpenAI / CloudWatch). RDS has no egress. +### Public routing + +The ALB default target group is the frontend service, so `/`, `/review`, and +`/dashboard` all serve the React SPA even on hard refreshes or shared links. +The deployed frontend is built with `VITE_API_BASE=/api`; nginx proxies only +`/api/*` to FastAPI and strips the `/api` prefix before forwarding. `/health` +is the only public path routed directly from the ALB to the backend target +group so backend health checks remain backend-specific. + ### Single-AZ everywhere it matters - RDS: `multi_az = false`, `db.t4g.micro`, 20 GB storage. Fine for the demo; diff --git a/infra/modules/ecs/main.tf b/infra/modules/ecs/main.tf index b0eed4e..172c1cd 100644 --- a/infra/modules/ecs/main.tf +++ b/infra/modules/ecs/main.tf @@ -1,9 +1,8 @@ -# ECS cluster, ALB with path-prefix routing to backend/frontend services, and -# two Fargate task definitions. The frontend serves the SPA over nginx and -# reverse-proxies same-origin API paths to the backend service via service -# discovery; the ALB also has a path-prefix rule that routes -# /query|/extract|/review|/dashboard|/health straight to the backend so a -# curl from the public DNS name reaches the API directly without the nginx hop. +# ECS cluster, ALB, and two Fargate task definitions. The frontend serves the +# SPA over nginx on port 8080 and reverse-proxies /api/* to the backend service +# via service discovery. The ALB default target is the frontend so /, /review, +# and /dashboard all serve the React SPA. Only backend health checks bypass +# nginx and route straight to FastAPI. # --- log groups --------------------------------------------------------------- @@ -111,7 +110,7 @@ resource "aws_lb" "this" { resource "aws_lb_target_group" "frontend" { name = "${var.project_name}-frontend" - port = 80 + port = 8080 protocol = "HTTP" vpc_id = var.vpc_id target_type = "ip" @@ -154,10 +153,9 @@ resource "aws_lb_listener" "http" { } } -# Path-prefix rule sends API traffic straight to the backend target group so a -# curl against http:///health works without the nginx hop. The -# frontend's same-origin proxy (nginx.conf.template) is what serves real users -# inside the SPA; this rule is for tooling and the demo. +# Backend health checks stay backend-specific. API calls use the ALB default +# frontend target and are proxied by nginx under /api/*, which lets nginx strip +# the deployment namespace before FastAPI sees the request path. resource "aws_lb_listener_rule" "backend" { listener_arn = aws_lb_listener.http.arn priority = 100 @@ -169,7 +167,7 @@ resource "aws_lb_listener_rule" "backend" { condition { path_pattern { - values = ["/query*", "/extract*", "/review*", "/dashboard*", "/health"] + values = ["/health"] } } } @@ -243,7 +241,7 @@ locals { image = var.frontend_image essential = true portMappings = [ - { containerPort = 80, protocol = "tcp" } + { containerPort = 8080, protocol = "tcp" } ] environment = [ # The nginx config template substitutes ${BACKEND_URL} on container @@ -332,7 +330,7 @@ resource "aws_ecs_service" "frontend" { load_balancer { target_group_arn = aws_lb_target_group.frontend.arn container_name = "frontend" - container_port = 80 + container_port = 8080 } deployment_minimum_healthy_percent = 50 diff --git a/infra/modules/network/main.tf b/infra/modules/network/main.tf index 5565128..d84d51a 100644 --- a/infra/modules/network/main.tf +++ b/infra/modules/network/main.tf @@ -52,8 +52,9 @@ resource "aws_route_table_association" "public" { # SGs encode the expected reachability graph: # # internet ──→ alb_sg (80, 443) -# alb_sg ──→ frontend_sg (80) (ALB to nginx) -# alb_sg ──→ backend_sg (8000) (ALB to FastAPI for path-prefix routes) +# alb_sg ──→ frontend_sg (8080) (ALB to nginx) +# alb_sg ──→ backend_sg (8000) (ALB to FastAPI /health) +# frontend_sg ─→ backend_sg (8000) (nginx /api proxy to FastAPI) # backend_sg ──→ rds_sg (5432) (FastAPI to Postgres) # # Egress is intentionally open: tasks need to reach ECR, Anthropic, OpenAI, and @@ -96,8 +97,8 @@ resource "aws_security_group" "frontend" { vpc_id = aws_vpc.this.id ingress { - from_port = 80 - to_port = 80 + from_port = 8080 + to_port = 8080 protocol = "tcp" security_groups = [aws_security_group.alb.id] description = "ALB → nginx." @@ -115,7 +116,7 @@ resource "aws_security_group" "frontend" { resource "aws_security_group" "backend" { name = "${var.project_name}-backend" - description = "Backend Fargate task. Reachable from the ALB only." + description = "Backend Fargate task. Reachable from the ALB and frontend task only." vpc_id = aws_vpc.this.id ingress { @@ -123,7 +124,15 @@ resource "aws_security_group" "backend" { to_port = 8000 protocol = "tcp" security_groups = [aws_security_group.alb.id] - description = "ALB → FastAPI." + description = "ALB → FastAPI /health." + } + + ingress { + from_port = 8000 + to_port = 8000 + protocol = "tcp" + security_groups = [aws_security_group.frontend.id] + description = "nginx /api proxy → FastAPI." } egress {