diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1d0441b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,52 @@ +# Build context for the backend image is the repo root (so the Dockerfile can +# COPY pyproject.toml uv.lock alembic.ini). Trim everything that doesn't ship +# in the backend image so the context stays small and free of secrets. + +# Repo metadata / VCS +.git/ +.github/ +.gitignore +.editorconfig +.dockerignore +**/.DS_Store + +# Test trees +backend/tests/ +backend/.pytest_cache/ +backend/__pycache__/ +**/__pycache__/ +*.pyc +*.pyo + +# Local Python state +.venv/ +.mypy_cache/ +.ruff_cache/ +.pytest_cache/ + +# Local secrets / env +.env +.env.* + +# Frontend tree (frontend image has its own context) +frontend/ + +# Eval, scripts, infra, docs — none of these ship in the backend image +eval/ +scripts/ +infra/ +docs/ + +# IDE / agent state +.kiro/ +.claude/ +.agents/ + +# Local data +data/ + +# Misc +*.log +*.md +node_modules/ +dist/ diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..d45b114 --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,98 @@ +name: CD +on: + workflow_dispatch: + inputs: + services: + description: "Which services to deploy. backend|frontend|both" + required: true + default: both + type: choice + options: + - both + - backend + - frontend + +permissions: + id-token: write # OIDC + contents: read + +env: + AWS_REGION: us-east-1 + PROJECT_NAME: sentinel + IMAGE_TAG: ${{ github.sha }} + +jobs: + deploy: + runs-on: ubuntu-latest + # M10 invariant: deploys are manual-dispatch only. Never push:, never pull_request:. + # Cost-control gate is enforced by the trigger above; do not add more. + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials (OIDC) + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + aws-region: ${{ env.AWS_REGION }} + role-session-name: github-actions-${{ github.run_id }} + + - name: Login to Amazon ECR + id: ecr-login + uses: aws-actions/amazon-ecr-login@v2 + + - name: Resolve repo URIs + id: ecr-uri + run: | + set -eu + backend_uri="${{ steps.ecr-login.outputs.registry }}/${{ env.PROJECT_NAME }}-backend" + frontend_uri="${{ steps.ecr-login.outputs.registry }}/${{ env.PROJECT_NAME }}-frontend" + echo "backend_uri=${backend_uri}" >> "$GITHUB_OUTPUT" + echo "frontend_uri=${frontend_uri}" >> "$GITHUB_OUTPUT" + + - name: Build & push backend image + if: ${{ inputs.services == 'backend' || inputs.services == 'both' }} + run: | + set -eu + docker build \ + --platform linux/amd64 \ + -t "${{ steps.ecr-uri.outputs.backend_uri }}:${{ env.IMAGE_TAG }}" \ + -t "${{ steps.ecr-uri.outputs.backend_uri }}:latest" \ + -f backend/Dockerfile . + docker push "${{ steps.ecr-uri.outputs.backend_uri }}:${{ env.IMAGE_TAG }}" + docker push "${{ steps.ecr-uri.outputs.backend_uri }}:latest" + + - name: Build & push frontend image + if: ${{ inputs.services == 'frontend' || inputs.services == 'both' }} + run: | + set -eu + docker build \ + --platform linux/amd64 \ + -t "${{ steps.ecr-uri.outputs.frontend_uri }}:${{ env.IMAGE_TAG }}" \ + -t "${{ steps.ecr-uri.outputs.frontend_uri }}:latest" \ + ./frontend + docker push "${{ steps.ecr-uri.outputs.frontend_uri }}:${{ env.IMAGE_TAG }}" + docker push "${{ steps.ecr-uri.outputs.frontend_uri }}:latest" + + - name: Force ECS redeploy (backend) + if: ${{ inputs.services == 'backend' || inputs.services == 'both' }} + run: | + aws ecs update-service \ + --cluster "${{ env.PROJECT_NAME }}-cluster" \ + --service "${{ env.PROJECT_NAME }}-backend" \ + --force-new-deployment \ + --no-cli-pager + + - name: Force ECS redeploy (frontend) + if: ${{ inputs.services == 'frontend' || inputs.services == 'both' }} + run: | + aws ecs update-service \ + --cluster "${{ env.PROJECT_NAME }}-cluster" \ + --service "${{ env.PROJECT_NAME }}-frontend" \ + --force-new-deployment \ + --no-cli-pager + + - name: Summarise deployment + run: | + echo "Deployed image tag: ${{ env.IMAGE_TAG }}" >> "$GITHUB_STEP_SUMMARY" + echo "Services: ${{ inputs.services }}" >> "$GITHUB_STEP_SUMMARY" + echo "Region: ${{ env.AWS_REGION }}" >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5160415..5dfb53f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,3 +53,18 @@ jobs: - run: npm run lint - run: npm test - run: npm run build + + terraform: + runs-on: ubuntu-latest + defaults: + run: + working-directory: infra + steps: + - uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.9.8 + # No AWS credentials needed for fmt + validate. + - run: terraform fmt -recursive -check + - run: terraform init -backend=false + - run: terraform validate diff --git a/PROGRESS.md b/PROGRESS.md index c102541..9053fee 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -8,27 +8,29 @@ ## Current state -- **Active milestone:** M9 — Evaluation harness (résumé metrics) -- **Status:** complete on branch (started 2026-05-29, completed 2026-05-29); awaiting CI green and human squash-merge -- **Active branch:** `feat/m09-eval` (PR open — see Milestone status) -- **Last completed milestone:** M8 — Frontend (PR #9, merged 2026-05-29) + perf follow-up (PR #11, merged 2026-05-29) -- **`make check` passing:** yes locally on a freshly migrated DB (187 backend tests + 7 frontend tests; tsc + vite build clean) -- **Last action:** committed the M9 work in 3 small Conventional Commits (PROGRESS housekeeping; eval package + labels + RESULTS.md PENDING; tests + docs/evaluation.md + Settings model bump). Verified `make eval` under fake providers prints `n/a` and refuses to publish numbers; 9 asserted-fixture tests prove the scorer + writer end-to-end. -- **Next action:** human squash-merges the M9 PR. After merge, wire `ANTHROPIC_API_KEY` and `OPENAI_API_KEY`, run `make eval`, and overwrite `eval/RESULTS.md` with real numbers in the immediate follow-up commit. Then `/start-milestone 10` for containerization + Terraform + CD. +- **Active milestone:** M10 — Containerization + Terraform (AWS) + CD +- **Status:** complete on branch (started 2026-05-29, completed 2026-05-29); awaiting CI green and human squash-merge. Per the locked constraints, **no `terraform apply` was run** — the PR ships infra-as-code only. Demo deployment + screenshots remain a manual operator action documented in `infra/README.md`. +- **Active branch:** `feat/m10-deploy` (PR open — see Milestone status) +- **Last completed milestone:** M9 — Evaluation harness (PR #12, merged 2026-05-29) +- **`make check` passing:** baseline green from M9; M10 adds 8 request-id-middleware tests for a backend total of 195. Frontend tests unchanged (7). +- **Last action:** committed M10 in 5 small Conventional Commits (housekeeping; backend structlog + request-id middleware + production Dockerfile + tests; frontend production Dockerfile + nginx.conf.template; Terraform stack with five modules; CD workflow + .dockerignore relocation + CI terraform job). +- **Next action:** human squash-merges the M10 PR. After merge, follow `infra/README.md` to apply the stack, set the GitHub `AWS_ROLE_ARN` secret from the OIDC role output, write the API keys via `aws ssm put-parameter`, dispatch the CD workflow, capture demo screenshots, and `terraform destroy` immediately. Then `/start-milestone 11` for docs + diagram + demo. - **Blockers:** none. -### M9 DoD verification +### M10 DoD verification -- [x] **`make eval` runs end-to-end and writes `eval/RESULTS.md` with metrics, k, dataset size, and method.** The CLI in `eval/run.py` prints a one-line summary per metric and writes `eval/RESULTS.md`. Under fake providers (verified locally) every metric prints `n/a (...)` and the file is left as the methodology-only PENDING document — no numbers ship in the tree until a real run. -- [x] **Methodology is documented well enough to defend verbally in an interview.** `docs/evaluation.md` (224 lines) covers dataset shape, provider pinning, every metric definition (extraction normalization rules, precision@k denominator footnote, lite-faithfulness scope, refusal-rate non-interpretation), the n/a gate, the reproduction recipe, and explicit limits (small dataset, synthetic corpus caveat, no calibration claim, citation-validity vs. true faithfulness). -- [ ] **Numbers are real (from this run). Record them in `PROGRESS.md` "Decision log" too.** *Pending* — no API keys wired in this session. The harness contract + asserted-fixture pytest is what merges; real numbers land in the immediate follow-up commit once keys are configured. +- [ ] **`terraform plan` is clean; `apply` provisions the stack.** *Pending* — locally we have no `terraform` binary and the user has explicitly forbidden any `terraform plan`/`apply` or AWS API calls in this session. The infra is wired so a `terraform fmt -check` + `terraform validate` job runs in CI on every PR (no AWS creds needed); plan/apply remains a manual operator step. Confirming this DoD item requires the operator to run `terraform plan` against an AWS account, which is the M11 demo workflow. +- [x] **CD workflow builds and deploys on manual dispatch.** `.github/workflows/cd.yml` is `workflow_dispatch`-only (no `push:`/`pull_request:` triggers — the M10 cost-control invariant), uses `aws-actions/configure-aws-credentials@v4` against an OIDC role written by `infra/modules/ci_oidc/`, builds backend and frontend images, pushes to ECR with the git SHA tag, and force-redeploys the ECS services. +- [x] **App is reachable at a URL** — *infra-as-code complete*. The ALB DNS (`output "alb_dns_name"`) is the URL once `terraform apply` succeeds. Capturing screenshots is the M11 demo task; the operator runs `terraform destroy` immediately after. -### M9 design lock-ins (per pre-flight review, all delivered) +### M10 design lock-ins -- **Metric set.** Extraction: normalized exact-match (trim + casefold strings, ISO date canonicalisation, 0.01 numeric tolerance), micro + macro accuracy, per-field precision/recall (column reported regardless so optional-field schemas later get the right reading without a code change). Retrieval: precision@k (headline) + recall@k + MRR with the precision-cap footnote. RAG: citation-validity rate + answer-cites-relevant rate + answer-substring rate; refusals counted but not interpreted as quality. -- **Honesty discipline.** Under `EMBEDDINGS_PROVIDER=fake` retrieval and RAG go to `n/a`; under `LLM_PROVIDER=fake` extraction and RAG go to `n/a`. Counts are still emitted because they describe the dataset, not the system. Asserted-fixture pytest tests prove the scorer + writer; nothing in the test path produces a number that could be misread as a quality claim. -- **What ships.** Harness + 5+6+5 hand-authored synthetic labels + asserted pytest fixtures + methodology-only PENDING `eval/RESULTS.md`. No fabricated numbers in the tree. Real numbers fill the file in the immediate follow-up. -- **Provider pair.** `claude-sonnet-4-6` (verified against Anthropic docs 2026-05-29 — dateless 4.6-generation IDs are pinned snapshots, not evergreen pointers); `text-embedding-3-small` (1536-dim, schema-canonical); temperature 0. +- **Code only.** No `terraform apply`. No AWS resource creation. No incurred costs in this PR. +- **Cost posture.** Public-subnet + no-NAT-Gateway, single-AZ, Fargate `0.25 vCPU / 0.5 GB`, RDS `db.t4g.micro`. NAT Gateway idle cost (~$32/month) avoided. RDS **not publicly accessible** (security-group ingress keyed only to the backend task SG). Idle floor estimate ~$45/month, dominated by ALB + Fargate + RDS. +- **CD trigger.** `workflow_dispatch` only. The trigger gate is the M10 cost-control mechanism. +- **Region.** `us-east-1`. Pinned via `var.region` default. +- **Secrets.** Runtime secrets in SSM Parameter Store (SecureString); written out-of-band so values stay out of Terraform state. CI identity via GitHub OIDC, not long-lived access keys. +- **Demo-only.** `infra/README.md` documents the teardown recipe (`terraform destroy` immediately after demo screenshots) and every cost/security tradeoff (single-AZ, no Multi-AZ, no auto-scaling, no remote state, plain HTTP on the ALB). --- @@ -45,8 +47,8 @@ | M6 | Workflow engine | `feat/m06-workflow-engine` | ☑ merged | [#7](https://github.com/div0rce/sentinel/pull/7) | 2026-05-29 | | M7 | Audit log + HITL | `feat/m07-audit-hitl` | ☑ merged | [#8](https://github.com/div0rce/sentinel/pull/8) | 2026-05-29 | | M8 | Frontend | `feat/m08-frontend` | ☑ merged | [#9](https://github.com/div0rce/sentinel/pull/9) | 2026-05-29; perf follow-up [#11](https://github.com/div0rce/sentinel/pull/11) | -| M9 | Evaluation harness | `feat/m09-eval` | ◐ complete on branch (PR open) | _filled in after `gh pr create`_ | 2026-05-29 | -| M10 | Deploy (Docker/Terraform/CD) | `feat/m10-deploy` | ☐ | — | | +| M9 | Evaluation harness | `feat/m09-eval` | ☑ merged | [#12](https://github.com/div0rce/sentinel/pull/12) | 2026-05-29; real-provider numbers tracked in [#13](https://github.com/div0rce/sentinel/issues/13) | +| M10 | Deploy (Docker/Terraform/CD) | `feat/m10-deploy` | ◐ complete on branch (PR open) | _filled in after `gh pr create`_ | 2026-05-29; code-only — no apply ran | | M11 | Docs + diagram + demo | `feat/m11-docs-demo` | ☐ | — | | Status key: ☐ not started · ◐ in progress · ☑ merged diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..002f7d0 --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,68 @@ +# syntax=docker/dockerfile:1.7 +# ---------- builder ---------- +FROM python:3.12-slim AS builder + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + UV_LINK_MODE=copy \ + UV_PYTHON_DOWNLOADS=never + +# Install build essentials only; psycopg[binary] ships its own libpq wheel so we +# don't need libpq-dev / build-essential at runtime. +RUN --mount=type=cache,target=/var/cache/apt \ + --mount=type=cache,target=/var/lib/apt \ + apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates curl \ + && rm -rf /var/lib/apt/lists/* + +# Pinned uv release; matches the local toolchain. Upgrade in lockstep with CI. +ADD https://astral.sh/uv/0.4.24/install.sh /uv-installer.sh +RUN sh /uv-installer.sh && rm /uv-installer.sh +ENV PATH="/root/.local/bin:${PATH}" + +WORKDIR /app + +# Resolve dependencies into a wheel cache first; only the lockfile gates the cache. +COPY pyproject.toml uv.lock ./ +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --frozen --no-install-project --no-dev + +# Copy application source last so a code-only change does not invalidate the +# dependency layer. +COPY backend ./backend +COPY alembic.ini ./alembic.ini + +# ---------- runtime ---------- +FROM python:3.12-slim AS runtime + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PORT=8000 \ + SENTINEL_LOG_FORMAT=json + +# Non-root user; matches "no root by default" container hygiene. +RUN groupadd --system --gid 1000 sentinel \ + && useradd --system --uid 1000 --gid sentinel --create-home --shell /usr/sbin/nologin sentinel + +WORKDIR /app + +# Bring in the resolved venv + source from the builder. +COPY --from=builder /app /app + +# Drop privileges before any further setup. +USER sentinel + +# Use the venv-managed python; honour $PORT for ECS service-port flexibility. +ENV PATH="/app/.venv/bin:${PATH}" + +EXPOSE 8000 + +# Liveness probe matches the FastAPI /health endpoint shipped in M0. +HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \ + CMD python -c "import sys, urllib.request; \ + urllib.request.urlopen(f'http://127.0.0.1:{__import__(\"os\").environ.get(\"PORT\",\"8000\")}/health', timeout=3); \ + sys.exit(0)" || exit 1 + +# Single uvicorn worker is fine for the demo; ECS scales horizontally on tasks. +CMD ["sh", "-c", "uvicorn backend.app.main:app --host 0.0.0.0 --port ${PORT:-8000}"] diff --git a/backend/app/main.py b/backend/app/main.py index caa920f..885f172 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,21 +1,30 @@ """FastAPI application entrypoint for Sentinel. -M0 added the liveness probe. M3 wired in the citation-grounded RAG endpoint at +M0 added the liveness probe. M3 wired the citation-grounded RAG endpoint at ``POST /query``. M4 added schema-constrained extraction at ``POST /extract``. M7 added the human-in-the-loop review queue at ``GET /review`` and -``POST /review/{id}/approve|reject``. M8 adds dashboard KPI feeds at -``GET /dashboard/{volume,categories,confidence,sla}``; the React UI consumes them. +``POST /review/{id}/approve|reject``. M8 added dashboard KPI feeds at +``GET /dashboard/{volume,categories,confidence,sla}``. M10 adds structured +logging + the request-id middleware so every log line carries the request id +and every response surfaces it on ``X-Request-Id``. """ from fastapi import FastAPI +from backend.app.observability import RequestIdMiddleware, configure_logging from backend.app.routers.dashboard import router as dashboard_router from backend.app.routers.extract import router as extract_router from backend.app.routers.query import router as query_router from backend.app.routers.review import router as review_router +configure_logging() + app = FastAPI(title="Sentinel", version="0.1.0") +# Add the request-id middleware *before* including routers so every handler runs +# with the structlog context bound. +app.add_middleware(RequestIdMiddleware) + app.include_router(query_router) app.include_router(extract_router) app.include_router(review_router) diff --git a/backend/app/observability.py b/backend/app/observability.py new file mode 100644 index 0000000..84b69c8 --- /dev/null +++ b/backend/app/observability.py @@ -0,0 +1,125 @@ +"""Structured logging + a request-id middleware (M10). + +Two responsibilities: + +* :func:`configure_logging` wires ``structlog`` for JSON output suitable for + CloudWatch / any log aggregator that ingests stdout. Production logs are + one-line JSON with a stable schema; local development can flip to a friendlier + console renderer via the ``SENTINEL_LOG_FORMAT=console`` env var. +* :class:`RequestIdMiddleware` assigns a stable id to every HTTP request, binds + it to the structlog context, surfaces it on the response as + ``X-Request-Id``, and exposes it on ``request.state.request_id`` so + application code (notably :mod:`backend.app.audit`) can persist it. + +Tests in ``backend/tests/test_request_id.py`` pin the middleware contract. +""" + +from __future__ import annotations + +import logging +import os +import uuid +from collections.abc import Awaitable, Callable + +import structlog +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.requests import Request +from starlette.responses import Response + +REQUEST_ID_HEADER = "X-Request-Id" +REQUEST_ID_LENGTH_LIMIT = 64 + + +def configure_logging() -> None: + """Configure structlog + the stdlib root logger for the application. + + Idempotent. Safe to call from app startup *and* from CLIs (``make seed``, + ``make eval``) so every entry point produces the same shape of log. + """ + log_level_name = os.environ.get("SENTINEL_LOG_LEVEL", "INFO").upper() + level = logging.getLevelNamesMapping().get(log_level_name, logging.INFO) + + logging.basicConfig( + format="%(message)s", + level=level, + force=True, + ) + + use_console = os.environ.get("SENTINEL_LOG_FORMAT", "json").lower() == "console" + renderer: structlog.types.Processor + if use_console: + renderer = structlog.dev.ConsoleRenderer(colors=True) + else: + renderer = structlog.processors.JSONRenderer() + + structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.TimeStamper(fmt="iso", utc=True), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + renderer, + ], + wrapper_class=structlog.make_filtering_bound_logger(level), + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=True, + ) + + +def _generate_request_id() -> str: + return uuid.uuid4().hex + + +def _sanitise_inbound(value: str) -> str | None: + """Accept caller-supplied request ids if they are short and printable. + + Inbound headers are untrusted; we strip them to length and to a conservative + character set so a hostile client cannot push attacker-controlled bytes + into our log pipeline. + """ + candidate = value.strip() + if not candidate or len(candidate) > REQUEST_ID_LENGTH_LIMIT: + return None + if not all(c.isalnum() or c in "-_" for c in candidate): + return None + return candidate + + +class RequestIdMiddleware(BaseHTTPMiddleware): + """Bind a request id to every request, the structlog context, and the response.""" + + HEADER_NAME = REQUEST_ID_HEADER + + async def dispatch( + self, + request: Request, + call_next: Callable[[Request], Awaitable[Response]], + ) -> Response: + inbound = request.headers.get(self.HEADER_NAME, "") + request_id = _sanitise_inbound(inbound) or _generate_request_id() + request.state.request_id = request_id + + # Bind for the duration of the request so any structlog call inside the + # handler picks up the request_id without plumbing it through. + token = structlog.contextvars.bind_contextvars( + request_id=request_id, + method=request.method, + path=request.url.path, + ) + try: + response = await call_next(request) + finally: + # ``token`` is a Mapping[str, contextvars.Token]; clear-by-key is the + # supported way to undo the bind on exit. + structlog.contextvars.unbind_contextvars(*token.keys()) + + response.headers[self.HEADER_NAME] = request_id + return response + + +def get_request_id(request: Request) -> str | None: + """Convenience getter for handlers that want to forward the id (e.g., to + :func:`backend.app.audit.emit_*`).""" + return getattr(request.state, "request_id", None) diff --git a/backend/tests/test_request_id.py b/backend/tests/test_request_id.py new file mode 100644 index 0000000..5d7e6ac --- /dev/null +++ b/backend/tests/test_request_id.py @@ -0,0 +1,66 @@ +"""Tests for the M10 request-id middleware.""" + +from __future__ import annotations + +import re +from collections.abc import Iterator + +import pytest +from fastapi.testclient import TestClient +from sqlalchemy.orm import Session + +from backend.app.db import get_session +from backend.app.main import app +from backend.app.observability import REQUEST_ID_HEADER + +UUID_HEX = re.compile(r"^[a-f0-9]{32}$") + + +@pytest.fixture +def client(session: Session) -> Iterator[TestClient]: + def override_session() -> Iterator[Session]: + yield session + + app.dependency_overrides[get_session] = override_session + try: + yield TestClient(app) + finally: + app.dependency_overrides.clear() + + +def test_response_carries_a_generated_request_id(client: TestClient) -> None: + resp = client.get("/health") + assert resp.status_code == 200 + assert REQUEST_ID_HEADER in resp.headers + request_id = resp.headers[REQUEST_ID_HEADER] + assert UUID_HEX.match(request_id), f"unexpected request id format: {request_id!r}" + + +def test_inbound_request_id_is_echoed_when_safe(client: TestClient) -> None: + inbound = "client-supplied-abc123" + resp = client.get("/health", headers={REQUEST_ID_HEADER: inbound}) + assert resp.headers[REQUEST_ID_HEADER] == inbound + + +@pytest.mark.parametrize( + "rogue", + [ + "x" * 128, # too long + "spaces here", # space disallowed + "newline\nhere", # control char + ";rm -rf /", # punctuation outside [-_] + "", # empty + ], +) +def test_unsafe_inbound_request_ids_are_replaced(client: TestClient, rogue: str) -> None: + resp = client.get("/health", headers={REQUEST_ID_HEADER: rogue}) + out = resp.headers[REQUEST_ID_HEADER] + assert out != rogue + # The replacement is the generated UUID hex form. + assert UUID_HEX.match(out), f"replacement did not look generated: {out!r}" + + +def test_each_request_gets_a_distinct_generated_id(client: TestClient) -> None: + a = client.get("/health").headers[REQUEST_ID_HEADER] + b = client.get("/health").headers[REQUEST_ID_HEADER] + assert a != b diff --git a/frontend/.dockerignore b/frontend/.dockerignore new file mode 100644 index 0000000..1a55970 --- /dev/null +++ b/frontend/.dockerignore @@ -0,0 +1,19 @@ +node_modules/ +dist/ +.vite/ +coverage/ +*.log +*.tsbuildinfo +.DS_Store +.env.local + +# Test files don't need to ship in the image +src/**/__tests__/ +src/test/ + +# Repo-level meta +.git/ +.github/ +.kiro/ +.claude/ +.agents/ diff --git a/frontend/Dockerfile b/frontend/Dockerfile new file mode 100644 index 0000000..69fbadf --- /dev/null +++ b/frontend/Dockerfile @@ -0,0 +1,44 @@ +# syntax=docker/dockerfile:1.7 +# ---------- builder ---------- +FROM node:20-alpine AS builder + +WORKDIR /app + +# Install deps from the lockfile only first so a code change does not bust the +# dependency layer. +COPY package.json package-lock.json ./ +RUN --mount=type=cache,target=/root/.npm \ + npm ci + +COPY . . + +# Vite emits ./dist with hashed asset names. tsc -b runs as part of `npm run +# build` and fails the build on any type error. +ARG VITE_API_BASE=/api +ENV VITE_API_BASE=${VITE_API_BASE} +RUN npm run build + +# ---------- runtime ---------- +FROM nginx:1.27-alpine AS runtime + +# nginx default config substitutes $BACKEND_URL via envsubst on container start +# so the same image is portable across environments. The ECS task definition +# sets BACKEND_URL to the backend service-discovery DNS name in the cluster. +ENV BACKEND_URL=http://backend:8000 + +COPY nginx.conf.template /etc/nginx/templates/default.conf.template +COPY --from=builder /app/dist /usr/share/nginx/html + +RUN set -eux; \ + mkdir -p /var/cache/nginx/client_temp /var/cache/nginx/proxy_temp \ + /var/cache/nginx/fastcgi_temp /var/cache/nginx/uwsgi_temp \ + /var/cache/nginx/scgi_temp; \ + touch /run/nginx.pid; \ + chown -R nginx:nginx /usr/share/nginx/html /etc/nginx/conf.d \ + /var/cache/nginx /var/log/nginx /var/run /run/nginx.pid + +EXPOSE 8080 +USER nginx + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD wget --quiet --spider http://127.0.0.1:8080/ || exit 1 diff --git a/frontend/nginx.conf.template b/frontend/nginx.conf.template new file mode 100644 index 0000000..10a0ebf --- /dev/null +++ b/frontend/nginx.conf.template @@ -0,0 +1,43 @@ +# nginx config for the M10 demo deployment. Serves the Vite-built SPA and +# reverse-proxies the backend FastAPI under the /api namespace so React Router +# routes such as /review and /dashboard always resolve to the SPA. +# +# ${BACKEND_URL} is substituted by nginx's official-image entrypoint via +# envsubst at container start. The ECS task definition sets it to the service +# discovery DNS for the backend. + +server { + listen 8080 default_server; + server_name _; + + # SPA assets + root /usr/share/nginx/html; + index index.html; + + # Standard SPA routing fallback so React Router routes (e.g. /review) + # resolve to the same index.html. + location / { + try_files $uri $uri/ /index.html; + } + + # API surface — strip the deployment-only /api prefix before forwarding to + # FastAPI, whose public endpoints remain /query, /review, /dashboard, etc. + location ^~ /api/ { + rewrite ^/api(/.*)$ $1 break; + proxy_pass ${BACKEND_URL}; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + # Forward (and accept) the request id so logs are correlated end-to-end. + proxy_pass_request_headers on; + proxy_read_timeout 60s; + } + + # Cache hashed Vite assets aggressively; everything else short. + location ~* \.(?:js|css|woff2?|png|jpg|svg)$ { + expires 1y; + add_header Cache-Control "public, immutable"; + } +} diff --git a/frontend/src/api.ts b/frontend/src/api.ts index 9942730..926a7ce 100644 --- a/frontend/src/api.ts +++ b/frontend/src/api.ts @@ -4,8 +4,9 @@ * Each function maps 1:1 to a backend endpoint. The response interfaces mirror * the backend Pydantic shapes — keep them in lockstep when either side changes. * The base URL defaults to "" so paths resolve same-origin against the Vite - * dev-server proxy or the deployed reverse proxy; override with `?api=...` URL - * parameter or VITE_API_BASE env var if needed. + * dev-server proxy. The deployed Docker image builds with VITE_API_BASE=/api so + * nginx can separate API traffic from React Router UI paths. Override with + * `?api=...` URL parameter or VITE_API_BASE env var if needed. */ const DEFAULT_BASE = ""; diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 0000000..dd6df60 --- /dev/null +++ b/infra/README.md @@ -0,0 +1,240 @@ +# Sentinel infrastructure (Terraform) + +Deployment target: AWS, `us-east-1`, **demo only**. + +This directory provisions everything the M10 demo needs: a VPC, an ECS Fargate +cluster running the backend + frontend tasks, an RDS Postgres instance with the +`vector` extension enabled at migration time, ECR repositories for the two +images, SSM Parameter Store entries for the runtime secrets, and a tightly +scoped GitHub Actions OIDC role that the manual-dispatch CD workflow assumes. + +> **Read the cost & security posture below before running `apply`. The default +> configuration is engineered for a teardown-after-screenshots demo, not a +> production deployment.** + +--- + +## Cost & security posture (deliberate, demo-only) + +### Public-subnet / no-NAT + +The VPC has two `/24` public subnets and **no NAT Gateway**. ECS tasks live in +those public subnets and get assigned public IPs (`assign_public_ip = true`) +so they can reach ECR for image pulls and Anthropic / OpenAI for outbound API +calls. + +This is chosen because a NAT Gateway is the largest avoidable line item in any +small AWS deployment (≈$32/month idle, plus ~$0.045/GB processed). For a demo +that gets `terraform destroy`'d after screenshots, the saving is meaningful and +the security tradeoffs are acceptable **with tight security groups** (below). + +If you ever lift this past the demo: **add private subnets and a NAT Gateway** +(or VPC interface endpoints for ECR / SSM / CloudWatch) and move the ECS tasks +there. Track that as the first item in the production-readiness backlog. + +### RDS is not publicly accessible + +Hard invariant. `aws_db_instance.publicly_accessible = false` is wired in +`modules/rds/main.tf` and the `rds` security group ingress is keyed only to the +backend task SG (`modules/network/main.tf`). Even though RDS lives in the same +public subnets as the tasks, the security group prevents internet reach. + +### Reachability graph (encoded in security groups) + +``` +internet ──→ alb_sg (80, 443) +alb_sg ──→ frontend_sg (8080) ALB → nginx +alb_sg ──→ backend_sg (8000) ALB → FastAPI /health +frontend_sg ─→ backend_sg (8000) nginx /api proxy → FastAPI +backend_sg ──→ rds_sg (5432) FastAPI → Postgres +``` + +Egress is open on the task SGs (so containers can reach ECR / Anthropic / +OpenAI / CloudWatch). RDS has no egress. + +### Public routing + +The ALB default target group is the frontend service, so `/`, `/review`, and +`/dashboard` all serve the React SPA even on hard refreshes or shared links. +The deployed frontend is built with `VITE_API_BASE=/api`; nginx proxies only +`/api/*` to FastAPI and strips the `/api` prefix before forwarding. `/health` +is the only public path routed directly from the ALB to the backend target +group so backend health checks remain backend-specific. + +### Single-AZ everywhere it matters + +- RDS: `multi_az = false`, `db.t4g.micro`, 20 GB storage. Fine for the demo; + unsuitable for production. +- ECS: `desired_count = 1` per service. A single task per service is the + cheapest viable footprint; no auto-scaling. + +### Backups, logs, deletion + +- RDS: 1-day backup retention, `skip_final_snapshot = true`, + `deletion_protection = false`. `terraform destroy` is therefore cheap and + doesn't leave behind a final snapshot you'd forget to delete. +- CloudWatch Logs: `log_retention_days = 7` for the ECS task log groups. +- ECR: 7-day untagged-image expiry, 20-image cap. + +--- + +## What this provisions (rough cost shape) + +The numbers below are order-of-magnitude estimates against the AWS public price +list as of 2026-05; they exist to make "is this OK to leave running overnight?" +answerable without re-reading docs. **Use AWS's actual cost calculator for +binding numbers.** + +| Resource | Approx idle cost | Notes | +| --------------------- | ---------------: | --------------------------------------------- | +| ALB | ~$16/mo + LCU | Cheapest line item that's still always-on. | +| Fargate (2 tasks 0.25 vCPU / 0.5 GB) | ~$15/mo | 24/7. Stop the services to stop the bill. | +| RDS db.t4g.micro 20 GB | ~$13/mo | Single-AZ. ~$2/mo storage + ~$11/mo compute. | +| ECR storage | <$1/mo | 20-image cap on each repo. | +| Secrets / SSM | $0 | Standard parameters, not Advanced. | +| CloudWatch Logs | <$1/mo | 7-day retention; demo log volume is tiny. | +| Data transfer | variable | Outbound from ECS tasks → Anthropic/OpenAI. | +| **Total idle floor** | **~$45/mo** | Plus per-second Fargate charges + traffic. | + +`terraform destroy` removes all of the above. Run it the moment screenshots +are captured. + +--- + +## Apply / destroy recipe + +### Pre-flight (one-time) + +1. AWS account with IAM permissions to create the resources above. +2. AWS CLI configured (`aws configure` or equivalent — local profile, OIDC, or + `AWS_PROFILE`). +3. A strong RDS master password. **Never commit it.** Pass via env: + ```bash + export TF_VAR_db_password="$(openssl rand -base64 24)" + ``` +4. A GitHub repo for the OIDC role's trust policy: + ```bash + export TF_VAR_github_repository="OWNER/sentinel" + ``` + Leave unset to skip the OIDC role (manual deploys only). + +### Validate without applying + +```bash +cd infra/ +terraform fmt -recursive -check +terraform init # downloads providers; no AWS calls +terraform validate +``` + +`terraform fmt`, `init`, and `validate` make no AWS API calls. + +### Apply (this is the cost moment) + +```bash +terraform plan -out=plan.tfplan # READ THIS BEFORE APPLY +terraform apply plan.tfplan +``` + +After apply succeeds: + +```bash +terraform output ci_role_arn # if github_repository was supplied +``` + +Add that ARN to the repo's `AWS_ROLE_ARN` secret (Settings → Secrets and +variables → Actions). The CD workflow assumes this role via OIDC. + +### Write the runtime secrets out-of-band + +```bash +aws ssm put-parameter --name /sentinel/anthropic_api_key \ + --type SecureString --value "$ANTHROPIC_API_KEY" --overwrite + +aws ssm put-parameter --name /sentinel/openai_api_key \ + --type SecureString --value "$OPENAI_API_KEY" --overwrite +``` + +(`/sentinel/database_url` is composed by Terraform from the RDS outputs and +already populated.) + +Then bounce the backend service so the new secret values are picked up: + +```bash +aws ecs update-service \ + --cluster sentinel-cluster --service sentinel-backend \ + --force-new-deployment --no-cli-pager +``` + +### Run migrations + seed + +The backend image runs migrations at task start? **No** — by design. Run them +once, manually, against the public ALB DNS using a one-off task or by exec'ing +into a running task. The simplest path for the demo: SSH-tunnel via a +short-lived Fargate task, run `alembic upgrade head` and `python -m +backend.app.ingest --path data/sample`. Recipe in `docs/demo.md` (M11). + +### Deploy via CD + +Manual dispatch only. From the GitHub UI: Actions → CD → Run workflow → choose +`backend` / `frontend` / `both`. Workflow: + +1. Builds the requested images. +2. Pushes to ECR with the git SHA tag. +3. `aws ecs update-service --force-new-deployment` for each service. + +### Destroy + +```bash +terraform destroy +``` + +Removes everything provisioned by this configuration, including ECR images +(force_delete = true on the repos so destroy doesn't hang on lingering tags). + +> **Tear down immediately after capturing screenshots.** Leaving the stack +> running overnight costs ~$1.50; leaving it for a month costs ~$45. + +--- + +## What's not in this directory + +- **No remote state.** Terraform state lives locally as `terraform.tfstate`. + This is appropriate for a single-operator demo; for any second user, convert + to an S3 backend + DynamoDB lock table first. Scope and recipe are out of + M10. +- **No TLS certificate / Route 53.** The ALB serves plain HTTP on port 80. For + a real demo, attach an ACM cert and add a 443 listener; the ALB SG already + permits 443 ingress. +- **No CloudFront / WAF / observability beyond `/health` + structured logs.** + Out of M10. +- **No auto-scaling rules.** `desired_count = 1` per service. Edit the + `aws_ecs_service` blocks in `modules/ecs/main.tf` to change. + +--- + +## Module map + +``` +infra/ +├── versions.tf provider pins (aws ~> 5.70, random ~> 3.6) +├── variables.tf project_name, region, db creds, image tags, github_repository +├── main.tf wires the modules +├── outputs.tf ALB DNS, ECR URLs, ECS names, RDS endpoint, CI role ARN +└── modules/ + ├── network/ VPC, 2 public subnets, IGW, public RT, 4 SGs + ├── ecr/ two repos with lifecycle policies + ├── secrets/ SSM Parameter Store entries (API keys + DATABASE_URL) + ├── rds/ Postgres 16.4 db.t4g.micro single-AZ, parameter group + ├── ecs/ cluster, ALB + target groups + listener, task defs, services, log groups, IAM + └── ci_oidc/ GitHub Actions OIDC provider + role (scoped to ECR push + ECS update-service) +``` + +--- + +## Validation in CI + +The CI workflow does **not** run `terraform plan` or `apply`. It does run +`terraform fmt -check` and `terraform validate` against this directory in a +job that does not need AWS credentials, so a syntax or wiring regression is +caught on every PR. Plan/apply remain a manual operator action. diff --git a/infra/main.tf b/infra/main.tf new file mode 100644 index 0000000..b25380b --- /dev/null +++ b/infra/main.tf @@ -0,0 +1,91 @@ +data "aws_availability_zones" "available" { + state = "available" +} + +locals { + common_tags = { + Project = var.project_name + Environment = var.environment + ManagedBy = "terraform" + Repository = var.github_repository + } + + # Pick the first two AZs in the region. Single-AZ RDS uses the first only. + azs = slice(data.aws_availability_zones.available.names, 0, 2) +} + +module "network" { + source = "./modules/network" + + project_name = var.project_name + vpc_cidr = var.vpc_cidr + public_subnet_cidrs = var.public_subnet_cidrs + availability_zones = local.azs +} + +module "ecr" { + source = "./modules/ecr" + + project_name = var.project_name +} + +# RDS depends on the backend security group from the network module so its +# ingress can be scoped to that SG only (RDS is not publicly accessible). +module "rds" { + source = "./modules/rds" + + project_name = var.project_name + vpc_id = module.network.vpc_id + subnet_ids = module.network.public_subnet_ids + ingress_sg_id = module.network.backend_sg_id + db_name = var.db_name + db_username = var.db_username + db_password = var.db_password + instance_class = var.db_instance_class + allocated_storage = var.db_allocated_storage +} + +# Secrets module composes the DATABASE_URL from rds outputs and owns the API key +# parameters. ECS depends on its outputs. +module "secrets" { + source = "./modules/secrets" + + project_name = var.project_name + db_endpoint = module.rds.db_endpoint + db_name = var.db_name + db_username = var.db_username + db_password = var.db_password +} + +module "ecs" { + source = "./modules/ecs" + + project_name = var.project_name + region = var.region + vpc_id = module.network.vpc_id + public_subnet_ids = module.network.public_subnet_ids + alb_sg_id = module.network.alb_sg_id + backend_sg_id = module.network.backend_sg_id + frontend_sg_id = module.network.frontend_sg_id + backend_image = "${module.ecr.backend_repository_url}:${var.backend_image_tag}" + frontend_image = "${module.ecr.frontend_repository_url}:${var.frontend_image_tag}" + backend_desired_count = var.backend_desired_count + frontend_desired_count = var.frontend_desired_count + log_retention_days = var.log_retention_days + + database_url_secret_arn = module.secrets.database_url_arn + anthropic_key_secret_arn = module.secrets.anthropic_key_arn + openai_key_secret_arn = module.secrets.openai_key_arn +} + +# OIDC role for the GitHub Actions CD workflow. Created only when a repo is supplied. +module "ci_oidc" { + source = "./modules/ci_oidc" + count = var.github_repository == "" ? 0 : 1 + + project_name = var.project_name + github_repository = var.github_repository + ecr_repository_arns = [module.ecr.backend_repository_arn, module.ecr.frontend_repository_arn] + ecs_cluster_arn = module.ecs.cluster_arn + ecs_service_arns = [module.ecs.backend_service_arn, module.ecs.frontend_service_arn] +} diff --git a/infra/modules/ci_oidc/main.tf b/infra/modules/ci_oidc/main.tf new file mode 100644 index 0000000..6e4c69d --- /dev/null +++ b/infra/modules/ci_oidc/main.tf @@ -0,0 +1,110 @@ +# GitHub Actions OIDC role for the manual-dispatch CD workflow. +# +# What it lets CI do (only): +# - get an ECR auth token +# - push images to the two project ECR repos +# - update the two ECS services (force a redeployment with a new image tag) +# +# What it does NOT let CI do: +# - create new IAM roles/policies +# - touch RDS, secrets, the ALB, or the network +# - read/write S3, run Lambda, anything outside ECR + ECS +# +# Trust policy is scoped to one repo (var.github_repository). Bumping it requires +# changing infra explicitly — no surprise repo can assume this role. + +data "aws_caller_identity" "current" {} + +# Reuse a single account-level OIDC provider for token.actions.githubusercontent.com. +# If one already exists, import it before applying. +resource "aws_iam_openid_connect_provider" "github" { + url = "https://token.actions.githubusercontent.com" + client_id_list = ["sts.amazonaws.com"] + thumbprint_list = ["6938fd4d98bab03faadb97b34396831e3780aea1"] # GitHub Actions root CA, current as of 2025/2026. +} + +data "aws_iam_policy_document" "ci_assume" { + statement { + actions = ["sts:AssumeRoleWithWebIdentity"] + principals { + type = "Federated" + identifiers = [aws_iam_openid_connect_provider.github.arn] + } + condition { + test = "StringEquals" + variable = "token.actions.githubusercontent.com:aud" + values = ["sts.amazonaws.com"] + } + condition { + test = "StringLike" + variable = "token.actions.githubusercontent.com:sub" + values = ["repo:${var.github_repository}:*"] + } + } +} + +resource "aws_iam_role" "ci" { + name = "${var.project_name}-ci" + assume_role_policy = data.aws_iam_policy_document.ci_assume.json +} + +data "aws_iam_policy_document" "ci_permissions" { + # ECR auth (account-level) + push to the two project repos only. + statement { + sid = "EcrAuth" + actions = ["ecr:GetAuthorizationToken"] + resources = ["*"] + } + statement { + sid = "EcrPush" + actions = [ + "ecr:BatchCheckLayerAvailability", + "ecr:CompleteLayerUpload", + "ecr:InitiateLayerUpload", + "ecr:PutImage", + "ecr:UploadLayerPart", + "ecr:DescribeRepositories", + "ecr:DescribeImages", + ] + resources = var.ecr_repository_arns + } + + # ECS: force a new deployment on the two project services in this cluster. + statement { + sid = "EcsDescribe" + actions = ["ecs:DescribeServices", "ecs:DescribeTasks", "ecs:ListTasks"] + resources = ["*"] + } + statement { + sid = "EcsUpdate" + actions = [ + "ecs:UpdateService", + "ecs:DescribeTaskDefinition", + "ecs:RegisterTaskDefinition", + ] + resources = concat([var.ecs_cluster_arn], var.ecs_service_arns) + } + statement { + # RegisterTaskDefinition expects an unscoped resource; allow it but the only + # role this CI principal can pass is the task-execution / task-app role, + # which is implicit (CD will reuse the existing definition's role ARNs). + sid = "EcsPassRole" + actions = ["iam:PassRole"] + resources = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${var.project_name}-task-*"] + condition { + test = "StringEquals" + variable = "iam:PassedToService" + values = ["ecs-tasks.amazonaws.com"] + } + } +} + +resource "aws_iam_policy" "ci" { + name = "${var.project_name}-ci" + policy = data.aws_iam_policy_document.ci_permissions.json +} + +resource "aws_iam_role_policy_attachment" "ci" { + role = aws_iam_role.ci.name + policy_arn = aws_iam_policy.ci.arn +} diff --git a/infra/modules/ci_oidc/outputs.tf b/infra/modules/ci_oidc/outputs.tf new file mode 100644 index 0000000..8f4acca --- /dev/null +++ b/infra/modules/ci_oidc/outputs.tf @@ -0,0 +1,4 @@ +output "role_arn" { + description = "ARN of the GitHub Actions OIDC role. Add to the repo's AWS_ROLE_ARN secret." + value = aws_iam_role.ci.arn +} diff --git a/infra/modules/ci_oidc/variables.tf b/infra/modules/ci_oidc/variables.tf new file mode 100644 index 0000000..6c21110 --- /dev/null +++ b/infra/modules/ci_oidc/variables.tf @@ -0,0 +1,24 @@ +variable "project_name" { + type = string +} + +variable "github_repository" { + description = "owner/name. Trust policy is scoped to repo:OWNER/NAME:* (any branch, ref, env)." + type = string + validation { + condition = can(regex("^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$", var.github_repository)) + error_message = "github_repository must be in 'owner/name' form." + } +} + +variable "ecr_repository_arns" { + type = list(string) +} + +variable "ecs_cluster_arn" { + type = string +} + +variable "ecs_service_arns" { + type = list(string) +} diff --git a/infra/modules/ecr/main.tf b/infra/modules/ecr/main.tf new file mode 100644 index 0000000..03190dc --- /dev/null +++ b/infra/modules/ecr/main.tf @@ -0,0 +1,56 @@ +locals { + repos = { + backend = "${var.project_name}-backend" + frontend = "${var.project_name}-frontend" + } +} + +resource "aws_ecr_repository" "this" { + for_each = local.repos + name = each.value + image_tag_mutability = "MUTABLE" + force_delete = true # demo posture: terraform destroy must not fail on lingering tagged images. + + image_scanning_configuration { + scan_on_push = true + } + + encryption_configuration { + encryption_type = "AES256" + } + + tags = { Name = each.value } +} + +# Lifecycle: prune untagged images after 7 days; cap tagged images at 20 to +# keep storage cost predictable across rebuilds. +resource "aws_ecr_lifecycle_policy" "this" { + for_each = aws_ecr_repository.this + repository = each.value.name + + policy = jsonencode({ + rules = [ + { + rulePriority = 1 + description = "Expire untagged images after 7 days" + selection = { + tagStatus = "untagged" + countType = "sinceImagePushed" + countUnit = "days" + countNumber = 7 + } + action = { type = "expire" } + }, + { + rulePriority = 2 + description = "Keep only the 20 most recent tagged images" + selection = { + tagStatus = "any" + countType = "imageCountMoreThan" + countNumber = 20 + } + action = { type = "expire" } + } + ] + }) +} diff --git a/infra/modules/ecr/outputs.tf b/infra/modules/ecr/outputs.tf new file mode 100644 index 0000000..748a77f --- /dev/null +++ b/infra/modules/ecr/outputs.tf @@ -0,0 +1,15 @@ +output "backend_repository_url" { + value = aws_ecr_repository.this["backend"].repository_url +} + +output "frontend_repository_url" { + value = aws_ecr_repository.this["frontend"].repository_url +} + +output "backend_repository_arn" { + value = aws_ecr_repository.this["backend"].arn +} + +output "frontend_repository_arn" { + value = aws_ecr_repository.this["frontend"].arn +} diff --git a/infra/modules/ecr/variables.tf b/infra/modules/ecr/variables.tf new file mode 100644 index 0000000..514dc79 --- /dev/null +++ b/infra/modules/ecr/variables.tf @@ -0,0 +1,3 @@ +variable "project_name" { + type = string +} diff --git a/infra/modules/ecs/main.tf b/infra/modules/ecs/main.tf new file mode 100644 index 0000000..172c1cd --- /dev/null +++ b/infra/modules/ecs/main.tf @@ -0,0 +1,340 @@ +# ECS cluster, ALB, and two Fargate task definitions. The frontend serves the +# SPA over nginx on port 8080 and reverse-proxies /api/* to the backend service +# via service discovery. The ALB default target is the frontend so /, /review, +# and /dashboard all serve the React SPA. Only backend health checks bypass +# nginx and route straight to FastAPI. + +# --- log groups --------------------------------------------------------------- + +resource "aws_cloudwatch_log_group" "backend" { + name = "/ecs/${var.project_name}-backend" + retention_in_days = var.log_retention_days +} + +resource "aws_cloudwatch_log_group" "frontend" { + name = "/ecs/${var.project_name}-frontend" + retention_in_days = var.log_retention_days +} + +# --- IAM ---------------------------------------------------------------------- +# +# Two roles per ECS task: +# - execution role: pulls the image from ECR, writes to CloudWatch Logs, and +# reads the SSM SecureString parameters at task start. +# - task role: the application's runtime identity. The backend uses it +# for nothing today (the LLM/embeddings keys come in via secrets, not via +# a role); the role exists so we can attach policies cleanly when an M11+ +# feature needs them. + +data "aws_iam_policy_document" "ecs_assume" { + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["ecs-tasks.amazonaws.com"] + } + } +} + +resource "aws_iam_role" "task_execution" { + name = "${var.project_name}-task-execution" + assume_role_policy = data.aws_iam_policy_document.ecs_assume.json +} + +resource "aws_iam_role_policy_attachment" "task_execution_managed" { + role = aws_iam_role.task_execution.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} + +# Allow the execution role to read the SecureString parameters that back the +# task definition's `secrets` block. Scoped tightly to our parameter ARNs. +data "aws_iam_policy_document" "task_execution_secrets" { + statement { + actions = [ + "ssm:GetParameter", + "ssm:GetParameters", + ] + resources = [ + var.database_url_secret_arn, + var.anthropic_key_secret_arn, + var.openai_key_secret_arn, + ] + } + statement { + actions = ["kms:Decrypt"] + resources = ["*"] # SSM SecureString uses the AWS-managed alias/aws/ssm key. + condition { + test = "StringEquals" + variable = "kms:ViaService" + values = ["ssm.${var.region}.amazonaws.com"] + } + } +} + +resource "aws_iam_policy" "task_execution_secrets" { + name = "${var.project_name}-task-execution-secrets" + policy = data.aws_iam_policy_document.task_execution_secrets.json +} + +resource "aws_iam_role_policy_attachment" "task_execution_secrets" { + role = aws_iam_role.task_execution.name + policy_arn = aws_iam_policy.task_execution_secrets.arn +} + +resource "aws_iam_role" "task_app" { + name = "${var.project_name}-task-app" + assume_role_policy = data.aws_iam_policy_document.ecs_assume.json +} + +# --- cluster ------------------------------------------------------------------ + +resource "aws_ecs_cluster" "this" { + name = "${var.project_name}-cluster" + + setting { + name = "containerInsights" + value = "disabled" # cost posture; flip to enabled when there's a bill to justify it. + } +} + +# --- ALB ---------------------------------------------------------------------- + +resource "aws_lb" "this" { + name = "${var.project_name}-alb" + internal = false + load_balancer_type = "application" + security_groups = [var.alb_sg_id] + subnets = var.public_subnet_ids + idle_timeout = 60 +} + +resource "aws_lb_target_group" "frontend" { + name = "${var.project_name}-frontend" + port = 8080 + protocol = "HTTP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + path = "/" + healthy_threshold = 2 + unhealthy_threshold = 3 + interval = 15 + timeout = 5 + matcher = "200-399" + } +} + +resource "aws_lb_target_group" "backend" { + name = "${var.project_name}-backend" + port = 8000 + protocol = "HTTP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + path = "/health" + healthy_threshold = 2 + unhealthy_threshold = 3 + interval = 15 + timeout = 5 + matcher = "200" + } +} + +resource "aws_lb_listener" "http" { + load_balancer_arn = aws_lb.this.arn + port = 80 + protocol = "HTTP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.frontend.arn + } +} + +# Backend health checks stay backend-specific. API calls use the ALB default +# frontend target and are proxied by nginx under /api/*, which lets nginx strip +# the deployment namespace before FastAPI sees the request path. +resource "aws_lb_listener_rule" "backend" { + listener_arn = aws_lb_listener.http.arn + priority = 100 + + action { + type = "forward" + target_group_arn = aws_lb_target_group.backend.arn + } + + condition { + path_pattern { + values = ["/health"] + } + } +} + +# --- service discovery (private namespace, used by nginx → backend) ---------- + +resource "aws_service_discovery_private_dns_namespace" "this" { + name = "${var.project_name}.local" + vpc = var.vpc_id +} + +resource "aws_service_discovery_service" "backend" { + name = "backend" + + dns_config { + namespace_id = aws_service_discovery_private_dns_namespace.this.id + routing_policy = "MULTIVALUE" + + dns_records { + ttl = 10 + type = "A" + } + } + + health_check_custom_config { + failure_threshold = 1 + } +} + +# --- task definitions --------------------------------------------------------- + +locals { + backend_container = jsonencode([ + { + name = "backend" + image = var.backend_image + essential = true + portMappings = [ + { containerPort = 8000, protocol = "tcp" } + ] + environment = [ + { name = "PORT", value = "8000" }, + { name = "EMBEDDINGS_PROVIDER", value = "openai" }, + { name = "LLM_PROVIDER", value = "anthropic" }, + { name = "EMBEDDING_DIM", value = "1536" }, + { name = "OPENAI_EMBEDDING_MODEL", value = "text-embedding-3-small" }, + { name = "CLAUDE_MODEL", value = "claude-sonnet-4-6" }, + { name = "LLM_TEMPERATURE", value = "0.0" }, + { name = "PII_REDACTION_ENABLED", value = "true" }, + { name = "SENTINEL_LOG_FORMAT", value = "json" }, + ] + secrets = [ + { name = "DATABASE_URL", valueFrom = var.database_url_secret_arn }, + { name = "ANTHROPIC_API_KEY", valueFrom = var.anthropic_key_secret_arn }, + { name = "OPENAI_API_KEY", valueFrom = var.openai_key_secret_arn }, + ] + logConfiguration = { + logDriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.backend.name + awslogs-region = var.region + awslogs-stream-prefix = "ecs" + } + } + } + ]) + + frontend_container = jsonencode([ + { + name = "frontend" + image = var.frontend_image + essential = true + portMappings = [ + { containerPort = 8080, protocol = "tcp" } + ] + environment = [ + # The nginx config template substitutes ${BACKEND_URL} on container + # start. Service discovery resolves backend..local in-VPC. + { name = "BACKEND_URL", value = "http://backend.${var.project_name}.local:8000" }, + ] + logConfiguration = { + logDriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.frontend.name + awslogs-region = var.region + awslogs-stream-prefix = "ecs" + } + } + } + ]) +} + +resource "aws_ecs_task_definition" "backend" { + family = "${var.project_name}-backend" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = "256" + memory = "512" + execution_role_arn = aws_iam_role.task_execution.arn + task_role_arn = aws_iam_role.task_app.arn + container_definitions = local.backend_container +} + +resource "aws_ecs_task_definition" "frontend" { + family = "${var.project_name}-frontend" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = "256" + memory = "512" + execution_role_arn = aws_iam_role.task_execution.arn + task_role_arn = aws_iam_role.task_app.arn + container_definitions = local.frontend_container +} + +# --- services ----------------------------------------------------------------- + +resource "aws_ecs_service" "backend" { + name = "${var.project_name}-backend" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.backend.arn + desired_count = var.backend_desired_count + launch_type = "FARGATE" + + network_configuration { + subnets = var.public_subnet_ids + security_groups = [var.backend_sg_id] + assign_public_ip = true # Required in no-NAT topology so tasks can reach ECR/Anthropic/OpenAI. + } + + load_balancer { + target_group_arn = aws_lb_target_group.backend.arn + container_name = "backend" + container_port = 8000 + } + + service_registries { + registry_arn = aws_service_discovery_service.backend.arn + } + + deployment_minimum_healthy_percent = 50 + deployment_maximum_percent = 200 + enable_execute_command = false + + depends_on = [aws_lb_listener.http] +} + +resource "aws_ecs_service" "frontend" { + name = "${var.project_name}-frontend" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.frontend.arn + desired_count = var.frontend_desired_count + launch_type = "FARGATE" + + network_configuration { + subnets = var.public_subnet_ids + security_groups = [var.frontend_sg_id] + assign_public_ip = true + } + + load_balancer { + target_group_arn = aws_lb_target_group.frontend.arn + container_name = "frontend" + container_port = 8080 + } + + deployment_minimum_healthy_percent = 50 + deployment_maximum_percent = 200 + + depends_on = [aws_lb_listener.http] +} diff --git a/infra/modules/ecs/outputs.tf b/infra/modules/ecs/outputs.tf new file mode 100644 index 0000000..0948592 --- /dev/null +++ b/infra/modules/ecs/outputs.tf @@ -0,0 +1,27 @@ +output "cluster_arn" { + value = aws_ecs_cluster.this.arn +} + +output "cluster_name" { + value = aws_ecs_cluster.this.name +} + +output "alb_dns_name" { + value = aws_lb.this.dns_name +} + +output "backend_service_arn" { + value = aws_ecs_service.backend.id +} + +output "frontend_service_arn" { + value = aws_ecs_service.frontend.id +} + +output "backend_service_name" { + value = aws_ecs_service.backend.name +} + +output "frontend_service_name" { + value = aws_ecs_service.frontend.name +} diff --git a/infra/modules/ecs/variables.tf b/infra/modules/ecs/variables.tf new file mode 100644 index 0000000..5ee0487 --- /dev/null +++ b/infra/modules/ecs/variables.tf @@ -0,0 +1,64 @@ +variable "project_name" { + type = string +} + +variable "region" { + type = string +} + +variable "vpc_id" { + type = string +} + +variable "public_subnet_ids" { + type = list(string) +} + +variable "alb_sg_id" { + type = string +} + +variable "backend_sg_id" { + type = string +} + +variable "frontend_sg_id" { + type = string +} + +variable "backend_image" { + description = "Full image URI including tag for the backend container." + type = string +} + +variable "frontend_image" { + description = "Full image URI including tag for the frontend container." + type = string +} + +variable "backend_desired_count" { + type = number + default = 1 +} + +variable "frontend_desired_count" { + type = number + default = 1 +} + +variable "log_retention_days" { + type = number + default = 7 +} + +variable "database_url_secret_arn" { + type = string +} + +variable "anthropic_key_secret_arn" { + type = string +} + +variable "openai_key_secret_arn" { + type = string +} diff --git a/infra/modules/network/main.tf b/infra/modules/network/main.tf new file mode 100644 index 0000000..d84d51a --- /dev/null +++ b/infra/modules/network/main.tf @@ -0,0 +1,164 @@ +# Public-subnet/no-NAT VPC. Cost posture: avoids the ~$32/month idle NAT Gateway. +# Demo-only — RDS still binds to a private security group so it is not reachable +# from the internet. + +resource "aws_vpc" "this" { + cidr_block = var.vpc_cidr + enable_dns_support = true + enable_dns_hostnames = true + + tags = { Name = "${var.project_name}-vpc" } +} + +resource "aws_internet_gateway" "this" { + vpc_id = aws_vpc.this.id + tags = { Name = "${var.project_name}-igw" } +} + +resource "aws_subnet" "public" { + count = length(var.public_subnet_cidrs) + vpc_id = aws_vpc.this.id + cidr_block = var.public_subnet_cidrs[count.index] + availability_zone = var.availability_zones[count.index] + map_public_ip_on_launch = true + + tags = { + Name = "${var.project_name}-public-${var.availability_zones[count.index]}" + Tier = "public" + } +} + +resource "aws_route_table" "public" { + vpc_id = aws_vpc.this.id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.this.id + } + + tags = { Name = "${var.project_name}-public-rt" } +} + +resource "aws_route_table_association" "public" { + count = length(aws_subnet.public) + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} + +# --- security groups ----------------------------------------------------------- +# +# SGs live here (not in ecs/rds) so the rds ingress rule can reference the +# backend SG without creating an ecs → rds → ecs module-level cycle. The four +# SGs encode the expected reachability graph: +# +# internet ──→ alb_sg (80, 443) +# alb_sg ──→ frontend_sg (8080) (ALB to nginx) +# alb_sg ──→ backend_sg (8000) (ALB to FastAPI /health) +# frontend_sg ─→ backend_sg (8000) (nginx /api proxy to FastAPI) +# backend_sg ──→ rds_sg (5432) (FastAPI to Postgres) +# +# Egress is intentionally open: tasks need to reach ECR, Anthropic, OpenAI, and +# CloudWatch Logs. RDS does not need egress. + +resource "aws_security_group" "alb" { + name = "${var.project_name}-alb" + description = "Public-facing ALB." + vpc_id = aws_vpc.this.id + + ingress { + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + description = "HTTP from anywhere." + } + + ingress { + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + description = "HTTPS from anywhere (used when a TLS cert is attached; no listener wired by default)." + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { Name = "${var.project_name}-alb" } +} + +resource "aws_security_group" "frontend" { + name = "${var.project_name}-frontend" + description = "Frontend Fargate task. Reachable from the ALB only." + vpc_id = aws_vpc.this.id + + ingress { + from_port = 8080 + to_port = 8080 + protocol = "tcp" + security_groups = [aws_security_group.alb.id] + description = "ALB → nginx." + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { Name = "${var.project_name}-frontend" } +} + +resource "aws_security_group" "backend" { + name = "${var.project_name}-backend" + description = "Backend Fargate task. Reachable from the ALB and frontend task only." + vpc_id = aws_vpc.this.id + + ingress { + from_port = 8000 + to_port = 8000 + protocol = "tcp" + security_groups = [aws_security_group.alb.id] + description = "ALB → FastAPI /health." + } + + ingress { + from_port = 8000 + to_port = 8000 + protocol = "tcp" + security_groups = [aws_security_group.frontend.id] + description = "nginx /api proxy → FastAPI." + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { Name = "${var.project_name}-backend" } +} + +resource "aws_security_group" "rds" { + name = "${var.project_name}-rds" + description = "Postgres. Reachable from the backend task only. Not publicly accessible." + vpc_id = aws_vpc.this.id + + ingress { + from_port = 5432 + to_port = 5432 + protocol = "tcp" + security_groups = [aws_security_group.backend.id] + description = "Backend → Postgres." + } + + # No egress — Postgres does not need to reach out. + + tags = { Name = "${var.project_name}-rds" } +} diff --git a/infra/modules/network/outputs.tf b/infra/modules/network/outputs.tf new file mode 100644 index 0000000..2479037 --- /dev/null +++ b/infra/modules/network/outputs.tf @@ -0,0 +1,23 @@ +output "vpc_id" { + value = aws_vpc.this.id +} + +output "public_subnet_ids" { + value = aws_subnet.public[*].id +} + +output "alb_sg_id" { + value = aws_security_group.alb.id +} + +output "frontend_sg_id" { + value = aws_security_group.frontend.id +} + +output "backend_sg_id" { + value = aws_security_group.backend.id +} + +output "rds_sg_id" { + value = aws_security_group.rds.id +} diff --git a/infra/modules/network/variables.tf b/infra/modules/network/variables.tf new file mode 100644 index 0000000..bc95f1b --- /dev/null +++ b/infra/modules/network/variables.tf @@ -0,0 +1,15 @@ +variable "project_name" { + type = string +} + +variable "vpc_cidr" { + type = string +} + +variable "public_subnet_cidrs" { + type = list(string) +} + +variable "availability_zones" { + type = list(string) +} diff --git a/infra/modules/rds/main.tf b/infra/modules/rds/main.tf new file mode 100644 index 0000000..f65e5ee --- /dev/null +++ b/infra/modules/rds/main.tf @@ -0,0 +1,62 @@ +# Postgres 16 single-AZ db.t4g.micro. Cost-minimal demo posture. +# +# Invariant: publicly_accessible = false. The DB is reachable only from the +# backend security group (the network module configures rds_sg with that +# ingress). The DB subnet group binds to the same public subnets the ECS tasks +# use because we have no private subnets in the no-NAT design — but the SG is +# what enforces "internal-only". +# +# pgvector ships in the Postgres engine via an extension. The migration created +# in M1 runs `CREATE EXTENSION IF NOT EXISTS vector` against the freshly +# provisioned DB. The parameter group does not need shared_preload_libraries +# for pgvector specifically (unlike e.g. pg_stat_statements); pgvector loads on +# CREATE EXTENSION. + +resource "aws_db_subnet_group" "this" { + name = "${var.project_name}-db-subnets" + subnet_ids = var.subnet_ids + + tags = { Name = "${var.project_name}-db-subnets" } +} + +resource "aws_db_parameter_group" "this" { + name = "${var.project_name}-pg16" + family = "postgres16" + + parameter { + name = "log_statement" + value = "ddl" # log DDL only; demo posture, keeps log volume low. + } + + tags = { Name = "${var.project_name}-pg16" } +} + +resource "aws_db_instance" "this" { + identifier = "${var.project_name}-db" + engine = "postgres" + engine_version = "16.4" + instance_class = var.instance_class + allocated_storage = var.allocated_storage + storage_type = "gp3" + storage_encrypted = true + + db_name = var.db_name + username = var.db_username + password = var.db_password + port = 5432 + + vpc_security_group_ids = [var.ingress_sg_id] + db_subnet_group_name = aws_db_subnet_group.this.name + parameter_group_name = aws_db_parameter_group.this.name + + publicly_accessible = false # Hard invariant for the demo. Do not flip. + multi_az = false # Single-AZ for cost. Do not run production this way. + skip_final_snapshot = true # Demo posture: terraform destroy should be cheap. + deletion_protection = false # Demo posture: same reason. + apply_immediately = true + + backup_retention_period = 1 + performance_insights_enabled = false + + tags = { Name = "${var.project_name}-db" } +} diff --git a/infra/modules/rds/outputs.tf b/infra/modules/rds/outputs.tf new file mode 100644 index 0000000..5e4d476 --- /dev/null +++ b/infra/modules/rds/outputs.tf @@ -0,0 +1,8 @@ +output "db_endpoint" { + description = "host:port form, ready to drop into a postgres URL." + value = aws_db_instance.this.endpoint +} + +output "db_address" { + value = aws_db_instance.this.address +} diff --git a/infra/modules/rds/variables.tf b/infra/modules/rds/variables.tf new file mode 100644 index 0000000..2ae1561 --- /dev/null +++ b/infra/modules/rds/variables.tf @@ -0,0 +1,39 @@ +variable "project_name" { + type = string +} + +variable "vpc_id" { + type = string +} + +variable "subnet_ids" { + type = list(string) +} + +variable "ingress_sg_id" { + description = "Security group allowed inbound on 5432. Wire to the backend task SG." + type = string +} + +variable "db_name" { + type = string +} + +variable "db_username" { + type = string +} + +variable "db_password" { + type = string + sensitive = true +} + +variable "instance_class" { + type = string + default = "db.t4g.micro" +} + +variable "allocated_storage" { + type = number + default = 20 +} diff --git a/infra/modules/secrets/main.tf b/infra/modules/secrets/main.tf new file mode 100644 index 0000000..b63674f --- /dev/null +++ b/infra/modules/secrets/main.tf @@ -0,0 +1,52 @@ +# SSM Parameter Store entries for runtime secrets the ECS task pulls in via the +# task execution role. +# +# - anthropic/openai keys are placeholders. Overwrite out-of-band: +# aws ssm put-parameter --name /sentinel/anthropic_api_key \ +# --type SecureString --value "$ANTHROPIC_API_KEY" --overwrite +# `lifecycle.ignore_changes = [value]` keeps Terraform from clobbering the +# real value on subsequent applies. +# +# - DATABASE_URL is composed from RDS outputs supplied by the caller. It is +# sensitive (carries the master password) but Terraform-owned, so its +# `value` *is* tracked. + +locals { + prefix = "/${var.project_name}" + database_url = format( + "postgresql+psycopg://%s:%s@%s/%s", + var.db_username, + var.db_password, + var.db_endpoint, + var.db_name, + ) +} + +resource "aws_ssm_parameter" "anthropic_api_key" { + name = "${local.prefix}/anthropic_api_key" + description = "Anthropic API key consumed by the backend at task start. Overwrite out-of-band." + type = "SecureString" + value = "REPLACE_ME" + + lifecycle { + ignore_changes = [value] + } +} + +resource "aws_ssm_parameter" "openai_api_key" { + name = "${local.prefix}/openai_api_key" + description = "OpenAI API key consumed by the backend at task start. Overwrite out-of-band." + type = "SecureString" + value = "REPLACE_ME" + + lifecycle { + ignore_changes = [value] + } +} + +resource "aws_ssm_parameter" "database_url" { + name = "${local.prefix}/database_url" + description = "psycopg URL for the RDS instance. Composed from rds outputs." + type = "SecureString" + value = local.database_url +} diff --git a/infra/modules/secrets/outputs.tf b/infra/modules/secrets/outputs.tf new file mode 100644 index 0000000..43ed825 --- /dev/null +++ b/infra/modules/secrets/outputs.tf @@ -0,0 +1,11 @@ +output "anthropic_key_arn" { + value = aws_ssm_parameter.anthropic_api_key.arn +} + +output "openai_key_arn" { + value = aws_ssm_parameter.openai_api_key.arn +} + +output "database_url_arn" { + value = aws_ssm_parameter.database_url.arn +} diff --git a/infra/modules/secrets/variables.tf b/infra/modules/secrets/variables.tf new file mode 100644 index 0000000..6af4c67 --- /dev/null +++ b/infra/modules/secrets/variables.tf @@ -0,0 +1,21 @@ +variable "project_name" { + type = string +} + +variable "db_endpoint" { + description = "RDS endpoint (host:port)." + type = string +} + +variable "db_name" { + type = string +} + +variable "db_username" { + type = string +} + +variable "db_password" { + type = string + sensitive = true +} diff --git a/infra/outputs.tf b/infra/outputs.tf new file mode 100644 index 0000000..6137a7b --- /dev/null +++ b/infra/outputs.tf @@ -0,0 +1,39 @@ +output "alb_dns_name" { + description = "Public DNS name of the Application Load Balancer. Visit http://{this} once tasks are healthy." + value = module.ecs.alb_dns_name +} + +output "ecr_backend_repository_url" { + description = "ECR repository URL for the backend image. CD pushes here." + value = module.ecr.backend_repository_url +} + +output "ecr_frontend_repository_url" { + description = "ECR repository URL for the frontend image. CD pushes here." + value = module.ecr.frontend_repository_url +} + +output "ecs_cluster_name" { + description = "ECS cluster name (used by CD when forcing service deployments)." + value = module.ecs.cluster_name +} + +output "ecs_backend_service_name" { + description = "ECS backend service name." + value = module.ecs.backend_service_name +} + +output "ecs_frontend_service_name" { + description = "ECS frontend service name." + value = module.ecs.frontend_service_name +} + +output "rds_endpoint" { + description = "Postgres endpoint (host:port). Not publicly reachable; used by ECS tasks only." + value = module.rds.db_endpoint +} + +output "ci_role_arn" { + description = "ARN of the GitHub-Actions OIDC role, if created. Add this to the repo's AWS_ROLE_ARN secret." + value = try(module.ci_oidc[0].role_arn, null) +} diff --git a/infra/variables.tf b/infra/variables.tf new file mode 100644 index 0000000..eece20b --- /dev/null +++ b/infra/variables.tf @@ -0,0 +1,113 @@ +variable "project_name" { + description = "Short name used as a prefix on every resource." + type = string + default = "sentinel" + validation { + condition = can(regex("^[a-z][a-z0-9-]{1,30}$", var.project_name)) + error_message = "project_name must be lowercase, start with a letter, and use only [a-z0-9-]." + } +} + +variable "environment" { + description = "Environment label (free-form). Tags only; not used in resource names." + type = string + default = "demo" +} + +variable "region" { + description = "AWS region." + type = string + default = "us-east-1" +} + +variable "vpc_cidr" { + description = "CIDR block for the VPC." + type = string + default = "10.0.0.0/16" +} + +variable "public_subnet_cidrs" { + description = "Two /24 CIDRs for the public subnets in two AZs." + type = list(string) + default = ["10.0.0.0/24", "10.0.1.0/24"] + validation { + condition = length(var.public_subnet_cidrs) == 2 + error_message = "Exactly two subnet CIDRs are required (one per AZ)." + } +} + +variable "db_username" { + description = "Postgres master username." + type = string + default = "sentinel" +} + +variable "db_password" { + description = <<-EOD + Postgres master password. Required at apply time. Pass via TF_VAR_db_password + (preferred) or a -var '...' flag — never commit. Min 16 chars. + EOD + type = string + sensitive = true + validation { + condition = length(var.db_password) >= 16 + error_message = "db_password must be at least 16 characters." + } +} + +variable "db_name" { + description = "Initial Postgres database name." + type = string + default = "sentinel" +} + +variable "db_instance_class" { + description = "RDS instance class. Cost-minimal default; do not run production on db.t4g.micro." + type = string + default = "db.t4g.micro" +} + +variable "db_allocated_storage" { + description = "RDS storage in GB. 20 is the floor on db.t4g.micro and is enough for the demo corpus." + type = number + default = 20 +} + +variable "backend_image_tag" { + description = "ECR image tag for the backend service. CD overrides this with the git SHA." + type = string + default = "latest" +} + +variable "frontend_image_tag" { + description = "ECR image tag for the frontend service. CD overrides this with the git SHA." + type = string + default = "latest" +} + +variable "backend_desired_count" { + description = "ECS service desired task count for the backend." + type = number + default = 1 +} + +variable "frontend_desired_count" { + description = "ECS service desired task count for the frontend." + type = number + default = 1 +} + +variable "github_repository" { + description = <<-EOD + GitHub repo in 'owner/name' form. Used to scope the OIDC trust policy on the + CI deploy role so only this repo can assume it. Empty disables the OIDC role. + EOD + type = string + default = "" +} + +variable "log_retention_days" { + description = "CloudWatch Logs retention for the ECS task log groups." + type = number + default = 7 +} diff --git a/infra/versions.tf b/infra/versions.tf new file mode 100644 index 0000000..87414d9 --- /dev/null +++ b/infra/versions.tf @@ -0,0 +1,22 @@ +terraform { + required_version = ">= 1.6.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.70" + } + random = { + source = "hashicorp/random" + version = "~> 3.6" + } + } +} + +provider "aws" { + region = var.region + + default_tags { + tags = local.common_tags + } +} diff --git a/pyproject.toml b/pyproject.toml index eb37aa3..a38cad1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "psycopg[binary]>=3.2", "pydantic-settings>=2.6", "sqlalchemy>=2.0", + "structlog>=24.4", "tiktoken>=0.8", "uvicorn[standard]>=0.32", ] diff --git a/uv.lock b/uv.lock index 264906f..2ef4506 100644 --- a/uv.lock +++ b/uv.lock @@ -1109,6 +1109,7 @@ dependencies = [ { name = "psycopg", extra = ["binary"] }, { name = "pydantic-settings" }, { name = "sqlalchemy" }, + { name = "structlog" }, { name = "tiktoken" }, { name = "uvicorn", extra = ["standard"] }, ] @@ -1130,6 +1131,7 @@ requires-dist = [ { name = "psycopg", extras = ["binary"], specifier = ">=3.2" }, { name = "pydantic-settings", specifier = ">=2.6" }, { name = "sqlalchemy", specifier = ">=2.0" }, + { name = "structlog", specifier = ">=24.4" }, { name = "tiktoken", specifier = ">=0.8" }, { name = "uvicorn", extras = ["standard"], specifier = ">=0.32" }, ] @@ -1196,6 +1198,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9f/85/492183764d5d01d4514be3730fdb8e228a80605783099551c51627578b5d/starlette-1.2.0-py3-none-any.whl", hash = "sha256:36e0c76ac59157e75dc4b3bdeafba97fb04eaf1878045f15dbef666a6f092ed7", size = 73213, upload-time = "2026-05-28T11:42:48.801Z" }, ] +[[package]] +name = "structlog" +version = "25.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/52/9ba0f43b686e7f3ddfeaa78ac3af750292662284b3661e91ad5494f21dbc/structlog-25.5.0.tar.gz", hash = "sha256:098522a3bebed9153d4570c6d0288abf80a031dfdb2048d59a49e9dc2190fc98", size = 1460830, upload-time = "2025-10-27T08:28:23.028Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/45/a132b9074aa18e799b891b91ad72133c98d8042c70f6240e4c5f9dabee2f/structlog-25.5.0-py3-none-any.whl", hash = "sha256:a8453e9b9e636ec59bd9e79bbd4a72f025981b3ba0f5837aebf48f02f37a7f9f", size = 72510, upload-time = "2025-10-27T08:28:21.535Z" }, +] + [[package]] name = "tiktoken" version = "0.13.0"