From 892dfbbdbfee13c1d95ba97a3a749501030ba5dd Mon Sep 17 00:00:00 2001 From: Jeff Yaw Date: Sat, 25 Apr 2026 08:26:24 -0700 Subject: [PATCH] fix: helm chart fail-closed gates, latent DATABASE_URL bug, deploy-path docs helm: - postgres.enabled=true now uses pgvector/pgvector:pg18 so the dev-mode toggle no longer crash-loops on the CREATE EXTENSION vector migration - redis.authToken is required when redis.enabled=true (mirrors the Compose --requirepass guard so Valkey can't boot unauthenticated) - _helpers databaseUrl: capture `required` returns into $_, otherwise externalDatabase deploys rendered "pg.example.comtestpostgresql://..." -- the assertion's return value was prepending into the URL string - redis probes use valkey-cli with --no-auth-warning to match Compose - NOTES.txt: drop redundant `or` wrap; app-configmap: drop redundant default since values.yaml already defaults sslMode cloud platforms: - cloudrun README step 7: REDIS_TLS=false / DATABASE_SSL=disable to match basic-tier Memorystore + Cloud SQL Auth Proxy unix socket - fly.toml: replace ghcr.io/yawlabs/mcp-hosting:latest (unfetchable by Fly) with a registry.fly.io placeholder + comment - fly bootstrap: stop swallowing real `fly postgres attach` failures (capture exit, only forgive "already attached"); slugify whoami; run `fly certs add` for non-default domains docs: - README: bump 0.1.0 -> 0.2.0 to match Chart.yaml - upgrade.md: replace MCP_HOSTING_IMAGE_TAG rollback recipe with docker-compose.override.yml block (the env var did nothing) - backup-restore.md: filenames mcp-hosting-backup-* -> mcp-hosting-* to match what scripts/backup.sh actually writes ci: - deploy-test: skip Python setup on the helm matrix entry - test.sh: pass --set redis.authToken=... to helm lint/template so the new required gate doesn't break CI Co-Authored-By: Claude Opus 4.7 --- .github/workflows/deploy-test.yml | 3 ++ README.md | 4 +- cloudrun/README.md | 4 +- docs/backup-restore.md | 10 ++--- docs/upgrade.md | 15 +++++++- fly/bootstrap.sh | 38 +++++++++++++++++-- fly/fly.toml | 6 ++- helm/mcp-hosting/templates/NOTES.txt | 2 +- helm/mcp-hosting/templates/_helpers.tpl | 19 ++++++++-- helm/mcp-hosting/templates/app-configmap.yaml | 2 +- helm/mcp-hosting/templates/app-secret.yaml | 7 ++-- .../templates/redis-statefulset.yaml | 23 +++++------ helm/mcp-hosting/values.yaml | 12 +++++- test.sh | 2 + 14 files changed, 110 insertions(+), 37 deletions(-) diff --git a/.github/workflows/deploy-test.yml b/.github/workflows/deploy-test.yml index 67b6302..d4f4580 100644 --- a/.github/workflows/deploy-test.yml +++ b/.github/workflows/deploy-test.yml @@ -38,6 +38,9 @@ jobs: - uses: actions/checkout@v6 - name: Set up Python + # helm target uses kubeconform (Go binary), not python. + # fly + cloudrun parse YAML/TOML in test.sh via python3. + if: matrix.target != 'helm' uses: actions/setup-python@v6 with: python-version: "3.12" diff --git a/README.md b/README.md index 99a30c2..36e69e2 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ echo $GITHUB_TOKEN | helm registry login ghcr.io -u --password-st # Install at a pinned chart version (recommended for production) helm install mcp-hosting oci://ghcr.io/yawlabs/charts/mcp-hosting \ - --version 0.1.0 \ + --version 0.2.0 \ --namespace mcp-hosting --create-namespace \ --set domain=mcp.example.com \ --set licenseKey=mcph_sh_... \ @@ -83,7 +83,7 @@ helm install mcp-hosting oci://ghcr.io/yawlabs/charts/mcp-hosting \ Two chart channels are published: - **Versioned** (`oci://ghcr.io/yawlabs/charts/mcp-hosting --version vX.Y.Z`) — tagged from `Chart.yaml`, immutable, what you should run in production. Tags ship via [`release.yml`](./.github/workflows/release.yml) when `vX.Y.Z` is pushed. -- **Preview** (`oci://ghcr.io/yawlabs/charts/mcp-hosting --version 0.1.0+`) — every push to `master` that touches `helm/`. Useful for canary'ing chart changes before they cut a tag. Don't pin production to one of these. +- **Preview** (`oci://ghcr.io/yawlabs/charts/mcp-hosting --version 0.2.0+`) — every push to `master` that touches `helm/`. Useful for canary'ing chart changes before they cut a tag. Don't pin production to one of these. Create the GHCR image-pull secret before installing — the chart references it but doesn't create it (the token shouldn't live in your values file): diff --git a/cloudrun/README.md b/cloudrun/README.md index 8977675..7011f7e 100644 --- a/cloudrun/README.md +++ b/cloudrun/README.md @@ -166,7 +166,7 @@ gcloud run deploy mcp-hosting \ --add-cloudsql-instances="$SQL_CONNECTION_NAME" \ --vpc-connector=mcp-hosting-vpc \ --vpc-egress=private-ranges-only \ - --set-env-vars="SELF_HOSTED=true,NODE_ENV=production,BASE_DOMAIN=mcp.example.com,REDIS_HOST=${REDIS_HOST},REDIS_PORT=${REDIS_PORT},REDIS_AUTH_TOKEN=,REDIS_TLS=true,DATABASE_SSL=require,AWS_REGION=us-east-1,EMAIL_FROM=noreply@mcp.example.com" \ + --set-env-vars="SELF_HOSTED=true,NODE_ENV=production,BASE_DOMAIN=mcp.example.com,REDIS_HOST=${REDIS_HOST},REDIS_PORT=${REDIS_PORT},REDIS_AUTH_TOKEN=,REDIS_TLS=false,DATABASE_SSL=disable,AWS_REGION=us-east-1,EMAIL_FROM=noreply@mcp.example.com" \ --set-secrets="DATABASE_URL=mcp-hosting-db-url:latest,COOKIE_SECRET=mcp-hosting-cookie-secret:latest,MCP_HOSTING_LICENSE_KEY=mcp-hosting-license-key:latest,GITHUB_CLIENT_ID=mcp-hosting-gh-client-id:latest,GITHUB_CLIENT_SECRET=mcp-hosting-gh-client-secret:latest,AWS_ACCESS_KEY_ID=mcp-hosting-aws-key:latest,AWS_SECRET_ACCESS_KEY=mcp-hosting-aws-secret:latest" ``` @@ -176,6 +176,8 @@ Notes on the flags above: - `--add-cloudsql-instances` injects the Cloud SQL Auth Proxy sidecar that backs the unix socket in `DATABASE_URL`. - `--vpc-connector` routes Memorystore traffic through the connector built in step 4. Without it, the service can't reach Redis. - `--timeout=3600` matches the SSE / Streamable HTTP long-lived connection requirement. +- `DATABASE_SSL=disable` matches the Cloud SQL Auth Proxy unix socket — TLS terminates inside the proxy sidecar, the app's own connection is over the local socket. If you switch to a TCP `DATABASE_URL` (private IP, no proxy), set `DATABASE_SSL=require`. +- `REDIS_TLS=false` matches the basic-tier Memorystore provisioned in step 3 (no AUTH, no TLS). If you upgrade to a standard / enterprise tier with `--enable-auth` and in-transit encryption, set `REDIS_TLS=true` and add `REDIS_AUTH_TOKEN=...` to the env-var list. `MCP_HOSTING_LICENSE_KEY` is required — the app refuses to boot without a valid Team license. diff --git a/docs/backup-restore.md b/docs/backup-restore.md index 3ce06cd..e123575 100644 --- a/docs/backup-restore.md +++ b/docs/backup-restore.md @@ -24,7 +24,7 @@ All user data lives in Postgres: 0 2 * * * /path/to/mcp-hosting-deploy/scripts/backup.sh ``` -The script writes timestamped `mcp-hosting-backup-YYYYMMDD-HHMMSS.sql.gz` files under `./backups/` by default. +The script writes timestamped `mcp-hosting-YYYYMMDD-HHMMSS.sql.gz` files under `./backups/` by default. ### Backups uploaded to S3 (recommended for production) @@ -46,7 +46,7 @@ The script uses your AWS CLI default credentials. Use an IAM user scoped to a si ```bash # Docker Compose -gunzip -c backups/mcp-hosting-backup-20260414-020001.sql.gz \ +gunzip -c backups/mcp-hosting-20260414-020001.sql.gz \ | docker compose exec -T postgres psql -U mcphosting mcphosting # Pull the Postgres container's state fresh first if restoring from scratch @@ -54,7 +54,7 @@ docker compose down docker volume rm docker-compose_postgres_data docker compose up -d postgres # wait ~5s for Postgres to accept connections -gunzip -c backups/mcp-hosting-backup-20260414-020001.sql.gz \ +gunzip -c backups/mcp-hosting-20260414-020001.sql.gz \ | docker compose exec -T postgres psql -U mcphosting mcphosting docker compose up -d ``` @@ -62,8 +62,8 @@ docker compose up -d ### From S3 ```bash -aws s3 cp s3://my-bucket/mcp-backups/mcp-hosting-backup-20260414-020001.sql.gz . -gunzip -c mcp-hosting-backup-20260414-020001.sql.gz \ +aws s3 cp s3://my-bucket/mcp-backups/mcp-hosting-20260414-020001.sql.gz . +gunzip -c mcp-hosting-20260414-020001.sql.gz \ | docker compose exec -T postgres psql -U mcphosting mcphosting ``` diff --git a/docs/upgrade.md b/docs/upgrade.md index d8d4011..021fe70 100644 --- a/docs/upgrade.md +++ b/docs/upgrade.md @@ -55,12 +55,23 @@ kubectl -n mcp-hosting rollout status deployment/mcp-hosting-app ### Docker Compose +`docker-compose.yml` pins `image: ghcr.io/yawlabs/mcp-hosting:latest`, +so a rollback can't be done with a plain env var -- there's no +interpolation on that line. Pin the previous tag in a gitignored +override and re-up: + ```bash -# Pin the previous tag -export MCP_HOSTING_IMAGE_TAG=v0.8.0 # whatever the prior version was +cat > docker-compose.override.yml <<'YAML' +services: + mcp-hosting-app: + image: ghcr.io/yawlabs/mcp-hosting:v0.8.0 # whatever the prior version was +YAML + docker compose up -d mcp-hosting-app ``` +To return to `:latest`, delete the override file and re-up. + ### Helm ```bash diff --git a/fly/bootstrap.sh b/fly/bootstrap.sh index 3c27d24..d03329e 100644 --- a/fly/bootstrap.sh +++ b/fly/bootstrap.sh @@ -29,7 +29,11 @@ set -euo pipefail # All values needed beyond APP_NAME/REGION are prompted interactively. # ============================================================================= -APP_NAME="${APP_NAME:-mcp-hosting-$(whoami)}" +# Fly app names must match [a-z0-9-]+ -- slugify whoami because some +# users have uppercase, dots, or other illegal chars in their login +# (corp SSO usernames like "John.Doe" are common). +_user_slug="$(whoami | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g; s/-\{2,\}/-/g; s/^-//; s/-$//')" +APP_NAME="${APP_NAME:-mcp-hosting-${_user_slug}}" REGION="${REGION:-iad}" log() { printf '\n\033[1;36m[bootstrap]\033[0m %s\n' "$*"; } @@ -129,10 +133,24 @@ else fi # Attach is idempotent-ish: it sets DATABASE_URL on the app. If already -# attached, it 422s with "database already attached" — tolerate that. +# attached, it 422s with "database already attached" -- tolerate that +# specifically. The naive `... | grep -v "already attached" || true` +# under `set -euo pipefail` swallows EVERY failure (auth, network, +# 5xx), leaving DATABASE_URL unset and crash-looping the app at boot. +# Capture output, check exit, and only forgive the known-benign case. log "Attaching Postgres to $APP_NAME (creates DATABASE_URL secret)" -fly postgres attach "$PG_APP" --app "$APP_NAME" --database-name mcphosting 2>&1 | \ - grep -v "already attached" || true +set +e +attach_output="$(fly postgres attach "$PG_APP" --app "$APP_NAME" --database-name mcphosting 2>&1)" +attach_exit=$? +set -e +if [[ $attach_exit -ne 0 ]]; then + if grep -q "already attached" <<<"$attach_output"; then + log "Postgres already attached to $APP_NAME" + else + printf '%s\n' "$attach_output" >&2 + die "fly postgres attach failed (see error above)" + fi +fi # ----------------------------------------------------------------------------- # 3. Ensure Upstash Redis; split the emitted REDIS_URL into HOST/PORT/AUTH @@ -205,6 +223,18 @@ fly deploy \ --image "$FLY_TAG" \ --config "$(dirname "$0")/fly.toml" +# Issue a managed TLS cert for any DOMAIN that isn't the default +# .fly.dev (which Fly's edge already covers). `fly certs add` is +# idempotent -- re-running for an existing cert prints status and exits 0. +if [[ "$DOMAIN" != "$APP_NAME.fly.dev" ]]; then + log "Issuing TLS cert for $DOMAIN" + if ! fly certs add "$DOMAIN" --app "$APP_NAME"; then + warn "fly certs add did not succeed -- add it manually:" + warn " fly certs add $DOMAIN --app $APP_NAME" + warn "Then add the DNS records the command prints." + fi +fi + log "Done. App URL: https://$DOMAIN" log "Health: curl -sf https://$DOMAIN/health" log "" diff --git a/fly/fly.toml b/fly/fly.toml index ac3dd10..04284f6 100644 --- a/fly/fly.toml +++ b/fly/fly.toml @@ -13,7 +13,11 @@ app = "mcp-hosting" primary_region = "iad" [build] - image = "ghcr.io/yawlabs/mcp-hosting:latest" + # Fly cannot auth to private GHCR. Mirror the image into registry.fly.io + # first (see ../docs/self-host-token.md, "Fly.io"), then either replace + # this value with the registry.fly.io tag you pushed, or pass --image + # at deploy time. bootstrap.sh in this directory does the latter. + image = "registry.fly.io/REPLACE_WITH_YOUR_APP:replace-with-deployment-tag" [env] SELF_HOSTED = "true" diff --git a/helm/mcp-hosting/templates/NOTES.txt b/helm/mcp-hosting/templates/NOTES.txt index f8cbbd4..d7fd08f 100644 --- a/helm/mcp-hosting/templates/NOTES.txt +++ b/helm/mcp-hosting/templates/NOTES.txt @@ -29,7 +29,7 @@ ERROR: No database configured. {{- end }} {{- end }} -{{- if not (or .Values.email.ses.accessKeyId) }} +{{- if not .Values.email.ses.accessKeyId }} NOTE: No email credentials configured. Magic link login will not work. Configure AWS SES credentials: diff --git a/helm/mcp-hosting/templates/_helpers.tpl b/helm/mcp-hosting/templates/_helpers.tpl index 25b6b8d..a82637c 100644 --- a/helm/mcp-hosting/templates/_helpers.tpl +++ b/helm/mcp-hosting/templates/_helpers.tpl @@ -83,13 +83,26 @@ DATABASE_URL connection string Uses in-cluster postgres when postgres.enabled=true, otherwise externalDatabase. Fails with a clear error if external DB is required but not configured. */}} +{{/* + WARNING: the password is embedded verbatim into DATABASE_URL. Helm has + no built-in URL-encoder, so passwords containing @, :, /, ?, #, %, +, + or whitespace will produce a malformed URL the driver rejects. Auto- + generated managed-DB passwords (RDS, Cloud SQL, AlloyDB) sometimes + include these. If you can't rotate to a URL-safe value, build + DATABASE_URL yourself in a Secret and override the chart's app Secret. +*/}} {{- define "mcp-hosting.databaseUrl" -}} {{- if .Values.postgres.enabled -}} postgresql://{{ .Values.postgres.username }}:{{ .Values.postgres.password }}@{{ include "mcp-hosting.postgresHost" . }}:5432/{{ .Values.postgres.database }} {{- else -}} -{{- required "externalDatabase.host is required when postgres.enabled=false. Set it to your RDS/Cloud SQL endpoint." .Values.externalDatabase.host -}} -{{- required "externalDatabase.password is required when postgres.enabled=false." .Values.externalDatabase.password -}} -postgresql://{{ .Values.externalDatabase.username }}:{{ .Values.externalDatabase.password }}@{{ .Values.externalDatabase.host }}:{{ .Values.externalDatabase.port }}{{ printf "/" }}{{ .Values.externalDatabase.database }}?sslmode={{ .Values.externalDatabase.sslMode }} +{{/* + `required` returns the value it asserts on, which would render BEFORE + the URL ("pg.example.comtest postgresql://...") if used inline. Capture + the return into a discard variable to use it as a pure assertion. +*/}} +{{- $_ := required "externalDatabase.host is required when postgres.enabled=false. Set it to your RDS/Cloud SQL endpoint." .Values.externalDatabase.host -}} +{{- $_ := required "externalDatabase.password is required when postgres.enabled=false." .Values.externalDatabase.password -}} +postgresql://{{ .Values.externalDatabase.username }}:{{ .Values.externalDatabase.password }}@{{ .Values.externalDatabase.host }}:{{ .Values.externalDatabase.port }}/{{ .Values.externalDatabase.database }}?sslmode={{ .Values.externalDatabase.sslMode }} {{- end -}} {{- end }} diff --git a/helm/mcp-hosting/templates/app-configmap.yaml b/helm/mcp-hosting/templates/app-configmap.yaml index bed24e4..783630f 100644 --- a/helm/mcp-hosting/templates/app-configmap.yaml +++ b/helm/mcp-hosting/templates/app-configmap.yaml @@ -20,7 +20,7 @@ data: {{- if .Values.postgres.enabled }} DATABASE_SSL: "disable" {{- else }} - DATABASE_SSL: {{ .Values.externalDatabase.sslMode | default "require" | quote }} + DATABASE_SSL: {{ .Values.externalDatabase.sslMode | quote }} {{- end }} {{- if .Values.redis.enabled }} REDIS_TLS: "false" diff --git a/helm/mcp-hosting/templates/app-secret.yaml b/helm/mcp-hosting/templates/app-secret.yaml index f9c8327..4be2aba 100644 --- a/helm/mcp-hosting/templates/app-secret.yaml +++ b/helm/mcp-hosting/templates/app-secret.yaml @@ -15,10 +15,9 @@ stringData: COOKIE_SECRET: {{ required "app.cookieSecret is required (32+ chars). Generate one with: openssl rand -hex 32" .Values.app.cookieSecret | quote }} GITHUB_CLIENT_ID: {{ required "app.githubClientId is required. Register an OAuth app at https://github.com/settings/developers" .Values.app.githubClientId | quote }} GITHUB_CLIENT_SECRET: {{ required "app.githubClientSecret is required (paired with app.githubClientId)" .Values.app.githubClientSecret | quote }} - {{- if and .Values.redis.enabled .Values.redis.authToken }} - REDIS_AUTH_TOKEN: {{ .Values.redis.authToken | quote }} - {{- end }} - {{- if and (not .Values.redis.enabled) .Values.externalRedis.password }} + {{- if .Values.redis.enabled }} + REDIS_AUTH_TOKEN: {{ required "redis.authToken is required when redis.enabled=true (mirrors the Compose --requirepass guard so Valkey can't boot unauthenticated). Generate one with: openssl rand -hex 24" .Values.redis.authToken | quote }} + {{- else if .Values.externalRedis.password }} REDIS_AUTH_TOKEN: {{ .Values.externalRedis.password | quote }} {{- end }} MCP_HOSTING_LICENSE_KEY: {{ required "licenseKey is required. Get yours at https://mcp.hosting/#pricing → Settings → Self-host (format: mcph_sh_)" .Values.licenseKey | quote }} diff --git a/helm/mcp-hosting/templates/redis-statefulset.yaml b/helm/mcp-hosting/templates/redis-statefulset.yaml index 2a8e816..ffcb3a6 100644 --- a/helm/mcp-hosting/templates/redis-statefulset.yaml +++ b/helm/mcp-hosting/templates/redis-statefulset.yaml @@ -33,7 +33,9 @@ spec: capabilities: drop: ["ALL"] readOnlyRootFilesystem: false - {{- if .Values.redis.authToken }} + # --requirepass is unconditional: the chart fails at render + # time if redis.authToken is unset (see the app Secret), so we + # never reach here without a token. Mirrors the Compose stance. args: - --requirepass - "$(REDIS_PASSWORD)" @@ -43,7 +45,6 @@ spec: secretKeyRef: name: {{ include "mcp-hosting.fullname" . }}-redis key: auth-token - {{- end }} ports: - name: redis containerPort: 6379 @@ -51,14 +52,17 @@ spec: volumeMounts: - name: data mountPath: /data + # valkey-cli is the primary CLI in valkey/valkey images. + # Matches the Compose path; redis-cli still works as a compat + # symlink but using the native name avoids surprises if upstream + # ever drops the symlink. livenessProbe: exec: command: - - redis-cli - {{- if .Values.redis.authToken }} + - valkey-cli - -a - "$(REDIS_PASSWORD)" - {{- end }} + - --no-auth-warning - ping initialDelaySeconds: 10 periodSeconds: 15 @@ -66,11 +70,10 @@ spec: readinessProbe: exec: command: - - redis-cli - {{- if .Values.redis.authToken }} + - valkey-cli - -a - "$(REDIS_PASSWORD)" - {{- end }} + - --no-auth-warning - ping initialDelaySeconds: 5 periodSeconds: 10 @@ -93,7 +96,6 @@ spec: resources: requests: storage: {{ .Values.redis.storage }} -{{- if .Values.redis.authToken }} --- apiVersion: v1 kind: Secret @@ -105,6 +107,5 @@ metadata: app.kubernetes.io/component: redis type: Opaque stringData: - auth-token: {{ .Values.redis.authToken | quote }} -{{- end }} + auth-token: {{ required "redis.authToken is required when redis.enabled=true. Generate one with: openssl rand -hex 24" .Values.redis.authToken | quote }} {{- end }} diff --git a/helm/mcp-hosting/values.yaml b/helm/mcp-hosting/values.yaml index 29d61be..e83691c 100644 --- a/helm/mcp-hosting/values.yaml +++ b/helm/mcp-hosting/values.yaml @@ -76,7 +76,11 @@ app: # Set postgres.enabled=true only for development/testing postgres: enabled: false # DO NOT run Postgres in Kubernetes for production - image: postgres:18-alpine + # pgvector is required: the app's startup migrations run + # `CREATE EXTENSION IF NOT EXISTS vector`. Plain postgres:18 doesn't + # ship pgvector, so the pod crash-loops before binding /health. + # Mirrors the bundled Compose image. + image: pgvector/pgvector:pg18 storage: 10Gi storageClass: "" database: mcphosting @@ -110,7 +114,11 @@ redis: image: valkey/valkey:8.0-alpine storage: 1Gi storageClass: "" - # -- Auth token for Redis (optional, recommended in production) + # -- Auth token for Redis (REQUIRED when redis.enabled=true). + # The chart fails at render time if this is empty -- mirrors the + # Compose path's --requirepass guard so a misconfigured deploy can't + # silently boot Valkey unauthenticated. Generate with: + # openssl rand -hex 24 authToken: "" # ============================================================================= diff --git a/test.sh b/test.sh index 3d69b59..6112bf8 100644 --- a/test.sh +++ b/test.sh @@ -199,6 +199,7 @@ test_helm() { --set app.githubClientSecret=y \ --set app.cookieSecret=z \ --set licenseKey=mcph_sh_citest \ + --set redis.authToken=ci-redis-token \ 2>&1 | tee "$RESULTS_DIR/helm-lint.log"; then fail "helm: lint failed" record helm fail "$(( $(date +%s) - start ))" "helm lint failed" @@ -216,6 +217,7 @@ test_helm() { --set app.githubClientSecret=y \ --set app.cookieSecret=z \ --set licenseKey=mcph_sh_citest \ + --set redis.authToken=ci-redis-token \ > "$out" 2> "$RESULTS_DIR/helm-template.err"; then fail "helm: helm template failed" cat "$RESULTS_DIR/helm-template.err"