diff --git a/.github/kind/e2e-calico.yaml b/.github/kind/e2e-calico.yaml new file mode 100644 index 0000000..fd236dc --- /dev/null +++ b/.github/kind/e2e-calico.yaml @@ -0,0 +1,22 @@ +# kind cluster configuration for Bloodraven E2E tests. +# Uses Calico CNI so NetworkPolicy resources are enforced (the default +# kindnet CNI does not implement NetworkPolicy, which means partition / +# self-fencing scenarios would silently pass without actually testing +# policy behaviour). +# +# Usage: +# kind create cluster --config=.github/kind/e2e-calico.yaml +# # then install Calico: +# kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.28.0/manifests/calico.yaml +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: bloodraven-e2e +nodes: + - role: control-plane + - role: worker + - role: worker +networking: + # Disable kindnet so Calico can manage CNI instead. + disableDefaultCNI: true + # Match the stock Calico manifest's default IPv4 pool. + podSubnet: "192.168.0.0/16" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index ab93840..06fb86f 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -105,6 +105,28 @@ Then enable GitHub Pages for the repository pointing at the `gh-pages` branch. T --- +### `e2e.yml` / `_e2e.yml` — Real-Cluster E2E + +**Triggers:** +- Nightly schedule (release profile) +- Manual dispatch with profile selection (smoke / release / full) +- Pull requests with the `e2e` label (smoke profile) + +The reusable workflow (`_e2e.yml`) creates a kind cluster with Calico CNI, deploys the playground, and runs `playground-chaos run-all` with the selected profile. It uploads JUnit results, chaos forensics, setup logs, and kind logs as artifacts. + +Profiles: +| Profile | Scenarios | Use case | +|---|---|---| +| `smoke` | 3 (~3-5 min) | PR label gate, fast feedback | +| `release` | 10 (~20-30 min) | Release and nightly gate | +| `full` | All registered | Full regression (manual only) | + +The release workflow (`.github/workflows/release.yml`) blocks Docker image builds and Helm chart publishing on the E2E release-profile gate. This ensures every tagged release is validated against real MySQL failover scenarios (WISHLIST #32). + +**Permissions:** `contents: read` (default) + +--- + ### `scan.yml` — Trivy Security Scan **Triggers:** Pull requests targeting `main` diff --git a/.github/workflows/_e2e.yml b/.github/workflows/_e2e.yml new file mode 100644 index 0000000..b493be5 --- /dev/null +++ b/.github/workflows/_e2e.yml @@ -0,0 +1,125 @@ +# Reusable E2E workflow — creates a kind cluster, deploys the playground, +# and runs playground-chaos with the selected profile. +# +# Called by: +# .github/workflows/e2e.yml (nightly, manual, PR label) +# .github/workflows/release.yml (release gate) +name: E2E (reusable) + +on: + workflow_call: + inputs: + profile: + description: "Chaos profile (smoke|release|full)" + required: false + default: "release" + type: string + timeout-minutes: + description: "Job timeout in minutes" + required: false + default: 90 + type: number + +permissions: + contents: read + +env: + BLOODRAVEN_SETUP_HELM_INSTALL_CRDS: "1" + SKIP_IMAGE_BUILD: "1" + +concurrency: + group: e2e-${{ github.workflow }}-${{ github.ref }}-${{ inputs.profile }} + cancel-in-progress: true + +jobs: + e2e: + name: Real-cluster E2E (${{ inputs.profile }}) + runs-on: ubuntu-latest + timeout-minutes: ${{ inputs.timeout-minutes }} + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-go@v6 + with: + go-version-file: go.mod + cache-dependency-path: go.sum + + - name: Build playground-chaos + run: make build-playground-chaos + + - name: Build Docker images + run: | + docker build --target bloodraven -t bloodraven:playground . + docker build --target sidecar -t bloodraven-sidecar:playground . + docker build -t bloodraven-counter:playground playground/counter-app + docker build -t bloodraven-dashboard:playground playground/dashboard + docker build -t bloodraven-dns-webhook:playground playground/dns-webhook + + - name: Create kind cluster + uses: helm/kind-action@v1.12.0 + with: + cluster_name: bloodraven-e2e + config: .github/kind/e2e-calico.yaml + # CNI is disabled in this kind config, so nodes cannot become + # Ready until Calico is installed in the next step. + wait: 0s + + - name: Install Calico CNI + run: | + kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.28.0/manifests/calico.yaml + kubectl -n kube-system rollout status daemonset/calico-node --timeout=180s + kubectl wait nodes --all --for=condition=Ready --timeout=180s + + - name: Load images into kind + run: | + kind load docker-image bloodraven:playground bloodraven-sidecar:playground bloodraven-counter:playground bloodraven-dashboard:playground bloodraven-dns-webhook:playground --name bloodraven-e2e + + - name: Deploy playground + run: | + set -o pipefail + ./playground/setup.sh 2>&1 | tee playground/setup.log + timeout-minutes: 10 + + - name: Run E2E (${{ inputs.profile }} profile) + run: make test-e2e E2E_PROFILE=${{ inputs.profile }} E2E_JUNIT_OUT=playground/chaos-results/e2e-${{ inputs.profile }}-junit.xml + timeout-minutes: ${{ inputs.timeout-minutes }} + + - name: Upload JUnit results + if: always() + uses: actions/upload-artifact@v4 + with: + name: e2e-${{ inputs.profile }}-junit + path: playground/chaos-results/e2e-${{ inputs.profile }}-junit.xml + retention-days: 30 + + - name: Upload chaos forensics + if: failure() + uses: actions/upload-artifact@v4 + with: + name: e2e-${{ inputs.profile }}-forensics + path: playground/chaos-results/ + retention-days: 30 + + - name: Upload kind logs + if: failure() + run: | + mkdir -p /tmp/kind-logs + kind export logs --name=bloodraven-e2e /tmp/kind-logs || true + continue-on-error: true + + - name: Upload kind logs artifact + if: failure() + uses: actions/upload-artifact@v4 + with: + name: e2e-${{ inputs.profile }}-kind-logs + path: /tmp/kind-logs/ + retention-days: 14 + continue-on-error: true + + - name: Upload setup logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: e2e-${{ inputs.profile }}-setup-logs + path: playground/setup.log + retention-days: 14 diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 0000000..d6777de --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,39 @@ +# E2E trigger workflow — nightly, manual, and PR-label-gated. +# The reusable workflow is in .github/workflows/_e2e.yml. +name: E2E + +on: + # Nightly release-profile run + schedule: + - cron: "0 5 * * *" # 05:00 UTC daily + + # Manual dispatch with profile selection + workflow_dispatch: + inputs: + profile: + description: "Chaos profile (smoke|release|full)" + required: false + default: "release" + type: choice + options: + - smoke + - release + - full + + # PR label gate: run smoke while the "e2e" label is present. + pull_request: + types: [opened, reopened, synchronize, labeled] + +permissions: + contents: read + +jobs: + # Skip PR-triggered runs unless the "e2e" label is present. + e2e: + if: >- + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'e2e')) + uses: ./.github/workflows/_e2e.yml + with: + profile: ${{ github.event_name == 'pull_request' && 'smoke' || (github.event.inputs.profile || 'release') }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 05efe59..031eea1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -73,10 +73,21 @@ jobs: working-directory: docs run: npm run verify:llms + # E2E release gate — runs the release-profile real-cluster E2E before + # any publishing jobs. This ensures that every tagged release has been + # validated against real MySQL pods, PVCs, DNS, taints, failover, and + # network partition scenarios (WISHLIST #32). + e2e-gate: + name: E2E gate (release profile) + needs: ci-gate + uses: ./.github/workflows/_e2e.yml + with: + profile: release + draft-release: name: Create Draft Release runs-on: ubuntu-latest - needs: ci-gate + needs: [ci-gate, e2e-gate] steps: - uses: actions/checkout@v6 with: @@ -116,7 +127,7 @@ jobs: docker: name: Build and Push Docker Images runs-on: ubuntu-latest - needs: [ci-gate, draft-release] + needs: [ci-gate, e2e-gate, draft-release] strategy: matrix: include: diff --git a/AGENTS.md b/AGENTS.md index 10f9c54..c54c211 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,7 +1,7 @@ # Repository Guidelines ## Project Structure & Module Organization -Primary code lives in the root Go module. `cmd/bloodraven` is the Kubernetes operator entrypoint; `cmd/sidecar` is the per-MySQL sidecar; `cmd/kubectl-bloodraven` is the day-2 `kubectl` plugin (status / promote / reclone / backup / verify-backup, built via `make build-kubectl-plugin`). API types live in `api/v1alpha1`, controller logic in `internal/controller`, and supporting packages in `internal/mysql`, `internal/platform`, `internal/sidecar`, `internal/state`, and `internal/metrics`. End-to-end and scenario-style tests live in `test/e2e`. Treat `bitpoke/` and `orchestrator/` as bundled upstream references, not the default place for new feature work. +Primary code lives in the root Go module. `cmd/bloodraven` is the Kubernetes operator entrypoint; `cmd/sidecar` is the per-MySQL sidecar; `cmd/kubectl-bloodraven` is the day-2 `kubectl` plugin (status / promote / reclone / backup / verify-backup, built via `make build-kubectl-plugin`). API types live in `api/v1alpha1`, controller logic in `internal/controller`, and supporting packages in `internal/mysql`, `internal/platform`, `internal/sidecar`, `internal/state`, and `internal/metrics`. Real-cluster scenario tests live under `internal/playground/scenarios` and run through `cmd/playground-chaos`; faster cross-component tests live under `test/component`, with API-server/envtest coverage under `test/envtest`. Treat `bitpoke/` and `orchestrator/` as bundled upstream references, not the default place for new feature work. ## Build, Test, and Development Commands Run commands from the repository root: @@ -10,6 +10,8 @@ Run commands from the repository root: - `go build ./cmd/sidecar` builds the sidecar binary. - `make build-kubectl-plugin` builds `bin/kubectl-bloodraven` (the day-2 `kubectl` plugin). Override `KUBECTL_PLUGIN_VERSION=` to stamp a release; `make install-kubectl-plugin` drops the binary onto `$PATH`. - `make test` runs `go test ./...` across unit and e2e-style packages. +- `make test-e2e` runs the release profile of real-cluster E2E tests against the current playground cluster (requires kind/k3d/minikube context prepared with `./playground/setup.sh`; CI creates kind and runs setup first). +- `make test-e2e-smoke` runs the smoke profile (~3 scenarios, fast feedback). - `make vet` runs `go vet ./...`. - `make lint` runs `golangci-lint run ./...`. `golangci-lint` is not vendored; install it with `go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest` (it lands in `$(go env GOPATH)/bin`). CI installs the same tool with the same command in `.github/workflows/ci.yml`, so local and CI output match when you run this. - `make generate` refreshes API deep-copy code in `api/v1alpha1`. @@ -26,7 +28,7 @@ Use standard Go formatting: run `gofmt` on changed files and keep imports organi Structured-log `msg` strings and field names listed in `docs/docs/log-schema.mdx` are a public stability contract — downstream log pipelines filter on them. When you touch a log call site whose `msg` appears in that doc's Event reference, either preserve the `msg` string and the documented field set exactly, or update `docs/docs/log-schema.mdx` in the same PR and call out the break in the PR description. The same applies to field naming: log keys are `camelCase` (per the contract), not `snake_case`. ## Testing Guidelines -Add table-driven unit tests beside the code they cover, using the existing `*_test.go` layout under `internal/`. Put cross-component behavior tests in `test/e2e`. Some tests create local HTTP listeners with `httptest`, so restricted sandboxes may fail even when local developer runs pass. +Add table-driven unit tests beside the code they cover, using the existing `*_test.go` layout under `internal/`. Put cross-component behavior tests in `test/component`, API-server/controller-runtime tests in `test/envtest`, and real-cluster playground scenarios in `internal/playground/scenarios` through `cmd/playground-chaos`. Some tests create local HTTP listeners with `httptest`, so restricted sandboxes may fail even when local developer runs pass. ### Pre-PR gate (required, do not skip) Before pushing a branch that opens or updates a PR, run all of the following from the repo root and fix anything they report. Do **not** push expecting CI to find problems you could have caught locally — CI failures on lint or generate drift are round-trip latency and reviewer noise. @@ -89,7 +91,7 @@ Lessons from running chaos scenarios against a live k3d cluster: `./playground/rebuild.sh operator` builds, imports to k3d, and restarts the operator deployment. For sidecar changes, use `./playground/rebuild.sh sidecar` (restarts MySQL pods). Both can be combined: `./playground/rebuild.sh operator sidecar`. ### Automated chaos runner -A subset of `playground/chaos-scenarios.md` is automated by `cmd/playground-chaos` and exposed as Make targets: `make chaos-list`, `make chaos-check`, `make chaos-run SCENARIO=`, `make chaos-run-all`. The runner refuses to mutate any kubectl context outside the `_guard.sh` allowlist; on assertion failure it captures cluster YAML + pods + events + operator/sidecar logs + raw `/metrics` under `playground/chaos-results///` for triage. Use `--no-cleanup` to keep injected state in place for forensics. +A subset of `playground/chaos-scenarios.md` is automated by `cmd/playground-chaos` and exposed as Make targets: `make chaos-list`, `make chaos-check`, `make chaos-run SCENARIO=`, `make chaos-run-all`, `make chaos-run-all-profile PROFILE=smoke|release|full`. The runner supports three E2E profiles (`--profile=smoke|release|full`) that filter which scenarios run. The runner refuses to mutate any kubectl context outside the `_guard.sh` allowlist; on assertion failure it captures cluster YAML + pods + events + operator/sidecar logs + raw `/metrics` under `playground/chaos-results///` for triage. Use `--no-cleanup` to keep injected state in place for forensics. The runner stamps an in-progress marker on the MFG (`chaos.playground.bloodraven.io/in-progress`) after Precheck and clears it on cleanup. A subsequent run that finds a leftover marker refuses to start with a specific reason (live owner / abandoned / different host). Override with `--force` (delete the marker before preflight) or `--auto-reset` (on Precheck failure, shell out to `reset-mysql.sh + setup.sh` and retry once; 3s pause unless `CI=1`). `chaos-check` runs the same structural baseline scenarios use — stuck scale-to-0 deployments, bogus `lastFailoverTarget`, anti-flap cooldown still ticking, `NoPrimary` (both-sites-read-only), replication off on a non-active candidate — each with the exact remediation command in the error. diff --git a/Makefile b/Makefile index eca7ca6..631897a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ CONTROLLER_GEN ?= go run sigs.k8s.io/controller-tools/cmd/controller-gen -.PHONY: help generate manifests build build-bloodraven build-sidecar build-playground-chaos build-kubectl-plugin install-kubectl-plugin test test-unit test-component test-envtest test-e2e test-integration fmt vet lint docker-build chaos-list chaos-check chaos-run chaos-run-all +.PHONY: help generate manifests build build-bloodraven build-sidecar build-playground-chaos build-kubectl-plugin install-kubectl-plugin test test-unit test-component test-envtest test-e2e test-e2e-smoke test-integration fmt vet lint docker-build chaos-list chaos-check chaos-run chaos-run-all chaos-run-all-profile ##@ General @@ -90,10 +90,15 @@ test-component: ## Run component tests (cross-package with fakes, no real cluste test-envtest: ## Run envtest controller tests (real API server, no cluster) go test -race -tags envtest ./test/envtest/ -test-e2e: ## Run real cluster end-to-end tests (requires kind/k3d — Phase 4, not yet implemented) - @echo "Real cluster e2e tests are not yet implemented (Testing 2.0 Phase 4)." - @echo "See TESTING_2.0.md for the planned scenarios." - @exit 1 +E2E_PROFILE ?= release +E2E_JUNIT_OUT ?= playground/chaos-results/e2e-$(E2E_PROFILE)-junit.xml +E2E_ARGS ?= + +test-e2e: build-playground-chaos ## Run real-cluster E2E tests (E2E_PROFILE=release|smoke|full; requires kind/k3d) + ./bin/playground-chaos run-all --profile=$(E2E_PROFILE) --auto-reset --continue-on-failure --junit-out=$(E2E_JUNIT_OUT) $(E2E_ARGS) + +test-e2e-smoke: build-playground-chaos ## Run real-cluster E2E smoke (smoke profile — requires kind/k3d) + $(MAKE) test-e2e E2E_PROFILE=smoke E2E_JUNIT_OUT=playground/chaos-results/e2e-smoke-junit.xml test-integration: ## Run integration tests (network listener tests) go test -tags integration -race ./internal/platform/ ./test/component/ @@ -123,3 +128,7 @@ chaos-run: build-playground-chaos ## Run a single scenario (SCENARIO=) chaos-run-all: build-playground-chaos ## Run every registered chaos scenario in order ./bin/playground-chaos run-all + +chaos-run-all-profile: build-playground-chaos ## Run chaos scenarios filtered by profile (PROFILE=smoke|release|full) + @if [ -z "$(PROFILE)" ]; then echo "usage: make chaos-run-all-profile PROFILE=smoke"; exit 2; fi + ./bin/playground-chaos run-all --profile=$(PROFILE) diff --git a/WISHLIST.md b/WISHLIST.md index 6f83c10..e6ddf4d 100644 --- a/WISHLIST.md +++ b/WISHLIST.md @@ -5,13 +5,16 @@ - [ ] 7. Cross-region/cross-cluster DR as a first-class feature - [ ] 27. Backup/restore performance guide - [ ] 30. Public repo, license, release cadence -- [ ] 32. Real-cluster E2E CI gate +- [x] 32. Real-cluster E2E CI gate - [ ] 41. Safe Secret watch narrowing design - [ ] 42. Namespace-scoped watch/cache mode evaluation +- [ ] 43. Dedicated backup/PITR real-cluster E2E scenarios ## P0 — Production adoption blockers -**32. Real-cluster E2E CI gate.** Unit/component/envtest coverage is not enough for a MySQL failover operator. Add an optional-but-required-before-release k3d/kind CI job that installs the chart and exercises real MySQL pods, PVCs, Services, DNS/DNSEndpoint behavior, taints, planned failover, emergency failover, operator restart, PVC loss, NetworkPolicy partition, backup restore, and PITR verification. This should run at least on release tags and nightly; if cost is acceptable, run a reduced smoke subset on PRs. +**32. Real-cluster E2E CI gate.** Done: `make test-e2e` runs the release profile of `playground-chaos run-all` against a real cluster instead of the former placeholder. `make test-e2e-smoke` runs a fast smoke subset (3 scenarios). Three profiles (`smoke`/`release`/`full`) filter scenarios via `--profile` on `playground-chaos run-all` and `make chaos-run-all-profile PROFILE=`. CI uses a reusable workflow (`_e2e.yml`) that creates a kind cluster with Calico CNI, deploys the playground, and runs the selected profile. Nightly and manual runs use the release profile; PRs with the `e2e` label trigger a smoke run. Release publishing blocks on the E2E release-profile gate. JUnit, forensics, setup logs, and kind logs are uploaded as artifacts. Dedicated MySQL backup restore and PITR verification scenarios are split out as follow-up #43 so the gate can start enforcing the existing real-cluster chaos suite now without misrepresenting that coverage. + +**43. Dedicated backup/PITR real-cluster E2E scenarios.** Follow-up to #32: add release-profile playground-chaos scenarios that configure the playground backup profile against RustFS, trigger a real `MysqlBackup`, verify restore via `MysqlBackupVerification`, then enable PITR/binlog archival and verify a point-in-time replay with deterministic marker rows. The #32 gate now exists and is release-blocking, but this backup/PITR coverage should be added before claiming the E2E release profile exercises every backup/restore path. ## P1 — DR and operational completeness diff --git a/charts/bloodraven/values.yaml b/charts/bloodraven/values.yaml index 9ecdbb4..34ddc72 100644 --- a/charts/bloodraven/values.yaml +++ b/charts/bloodraven/values.yaml @@ -161,6 +161,3 @@ auxiliary: # Beyond this limit, additional upgrades are rejected with 429. # Defends against an attacker pinning the hub's clients map. wsMaxClients: 100 - -# -- Install CRDs. Set to false if CRDs are managed externally. -installCRDs: true diff --git a/cmd/playground-chaos/main.go b/cmd/playground-chaos/main.go index bd70e3f..0f2b870 100644 --- a/cmd/playground-chaos/main.go +++ b/cmd/playground-chaos/main.go @@ -43,6 +43,7 @@ func main() { autoReset := rootFlags.Bool("auto-reset", false, "on precheck failure: run playground-chaos reset, then retry once (3s pause unless CI=1)") continueOnFailure := rootFlags.Bool("continue-on-failure", false, "run-all only: keep going past the first failure") junitOut := rootFlags.String("junit-out", "", "run-all only: write JUnit XML report to this path") + profile := rootFlags.String("profile", string(runner.DefaultProfile), "run-all only: scenario subset (smoke|release|full)") verbose := rootFlags.Bool("verbose", false, "verbose logging") kubeconfig := rootFlags.String("kubeconfig", "", "kubeconfig path (default: KUBECONFIG / ~/.kube/config)") kctx := rootFlags.String("context", "", "kubectl context to use (default: current-context)") @@ -97,7 +98,12 @@ func main() { } os.Exit(runOne(*kubeconfig, *kctx, *namespace, *fg, *resultsDir, *timeout, *noCleanup, *force, *autoReset, subArgs[0], logger)) case "run-all": - os.Exit(runAll(*kubeconfig, *kctx, *namespace, *fg, *resultsDir, *timeout, *noCleanup, *force, *autoReset, *continueOnFailure, *junitOut, logger)) + p := runner.Profile(*profile) + if !p.IsValid() { + fmt.Fprintf(os.Stderr, "invalid profile %q; valid: smoke, release, full\n", p) + os.Exit(exitFlagParse) + } + os.Exit(runAll(*kubeconfig, *kctx, *namespace, *fg, *resultsDir, *timeout, *noCleanup, *force, *autoReset, *continueOnFailure, *junitOut, p, logger)) default: fmt.Fprintf(os.Stderr, "unknown subcommand: %s\n", subcmd) usage() @@ -123,6 +129,7 @@ Flags: --auto-reset on precheck failure: playground-chaos reset, retry once --continue-on-failure run-all only: keep going past first failure --junit-out run-all only: write JUnit XML to path + --profile run-all only: scenario subset (smoke|release|full) --verbose verbose logging --kubeconfig kubeconfig path --context kubectl context @@ -316,7 +323,7 @@ func runReset(ctx context.Context, kubeconfig, kctx, currentCtx, namespace, fg, return nil } -func runAll(kubeconfig, kctx, namespace, fg, resultsDir string, timeout time.Duration, noCleanup, force, autoReset, continueOnFailure bool, junitOut string, logger *slog.Logger) int { +func runAll(kubeconfig, kctx, namespace, fg, resultsDir string, timeout time.Duration, noCleanup, force, autoReset, continueOnFailure bool, junitOut string, profile runner.Profile, logger *slog.Logger) int { k, err := loadKube(kubeconfig, kctx, false) if err != nil { fmt.Fprintln(os.Stderr, err) @@ -325,11 +332,14 @@ func runAll(kubeconfig, kctx, namespace, fg, resultsDir string, timeout time.Dur } return exitEnvironment } - scens := runner.DefaultRegistry.List() + scens := runner.SelectForProfile(runner.DefaultRegistry.List(), profile) if len(scens) == 0 { - fmt.Fprintln(os.Stderr, "no scenarios registered") + fmt.Fprintf(os.Stderr, "no scenarios selected for profile %q\n", profile) return exitFailure } + if profile != runner.ProfileFull && profile != "" { + fmt.Fprintf(os.Stderr, "Running profile %q: %d of %d scenarios\n", profile, len(scens), len(runner.DefaultRegistry.List())) + } if force { fmt.Fprintln(os.Stderr, "!! --force: will delete any prior chaos in-progress marker before each scenario's preflight") } diff --git a/docs/docs/gitops.mdx b/docs/docs/gitops.mdx index af0a865..af80845 100644 --- a/docs/docs/gitops.mdx +++ b/docs/docs/gitops.mdx @@ -13,10 +13,10 @@ Use this page for Argo CD or Flux-managed Bloodraven installs. It covers resourc Pick one CRD owner: -| Owner | Helm value | Notes | -|---|---|---| -| Bloodraven Helm release | `installCRDs=true` | Simple, but Helm CRD upgrades need care | -| Platform CRD app | `installCRDs=false` | Better for centralized CRD review and ordering | +| Owner | Notes | +|---|---| +| Bloodraven Helm release | Simple first install because Helm applies files in `charts/bloodraven/crds/`; CRD upgrades still need explicit review/application because Helm does not upgrade CRDs automatically. | +| Platform CRD app | Better for centralized CRD review and ordering; install the operator chart after the CRD app syncs. | Do not manage the same CRDs in both places. diff --git a/docs/docs/install-production.mdx b/docs/docs/install-production.mdx index 7b8ee7c..73db623 100644 --- a/docs/docs/install-production.mdx +++ b/docs/docs/install-production.mdx @@ -36,11 +36,10 @@ GitOps users should choose one owner for CRDs. Do not let both Helm and a separa ```bash helm upgrade --install bloodraven bloodraven/bloodraven \ --namespace bloodraven \ - --create-namespace \ - --set installCRDs=true + --create-namespace ``` -If CRDs are managed separately: +Helm installs CRDs from the chart's `crds/` directory on first install. If CRDs are managed separately: ```bash kubectl apply -f https://raw.githubusercontent.com/ShipStream/bloodraven/main/config/crd/bases/shipstream.io_mysqlfailovergroups.yaml @@ -48,7 +47,7 @@ kubectl apply -f https://raw.githubusercontent.com/ShipStream/bloodraven/main/co kubectl apply -f https://raw.githubusercontent.com/ShipStream/bloodraven/main/config/crd/bases/shipstream.io_mysqlbackupverifications.yaml ``` -Then install the operator with `--set installCRDs=false`. +Then install the operator chart after the platform CRD app has applied the CRDs. Helm does not upgrade CRDs in `crds/`; apply CRD updates explicitly during upgrades. ## Helm values diff --git a/docs/docs/production-install-examples.mdx b/docs/docs/production-install-examples.mdx index 44317f9..03c13f9 100644 --- a/docs/docs/production-install-examples.mdx +++ b/docs/docs/production-install-examples.mdx @@ -68,8 +68,6 @@ auxiliary: type: ClusterIP wsAllowedOrigins: https://dashboard.example.com wsMaxClients: 100 - -installCRDs: true ``` Install with: @@ -80,8 +78,7 @@ helm upgrade --install bloodraven bloodraven/bloodraven \ -f values-production.yaml ``` -If Argo CD owns CRDs separately, set `installCRDs: false` and commit the -CRDs under `charts/bloodraven/crds/` or your platform CRD app. +If Argo CD owns CRDs separately, commit the CRDs under `charts/bloodraven/crds/` or your platform CRD app and install the operator chart after that CRD app syncs. Helm installs chart CRDs on first install but does not upgrade them automatically. ## NetworkPolicy diff --git a/examples/argocd-application.yaml b/examples/argocd-application.yaml index 8948a2a..4c99515 100644 --- a/examples/argocd-application.yaml +++ b/examples/argocd-application.yaml @@ -11,7 +11,6 @@ spec: targetRevision: 0.1.6 helm: values: | - installCRDs: false metrics: service: enabled: true diff --git a/examples/production-values.yaml b/examples/production-values.yaml index 83c4be9..bacc9bf 100644 --- a/examples/production-values.yaml +++ b/examples/production-values.yaml @@ -27,5 +27,3 @@ auxiliary: service: enabled: false wsAllowedOrigins: "https://dashboard.example.com" - -installCRDs: true diff --git a/internal/controller/bootstrap.go b/internal/controller/bootstrap.go index 10183d2..aff3498 100644 --- a/internal/controller/bootstrap.go +++ b/internal/controller/bootstrap.go @@ -40,7 +40,17 @@ func (b *BootstrapController) BootstrapReplica(ctx context.Context, opts Bootstr return fmt.Errorf("primary is read-only, cannot bootstrap from it") } - // Step 2: CLONE INSTANCE is a destructive administrative operation, but + // Step 2: Ensure the clone plugin is loaded on both sides. MySQL executes + // CLONE INSTANCE on the recipient, but the donor also needs the plugin or + // the recipient returns Error 3862 with donor Error 1524. + if err := opts.Primary.EnsureClonePlugin(ctx); err != nil { + return fmt.Errorf("ensure primary clone plugin: %w", err) + } + if err := opts.Replica.EnsureClonePlugin(ctx); err != nil { + return fmt.Errorf("ensure replica clone plugin: %w", err) + } + + // Step 3: CLONE INSTANCE is a destructive administrative operation, but // MySQL rejects it while the recipient has super_read_only enabled. if err := opts.Replica.SetSuperReadOnly(ctx, false); err != nil { return fmt.Errorf("disable replica super_read_only for clone: %w", err) diff --git a/internal/controller/bootstrap_test.go b/internal/controller/bootstrap_test.go index 07374b4..332b03e 100644 --- a/internal/controller/bootstrap_test.go +++ b/internal/controller/bootstrap_test.go @@ -116,6 +116,11 @@ func (b *bootstrapMock) WaitForRelayLogDrain(_ context.Context, _ time.Duration) return nil } +func (b *bootstrapMock) EnsureClonePlugin(_ context.Context) error { + b.record("EnsureClonePlugin") + return nil +} + func (b *bootstrapMock) SetCloneDonorList(_ context.Context, donor string) error { b.record("SetCloneDonorList") b.mu.Lock() @@ -159,13 +164,13 @@ func TestBootstrapReplica_HappyPath(t *testing.T) { // Primary should have CheckReadOnly called pCalls := primary.getCalls() - if len(pCalls) != 1 || pCalls[0] != "CheckReadOnly" { - t.Errorf("primary calls: got %v, want [CheckReadOnly]", pCalls) + if len(pCalls) != 2 || pCalls[0] != "CheckReadOnly" || pCalls[1] != "EnsureClonePlugin" { + t.Errorf("primary calls: got %v, want [CheckReadOnly EnsureClonePlugin]", pCalls) } // Replica should be thawed before clone, then cloned. rCalls := replica.getCalls() - expected := []string{"SetSuperReadOnly(OFF)", "SetReadOnly", "SetCloneDonorList", "KillAppConnections", "CloneInstance"} + expected := []string{"EnsureClonePlugin", "SetSuperReadOnly(OFF)", "SetReadOnly", "SetCloneDonorList", "KillAppConnections", "CloneInstance"} if len(rCalls) != len(expected) { t.Fatalf("replica calls: got %v, want %v", rCalls, expected) } diff --git a/internal/controller/failover_test.go b/internal/controller/failover_test.go index cebc1c2..82bba12 100644 --- a/internal/controller/failover_test.go +++ b/internal/controller/failover_test.go @@ -115,6 +115,11 @@ func (t *trackingMock) WaitForRelayLogDrain(_ context.Context, _ time.Duration) return err } +func (t *trackingMock) EnsureClonePlugin(_ context.Context) error { + t.record("EnsureClonePlugin") + return nil +} + func (t *trackingMock) SetCloneDonorList(_ context.Context, donor string) error { t.record("SetCloneDonorList") return nil diff --git a/internal/controller/init_users.go b/internal/controller/init_users.go index 9b3400f..241f84e 100644 --- a/internal/controller/init_users.go +++ b/internal/controller/init_users.go @@ -66,6 +66,17 @@ if [ -z "${MYSQL_REPLICATION_USER:-}" ] || [ -z "${MYSQL_REPLICATION_PASSWORD:-} exit 0 fi +install_clone_plugin() { + local installed + installed=$(MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" mysql -u root -Nse "SELECT COUNT(*) FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME='clone'" 2>/dev/null || echo 0) + if [ "$installed" = "0" ]; then + echo "bloodraven-init: installing MySQL clone plugin" + MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" mysql -u root -e "INSTALL PLUGIN clone SONAME 'mysql_clone.so';" + fi +} + +install_clone_plugin + REPL_USER=$(escape_sql "$MYSQL_REPLICATION_USER") REPL_PASS=$(escape_sql "$MYSQL_REPLICATION_PASSWORD") @@ -93,6 +104,15 @@ escape_sql() { printf '%s' "$val" } +install_clone_plugin() { + local installed + installed=$(MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" mysql -u root -Nse "SELECT COUNT(*) FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME='clone'" 2>/dev/null || echo 0) + if [ "$installed" = "0" ]; then + echo "bloodraven-init: installing MySQL clone plugin" + MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" mysql -u root -e "INSTALL PLUGIN clone SONAME 'mysql_clone.so';" + fi +} + create_user_with_grants() { local user pass grants user=$(escape_sql "$(read_cred "$1" username)") @@ -113,7 +133,9 @@ EOSQL ` // Operator user — full admin for topology management, replication, cloning. - script += `create_user_with_grants operator "GRANT ALL PRIVILEGES ON *.* TO '__USER__'@'%' WITH GRANT OPTION;" + script += `install_clone_plugin + +create_user_with_grants operator "GRANT ALL PRIVILEGES ON *.* TO '__USER__'@'%' WITH GRANT OPTION;" ` if fg.Spec.Credentials.AppSecret != "" { diff --git a/internal/controller/reconciler.go b/internal/controller/reconciler.go index d6bdac5..20dab58 100644 --- a/internal/controller/reconciler.go +++ b/internal/controller/reconciler.go @@ -586,7 +586,7 @@ func (r *MysqlFailoverGroupReconciler) reconcileConfigMap(ctx context.Context, f labelManagedBy: managerName, } cm.Data = map[string]string{ - "my.cnf": generateMyCnf(fg), + "bloodraven.cnf": generateMyCnf(fg), } return nil }) @@ -826,9 +826,20 @@ func (r *MysqlFailoverGroupReconciler) reconcileDeployment(ctx context.Context, }) } + mysqlArgs := []string{ + fmt.Sprintf("--server-id=%d", serverID), + "--gtid-mode=ON", + "--enforce-gtid-consistency=ON", + "--log-bin=/var/lib/mysql/mysql-bin", + "--log-replica-updates=ON", + "--skip-replica-start=ON", + "--plugin-load-add=mysql_clone.so", + } + mysqlContainer := corev1.Container{ Name: "mysql", Image: image, + Args: mysqlArgs, Ports: []corev1.ContainerPort{ { Name: "mysql", @@ -1143,7 +1154,7 @@ func (r *MysqlFailoverGroupReconciler) reconcileDeployment(ctx context.Context, Image: image, Command: []string{ "sh", "-c", - fmt.Sprintf("cp /etc/mysql/config-map/* /etc/mysql/conf.d/ && printf '[mysqld]\\nserver-id=%d\\n' > /etc/mysql/conf.d/server-id.cnf", serverID), + fmt.Sprintf("cp /etc/mysql/config-map/bloodraven.cnf /etc/mysql/conf.d/bloodraven.cnf && printf '[mysqld]\\nserver-id=%d\\n' > /etc/mysql/conf.d/server-id.cnf", serverID), }, VolumeMounts: []corev1.VolumeMount{ {Name: "config", MountPath: "/etc/mysql/config-map"}, diff --git a/internal/controller/reconciler_test.go b/internal/controller/reconciler_test.go index fca7b0c..5352814 100644 --- a/internal/controller/reconciler_test.go +++ b/internal/controller/reconciler_test.go @@ -134,9 +134,9 @@ func TestReconcile_CreatesConfigMap(t *testing.T) { t.Fatalf("configmap not created: %v", err) } - myCnf, ok := cm.Data["my.cnf"] + myCnf, ok := cm.Data["bloodraven.cnf"] if !ok { - t.Fatal("my.cnf not found in configmap data") + t.Fatal("bloodraven.cnf not found in configmap data") } // Check for key config values @@ -363,7 +363,7 @@ func TestReconcile_TLSConfig(t *testing.T) { t.Fatalf("configmap not found: %v", err) } - myCnf := cm.Data["my.cnf"] + myCnf := cm.Data["bloodraven.cnf"] if !strings.Contains(myCnf, "require-secure-transport=ON") { t.Error("TLS-enabled config should contain require-secure-transport=ON") } @@ -394,7 +394,7 @@ func TestReconcile_MysqlConfOverrides(t *testing.T) { t.Fatalf("configmap not found: %v", err) } - myCnf := cm.Data["my.cnf"] + myCnf := cm.Data["bloodraven.cnf"] if !strings.Contains(myCnf, "max-connections=1000") { t.Error("override max-connections=1000 should be present") } diff --git a/internal/controller/topology_test.go b/internal/controller/topology_test.go index 1554c6d..096f3a9 100644 --- a/internal/controller/topology_test.go +++ b/internal/controller/topology_test.go @@ -108,6 +108,7 @@ func (m *mockMySQL) HasUserSchemas(_ context.Context) (bool, error) { } return m.gtidExecuted != "", m.userSchemasErr } +func (m *mockMySQL) EnsureClonePlugin(_ context.Context) error { return nil } func (m *mockMySQL) SetCloneDonorList(_ context.Context, _ string) error { return nil } func (m *mockMySQL) CloneInstance(_ context.Context, _, _, _ string, _ bool, _ int) error { return nil diff --git a/internal/controller/updater_test.go b/internal/controller/updater_test.go index 21b13da..11202ee 100644 --- a/internal/controller/updater_test.go +++ b/internal/controller/updater_test.go @@ -586,6 +586,7 @@ func (f *flappingChecker) StartReplicaSQLThread(_ context.Context) error { retur func (f *flappingChecker) WaitForRelayLogDrain(_ context.Context, _ time.Duration) error { return nil } +func (f *flappingChecker) EnsureClonePlugin(_ context.Context) error { return nil } func (f *flappingChecker) SetCloneDonorList(_ context.Context, _ string) error { return nil } func (f *flappingChecker) GetGtidExecuted(_ context.Context) (string, error) { return "", nil } func (f *flappingChecker) KillAppConnections(_ context.Context) (int, error) { return 0, nil } @@ -622,9 +623,12 @@ func (r *replicaStatusErrorChecker) StartReplicaSQLThread(_ context.Context) err func (r *replicaStatusErrorChecker) WaitForRelayLogDrain(_ context.Context, _ time.Duration) error { return nil } +func (r *replicaStatusErrorChecker) EnsureClonePlugin(_ context.Context) error { return nil } func (r *replicaStatusErrorChecker) SetCloneDonorList(_ context.Context, _ string) error { return nil } -func (r *replicaStatusErrorChecker) GetGtidExecuted(_ context.Context) (string, error) { return "", nil } -func (r *replicaStatusErrorChecker) KillAppConnections(_ context.Context) (int, error) { return 0, nil } +func (r *replicaStatusErrorChecker) GetGtidExecuted(_ context.Context) (string, error) { + return "", nil +} +func (r *replicaStatusErrorChecker) KillAppConnections(_ context.Context) (int, error) { return 0, nil } func (r *replicaStatusErrorChecker) CloneInstance(_ context.Context, _, _, _ string, _ bool, _ int) error { return nil } diff --git a/internal/mysql/checker.go b/internal/mysql/checker.go index 51f1e6b..dc48b57 100644 --- a/internal/mysql/checker.go +++ b/internal/mysql/checker.go @@ -32,6 +32,7 @@ type Checker interface { GetGtidExecuted(ctx context.Context) (string, error) // Clone plugin methods: + EnsureClonePlugin(ctx context.Context) error SetCloneDonorList(ctx context.Context, donor string) error CloneInstance(ctx context.Context, user, host, password string, useSSL bool, cloneTimeoutSec int) error } diff --git a/internal/mysql/clone.go b/internal/mysql/clone.go index e692a4d..1034aa1 100644 --- a/internal/mysql/clone.go +++ b/internal/mysql/clone.go @@ -2,12 +2,22 @@ package mysql import ( "context" + "errors" "fmt" + + mysqldriver "github.com/go-sql-driver/mysql" ) func (m *checker) SetCloneDonorList(ctx context.Context, donor string) error { _, err := m.db.ExecContext(ctx, "SET GLOBAL clone_valid_donor_list = ?", donor) if err != nil { + var mysqlErr *mysqldriver.MySQLError + if errors.As(err, &mysqlErr) && mysqlErr.Number == 1193 { + // MySQL 8.4+ no longer exposes clone_valid_donor_list even when + // the clone plugin is available. Older versions require this allowlist; + // newer versions can proceed directly to CLONE INSTANCE. + return nil + } return fmt.Errorf("set clone donor list: %w", err) } return nil @@ -18,6 +28,10 @@ func (m *checker) CloneInstance(ctx context.Context, user, host, password string cloneTimeoutSec = 3600 } + if err := m.EnsureClonePlugin(ctx); err != nil { + return fmt.Errorf("ensure clone plugin: %w", err) + } + // Set connection-level and global timeouts before cloning. // net_read_timeout and net_write_timeout are session-scoped and prevent the // server from dropping the connection during a long clone transfer. @@ -30,6 +44,12 @@ func (m *checker) CloneInstance(ctx context.Context, user, host, password string } for _, s := range timeoutStmts { if _, err := m.db.ExecContext(ctx, s); err != nil { + var mysqlErr *mysqldriver.MySQLError + if errors.As(err, &mysqlErr) && mysqlErr.Number == 1193 && s == fmt.Sprintf("SET GLOBAL clone_ddl_timeout = %d", cloneTimeoutSec) { + // clone_ddl_timeout was removed in newer MySQL releases; the + // connection-level timeouts above still apply, so continue. + continue + } return fmt.Errorf("set clone timeout (%s): %w", s, err) } } @@ -46,3 +66,25 @@ func (m *checker) CloneInstance(ctx context.Context, user, host, password string } return nil } + +func (m *checker) EnsureClonePlugin(ctx context.Context) error { + var installed int + if err := m.db.QueryRowContext(ctx, "SELECT COUNT(*) FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME = 'clone'").Scan(&installed); err != nil { + return fmt.Errorf("check clone plugin: %w", err) + } + if installed > 0 { + return nil + } + + _, err := m.db.ExecContext(ctx, "INSTALL PLUGIN clone SONAME 'mysql_clone.so'") + if err != nil { + var mysqlErr *mysqldriver.MySQLError + if errors.As(err, &mysqlErr) && mysqlErr.Number == 1125 { + // Another bootstrap/setup path may have installed the plugin between + // the INFORMATION_SCHEMA check and this statement. + return nil + } + return fmt.Errorf("install clone plugin: %w", err) + } + return nil +} diff --git a/internal/mysql/replication.go b/internal/mysql/replication.go index 754c9de..5ec9b33 100644 --- a/internal/mysql/replication.go +++ b/internal/mysql/replication.go @@ -157,6 +157,12 @@ func (m *checker) ShowReplicaStatus(ctx context.Context) (*ReplicaStatus, error) if v, ok := colMap["Last_Error"]; ok && v.Valid { rs.LastError = v.String } + if v, ok := colMap["Last_IO_Error"]; ok && v.Valid && rs.LastError == "" { + rs.LastError = v.String + } + if v, ok := colMap["Last_SQL_Error"]; ok && v.Valid && rs.LastError == "" { + rs.LastError = v.String + } if v, ok := colMap["Last_Errno"]; ok && v.Valid && rs.LastError == "" { // fallback } @@ -211,6 +217,12 @@ func (m *checker) ChangeReplicationSource(ctx context.Context, opts ReplicationS ) if opts.UseSSL { q += ", SOURCE_SSL=1" + } else { + // MySQL 8's default caching_sha2_password authentication needs + // the source's RSA public key for non-TLS replication channels. + // Without this, START REPLICA succeeds but the IO thread exits + // asynchronously, leaving the site permanently not-replicating. + q += ", GET_SOURCE_PUBLIC_KEY=1" } if _, err := m.db.ExecContext(ctx, q); err != nil { return fmt.Errorf("change replication source: %w", err) diff --git a/internal/playground/runner/junit.go b/internal/playground/runner/junit.go index 2d357e7..edde531 100644 --- a/internal/playground/runner/junit.go +++ b/internal/playground/runner/junit.go @@ -4,6 +4,7 @@ import ( "encoding/xml" "fmt" "os" + "path/filepath" ) // JUnitTestSuite is the surefire-flavor JUnit XML structure. @@ -19,10 +20,10 @@ type JUnitTestSuite struct { // JUnitTestCase is one scenario. type JUnitTestCase struct { - XMLName xml.Name `xml:"testcase"` - Name string `xml:"name,attr"` - Classname string `xml:"classname,attr"` - Time float64 `xml:"time,attr"` + XMLName xml.Name `xml:"testcase"` + Name string `xml:"name,attr"` + Classname string `xml:"classname,attr"` + Time float64 `xml:"time,attr"` Failure *junitFailure `xml:"failure,omitempty"` } @@ -49,7 +50,7 @@ func WriteJUnit(path string, results []Result) error { if !r.Passed { suite.Failures++ tc.Failure = &junitFailure{ - Type: fmt.Sprintf("%s/%s", r.Phase, r.StepName), + Type: fmt.Sprintf("%s/%s", r.Phase, r.StepName), Message: r.Failure, Body: fmt.Sprintf("phase=%s step=%q\n%s\n\nForensics: %s", r.Phase, r.StepName, r.Failure, r.CapturePath), @@ -65,6 +66,11 @@ func WriteJUnit(path string, results []Result) error { body = append([]byte(xml.Header), body...) body = append(body, '\n') + if dir := filepath.Dir(path); dir != "." && dir != "" { + if err := os.MkdirAll(dir, 0o755); err != nil { + return err + } + } if err := os.WriteFile(path, body, 0o644); err != nil { return err } diff --git a/internal/playground/runner/profile.go b/internal/playground/runner/profile.go new file mode 100644 index 0000000..231cc40 --- /dev/null +++ b/internal/playground/runner/profile.go @@ -0,0 +1,98 @@ +package runner + +// Profile selects which scenarios run-all executes. The three profiles +// form a strict superset chain: smoke ⊂ release ⊂ full. +// +// - smoke: short PR-label/manual subset covering emergency failover, +// planned switchover, and operator restart durability (~3 scenarios). +// - release: curated release/nightly subset covering the WISHLIST #32 +// behaviours (emergency failover, planned switchover, operator restart, +// data integrity, operator kill during failover, self-fencing, +// network partition, PVC loss/re-bootstrap, old-primary recovery, +// failover state durability). +// - full: every registered scenario (existing run-all behaviour). +type Profile string + +const ( + ProfileSmoke Profile = "smoke" + ProfileRelease Profile = "release" + ProfileFull Profile = "full" +) + +// DefaultProfile is used when --profile is not supplied. +const DefaultProfile Profile = ProfileFull + +// smokeScenarios is the hard-coded smoke subset. These three scenarios +// exercise the critical path — emergency failover, planned switchover, +// operator restart — and complete in roughly 3-5 minutes on a warm +// playground cluster. +var smokeScenarios = map[string]bool{ + "01-clean-primary-kill": true, // emergency failover + "02-planned-switchover": true, // planned switchover + "02-operator-kill-restart": true, // operator restart durability +} + +// releaseScenarios is the hard-coded release/nightly subset. In addition +// to the smoke scenarios, this covers the behaviours called out in +// WISHLIST #32: real MySQL pods/PVCs/Services, DNS/DNSEndpoint, taints, +// planned failover, emergency failover, operator restart, PVC loss and +// re-bootstrap, network partition / self-fencing, old-primary recovery, +// and failover state durability across operator restarts. +var releaseScenarios = map[string]bool{ + // smoke scenarios (superset) + "01-clean-primary-kill": true, + "02-planned-switchover": true, + "02-operator-kill-restart": true, + // additional release scenarios + "04-data-integrity-on-failover": true, // data plane correctness + "05-operator-kill-during-failover": true, // operator resilience mid-failover + "06-self-fence-isolated-primary": true, // taint/DNS self-fencing + "09-network-partition-self-fence": true, // NetworkPolicy/partition + "10-full-bootstrap-after-data-wipe": true, // PVC loss → re-bootstrap + "12-old-primary-recovery-no-divergence": true, // old-primary recovery + "23-failover-state-durability": true, // state survives operator restart +} + +// Profiles returns the list of valid profile names for CLI help and +// validation. +func Profiles() []Profile { + return []Profile{ProfileSmoke, ProfileRelease, ProfileFull} +} + +// IsValid reports whether p is a recognised profile name. +func (p Profile) IsValid() bool { + switch p { + case ProfileSmoke, ProfileRelease, ProfileFull: + return true + default: + return false + } + +} + +// SelectForProfile filters the given scenario list to the subset that +// belongs to the requested profile. For ProfileFull (the default) all +// scenarios are returned unfiltered. Unknown scenario IDs in the profile +// allowlist are silently ignored so that adding new scenarios does not +// break existing profiles. +func SelectForProfile(all []Scenario, p Profile) []Scenario { + if p == ProfileFull || p == "" { + return all + } + var allowlist map[string]bool + switch p { + case ProfileSmoke: + allowlist = smokeScenarios + case ProfileRelease: + allowlist = releaseScenarios + default: + return all + } + var out []Scenario + for _, s := range all { + if allowlist[s.ID] { + out = append(out, s) + } + } + return out +} diff --git a/internal/playground/runner/profile_registry_test.go b/internal/playground/runner/profile_registry_test.go new file mode 100644 index 0000000..b7e8119 --- /dev/null +++ b/internal/playground/runner/profile_registry_test.go @@ -0,0 +1,30 @@ +package runner_test + +import ( + "testing" + + "github.com/shipstream/bloodraven/internal/playground/runner" + _ "github.com/shipstream/bloodraven/internal/playground/scenarios" +) + +func TestProfilesSelectRegisteredScenarios(t *testing.T) { + all := runner.DefaultRegistry.List() + if len(all) == 0 { + t.Fatal("no scenarios registered") + } + + smoke := runner.SelectForProfile(all, runner.ProfileSmoke) + if len(smoke) != 3 { + t.Fatalf("smoke profile selected %d scenarios, want 3", len(smoke)) + } + + release := runner.SelectForProfile(all, runner.ProfileRelease) + if len(release) != 10 { + t.Fatalf("release profile selected %d scenarios, want 10", len(release)) + } + + full := runner.SelectForProfile(all, runner.ProfileFull) + if len(full) != len(all) { + t.Fatalf("full profile selected %d scenarios, want all %d", len(full), len(all)) + } +} diff --git a/internal/playground/runner/profile_test.go b/internal/playground/runner/profile_test.go new file mode 100644 index 0000000..159d594 --- /dev/null +++ b/internal/playground/runner/profile_test.go @@ -0,0 +1,137 @@ +package runner + +import ( + "testing" +) + +func TestProfileIsValid(t *testing.T) { + for _, p := range []Profile{ProfileSmoke, ProfileRelease, ProfileFull} { + if !p.IsValid() { + t.Errorf("expected %q to be valid", p) + } + } + for _, p := range []Profile{"unknown", "", "partial"} { + if p.IsValid() { + t.Errorf("expected %q to be invalid", p) + } + } +} + +func TestProfilesReturnsAll(t *testing.T) { + got := Profiles() + if len(got) != 3 { + t.Fatalf("Profiles() returned %d entries, want 3", len(got)) + } + want := map[Profile]bool{ProfileSmoke: true, ProfileRelease: true, ProfileFull: true} + for _, p := range got { + if !want[p] { + t.Errorf("unexpected profile %q", p) + } + } +} + +func TestSelectForProfileFull(t *testing.T) { + all := []Scenario{ + {ID: "01-clean-primary-kill"}, + {ID: "02-planned-switchover"}, + {ID: "06-self-fence-isolated-primary"}, + } + got := SelectForProfile(all, ProfileFull) + if len(got) != 3 { + t.Fatalf("ProfileFull: got %d scenarios, want 3", len(got)) + } + // Empty string defaults to full + got2 := SelectForProfile(all, "") + if len(got2) != 3 { + t.Fatalf("empty profile: got %d scenarios, want 3", len(got2)) + } +} + +func TestSelectForProfileSmoke(t *testing.T) { + all := []Scenario{ + {ID: "01-clean-primary-kill"}, + {ID: "02-planned-switchover"}, + {ID: "02-operator-kill-restart"}, + {ID: "06-self-fence-isolated-primary"}, + {ID: "10-full-bootstrap-after-data-wipe"}, + } + got := SelectForProfile(all, ProfileSmoke) + if len(got) != 3 { + t.Fatalf("ProfileSmoke: got %d scenarios, want 3; got %v", len(got), ids(got)) + } + for _, s := range got { + if !smokeScenarios[s.ID] { + t.Errorf("ProfileSmoke returned unexpected scenario %q", s.ID) + } + } +} + +func TestSelectForProfileRelease(t *testing.T) { + all := []Scenario{ + {ID: "01-clean-primary-kill"}, + {ID: "02-planned-switchover"}, + {ID: "02-operator-kill-restart"}, + {ID: "04-data-integrity-on-failover"}, + {ID: "05-operator-kill-during-failover"}, + {ID: "06-self-fence-isolated-primary"}, + {ID: "09-network-partition-self-fence"}, + {ID: "10-full-bootstrap-after-data-wipe"}, + {ID: "12-old-primary-recovery-no-divergence"}, + {ID: "23-failover-state-durability"}, + {ID: "05-split-brain-auto-resolve"}, // not in release + } + got := SelectForProfile(all, ProfileRelease) + if len(got) != 10 { + t.Fatalf("ProfileRelease: got %d scenarios, want 10; got %v", len(got), ids(got)) + } + for _, s := range got { + if !releaseScenarios[s.ID] { + t.Errorf("ProfileRelease returned unexpected scenario %q", s.ID) + } + } +} + +func TestSelectForProfileReleaseIncludesSmoke(t *testing.T) { + // Every smoke scenario must also be in the release profile. + for id := range smokeScenarios { + if !releaseScenarios[id] { + t.Errorf("smoke scenario %q is not in release profile", id) + } + } +} + +func TestSelectForProfileUnknownAllowlistIgnoresMissing(t *testing.T) { + // If the allowlist references an ID that doesn't exist in the + // scenario list, SelectForProfile silently skips it (no panic, + // no error). This makes adding new scenarios to a profile safe + // even before the scenario is registered. + all := []Scenario{{ID: "01-clean-primary-kill"}} + got := SelectForProfile(all, ProfileSmoke) + if len(got) != 1 || got[0].ID != "01-clean-primary-kill" { + t.Fatalf("ProfileSmoke with missing IDs: got %v", ids(got)) + } +} + +func TestSelectForProfileUnknownProfileReturnsAll(t *testing.T) { + all := []Scenario{{ID: "a"}, {ID: "b"}} + got := SelectForProfile(all, Profile("unknown")) + if len(got) != 2 { + t.Fatalf("unknown profile: got %d scenarios, want 2", len(got)) + } +} + +func TestSelectForProfileEmptyInput(t *testing.T) { + got := SelectForProfile(nil, ProfileSmoke) + if len(got) != 0 { + t.Fatalf("empty input: got %d scenarios, want 0", len(got)) + } +} + +// ids is a test helper that extracts scenario IDs. +func ids(scens []Scenario) []string { + out := make([]string, len(scens)) + for i, s := range scens { + out[i] = s.ID + } + return out +} diff --git a/internal/testutil/fakes.go b/internal/testutil/fakes.go index 21b9a95..fa37ee3 100644 --- a/internal/testutil/fakes.go +++ b/internal/testutil/fakes.go @@ -190,6 +190,13 @@ func (m *FakeMySQL) WaitForRelayLogDrain(_ context.Context, _ time.Duration) err return m.DrainErr } +func (m *FakeMySQL) EnsureClonePlugin(_ context.Context) error { + m.mu.Lock() + defer m.mu.Unlock() + m.record("EnsureClonePlugin") + return nil +} + func (m *FakeMySQL) SetCloneDonorList(_ context.Context, donor string) error { m.mu.Lock() defer m.mu.Unlock() diff --git a/playground/manifests/rustfs.yaml b/playground/manifests/rustfs.yaml index dac999a..f708b67 100644 --- a/playground/manifests/rustfs.yaml +++ b/playground/manifests/rustfs.yaml @@ -6,8 +6,8 @@ metadata: type: Opaque stringData: # AWS_* envs are what Dragonfly's S3 client reads. - AWS_ACCESS_KEY_ID: rustfsadmin - AWS_SECRET_ACCESS_KEY: rustfsadmin + AWS_ACCESS_KEY_ID: bloodraven-rustfs-access + AWS_SECRET_ACCESS_KEY: bloodraven-rustfs-secret AWS_REGION: us-east-1 # RUSTFS_* envs are what the rustfs server binary reads. Holding both # naming conventions in the same secret lets the rustfs Deployment @@ -17,8 +17,8 @@ stringData: # CreateContainerConfigError indefinitely (the secret exists in the # API server but the pod-scoped kubelet cache returns not-found and # never invalidates). - RUSTFS_ACCESS_KEY: rustfsadmin - RUSTFS_SECRET_KEY: rustfsadmin + RUSTFS_ACCESS_KEY: bloodraven-rustfs-access + RUSTFS_SECRET_KEY: bloodraven-rustfs-secret --- apiVersion: v1 kind: PersistentVolumeClaim diff --git a/playground/setup.sh b/playground/setup.sh index baa9a27..3836466 100755 --- a/playground/setup.sh +++ b/playground/setup.sh @@ -61,6 +61,14 @@ done source "$SCRIPT_DIR/_guard.sh" require_playground_context +HELM_INSTALL_CRDS=false +case "${BLOODRAVEN_SETUP_HELM_INSTALL_CRDS:-}" in + 1|true|TRUE|yes|YES) HELM_INSTALL_CRDS=true ;; +esac +if [[ "$HELM_INSTALL_CRDS" == "true" ]] && helm status bloodraven -n "$NAMESPACE" >/dev/null 2>&1; then + fail "BLOODRAVEN_SETUP_HELM_INSTALL_CRDS=1 requires a fresh Helm release. Helm installs CRDs from charts/bloodraven/crds only on first install and will not upgrade or repair them on helm upgrade; unset BLOODRAVEN_SETUP_HELM_INSTALL_CRDS to apply CRDs explicitly before upgrading." +fi + # Prefer docker over podman. k3d's podman support is experimental and the # tar-archive image-load path is slower than docker's native import. # Override with BLOODRAVEN_CONTAINER_RUNTIME=podman if you actually want @@ -98,8 +106,8 @@ info "Labeling nodes as site zones..." # Pick first two worker nodes (skip control-plane-only if possible) WORKERS=() for n in "${NODES[@]}"; do - role=$(kubectl get node "$n" -o jsonpath='{.metadata.labels.node-role\.kubernetes\.io/control-plane}' 2>/dev/null || true) - if [[ -z "$role" ]]; then + labels=$(kubectl get node "$n" --show-labels --no-headers 2>/dev/null || true) + if [[ "$labels" != *"node-role.kubernetes.io/control-plane"* && "$labels" != *"node-role.kubernetes.io/master"* ]]; then WORKERS+=("$n") fi done @@ -118,19 +126,23 @@ kubectl label node "${WORKERS[1]}" shipstream.io/site.playground=pdx --overwrite ok "Nodes labeled: ${WORKERS[0]}=iad, ${WORKERS[1]}=pdx" # ── 3. Build images ────────────────────────────────────────────────────── -info "Building operator and sidecar images..." -$RUNTIME build --target bloodraven -t bloodraven:playground "$PROJECT_ROOT" -$RUNTIME build --target sidecar -t bloodraven-sidecar:playground "$PROJECT_ROOT" +if [[ -n "${SKIP_IMAGE_BUILD:-}" ]]; then + info "SKIP_IMAGE_BUILD is set — skipping image builds (CI mode: images pre-built or pre-loaded)" +else + info "Building operator and sidecar images..." + $RUNTIME build --target bloodraven -t bloodraven:playground "$PROJECT_ROOT" + $RUNTIME build --target sidecar -t bloodraven-sidecar:playground "$PROJECT_ROOT" -info "Building counter-app image..." -$RUNTIME build -t bloodraven-counter:playground "$SCRIPT_DIR/counter-app" + info "Building counter-app image..." + $RUNTIME build -t bloodraven-counter:playground "$SCRIPT_DIR/counter-app" -info "Building dashboard image..." -$RUNTIME build -t bloodraven-dashboard:playground "$SCRIPT_DIR/dashboard" + info "Building dashboard image..." + $RUNTIME build -t bloodraven-dashboard:playground "$SCRIPT_DIR/dashboard" -info "Building dns-webhook image..." -$RUNTIME build -t bloodraven-dns-webhook:playground "$SCRIPT_DIR/dns-webhook" -ok "All images built" + info "Building dns-webhook image..." + $RUNTIME build -t bloodraven-dns-webhook:playground "$SCRIPT_DIR/dns-webhook" + ok "All images built" +fi # ── 4. Auto-detect cluster tool and load images ────────────────────────── IMAGES=(bloodraven:playground bloodraven-sidecar:playground bloodraven-counter:playground bloodraven-dashboard:playground bloodraven-dns-webhook:playground) @@ -239,9 +251,13 @@ EOF ok "DNSEndpoint CRD installed" # ── 6. Install Bloodraven CRDs ─────────────────────────────────────────── -info "Installing Bloodraven CRDs..." -kubectl apply -f "$PROJECT_ROOT/charts/bloodraven/crds/" -ok "Bloodraven CRDs installed" +if [[ "$HELM_INSTALL_CRDS" == "true" ]]; then + info "Skipping manual Bloodraven CRD install; fresh Helm install will install chart CRDs from charts/bloodraven/crds" +else + info "Installing Bloodraven CRDs..." + kubectl apply -f "$PROJECT_ROOT/charts/bloodraven/crds/" + ok "Bloodraven CRDs installed" +fi # ── 7. Create namespace and deploy manifests ───────────────────────────── info "Creating namespace and deploying manifests..." @@ -274,12 +290,23 @@ helm upgrade --install bloodraven "$PROJECT_ROOT/charts/bloodraven" \ --set image.repository="${IMG_PREFIX}bloodraven" \ --set image.tag=playground \ --set image.pullPolicy=Never \ - --set installCRDs=false \ --set auxiliary.service.enabled=true \ --set 'nodeSelector=null' \ --set 'tolerations[0].key=node.kubernetes.io/disk-pressure' \ --set 'tolerations[0].operator=Exists' \ --set 'tolerations[0].effect=NoSchedule' \ + --set 'tolerations[1].key=shipstream.io/db-readonly-playground' \ + --set 'tolerations[1].operator=Exists' \ + --set 'tolerations[1].effect=NoSchedule' \ + --set 'tolerations[2].key=shipstream.io/db-readonly' \ + --set 'tolerations[2].operator=Exists' \ + --set 'tolerations[2].effect=NoSchedule' \ + --set 'tolerations[3].key=shipstream.io/db-readonly-playground' \ + --set 'tolerations[3].operator=Exists' \ + --set 'tolerations[3].effect=NoExecute' \ + --set 'tolerations[4].key=shipstream.io/db-readonly' \ + --set 'tolerations[4].operator=Exists' \ + --set 'tolerations[4].effect=NoExecute' \ --set leaderElection.enabled=false \ --timeout=180s # Don't use --wait; the operator may take a moment to pass readiness after @@ -358,21 +385,50 @@ REPL_USER=$(kubectl -n "$NAMESPACE" get secret mysql-credentials -o jsonpath='{. REPL_PASS=$(kubectl -n "$NAMESPACE" get secret mysql-credentials -o jsonpath='{.data.MYSQL_REPLICATION_PASSWORD}' | base64 -d) ROOT_PASS=$(kubectl -n "$NAMESPACE" get secret mysql-credentials -o jsonpath='{.data.MYSQL_ROOT_PASSWORD}' | base64 -d) for site in iad pdx; do - READ_ONLY=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \ - mysql "-uroot" "-p${ROOT_PASS}" -Nse "SELECT @@global.read_only" 2>/dev/null || echo 0) - SUPER_READ_ONLY=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \ - mysql "-uroot" "-p${ROOT_PASS}" -Nse "SELECT @@global.super_read_only" 2>/dev/null || echo 0) - if kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \ - mysql "-uroot" "-p${ROOT_PASS}" -e \ - "SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF; \ - CREATE USER IF NOT EXISTS '${REPL_USER}'@'%' IDENTIFIED BY '${REPL_PASS}'; \ - GRANT REPLICATION SLAVE, REPLICATION CLIENT, BACKUP_ADMIN, CLONE_ADMIN ON *.* TO '${REPL_USER}'@'%'; \ - FLUSH PRIVILEGES;" 2>/dev/null; then - kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \ - mysql "-uroot" "-p${ROOT_PASS}" -e "SET GLOBAL read_only=${READ_ONLY}; SET GLOBAL super_read_only=${SUPER_READ_ONLY};" 2>/dev/null || true - ok "Replication user created on $site" - else + CREATED=false + LAST_ERR="" + for attempt in $(seq 1 12); do + READ_ONLY=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \ + env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -Nse "SELECT @@global.read_only" 2>/dev/null || echo 0) + SUPER_READ_ONLY=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \ + env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -Nse "SELECT @@global.super_read_only" 2>/dev/null || echo 0) + LAST_ERR=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \ + env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -e \ + "SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF; \ + INSTALL PLUGIN clone SONAME 'mysql_clone.so'; \ + CREATE USER IF NOT EXISTS '${REPL_USER}'@'%' IDENTIFIED BY '${REPL_PASS}'; \ + GRANT REPLICATION SLAVE, REPLICATION CLIENT, BACKUP_ADMIN, CLONE_ADMIN ON *.* TO '${REPL_USER}'@'%'; \ + FLUSH PRIVILEGES;" 2>&1) && CREATED=true || CREATED=false + if [[ "$CREATED" != "true" ]] && grep -Eq "(Function|Plugin) 'clone' already exists|already installed" <<<"$LAST_ERR"; then + LAST_ERR=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \ + env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -e \ + "SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF; \ + CREATE USER IF NOT EXISTS '${REPL_USER}'@'%' IDENTIFIED BY '${REPL_PASS}'; \ + GRANT REPLICATION SLAVE, REPLICATION CLIENT, BACKUP_ADMIN, CLONE_ADMIN ON *.* TO '${REPL_USER}'@'%'; \ + FLUSH PRIVILEGES;" 2>&1) && CREATED=true || CREATED=false + fi + if [[ "$CREATED" == "true" ]]; then + kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \ + env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -e "SET GLOBAL read_only=${READ_ONLY}; SET GLOBAL super_read_only=${SUPER_READ_ONLY};" 2>/dev/null || true + ok "Replication user created on $site" + break + fi + + # The operator init script also creates this user. If our explicit setup + # races with early bootstrap but the user is already present, keep going. + if kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \ + env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -Nse "SELECT CONCAT(user_exists, ':', clone_loaded) FROM (SELECT COUNT(*) AS user_exists FROM mysql.user WHERE user='${REPL_USER}' AND host='%') u CROSS JOIN (SELECT COUNT(*) AS clone_loaded FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME='clone') p" 2>/dev/null | grep -q '^1:1$'; then + ok "Replication user and clone plugin already exist on $site" + CREATED=true + break + fi + + warn "Replication user setup on $site failed (attempt $attempt/12); retrying..." + sleep 5 + done + if [[ "$CREATED" != "true" ]]; then warn "Failed to create replication user on $site" + echo "$LAST_ERR" >&2 exit 1 fi done diff --git a/test/component/helpers_test.go b/test/component/helpers_test.go index 69cceba..b877789 100644 --- a/test/component/helpers_test.go +++ b/test/component/helpers_test.go @@ -123,6 +123,10 @@ func (m *mockMySQL) WaitForRelayLogDrain(_ context.Context, _ time.Duration) err return nil } +func (m *mockMySQL) EnsureClonePlugin(_ context.Context) error { + return nil +} + func (m *mockMySQL) SetCloneDonorList(_ context.Context, donor string) error { m.mu.Lock() defer m.mu.Unlock()