diff --git a/.github/kind/e2e-calico.yaml b/.github/kind/e2e-calico.yaml
new file mode 100644
index 0000000..fd236dc
--- /dev/null
+++ b/.github/kind/e2e-calico.yaml
@@ -0,0 +1,22 @@
+# kind cluster configuration for Bloodraven E2E tests.
+# Uses Calico CNI so NetworkPolicy resources are enforced (the default
+# kindnet CNI does not implement NetworkPolicy, which means partition /
+# self-fencing scenarios would silently pass without actually testing
+# policy behaviour).
+#
+# Usage:
+#   kind create cluster --config=.github/kind/e2e-calico.yaml
+#   # then install Calico:
+#   kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.28.0/manifests/calico.yaml
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+name: bloodraven-e2e
+nodes:
+  - role: control-plane
+  - role: worker
+  - role: worker
+networking:
+  # Disable kindnet so Calico can manage CNI instead.
+  disableDefaultCNI: true
+  # Match the stock Calico manifest's default IPv4 pool.
+  podSubnet: "192.168.0.0/16"
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index ab93840..06fb86f 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -105,6 +105,28 @@ Then enable GitHub Pages for the repository pointing at the `gh-pages` branch. T
 
 ---
 
+### `e2e.yml` / `_e2e.yml` — Real-Cluster E2E
+
+**Triggers:**
+- Nightly schedule (release profile)
+- Manual dispatch with profile selection (smoke / release / full)
+- Pull requests with the `e2e` label (smoke profile)
+
+The reusable workflow (`_e2e.yml`) creates a kind cluster with Calico CNI, deploys the playground, and runs `playground-chaos run-all` with the selected profile. It uploads JUnit results, chaos forensics, setup logs, and kind logs as artifacts.
+
+Profiles:
+| Profile | Scenarios | Use case |
+|---|---|---|
+| `smoke` | 3 (~3-5 min) | PR label gate, fast feedback |
+| `release` | 10 (~20-30 min) | Release and nightly gate |
+| `full` | All registered | Full regression (manual only) |
+
+The release workflow (`.github/workflows/release.yml`) blocks Docker image builds and Helm chart publishing on the E2E release-profile gate. This ensures every tagged release is validated against real MySQL failover scenarios (WISHLIST #32).
+
+**Permissions:** `contents: read` (default)
+
+---
+
 ### `scan.yml` — Trivy Security Scan
 
 **Triggers:** Pull requests targeting `main`
diff --git a/.github/workflows/_e2e.yml b/.github/workflows/_e2e.yml
new file mode 100644
index 0000000..b493be5
--- /dev/null
+++ b/.github/workflows/_e2e.yml
@@ -0,0 +1,125 @@
+# Reusable E2E workflow — creates a kind cluster, deploys the playground,
+# and runs playground-chaos with the selected profile.
+#
+# Called by:
+#   .github/workflows/e2e.yml (nightly, manual, PR label)
+#   .github/workflows/release.yml (release gate)
+name: E2E (reusable)
+
+on:
+  workflow_call:
+    inputs:
+      profile:
+        description: "Chaos profile (smoke|release|full)"
+        required: false
+        default: "release"
+        type: string
+      timeout-minutes:
+        description: "Job timeout in minutes"
+        required: false
+        default: 90
+        type: number
+
+permissions:
+  contents: read
+
+env:
+  BLOODRAVEN_SETUP_HELM_INSTALL_CRDS: "1"
+  SKIP_IMAGE_BUILD: "1"
+
+concurrency:
+  group: e2e-${{ github.workflow }}-${{ github.ref }}-${{ inputs.profile }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    name: Real-cluster E2E (${{ inputs.profile }})
+    runs-on: ubuntu-latest
+    timeout-minutes: ${{ inputs.timeout-minutes }}
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: actions/setup-go@v6
+        with:
+          go-version-file: go.mod
+          cache-dependency-path: go.sum
+
+      - name: Build playground-chaos
+        run: make build-playground-chaos
+
+      - name: Build Docker images
+        run: |
+          docker build --target bloodraven -t bloodraven:playground .
+          docker build --target sidecar -t bloodraven-sidecar:playground .
+          docker build -t bloodraven-counter:playground playground/counter-app
+          docker build -t bloodraven-dashboard:playground playground/dashboard
+          docker build -t bloodraven-dns-webhook:playground playground/dns-webhook
+
+      - name: Create kind cluster
+        uses: helm/kind-action@v1.12.0
+        with:
+          cluster_name: bloodraven-e2e
+          config: .github/kind/e2e-calico.yaml
+          # CNI is disabled in this kind config, so nodes cannot become
+          # Ready until Calico is installed in the next step.
+          wait: 0s
+
+      - name: Install Calico CNI
+        run: |
+          kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.28.0/manifests/calico.yaml
+          kubectl -n kube-system rollout status daemonset/calico-node --timeout=180s
+          kubectl wait nodes --all --for=condition=Ready --timeout=180s
+
+      - name: Load images into kind
+        run: |
+          kind load docker-image bloodraven:playground bloodraven-sidecar:playground bloodraven-counter:playground bloodraven-dashboard:playground bloodraven-dns-webhook:playground --name bloodraven-e2e
+
+      - name: Deploy playground
+        run: |
+          set -o pipefail
+          ./playground/setup.sh 2>&1 | tee playground/setup.log
+        timeout-minutes: 10
+
+      - name: Run E2E (${{ inputs.profile }} profile)
+        run: make test-e2e E2E_PROFILE=${{ inputs.profile }} E2E_JUNIT_OUT=playground/chaos-results/e2e-${{ inputs.profile }}-junit.xml
+        timeout-minutes: ${{ inputs.timeout-minutes }}
+
+      - name: Upload JUnit results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: e2e-${{ inputs.profile }}-junit
+          path: playground/chaos-results/e2e-${{ inputs.profile }}-junit.xml
+          retention-days: 30
+
+      - name: Upload chaos forensics
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: e2e-${{ inputs.profile }}-forensics
+          path: playground/chaos-results/
+          retention-days: 30
+
+      - name: Upload kind logs
+        if: failure()
+        run: |
+          mkdir -p /tmp/kind-logs
+          kind export logs --name=bloodraven-e2e /tmp/kind-logs || true
+        continue-on-error: true
+
+      - name: Upload kind logs artifact
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: e2e-${{ inputs.profile }}-kind-logs
+          path: /tmp/kind-logs/
+          retention-days: 14
+        continue-on-error: true
+
+      - name: Upload setup logs
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: e2e-${{ inputs.profile }}-setup-logs
+          path: playground/setup.log
+          retention-days: 14
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
new file mode 100644
index 0000000..d6777de
--- /dev/null
+++ b/.github/workflows/e2e.yml
@@ -0,0 +1,39 @@
+# E2E trigger workflow — nightly, manual, and PR-label-gated.
+# The reusable workflow is in .github/workflows/_e2e.yml.
+name: E2E
+
+on:
+  # Nightly release-profile run
+  schedule:
+    - cron: "0 5 * * *" # 05:00 UTC daily
+
+  # Manual dispatch with profile selection
+  workflow_dispatch:
+    inputs:
+      profile:
+        description: "Chaos profile (smoke|release|full)"
+        required: false
+        default: "release"
+        type: choice
+        options:
+          - smoke
+          - release
+          - full
+
+  # PR label gate: run smoke while the "e2e" label is present.
+  pull_request:
+    types: [opened, reopened, synchronize, labeled]
+
+permissions:
+  contents: read
+
+jobs:
+  # Skip PR-triggered runs unless the "e2e" label is present.
+  e2e:
+    if: >-
+      github.event_name == 'schedule' ||
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'e2e'))
+    uses: ./.github/workflows/_e2e.yml
+    with:
+      profile: ${{ github.event_name == 'pull_request' && 'smoke' || (github.event.inputs.profile || 'release') }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 05efe59..031eea1 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -73,10 +73,21 @@ jobs:
         working-directory: docs
         run: npm run verify:llms
 
+  # E2E release gate — runs the release-profile real-cluster E2E before
+  # any publishing jobs. This ensures that every tagged release has been
+  # validated against real MySQL pods, PVCs, DNS, taints, failover, and
+  # network partition scenarios (WISHLIST #32).
+  e2e-gate:
+    name: E2E gate (release profile)
+    needs: ci-gate
+    uses: ./.github/workflows/_e2e.yml
+    with:
+      profile: release
+
   draft-release:
     name: Create Draft Release
     runs-on: ubuntu-latest
-    needs: ci-gate
+    needs: [ci-gate, e2e-gate]
     steps:
       - uses: actions/checkout@v6
         with:
@@ -116,7 +127,7 @@ jobs:
   docker:
     name: Build and Push Docker Images
     runs-on: ubuntu-latest
-    needs: [ci-gate, draft-release]
+    needs: [ci-gate, e2e-gate, draft-release]
     strategy:
       matrix:
         include:
diff --git a/AGENTS.md b/AGENTS.md
index 10f9c54..c54c211 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,7 +1,7 @@
 # Repository Guidelines
 
 ## Project Structure & Module Organization
-Primary code lives in the root Go module. `cmd/bloodraven` is the Kubernetes operator entrypoint; `cmd/sidecar` is the per-MySQL sidecar; `cmd/kubectl-bloodraven` is the day-2 `kubectl` plugin (status / promote / reclone / backup / verify-backup, built via `make build-kubectl-plugin`). API types live in `api/v1alpha1`, controller logic in `internal/controller`, and supporting packages in `internal/mysql`, `internal/platform`, `internal/sidecar`, `internal/state`, and `internal/metrics`. End-to-end and scenario-style tests live in `test/e2e`. Treat `bitpoke/` and `orchestrator/` as bundled upstream references, not the default place for new feature work.
+Primary code lives in the root Go module. `cmd/bloodraven` is the Kubernetes operator entrypoint; `cmd/sidecar` is the per-MySQL sidecar; `cmd/kubectl-bloodraven` is the day-2 `kubectl` plugin (status / promote / reclone / backup / verify-backup, built via `make build-kubectl-plugin`). API types live in `api/v1alpha1`, controller logic in `internal/controller`, and supporting packages in `internal/mysql`, `internal/platform`, `internal/sidecar`, `internal/state`, and `internal/metrics`. Real-cluster scenario tests live under `internal/playground/scenarios` and run through `cmd/playground-chaos`; faster cross-component tests live under `test/component`, with API-server/envtest coverage under `test/envtest`. Treat `bitpoke/` and `orchestrator/` as bundled upstream references, not the default place for new feature work.
 
 ## Build, Test, and Development Commands
 Run commands from the repository root:
@@ -10,6 +10,8 @@ Run commands from the repository root:
 - `go build ./cmd/sidecar` builds the sidecar binary.
 - `make build-kubectl-plugin` builds `bin/kubectl-bloodraven` (the day-2 `kubectl` plugin). Override `KUBECTL_PLUGIN_VERSION=<tag>` to stamp a release; `make install-kubectl-plugin` drops the binary onto `$PATH`.
 - `make test` runs `go test ./...` across unit and e2e-style packages.
+- `make test-e2e` runs the release profile of real-cluster E2E tests against the current playground cluster (requires kind/k3d/minikube context prepared with `./playground/setup.sh`; CI creates kind and runs setup first).
+- `make test-e2e-smoke` runs the smoke profile (~3 scenarios, fast feedback).
 - `make vet` runs `go vet ./...`.
 - `make lint` runs `golangci-lint run ./...`. `golangci-lint` is not vendored; install it with `go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest` (it lands in `$(go env GOPATH)/bin`). CI installs the same tool with the same command in `.github/workflows/ci.yml`, so local and CI output match when you run this.
 - `make generate` refreshes API deep-copy code in `api/v1alpha1`.
@@ -26,7 +28,7 @@ Use standard Go formatting: run `gofmt` on changed files and keep imports organi
 Structured-log `msg` strings and field names listed in `docs/docs/log-schema.mdx` are a public stability contract — downstream log pipelines filter on them. When you touch a log call site whose `msg` appears in that doc's Event reference, either preserve the `msg` string and the documented field set exactly, or update `docs/docs/log-schema.mdx` in the same PR and call out the break in the PR description. The same applies to field naming: log keys are `camelCase` (per the contract), not `snake_case`.
 
 ## Testing Guidelines
-Add table-driven unit tests beside the code they cover, using the existing `*_test.go` layout under `internal/`. Put cross-component behavior tests in `test/e2e`. Some tests create local HTTP listeners with `httptest`, so restricted sandboxes may fail even when local developer runs pass.
+Add table-driven unit tests beside the code they cover, using the existing `*_test.go` layout under `internal/`. Put cross-component behavior tests in `test/component`, API-server/controller-runtime tests in `test/envtest`, and real-cluster playground scenarios in `internal/playground/scenarios` through `cmd/playground-chaos`. Some tests create local HTTP listeners with `httptest`, so restricted sandboxes may fail even when local developer runs pass.
 
 ### Pre-PR gate (required, do not skip)
 Before pushing a branch that opens or updates a PR, run all of the following from the repo root and fix anything they report. Do **not** push expecting CI to find problems you could have caught locally — CI failures on lint or generate drift are round-trip latency and reviewer noise.
@@ -89,7 +91,7 @@ Lessons from running chaos scenarios against a live k3d cluster:
 `./playground/rebuild.sh operator` builds, imports to k3d, and restarts the operator deployment. For sidecar changes, use `./playground/rebuild.sh sidecar` (restarts MySQL pods). Both can be combined: `./playground/rebuild.sh operator sidecar`.
 
 ### Automated chaos runner
-A subset of `playground/chaos-scenarios.md` is automated by `cmd/playground-chaos` and exposed as Make targets: `make chaos-list`, `make chaos-check`, `make chaos-run SCENARIO=<id>`, `make chaos-run-all`. The runner refuses to mutate any kubectl context outside the `_guard.sh` allowlist; on assertion failure it captures cluster YAML + pods + events + operator/sidecar logs + raw `/metrics` under `playground/chaos-results/<timestamp>/<scenario-id>/` for triage. Use `--no-cleanup` to keep injected state in place for forensics.
+A subset of `playground/chaos-scenarios.md` is automated by `cmd/playground-chaos` and exposed as Make targets: `make chaos-list`, `make chaos-check`, `make chaos-run SCENARIO=<id>`, `make chaos-run-all`, `make chaos-run-all-profile PROFILE=smoke|release|full`. The runner supports three E2E profiles (`--profile=smoke|release|full`) that filter which scenarios run. The runner refuses to mutate any kubectl context outside the `_guard.sh` allowlist; on assertion failure it captures cluster YAML + pods + events + operator/sidecar logs + raw `/metrics` under `playground/chaos-results/<timestamp>/<scenario-id>/` for triage. Use `--no-cleanup` to keep injected state in place for forensics.
 
 The runner stamps an in-progress marker on the MFG (`chaos.playground.bloodraven.io/in-progress`) after Precheck and clears it on cleanup. A subsequent run that finds a leftover marker refuses to start with a specific reason (live owner / abandoned / different host). Override with `--force` (delete the marker before preflight) or `--auto-reset` (on Precheck failure, shell out to `reset-mysql.sh + setup.sh` and retry once; 3s pause unless `CI=1`). `chaos-check` runs the same structural baseline scenarios use — stuck scale-to-0 deployments, bogus `lastFailoverTarget`, anti-flap cooldown still ticking, `NoPrimary` (both-sites-read-only), replication off on a non-active candidate — each with the exact remediation command in the error.
 
diff --git a/Makefile b/Makefile
index eca7ca6..631897a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 CONTROLLER_GEN ?= go run sigs.k8s.io/controller-tools/cmd/controller-gen
 
-.PHONY: help generate manifests build build-bloodraven build-sidecar build-playground-chaos build-kubectl-plugin install-kubectl-plugin test test-unit test-component test-envtest test-e2e test-integration fmt vet lint docker-build chaos-list chaos-check chaos-run chaos-run-all
+.PHONY: help generate manifests build build-bloodraven build-sidecar build-playground-chaos build-kubectl-plugin install-kubectl-plugin test test-unit test-component test-envtest test-e2e test-e2e-smoke test-integration fmt vet lint docker-build chaos-list chaos-check chaos-run chaos-run-all chaos-run-all-profile
 
 ##@ General
 
@@ -90,10 +90,15 @@ test-component: ## Run component tests (cross-package with fakes, no real cluste
 test-envtest: ## Run envtest controller tests (real API server, no cluster)
 	go test -race -tags envtest ./test/envtest/
 
-test-e2e: ## Run real cluster end-to-end tests (requires kind/k3d — Phase 4, not yet implemented)
-	@echo "Real cluster e2e tests are not yet implemented (Testing 2.0 Phase 4)."
-	@echo "See TESTING_2.0.md for the planned scenarios."
-	@exit 1
+E2E_PROFILE ?= release
+E2E_JUNIT_OUT ?= playground/chaos-results/e2e-$(E2E_PROFILE)-junit.xml
+E2E_ARGS ?=
+
+test-e2e: build-playground-chaos ## Run real-cluster E2E tests (E2E_PROFILE=release|smoke|full; requires kind/k3d)
+	./bin/playground-chaos run-all --profile=$(E2E_PROFILE) --auto-reset --continue-on-failure --junit-out=$(E2E_JUNIT_OUT) $(E2E_ARGS)
+
+test-e2e-smoke: build-playground-chaos ## Run real-cluster E2E smoke (smoke profile — requires kind/k3d)
+	$(MAKE) test-e2e E2E_PROFILE=smoke E2E_JUNIT_OUT=playground/chaos-results/e2e-smoke-junit.xml
 
 test-integration: ## Run integration tests (network listener tests)
 	go test -tags integration -race ./internal/platform/ ./test/component/
@@ -123,3 +128,7 @@ chaos-run: build-playground-chaos ## Run a single scenario (SCENARIO=<id>)
 
 chaos-run-all: build-playground-chaos ## Run every registered chaos scenario in order
 	./bin/playground-chaos run-all
+
+chaos-run-all-profile: build-playground-chaos ## Run chaos scenarios filtered by profile (PROFILE=smoke|release|full)
+	@if [ -z "$(PROFILE)" ]; then echo "usage: make chaos-run-all-profile PROFILE=smoke"; exit 2; fi
+	./bin/playground-chaos run-all --profile=$(PROFILE)
diff --git a/WISHLIST.md b/WISHLIST.md
index 6f83c10..e6ddf4d 100644
--- a/WISHLIST.md
+++ b/WISHLIST.md
@@ -5,13 +5,16 @@
 - [ ] 7. Cross-region/cross-cluster DR as a first-class feature
 - [ ] 27. Backup/restore performance guide
 - [ ] 30. Public repo, license, release cadence
-- [ ] 32. Real-cluster E2E CI gate
+- [x] 32. Real-cluster E2E CI gate
 - [ ] 41. Safe Secret watch narrowing design
 - [ ] 42. Namespace-scoped watch/cache mode evaluation
+- [ ] 43. Dedicated backup/PITR real-cluster E2E scenarios
 
 ## P0 — Production adoption blockers
 
-**32. Real-cluster E2E CI gate.** Unit/component/envtest coverage is not enough for a MySQL failover operator. Add an optional-but-required-before-release k3d/kind CI job that installs the chart and exercises real MySQL pods, PVCs, Services, DNS/DNSEndpoint behavior, taints, planned failover, emergency failover, operator restart, PVC loss, NetworkPolicy partition, backup restore, and PITR verification. This should run at least on release tags and nightly; if cost is acceptable, run a reduced smoke subset on PRs.
+**32. Real-cluster E2E CI gate.** Done: `make test-e2e` runs the release profile of `playground-chaos run-all` against a real cluster instead of the former placeholder. `make test-e2e-smoke` runs a fast smoke subset (3 scenarios). Three profiles (`smoke`/`release`/`full`) filter scenarios via `--profile` on `playground-chaos run-all` and `make chaos-run-all-profile PROFILE=`. CI uses a reusable workflow (`_e2e.yml`) that creates a kind cluster with Calico CNI, deploys the playground, and runs the selected profile. Nightly and manual runs use the release profile; PRs with the `e2e` label trigger a smoke run. Release publishing blocks on the E2E release-profile gate. JUnit, forensics, setup logs, and kind logs are uploaded as artifacts. Dedicated MySQL backup restore and PITR verification scenarios are split out as follow-up #43 so the gate can start enforcing the existing real-cluster chaos suite now without misrepresenting that coverage.
+
+**43. Dedicated backup/PITR real-cluster E2E scenarios.** Follow-up to #32: add release-profile playground-chaos scenarios that configure the playground backup profile against RustFS, trigger a real `MysqlBackup`, verify restore via `MysqlBackupVerification`, then enable PITR/binlog archival and verify a point-in-time replay with deterministic marker rows. The #32 gate now exists and is release-blocking, but this backup/PITR coverage should be added before claiming the E2E release profile exercises every backup/restore path.
 
 ## P1 — DR and operational completeness
 
diff --git a/charts/bloodraven/values.yaml b/charts/bloodraven/values.yaml
index 9ecdbb4..34ddc72 100644
--- a/charts/bloodraven/values.yaml
+++ b/charts/bloodraven/values.yaml
@@ -161,6 +161,3 @@ auxiliary:
     # Beyond this limit, additional upgrades are rejected with 429.
     # Defends against an attacker pinning the hub's clients map.
     wsMaxClients: 100
-
-# -- Install CRDs. Set to false if CRDs are managed externally.
-installCRDs: true
diff --git a/cmd/playground-chaos/main.go b/cmd/playground-chaos/main.go
index bd70e3f..0f2b870 100644
--- a/cmd/playground-chaos/main.go
+++ b/cmd/playground-chaos/main.go
@@ -43,6 +43,7 @@ func main() {
 	autoReset := rootFlags.Bool("auto-reset", false, "on precheck failure: run playground-chaos reset, then retry once (3s pause unless CI=1)")
 	continueOnFailure := rootFlags.Bool("continue-on-failure", false, "run-all only: keep going past the first failure")
 	junitOut := rootFlags.String("junit-out", "", "run-all only: write JUnit XML report to this path")
+	profile := rootFlags.String("profile", string(runner.DefaultProfile), "run-all only: scenario subset (smoke|release|full)")
 	verbose := rootFlags.Bool("verbose", false, "verbose logging")
 	kubeconfig := rootFlags.String("kubeconfig", "", "kubeconfig path (default: KUBECONFIG / ~/.kube/config)")
 	kctx := rootFlags.String("context", "", "kubectl context to use (default: current-context)")
@@ -97,7 +98,12 @@ func main() {
 		}
 		os.Exit(runOne(*kubeconfig, *kctx, *namespace, *fg, *resultsDir, *timeout, *noCleanup, *force, *autoReset, subArgs[0], logger))
 	case "run-all":
-		os.Exit(runAll(*kubeconfig, *kctx, *namespace, *fg, *resultsDir, *timeout, *noCleanup, *force, *autoReset, *continueOnFailure, *junitOut, logger))
+		p := runner.Profile(*profile)
+		if !p.IsValid() {
+			fmt.Fprintf(os.Stderr, "invalid profile %q; valid: smoke, release, full\n", p)
+			os.Exit(exitFlagParse)
+		}
+		os.Exit(runAll(*kubeconfig, *kctx, *namespace, *fg, *resultsDir, *timeout, *noCleanup, *force, *autoReset, *continueOnFailure, *junitOut, p, logger))
 	default:
 		fmt.Fprintf(os.Stderr, "unknown subcommand: %s\n", subcmd)
 		usage()
@@ -123,6 +129,7 @@ Flags:
   --auto-reset           on precheck failure: playground-chaos reset, retry once
   --continue-on-failure  run-all only: keep going past first failure
   --junit-out            run-all only: write JUnit XML to path
+  --profile              run-all only: scenario subset (smoke|release|full)
   --verbose              verbose logging
   --kubeconfig           kubeconfig path
   --context              kubectl context
@@ -316,7 +323,7 @@ func runReset(ctx context.Context, kubeconfig, kctx, currentCtx, namespace, fg,
 	return nil
 }
 
-func runAll(kubeconfig, kctx, namespace, fg, resultsDir string, timeout time.Duration, noCleanup, force, autoReset, continueOnFailure bool, junitOut string, logger *slog.Logger) int {
+func runAll(kubeconfig, kctx, namespace, fg, resultsDir string, timeout time.Duration, noCleanup, force, autoReset, continueOnFailure bool, junitOut string, profile runner.Profile, logger *slog.Logger) int {
 	k, err := loadKube(kubeconfig, kctx, false)
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
@@ -325,11 +332,14 @@ func runAll(kubeconfig, kctx, namespace, fg, resultsDir string, timeout time.Dur
 		}
 		return exitEnvironment
 	}
-	scens := runner.DefaultRegistry.List()
+	scens := runner.SelectForProfile(runner.DefaultRegistry.List(), profile)
 	if len(scens) == 0 {
-		fmt.Fprintln(os.Stderr, "no scenarios registered")
+		fmt.Fprintf(os.Stderr, "no scenarios selected for profile %q\n", profile)
 		return exitFailure
 	}
+	if profile != runner.ProfileFull && profile != "" {
+		fmt.Fprintf(os.Stderr, "Running profile %q: %d of %d scenarios\n", profile, len(scens), len(runner.DefaultRegistry.List()))
+	}
 	if force {
 		fmt.Fprintln(os.Stderr, "!! --force: will delete any prior chaos in-progress marker before each scenario's preflight")
 	}
diff --git a/docs/docs/gitops.mdx b/docs/docs/gitops.mdx
index af0a865..af80845 100644
--- a/docs/docs/gitops.mdx
+++ b/docs/docs/gitops.mdx
@@ -13,10 +13,10 @@ Use this page for Argo CD or Flux-managed Bloodraven installs. It covers resourc
 
 Pick one CRD owner:
 
-| Owner | Helm value | Notes |
-|---|---|---|
-| Bloodraven Helm release | `installCRDs=true` | Simple, but Helm CRD upgrades need care |
-| Platform CRD app | `installCRDs=false` | Better for centralized CRD review and ordering |
+| Owner | Notes |
+|---|---|
+| Bloodraven Helm release | Simple first install because Helm applies files in `charts/bloodraven/crds/`; CRD upgrades still need explicit review/application because Helm does not upgrade CRDs automatically. |
+| Platform CRD app | Better for centralized CRD review and ordering; install the operator chart after the CRD app syncs. |
 
 Do not manage the same CRDs in both places.
 
diff --git a/docs/docs/install-production.mdx b/docs/docs/install-production.mdx
index 7b8ee7c..73db623 100644
--- a/docs/docs/install-production.mdx
+++ b/docs/docs/install-production.mdx
@@ -36,11 +36,10 @@ GitOps users should choose one owner for CRDs. Do not let both Helm and a separa
 ```bash
 helm upgrade --install bloodraven bloodraven/bloodraven \
   --namespace bloodraven \
-  --create-namespace \
-  --set installCRDs=true
+  --create-namespace
 ```
 
-If CRDs are managed separately:
+Helm installs CRDs from the chart's `crds/` directory on first install. If CRDs are managed separately:
 
 ```bash
 kubectl apply -f https://raw.githubusercontent.com/ShipStream/bloodraven/main/config/crd/bases/shipstream.io_mysqlfailovergroups.yaml
@@ -48,7 +47,7 @@ kubectl apply -f https://raw.githubusercontent.com/ShipStream/bloodraven/main/co
 kubectl apply -f https://raw.githubusercontent.com/ShipStream/bloodraven/main/config/crd/bases/shipstream.io_mysqlbackupverifications.yaml
 ```
 
-Then install the operator with `--set installCRDs=false`.
+Then install the operator chart after the platform CRD app has applied the CRDs. Helm does not upgrade CRDs in `crds/`; apply CRD updates explicitly during upgrades.
 
 ## Helm values
 
diff --git a/docs/docs/production-install-examples.mdx b/docs/docs/production-install-examples.mdx
index 44317f9..03c13f9 100644
--- a/docs/docs/production-install-examples.mdx
+++ b/docs/docs/production-install-examples.mdx
@@ -68,8 +68,6 @@ auxiliary:
     type: ClusterIP
     wsAllowedOrigins: https://dashboard.example.com
     wsMaxClients: 100
-
-installCRDs: true
 ```
 
 Install with:
@@ -80,8 +78,7 @@ helm upgrade --install bloodraven bloodraven/bloodraven \
   -f values-production.yaml
 ```
 
-If Argo CD owns CRDs separately, set `installCRDs: false` and commit the
-CRDs under `charts/bloodraven/crds/` or your platform CRD app.
+If Argo CD owns CRDs separately, commit the CRDs under `charts/bloodraven/crds/` or your platform CRD app and install the operator chart after that CRD app syncs. Helm installs chart CRDs on first install but does not upgrade them automatically.
 
 ## NetworkPolicy
 
diff --git a/examples/argocd-application.yaml b/examples/argocd-application.yaml
index 8948a2a..4c99515 100644
--- a/examples/argocd-application.yaml
+++ b/examples/argocd-application.yaml
@@ -11,7 +11,6 @@ spec:
     targetRevision: 0.1.6
     helm:
       values: |
-        installCRDs: false
         metrics:
           service:
             enabled: true
diff --git a/examples/production-values.yaml b/examples/production-values.yaml
index 83c4be9..bacc9bf 100644
--- a/examples/production-values.yaml
+++ b/examples/production-values.yaml
@@ -27,5 +27,3 @@ auxiliary:
   service:
     enabled: false
     wsAllowedOrigins: "https://dashboard.example.com"
-
-installCRDs: true
diff --git a/internal/controller/bootstrap.go b/internal/controller/bootstrap.go
index 10183d2..aff3498 100644
--- a/internal/controller/bootstrap.go
+++ b/internal/controller/bootstrap.go
@@ -40,7 +40,17 @@ func (b *BootstrapController) BootstrapReplica(ctx context.Context, opts Bootstr
 		return fmt.Errorf("primary is read-only, cannot bootstrap from it")
 	}
 
-	// Step 2: CLONE INSTANCE is a destructive administrative operation, but
+	// Step 2: Ensure the clone plugin is loaded on both sides. MySQL executes
+	// CLONE INSTANCE on the recipient, but the donor also needs the plugin or
+	// the recipient returns Error 3862 with donor Error 1524.
+	if err := opts.Primary.EnsureClonePlugin(ctx); err != nil {
+		return fmt.Errorf("ensure primary clone plugin: %w", err)
+	}
+	if err := opts.Replica.EnsureClonePlugin(ctx); err != nil {
+		return fmt.Errorf("ensure replica clone plugin: %w", err)
+	}
+
+	// Step 3: CLONE INSTANCE is a destructive administrative operation, but
 	// MySQL rejects it while the recipient has super_read_only enabled.
 	if err := opts.Replica.SetSuperReadOnly(ctx, false); err != nil {
 		return fmt.Errorf("disable replica super_read_only for clone: %w", err)
diff --git a/internal/controller/bootstrap_test.go b/internal/controller/bootstrap_test.go
index 07374b4..332b03e 100644
--- a/internal/controller/bootstrap_test.go
+++ b/internal/controller/bootstrap_test.go
@@ -116,6 +116,11 @@ func (b *bootstrapMock) WaitForRelayLogDrain(_ context.Context, _ time.Duration)
 	return nil
 }
 
+func (b *bootstrapMock) EnsureClonePlugin(_ context.Context) error {
+	b.record("EnsureClonePlugin")
+	return nil
+}
+
 func (b *bootstrapMock) SetCloneDonorList(_ context.Context, donor string) error {
 	b.record("SetCloneDonorList")
 	b.mu.Lock()
@@ -159,13 +164,13 @@ func TestBootstrapReplica_HappyPath(t *testing.T) {
 
 	// Primary should have CheckReadOnly called
 	pCalls := primary.getCalls()
-	if len(pCalls) != 1 || pCalls[0] != "CheckReadOnly" {
-		t.Errorf("primary calls: got %v, want [CheckReadOnly]", pCalls)
+	if len(pCalls) != 2 || pCalls[0] != "CheckReadOnly" || pCalls[1] != "EnsureClonePlugin" {
+		t.Errorf("primary calls: got %v, want [CheckReadOnly EnsureClonePlugin]", pCalls)
 	}
 
 	// Replica should be thawed before clone, then cloned.
 	rCalls := replica.getCalls()
-	expected := []string{"SetSuperReadOnly(OFF)", "SetReadOnly", "SetCloneDonorList", "KillAppConnections", "CloneInstance"}
+	expected := []string{"EnsureClonePlugin", "SetSuperReadOnly(OFF)", "SetReadOnly", "SetCloneDonorList", "KillAppConnections", "CloneInstance"}
 	if len(rCalls) != len(expected) {
 		t.Fatalf("replica calls: got %v, want %v", rCalls, expected)
 	}
diff --git a/internal/controller/failover_test.go b/internal/controller/failover_test.go
index cebc1c2..82bba12 100644
--- a/internal/controller/failover_test.go
+++ b/internal/controller/failover_test.go
@@ -115,6 +115,11 @@ func (t *trackingMock) WaitForRelayLogDrain(_ context.Context, _ time.Duration)
 	return err
 }
 
+func (t *trackingMock) EnsureClonePlugin(_ context.Context) error {
+	t.record("EnsureClonePlugin")
+	return nil
+}
+
 func (t *trackingMock) SetCloneDonorList(_ context.Context, donor string) error {
 	t.record("SetCloneDonorList")
 	return nil
diff --git a/internal/controller/init_users.go b/internal/controller/init_users.go
index 9b3400f..241f84e 100644
--- a/internal/controller/init_users.go
+++ b/internal/controller/init_users.go
@@ -66,6 +66,17 @@ if [ -z "${MYSQL_REPLICATION_USER:-}" ] || [ -z "${MYSQL_REPLICATION_PASSWORD:-}
     exit 0
 fi
 
+install_clone_plugin() {
+    local installed
+    installed=$(MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" mysql -u root -Nse "SELECT COUNT(*) FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME='clone'" 2>/dev/null || echo 0)
+    if [ "$installed" = "0" ]; then
+        echo "bloodraven-init: installing MySQL clone plugin"
+        MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" mysql -u root -e "INSTALL PLUGIN clone SONAME 'mysql_clone.so';"
+    fi
+}
+
+install_clone_plugin
+
 REPL_USER=$(escape_sql "$MYSQL_REPLICATION_USER")
 REPL_PASS=$(escape_sql "$MYSQL_REPLICATION_PASSWORD")
 
@@ -93,6 +104,15 @@ escape_sql() {
     printf '%s' "$val"
 }
 
+install_clone_plugin() {
+    local installed
+    installed=$(MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" mysql -u root -Nse "SELECT COUNT(*) FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME='clone'" 2>/dev/null || echo 0)
+    if [ "$installed" = "0" ]; then
+        echo "bloodraven-init: installing MySQL clone plugin"
+        MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" mysql -u root -e "INSTALL PLUGIN clone SONAME 'mysql_clone.so';"
+    fi
+}
+
 create_user_with_grants() {
     local user pass grants
     user=$(escape_sql "$(read_cred "$1" username)")
@@ -113,7 +133,9 @@ EOSQL
 
 `
 	// Operator user — full admin for topology management, replication, cloning.
-	script += `create_user_with_grants operator "GRANT ALL PRIVILEGES ON *.* TO '__USER__'@'%' WITH GRANT OPTION;"
+	script += `install_clone_plugin
+
+create_user_with_grants operator "GRANT ALL PRIVILEGES ON *.* TO '__USER__'@'%' WITH GRANT OPTION;"
 `
 
 	if fg.Spec.Credentials.AppSecret != "" {
diff --git a/internal/controller/reconciler.go b/internal/controller/reconciler.go
index d6bdac5..20dab58 100644
--- a/internal/controller/reconciler.go
+++ b/internal/controller/reconciler.go
@@ -586,7 +586,7 @@ func (r *MysqlFailoverGroupReconciler) reconcileConfigMap(ctx context.Context, f
 			labelManagedBy:     managerName,
 		}
 		cm.Data = map[string]string{
-			"my.cnf": generateMyCnf(fg),
+			"bloodraven.cnf": generateMyCnf(fg),
 		}
 		return nil
 	})
@@ -826,9 +826,20 @@ func (r *MysqlFailoverGroupReconciler) reconcileDeployment(ctx context.Context,
 			})
 		}
 
+		mysqlArgs := []string{
+			fmt.Sprintf("--server-id=%d", serverID),
+			"--gtid-mode=ON",
+			"--enforce-gtid-consistency=ON",
+			"--log-bin=/var/lib/mysql/mysql-bin",
+			"--log-replica-updates=ON",
+			"--skip-replica-start=ON",
+			"--plugin-load-add=mysql_clone.so",
+		}
+
 		mysqlContainer := corev1.Container{
 			Name:  "mysql",
 			Image: image,
+			Args:  mysqlArgs,
 			Ports: []corev1.ContainerPort{
 				{
 					Name:          "mysql",
@@ -1143,7 +1154,7 @@ func (r *MysqlFailoverGroupReconciler) reconcileDeployment(ctx context.Context,
 						Image: image,
 						Command: []string{
 							"sh", "-c",
-							fmt.Sprintf("cp /etc/mysql/config-map/* /etc/mysql/conf.d/ && printf '[mysqld]\\nserver-id=%d\\n' > /etc/mysql/conf.d/server-id.cnf", serverID),
+							fmt.Sprintf("cp /etc/mysql/config-map/bloodraven.cnf /etc/mysql/conf.d/bloodraven.cnf && printf '[mysqld]\\nserver-id=%d\\n' > /etc/mysql/conf.d/server-id.cnf", serverID),
 						},
 						VolumeMounts: []corev1.VolumeMount{
 							{Name: "config", MountPath: "/etc/mysql/config-map"},
diff --git a/internal/controller/reconciler_test.go b/internal/controller/reconciler_test.go
index fca7b0c..5352814 100644
--- a/internal/controller/reconciler_test.go
+++ b/internal/controller/reconciler_test.go
@@ -134,9 +134,9 @@ func TestReconcile_CreatesConfigMap(t *testing.T) {
 		t.Fatalf("configmap not created: %v", err)
 	}
 
-	myCnf, ok := cm.Data["my.cnf"]
+	myCnf, ok := cm.Data["bloodraven.cnf"]
 	if !ok {
-		t.Fatal("my.cnf not found in configmap data")
+		t.Fatal("bloodraven.cnf not found in configmap data")
 	}
 
 	// Check for key config values
@@ -363,7 +363,7 @@ func TestReconcile_TLSConfig(t *testing.T) {
 		t.Fatalf("configmap not found: %v", err)
 	}
 
-	myCnf := cm.Data["my.cnf"]
+	myCnf := cm.Data["bloodraven.cnf"]
 	if !strings.Contains(myCnf, "require-secure-transport=ON") {
 		t.Error("TLS-enabled config should contain require-secure-transport=ON")
 	}
@@ -394,7 +394,7 @@ func TestReconcile_MysqlConfOverrides(t *testing.T) {
 		t.Fatalf("configmap not found: %v", err)
 	}
 
-	myCnf := cm.Data["my.cnf"]
+	myCnf := cm.Data["bloodraven.cnf"]
 	if !strings.Contains(myCnf, "max-connections=1000") {
 		t.Error("override max-connections=1000 should be present")
 	}
diff --git a/internal/controller/topology_test.go b/internal/controller/topology_test.go
index 1554c6d..096f3a9 100644
--- a/internal/controller/topology_test.go
+++ b/internal/controller/topology_test.go
@@ -108,6 +108,7 @@ func (m *mockMySQL) HasUserSchemas(_ context.Context) (bool, error) {
 	}
 	return m.gtidExecuted != "", m.userSchemasErr
 }
+func (m *mockMySQL) EnsureClonePlugin(_ context.Context) error           { return nil }
 func (m *mockMySQL) SetCloneDonorList(_ context.Context, _ string) error { return nil }
 func (m *mockMySQL) CloneInstance(_ context.Context, _, _, _ string, _ bool, _ int) error {
 	return nil
diff --git a/internal/controller/updater_test.go b/internal/controller/updater_test.go
index 21b13da..11202ee 100644
--- a/internal/controller/updater_test.go
+++ b/internal/controller/updater_test.go
@@ -586,6 +586,7 @@ func (f *flappingChecker) StartReplicaSQLThread(_ context.Context) error { retur
 func (f *flappingChecker) WaitForRelayLogDrain(_ context.Context, _ time.Duration) error {
 	return nil
 }
+func (f *flappingChecker) EnsureClonePlugin(_ context.Context) error           { return nil }
 func (f *flappingChecker) SetCloneDonorList(_ context.Context, _ string) error { return nil }
 func (f *flappingChecker) GetGtidExecuted(_ context.Context) (string, error)   { return "", nil }
 func (f *flappingChecker) KillAppConnections(_ context.Context) (int, error)   { return 0, nil }
@@ -622,9 +623,12 @@ func (r *replicaStatusErrorChecker) StartReplicaSQLThread(_ context.Context) err
 func (r *replicaStatusErrorChecker) WaitForRelayLogDrain(_ context.Context, _ time.Duration) error {
 	return nil
 }
+func (r *replicaStatusErrorChecker) EnsureClonePlugin(_ context.Context) error           { return nil }
 func (r *replicaStatusErrorChecker) SetCloneDonorList(_ context.Context, _ string) error { return nil }
-func (r *replicaStatusErrorChecker) GetGtidExecuted(_ context.Context) (string, error)   { return "", nil }
-func (r *replicaStatusErrorChecker) KillAppConnections(_ context.Context) (int, error)   { return 0, nil }
+func (r *replicaStatusErrorChecker) GetGtidExecuted(_ context.Context) (string, error) {
+	return "", nil
+}
+func (r *replicaStatusErrorChecker) KillAppConnections(_ context.Context) (int, error) { return 0, nil }
 func (r *replicaStatusErrorChecker) CloneInstance(_ context.Context, _, _, _ string, _ bool, _ int) error {
 	return nil
 }
diff --git a/internal/mysql/checker.go b/internal/mysql/checker.go
index 51f1e6b..dc48b57 100644
--- a/internal/mysql/checker.go
+++ b/internal/mysql/checker.go
@@ -32,6 +32,7 @@ type Checker interface {
 	GetGtidExecuted(ctx context.Context) (string, error)
 
 	// Clone plugin methods:
+	EnsureClonePlugin(ctx context.Context) error
 	SetCloneDonorList(ctx context.Context, donor string) error
 	CloneInstance(ctx context.Context, user, host, password string, useSSL bool, cloneTimeoutSec int) error
 }
diff --git a/internal/mysql/clone.go b/internal/mysql/clone.go
index e692a4d..1034aa1 100644
--- a/internal/mysql/clone.go
+++ b/internal/mysql/clone.go
@@ -2,12 +2,22 @@ package mysql
 
 import (
 	"context"
+	"errors"
 	"fmt"
+
+	mysqldriver "github.com/go-sql-driver/mysql"
 )
 
 func (m *checker) SetCloneDonorList(ctx context.Context, donor string) error {
 	_, err := m.db.ExecContext(ctx, "SET GLOBAL clone_valid_donor_list = ?", donor)
 	if err != nil {
+		var mysqlErr *mysqldriver.MySQLError
+		if errors.As(err, &mysqlErr) && mysqlErr.Number == 1193 {
+			// MySQL 8.4+ no longer exposes clone_valid_donor_list even when
+			// the clone plugin is available. Older versions require this allowlist;
+			// newer versions can proceed directly to CLONE INSTANCE.
+			return nil
+		}
 		return fmt.Errorf("set clone donor list: %w", err)
 	}
 	return nil
@@ -18,6 +28,10 @@ func (m *checker) CloneInstance(ctx context.Context, user, host, password string
 		cloneTimeoutSec = 3600
 	}
 
+	if err := m.EnsureClonePlugin(ctx); err != nil {
+		return fmt.Errorf("ensure clone plugin: %w", err)
+	}
+
 	// Set connection-level and global timeouts before cloning.
 	// net_read_timeout and net_write_timeout are session-scoped and prevent the
 	// server from dropping the connection during a long clone transfer.
@@ -30,6 +44,12 @@ func (m *checker) CloneInstance(ctx context.Context, user, host, password string
 	}
 	for _, s := range timeoutStmts {
 		if _, err := m.db.ExecContext(ctx, s); err != nil {
+			var mysqlErr *mysqldriver.MySQLError
+			if errors.As(err, &mysqlErr) && mysqlErr.Number == 1193 && s == fmt.Sprintf("SET GLOBAL clone_ddl_timeout = %d", cloneTimeoutSec) {
+				// clone_ddl_timeout was removed in newer MySQL releases; the
+				// connection-level timeouts above still apply, so continue.
+				continue
+			}
 			return fmt.Errorf("set clone timeout (%s): %w", s, err)
 		}
 	}
@@ -46,3 +66,25 @@ func (m *checker) CloneInstance(ctx context.Context, user, host, password string
 	}
 	return nil
 }
+
+func (m *checker) EnsureClonePlugin(ctx context.Context) error {
+	var installed int
+	if err := m.db.QueryRowContext(ctx, "SELECT COUNT(*) FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME = 'clone'").Scan(&installed); err != nil {
+		return fmt.Errorf("check clone plugin: %w", err)
+	}
+	if installed > 0 {
+		return nil
+	}
+
+	_, err := m.db.ExecContext(ctx, "INSTALL PLUGIN clone SONAME 'mysql_clone.so'")
+	if err != nil {
+		var mysqlErr *mysqldriver.MySQLError
+		if errors.As(err, &mysqlErr) && mysqlErr.Number == 1125 {
+			// Another bootstrap/setup path may have installed the plugin between
+			// the INFORMATION_SCHEMA check and this statement.
+			return nil
+		}
+		return fmt.Errorf("install clone plugin: %w", err)
+	}
+	return nil
+}
diff --git a/internal/mysql/replication.go b/internal/mysql/replication.go
index 754c9de..5ec9b33 100644
--- a/internal/mysql/replication.go
+++ b/internal/mysql/replication.go
@@ -157,6 +157,12 @@ func (m *checker) ShowReplicaStatus(ctx context.Context) (*ReplicaStatus, error)
 	if v, ok := colMap["Last_Error"]; ok && v.Valid {
 		rs.LastError = v.String
 	}
+	if v, ok := colMap["Last_IO_Error"]; ok && v.Valid && rs.LastError == "" {
+		rs.LastError = v.String
+	}
+	if v, ok := colMap["Last_SQL_Error"]; ok && v.Valid && rs.LastError == "" {
+		rs.LastError = v.String
+	}
 	if v, ok := colMap["Last_Errno"]; ok && v.Valid && rs.LastError == "" {
 		// fallback
 	}
@@ -211,6 +217,12 @@ func (m *checker) ChangeReplicationSource(ctx context.Context, opts ReplicationS
 	)
 	if opts.UseSSL {
 		q += ", SOURCE_SSL=1"
+	} else {
+		// MySQL 8's default caching_sha2_password authentication needs
+		// the source's RSA public key for non-TLS replication channels.
+		// Without this, START REPLICA succeeds but the IO thread exits
+		// asynchronously, leaving the site permanently not-replicating.
+		q += ", GET_SOURCE_PUBLIC_KEY=1"
 	}
 	if _, err := m.db.ExecContext(ctx, q); err != nil {
 		return fmt.Errorf("change replication source: %w", err)
diff --git a/internal/playground/runner/junit.go b/internal/playground/runner/junit.go
index 2d357e7..edde531 100644
--- a/internal/playground/runner/junit.go
+++ b/internal/playground/runner/junit.go
@@ -4,6 +4,7 @@ import (
 	"encoding/xml"
 	"fmt"
 	"os"
+	"path/filepath"
 )
 
 // JUnitTestSuite is the surefire-flavor JUnit XML structure.
@@ -19,10 +20,10 @@ type JUnitTestSuite struct {
 
 // JUnitTestCase is one scenario.
 type JUnitTestCase struct {
-	XMLName   xml.Name `xml:"testcase"`
-	Name      string   `xml:"name,attr"`
-	Classname string   `xml:"classname,attr"`
-	Time      float64  `xml:"time,attr"`
+	XMLName   xml.Name      `xml:"testcase"`
+	Name      string        `xml:"name,attr"`
+	Classname string        `xml:"classname,attr"`
+	Time      float64       `xml:"time,attr"`
 	Failure   *junitFailure `xml:"failure,omitempty"`
 }
 
@@ -49,7 +50,7 @@ func WriteJUnit(path string, results []Result) error {
 		if !r.Passed {
 			suite.Failures++
 			tc.Failure = &junitFailure{
-				Type: fmt.Sprintf("%s/%s", r.Phase, r.StepName),
+				Type:    fmt.Sprintf("%s/%s", r.Phase, r.StepName),
 				Message: r.Failure,
 				Body: fmt.Sprintf("phase=%s step=%q\n%s\n\nForensics: %s",
 					r.Phase, r.StepName, r.Failure, r.CapturePath),
@@ -65,6 +66,11 @@ func WriteJUnit(path string, results []Result) error {
 	body = append([]byte(xml.Header), body...)
 	body = append(body, '\n')
 
+	if dir := filepath.Dir(path); dir != "." && dir != "" {
+		if err := os.MkdirAll(dir, 0o755); err != nil {
+			return err
+		}
+	}
 	if err := os.WriteFile(path, body, 0o644); err != nil {
 		return err
 	}
diff --git a/internal/playground/runner/profile.go b/internal/playground/runner/profile.go
new file mode 100644
index 0000000..231cc40
--- /dev/null
+++ b/internal/playground/runner/profile.go
@@ -0,0 +1,98 @@
+package runner
+
+// Profile selects which scenarios run-all executes. The three profiles
+// form a strict superset chain: smoke ⊂ release ⊂ full.
+//
+//   - smoke:  short PR-label/manual subset covering emergency failover,
+//     planned switchover, and operator restart durability (~3 scenarios).
+//   - release: curated release/nightly subset covering the WISHLIST #32
+//     behaviours (emergency failover, planned switchover, operator restart,
+//     data integrity, operator kill during failover, self-fencing,
+//     network partition, PVC loss/re-bootstrap, old-primary recovery,
+//     failover state durability).
+//   - full:   every registered scenario (existing run-all behaviour).
+type Profile string
+
+const (
+	ProfileSmoke   Profile = "smoke"
+	ProfileRelease Profile = "release"
+	ProfileFull    Profile = "full"
+)
+
+// DefaultProfile is used when --profile is not supplied.
+const DefaultProfile Profile = ProfileFull
+
+// smokeScenarios is the hard-coded smoke subset. These three scenarios
+// exercise the critical path — emergency failover, planned switchover,
+// operator restart — and complete in roughly 3-5 minutes on a warm
+// playground cluster.
+var smokeScenarios = map[string]bool{
+	"01-clean-primary-kill":    true, // emergency failover
+	"02-planned-switchover":    true, // planned switchover
+	"02-operator-kill-restart": true, // operator restart durability
+}
+
+// releaseScenarios is the hard-coded release/nightly subset. In addition
+// to the smoke scenarios, this covers the behaviours called out in
+// WISHLIST #32: real MySQL pods/PVCs/Services, DNS/DNSEndpoint, taints,
+// planned failover, emergency failover, operator restart, PVC loss and
+// re-bootstrap, network partition / self-fencing, old-primary recovery,
+// and failover state durability across operator restarts.
+var releaseScenarios = map[string]bool{
+	// smoke scenarios (superset)
+	"01-clean-primary-kill":    true,
+	"02-planned-switchover":    true,
+	"02-operator-kill-restart": true,
+	// additional release scenarios
+	"04-data-integrity-on-failover":         true, // data plane correctness
+	"05-operator-kill-during-failover":      true, // operator resilience mid-failover
+	"06-self-fence-isolated-primary":        true, // taint/DNS self-fencing
+	"09-network-partition-self-fence":       true, // NetworkPolicy/partition
+	"10-full-bootstrap-after-data-wipe":     true, // PVC loss → re-bootstrap
+	"12-old-primary-recovery-no-divergence": true, // old-primary recovery
+	"23-failover-state-durability":          true, // state survives operator restart
+}
+
+// Profiles returns the list of valid profile names for CLI help and
+// validation.
+func Profiles() []Profile {
+	return []Profile{ProfileSmoke, ProfileRelease, ProfileFull}
+}
+
+// IsValid reports whether p is a recognised profile name.
+func (p Profile) IsValid() bool {
+	switch p {
+	case ProfileSmoke, ProfileRelease, ProfileFull:
+		return true
+	default:
+		return false
+	}
+
+}
+
+// SelectForProfile filters the given scenario list to the subset that
+// belongs to the requested profile. For ProfileFull (the default) all
+// scenarios are returned unfiltered. Unknown scenario IDs in the profile
+// allowlist are silently ignored so that adding new scenarios does not
+// break existing profiles.
+func SelectForProfile(all []Scenario, p Profile) []Scenario {
+	if p == ProfileFull || p == "" {
+		return all
+	}
+	var allowlist map[string]bool
+	switch p {
+	case ProfileSmoke:
+		allowlist = smokeScenarios
+	case ProfileRelease:
+		allowlist = releaseScenarios
+	default:
+		return all
+	}
+	var out []Scenario
+	for _, s := range all {
+		if allowlist[s.ID] {
+			out = append(out, s)
+		}
+	}
+	return out
+}
diff --git a/internal/playground/runner/profile_registry_test.go b/internal/playground/runner/profile_registry_test.go
new file mode 100644
index 0000000..b7e8119
--- /dev/null
+++ b/internal/playground/runner/profile_registry_test.go
@@ -0,0 +1,30 @@
+package runner_test
+
+import (
+	"testing"
+
+	"github.com/shipstream/bloodraven/internal/playground/runner"
+	_ "github.com/shipstream/bloodraven/internal/playground/scenarios"
+)
+
+func TestProfilesSelectRegisteredScenarios(t *testing.T) {
+	all := runner.DefaultRegistry.List()
+	if len(all) == 0 {
+		t.Fatal("no scenarios registered")
+	}
+
+	smoke := runner.SelectForProfile(all, runner.ProfileSmoke)
+	if len(smoke) != 3 {
+		t.Fatalf("smoke profile selected %d scenarios, want 3", len(smoke))
+	}
+
+	release := runner.SelectForProfile(all, runner.ProfileRelease)
+	if len(release) != 10 {
+		t.Fatalf("release profile selected %d scenarios, want 10", len(release))
+	}
+
+	full := runner.SelectForProfile(all, runner.ProfileFull)
+	if len(full) != len(all) {
+		t.Fatalf("full profile selected %d scenarios, want all %d", len(full), len(all))
+	}
+}
diff --git a/internal/playground/runner/profile_test.go b/internal/playground/runner/profile_test.go
new file mode 100644
index 0000000..159d594
--- /dev/null
+++ b/internal/playground/runner/profile_test.go
@@ -0,0 +1,137 @@
+package runner
+
+import (
+	"testing"
+)
+
+func TestProfileIsValid(t *testing.T) {
+	for _, p := range []Profile{ProfileSmoke, ProfileRelease, ProfileFull} {
+		if !p.IsValid() {
+			t.Errorf("expected %q to be valid", p)
+		}
+	}
+	for _, p := range []Profile{"unknown", "", "partial"} {
+		if p.IsValid() {
+			t.Errorf("expected %q to be invalid", p)
+		}
+	}
+}
+
+func TestProfilesReturnsAll(t *testing.T) {
+	got := Profiles()
+	if len(got) != 3 {
+		t.Fatalf("Profiles() returned %d entries, want 3", len(got))
+	}
+	want := map[Profile]bool{ProfileSmoke: true, ProfileRelease: true, ProfileFull: true}
+	for _, p := range got {
+		if !want[p] {
+			t.Errorf("unexpected profile %q", p)
+		}
+	}
+}
+
+func TestSelectForProfileFull(t *testing.T) {
+	all := []Scenario{
+		{ID: "01-clean-primary-kill"},
+		{ID: "02-planned-switchover"},
+		{ID: "06-self-fence-isolated-primary"},
+	}
+	got := SelectForProfile(all, ProfileFull)
+	if len(got) != 3 {
+		t.Fatalf("ProfileFull: got %d scenarios, want 3", len(got))
+	}
+	// Empty string defaults to full
+	got2 := SelectForProfile(all, "")
+	if len(got2) != 3 {
+		t.Fatalf("empty profile: got %d scenarios, want 3", len(got2))
+	}
+}
+
+func TestSelectForProfileSmoke(t *testing.T) {
+	all := []Scenario{
+		{ID: "01-clean-primary-kill"},
+		{ID: "02-planned-switchover"},
+		{ID: "02-operator-kill-restart"},
+		{ID: "06-self-fence-isolated-primary"},
+		{ID: "10-full-bootstrap-after-data-wipe"},
+	}
+	got := SelectForProfile(all, ProfileSmoke)
+	if len(got) != 3 {
+		t.Fatalf("ProfileSmoke: got %d scenarios, want 3; got %v", len(got), ids(got))
+	}
+	for _, s := range got {
+		if !smokeScenarios[s.ID] {
+			t.Errorf("ProfileSmoke returned unexpected scenario %q", s.ID)
+		}
+	}
+}
+
+func TestSelectForProfileRelease(t *testing.T) {
+	all := []Scenario{
+		{ID: "01-clean-primary-kill"},
+		{ID: "02-planned-switchover"},
+		{ID: "02-operator-kill-restart"},
+		{ID: "04-data-integrity-on-failover"},
+		{ID: "05-operator-kill-during-failover"},
+		{ID: "06-self-fence-isolated-primary"},
+		{ID: "09-network-partition-self-fence"},
+		{ID: "10-full-bootstrap-after-data-wipe"},
+		{ID: "12-old-primary-recovery-no-divergence"},
+		{ID: "23-failover-state-durability"},
+		{ID: "05-split-brain-auto-resolve"}, // not in release
+	}
+	got := SelectForProfile(all, ProfileRelease)
+	if len(got) != 10 {
+		t.Fatalf("ProfileRelease: got %d scenarios, want 10; got %v", len(got), ids(got))
+	}
+	for _, s := range got {
+		if !releaseScenarios[s.ID] {
+			t.Errorf("ProfileRelease returned unexpected scenario %q", s.ID)
+		}
+	}
+}
+
+func TestSelectForProfileReleaseIncludesSmoke(t *testing.T) {
+	// Every smoke scenario must also be in the release profile.
+	for id := range smokeScenarios {
+		if !releaseScenarios[id] {
+			t.Errorf("smoke scenario %q is not in release profile", id)
+		}
+	}
+}
+
+func TestSelectForProfileUnknownAllowlistIgnoresMissing(t *testing.T) {
+	// If the allowlist references an ID that doesn't exist in the
+	// scenario list, SelectForProfile silently skips it (no panic,
+	// no error). This makes adding new scenarios to a profile safe
+	// even before the scenario is registered.
+	all := []Scenario{{ID: "01-clean-primary-kill"}}
+	got := SelectForProfile(all, ProfileSmoke)
+	if len(got) != 1 || got[0].ID != "01-clean-primary-kill" {
+		t.Fatalf("ProfileSmoke with missing IDs: got %v", ids(got))
+	}
+}
+
+func TestSelectForProfileUnknownProfileReturnsAll(t *testing.T) {
+	all := []Scenario{{ID: "a"}, {ID: "b"}}
+	got := SelectForProfile(all, Profile("unknown"))
+	if len(got) != 2 {
+		t.Fatalf("unknown profile: got %d scenarios, want 2", len(got))
+	}
+}
+
+func TestSelectForProfileEmptyInput(t *testing.T) {
+	got := SelectForProfile(nil, ProfileSmoke)
+	if len(got) != 0 {
+		t.Fatalf("empty input: got %d scenarios, want 0", len(got))
+	}
+}
+
+// ids is a test helper that extracts scenario IDs.
+func ids(scens []Scenario) []string {
+	out := make([]string, len(scens))
+	for i, s := range scens {
+		out[i] = s.ID
+	}
+	return out
+}
diff --git a/internal/testutil/fakes.go b/internal/testutil/fakes.go
index 21b9a95..fa37ee3 100644
--- a/internal/testutil/fakes.go
+++ b/internal/testutil/fakes.go
@@ -190,6 +190,13 @@ func (m *FakeMySQL) WaitForRelayLogDrain(_ context.Context, _ time.Duration) err
 	return m.DrainErr
 }
 
+func (m *FakeMySQL) EnsureClonePlugin(_ context.Context) error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	m.record("EnsureClonePlugin")
+	return nil
+}
+
 func (m *FakeMySQL) SetCloneDonorList(_ context.Context, donor string) error {
 	m.mu.Lock()
 	defer m.mu.Unlock()
diff --git a/playground/manifests/rustfs.yaml b/playground/manifests/rustfs.yaml
index dac999a..f708b67 100644
--- a/playground/manifests/rustfs.yaml
+++ b/playground/manifests/rustfs.yaml
@@ -6,8 +6,8 @@ metadata:
 type: Opaque
 stringData:
   # AWS_* envs are what Dragonfly's S3 client reads.
-  AWS_ACCESS_KEY_ID: rustfsadmin
-  AWS_SECRET_ACCESS_KEY: rustfsadmin
+  AWS_ACCESS_KEY_ID: bloodraven-rustfs-access
+  AWS_SECRET_ACCESS_KEY: bloodraven-rustfs-secret
   AWS_REGION: us-east-1
   # RUSTFS_* envs are what the rustfs server binary reads. Holding both
   # naming conventions in the same secret lets the rustfs Deployment
@@ -17,8 +17,8 @@ stringData:
   # CreateContainerConfigError indefinitely (the secret exists in the
   # API server but the pod-scoped kubelet cache returns not-found and
   # never invalidates).
-  RUSTFS_ACCESS_KEY: rustfsadmin
-  RUSTFS_SECRET_KEY: rustfsadmin
+  RUSTFS_ACCESS_KEY: bloodraven-rustfs-access
+  RUSTFS_SECRET_KEY: bloodraven-rustfs-secret
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
diff --git a/playground/setup.sh b/playground/setup.sh
index baa9a27..3836466 100755
--- a/playground/setup.sh
+++ b/playground/setup.sh
@@ -61,6 +61,14 @@ done
 source "$SCRIPT_DIR/_guard.sh"
 require_playground_context
 
+HELM_INSTALL_CRDS=false
+case "${BLOODRAVEN_SETUP_HELM_INSTALL_CRDS:-}" in
+  1|true|TRUE|yes|YES) HELM_INSTALL_CRDS=true ;;
+esac
+if [[ "$HELM_INSTALL_CRDS" == "true" ]] && helm status bloodraven -n "$NAMESPACE" >/dev/null 2>&1; then
+  fail "BLOODRAVEN_SETUP_HELM_INSTALL_CRDS=1 requires a fresh Helm release. Helm installs CRDs from charts/bloodraven/crds only on first install and will not upgrade or repair them on helm upgrade; unset BLOODRAVEN_SETUP_HELM_INSTALL_CRDS to apply CRDs explicitly before upgrading."
+fi
+
 # Prefer docker over podman. k3d's podman support is experimental and the
 # tar-archive image-load path is slower than docker's native import.
 # Override with BLOODRAVEN_CONTAINER_RUNTIME=podman if you actually want
@@ -98,8 +106,8 @@ info "Labeling nodes as site zones..."
 # Pick first two worker nodes (skip control-plane-only if possible)
 WORKERS=()
 for n in "${NODES[@]}"; do
-  role=$(kubectl get node "$n" -o jsonpath='{.metadata.labels.node-role\.kubernetes\.io/control-plane}' 2>/dev/null || true)
-  if [[ -z "$role" ]]; then
+  labels=$(kubectl get node "$n" --show-labels --no-headers 2>/dev/null || true)
+  if [[ "$labels" != *"node-role.kubernetes.io/control-plane"* && "$labels" != *"node-role.kubernetes.io/master"* ]]; then
     WORKERS+=("$n")
   fi
 done
@@ -118,19 +126,23 @@ kubectl label node "${WORKERS[1]}" shipstream.io/site.playground=pdx --overwrite
 ok "Nodes labeled: ${WORKERS[0]}=iad, ${WORKERS[1]}=pdx"
 
 # ── 3. Build images ──────────────────────────────────────────────────────
-info "Building operator and sidecar images..."
-$RUNTIME build --target bloodraven -t bloodraven:playground "$PROJECT_ROOT"
-$RUNTIME build --target sidecar -t bloodraven-sidecar:playground "$PROJECT_ROOT"
+if [[ -n "${SKIP_IMAGE_BUILD:-}" ]]; then
+  info "SKIP_IMAGE_BUILD is set — skipping image builds (CI mode: images pre-built or pre-loaded)"
+else
+  info "Building operator and sidecar images..."
+  $RUNTIME build --target bloodraven -t bloodraven:playground "$PROJECT_ROOT"
+  $RUNTIME build --target sidecar -t bloodraven-sidecar:playground "$PROJECT_ROOT"
 
-info "Building counter-app image..."
-$RUNTIME build -t bloodraven-counter:playground "$SCRIPT_DIR/counter-app"
+  info "Building counter-app image..."
+  $RUNTIME build -t bloodraven-counter:playground "$SCRIPT_DIR/counter-app"
 
-info "Building dashboard image..."
-$RUNTIME build -t bloodraven-dashboard:playground "$SCRIPT_DIR/dashboard"
+  info "Building dashboard image..."
+  $RUNTIME build -t bloodraven-dashboard:playground "$SCRIPT_DIR/dashboard"
 
-info "Building dns-webhook image..."
-$RUNTIME build -t bloodraven-dns-webhook:playground "$SCRIPT_DIR/dns-webhook"
-ok "All images built"
+  info "Building dns-webhook image..."
+  $RUNTIME build -t bloodraven-dns-webhook:playground "$SCRIPT_DIR/dns-webhook"
+  ok "All images built"
+fi
 
 # ── 4. Auto-detect cluster tool and load images ──────────────────────────
 IMAGES=(bloodraven:playground bloodraven-sidecar:playground bloodraven-counter:playground bloodraven-dashboard:playground bloodraven-dns-webhook:playground)
@@ -239,9 +251,13 @@ EOF
 ok "DNSEndpoint CRD installed"
 
 # ── 6. Install Bloodraven CRDs ───────────────────────────────────────────
-info "Installing Bloodraven CRDs..."
-kubectl apply -f "$PROJECT_ROOT/charts/bloodraven/crds/"
-ok "Bloodraven CRDs installed"
+if [[ "$HELM_INSTALL_CRDS" == "true" ]]; then
+  info "Skipping manual Bloodraven CRD install; fresh Helm install will install chart CRDs from charts/bloodraven/crds"
+else
+  info "Installing Bloodraven CRDs..."
+  kubectl apply -f "$PROJECT_ROOT/charts/bloodraven/crds/"
+  ok "Bloodraven CRDs installed"
+fi
 
 # ── 7. Create namespace and deploy manifests ─────────────────────────────
 info "Creating namespace and deploying manifests..."
@@ -274,12 +290,23 @@ helm upgrade --install bloodraven "$PROJECT_ROOT/charts/bloodraven" \
   --set image.repository="${IMG_PREFIX}bloodraven" \
   --set image.tag=playground \
   --set image.pullPolicy=Never \
-  --set installCRDs=false \
   --set auxiliary.service.enabled=true \
   --set 'nodeSelector=null' \
   --set 'tolerations[0].key=node.kubernetes.io/disk-pressure' \
   --set 'tolerations[0].operator=Exists' \
   --set 'tolerations[0].effect=NoSchedule' \
+  --set 'tolerations[1].key=shipstream.io/db-readonly-playground' \
+  --set 'tolerations[1].operator=Exists' \
+  --set 'tolerations[1].effect=NoSchedule' \
+  --set 'tolerations[2].key=shipstream.io/db-readonly' \
+  --set 'tolerations[2].operator=Exists' \
+  --set 'tolerations[2].effect=NoSchedule' \
+  --set 'tolerations[3].key=shipstream.io/db-readonly-playground' \
+  --set 'tolerations[3].operator=Exists' \
+  --set 'tolerations[3].effect=NoExecute' \
+  --set 'tolerations[4].key=shipstream.io/db-readonly' \
+  --set 'tolerations[4].operator=Exists' \
+  --set 'tolerations[4].effect=NoExecute' \
   --set leaderElection.enabled=false \
   --timeout=180s
 # Don't use --wait; the operator may take a moment to pass readiness after
@@ -358,21 +385,50 @@ REPL_USER=$(kubectl -n "$NAMESPACE" get secret mysql-credentials -o jsonpath='{.
 REPL_PASS=$(kubectl -n "$NAMESPACE" get secret mysql-credentials -o jsonpath='{.data.MYSQL_REPLICATION_PASSWORD}' | base64 -d)
 ROOT_PASS=$(kubectl -n "$NAMESPACE" get secret mysql-credentials -o jsonpath='{.data.MYSQL_ROOT_PASSWORD}' | base64 -d)
 for site in iad pdx; do
-  READ_ONLY=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \
-    mysql "-uroot" "-p${ROOT_PASS}" -Nse "SELECT @@global.read_only" 2>/dev/null || echo 0)
-  SUPER_READ_ONLY=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \
-    mysql "-uroot" "-p${ROOT_PASS}" -Nse "SELECT @@global.super_read_only" 2>/dev/null || echo 0)
-  if kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \
-    mysql "-uroot" "-p${ROOT_PASS}" -e \
-    "SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF; \
-     CREATE USER IF NOT EXISTS '${REPL_USER}'@'%' IDENTIFIED BY '${REPL_PASS}'; \
-     GRANT REPLICATION SLAVE, REPLICATION CLIENT, BACKUP_ADMIN, CLONE_ADMIN ON *.* TO '${REPL_USER}'@'%'; \
-     FLUSH PRIVILEGES;" 2>/dev/null; then
-    kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \
-      mysql "-uroot" "-p${ROOT_PASS}" -e "SET GLOBAL read_only=${READ_ONLY}; SET GLOBAL super_read_only=${SUPER_READ_ONLY};" 2>/dev/null || true
-    ok "Replication user created on $site"
-  else
+  CREATED=false
+  LAST_ERR=""
+  for attempt in $(seq 1 12); do
+    READ_ONLY=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \
+      env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -Nse "SELECT @@global.read_only" 2>/dev/null || echo 0)
+    SUPER_READ_ONLY=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \
+      env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -Nse "SELECT @@global.super_read_only" 2>/dev/null || echo 0)
+    LAST_ERR=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \
+      env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -e \
+      "SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF; \
+       INSTALL PLUGIN clone SONAME 'mysql_clone.so'; \
+       CREATE USER IF NOT EXISTS '${REPL_USER}'@'%' IDENTIFIED BY '${REPL_PASS}'; \
+       GRANT REPLICATION SLAVE, REPLICATION CLIENT, BACKUP_ADMIN, CLONE_ADMIN ON *.* TO '${REPL_USER}'@'%'; \
+       FLUSH PRIVILEGES;" 2>&1) && CREATED=true || CREATED=false
+    if [[ "$CREATED" != "true" ]] && grep -Eq "(Function|Plugin) 'clone' already exists|already installed" <<<"$LAST_ERR"; then
+      LAST_ERR=$(kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \
+        env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -e \
+        "SET GLOBAL super_read_only=OFF; SET GLOBAL read_only=OFF; \
+         CREATE USER IF NOT EXISTS '${REPL_USER}'@'%' IDENTIFIED BY '${REPL_PASS}'; \
+         GRANT REPLICATION SLAVE, REPLICATION CLIENT, BACKUP_ADMIN, CLONE_ADMIN ON *.* TO '${REPL_USER}'@'%'; \
+         FLUSH PRIVILEGES;" 2>&1) && CREATED=true || CREATED=false
+    fi
+    if [[ "$CREATED" == "true" ]]; then
+      kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \
+        env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -e "SET GLOBAL read_only=${READ_ONLY}; SET GLOBAL super_read_only=${SUPER_READ_ONLY};" 2>/dev/null || true
+      ok "Replication user created on $site"
+      break
+    fi
+
+    # The operator init script also creates this user. If our explicit setup
+    # races with early bootstrap but the user is already present, keep going.
+    if kubectl -n "$NAMESPACE" exec "deploy/mysql-playground-$site" -c mysql -- \
+      env MYSQL_PWD="$ROOT_PASS" mysql -h127.0.0.1 -uroot -Nse "SELECT CONCAT(user_exists, ':', clone_loaded) FROM (SELECT COUNT(*) AS user_exists FROM mysql.user WHERE user='${REPL_USER}' AND host='%') u CROSS JOIN (SELECT COUNT(*) AS clone_loaded FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME='clone') p" 2>/dev/null | grep -q '^1:1$'; then
+      ok "Replication user and clone plugin already exist on $site"
+      CREATED=true
+      break
+    fi
+
+    warn "Replication user setup on $site failed (attempt $attempt/12); retrying..."
+    sleep 5
+  done
+  if [[ "$CREATED" != "true" ]]; then
     warn "Failed to create replication user on $site"
+    echo "$LAST_ERR" >&2
     exit 1
   fi
 done
diff --git a/test/component/helpers_test.go b/test/component/helpers_test.go
index 69cceba..b877789 100644
--- a/test/component/helpers_test.go
+++ b/test/component/helpers_test.go
@@ -123,6 +123,10 @@ func (m *mockMySQL) WaitForRelayLogDrain(_ context.Context, _ time.Duration) err
 	return nil
 }
 
+func (m *mockMySQL) EnsureClonePlugin(_ context.Context) error {
+	return nil
+}
+
 func (m *mockMySQL) SetCloneDonorList(_ context.Context, donor string) error {
 	m.mu.Lock()
 	defer m.mu.Unlock()