From b27ea85fe07ea84d96fd5dbb9b3244206aa5946c Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:07:38 -0400 Subject: [PATCH 01/15] docs: add enterprise disaster recovery guidance [EDU-789] --- .../version-25.3/enterprise-sidebar.json | 1 + .../enterprise/disaster-recovery.md | 128 ++++++++++++++++++ .../getting-started/production-checklist.md | 13 +- 3 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 platform-enterprise_versioned_docs/version-25.3/enterprise/disaster-recovery.md diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise-sidebar.json b/platform-enterprise_versioned_docs/version-25.3/enterprise-sidebar.json index bde267108..a29c40531 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise-sidebar.json +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise-sidebar.json @@ -88,6 +88,7 @@ }, "enterprise/testing", "enterprise/upgrade", + "enterprise/disaster-recovery", { "type": "category", "label": "Advanced", diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/disaster-recovery.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/disaster-recovery.md new file mode 100644 index 000000000..841d53863 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/disaster-recovery.md @@ -0,0 +1,128 @@ +--- +title: "Disaster recovery" +description: "Plan disaster recovery for self-hosted Seqera Platform deployments" +date created: "2026-04-07" +last updated: "2026-04-07" +tags: [enterprise, disaster recovery, backup, restore, operations] +--- + +This guide outlines a practical disaster recovery (DR) approach for self-hosted Seqera Platform deployments. Use it to define what must be restored, which parts of the environment should be rebuilt from infrastructure-as-code, and how to validate that the restored platform is ready for users. + +The exact recovery procedure depends on your deployment model, your cloud architecture, and your recovery time objective (RTO) and recovery point objective (RPO). Seqera does not provide a single turnkey DR template for every environment, but the guidance on this page covers the minimum state and validation steps that most teams need. + +## Define your recovery target + +Before you write a DR runbook, decide what a successful recovery means for your deployment: + +- Whether you are recovering the existing environment or recreating the account, subscription, or project that hosts Platform. +- How much data loss is acceptable between the last good backup and the restored system. +- How long Platform can remain unavailable before users must switch to another workflow. +- Which services must return first: Platform login, pipeline launches, Studios, or pipeline optimization. + +Your deployment model affects the recovery shape: + +- [Docker Compose](./platform-docker-compose) is best treated as a single-instance recovery path with a longer outage window. +- [Kubernetes](./platform-kubernetes) is better suited to production environments that need higher availability and faster infrastructure replacement. + +:::note +Running multiple backend replicas improves availability during normal operations, but it is not a substitute for database backups, configuration backups, or a tested restore procedure. +::: + +## Back up the required state + +At minimum, your DR plan should cover the following components. + +### SQL database + +The SQL database is the primary persistent state for Seqera Platform. Back up the database before upgrades and on a schedule that matches your RPO. If you use a managed service such as Amazon RDS, document both the snapshot schedule and the exact restore procedure. + +If you use the pipeline optimization service and its `groundswell` database is hosted separately, include that database in the same backup and restore plan. + +### Configuration and secrets + +Back up the configuration that is required to recreate the deployment: + +- `tower.env` +- `tower.yml` +- Helm values files, Kubernetes manifests, or Docker Compose files +- Kubernetes ConfigMaps and Secrets, if used +- Reverse proxy, ingress, DNS, and TLS certificate configuration +- Registry credentials, license configuration, and any custom image mirror settings + +For AWS deployments that use [AWS Parameter Store](./configuration/aws_parameter_store), include the parameter hierarchy and IAM policies required for Seqera to read those values. + +### Redis + +Redis is used for caching and coordination, not as the system of record. In most environments, the priority is to recreate a working Redis service before Platform starts rather than to restore Redis from backup. Document which Redis service you use and how to rebuild it. + +### External dependencies + +Platform recovery also depends on services outside the application itself. Document how to recreate or reconnect: + +- Container registries and image mirrors +- Object storage buckets used by pipelines +- Compute environment credentials and IAM roles +- SMTP configuration +- Identity provider integration +- Studios prerequisites and custom container images + +If your recovery scenario includes recreating the full cloud account, verify that these external dependencies are either reproducible or owned by another team with a compatible DR plan. + +## Choose a deployment-specific DR posture + +### Docker Compose deployments + +Docker Compose deployments are suitable for evaluation, development, and smaller production environments. For DR, plan around full environment replacement rather than in-place failover: + +- Keep infrastructure definitions for the VM, storage volumes, DNS, and network rules outside the instance itself. +- Take scheduled database snapshots and, if you host supporting services locally, snapshot the attached volumes as well. +- Expect service downtime during recovery and during many maintenance operations. +- Validate whether your acceptable outage window matches this model before using Docker Compose for production. + +### Kubernetes deployments + +Kubernetes deployments are the preferred option when you need a more repeatable production recovery path: + +- Store manifests, Helm values, ingress settings, and secret material in a controlled source of truth. +- Use managed database and Redis services where possible so the cluster can be rebuilt independently of the data layer. +- Document the order for restoring cluster resources, shared secrets, and external DNS or load balancer configuration. +- If you rely on multiple backend replicas for availability, make sure the cron service remains a single instance after restoration. + +## Restore in a controlled order + +The restore order matters more than the individual commands. A typical recovery sequence is: + +1. Recreate the base infrastructure: network, DNS, load balancer, Kubernetes cluster or VM, database service, and Redis service. +2. Restore the Seqera SQL database from the most recent acceptable snapshot. +3. Recreate or restore Platform configuration, secrets, TLS certificates, registry access, and any parameter-store entries. +4. Deploy the Platform services. Ensure the cron service completes its startup tasks before relying on the backend. +5. Restore optional components such as Studios or pipeline optimization if your environment uses them. +6. Reconnect external integrations such as SMTP, identity providers, and container registries. + +:::warning +Do not treat application deployment alone as a DR test. A successful recovery must prove that the restored deployment can authenticate users, launch workflows, and access its required external services. +::: + +## Validate the recovered platform + +After restoration, run a short validation sequence before declaring the environment ready: + +1. Confirm that users can log in. +2. Confirm that your organization and workspace configuration is present. +3. Confirm that credentials, compute environments, and secrets are available where expected. +4. Launch a small validation workflow, such as the [deployment test workflow](./testing), and verify that logs and outputs are produced normally. +5. If you use Studios, launch a test Studio session. +6. If you use pipeline optimization, verify that the service starts and can read its database. + +Record the actual recovery duration and any manual fixes required so you can refine the runbook after each exercise. + +## Practice the plan + +A DR plan is only useful if it is exercised. As part of production readiness: + +- Run a scheduled recovery drill in a non-production environment. +- Verify that your backups can actually be restored. +- Measure the real RTO and RPO you achieved. +- Update the runbook when your deployment topology, secrets, integrations, or ownership changes. + +For broader production readiness checks, see the [production checklist](../getting-started/production-checklist). diff --git a/platform-enterprise_versioned_docs/version-25.3/getting-started/production-checklist.md b/platform-enterprise_versioned_docs/version-25.3/getting-started/production-checklist.md index 5c25f700c..cfe7b1fc3 100644 --- a/platform-enterprise_versioned_docs/version-25.3/getting-started/production-checklist.md +++ b/platform-enterprise_versioned_docs/version-25.3/getting-started/production-checklist.md @@ -2,7 +2,7 @@ title: "Production checklist" description: "A pre-production checklist for Seqera Platform." date created: "2025-07-03" -last updated: "2026-03-25" +last updated: "2026-04-07" tags: [production, checklist, deployment, limitations, retry] --- @@ -83,6 +83,17 @@ Do not rotate credentials during active pipeline runs. Schedule rotations during Use [Pipeline Secrets](../secrets/overview) to manage sensitive values such as API keys for third-party services. Secrets are injected at runtime and are not exposed in pipeline logs or configuration files. +## Disaster recovery planning + +Teams often discover gaps in disaster recovery planning only when they are asked to prepare for an audit or simulation exercise. Before go-live: + +- Define your recovery time objective (RTO) and recovery point objective (RPO). +- Decide whether your DR scenario assumes in-place recovery or full account recreation. +- Verify that you back up the Seqera database, deployment configuration, secrets, TLS assets, and external dependency configuration on a schedule that matches your RPO. +- Run at least one recovery drill in a non-production environment and record the real recovery time and manual steps required. + +See [Disaster recovery](../enterprise/disaster-recovery) for a deployment-focused recovery planning guide. + ## Compute environment permissions Permissions within shared compute environments are a frequent source of unexpected behavior, particularly when multiple teams use the same workspace. From cb4a8ed8582d8ff25c9d049418d1fbf53331d469 Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:12:23 -0400 Subject: [PATCH 02/15] EDU-789: add platform disaster recovery docs --- .../enterprise/disaster-recovery.md | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 platform-enterprise_docs/enterprise/disaster-recovery.md diff --git a/platform-enterprise_docs/enterprise/disaster-recovery.md b/platform-enterprise_docs/enterprise/disaster-recovery.md new file mode 100644 index 000000000..e63781baf --- /dev/null +++ b/platform-enterprise_docs/enterprise/disaster-recovery.md @@ -0,0 +1,91 @@ +--- +title: "Platform disaster recovery" +description: Plan backup, restore, and recovery steps for Seqera Platform Enterprise deployments +date created: "2026-04-07" +tags: [installation, deployment, disaster recovery, backup, restore] +--- + +Use this guide to define a disaster recovery (DR) plan for Seqera Platform Enterprise before you need to restore service after an infrastructure loss or a region-level incident. + +Seqera Platform does not create a DR plan for you. Your recovery procedure depends on the infrastructure that hosts Platform, your database and Redis services, your container registry access, and the backup capabilities offered by your cloud provider or platform team. + +## What to protect + +Back up and document the parts of your deployment that you will need to rebuild Platform: + +- The Platform SQL database and its restore procedure. +- Your Platform configuration, including `tower.env`, `tower.yml`, Helm values, Kubernetes manifests, or `docker-compose.yml`. +- Your `TOWER_CRYPTO_SECRETKEY` value and any rotation-related keys. Existing encrypted secrets in the Platform database cannot be decrypted without the correct key material. +- TLS certificates, identity provider settings, registry credentials, and any other secrets required to start Platform. +- The storage locations and infrastructure dependencies referenced by your Platform deployment, such as load balancers, DNS records, persistent volumes, and mirrored container images. + +:::warning +Back up your Platform database before changing the crypto secret key or running key rotation. For more information, see [Configuration overview](./configuration/overview#secret-key-rotation). +::: + +## Define recovery targets + +Document the following targets with your operations team: + +- Recovery point objective (RPO): how much recent Platform state you can afford to lose. +- Recovery time objective (RTO): how long Platform can remain unavailable. +- Recovery owner: who can restore the database, recreate infrastructure, and validate the application. + +Your deployment model directly affects these targets: + +- Kubernetes and Helm deployments can be rebuilt on new infrastructure more easily, especially when Platform runs with external managed database and Redis services. +- Docker Compose deployments are single-instance by design. Restoring them normally requires application downtime while the host, configuration, and backing services are rebuilt. + +## Recommended backup strategy + +At minimum, maintain: + +1. Regular database backups or snapshots for the SQL database used by Platform. +2. Version-controlled copies of your deployment manifests and configuration overrides. +3. A secure copy of the active crypto secret key and any required supporting secrets. +4. A written restore runbook that includes DNS, ingress, load balancer, and certificate steps. + +For production environments, use the backup and replication features provided by your infrastructure: + +- Managed SQL backups, snapshots, and cross-region replicas where required by your RPO and RTO. +- Backups for any persistent volumes or host-attached storage used by your deployment. +- Registry mirroring for Platform images if your environment cannot rely on direct access to `cr.seqera.io` during recovery. + +## Recovery workflow + +### Kubernetes or Helm deployments + +1. Recreate or fail over the Kubernetes cluster and its supporting infrastructure. +2. Restore access to the SQL database, Redis service, secrets, ingress, and DNS records. +3. Reapply your Helm values or Kubernetes manifests. +4. Restore the SQL database from the selected backup or snapshot. +5. Confirm that Platform starts with the same crypto secret key used to encrypt the existing database contents. +6. Validate login, workspace access, and workflow launch behavior. + +### Docker Compose deployments + +1. Provision a replacement host or recover the existing host. +2. Restore `tower.env`, `tower.yml`, `docker-compose.yml`, certificates, and secret material. +3. Restore or recreate the external SQL database and Redis service used by Platform. +4. Start Platform with `docker compose up` and allow migrations and startup checks to finish. +5. Validate login, workspace access, and workflow launch behavior before switching traffic back. + +## Validation checklist + +Test your DR plan on a schedule that matches your organization's risk requirements. During each exercise, confirm that you can: + +- Restore the database from a recent backup. +- Start Platform with the correct crypto secret key and configuration. +- Reach the frontend through the expected DNS and TLS path. +- Log in and access organizations, workspaces, and compute environments. +- Launch a small workflow to verify end-to-end operation. + +The [Test deployment](./testing) guide provides a simple post-recovery smoke test you can adapt for DR exercises. + +## Related guides + +- [Platform installation overview](./install-platform) +- [Platform: Helm](./platform-helm) +- [Platform: Kubernetes](./platform-kubernetes) +- [Platform: Docker Compose](./platform-docker-compose) +- [Test deployment](./testing) From 9545e39159663323324144bb12b0da44f805a6b2 Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:19 -0400 Subject: [PATCH 03/15] EDU-789: add platform disaster recovery docs --- platform-enterprise_docs/enterprise/install-platform.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/platform-enterprise_docs/enterprise/install-platform.md b/platform-enterprise_docs/enterprise/install-platform.md index d91df41b3..d73d628f4 100644 --- a/platform-enterprise_docs/enterprise/install-platform.md +++ b/platform-enterprise_docs/enterprise/install-platform.md @@ -18,6 +18,8 @@ Seqera Platform Enterprise can be deployed using Docker Compose, Kubernetes, or See each deployment guide for detailed requirements. +For backup, restore, and recovery planning, see [Platform disaster recovery](./disaster-recovery). + ## Prerequisites :::info From ed19fe9993bda3680639eac31bd8ec43c4d1de15 Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:45 -0400 Subject: [PATCH 04/15] EDU-789: add platform disaster recovery docs From b1df8b7e91a284a19fc431a1b70005c18ec37b0d Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:46 -0400 Subject: [PATCH 05/15] EDU-789: add platform disaster recovery docs --- .../enterprise/disaster-recovery.md | 155 +++++++----------- 1 file changed, 59 insertions(+), 96 deletions(-) diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/disaster-recovery.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/disaster-recovery.md index 841d53863..e63781baf 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/disaster-recovery.md +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/disaster-recovery.md @@ -1,128 +1,91 @@ --- -title: "Disaster recovery" -description: "Plan disaster recovery for self-hosted Seqera Platform deployments" +title: "Platform disaster recovery" +description: Plan backup, restore, and recovery steps for Seqera Platform Enterprise deployments date created: "2026-04-07" -last updated: "2026-04-07" -tags: [enterprise, disaster recovery, backup, restore, operations] +tags: [installation, deployment, disaster recovery, backup, restore] --- -This guide outlines a practical disaster recovery (DR) approach for self-hosted Seqera Platform deployments. Use it to define what must be restored, which parts of the environment should be rebuilt from infrastructure-as-code, and how to validate that the restored platform is ready for users. +Use this guide to define a disaster recovery (DR) plan for Seqera Platform Enterprise before you need to restore service after an infrastructure loss or a region-level incident. -The exact recovery procedure depends on your deployment model, your cloud architecture, and your recovery time objective (RTO) and recovery point objective (RPO). Seqera does not provide a single turnkey DR template for every environment, but the guidance on this page covers the minimum state and validation steps that most teams need. +Seqera Platform does not create a DR plan for you. Your recovery procedure depends on the infrastructure that hosts Platform, your database and Redis services, your container registry access, and the backup capabilities offered by your cloud provider or platform team. -## Define your recovery target +## What to protect -Before you write a DR runbook, decide what a successful recovery means for your deployment: +Back up and document the parts of your deployment that you will need to rebuild Platform: -- Whether you are recovering the existing environment or recreating the account, subscription, or project that hosts Platform. -- How much data loss is acceptable between the last good backup and the restored system. -- How long Platform can remain unavailable before users must switch to another workflow. -- Which services must return first: Platform login, pipeline launches, Studios, or pipeline optimization. +- The Platform SQL database and its restore procedure. +- Your Platform configuration, including `tower.env`, `tower.yml`, Helm values, Kubernetes manifests, or `docker-compose.yml`. +- Your `TOWER_CRYPTO_SECRETKEY` value and any rotation-related keys. Existing encrypted secrets in the Platform database cannot be decrypted without the correct key material. +- TLS certificates, identity provider settings, registry credentials, and any other secrets required to start Platform. +- The storage locations and infrastructure dependencies referenced by your Platform deployment, such as load balancers, DNS records, persistent volumes, and mirrored container images. -Your deployment model affects the recovery shape: - -- [Docker Compose](./platform-docker-compose) is best treated as a single-instance recovery path with a longer outage window. -- [Kubernetes](./platform-kubernetes) is better suited to production environments that need higher availability and faster infrastructure replacement. - -:::note -Running multiple backend replicas improves availability during normal operations, but it is not a substitute for database backups, configuration backups, or a tested restore procedure. +:::warning +Back up your Platform database before changing the crypto secret key or running key rotation. For more information, see [Configuration overview](./configuration/overview#secret-key-rotation). ::: -## Back up the required state - -At minimum, your DR plan should cover the following components. - -### SQL database +## Define recovery targets -The SQL database is the primary persistent state for Seqera Platform. Back up the database before upgrades and on a schedule that matches your RPO. If you use a managed service such as Amazon RDS, document both the snapshot schedule and the exact restore procedure. +Document the following targets with your operations team: -If you use the pipeline optimization service and its `groundswell` database is hosted separately, include that database in the same backup and restore plan. +- Recovery point objective (RPO): how much recent Platform state you can afford to lose. +- Recovery time objective (RTO): how long Platform can remain unavailable. +- Recovery owner: who can restore the database, recreate infrastructure, and validate the application. -### Configuration and secrets +Your deployment model directly affects these targets: -Back up the configuration that is required to recreate the deployment: +- Kubernetes and Helm deployments can be rebuilt on new infrastructure more easily, especially when Platform runs with external managed database and Redis services. +- Docker Compose deployments are single-instance by design. Restoring them normally requires application downtime while the host, configuration, and backing services are rebuilt. -- `tower.env` -- `tower.yml` -- Helm values files, Kubernetes manifests, or Docker Compose files -- Kubernetes ConfigMaps and Secrets, if used -- Reverse proxy, ingress, DNS, and TLS certificate configuration -- Registry credentials, license configuration, and any custom image mirror settings +## Recommended backup strategy -For AWS deployments that use [AWS Parameter Store](./configuration/aws_parameter_store), include the parameter hierarchy and IAM policies required for Seqera to read those values. +At minimum, maintain: -### Redis +1. Regular database backups or snapshots for the SQL database used by Platform. +2. Version-controlled copies of your deployment manifests and configuration overrides. +3. A secure copy of the active crypto secret key and any required supporting secrets. +4. A written restore runbook that includes DNS, ingress, load balancer, and certificate steps. -Redis is used for caching and coordination, not as the system of record. In most environments, the priority is to recreate a working Redis service before Platform starts rather than to restore Redis from backup. Document which Redis service you use and how to rebuild it. +For production environments, use the backup and replication features provided by your infrastructure: -### External dependencies +- Managed SQL backups, snapshots, and cross-region replicas where required by your RPO and RTO. +- Backups for any persistent volumes or host-attached storage used by your deployment. +- Registry mirroring for Platform images if your environment cannot rely on direct access to `cr.seqera.io` during recovery. -Platform recovery also depends on services outside the application itself. Document how to recreate or reconnect: +## Recovery workflow -- Container registries and image mirrors -- Object storage buckets used by pipelines -- Compute environment credentials and IAM roles -- SMTP configuration -- Identity provider integration -- Studios prerequisites and custom container images +### Kubernetes or Helm deployments -If your recovery scenario includes recreating the full cloud account, verify that these external dependencies are either reproducible or owned by another team with a compatible DR plan. - -## Choose a deployment-specific DR posture +1. Recreate or fail over the Kubernetes cluster and its supporting infrastructure. +2. Restore access to the SQL database, Redis service, secrets, ingress, and DNS records. +3. Reapply your Helm values or Kubernetes manifests. +4. Restore the SQL database from the selected backup or snapshot. +5. Confirm that Platform starts with the same crypto secret key used to encrypt the existing database contents. +6. Validate login, workspace access, and workflow launch behavior. ### Docker Compose deployments -Docker Compose deployments are suitable for evaluation, development, and smaller production environments. For DR, plan around full environment replacement rather than in-place failover: - -- Keep infrastructure definitions for the VM, storage volumes, DNS, and network rules outside the instance itself. -- Take scheduled database snapshots and, if you host supporting services locally, snapshot the attached volumes as well. -- Expect service downtime during recovery and during many maintenance operations. -- Validate whether your acceptable outage window matches this model before using Docker Compose for production. - -### Kubernetes deployments - -Kubernetes deployments are the preferred option when you need a more repeatable production recovery path: - -- Store manifests, Helm values, ingress settings, and secret material in a controlled source of truth. -- Use managed database and Redis services where possible so the cluster can be rebuilt independently of the data layer. -- Document the order for restoring cluster resources, shared secrets, and external DNS or load balancer configuration. -- If you rely on multiple backend replicas for availability, make sure the cron service remains a single instance after restoration. - -## Restore in a controlled order - -The restore order matters more than the individual commands. A typical recovery sequence is: - -1. Recreate the base infrastructure: network, DNS, load balancer, Kubernetes cluster or VM, database service, and Redis service. -2. Restore the Seqera SQL database from the most recent acceptable snapshot. -3. Recreate or restore Platform configuration, secrets, TLS certificates, registry access, and any parameter-store entries. -4. Deploy the Platform services. Ensure the cron service completes its startup tasks before relying on the backend. -5. Restore optional components such as Studios or pipeline optimization if your environment uses them. -6. Reconnect external integrations such as SMTP, identity providers, and container registries. - -:::warning -Do not treat application deployment alone as a DR test. A successful recovery must prove that the restored deployment can authenticate users, launch workflows, and access its required external services. -::: - -## Validate the recovered platform - -After restoration, run a short validation sequence before declaring the environment ready: +1. Provision a replacement host or recover the existing host. +2. Restore `tower.env`, `tower.yml`, `docker-compose.yml`, certificates, and secret material. +3. Restore or recreate the external SQL database and Redis service used by Platform. +4. Start Platform with `docker compose up` and allow migrations and startup checks to finish. +5. Validate login, workspace access, and workflow launch behavior before switching traffic back. -1. Confirm that users can log in. -2. Confirm that your organization and workspace configuration is present. -3. Confirm that credentials, compute environments, and secrets are available where expected. -4. Launch a small validation workflow, such as the [deployment test workflow](./testing), and verify that logs and outputs are produced normally. -5. If you use Studios, launch a test Studio session. -6. If you use pipeline optimization, verify that the service starts and can read its database. +## Validation checklist -Record the actual recovery duration and any manual fixes required so you can refine the runbook after each exercise. +Test your DR plan on a schedule that matches your organization's risk requirements. During each exercise, confirm that you can: -## Practice the plan +- Restore the database from a recent backup. +- Start Platform with the correct crypto secret key and configuration. +- Reach the frontend through the expected DNS and TLS path. +- Log in and access organizations, workspaces, and compute environments. +- Launch a small workflow to verify end-to-end operation. -A DR plan is only useful if it is exercised. As part of production readiness: +The [Test deployment](./testing) guide provides a simple post-recovery smoke test you can adapt for DR exercises. -- Run a scheduled recovery drill in a non-production environment. -- Verify that your backups can actually be restored. -- Measure the real RTO and RPO you achieved. -- Update the runbook when your deployment topology, secrets, integrations, or ownership changes. +## Related guides -For broader production readiness checks, see the [production checklist](../getting-started/production-checklist). +- [Platform installation overview](./install-platform) +- [Platform: Helm](./platform-helm) +- [Platform: Kubernetes](./platform-kubernetes) +- [Platform: Docker Compose](./platform-docker-compose) +- [Test deployment](./testing) From 7ce472b9398f21e79d000d95a0a492ab915abe77 Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:47 -0400 Subject: [PATCH 06/15] EDU-789: add platform disaster recovery docs From 655f3ad465dfa21c01ee17ef0478df60168d66ef Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:48 -0400 Subject: [PATCH 07/15] EDU-789: add platform disaster recovery docs --- .../version-25.3/enterprise/install-platform.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/install-platform.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/install-platform.md index 4f7b4d09d..a5311a588 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/install-platform.md +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/install-platform.md @@ -18,6 +18,8 @@ Seqera Platform Enterprise can be deployed using Docker Compose, Kubernetes, or See each deployment guide for detailed requirements. +For backup, restore, and recovery planning, see [Platform disaster recovery](./disaster-recovery). + ## Prerequisites :::info From 9f6fd77b923e7cac54af0859bc76221d6f37dcd2 Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:49 -0400 Subject: [PATCH 08/15] EDU-789: add platform disaster recovery docs --- platform-enterprise_docs/enterprise/platform-helm.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/platform-enterprise_docs/enterprise/platform-helm.md b/platform-enterprise_docs/enterprise/platform-helm.md index ca68df1d0..b36e5ecba 100644 --- a/platform-enterprise_docs/enterprise/platform-helm.md +++ b/platform-enterprise_docs/enterprise/platform-helm.md @@ -58,6 +58,10 @@ helm upgrade my-release oci://public.cr.seqera.io/charts/platform \ --values my-values.yaml ``` +## Disaster recovery planning + +Define your backup, restore, and validation procedure before promoting a Helm deployment to production. For DR guidance, including database backups, crypto key handling, and post-restore checks, see [Platform disaster recovery](./disaster-recovery). + ## Uninstalling the Helm chart To uninstall the Seqera Platform Enterprise Helm chart, run the following command, replacing `my-release` and `my-namespace` with your release name and namespace: From 90805ac07ae5386b68933d957e95d6b3cbcaf6d2 Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:50 -0400 Subject: [PATCH 09/15] EDU-789: add platform disaster recovery docs --- .../version-25.3/enterprise/platform-helm.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-helm.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-helm.md index ca68df1d0..b36e5ecba 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-helm.md +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-helm.md @@ -58,6 +58,10 @@ helm upgrade my-release oci://public.cr.seqera.io/charts/platform \ --values my-values.yaml ``` +## Disaster recovery planning + +Define your backup, restore, and validation procedure before promoting a Helm deployment to production. For DR guidance, including database backups, crypto key handling, and post-restore checks, see [Platform disaster recovery](./disaster-recovery). + ## Uninstalling the Helm chart To uninstall the Seqera Platform Enterprise Helm chart, run the following command, replacing `my-release` and `my-namespace` with your release name and namespace: From 3b2ccdd780482b49b2407746b5c7e822cef7390e Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:52 -0400 Subject: [PATCH 10/15] EDU-789: add platform disaster recovery docs --- platform-enterprise_docs/enterprise/platform-kubernetes.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/platform-enterprise_docs/enterprise/platform-kubernetes.md b/platform-enterprise_docs/enterprise/platform-kubernetes.md index 1b17566b9..235e07ac8 100644 --- a/platform-enterprise_docs/enterprise/platform-kubernetes.md +++ b/platform-enterprise_docs/enterprise/platform-kubernetes.md @@ -204,6 +204,8 @@ To configure Seqera Enterprise for high availability, note that: - The `cron` service may only have a single instance - The `groundswell` service may only have a single instance +For backup, restore, and validation planning, see [Platform disaster recovery](./disaster-recovery). + [aws-configure-ingress]: https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/guide/ingress/annotations/ [azure-configure-ingress]: https://docs.microsoft.com/en-us/azure/application-gateway/ingress-controller-annotations [google-configure-ingress]: https://cloud.google.com/kubernetes-engine/docs/concepts/ingress From 2d6065a7188b48c6869588debdfaaaf0bb6d5f56 Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:53 -0400 Subject: [PATCH 11/15] EDU-789: add platform disaster recovery docs --- .../version-25.3/enterprise/platform-kubernetes.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-kubernetes.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-kubernetes.md index 905bb9585..8d127a168 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-kubernetes.md +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-kubernetes.md @@ -204,6 +204,8 @@ To configure Seqera Enterprise for high availability, note that: - The `cron` service may only have a single instance - The `groundswell` service may only have a single instance +For backup, restore, and validation planning, see [Platform disaster recovery](./disaster-recovery). + [aws-configure-ingress]: https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/guide/ingress/annotations/ [azure-configure-ingress]: https://docs.microsoft.com/en-us/azure/application-gateway/ingress-controller-annotations [google-configure-ingress]: https://cloud.google.com/kubernetes-engine/docs/concepts/ingress From f6d9f92a7fa1960ebd223bc121e69353b5e7f9b9 Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:54 -0400 Subject: [PATCH 12/15] EDU-789: add platform disaster recovery docs --- .../enterprise/platform-docker-compose.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/platform-enterprise_docs/enterprise/platform-docker-compose.md b/platform-enterprise_docs/enterprise/platform-docker-compose.md index d45091502..a1d66f98e 100644 --- a/platform-enterprise_docs/enterprise/platform-docker-compose.md +++ b/platform-enterprise_docs/enterprise/platform-docker-compose.md @@ -115,3 +115,7 @@ Seqera Platform offers a service that optimizes pipeline resource requests. Refe :::note Studios is available from Seqera Platform v24.1. If you experience any problems during the deployment process please contact your account executive. Studios in Enterprise is not installed by default. ::: + +## Disaster recovery planning + +Docker Compose deployments are single-instance by design, so recovery normally requires service downtime while you restore the host, configuration, and backing services. For backup, restore, and validation guidance, see [Platform disaster recovery](./disaster-recovery). From 54c65817da5a446ae055ea4e9e69db94005fa555 Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:55 -0400 Subject: [PATCH 13/15] EDU-789: add platform disaster recovery docs --- .../version-25.3/enterprise/platform-docker-compose.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-docker-compose.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-docker-compose.md index d45091502..a1d66f98e 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-docker-compose.md +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/platform-docker-compose.md @@ -115,3 +115,7 @@ Seqera Platform offers a service that optimizes pipeline resource requests. Refe :::note Studios is available from Seqera Platform v24.1. If you experience any problems during the deployment process please contact your account executive. Studios in Enterprise is not installed by default. ::: + +## Disaster recovery planning + +Docker Compose deployments are single-instance by design, so recovery normally requires service downtime while you restore the host, configuration, and backing services. For backup, restore, and validation guidance, see [Platform disaster recovery](./disaster-recovery). From 102eb43194ee0afff762db6c57e4053accb195fc Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:57 -0400 Subject: [PATCH 14/15] EDU-789: add platform disaster recovery docs --- platform-enterprise_docs/enterprise-sidebar.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/platform-enterprise_docs/enterprise-sidebar.json b/platform-enterprise_docs/enterprise-sidebar.json index a22110dc6..4449e3d81 100644 --- a/platform-enterprise_docs/enterprise-sidebar.json +++ b/platform-enterprise_docs/enterprise-sidebar.json @@ -21,7 +21,8 @@ "items": [ "enterprise/platform-helm", "enterprise/platform-kubernetes", - "enterprise/platform-docker-compose" + "enterprise/platform-docker-compose", + "enterprise/disaster-recovery" ] }, { From 8859eb45a7c7a3cb8ab2c4bb7cd93d2577cee473 Mon Sep 17 00:00:00 2001 From: Llewellyn vd Berg <113503285+llewellyn-sl@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:13:58 -0400 Subject: [PATCH 15/15] EDU-789: add platform disaster recovery docs --- .../version-25.3/enterprise-sidebar.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise-sidebar.json b/platform-enterprise_versioned_docs/version-25.3/enterprise-sidebar.json index a29c40531..0f2869bd2 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise-sidebar.json +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise-sidebar.json @@ -21,7 +21,8 @@ "items": [ "enterprise/platform-helm", "enterprise/platform-kubernetes", - "enterprise/platform-docker-compose" + "enterprise/platform-docker-compose", + "enterprise/disaster-recovery" ] }, { @@ -88,7 +89,6 @@ }, "enterprise/testing", "enterprise/upgrade", - "enterprise/disaster-recovery", { "type": "category", "label": "Advanced",