From f280fbc460d5fa442f1f9008edf4030aee40aac2 Mon Sep 17 00:00:00 2001 From: Alex Lutay <1928266+taurus-forever@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:46:23 +0000 Subject: [PATCH 1/2] [DPE-9964] Pre-upgrade switchover to minimize client write downtime during refresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the Patroni primary is also the Juju leader (upgraded last), clients lose write access for the entire upgrade cycle because no unit can update the relation endpoints. Perform a graceful Patroni switchover before the snap refresh so the endpoint is updated while the unit is still responsive. Falls back to the current automatic failover behavior if the switchover fails. Also, to commit endpoint to client, charm have to defer upgrade_granted, as Juju batches relation data changes and only commits them when the hook exits. The previous approach updated endpoints inside _on_upgrade_granted but the client wouldn't see the change until after the snap refresh completed — defeating the purpose. Now the switchover + endpoint update happens in the first invocation, which defers the event and returns. Juju commits the endpoint change, the client sees the new primary immediately, and the deferred event fires a second time to proceed with the snap refresh. Skip pre-upgrade switchover for single unit application or when snap revision is unchanged. Assisted-by: Claude:claude-4.6-opus --- src/upgrade.py | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/src/upgrade.py b/src/upgrade.py index 795aacf060b..66784bf9104 100644 --- a/src/upgrade.py +++ b/src/upgrade.py @@ -6,6 +6,7 @@ import json import logging +from charmlibs import snap from charms.data_platform_libs.v0.upgrade import ( ClusterNotReadyError, DataUpgrade, @@ -18,15 +19,17 @@ from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed from typing_extensions import override +from cluster import SwitchoverFailedError from constants import ( APP_SCOPE, MONITORING_PASSWORD_KEY, MONITORING_USER, PATRONI_PASSWORD_KEY, + POSTGRESQL_SNAP_NAME, RAFT_PASSWORD_KEY, SNAP_PACKAGES, ) -from utils import new_password +from utils import new_password, snap_refreshed logger = logging.getLogger(__name__) @@ -141,8 +144,48 @@ def _on_upgrade_charm_check_legacy(self) -> None: self._prepare_upgrade_from_legacy() self.on.upgrade_charm.emit() + def _pre_upgrade_switchover(self, event: UpgradeGrantedEvent) -> bool: + """Switchover primary before upgrading, to minimize client write downtime. + + Returns True if the event was deferred after the switchover (caller should return). + """ + if len(self.peer_relation.units) == 0: + return False + + if snap_refreshed(snap.SnapCache()[POSTGRESQL_SNAP_NAME].revision): + logger.info("Snap is already at the target revision, skipping pre-upgrade switchover") + return False + + if self.unit_upgrade_data.get("pre-upgrade-switchover-done"): + self.unit_upgrade_data.update({"pre-upgrade-switchover-done": ""}) + return False + + old_primary = self.charm._patroni.get_primary() + if old_primary is None or old_primary != self.charm.unit.name.replace("/", "-"): + return False + + logger.info("Switching over primary before upgrading") + self.charm.unit.status = MaintenanceStatus("switching over primary") + self.charm._patroni.switchover() + self.charm._patroni.primary_changed(old_primary) + logger.info("Primary switchover completed") + + if self.charm.unit.is_leader(): + self.charm._update_relation_endpoints() + + self.unit_upgrade_data.update({"pre-upgrade-switchover-done": "true"}) + logger.info("Deferring upgrade to let Juju commit endpoint changes to client relations") + event.defer() + return True + @override def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None: + try: + if self._pre_upgrade_switchover(event): + return + except (RetryError, SwitchoverFailedError) as e: + logger.warning("Pre-upgrade switchover failed: %s. Proceeding with upgrade.", e) + # Refresh the charmed PostgreSQL snap and restart the database. # Update the configuration. self.charm.unit.status = MaintenanceStatus("updating configuration") From e58ff172ec309f0a059b0601d43c6bbdc6de767f Mon Sep 17 00:00:00 2001 From: Alex Lutay <1928266+taurus-forever@users.noreply.github.com> Date: Mon, 27 Apr 2026 13:57:23 +0200 Subject: [PATCH 2/2] Bump snap revision to trigger upgrade tests --- src/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/constants.py b/src/constants.py index 0ceb6dc6cae..6f6be8d474a 100644 --- a/src/constants.py +++ b/src/constants.py @@ -40,7 +40,7 @@ SNAP_PACKAGES = [ ( POSTGRESQL_SNAP_NAME, - {"revision": {"aarch64": "280", "x86_64": "281"}}, + {"revision": {"aarch64": "291", "x86_64": "290"}}, ) ]