From c0a5761769b45f7c8377f399597c792636564065 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 26 May 2026 17:09:01 +0300 Subject: [PATCH 01/11] Try to be less restrictive when cleaning up raft --- src/relations/watcher_requirer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/relations/watcher_requirer.py b/src/relations/watcher_requirer.py index add8c56..117ef81 100644 --- a/src/relations/watcher_requirer.py +++ b/src/relations/watcher_requirer.py @@ -232,6 +232,9 @@ def _get_patroni_cas(self, relation: Relation) -> str | None: return name return f"relation-{relation.id}" + def _get_related_ips(self, relation: Relation) -> list[str]: + return [data["unit-address"] for data in relation.data.values() if "unit-address" in data] + def _get_standby_clusters(self, relation: Relation) -> list[str]: """Get related standby clusters from the relation app data. @@ -327,7 +330,7 @@ def _update_unit_address_if_changed(self) -> None: new_address, raft_password, partner_addrs, port ) raft_controller.cleanup_raft_cluster( - new_address, raft_password, partner_addrs, port + new_address, raft_password, self._get_related_ips(relation), port ) def _on_update_status(self, event: UpdateStatusEvent) -> None: From a7fe4a4d8e94b21053ee277281482cf6c5524515 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 27 May 2026 12:00:25 +0300 Subject: [PATCH 02/11] Check correct address --- src/charm.py | 8 +------- src/raft_controller.py | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/charm.py b/src/charm.py index 0f7c7a5..b94bef8 100755 --- a/src/charm.py +++ b/src/charm.py @@ -98,7 +98,7 @@ def __init__(self, *args): handler.setFormatter(logging.Formatter("{name}:{message}", style="{")) # Watcher mode: lightweight Raft witness, no PostgreSQL - self._init_watcher_mode() + self.watcher_requirer = WatcherRequirerHandler(self) # Set tracing_endpoint for @trace_charm decorator compatibility self.tracing_endpoint = None @@ -119,12 +119,6 @@ def __init__(self, *args): else: self.refresh.next_unit_allowed_to_refresh = True - def _init_watcher_mode(self): - """Initialize the charm in watcher mode (lightweight Raft witness).""" - self.watcher_requirer = WatcherRequirerHandler(self) - # Watcher mode delegates all event handling to WatcherRequirerHandler. - # We still observe leader_elected to persist the role in peer data. - def _post_snap_refresh(self, refresh: charm_refresh.Machines): """Start PostgreSQL, check if this app and unit are healthy, and allow next unit to refresh. diff --git a/src/raft_controller.py b/src/raft_controller.py index aae3c06..d431882 100644 --- a/src/raft_controller.py +++ b/src/raft_controller.py @@ -302,7 +302,7 @@ def cleanup_raft_cluster( member_ip = member_addr.split(":")[0] # Check if this is a stale watcher (not a PostgreSQL node and not current watcher) - if member_ip not in partner_addrs and member_addr != member_address: + if member_ip not in partner_addrs and member_ip != member_address: logger.info(f"Removing stale Raft member: {member_addr}") self.remove_raft_member(member_addr, raft_password, []) return True From 84fbd4646b5bca61d21f05eac0555b0cfc90afe9 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 27 May 2026 14:38:48 +0300 Subject: [PATCH 03/11] Clean up on relation changed --- src/relations/watcher_requirer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/relations/watcher_requirer.py b/src/relations/watcher_requirer.py index 117ef81..e7a4d86 100644 --- a/src/relations/watcher_requirer.py +++ b/src/relations/watcher_requirer.py @@ -479,6 +479,9 @@ def _on_watcher_relation_changed( port = self._get_port_for_relation(relation.id) raft_controller = RaftController(self.charm, f"rel{relation.id}") + raft_controller.cleanup_raft_cluster( + self.unit_ip, raft_password, self._get_related_ips(relation), port + ) if self._is_disabled(relation) or not self._should_watcher_vote(partner_addrs): logger.debug("Disabling the watcher") raft_controller.remove_service() From f5099e3fe52cc909360f5a903042e9b446119f9c Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 27 May 2026 15:46:36 +0300 Subject: [PATCH 04/11] Back to peers --- src/relations/watcher_requirer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/relations/watcher_requirer.py b/src/relations/watcher_requirer.py index e7a4d86..6b69d5f 100644 --- a/src/relations/watcher_requirer.py +++ b/src/relations/watcher_requirer.py @@ -330,7 +330,7 @@ def _update_unit_address_if_changed(self) -> None: new_address, raft_password, partner_addrs, port ) raft_controller.cleanup_raft_cluster( - new_address, raft_password, self._get_related_ips(relation), port + new_address, raft_password, partner_addrs, port ) def _on_update_status(self, event: UpdateStatusEvent) -> None: @@ -479,9 +479,7 @@ def _on_watcher_relation_changed( port = self._get_port_for_relation(relation.id) raft_controller = RaftController(self.charm, f"rel{relation.id}") - raft_controller.cleanup_raft_cluster( - self.unit_ip, raft_password, self._get_related_ips(relation), port - ) + raft_controller.cleanup_raft_cluster(self.unit_ip, raft_password, partner_addrs, port) if self._is_disabled(relation) or not self._should_watcher_vote(partner_addrs): logger.debug("Disabling the watcher") raft_controller.remove_service() From c0e5fbfdca3d0597eeec891a0bd4368285d969ab Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 27 May 2026 22:58:22 +0300 Subject: [PATCH 05/11] Revert peers --- src/relations/watcher_requirer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/relations/watcher_requirer.py b/src/relations/watcher_requirer.py index 6b69d5f..e7a4d86 100644 --- a/src/relations/watcher_requirer.py +++ b/src/relations/watcher_requirer.py @@ -330,7 +330,7 @@ def _update_unit_address_if_changed(self) -> None: new_address, raft_password, partner_addrs, port ) raft_controller.cleanup_raft_cluster( - new_address, raft_password, partner_addrs, port + new_address, raft_password, self._get_related_ips(relation), port ) def _on_update_status(self, event: UpdateStatusEvent) -> None: @@ -479,7 +479,9 @@ def _on_watcher_relation_changed( port = self._get_port_for_relation(relation.id) raft_controller = RaftController(self.charm, f"rel{relation.id}") - raft_controller.cleanup_raft_cluster(self.unit_ip, raft_password, partner_addrs, port) + raft_controller.cleanup_raft_cluster( + self.unit_ip, raft_password, self._get_related_ips(relation), port + ) if self._is_disabled(relation) or not self._should_watcher_vote(partner_addrs): logger.debug("Disabling the watcher") raft_controller.remove_service() From e56db98c267a883ebda04dc30314067be6926e72 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 28 May 2026 14:13:40 +0300 Subject: [PATCH 06/11] Use addresses consistently --- src/raft_controller.py | 16 +++++++--------- src/relations/watcher_requirer.py | 10 ++++++---- tests/unit/test_raft_controller.py | 10 +++++----- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/raft_controller.py b/src/raft_controller.py index d431882..2048ed8 100644 --- a/src/raft_controller.py +++ b/src/raft_controller.py @@ -187,15 +187,13 @@ def configure( return True def check_watcher_connection( - self, member_address: str, raft_password: str, partner_addrs: list[str], port: int + self, member_address: str, raft_password: str, partner_addrs: list[str] ) -> None: """Verify that the watcher has joined the Raft cluster.""" if not partner_addrs: logger.debug("Check connection early exit: No partners provided") return - watcher_addr = f"{member_address}:{port}" - # Get the status of the raft cluster. syncobj_util = TcpUtility(password=raft_password, timeout=3) @@ -203,7 +201,9 @@ def check_watcher_connection( try: for attempt in Retrying(stop=stop_after_attempt(10), wait=wait_fixed(2)): with attempt: - if not (raft_status := syncobj_util.executeCommand(watcher_addr, ["status"])): + if not ( + raft_status := syncobj_util.executeCommand(member_address, ["status"]) + ): raise Exception("Raft watcher no status") logger.debug(f"Observer raft: {raft_status}") for key in raft_status: @@ -282,17 +282,15 @@ def restart(self) -> bool: return False def cleanup_raft_cluster( - self, member_address: str, raft_password: str, partner_addrs: list[str], port: int + self, member_address: str, raft_password: str, partner_addrs: list[str] ) -> bool: """Cleanup RAFT members not belonging to the current cluster or not a related watcher.""" # Get Raft cluster status to find all members try: - watcher_addr = f"{member_address}:{port}" - # Get the status of the raft cluster. syncobj_util = TcpUtility(password=raft_password, timeout=3) - for raft_host in [watcher_addr, *[f"{addr}:{RAFT_PORT}" for addr in partner_addrs]]: + for raft_host in [member_address, *[f"{addr}:{RAFT_PORT}" for addr in partner_addrs]]: if raft_status := syncobj_util.executeCommand(raft_host, ["status"]): # Find all partner nodes in the Raft cluster # Keys look like: partner_node_status_server_10.131.50.142:2222 @@ -302,7 +300,7 @@ def cleanup_raft_cluster( member_ip = member_addr.split(":")[0] # Check if this is a stale watcher (not a PostgreSQL node and not current watcher) - if member_ip not in partner_addrs and member_ip != member_address: + if member_ip not in partner_addrs and member_addr != member_address: logger.info(f"Removing stale Raft member: {member_addr}") self.remove_raft_member(member_addr, raft_password, []) return True diff --git a/src/relations/watcher_requirer.py b/src/relations/watcher_requirer.py index e7a4d86..bee802a 100644 --- a/src/relations/watcher_requirer.py +++ b/src/relations/watcher_requirer.py @@ -313,6 +313,7 @@ def _update_unit_address_if_changed(self) -> None: and (partner_addrs := self._get_raft_partner_addrs(relation)) ): port = self._get_port_for_relation(relation.id) + watcher_addr = f"{new_address}:{port}" raft_controller = RaftController(self.charm, f"rel{relation.id}") changed = raft_controller.configure( port, @@ -327,10 +328,10 @@ def _update_unit_address_if_changed(self) -> None: ) raft_controller.restart() raft_controller.check_watcher_connection( - new_address, raft_password, partner_addrs, port + watcher_addr, raft_password, partner_addrs ) raft_controller.cleanup_raft_cluster( - new_address, raft_password, self._get_related_ips(relation), port + watcher_addr, raft_password, self._get_related_ips(relation) ) def _on_update_status(self, event: UpdateStatusEvent) -> None: @@ -477,10 +478,11 @@ def _on_watcher_relation_changed( # Get or assign a port for this relation port = self._get_port_for_relation(relation.id) + watcher_addr = f"{self.unit_ip}:{port}" raft_controller = RaftController(self.charm, f"rel{relation.id}") raft_controller.cleanup_raft_cluster( - self.unit_ip, raft_password, self._get_related_ips(relation), port + watcher_addr, raft_password, self._get_related_ips(relation) ) if self._is_disabled(relation) or not self._should_watcher_vote(partner_addrs): logger.debug("Disabling the watcher") @@ -499,7 +501,7 @@ def _on_watcher_relation_changed( ) raft_controller.restart() raft_controller.check_watcher_connection( - unit_ip, raft_password, partner_addrs, port + watcher_addr, raft_password, partner_addrs ) relation.data[self.charm.unit]["unit-address"] = unit_ip diff --git a/tests/unit/test_raft_controller.py b/tests/unit/test_raft_controller.py index e2245e5..543c7a2 100644 --- a/tests/unit/test_raft_controller.py +++ b/tests/unit/test_raft_controller.py @@ -102,14 +102,14 @@ def test_check_watcher_connection(controller: RaftController): patch("raft_controller.stop_after_attempt", return_value=stop_after_delay(0)), ): # No partners - controller.check_watcher_connection("1.1.1.1", "testpass", [], 2223) + controller.check_watcher_connection("1.1.1.1:2223", "testpass", []) assert not _tcputility.called # Can't get watcher status _tcputility.return_value.executeCommand.side_effect = [{}] - controller.check_watcher_connection("1.1.1.1", "testpass", ["2.2.2.2", "3.3.3.3"], 2223) + controller.check_watcher_connection("1.1.1.1:2223", "testpass", ["2.2.2.2", "3.3.3.3"]) _tcputility.assert_called_once_with(password="testpass", timeout=3) _tcputility.return_value.executeCommand.assert_called_once_with("1.1.1.1:2223", ["status"]) @@ -124,7 +124,7 @@ def test_check_watcher_connection(controller: RaftController): } _tcputility.return_value.executeCommand.side_effect = [raft_status] - controller.check_watcher_connection("1.1.1.1", "testpass", ["2.2.2.2", "3.3.3.3"], 2223) + controller.check_watcher_connection("1.1.1.1:2223", "testpass", ["2.2.2.2", "3.3.3.3"]) _tcputility.assert_called_once_with(password="testpass", timeout=3) _tcputility.return_value.executeCommand.assert_called_once_with("1.1.1.1:2223", ["status"]) @@ -139,7 +139,7 @@ def test_check_watcher_connection(controller: RaftController): } _tcputility.return_value.executeCommand.side_effect = [raft_status, Exception, Exception] - controller.check_watcher_connection("1.1.1.1", "testpass", ["2.2.2.2", "3.3.3.3"], 2223) + controller.check_watcher_connection("1.1.1.1:2223", "testpass", ["2.2.2.2", "3.3.3.3"]) _tcputility.assert_called_once_with(password="testpass", timeout=3) assert _tcputility.return_value.executeCommand.call_count == 3 @@ -157,7 +157,7 @@ def test_check_watcher_connection(controller: RaftController): } _tcputility.return_value.executeCommand.side_effect = [raft_status, Exception, {1: 2}] - controller.check_watcher_connection("1.1.1.1", "testpass", ["2.2.2.2", "3.3.3.3"], 2223) + controller.check_watcher_connection("1.1.1.1:2223", "testpass", ["2.2.2.2", "3.3.3.3"]) _tcputility.assert_called_once_with(password="testpass", timeout=3) assert _tcputility.return_value.executeCommand.call_count == 3 From 9f8d3f4776560533520e4d8230ced3a49281fddb Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 28 May 2026 15:14:28 +0300 Subject: [PATCH 07/11] Use partner list --- src/relations/watcher_requirer.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/relations/watcher_requirer.py b/src/relations/watcher_requirer.py index bee802a..3de9ac6 100644 --- a/src/relations/watcher_requirer.py +++ b/src/relations/watcher_requirer.py @@ -232,9 +232,6 @@ def _get_patroni_cas(self, relation: Relation) -> str | None: return name return f"relation-{relation.id}" - def _get_related_ips(self, relation: Relation) -> list[str]: - return [data["unit-address"] for data in relation.data.values() if "unit-address" in data] - def _get_standby_clusters(self, relation: Relation) -> list[str]: """Get related standby clusters from the relation app data. @@ -330,9 +327,7 @@ def _update_unit_address_if_changed(self) -> None: raft_controller.check_watcher_connection( watcher_addr, raft_password, partner_addrs ) - raft_controller.cleanup_raft_cluster( - watcher_addr, raft_password, self._get_related_ips(relation) - ) + raft_controller.cleanup_raft_cluster(watcher_addr, raft_password, partner_addrs) def _on_update_status(self, event: UpdateStatusEvent) -> None: """Handle update status event in watcher mode.""" @@ -481,9 +476,7 @@ def _on_watcher_relation_changed( watcher_addr = f"{self.unit_ip}:{port}" raft_controller = RaftController(self.charm, f"rel{relation.id}") - raft_controller.cleanup_raft_cluster( - watcher_addr, raft_password, self._get_related_ips(relation) - ) + raft_controller.cleanup_raft_cluster(watcher_addr, raft_password, partner_addrs) if self._is_disabled(relation) or not self._should_watcher_vote(partner_addrs): logger.debug("Disabling the watcher") raft_controller.remove_service() From e48296daa65b7158f50b68e6dc19ac0653a3355e Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 29 May 2026 00:14:35 +0300 Subject: [PATCH 08/11] Store IP addr in peer data --- src/relations/watcher_requirer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/relations/watcher_requirer.py b/src/relations/watcher_requirer.py index 3de9ac6..c587e9f 100644 --- a/src/relations/watcher_requirer.py +++ b/src/relations/watcher_requirer.py @@ -281,14 +281,15 @@ def _on_leader_elected(self, _) -> None: def _update_unit_address_if_changed(self) -> None: """Update unit-address in relation data if IP has changed, for ALL relations.""" - if not (new_address := self.unit_ip): + if not (new_address := self.unit_ip) or not self.charm.unit.is_leader(): return + current_address = self.charm.app_peer_data.get("unit-address") + address_changed = current_address != new_address + unit_az = os.environ.get("JUJU_AVAILABILITY_ZONE") for relation in self.model.relations.get(WATCHER_RELATION, []): - current_address = relation.data[self.charm.unit].get("unit-address") current_az = relation.data[self.charm.app].get("unit-az") - address_changed = current_address != new_address az_changed = bool(unit_az and current_az != unit_az) if not address_changed and not az_changed: @@ -328,6 +329,7 @@ def _update_unit_address_if_changed(self) -> None: watcher_addr, raft_password, partner_addrs ) raft_controller.cleanup_raft_cluster(watcher_addr, raft_password, partner_addrs) + self.charm.app_peer_data["ip-address"] = new_address def _on_update_status(self, event: UpdateStatusEvent) -> None: """Handle update status event in watcher mode.""" From 435bbfc1ea1850b11cf7e9ede76458986c43494a Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 29 May 2026 04:06:36 +0300 Subject: [PATCH 09/11] Ffwd raft check --- .../integration/ha_tests/test_stereo_mode.py | 83 ++++++++++--------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/tests/integration/ha_tests/test_stereo_mode.py b/tests/integration/ha_tests/test_stereo_mode.py index d2f1120..0d4d79d 100644 --- a/tests/integration/ha_tests/test_stereo_mode.py +++ b/tests/integration/ha_tests/test_stereo_mode.py @@ -92,46 +92,49 @@ async def verify_raft_cluster_health( assert return_code == 0, f"Failed to get watcher address from {watcher_unit.name}" watcher_ip = watcher_ip.strip() - for attempt in Retrying(stop=stop_after_delay(180), wait=wait_fixed(5), reraise=True): - with attempt: - for unit in ops_test.model.applications[db_app_name].units: - # Get the Raft password from Patroni config using juju exec directly - # We need to avoid shell interpretation issues with run_command_on_unit - complete_command = [ - "exec", - "--unit", - unit.name, - "--", - "cat", - "/var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml", - ] - return_code, stdout, _ = await ops_test.juju(*complete_command) - assert return_code == 0, f"Failed to read patroni.yaml on {unit.name}" - - conf = safe_load(stdout) - password = conf.get("raft", {}).get("password") - self_addr = conf.get("raft", {}).get("self_addr") - assert password, f"Could not find Raft password in patroni.yaml on {unit.name}" - - # Check Raft status using the password - syncobj_util = TcpUtility(password=password, timeout=3) - status = syncobj_util.executeCommand(self_addr, ["status"]) - logger.info(f"Raft status on {unit.name}: {status}...") - - # Verify quorum - assert status["has_quorum"] is True, f"Unit {unit.name} does not have Raft quorum" - - assert status["partner_nodes_count"] + 1 == expected_members - - # Verify watcher is in the cluster (if requested) - # After network isolation tests, the watcher may have been redeployed - # with a new IP that isn't yet updated in the Raft configuration - if check_watcher_ip: - assert watcher_ip in [ - key.split(":")[0].split(RAFT_PARTNER_PREFIX)[-1] - for key in status - if key.startswith(RAFT_PARTNER_PREFIX) - ], f"Watcher {watcher_ip} not found in Raft cluster on {unit.name}" + async with ops_test.fast_forward(): + for attempt in Retrying(stop=stop_after_delay(180), wait=wait_fixed(5), reraise=True): + with attempt: + for unit in ops_test.model.applications[db_app_name].units: + # Get the Raft password from Patroni config using juju exec directly + # We need to avoid shell interpretation issues with run_command_on_unit + complete_command = [ + "exec", + "--unit", + unit.name, + "--", + "cat", + "/var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml", + ] + return_code, stdout, _ = await ops_test.juju(*complete_command) + assert return_code == 0, f"Failed to read patroni.yaml on {unit.name}" + + conf = safe_load(stdout) + password = conf.get("raft", {}).get("password") + self_addr = conf.get("raft", {}).get("self_addr") + assert password, f"Could not find Raft password in patroni.yaml on {unit.name}" + + # Check Raft status using the password + syncobj_util = TcpUtility(password=password, timeout=3) + status = syncobj_util.executeCommand(self_addr, ["status"]) + logger.info(f"Raft status on {unit.name}: {status}") + + # Verify quorum + assert status["has_quorum"] is True, ( + f"Unit {unit.name} does not have Raft quorum" + ) + + assert status["partner_nodes_count"] + 1 == expected_members + + # Verify watcher is in the cluster (if requested) + # After network isolation tests, the watcher may have been redeployed + # with a new IP that isn't yet updated in the Raft configuration + if check_watcher_ip: + assert watcher_ip in [ + key.split(":")[0].split(RAFT_PARTNER_PREFIX)[-1] + for key in status + if key.startswith(RAFT_PARTNER_PREFIX) + ], f"Watcher {watcher_ip} not found in Raft cluster on {unit.name}" logger.info("Raft cluster health verified successfully") From bc6e3befdea2b55f4b48a1774db9b7a434dc4849 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 29 May 2026 05:32:17 +0300 Subject: [PATCH 10/11] Wrong key --- src/relations/watcher_requirer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/relations/watcher_requirer.py b/src/relations/watcher_requirer.py index c587e9f..104a55b 100644 --- a/src/relations/watcher_requirer.py +++ b/src/relations/watcher_requirer.py @@ -329,7 +329,7 @@ def _update_unit_address_if_changed(self) -> None: watcher_addr, raft_password, partner_addrs ) raft_controller.cleanup_raft_cluster(watcher_addr, raft_password, partner_addrs) - self.charm.app_peer_data["ip-address"] = new_address + self.charm.app_peer_data["unit-address"] = new_address def _on_update_status(self, event: UpdateStatusEvent) -> None: """Handle update status event in watcher mode.""" From bbb2b832bdb3512ceb541f47694e915dcee7c28f Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 29 May 2026 06:09:07 +0300 Subject: [PATCH 11/11] Revert ffwd --- .../integration/ha_tests/test_stereo_mode.py | 83 +++++++++---------- 1 file changed, 40 insertions(+), 43 deletions(-) diff --git a/tests/integration/ha_tests/test_stereo_mode.py b/tests/integration/ha_tests/test_stereo_mode.py index 0d4d79d..f24f986 100644 --- a/tests/integration/ha_tests/test_stereo_mode.py +++ b/tests/integration/ha_tests/test_stereo_mode.py @@ -92,49 +92,46 @@ async def verify_raft_cluster_health( assert return_code == 0, f"Failed to get watcher address from {watcher_unit.name}" watcher_ip = watcher_ip.strip() - async with ops_test.fast_forward(): - for attempt in Retrying(stop=stop_after_delay(180), wait=wait_fixed(5), reraise=True): - with attempt: - for unit in ops_test.model.applications[db_app_name].units: - # Get the Raft password from Patroni config using juju exec directly - # We need to avoid shell interpretation issues with run_command_on_unit - complete_command = [ - "exec", - "--unit", - unit.name, - "--", - "cat", - "/var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml", - ] - return_code, stdout, _ = await ops_test.juju(*complete_command) - assert return_code == 0, f"Failed to read patroni.yaml on {unit.name}" - - conf = safe_load(stdout) - password = conf.get("raft", {}).get("password") - self_addr = conf.get("raft", {}).get("self_addr") - assert password, f"Could not find Raft password in patroni.yaml on {unit.name}" - - # Check Raft status using the password - syncobj_util = TcpUtility(password=password, timeout=3) - status = syncobj_util.executeCommand(self_addr, ["status"]) - logger.info(f"Raft status on {unit.name}: {status}") - - # Verify quorum - assert status["has_quorum"] is True, ( - f"Unit {unit.name} does not have Raft quorum" - ) - - assert status["partner_nodes_count"] + 1 == expected_members - - # Verify watcher is in the cluster (if requested) - # After network isolation tests, the watcher may have been redeployed - # with a new IP that isn't yet updated in the Raft configuration - if check_watcher_ip: - assert watcher_ip in [ - key.split(":")[0].split(RAFT_PARTNER_PREFIX)[-1] - for key in status - if key.startswith(RAFT_PARTNER_PREFIX) - ], f"Watcher {watcher_ip} not found in Raft cluster on {unit.name}" + for attempt in Retrying(stop=stop_after_delay(180), wait=wait_fixed(5), reraise=True): + with attempt: + for unit in ops_test.model.applications[db_app_name].units: + # Get the Raft password from Patroni config using juju exec directly + # We need to avoid shell interpretation issues with run_command_on_unit + complete_command = [ + "exec", + "--unit", + unit.name, + "--", + "cat", + "/var/snap/charmed-postgresql/current/etc/patroni/patroni.yaml", + ] + return_code, stdout, _ = await ops_test.juju(*complete_command) + assert return_code == 0, f"Failed to read patroni.yaml on {unit.name}" + + conf = safe_load(stdout) + password = conf.get("raft", {}).get("password") + self_addr = conf.get("raft", {}).get("self_addr") + assert password, f"Could not find Raft password in patroni.yaml on {unit.name}" + + # Check Raft status using the password + syncobj_util = TcpUtility(password=password, timeout=3) + status = syncobj_util.executeCommand(self_addr, ["status"]) + logger.info(f"Raft status on {unit.name}: {status}") + + # Verify quorum + assert status["has_quorum"] is True, f"Unit {unit.name} does not have Raft quorum" + + assert status["partner_nodes_count"] + 1 == expected_members + + # Verify watcher is in the cluster (if requested) + # After network isolation tests, the watcher may have been redeployed + # with a new IP that isn't yet updated in the Raft configuration + if check_watcher_ip: + assert watcher_ip in [ + key.split(":")[0].split(RAFT_PARTNER_PREFIX)[-1] + for key in status + if key.startswith(RAFT_PARTNER_PREFIX) + ], f"Watcher {watcher_ip} not found in Raft cluster on {unit.name}" logger.info("Raft cluster health verified successfully")