Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
120 commits
Select commit Hold shift + click to select a range
a59a3a9
feat(watcher): add postgresql-watcher charm for stereo mode
marceloneppel Jan 27, 2026
9d847c3
feat(postgresql-watcher): replace charmed-postgresql snap with native…
marceloneppel Jan 28, 2026
cb8276c
fix(ha): use cut_network_from_unit for faster failover detection in s…
marceloneppel Jan 28, 2026
cb9cf56
fix(raft): improve IP change handling and watcher lifecycle management
marceloneppel Jan 29, 2026
1a0de53
fix(tests): auto-build watcher charm and deploy sequentially in stere…
marceloneppel Jan 29, 2026
41bae8f
fix(tests): improve stereo mode test stability and resilience
marceloneppel Jan 30, 2026
129041c
fix(tests): add use_ip_from_inside for stale IP handling and fix veri…
marceloneppel Jan 30, 2026
5010b04
fix(watcher): implement _onTick with TTL expiry logic for failover
marceloneppel Jan 30, 2026
e8ab990
fix(watcher): use hyphenated keys in health check action results
marceloneppel Jan 30, 2026
2c2e8d3
fix(watcher): add PostgreSQL user authentication and fix lint/test is…
marceloneppel Feb 2, 2026
5207e2b
test(stereo-mode): deploy 2 PostgreSQL units from start
marceloneppel Feb 2, 2026
d0def16
Update password management docs with Juju secrets (16) (#1379)
a-velasco Jan 28, 2026
27ad3d1
Update charmcraft.yaml build tools (#1399)
renovate[bot] Jan 28, 2026
1adbcbe
Lock file maintenance Python dependencies (#1400)
renovate[bot] Jan 28, 2026
dca0729
Update documentation home page (#1402)
a-velasco Jan 29, 2026
c31c613
Add new stable releases to releases.md (16) (#1405)
a-velasco Jan 30, 2026
05998f2
feat(stereo-mode): unify watcher into PostgreSQL charm with role config
marceloneppel Mar 5, 2026
1b6bde9
fix(watcher): set tracing_endpoint=None for @trace_charm compatibility
marceloneppel Mar 5, 2026
04ed58b
[DPE-9158] Limit repo listing to find the timelines (#1403)
dragomirp Feb 3, 2026
f86f7be
[DPE-8932] Strict mode configuration (#1389)
dragomirp Feb 3, 2026
a48b62c
Update dependency pip to v26 (#1416)
renovate[bot] Feb 3, 2026
ddddb82
Update charmcraft.yaml build tools (#1415)
renovate[bot] Feb 3, 2026
b41c940
Update canonical/data-platform-workflows action to v41.1.1 (#1414)
renovate[bot] Feb 3, 2026
ee92abd
Lock file maintenance Python dependencies (#1417)
renovate[bot] Feb 3, 2026
3cf43e0
Remove arm jammy image pinning (#1391)
dragomirp Feb 3, 2026
5fdded7
Switch to ty for typechecking (#1422)
dragomirp Feb 5, 2026
07a4a8c
[MISC] Move TLS transfer to single kernel (#1410)
dragomirp Feb 5, 2026
7e3dfd4
Add information about custom usernames (#1409)
a-velasco Feb 6, 2026
1efa3b5
DPE-8980 Support Juju 4: use 'ip' instead of 'private-address' (if av…
taurus-forever Feb 6, 2026
5179d17
DPE-8900 Fix CIDR mask for self_ip (peer_ip in pg_hba) (#1424)
taurus-forever Feb 9, 2026
814042f
[DPE-9370] Handle retry error in bulk update (#1427)
dragomirp Feb 9, 2026
fe227c8
Update canonical/data-platform-workflows action to v41.1.2 (#1430)
renovate[bot] Feb 10, 2026
ecd3dc0
Lock file maintenance Python dependencies (#1432)
renovate[bot] Feb 10, 2026
5c350ab
Update charmcraft.yaml build tools (#1431)
renovate[bot] Feb 10, 2026
4246a1f
[MISC] Bump charm libs (#1434)
dragomirp Feb 11, 2026
5beda93
Add optional flags to provides endpoints for PG16 VM (#1435)
taurus-forever Feb 12, 2026
12f8039
Sync renovate config changes (#1447)
dragomirp Feb 13, 2026
783ebf0
Update canonical/data-platform-workflows action to v42 (#1456)
renovate[bot] Feb 17, 2026
f0a3000
Lock file maintenance (#1457)
renovate[bot] Feb 17, 2026
df73f1d
Update charmcraft.yaml build tools (#1454)
renovate[bot] Feb 17, 2026
8edda3b
Add CODEOWNERS (#1464)
carlcsaposs-canonical Feb 18, 2026
7cc34b6
Check if bucket key is present (#1462)
dragomirp Feb 18, 2026
7c20dc8
Update Python dependencies (16/edge) (#1455)
renovate[bot] Feb 19, 2026
0b98afd
feat: multiple Patroni primaries alert (#1460)
Deezzir Feb 23, 2026
55cc98a
Lock file maintenance (#1480)
renovate[bot] Feb 24, 2026
b590e22
Revert COS agent lib (#1472)
dragomirp Feb 24, 2026
3a4e8ff
Update Python dependencies (#1479)
renovate[bot] Feb 24, 2026
0e40e4e
Update dependency uv to v0.10.5 (#1478)
renovate[bot] Feb 24, 2026
bacf191
fix(monitoring): add _total suffix to PostgreSQL counter metrics in G…
marceloneppel Feb 26, 2026
946c56c
[DPE-9443] Switch to ops tracing (16/edge) (#1466)
dragomirp Feb 26, 2026
95de21b
Update COS agent lib (#1486)
dragomirp Feb 27, 2026
67c41f4
Remove bind mounts (#1488)
dragomirp Mar 2, 2026
ee30e2f
Update canonical/data-platform-workflows action to v42.0.1 (#1495)
renovate[bot] Mar 3, 2026
8bb657f
Update GitHub actions (#1498)
renovate[bot] Mar 3, 2026
784628f
Lock file maintenance (#1499)
renovate[bot] Mar 3, 2026
f61170f
Update Python dependencies (#1497)
renovate[bot] Mar 3, 2026
66a1366
Update dependency uv to v0.10.7 (#1496)
renovate[bot] Mar 3, 2026
56e99bd
Add v16/1.206.0 to refresh docs (#1500)
carlcsaposs-canonical Mar 3, 2026
0e47bb4
Add new 16/stable revisions to releases.md (#1503)
a-velasco Mar 4, 2026
5df6d89
[DPE-9479] Test app channel and base/series (16/edge) (#1505)
dragomirp Mar 5, 2026
258f6b9
[DPE-9455] Bump PostgreSQL to 16.13 (#1509)
taurus-forever Mar 5, 2026
9b6293e
fix(tests): add idempotency to stereo mode test relations
marceloneppel Mar 6, 2026
0a43d15
feat(watcher): add multi-cluster support with per-relation Raft insta…
marceloneppel Mar 10, 2026
dc05fb5
Merge remote-tracking branch 'origin/16/edge' into stereo-mode-unifie…
marceloneppel Mar 10, 2026
df0246d
Merge remote-tracking branch 'origin/16/edge' into stereo-mode-unifie…
marceloneppel Mar 16, 2026
68ab8ef
feat: replace custom Raft with Patroni's raft_controller
marceloneppel Mar 16, 2026
2a30d01
refactor(watcher): install snap from store instead of bundling
marceloneppel Mar 16, 2026
8118c85
test(async-replication): add stereo mode tests for async replication …
marceloneppel Mar 17, 2026
184bc68
Merge remote-tracking branch 'origin/16/edge' into stereo-mode-unifie…
marceloneppel Mar 23, 2026
fcc50af
Downgrade amd runners to jammy
dragomirp Apr 1, 2026
31cdd4d
fix(watcher): harden stereo-mode watcher implementation
marceloneppel Apr 7, 2026
e3b3d65
Merge branch '16/edge' into stereo-mode-unified-charm
marceloneppel Apr 7, 2026
cfe8613
style: format files for lint
marceloneppel Apr 7, 2026
af54ea4
fix: satisfy ty in lint job
marceloneppel Apr 7, 2026
79f4d67
fix(watcher): fall back to loopback raft status probe
marceloneppel Apr 8, 2026
2a88644
Run watcher raft controller under patroni profile
marceloneppel Apr 8, 2026
1745750
Merge remote-tracking branch 'origin/16/edge' into stereo-mode-unifie…
marceloneppel Apr 9, 2026
32333fd
feat(watcher): improve Raft quorum management, enrich cluster status …
marceloneppel Apr 14, 2026
187f342
refactor: decompose _format_cluster_status into focused helper methods
marceloneppel Apr 14, 2026
fc3b806
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 16, 2026
4dc69f5
Rendering WIP
dragomirp Apr 17, 2026
d40037e
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 17, 2026
cfbb387
Dir permissions
dragomirp Apr 18, 2026
56bc4ce
Switch to systemd charmlib
dragomirp Apr 18, 2026
79ba856
Try to fall into existing refresh logic
dragomirp Apr 20, 2026
1c2c0e6
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 20, 2026
439e345
Persist port mapping in peer data and use systemd template
dragomirp Apr 20, 2026
80d1906
Remove data dir on rel removal
dragomirp Apr 20, 2026
cfd9cd8
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 21, 2026
a68ec9b
Parse yaml
dragomirp Apr 21, 2026
366403a
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 21, 2026
7f24ec5
Cleanup sycobj imports
dragomirp Apr 21, 2026
0b3b5f4
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 21, 2026
b4f5c14
Deep import and cleanup
dragomirp Apr 21, 2026
0510e5e
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 22, 2026
b89b13e
Revert iptables ip code
dragomirp Apr 22, 2026
24051ba
Call patroni for additional details
dragomirp Apr 23, 2026
9aaafc7
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 23, 2026
e559bd9
Factor out parallel calls
dragomirp Apr 24, 2026
67d81e2
Merge RAFT controller and Health checker
dragomirp Apr 24, 2026
c1ccf83
Additive changes for stereo mode
dragomirp Apr 24, 2026
9454839
Merge branch 'stereo-mode-additive-code' into dragop/stereo-mode-unif…
dragomirp Apr 24, 2026
be949bf
Try to clean up role changes code
dragomirp Apr 25, 2026
2e033b2
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 27, 2026
6559d40
PR 1648 tweaks
dragomirp Apr 27, 2026
6c2afa2
Readd watcher
dragomirp Apr 27, 2026
2e3c154
Add back raft add
dragomirp Apr 27, 2026
862ce99
Disable watcher during raft reinit and async replication
dragomirp Apr 28, 2026
d0d0852
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 28, 2026
4772828
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 28, 2026
d274f5f
Handle exception
dragomirp Apr 28, 2026
2c94002
Clean up add code
dragomirp Apr 29, 2026
5e72c81
Merge branch '16/edge' into dragop/stereo-mode-unified-charm
dragomirp Apr 29, 2026
f6d4b94
Watcher removal
dragomirp Apr 29, 2026
cc22ccc
Block on multiple watcher
dragomirp Apr 29, 2026
f931081
Secret update for raft password
dragomirp Apr 29, 2026
e82ce09
Fix invalid roles test
dragomirp Apr 29, 2026
0ba9817
Merge branch 'stereo-mode-watcher' into dragop/stereo-mode-unified-charm
dragomirp Apr 30, 2026
d905702
Merge branch 'stereo-mode-watcher' into dragop/stereo-mode-unified-charm
dragomirp Apr 30, 2026
0a95bab
Merge branch 'stereo-mode-watcher' into dragop/stereo-mode-unified-charm
dragomirp Apr 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,20 @@ restore:
restore-to-time:
type: string
description: Point-in-time-recovery target in PSQL format.
get-cluster-status:
description: Display cluster topology, PostgreSQL units health status, and Raft cluster state.
Only available when role=watcher.
params:
cluster-name:
type: string
description: |
The name of the cluster to filter the output by.
Useful in async-replication (Disaster Recovery) setups where multiple clusters are related.
standby-clusters:
type: boolean
default: false
description: |
Show status information including linked standby clusters (async replication).
trigger-health-check:
description: Manually trigger health checks on PostgreSQL endpoints and return results.
Only available when role=watcher.
293 changes: 292 additions & 1 deletion src/relations/watcher_requirer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
import os
import typing
from datetime import datetime
from typing import Any, Literal

from charmlibs.systemd import service_running
from ops import (
ActionEvent,
ActiveStatus,
BlockedStatus,
InstallEvent,
Expand All @@ -40,7 +42,7 @@
)

from constants import RAFT_PORT, WATCHER_RELATION
from raft_controller import RaftController, install_service
from raft_controller import ClusterStatus, RaftController, install_service

if typing.TYPE_CHECKING:
from charm import PostgresqlOperatorCharm
Expand Down Expand Up @@ -82,6 +84,14 @@ def __init__(self, charm: "PostgresqlOperatorCharm"):
self._on_watcher_relation_broken,
)

# Actions
self.framework.observe(
self.charm.on.get_cluster_status_action, self._on_get_cluster_status
)
self.framework.observe(
self.charm.on.trigger_health_check_action, self._on_trigger_health_check
)

@property
def unit_ip(self) -> str | None:
"""Return this unit's IP address."""
Expand Down Expand Up @@ -522,3 +532,284 @@ def _on_watcher_relation_broken(self, event: RelationBrokenEvent) -> None:
]
if not remaining:
self.charm.unit.status = WaitingStatus("Waiting for relation to PostgreSQL")

# -- Actions --

def _build_ip_maps(self, relation: Relation) -> tuple[dict[str, str], dict[str, str]]:
"""Build IP-to-AZ and IP-to-unit-name maps from relation data.

Returns:
Tuple of (ip_to_az, ip_to_unit) dictionaries.
"""
ip_to_az: dict[str, str] = {}
ip_to_unit: dict[str, str] = {}
for unit in relation.units:
if unit_ip := relation.data[unit].get("unit-address"):
ip_to_unit[unit_ip] = unit.name
if unit_az := relation.data[unit].get("unit-az"):
ip_to_az[unit_ip] = unit_az
if watcher_ip := self.unit_ip:
ip_to_unit[watcher_ip] = self.charm.unit.name
return ip_to_az, ip_to_unit

def _resolve_raft_members(
self, raft_status: ClusterStatus, ip_to_unit: dict[str, str]
) -> None:
"""Resolve Raft member IPs to unit names in-place."""
resolved = []
for member_addr in raft_status.get("members", []):
member_ip = member_addr.split(":")[0]
resolved.append(ip_to_unit.get(member_ip, member_addr))
raft_status["members"] = sorted(resolved)

def _on_get_cluster_status(self, event: ActionEvent) -> None:
"""Handle get-cluster-status action."""
cluster_name_filter = event.params.get("cluster-name")
cluster_set_mode = event.params.get("standby-clusters", False)

relations = self.model.relations.get(WATCHER_RELATION, [])
clusters_data: dict[str, dict[str, Any]] = {}
standby_clusters_map: dict[str, list[str]] = {}
for relation in relations:
cluster_name = self._get_cluster_name(relation)
if cluster_name_filter and cluster_name != cluster_name_filter:
continue
clusters_data[cluster_name] = self._format_cluster_status(relation)
standby_clusters_map[cluster_name] = self._get_standby_clusters(relation)

if not clusters_data:
if cluster_name_filter:
event.fail(f"Cluster '{cluster_name_filter}' not found among related clusters.")
else:
event.set_results({"success": "True", "status": json.dumps({})})
return

if cluster_set_mode:
result_status = self._format_cluster_set_status(clusters_data, standby_clusters_map)
elif len(clusters_data) == 1:
# Single cluster: return the cluster status directly
result_status = next(iter(clusters_data.values()))
else:
# Multi-cluster: return list with watcher summary
result_status = {
"clusters": list(clusters_data.values()),
"watcher": {
"unit": self.charm.unit.name,
"address": self.unit_ip,
"clusters_monitored": len(clusters_data),
},
}

event.set_results({"success": "True", "status": json.dumps(result_status)})

def _get_pg_version(self, relation: Relation) -> str:
"""Return Postgresql version of the cluster."""
if not relation.app:
return "unknown"

return relation.data[relation.app].get("version", "unknown")

def _build_postgresql_topology(
self,
relation: Relation,
pg_endpoints: list[str],
ip_to_unit: dict[str, str],
) -> tuple[
dict[str, Any],
str | None,
Literal["primary", "standby", "unknown"],
int | Literal["unknown"],
]:
"""Build PostgreSQL topology entries and infer the cluster role."""
topology: dict[str, Any] = {}
primary_endpoint = None
cluster_role = "unknown"
version = self._get_pg_version(relation)
timeline = "unknown"

if not pg_endpoints:
return topology, primary_endpoint, cluster_role, timeline

raft_controller = RaftController(self.charm, f"rel{relation.id}")
# TODO figure out how to share the password for async clusters
health_results = (
raft_controller.check_all_endpoints(pg_endpoints, password)
if (password := self.get_watcher_password(relation))
else dict.fromkeys(pg_endpoints, False)
)
cluster_status = raft_controller.cluster_status(pg_endpoints)
patroni_members = {}
for member in cluster_status:
patroni_members[member["host"]] = member

for endpoint in pg_endpoints:
unit_name = ip_to_unit.get(endpoint, endpoint)
patroni_member = patroni_members.get(endpoint, {})
is_healthy = health_results.get(endpoint, False)

if is_primary := patroni_member.get("role") == "leader":
primary_endpoint = f"{endpoint}:5432"

role = patroni_member.get("role", "unknown")
lag = patroni_member.get("lag", "unknown")
if role == "leader":
role = "primary"
timeline = patroni_member.get("timeline", "unknown")
cluster_role = "primary"
lag = 0
elif role == "standby_leader":
role = "standby"
cluster_role = "standby"
timeline = patroni_member.get("timeline", "unknown")
lag = 0

topology[unit_name] = {
"address": f"{endpoint}:5432",
"memberrole": role,
"mode": "r/w" if is_primary else "r/o",
"status": "online" if is_healthy else "offline",
"version": version,
"lag": lag,
}
return topology, primary_endpoint, cluster_role, timeline

def _is_tls_enabled(self, relation: Relation) -> bool:
"""Return whether TLS is enabled for the related PostgreSQL cluster."""
if not relation.app:
return False
return relation.data[relation.app].get("tls-enabled", "false") == "true"

def _format_cluster_status(self, relation: Relation) -> dict[str, Any]:
"""Format cluster status for a single cluster relation."""
cluster_name = self._get_cluster_name(relation)
pg_endpoints = self._get_raft_partner_addrs(relation)
_ip_to_az, ip_to_unit = self._build_ip_maps(relation)

# Get Raft status
port = self._get_port_for_relation(relation.id)
password = self._get_raft_password(relation)
raft_controller = RaftController(self.charm, instance_id=f"rel{relation.id}")
raft_status = raft_controller.get_status(port, password)
self._resolve_raft_members(raft_status, ip_to_unit)
has_quorum = raft_status.get("has_quorum", False)
watcher_voting = self._should_watcher_vote(pg_endpoints) and not self._is_disabled(
relation
)
topology, primary_endpoint, cluster_role, timeline = self._build_postgresql_topology(
relation, pg_endpoints, ip_to_unit
)

# Add watcher entry to topology
watcher_port = self._get_port_for_relation(relation.id)
watcher_ip = self.unit_ip or relation.data[self.charm.unit].get("unit-address")
watcher_address = f"{watcher_ip}:{watcher_port}" if watcher_ip else None
topology[self.charm.unit.name] = {
"address": watcher_address,
"memberrole": "watcher",
"mode": "n/a",
"status": "online" if raft_status.get("running", False) else "offline",
"version": "n/a",
"voting": watcher_voting,
}

status_text = (
"cluster is tolerant to failures."
if has_quorum
else "cluster is not tolerant to any failures."
)

return {
"clustername": cluster_name,
"clusterrole": cluster_role,
"primary": primary_endpoint,
"ssl": "required" if self._is_tls_enabled(relation) else "disabled",
"status": "ok" if has_quorum else "ok_no_tolerance",
"statustext": status_text,
"timeline": timeline,
"topology": topology,
"raft": {
"has_quorum": has_quorum,
"leader": raft_status.get("leader"),
"members": raft_status.get("members", []),
},
}

def _format_cluster_set_status(
self,
clusters_data: dict[str, dict[str, Any]],
standby_clusters_map: dict[str, list[str]],
) -> dict[str, Any]:
"""Format cluster-set status for async replication view."""
clusters_summary: dict[str, Any] = {}
# TODO No way to have multiple primaries
primary_cluster_name = None

for name, data in clusters_data.items():
cluster_role = data.get("clusterrole", "unknown")
is_primary = cluster_role == "primary"
summary: dict[str, Any] = {
"clusterrole": cluster_role,
"status": data.get("status", "unknown"),
"primary": data.get("primary"),
"linked_standby_clusters": standby_clusters_map.get(name, []),
}
if is_primary and primary_cluster_name is None:
primary_cluster_name = name
elif cluster_role == "standby":
summary["replication_status"] = "streaming"
summary["replication_lag"] = 0
summary["timeline"] = data.get("timeline", 0)
clusters_summary[name] = summary

all_healthy = all(c.get("status") == "ok" for c in clusters_data.values())

return {
"clusters": clusters_summary,
"primary_cluster": primary_cluster_name,
"status": "healthy" if all_healthy else "degraded",
"statustext": ("all clusters available." if all_healthy else "some clusters at risk."),
}

def _on_trigger_health_check(self, event: ActionEvent) -> None:
"""Handle trigger-health-check action."""
clusters: list[dict[str, Any]] = []
total_healthy = 0
total_count = 0

for relation in self.model.relations.get(WATCHER_RELATION, []):
pg_endpoints = self._get_raft_partner_addrs(relation)
if not pg_endpoints or not (password := self.get_watcher_password(relation)):
continue

raft_controller = RaftController(self.charm, f"rel{relation.id}")
health_results = raft_controller.check_all_endpoints(pg_endpoints, password)

_ip_to_az, ip_to_unit = self._build_ip_maps(relation)

cluster_name = self._get_cluster_name(relation)
endpoint_statuses: dict[str, str] = {}
for endpoint in health_results:
unit_name = ip_to_unit.get(endpoint)
label = unit_name if unit_name else f"{cluster_name}/{endpoint}"
is_healthy = health_results.get(endpoint, False)
endpoint_statuses[label] = "healthy" if is_healthy else "unhealthy"
if is_healthy:
total_healthy += 1
total_count += 1

clusters.append({
"cluster_name": cluster_name,
"endpoints": endpoint_statuses,
})

if total_count == 0:
event.fail("No PostgreSQL endpoints available")
return

output: dict[str, Any] = {
"clusters": clusters,
"healthy-count": total_healthy,
"total-count": total_count,
}

event.set_results({"health-check": json.dumps(output)})
Loading
Loading