diff --git a/poetry.lock b/poetry.lock index b0e78ba4ff3..caab7240423 100644 --- a/poetry.lock +++ b/poetry.lock @@ -73,6 +73,7 @@ files = [ ] [package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} idna = ">=2.8" typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""} @@ -914,7 +915,7 @@ version = "1.3.1" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" -groups = ["integration", "unit"] +groups = ["main", "integration", "unit"] markers = "python_version == \"3.10\"" files = [ {file = "exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598"}, @@ -1791,24 +1792,26 @@ testing = ["coverage", "pytest", "pytest-benchmark"] [[package]] name = "postgresql-charms-single-kernel" -version = "16.2.1" +version = "16.2.3" description = "Shared and reusable code for PostgreSQL-related charms" optional = false -python-versions = "<4.0,>=3.8" +python-versions = ">=3.8,<4.0" groups = ["main"] files = [ - {file = "postgresql_charms_single_kernel-16.2.1-py3-none-any.whl", hash = "sha256:6936d06e225a9b8f1bc6da1b22b9cdc6f96fa5c4db7c47a3dbdc5ba5f956abad"}, - {file = "postgresql_charms_single_kernel-16.2.1.tar.gz", hash = "sha256:5551abb62e7248218a009a0cb5da171de6d0a2919d349c5133bf4ac26cb6fe3c"}, + {file = "e763386979329d313cf303d03d504848e0066ff6.zip", hash = "sha256:0dcbf3deaa87a4886eade048bb39c835b39d750cee54ef33ffb3634a012a182f"}, ] [package.dependencies] -httpx = {version = "*", optional = true, markers = "python_full_version >= \"3.12.0\" and extra == \"postgresql\""} ops = ">=2.0.0" psycopg2 = ">=2.9.10" tenacity = ">=9.0.0" [package.extras] -postgresql = ["httpx ; python_full_version >= \"3.12.0\""] +postgresql = ["httpx ; python_version >= \"3.12\""] + +[package.source] +type = "url" +url = "https://github.com/canonical/postgresql-single-kernel-library/archive/e763386979329d313cf303d03d504848e0066ff6.zip" [[package]] name = "prompt-toolkit" @@ -3062,4 +3065,4 @@ h11 = ">=0.16.0,<1" [metadata] lock-version = "2.1" python-versions = ">=3.10,<4.0" -content-hash = "5ed81c48200bb2d799f2955daa5a483ffc2d82e9edf0cdfa37bc079b2ee8077e" +content-hash = "bea73e3310a43744e597b14ece013efb1ee82b884cf0faee6a0567c36f1bd8f5" diff --git a/pyproject.toml b/pyproject.toml index c7c206607d4..b5dee08d1b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ psutil = "^7.2.2" charm-refresh = "^3.1.0.2" charmlibs-snap = "^1.0.1" charmlibs-interfaces-tls-certificates = "^1.8.1" -postgresql-charms-single-kernel = {extras = ["postgresql"], version="16.2.1"} +postgresql-charms-single-kernel = {url = "https://github.com/canonical/postgresql-single-kernel-library/archive/e763386979329d313cf303d03d504848e0066ff6.zip"} [tool.poetry.group.charm-libs.dependencies] # data_platform_libs/v0/data_interfaces.py diff --git a/scripts/cluster_topology_observer.py b/scripts/cluster_topology_observer.py index f0f16cec711..e174429cc0e 100644 --- a/scripts/cluster_topology_observer.py +++ b/scripts/cluster_topology_observer.py @@ -35,7 +35,7 @@ async def _httpx_get_request(url: str): ssl_ctx = create_default_context() with suppress(FileNotFoundError): ssl_ctx.load_verify_locations(cafile=f"{PATRONI_CONF_PATH}/{TLS_CA_BUNDLE_FILE}") - async with AsyncClient(timeout=API_REQUEST_TIMEOUT, verify=ssl_ctx) as client: + async with AsyncClient(timeout=API_REQUEST_TIMEOUT, verify=ssl_ctx, trust_env=False) as client: try: return (await client.get(url)).json() except Exception as e: diff --git a/src/cluster.py b/src/cluster.py index 956139886dd..5d9b57933b8 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -208,6 +208,13 @@ def _patroni_url(self) -> str: """Patroni REST API URL.""" return f"https://{self.unit_ip}:8008" + @cached_property + def _session(self) -> requests.Session: + # Patroni API calls are always intra-cluster and must not go through HTTP proxies. + s = requests.Session() + s.trust_env = False + return s + @staticmethod def _dict_to_hba_string(_dict: dict[str, Any]) -> str: """Transform a dictionary into a Host Based Authentication valid string.""" @@ -411,7 +418,7 @@ def get_patroni_health(self) -> dict[str, str]: """Gets, retires and parses the Patroni health endpoint.""" for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(7)): with attempt: - r = requests.get( + r = self._session.get( f"{self._patroni_url}/health", verify=self.verify, timeout=API_REQUEST_TIMEOUT, @@ -454,7 +461,7 @@ def is_replication_healthy(self) -> bool: for members_ip in members_ips: endpoint = "leader" if members_ip == primary_ip else "replica?lag=16kB" url = self._patroni_url.replace(self.unit_ip, members_ip) - r = requests.get( + r = self._session.get( f"{url}/{endpoint}", verify=self.verify, auth=self._patroni_auth, @@ -520,7 +527,7 @@ def is_member_isolated(self) -> bool: try: for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)): with attempt: - r = requests.get( + r = self._session.get( f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}", verify=self.verify, timeout=API_REQUEST_TIMEOUT, @@ -582,7 +589,7 @@ def are_replicas_up(self) -> dict[str, bool] | None: def promote_standby_cluster(self) -> None: """Promote a standby cluster to be a regular cluster.""" - config_response = requests.get( + config_response = self._session.get( f"{self._patroni_url}/config", verify=self.verify, auth=self._patroni_auth, @@ -595,7 +602,7 @@ def promote_standby_cluster(self) -> None: ) if "standby_cluster" not in config_response.json(): raise StandbyClusterAlreadyPromotedError("standby cluster is already promoted") - r = requests.patch( + r = self._session.patch( f"{self._patroni_url}/config", verify=self.verify, json={"standby_cluster": None}, @@ -610,7 +617,7 @@ def promote_standby_cluster(self) -> None: def set_max_timelines_history(self) -> None: """Patch the DCS with max_timelines_history limit.""" - requests.patch( + self._session.patch( f"{self._patroni_url}/config", verify=self.verify, json={"max_timelines_history": 50}, @@ -806,7 +813,7 @@ def switchover(self, candidate: str | None = None, async_cluster: bool = False) body = {"leader": current_primary} if candidate: body["candidate"] = candidate - r = requests.post( + r = self._session.post( f"{self._patroni_url}/switchover", json=body, verify=self.verify, @@ -1018,7 +1025,7 @@ def remove_raft_member(self, member_address: str | None) -> None: def reload_patroni_configuration(self): """Reload Patroni configuration after it was changed.""" logger.debug("Reloading Patroni configuration...") - r = requests.post( + r = self._session.post( f"{self._patroni_url}/reload", verify=self.verify, auth=self._patroni_auth, @@ -1057,7 +1064,7 @@ def restart_patroni(self) -> bool: def restart_postgresql(self) -> None: """Restart PostgreSQL.""" logger.debug("Restarting PostgreSQL...") - r = requests.post( + r = self._session.post( f"{self._patroni_url}/restart", verify=self.verify, auth=self._patroni_auth, @@ -1069,7 +1076,7 @@ def restart_postgresql(self) -> None: def reinitialize_postgresql(self) -> None: """Reinitialize PostgreSQL.""" logger.debug("Reinitializing PostgreSQL...") - r = requests.post( + r = self._session.post( f"{self._patroni_url}/reinitialize", verify=self.verify, auth=self._patroni_auth, @@ -1087,7 +1094,7 @@ def bulk_update_parameters_controller_by_patroni( """ if not base_parameters: base_parameters = {} - r = requests.patch( + r = self._session.patch( f"{self._patroni_url}/config", verify=self.verify, json={ @@ -1116,7 +1123,7 @@ def ensure_slots_controller_by_patroni(self, slots: dict[str, str]) -> None: """ for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True): with attempt: - current_config = requests.get( + current_config = self._session.get( f"{self._patroni_url}/config", verify=self.verify, timeout=PATRONI_TIMEOUT, @@ -1140,7 +1147,7 @@ def ensure_slots_controller_by_patroni(self, slots: dict[str, str]) -> None: "plugin": "pgoutput", "type": "logical", } - r = requests.patch( + r = self._session.patch( f"{self._patroni_url}/config", verify=self.verify, json={"slots": slots_patch}, @@ -1183,7 +1190,7 @@ def update_synchronous_node_count(self) -> None: """Update synchronous_node_count to the minority of the planned cluster.""" for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): with attempt: - r = requests.patch( + r = self._session.patch( f"{self._patroni_url}/config", json=self.synchronous_configuration, verify=self.verify, diff --git a/tests/integration/test_proxy.py b/tests/integration/test_proxy.py new file mode 100644 index 00000000000..6c4736d20a5 --- /dev/null +++ b/tests/integration/test_proxy.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# Copyright 2026 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Integration test: charm deploys and operates correctly behind an HTTP proxy. + +Regression test for https://github.com/canonical/postgresql-operator/issues/1714. +When Juju model-config sets an HTTP proxy, proxy environment variables leak into +all unit processes. Patroni REST API calls (intra-cluster, on private IPs) must +bypass the proxy — otherwise the charm gets stuck in "awaiting start of the +primary". + +Reproduces the exact scenario from the issue: both Juju model-config proxy +settings AND cloudinit-userdata writing proxy vars to /etc/environment, with +a real Squid proxy running on the LXD host. +""" + +import logging +import subprocess +import textwrap + +import pytest +import requests + +from .adapters import JujuFixture +from .jubilant_helpers import ( + DATABASE_APP_NAME, + get_primary, + get_unit_address, +) + +logger = logging.getLogger(__name__) + + +def _get_lxd_bridge_ip() -> str: + """Return the IP of the lxdbr0 bridge (proxy host reachable by containers).""" + output = subprocess.run( + ["ip", "-4", "-o", "addr", "show", "lxdbr0"], + check=True, + capture_output=True, + text=True, + ).stdout + return output.split("inet ", 1)[1].split("/")[0] + + +PROXY_HOST = _get_lxd_bridge_ip() +PROXY_URL = f"http://{PROXY_HOST}:3128" + +CLOUDINIT_USERDATA = textwrap.dedent("""\ + #cloud-config + write_files: + - path: /etc/environment + permissions: '0644' + owner: root:root + content: | + http_proxy={proxy} + https_proxy={proxy} + HTTP_PROXY={proxy} + HTTPS_PROXY={proxy} + no_proxy=localhost,127.0.0.1,10.0.0.0/8 + NO_PROXY=localhost,127.0.0.1,10.0.0.0/8 +""").format(proxy=PROXY_URL) + +PROXY_CONFIG = { + "http-proxy": PROXY_URL, + "https-proxy": PROXY_URL, + "no-proxy": "127.0.0.1,localhost,::1", + "cloudinit-userdata": CLOUDINIT_USERDATA, +} + + +@pytest.mark.abort_on_fail +def test_deploy_with_proxy(juju: JujuFixture, charm: str): + """Deploy PostgreSQL in a model with HTTP proxy configured.""" + # Apply the proxy config before deploying so the units' machines are provisioned + # with it (cloudinit-userdata only affects machines created after it is set). + juju.ext.model.set_config(PROXY_CONFIG) + juju.ext.model.deploy( + charm, + application_name=DATABASE_APP_NAME, + num_units=3, + config={"profile": "testing"}, + ) + juju.ext.model.set_config({"update-status-hook-interval": "10s"}) + juju.ext.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active", timeout=1500) + + +def test_proxy_env_vars_present_on_units(juju: JujuFixture): + """Verify the proxy env vars are set in /etc/environment (test precondition).""" + unit_name = next(iter(juju.status().get_units(DATABASE_APP_NAME))) + env_output = juju.ssh(unit_name, "cat /etc/environment") + assert "HTTPS_PROXY" in env_output, ( + "Proxy env vars not found in /etc/environment — cloudinit-userdata not applied" + ) + + +def test_patroni_api_reachable(juju: JujuFixture): + """Patroni REST API responds on every unit despite proxy env vars.""" + units = juju.status().get_units(DATABASE_APP_NAME) + for unit_name in units: + host = get_unit_address(juju, unit_name) + result = requests.get(f"https://{host}:8008/health", verify=False) + assert result.status_code == 200, f"Patroni API unreachable on {unit_name}" + + +def test_get_primary_works(juju: JujuFixture): + """The get-primary action succeeds (exercises the charm's internal Patroni client).""" + unit_name = next(iter(juju.status().get_units(DATABASE_APP_NAME))) + primary = get_primary(juju, unit_name) + assert primary, "get-primary returned empty result" diff --git a/tests/spread/test_proxy.py/task.yaml b/tests/spread/test_proxy.py/task.yaml new file mode 100644 index 00000000000..ed1ed011ed8 --- /dev/null +++ b/tests/spread/test_proxy.py/task.yaml @@ -0,0 +1,17 @@ +summary: test_proxy.py +environment: + TEST_MODULE: test_proxy.py +prepare: | + apt-get update -qq + apt-get install -y -qq squid + BRIDGE_IP=$(ip -4 -o addr show lxdbr0 | awk '{print $4}' | cut -d/ -f1) + printf 'http_port %s:3128\nacl localnet src 10.0.0.0/8\nhttp_access allow localnet\nhttp_access allow localhost\nhttp_access deny all\n' "$BRIDGE_IP" > /etc/squid/squid.conf + systemctl restart squid + systemctl is-active --quiet squid +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +restore: | + systemctl stop squid + apt-get remove -y -qq squid +artifacts: + - allure-results diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index 8909f82cc52..de336aefce0 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -120,8 +120,10 @@ def test_get_patroni_health(peers_ips, patroni): patch("cluster.stop_after_delay", new_callable=PropertyMock) as _stop_after_delay, patch("cluster.wait_fixed", new_callable=PropertyMock) as _wait_fixed, patch("charm.Patroni._patroni_url", new_callable=PropertyMock) as _patroni_url, - patch("requests.get", side_effect=mocked_requests_get) as _get, + patch.object(patroni, "_session") as _session, ): + _session.get.side_effect = mocked_requests_get + _get = _session.get # Test when the Patroni API is reachable. _patroni_url.return_value = "http://server1" health = patroni.get_patroni_health() @@ -206,12 +208,13 @@ def test_is_creating_backup(peers_ips, patroni): def test_is_replication_healthy(peers_ips, patroni): with ( - patch("requests.get") as _get, + patch.object(patroni, "_session") as _session, patch("charm.Patroni.get_primary") as _get_primary, patch("charm.Patroni.get_standby_leader") as _get_standby_leader, patch("charm.Patroni.get_member_ip"), patch("cluster.stop_after_delay", return_value=stop_after_delay(0)), ): + _get = _session.get # Test when replication is healthy. _get.return_value.status_code = 200 assert patroni.is_replication_healthy() @@ -240,9 +243,11 @@ def test_is_member_isolated(peers_ips, patroni): with ( patch("cluster.stop_after_delay", return_value=stop_after_delay(0)), patch("cluster.wait_fixed", return_value=wait_fixed(0)), - patch("requests.get", side_effect=mocked_requests_get) as _get, + patch.object(patroni, "_session") as _session, patch("charm.Patroni._patroni_url", new_callable=PropertyMock) as _patroni_url, ): + _session.get.side_effect = mocked_requests_get + _get = _session.get # Test when it wasn't possible to connect to the Patroni API. _patroni_url.return_value = "http://server3" assert not patroni.is_member_isolated @@ -365,7 +370,8 @@ def test_stop_patroni(peers_ips, patroni): def test_reinitialize_postgresql(peers_ips, patroni): - with patch("requests.post") as _post: + with patch.object(patroni, "_session") as _session: + _post = _session.post patroni.reinitialize_postgresql() _post.assert_called_once_with( f"https://{patroni.unit_ip}:8008/reinitialize", @@ -377,9 +383,10 @@ def test_reinitialize_postgresql(peers_ips, patroni): def test_switchover(peers_ips, patroni): with ( - patch("requests.post") as _post, + patch.object(patroni, "_session") as _session, patch("cluster.Patroni.get_primary", return_value="primary"), ): + _post = _session.post response = _post.return_value response.status_code = 200 @@ -426,8 +433,9 @@ def test_update_synchronous_node_count(peers_ips, patroni): with ( patch("cluster.stop_after_delay", return_value=stop_after_delay(0)) as _wait_fixed, patch("cluster.wait_fixed", return_value=wait_fixed(0)) as _wait_fixed, - patch("requests.patch") as _patch, + patch.object(patroni, "_session") as _session, ): + _patch = _session.patch response = _patch.return_value response.status_code = 200 @@ -450,8 +458,9 @@ def test_update_synchronous_node_count(peers_ips, patroni): def test_set_max_timelines_history(peers_ips, patroni): with ( - patch("requests.patch") as _patch, + patch.object(patroni, "_session") as _session, ): + _patch = _session.patch patroni.set_max_timelines_history() _patch.assert_called_once_with( @@ -491,11 +500,12 @@ def test_configure_patroni_on_unit(peers_ips, patroni): def test_member_started_true(peers_ips, patroni): with ( - patch("cluster.requests.get") as _get, + patch.object(patroni, "_session") as _session, patch("cluster.stop_after_delay", return_value=stop_after_delay(0)), patch("cluster.wait_fixed", return_value=wait_fixed(0)), patch("charm.Patroni.is_patroni_running", return_value=True), ): + _get = _session.get _get.return_value.json.return_value = {"state": "running"} assert patroni.member_started @@ -510,11 +520,12 @@ def test_member_started_true(peers_ips, patroni): def test_member_started_false(peers_ips, patroni): with ( - patch("cluster.requests.get") as _get, + patch.object(patroni, "_session") as _session, patch("cluster.stop_after_delay", return_value=stop_after_delay(0)), patch("cluster.wait_fixed", return_value=wait_fixed(0)), patch("charm.Patroni.is_patroni_running", return_value=True), ): + _get = _session.get _get.return_value.json.return_value = {"state": "stopped"} assert not patroni.member_started @@ -529,11 +540,12 @@ def test_member_started_false(peers_ips, patroni): def test_member_started_error(peers_ips, patroni): with ( - patch("cluster.requests.get") as _get, + patch.object(patroni, "_session") as _session, patch("cluster.stop_after_delay", return_value=stop_after_delay(0)), patch("cluster.wait_fixed", return_value=wait_fixed(0)), patch("charm.Patroni.is_patroni_running", return_value=True), ): + _get = _session.get _get.side_effect = Exception assert not patroni.member_started @@ -548,10 +560,11 @@ def test_member_started_error(peers_ips, patroni): def test_member_inactive_true(peers_ips, patroni): with ( - patch("cluster.requests.get") as _get, + patch.object(patroni, "_session") as _session, patch("cluster.stop_after_delay", return_value=stop_after_delay(0)), patch("cluster.wait_fixed", return_value=wait_fixed(0)), ): + _get = _session.get _get.return_value.json.return_value = {"state": "stopped"} assert patroni.member_inactive @@ -566,10 +579,11 @@ def test_member_inactive_true(peers_ips, patroni): def test_member_inactive_false(peers_ips, patroni): with ( - patch("cluster.requests.get") as _get, + patch.object(patroni, "_session") as _session, patch("cluster.stop_after_delay", return_value=stop_after_delay(0)), patch("cluster.wait_fixed", return_value=wait_fixed(0)), ): + _get = _session.get _get.return_value.json.return_value = {"state": "starting"} assert not patroni.member_inactive @@ -584,10 +598,11 @@ def test_member_inactive_false(peers_ips, patroni): def test_member_inactive_error(peers_ips, patroni): with ( - patch("cluster.requests.get") as _get, + patch.object(patroni, "_session") as _session, patch("cluster.stop_after_delay", return_value=stop_after_delay(0)), patch("cluster.wait_fixed", return_value=wait_fixed(0)), ): + _get = _session.get _get.side_effect = Exception assert patroni.member_inactive @@ -798,11 +813,12 @@ def test_remove_raft_member(patroni): def test_remove_raft_member_no_quorum(patroni, harness): with ( patch("cluster.TcpUtility") as _tcp_utility, - patch("cluster.requests.get") as _get, + patch.object(patroni, "_session") as _session, patch( "charm.PostgresqlOperatorCharm.unit_peer_data", new_callable=PropertyMock ) as _unit_peer_data, ): + _get = _session.get # Async replica _unit_peer_data.return_value = {} _tcp_utility.return_value.executeCommand.return_value = { diff --git a/tests/unit/test_cluster_topology_observer.py b/tests/unit/test_cluster_topology_observer.py index 99e31d4f007..4d9ff8e171f 100644 --- a/tests/unit/test_cluster_topology_observer.py +++ b/tests/unit/test_cluster_topology_observer.py @@ -178,7 +178,7 @@ async def test_main(): ] with pytest.raises(UnreachableUnitsError): await main() - _async_client.assert_any_call(timeout=5, verify=_context.return_value) + _async_client.assert_any_call(timeout=5, verify=_context.return_value, trust_env=False) _get.assert_any_call("http://server1:8008/cluster") _get.assert_any_call("http://server3:8008/cluster")