From 78ad5805f8b02e3ead3d9362bdd8f169a72d62d5 Mon Sep 17 00:00:00 2001 From: Miguel Angel Ajo Pelayo Date: Mon, 1 Dec 2025 15:20:53 +0100 Subject: [PATCH] fix: lease-acquisition loop can fail with temporary grpc issues a lease acquisition loop can fail in the following way very easily if a router in openshift/k8s is restarted during the lease get operation. this retry mechanism avoids this issue. ``` 2025-11-28T15:22:39.557321889Z for lease: There are 9 approved 2025-11-28T15:22:39.557321889Z exporters, (i.e. ) but all of them are 2025-11-28T15:22:39.557321889Z already leased (7:30:58) 2025-11-28T15:22:44.577566967Z [11/28/25 15:22:44] INFO INFO:jumpstarter.client.lease:Waiting lease.py:355 2025-11-28T15:22:44.577566967Z for lease: There are 9 approved 2025-11-28T15:22:44.577566967Z exporters, (i.e. ) but all of them are 2025-11-28T15:22:44.577566967Z already leased (7:31:03) 2025-11-28T15:22:56.436248186Z Error: grpc error: Socket closed ``` --- .../jumpstarter/jumpstarter/client/lease.py | 22 +++++++++++++++++-- packages/jumpstarter/pyproject.toml | 1 + 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/packages/jumpstarter/jumpstarter/client/lease.py b/packages/jumpstarter/jumpstarter/client/lease.py index 5bf087ade..289f5efce 100644 --- a/packages/jumpstarter/jumpstarter/client/lease.py +++ b/packages/jumpstarter/jumpstarter/client/lease.py @@ -23,6 +23,7 @@ from grpc.aio import AioRpcError, Channel from jumpstarter_protocol import jumpstarter_pb2, jumpstarter_pb2_grpc from rich.console import Console +from tenacity import retry, retry_if_exception_type, wait_exponential_jitter from .exceptions import LeaseError from jumpstarter.client import client_from_path @@ -77,6 +78,24 @@ async def get(self): svc = ClientService(channel=self.channel, namespace=self.namespace) return await svc.GetLease(name=self.name) + @retry( + wait=wait_exponential_jitter(initial=1, max=120, jitter=1), + retry=retry_if_exception_type(ConnectionError), + reraise=True, + ) + async def _get_with_retry(self): + """Get lease with exponential backoff retry on ConnectionError. + + Retries with exponential backoff and jitter indefinitely when ConnectionError occurs. + The wait time between retries is capped at 2 minutes (120 seconds). + Jitter helps prevent thundering herd problems when multiple clients retry simultaneously. + """ + try: + return await self.get() + except ConnectionError as e: + logger.error("Error while getting lease %s: %s", self.name, e) + raise + def request(self): """Request a lease, or verifies a lease which was already created. @@ -136,8 +155,7 @@ async def _acquire(self): with LeaseAcquisitionSpinner(self.name) as spinner: while True: logger.debug("Polling Lease %s", self.name) - result = await self.get() - + result = await self._get_with_retry() # lease ready if condition_true(result.conditions, "Ready"): logger.debug("Lease %s acquired", self.name) diff --git a/packages/jumpstarter/pyproject.toml b/packages/jumpstarter/pyproject.toml index cb6a0f3cd..fbd573cd5 100644 --- a/packages/jumpstarter/pyproject.toml +++ b/packages/jumpstarter/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "xdg-base-dirs>=6.0.2", "pydantic-settings>=2.9.1", "rich>=14.0.0", + "tenacity>=8.2.0", ] [dependency-groups]