Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,11 @@ describe("RunStatusBadge", () => {
render(<RunStatusBadge status={AutomationRunStatus.RUNNING} />);
expect(screen.getByText("AUTOMATIONS$DETAIL$RUNNING")).toBeInTheDocument();
});

it("renders skipped/limit-reached label for skipped status", () => {
render(<RunStatusBadge status={AutomationRunStatus.SKIPPED} />);
expect(
screen.getByText("AUTOMATIONS$DETAIL$SKIPPED_LIMIT_REACHED"),
).toBeInTheDocument();
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ const statusConfig: Record<
label: I18nKey.AUTOMATIONS$DETAIL$RUNNING,
style: "border-border bg-surface-elevated text-content-muted",
},
[AutomationRunStatus.SKIPPED]: {
label: I18nKey.AUTOMATIONS$DETAIL$SKIPPED_LIMIT_REACHED,
style: "border-border bg-surface-elevated text-content-muted",
},
};

function StatusIcon({ status }: { status: AutomationRunStatus }) {
Expand Down
17 changes: 17 additions & 0 deletions frontend/src/i18n/translation.json
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,23 @@
"tr": "Zamanlama",
"uk": "Розклад"
},
"AUTOMATIONS$DETAIL$SKIPPED_LIMIT_REACHED": {
"en": "Skipped – Limit Reached",
"ja": "スキップ – 上限に到達",
"zh-CN": "已跳过 – 已达上限",
"zh-TW": "已略過 – 已達上限",
"ko-KR": "건너뜀 – 한도 도달",
"no": "Hoppet over – grense nådd",
"ar": "تم التخطّي – تم بلوغ الحد",
"de": "Übersprungen – Limit erreicht",
"fr": "Ignoré – limite atteinte",
"it": "Saltato – limite raggiunto",
"pt": "Ignorado – limite atingido",
"es": "Omitido – límite alcanzado",
"ca": "Omès – límit assolit",
"tr": "Atlandı – sınıra ulaşıldı",
"uk": "Пропущено – ліміт досягнуто"
},
"AUTOMATIONS$DETAIL$SUCCESSFUL": {
"en": "Successful",
"ja": "成功",
Expand Down
1 change: 1 addition & 0 deletions frontend/src/types/automation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ export enum AutomationRunStatus {
RUNNING = "RUNNING",
COMPLETED = "COMPLETED",
FAILED = "FAILED",
SKIPPED = "SKIPPED",
}

export interface AutomationRun {
Expand Down
30 changes: 30 additions & 0 deletions openhands/automation/backends/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from openhands.automation.backends.base import ExecutionBackend, ExecutionContext
from openhands.automation.config import get_config
from openhands.automation.exceptions import ConcurrencyLimitReachedError
from openhands.automation.models import AutomationRun
from openhands.automation.utils.api_key import get_api_key_for_automation_run
from openhands.automation.utils.sandbox import (
Expand Down Expand Up @@ -52,6 +53,27 @@ def _is_auth_error(exc: BaseException) -> bool:
return False


def _concurrency_limit_detail(resp: httpx.Response) -> dict | None:
"""Return the CONCURRENCY_LIMIT_REACHED detail dict if the response is the
organization concurrent-sandbox limit 429, else None.

The OpenHands API raises this as a FastAPI HTTPException, so the body is
``{"detail": {"error": "CONCURRENCY_LIMIT_REACHED", ...}}``; we also tolerate
a flat ``{"error": ...}`` shape. This is distinct from a transient
rate-limit 429, which should still be retried (see ``_is_rate_limit_error``).
"""
if resp.status_code != 429:
return None
try:
body = resp.json()
except Exception:
return None
detail = body.get("detail", body) if isinstance(body, dict) else None
if isinstance(detail, dict) and detail.get("error") == "CONCURRENCY_LIMIT_REACHED":
return detail
return None


class CloudSandboxBackend(ExecutionBackend):
"""Execution backend that creates Cloud sandboxes per run.

Expand Down Expand Up @@ -269,6 +291,14 @@ async def _do_create():
resp = await client.post(
f"{self.api_url}/api/v1/sandboxes", headers=headers
)
# A concurrency-limit 429 is not transient: retrying won't free a
# slot. Raise a non-HTTPStatusError so the retry predicate skips it
# and the dispatcher can mark the run SKIPPED instead of FAILED.
detail = _concurrency_limit_detail(resp)
if detail is not None:
raise ConcurrencyLimitReachedError(
detail.get("message") or "Concurrency limit reached"
)
resp.raise_for_status()
return resp.json()["id"]

Expand Down
14 changes: 13 additions & 1 deletion openhands/automation/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@
from openhands.automation.backends import get_backend
from openhands.automation.config import ServiceSettings, get_config
from openhands.automation.db import using_sqlite
from openhands.automation.exceptions import PermanentDispatchError, TarballNotFoundError
from openhands.automation.exceptions import (
ConcurrencyLimitReachedError,
PermanentDispatchError,
TarballNotFoundError,
)
from openhands.automation.execution import execute_in_context
from openhands.automation.models import (
Automation,
Expand Down Expand Up @@ -196,6 +200,14 @@ async def _fail(error: str, disable: bool = False) -> None:
# Note: This also initializes backend state (e.g., API key for cloud mode)
try:
ctx = await backend.get_execution_context(client)
except ConcurrencyLimitReachedError as exc:
logger.warning(
"Run skipped — organization concurrency limit reached: %s",
exc,
extra=_log_ctx(),
)
await mark_run_terminal(session_factory, run, AutomationRunStatus.SKIPPED)
return
except Exception:
logger.exception("Failed to get execution context", extra=_log_ctx())
await _fail("Failed to get execution context")
Expand Down
11 changes: 11 additions & 0 deletions openhands/automation/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,14 @@ class TarballNotFoundError(PermanentDispatchError):
"""

pass


class ConcurrencyLimitReachedError(Exception):
"""The organization/workspace has reached its concurrent-sandbox limit.

Unlike PermanentDispatchError this is a *transient*, org-level condition: a
later run may succeed once a concurrent slot frees up. The run is marked
SKIPPED (not FAILED) and the automation is left enabled.
"""

pass
1 change: 1 addition & 0 deletions openhands/automation/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class AutomationRunStatus(enum.Enum):
COMPLETED = "COMPLETED"
FAILED = "FAILED"
CANCELLED = "CANCELLED"
SKIPPED = "SKIPPED"


class Automation(Base):
Expand Down
1 change: 1 addition & 0 deletions openhands/automation/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ class RunStatus(StrEnum):
COMPLETED = "COMPLETED"
FAILED = "FAILED"
CANCELLED = "CANCELLED"
SKIPPED = "SKIPPED"


def _validate_command_string(
Expand Down
1 change: 1 addition & 0 deletions openhands/automation/utils/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ async def mark_run_status(
AutomationRunStatus.COMPLETED,
AutomationRunStatus.FAILED,
AutomationRunStatus.CANCELLED,
AutomationRunStatus.SKIPPED,
):
values["completed_at"] = now
run.completed_at = now
Expand Down
101 changes: 101 additions & 0 deletions tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from unittest.mock import AsyncMock, MagicMock, patch

import httpx
import pytest

from openhands.automation.backends import (
Expand All @@ -10,6 +11,8 @@
LocalAgentServerBackend,
get_backend,
)
from openhands.automation.backends.cloud import _concurrency_limit_detail
from openhands.automation.exceptions import ConcurrencyLimitReachedError


class TestExecutionContext:
Expand Down Expand Up @@ -469,3 +472,101 @@ def test_cloud_mode(self, monkeypatch, mock_run):
backend = get_backend(mock_run)
assert isinstance(backend, CloudSandboxBackend)
assert backend.api_url == "https://app.all-hands.dev"


class TestConcurrencyLimitDetection:
"""Tests for `_concurrency_limit_detail`, the discriminator that tells an
organization concurrency-limit 429 (→ mark run SKIPPED) apart from a
transient rate-limit 429 (→ retry as before)."""

@staticmethod
def _resp(status: int, *, json=None, raw: bytes | None = None) -> httpx.Response:
req = httpx.Request("POST", "https://app.all-hands.dev/api/v1/sandboxes")
if raw is not None:
return httpx.Response(status, request=req, content=raw)
return httpx.Response(status, request=req, json=json)

def test_detects_nested_fastapi_detail(self):
"""The real shape: FastAPI nests the HTTPException detail under "detail"."""
resp = self._resp(
429,
json={
"detail": {
"error": "CONCURRENCY_LIMIT_REACHED",
"message": "You have reached your limit of 3 ...",
"limit": 3,
"current": 3,
}
},
)
detail = _concurrency_limit_detail(resp)
assert detail is not None
assert detail["limit"] == 3

def test_detects_flat_detail(self):
"""A non-nested {"error": ...} body is also tolerated."""
resp = self._resp(429, json={"error": "CONCURRENCY_LIMIT_REACHED"})
assert _concurrency_limit_detail(resp) is not None

def test_ignores_transient_rate_limit_429(self):
"""A generic 429 with a string detail is a transient rate limit."""
resp = self._resp(429, json={"detail": "Rate limited, slow down"})
assert _concurrency_limit_detail(resp) is None

def test_ignores_429_without_marker(self):
"""A 429 whose detail lacks the marker is not a concurrency limit."""
resp = self._resp(429, json={"detail": {"error": "SOMETHING_ELSE"}})
assert _concurrency_limit_detail(resp) is None

def test_ignores_non_json_429(self):
"""A non-JSON 429 body never matches (and does not raise)."""
resp = self._resp(429, raw=b"<html>too many requests</html>")
assert _concurrency_limit_detail(resp) is None

def test_ignores_non_429(self):
"""Only 429 responses can be a concurrency limit."""
resp = self._resp(200, json={"id": "sandbox-abc"})
assert _concurrency_limit_detail(resp) is None


class TestCloudSandboxConcurrencyLimit:
"""Tests that `_create_sandbox` surfaces the org concurrency limit as
`ConcurrencyLimitReachedError` without retrying it."""

@pytest.fixture
def mock_run(self):
run = MagicMock()
run.sandbox_id = None
run.keep_alive = False
run.bash_command_id = None
return run

@pytest.mark.asyncio
async def test_create_sandbox_raises_and_does_not_retry(self, mock_run):
"""A concurrency-limit 429 raises ConcurrencyLimitReachedError on the
first attempt — retrying cannot free a slot, so it must not be retried."""
backend = CloudSandboxBackend(api_url="https://app.all-hands.dev", run=mock_run)

req = httpx.Request("POST", "https://app.all-hands.dev/api/v1/sandboxes")
resp = httpx.Response(
429,
request=req,
json={
"detail": {
"error": "CONCURRENCY_LIMIT_REACHED",
"message": "Reached limit of 3 concurrent conversations.",
"limit": 3,
"current": 3,
}
},
)
client = MagicMock()
client.post = AsyncMock(return_value=resp)

with pytest.raises(
ConcurrencyLimitReachedError, match="concurrent conversations"
):
await backend._create_sandbox(client, {"Authorization": "Bearer x"})

# No retry: the sandbox API was hit exactly once.
assert client.post.await_count == 1
Loading
Loading