From 56588a8f672328d658bf7114b1b510169a7b8d6e Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 9 Jun 2026 03:20:47 +0000 Subject: [PATCH] feat(terminal): reject Python/JSON literals passed as bash commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the model emits a tool call where `command` is a Python/JSON literal — e.g. `[{'default': {...}}, ['apps'], ['code...']]` — bash returns a useless "command not found" and the model rarely self-corrects. Trajectory analysis of a SWE-Bench Nemotron run showed ~12% of terminal calls (18 of 148) burning context this way, with cascading cost growth because failed turns stay in history. This PR adds a pre-execution guard in `TerminalExecutor` that: * Detects the literal head pattern (`[{`, `[[`, `["`, `['`, `{"`, `{'`) while carefully exempting real bash uses of `[`, `[[`, and `{` (which are always followed by whitespace). * Returns a `TerminalObservation(is_error=True)` whose text names the literal kind, shows the offending head, and points the model at the two recovery patterns: `file_editor create` + `python /tmp/x.py`, or an inline heredoc. * Emits one structured WARN log per rejection so eval pipelines can grep for the pattern. * Skips the check when `is_input=True` (raw bytes forwarded to a running process, e.g. `C-c` keystrokes). The detection lives in `definition.py` next to `TerminalAction` as `looks_like_python_literal_argument(command) -> str | None`, exported for reuse and individually unit-tested. Tests: `tests/tools/terminal/test_literal_arg_guard.py` — 24 new tests covering the heuristic matrix (real malformed samples, synthetic variants, bash false-positives like `[ -f x ]` and `{ ls; }`), the executor returning the structured error without hitting the shell, `is_input=True` bypass, and WARN log emission. Existing terminal parsing/observation suite (35 tests) passes unchanged. Lint+pyright clean. Co-authored-by: openhands --- .../openhands/tools/terminal/definition.py | 49 ++++++ .../openhands/tools/terminal/impl.py | 29 ++++ .../tools/terminal/test_literal_arg_guard.py | 155 ++++++++++++++++++ 3 files changed, 233 insertions(+) create mode 100644 tests/tools/terminal/test_literal_arg_guard.py diff --git a/openhands-tools/openhands/tools/terminal/definition.py b/openhands-tools/openhands/tools/terminal/definition.py index f3068f2994..69df4d4e94 100644 --- a/openhands-tools/openhands/tools/terminal/definition.py +++ b/openhands-tools/openhands/tools/terminal/definition.py @@ -34,6 +34,55 @@ from openhands.tools.terminal.metadata import CmdOutputMetadata +_LITERAL_ARG_HINT_TEMPLATE = ( + "[Tool-argument error] The `command` argument looks like a Python/JSON " + "{literal_kind}, not a shell command. It starts with: {head!r}\n\n" + "The `terminal` tool runs exactly ONE shell command at a time. To pass " + "structured data or multi-line code:\n" + " - Write a script first with `file_editor` " + '(command="create", path="/tmp/run.py", ...), then invoke it: ' + "`python /tmp/run.py`.\n" + " - Or use a heredoc inline, e.g.:\n" + " python - <<'EOF'\n" + " DATABASES = {{'default': {{...}}}}\n" + " # your code here\n" + " EOF\n\n" + "Do not put a Python list/dict literal into the `command` field; the shell " + "cannot interpret it." +) + + +def looks_like_python_literal_argument(command: str) -> str | None: + """Detect when a tool call has packed structured data into `command`. + + Returns a short reason string (``"list literal"``, ``"nested list literal"`` + or ``"dict literal"``) if ``command`` appears to be a Python/JSON literal + rather than a shell command, otherwise ``None``. + + Carefully distinguishes legitimate bash uses of ``[`` and ``[[`` (which + are always followed by whitespace) from Python-style literals. See the + accompanying tests for the full matrix. + """ + stripped = command.lstrip() + if len(stripped) < 2: + return None + a, b = stripped[0], stripped[1] + # Top-level list literals: [{...}], ["..."], ['...'] + if a == "[" and b in ("{", '"', "'"): + return "list literal" + # Nested list literals: [[...]] — but bash `[[ -f x ]]` is followed by a + # whitespace char, so we only flag `[[` followed by non-whitespace. + if a == "[" and b == "[": + if len(stripped) >= 3 and stripped[2] not in (" ", "\t"): + return "nested list literal" + return None + # Top-level dict literals: {"key": ...} or {'key': ...}. + # Bash group commands `{ ls; }` always have a space after `{`. + if a == "{" and b in ('"', "'"): + return "dict literal" + return None + + class TerminalAction(Action): """Schema for terminal command execution.""" diff --git a/openhands-tools/openhands/tools/terminal/impl.py b/openhands-tools/openhands/tools/terminal/impl.py index 1e0a3bd950..32ba7ea210 100644 --- a/openhands-tools/openhands/tools/terminal/impl.py +++ b/openhands-tools/openhands/tools/terminal/impl.py @@ -15,8 +15,10 @@ from openhands.sdk.conversation import LocalConversation from openhands.tools.terminal.constants import CMD_OUTPUT_PS1_END from openhands.tools.terminal.definition import ( + _LITERAL_ARG_HINT_TEMPLATE, TerminalAction, TerminalObservation, + looks_like_python_literal_argument, ) from openhands.tools.terminal.terminal.factory import ( _is_tmux_available, @@ -538,6 +540,33 @@ def __call__( if action.reset and action.is_input: raise ValueError("Cannot use reset=True with is_input=True") + # Short-circuit obvious tool-call malformation: Python/JSON literals + # passed where the model should have sent a shell command. The shell + # would otherwise echo a confusing `command not found` and the model + # rarely self-corrects without a structured hint. Skip the check when + # `is_input=True` because that path forwards raw bytes (e.g. keystrokes + # like `C-c`) to a running process and is not a fresh shell command. + if not action.is_input: + literal_kind = looks_like_python_literal_argument(action.command) + if literal_kind is not None: + head = action.command.lstrip()[:60] + logger.warning( + "Rejected terminal call: command argument looks like a " + "Python/JSON %s (head=%r). Returning structured hint to " + "the model instead of executing.", + literal_kind, + head, + ) + return TerminalObservation.from_text( + _LITERAL_ARG_HINT_TEMPLATE.format( + literal_kind=literal_kind, + head=head, + ), + is_error=True, + command=action.command, + exit_code=None, + ) + if self._pool is not None: return self._execute_pooled(action, conversation) else: diff --git a/tests/tools/terminal/test_literal_arg_guard.py b/tests/tools/terminal/test_literal_arg_guard.py new file mode 100644 index 0000000000..93387f96ce --- /dev/null +++ b/tests/tools/terminal/test_literal_arg_guard.py @@ -0,0 +1,155 @@ +"""SDK-5: guard against Python/JSON literals passed to the terminal tool. + +Motivation: in real eval runs (e.g. Nemotron 550B on SWE-Bench), the model +sometimes packs structured arguments — a settings dict, a list of installed +apps, a code blob — into the single ``command`` field. The shell then echoes +``bash: [{default:: command not found`` and the model burns turns retrying +similar garbage. This guard catches the literal pre-execution and replies +with an actionable hint instead. +""" + +import pytest + +from openhands.tools.terminal.definition import ( + TerminalAction, + TerminalObservation, + looks_like_python_literal_argument, +) +from openhands.tools.terminal.impl import TerminalExecutor + + +# -------------------------------------------------------------------------- +# Unit tests for the heuristic. +# -------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "command,expected", + [ + # Real malformed examples extracted from a SWE-Bench Nemotron run. + ("[['col1', 'col2'], ['col1', 'col2']]", "nested list literal"), + ( + "[{'default': {'ENGINE': 'django.db.backends.sqlite3'}}, " + "['django.contrib.auth']]", + "list literal", + ), + ('["field.column]\\n\\nwith connection.schema_editor() ..."]', "list literal"), + # Synthetic variants. + ('{"key": "value"}', "dict literal"), + ("{'key': 'value'}", "dict literal"), + # Leading whitespace must be stripped before classification. + (" [{'a': 1}]", "list literal"), + ('\t["x"]', "list literal"), + ], +) +def test_python_literals_are_detected(command: str, expected: str) -> None: + assert looks_like_python_literal_argument(command) == expected + + +@pytest.mark.parametrize( + "command", + [ + # Legitimate bash tests must NOT trip the guard. + "[ -f /tmp/foo ]", + "[ -d /workspace ]", + '[[ "$x" = "y" ]]', + "[[ -z $VAR ]]", + # Bash group commands. + "{ ls; echo done; }", + # Normal commands. + "ls -la", + "python -c 'print(1)'", + # Even commands whose arguments happen to be JSON-shaped are fine, + # because the COMMAND name is plain text. + "echo '[1, 2, 3]'", + "curl -d '{\"a\": 1}' http://x", + # Edge: empty / one-char strings should never trip. + "", + "[", + "{", + " ", + ], +) +def test_legitimate_commands_are_not_flagged(command: str) -> None: + assert looks_like_python_literal_argument(command) is None + + +# -------------------------------------------------------------------------- +# Executor-level integration: literal => structured error, no shell. +# -------------------------------------------------------------------------- + + +_SHELL_SENTINEL = "Executor should not reach the shell when the command is rejected" + + +@pytest.fixture +def executor_without_shell() -> TerminalExecutor: + """Build a TerminalExecutor without touching the real shell. + + We bypass ``__init__`` (which spins up a real tmux/subprocess session) and + stub both shell-execution paths to raise. The literal guard runs *before* + those paths, so an unrelated command must escape the guard and trigger the + sentinel — that's how we prove the guard didn't fire. + """ + exe = TerminalExecutor.__new__(TerminalExecutor) + exe._pool = None + + def _reach_shell(*_args: object, **_kwargs: object) -> TerminalObservation: + raise AssertionError(_SHELL_SENTINEL) + + exe._execute_pooled = _reach_shell # type: ignore[method-assign] + exe._execute_single_session = _reach_shell # type: ignore[method-assign] + return exe + + +def test_literal_command_returns_structured_error_without_shell( + executor_without_shell: TerminalExecutor, +) -> None: + action = TerminalAction(command="[{'default': {'ENGINE': 'sqlite3'}}]") + obs = executor_without_shell(action) + + assert isinstance(obs, TerminalObservation) + assert obs.is_error is True + assert obs.exit_code is None + assert obs.command == action.command + text = obs.text + assert "list literal" in text + # The hint must teach a concrete recovery path. + assert "file_editor" in text + assert "heredoc" in text or "<<'EOF'" in text + + +def test_bash_test_expression_not_blocked( + executor_without_shell: TerminalExecutor, +) -> None: + """A real bash `[ -f ... ]` must reach the shell, not the literal guard.""" + action = TerminalAction(command="[ -f /tmp/foo ]") + with pytest.raises(AssertionError, match=_SHELL_SENTINEL): + executor_without_shell(action) + + +def test_is_input_path_skips_guard( + executor_without_shell: TerminalExecutor, +) -> None: + """``is_input=True`` forwards raw bytes to a running process. Keystrokes + like ``C-c`` or arbitrary literals are valid in that context and must + bypass the guard.""" + action = TerminalAction(command="[{'sent_as_keystrokes': True}]", is_input=True) + with pytest.raises(AssertionError, match=_SHELL_SENTINEL): + executor_without_shell(action) + + +def test_guard_warning_is_logged( + executor_without_shell: TerminalExecutor, + caplog: pytest.LogCaptureFixture, +) -> None: + """Operators should be able to grep for the guard firing in eval logs.""" + import logging + + caplog.set_level(logging.WARNING, logger="openhands.tools.terminal.impl") + action = TerminalAction(command='["some code that should have been a script"]') + executor_without_shell(action) + assert any( + "Rejected terminal call" in rec.message and "list literal" in rec.message + for rec in caplog.records + )