From 56588a8f672328d658bf7114b1b510169a7b8d6e Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 9 Jun 2026 03:20:47 +0000
Subject: [PATCH] feat(terminal): reject Python/JSON literals passed as bash
 commands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the model emits a tool call where `command` is a Python/JSON
literal — e.g. `[{'default': {...}}, ['apps'], ['code...']]` — bash
returns a useless "command not found" and the model rarely
self-corrects. Trajectory analysis of a SWE-Bench Nemotron run showed
~12% of terminal calls (18 of 148) burning context this way, with
cascading cost growth because failed turns stay in history.

This PR adds a pre-execution guard in `TerminalExecutor` that:

* Detects the literal head pattern (`[{`, `[[`, `["`, `['`,
  `{"`, `{'`) while carefully exempting real bash uses of `[`,
  `[[`, and `{` (which are always followed by whitespace).
* Returns a `TerminalObservation(is_error=True)` whose text names the
  literal kind, shows the offending head, and points the model at the
  two recovery patterns: `file_editor create` + `python /tmp/x.py`,
  or an inline heredoc.
* Emits one structured WARN log per rejection so eval pipelines can
  grep for the pattern.
* Skips the check when `is_input=True` (raw bytes forwarded to a
  running process, e.g. `C-c` keystrokes).

The detection lives in `definition.py` next to `TerminalAction` as
`looks_like_python_literal_argument(command) -> str | None`, exported
for reuse and individually unit-tested.

Tests: `tests/tools/terminal/test_literal_arg_guard.py` — 24 new
tests covering the heuristic matrix (real malformed samples, synthetic
variants, bash false-positives like `[ -f x ]` and `{ ls; }`), the
executor returning the structured error without hitting the shell,
`is_input=True` bypass, and WARN log emission. Existing terminal
parsing/observation suite (35 tests) passes unchanged.

Lint+pyright clean.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../openhands/tools/terminal/definition.py    |  49 ++++++
 .../openhands/tools/terminal/impl.py          |  29 ++++
 .../tools/terminal/test_literal_arg_guard.py  | 155 ++++++++++++++++++
 3 files changed, 233 insertions(+)
 create mode 100644 tests/tools/terminal/test_literal_arg_guard.py

diff --git a/openhands-tools/openhands/tools/terminal/definition.py b/openhands-tools/openhands/tools/terminal/definition.py
index f3068f2994..69df4d4e94 100644
--- a/openhands-tools/openhands/tools/terminal/definition.py
+++ b/openhands-tools/openhands/tools/terminal/definition.py
@@ -34,6 +34,55 @@
 from openhands.tools.terminal.metadata import CmdOutputMetadata
 
 
+_LITERAL_ARG_HINT_TEMPLATE = (
+    "[Tool-argument error] The `command` argument looks like a Python/JSON "
+    "{literal_kind}, not a shell command. It starts with: {head!r}\n\n"
+    "The `terminal` tool runs exactly ONE shell command at a time. To pass "
+    "structured data or multi-line code:\n"
+    "  - Write a script first with `file_editor` "
+    '(command="create", path="/tmp/run.py", ...), then invoke it: '
+    "`python /tmp/run.py`.\n"
+    "  - Or use a heredoc inline, e.g.:\n"
+    "        python - <<'EOF'\n"
+    "        DATABASES = {{'default': {{...}}}}\n"
+    "        # your code here\n"
+    "        EOF\n\n"
+    "Do not put a Python list/dict literal into the `command` field; the shell "
+    "cannot interpret it."
+)
+
+
+def looks_like_python_literal_argument(command: str) -> str | None:
+    """Detect when a tool call has packed structured data into `command`.
+
+    Returns a short reason string (``"list literal"``, ``"nested list literal"``
+    or ``"dict literal"``) if ``command`` appears to be a Python/JSON literal
+    rather than a shell command, otherwise ``None``.
+
+    Carefully distinguishes legitimate bash uses of ``[`` and ``[[`` (which
+    are always followed by whitespace) from Python-style literals. See the
+    accompanying tests for the full matrix.
+    """
+    stripped = command.lstrip()
+    if len(stripped) < 2:
+        return None
+    a, b = stripped[0], stripped[1]
+    # Top-level list literals: [{...}], ["..."], ['...']
+    if a == "[" and b in ("{", '"', "'"):
+        return "list literal"
+    # Nested list literals: [[...]] — but bash `[[ -f x ]]` is followed by a
+    # whitespace char, so we only flag `[[` followed by non-whitespace.
+    if a == "[" and b == "[":
+        if len(stripped) >= 3 and stripped[2] not in (" ", "\t"):
+            return "nested list literal"
+        return None
+    # Top-level dict literals: {"key": ...} or {'key': ...}.
+    # Bash group commands `{ ls; }` always have a space after `{`.
+    if a == "{" and b in ('"', "'"):
+        return "dict literal"
+    return None
+
+
 class TerminalAction(Action):
     """Schema for terminal command execution."""
 
diff --git a/openhands-tools/openhands/tools/terminal/impl.py b/openhands-tools/openhands/tools/terminal/impl.py
index 1e0a3bd950..32ba7ea210 100644
--- a/openhands-tools/openhands/tools/terminal/impl.py
+++ b/openhands-tools/openhands/tools/terminal/impl.py
@@ -15,8 +15,10 @@
     from openhands.sdk.conversation import LocalConversation
 from openhands.tools.terminal.constants import CMD_OUTPUT_PS1_END
 from openhands.tools.terminal.definition import (
+    _LITERAL_ARG_HINT_TEMPLATE,
     TerminalAction,
     TerminalObservation,
+    looks_like_python_literal_argument,
 )
 from openhands.tools.terminal.terminal.factory import (
     _is_tmux_available,
@@ -538,6 +540,33 @@ def __call__(
         if action.reset and action.is_input:
             raise ValueError("Cannot use reset=True with is_input=True")
 
+        # Short-circuit obvious tool-call malformation: Python/JSON literals
+        # passed where the model should have sent a shell command. The shell
+        # would otherwise echo a confusing `command not found` and the model
+        # rarely self-corrects without a structured hint. Skip the check when
+        # `is_input=True` because that path forwards raw bytes (e.g. keystrokes
+        # like `C-c`) to a running process and is not a fresh shell command.
+        if not action.is_input:
+            literal_kind = looks_like_python_literal_argument(action.command)
+            if literal_kind is not None:
+                head = action.command.lstrip()[:60]
+                logger.warning(
+                    "Rejected terminal call: command argument looks like a "
+                    "Python/JSON %s (head=%r). Returning structured hint to "
+                    "the model instead of executing.",
+                    literal_kind,
+                    head,
+                )
+                return TerminalObservation.from_text(
+                    _LITERAL_ARG_HINT_TEMPLATE.format(
+                        literal_kind=literal_kind,
+                        head=head,
+                    ),
+                    is_error=True,
+                    command=action.command,
+                    exit_code=None,
+                )
+
         if self._pool is not None:
             return self._execute_pooled(action, conversation)
         else:
diff --git a/tests/tools/terminal/test_literal_arg_guard.py b/tests/tools/terminal/test_literal_arg_guard.py
new file mode 100644
index 0000000000..93387f96ce
--- /dev/null
+++ b/tests/tools/terminal/test_literal_arg_guard.py
@@ -0,0 +1,155 @@
+"""SDK-5: guard against Python/JSON literals passed to the terminal tool.
+
+Motivation: in real eval runs (e.g. Nemotron 550B on SWE-Bench), the model
+sometimes packs structured arguments — a settings dict, a list of installed
+apps, a code blob — into the single ``command`` field. The shell then echoes
+``bash: [{default:: command not found`` and the model burns turns retrying
+similar garbage. This guard catches the literal pre-execution and replies
+with an actionable hint instead.
+"""
+
+import pytest
+
+from openhands.tools.terminal.definition import (
+    TerminalAction,
+    TerminalObservation,
+    looks_like_python_literal_argument,
+)
+from openhands.tools.terminal.impl import TerminalExecutor
+
+
+# --------------------------------------------------------------------------
+# Unit tests for the heuristic.
+# --------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "command,expected",
+    [
+        # Real malformed examples extracted from a SWE-Bench Nemotron run.
+        ("[['col1', 'col2'], ['col1', 'col2']]", "nested list literal"),
+        (
+            "[{'default': {'ENGINE': 'django.db.backends.sqlite3'}}, "
+            "['django.contrib.auth']]",
+            "list literal",
+        ),
+        ('["field.column]\\n\\nwith connection.schema_editor() ..."]', "list literal"),
+        # Synthetic variants.
+        ('{"key": "value"}', "dict literal"),
+        ("{'key': 'value'}", "dict literal"),
+        # Leading whitespace must be stripped before classification.
+        ("   [{'a': 1}]", "list literal"),
+        ('\t["x"]', "list literal"),
+    ],
+)
+def test_python_literals_are_detected(command: str, expected: str) -> None:
+    assert looks_like_python_literal_argument(command) == expected
+
+
+@pytest.mark.parametrize(
+    "command",
+    [
+        # Legitimate bash tests must NOT trip the guard.
+        "[ -f /tmp/foo ]",
+        "[ -d /workspace ]",
+        '[[ "$x" = "y" ]]',
+        "[[ -z $VAR ]]",
+        # Bash group commands.
+        "{ ls; echo done; }",
+        # Normal commands.
+        "ls -la",
+        "python -c 'print(1)'",
+        # Even commands whose arguments happen to be JSON-shaped are fine,
+        # because the COMMAND name is plain text.
+        "echo '[1, 2, 3]'",
+        "curl -d '{\"a\": 1}' http://x",
+        # Edge: empty / one-char strings should never trip.
+        "",
+        "[",
+        "{",
+        " ",
+    ],
+)
+def test_legitimate_commands_are_not_flagged(command: str) -> None:
+    assert looks_like_python_literal_argument(command) is None
+
+
+# --------------------------------------------------------------------------
+# Executor-level integration: literal => structured error, no shell.
+# --------------------------------------------------------------------------
+
+
+_SHELL_SENTINEL = "Executor should not reach the shell when the command is rejected"
+
+
+@pytest.fixture
+def executor_without_shell() -> TerminalExecutor:
+    """Build a TerminalExecutor without touching the real shell.
+
+    We bypass ``__init__`` (which spins up a real tmux/subprocess session) and
+    stub both shell-execution paths to raise. The literal guard runs *before*
+    those paths, so an unrelated command must escape the guard and trigger the
+    sentinel — that's how we prove the guard didn't fire.
+    """
+    exe = TerminalExecutor.__new__(TerminalExecutor)
+    exe._pool = None
+
+    def _reach_shell(*_args: object, **_kwargs: object) -> TerminalObservation:
+        raise AssertionError(_SHELL_SENTINEL)
+
+    exe._execute_pooled = _reach_shell  # type: ignore[method-assign]
+    exe._execute_single_session = _reach_shell  # type: ignore[method-assign]
+    return exe
+
+
+def test_literal_command_returns_structured_error_without_shell(
+    executor_without_shell: TerminalExecutor,
+) -> None:
+    action = TerminalAction(command="[{'default': {'ENGINE': 'sqlite3'}}]")
+    obs = executor_without_shell(action)
+
+    assert isinstance(obs, TerminalObservation)
+    assert obs.is_error is True
+    assert obs.exit_code is None
+    assert obs.command == action.command
+    text = obs.text
+    assert "list literal" in text
+    # The hint must teach a concrete recovery path.
+    assert "file_editor" in text
+    assert "heredoc" in text or "<<'EOF'" in text
+
+
+def test_bash_test_expression_not_blocked(
+    executor_without_shell: TerminalExecutor,
+) -> None:
+    """A real bash `[ -f ... ]` must reach the shell, not the literal guard."""
+    action = TerminalAction(command="[ -f /tmp/foo ]")
+    with pytest.raises(AssertionError, match=_SHELL_SENTINEL):
+        executor_without_shell(action)
+
+
+def test_is_input_path_skips_guard(
+    executor_without_shell: TerminalExecutor,
+) -> None:
+    """``is_input=True`` forwards raw bytes to a running process. Keystrokes
+    like ``C-c`` or arbitrary literals are valid in that context and must
+    bypass the guard."""
+    action = TerminalAction(command="[{'sent_as_keystrokes': True}]", is_input=True)
+    with pytest.raises(AssertionError, match=_SHELL_SENTINEL):
+        executor_without_shell(action)
+
+
+def test_guard_warning_is_logged(
+    executor_without_shell: TerminalExecutor,
+    caplog: pytest.LogCaptureFixture,
+) -> None:
+    """Operators should be able to grep for the guard firing in eval logs."""
+    import logging
+
+    caplog.set_level(logging.WARNING, logger="openhands.tools.terminal.impl")
+    action = TerminalAction(command='["some code that should have been a script"]')
+    executor_without_shell(action)
+    assert any(
+        "Rejected terminal call" in rec.message and "list literal" in rec.message
+        for rec in caplog.records
+    )