diff --git a/openhands-tools/openhands/tools/terminal/definition.py b/openhands-tools/openhands/tools/terminal/definition.py index f3068f2994..69df4d4e94 100644 --- a/openhands-tools/openhands/tools/terminal/definition.py +++ b/openhands-tools/openhands/tools/terminal/definition.py @@ -34,6 +34,55 @@ from openhands.tools.terminal.metadata import CmdOutputMetadata +_LITERAL_ARG_HINT_TEMPLATE = ( + "[Tool-argument error] The `command` argument looks like a Python/JSON " + "{literal_kind}, not a shell command. It starts with: {head!r}\n\n" + "The `terminal` tool runs exactly ONE shell command at a time. To pass " + "structured data or multi-line code:\n" + " - Write a script first with `file_editor` " + '(command="create", path="/tmp/run.py", ...), then invoke it: ' + "`python /tmp/run.py`.\n" + " - Or use a heredoc inline, e.g.:\n" + " python - <<'EOF'\n" + " DATABASES = {{'default': {{...}}}}\n" + " # your code here\n" + " EOF\n\n" + "Do not put a Python list/dict literal into the `command` field; the shell " + "cannot interpret it." +) + + +def looks_like_python_literal_argument(command: str) -> str | None: + """Detect when a tool call has packed structured data into `command`. + + Returns a short reason string (``"list literal"``, ``"nested list literal"`` + or ``"dict literal"``) if ``command`` appears to be a Python/JSON literal + rather than a shell command, otherwise ``None``. + + Carefully distinguishes legitimate bash uses of ``[`` and ``[[`` (which + are always followed by whitespace) from Python-style literals. See the + accompanying tests for the full matrix. + """ + stripped = command.lstrip() + if len(stripped) < 2: + return None + a, b = stripped[0], stripped[1] + # Top-level list literals: [{...}], ["..."], ['...'] + if a == "[" and b in ("{", '"', "'"): + return "list literal" + # Nested list literals: [[...]] — but bash `[[ -f x ]]` is followed by a + # whitespace char, so we only flag `[[` followed by non-whitespace. + if a == "[" and b == "[": + if len(stripped) >= 3 and stripped[2] not in (" ", "\t"): + return "nested list literal" + return None + # Top-level dict literals: {"key": ...} or {'key': ...}. + # Bash group commands `{ ls; }` always have a space after `{`. + if a == "{" and b in ('"', "'"): + return "dict literal" + return None + + class TerminalAction(Action): """Schema for terminal command execution.""" diff --git a/openhands-tools/openhands/tools/terminal/impl.py b/openhands-tools/openhands/tools/terminal/impl.py index 1e0a3bd950..32ba7ea210 100644 --- a/openhands-tools/openhands/tools/terminal/impl.py +++ b/openhands-tools/openhands/tools/terminal/impl.py @@ -15,8 +15,10 @@ from openhands.sdk.conversation import LocalConversation from openhands.tools.terminal.constants import CMD_OUTPUT_PS1_END from openhands.tools.terminal.definition import ( + _LITERAL_ARG_HINT_TEMPLATE, TerminalAction, TerminalObservation, + looks_like_python_literal_argument, ) from openhands.tools.terminal.terminal.factory import ( _is_tmux_available, @@ -538,6 +540,33 @@ def __call__( if action.reset and action.is_input: raise ValueError("Cannot use reset=True with is_input=True") + # Short-circuit obvious tool-call malformation: Python/JSON literals + # passed where the model should have sent a shell command. The shell + # would otherwise echo a confusing `command not found` and the model + # rarely self-corrects without a structured hint. Skip the check when + # `is_input=True` because that path forwards raw bytes (e.g. keystrokes + # like `C-c`) to a running process and is not a fresh shell command. + if not action.is_input: + literal_kind = looks_like_python_literal_argument(action.command) + if literal_kind is not None: + head = action.command.lstrip()[:60] + logger.warning( + "Rejected terminal call: command argument looks like a " + "Python/JSON %s (head=%r). Returning structured hint to " + "the model instead of executing.", + literal_kind, + head, + ) + return TerminalObservation.from_text( + _LITERAL_ARG_HINT_TEMPLATE.format( + literal_kind=literal_kind, + head=head, + ), + is_error=True, + command=action.command, + exit_code=None, + ) + if self._pool is not None: return self._execute_pooled(action, conversation) else: diff --git a/tests/tools/terminal/test_literal_arg_guard.py b/tests/tools/terminal/test_literal_arg_guard.py new file mode 100644 index 0000000000..93387f96ce --- /dev/null +++ b/tests/tools/terminal/test_literal_arg_guard.py @@ -0,0 +1,155 @@ +"""SDK-5: guard against Python/JSON literals passed to the terminal tool. + +Motivation: in real eval runs (e.g. Nemotron 550B on SWE-Bench), the model +sometimes packs structured arguments — a settings dict, a list of installed +apps, a code blob — into the single ``command`` field. The shell then echoes +``bash: [{default:: command not found`` and the model burns turns retrying +similar garbage. This guard catches the literal pre-execution and replies +with an actionable hint instead. +""" + +import pytest + +from openhands.tools.terminal.definition import ( + TerminalAction, + TerminalObservation, + looks_like_python_literal_argument, +) +from openhands.tools.terminal.impl import TerminalExecutor + + +# -------------------------------------------------------------------------- +# Unit tests for the heuristic. +# -------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "command,expected", + [ + # Real malformed examples extracted from a SWE-Bench Nemotron run. + ("[['col1', 'col2'], ['col1', 'col2']]", "nested list literal"), + ( + "[{'default': {'ENGINE': 'django.db.backends.sqlite3'}}, " + "['django.contrib.auth']]", + "list literal", + ), + ('["field.column]\\n\\nwith connection.schema_editor() ..."]', "list literal"), + # Synthetic variants. + ('{"key": "value"}', "dict literal"), + ("{'key': 'value'}", "dict literal"), + # Leading whitespace must be stripped before classification. + (" [{'a': 1}]", "list literal"), + ('\t["x"]', "list literal"), + ], +) +def test_python_literals_are_detected(command: str, expected: str) -> None: + assert looks_like_python_literal_argument(command) == expected + + +@pytest.mark.parametrize( + "command", + [ + # Legitimate bash tests must NOT trip the guard. + "[ -f /tmp/foo ]", + "[ -d /workspace ]", + '[[ "$x" = "y" ]]', + "[[ -z $VAR ]]", + # Bash group commands. + "{ ls; echo done; }", + # Normal commands. + "ls -la", + "python -c 'print(1)'", + # Even commands whose arguments happen to be JSON-shaped are fine, + # because the COMMAND name is plain text. + "echo '[1, 2, 3]'", + "curl -d '{\"a\": 1}' http://x", + # Edge: empty / one-char strings should never trip. + "", + "[", + "{", + " ", + ], +) +def test_legitimate_commands_are_not_flagged(command: str) -> None: + assert looks_like_python_literal_argument(command) is None + + +# -------------------------------------------------------------------------- +# Executor-level integration: literal => structured error, no shell. +# -------------------------------------------------------------------------- + + +_SHELL_SENTINEL = "Executor should not reach the shell when the command is rejected" + + +@pytest.fixture +def executor_without_shell() -> TerminalExecutor: + """Build a TerminalExecutor without touching the real shell. + + We bypass ``__init__`` (which spins up a real tmux/subprocess session) and + stub both shell-execution paths to raise. The literal guard runs *before* + those paths, so an unrelated command must escape the guard and trigger the + sentinel — that's how we prove the guard didn't fire. + """ + exe = TerminalExecutor.__new__(TerminalExecutor) + exe._pool = None + + def _reach_shell(*_args: object, **_kwargs: object) -> TerminalObservation: + raise AssertionError(_SHELL_SENTINEL) + + exe._execute_pooled = _reach_shell # type: ignore[method-assign] + exe._execute_single_session = _reach_shell # type: ignore[method-assign] + return exe + + +def test_literal_command_returns_structured_error_without_shell( + executor_without_shell: TerminalExecutor, +) -> None: + action = TerminalAction(command="[{'default': {'ENGINE': 'sqlite3'}}]") + obs = executor_without_shell(action) + + assert isinstance(obs, TerminalObservation) + assert obs.is_error is True + assert obs.exit_code is None + assert obs.command == action.command + text = obs.text + assert "list literal" in text + # The hint must teach a concrete recovery path. + assert "file_editor" in text + assert "heredoc" in text or "<<'EOF'" in text + + +def test_bash_test_expression_not_blocked( + executor_without_shell: TerminalExecutor, +) -> None: + """A real bash `[ -f ... ]` must reach the shell, not the literal guard.""" + action = TerminalAction(command="[ -f /tmp/foo ]") + with pytest.raises(AssertionError, match=_SHELL_SENTINEL): + executor_without_shell(action) + + +def test_is_input_path_skips_guard( + executor_without_shell: TerminalExecutor, +) -> None: + """``is_input=True`` forwards raw bytes to a running process. Keystrokes + like ``C-c`` or arbitrary literals are valid in that context and must + bypass the guard.""" + action = TerminalAction(command="[{'sent_as_keystrokes': True}]", is_input=True) + with pytest.raises(AssertionError, match=_SHELL_SENTINEL): + executor_without_shell(action) + + +def test_guard_warning_is_logged( + executor_without_shell: TerminalExecutor, + caplog: pytest.LogCaptureFixture, +) -> None: + """Operators should be able to grep for the guard firing in eval logs.""" + import logging + + caplog.set_level(logging.WARNING, logger="openhands.tools.terminal.impl") + action = TerminalAction(command='["some code that should have been a script"]') + executor_without_shell(action) + assert any( + "Rejected terminal call" in rec.message and "list literal" in rec.message + for rec in caplog.records + )