From f0bab19722b538c080d2edfbd9e75ddf7c03f5ec Mon Sep 17 00:00:00 2001 From: Alexander Date: Mon, 8 Jun 2026 07:02:57 -0400 Subject: [PATCH] agents(hermes): wire reasoning config for chat slot (#661, closes #635) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire `display.show_reasoning: true`, `display.streaming: true`, and `model.max_tokens: 8192` into the Hermes config template so the chat slot's Qwen3-class thinking model is visible in the TUI. Three Hermes thinking-model gotchas applied (#635 memory note): - streaming: true is the transport prerequisite; without it the TUI hangs silently on the block. - show_reasoning: true is the UI gate; without it the reasoning output is suppressed even when streaming is enabled. - max_tokens: 8192 caps the budget so Qwen3 cannot exhaust it entirely in and return an empty content field (silent TUI). Delegation → `agent` slot (ace-saber MoE, thinking-off) is unchanged; _DELEGATION_SLOT_NAME was already renamed "agent" in #654/#665. Reconciliation (#635 vs #661): #635 asked for "advanced-reasoning subagents → chat-27b" but Hermes has a single delegation config — no per-subagent-type routing exists upstream. Reasoning visibility therefore lives on the top-level chat conversation (show_reasoning + streaming on the main model); the agent MoE handles delegation and is intentionally thinking-off for fast agentic subtasks. A note is added to the delegation comment in the template. Also updated the template docstring to fix the stale `agent-hermes` reference (post-rename: slot is now `agent`). Live container e2e deferred: hal0-dev has no GPU; CT105 runtime verification is out of scope for this PR (noted in acceptance criteria). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../agents/hermes_templates/config.yaml.j2 | 36 +++-- tests/agents/test_hermes_provision.py | 132 ++++++++++++++++++ 2 files changed, 159 insertions(+), 9 deletions(-) diff --git a/src/hal0/agents/hermes_templates/config.yaml.j2 b/src/hal0/agents/hermes_templates/config.yaml.j2 index 77b7f714..50403f9f 100644 --- a/src/hal0/agents/hermes_templates/config.yaml.j2 +++ b/src/hal0/agents/hermes_templates/config.yaml.j2 @@ -17,8 +17,8 @@ system_prompt — str (persona-rendered prelude; PR-3 Phase 7) personality_name — str (display name of active persona; PR-3 Phase 8) delegation — dict or None ({model, base_url, provider}) — subagent - role model from the `agent-hermes` slot. None → block - omitted → subagents inherit the chat model. + role model from the `agent` slot (post-rename #654). + None → block omitted → subagents inherit the chat model. auxiliary_tasks — dict task → {provider, model, base_url}. Drives the auxiliary: block. utility-slot tasks render provider:"custom"+base_url; vision/web_extract stay @@ -62,6 +62,13 @@ model: provider: "custom" base_url: "http://127.0.0.1:8080/v1" {%- endif %} + # max_tokens guards against thinking-model silent-TUI: Qwen3 chat (the + # `chat` slot) can burn the entire budget on and return + # an empty content field if max_tokens is unset. 8192 gives the model + # room to think + reply while keeping responses bounded. Without this, + # `display.show_reasoning: true` (below) still shows nothing because the + # content field is empty. Gotcha documented in #635 + hermes memory note. + max_tokens: 8192 providers: # `custom` is upstream Hermes's built-in profile for OpenAI-compatible @@ -117,12 +124,18 @@ custom_providers: {%- endif %} {%- if delegation %} -{# feat/hermes-role-slots: subagents run on a dedicated slot - (`agent-hermes`). delegate_tool.py `_resolve_delegation_credentials` - reads delegation.{model,provider,base_url}; setting base_url forces - provider → "custom" so subagents hit hal0's OpenAI surface. Omitted - entirely when the slot isn't live, in which case subagents inherit the - parent (chat) model. #} +{# feat/hermes-role-slots (#661): subagents run on the `agent` slot (ace-saber + MoE, thinking-off). This is intentionally different from the main `chat` + slot (thinking-on, Qwen3 class). The chat slot is used for the top-level + reasoning-visible conversation; the agent MoE is the fast non-think path + for agentic subtasks (tool use, decomposition). Reconciliation: #635 asked + for "advanced-reasoning subagents → chat-27b" but Hermes has one delegation + config (no per-subagent-type routing), so reasoning lives on the chat model + (show_reasoning + streaming above) and the agent slot handles delegation. + delegate_tool.py `_resolve_delegation_credentials` reads + delegation.{model,provider,base_url}; base_url forces provider → "custom" + so subagents hit hal0's OpenAI surface. Omitted when the slot is not live, + in which case subagents inherit the parent (chat) model. #} delegation: model: {{ delegation.model | tojson }} provider: {{ delegation.provider | default("custom") | tojson }} @@ -185,7 +198,12 @@ agent: display: bell_on_complete: false - show_reasoning: false + # Reasoning visibility: `chat` slot runs Qwen3-class thinking models. + # Both flags are required together — streaming: true is the transport + # prerequisite; show_reasoning: true is the UI gate. Without streaming, + # the TUI hangs silently on the block. Gotcha from #635. + streaming: true + show_reasoning: true {%- if personality_name %} # Phase 8 (PR-3): UI-facing label for the active persona; surfaces in # `hermes` itself when the operator runs the CLI directly. The diff --git a/tests/agents/test_hermes_provision.py b/tests/agents/test_hermes_provision.py index d9596108..66540937 100644 --- a/tests/agents/test_hermes_provision.py +++ b/tests/agents/test_hermes_provision.py @@ -869,6 +869,138 @@ def test_config_write_renders_role_slots_from_live_state( ] +# ── #661/#635: reasoning wiring — chat slot + display flags ────────────────── +# +# #635 wires the `chat` slot (Qwen3-class thinking-on model) as the top-level +# model with reasoning visible in the TUI. Three Hermes gotchas apply: +# 1. model.base_url MUST be set (provider:custom requires it) ← already done. +# 2. model.max_tokens MUST be set — else Qwen3 spends the full budget on +# and the content field comes back empty (silent TUI). +# 3. display.streaming AND display.show_reasoning BOTH required; without +# streaming the TUI hangs on the reasoning block. +# +# Reconciliation with #661: delegation→`agent` (ace-saber MoE, thinking-off) +# is already wired via _DELEGATION_SLOT_NAME="agent". Hermes has one delegation +# config (no per-subagent-type routing), so the reasoning-ON path is the top- +# level chat conversation, not a separate subagent slot. The agent MoE stays +# thinking-off by design. + + +def test_rendered_config_has_show_reasoning_true() -> None: + """display.show_reasoning: true is required for thinking-model TUI visibility.""" + yaml = pytest.importorskip("yaml") + rendered = hp._render_config_yaml( + primary={ + "model_id": "qwen3-coder-next-reap-40b-a3b-q4kxl", + "backend_url": _HAL0_V1, + "context_length": 32768, + }, + agent_id="hermes-agent", + ) + cfg = yaml.safe_load(rendered) + display = cfg.get("display") or {} + assert display.get("show_reasoning") is True, ( + "display.show_reasoning must be true for Qwen3 thinking visibility " + "(#635 gotcha: without it, reasoning output is silently suppressed)" + ) + + +def test_rendered_config_has_streaming_true() -> None: + """display.streaming: true is required alongside show_reasoning for thinking models. + + Without streaming, the TUI hangs on the block while waiting for + the full response before displaying anything (#635 + memory note). + """ + yaml = pytest.importorskip("yaml") + rendered = hp._render_config_yaml( + primary={ + "model_id": "qwen3-coder-next-reap-40b-a3b-q4kxl", + "backend_url": _HAL0_V1, + "context_length": 32768, + }, + agent_id="hermes-agent", + ) + cfg = yaml.safe_load(rendered) + display = cfg.get("display") or {} + assert display.get("streaming") is True, ( + "display.streaming must be true for thinking models (#635 gotcha: " + "without streaming, the TUI hangs silently on the block)" + ) + + +def test_rendered_config_has_model_max_tokens() -> None: + """model.max_tokens is required to prevent silent TUI on Qwen3 thinking models. + + Without a max_tokens cap, Qwen3 can exhaust the budget in and + return an empty content field — which looks like a broken wiring even + when everything else is correct (#635 + smoke-test timeout comment in + _smoke_chat_roundtrip). + """ + yaml = pytest.importorskip("yaml") + rendered = hp._render_config_yaml( + primary={ + "model_id": "qwen3-coder-next-reap-40b-a3b-q4kxl", + "backend_url": _HAL0_V1, + "context_length": 32768, + }, + agent_id="hermes-agent", + ) + cfg = yaml.safe_load(rendered) + model_cfg = cfg.get("model") or {} + max_tokens = model_cfg.get("max_tokens") + assert isinstance(max_tokens, int) and max_tokens > 0, ( + "model.max_tokens must be a positive integer — " + "Qwen3 thinking models silently drain the budget otherwise (#635)" + ) + + +def test_rendered_config_model_base_url_set() -> None: + """model.base_url is always present in the rendered config. + + Hermes's bare ``provider: custom`` requires model.base_url to be set + or it falls back to OpenRouter and emits '... is not a valid model ID' 400 + (#635 gotcha, also tracked in memory hermes_bare_custom_needs_model_base_url). + """ + yaml = pytest.importorskip("yaml") + # Verify both the primary-slot branch and the no-slot fallback branch. + for primary in [ + {"model_id": "qwen3-27b", "backend_url": _HAL0_V1, "context_length": 32768}, + None, + ]: + rendered = hp._render_config_yaml(primary=primary, agent_id="hermes-agent") + cfg = yaml.safe_load(rendered) + model_cfg = cfg.get("model") or {} + assert model_cfg.get("base_url"), ( + f"model.base_url must be set (primary={primary!r}) — " + "bare provider:custom without base_url routes to OpenRouter" + ) + + +def test_delegation_targets_agent_slot_not_chat() -> None: + """Delegation → `agent` MoE slot (thinking-off); chat stays on main model. + + This validates the #661/#635 reconciliation: #635 asked for + 'advanced-reasoning subagents → chat-27b' but Hermes has a single + delegation config. Reasoning lives on the top-level chat conversation + (show_reasoning + streaming); the agent MoE handles delegation. + The chat slot model must NOT appear as the delegation model. + """ + deleg = hp._resolve_delegation(_ROLE_SLOTS, hal0_base_url=_HAL0_V1) + assert deleg is not None, "delegation must be set when the agent slot is live" + # The delegation model is the agent MoE, not the chat-27b model. + assert deleg["model"] == "hermes-4-14b-q5km", ( + "delegation.model must be the agent slot model " + "(ace-saber MoE, thinking-off) — not the chat slot" + ) + chat_model = "qwen3-coder-next-reap-40b-a3b-q4kxl" + assert deleg["model"] != chat_model, ( + "delegation must NOT be the chat-slot model; " + "reasoning runs on the top-level chat, not in subagents" + ) + assert deleg["base_url"] == _HAL0_V1 + assert deleg["provider"] == "custom" + + def test_config_write_phase_writes_yaml_idempotently( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: