From f0bab19722b538c080d2edfbd9e75ddf7c03f5ec Mon Sep 17 00:00:00 2001
From: Alexander <alexander@awideweb.com>
Date: Mon, 8 Jun 2026 07:02:57 -0400
Subject: [PATCH] agents(hermes): wire reasoning config for chat slot (#661,
 closes #635)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wire `display.show_reasoning: true`, `display.streaming: true`, and
`model.max_tokens: 8192` into the Hermes config template so the chat
slot's Qwen3-class thinking model is visible in the TUI.

Three Hermes thinking-model gotchas applied (#635 memory note):
- streaming: true is the transport prerequisite; without it the TUI
  hangs silently on the <think> block.
- show_reasoning: true is the UI gate; without it the reasoning output
  is suppressed even when streaming is enabled.
- max_tokens: 8192 caps the budget so Qwen3 cannot exhaust it entirely
  in <think> and return an empty content field (silent TUI).

Delegation → `agent` slot (ace-saber MoE, thinking-off) is unchanged;
_DELEGATION_SLOT_NAME was already renamed "agent" in #654/#665.

Reconciliation (#635 vs #661): #635 asked for "advanced-reasoning
subagents → chat-27b" but Hermes has a single delegation config — no
per-subagent-type routing exists upstream. Reasoning visibility
therefore lives on the top-level chat conversation (show_reasoning +
streaming on the main model); the agent MoE handles delegation and is
intentionally thinking-off for fast agentic subtasks. A note is added
to the delegation comment in the template.

Also updated the template docstring to fix the stale `agent-hermes`
reference (post-rename: slot is now `agent`).

Live container e2e deferred: hal0-dev has no GPU; CT105 runtime
verification is out of scope for this PR (noted in acceptance criteria).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../agents/hermes_templates/config.yaml.j2    |  36 +++--
 tests/agents/test_hermes_provision.py         | 132 ++++++++++++++++++
 2 files changed, 159 insertions(+), 9 deletions(-)
diff --git a/src/hal0/agents/hermes_templates/config.yaml.j2 b/src/hal0/agents/hermes_templates/config.yaml.j2
index 77b7f714..50403f9f 100644
--- a/src/hal0/agents/hermes_templates/config.yaml.j2
+++ b/src/hal0/agents/hermes_templates/config.yaml.j2
@@ -17,8 +17,8 @@
      system_prompt    — str          (persona-rendered prelude; PR-3 Phase 7)
      personality_name — str          (display name of active persona; PR-3 Phase 8)
      delegation       — dict or None ({model, base_url, provider}) — subagent
-                        role model from the `agent-hermes` slot. None → block
-                        omitted → subagents inherit the chat model.
+                        role model from the `agent` slot (post-rename #654).
+                        None → block omitted → subagents inherit the chat model.
      auxiliary_tasks  — dict          task → {provider, model, base_url}.
                         Drives the auxiliary: block. utility-slot tasks render
                         provider:"custom"+base_url; vision/web_extract stay
@@ -62,6 +62,13 @@ model:
   provider: "custom"
   base_url: "http://127.0.0.1:8080/v1"
 {%- endif %}
+  # max_tokens guards against thinking-model silent-TUI: Qwen3 chat (the
+  # `chat` slot) can burn the entire budget on <think>…</think> and return
+  # an empty content field if max_tokens is unset. 8192 gives the model
+  # room to think + reply while keeping responses bounded. Without this,
+  # `display.show_reasoning: true` (below) still shows nothing because the
+  # content field is empty. Gotcha documented in #635 + hermes memory note.
+  max_tokens: 8192
 
 providers:
   # `custom` is upstream Hermes's built-in profile for OpenAI-compatible
@@ -117,12 +124,18 @@ custom_providers:
 {%- endif %}
 {%- if delegation %}
 
-{# feat/hermes-role-slots: subagents run on a dedicated slot
-   (`agent-hermes`). delegate_tool.py `_resolve_delegation_credentials`
-   reads delegation.{model,provider,base_url}; setting base_url forces
-   provider → "custom" so subagents hit hal0's OpenAI surface. Omitted
-   entirely when the slot isn't live, in which case subagents inherit the
-   parent (chat) model. #}
+{# feat/hermes-role-slots (#661): subagents run on the `agent` slot (ace-saber
+   MoE, thinking-off). This is intentionally different from the main `chat`
+   slot (thinking-on, Qwen3 class). The chat slot is used for the top-level
+   reasoning-visible conversation; the agent MoE is the fast non-think path
+   for agentic subtasks (tool use, decomposition). Reconciliation: #635 asked
+   for "advanced-reasoning subagents → chat-27b" but Hermes has one delegation
+   config (no per-subagent-type routing), so reasoning lives on the chat model
+   (show_reasoning + streaming above) and the agent slot handles delegation.
+   delegate_tool.py `_resolve_delegation_credentials` reads
+   delegation.{model,provider,base_url}; base_url forces provider → "custom"
+   so subagents hit hal0's OpenAI surface. Omitted when the slot is not live,
+   in which case subagents inherit the parent (chat) model. #}
 delegation:
   model: {{ delegation.model | tojson }}
   provider: {{ delegation.provider | default("custom") | tojson }}
@@ -185,7 +198,12 @@ agent:
 
 display:
   bell_on_complete: false
-  show_reasoning: false
+  # Reasoning visibility: `chat` slot runs Qwen3-class thinking models.
+  # Both flags are required together — streaming: true is the transport
+  # prerequisite; show_reasoning: true is the UI gate. Without streaming,
+  # the TUI hangs silently on the <think> block. Gotcha from #635.
+  streaming: true
+  show_reasoning: true
 {%- if personality_name %}
   # Phase 8 (PR-3): UI-facing label for the active persona; surfaces in
   # `hermes` itself when the operator runs the CLI directly. The
diff --git a/tests/agents/test_hermes_provision.py b/tests/agents/test_hermes_provision.py
index d9596108..66540937 100644
--- a/tests/agents/test_hermes_provision.py
+++ b/tests/agents/test_hermes_provision.py
@@ -869,6 +869,138 @@ def test_config_write_renders_role_slots_from_live_state(
     ]
 
 
+# ── #661/#635: reasoning wiring — chat slot + display flags ──────────────────
+#
+# #635 wires the `chat` slot (Qwen3-class thinking-on model) as the top-level
+# model with reasoning visible in the TUI. Three Hermes gotchas apply:
+#   1. model.base_url MUST be set (provider:custom requires it) ← already done.
+#   2. model.max_tokens MUST be set — else Qwen3 spends the full budget on
+#      <think> and the content field comes back empty (silent TUI).
+#   3. display.streaming AND display.show_reasoning BOTH required; without
+#      streaming the TUI hangs on the reasoning block.
+#
+# Reconciliation with #661: delegation→`agent` (ace-saber MoE, thinking-off)
+# is already wired via _DELEGATION_SLOT_NAME="agent". Hermes has one delegation
+# config (no per-subagent-type routing), so the reasoning-ON path is the top-
+# level chat conversation, not a separate subagent slot. The agent MoE stays
+# thinking-off by design.
+
+
+def test_rendered_config_has_show_reasoning_true() -> None:
+    """display.show_reasoning: true is required for thinking-model TUI visibility."""
+    yaml = pytest.importorskip("yaml")
+    rendered = hp._render_config_yaml(
+        primary={
+            "model_id": "qwen3-coder-next-reap-40b-a3b-q4kxl",
+            "backend_url": _HAL0_V1,
+            "context_length": 32768,
+        },
+        agent_id="hermes-agent",
+    )
+    cfg = yaml.safe_load(rendered)
+    display = cfg.get("display") or {}
+    assert display.get("show_reasoning") is True, (
+        "display.show_reasoning must be true for Qwen3 thinking visibility "
+        "(#635 gotcha: without it, reasoning output is silently suppressed)"
+    )
+
+
+def test_rendered_config_has_streaming_true() -> None:
+    """display.streaming: true is required alongside show_reasoning for thinking models.
+
+    Without streaming, the TUI hangs on the <think> block while waiting for
+    the full response before displaying anything (#635 + memory note).
+    """
+    yaml = pytest.importorskip("yaml")
+    rendered = hp._render_config_yaml(
+        primary={
+            "model_id": "qwen3-coder-next-reap-40b-a3b-q4kxl",
+            "backend_url": _HAL0_V1,
+            "context_length": 32768,
+        },
+        agent_id="hermes-agent",
+    )
+    cfg = yaml.safe_load(rendered)
+    display = cfg.get("display") or {}
+    assert display.get("streaming") is True, (
+        "display.streaming must be true for thinking models (#635 gotcha: "
+        "without streaming, the TUI hangs silently on the <think> block)"
+    )
+
+
+def test_rendered_config_has_model_max_tokens() -> None:
+    """model.max_tokens is required to prevent silent TUI on Qwen3 thinking models.
+
+    Without a max_tokens cap, Qwen3 can exhaust the budget in <think> and
+    return an empty content field — which looks like a broken wiring even
+    when everything else is correct (#635 + smoke-test timeout comment in
+    _smoke_chat_roundtrip).
+    """
+    yaml = pytest.importorskip("yaml")
+    rendered = hp._render_config_yaml(
+        primary={
+            "model_id": "qwen3-coder-next-reap-40b-a3b-q4kxl",
+            "backend_url": _HAL0_V1,
+            "context_length": 32768,
+        },
+        agent_id="hermes-agent",
+    )
+    cfg = yaml.safe_load(rendered)
+    model_cfg = cfg.get("model") or {}
+    max_tokens = model_cfg.get("max_tokens")
+    assert isinstance(max_tokens, int) and max_tokens > 0, (
+        "model.max_tokens must be a positive integer — "
+        "Qwen3 thinking models silently drain the budget otherwise (#635)"
+    )
+
+
+def test_rendered_config_model_base_url_set() -> None:
+    """model.base_url is always present in the rendered config.
+
+    Hermes's bare ``provider: custom`` requires model.base_url to be set
+    or it falls back to OpenRouter and emits '... is not a valid model ID' 400
+    (#635 gotcha, also tracked in memory hermes_bare_custom_needs_model_base_url).
+    """
+    yaml = pytest.importorskip("yaml")
+    # Verify both the primary-slot branch and the no-slot fallback branch.
+    for primary in [
+        {"model_id": "qwen3-27b", "backend_url": _HAL0_V1, "context_length": 32768},
+        None,
+    ]:
+        rendered = hp._render_config_yaml(primary=primary, agent_id="hermes-agent")
+        cfg = yaml.safe_load(rendered)
+        model_cfg = cfg.get("model") or {}
+        assert model_cfg.get("base_url"), (
+            f"model.base_url must be set (primary={primary!r}) — "
+            "bare provider:custom without base_url routes to OpenRouter"
+        )
+
+
+def test_delegation_targets_agent_slot_not_chat() -> None:
+    """Delegation → `agent` MoE slot (thinking-off); chat stays on main model.
+
+    This validates the #661/#635 reconciliation: #635 asked for
+    'advanced-reasoning subagents → chat-27b' but Hermes has a single
+    delegation config. Reasoning lives on the top-level chat conversation
+    (show_reasoning + streaming); the agent MoE handles delegation.
+    The chat slot model must NOT appear as the delegation model.
+    """
+    deleg = hp._resolve_delegation(_ROLE_SLOTS, hal0_base_url=_HAL0_V1)
+    assert deleg is not None, "delegation must be set when the agent slot is live"
+    # The delegation model is the agent MoE, not the chat-27b model.
+    assert deleg["model"] == "hermes-4-14b-q5km", (
+        "delegation.model must be the agent slot model "
+        "(ace-saber MoE, thinking-off) — not the chat slot"
+    )
+    chat_model = "qwen3-coder-next-reap-40b-a3b-q4kxl"
+    assert deleg["model"] != chat_model, (
+        "delegation must NOT be the chat-slot model; "
+        "reasoning runs on the top-level chat, not in subagents"
+    )
+    assert deleg["base_url"] == _HAL0_V1
+    assert deleg["provider"] == "custom"
+
+
 def test_config_write_phase_writes_yaml_idempotently(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None: