From dbd796c892eb9965ddbd908389073f50649d10fa Mon Sep 17 00:00:00 2001
From: hallerite <git@hallerite.com>
Date: Wed, 10 Jun 2026 21:45:36 +0000
Subject: [PATCH 1/2] fix(nemotron3): faithful Nano/Super/Ultra rendering +
 per-variant config split
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make the renderer byte-for-byte match apply_chat_template on branches the
shared barrage didn't cover, verified against the real cached templates.

Faithfulness (assistant body now mirrors the template's string algebra —
assemble <think>…</think>{content}, trim, append one separator — and is
tokenized in one pass):
- reason → tool-call / empty content no longer emits a stray blank line
  (</think>\n<tool_call>, not </think>\n\n<tool_call>); same for the
  no-tool empty-content case.
- history-truncation boundary is last_user_idx (was last_plain_assistant_idx)
  for every variant, so in-flight tool-cycle reasoning is kept.
- inline <think>…</think> in content renders verbatim (no reformat).
- user / system / tool / reasoning_content emitted unstripped.

Variant split (low_effort / medium_effort are real per-variant Jinja kwargs):
- nemotron-3 (Nano/Super): enable_thinking, truncate_history_thinking, low_effort.
- nemotron-3-ultra (new discriminator): + medium_effort.
- one shared Nemotron3Renderer selects the variant from config.name; drops the
  ultra flag, _default_ultra, and _ULTRA_DEFAULTS. _is_super kept to no-op
  low_effort on Nano. Bad combos now fail at config-load.

BREAKING: Nemotron3RendererConfig(ultra=True) → Nemotron3UltraRendererConfig().
"auto" resolution is unaffected.

Tests: new tests/test_nemotron3_parity.py (exhaustive Nano/Super/Ultra parity);
effort kwargs wired into the config-parity matrix; test_nemotron3_ultra.py
rewritten for the two-config wiring.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md                            |   2 +-
 renderers/__init__.py                |   2 +
 renderers/base.py                    |  12 +-
 renderers/configs.py                 |  71 +--
 renderers/nemotron3.py               | 373 +++++++--------
 tests/conftest.py                    |   3 +-
 tests/test_nemotron3_parity.py       | 676 +++++++++++++++++++++++++++
 tests/test_nemotron3_ultra.py        | 118 +++--
 tests/test_renderer_config_parity.py |  14 +-
 9 files changed, 1003 insertions(+), 268 deletions(-)
 create mode 100644 tests/test_nemotron3_parity.py
diff --git a/README.md b/README.md
index b2c3f2f..d153163 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ next_prompt_ids = r.bridge_to_next_turn(
 )
 ```
 
-Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper.
+Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `nemotron-3-ultra`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper.
 
 ## API
 
diff --git a/renderers/__init__.py b/renderers/__init__.py
index baa25db..7570f31 100644
--- a/renderers/__init__.py
+++ b/renderers/__init__.py
@@ -56,6 +56,7 @@
     Llama3RendererConfig,
     MiniMaxM2RendererConfig,
     Nemotron3RendererConfig,
+    Nemotron3UltraRendererConfig,
     Qwen35RendererConfig,
     Qwen36RendererConfig,
     Qwen3RendererConfig,
@@ -146,6 +147,7 @@ def __dir__() -> list[str]:
     "MultimodalRenderer",
     "Nemotron3Renderer",
     "Nemotron3RendererConfig",
+    "Nemotron3UltraRendererConfig",
     "OverlongPromptError",
     "ParsedResponse",
     "ParsedToolCall",
diff --git a/renderers/base.py b/renderers/base.py
index 0397b85..8fd9870 100644
--- a/renderers/base.py
+++ b/renderers/base.py
@@ -1040,14 +1040,15 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No
     "moonshotai/Kimi-K2-Instruct": "kimi-k2",
     "moonshotai/Kimi-K2.5": "kimi-k2.5",
     "moonshotai/Kimi-K2.6": "kimi-k2.5",
-    # Nemotron 3. Nano / Super share one chat-template variant; the Ultra
-    # checkpoints use the Ultra variant — the renderer auto-selects it from
-    # the model name (see ``nemotron3._ULTRA_DEFAULTS``). BF16 and FP8 share the
+    # Nemotron 3. Nano / Super share one chat-template variant (``nemotron-3``);
+    # the Ultra checkpoints use the Ultra variant (``nemotron-3-ultra``, distinct
+    # ``</think>`` glue). Both route to the same Nemotron3Renderer, which selects
+    # the variant from the resolved config's ``name``. BF16 and FP8 share the
     # same tokenizer and template.
     "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "nemotron-3",
     "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": "nemotron-3",
-    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3",
-    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3",
+    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3-ultra",
+    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3-ultra",
     # Llama 3.2 (Instruct). Tested against the gated meta-llama repos and
     # the unrestricted unsloth/... mirror, which ships a byte-identical
     # chat template. ``Llama3Renderer`` defaults ``date_string`` to
@@ -1374,6 +1375,7 @@ def _populate_registry():
             "laguna-xs.2": LagunaXS2Renderer,
             "llama-3": Llama3Renderer,
             "nemotron-3": Nemotron3Renderer,
+            "nemotron-3-ultra": Nemotron3Renderer,
             "gpt-oss": GptOssRenderer,
         }
     )
diff --git a/renderers/configs.py b/renderers/configs.py
index ca16e46..d500f8e 100644
--- a/renderers/configs.py
+++ b/renderers/configs.py
@@ -354,7 +354,14 @@ class MiniMaxM2RendererConfig(BaseRendererConfig):
 
 
 class Nemotron3RendererConfig(BaseRendererConfig):
-    """Nemotron 3 renderer config."""
+    """Nemotron-3 **Nano / Super** renderer config.
+
+    Nano and Super share one chat-template variant; the renderer routes both
+    through :class:`renderers.nemotron3.Nemotron3Renderer`. The Ultra variant
+    has its own template (different reasoning-block glue) and config —
+    :class:`Nemotron3UltraRendererConfig` — and is reached via the
+    ``nemotron-3-ultra`` discriminator.
+    """
 
     name: Literal["nemotron-3"] = "nemotron-3"
 
@@ -362,26 +369,6 @@ class Nemotron3RendererConfig(BaseRendererConfig):
     """When ``True``, the generation prompt includes ``<think>``. Mirrors
     the chat template's ``enable_thinking`` kwarg."""
 
-    ultra: bool | None = None
-    """Select the Nemotron-3 **Ultra** chat-template variant.
-
-    ``None`` (default) auto-detects from the model name (see
-    ``renderers.nemotron3._ULTRA_DEFAULTS``): the Ultra checkpoints resolve
-    to ``True``; Nano / Super and unknown checkpoints to ``False``. Set
-    explicitly to force a variant — e.g. an Ultra fine-tune or a
-    locally-pathed checkpoint whose ``name_or_path`` isn't in the table.
-
-    Ultra's template differs from Nano/Super: the reasoning block is glued
-    as ``<think>\\n{reasoning}</think>{content}`` (no ``\\n`` around
-    ``</think>``), truncated historical turns collapse to
-    ``<think></think>{content}`` (no ``\\n``), and the thinking-truncation
-    boundary follows the template's ``loop.index0 < last_user_idx`` rule
-    (drop thinking on every assistant turn before the last user message).
-
-    Not a chat-template kwarg — it picks which template the renderer
-    mirrors, not a variable passed into one — so it's listed in
-    ``_internal_fields`` and excluded from ``template_field_names()``."""
-
     truncate_history_thinking: bool = True
     """When ``False``, keep ``<think>{reasoning}</think>`` on past-cycle
     assistant turns instead of dropping them. Mirrors the chat
@@ -389,14 +376,37 @@ class Nemotron3RendererConfig(BaseRendererConfig):
     ``preserve_all_thinking`` / ``preserve_thinking_between_tool_calls``
     — see :class:`BaseRendererConfig` for the contract."""
 
-    # ``ultra`` is a template-variant SELECTOR — it picks which template the
-    # renderer mirrors (Ultra vs Nano/Super), not a variable passed into one;
-    # there is no ``ultra`` Jinja variable. Marked internal so the parity
-    # matrix doesn't cross it as a template field. Same ``_internal_fields``
-    # mechanism DeepSeek-V3 uses for its no-op ``enable_thinking``, for a
-    # different underlying reason (theirs is an ignored kwarg, this is a
-    # variant switch).
-    _internal_fields = frozenset({"ultra"})
+    low_effort: bool = False
+    """When ``True``, append ``\\n\\n{reasoning effort: low}`` to the last user
+    message, nudging the model toward shorter reasoning. Mirrors the **Super**
+    chat template's ``low_effort`` kwarg. A no-op on **Nano** (its template
+    doesn't define it) — exactly as ``apply_chat_template`` ignores an undefined
+    template variable; the renderer distinguishes the two by model name (see
+    ``renderers.nemotron3._is_super``)."""
+
+
+class Nemotron3UltraRendererConfig(BaseRendererConfig):
+    """Nemotron-3 **Ultra** renderer config — distinct discriminator so the
+    registry routes Ultra checkpoints to the Ultra template variant.
+
+    Ultra's template differs from Nano/Super: the reasoning block is glued as
+    ``<think>\\n{reasoning}</think>{content}`` (no ``\\n`` around ``</think>``)
+    and truncated historical turns collapse to ``<think></think>{content}``
+    (no ``\\n``). It shares the :class:`renderers.nemotron3.Nemotron3Renderer`
+    implementation, which selects the variant from ``config.name``.
+    """
+
+    name: Literal["nemotron-3-ultra"] = "nemotron-3-ultra"
+
+    enable_thinking: bool = True
+    """See :class:`Nemotron3RendererConfig.enable_thinking`."""
+
+    truncate_history_thinking: bool = True
+    """See :class:`Nemotron3RendererConfig.truncate_history_thinking`."""
+
+    medium_effort: bool = False
+    """When ``True``, append ``\\n\\n{reasoning effort: efficient}`` to the last
+    user message. Mirrors the Ultra chat template's ``medium_effort`` kwarg."""
 
 
 class DeepSeekV3RendererConfig(BaseRendererConfig):
@@ -444,6 +454,7 @@ class DeepSeekR1RendererConfig(BaseRendererConfig):
         Llama3RendererConfig,
         MiniMaxM2RendererConfig,
         Nemotron3RendererConfig,
+        Nemotron3UltraRendererConfig,
         DeepSeekV3RendererConfig,
         DeepSeekR1RendererConfig,
     ],
@@ -480,6 +491,7 @@ class DeepSeekR1RendererConfig(BaseRendererConfig):
     "llama-3": Llama3RendererConfig,
     "minimax-m2": MiniMaxM2RendererConfig,
     "nemotron-3": Nemotron3RendererConfig,
+    "nemotron-3-ultra": Nemotron3UltraRendererConfig,
     "deepseek-v3": DeepSeekV3RendererConfig,
     "deepseek-r1": DeepSeekR1RendererConfig,
 }
@@ -525,6 +537,7 @@ def config_from_name(name: str) -> BaseRendererConfig | None:
     "Llama3RendererConfig",
     "MiniMaxM2RendererConfig",
     "Nemotron3RendererConfig",
+    "Nemotron3UltraRendererConfig",
     "Qwen35RendererConfig",
     "Qwen36RendererConfig",
     "Qwen3RendererConfig",
diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py
index b735cde..8716145 100644
--- a/renderers/nemotron3.py
+++ b/renderers/nemotron3.py
@@ -30,7 +30,7 @@
     should_preserve_past_thinking,
     trim_to_turn_close,
 )
-from renderers.configs import Nemotron3RendererConfig
+from renderers.configs import Nemotron3RendererConfig, Nemotron3UltraRendererConfig
 from renderers.parsing import parse_qwen35
 
 # ---------------------------------------------------------------------------
@@ -75,33 +75,25 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str]
     return lines
 
 
-# Per-model ``ultra`` default, applied when the renderer config leaves it
-# ``None``. The Nemotron-3 family ships two chat-template variants: Nano /
-# Super share one; Ultra differs in the reasoning-block glue (no ``\n`` around
-# ``</think>``) and the thinking-truncation boundary (drop thinking on every
-# assistant turn before the last user message). BF16 and FP8 share the same
-# tokenizer and template. Hard-coded keyed by
-# ``tokenizer.name_or_path`` rather than probed from the live template — the
-# same convention as Qwen3.5's ``_ENABLE_THINKING_DEFAULTS`` (avoids pulling
-# ``apply_chat_template`` onto the construction hot path and keeps
-# bring-your-own-tokenizer use working).
-_ULTRA_DEFAULTS: dict[str, bool] = {
-    "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": False,
-    "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": False,
-    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": True,
-    "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": True,
-}
-
-
-def _default_ultra(tokenizer) -> bool:
-    """Hard-coded ``ultra`` default for ``tokenizer``'s model.
-
-    Falls back to ``False`` (the Nano / Super template, and the majority of
-    the family) for unknown / fine-tuned checkpoints whose ``name_or_path``
-    isn't in ``_ULTRA_DEFAULTS`` — pass an explicit ``ultra=True`` for an
-    Ultra fine-tune or a locally-pathed Ultra checkpoint.
+# The Nemotron-3 family ships two chat-template variants. Nano / Super share
+# one (config ``name="nemotron-3"``); Ultra differs in the reasoning-block glue
+# — no ``\n`` around ``</think>`` — and gets its own discriminator
+# (``name="nemotron-3-ultra"``). Which variant a checkpoint uses is carried by
+# ``MODEL_RENDERER_MAP`` (and hence the resolved config's ``name``), so the
+# renderer reads it off ``config.name`` rather than probing the live template.
+_ULTRA_CONFIG_NAME = "nemotron-3-ultra"
+
+
+def _is_super(tokenizer) -> bool:
+    """Does this checkpoint use the **Super** flavour of the shared Nano/Super
+    template — i.e. the one whose Jinja defines the ``low_effort`` kwarg?
+
+    Nano and Super share one config (``nemotron-3``), so the model name is the
+    only signal that separates them. Detected by substring; unknown / fine-tuned
+    checkpoints default to ``False`` so ``low_effort`` is a no-op there —
+    matching how the Nano template silently ignores it.
     """
-    return _ULTRA_DEFAULTS.get(getattr(tokenizer, "name_or_path", ""), False)
+    return "super" in (getattr(tokenizer, "name_or_path", "") or "").lower()
 
 
 class Nemotron3Renderer:
@@ -110,17 +102,31 @@ class Nemotron3Renderer:
     def __init__(
         self,
         tokenizer: PreTrainedTokenizer,
-        config: Nemotron3RendererConfig | None = None,
+        config: Nemotron3RendererConfig | Nemotron3UltraRendererConfig | None = None,
     ):
         self._tokenizer = tokenizer
         cfg = config or Nemotron3RendererConfig()
-        # ``ultra=None`` defers to the model's known default (see
-        # ``_ULTRA_DEFAULTS``). Materialise here so downstream reads see a
-        # concrete bool; rebind the frozen config with the resolved value so
-        # introspection sees the same.
-        if cfg.ultra is None:
-            cfg = cfg.model_copy(update={"ultra": _default_ultra(tokenizer)})
         self.config = cfg
+        # The Ultra variant is selected by the config discriminator
+        # (``name="nemotron-3-ultra"``), not a flag — one renderer class serves
+        # both, switching glue off ``self._ultra``.
+        self._ultra = cfg.name == _ULTRA_CONFIG_NAME
+
+        # Resolve the per-variant reasoning-effort hint appended to the last
+        # user message. Ultra honours ``medium_effort``; Super honours
+        # ``low_effort``; Nano honours neither. The non-matching kwarg is
+        # silently ignored (empty hint), exactly as ``apply_chat_template``
+        # ignores a template variable the variant's Jinja never defines.
+        if self._ultra:
+            self._effort_hint = (
+                "\n\n{reasoning effort: efficient}"
+                if getattr(cfg, "medium_effort", False)
+                else ""
+            )
+        elif getattr(cfg, "low_effort", False) and _is_super(tokenizer):
+            self._effort_hint = "\n\n{reasoning effort: low}"
+        else:
+            self._effort_hint = ""
 
         # Look up special token IDs from the tokenizer (not hardcoded).
         # <|endoftext|> is optional: Nemotron-3 Nano / Super tokenizers ship
@@ -321,9 +327,12 @@ def emit_text_segments(
 
             emit_special(self._im_start, sys_idx, is_sampled=False, is_content=False)
 
-            # Build system content: user's system text first, then tools
+            # Build system content: user's system text first, then tools.
+            # The template emits ``system_message`` verbatim (no trim) and
+            # gates the ``\n\n`` separator on its raw length, so keep the
+            # caller's content unstripped.
             if first_is_system:
-                sys_content = self._render_content(messages[0].get("content")).strip()
+                sys_content = self._render_content(messages[0].get("content"))
             else:
                 sys_content = ""
 
@@ -351,7 +360,7 @@ def emit_text_segments(
 
         elif first_is_system:
             sys_idx = orig_idx(0)
-            sys_content = self._render_content(messages[0].get("content")).strip()
+            sys_content = self._render_content(messages[0].get("content"))
             emit_special(self._im_start, sys_idx, is_sampled=False, is_content=False)
             sys_segments2: list[tuple[str, bool]] = [("system\n", False)]
             if sys_content:
@@ -360,22 +369,13 @@ def emit_text_segments(
             emit_special(self._im_end, sys_idx, is_sampled=False, is_content=False)
             emit_text("\n", sys_idx, is_sampled=False, is_content=False)
 
-        # Track the most-recent plain (non-tool-call) assistant so we can
-        # preserve its reasoning while stripping reasoning from earlier
-        # assistants — the Nemotron-3 template matches this pattern.
-        last_plain_assistant_idx = -1
-        for j in range(len(messages) - 1, -1, -1):
-            if messages[j].get("role") == "assistant" and not messages[j].get(
-                "tool_calls"
-            ):
-                last_plain_assistant_idx = j
-                break
-
-        # Ultra truncates thinking on every assistant turn *before the last
-        # user message* (template rule ``loop.index0 < last_user_idx``),
-        # whereas Nano/Super preserve only the last plain assistant. Compute
-        # the last-user index over the normalized ``messages`` list (a leading
-        # system never holds a user, so the relative comparison is unaffected).
+        # All Nemotron-3 variants (Nano / Super / Ultra) truncate historical
+        # thinking on every assistant turn *before the last user message* —
+        # the template rule ``truncate_history_thinking and loop.index0 <
+        # last_user_idx`` is byte-identical across the three chat templates.
+        # Compute the last-user index over the normalized ``messages`` list (a
+        # leading system never holds a user, so the relative comparison is
+        # unaffected).
         last_user_idx_norm = -1
         for j in range(len(messages) - 1, -1, -1):
             if messages[j].get("role") == "user":
@@ -385,7 +385,10 @@ def emit_text_segments(
         # ── 2. Iterate messages ─────────────────────────────────────
         for i, msg in enumerate(messages):
             role = msg["role"]
-            content = self._render_content(msg.get("content")).strip()
+            # Keep content unstripped: the template emits user / system / tool
+            # content verbatim, and assistant trimming happens inside
+            # ``_assistant_body`` exactly where the template applies it.
+            content = self._render_content(msg.get("content"))
             msg_orig_idx = orig_idx(i)
 
             if role == "system":
@@ -400,6 +403,12 @@ def emit_text_segments(
                 user_segments: list[tuple[str, bool]] = [("user\n", False)]
                 if content:
                     user_segments.append((content, True))
+                # Reasoning-effort hint rides on the LAST user message only,
+                # glued to the content so BPE sees them as one chunk (matching
+                # the template's ``content + '\n\n{reasoning effort: …}'``). It
+                # is template scaffold, not caller content → is_content=False.
+                if self._effort_hint and i == last_user_idx_norm:
+                    user_segments.append((self._effort_hint, False))
                 emit_text_segments(user_segments, msg_orig_idx, is_sampled=False)
                 emit_special(
                     self._im_end, msg_orig_idx, is_sampled=False, is_content=False
@@ -407,26 +416,29 @@ def emit_text_segments(
                 emit_text("\n", msg_orig_idx, is_sampled=False, is_content=False)
 
             elif role == "assistant":
-                if self.config.ultra:
-                    is_last_turn = i >= last_user_idx_norm
-                else:
-                    is_last_turn = i >= last_plain_assistant_idx
+                # Template: ``include_content = not (truncate_history_thinking
+                # and loop.index0 < last_user_idx)``. The renderer-internal
+                # preserve_* overrides only ever *extend* retention, so OR them
+                # in (a preserved turn keeps its thinking even when the
+                # template default would drop it).
                 preserve_thinking = msg_orig_idx >= 0 and should_preserve_past_thinking(
                     original_messages,
                     msg_orig_idx,
                     preserve_all_thinking=self.config.preserve_all_thinking,
                     preserve_thinking_between_tool_calls=self.config.preserve_thinking_between_tool_calls,
                 )
+                include_content = (
+                    not self.config.truncate_history_thinking
+                    or i >= last_user_idx_norm
+                    or preserve_thinking
+                )
                 self._render_assistant(
                     msg,
                     msg_orig_idx,
                     content,
-                    is_last_turn=is_last_turn,
-                    preserve_thinking=preserve_thinking,
+                    include_content=include_content,
                     emit_special=emit_special,
                     emit_text=emit_text,
-                    emit_ids=emit_ids,
-                    emit_text_segments=emit_text_segments,
                 )
 
             elif role == "tool":
@@ -516,6 +528,11 @@ def bridge_to_next_turn(
             not previous_prompt_ids
             or not new_messages
             or reject_assistant_in_extension(new_messages)
+            # An active effort hint rides on the *last* user message. Appending
+            # a new turn can move which user is last, which would strand the
+            # hint on the frozen previous prompt — the append-only bridge can't
+            # rewrite it. Bail so the caller does a full, correct re-render.
+            or self._effort_hint
         ):
             return None
 
@@ -585,7 +602,9 @@ def emit_text_segments(
 
         for i, msg in enumerate(new_messages):
             role = msg.get("role")
-            content = self._render_content(msg.get("content")).strip()
+            # Unstripped — the template emits user / system / tool content
+            # verbatim (see :meth:`render`).
+            content = self._render_content(msg.get("content"))
             if role == "user":
                 emit_special(self._im_start, i)
                 user_segments: list[tuple[str, bool]] = [("user\n", False)]
@@ -646,29 +665,10 @@ def _render_assistant(
         msg_idx: int,
         content: str,
         *,
-        is_last_turn: bool,
-        preserve_thinking: bool = False,
+        include_content: bool,
         emit_special,
         emit_text,
-        emit_ids,
-        emit_text_segments,
     ) -> None:
-        # Extract reasoning_content
-        reasoning_content = ""
-        if isinstance(msg.get("reasoning_content"), str):
-            reasoning_content = msg["reasoning_content"]
-        elif "</think>" in content:
-            before_think_end, after_think_end = content.split("</think>", 1)
-            if "<think>" in before_think_end:
-                reasoning_content = before_think_end.split("<think>")[-1].lstrip("\n")
-            else:
-                reasoning_content = before_think_end.lstrip("\n")
-            reasoning_content = reasoning_content.rstrip("\n")
-            content = after_think_end.lstrip("\n")
-
-        reasoning_content = reasoning_content.strip()
-        ultra = self.config.ultra
-
         # ``<|im_start|>assistant\n`` is template-injected scaffolding —
         # at inference the chat template emits these as the generation
         # prompt and the model never samples them. Marking the role tag
@@ -678,123 +678,108 @@ def _render_assistant(
         emit_special(self._im_start, msg_idx, is_sampled=False, is_content=False)
         emit_text("assistant\n", msg_idx, is_sampled=False, is_content=False)
 
-        # Nemotron 3 keeps reasoning on the most-recent plain assistant but
-        # strips it from historical turns, which collapse to an empty
-        # <think></think> block. Empty <think></think> is also emitted when
-        # the turn has no reasoning at all. The trailing ``\n`` (when
-        # tool_calls follow) is glued to ``content`` in a single emit_text
-        # so BPE sees ``content\n`` as one chunk, matching how
-        # apply_chat_template tokenises the concatenated template string.
-        tool_calls = msg.get("tool_calls") or []
-        # A \n is always required between the text/think block and the first
-        # <tool_call>, whether the content is empty or not.
-        content_suffix = "\n" if tool_calls else ""
-
-        if reasoning_content and (
-            is_last_turn
-            or preserve_thinking
-            or not self.config.truncate_history_thinking
-        ):
-            emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
-            # Ultra: <think>\n{reasoning}</think>{content} (no \n around </think>).
-            # Nano/Super: <think>\n{reasoning}\n</think>\n{content}.
-            emit_text(
-                ("\n" + reasoning_content)
-                if ultra
-                else ("\n" + reasoning_content + "\n"),
-                msg_idx,
-                is_sampled=True,
-                is_content=True,
-            )
-            emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
-            # Single \n separator (not \n\n like Qwen3.5); Ultra glues directly.
-            emit_text(
-                (content + content_suffix)
-                if ultra
-                else ("\n" + content + content_suffix),
-                msg_idx,
-                is_sampled=True,
-                is_content=True,
-            )
-        elif reasoning_content:
-            # Historical assistant whose reasoning got stripped. Nano/Super keep
-            # a single \n between the collapsed <think></think> and the content
-            # as a marker that reasoning existed; Ultra glues content directly.
-            emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
-            emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
-            emit_text(
-                (content + content_suffix)
-                if ultra
-                else ("\n" + content + content_suffix),
-                msg_idx,
-                is_sampled=True,
-                is_content=True,
-            )
+        # Build the body (everything between ``assistant\n`` and ``<|im_end|>``)
+        # as a single string mirroring the chat template's own string algebra,
+        # then tokenise it in one pass. The ``<think>`` / ``</think>`` /
+        # ``<tool_call>`` / ``</tool_call>`` markers are added tokens, so the
+        # tokenizer isolates them — encoding the assembled body yields the same
+        # ids as ``apply_chat_template`` (which likewise encodes a rendered
+        # string). The whole body is sampled content; ``<|im_end|>`` is the
+        # model's stop signal (sampled), and the inter-turn ``\n`` is not.
+        body = self._assistant_body(msg, content, include_content=include_content)
+        if body:
+            emit_text(body, msg_idx, is_sampled=True, is_content=True)
+        emit_special(self._im_end, msg_idx, is_sampled=True, is_content=True)
+        emit_text("\n", msg_idx, is_sampled=False, is_content=False)
+
+    def _assistant_body(
+        self, msg: Message, raw_content: str, *, include_content: bool
+    ) -> str:
+        """Assemble the assistant body string exactly as the chat template.
+
+        ``include_content`` is the template's ``not (truncate_history_thinking
+        and loop.index0 < last_user_idx)`` (already OR-ed with the preserve_*
+        overrides by the caller): ``True`` keeps the full think+content block,
+        ``False`` collapses historical thinking to an empty ``<think></think>``.
+        """
+        ultra = self._ultra
+
+        # 1. Assemble ``content`` — wrap a ``reasoning_content`` field in
+        #    <think> tags (raw, not stripped: interior whitespace is part of
+        #    the reasoning), else prepend an empty <think></think> only when
+        #    the content carries no inline think tags of its own (which are
+        #    passed through verbatim, like the template).
+        reasoning = msg.get("reasoning_content")
+        if isinstance(reasoning, str) and reasoning.strip():
+            if ultra:
+                content = "<think>\n" + reasoning + "</think>" + raw_content
+            else:
+                content = "<think>\n" + reasoning + "\n</think>\n" + raw_content
         else:
-            # No reasoning ever — <think></think> glued directly to content.
-            emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
-            emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
-            emit_text(
-                content + content_suffix,
-                msg_idx,
-                is_sampled=True,
-                is_content=True,
-            )
+            content = raw_content
+            if "<think>" not in content and "</think>" not in content:
+                content = "<think></think>" + content
+
+        tool_calls = msg.get("tool_calls") or []
 
-        # Tool calls (leading \n was glued to the content above; each
-        # iteration's trailing \n after </tool_call> handles the
-        # separator to the next block).
         if tool_calls:
+            parts: list[str] = []
+            if content.strip():
+                if include_content:
+                    parts.append(content.strip() + "\n")
+                else:
+                    # Drop historical thinking: keep only what follows the last
+                    # </think> (or precedes a dangling <think>), then re-stamp
+                    # an empty block. Nano/Super trim the remainder; Ultra glues
+                    # it raw (its template omits the trailing ``| trim``).
+                    c = content
+                    if "</think>" in c:
+                        c = c.split("</think>")[-1]
+                    elif "<think>" in c:
+                        c = c.split("<think>")[0]
+                    c = "<think></think>" + (c if ultra else c.strip())
+                    if c:
+                        parts.append(c + "\n")
+            else:
+                # Non-string / empty content: bare collapsed think block, no \n.
+                parts.append("<think></think>")
             for tc in tool_calls:
-                func = tc.get("function") or tc
-                name = func.get("name", "")
-                arguments = func.get("arguments", {})
-
-                emit_special(self._tool_call, msg_idx, is_sampled=True, is_content=True)
-                emit_text(
-                    "\n<function=" + name + ">\n",
-                    msg_idx,
-                    is_sampled=True,
-                    is_content=True,
-                )
+                parts.append(self._format_tool_call(tc))
+            return "".join(parts)
 
-                # Render arguments
-                # OpenAI canonical form: arguments is a JSON string. Parse it so the
-                # per-argument rendering below still works.
-                if isinstance(arguments, str):
-                    try:
-                        arguments = json.loads(arguments)
-                    except json.JSONDecodeError:
-                        arguments = {}
-                if isinstance(arguments, dict):
-                    for arg_name, arg_value in arguments.items():
-                        if isinstance(arg_value, (dict, list)):
-                            value_str = json.dumps(arg_value, ensure_ascii=False)
-                        else:
-                            value_str = str(arg_value)
-                        emit_text(
-                            "<parameter="
-                            + arg_name
-                            + ">\n"
-                            + value_str
-                            + "\n</parameter>\n",
-                            msg_idx,
-                            is_sampled=True,
-                            is_content=True,
-                        )
-
-                emit_text("</function>\n", msg_idx, is_sampled=True, is_content=True)
-                emit_special(
-                    self._tool_call_end, msg_idx, is_sampled=True, is_content=True
-                )
-                # Trailing \n after </tool_call> (Nemotron 3 specific)
-                emit_text("\n", msg_idx, is_sampled=True, is_content=True)
+        # No tool calls.
+        if include_content:
+            return content.strip()
+        c = content
+        if "<think>" in c and "</think>" in c:
+            c = "<think></think>" + c.split("</think>")[-1]
+        return c.strip()
 
-        # ``<|im_end|>`` is the model's stop signal — it samples this to
-        # end its turn, so it is part of the sampled stream. The trailing
-        # ``\n`` is template-appended between turns and never sampled.
-        emit_special(self._im_end, msg_idx, is_sampled=True, is_content=True)
-        emit_text("\n", msg_idx, is_sampled=False, is_content=False)
+    @staticmethod
+    def _format_tool_call(tc: dict[str, Any]) -> str:
+        """Render one tool call as ``<tool_call>…</tool_call>\\n`` XML."""
+        func = tc.get("function") or tc
+        name = func.get("name", "")
+        arguments = func.get("arguments", {})
+        # OpenAI canonical form: arguments is a JSON string. Parse it so the
+        # per-argument rendering below still works.
+        if isinstance(arguments, str):
+            try:
+                arguments = json.loads(arguments)
+            except json.JSONDecodeError:
+                arguments = {}
+        parts = ["<tool_call>\n<function=" + name + ">\n"]
+        if isinstance(arguments, dict):
+            for arg_name, arg_value in arguments.items():
+                if isinstance(arg_value, (dict, list)):
+                    value_str = json.dumps(arg_value, ensure_ascii=False)
+                else:
+                    value_str = str(arg_value)
+                parts.append(
+                    "<parameter=" + arg_name + ">\n" + value_str + "\n</parameter>\n"
+                )
+        parts.append("</function>\n</tool_call>\n")
+        return "".join(parts)
 
     # ------------------------------------------------------------------
     # Tool message rendering
diff --git a/tests/conftest.py b/tests/conftest.py
index d62d600..c3bfeed 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,7 +33,8 @@
     ("moonshotai/Kimi-K2.6", "auto"),
     ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
     ("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"),
-    # Ultra resolves the Ultra template variant via name (auto → ultra=True).
+    # Ultra resolves to the `nemotron-3-ultra` config variant via the model
+    # name (auto → MODEL_RENDERER_MAP → nemotron-3-ultra).
     ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
     ("poolside/Laguna-XS.2", "auto"),
     # DeepSeek-V3/R1 are intentionally NOT in this shared barrage: their
diff --git a/tests/test_nemotron3_parity.py b/tests/test_nemotron3_parity.py
new file mode 100644
index 0000000..01c521b
--- /dev/null
+++ b/tests/test_nemotron3_parity.py
@@ -0,0 +1,676 @@
+"""Exhaustive token-for-token parity for the Nemotron-3 renderer.
+
+The shared barrage in ``test_render_ids.py`` covers the common message
+shapes against every model. This file pins the Nemotron-3-specific template
+branches that the shared matrix can't reach — they'd fail on other models or
+exercise behaviour unique to the Nemotron-3 chat template:
+
+* reasoning + empty / ``None`` content with and without tool calls (the
+  template trims the assembled ``<think>…</think>{content}`` block and appends
+  exactly one separator — a stray ``\\n`` here is the most common agentic
+  regression);
+* the historical-thinking truncation boundary, which is ``loop.index0 <
+  last_user_idx`` in **all three** variants (Nano / Super / Ultra) — so an
+  in-flight tool cycle (assistant turns after the last user message) keeps its
+  reasoning by default;
+* inline ``<think>…</think>`` tags carried in ``content`` rendering verbatim
+  (the template only reformats reasoning supplied via ``reasoning_content``);
+* verbatim (unstripped) user / system / tool content and ``reasoning_content``;
+* the ``enable_thinking`` / ``truncate_history_thinking`` template kwargs;
+* the per-variant reasoning-effort kwargs: ``low_effort`` (Super) and
+  ``medium_effort`` (Ultra), each a no-op on the variants that don't define it.
+
+Every assertion compares ``renderer.render_ids(...)`` to
+``tokenizer.apply_chat_template(..., tokenize=True)`` — a pass means the
+renderer is byte-for-byte faithful for that case. Tokenizers are loaded from
+the local HF cache (offline); no network.
+
+The variants split across two configs: ``nemotron-3`` (Nano / Super, with
+``low_effort``) and ``nemotron-3-ultra`` (Ultra, with ``medium_effort``). The
+helper resolves the right config class per model from ``MODEL_RENDERER_MAP``.
+"""
+
+from __future__ import annotations
+
+from functools import lru_cache
+
+import pytest
+
+from renderers import create_renderer
+from renderers.base import MODEL_RENDERER_MAP, load_tokenizer
+from renderers.configs import _config_class_for
+
+# BF16 / FP8 share a tokenizer; only the BF16 checkpoints are cached for tests.
+NANO = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+SUPER = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"
+ULTRA = "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16"
+MODELS = [NANO, SUPER, ULTRA]
+
+
+@lru_cache
+def _tok(model: str):
+    return load_tokenizer(model)
+
+
+def _config_cls(model: str):
+    """The typed-config class the model resolves to (``nemotron-3`` for
+    Nano/Super, ``nemotron-3-ultra`` for Ultra)."""
+    return _config_class_for(MODEL_RENDERER_MAP[model])
+
+
+def _renderer(model: str, **flags):
+    # Build with the model's own variant config so the renderer picks the right
+    # ``</think>`` glue (and only valid kwargs are accepted).
+    return create_renderer(_tok(model), _config_cls(model)(**flags))
+
+
+def _expected(
+    model: str, messages, *, tools=None, add_generation_prompt=False, **kwargs
+):
+    out = _tok(model).apply_chat_template(
+        messages,
+        tools=tools,
+        tokenize=True,
+        return_dict=False,
+        add_generation_prompt=add_generation_prompt,
+        **kwargs,
+    )
+    if isinstance(out, str):  # some tokenizers return str even with tokenize=True
+        return list(_tok(model).encode(out, add_special_tokens=False))
+    return list(out)
+
+
+def _assert_parity(
+    model, messages, *, tools=None, add_generation_prompt=False, **template_kwargs
+):
+    """Renderer ids == apply_chat_template ids for ``model``.
+
+    ``template_kwargs`` (e.g. ``enable_thinking``, ``truncate_history_thinking``)
+    are forwarded to both the renderer config and ``apply_chat_template`` so the
+    two sides stay aligned.
+    """
+    renderer = _renderer(model, **template_kwargs)
+    got = renderer.render_ids(
+        messages, tools=tools, add_generation_prompt=add_generation_prompt
+    )
+    exp = _expected(
+        model,
+        messages,
+        tools=tools,
+        add_generation_prompt=add_generation_prompt,
+        **template_kwargs,
+    )
+    assert got == exp, (
+        f"{model}: render_ids diverged from apply_chat_template\n"
+        f"  exp …{_tok(model).decode(exp[-40:])!r}\n"
+        f"  got …{_tok(model).decode(got[-40:])!r}"
+    )
+
+
+pytestmark = pytest.mark.parametrize("model", MODELS, ids=["nano", "super", "ultra"])
+
+
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a city",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {"type": "string", "description": "The city name"}
+                },
+                "required": ["city"],
+            },
+        },
+    }
+]
+
+
+# ── Reasoning + tool calls: the trim / separator boundary ─────────────
+
+
+def test_reasoning_empty_content_tool_call(model):
+    """reason → tool call, no prose. Must be ``</think>\\n<tool_call>`` (one
+    newline), not ``</think>\\n\\n<tool_call>``."""
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Weather in Paris?"},
+            {
+                "role": "assistant",
+                "reasoning_content": "I should call the weather tool.",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "Paris"},
+                        }
+                    }
+                ],
+            },
+        ],
+        tools=TOOLS,
+    )
+
+
+def test_reasoning_none_content_tool_call(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Weather in Paris?"},
+            {
+                "role": "assistant",
+                "reasoning_content": "Call the tool.",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "Paris"},
+                        }
+                    }
+                ],
+            },
+        ],
+        tools=TOOLS,
+    )
+
+
+def test_reasoning_content_tool_call(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Weather in Paris?"},
+            {
+                "role": "assistant",
+                "reasoning_content": "Think first.",
+                "content": "Let me check.",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "Paris"},
+                        }
+                    }
+                ],
+            },
+        ],
+        tools=TOOLS,
+    )
+
+
+def test_reasoning_empty_content_no_tool_call(model):
+    """reason → empty answer, no tool call: ``</think>`` glued to ``<|im_end|>``."""
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "hi"},
+            {"role": "assistant", "reasoning_content": "thinking", "content": ""},
+        ],
+    )
+
+
+def test_multiple_tool_calls_with_reasoning(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Weather in Paris and London?"},
+            {
+                "role": "assistant",
+                "reasoning_content": "Two cities — two calls.",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "Paris"},
+                        }
+                    },
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "London"},
+                        }
+                    },
+                ],
+            },
+        ],
+        tools=TOOLS,
+    )
+
+
+def test_tool_call_with_nested_object_args(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "go"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {
+                                "city": "Paris",
+                                "opts": {"unit": "c", "days": [1, 2]},
+                            },
+                        }
+                    }
+                ],
+            },
+        ],
+        tools=TOOLS,
+    )
+
+
+# ── Historical-thinking truncation boundary (last_user_idx) ───────────
+
+
+def test_inflight_tool_cycle_keeps_reasoning(model):
+    """Assistant turns after the last user message (the in-flight tool cycle)
+    keep their reasoning by default — boundary is ``loop.index0 <
+    last_user_idx`` in every variant."""
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Weather in Paris?"},
+            {
+                "role": "assistant",
+                "reasoning_content": "Call the tool first.",
+                "content": "calling",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "Paris"},
+                        }
+                    }
+                ],
+            },
+            {"role": "tool", "content": '{"temp": 20}'},
+            {
+                "role": "assistant",
+                "reasoning_content": "Now I can answer.",
+                "content": "It is 20 degrees.",
+            },
+        ],
+        tools=TOOLS,
+    )
+
+
+def test_history_truncation_drops_older_reasoning(model):
+    """A reasoning turn before the last user message is collapsed to an empty
+    think block (tool-call branch trims the remainder on Nano/Super)."""
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Q1"},
+            {
+                "role": "assistant",
+                "reasoning_content": "reasoning before tool",
+                "content": "calling",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "Paris"},
+                        }
+                    }
+                ],
+            },
+            {"role": "tool", "content": '{"temp": 20}'},
+            {"role": "assistant", "reasoning_content": "after", "content": "Done."},
+            {"role": "user", "content": "Q2"},
+            {"role": "assistant", "reasoning_content": "final", "content": "A2"},
+        ],
+        tools=TOOLS,
+    )
+
+
+def test_two_block_tool_conversation(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "system", "content": "be brief"},
+            {"role": "user", "content": "first"},
+            {
+                "role": "assistant",
+                "reasoning_content": "R2",
+                "content": "calling.",
+                "tool_calls": [
+                    {"function": {"name": "get_weather", "arguments": {"city": "a"}}}
+                ],
+            },
+            {"role": "tool", "content": "result-a"},
+            {"role": "assistant", "reasoning_content": "R4", "content": "answer-1"},
+            {"role": "user", "content": "second"},
+            {
+                "role": "assistant",
+                "reasoning_content": "R6",
+                "content": "calling.",
+                "tool_calls": [
+                    {"function": {"name": "get_weather", "arguments": {"city": "b"}}}
+                ],
+            },
+            {"role": "tool", "content": "result-b"},
+            {"role": "assistant", "reasoning_content": "R8", "content": "answer-2"},
+        ],
+        tools=TOOLS,
+    )
+
+
+def test_plain_multi_turn_reasoning_truncation(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Q1"},
+            {
+                "role": "assistant",
+                "reasoning_content": "long reasoning one",
+                "content": "A1",
+            },
+            {"role": "user", "content": "Q2"},
+            {
+                "role": "assistant",
+                "reasoning_content": "long reasoning two",
+                "content": "A2",
+            },
+        ],
+    )
+
+
+@pytest.mark.parametrize("truncate", [True, False])
+def test_truncate_history_thinking_kwarg(model, truncate):
+    """``truncate_history_thinking=False`` keeps reasoning on every past turn."""
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Q1"},
+            {
+                "role": "assistant",
+                "reasoning_content": "first reasoning",
+                "content": "A1",
+            },
+            {"role": "user", "content": "Q2"},
+            {
+                "role": "assistant",
+                "reasoning_content": "second reasoning",
+                "content": "A2",
+            },
+        ],
+        truncate_history_thinking=truncate,
+    )
+
+
+# ── Inline <think> tags carried in content (no reasoning_content field) ─
+
+
+def test_inline_think_tags_final_turn_verbatim(model):
+    """Inline ``<think>…</think>`` in the final assistant ``content`` renders
+    verbatim — the renderer must not parse + reformat it."""
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "hi"},
+            {"role": "assistant", "content": "<think>secret</think>visible"},
+        ],
+    )
+
+
+def test_inline_think_tags_history_turn(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "hi"},
+            {
+                "role": "assistant",
+                "content": "<think>secret reasoning</think>visible answer",
+            },
+            {"role": "user", "content": "again"},
+            {"role": "assistant", "content": "second"},
+        ],
+    )
+
+
+# ── Verbatim (unstripped) content ─────────────────────────────────────
+
+
+def test_system_content_whitespace_verbatim(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "system", "content": "  padded system  "},
+            {"role": "user", "content": "hi"},
+        ],
+    )
+
+
+def test_user_content_whitespace_verbatim(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "  padded user  "},
+            {"role": "assistant", "content": "ok"},
+        ],
+    )
+
+
+def test_assistant_content_whitespace(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "hi"},
+            {"role": "assistant", "content": "  spaced answer  "},
+        ],
+    )
+
+
+def test_reasoning_content_whitespace_verbatim(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "hi"},
+            {
+                "role": "assistant",
+                "reasoning_content": "  padded reason  ",
+                "content": "answer",
+            },
+        ],
+    )
+
+
+def test_tool_content_whitespace_verbatim(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "go"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "Paris"},
+                        }
+                    }
+                ],
+            },
+            {"role": "tool", "content": "  spaced tool result  "},
+            {"role": "assistant", "content": "done"},
+        ],
+        tools=TOOLS,
+    )
+
+
+def test_system_whitespace_with_tools(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "system", "content": "  weather bot  "},
+            {"role": "user", "content": "Weather?"},
+        ],
+        tools=TOOLS,
+    )
+
+
+# ── Generation prompt / thinking toggle ───────────────────────────────
+
+
+@pytest.mark.parametrize("enable_thinking", [True, False])
+def test_generation_prompt_thinking_toggle(model, enable_thinking):
+    _assert_parity(
+        model,
+        [{"role": "user", "content": "hi"}],
+        add_generation_prompt=True,
+        enable_thinking=enable_thinking,
+    )
+
+
+def test_generation_prompt_after_tool_response(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Weather in Paris?"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "Paris"},
+                        }
+                    }
+                ],
+            },
+            {"role": "tool", "content": '{"temp": 20}'},
+        ],
+        tools=TOOLS,
+        add_generation_prompt=True,
+    )
+
+
+# ── Whole-conversation cycles, no reasoning ───────────────────────────
+
+
+def test_full_tool_cycle_no_reasoning(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": "Weather in Paris?"},
+            {
+                "role": "assistant",
+                "content": "Let me check.",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "Paris"},
+                        }
+                    }
+                ],
+            },
+            {"role": "tool", "content": '{"temp": 20, "condition": "sunny"}'},
+            {"role": "assistant", "content": "It is 20 degrees and sunny."},
+        ],
+        tools=TOOLS,
+    )
+
+
+def test_consecutive_tool_responses(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Weather in Paris and London?"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "Paris"},
+                        }
+                    },
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "London"},
+                        }
+                    },
+                ],
+            },
+            {"role": "tool", "content": '{"temp": 20}'},
+            {"role": "tool", "content": '{"temp": 15}'},
+            {"role": "assistant", "content": "Paris: 20, London: 15."},
+        ],
+        tools=TOOLS,
+    )
+
+
+def test_no_system_no_tools_injects_empty_system(model):
+    _assert_parity(
+        model,
+        [
+            {"role": "user", "content": "Hello!"},
+            {"role": "assistant", "content": "Hi there!"},
+        ],
+    )
+
+
+# ── Reasoning-effort kwargs (variant-specific) ────────────────────────
+
+_EFFORT_SHAPES = [
+    # gen-prompt shape: hint rides on the (only) user message.
+    ([{"role": "user", "content": "solve it"}], {"add_generation_prompt": True}),
+    # multi-turn: hint must land on the LAST user message, not the first.
+    (
+        [
+            {"role": "user", "content": "first"},
+            {"role": "assistant", "content": "ok"},
+            {"role": "user", "content": "second"},
+        ],
+        {"add_generation_prompt": True},
+    ),
+]
+
+
+@pytest.mark.parametrize("flag", [True, False])
+@pytest.mark.parametrize(
+    "shape,extra", _EFFORT_SHAPES, ids=["gen_prompt", "multi_turn"]
+)
+def test_low_effort_kwarg(model, flag, shape, extra):
+    """``low_effort`` appends ``\\n\\n{reasoning effort: low}`` to the last user
+    message on **Super**; it's a no-op on **Nano** (its template never defines
+    it). Ultra's config has no such field, so it's skipped."""
+    if model == ULTRA:
+        pytest.skip("low_effort is a nemotron-3 (Nano/Super) kwarg")
+    _assert_parity(model, shape, low_effort=flag, **extra)
+
+
+@pytest.mark.parametrize("flag", [True, False])
+@pytest.mark.parametrize(
+    "shape,extra", _EFFORT_SHAPES, ids=["gen_prompt", "multi_turn"]
+)
+def test_medium_effort_kwarg(model, flag, shape, extra):
+    """``medium_effort`` appends ``\\n\\n{reasoning effort: efficient}`` on
+    **Ultra**. Nano/Super configs have no such field, so they're skipped."""
+    if model != ULTRA:
+        pytest.skip("medium_effort is a nemotron-3-ultra kwarg")
+    _assert_parity(model, shape, medium_effort=flag, **extra)
+
+
+def test_effort_kwarg_lives_on_the_right_variant(model):
+    """Each effort kwarg is declared only on the variant whose template defines
+    it — the discriminated union rejects the wrong combination at config load."""
+    fields = _config_cls(model).template_field_names()
+    if model == ULTRA:
+        assert "medium_effort" in fields and "low_effort" not in fields
+    else:
+        assert "low_effort" in fields and "medium_effort" not in fields
diff --git a/tests/test_nemotron3_ultra.py b/tests/test_nemotron3_ultra.py
index 7716d15..64c3cdb 100644
--- a/tests/test_nemotron3_ultra.py
+++ b/tests/test_nemotron3_ultra.py
@@ -1,22 +1,37 @@
-"""Offline wiring tests for the Nemotron-3 Ultra template variant.
+"""Offline wiring tests for the Nemotron-3 variant split.
 
-Assert the name-based ``ultra`` auto-selection, the model→renderer mapping,
-and the typed-config surface WITHOUT loading any tokenizer (no network). This
-pins the wiring the parity matrix can't reach — in particular the FP8 entry,
-which no test loads a tokenizer for — so it can't silently rot.
+Assert the model→renderer mapping, the per-variant typed-config surface, and
+the name-based ``low_effort`` gating WITHOUT loading any tokenizer (no
+network). This pins the wiring the parity matrix can't reach — in particular
+the FP8 Ultra entry, which no test loads a tokenizer for — so it can't
+silently rot.
+
+The two variants:
+
+* ``nemotron-3`` — Nano / Super, shared template. Config exposes ``low_effort``
+  (honoured on Super, a no-op on Nano).
+* ``nemotron-3-ultra`` — Ultra, distinct ``</think>`` glue. Config exposes
+  ``medium_effort``.
+
+Both route to the one ``Nemotron3Renderer`` class, which selects the variant
+from ``config.name``.
 """
 
 from types import SimpleNamespace
 
 from renderers.base import MODEL_RENDERER_MAP
-from renderers.configs import Nemotron3RendererConfig
-from renderers.nemotron3 import _ULTRA_DEFAULTS, _default_ultra
+from renderers.configs import (
+    Nemotron3RendererConfig,
+    Nemotron3UltraRendererConfig,
+    _config_class_for,
+)
+from renderers.nemotron3 import Nemotron3Renderer, _is_super
 
 _ULTRA_REPOS = [
     "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
     "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8",
 ]
-_NON_ULTRA_REPOS = [
+_NANO_SUPER_REPOS = [
     "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
     "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
 ]
@@ -26,34 +41,65 @@ def _fake_tok(name):
     return SimpleNamespace(name_or_path=name)
 
 
-def test_ultra_and_non_ultra_models_map_to_nemotron3():
-    for repo in _ULTRA_REPOS + _NON_ULTRA_REPOS:
+def test_models_map_to_their_variant():
+    for repo in _ULTRA_REPOS:
+        assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3-ultra", repo
+    for repo in _NANO_SUPER_REPOS:
         assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3", repo
 
 
-def test_default_ultra_resolves_by_name():
-    # Ultra checkpoints (incl. the gated FP8 repo) resolve True.
-    for repo in _ULTRA_REPOS:
-        assert _ULTRA_DEFAULTS[repo] is True
-        assert _default_ultra(_fake_tok(repo)) is True
-    # Nano / Super resolve False (the shared Nano/Super template).
-    for repo in _NON_ULTRA_REPOS:
-        assert _default_ultra(_fake_tok(repo)) is False
-    # Unknown / fine-tuned / local-path checkpoints fall back to False;
-    # those must pass an explicit ultra= if they need the Ultra template.
-    assert _default_ultra(_fake_tok("acme/my-nemotron-ultra-ft")) is False
-    assert _default_ultra(_fake_tok("/home/user/local-ckpt")) is False
-    assert _default_ultra(SimpleNamespace()) is False  # no name_or_path attr
-
-
-def test_ultra_is_not_a_template_kwarg():
-    fields = Nemotron3RendererConfig.template_field_names()
-    assert "ultra" not in fields
-    assert fields == frozenset({"enable_thinking", "truncate_history_thinking"})
-    assert "ultra" in Nemotron3RendererConfig._internal_fields
-
-
-def test_ultra_config_default_is_none_and_overridable():
-    assert Nemotron3RendererConfig().ultra is None  # None => auto-detect by name
-    assert Nemotron3RendererConfig(ultra=True).ultra is True
-    assert Nemotron3RendererConfig(ultra=False).ultra is False
+def test_both_variants_resolve_to_one_renderer_class():
+    # The registry routes both discriminators to the shared renderer class.
+    assert _config_class_for("nemotron-3") is Nemotron3RendererConfig
+    assert _config_class_for("nemotron-3-ultra") is Nemotron3UltraRendererConfig
+
+
+def test_renderer_reads_variant_from_config_name():
+    # No tokenizer needed for the ``_ultra`` flag — it comes off config.name.
+    # Build with a fake tokenizer that has the special tokens stubbed out.
+    class _Tok:
+        name_or_path = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+        unk_token_id = -1
+
+        def convert_tokens_to_ids(self, tok):
+            # Deterministic non-unk ids so construction succeeds offline.
+            return abs(hash(tok)) % 100_000 + 1
+
+    nano = Nemotron3Renderer(_Tok(), Nemotron3RendererConfig())
+    ultra = Nemotron3Renderer(_Tok(), Nemotron3UltraRendererConfig())
+    assert nano._ultra is False
+    assert ultra._ultra is True
+
+
+def test_template_fields_per_variant():
+    # ``low_effort`` lives only on the Nano/Super config; ``medium_effort``
+    # only on Ultra. Both ARE chat-template kwargs (unlike the removed ``ultra``
+    # selector), so they appear in the template-field surface.
+    assert Nemotron3RendererConfig.template_field_names() == frozenset(
+        {"enable_thinking", "truncate_history_thinking", "low_effort"}
+    )
+    assert Nemotron3UltraRendererConfig.template_field_names() == frozenset(
+        {"enable_thinking", "truncate_history_thinking", "medium_effort"}
+    )
+
+
+def test_configs_reject_the_other_variants_effort_kwarg():
+    # Discriminated-union honesty: a bad combination fails at config-load.
+    import pytest
+    from pydantic import ValidationError
+
+    with pytest.raises(ValidationError):
+        Nemotron3RendererConfig(medium_effort=True)  # type: ignore[call-arg]
+    with pytest.raises(ValidationError):
+        Nemotron3UltraRendererConfig(low_effort=True)  # type: ignore[call-arg]
+    # And the removed ``ultra`` selector is gone entirely.
+    with pytest.raises(ValidationError):
+        Nemotron3RendererConfig(ultra=True)  # type: ignore[call-arg]
+
+
+def test_is_super_name_detection():
+    assert _is_super(_fake_tok("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"))
+    assert not _is_super(_fake_tok("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"))
+    # Unknown / local-path checkpoints default to False → low_effort no-op.
+    assert not _is_super(_fake_tok("/home/user/local-ckpt"))
+    assert not _is_super(SimpleNamespace())  # no name_or_path attr
diff --git a/tests/test_renderer_config_parity.py b/tests/test_renderer_config_parity.py
index abe47a6..d8b19a3 100644
--- a/tests/test_renderer_config_parity.py
+++ b/tests/test_renderer_config_parity.py
@@ -55,9 +55,13 @@
     ("moonshotai/Kimi-K2.6", "auto"),
     ("deepseek-ai/DeepSeek-V3", "auto"),
     ("deepseek-ai/DeepSeek-R1", "auto"),
+    # Nano + Super share the ``nemotron-3`` config (incl. ``low_effort``, which
+    # fires only on Super); both are exercised so the kwarg is checked where it
+    # no-ops (Nano) AND where it appends (Super).
     ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
-    # Ultra: auto-resolves to the Ultra template variant (ultra=True) via the
-    # model name; parity asserted against the Ultra apply_chat_template.
+    ("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"),
+    # Ultra: auto-resolves to the ``nemotron-3-ultra`` config via the model
+    # name; parity asserted against the Ultra apply_chat_template (``medium_effort``).
     ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
     ("poolside/Laguna-XS.2", "auto"),
     ("openai/gpt-oss-20b", "gpt-oss"),
@@ -85,6 +89,12 @@
     # ``truncate_history_thinking=False`` keeps reasoning on historical
     # assistants instead of collapsing to ``<think></think>``.
     "truncate_history_thinking": [True, False],
+    # Nemotron-3 reasoning-effort hints appended to the last user message.
+    # ``low_effort`` is a Super (``nemotron-3``) kwarg; ``medium_effort`` an
+    # Ultra (``nemotron-3-ultra``) kwarg. On the variant that doesn't define
+    # the kwarg the template — and the renderer — no-op it.
+    "low_effort": [True, False],
+    "medium_effort": [True, False],
     # MiniMax-M2 — fallback persona string when no system message is
     # supplied. Two arbitrary values to verify the renderer threads the
     # exact bytes through (whitespace included).

From f70ddb78fa841849fb341ce918e0f8f921958d0a Mon Sep 17 00:00:00 2001
From: hallerite <git@hallerite.com>
Date: Wed, 10 Jun 2026 22:55:54 +0000
Subject: [PATCH 2/2] refactor(nemotron3): Nemotron3UltraRenderer subclass
 instead of one class under two names

Match the house style (GLM5Renderer/GLM51Renderer, Qwen35Renderer/Qwen36Renderer):
each registered renderer name gets its own class. nemotron-3-ultra now maps to a
Nemotron3UltraRenderer(Nemotron3Renderer) sibling that flips the _ultra / _config_cls
class hooks, rather than registering one class under two names and branching on
config.name. No behavior change; full suite green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 renderers/__init__.py         |  2 ++
 renderers/base.py             |  4 ++--
 renderers/nemotron3.py        | 44 +++++++++++++++++++++++++----------
 tests/test_nemotron3_ultra.py | 41 ++++++++++++++++----------------
 4 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/renderers/__init__.py b/renderers/__init__.py
index 7570f31..9fd385e 100644
--- a/renderers/__init__.py
+++ b/renderers/__init__.py
@@ -89,6 +89,7 @@
     "Llama3Renderer": "renderers.llama_3",
     "MiniMaxM2Renderer": "renderers.minimax_m2",
     "Nemotron3Renderer": "renderers.nemotron3",
+    "Nemotron3UltraRenderer": "renderers.nemotron3",
     "Qwen35Renderer": "renderers.qwen35",
     "Qwen36Renderer": "renderers.qwen36",
     "Qwen3Renderer": "renderers.qwen3",
@@ -147,6 +148,7 @@ def __dir__() -> list[str]:
     "MultimodalRenderer",
     "Nemotron3Renderer",
     "Nemotron3RendererConfig",
+    "Nemotron3UltraRenderer",
     "Nemotron3UltraRendererConfig",
     "OverlongPromptError",
     "ParsedResponse",
diff --git a/renderers/base.py b/renderers/base.py
index 8fd9870..f141594 100644
--- a/renderers/base.py
+++ b/renderers/base.py
@@ -1351,7 +1351,7 @@ def _populate_registry():
     from renderers.laguna_xs2 import LagunaXS2Renderer
     from renderers.llama_3 import Llama3Renderer
     from renderers.minimax_m2 import MiniMaxM2Renderer
-    from renderers.nemotron3 import Nemotron3Renderer
+    from renderers.nemotron3 import Nemotron3Renderer, Nemotron3UltraRenderer
     from renderers.qwen3 import Qwen3Renderer
     from renderers.qwen3_vl import Qwen3VLRenderer
     from renderers.qwen35 import Qwen35Renderer
@@ -1375,7 +1375,7 @@ def _populate_registry():
             "laguna-xs.2": LagunaXS2Renderer,
             "llama-3": Llama3Renderer,
             "nemotron-3": Nemotron3Renderer,
-            "nemotron-3-ultra": Nemotron3Renderer,
+            "nemotron-3-ultra": Nemotron3UltraRenderer,
             "gpt-oss": GptOssRenderer,
         }
     )
diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py
index 8716145..c29129c 100644
--- a/renderers/nemotron3.py
+++ b/renderers/nemotron3.py
@@ -76,12 +76,11 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str]
 
 
 # The Nemotron-3 family ships two chat-template variants. Nano / Super share
-# one (config ``name="nemotron-3"``); Ultra differs in the reasoning-block glue
-# — no ``\n`` around ``</think>`` — and gets its own discriminator
-# (``name="nemotron-3-ultra"``). Which variant a checkpoint uses is carried by
-# ``MODEL_RENDERER_MAP`` (and hence the resolved config's ``name``), so the
-# renderer reads it off ``config.name`` rather than probing the live template.
-_ULTRA_CONFIG_NAME = "nemotron-3-ultra"
+# one (renderer ``Nemotron3Renderer`` / config ``name="nemotron-3"``); Ultra
+# differs in the reasoning-block glue — no ``\n`` around ``</think>`` — and is
+# the ``Nemotron3UltraRenderer`` subclass (``name="nemotron-3-ultra"``). Which
+# variant a checkpoint uses is carried by ``MODEL_RENDERER_MAP``, so the right
+# renderer class is constructed and the variant is encoded by the class itself.
 
 
 def _is_super(tokenizer) -> bool:
@@ -97,7 +96,18 @@ def _is_super(tokenizer) -> bool:
 
 
 class Nemotron3Renderer:
-    """Deterministic message → token renderer for Nemotron 3 models."""
+    """Deterministic message → token renderer for Nemotron-3 Nano / Super.
+
+    The Ultra variant (distinct ``</think>`` glue) is the
+    :class:`Nemotron3UltraRenderer` subclass below; both are registered under
+    their own discriminator and differ only by the class-level hooks here.
+    """
+
+    # Variant hooks (overridden by ``Nemotron3UltraRenderer``): the default
+    # config to build when none is passed, and whether to use Ultra's
+    # reasoning-block glue.
+    _config_cls: type = Nemotron3RendererConfig
+    _ultra: bool = False
 
     def __init__(
         self,
@@ -105,12 +115,8 @@ def __init__(
         config: Nemotron3RendererConfig | Nemotron3UltraRendererConfig | None = None,
     ):
         self._tokenizer = tokenizer
-        cfg = config or Nemotron3RendererConfig()
+        cfg = config or type(self)._config_cls()
         self.config = cfg
-        # The Ultra variant is selected by the config discriminator
-        # (``name="nemotron-3-ultra"``), not a flag — one renderer class serves
-        # both, switching glue off ``self._ultra``.
-        self._ultra = cfg.name == _ULTRA_CONFIG_NAME
 
         # Resolve the per-variant reasoning-effort hint appended to the last
         # user message. Ultra honours ``medium_effort``; Super honours
@@ -825,3 +831,17 @@ def _render_tool(
         if not next_is_tool:
             emit_special(self._im_end, oi, is_sampled=False, is_content=False)
             emit_text("\n", oi, is_sampled=False, is_content=False)
+
+
+class Nemotron3UltraRenderer(Nemotron3Renderer):
+    """Renderer for Nemotron-3 **Ultra**.
+
+    Identical to :class:`Nemotron3Renderer` except the reasoning block is glued
+    as ``<think>\\n{reasoning}</think>{content}`` (no ``\\n`` around
+    ``</think>``) and truncated historical turns collapse to
+    ``<think></think>{content}`` (no ``\\n``) — the difference is carried by the
+    ``_ultra`` class hook. Honours the Ultra-only ``medium_effort`` kwarg.
+    """
+
+    _config_cls = Nemotron3UltraRendererConfig
+    _ultra = True
diff --git a/tests/test_nemotron3_ultra.py b/tests/test_nemotron3_ultra.py
index 64c3cdb..a6832d7 100644
--- a/tests/test_nemotron3_ultra.py
+++ b/tests/test_nemotron3_ultra.py
@@ -19,13 +19,13 @@
 
 from types import SimpleNamespace
 
-from renderers.base import MODEL_RENDERER_MAP
+from renderers.base import MODEL_RENDERER_MAP, RENDERER_REGISTRY, _populate_registry
 from renderers.configs import (
     Nemotron3RendererConfig,
     Nemotron3UltraRendererConfig,
     _config_class_for,
 )
-from renderers.nemotron3 import Nemotron3Renderer, _is_super
+from renderers.nemotron3 import Nemotron3Renderer, Nemotron3UltraRenderer, _is_super
 
 _ULTRA_REPOS = [
     "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
@@ -48,27 +48,26 @@ def test_models_map_to_their_variant():
         assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3", repo
 
 
-def test_both_variants_resolve_to_one_renderer_class():
-    # The registry routes both discriminators to the shared renderer class.
+def test_each_discriminator_maps_to_its_config_and_renderer_class():
+    # Config discriminator → config class.
     assert _config_class_for("nemotron-3") is Nemotron3RendererConfig
     assert _config_class_for("nemotron-3-ultra") is Nemotron3UltraRendererConfig
-
-
-def test_renderer_reads_variant_from_config_name():
-    # No tokenizer needed for the ``_ultra`` flag — it comes off config.name.
-    # Build with a fake tokenizer that has the special tokens stubbed out.
-    class _Tok:
-        name_or_path = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-        unk_token_id = -1
-
-        def convert_tokens_to_ids(self, tok):
-            # Deterministic non-unk ids so construction succeeds offline.
-            return abs(hash(tok)) % 100_000 + 1
-
-    nano = Nemotron3Renderer(_Tok(), Nemotron3RendererConfig())
-    ultra = Nemotron3Renderer(_Tok(), Nemotron3UltraRendererConfig())
-    assert nano._ultra is False
-    assert ultra._ultra is True
+    # Registry → renderer class (Ultra is a sibling subclass, matching the
+    # GLM-5/5.1 and Qwen3.5/3.6 house style — not one class under two names).
+    _populate_registry()
+    assert RENDERER_REGISTRY["nemotron-3"] is Nemotron3Renderer
+    assert RENDERER_REGISTRY["nemotron-3-ultra"] is Nemotron3UltraRenderer
+    assert issubclass(Nemotron3UltraRenderer, Nemotron3Renderer)
+
+
+def test_variant_is_encoded_by_the_class():
+    # The ``</think>`` glue is selected by the class hook, not config.name —
+    # so the right renderer class must be constructed (create_renderer routes
+    # config.name → class). Default config also follows the class.
+    assert Nemotron3Renderer._ultra is False
+    assert Nemotron3UltraRenderer._ultra is True
+    assert Nemotron3Renderer._config_cls is Nemotron3RendererConfig
+    assert Nemotron3UltraRenderer._config_cls is Nemotron3UltraRendererConfig
 
 
 def test_template_fields_per_variant():