diff --git a/README.md b/README.md
index b2c3f2f..d153163 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ next_prompt_ids = r.bridge_to_next_turn(
)
```
-Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper.
+Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `nemotron-3-ultra`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper.
## API
diff --git a/renderers/__init__.py b/renderers/__init__.py
index baa25db..9fd385e 100644
--- a/renderers/__init__.py
+++ b/renderers/__init__.py
@@ -56,6 +56,7 @@
Llama3RendererConfig,
MiniMaxM2RendererConfig,
Nemotron3RendererConfig,
+ Nemotron3UltraRendererConfig,
Qwen35RendererConfig,
Qwen36RendererConfig,
Qwen3RendererConfig,
@@ -88,6 +89,7 @@
"Llama3Renderer": "renderers.llama_3",
"MiniMaxM2Renderer": "renderers.minimax_m2",
"Nemotron3Renderer": "renderers.nemotron3",
+ "Nemotron3UltraRenderer": "renderers.nemotron3",
"Qwen35Renderer": "renderers.qwen35",
"Qwen36Renderer": "renderers.qwen36",
"Qwen3Renderer": "renderers.qwen3",
@@ -146,6 +148,8 @@ def __dir__() -> list[str]:
"MultimodalRenderer",
"Nemotron3Renderer",
"Nemotron3RendererConfig",
+ "Nemotron3UltraRenderer",
+ "Nemotron3UltraRendererConfig",
"OverlongPromptError",
"ParsedResponse",
"ParsedToolCall",
diff --git a/renderers/base.py b/renderers/base.py
index 0397b85..f141594 100644
--- a/renderers/base.py
+++ b/renderers/base.py
@@ -1040,14 +1040,15 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No
"moonshotai/Kimi-K2-Instruct": "kimi-k2",
"moonshotai/Kimi-K2.5": "kimi-k2.5",
"moonshotai/Kimi-K2.6": "kimi-k2.5",
- # Nemotron 3. Nano / Super share one chat-template variant; the Ultra
- # checkpoints use the Ultra variant — the renderer auto-selects it from
- # the model name (see ``nemotron3._ULTRA_DEFAULTS``). BF16 and FP8 share the
+ # Nemotron 3. Nano / Super share one chat-template variant (``nemotron-3``);
+ # the Ultra checkpoints use the Ultra variant (``nemotron-3-ultra``, distinct
+ # ```` glue). Both route to the same Nemotron3Renderer, which selects
+ # the variant from the resolved config's ``name``. BF16 and FP8 share the
# same tokenizer and template.
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "nemotron-3",
"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": "nemotron-3",
- "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3",
- "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3",
+ "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3-ultra",
+ "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3-ultra",
# Llama 3.2 (Instruct). Tested against the gated meta-llama repos and
# the unrestricted unsloth/... mirror, which ships a byte-identical
# chat template. ``Llama3Renderer`` defaults ``date_string`` to
@@ -1350,7 +1351,7 @@ def _populate_registry():
from renderers.laguna_xs2 import LagunaXS2Renderer
from renderers.llama_3 import Llama3Renderer
from renderers.minimax_m2 import MiniMaxM2Renderer
- from renderers.nemotron3 import Nemotron3Renderer
+ from renderers.nemotron3 import Nemotron3Renderer, Nemotron3UltraRenderer
from renderers.qwen3 import Qwen3Renderer
from renderers.qwen3_vl import Qwen3VLRenderer
from renderers.qwen35 import Qwen35Renderer
@@ -1374,6 +1375,7 @@ def _populate_registry():
"laguna-xs.2": LagunaXS2Renderer,
"llama-3": Llama3Renderer,
"nemotron-3": Nemotron3Renderer,
+ "nemotron-3-ultra": Nemotron3UltraRenderer,
"gpt-oss": GptOssRenderer,
}
)
diff --git a/renderers/configs.py b/renderers/configs.py
index ca16e46..d500f8e 100644
--- a/renderers/configs.py
+++ b/renderers/configs.py
@@ -354,7 +354,14 @@ class MiniMaxM2RendererConfig(BaseRendererConfig):
class Nemotron3RendererConfig(BaseRendererConfig):
- """Nemotron 3 renderer config."""
+ """Nemotron-3 **Nano / Super** renderer config.
+
+ Nano and Super share one chat-template variant; the renderer routes both
+ through :class:`renderers.nemotron3.Nemotron3Renderer`. The Ultra variant
+ has its own template (different reasoning-block glue) and config —
+ :class:`Nemotron3UltraRendererConfig` — and is reached via the
+ ``nemotron-3-ultra`` discriminator.
+ """
name: Literal["nemotron-3"] = "nemotron-3"
@@ -362,26 +369,6 @@ class Nemotron3RendererConfig(BaseRendererConfig):
"""When ``True``, the generation prompt includes ````. Mirrors
the chat template's ``enable_thinking`` kwarg."""
- ultra: bool | None = None
- """Select the Nemotron-3 **Ultra** chat-template variant.
-
- ``None`` (default) auto-detects from the model name (see
- ``renderers.nemotron3._ULTRA_DEFAULTS``): the Ultra checkpoints resolve
- to ``True``; Nano / Super and unknown checkpoints to ``False``. Set
- explicitly to force a variant — e.g. an Ultra fine-tune or a
- locally-pathed checkpoint whose ``name_or_path`` isn't in the table.
-
- Ultra's template differs from Nano/Super: the reasoning block is glued
- as ``\\n{reasoning}{content}`` (no ``\\n`` around
- ````), truncated historical turns collapse to
- ``{content}`` (no ``\\n``), and the thinking-truncation
- boundary follows the template's ``loop.index0 < last_user_idx`` rule
- (drop thinking on every assistant turn before the last user message).
-
- Not a chat-template kwarg — it picks which template the renderer
- mirrors, not a variable passed into one — so it's listed in
- ``_internal_fields`` and excluded from ``template_field_names()``."""
-
truncate_history_thinking: bool = True
"""When ``False``, keep ``{reasoning}`` on past-cycle
assistant turns instead of dropping them. Mirrors the chat
@@ -389,14 +376,37 @@ class Nemotron3RendererConfig(BaseRendererConfig):
``preserve_all_thinking`` / ``preserve_thinking_between_tool_calls``
— see :class:`BaseRendererConfig` for the contract."""
- # ``ultra`` is a template-variant SELECTOR — it picks which template the
- # renderer mirrors (Ultra vs Nano/Super), not a variable passed into one;
- # there is no ``ultra`` Jinja variable. Marked internal so the parity
- # matrix doesn't cross it as a template field. Same ``_internal_fields``
- # mechanism DeepSeek-V3 uses for its no-op ``enable_thinking``, for a
- # different underlying reason (theirs is an ignored kwarg, this is a
- # variant switch).
- _internal_fields = frozenset({"ultra"})
+ low_effort: bool = False
+ """When ``True``, append ``\\n\\n{reasoning effort: low}`` to the last user
+ message, nudging the model toward shorter reasoning. Mirrors the **Super**
+ chat template's ``low_effort`` kwarg. A no-op on **Nano** (its template
+ doesn't define it) — exactly as ``apply_chat_template`` ignores an undefined
+ template variable; the renderer distinguishes the two by model name (see
+ ``renderers.nemotron3._is_super``)."""
+
+
+class Nemotron3UltraRendererConfig(BaseRendererConfig):
+ """Nemotron-3 **Ultra** renderer config — distinct discriminator so the
+ registry routes Ultra checkpoints to the Ultra template variant.
+
+ Ultra's template differs from Nano/Super: the reasoning block is glued as
+ ``\\n{reasoning}{content}`` (no ``\\n`` around ````)
+ and truncated historical turns collapse to ``{content}``
+ (no ``\\n``). It shares the :class:`renderers.nemotron3.Nemotron3Renderer`
+ implementation, which selects the variant from ``config.name``.
+ """
+
+ name: Literal["nemotron-3-ultra"] = "nemotron-3-ultra"
+
+ enable_thinking: bool = True
+ """See :class:`Nemotron3RendererConfig.enable_thinking`."""
+
+ truncate_history_thinking: bool = True
+ """See :class:`Nemotron3RendererConfig.truncate_history_thinking`."""
+
+ medium_effort: bool = False
+ """When ``True``, append ``\\n\\n{reasoning effort: efficient}`` to the last
+ user message. Mirrors the Ultra chat template's ``medium_effort`` kwarg."""
class DeepSeekV3RendererConfig(BaseRendererConfig):
@@ -444,6 +454,7 @@ class DeepSeekR1RendererConfig(BaseRendererConfig):
Llama3RendererConfig,
MiniMaxM2RendererConfig,
Nemotron3RendererConfig,
+ Nemotron3UltraRendererConfig,
DeepSeekV3RendererConfig,
DeepSeekR1RendererConfig,
],
@@ -480,6 +491,7 @@ class DeepSeekR1RendererConfig(BaseRendererConfig):
"llama-3": Llama3RendererConfig,
"minimax-m2": MiniMaxM2RendererConfig,
"nemotron-3": Nemotron3RendererConfig,
+ "nemotron-3-ultra": Nemotron3UltraRendererConfig,
"deepseek-v3": DeepSeekV3RendererConfig,
"deepseek-r1": DeepSeekR1RendererConfig,
}
@@ -525,6 +537,7 @@ def config_from_name(name: str) -> BaseRendererConfig | None:
"Llama3RendererConfig",
"MiniMaxM2RendererConfig",
"Nemotron3RendererConfig",
+ "Nemotron3UltraRendererConfig",
"Qwen35RendererConfig",
"Qwen36RendererConfig",
"Qwen3RendererConfig",
diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py
index b735cde..c29129c 100644
--- a/renderers/nemotron3.py
+++ b/renderers/nemotron3.py
@@ -30,7 +30,7 @@
should_preserve_past_thinking,
trim_to_turn_close,
)
-from renderers.configs import Nemotron3RendererConfig
+from renderers.configs import Nemotron3RendererConfig, Nemotron3UltraRendererConfig
from renderers.parsing import parse_qwen35
# ---------------------------------------------------------------------------
@@ -75,53 +75,65 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str]
return lines
-# Per-model ``ultra`` default, applied when the renderer config leaves it
-# ``None``. The Nemotron-3 family ships two chat-template variants: Nano /
-# Super share one; Ultra differs in the reasoning-block glue (no ``\n`` around
-# ````) and the thinking-truncation boundary (drop thinking on every
-# assistant turn before the last user message). BF16 and FP8 share the same
-# tokenizer and template. Hard-coded keyed by
-# ``tokenizer.name_or_path`` rather than probed from the live template — the
-# same convention as Qwen3.5's ``_ENABLE_THINKING_DEFAULTS`` (avoids pulling
-# ``apply_chat_template`` onto the construction hot path and keeps
-# bring-your-own-tokenizer use working).
-_ULTRA_DEFAULTS: dict[str, bool] = {
- "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": False,
- "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": False,
- "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": True,
- "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": True,
-}
-
-
-def _default_ultra(tokenizer) -> bool:
- """Hard-coded ``ultra`` default for ``tokenizer``'s model.
-
- Falls back to ``False`` (the Nano / Super template, and the majority of
- the family) for unknown / fine-tuned checkpoints whose ``name_or_path``
- isn't in ``_ULTRA_DEFAULTS`` — pass an explicit ``ultra=True`` for an
- Ultra fine-tune or a locally-pathed Ultra checkpoint.
+# The Nemotron-3 family ships two chat-template variants. Nano / Super share
+# one (renderer ``Nemotron3Renderer`` / config ``name="nemotron-3"``); Ultra
+# differs in the reasoning-block glue — no ``\n`` around ```` — and is
+# the ``Nemotron3UltraRenderer`` subclass (``name="nemotron-3-ultra"``). Which
+# variant a checkpoint uses is carried by ``MODEL_RENDERER_MAP``, so the right
+# renderer class is constructed and the variant is encoded by the class itself.
+
+
+def _is_super(tokenizer) -> bool:
+ """Does this checkpoint use the **Super** flavour of the shared Nano/Super
+ template — i.e. the one whose Jinja defines the ``low_effort`` kwarg?
+
+ Nano and Super share one config (``nemotron-3``), so the model name is the
+ only signal that separates them. Detected by substring; unknown / fine-tuned
+ checkpoints default to ``False`` so ``low_effort`` is a no-op there —
+ matching how the Nano template silently ignores it.
"""
- return _ULTRA_DEFAULTS.get(getattr(tokenizer, "name_or_path", ""), False)
+ return "super" in (getattr(tokenizer, "name_or_path", "") or "").lower()
class Nemotron3Renderer:
- """Deterministic message → token renderer for Nemotron 3 models."""
+ """Deterministic message → token renderer for Nemotron-3 Nano / Super.
+
+ The Ultra variant (distinct ```` glue) is the
+ :class:`Nemotron3UltraRenderer` subclass below; both are registered under
+ their own discriminator and differ only by the class-level hooks here.
+ """
+
+ # Variant hooks (overridden by ``Nemotron3UltraRenderer``): the default
+ # config to build when none is passed, and whether to use Ultra's
+ # reasoning-block glue.
+ _config_cls: type = Nemotron3RendererConfig
+ _ultra: bool = False
def __init__(
self,
tokenizer: PreTrainedTokenizer,
- config: Nemotron3RendererConfig | None = None,
+ config: Nemotron3RendererConfig | Nemotron3UltraRendererConfig | None = None,
):
self._tokenizer = tokenizer
- cfg = config or Nemotron3RendererConfig()
- # ``ultra=None`` defers to the model's known default (see
- # ``_ULTRA_DEFAULTS``). Materialise here so downstream reads see a
- # concrete bool; rebind the frozen config with the resolved value so
- # introspection sees the same.
- if cfg.ultra is None:
- cfg = cfg.model_copy(update={"ultra": _default_ultra(tokenizer)})
+ cfg = config or type(self)._config_cls()
self.config = cfg
+ # Resolve the per-variant reasoning-effort hint appended to the last
+ # user message. Ultra honours ``medium_effort``; Super honours
+ # ``low_effort``; Nano honours neither. The non-matching kwarg is
+ # silently ignored (empty hint), exactly as ``apply_chat_template``
+ # ignores a template variable the variant's Jinja never defines.
+ if self._ultra:
+ self._effort_hint = (
+ "\n\n{reasoning effort: efficient}"
+ if getattr(cfg, "medium_effort", False)
+ else ""
+ )
+ elif getattr(cfg, "low_effort", False) and _is_super(tokenizer):
+ self._effort_hint = "\n\n{reasoning effort: low}"
+ else:
+ self._effort_hint = ""
+
# Look up special token IDs from the tokenizer (not hardcoded).
# <|endoftext|> is optional: Nemotron-3 Nano / Super tokenizers ship
# <|im_end|> as the sole EOS; older / larger variants additionally
@@ -321,9 +333,12 @@ def emit_text_segments(
emit_special(self._im_start, sys_idx, is_sampled=False, is_content=False)
- # Build system content: user's system text first, then tools
+ # Build system content: user's system text first, then tools.
+ # The template emits ``system_message`` verbatim (no trim) and
+ # gates the ``\n\n`` separator on its raw length, so keep the
+ # caller's content unstripped.
if first_is_system:
- sys_content = self._render_content(messages[0].get("content")).strip()
+ sys_content = self._render_content(messages[0].get("content"))
else:
sys_content = ""
@@ -351,7 +366,7 @@ def emit_text_segments(
elif first_is_system:
sys_idx = orig_idx(0)
- sys_content = self._render_content(messages[0].get("content")).strip()
+ sys_content = self._render_content(messages[0].get("content"))
emit_special(self._im_start, sys_idx, is_sampled=False, is_content=False)
sys_segments2: list[tuple[str, bool]] = [("system\n", False)]
if sys_content:
@@ -360,22 +375,13 @@ def emit_text_segments(
emit_special(self._im_end, sys_idx, is_sampled=False, is_content=False)
emit_text("\n", sys_idx, is_sampled=False, is_content=False)
- # Track the most-recent plain (non-tool-call) assistant so we can
- # preserve its reasoning while stripping reasoning from earlier
- # assistants — the Nemotron-3 template matches this pattern.
- last_plain_assistant_idx = -1
- for j in range(len(messages) - 1, -1, -1):
- if messages[j].get("role") == "assistant" and not messages[j].get(
- "tool_calls"
- ):
- last_plain_assistant_idx = j
- break
-
- # Ultra truncates thinking on every assistant turn *before the last
- # user message* (template rule ``loop.index0 < last_user_idx``),
- # whereas Nano/Super preserve only the last plain assistant. Compute
- # the last-user index over the normalized ``messages`` list (a leading
- # system never holds a user, so the relative comparison is unaffected).
+ # All Nemotron-3 variants (Nano / Super / Ultra) truncate historical
+ # thinking on every assistant turn *before the last user message* —
+ # the template rule ``truncate_history_thinking and loop.index0 <
+ # last_user_idx`` is byte-identical across the three chat templates.
+ # Compute the last-user index over the normalized ``messages`` list (a
+ # leading system never holds a user, so the relative comparison is
+ # unaffected).
last_user_idx_norm = -1
for j in range(len(messages) - 1, -1, -1):
if messages[j].get("role") == "user":
@@ -385,7 +391,10 @@ def emit_text_segments(
# ── 2. Iterate messages ─────────────────────────────────────
for i, msg in enumerate(messages):
role = msg["role"]
- content = self._render_content(msg.get("content")).strip()
+ # Keep content unstripped: the template emits user / system / tool
+ # content verbatim, and assistant trimming happens inside
+ # ``_assistant_body`` exactly where the template applies it.
+ content = self._render_content(msg.get("content"))
msg_orig_idx = orig_idx(i)
if role == "system":
@@ -400,6 +409,12 @@ def emit_text_segments(
user_segments: list[tuple[str, bool]] = [("user\n", False)]
if content:
user_segments.append((content, True))
+ # Reasoning-effort hint rides on the LAST user message only,
+ # glued to the content so BPE sees them as one chunk (matching
+ # the template's ``content + '\n\n{reasoning effort: …}'``). It
+ # is template scaffold, not caller content → is_content=False.
+ if self._effort_hint and i == last_user_idx_norm:
+ user_segments.append((self._effort_hint, False))
emit_text_segments(user_segments, msg_orig_idx, is_sampled=False)
emit_special(
self._im_end, msg_orig_idx, is_sampled=False, is_content=False
@@ -407,26 +422,29 @@ def emit_text_segments(
emit_text("\n", msg_orig_idx, is_sampled=False, is_content=False)
elif role == "assistant":
- if self.config.ultra:
- is_last_turn = i >= last_user_idx_norm
- else:
- is_last_turn = i >= last_plain_assistant_idx
+ # Template: ``include_content = not (truncate_history_thinking
+ # and loop.index0 < last_user_idx)``. The renderer-internal
+ # preserve_* overrides only ever *extend* retention, so OR them
+ # in (a preserved turn keeps its thinking even when the
+ # template default would drop it).
preserve_thinking = msg_orig_idx >= 0 and should_preserve_past_thinking(
original_messages,
msg_orig_idx,
preserve_all_thinking=self.config.preserve_all_thinking,
preserve_thinking_between_tool_calls=self.config.preserve_thinking_between_tool_calls,
)
+ include_content = (
+ not self.config.truncate_history_thinking
+ or i >= last_user_idx_norm
+ or preserve_thinking
+ )
self._render_assistant(
msg,
msg_orig_idx,
content,
- is_last_turn=is_last_turn,
- preserve_thinking=preserve_thinking,
+ include_content=include_content,
emit_special=emit_special,
emit_text=emit_text,
- emit_ids=emit_ids,
- emit_text_segments=emit_text_segments,
)
elif role == "tool":
@@ -516,6 +534,11 @@ def bridge_to_next_turn(
not previous_prompt_ids
or not new_messages
or reject_assistant_in_extension(new_messages)
+ # An active effort hint rides on the *last* user message. Appending
+ # a new turn can move which user is last, which would strand the
+ # hint on the frozen previous prompt — the append-only bridge can't
+ # rewrite it. Bail so the caller does a full, correct re-render.
+ or self._effort_hint
):
return None
@@ -585,7 +608,9 @@ def emit_text_segments(
for i, msg in enumerate(new_messages):
role = msg.get("role")
- content = self._render_content(msg.get("content")).strip()
+ # Unstripped — the template emits user / system / tool content
+ # verbatim (see :meth:`render`).
+ content = self._render_content(msg.get("content"))
if role == "user":
emit_special(self._im_start, i)
user_segments: list[tuple[str, bool]] = [("user\n", False)]
@@ -646,29 +671,10 @@ def _render_assistant(
msg_idx: int,
content: str,
*,
- is_last_turn: bool,
- preserve_thinking: bool = False,
+ include_content: bool,
emit_special,
emit_text,
- emit_ids,
- emit_text_segments,
) -> None:
- # Extract reasoning_content
- reasoning_content = ""
- if isinstance(msg.get("reasoning_content"), str):
- reasoning_content = msg["reasoning_content"]
- elif "" in content:
- before_think_end, after_think_end = content.split("", 1)
- if "" in before_think_end:
- reasoning_content = before_think_end.split("")[-1].lstrip("\n")
- else:
- reasoning_content = before_think_end.lstrip("\n")
- reasoning_content = reasoning_content.rstrip("\n")
- content = after_think_end.lstrip("\n")
-
- reasoning_content = reasoning_content.strip()
- ultra = self.config.ultra
-
# ``<|im_start|>assistant\n`` is template-injected scaffolding —
# at inference the chat template emits these as the generation
# prompt and the model never samples them. Marking the role tag
@@ -678,123 +684,108 @@ def _render_assistant(
emit_special(self._im_start, msg_idx, is_sampled=False, is_content=False)
emit_text("assistant\n", msg_idx, is_sampled=False, is_content=False)
- # Nemotron 3 keeps reasoning on the most-recent plain assistant but
- # strips it from historical turns, which collapse to an empty
- # block. Empty is also emitted when
- # the turn has no reasoning at all. The trailing ``\n`` (when
- # tool_calls follow) is glued to ``content`` in a single emit_text
- # so BPE sees ``content\n`` as one chunk, matching how
- # apply_chat_template tokenises the concatenated template string.
- tool_calls = msg.get("tool_calls") or []
- # A \n is always required between the text/think block and the first
- # , whether the content is empty or not.
- content_suffix = "\n" if tool_calls else ""
-
- if reasoning_content and (
- is_last_turn
- or preserve_thinking
- or not self.config.truncate_history_thinking
- ):
- emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
- # Ultra: \n{reasoning}{content} (no \n around ).
- # Nano/Super: \n{reasoning}\n\n{content}.
- emit_text(
- ("\n" + reasoning_content)
- if ultra
- else ("\n" + reasoning_content + "\n"),
- msg_idx,
- is_sampled=True,
- is_content=True,
- )
- emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
- # Single \n separator (not \n\n like Qwen3.5); Ultra glues directly.
- emit_text(
- (content + content_suffix)
- if ultra
- else ("\n" + content + content_suffix),
- msg_idx,
- is_sampled=True,
- is_content=True,
- )
- elif reasoning_content:
- # Historical assistant whose reasoning got stripped. Nano/Super keep
- # a single \n between the collapsed and the content
- # as a marker that reasoning existed; Ultra glues content directly.
- emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
- emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
- emit_text(
- (content + content_suffix)
- if ultra
- else ("\n" + content + content_suffix),
- msg_idx,
- is_sampled=True,
- is_content=True,
- )
+ # Build the body (everything between ``assistant\n`` and ``<|im_end|>``)
+ # as a single string mirroring the chat template's own string algebra,
+ # then tokenise it in one pass. The ```` / ```` /
+ # ```` / ```` markers are added tokens, so the
+ # tokenizer isolates them — encoding the assembled body yields the same
+ # ids as ``apply_chat_template`` (which likewise encodes a rendered
+ # string). The whole body is sampled content; ``<|im_end|>`` is the
+ # model's stop signal (sampled), and the inter-turn ``\n`` is not.
+ body = self._assistant_body(msg, content, include_content=include_content)
+ if body:
+ emit_text(body, msg_idx, is_sampled=True, is_content=True)
+ emit_special(self._im_end, msg_idx, is_sampled=True, is_content=True)
+ emit_text("\n", msg_idx, is_sampled=False, is_content=False)
+
+ def _assistant_body(
+ self, msg: Message, raw_content: str, *, include_content: bool
+ ) -> str:
+ """Assemble the assistant body string exactly as the chat template.
+
+ ``include_content`` is the template's ``not (truncate_history_thinking
+ and loop.index0 < last_user_idx)`` (already OR-ed with the preserve_*
+ overrides by the caller): ``True`` keeps the full think+content block,
+ ``False`` collapses historical thinking to an empty ````.
+ """
+ ultra = self._ultra
+
+ # 1. Assemble ``content`` — wrap a ``reasoning_content`` field in
+ # tags (raw, not stripped: interior whitespace is part of
+ # the reasoning), else prepend an empty only when
+ # the content carries no inline think tags of its own (which are
+ # passed through verbatim, like the template).
+ reasoning = msg.get("reasoning_content")
+ if isinstance(reasoning, str) and reasoning.strip():
+ if ultra:
+ content = "\n" + reasoning + "" + raw_content
+ else:
+ content = "\n" + reasoning + "\n\n" + raw_content
else:
- # No reasoning ever — glued directly to content.
- emit_special(self._think, msg_idx, is_sampled=True, is_content=True)
- emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True)
- emit_text(
- content + content_suffix,
- msg_idx,
- is_sampled=True,
- is_content=True,
- )
+ content = raw_content
+ if "" not in content and "" not in content:
+ content = "" + content
+
+ tool_calls = msg.get("tool_calls") or []
- # Tool calls (leading \n was glued to the content above; each
- # iteration's trailing \n after handles the
- # separator to the next block).
if tool_calls:
+ parts: list[str] = []
+ if content.strip():
+ if include_content:
+ parts.append(content.strip() + "\n")
+ else:
+ # Drop historical thinking: keep only what follows the last
+ # (or precedes a dangling ), then re-stamp
+ # an empty block. Nano/Super trim the remainder; Ultra glues
+ # it raw (its template omits the trailing ``| trim``).
+ c = content
+ if "" in c:
+ c = c.split("")[-1]
+ elif "" in c:
+ c = c.split("")[0]
+ c = "" + (c if ultra else c.strip())
+ if c:
+ parts.append(c + "\n")
+ else:
+ # Non-string / empty content: bare collapsed think block, no \n.
+ parts.append("")
for tc in tool_calls:
- func = tc.get("function") or tc
- name = func.get("name", "")
- arguments = func.get("arguments", {})
-
- emit_special(self._tool_call, msg_idx, is_sampled=True, is_content=True)
- emit_text(
- "\n\n",
- msg_idx,
- is_sampled=True,
- is_content=True,
- )
+ parts.append(self._format_tool_call(tc))
+ return "".join(parts)
- # Render arguments
- # OpenAI canonical form: arguments is a JSON string. Parse it so the
- # per-argument rendering below still works.
- if isinstance(arguments, str):
- try:
- arguments = json.loads(arguments)
- except json.JSONDecodeError:
- arguments = {}
- if isinstance(arguments, dict):
- for arg_name, arg_value in arguments.items():
- if isinstance(arg_value, (dict, list)):
- value_str = json.dumps(arg_value, ensure_ascii=False)
- else:
- value_str = str(arg_value)
- emit_text(
- "\n"
- + value_str
- + "\n\n",
- msg_idx,
- is_sampled=True,
- is_content=True,
- )
-
- emit_text("\n", msg_idx, is_sampled=True, is_content=True)
- emit_special(
- self._tool_call_end, msg_idx, is_sampled=True, is_content=True
- )
- # Trailing \n after (Nemotron 3 specific)
- emit_text("\n", msg_idx, is_sampled=True, is_content=True)
+ # No tool calls.
+ if include_content:
+ return content.strip()
+ c = content
+ if "" in c and "" in c:
+ c = "" + c.split("")[-1]
+ return c.strip()
- # ``<|im_end|>`` is the model's stop signal — it samples this to
- # end its turn, so it is part of the sampled stream. The trailing
- # ``\n`` is template-appended between turns and never sampled.
- emit_special(self._im_end, msg_idx, is_sampled=True, is_content=True)
- emit_text("\n", msg_idx, is_sampled=False, is_content=False)
+ @staticmethod
+ def _format_tool_call(tc: dict[str, Any]) -> str:
+ """Render one tool call as ``…\\n`` XML."""
+ func = tc.get("function") or tc
+ name = func.get("name", "")
+ arguments = func.get("arguments", {})
+ # OpenAI canonical form: arguments is a JSON string. Parse it so the
+ # per-argument rendering below still works.
+ if isinstance(arguments, str):
+ try:
+ arguments = json.loads(arguments)
+ except json.JSONDecodeError:
+ arguments = {}
+ parts = ["\n\n"]
+ if isinstance(arguments, dict):
+ for arg_name, arg_value in arguments.items():
+ if isinstance(arg_value, (dict, list)):
+ value_str = json.dumps(arg_value, ensure_ascii=False)
+ else:
+ value_str = str(arg_value)
+ parts.append(
+ "\n" + value_str + "\n\n"
+ )
+ parts.append("\n\n")
+ return "".join(parts)
# ------------------------------------------------------------------
# Tool message rendering
@@ -840,3 +831,17 @@ def _render_tool(
if not next_is_tool:
emit_special(self._im_end, oi, is_sampled=False, is_content=False)
emit_text("\n", oi, is_sampled=False, is_content=False)
+
+
+class Nemotron3UltraRenderer(Nemotron3Renderer):
+ """Renderer for Nemotron-3 **Ultra**.
+
+ Identical to :class:`Nemotron3Renderer` except the reasoning block is glued
+ as ``\\n{reasoning}{content}`` (no ``\\n`` around
+ ````) and truncated historical turns collapse to
+ ``{content}`` (no ``\\n``) — the difference is carried by the
+ ``_ultra`` class hook. Honours the Ultra-only ``medium_effort`` kwarg.
+ """
+
+ _config_cls = Nemotron3UltraRendererConfig
+ _ultra = True
diff --git a/tests/conftest.py b/tests/conftest.py
index d62d600..c3bfeed 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,7 +33,8 @@
("moonshotai/Kimi-K2.6", "auto"),
("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"),
- # Ultra resolves the Ultra template variant via name (auto → ultra=True).
+ # Ultra resolves to the `nemotron-3-ultra` config variant via the model
+ # name (auto → MODEL_RENDERER_MAP → nemotron-3-ultra).
("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
("poolside/Laguna-XS.2", "auto"),
# DeepSeek-V3/R1 are intentionally NOT in this shared barrage: their
diff --git a/tests/test_nemotron3_parity.py b/tests/test_nemotron3_parity.py
new file mode 100644
index 0000000..01c521b
--- /dev/null
+++ b/tests/test_nemotron3_parity.py
@@ -0,0 +1,676 @@
+"""Exhaustive token-for-token parity for the Nemotron-3 renderer.
+
+The shared barrage in ``test_render_ids.py`` covers the common message
+shapes against every model. This file pins the Nemotron-3-specific template
+branches that the shared matrix can't reach — they'd fail on other models or
+exercise behaviour unique to the Nemotron-3 chat template:
+
+* reasoning + empty / ``None`` content with and without tool calls (the
+ template trims the assembled ``…{content}`` block and appends
+ exactly one separator — a stray ``\\n`` here is the most common agentic
+ regression);
+* the historical-thinking truncation boundary, which is ``loop.index0 <
+ last_user_idx`` in **all three** variants (Nano / Super / Ultra) — so an
+ in-flight tool cycle (assistant turns after the last user message) keeps its
+ reasoning by default;
+* inline ``…`` tags carried in ``content`` rendering verbatim
+ (the template only reformats reasoning supplied via ``reasoning_content``);
+* verbatim (unstripped) user / system / tool content and ``reasoning_content``;
+* the ``enable_thinking`` / ``truncate_history_thinking`` template kwargs;
+* the per-variant reasoning-effort kwargs: ``low_effort`` (Super) and
+ ``medium_effort`` (Ultra), each a no-op on the variants that don't define it.
+
+Every assertion compares ``renderer.render_ids(...)`` to
+``tokenizer.apply_chat_template(..., tokenize=True)`` — a pass means the
+renderer is byte-for-byte faithful for that case. Tokenizers are loaded from
+the local HF cache (offline); no network.
+
+The variants split across two configs: ``nemotron-3`` (Nano / Super, with
+``low_effort``) and ``nemotron-3-ultra`` (Ultra, with ``medium_effort``). The
+helper resolves the right config class per model from ``MODEL_RENDERER_MAP``.
+"""
+
+from __future__ import annotations
+
+from functools import lru_cache
+
+import pytest
+
+from renderers import create_renderer
+from renderers.base import MODEL_RENDERER_MAP, load_tokenizer
+from renderers.configs import _config_class_for
+
+# BF16 / FP8 share a tokenizer; only the BF16 checkpoints are cached for tests.
+NANO = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+SUPER = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"
+ULTRA = "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16"
+MODELS = [NANO, SUPER, ULTRA]
+
+
+@lru_cache
+def _tok(model: str):
+ return load_tokenizer(model)
+
+
+def _config_cls(model: str):
+ """The typed-config class the model resolves to (``nemotron-3`` for
+ Nano/Super, ``nemotron-3-ultra`` for Ultra)."""
+ return _config_class_for(MODEL_RENDERER_MAP[model])
+
+
+def _renderer(model: str, **flags):
+ # Build with the model's own variant config so the renderer picks the right
+ # ```` glue (and only valid kwargs are accepted).
+ return create_renderer(_tok(model), _config_cls(model)(**flags))
+
+
+def _expected(
+ model: str, messages, *, tools=None, add_generation_prompt=False, **kwargs
+):
+ out = _tok(model).apply_chat_template(
+ messages,
+ tools=tools,
+ tokenize=True,
+ return_dict=False,
+ add_generation_prompt=add_generation_prompt,
+ **kwargs,
+ )
+ if isinstance(out, str): # some tokenizers return str even with tokenize=True
+ return list(_tok(model).encode(out, add_special_tokens=False))
+ return list(out)
+
+
+def _assert_parity(
+ model, messages, *, tools=None, add_generation_prompt=False, **template_kwargs
+):
+ """Renderer ids == apply_chat_template ids for ``model``.
+
+ ``template_kwargs`` (e.g. ``enable_thinking``, ``truncate_history_thinking``)
+ are forwarded to both the renderer config and ``apply_chat_template`` so the
+ two sides stay aligned.
+ """
+ renderer = _renderer(model, **template_kwargs)
+ got = renderer.render_ids(
+ messages, tools=tools, add_generation_prompt=add_generation_prompt
+ )
+ exp = _expected(
+ model,
+ messages,
+ tools=tools,
+ add_generation_prompt=add_generation_prompt,
+ **template_kwargs,
+ )
+ assert got == exp, (
+ f"{model}: render_ids diverged from apply_chat_template\n"
+ f" exp …{_tok(model).decode(exp[-40:])!r}\n"
+ f" got …{_tok(model).decode(got[-40:])!r}"
+ )
+
+
+pytestmark = pytest.mark.parametrize("model", MODELS, ids=["nano", "super", "ultra"])
+
+
+TOOLS = [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a city",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "city": {"type": "string", "description": "The city name"}
+ },
+ "required": ["city"],
+ },
+ },
+ }
+]
+
+
+# ── Reasoning + tool calls: the trim / separator boundary ─────────────
+
+
+def test_reasoning_empty_content_tool_call(model):
+ """reason → tool call, no prose. Must be ``\\n`` (one
+ newline), not ``\\n\\n``."""
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Weather in Paris?"},
+ {
+ "role": "assistant",
+ "reasoning_content": "I should call the weather tool.",
+ "content": "",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "Paris"},
+ }
+ }
+ ],
+ },
+ ],
+ tools=TOOLS,
+ )
+
+
+def test_reasoning_none_content_tool_call(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Weather in Paris?"},
+ {
+ "role": "assistant",
+ "reasoning_content": "Call the tool.",
+ "content": None,
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "Paris"},
+ }
+ }
+ ],
+ },
+ ],
+ tools=TOOLS,
+ )
+
+
+def test_reasoning_content_tool_call(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Weather in Paris?"},
+ {
+ "role": "assistant",
+ "reasoning_content": "Think first.",
+ "content": "Let me check.",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "Paris"},
+ }
+ }
+ ],
+ },
+ ],
+ tools=TOOLS,
+ )
+
+
+def test_reasoning_empty_content_no_tool_call(model):
+ """reason → empty answer, no tool call: ```` glued to ``<|im_end|>``."""
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "hi"},
+ {"role": "assistant", "reasoning_content": "thinking", "content": ""},
+ ],
+ )
+
+
+def test_multiple_tool_calls_with_reasoning(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Weather in Paris and London?"},
+ {
+ "role": "assistant",
+ "reasoning_content": "Two cities — two calls.",
+ "content": "",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "Paris"},
+ }
+ },
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "London"},
+ }
+ },
+ ],
+ },
+ ],
+ tools=TOOLS,
+ )
+
+
+def test_tool_call_with_nested_object_args(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "go"},
+ {
+ "role": "assistant",
+ "content": "",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {
+ "city": "Paris",
+ "opts": {"unit": "c", "days": [1, 2]},
+ },
+ }
+ }
+ ],
+ },
+ ],
+ tools=TOOLS,
+ )
+
+
+# ── Historical-thinking truncation boundary (last_user_idx) ───────────
+
+
+def test_inflight_tool_cycle_keeps_reasoning(model):
+ """Assistant turns after the last user message (the in-flight tool cycle)
+ keep their reasoning by default — boundary is ``loop.index0 <
+ last_user_idx`` in every variant."""
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Weather in Paris?"},
+ {
+ "role": "assistant",
+ "reasoning_content": "Call the tool first.",
+ "content": "calling",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "Paris"},
+ }
+ }
+ ],
+ },
+ {"role": "tool", "content": '{"temp": 20}'},
+ {
+ "role": "assistant",
+ "reasoning_content": "Now I can answer.",
+ "content": "It is 20 degrees.",
+ },
+ ],
+ tools=TOOLS,
+ )
+
+
+def test_history_truncation_drops_older_reasoning(model):
+ """A reasoning turn before the last user message is collapsed to an empty
+ think block (tool-call branch trims the remainder on Nano/Super)."""
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Q1"},
+ {
+ "role": "assistant",
+ "reasoning_content": "reasoning before tool",
+ "content": "calling",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "Paris"},
+ }
+ }
+ ],
+ },
+ {"role": "tool", "content": '{"temp": 20}'},
+ {"role": "assistant", "reasoning_content": "after", "content": "Done."},
+ {"role": "user", "content": "Q2"},
+ {"role": "assistant", "reasoning_content": "final", "content": "A2"},
+ ],
+ tools=TOOLS,
+ )
+
+
+def test_two_block_tool_conversation(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "system", "content": "be brief"},
+ {"role": "user", "content": "first"},
+ {
+ "role": "assistant",
+ "reasoning_content": "R2",
+ "content": "calling.",
+ "tool_calls": [
+ {"function": {"name": "get_weather", "arguments": {"city": "a"}}}
+ ],
+ },
+ {"role": "tool", "content": "result-a"},
+ {"role": "assistant", "reasoning_content": "R4", "content": "answer-1"},
+ {"role": "user", "content": "second"},
+ {
+ "role": "assistant",
+ "reasoning_content": "R6",
+ "content": "calling.",
+ "tool_calls": [
+ {"function": {"name": "get_weather", "arguments": {"city": "b"}}}
+ ],
+ },
+ {"role": "tool", "content": "result-b"},
+ {"role": "assistant", "reasoning_content": "R8", "content": "answer-2"},
+ ],
+ tools=TOOLS,
+ )
+
+
+def test_plain_multi_turn_reasoning_truncation(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Q1"},
+ {
+ "role": "assistant",
+ "reasoning_content": "long reasoning one",
+ "content": "A1",
+ },
+ {"role": "user", "content": "Q2"},
+ {
+ "role": "assistant",
+ "reasoning_content": "long reasoning two",
+ "content": "A2",
+ },
+ ],
+ )
+
+
+@pytest.mark.parametrize("truncate", [True, False])
+def test_truncate_history_thinking_kwarg(model, truncate):
+ """``truncate_history_thinking=False`` keeps reasoning on every past turn."""
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Q1"},
+ {
+ "role": "assistant",
+ "reasoning_content": "first reasoning",
+ "content": "A1",
+ },
+ {"role": "user", "content": "Q2"},
+ {
+ "role": "assistant",
+ "reasoning_content": "second reasoning",
+ "content": "A2",
+ },
+ ],
+ truncate_history_thinking=truncate,
+ )
+
+
+# ── Inline tags carried in content (no reasoning_content field) ─
+
+
+def test_inline_think_tags_final_turn_verbatim(model):
+ """Inline ``…`` in the final assistant ``content`` renders
+ verbatim — the renderer must not parse + reformat it."""
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "hi"},
+ {"role": "assistant", "content": "secretvisible"},
+ ],
+ )
+
+
+def test_inline_think_tags_history_turn(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "hi"},
+ {
+ "role": "assistant",
+ "content": "secret reasoningvisible answer",
+ },
+ {"role": "user", "content": "again"},
+ {"role": "assistant", "content": "second"},
+ ],
+ )
+
+
+# ── Verbatim (unstripped) content ─────────────────────────────────────
+
+
+def test_system_content_whitespace_verbatim(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "system", "content": " padded system "},
+ {"role": "user", "content": "hi"},
+ ],
+ )
+
+
+def test_user_content_whitespace_verbatim(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": " padded user "},
+ {"role": "assistant", "content": "ok"},
+ ],
+ )
+
+
+def test_assistant_content_whitespace(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "hi"},
+ {"role": "assistant", "content": " spaced answer "},
+ ],
+ )
+
+
+def test_reasoning_content_whitespace_verbatim(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "hi"},
+ {
+ "role": "assistant",
+ "reasoning_content": " padded reason ",
+ "content": "answer",
+ },
+ ],
+ )
+
+
+def test_tool_content_whitespace_verbatim(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "go"},
+ {
+ "role": "assistant",
+ "content": "",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "Paris"},
+ }
+ }
+ ],
+ },
+ {"role": "tool", "content": " spaced tool result "},
+ {"role": "assistant", "content": "done"},
+ ],
+ tools=TOOLS,
+ )
+
+
+def test_system_whitespace_with_tools(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "system", "content": " weather bot "},
+ {"role": "user", "content": "Weather?"},
+ ],
+ tools=TOOLS,
+ )
+
+
+# ── Generation prompt / thinking toggle ───────────────────────────────
+
+
+@pytest.mark.parametrize("enable_thinking", [True, False])
+def test_generation_prompt_thinking_toggle(model, enable_thinking):
+ _assert_parity(
+ model,
+ [{"role": "user", "content": "hi"}],
+ add_generation_prompt=True,
+ enable_thinking=enable_thinking,
+ )
+
+
+def test_generation_prompt_after_tool_response(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Weather in Paris?"},
+ {
+ "role": "assistant",
+ "content": "",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "Paris"},
+ }
+ }
+ ],
+ },
+ {"role": "tool", "content": '{"temp": 20}'},
+ ],
+ tools=TOOLS,
+ add_generation_prompt=True,
+ )
+
+
+# ── Whole-conversation cycles, no reasoning ───────────────────────────
+
+
+def test_full_tool_cycle_no_reasoning(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "system", "content": "You are helpful."},
+ {"role": "user", "content": "Weather in Paris?"},
+ {
+ "role": "assistant",
+ "content": "Let me check.",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "Paris"},
+ }
+ }
+ ],
+ },
+ {"role": "tool", "content": '{"temp": 20, "condition": "sunny"}'},
+ {"role": "assistant", "content": "It is 20 degrees and sunny."},
+ ],
+ tools=TOOLS,
+ )
+
+
+def test_consecutive_tool_responses(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Weather in Paris and London?"},
+ {
+ "role": "assistant",
+ "content": "",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "Paris"},
+ }
+ },
+ {
+ "function": {
+ "name": "get_weather",
+ "arguments": {"city": "London"},
+ }
+ },
+ ],
+ },
+ {"role": "tool", "content": '{"temp": 20}'},
+ {"role": "tool", "content": '{"temp": 15}'},
+ {"role": "assistant", "content": "Paris: 20, London: 15."},
+ ],
+ tools=TOOLS,
+ )
+
+
+def test_no_system_no_tools_injects_empty_system(model):
+ _assert_parity(
+ model,
+ [
+ {"role": "user", "content": "Hello!"},
+ {"role": "assistant", "content": "Hi there!"},
+ ],
+ )
+
+
+# ── Reasoning-effort kwargs (variant-specific) ────────────────────────
+
+_EFFORT_SHAPES = [
+ # gen-prompt shape: hint rides on the (only) user message.
+ ([{"role": "user", "content": "solve it"}], {"add_generation_prompt": True}),
+ # multi-turn: hint must land on the LAST user message, not the first.
+ (
+ [
+ {"role": "user", "content": "first"},
+ {"role": "assistant", "content": "ok"},
+ {"role": "user", "content": "second"},
+ ],
+ {"add_generation_prompt": True},
+ ),
+]
+
+
+@pytest.mark.parametrize("flag", [True, False])
+@pytest.mark.parametrize(
+ "shape,extra", _EFFORT_SHAPES, ids=["gen_prompt", "multi_turn"]
+)
+def test_low_effort_kwarg(model, flag, shape, extra):
+ """``low_effort`` appends ``\\n\\n{reasoning effort: low}`` to the last user
+ message on **Super**; it's a no-op on **Nano** (its template never defines
+ it). Ultra's config has no such field, so it's skipped."""
+ if model == ULTRA:
+ pytest.skip("low_effort is a nemotron-3 (Nano/Super) kwarg")
+ _assert_parity(model, shape, low_effort=flag, **extra)
+
+
+@pytest.mark.parametrize("flag", [True, False])
+@pytest.mark.parametrize(
+ "shape,extra", _EFFORT_SHAPES, ids=["gen_prompt", "multi_turn"]
+)
+def test_medium_effort_kwarg(model, flag, shape, extra):
+ """``medium_effort`` appends ``\\n\\n{reasoning effort: efficient}`` on
+ **Ultra**. Nano/Super configs have no such field, so they're skipped."""
+ if model != ULTRA:
+ pytest.skip("medium_effort is a nemotron-3-ultra kwarg")
+ _assert_parity(model, shape, medium_effort=flag, **extra)
+
+
+def test_effort_kwarg_lives_on_the_right_variant(model):
+ """Each effort kwarg is declared only on the variant whose template defines
+ it — the discriminated union rejects the wrong combination at config load."""
+ fields = _config_cls(model).template_field_names()
+ if model == ULTRA:
+ assert "medium_effort" in fields and "low_effort" not in fields
+ else:
+ assert "low_effort" in fields and "medium_effort" not in fields
diff --git a/tests/test_nemotron3_ultra.py b/tests/test_nemotron3_ultra.py
index 7716d15..a6832d7 100644
--- a/tests/test_nemotron3_ultra.py
+++ b/tests/test_nemotron3_ultra.py
@@ -1,22 +1,37 @@
-"""Offline wiring tests for the Nemotron-3 Ultra template variant.
+"""Offline wiring tests for the Nemotron-3 variant split.
-Assert the name-based ``ultra`` auto-selection, the model→renderer mapping,
-and the typed-config surface WITHOUT loading any tokenizer (no network). This
-pins the wiring the parity matrix can't reach — in particular the FP8 entry,
-which no test loads a tokenizer for — so it can't silently rot.
+Assert the model→renderer mapping, the per-variant typed-config surface, and
+the name-based ``low_effort`` gating WITHOUT loading any tokenizer (no
+network). This pins the wiring the parity matrix can't reach — in particular
+the FP8 Ultra entry, which no test loads a tokenizer for — so it can't
+silently rot.
+
+The two variants:
+
+* ``nemotron-3`` — Nano / Super, shared template. Config exposes ``low_effort``
+ (honoured on Super, a no-op on Nano).
+* ``nemotron-3-ultra`` — Ultra, distinct ```` glue. Config exposes
+ ``medium_effort``.
+
+Both route to the one ``Nemotron3Renderer`` class, which selects the variant
+from ``config.name``.
"""
from types import SimpleNamespace
-from renderers.base import MODEL_RENDERER_MAP
-from renderers.configs import Nemotron3RendererConfig
-from renderers.nemotron3 import _ULTRA_DEFAULTS, _default_ultra
+from renderers.base import MODEL_RENDERER_MAP, RENDERER_REGISTRY, _populate_registry
+from renderers.configs import (
+ Nemotron3RendererConfig,
+ Nemotron3UltraRendererConfig,
+ _config_class_for,
+)
+from renderers.nemotron3 import Nemotron3Renderer, Nemotron3UltraRenderer, _is_super
_ULTRA_REPOS = [
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8",
]
-_NON_ULTRA_REPOS = [
+_NANO_SUPER_REPOS = [
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
]
@@ -26,34 +41,64 @@ def _fake_tok(name):
return SimpleNamespace(name_or_path=name)
-def test_ultra_and_non_ultra_models_map_to_nemotron3():
- for repo in _ULTRA_REPOS + _NON_ULTRA_REPOS:
+def test_models_map_to_their_variant():
+ for repo in _ULTRA_REPOS:
+ assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3-ultra", repo
+ for repo in _NANO_SUPER_REPOS:
assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3", repo
-def test_default_ultra_resolves_by_name():
- # Ultra checkpoints (incl. the gated FP8 repo) resolve True.
- for repo in _ULTRA_REPOS:
- assert _ULTRA_DEFAULTS[repo] is True
- assert _default_ultra(_fake_tok(repo)) is True
- # Nano / Super resolve False (the shared Nano/Super template).
- for repo in _NON_ULTRA_REPOS:
- assert _default_ultra(_fake_tok(repo)) is False
- # Unknown / fine-tuned / local-path checkpoints fall back to False;
- # those must pass an explicit ultra= if they need the Ultra template.
- assert _default_ultra(_fake_tok("acme/my-nemotron-ultra-ft")) is False
- assert _default_ultra(_fake_tok("/home/user/local-ckpt")) is False
- assert _default_ultra(SimpleNamespace()) is False # no name_or_path attr
-
-
-def test_ultra_is_not_a_template_kwarg():
- fields = Nemotron3RendererConfig.template_field_names()
- assert "ultra" not in fields
- assert fields == frozenset({"enable_thinking", "truncate_history_thinking"})
- assert "ultra" in Nemotron3RendererConfig._internal_fields
-
-
-def test_ultra_config_default_is_none_and_overridable():
- assert Nemotron3RendererConfig().ultra is None # None => auto-detect by name
- assert Nemotron3RendererConfig(ultra=True).ultra is True
- assert Nemotron3RendererConfig(ultra=False).ultra is False
+def test_each_discriminator_maps_to_its_config_and_renderer_class():
+ # Config discriminator → config class.
+ assert _config_class_for("nemotron-3") is Nemotron3RendererConfig
+ assert _config_class_for("nemotron-3-ultra") is Nemotron3UltraRendererConfig
+ # Registry → renderer class (Ultra is a sibling subclass, matching the
+ # GLM-5/5.1 and Qwen3.5/3.6 house style — not one class under two names).
+ _populate_registry()
+ assert RENDERER_REGISTRY["nemotron-3"] is Nemotron3Renderer
+ assert RENDERER_REGISTRY["nemotron-3-ultra"] is Nemotron3UltraRenderer
+ assert issubclass(Nemotron3UltraRenderer, Nemotron3Renderer)
+
+
+def test_variant_is_encoded_by_the_class():
+ # The ```` glue is selected by the class hook, not config.name —
+ # so the right renderer class must be constructed (create_renderer routes
+ # config.name → class). Default config also follows the class.
+ assert Nemotron3Renderer._ultra is False
+ assert Nemotron3UltraRenderer._ultra is True
+ assert Nemotron3Renderer._config_cls is Nemotron3RendererConfig
+ assert Nemotron3UltraRenderer._config_cls is Nemotron3UltraRendererConfig
+
+
+def test_template_fields_per_variant():
+ # ``low_effort`` lives only on the Nano/Super config; ``medium_effort``
+ # only on Ultra. Both ARE chat-template kwargs (unlike the removed ``ultra``
+ # selector), so they appear in the template-field surface.
+ assert Nemotron3RendererConfig.template_field_names() == frozenset(
+ {"enable_thinking", "truncate_history_thinking", "low_effort"}
+ )
+ assert Nemotron3UltraRendererConfig.template_field_names() == frozenset(
+ {"enable_thinking", "truncate_history_thinking", "medium_effort"}
+ )
+
+
+def test_configs_reject_the_other_variants_effort_kwarg():
+ # Discriminated-union honesty: a bad combination fails at config-load.
+ import pytest
+ from pydantic import ValidationError
+
+ with pytest.raises(ValidationError):
+ Nemotron3RendererConfig(medium_effort=True) # type: ignore[call-arg]
+ with pytest.raises(ValidationError):
+ Nemotron3UltraRendererConfig(low_effort=True) # type: ignore[call-arg]
+ # And the removed ``ultra`` selector is gone entirely.
+ with pytest.raises(ValidationError):
+ Nemotron3RendererConfig(ultra=True) # type: ignore[call-arg]
+
+
+def test_is_super_name_detection():
+ assert _is_super(_fake_tok("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"))
+ assert not _is_super(_fake_tok("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"))
+ # Unknown / local-path checkpoints default to False → low_effort no-op.
+ assert not _is_super(_fake_tok("/home/user/local-ckpt"))
+ assert not _is_super(SimpleNamespace()) # no name_or_path attr
diff --git a/tests/test_renderer_config_parity.py b/tests/test_renderer_config_parity.py
index abe47a6..d8b19a3 100644
--- a/tests/test_renderer_config_parity.py
+++ b/tests/test_renderer_config_parity.py
@@ -55,9 +55,13 @@
("moonshotai/Kimi-K2.6", "auto"),
("deepseek-ai/DeepSeek-V3", "auto"),
("deepseek-ai/DeepSeek-R1", "auto"),
+ # Nano + Super share the ``nemotron-3`` config (incl. ``low_effort``, which
+ # fires only on Super); both are exercised so the kwarg is checked where it
+ # no-ops (Nano) AND where it appends (Super).
("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"),
- # Ultra: auto-resolves to the Ultra template variant (ultra=True) via the
- # model name; parity asserted against the Ultra apply_chat_template.
+ ("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"),
+ # Ultra: auto-resolves to the ``nemotron-3-ultra`` config via the model
+ # name; parity asserted against the Ultra apply_chat_template (``medium_effort``).
("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"),
("poolside/Laguna-XS.2", "auto"),
("openai/gpt-oss-20b", "gpt-oss"),
@@ -85,6 +89,12 @@
# ``truncate_history_thinking=False`` keeps reasoning on historical
# assistants instead of collapsing to ````.
"truncate_history_thinking": [True, False],
+ # Nemotron-3 reasoning-effort hints appended to the last user message.
+ # ``low_effort`` is a Super (``nemotron-3``) kwarg; ``medium_effort`` an
+ # Ultra (``nemotron-3-ultra``) kwarg. On the variant that doesn't define
+ # the kwarg the template — and the renderer — no-op it.
+ "low_effort": [True, False],
+ "medium_effort": [True, False],
# MiniMax-M2 — fallback persona string when no system message is
# supplied. Two arbitrary values to verify the renderer threads the
# exact bytes through (whitespace included).