From dbd796c892eb9965ddbd908389073f50649d10fa Mon Sep 17 00:00:00 2001 From: hallerite Date: Wed, 10 Jun 2026 21:45:36 +0000 Subject: [PATCH 1/2] fix(nemotron3): faithful Nano/Super/Ultra rendering + per-variant config split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the renderer byte-for-byte match apply_chat_template on branches the shared barrage didn't cover, verified against the real cached templates. Faithfulness (assistant body now mirrors the template's string algebra — assemble {content}, trim, append one separator — and is tokenized in one pass): - reason → tool-call / empty content no longer emits a stray blank line (\n, not \n\n); same for the no-tool empty-content case. - history-truncation boundary is last_user_idx (was last_plain_assistant_idx) for every variant, so in-flight tool-cycle reasoning is kept. - inline in content renders verbatim (no reformat). - user / system / tool / reasoning_content emitted unstripped. Variant split (low_effort / medium_effort are real per-variant Jinja kwargs): - nemotron-3 (Nano/Super): enable_thinking, truncate_history_thinking, low_effort. - nemotron-3-ultra (new discriminator): + medium_effort. - one shared Nemotron3Renderer selects the variant from config.name; drops the ultra flag, _default_ultra, and _ULTRA_DEFAULTS. _is_super kept to no-op low_effort on Nano. Bad combos now fail at config-load. BREAKING: Nemotron3RendererConfig(ultra=True) → Nemotron3UltraRendererConfig(). "auto" resolution is unaffected. Tests: new tests/test_nemotron3_parity.py (exhaustive Nano/Super/Ultra parity); effort kwargs wired into the config-parity matrix; test_nemotron3_ultra.py rewritten for the two-config wiring. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 2 +- renderers/__init__.py | 2 + renderers/base.py | 12 +- renderers/configs.py | 71 +-- renderers/nemotron3.py | 373 +++++++-------- tests/conftest.py | 3 +- tests/test_nemotron3_parity.py | 676 +++++++++++++++++++++++++++ tests/test_nemotron3_ultra.py | 118 +++-- tests/test_renderer_config_parity.py | 14 +- 9 files changed, 1003 insertions(+), 268 deletions(-) create mode 100644 tests/test_nemotron3_parity.py diff --git a/README.md b/README.md index b2c3f2f..d153163 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ next_prompt_ids = r.bridge_to_next_turn( ) ``` -Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper. +Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `nemotron-3-ultra`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper. ## API diff --git a/renderers/__init__.py b/renderers/__init__.py index baa25db..7570f31 100644 --- a/renderers/__init__.py +++ b/renderers/__init__.py @@ -56,6 +56,7 @@ Llama3RendererConfig, MiniMaxM2RendererConfig, Nemotron3RendererConfig, + Nemotron3UltraRendererConfig, Qwen35RendererConfig, Qwen36RendererConfig, Qwen3RendererConfig, @@ -146,6 +147,7 @@ def __dir__() -> list[str]: "MultimodalRenderer", "Nemotron3Renderer", "Nemotron3RendererConfig", + "Nemotron3UltraRendererConfig", "OverlongPromptError", "ParsedResponse", "ParsedToolCall", diff --git a/renderers/base.py b/renderers/base.py index 0397b85..8fd9870 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1040,14 +1040,15 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No "moonshotai/Kimi-K2-Instruct": "kimi-k2", "moonshotai/Kimi-K2.5": "kimi-k2.5", "moonshotai/Kimi-K2.6": "kimi-k2.5", - # Nemotron 3. Nano / Super share one chat-template variant; the Ultra - # checkpoints use the Ultra variant — the renderer auto-selects it from - # the model name (see ``nemotron3._ULTRA_DEFAULTS``). BF16 and FP8 share the + # Nemotron 3. Nano / Super share one chat-template variant (``nemotron-3``); + # the Ultra checkpoints use the Ultra variant (``nemotron-3-ultra``, distinct + # ```` glue). Both route to the same Nemotron3Renderer, which selects + # the variant from the resolved config's ``name``. BF16 and FP8 share the # same tokenizer and template. "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "nemotron-3", "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": "nemotron-3", - "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3", - "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3", + "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3-ultra", + "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3-ultra", # Llama 3.2 (Instruct). Tested against the gated meta-llama repos and # the unrestricted unsloth/... mirror, which ships a byte-identical # chat template. ``Llama3Renderer`` defaults ``date_string`` to @@ -1374,6 +1375,7 @@ def _populate_registry(): "laguna-xs.2": LagunaXS2Renderer, "llama-3": Llama3Renderer, "nemotron-3": Nemotron3Renderer, + "nemotron-3-ultra": Nemotron3Renderer, "gpt-oss": GptOssRenderer, } ) diff --git a/renderers/configs.py b/renderers/configs.py index ca16e46..d500f8e 100644 --- a/renderers/configs.py +++ b/renderers/configs.py @@ -354,7 +354,14 @@ class MiniMaxM2RendererConfig(BaseRendererConfig): class Nemotron3RendererConfig(BaseRendererConfig): - """Nemotron 3 renderer config.""" + """Nemotron-3 **Nano / Super** renderer config. + + Nano and Super share one chat-template variant; the renderer routes both + through :class:`renderers.nemotron3.Nemotron3Renderer`. The Ultra variant + has its own template (different reasoning-block glue) and config — + :class:`Nemotron3UltraRendererConfig` — and is reached via the + ``nemotron-3-ultra`` discriminator. + """ name: Literal["nemotron-3"] = "nemotron-3" @@ -362,26 +369,6 @@ class Nemotron3RendererConfig(BaseRendererConfig): """When ``True``, the generation prompt includes ````. Mirrors the chat template's ``enable_thinking`` kwarg.""" - ultra: bool | None = None - """Select the Nemotron-3 **Ultra** chat-template variant. - - ``None`` (default) auto-detects from the model name (see - ``renderers.nemotron3._ULTRA_DEFAULTS``): the Ultra checkpoints resolve - to ``True``; Nano / Super and unknown checkpoints to ``False``. Set - explicitly to force a variant — e.g. an Ultra fine-tune or a - locally-pathed checkpoint whose ``name_or_path`` isn't in the table. - - Ultra's template differs from Nano/Super: the reasoning block is glued - as ``\\n{reasoning}{content}`` (no ``\\n`` around - ````), truncated historical turns collapse to - ``{content}`` (no ``\\n``), and the thinking-truncation - boundary follows the template's ``loop.index0 < last_user_idx`` rule - (drop thinking on every assistant turn before the last user message). - - Not a chat-template kwarg — it picks which template the renderer - mirrors, not a variable passed into one — so it's listed in - ``_internal_fields`` and excluded from ``template_field_names()``.""" - truncate_history_thinking: bool = True """When ``False``, keep ``{reasoning}`` on past-cycle assistant turns instead of dropping them. Mirrors the chat @@ -389,14 +376,37 @@ class Nemotron3RendererConfig(BaseRendererConfig): ``preserve_all_thinking`` / ``preserve_thinking_between_tool_calls`` — see :class:`BaseRendererConfig` for the contract.""" - # ``ultra`` is a template-variant SELECTOR — it picks which template the - # renderer mirrors (Ultra vs Nano/Super), not a variable passed into one; - # there is no ``ultra`` Jinja variable. Marked internal so the parity - # matrix doesn't cross it as a template field. Same ``_internal_fields`` - # mechanism DeepSeek-V3 uses for its no-op ``enable_thinking``, for a - # different underlying reason (theirs is an ignored kwarg, this is a - # variant switch). - _internal_fields = frozenset({"ultra"}) + low_effort: bool = False + """When ``True``, append ``\\n\\n{reasoning effort: low}`` to the last user + message, nudging the model toward shorter reasoning. Mirrors the **Super** + chat template's ``low_effort`` kwarg. A no-op on **Nano** (its template + doesn't define it) — exactly as ``apply_chat_template`` ignores an undefined + template variable; the renderer distinguishes the two by model name (see + ``renderers.nemotron3._is_super``).""" + + +class Nemotron3UltraRendererConfig(BaseRendererConfig): + """Nemotron-3 **Ultra** renderer config — distinct discriminator so the + registry routes Ultra checkpoints to the Ultra template variant. + + Ultra's template differs from Nano/Super: the reasoning block is glued as + ``\\n{reasoning}{content}`` (no ``\\n`` around ````) + and truncated historical turns collapse to ``{content}`` + (no ``\\n``). It shares the :class:`renderers.nemotron3.Nemotron3Renderer` + implementation, which selects the variant from ``config.name``. + """ + + name: Literal["nemotron-3-ultra"] = "nemotron-3-ultra" + + enable_thinking: bool = True + """See :class:`Nemotron3RendererConfig.enable_thinking`.""" + + truncate_history_thinking: bool = True + """See :class:`Nemotron3RendererConfig.truncate_history_thinking`.""" + + medium_effort: bool = False + """When ``True``, append ``\\n\\n{reasoning effort: efficient}`` to the last + user message. Mirrors the Ultra chat template's ``medium_effort`` kwarg.""" class DeepSeekV3RendererConfig(BaseRendererConfig): @@ -444,6 +454,7 @@ class DeepSeekR1RendererConfig(BaseRendererConfig): Llama3RendererConfig, MiniMaxM2RendererConfig, Nemotron3RendererConfig, + Nemotron3UltraRendererConfig, DeepSeekV3RendererConfig, DeepSeekR1RendererConfig, ], @@ -480,6 +491,7 @@ class DeepSeekR1RendererConfig(BaseRendererConfig): "llama-3": Llama3RendererConfig, "minimax-m2": MiniMaxM2RendererConfig, "nemotron-3": Nemotron3RendererConfig, + "nemotron-3-ultra": Nemotron3UltraRendererConfig, "deepseek-v3": DeepSeekV3RendererConfig, "deepseek-r1": DeepSeekR1RendererConfig, } @@ -525,6 +537,7 @@ def config_from_name(name: str) -> BaseRendererConfig | None: "Llama3RendererConfig", "MiniMaxM2RendererConfig", "Nemotron3RendererConfig", + "Nemotron3UltraRendererConfig", "Qwen35RendererConfig", "Qwen36RendererConfig", "Qwen3RendererConfig", diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py index b735cde..8716145 100644 --- a/renderers/nemotron3.py +++ b/renderers/nemotron3.py @@ -30,7 +30,7 @@ should_preserve_past_thinking, trim_to_turn_close, ) -from renderers.configs import Nemotron3RendererConfig +from renderers.configs import Nemotron3RendererConfig, Nemotron3UltraRendererConfig from renderers.parsing import parse_qwen35 # --------------------------------------------------------------------------- @@ -75,33 +75,25 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str] return lines -# Per-model ``ultra`` default, applied when the renderer config leaves it -# ``None``. The Nemotron-3 family ships two chat-template variants: Nano / -# Super share one; Ultra differs in the reasoning-block glue (no ``\n`` around -# ````) and the thinking-truncation boundary (drop thinking on every -# assistant turn before the last user message). BF16 and FP8 share the same -# tokenizer and template. Hard-coded keyed by -# ``tokenizer.name_or_path`` rather than probed from the live template — the -# same convention as Qwen3.5's ``_ENABLE_THINKING_DEFAULTS`` (avoids pulling -# ``apply_chat_template`` onto the construction hot path and keeps -# bring-your-own-tokenizer use working). -_ULTRA_DEFAULTS: dict[str, bool] = { - "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": False, - "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": False, - "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": True, - "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": True, -} - - -def _default_ultra(tokenizer) -> bool: - """Hard-coded ``ultra`` default for ``tokenizer``'s model. - - Falls back to ``False`` (the Nano / Super template, and the majority of - the family) for unknown / fine-tuned checkpoints whose ``name_or_path`` - isn't in ``_ULTRA_DEFAULTS`` — pass an explicit ``ultra=True`` for an - Ultra fine-tune or a locally-pathed Ultra checkpoint. +# The Nemotron-3 family ships two chat-template variants. Nano / Super share +# one (config ``name="nemotron-3"``); Ultra differs in the reasoning-block glue +# — no ``\n`` around ```` — and gets its own discriminator +# (``name="nemotron-3-ultra"``). Which variant a checkpoint uses is carried by +# ``MODEL_RENDERER_MAP`` (and hence the resolved config's ``name``), so the +# renderer reads it off ``config.name`` rather than probing the live template. +_ULTRA_CONFIG_NAME = "nemotron-3-ultra" + + +def _is_super(tokenizer) -> bool: + """Does this checkpoint use the **Super** flavour of the shared Nano/Super + template — i.e. the one whose Jinja defines the ``low_effort`` kwarg? + + Nano and Super share one config (``nemotron-3``), so the model name is the + only signal that separates them. Detected by substring; unknown / fine-tuned + checkpoints default to ``False`` so ``low_effort`` is a no-op there — + matching how the Nano template silently ignores it. """ - return _ULTRA_DEFAULTS.get(getattr(tokenizer, "name_or_path", ""), False) + return "super" in (getattr(tokenizer, "name_or_path", "") or "").lower() class Nemotron3Renderer: @@ -110,17 +102,31 @@ class Nemotron3Renderer: def __init__( self, tokenizer: PreTrainedTokenizer, - config: Nemotron3RendererConfig | None = None, + config: Nemotron3RendererConfig | Nemotron3UltraRendererConfig | None = None, ): self._tokenizer = tokenizer cfg = config or Nemotron3RendererConfig() - # ``ultra=None`` defers to the model's known default (see - # ``_ULTRA_DEFAULTS``). Materialise here so downstream reads see a - # concrete bool; rebind the frozen config with the resolved value so - # introspection sees the same. - if cfg.ultra is None: - cfg = cfg.model_copy(update={"ultra": _default_ultra(tokenizer)}) self.config = cfg + # The Ultra variant is selected by the config discriminator + # (``name="nemotron-3-ultra"``), not a flag — one renderer class serves + # both, switching glue off ``self._ultra``. + self._ultra = cfg.name == _ULTRA_CONFIG_NAME + + # Resolve the per-variant reasoning-effort hint appended to the last + # user message. Ultra honours ``medium_effort``; Super honours + # ``low_effort``; Nano honours neither. The non-matching kwarg is + # silently ignored (empty hint), exactly as ``apply_chat_template`` + # ignores a template variable the variant's Jinja never defines. + if self._ultra: + self._effort_hint = ( + "\n\n{reasoning effort: efficient}" + if getattr(cfg, "medium_effort", False) + else "" + ) + elif getattr(cfg, "low_effort", False) and _is_super(tokenizer): + self._effort_hint = "\n\n{reasoning effort: low}" + else: + self._effort_hint = "" # Look up special token IDs from the tokenizer (not hardcoded). # <|endoftext|> is optional: Nemotron-3 Nano / Super tokenizers ship @@ -321,9 +327,12 @@ def emit_text_segments( emit_special(self._im_start, sys_idx, is_sampled=False, is_content=False) - # Build system content: user's system text first, then tools + # Build system content: user's system text first, then tools. + # The template emits ``system_message`` verbatim (no trim) and + # gates the ``\n\n`` separator on its raw length, so keep the + # caller's content unstripped. if first_is_system: - sys_content = self._render_content(messages[0].get("content")).strip() + sys_content = self._render_content(messages[0].get("content")) else: sys_content = "" @@ -351,7 +360,7 @@ def emit_text_segments( elif first_is_system: sys_idx = orig_idx(0) - sys_content = self._render_content(messages[0].get("content")).strip() + sys_content = self._render_content(messages[0].get("content")) emit_special(self._im_start, sys_idx, is_sampled=False, is_content=False) sys_segments2: list[tuple[str, bool]] = [("system\n", False)] if sys_content: @@ -360,22 +369,13 @@ def emit_text_segments( emit_special(self._im_end, sys_idx, is_sampled=False, is_content=False) emit_text("\n", sys_idx, is_sampled=False, is_content=False) - # Track the most-recent plain (non-tool-call) assistant so we can - # preserve its reasoning while stripping reasoning from earlier - # assistants — the Nemotron-3 template matches this pattern. - last_plain_assistant_idx = -1 - for j in range(len(messages) - 1, -1, -1): - if messages[j].get("role") == "assistant" and not messages[j].get( - "tool_calls" - ): - last_plain_assistant_idx = j - break - - # Ultra truncates thinking on every assistant turn *before the last - # user message* (template rule ``loop.index0 < last_user_idx``), - # whereas Nano/Super preserve only the last plain assistant. Compute - # the last-user index over the normalized ``messages`` list (a leading - # system never holds a user, so the relative comparison is unaffected). + # All Nemotron-3 variants (Nano / Super / Ultra) truncate historical + # thinking on every assistant turn *before the last user message* — + # the template rule ``truncate_history_thinking and loop.index0 < + # last_user_idx`` is byte-identical across the three chat templates. + # Compute the last-user index over the normalized ``messages`` list (a + # leading system never holds a user, so the relative comparison is + # unaffected). last_user_idx_norm = -1 for j in range(len(messages) - 1, -1, -1): if messages[j].get("role") == "user": @@ -385,7 +385,10 @@ def emit_text_segments( # ── 2. Iterate messages ───────────────────────────────────── for i, msg in enumerate(messages): role = msg["role"] - content = self._render_content(msg.get("content")).strip() + # Keep content unstripped: the template emits user / system / tool + # content verbatim, and assistant trimming happens inside + # ``_assistant_body`` exactly where the template applies it. + content = self._render_content(msg.get("content")) msg_orig_idx = orig_idx(i) if role == "system": @@ -400,6 +403,12 @@ def emit_text_segments( user_segments: list[tuple[str, bool]] = [("user\n", False)] if content: user_segments.append((content, True)) + # Reasoning-effort hint rides on the LAST user message only, + # glued to the content so BPE sees them as one chunk (matching + # the template's ``content + '\n\n{reasoning effort: …}'``). It + # is template scaffold, not caller content → is_content=False. + if self._effort_hint and i == last_user_idx_norm: + user_segments.append((self._effort_hint, False)) emit_text_segments(user_segments, msg_orig_idx, is_sampled=False) emit_special( self._im_end, msg_orig_idx, is_sampled=False, is_content=False @@ -407,26 +416,29 @@ def emit_text_segments( emit_text("\n", msg_orig_idx, is_sampled=False, is_content=False) elif role == "assistant": - if self.config.ultra: - is_last_turn = i >= last_user_idx_norm - else: - is_last_turn = i >= last_plain_assistant_idx + # Template: ``include_content = not (truncate_history_thinking + # and loop.index0 < last_user_idx)``. The renderer-internal + # preserve_* overrides only ever *extend* retention, so OR them + # in (a preserved turn keeps its thinking even when the + # template default would drop it). preserve_thinking = msg_orig_idx >= 0 and should_preserve_past_thinking( original_messages, msg_orig_idx, preserve_all_thinking=self.config.preserve_all_thinking, preserve_thinking_between_tool_calls=self.config.preserve_thinking_between_tool_calls, ) + include_content = ( + not self.config.truncate_history_thinking + or i >= last_user_idx_norm + or preserve_thinking + ) self._render_assistant( msg, msg_orig_idx, content, - is_last_turn=is_last_turn, - preserve_thinking=preserve_thinking, + include_content=include_content, emit_special=emit_special, emit_text=emit_text, - emit_ids=emit_ids, - emit_text_segments=emit_text_segments, ) elif role == "tool": @@ -516,6 +528,11 @@ def bridge_to_next_turn( not previous_prompt_ids or not new_messages or reject_assistant_in_extension(new_messages) + # An active effort hint rides on the *last* user message. Appending + # a new turn can move which user is last, which would strand the + # hint on the frozen previous prompt — the append-only bridge can't + # rewrite it. Bail so the caller does a full, correct re-render. + or self._effort_hint ): return None @@ -585,7 +602,9 @@ def emit_text_segments( for i, msg in enumerate(new_messages): role = msg.get("role") - content = self._render_content(msg.get("content")).strip() + # Unstripped — the template emits user / system / tool content + # verbatim (see :meth:`render`). + content = self._render_content(msg.get("content")) if role == "user": emit_special(self._im_start, i) user_segments: list[tuple[str, bool]] = [("user\n", False)] @@ -646,29 +665,10 @@ def _render_assistant( msg_idx: int, content: str, *, - is_last_turn: bool, - preserve_thinking: bool = False, + include_content: bool, emit_special, emit_text, - emit_ids, - emit_text_segments, ) -> None: - # Extract reasoning_content - reasoning_content = "" - if isinstance(msg.get("reasoning_content"), str): - reasoning_content = msg["reasoning_content"] - elif "" in content: - before_think_end, after_think_end = content.split("", 1) - if "" in before_think_end: - reasoning_content = before_think_end.split("")[-1].lstrip("\n") - else: - reasoning_content = before_think_end.lstrip("\n") - reasoning_content = reasoning_content.rstrip("\n") - content = after_think_end.lstrip("\n") - - reasoning_content = reasoning_content.strip() - ultra = self.config.ultra - # ``<|im_start|>assistant\n`` is template-injected scaffolding — # at inference the chat template emits these as the generation # prompt and the model never samples them. Marking the role tag @@ -678,123 +678,108 @@ def _render_assistant( emit_special(self._im_start, msg_idx, is_sampled=False, is_content=False) emit_text("assistant\n", msg_idx, is_sampled=False, is_content=False) - # Nemotron 3 keeps reasoning on the most-recent plain assistant but - # strips it from historical turns, which collapse to an empty - # block. Empty is also emitted when - # the turn has no reasoning at all. The trailing ``\n`` (when - # tool_calls follow) is glued to ``content`` in a single emit_text - # so BPE sees ``content\n`` as one chunk, matching how - # apply_chat_template tokenises the concatenated template string. - tool_calls = msg.get("tool_calls") or [] - # A \n is always required between the text/think block and the first - # , whether the content is empty or not. - content_suffix = "\n" if tool_calls else "" - - if reasoning_content and ( - is_last_turn - or preserve_thinking - or not self.config.truncate_history_thinking - ): - emit_special(self._think, msg_idx, is_sampled=True, is_content=True) - # Ultra: \n{reasoning}{content} (no \n around ). - # Nano/Super: \n{reasoning}\n\n{content}. - emit_text( - ("\n" + reasoning_content) - if ultra - else ("\n" + reasoning_content + "\n"), - msg_idx, - is_sampled=True, - is_content=True, - ) - emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True) - # Single \n separator (not \n\n like Qwen3.5); Ultra glues directly. - emit_text( - (content + content_suffix) - if ultra - else ("\n" + content + content_suffix), - msg_idx, - is_sampled=True, - is_content=True, - ) - elif reasoning_content: - # Historical assistant whose reasoning got stripped. Nano/Super keep - # a single \n between the collapsed and the content - # as a marker that reasoning existed; Ultra glues content directly. - emit_special(self._think, msg_idx, is_sampled=True, is_content=True) - emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True) - emit_text( - (content + content_suffix) - if ultra - else ("\n" + content + content_suffix), - msg_idx, - is_sampled=True, - is_content=True, - ) + # Build the body (everything between ``assistant\n`` and ``<|im_end|>``) + # as a single string mirroring the chat template's own string algebra, + # then tokenise it in one pass. The ```` / ```` / + # ```` / ```` markers are added tokens, so the + # tokenizer isolates them — encoding the assembled body yields the same + # ids as ``apply_chat_template`` (which likewise encodes a rendered + # string). The whole body is sampled content; ``<|im_end|>`` is the + # model's stop signal (sampled), and the inter-turn ``\n`` is not. + body = self._assistant_body(msg, content, include_content=include_content) + if body: + emit_text(body, msg_idx, is_sampled=True, is_content=True) + emit_special(self._im_end, msg_idx, is_sampled=True, is_content=True) + emit_text("\n", msg_idx, is_sampled=False, is_content=False) + + def _assistant_body( + self, msg: Message, raw_content: str, *, include_content: bool + ) -> str: + """Assemble the assistant body string exactly as the chat template. + + ``include_content`` is the template's ``not (truncate_history_thinking + and loop.index0 < last_user_idx)`` (already OR-ed with the preserve_* + overrides by the caller): ``True`` keeps the full think+content block, + ``False`` collapses historical thinking to an empty ````. + """ + ultra = self._ultra + + # 1. Assemble ``content`` — wrap a ``reasoning_content`` field in + # tags (raw, not stripped: interior whitespace is part of + # the reasoning), else prepend an empty only when + # the content carries no inline think tags of its own (which are + # passed through verbatim, like the template). + reasoning = msg.get("reasoning_content") + if isinstance(reasoning, str) and reasoning.strip(): + if ultra: + content = "\n" + reasoning + "" + raw_content + else: + content = "\n" + reasoning + "\n\n" + raw_content else: - # No reasoning ever — glued directly to content. - emit_special(self._think, msg_idx, is_sampled=True, is_content=True) - emit_special(self._think_end, msg_idx, is_sampled=True, is_content=True) - emit_text( - content + content_suffix, - msg_idx, - is_sampled=True, - is_content=True, - ) + content = raw_content + if "" not in content and "" not in content: + content = "" + content + + tool_calls = msg.get("tool_calls") or [] - # Tool calls (leading \n was glued to the content above; each - # iteration's trailing \n after handles the - # separator to the next block). if tool_calls: + parts: list[str] = [] + if content.strip(): + if include_content: + parts.append(content.strip() + "\n") + else: + # Drop historical thinking: keep only what follows the last + # (or precedes a dangling ), then re-stamp + # an empty block. Nano/Super trim the remainder; Ultra glues + # it raw (its template omits the trailing ``| trim``). + c = content + if "" in c: + c = c.split("")[-1] + elif "" in c: + c = c.split("")[0] + c = "" + (c if ultra else c.strip()) + if c: + parts.append(c + "\n") + else: + # Non-string / empty content: bare collapsed think block, no \n. + parts.append("") for tc in tool_calls: - func = tc.get("function") or tc - name = func.get("name", "") - arguments = func.get("arguments", {}) - - emit_special(self._tool_call, msg_idx, is_sampled=True, is_content=True) - emit_text( - "\n\n", - msg_idx, - is_sampled=True, - is_content=True, - ) + parts.append(self._format_tool_call(tc)) + return "".join(parts) - # Render arguments - # OpenAI canonical form: arguments is a JSON string. Parse it so the - # per-argument rendering below still works. - if isinstance(arguments, str): - try: - arguments = json.loads(arguments) - except json.JSONDecodeError: - arguments = {} - if isinstance(arguments, dict): - for arg_name, arg_value in arguments.items(): - if isinstance(arg_value, (dict, list)): - value_str = json.dumps(arg_value, ensure_ascii=False) - else: - value_str = str(arg_value) - emit_text( - "\n" - + value_str - + "\n\n", - msg_idx, - is_sampled=True, - is_content=True, - ) - - emit_text("\n", msg_idx, is_sampled=True, is_content=True) - emit_special( - self._tool_call_end, msg_idx, is_sampled=True, is_content=True - ) - # Trailing \n after (Nemotron 3 specific) - emit_text("\n", msg_idx, is_sampled=True, is_content=True) + # No tool calls. + if include_content: + return content.strip() + c = content + if "" in c and "" in c: + c = "" + c.split("")[-1] + return c.strip() - # ``<|im_end|>`` is the model's stop signal — it samples this to - # end its turn, so it is part of the sampled stream. The trailing - # ``\n`` is template-appended between turns and never sampled. - emit_special(self._im_end, msg_idx, is_sampled=True, is_content=True) - emit_text("\n", msg_idx, is_sampled=False, is_content=False) + @staticmethod + def _format_tool_call(tc: dict[str, Any]) -> str: + """Render one tool call as ``\\n`` XML.""" + func = tc.get("function") or tc + name = func.get("name", "") + arguments = func.get("arguments", {}) + # OpenAI canonical form: arguments is a JSON string. Parse it so the + # per-argument rendering below still works. + if isinstance(arguments, str): + try: + arguments = json.loads(arguments) + except json.JSONDecodeError: + arguments = {} + parts = ["\n\n"] + if isinstance(arguments, dict): + for arg_name, arg_value in arguments.items(): + if isinstance(arg_value, (dict, list)): + value_str = json.dumps(arg_value, ensure_ascii=False) + else: + value_str = str(arg_value) + parts.append( + "\n" + value_str + "\n\n" + ) + parts.append("\n\n") + return "".join(parts) # ------------------------------------------------------------------ # Tool message rendering diff --git a/tests/conftest.py b/tests/conftest.py index d62d600..c3bfeed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,7 +33,8 @@ ("moonshotai/Kimi-K2.6", "auto"), ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"), ("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"), - # Ultra resolves the Ultra template variant via name (auto → ultra=True). + # Ultra resolves to the `nemotron-3-ultra` config variant via the model + # name (auto → MODEL_RENDERER_MAP → nemotron-3-ultra). ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"), ("poolside/Laguna-XS.2", "auto"), # DeepSeek-V3/R1 are intentionally NOT in this shared barrage: their diff --git a/tests/test_nemotron3_parity.py b/tests/test_nemotron3_parity.py new file mode 100644 index 0000000..01c521b --- /dev/null +++ b/tests/test_nemotron3_parity.py @@ -0,0 +1,676 @@ +"""Exhaustive token-for-token parity for the Nemotron-3 renderer. + +The shared barrage in ``test_render_ids.py`` covers the common message +shapes against every model. This file pins the Nemotron-3-specific template +branches that the shared matrix can't reach — they'd fail on other models or +exercise behaviour unique to the Nemotron-3 chat template: + +* reasoning + empty / ``None`` content with and without tool calls (the + template trims the assembled ``{content}`` block and appends + exactly one separator — a stray ``\\n`` here is the most common agentic + regression); +* the historical-thinking truncation boundary, which is ``loop.index0 < + last_user_idx`` in **all three** variants (Nano / Super / Ultra) — so an + in-flight tool cycle (assistant turns after the last user message) keeps its + reasoning by default; +* inline ```` tags carried in ``content`` rendering verbatim + (the template only reformats reasoning supplied via ``reasoning_content``); +* verbatim (unstripped) user / system / tool content and ``reasoning_content``; +* the ``enable_thinking`` / ``truncate_history_thinking`` template kwargs; +* the per-variant reasoning-effort kwargs: ``low_effort`` (Super) and + ``medium_effort`` (Ultra), each a no-op on the variants that don't define it. + +Every assertion compares ``renderer.render_ids(...)`` to +``tokenizer.apply_chat_template(..., tokenize=True)`` — a pass means the +renderer is byte-for-byte faithful for that case. Tokenizers are loaded from +the local HF cache (offline); no network. + +The variants split across two configs: ``nemotron-3`` (Nano / Super, with +``low_effort``) and ``nemotron-3-ultra`` (Ultra, with ``medium_effort``). The +helper resolves the right config class per model from ``MODEL_RENDERER_MAP``. +""" + +from __future__ import annotations + +from functools import lru_cache + +import pytest + +from renderers import create_renderer +from renderers.base import MODEL_RENDERER_MAP, load_tokenizer +from renderers.configs import _config_class_for + +# BF16 / FP8 share a tokenizer; only the BF16 checkpoints are cached for tests. +NANO = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" +SUPER = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +ULTRA = "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16" +MODELS = [NANO, SUPER, ULTRA] + + +@lru_cache +def _tok(model: str): + return load_tokenizer(model) + + +def _config_cls(model: str): + """The typed-config class the model resolves to (``nemotron-3`` for + Nano/Super, ``nemotron-3-ultra`` for Ultra).""" + return _config_class_for(MODEL_RENDERER_MAP[model]) + + +def _renderer(model: str, **flags): + # Build with the model's own variant config so the renderer picks the right + # ```` glue (and only valid kwargs are accepted). + return create_renderer(_tok(model), _config_cls(model)(**flags)) + + +def _expected( + model: str, messages, *, tools=None, add_generation_prompt=False, **kwargs +): + out = _tok(model).apply_chat_template( + messages, + tools=tools, + tokenize=True, + return_dict=False, + add_generation_prompt=add_generation_prompt, + **kwargs, + ) + if isinstance(out, str): # some tokenizers return str even with tokenize=True + return list(_tok(model).encode(out, add_special_tokens=False)) + return list(out) + + +def _assert_parity( + model, messages, *, tools=None, add_generation_prompt=False, **template_kwargs +): + """Renderer ids == apply_chat_template ids for ``model``. + + ``template_kwargs`` (e.g. ``enable_thinking``, ``truncate_history_thinking``) + are forwarded to both the renderer config and ``apply_chat_template`` so the + two sides stay aligned. + """ + renderer = _renderer(model, **template_kwargs) + got = renderer.render_ids( + messages, tools=tools, add_generation_prompt=add_generation_prompt + ) + exp = _expected( + model, + messages, + tools=tools, + add_generation_prompt=add_generation_prompt, + **template_kwargs, + ) + assert got == exp, ( + f"{model}: render_ids diverged from apply_chat_template\n" + f" exp …{_tok(model).decode(exp[-40:])!r}\n" + f" got …{_tok(model).decode(got[-40:])!r}" + ) + + +pytestmark = pytest.mark.parametrize("model", MODELS, ids=["nano", "super", "ultra"]) + + +TOOLS = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather for a city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "The city name"} + }, + "required": ["city"], + }, + }, + } +] + + +# ── Reasoning + tool calls: the trim / separator boundary ───────────── + + +def test_reasoning_empty_content_tool_call(model): + """reason → tool call, no prose. Must be ``\\n`` (one + newline), not ``\\n\\n``.""" + _assert_parity( + model, + [ + {"role": "user", "content": "Weather in Paris?"}, + { + "role": "assistant", + "reasoning_content": "I should call the weather tool.", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + } + } + ], + }, + ], + tools=TOOLS, + ) + + +def test_reasoning_none_content_tool_call(model): + _assert_parity( + model, + [ + {"role": "user", "content": "Weather in Paris?"}, + { + "role": "assistant", + "reasoning_content": "Call the tool.", + "content": None, + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + } + } + ], + }, + ], + tools=TOOLS, + ) + + +def test_reasoning_content_tool_call(model): + _assert_parity( + model, + [ + {"role": "user", "content": "Weather in Paris?"}, + { + "role": "assistant", + "reasoning_content": "Think first.", + "content": "Let me check.", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + } + } + ], + }, + ], + tools=TOOLS, + ) + + +def test_reasoning_empty_content_no_tool_call(model): + """reason → empty answer, no tool call: ```` glued to ``<|im_end|>``.""" + _assert_parity( + model, + [ + {"role": "user", "content": "hi"}, + {"role": "assistant", "reasoning_content": "thinking", "content": ""}, + ], + ) + + +def test_multiple_tool_calls_with_reasoning(model): + _assert_parity( + model, + [ + {"role": "user", "content": "Weather in Paris and London?"}, + { + "role": "assistant", + "reasoning_content": "Two cities — two calls.", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + } + }, + { + "function": { + "name": "get_weather", + "arguments": {"city": "London"}, + } + }, + ], + }, + ], + tools=TOOLS, + ) + + +def test_tool_call_with_nested_object_args(model): + _assert_parity( + model, + [ + {"role": "user", "content": "go"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": { + "city": "Paris", + "opts": {"unit": "c", "days": [1, 2]}, + }, + } + } + ], + }, + ], + tools=TOOLS, + ) + + +# ── Historical-thinking truncation boundary (last_user_idx) ─────────── + + +def test_inflight_tool_cycle_keeps_reasoning(model): + """Assistant turns after the last user message (the in-flight tool cycle) + keep their reasoning by default — boundary is ``loop.index0 < + last_user_idx`` in every variant.""" + _assert_parity( + model, + [ + {"role": "user", "content": "Weather in Paris?"}, + { + "role": "assistant", + "reasoning_content": "Call the tool first.", + "content": "calling", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + } + } + ], + }, + {"role": "tool", "content": '{"temp": 20}'}, + { + "role": "assistant", + "reasoning_content": "Now I can answer.", + "content": "It is 20 degrees.", + }, + ], + tools=TOOLS, + ) + + +def test_history_truncation_drops_older_reasoning(model): + """A reasoning turn before the last user message is collapsed to an empty + think block (tool-call branch trims the remainder on Nano/Super).""" + _assert_parity( + model, + [ + {"role": "user", "content": "Q1"}, + { + "role": "assistant", + "reasoning_content": "reasoning before tool", + "content": "calling", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + } + } + ], + }, + {"role": "tool", "content": '{"temp": 20}'}, + {"role": "assistant", "reasoning_content": "after", "content": "Done."}, + {"role": "user", "content": "Q2"}, + {"role": "assistant", "reasoning_content": "final", "content": "A2"}, + ], + tools=TOOLS, + ) + + +def test_two_block_tool_conversation(model): + _assert_parity( + model, + [ + {"role": "system", "content": "be brief"}, + {"role": "user", "content": "first"}, + { + "role": "assistant", + "reasoning_content": "R2", + "content": "calling.", + "tool_calls": [ + {"function": {"name": "get_weather", "arguments": {"city": "a"}}} + ], + }, + {"role": "tool", "content": "result-a"}, + {"role": "assistant", "reasoning_content": "R4", "content": "answer-1"}, + {"role": "user", "content": "second"}, + { + "role": "assistant", + "reasoning_content": "R6", + "content": "calling.", + "tool_calls": [ + {"function": {"name": "get_weather", "arguments": {"city": "b"}}} + ], + }, + {"role": "tool", "content": "result-b"}, + {"role": "assistant", "reasoning_content": "R8", "content": "answer-2"}, + ], + tools=TOOLS, + ) + + +def test_plain_multi_turn_reasoning_truncation(model): + _assert_parity( + model, + [ + {"role": "user", "content": "Q1"}, + { + "role": "assistant", + "reasoning_content": "long reasoning one", + "content": "A1", + }, + {"role": "user", "content": "Q2"}, + { + "role": "assistant", + "reasoning_content": "long reasoning two", + "content": "A2", + }, + ], + ) + + +@pytest.mark.parametrize("truncate", [True, False]) +def test_truncate_history_thinking_kwarg(model, truncate): + """``truncate_history_thinking=False`` keeps reasoning on every past turn.""" + _assert_parity( + model, + [ + {"role": "user", "content": "Q1"}, + { + "role": "assistant", + "reasoning_content": "first reasoning", + "content": "A1", + }, + {"role": "user", "content": "Q2"}, + { + "role": "assistant", + "reasoning_content": "second reasoning", + "content": "A2", + }, + ], + truncate_history_thinking=truncate, + ) + + +# ── Inline tags carried in content (no reasoning_content field) ─ + + +def test_inline_think_tags_final_turn_verbatim(model): + """Inline ```` in the final assistant ``content`` renders + verbatim — the renderer must not parse + reformat it.""" + _assert_parity( + model, + [ + {"role": "user", "content": "hi"}, + {"role": "assistant", "content": "secretvisible"}, + ], + ) + + +def test_inline_think_tags_history_turn(model): + _assert_parity( + model, + [ + {"role": "user", "content": "hi"}, + { + "role": "assistant", + "content": "secret reasoningvisible answer", + }, + {"role": "user", "content": "again"}, + {"role": "assistant", "content": "second"}, + ], + ) + + +# ── Verbatim (unstripped) content ───────────────────────────────────── + + +def test_system_content_whitespace_verbatim(model): + _assert_parity( + model, + [ + {"role": "system", "content": " padded system "}, + {"role": "user", "content": "hi"}, + ], + ) + + +def test_user_content_whitespace_verbatim(model): + _assert_parity( + model, + [ + {"role": "user", "content": " padded user "}, + {"role": "assistant", "content": "ok"}, + ], + ) + + +def test_assistant_content_whitespace(model): + _assert_parity( + model, + [ + {"role": "user", "content": "hi"}, + {"role": "assistant", "content": " spaced answer "}, + ], + ) + + +def test_reasoning_content_whitespace_verbatim(model): + _assert_parity( + model, + [ + {"role": "user", "content": "hi"}, + { + "role": "assistant", + "reasoning_content": " padded reason ", + "content": "answer", + }, + ], + ) + + +def test_tool_content_whitespace_verbatim(model): + _assert_parity( + model, + [ + {"role": "user", "content": "go"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + } + } + ], + }, + {"role": "tool", "content": " spaced tool result "}, + {"role": "assistant", "content": "done"}, + ], + tools=TOOLS, + ) + + +def test_system_whitespace_with_tools(model): + _assert_parity( + model, + [ + {"role": "system", "content": " weather bot "}, + {"role": "user", "content": "Weather?"}, + ], + tools=TOOLS, + ) + + +# ── Generation prompt / thinking toggle ─────────────────────────────── + + +@pytest.mark.parametrize("enable_thinking", [True, False]) +def test_generation_prompt_thinking_toggle(model, enable_thinking): + _assert_parity( + model, + [{"role": "user", "content": "hi"}], + add_generation_prompt=True, + enable_thinking=enable_thinking, + ) + + +def test_generation_prompt_after_tool_response(model): + _assert_parity( + model, + [ + {"role": "user", "content": "Weather in Paris?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + } + } + ], + }, + {"role": "tool", "content": '{"temp": 20}'}, + ], + tools=TOOLS, + add_generation_prompt=True, + ) + + +# ── Whole-conversation cycles, no reasoning ─────────────────────────── + + +def test_full_tool_cycle_no_reasoning(model): + _assert_parity( + model, + [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Weather in Paris?"}, + { + "role": "assistant", + "content": "Let me check.", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + } + } + ], + }, + {"role": "tool", "content": '{"temp": 20, "condition": "sunny"}'}, + {"role": "assistant", "content": "It is 20 degrees and sunny."}, + ], + tools=TOOLS, + ) + + +def test_consecutive_tool_responses(model): + _assert_parity( + model, + [ + {"role": "user", "content": "Weather in Paris and London?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + } + }, + { + "function": { + "name": "get_weather", + "arguments": {"city": "London"}, + } + }, + ], + }, + {"role": "tool", "content": '{"temp": 20}'}, + {"role": "tool", "content": '{"temp": 15}'}, + {"role": "assistant", "content": "Paris: 20, London: 15."}, + ], + tools=TOOLS, + ) + + +def test_no_system_no_tools_injects_empty_system(model): + _assert_parity( + model, + [ + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Hi there!"}, + ], + ) + + +# ── Reasoning-effort kwargs (variant-specific) ──────────────────────── + +_EFFORT_SHAPES = [ + # gen-prompt shape: hint rides on the (only) user message. + ([{"role": "user", "content": "solve it"}], {"add_generation_prompt": True}), + # multi-turn: hint must land on the LAST user message, not the first. + ( + [ + {"role": "user", "content": "first"}, + {"role": "assistant", "content": "ok"}, + {"role": "user", "content": "second"}, + ], + {"add_generation_prompt": True}, + ), +] + + +@pytest.mark.parametrize("flag", [True, False]) +@pytest.mark.parametrize( + "shape,extra", _EFFORT_SHAPES, ids=["gen_prompt", "multi_turn"] +) +def test_low_effort_kwarg(model, flag, shape, extra): + """``low_effort`` appends ``\\n\\n{reasoning effort: low}`` to the last user + message on **Super**; it's a no-op on **Nano** (its template never defines + it). Ultra's config has no such field, so it's skipped.""" + if model == ULTRA: + pytest.skip("low_effort is a nemotron-3 (Nano/Super) kwarg") + _assert_parity(model, shape, low_effort=flag, **extra) + + +@pytest.mark.parametrize("flag", [True, False]) +@pytest.mark.parametrize( + "shape,extra", _EFFORT_SHAPES, ids=["gen_prompt", "multi_turn"] +) +def test_medium_effort_kwarg(model, flag, shape, extra): + """``medium_effort`` appends ``\\n\\n{reasoning effort: efficient}`` on + **Ultra**. Nano/Super configs have no such field, so they're skipped.""" + if model != ULTRA: + pytest.skip("medium_effort is a nemotron-3-ultra kwarg") + _assert_parity(model, shape, medium_effort=flag, **extra) + + +def test_effort_kwarg_lives_on_the_right_variant(model): + """Each effort kwarg is declared only on the variant whose template defines + it — the discriminated union rejects the wrong combination at config load.""" + fields = _config_cls(model).template_field_names() + if model == ULTRA: + assert "medium_effort" in fields and "low_effort" not in fields + else: + assert "low_effort" in fields and "medium_effort" not in fields diff --git a/tests/test_nemotron3_ultra.py b/tests/test_nemotron3_ultra.py index 7716d15..64c3cdb 100644 --- a/tests/test_nemotron3_ultra.py +++ b/tests/test_nemotron3_ultra.py @@ -1,22 +1,37 @@ -"""Offline wiring tests for the Nemotron-3 Ultra template variant. +"""Offline wiring tests for the Nemotron-3 variant split. -Assert the name-based ``ultra`` auto-selection, the model→renderer mapping, -and the typed-config surface WITHOUT loading any tokenizer (no network). This -pins the wiring the parity matrix can't reach — in particular the FP8 entry, -which no test loads a tokenizer for — so it can't silently rot. +Assert the model→renderer mapping, the per-variant typed-config surface, and +the name-based ``low_effort`` gating WITHOUT loading any tokenizer (no +network). This pins the wiring the parity matrix can't reach — in particular +the FP8 Ultra entry, which no test loads a tokenizer for — so it can't +silently rot. + +The two variants: + +* ``nemotron-3`` — Nano / Super, shared template. Config exposes ``low_effort`` + (honoured on Super, a no-op on Nano). +* ``nemotron-3-ultra`` — Ultra, distinct ```` glue. Config exposes + ``medium_effort``. + +Both route to the one ``Nemotron3Renderer`` class, which selects the variant +from ``config.name``. """ from types import SimpleNamespace from renderers.base import MODEL_RENDERER_MAP -from renderers.configs import Nemotron3RendererConfig -from renderers.nemotron3 import _ULTRA_DEFAULTS, _default_ultra +from renderers.configs import ( + Nemotron3RendererConfig, + Nemotron3UltraRendererConfig, + _config_class_for, +) +from renderers.nemotron3 import Nemotron3Renderer, _is_super _ULTRA_REPOS = [ "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8", ] -_NON_ULTRA_REPOS = [ +_NANO_SUPER_REPOS = [ "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", ] @@ -26,34 +41,65 @@ def _fake_tok(name): return SimpleNamespace(name_or_path=name) -def test_ultra_and_non_ultra_models_map_to_nemotron3(): - for repo in _ULTRA_REPOS + _NON_ULTRA_REPOS: +def test_models_map_to_their_variant(): + for repo in _ULTRA_REPOS: + assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3-ultra", repo + for repo in _NANO_SUPER_REPOS: assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3", repo -def test_default_ultra_resolves_by_name(): - # Ultra checkpoints (incl. the gated FP8 repo) resolve True. - for repo in _ULTRA_REPOS: - assert _ULTRA_DEFAULTS[repo] is True - assert _default_ultra(_fake_tok(repo)) is True - # Nano / Super resolve False (the shared Nano/Super template). - for repo in _NON_ULTRA_REPOS: - assert _default_ultra(_fake_tok(repo)) is False - # Unknown / fine-tuned / local-path checkpoints fall back to False; - # those must pass an explicit ultra= if they need the Ultra template. - assert _default_ultra(_fake_tok("acme/my-nemotron-ultra-ft")) is False - assert _default_ultra(_fake_tok("/home/user/local-ckpt")) is False - assert _default_ultra(SimpleNamespace()) is False # no name_or_path attr - - -def test_ultra_is_not_a_template_kwarg(): - fields = Nemotron3RendererConfig.template_field_names() - assert "ultra" not in fields - assert fields == frozenset({"enable_thinking", "truncate_history_thinking"}) - assert "ultra" in Nemotron3RendererConfig._internal_fields - - -def test_ultra_config_default_is_none_and_overridable(): - assert Nemotron3RendererConfig().ultra is None # None => auto-detect by name - assert Nemotron3RendererConfig(ultra=True).ultra is True - assert Nemotron3RendererConfig(ultra=False).ultra is False +def test_both_variants_resolve_to_one_renderer_class(): + # The registry routes both discriminators to the shared renderer class. + assert _config_class_for("nemotron-3") is Nemotron3RendererConfig + assert _config_class_for("nemotron-3-ultra") is Nemotron3UltraRendererConfig + + +def test_renderer_reads_variant_from_config_name(): + # No tokenizer needed for the ``_ultra`` flag — it comes off config.name. + # Build with a fake tokenizer that has the special tokens stubbed out. + class _Tok: + name_or_path = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" + unk_token_id = -1 + + def convert_tokens_to_ids(self, tok): + # Deterministic non-unk ids so construction succeeds offline. + return abs(hash(tok)) % 100_000 + 1 + + nano = Nemotron3Renderer(_Tok(), Nemotron3RendererConfig()) + ultra = Nemotron3Renderer(_Tok(), Nemotron3UltraRendererConfig()) + assert nano._ultra is False + assert ultra._ultra is True + + +def test_template_fields_per_variant(): + # ``low_effort`` lives only on the Nano/Super config; ``medium_effort`` + # only on Ultra. Both ARE chat-template kwargs (unlike the removed ``ultra`` + # selector), so they appear in the template-field surface. + assert Nemotron3RendererConfig.template_field_names() == frozenset( + {"enable_thinking", "truncate_history_thinking", "low_effort"} + ) + assert Nemotron3UltraRendererConfig.template_field_names() == frozenset( + {"enable_thinking", "truncate_history_thinking", "medium_effort"} + ) + + +def test_configs_reject_the_other_variants_effort_kwarg(): + # Discriminated-union honesty: a bad combination fails at config-load. + import pytest + from pydantic import ValidationError + + with pytest.raises(ValidationError): + Nemotron3RendererConfig(medium_effort=True) # type: ignore[call-arg] + with pytest.raises(ValidationError): + Nemotron3UltraRendererConfig(low_effort=True) # type: ignore[call-arg] + # And the removed ``ultra`` selector is gone entirely. + with pytest.raises(ValidationError): + Nemotron3RendererConfig(ultra=True) # type: ignore[call-arg] + + +def test_is_super_name_detection(): + assert _is_super(_fake_tok("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16")) + assert not _is_super(_fake_tok("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16")) + # Unknown / local-path checkpoints default to False → low_effort no-op. + assert not _is_super(_fake_tok("/home/user/local-ckpt")) + assert not _is_super(SimpleNamespace()) # no name_or_path attr diff --git a/tests/test_renderer_config_parity.py b/tests/test_renderer_config_parity.py index abe47a6..d8b19a3 100644 --- a/tests/test_renderer_config_parity.py +++ b/tests/test_renderer_config_parity.py @@ -55,9 +55,13 @@ ("moonshotai/Kimi-K2.6", "auto"), ("deepseek-ai/DeepSeek-V3", "auto"), ("deepseek-ai/DeepSeek-R1", "auto"), + # Nano + Super share the ``nemotron-3`` config (incl. ``low_effort``, which + # fires only on Super); both are exercised so the kwarg is checked where it + # no-ops (Nano) AND where it appends (Super). ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "auto"), - # Ultra: auto-resolves to the Ultra template variant (ultra=True) via the - # model name; parity asserted against the Ultra apply_chat_template. + ("nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "auto"), + # Ultra: auto-resolves to the ``nemotron-3-ultra`` config via the model + # name; parity asserted against the Ultra apply_chat_template (``medium_effort``). ("nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", "auto"), ("poolside/Laguna-XS.2", "auto"), ("openai/gpt-oss-20b", "gpt-oss"), @@ -85,6 +89,12 @@ # ``truncate_history_thinking=False`` keeps reasoning on historical # assistants instead of collapsing to ````. "truncate_history_thinking": [True, False], + # Nemotron-3 reasoning-effort hints appended to the last user message. + # ``low_effort`` is a Super (``nemotron-3``) kwarg; ``medium_effort`` an + # Ultra (``nemotron-3-ultra``) kwarg. On the variant that doesn't define + # the kwarg the template — and the renderer — no-op it. + "low_effort": [True, False], + "medium_effort": [True, False], # MiniMax-M2 — fallback persona string when no system message is # supplied. Two arbitrary values to verify the renderer threads the # exact bytes through (whitespace included). From f70ddb78fa841849fb341ce918e0f8f921958d0a Mon Sep 17 00:00:00 2001 From: hallerite Date: Wed, 10 Jun 2026 22:55:54 +0000 Subject: [PATCH 2/2] refactor(nemotron3): Nemotron3UltraRenderer subclass instead of one class under two names Match the house style (GLM5Renderer/GLM51Renderer, Qwen35Renderer/Qwen36Renderer): each registered renderer name gets its own class. nemotron-3-ultra now maps to a Nemotron3UltraRenderer(Nemotron3Renderer) sibling that flips the _ultra / _config_cls class hooks, rather than registering one class under two names and branching on config.name. No behavior change; full suite green. Co-Authored-By: Claude Opus 4.8 (1M context) --- renderers/__init__.py | 2 ++ renderers/base.py | 4 ++-- renderers/nemotron3.py | 44 +++++++++++++++++++++++++---------- tests/test_nemotron3_ultra.py | 41 ++++++++++++++++---------------- 4 files changed, 56 insertions(+), 35 deletions(-) diff --git a/renderers/__init__.py b/renderers/__init__.py index 7570f31..9fd385e 100644 --- a/renderers/__init__.py +++ b/renderers/__init__.py @@ -89,6 +89,7 @@ "Llama3Renderer": "renderers.llama_3", "MiniMaxM2Renderer": "renderers.minimax_m2", "Nemotron3Renderer": "renderers.nemotron3", + "Nemotron3UltraRenderer": "renderers.nemotron3", "Qwen35Renderer": "renderers.qwen35", "Qwen36Renderer": "renderers.qwen36", "Qwen3Renderer": "renderers.qwen3", @@ -147,6 +148,7 @@ def __dir__() -> list[str]: "MultimodalRenderer", "Nemotron3Renderer", "Nemotron3RendererConfig", + "Nemotron3UltraRenderer", "Nemotron3UltraRendererConfig", "OverlongPromptError", "ParsedResponse", diff --git a/renderers/base.py b/renderers/base.py index 8fd9870..f141594 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1351,7 +1351,7 @@ def _populate_registry(): from renderers.laguna_xs2 import LagunaXS2Renderer from renderers.llama_3 import Llama3Renderer from renderers.minimax_m2 import MiniMaxM2Renderer - from renderers.nemotron3 import Nemotron3Renderer + from renderers.nemotron3 import Nemotron3Renderer, Nemotron3UltraRenderer from renderers.qwen3 import Qwen3Renderer from renderers.qwen3_vl import Qwen3VLRenderer from renderers.qwen35 import Qwen35Renderer @@ -1375,7 +1375,7 @@ def _populate_registry(): "laguna-xs.2": LagunaXS2Renderer, "llama-3": Llama3Renderer, "nemotron-3": Nemotron3Renderer, - "nemotron-3-ultra": Nemotron3Renderer, + "nemotron-3-ultra": Nemotron3UltraRenderer, "gpt-oss": GptOssRenderer, } ) diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py index 8716145..c29129c 100644 --- a/renderers/nemotron3.py +++ b/renderers/nemotron3.py @@ -76,12 +76,11 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str] # The Nemotron-3 family ships two chat-template variants. Nano / Super share -# one (config ``name="nemotron-3"``); Ultra differs in the reasoning-block glue -# — no ``\n`` around ```` — and gets its own discriminator -# (``name="nemotron-3-ultra"``). Which variant a checkpoint uses is carried by -# ``MODEL_RENDERER_MAP`` (and hence the resolved config's ``name``), so the -# renderer reads it off ``config.name`` rather than probing the live template. -_ULTRA_CONFIG_NAME = "nemotron-3-ultra" +# one (renderer ``Nemotron3Renderer`` / config ``name="nemotron-3"``); Ultra +# differs in the reasoning-block glue — no ``\n`` around ```` — and is +# the ``Nemotron3UltraRenderer`` subclass (``name="nemotron-3-ultra"``). Which +# variant a checkpoint uses is carried by ``MODEL_RENDERER_MAP``, so the right +# renderer class is constructed and the variant is encoded by the class itself. def _is_super(tokenizer) -> bool: @@ -97,7 +96,18 @@ def _is_super(tokenizer) -> bool: class Nemotron3Renderer: - """Deterministic message → token renderer for Nemotron 3 models.""" + """Deterministic message → token renderer for Nemotron-3 Nano / Super. + + The Ultra variant (distinct ```` glue) is the + :class:`Nemotron3UltraRenderer` subclass below; both are registered under + their own discriminator and differ only by the class-level hooks here. + """ + + # Variant hooks (overridden by ``Nemotron3UltraRenderer``): the default + # config to build when none is passed, and whether to use Ultra's + # reasoning-block glue. + _config_cls: type = Nemotron3RendererConfig + _ultra: bool = False def __init__( self, @@ -105,12 +115,8 @@ def __init__( config: Nemotron3RendererConfig | Nemotron3UltraRendererConfig | None = None, ): self._tokenizer = tokenizer - cfg = config or Nemotron3RendererConfig() + cfg = config or type(self)._config_cls() self.config = cfg - # The Ultra variant is selected by the config discriminator - # (``name="nemotron-3-ultra"``), not a flag — one renderer class serves - # both, switching glue off ``self._ultra``. - self._ultra = cfg.name == _ULTRA_CONFIG_NAME # Resolve the per-variant reasoning-effort hint appended to the last # user message. Ultra honours ``medium_effort``; Super honours @@ -825,3 +831,17 @@ def _render_tool( if not next_is_tool: emit_special(self._im_end, oi, is_sampled=False, is_content=False) emit_text("\n", oi, is_sampled=False, is_content=False) + + +class Nemotron3UltraRenderer(Nemotron3Renderer): + """Renderer for Nemotron-3 **Ultra**. + + Identical to :class:`Nemotron3Renderer` except the reasoning block is glued + as ``\\n{reasoning}{content}`` (no ``\\n`` around + ````) and truncated historical turns collapse to + ``{content}`` (no ``\\n``) — the difference is carried by the + ``_ultra`` class hook. Honours the Ultra-only ``medium_effort`` kwarg. + """ + + _config_cls = Nemotron3UltraRendererConfig + _ultra = True diff --git a/tests/test_nemotron3_ultra.py b/tests/test_nemotron3_ultra.py index 64c3cdb..a6832d7 100644 --- a/tests/test_nemotron3_ultra.py +++ b/tests/test_nemotron3_ultra.py @@ -19,13 +19,13 @@ from types import SimpleNamespace -from renderers.base import MODEL_RENDERER_MAP +from renderers.base import MODEL_RENDERER_MAP, RENDERER_REGISTRY, _populate_registry from renderers.configs import ( Nemotron3RendererConfig, Nemotron3UltraRendererConfig, _config_class_for, ) -from renderers.nemotron3 import Nemotron3Renderer, _is_super +from renderers.nemotron3 import Nemotron3Renderer, Nemotron3UltraRenderer, _is_super _ULTRA_REPOS = [ "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16", @@ -48,27 +48,26 @@ def test_models_map_to_their_variant(): assert MODEL_RENDERER_MAP.get(repo) == "nemotron-3", repo -def test_both_variants_resolve_to_one_renderer_class(): - # The registry routes both discriminators to the shared renderer class. +def test_each_discriminator_maps_to_its_config_and_renderer_class(): + # Config discriminator → config class. assert _config_class_for("nemotron-3") is Nemotron3RendererConfig assert _config_class_for("nemotron-3-ultra") is Nemotron3UltraRendererConfig - - -def test_renderer_reads_variant_from_config_name(): - # No tokenizer needed for the ``_ultra`` flag — it comes off config.name. - # Build with a fake tokenizer that has the special tokens stubbed out. - class _Tok: - name_or_path = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" - unk_token_id = -1 - - def convert_tokens_to_ids(self, tok): - # Deterministic non-unk ids so construction succeeds offline. - return abs(hash(tok)) % 100_000 + 1 - - nano = Nemotron3Renderer(_Tok(), Nemotron3RendererConfig()) - ultra = Nemotron3Renderer(_Tok(), Nemotron3UltraRendererConfig()) - assert nano._ultra is False - assert ultra._ultra is True + # Registry → renderer class (Ultra is a sibling subclass, matching the + # GLM-5/5.1 and Qwen3.5/3.6 house style — not one class under two names). + _populate_registry() + assert RENDERER_REGISTRY["nemotron-3"] is Nemotron3Renderer + assert RENDERER_REGISTRY["nemotron-3-ultra"] is Nemotron3UltraRenderer + assert issubclass(Nemotron3UltraRenderer, Nemotron3Renderer) + + +def test_variant_is_encoded_by_the_class(): + # The ```` glue is selected by the class hook, not config.name — + # so the right renderer class must be constructed (create_renderer routes + # config.name → class). Default config also follows the class. + assert Nemotron3Renderer._ultra is False + assert Nemotron3UltraRenderer._ultra is True + assert Nemotron3Renderer._config_cls is Nemotron3RendererConfig + assert Nemotron3UltraRenderer._config_cls is Nemotron3UltraRendererConfig def test_template_fields_per_variant():