Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ next_prompt_ids = r.bridge_to_next_turn(
)
```

Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper.
Hand-coded renderers ship for `qwen3`, `qwen3-vl`, `qwen3.5`, `qwen3.6`, `glm-5`, `glm-5.1`, `glm-4.5`, `minimax-m2`, `deepseek-v3`, `kimi-k2`, `kimi-k2.5`, `nemotron-3`, `nemotron-3-ultra`, `gpt-oss`. Anything else falls back to `DefaultRenderer`, a generic `apply_chat_template` wrapper.

## API

Expand Down
4 changes: 4 additions & 0 deletions renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
Llama3RendererConfig,
MiniMaxM2RendererConfig,
Nemotron3RendererConfig,
Nemotron3UltraRendererConfig,
Qwen35RendererConfig,
Qwen36RendererConfig,
Qwen3RendererConfig,
Expand Down Expand Up @@ -88,6 +89,7 @@
"Llama3Renderer": "renderers.llama_3",
"MiniMaxM2Renderer": "renderers.minimax_m2",
"Nemotron3Renderer": "renderers.nemotron3",
"Nemotron3UltraRenderer": "renderers.nemotron3",
"Qwen35Renderer": "renderers.qwen35",
"Qwen36Renderer": "renderers.qwen36",
"Qwen3Renderer": "renderers.qwen3",
Expand Down Expand Up @@ -146,6 +148,8 @@ def __dir__() -> list[str]:
"MultimodalRenderer",
"Nemotron3Renderer",
"Nemotron3RendererConfig",
"Nemotron3UltraRenderer",
"Nemotron3UltraRendererConfig",
"OverlongPromptError",
"ParsedResponse",
"ParsedToolCall",
Expand Down
14 changes: 8 additions & 6 deletions renderers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1040,14 +1040,15 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No
"moonshotai/Kimi-K2-Instruct": "kimi-k2",
"moonshotai/Kimi-K2.5": "kimi-k2.5",
"moonshotai/Kimi-K2.6": "kimi-k2.5",
# Nemotron 3. Nano / Super share one chat-template variant; the Ultra
# checkpoints use the Ultra variant — the renderer auto-selects it from
# the model name (see ``nemotron3._ULTRA_DEFAULTS``). BF16 and FP8 share the
# Nemotron 3. Nano / Super share one chat-template variant (``nemotron-3``);
# the Ultra checkpoints use the Ultra variant (``nemotron-3-ultra``, distinct
# ``</think>`` glue). Both route to the same Nemotron3Renderer, which selects
# the variant from the resolved config's ``name``. BF16 and FP8 share the
# same tokenizer and template.
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "nemotron-3",
"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16": "nemotron-3",
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3",
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3",
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16": "nemotron-3-ultra",
"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-FP8": "nemotron-3-ultra",
# Llama 3.2 (Instruct). Tested against the gated meta-llama repos and
# the unrestricted unsloth/... mirror, which ships a byte-identical
# chat template. ``Llama3Renderer`` defaults ``date_string`` to
Expand Down Expand Up @@ -1350,7 +1351,7 @@ def _populate_registry():
from renderers.laguna_xs2 import LagunaXS2Renderer
from renderers.llama_3 import Llama3Renderer
from renderers.minimax_m2 import MiniMaxM2Renderer
from renderers.nemotron3 import Nemotron3Renderer
from renderers.nemotron3 import Nemotron3Renderer, Nemotron3UltraRenderer
from renderers.qwen3 import Qwen3Renderer
from renderers.qwen3_vl import Qwen3VLRenderer
from renderers.qwen35 import Qwen35Renderer
Expand All @@ -1374,6 +1375,7 @@ def _populate_registry():
"laguna-xs.2": LagunaXS2Renderer,
"llama-3": Llama3Renderer,
"nemotron-3": Nemotron3Renderer,
"nemotron-3-ultra": Nemotron3UltraRenderer,
"gpt-oss": GptOssRenderer,
}
)
Expand Down
71 changes: 42 additions & 29 deletions renderers/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,49 +354,59 @@ class MiniMaxM2RendererConfig(BaseRendererConfig):


class Nemotron3RendererConfig(BaseRendererConfig):
"""Nemotron 3 renderer config."""
"""Nemotron-3 **Nano / Super** renderer config.

Nano and Super share one chat-template variant; the renderer routes both
through :class:`renderers.nemotron3.Nemotron3Renderer`. The Ultra variant
has its own template (different reasoning-block glue) and config —
:class:`Nemotron3UltraRendererConfig` — and is reached via the
``nemotron-3-ultra`` discriminator.
"""

name: Literal["nemotron-3"] = "nemotron-3"

enable_thinking: bool = True
"""When ``True``, the generation prompt includes ``<think>``. Mirrors
the chat template's ``enable_thinking`` kwarg."""

ultra: bool | None = None
"""Select the Nemotron-3 **Ultra** chat-template variant.

``None`` (default) auto-detects from the model name (see
``renderers.nemotron3._ULTRA_DEFAULTS``): the Ultra checkpoints resolve
to ``True``; Nano / Super and unknown checkpoints to ``False``. Set
explicitly to force a variant — e.g. an Ultra fine-tune or a
locally-pathed checkpoint whose ``name_or_path`` isn't in the table.

Ultra's template differs from Nano/Super: the reasoning block is glued
as ``<think>\\n{reasoning}</think>{content}`` (no ``\\n`` around
``</think>``), truncated historical turns collapse to
``<think></think>{content}`` (no ``\\n``), and the thinking-truncation
boundary follows the template's ``loop.index0 < last_user_idx`` rule
(drop thinking on every assistant turn before the last user message).

Not a chat-template kwarg — it picks which template the renderer
mirrors, not a variable passed into one — so it's listed in
``_internal_fields`` and excluded from ``template_field_names()``."""

truncate_history_thinking: bool = True
"""When ``False``, keep ``<think>{reasoning}</think>`` on past-cycle
assistant turns instead of dropping them. Mirrors the chat
template's ``truncate_history_thinking`` toggle. OR-composes with
``preserve_all_thinking`` / ``preserve_thinking_between_tool_calls``
— see :class:`BaseRendererConfig` for the contract."""

# ``ultra`` is a template-variant SELECTOR — it picks which template the
# renderer mirrors (Ultra vs Nano/Super), not a variable passed into one;
# there is no ``ultra`` Jinja variable. Marked internal so the parity
# matrix doesn't cross it as a template field. Same ``_internal_fields``
# mechanism DeepSeek-V3 uses for its no-op ``enable_thinking``, for a
# different underlying reason (theirs is an ignored kwarg, this is a
# variant switch).
_internal_fields = frozenset({"ultra"})
low_effort: bool = False
"""When ``True``, append ``\\n\\n{reasoning effort: low}`` to the last user
message, nudging the model toward shorter reasoning. Mirrors the **Super**
chat template's ``low_effort`` kwarg. A no-op on **Nano** (its template
doesn't define it) — exactly as ``apply_chat_template`` ignores an undefined
template variable; the renderer distinguishes the two by model name (see
``renderers.nemotron3._is_super``)."""


class Nemotron3UltraRendererConfig(BaseRendererConfig):
"""Nemotron-3 **Ultra** renderer config — distinct discriminator so the
registry routes Ultra checkpoints to the Ultra template variant.

Ultra's template differs from Nano/Super: the reasoning block is glued as
``<think>\\n{reasoning}</think>{content}`` (no ``\\n`` around ``</think>``)
and truncated historical turns collapse to ``<think></think>{content}``
(no ``\\n``). It shares the :class:`renderers.nemotron3.Nemotron3Renderer`
implementation, which selects the variant from ``config.name``.
"""

name: Literal["nemotron-3-ultra"] = "nemotron-3-ultra"

enable_thinking: bool = True
"""See :class:`Nemotron3RendererConfig.enable_thinking`."""

truncate_history_thinking: bool = True
"""See :class:`Nemotron3RendererConfig.truncate_history_thinking`."""

medium_effort: bool = False
"""When ``True``, append ``\\n\\n{reasoning effort: efficient}`` to the last
user message. Mirrors the Ultra chat template's ``medium_effort`` kwarg."""


class DeepSeekV3RendererConfig(BaseRendererConfig):
Expand Down Expand Up @@ -444,6 +454,7 @@ class DeepSeekR1RendererConfig(BaseRendererConfig):
Llama3RendererConfig,
MiniMaxM2RendererConfig,
Nemotron3RendererConfig,
Nemotron3UltraRendererConfig,
DeepSeekV3RendererConfig,
DeepSeekR1RendererConfig,
],
Expand Down Expand Up @@ -480,6 +491,7 @@ class DeepSeekR1RendererConfig(BaseRendererConfig):
"llama-3": Llama3RendererConfig,
"minimax-m2": MiniMaxM2RendererConfig,
"nemotron-3": Nemotron3RendererConfig,
"nemotron-3-ultra": Nemotron3UltraRendererConfig,
"deepseek-v3": DeepSeekV3RendererConfig,
"deepseek-r1": DeepSeekR1RendererConfig,
}
Expand Down Expand Up @@ -525,6 +537,7 @@ def config_from_name(name: str) -> BaseRendererConfig | None:
"Llama3RendererConfig",
"MiniMaxM2RendererConfig",
"Nemotron3RendererConfig",
"Nemotron3UltraRendererConfig",
"Qwen35RendererConfig",
"Qwen36RendererConfig",
"Qwen3RendererConfig",
Expand Down
Loading
Loading