From 4b6abc6e36ed61843a1c427c4e72e182a4928eec Mon Sep 17 00:00:00 2001
From: Dong Wang <dongw2019@gmail.com>
Date: Mon, 11 May 2026 00:03:05 +0000
Subject: [PATCH 1/4]   tool_parsers: add Nemotron JSON tool parser

  Register a built-in parser for Nemotron <TOOLCALL> JSON payloads,
  add a matching chat template example, and cover streaming extraction
  for content-plus-tool and parallel-call chunk boundaries.

Signed-off-by: Dong Wang <dongw2019@gmail.com>
---
 docs/features/tool_calling.md                 |  14 +-
 .../tool_chat_template_nemotron_json.jinja    | 135 ++++++++
 .../test_nemotron_json_tool_parser.py         | 205 +++++++++++
 vllm/tool_parsers/__init__.py                 |   4 +
 .../tool_parsers/nemotron_json_tool_parser.py | 327 ++++++++++++++++++
 5 files changed, 683 insertions(+), 2 deletions(-)
 create mode 100644 examples/tool_chat_template_nemotron_json.jinja
 create mode 100644 tests/tool_parsers/test_nemotron_json_tool_parser.py
 create mode 100644 vllm/tool_parsers/nemotron_json_tool_parser.py
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 9c60255d6928..5eb11c876029 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -324,6 +324,16 @@ Supported models:
 
 Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`
 
+### Nemotron Models (`nemotron_json`)
+
+Supported models:
+
+* `nvidia/NVIDIA-Nemotron-Nano-9B-v2` (and FP8/NVFP4 variants; use with [examples/tool_chat_template_nemotron_json.jinja](../../examples/tool_chat_template_nemotron_json.jinja))
+
+The parser handles the `<TOOLCALL>[{"name": ..., "arguments": ...}, ...]</TOOLCALL>` envelope emitted by the Nemotron chat template, and works with the model's hybrid thinking mode: any `<think>...</think>` prefix is preserved as message content (or stripped by a reasoning parser if one is configured).
+
+Flags: `--tool-call-parser nemotron_json --chat-template examples/tool_chat_template_nemotron_json.jinja`
+
 ### DeepSeek-V3 Models (`deepseek_v3`)
 
 Supported models:
@@ -510,8 +520,8 @@ Here is a summary of a plugin file:
     # in --tool-call-parser. you can define as many
     # tool parsers as you want here.
     class ExampleToolParser(ToolParser):
-        def __init__(self, tokenizer: TokenizerLike):
-            super().__init__(tokenizer)
+        def __init__(self, tokenizer: TokenizerLike, tools=None):
+            super().__init__(tokenizer, tools)
 
         # adjust request. e.g.: set skip special tokens
         # to False for tool call output.
diff --git a/examples/tool_chat_template_nemotron_json.jinja b/examples/tool_chat_template_nemotron_json.jinja
new file mode 100644
index 000000000000..f6f0a380c107
--- /dev/null
+++ b/examples/tool_chat_template_nemotron_json.jinja
@@ -0,0 +1,135 @@
+{%- set ns = namespace(enable_thinking=true) -%}
+
+{%- for message in messages -%}
+    {%- set content = message['content'] -%}
+    {%- if message['role'] == 'user' or message['role'] == 'system' -%}
+        {%- if '/think' in content -%}
+            {%- set ns.enable_thinking = true -%}
+        {%- elif '/no_think' in content -%}
+            {%- set ns.enable_thinking = false -%}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if messages[0]['role'] != 'system' -%}
+    {%- set ns.non_tool_system_content = '' -%}
+    {{- '<SPECIAL_10>System\n' -}}
+{%- else -%}
+    {%- set ns.non_tool_system_content = messages[0]['content']
+        .replace('/think', '')
+        .replace('/no_think', '')
+        .strip()
+    -%}
+    {{- '<SPECIAL_10>System\n' + ns.non_tool_system_content }}
+{%- endif -%}
+
+{%- if tools -%}
+    {%- if ns.non_tool_system_content is defined
+        and ns.non_tool_system_content != '' -%}
+        {{- '\n\n' -}}
+    {%- endif -%}
+
+    {{- 'You can use the following tools to assist the user if required:' -}}
+    {{- '\n<AVAILABLE_TOOLS>[' -}}
+    {%- for tool in tools -%}
+        {{- (tool.function if tool.function is defined else tool) | tojson -}}
+        {{- ', ' if not loop.last else '' -}}
+    {%- endfor -%}
+    {{- ']</AVAILABLE_TOOLS>\n\n' -}}
+
+    {{- 'If you decide to call any tool(s), use the following format:\n' -}}
+    {{- '<TOOLCALL>[{{"name": "tool_name1", "arguments": "tool_args1"}}, ' -}}
+    {{- '{{"name": "tool_name2", "arguments": "tool_args2"}}]</TOOLCALL>\n\n' -}}
+
+    {{- 'The user will execute tool-calls and return responses from tool(s) in this format:\n' -}}
+    {{- '<TOOL_RESPONSE>[{{"tool_response1"}}, {{"tool_response2"}}]</TOOL_RESPONSE>\n\n' -}}
+
+    {{- 'Based on the tool responses, you can call additional tools if needed, correct tool calls if any errors are found, or just respond to the user.' -}}
+{%- endif -%}
+
+{{- '\n' -}}
+
+{%- set messages = messages[1:] if messages[0]['role'] == 'system' else messages -%}
+
+{%- if messages[-1]['role'] == 'assistant' -%}
+    {%- set ns.last_turn_assistant_content = messages[-1]['content'].strip() -%}
+    {%- set messages = messages[:-1] -%}
+{%- endif -%}
+
+{%- for message in messages -%}
+    {%- set content = message['content'] -%}
+
+    {%- if message['role'] == 'user' -%}
+        {{- '<SPECIAL_11>User\n' + content.replace('/think', '').replace('/no_think', '').strip() + '\n' }}
+
+    {%- elif message['role'] == 'tool' -%}
+        {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}
+            {{- '<SPECIAL_11>User\n' + '<TOOL_RESPONSE>[' }}
+        {%- endif -%}
+        {{- message['content'] -}}
+        {{- ', ' if not loop.last and (messages[loop.index0 + 1].role == 'tool') else '' -}}
+        {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}
+            {{- ']</TOOL_RESPONSE>\n' -}}
+        {%- endif -%}
+
+    {%- elif message['role'] == 'assistant' -%}
+        {%- if '</think>' in content -%}
+            {%- set content = content.split('</think>')[1].strip() -%}
+        {%- endif -%}
+
+        {{- '<SPECIAL_11>Assistant\n' + content.strip() }}
+
+        {%- if message.tool_calls -%}
+            {%- if content.strip() != '' -%}
+                {{- '\n\n' -}}
+            {%- endif -%}
+            {{- '<TOOLCALL>[' -}}
+            {%- for call in message.tool_calls -%}
+                {%- set fn = call.function if call.function is defined else call -%}
+                {{- '{"name": "' + fn.name + '", "arguments": ' -}}
+                {%- if fn.arguments is string -%}
+                    {{- fn.arguments -}}
+                {%- else -%}
+                    {{- fn.arguments | tojson -}}
+                {%- endif -%}
+                {{- '}' + (', ' if not loop.last else '') -}}
+            {%- endfor -%}
+            {{- ']</TOOLCALL>' -}}
+        {%- endif -%}
+
+        {{- '\n<SPECIAL_12>\n' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{- '<SPECIAL_11>Assistant\n' -}}
+    {%- if ns.enable_thinking is defined and ns.enable_thinking is false -%}
+        {{- '<think></think>' -}}
+    {%- else -%}
+        {{- '<think>\n' -}}
+    {%- endif -%}
+    {%- if ns.last_turn_assistant_content is defined
+        and ns.last_turn_assistant_content != '' -%}
+        {{- ns.last_turn_assistant_content -}}
+    {%- endif -%}
+
+{%- else -%}
+    {%- if ns.last_turn_assistant_content is defined
+        and ns.last_turn_assistant_content != '' -%}
+        {{- '<SPECIAL_11>Assistant\n' -}}
+        {%- if ns.enable_thinking is defined and ns.enable_thinking is false -%}
+            {{- '<think></think>' -}}
+        {%- else -%}
+            {{- '<think>\n' -}}
+        {%- endif -%}
+        {{- ns.last_turn_assistant_content -}}
+
+        {%- if continue_final_message is defined -%}
+            {%- if continue_final_message is false -%}
+                {{- '\n<SPECIAL_12>\n' -}}
+            {%- endif -%}
+        {%- else -%}
+            {{- '\n<SPECIAL_12>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endif -%}
diff --git a/tests/tool_parsers/test_nemotron_json_tool_parser.py b/tests/tool_parsers/test_nemotron_json_tool_parser.py
new file mode 100644
index 000000000000..3d885519efad
--- /dev/null
+++ b/tests/tool_parsers/test_nemotron_json_tool_parser.py
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from tests.tool_parsers.utils import run_tool_extraction
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import ExtractedToolCallInformation
+from vllm.tool_parsers import ToolParserManager
+
+
+@pytest.fixture
+def mock_tokenizer():
+    tokenizer = MagicMock()
+    tokenizer.get_vocab.return_value = {}
+    tokenizer.tokenize.side_effect = lambda text: list(text)
+    return tokenizer
+
+
+@pytest.fixture
+def mock_request():
+    request = MagicMock(spec=ChatCompletionRequest)
+    request.tools = []
+    request.tool_choice = "auto"
+    return request
+
+
+@pytest.fixture
+def parser(mock_tokenizer):
+    parser_cls = ToolParserManager.get_tool_parser("nemotron_json")
+    return parser_cls(mock_tokenizer, tools=[])
+
+
+def test_nemotron_json_registered_and_accepts_tools(mock_tokenizer):
+    parser_cls = ToolParserManager.get_tool_parser("nemotron_json")
+
+    parser = parser_cls(mock_tokenizer, tools=[])
+
+    assert parser.tool_call_start_token == "<TOOLCALL>"
+
+
+def test_extract_tool_calls_returns_content_without_tool_call(parser, mock_request):
+    model_output = "No tool call here."
+
+    result = parser.extract_tool_calls(model_output, mock_request)
+
+    assert isinstance(result, ExtractedToolCallInformation)
+    assert result.tools_called is False
+    assert result.tool_calls == []
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_from_nemotron_array(parser, mock_request):
+    model_output = (
+        "Let me check that."
+        '<TOOLCALL>[{"name": "get_weather", '
+        '"arguments": {"city": "Tokyo", "unit": "celsius"}}]</TOOLCALL>'
+    )
+
+    result = parser.extract_tool_calls(model_output, mock_request)
+
+    assert result.tools_called is True
+    assert result.content == "Let me check that."
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].type == "function"
+    assert result.tool_calls[0].function.name == "get_weather"
+    assert result.tool_calls[0].function.arguments == (
+        '{"city": "Tokyo", "unit": "celsius"}'
+    )
+
+
+def test_extract_tool_calls_wraps_single_object(parser, mock_request):
+    model_output = (
+        '<TOOLCALL>{"name": "lookup", "arguments": {"query": "vllm"}}</TOOLCALL>'
+    )
+
+    result = parser.extract_tool_calls(model_output, mock_request)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "lookup"
+    assert result.tool_calls[0].function.arguments == '{"query": "vllm"}'
+
+
+def test_extract_tool_calls_supports_string_arguments(parser, mock_request):
+    model_output = (
+        '<TOOLCALL>[{"name": "run_query", '
+        '"arguments": "{\\"sql\\": \\"select 1\\"}"}]</TOOLCALL>'
+    )
+
+    result = parser.extract_tool_calls(model_output, mock_request)
+
+    assert result.tools_called is True
+    assert result.tool_calls[0].function.name == "run_query"
+    assert result.tool_calls[0].function.arguments == '{"sql": "select 1"}'
+
+
+def test_extract_tool_calls_returns_original_for_malformed(parser, mock_request):
+    model_output = '<TOOLCALL>[{"name": "broken", "arguments": {}</TOOLCALL>'
+
+    result = parser.extract_tool_calls(model_output, mock_request)
+
+    assert result.tools_called is False
+    assert result.tool_calls == []
+    assert result.content == model_output
+
+
+def test_streaming_reconstructs_tool_call(parser, mock_request):
+    model_output = (
+        "Let me check."
+        '<TOOLCALL>[{"name": "get_weather", '
+        '"arguments": {"city": "Tokyo", "unit": "celsius"}}]</TOOLCALL>'
+    )
+
+    content, tool_calls = run_tool_extraction(
+        parser,
+        list(model_output),
+        request=mock_request,
+        streaming=True,
+    )
+
+    assert content == "Let me check."
+    assert len(tool_calls) == 1
+    assert tool_calls[0].function.name == "get_weather"
+    assert tool_calls[0].function.arguments == ('{"city": "Tokyo", "unit": "celsius"}')
+
+
+def test_streaming_handles_nested_json_arguments(parser, mock_request):
+    model_output = (
+        '<TOOLCALL>[{"name": "search", '
+        '"arguments": {"filters": {"city": "Tokyo"}, '
+        '"items": [{"name": "rain", "value": true}]}}]</TOOLCALL>'
+    )
+
+    content, tool_calls = run_tool_extraction(
+        parser,
+        list(model_output),
+        request=mock_request,
+        streaming=True,
+    )
+
+    assert content is None
+    assert len(tool_calls) == 1
+    assert tool_calls[0].function.name == "search"
+    assert tool_calls[0].function.arguments == (
+        '{"filters": {"city": "Tokyo"}, "items": [{"name": "rain", "value": true}]}'
+    )
+
+
+def test_extract_tool_calls_keeps_think_block_as_content(parser, mock_request):
+    model_output = (
+        "<think>\nI need the weather for Tokyo.\n</think>\n"
+        '<TOOLCALL>[{"name": "get_weather", "arguments": {"city": "Tokyo"}}]</TOOLCALL>'
+    )
+
+    result = parser.extract_tool_calls(model_output, mock_request)
+
+    assert result.tools_called is True
+    assert result.content == "<think>\nI need the weather for Tokyo.\n</think>\n"
+    assert result.tool_calls[0].function.name == "get_weather"
+    assert result.tool_calls[0].function.arguments == '{"city": "Tokyo"}'
+
+
+def test_streaming_keeps_think_block_as_content(parser, mock_request):
+    model_output = (
+        "<think>\nI need the weather for Tokyo.\n</think>\n"
+        '<TOOLCALL>[{"name": "get_weather", "arguments": {"city": "Tokyo"}}]</TOOLCALL>'
+    )
+
+    content, tool_calls = run_tool_extraction(
+        parser,
+        list(model_output),
+        request=mock_request,
+        streaming=True,
+    )
+
+    assert content == "<think>\nI need the weather for Tokyo.\n</think>\n"
+    assert len(tool_calls) == 1
+    assert tool_calls[0].function.name == "get_weather"
+    assert tool_calls[0].function.arguments == '{"city": "Tokyo"}'
+
+
+def test_streaming_handles_multiple_tool_calls(parser, mock_request):
+    model_output = (
+        '<TOOLCALL>[{"name": "get_weather", '
+        '"arguments": {"city": "Tokyo"}}, '
+        '{"name": "lookup_timezone", '
+        '"arguments": {"city": "Tokyo"}}]</TOOLCALL>'
+    )
+
+    content, tool_calls = run_tool_extraction(
+        parser,
+        list(model_output),
+        request=mock_request,
+        streaming=True,
+    )
+
+    assert content is None
+    assert len(tool_calls) == 2
+    assert tool_calls[0].function.name == "get_weather"
+    assert tool_calls[0].function.arguments == '{"city": "Tokyo"}'
+    assert tool_calls[1].function.name == "lookup_timezone"
+    assert tool_calls[1].function.arguments == '{"city": "Tokyo"}'
diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py
index 7c5f45d2022e..e26070cdecb4 100644
--- a/vllm/tool_parsers/__init__.py
+++ b/vllm/tool_parsers/__init__.py
@@ -130,6 +130,10 @@
         "minimax_tool_parser",
         "MinimaxToolParser",
     ),
+    "nemotron_json": (
+        "nemotron_json_tool_parser",
+        "NemotronJSONToolParser",
+    ),
     "mistral": (
         "mistral_tool_parser",
         "MistralToolParser",
diff --git a/vllm/tool_parsers/nemotron_json_tool_parser.py b/vllm/tool_parsers/nemotron_json_tool_parser.py
new file mode 100644
index 000000000000..e015b5cba3d4
--- /dev/null
+++ b/vllm/tool_parsers/nemotron_json_tool_parser.py
@@ -0,0 +1,327 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Any
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import Tool, ToolParser
+from vllm.tool_parsers.utils import partial_tag_overlap
+
+logger = init_logger(__name__)
+
+
+class NemotronJSONToolParser(ToolParser):
+    """Tool parser for Nemotron models that emit <TOOLCALL> JSON payloads."""
+
+    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
+        super().__init__(tokenizer, tools)
+
+        self.tool_call_start_token = "<TOOLCALL>"
+        self.tool_call_end_token = "</TOOLCALL>"
+        self.tool_call_regex = re.compile(
+            rf"{self.tool_call_start_token}(.*?){self.tool_call_end_token}",
+            re.DOTALL,
+        )
+        self._sent_content_idx = 0
+        self._tool_args_emitted: list[bool] = []
+
+    def adjust_request(
+        self, request: ChatCompletionRequest | ResponsesRequest
+    ) -> ChatCompletionRequest | ResponsesRequest:
+        request = super().adjust_request(request)
+        if request.tools and request.tool_choice != "none":
+            request.skip_special_tokens = False
+        return request
+
+    @staticmethod
+    def _normalize_tool_call_payload(payload: str) -> list[dict[str, Any]]:
+        payload = payload.strip()
+        if not payload.startswith("["):
+            payload = "[" + payload
+        if not payload.endswith("]"):
+            payload = payload + "]"
+
+        parsed = json.loads(payload)
+        if isinstance(parsed, dict):
+            return [parsed]
+        if isinstance(parsed, list):
+            return [item for item in parsed if isinstance(item, dict)]
+        return []
+
+    @staticmethod
+    def _serialize_arguments(arguments: Any) -> str:
+        if isinstance(arguments, str):
+            return arguments
+        return json.dumps(arguments, ensure_ascii=False)
+
+    @staticmethod
+    def _strip_trailing_auto_closers(chunk: str) -> str:
+        idx = len(chunk)
+        while idx > 0 and chunk[idx - 1] in " \t\r\n}]":
+            idx -= 1
+        while idx > 0 and chunk[idx - 1] == '"':
+            if idx - 2 >= 0 and chunk[idx - 2] == "\\":
+                break
+            idx -= 1
+        return chunk[:idx]
+
+    @staticmethod
+    def _common_prefix_len(left: str, right: str) -> int:
+        max_len = min(len(left), len(right))
+        idx = 0
+        while idx < max_len and left[idx] == right[idx]:
+            idx += 1
+        return idx
+
+    def _compute_arguments_delta(self, arguments: Any, end_of_call: bool) -> str:
+        if self.current_tool_id < 0:
+            return ""
+
+        while len(self.streamed_args_for_tool) <= self.current_tool_id:
+            self.streamed_args_for_tool.append("")
+        while len(self._tool_args_emitted) <= self.current_tool_id:
+            self._tool_args_emitted.append(False)
+
+        cur_arguments = self._serialize_arguments(arguments)
+        streamed_prefix = self.streamed_args_for_tool[self.current_tool_id]
+        emitted_any = self._tool_args_emitted[self.current_tool_id]
+
+        lcp_len = self._common_prefix_len(cur_arguments, streamed_prefix)
+        if lcp_len != len(streamed_prefix):
+            streamed_prefix = streamed_prefix[:lcp_len]
+            self.streamed_args_for_tool[self.current_tool_id] = streamed_prefix
+
+        arguments_delta = cur_arguments[lcp_len:]
+        if not arguments_delta:
+            return ""
+
+        if not end_of_call:
+            arguments_delta = self._strip_trailing_auto_closers(arguments_delta)
+
+        if (
+            not emitted_any
+            and not end_of_call
+            and arguments_delta
+            and arguments_delta.endswith("}")
+        ):
+            arguments_delta = arguments_delta[:-1]
+            if arguments_delta.endswith('"'):
+                arguments_delta = arguments_delta[:-1]
+
+        return arguments_delta
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output,
+            )
+
+        try:
+            payloads = self.tool_call_regex.findall(model_output)
+            tool_calls: list[ToolCall] = []
+            for payload in payloads:
+                for raw_tool_call in self._normalize_tool_call_payload(payload):
+                    try:
+                        tool_calls.append(
+                            ToolCall(
+                                type="function",
+                                function=FunctionCall(
+                                    name=raw_tool_call["name"],
+                                    arguments=self._serialize_arguments(
+                                        raw_tool_call["arguments"]
+                                    ),
+                                ),
+                            )
+                        )
+                    except Exception:
+                        continue
+
+            if not tool_calls:
+                return ExtractedToolCallInformation(
+                    tools_called=False,
+                    tool_calls=[],
+                    content=model_output,
+                )
+
+            content = model_output[: model_output.find(self.tool_call_start_token)]
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=content if content else None,
+            )
+        except Exception:
+            logger.exception("Error extracting tool call from response.")
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output,
+            )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        if not previous_text:
+            self.current_tool_id = -1
+            self.current_tool_name_sent = False
+            self.streamed_args_for_tool = []
+            self._tool_args_emitted = []
+            self._sent_content_idx = 0
+
+        start_idx = current_text.find(self.tool_call_start_token)
+        if start_idx == -1:
+            overlap = partial_tag_overlap(current_text, self.tool_call_start_token)
+            sendable_idx = len(current_text) - overlap
+            if sendable_idx > self._sent_content_idx:
+                content = current_text[self._sent_content_idx : sendable_idx]
+                self._sent_content_idx = sendable_idx
+                return DeltaMessage(content=content)
+            return None
+
+        if self._sent_content_idx < start_idx:
+            content = current_text[self._sent_content_idx : start_idx]
+            self._sent_content_idx = start_idx
+            return DeltaMessage(content=content)
+
+        payload_start = start_idx + len(self.tool_call_start_token)
+        payload_end = current_text.find(self.tool_call_end_token, payload_start)
+        end_of_call = payload_end != -1
+        payload = current_text[
+            payload_start : payload_end if end_of_call else len(current_text)
+        ]
+        if not payload.strip():
+            return None
+
+        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
+        try:
+            parsed_tool_calls = partial_json_parser.loads(payload, flags)
+        except (
+            partial_json_parser.core.exceptions.MalformedJSON,
+            json.JSONDecodeError,
+            ValueError,
+        ):
+            return None
+
+        if isinstance(parsed_tool_calls, dict):
+            parsed_tool_calls = [parsed_tool_calls]
+        if not isinstance(parsed_tool_calls, list) or not parsed_tool_calls:
+            return None
+
+        if self.current_tool_id < 0:
+            self.current_tool_id = 0
+            self.current_tool_name_sent = False
+            self.streamed_args_for_tool.append("")
+            self._tool_args_emitted.append(False)
+
+        if self.current_tool_name_sent and self.current_tool_id + 1 < len(
+            parsed_tool_calls
+        ):
+            current_tool_call = parsed_tool_calls[self.current_tool_id]
+            if isinstance(current_tool_call, dict) and "arguments" in current_tool_call:
+                arguments_delta = self._compute_arguments_delta(
+                    current_tool_call["arguments"], True
+                )
+                if arguments_delta:
+                    self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
+                    self._tool_args_emitted[self.current_tool_id] = True
+                    return DeltaMessage(
+                        tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_id,
+                                function=DeltaFunctionCall(arguments=arguments_delta),
+                            )
+                        ]
+                    )
+
+            self.current_tool_id += 1
+            self.current_tool_name_sent = False
+            self.streamed_args_for_tool.append("")
+            self._tool_args_emitted.append(False)
+
+        if self.current_tool_id >= len(parsed_tool_calls):
+            return None
+
+        current_tool_call = parsed_tool_calls[self.current_tool_id]
+        if not isinstance(current_tool_call, dict):
+            return None
+
+        if not self.current_tool_name_sent:
+            function_name = current_tool_call.get("name")
+            if not function_name:
+                return None
+
+            arguments_delta = ""
+            if "arguments" in current_tool_call:
+                arguments_delta = self._compute_arguments_delta(
+                    current_tool_call["arguments"], end_of_call
+                )
+                if arguments_delta:
+                    self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
+                    self._tool_args_emitted[self.current_tool_id] = True
+
+            self.current_tool_name_sent = True
+            return DeltaMessage(
+                tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        id=make_tool_call_id(),
+                        type="function",
+                        function=DeltaFunctionCall(
+                            name=function_name,
+                            arguments=arguments_delta or None,
+                        ),
+                    )
+                ]
+            )
+
+        if "arguments" not in current_tool_call:
+            return None
+
+        arguments_delta = self._compute_arguments_delta(
+            current_tool_call["arguments"], end_of_call
+        )
+        if not arguments_delta:
+            return None
+
+        self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
+        self._tool_args_emitted[self.current_tool_id] = True
+        return DeltaMessage(
+            tool_calls=[
+                DeltaToolCall(
+                    index=self.current_tool_id,
+                    function=DeltaFunctionCall(arguments=arguments_delta),
+                )
+            ]
+        )

From 32367d45e3ae37d03e9c73598406bc4e6e62fba2 Mon Sep 17 00:00:00 2001
From: Dong Wang <dongw2019@gmail.com>
Date: Mon, 11 May 2026 00:07:49 +0000
Subject: [PATCH 2/4] cover streaming extraction for content-plus-tool and
 parallel-call chunk boundaries

Signed-off-by: Dong Wang <dongw2019@gmail.com>
---
 .../test_nemotron_json_tool_parser.py         |  45 +++++++
 .../tool_parsers/nemotron_json_tool_parser.py | 120 ++++++++----------
 2 files changed, 100 insertions(+), 65 deletions(-)

diff --git a/tests/tool_parsers/test_nemotron_json_tool_parser.py b/tests/tool_parsers/test_nemotron_json_tool_parser.py
index 3d885519efad..0106663f0407 100644
--- a/tests/tool_parsers/test_nemotron_json_tool_parser.py
+++ b/tests/tool_parsers/test_nemotron_json_tool_parser.py
@@ -195,6 +195,51 @@ def test_streaming_handles_multiple_tool_calls(parser, mock_request):
         list(model_output),
         request=mock_request,
         streaming=True,
+        assert_one_tool_per_delta=False,
+    )
+
+    assert content is None
+    assert len(tool_calls) == 2
+    assert tool_calls[0].function.name == "get_weather"
+    assert tool_calls[0].function.arguments == '{"city": "Tokyo"}'
+    assert tool_calls[1].function.name == "lookup_timezone"
+    assert tool_calls[1].function.arguments == '{"city": "Tokyo"}'
+
+
+def test_streaming_single_delta_handles_content_and_tool_call(parser, mock_request):
+    model_output = (
+        "Let me check."
+        '<TOOLCALL>[{"name": "get_weather", '
+        '"arguments": {"city": "Tokyo"}}]</TOOLCALL>'
+    )
+
+    content, tool_calls = run_tool_extraction(
+        parser,
+        [model_output],
+        request=mock_request,
+        streaming=True,
+    )
+
+    assert content == "Let me check."
+    assert len(tool_calls) == 1
+    assert tool_calls[0].function.name == "get_weather"
+    assert tool_calls[0].function.arguments == '{"city": "Tokyo"}'
+
+
+def test_streaming_single_delta_handles_multiple_tool_calls(parser, mock_request):
+    model_output = (
+        '<TOOLCALL>[{"name": "get_weather", '
+        '"arguments": {"city": "Tokyo"}}, '
+        '{"name": "lookup_timezone", '
+        '"arguments": {"city": "Tokyo"}}]</TOOLCALL>'
+    )
+
+    content, tool_calls = run_tool_extraction(
+        parser,
+        [model_output],
+        request=mock_request,
+        streaming=True,
+        assert_one_tool_per_delta=False,
     )
 
     assert content is None
diff --git a/vllm/tool_parsers/nemotron_json_tool_parser.py b/vllm/tool_parsers/nemotron_json_tool_parser.py
index e015b5cba3d4..10ca2c5c647a 100644
--- a/vllm/tool_parsers/nemotron_json_tool_parser.py
+++ b/vllm/tool_parsers/nemotron_json_tool_parser.py
@@ -210,10 +210,10 @@ def extract_tool_calls_streaming(
                 return DeltaMessage(content=content)
             return None
 
+        content_delta: str | None = None
         if self._sent_content_idx < start_idx:
-            content = current_text[self._sent_content_idx : start_idx]
+            content_delta = current_text[self._sent_content_idx : start_idx]
             self._sent_content_idx = start_idx
-            return DeltaMessage(content=content)
 
         payload_start = start_idx + len(self.tool_call_start_token)
         payload_end = current_text.find(self.tool_call_end_token, payload_start)
@@ -245,55 +245,34 @@ def extract_tool_calls_streaming(
             self.streamed_args_for_tool.append("")
             self._tool_args_emitted.append(False)
 
-        if self.current_tool_name_sent and self.current_tool_id + 1 < len(
-            parsed_tool_calls
-        ):
+        tool_call_deltas: list[DeltaToolCall] = []
+        while self.current_tool_id < len(parsed_tool_calls):
             current_tool_call = parsed_tool_calls[self.current_tool_id]
-            if isinstance(current_tool_call, dict) and "arguments" in current_tool_call:
-                arguments_delta = self._compute_arguments_delta(
-                    current_tool_call["arguments"], True
-                )
-                if arguments_delta:
-                    self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
-                    self._tool_args_emitted[self.current_tool_id] = True
-                    return DeltaMessage(
-                        tool_calls=[
-                            DeltaToolCall(
-                                index=self.current_tool_id,
-                                function=DeltaFunctionCall(arguments=arguments_delta),
-                            )
-                        ]
-                    )
-
-            self.current_tool_id += 1
-            self.current_tool_name_sent = False
-            self.streamed_args_for_tool.append("")
-            self._tool_args_emitted.append(False)
-
-        if self.current_tool_id >= len(parsed_tool_calls):
-            return None
+            if not isinstance(current_tool_call, dict):
+                break
 
-        current_tool_call = parsed_tool_calls[self.current_tool_id]
-        if not isinstance(current_tool_call, dict):
-            return None
+            call_complete = end_of_call or self.current_tool_id + 1 < len(
+                parsed_tool_calls
+            )
 
-        if not self.current_tool_name_sent:
-            function_name = current_tool_call.get("name")
-            if not function_name:
-                return None
+            if not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if not function_name:
+                    break
 
-            arguments_delta = ""
-            if "arguments" in current_tool_call:
-                arguments_delta = self._compute_arguments_delta(
-                    current_tool_call["arguments"], end_of_call
-                )
-                if arguments_delta:
-                    self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
-                    self._tool_args_emitted[self.current_tool_id] = True
+                arguments_delta = ""
+                if "arguments" in current_tool_call:
+                    arguments_delta = self._compute_arguments_delta(
+                        current_tool_call["arguments"], call_complete
+                    )
+                    if arguments_delta:
+                        self.streamed_args_for_tool[self.current_tool_id] += (
+                            arguments_delta
+                        )
+                        self._tool_args_emitted[self.current_tool_id] = True
 
-            self.current_tool_name_sent = True
-            return DeltaMessage(
-                tool_calls=[
+                self.current_tool_name_sent = True
+                tool_call_deltas.append(
                     DeltaToolCall(
                         index=self.current_tool_id,
                         id=make_tool_call_id(),
@@ -303,25 +282,36 @@ def extract_tool_calls_streaming(
                             arguments=arguments_delta or None,
                         ),
                     )
-                ]
-            )
+                )
+            elif "arguments" in current_tool_call:
+                arguments_delta = self._compute_arguments_delta(
+                    current_tool_call["arguments"], call_complete
+                )
+                if arguments_delta:
+                    self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
+                    self._tool_args_emitted[self.current_tool_id] = True
+                    tool_call_deltas.append(
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(arguments=arguments_delta),
+                        )
+                    )
+            elif not call_complete:
+                break
 
-        if "arguments" not in current_tool_call:
-            return None
+            if self.current_tool_id + 1 >= len(parsed_tool_calls):
+                break
 
-        arguments_delta = self._compute_arguments_delta(
-            current_tool_call["arguments"], end_of_call
-        )
-        if not arguments_delta:
-            return None
+            self.current_tool_id += 1
+            self.current_tool_name_sent = False
+            while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                self.streamed_args_for_tool.append("")
+            while len(self._tool_args_emitted) <= self.current_tool_id:
+                self._tool_args_emitted.append(False)
 
-        self.streamed_args_for_tool[self.current_tool_id] += arguments_delta
-        self._tool_args_emitted[self.current_tool_id] = True
-        return DeltaMessage(
-            tool_calls=[
-                DeltaToolCall(
-                    index=self.current_tool_id,
-                    function=DeltaFunctionCall(arguments=arguments_delta),
-                )
-            ]
-        )
+        if content_delta is not None or tool_call_deltas:
+            return DeltaMessage(
+                content=content_delta,
+                tool_calls=tool_call_deltas or None,
+            )
+        return None

From ce7222b6972a3d1fdd9f017f010b7df377859cda Mon Sep 17 00:00:00 2001
From: Dong Wang <dongw2019@gmail.com>
Date: Mon, 11 May 2026 00:51:57 +0000
Subject: [PATCH 3/4] cover the case the too_name needs to be escaped

Signed-off-by: Dong Wang <dongw2019@gmail.com>
---
 .../tool_chat_template_nemotron_json.jinja    |  4 +-
 .../test_nemotron_json_chat_template.py       | 45 +++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 tests/renderers/test_nemotron_json_chat_template.py

diff --git a/examples/tool_chat_template_nemotron_json.jinja b/examples/tool_chat_template_nemotron_json.jinja
index f6f0a380c107..5bb9478d9974 100644
--- a/examples/tool_chat_template_nemotron_json.jinja
+++ b/examples/tool_chat_template_nemotron_json.jinja
@@ -86,7 +86,9 @@
             {{- '<TOOLCALL>[' -}}
             {%- for call in message.tool_calls -%}
                 {%- set fn = call.function if call.function is defined else call -%}
-                {{- '{"name": "' + fn.name + '", "arguments": ' -}}
+                {{- '{"name": ' -}}
+                {{- fn.name | tojson -}}
+                {{- ', "arguments": ' -}}
                 {%- if fn.arguments is string -%}
                     {{- fn.arguments -}}
                 {%- else -%}
diff --git a/tests/renderers/test_nemotron_json_chat_template.py b/tests/renderers/test_nemotron_json_chat_template.py
new file mode 100644
index 000000000000..93a641e1fca9
--- /dev/null
+++ b/tests/renderers/test_nemotron_json_chat_template.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from pathlib import Path
+
+import jinja2.sandbox
+
+TEMPLATE_PATH = (
+    Path(__file__).resolve().parent.parent.parent
+    / "examples"
+    / "tool_chat_template_nemotron_json.jinja"
+)
+
+
+def test_tool_call_name_is_json_escaped():
+    template = jinja2.sandbox.ImmutableSandboxedEnvironment().from_string(
+        TEMPLATE_PATH.read_text()
+    )
+    tool_name = 'search"quoted\\name'
+    rendered = template.render(
+        messages=[
+            {"role": "user", "content": "Search docs"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": tool_name,
+                            "arguments": {"query": "vllm"},
+                        },
+                    }
+                ],
+            },
+            {"role": "tool", "content": '{"result": "ok"}'},
+        ],
+        add_generation_prompt=False,
+    )
+
+    payload = rendered.split("<TOOLCALL>", 1)[1].split("</TOOLCALL>", 1)[0]
+    tool_calls = json.loads(payload)
+
+    assert tool_calls[0]["name"] == tool_name
+    assert tool_calls[0]["arguments"] == {"query": "vllm"}

From 644451fdcaa660e647fb44a8b6f412aff8dea408 Mon Sep 17 00:00:00 2001
From: Dong Wang <dongw2019@gmail.com>
Date: Mon, 11 May 2026 00:57:10 +0000
Subject: [PATCH 4/4] refactored parser name to be more specific toward Nano-V2

Signed-off-by: Dong Wang <dongw2019@gmail.com>
---
 docs/features/tool_calling.md                              | 7 ++++---
 ...son.jinja => tool_chat_template_nemotron_nano_v2.jinja} | 0
 ..._template.py => test_nemotron_nano_v2_chat_template.py} | 2 +-
 ...tool_parser.py => test_nemotron_nano_v2_tool_parser.py} | 6 +++---
 vllm/tool_parsers/__init__.py                              | 6 +++---
 ...json_tool_parser.py => nemotron_nano_v2_tool_parser.py} | 4 ++--
 6 files changed, 13 insertions(+), 12 deletions(-)
 rename examples/{tool_chat_template_nemotron_json.jinja => tool_chat_template_nemotron_nano_v2.jinja} (100%)
 rename tests/renderers/{test_nemotron_json_chat_template.py => test_nemotron_nano_v2_chat_template.py} (96%)
 rename tests/tool_parsers/{test_nemotron_json_tool_parser.py => test_nemotron_nano_v2_tool_parser.py} (97%)
 rename vllm/tool_parsers/{nemotron_json_tool_parser.py => nemotron_nano_v2_tool_parser.py} (98%)

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 5eb11c876029..ee543c47d4af 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -324,15 +324,16 @@ Supported models:
 
 Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`
 
-### Nemotron Models (`nemotron_json`)
+### Nemotron Nano v2 Models (`nemotron_nano_v2`)
 
 Supported models:
 
-* `nvidia/NVIDIA-Nemotron-Nano-9B-v2` (and FP8/NVFP4 variants; use with [examples/tool_chat_template_nemotron_json.jinja](../../examples/tool_chat_template_nemotron_json.jinja))
+* `nvidia/NVIDIA-Nemotron-Nano-9B-v2` (and FP8/NVFP4 variants; use with [examples/tool_chat_template_nemotron_nano_v2.jinja](../../examples/tool_chat_template_nemotron_nano_v2.jinja))
+* `nvidia/NVIDIA-Nemotron-Nano-12B-v2` (and FP8/NVFP4 variants; use with [examples/tool_chat_template_nemotron_nano_v2.jinja](../../examples/tool_chat_template_nemotron_nano_v2.jinja))
 
 The parser handles the `<TOOLCALL>[{"name": ..., "arguments": ...}, ...]</TOOLCALL>` envelope emitted by the Nemotron chat template, and works with the model's hybrid thinking mode: any `<think>...</think>` prefix is preserved as message content (or stripped by a reasoning parser if one is configured).
 
-Flags: `--tool-call-parser nemotron_json --chat-template examples/tool_chat_template_nemotron_json.jinja`
+Flags: `--tool-call-parser nemotron_nano_v2 --chat-template examples/tool_chat_template_nemotron_nano_v2.jinja`
 
 ### DeepSeek-V3 Models (`deepseek_v3`)
 
diff --git a/examples/tool_chat_template_nemotron_json.jinja b/examples/tool_chat_template_nemotron_nano_v2.jinja
similarity index 100%
rename from examples/tool_chat_template_nemotron_json.jinja
rename to examples/tool_chat_template_nemotron_nano_v2.jinja
diff --git a/tests/renderers/test_nemotron_json_chat_template.py b/tests/renderers/test_nemotron_nano_v2_chat_template.py
similarity index 96%
rename from tests/renderers/test_nemotron_json_chat_template.py
rename to tests/renderers/test_nemotron_nano_v2_chat_template.py
index 93a641e1fca9..4b4ce37ca035 100644
--- a/tests/renderers/test_nemotron_json_chat_template.py
+++ b/tests/renderers/test_nemotron_nano_v2_chat_template.py
@@ -9,7 +9,7 @@
 TEMPLATE_PATH = (
     Path(__file__).resolve().parent.parent.parent
     / "examples"
-    / "tool_chat_template_nemotron_json.jinja"
+    / "tool_chat_template_nemotron_nano_v2.jinja"
 )
 
 
diff --git a/tests/tool_parsers/test_nemotron_json_tool_parser.py b/tests/tool_parsers/test_nemotron_nano_v2_tool_parser.py
similarity index 97%
rename from tests/tool_parsers/test_nemotron_json_tool_parser.py
rename to tests/tool_parsers/test_nemotron_nano_v2_tool_parser.py
index 0106663f0407..f693ef6aa941 100644
--- a/tests/tool_parsers/test_nemotron_json_tool_parser.py
+++ b/tests/tool_parsers/test_nemotron_nano_v2_tool_parser.py
@@ -29,12 +29,12 @@ def mock_request():
 
 @pytest.fixture
 def parser(mock_tokenizer):
-    parser_cls = ToolParserManager.get_tool_parser("nemotron_json")
+    parser_cls = ToolParserManager.get_tool_parser("nemotron_nano_v2")
     return parser_cls(mock_tokenizer, tools=[])
 
 
-def test_nemotron_json_registered_and_accepts_tools(mock_tokenizer):
-    parser_cls = ToolParserManager.get_tool_parser("nemotron_json")
+def test_nemotron_nano_v2_registered_and_accepts_tools(mock_tokenizer):
+    parser_cls = ToolParserManager.get_tool_parser("nemotron_nano_v2")
 
     parser = parser_cls(mock_tokenizer, tools=[])
 
diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py
index e26070cdecb4..45f026888894 100644
--- a/vllm/tool_parsers/__init__.py
+++ b/vllm/tool_parsers/__init__.py
@@ -130,9 +130,9 @@
         "minimax_tool_parser",
         "MinimaxToolParser",
     ),
-    "nemotron_json": (
-        "nemotron_json_tool_parser",
-        "NemotronJSONToolParser",
+    "nemotron_nano_v2": (
+        "nemotron_nano_v2_tool_parser",
+        "NemotronNanoV2ToolParser",
     ),
     "mistral": (
         "mistral_tool_parser",
diff --git a/vllm/tool_parsers/nemotron_json_tool_parser.py b/vllm/tool_parsers/nemotron_nano_v2_tool_parser.py
similarity index 98%
rename from vllm/tool_parsers/nemotron_json_tool_parser.py
rename to vllm/tool_parsers/nemotron_nano_v2_tool_parser.py
index 10ca2c5c647a..0e1221228f24 100644
--- a/vllm/tool_parsers/nemotron_json_tool_parser.py
+++ b/vllm/tool_parsers/nemotron_nano_v2_tool_parser.py
@@ -30,8 +30,8 @@
 logger = init_logger(__name__)
 
 
-class NemotronJSONToolParser(ToolParser):
-    """Tool parser for Nemotron models that emit <TOOLCALL> JSON payloads."""
+class NemotronNanoV2ToolParser(ToolParser):
+    """Tool parser for Nemotron Nano v2 models that emit <TOOLCALL> JSON."""
 
     def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
         super().__init__(tokenizer, tools)