From 4b6abc6e36ed61843a1c427c4e72e182a4928eec Mon Sep 17 00:00:00 2001 From: Dong Wang Date: Mon, 11 May 2026 00:03:05 +0000 Subject: [PATCH 1/4] tool_parsers: add Nemotron JSON tool parser Register a built-in parser for Nemotron JSON payloads, add a matching chat template example, and cover streaming extraction for content-plus-tool and parallel-call chunk boundaries. Signed-off-by: Dong Wang --- docs/features/tool_calling.md | 14 +- .../tool_chat_template_nemotron_json.jinja | 135 ++++++++ .../test_nemotron_json_tool_parser.py | 205 +++++++++++ vllm/tool_parsers/__init__.py | 4 + .../tool_parsers/nemotron_json_tool_parser.py | 327 ++++++++++++++++++ 5 files changed, 683 insertions(+), 2 deletions(-) create mode 100644 examples/tool_chat_template_nemotron_json.jinja create mode 100644 tests/tool_parsers/test_nemotron_json_tool_parser.py create mode 100644 vllm/tool_parsers/nemotron_json_tool_parser.py diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 9c60255d6928..5eb11c876029 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -324,6 +324,16 @@ Supported models: Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja` +### Nemotron Models (`nemotron_json`) + +Supported models: + +* `nvidia/NVIDIA-Nemotron-Nano-9B-v2` (and FP8/NVFP4 variants; use with [examples/tool_chat_template_nemotron_json.jinja](../../examples/tool_chat_template_nemotron_json.jinja)) + +The parser handles the `[{"name": ..., "arguments": ...}, ...]` envelope emitted by the Nemotron chat template, and works with the model's hybrid thinking mode: any `...` prefix is preserved as message content (or stripped by a reasoning parser if one is configured). + +Flags: `--tool-call-parser nemotron_json --chat-template examples/tool_chat_template_nemotron_json.jinja` + ### DeepSeek-V3 Models (`deepseek_v3`) Supported models: @@ -510,8 +520,8 @@ Here is a summary of a plugin file: # in --tool-call-parser. you can define as many # tool parsers as you want here. class ExampleToolParser(ToolParser): - def __init__(self, tokenizer: TokenizerLike): - super().__init__(tokenizer) + def __init__(self, tokenizer: TokenizerLike, tools=None): + super().__init__(tokenizer, tools) # adjust request. e.g.: set skip special tokens # to False for tool call output. diff --git a/examples/tool_chat_template_nemotron_json.jinja b/examples/tool_chat_template_nemotron_json.jinja new file mode 100644 index 000000000000..f6f0a380c107 --- /dev/null +++ b/examples/tool_chat_template_nemotron_json.jinja @@ -0,0 +1,135 @@ +{%- set ns = namespace(enable_thinking=true) -%} + +{%- for message in messages -%} + {%- set content = message['content'] -%} + {%- if message['role'] == 'user' or message['role'] == 'system' -%} + {%- if '/think' in content -%} + {%- set ns.enable_thinking = true -%} + {%- elif '/no_think' in content -%} + {%- set ns.enable_thinking = false -%} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if messages[0]['role'] != 'system' -%} + {%- set ns.non_tool_system_content = '' -%} + {{- 'System\n' -}} +{%- else -%} + {%- set ns.non_tool_system_content = messages[0]['content'] + .replace('/think', '') + .replace('/no_think', '') + .strip() + -%} + {{- 'System\n' + ns.non_tool_system_content }} +{%- endif -%} + +{%- if tools -%} + {%- if ns.non_tool_system_content is defined + and ns.non_tool_system_content != '' -%} + {{- '\n\n' -}} + {%- endif -%} + + {{- 'You can use the following tools to assist the user if required:' -}} + {{- '\n[' -}} + {%- for tool in tools -%} + {{- (tool.function if tool.function is defined else tool) | tojson -}} + {{- ', ' if not loop.last else '' -}} + {%- endfor -%} + {{- ']\n\n' -}} + + {{- 'If you decide to call any tool(s), use the following format:\n' -}} + {{- '[{{"name": "tool_name1", "arguments": "tool_args1"}}, ' -}} + {{- '{{"name": "tool_name2", "arguments": "tool_args2"}}]\n\n' -}} + + {{- 'The user will execute tool-calls and return responses from tool(s) in this format:\n' -}} + {{- '[{{"tool_response1"}}, {{"tool_response2"}}]\n\n' -}} + + {{- 'Based on the tool responses, you can call additional tools if needed, correct tool calls if any errors are found, or just respond to the user.' -}} +{%- endif -%} + +{{- '\n' -}} + +{%- set messages = messages[1:] if messages[0]['role'] == 'system' else messages -%} + +{%- if messages[-1]['role'] == 'assistant' -%} + {%- set ns.last_turn_assistant_content = messages[-1]['content'].strip() -%} + {%- set messages = messages[:-1] -%} +{%- endif -%} + +{%- for message in messages -%} + {%- set content = message['content'] -%} + + {%- if message['role'] == 'user' -%} + {{- 'User\n' + content.replace('/think', '').replace('/no_think', '').strip() + '\n' }} + + {%- elif message['role'] == 'tool' -%} + {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%} + {{- 'User\n' + '[' }} + {%- endif -%} + {{- message['content'] -}} + {{- ', ' if not loop.last and (messages[loop.index0 + 1].role == 'tool') else '' -}} + {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%} + {{- ']\n' -}} + {%- endif -%} + + {%- elif message['role'] == 'assistant' -%} + {%- if '' in content -%} + {%- set content = content.split('')[1].strip() -%} + {%- endif -%} + + {{- 'Assistant\n' + content.strip() }} + + {%- if message.tool_calls -%} + {%- if content.strip() != '' -%} + {{- '\n\n' -}} + {%- endif -%} + {{- '[' -}} + {%- for call in message.tool_calls -%} + {%- set fn = call.function if call.function is defined else call -%} + {{- '{"name": "' + fn.name + '", "arguments": ' -}} + {%- if fn.arguments is string -%} + {{- fn.arguments -}} + {%- else -%} + {{- fn.arguments | tojson -}} + {%- endif -%} + {{- '}' + (', ' if not loop.last else '') -}} + {%- endfor -%} + {{- ']' -}} + {%- endif -%} + + {{- '\n\n' -}} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {{- 'Assistant\n' -}} + {%- if ns.enable_thinking is defined and ns.enable_thinking is false -%} + {{- '' -}} + {%- else -%} + {{- '\n' -}} + {%- endif -%} + {%- if ns.last_turn_assistant_content is defined + and ns.last_turn_assistant_content != '' -%} + {{- ns.last_turn_assistant_content -}} + {%- endif -%} + +{%- else -%} + {%- if ns.last_turn_assistant_content is defined + and ns.last_turn_assistant_content != '' -%} + {{- 'Assistant\n' -}} + {%- if ns.enable_thinking is defined and ns.enable_thinking is false -%} + {{- '' -}} + {%- else -%} + {{- '\n' -}} + {%- endif -%} + {{- ns.last_turn_assistant_content -}} + + {%- if continue_final_message is defined -%} + {%- if continue_final_message is false -%} + {{- '\n\n' -}} + {%- endif -%} + {%- else -%} + {{- '\n\n' -}} + {%- endif -%} + {%- endif -%} +{%- endif -%} diff --git a/tests/tool_parsers/test_nemotron_json_tool_parser.py b/tests/tool_parsers/test_nemotron_json_tool_parser.py new file mode 100644 index 000000000000..3d885519efad --- /dev/null +++ b/tests/tool_parsers/test_nemotron_json_tool_parser.py @@ -0,0 +1,205 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock + +import pytest + +from tests.tool_parsers.utils import run_tool_extraction +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.engine.protocol import ExtractedToolCallInformation +from vllm.tool_parsers import ToolParserManager + + +@pytest.fixture +def mock_tokenizer(): + tokenizer = MagicMock() + tokenizer.get_vocab.return_value = {} + tokenizer.tokenize.side_effect = lambda text: list(text) + return tokenizer + + +@pytest.fixture +def mock_request(): + request = MagicMock(spec=ChatCompletionRequest) + request.tools = [] + request.tool_choice = "auto" + return request + + +@pytest.fixture +def parser(mock_tokenizer): + parser_cls = ToolParserManager.get_tool_parser("nemotron_json") + return parser_cls(mock_tokenizer, tools=[]) + + +def test_nemotron_json_registered_and_accepts_tools(mock_tokenizer): + parser_cls = ToolParserManager.get_tool_parser("nemotron_json") + + parser = parser_cls(mock_tokenizer, tools=[]) + + assert parser.tool_call_start_token == "" + + +def test_extract_tool_calls_returns_content_without_tool_call(parser, mock_request): + model_output = "No tool call here." + + result = parser.extract_tool_calls(model_output, mock_request) + + assert isinstance(result, ExtractedToolCallInformation) + assert result.tools_called is False + assert result.tool_calls == [] + assert result.content == model_output + + +def test_extract_tool_calls_from_nemotron_array(parser, mock_request): + model_output = ( + "Let me check that." + '[{"name": "get_weather", ' + '"arguments": {"city": "Tokyo", "unit": "celsius"}}]' + ) + + result = parser.extract_tool_calls(model_output, mock_request) + + assert result.tools_called is True + assert result.content == "Let me check that." + assert len(result.tool_calls) == 1 + assert result.tool_calls[0].type == "function" + assert result.tool_calls[0].function.name == "get_weather" + assert result.tool_calls[0].function.arguments == ( + '{"city": "Tokyo", "unit": "celsius"}' + ) + + +def test_extract_tool_calls_wraps_single_object(parser, mock_request): + model_output = ( + '{"name": "lookup", "arguments": {"query": "vllm"}}' + ) + + result = parser.extract_tool_calls(model_output, mock_request) + + assert result.tools_called is True + assert len(result.tool_calls) == 1 + assert result.tool_calls[0].function.name == "lookup" + assert result.tool_calls[0].function.arguments == '{"query": "vllm"}' + + +def test_extract_tool_calls_supports_string_arguments(parser, mock_request): + model_output = ( + '[{"name": "run_query", ' + '"arguments": "{\\"sql\\": \\"select 1\\"}"}]' + ) + + result = parser.extract_tool_calls(model_output, mock_request) + + assert result.tools_called is True + assert result.tool_calls[0].function.name == "run_query" + assert result.tool_calls[0].function.arguments == '{"sql": "select 1"}' + + +def test_extract_tool_calls_returns_original_for_malformed(parser, mock_request): + model_output = '[{"name": "broken", "arguments": {}' + + result = parser.extract_tool_calls(model_output, mock_request) + + assert result.tools_called is False + assert result.tool_calls == [] + assert result.content == model_output + + +def test_streaming_reconstructs_tool_call(parser, mock_request): + model_output = ( + "Let me check." + '[{"name": "get_weather", ' + '"arguments": {"city": "Tokyo", "unit": "celsius"}}]' + ) + + content, tool_calls = run_tool_extraction( + parser, + list(model_output), + request=mock_request, + streaming=True, + ) + + assert content == "Let me check." + assert len(tool_calls) == 1 + assert tool_calls[0].function.name == "get_weather" + assert tool_calls[0].function.arguments == ('{"city": "Tokyo", "unit": "celsius"}') + + +def test_streaming_handles_nested_json_arguments(parser, mock_request): + model_output = ( + '[{"name": "search", ' + '"arguments": {"filters": {"city": "Tokyo"}, ' + '"items": [{"name": "rain", "value": true}]}}]' + ) + + content, tool_calls = run_tool_extraction( + parser, + list(model_output), + request=mock_request, + streaming=True, + ) + + assert content is None + assert len(tool_calls) == 1 + assert tool_calls[0].function.name == "search" + assert tool_calls[0].function.arguments == ( + '{"filters": {"city": "Tokyo"}, "items": [{"name": "rain", "value": true}]}' + ) + + +def test_extract_tool_calls_keeps_think_block_as_content(parser, mock_request): + model_output = ( + "\nI need the weather for Tokyo.\n\n" + '[{"name": "get_weather", "arguments": {"city": "Tokyo"}}]' + ) + + result = parser.extract_tool_calls(model_output, mock_request) + + assert result.tools_called is True + assert result.content == "\nI need the weather for Tokyo.\n\n" + assert result.tool_calls[0].function.name == "get_weather" + assert result.tool_calls[0].function.arguments == '{"city": "Tokyo"}' + + +def test_streaming_keeps_think_block_as_content(parser, mock_request): + model_output = ( + "\nI need the weather for Tokyo.\n\n" + '[{"name": "get_weather", "arguments": {"city": "Tokyo"}}]' + ) + + content, tool_calls = run_tool_extraction( + parser, + list(model_output), + request=mock_request, + streaming=True, + ) + + assert content == "\nI need the weather for Tokyo.\n\n" + assert len(tool_calls) == 1 + assert tool_calls[0].function.name == "get_weather" + assert tool_calls[0].function.arguments == '{"city": "Tokyo"}' + + +def test_streaming_handles_multiple_tool_calls(parser, mock_request): + model_output = ( + '[{"name": "get_weather", ' + '"arguments": {"city": "Tokyo"}}, ' + '{"name": "lookup_timezone", ' + '"arguments": {"city": "Tokyo"}}]' + ) + + content, tool_calls = run_tool_extraction( + parser, + list(model_output), + request=mock_request, + streaming=True, + ) + + assert content is None + assert len(tool_calls) == 2 + assert tool_calls[0].function.name == "get_weather" + assert tool_calls[0].function.arguments == '{"city": "Tokyo"}' + assert tool_calls[1].function.name == "lookup_timezone" + assert tool_calls[1].function.arguments == '{"city": "Tokyo"}' diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py index 7c5f45d2022e..e26070cdecb4 100644 --- a/vllm/tool_parsers/__init__.py +++ b/vllm/tool_parsers/__init__.py @@ -130,6 +130,10 @@ "minimax_tool_parser", "MinimaxToolParser", ), + "nemotron_json": ( + "nemotron_json_tool_parser", + "NemotronJSONToolParser", + ), "mistral": ( "mistral_tool_parser", "MistralToolParser", diff --git a/vllm/tool_parsers/nemotron_json_tool_parser.py b/vllm/tool_parsers/nemotron_json_tool_parser.py new file mode 100644 index 000000000000..e015b5cba3d4 --- /dev/null +++ b/vllm/tool_parsers/nemotron_json_tool_parser.py @@ -0,0 +1,327 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence +from typing import Any + +import partial_json_parser +import regex as re +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest +from vllm.logger import init_logger +from vllm.tokenizers import TokenizerLike +from vllm.tool_parsers.abstract_tool_parser import Tool, ToolParser +from vllm.tool_parsers.utils import partial_tag_overlap + +logger = init_logger(__name__) + + +class NemotronJSONToolParser(ToolParser): + """Tool parser for Nemotron models that emit JSON payloads.""" + + def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None): + super().__init__(tokenizer, tools) + + self.tool_call_start_token = "" + self.tool_call_end_token = "" + self.tool_call_regex = re.compile( + rf"{self.tool_call_start_token}(.*?){self.tool_call_end_token}", + re.DOTALL, + ) + self._sent_content_idx = 0 + self._tool_args_emitted: list[bool] = [] + + def adjust_request( + self, request: ChatCompletionRequest | ResponsesRequest + ) -> ChatCompletionRequest | ResponsesRequest: + request = super().adjust_request(request) + if request.tools and request.tool_choice != "none": + request.skip_special_tokens = False + return request + + @staticmethod + def _normalize_tool_call_payload(payload: str) -> list[dict[str, Any]]: + payload = payload.strip() + if not payload.startswith("["): + payload = "[" + payload + if not payload.endswith("]"): + payload = payload + "]" + + parsed = json.loads(payload) + if isinstance(parsed, dict): + return [parsed] + if isinstance(parsed, list): + return [item for item in parsed if isinstance(item, dict)] + return [] + + @staticmethod + def _serialize_arguments(arguments: Any) -> str: + if isinstance(arguments, str): + return arguments + return json.dumps(arguments, ensure_ascii=False) + + @staticmethod + def _strip_trailing_auto_closers(chunk: str) -> str: + idx = len(chunk) + while idx > 0 and chunk[idx - 1] in " \t\r\n}]": + idx -= 1 + while idx > 0 and chunk[idx - 1] == '"': + if idx - 2 >= 0 and chunk[idx - 2] == "\\": + break + idx -= 1 + return chunk[:idx] + + @staticmethod + def _common_prefix_len(left: str, right: str) -> int: + max_len = min(len(left), len(right)) + idx = 0 + while idx < max_len and left[idx] == right[idx]: + idx += 1 + return idx + + def _compute_arguments_delta(self, arguments: Any, end_of_call: bool) -> str: + if self.current_tool_id < 0: + return "" + + while len(self.streamed_args_for_tool) <= self.current_tool_id: + self.streamed_args_for_tool.append("") + while len(self._tool_args_emitted) <= self.current_tool_id: + self._tool_args_emitted.append(False) + + cur_arguments = self._serialize_arguments(arguments) + streamed_prefix = self.streamed_args_for_tool[self.current_tool_id] + emitted_any = self._tool_args_emitted[self.current_tool_id] + + lcp_len = self._common_prefix_len(cur_arguments, streamed_prefix) + if lcp_len != len(streamed_prefix): + streamed_prefix = streamed_prefix[:lcp_len] + self.streamed_args_for_tool[self.current_tool_id] = streamed_prefix + + arguments_delta = cur_arguments[lcp_len:] + if not arguments_delta: + return "" + + if not end_of_call: + arguments_delta = self._strip_trailing_auto_closers(arguments_delta) + + if ( + not emitted_any + and not end_of_call + and arguments_delta + and arguments_delta.endswith("}") + ): + arguments_delta = arguments_delta[:-1] + if arguments_delta.endswith('"'): + arguments_delta = arguments_delta[:-1] + + return arguments_delta + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + if self.tool_call_start_token not in model_output: + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=model_output, + ) + + try: + payloads = self.tool_call_regex.findall(model_output) + tool_calls: list[ToolCall] = [] + for payload in payloads: + for raw_tool_call in self._normalize_tool_call_payload(payload): + try: + tool_calls.append( + ToolCall( + type="function", + function=FunctionCall( + name=raw_tool_call["name"], + arguments=self._serialize_arguments( + raw_tool_call["arguments"] + ), + ), + ) + ) + except Exception: + continue + + if not tool_calls: + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=model_output, + ) + + content = model_output[: model_output.find(self.tool_call_start_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None, + ) + except Exception: + logger.exception("Error extracting tool call from response.") + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=model_output, + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + if not previous_text: + self.current_tool_id = -1 + self.current_tool_name_sent = False + self.streamed_args_for_tool = [] + self._tool_args_emitted = [] + self._sent_content_idx = 0 + + start_idx = current_text.find(self.tool_call_start_token) + if start_idx == -1: + overlap = partial_tag_overlap(current_text, self.tool_call_start_token) + sendable_idx = len(current_text) - overlap + if sendable_idx > self._sent_content_idx: + content = current_text[self._sent_content_idx : sendable_idx] + self._sent_content_idx = sendable_idx + return DeltaMessage(content=content) + return None + + if self._sent_content_idx < start_idx: + content = current_text[self._sent_content_idx : start_idx] + self._sent_content_idx = start_idx + return DeltaMessage(content=content) + + payload_start = start_idx + len(self.tool_call_start_token) + payload_end = current_text.find(self.tool_call_end_token, payload_start) + end_of_call = payload_end != -1 + payload = current_text[ + payload_start : payload_end if end_of_call else len(current_text) + ] + if not payload.strip(): + return None + + flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR + try: + parsed_tool_calls = partial_json_parser.loads(payload, flags) + except ( + partial_json_parser.core.exceptions.MalformedJSON, + json.JSONDecodeError, + ValueError, + ): + return None + + if isinstance(parsed_tool_calls, dict): + parsed_tool_calls = [parsed_tool_calls] + if not isinstance(parsed_tool_calls, list) or not parsed_tool_calls: + return None + + if self.current_tool_id < 0: + self.current_tool_id = 0 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + self._tool_args_emitted.append(False) + + if self.current_tool_name_sent and self.current_tool_id + 1 < len( + parsed_tool_calls + ): + current_tool_call = parsed_tool_calls[self.current_tool_id] + if isinstance(current_tool_call, dict) and "arguments" in current_tool_call: + arguments_delta = self._compute_arguments_delta( + current_tool_call["arguments"], True + ) + if arguments_delta: + self.streamed_args_for_tool[self.current_tool_id] += arguments_delta + self._tool_args_emitted[self.current_tool_id] = True + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall(arguments=arguments_delta), + ) + ] + ) + + self.current_tool_id += 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + self._tool_args_emitted.append(False) + + if self.current_tool_id >= len(parsed_tool_calls): + return None + + current_tool_call = parsed_tool_calls[self.current_tool_id] + if not isinstance(current_tool_call, dict): + return None + + if not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if not function_name: + return None + + arguments_delta = "" + if "arguments" in current_tool_call: + arguments_delta = self._compute_arguments_delta( + current_tool_call["arguments"], end_of_call + ) + if arguments_delta: + self.streamed_args_for_tool[self.current_tool_id] += arguments_delta + self._tool_args_emitted[self.current_tool_id] = True + + self.current_tool_name_sent = True + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + id=make_tool_call_id(), + type="function", + function=DeltaFunctionCall( + name=function_name, + arguments=arguments_delta or None, + ), + ) + ] + ) + + if "arguments" not in current_tool_call: + return None + + arguments_delta = self._compute_arguments_delta( + current_tool_call["arguments"], end_of_call + ) + if not arguments_delta: + return None + + self.streamed_args_for_tool[self.current_tool_id] += arguments_delta + self._tool_args_emitted[self.current_tool_id] = True + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall(arguments=arguments_delta), + ) + ] + ) From 32367d45e3ae37d03e9c73598406bc4e6e62fba2 Mon Sep 17 00:00:00 2001 From: Dong Wang Date: Mon, 11 May 2026 00:07:49 +0000 Subject: [PATCH 2/4] cover streaming extraction for content-plus-tool and parallel-call chunk boundaries Signed-off-by: Dong Wang --- .../test_nemotron_json_tool_parser.py | 45 +++++++ .../tool_parsers/nemotron_json_tool_parser.py | 120 ++++++++---------- 2 files changed, 100 insertions(+), 65 deletions(-) diff --git a/tests/tool_parsers/test_nemotron_json_tool_parser.py b/tests/tool_parsers/test_nemotron_json_tool_parser.py index 3d885519efad..0106663f0407 100644 --- a/tests/tool_parsers/test_nemotron_json_tool_parser.py +++ b/tests/tool_parsers/test_nemotron_json_tool_parser.py @@ -195,6 +195,51 @@ def test_streaming_handles_multiple_tool_calls(parser, mock_request): list(model_output), request=mock_request, streaming=True, + assert_one_tool_per_delta=False, + ) + + assert content is None + assert len(tool_calls) == 2 + assert tool_calls[0].function.name == "get_weather" + assert tool_calls[0].function.arguments == '{"city": "Tokyo"}' + assert tool_calls[1].function.name == "lookup_timezone" + assert tool_calls[1].function.arguments == '{"city": "Tokyo"}' + + +def test_streaming_single_delta_handles_content_and_tool_call(parser, mock_request): + model_output = ( + "Let me check." + '[{"name": "get_weather", ' + '"arguments": {"city": "Tokyo"}}]' + ) + + content, tool_calls = run_tool_extraction( + parser, + [model_output], + request=mock_request, + streaming=True, + ) + + assert content == "Let me check." + assert len(tool_calls) == 1 + assert tool_calls[0].function.name == "get_weather" + assert tool_calls[0].function.arguments == '{"city": "Tokyo"}' + + +def test_streaming_single_delta_handles_multiple_tool_calls(parser, mock_request): + model_output = ( + '[{"name": "get_weather", ' + '"arguments": {"city": "Tokyo"}}, ' + '{"name": "lookup_timezone", ' + '"arguments": {"city": "Tokyo"}}]' + ) + + content, tool_calls = run_tool_extraction( + parser, + [model_output], + request=mock_request, + streaming=True, + assert_one_tool_per_delta=False, ) assert content is None diff --git a/vllm/tool_parsers/nemotron_json_tool_parser.py b/vllm/tool_parsers/nemotron_json_tool_parser.py index e015b5cba3d4..10ca2c5c647a 100644 --- a/vllm/tool_parsers/nemotron_json_tool_parser.py +++ b/vllm/tool_parsers/nemotron_json_tool_parser.py @@ -210,10 +210,10 @@ def extract_tool_calls_streaming( return DeltaMessage(content=content) return None + content_delta: str | None = None if self._sent_content_idx < start_idx: - content = current_text[self._sent_content_idx : start_idx] + content_delta = current_text[self._sent_content_idx : start_idx] self._sent_content_idx = start_idx - return DeltaMessage(content=content) payload_start = start_idx + len(self.tool_call_start_token) payload_end = current_text.find(self.tool_call_end_token, payload_start) @@ -245,55 +245,34 @@ def extract_tool_calls_streaming( self.streamed_args_for_tool.append("") self._tool_args_emitted.append(False) - if self.current_tool_name_sent and self.current_tool_id + 1 < len( - parsed_tool_calls - ): + tool_call_deltas: list[DeltaToolCall] = [] + while self.current_tool_id < len(parsed_tool_calls): current_tool_call = parsed_tool_calls[self.current_tool_id] - if isinstance(current_tool_call, dict) and "arguments" in current_tool_call: - arguments_delta = self._compute_arguments_delta( - current_tool_call["arguments"], True - ) - if arguments_delta: - self.streamed_args_for_tool[self.current_tool_id] += arguments_delta - self._tool_args_emitted[self.current_tool_id] = True - return DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.current_tool_id, - function=DeltaFunctionCall(arguments=arguments_delta), - ) - ] - ) - - self.current_tool_id += 1 - self.current_tool_name_sent = False - self.streamed_args_for_tool.append("") - self._tool_args_emitted.append(False) - - if self.current_tool_id >= len(parsed_tool_calls): - return None + if not isinstance(current_tool_call, dict): + break - current_tool_call = parsed_tool_calls[self.current_tool_id] - if not isinstance(current_tool_call, dict): - return None + call_complete = end_of_call or self.current_tool_id + 1 < len( + parsed_tool_calls + ) - if not self.current_tool_name_sent: - function_name = current_tool_call.get("name") - if not function_name: - return None + if not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if not function_name: + break - arguments_delta = "" - if "arguments" in current_tool_call: - arguments_delta = self._compute_arguments_delta( - current_tool_call["arguments"], end_of_call - ) - if arguments_delta: - self.streamed_args_for_tool[self.current_tool_id] += arguments_delta - self._tool_args_emitted[self.current_tool_id] = True + arguments_delta = "" + if "arguments" in current_tool_call: + arguments_delta = self._compute_arguments_delta( + current_tool_call["arguments"], call_complete + ) + if arguments_delta: + self.streamed_args_for_tool[self.current_tool_id] += ( + arguments_delta + ) + self._tool_args_emitted[self.current_tool_id] = True - self.current_tool_name_sent = True - return DeltaMessage( - tool_calls=[ + self.current_tool_name_sent = True + tool_call_deltas.append( DeltaToolCall( index=self.current_tool_id, id=make_tool_call_id(), @@ -303,25 +282,36 @@ def extract_tool_calls_streaming( arguments=arguments_delta or None, ), ) - ] - ) + ) + elif "arguments" in current_tool_call: + arguments_delta = self._compute_arguments_delta( + current_tool_call["arguments"], call_complete + ) + if arguments_delta: + self.streamed_args_for_tool[self.current_tool_id] += arguments_delta + self._tool_args_emitted[self.current_tool_id] = True + tool_call_deltas.append( + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall(arguments=arguments_delta), + ) + ) + elif not call_complete: + break - if "arguments" not in current_tool_call: - return None + if self.current_tool_id + 1 >= len(parsed_tool_calls): + break - arguments_delta = self._compute_arguments_delta( - current_tool_call["arguments"], end_of_call - ) - if not arguments_delta: - return None + self.current_tool_id += 1 + self.current_tool_name_sent = False + while len(self.streamed_args_for_tool) <= self.current_tool_id: + self.streamed_args_for_tool.append("") + while len(self._tool_args_emitted) <= self.current_tool_id: + self._tool_args_emitted.append(False) - self.streamed_args_for_tool[self.current_tool_id] += arguments_delta - self._tool_args_emitted[self.current_tool_id] = True - return DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.current_tool_id, - function=DeltaFunctionCall(arguments=arguments_delta), - ) - ] - ) + if content_delta is not None or tool_call_deltas: + return DeltaMessage( + content=content_delta, + tool_calls=tool_call_deltas or None, + ) + return None From ce7222b6972a3d1fdd9f017f010b7df377859cda Mon Sep 17 00:00:00 2001 From: Dong Wang Date: Mon, 11 May 2026 00:51:57 +0000 Subject: [PATCH 3/4] cover the case the too_name needs to be escaped Signed-off-by: Dong Wang --- .../tool_chat_template_nemotron_json.jinja | 4 +- .../test_nemotron_json_chat_template.py | 45 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 tests/renderers/test_nemotron_json_chat_template.py diff --git a/examples/tool_chat_template_nemotron_json.jinja b/examples/tool_chat_template_nemotron_json.jinja index f6f0a380c107..5bb9478d9974 100644 --- a/examples/tool_chat_template_nemotron_json.jinja +++ b/examples/tool_chat_template_nemotron_json.jinja @@ -86,7 +86,9 @@ {{- '[' -}} {%- for call in message.tool_calls -%} {%- set fn = call.function if call.function is defined else call -%} - {{- '{"name": "' + fn.name + '", "arguments": ' -}} + {{- '{"name": ' -}} + {{- fn.name | tojson -}} + {{- ', "arguments": ' -}} {%- if fn.arguments is string -%} {{- fn.arguments -}} {%- else -%} diff --git a/tests/renderers/test_nemotron_json_chat_template.py b/tests/renderers/test_nemotron_json_chat_template.py new file mode 100644 index 000000000000..93a641e1fca9 --- /dev/null +++ b/tests/renderers/test_nemotron_json_chat_template.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from pathlib import Path + +import jinja2.sandbox + +TEMPLATE_PATH = ( + Path(__file__).resolve().parent.parent.parent + / "examples" + / "tool_chat_template_nemotron_json.jinja" +) + + +def test_tool_call_name_is_json_escaped(): + template = jinja2.sandbox.ImmutableSandboxedEnvironment().from_string( + TEMPLATE_PATH.read_text() + ) + tool_name = 'search"quoted\\name' + rendered = template.render( + messages=[ + {"role": "user", "content": "Search docs"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": tool_name, + "arguments": {"query": "vllm"}, + }, + } + ], + }, + {"role": "tool", "content": '{"result": "ok"}'}, + ], + add_generation_prompt=False, + ) + + payload = rendered.split("", 1)[1].split("", 1)[0] + tool_calls = json.loads(payload) + + assert tool_calls[0]["name"] == tool_name + assert tool_calls[0]["arguments"] == {"query": "vllm"} From 644451fdcaa660e647fb44a8b6f412aff8dea408 Mon Sep 17 00:00:00 2001 From: Dong Wang Date: Mon, 11 May 2026 00:57:10 +0000 Subject: [PATCH 4/4] refactored parser name to be more specific toward Nano-V2 Signed-off-by: Dong Wang --- docs/features/tool_calling.md | 7 ++++--- ...son.jinja => tool_chat_template_nemotron_nano_v2.jinja} | 0 ..._template.py => test_nemotron_nano_v2_chat_template.py} | 2 +- ...tool_parser.py => test_nemotron_nano_v2_tool_parser.py} | 6 +++--- vllm/tool_parsers/__init__.py | 6 +++--- ...json_tool_parser.py => nemotron_nano_v2_tool_parser.py} | 4 ++-- 6 files changed, 13 insertions(+), 12 deletions(-) rename examples/{tool_chat_template_nemotron_json.jinja => tool_chat_template_nemotron_nano_v2.jinja} (100%) rename tests/renderers/{test_nemotron_json_chat_template.py => test_nemotron_nano_v2_chat_template.py} (96%) rename tests/tool_parsers/{test_nemotron_json_tool_parser.py => test_nemotron_nano_v2_tool_parser.py} (97%) rename vllm/tool_parsers/{nemotron_json_tool_parser.py => nemotron_nano_v2_tool_parser.py} (98%) diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 5eb11c876029..ee543c47d4af 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -324,15 +324,16 @@ Supported models: Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja` -### Nemotron Models (`nemotron_json`) +### Nemotron Nano v2 Models (`nemotron_nano_v2`) Supported models: -* `nvidia/NVIDIA-Nemotron-Nano-9B-v2` (and FP8/NVFP4 variants; use with [examples/tool_chat_template_nemotron_json.jinja](../../examples/tool_chat_template_nemotron_json.jinja)) +* `nvidia/NVIDIA-Nemotron-Nano-9B-v2` (and FP8/NVFP4 variants; use with [examples/tool_chat_template_nemotron_nano_v2.jinja](../../examples/tool_chat_template_nemotron_nano_v2.jinja)) +* `nvidia/NVIDIA-Nemotron-Nano-12B-v2` (and FP8/NVFP4 variants; use with [examples/tool_chat_template_nemotron_nano_v2.jinja](../../examples/tool_chat_template_nemotron_nano_v2.jinja)) The parser handles the `[{"name": ..., "arguments": ...}, ...]` envelope emitted by the Nemotron chat template, and works with the model's hybrid thinking mode: any `...` prefix is preserved as message content (or stripped by a reasoning parser if one is configured). -Flags: `--tool-call-parser nemotron_json --chat-template examples/tool_chat_template_nemotron_json.jinja` +Flags: `--tool-call-parser nemotron_nano_v2 --chat-template examples/tool_chat_template_nemotron_nano_v2.jinja` ### DeepSeek-V3 Models (`deepseek_v3`) diff --git a/examples/tool_chat_template_nemotron_json.jinja b/examples/tool_chat_template_nemotron_nano_v2.jinja similarity index 100% rename from examples/tool_chat_template_nemotron_json.jinja rename to examples/tool_chat_template_nemotron_nano_v2.jinja diff --git a/tests/renderers/test_nemotron_json_chat_template.py b/tests/renderers/test_nemotron_nano_v2_chat_template.py similarity index 96% rename from tests/renderers/test_nemotron_json_chat_template.py rename to tests/renderers/test_nemotron_nano_v2_chat_template.py index 93a641e1fca9..4b4ce37ca035 100644 --- a/tests/renderers/test_nemotron_json_chat_template.py +++ b/tests/renderers/test_nemotron_nano_v2_chat_template.py @@ -9,7 +9,7 @@ TEMPLATE_PATH = ( Path(__file__).resolve().parent.parent.parent / "examples" - / "tool_chat_template_nemotron_json.jinja" + / "tool_chat_template_nemotron_nano_v2.jinja" ) diff --git a/tests/tool_parsers/test_nemotron_json_tool_parser.py b/tests/tool_parsers/test_nemotron_nano_v2_tool_parser.py similarity index 97% rename from tests/tool_parsers/test_nemotron_json_tool_parser.py rename to tests/tool_parsers/test_nemotron_nano_v2_tool_parser.py index 0106663f0407..f693ef6aa941 100644 --- a/tests/tool_parsers/test_nemotron_json_tool_parser.py +++ b/tests/tool_parsers/test_nemotron_nano_v2_tool_parser.py @@ -29,12 +29,12 @@ def mock_request(): @pytest.fixture def parser(mock_tokenizer): - parser_cls = ToolParserManager.get_tool_parser("nemotron_json") + parser_cls = ToolParserManager.get_tool_parser("nemotron_nano_v2") return parser_cls(mock_tokenizer, tools=[]) -def test_nemotron_json_registered_and_accepts_tools(mock_tokenizer): - parser_cls = ToolParserManager.get_tool_parser("nemotron_json") +def test_nemotron_nano_v2_registered_and_accepts_tools(mock_tokenizer): + parser_cls = ToolParserManager.get_tool_parser("nemotron_nano_v2") parser = parser_cls(mock_tokenizer, tools=[]) diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py index e26070cdecb4..45f026888894 100644 --- a/vllm/tool_parsers/__init__.py +++ b/vllm/tool_parsers/__init__.py @@ -130,9 +130,9 @@ "minimax_tool_parser", "MinimaxToolParser", ), - "nemotron_json": ( - "nemotron_json_tool_parser", - "NemotronJSONToolParser", + "nemotron_nano_v2": ( + "nemotron_nano_v2_tool_parser", + "NemotronNanoV2ToolParser", ), "mistral": ( "mistral_tool_parser", diff --git a/vllm/tool_parsers/nemotron_json_tool_parser.py b/vllm/tool_parsers/nemotron_nano_v2_tool_parser.py similarity index 98% rename from vllm/tool_parsers/nemotron_json_tool_parser.py rename to vllm/tool_parsers/nemotron_nano_v2_tool_parser.py index 10ca2c5c647a..0e1221228f24 100644 --- a/vllm/tool_parsers/nemotron_json_tool_parser.py +++ b/vllm/tool_parsers/nemotron_nano_v2_tool_parser.py @@ -30,8 +30,8 @@ logger = init_logger(__name__) -class NemotronJSONToolParser(ToolParser): - """Tool parser for Nemotron models that emit JSON payloads.""" +class NemotronNanoV2ToolParser(ToolParser): + """Tool parser for Nemotron Nano v2 models that emit JSON.""" def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None): super().__init__(tokenizer, tools)