diff --git a/renderers/parsers.py b/renderers/parsers.py index 77fa850..a277a0e 100644 --- a/renderers/parsers.py +++ b/renderers/parsers.py @@ -20,6 +20,7 @@ from typing import Protocol, runtime_checkable from renderers.base import ParsedToolCall, ToolCallParseStatus +from renderers.parsing import _parse_xml_function_blocks # ── Shared helpers ─────────────────────────────────────────────────── @@ -156,7 +157,22 @@ def extract(self, ids: list[int]) -> tuple[list[int], list[ParsedToolCall]]: class Qwen35ToolParser: - """XML-style tool calls: ``V...``.""" + """XML-style tool calls: ``V...``. + + Mirrors vLLM's ``Qwen3CoderToolParser`` extraction logic — see + ``renderers.parsing`` for the full ladder and the rationale behind + picking ``qwen3_coder`` over ``qwen3_xml``. This parser drives + ``DefaultRenderer`` when callers wire it up with + ``tool_parser="qwen3.5"``; ``Qwen35Renderer.parse_response`` + routes through the same shared helpers via ``parse_qwen35``. + + Schema-aware coercion is intentionally inert here: ``ToolParser``'s + ``extract`` API does not receive the tool list (callers consume + the parsed values as opaque dicts), so ``param_index`` is empty + and every parameter goes through the no-schema branch of + ``_coerce_arg_value`` — vLLM-equivalent verbatim strings after the + case-insensitive ``null`` short-circuit. + """ def __init__(self, tokenizer): self._tokenizer = tokenizer @@ -169,69 +185,57 @@ def extract(self, ids: list[int]) -> tuple[list[int], list[ParsedToolCall]]: tc_start = _find(ids, self._tc_id) if tc_start == -1: return ids, [] + tool_calls: list[ParsedToolCall] = [] i = tc_start + param_index: dict = {} while i < len(ids): - if ids[i] == self._tc_id: - end = ( - _find(ids, self._tc_end_id, i + 1) - if self._tc_end_id is not None - else -1 - ) - if end == -1: - raw = _decode(self._tokenizer, ids[i + 1 :]) - tool_calls.append( - ParsedToolCall( - raw=raw, - token_span=(i, len(ids)), - status=ToolCallParseStatus.UNCLOSED_BLOCK, - ) - ) - break + if ids[i] != self._tc_id: + i += 1 + continue + + end = ( + _find(ids, self._tc_end_id, i + 1) + if self._tc_end_id is not None + else -1 + ) + if end == -1: + block_text = _decode(self._tokenizer, ids[i + 1 :]) + span = (i, len(ids)) + wrapper_unclosed = True + else: block_text = _decode(self._tokenizer, ids[i + 1 : end]) span = (i, end + 1) - name_match = re.search(r"]+)>", block_text) - if not name_match: - tool_calls.append( - ParsedToolCall( - raw=block_text, - token_span=span, - status=ToolCallParseStatus.MALFORMED_STRUCTURE, - ) - ) - i = end + 1 - continue - name = name_match.group(1) - arguments: dict = {} - any_json_fallback = False - for pm in re.finditer( - r"]+)>\n?(.*?)\n?", - block_text, - re.DOTALL, - ): - arg_name = pm.group(1) - arg_value = pm.group(2).strip() - try: - arguments[arg_name] = json.loads(arg_value) - except (json.JSONDecodeError, ValueError): - arguments[arg_name] = arg_value - any_json_fallback = True + wrapper_unclosed = False + + block_calls = _parse_xml_function_blocks( + block_text, + param_index=param_index, + token_span=span, + wrapper_unclosed=wrapper_unclosed, + ) + if block_calls: + tool_calls.extend(block_calls) + elif wrapper_unclosed: tool_calls.append( ParsedToolCall( raw=block_text, - name=name, - arguments=arguments, token_span=span, - status=( - ToolCallParseStatus.INVALID_JSON - if any_json_fallback - else ToolCallParseStatus.OK - ), + status=ToolCallParseStatus.UNCLOSED_BLOCK, ) ) - i = end + 1 else: - i += 1 + tool_calls.append( + ParsedToolCall( + raw=block_text, + token_span=span, + status=ToolCallParseStatus.MALFORMED_STRUCTURE, + ) + ) + + if wrapper_unclosed: + break + i = end + 1 return ids[:tc_start], tool_calls diff --git a/renderers/parsing.py b/renderers/parsing.py index ee26e89..0a8f25a 100644 --- a/renderers/parsing.py +++ b/renderers/parsing.py @@ -17,20 +17,122 @@ from __future__ import annotations import json +import re +import uuid from typing import Any from renderers.base import ParsedResponse, ParsedToolCall, ToolCallParseStatus, ToolSpec +# ── vLLM Qwen3CoderToolParser regex patterns ──────────────────────── +# +# Lifted verbatim from ``vllm/tool_parsers/qwen3coder_tool_parser.py`` +# (``self.tool_call_regex`` / ``tool_call_function_regex`` / +# ``tool_call_parameter_regex``). Each pattern has two branches: one +# for the closed form (with the matching end tag) and one anchored at +# end-of-string for unclosed output. The parameter regex uses +# positive lookaheads so a missing ```` is recovered from +# the next ```` boundary instead of +# silently dropping the value. + + +# Mask matching vLLM's ``vllm.utils.MASK_64_BITS`` so the random id we +# generate is byte-shape compatible with vLLM's ``random_uuid()``. +_MASK_64_BITS = (1 << 64) - 1 + + +def _make_tool_call_id() -> str: + """Mint a tool-call id in vLLM's default format. + + vLLM's ``ToolCall`` dataclass uses ``default_factory=make_tool_call_id`` + which returns ``f"chatcmpl-tool-{random_uuid()}"`` where + ``random_uuid`` is ``f"{uuid.uuid4().int & MASK_64_BITS:016x}"`` — + 16 hex characters drawn from the low 64 bits of a v4 UUID. We + reproduce the format so OpenAI-shaped clients get the same id + surface across renderers and vLLM serving paths. + """ + return f"chatcmpl-tool-{uuid.uuid4().int & _MASK_64_BITS:016x}" + + +_FUNCTION_BLOCK_RE = re.compile( + r"||(?=)|$)", + re.DOTALL, +) + + # ── Schema-aware argument coercion ────────────────────────────────── # # XML-style tool-call formats render argument values verbatim inside -# ```` tags with no quoting. ``true`` and the string +# ```` tags with no quoting. ``true`` and the string # ``"true"`` produce identical wire bytes; without the tool schema, the -# parser has no signal to distinguish them and defaults to -# ``json.loads`` (the historical behavior). When the caller passes -# ``tools=[...]``, parsers consult the per-parameter declared type to -# keep string args verbatim, matching vLLM / SGLang reference parsers. +# parser has no signal to distinguish them. We mirror vLLM's shared +# ``extract_types_from_schema`` + ``coerce_to_schema_type`` helpers in +# ``vllm/tool_parsers/utils.py``: +# +# 1. Flatten the JSON-Schema fragment to a set of types, recursing +# through ``anyOf`` / ``oneOf`` / ``allOf`` and inferring types from +# ``enum`` values. A schema with no extractable type information +# defaults to ``["string"]``. +# 2. Walk the priority-ordered ladder ``null > integer > number > +# boolean > object > array > string``, returning the first +# successful coercion. ``string`` is the always-succeeding terminal +# whenever it appears in the type set, which subsumes both pure +# string params and union types like ``Union[str, bool]``. +# +# Why ``qwen3_coder`` semantics and not ``qwen3_xml``: vLLM ships two +# parsers for the same wire format. ``qwen3_xml`` is the newer +# streaming-first parser (uses ``xml.parsers.expat`` and is the +# recommended choice for *live serving* because ``qwen3_coder`` has +# known infinite-loop bugs in vLLM's streaming path). For offline +# token-level extraction we follow ``qwen3_coder``, which itself routes +# through the shared ``coerce_to_schema_type`` helper above — so the +# two converge on scalar semantics in non-streaming use. +# TODO: when a streaming parse API lands, switch the streaming path to +# ``qwen3_xml``-style state-machine semantics to dodge vLLM's +# regex-streaming bugs (see HF Qwen3-Coder-Next discussion #17). + + +# Type aliases mirror ``vllm/tool_parsers/utils.py:_TYPE_ALIASES``. +_TYPE_ALIASES: dict[str, str] = { + "str": "string", + "text": "string", + "varchar": "string", + "char": "string", + "enum": "string", + "int": "integer", + "int32": "integer", + "int64": "integer", + "uint": "integer", + "uint32": "integer", + "uint64": "integer", + "long": "integer", + "short": "integer", + "unsigned": "integer", + "float": "number", + "float32": "number", + "float64": "number", + "double": "number", + "bool": "boolean", + "dict": "object", + "arr": "array", + "list": "array", + "sequence": "array", +} + + +# Priority order mirrors ``vllm/tool_parsers/utils.py:coerce_to_schema_type``. +_TYPE_PRIORITY = ( + "null", + "integer", + "number", + "boolean", + "object", + "array", + "string", +) def _build_param_type_index( @@ -59,42 +161,124 @@ def _build_param_type_index( return index -def _coerce_arg_value( - text: str, param_schema: dict[str, Any] | None -) -> tuple[Any, bool]: - """Coerce a raw ```` body to its declared type. +def _extract_types_from_schema(schema: Any) -> list[str]: + """Flatten a JSON-Schema fragment to a list of type strings. - Returns ``(value, used_json_fallback)``. The boolean is ``True`` only - when ``json.loads`` was attempted, raised, AND the schema doesn't - permit a string. Returning a string verbatim because the schema - permits strings is NOT a fallback. - - Rule (matches vLLM / SGLang reference parsers): + Byte-for-byte port of ``vllm/tool_parsers/utils.py:extract_types_from_schema``: + handles top-level ``type`` (string or list), infers types from + ``enum`` values, and recurses through ``anyOf`` / ``oneOf`` / + ``allOf``. Returns ``["string"]`` when no type information can be + determined — which makes the no-schema branch coerce to ``string`` + via the priority-ordered ladder in :func:`_coerce_arg_value`. + """ + if schema is None or not isinstance(schema, dict): + return ["string"] + + types: set[str] = set() + + if "type" in schema: + type_value = schema["type"] + if isinstance(type_value, str): + types.add(type_value) + elif isinstance(type_value, list): + for t in type_value: + if isinstance(t, str): + types.add(t) + + if "enum" in schema and isinstance(schema["enum"], list) and schema["enum"]: + for value in schema["enum"]: + if value is None: + types.add("null") + elif isinstance(value, bool): + types.add("boolean") + elif isinstance(value, int): + types.add("integer") + elif isinstance(value, float): + types.add("number") + elif isinstance(value, str): + types.add("string") + elif isinstance(value, list): + types.add("array") + elif isinstance(value, dict): + types.add("object") + + for choice_field in ("anyOf", "oneOf", "allOf"): + if choice_field in schema and isinstance(schema[choice_field], list): + for choice in schema[choice_field]: + types.update(_extract_types_from_schema(choice)) + + return list(types) if types else ["string"] - - If the param's declared ``type`` is ``"string"`` (or single-element - ``["string"]``), return ``text`` verbatim — never ``json.loads``. - - Otherwise try ``json.loads``. If that fails, return raw ``text``. - The ``used_json_fallback`` flag is ``True`` only when the schema - does NOT permit a string — i.e. the fallback is truly suspect. - Union types (``anyOf``/``oneOf``) that include ``"string"`` alongside - other types still attempt ``json.loads`` first so an explicit - integer / bool can parse; the string branch wins as fallback, and - landing there is expected — not a malformed-JSON signal. +def _coerce_arg_value( + text: str, param_schema: dict[str, Any] | None +) -> tuple[Any, bool]: + """Coerce a raw ```` body to its schema type. + + Mirrors vLLM ``coerce_to_schema_type`` (``vllm/tool_parsers/utils.py``): + flatten the schema to a set of JSON-Schema types (recursing through + ``anyOf``/``oneOf``/``allOf`` and ``enum``), then walk a priority- + ordered ladder ``null > integer > number > boolean > object > array + > string``, returning the first successful coercion. ``string`` is + the always-succeeding terminal whenever it is in the type set. + + Returns ``(value, used_fallback)``. ``used_fallback=True`` iff + every type in the schema declined the value AND the last-ditch + ``json.loads`` of the raw text also failed — i.e. the value + couldn't be expressed as anything the schema permits. The renderer + propagates that flag to ``ToolCallParseStatus.INVALID_JSON`` for the + verifier / RL-loss signal; vLLM has no such signal. """ - string_is_allowed = False - if param_schema is not None: - declared = param_schema.get("type") - if declared == "string" or declared == ["string"]: + schema_types = _extract_types_from_schema(param_schema) + normalized_types = { + _TYPE_ALIASES.get(key, key) + for t in schema_types + if isinstance(t, str) + for key in [t.strip().lower()] + } + + for candidate in _TYPE_PRIORITY: + if candidate not in normalized_types: + continue + if candidate == "null": + if text.lower() == "null": + return None, False + continue + if candidate == "string": return text, False - for branch in param_schema.get("anyOf") or param_schema.get("oneOf") or []: - if isinstance(branch, dict) and branch.get("type") == "string": - string_is_allowed = True - break + if candidate == "integer": + try: + return int(text), False + except (ValueError, TypeError): + continue + if candidate == "number": + # ``int(val)`` is inside the try block so ``nan`` (raises + # ``ValueError``) and ``inf`` (raises ``OverflowError``) skip + # the number branch instead of aborting the whole parse. + # vLLM's reference catches ``ValueError`` / ``TypeError`` only + # — we widen to ``OverflowError`` so ``"inf"`` doesn't crash. + try: + val = float(text) + return (val if val != int(val) else int(val)), False + except (ValueError, TypeError, OverflowError): + continue + if candidate == "boolean": + lowered = text.lower().strip() + if lowered in ("true", "1"): + return True, False + if lowered in ("false", "0"): + return False, False + continue + if candidate in ("object", "array"): + try: + return json.loads(text), False + except (json.JSONDecodeError, ValueError, TypeError): + continue + try: return json.loads(text), False except (json.JSONDecodeError, ValueError): - return text, not string_is_allowed + return text, True def _find(ids: list[int], target: int, start: int = 0) -> int: @@ -259,20 +443,49 @@ def parse_qwen35( content="", reasoning_content=reasoning or None, tool_calls=[] ) + full_text = _decode(tokenizer, ids) tc_start = _find(ids, tool_call_id) - tool_calls: list[ParsedToolCall] = [] - if tc_start != -1: - content_text = _decode(tokenizer, ids[:tc_start]).strip() - tool_calls = _parse_xml_tool_calls( - tokenizer, - ids[tc_start:], - tool_call_id, - tool_call_end_id, - section_offset=parse_offset + tc_start, - param_index=_build_param_type_index(tools), - ) - else: - content_text = _decode(tokenizer, ids).strip() + param_index = _build_param_type_index(tools) + try: + if tc_start != -1: + # vLLM ``qwen3coder_tool_parser.py:316-319``: content text + # is the raw slice up to the first ```` (or + # first ```` is present). + # No whitespace stripping — the model's prefix prose is + # surfaced verbatim. + content_text = _decode(tokenizer, ids[:tc_start]) + tool_calls = _parse_xml_tool_calls( + tokenizer, + ids[tc_start:], + tool_call_id, + tool_call_end_id, + section_offset=parse_offset + tc_start, + param_index=param_index, + ) + else: + # vLLM ``qwen3coder_tool_parser.py:269-271`` back-off: no + # ```` markers — scan whole output for raw + # ```` blocks. Content text is whatever sits + # before the first ``= 0 else full_text + else: + content_text = full_text + except Exception: + # vLLM ``qwen3coder_tool_parser.py:327-331`` catch-all: on any + # extraction error, drop every recovered call and surface the + # raw text as content so downstream consumers never see a + # half-parsed call. + content_text = full_text + tool_calls = [] return ParsedResponse( content=content_text, @@ -290,71 +503,149 @@ def _parse_xml_tool_calls( section_offset: int, param_index: dict[str, dict[str, dict[str, Any]]], ) -> list[ParsedToolCall]: - """Parse Qwen3.5-style XML tool calls from token IDs.""" - import re - + """Parse Qwen3.5-style XML tool calls from token IDs. + + Uses token IDs to demarcate ```` / ```` + boundaries (so ``token_span`` stays precise for trainer-side + masking) and vLLM-parity regex on the decoded block text to + extract the function/parameter content tolerantly. Mirrors + ``Qwen3CoderToolParser._parse_xml_function_call`` plus the + tag-tolerance branches of the three module-level patterns. + """ tool_calls: list[ParsedToolCall] = [] i = 0 while i < len(ids): - if ids[i] == tc_id: - end = _find(ids, tc_end_id, i + 1) - if end == -1: - raw = _decode(tokenizer, ids[i + 1 :]) - tool_calls.append( - ParsedToolCall( - raw=raw, - token_span=(section_offset + i, section_offset + len(ids)), - status=ToolCallParseStatus.UNCLOSED_BLOCK, - ) - ) - break + if ids[i] != tc_id: + i += 1 + continue + + end = _find(ids, tc_end_id, i + 1) + if end == -1: + block_ids = ids[i + 1 :] + block_text = _decode(tokenizer, block_ids) + span = (section_offset + i, section_offset + len(ids)) + wrapper_unclosed = True + else: block_text = _decode(tokenizer, ids[i + 1 : end]) span = (section_offset + i, section_offset + end + 1) - name_match = re.search(r"]+)>", block_text) - if not name_match: - tool_calls.append( - ParsedToolCall( - raw=block_text, - token_span=span, - status=ToolCallParseStatus.MALFORMED_STRUCTURE, - ) - ) - i = end + 1 - continue + wrapper_unclosed = False - name = name_match.group(1) - params = param_index.get(name, {}) - arguments: dict = {} - any_json_fallback = False - for pm in re.finditer( - r"]+)>\n?(.*?)\n?", block_text, re.DOTALL - ): - arg_name = pm.group(1) - arg_value = pm.group(2).strip() - value, used_fallback = _coerce_arg_value( - arg_value, params.get(arg_name) + block_calls = _parse_xml_function_blocks( + block_text, + param_index=param_index, + token_span=span, + wrapper_unclosed=wrapper_unclosed, + ) + if block_calls: + tool_calls.extend(block_calls) + elif wrapper_unclosed: + # vLLM would silently drop a content-less unclosed + # ````; we keep the diagnostic so verifier / + # RL-loss code can mask the malformed span. + tool_calls.append( + ParsedToolCall( + raw=block_text, + token_span=span, + status=ToolCallParseStatus.UNCLOSED_BLOCK, ) - arguments[arg_name] = value - any_json_fallback = any_json_fallback or used_fallback + ) + else: tool_calls.append( ParsedToolCall( raw=block_text, - name=name, - arguments=arguments, token_span=span, - status=( - ToolCallParseStatus.INVALID_JSON - if any_json_fallback - else ToolCallParseStatus.OK - ), + status=ToolCallParseStatus.MALFORMED_STRUCTURE, ) ) - i = end + 1 + + if wrapper_unclosed: + break + i = end + 1 + return tool_calls + + +def _parse_xml_function_blocks( + text: str, + *, + param_index: dict[str, dict[str, dict[str, Any]]], + token_span: tuple[int, int] | None, + wrapper_unclosed: bool, +) -> list[ParsedToolCall]: + """Apply vLLM's ```` regex over *text*. + + ``token_span`` is shared by every call recovered from this block; + the granularity is the surrounding ```` region (or the + whole completion in the no-marker back-off path), matching how + vLLM treats multiple ```` siblings — it does not try to + attribute character offsets back to token positions. + """ + tool_calls: list[ParsedToolCall] = [] + for match in _FUNCTION_BLOCK_RE.finditer(text): + closed = match.group(1) + body = closed if closed is not None else (match.group(2) or "") + function_unclosed = closed is None + end_index = body.find(">") + if end_index == -1: + tool_calls.append( + ParsedToolCall( + raw=body, + token_span=token_span, + status=ToolCallParseStatus.MALFORMED_STRUCTURE, + ) + ) + continue + + name = body[:end_index] + params_text = body[end_index + 1 :] + params = param_index.get(name, {}) + arguments: dict = {} + any_fallback = False + for capture in _PARAMETER_BLOCK_RE.findall(params_text): + idx = capture.find(">") + if idx == -1: + continue + arg_name = capture[:idx] + arg_value = _strip_one_newline(capture[idx + 1 :]) + value, used_fallback = _coerce_arg_value(arg_value, params.get(arg_name)) + arguments[arg_name] = value + any_fallback = any_fallback or used_fallback + + # Precedence: structural failure dominates coercion failure. + # When a block is both unclosed AND has args that couldn't be + # coerced, surface ``UNCLOSED_BLOCK`` only — the truncation is + # the root cause and the verifier already discounts unclosed + # blocks. ``INVALID_JSON`` is reserved for well-formed blocks + # whose recovered args couldn't satisfy the schema. + if wrapper_unclosed or function_unclosed: + status = ToolCallParseStatus.UNCLOSED_BLOCK + elif any_fallback: + status = ToolCallParseStatus.INVALID_JSON else: - i += 1 + status = ToolCallParseStatus.OK + tool_calls.append( + ParsedToolCall( + raw=body, + name=name, + arguments=arguments, + id=_make_tool_call_id(), + token_span=token_span, + status=status, + ) + ) return tool_calls +def _strip_one_newline(s: str) -> str: + """vLLM ``qwen3coder_tool_parser.py:246-250`` strips exactly one + leading and one trailing newline from a parameter value — no + further whitespace trimming.""" + if s.startswith("\n"): + s = s[1:] + if s.endswith("\n"): + s = s[:-1] + return s + + # ── GLM-5/4.7/4.5: name k v diff --git a/tests/test_arg_coercion.py b/tests/test_arg_coercion.py new file mode 100644 index 0000000..b966305 --- /dev/null +++ b/tests/test_arg_coercion.py @@ -0,0 +1,503 @@ +"""Schema-aware argument coercion parity with vLLM / SGLang. + +The XML-style tool-call wire format (Qwen3.5, GLM, MiniMax, Laguna) +renders parameter values verbatim with no quoting, so the parser must +consult the tool schema to know whether ``True`` is a bool or a string. +Renderers' previous coercion was strict ``json.loads``, which flagged +``True`` / ``False`` (capitalised Python literals) as ``INVALID_JSON`` +for boolean params — but both vLLM's ``Qwen3CoderToolParser`` and +SGLang's ``Qwen3CoderDetector`` accept them via case-folded comparison. + +These tests pin the new ladder to the vLLM / SGLang reference shape and +guard the bug from issue #47. +""" + +from __future__ import annotations + +import pytest + +from renderers.base import ToolCallParseStatus, load_tokenizer +from renderers.parsing import _coerce_arg_value, parse_qwen35 + + +# ── Direct coercion ───────────────────────────────────────────────── + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("true", True), + ("True", True), + ("TRUE", True), + ("false", False), + ("False", False), + ("FALSE", False), + ], +) +def test_boolean_accepts_case_insensitive(raw, expected): + value, used_fallback = _coerce_arg_value(raw, {"type": "boolean"}) + assert value is expected + assert used_fallback is False + + +@pytest.mark.parametrize("declared", ["boolean", "bool"]) +def test_boolean_type_aliases_all_accepted(declared): + value, used_fallback = _coerce_arg_value("True", {"type": declared}) + assert value is True + assert used_fallback is False + + +def test_boolean_garbage_keeps_raw_text_with_invalid_flag(): + """vLLM ``coerce_to_schema_type``: when the boolean branch declines + (value not in ``{true, false, 1, 0}``) and no other type in the + schema accepts the value, the terminal ``json.loads`` is attempted + and on failure the raw text is returned. We surface that path + via ``used_fallback=True`` for the verifier / RL-loss signal.""" + value, used_fallback = _coerce_arg_value("yes", {"type": "boolean"}) + assert value == "yes" + assert used_fallback is True + + +@pytest.mark.parametrize( + "declared", + [["boolean", "null"], ["integer", "null"], ["number", "null"], ["object", "null"]], +) +def test_null_coerces_when_null_is_in_type_set(declared): + """vLLM null-coerces only when ``"null"`` is one of the declared + schema types — typically via JSON-Schema array-form ``["X", "null"]`` + (the ``Optional[X]`` shape) or via ``anyOf: [..., {"type": "null"}]``. + Case-insensitive: matches ``null`` / ``Null`` / ``NULL``.""" + for raw in ("null", "Null", "NULL"): + value, used_fallback = _coerce_arg_value(raw, {"type": declared}) + assert value is None + assert used_fallback is False + + +def test_string_typed_null_returns_verbatim(): + """Pure string-typed param: ``"null"`` on the wire is just the + string ``"null"`` — no null coercion because ``"null"`` is not in + the schema's type set. Byte parity with vLLM + ``coerce_to_schema_type`` (string branch always succeeds).""" + for raw in ("null", "Null", "NULL"): + value, used_fallback = _coerce_arg_value(raw, {"type": "string"}) + assert value == raw + assert used_fallback is False + + +@pytest.mark.parametrize( + "declared,raw,expected", + [ + ("integer", "42", 42), + ("int", "-7", -7), + ("uint", "0", 0), + ("long", "9999999999", 9999999999), + ("short", "12", 12), + ("unsigned", "5", 5), + ], +) +def test_int_family_coerces(declared, raw, expected): + value, used_fallback = _coerce_arg_value(raw, {"type": declared}) + assert value == expected + assert isinstance(value, int) + assert used_fallback is False + + +def test_int_failure_keeps_raw_and_flags_fallback(): + value, used_fallback = _coerce_arg_value("abc", {"type": "integer"}) + assert value == "abc" + assert used_fallback is True + + +@pytest.mark.parametrize( + "raw,expected,exact_type", + [ + ("3.14", 3.14, float), + ("-2.5", -2.5, float), + # vLLM ``coerce_to_schema_type`` int-demotes whenever the parsed + # float is a whole number, regardless of source-string shape — + # so ``1e3``, ``1.0``, and ``7`` all become int. + ("1e3", 1000, int), + ("1.0", 1, int), + ("7", 7, int), + ], +) +def test_float_family_coerces(raw, expected, exact_type): + value, used_fallback = _coerce_arg_value(raw, {"type": "number"}) + assert value == expected + assert type(value) is exact_type + assert used_fallback is False + + +def test_float_failure_keeps_raw_and_flags_fallback(): + value, used_fallback = _coerce_arg_value("not-a-number", {"type": "float"}) + assert value == "not-a-number" + assert used_fallback is True + + +def test_object_via_json_loads(): + value, used_fallback = _coerce_arg_value('{"k": 1}', {"type": "object"}) + assert value == {"k": 1} + assert used_fallback is False + + +def test_object_python_literal_does_not_parse(): + """vLLM ``coerce_to_schema_type`` only uses ``json.loads`` for + object/array types — no ``ast.literal_eval`` fallback. Python-style + single-quoted dicts therefore land on the terminal raw-text path + and get flagged. (Renderers' previous ``ast.literal_eval`` fallback + was a vLLM deviation; we dropped it for parity.)""" + value, used_fallback = _coerce_arg_value("{'k': 1}", {"type": "object"}) + assert value == "{'k': 1}" + assert used_fallback is True + + +def test_array_via_json_loads(): + value, used_fallback = _coerce_arg_value("[1, 2, 3]", {"type": "array"}) + assert value == [1, 2, 3] + assert used_fallback is False + + +def test_object_total_failure_flags_invalid(): + value, used_fallback = _coerce_arg_value( + "this is not a json object", {"type": "object"} + ) + assert value == "this is not a json object" + assert used_fallback is True + + +def test_anyof_recursively_flattens_types(): + """vLLM ``extract_types_from_schema`` recurses through ``anyOf`` / + ``oneOf`` / ``allOf`` to build the type set. ``Union[int, str]`` ⇒ + ``{integer, string}``: integer parses first, falling back to string + for non-numeric values — and a bare string never gets flagged + because the string branch is the always-succeeding terminal.""" + schema = {"anyOf": [{"type": "integer"}, {"type": "string"}]} + + # Integer-shaped wire value rides the integer branch. + value, used_fallback = _coerce_arg_value("42", schema) + assert value == 42 + assert used_fallback is False + + # Non-numeric wire value declines integer, lands on string. The + # exact regression from PR #63: pre-fix, this was flagged + # ``INVALID_JSON`` because top-level ``type`` was absent and the + # parser tried ``json.loads("LIS")``. vLLM's recursive flatten + + # priority-ordered ladder fixes that as a side effect. + value, used_fallback = _coerce_arg_value("LIS", schema) + assert value == "LIS" + assert used_fallback is False + + +def test_oneof_with_string_branch_recovers_bare_strings(): + """Same recursive-flatten path as ``anyOf``: ``oneOf`` with a + string branch must accept bare-string wire values without flagging.""" + schema = {"oneOf": [{"type": "integer"}, {"type": "string"}]} + value, used_fallback = _coerce_arg_value("hello", schema) + assert value == "hello" + assert used_fallback is False + + +@pytest.mark.parametrize( + "declared", ["string", "str", "text", "varchar", "char", "enum"] +) +def test_string_family_returns_verbatim(declared): + value, used_fallback = _coerce_arg_value("True", {"type": declared}) + assert value == "True" + assert used_fallback is False + + +def test_string_with_list_form_type(): + value, used_fallback = _coerce_arg_value("True", {"type": ["string"]}) + assert value == "True" + assert used_fallback is False + + +def test_no_schema_returns_verbatim_string(): + """vLLM ``extract_types_from_schema(None)`` returns ``["string"]``, + which routes every value through the always-succeeding string + branch of the priority ladder. Untyped ``"null"`` therefore stays + a string — there is no top-level null short-circuit ahead of the + type check.""" + value, used_fallback = _coerce_arg_value("42", None) + assert value == "42" + assert used_fallback is False + + value, used_fallback = _coerce_arg_value("True", None) + assert value == "True" + assert used_fallback is False + + value, used_fallback = _coerce_arg_value("hello", None) + assert value == "hello" + assert used_fallback is False + + value, used_fallback = _coerce_arg_value("null", None) + assert value == "null" + assert used_fallback is False + + +def test_unknown_type_falls_through_to_terminal_json_loads(): + """A declared type the priority ladder doesn't recognise (e.g. + ``"date"``) skips every typed branch and lands on the terminal + ``json.loads``. JSON-shaped values still parse; everything else + keeps the raw text and gets flagged.""" + value, used_fallback = _coerce_arg_value("2024-01-01", {"type": "date"}) + assert value == "2024-01-01" + assert used_fallback is True + + # JSON-shaped value rides the terminal ``json.loads``. + value, used_fallback = _coerce_arg_value("[1, 2]", {"type": "date"}) + assert value == [1, 2] + assert used_fallback is False + + +# ── Integration: parse_qwen35 end-to-end (the bug from issue #47) ─── + + +_TOOLS_FROM_ISSUE = [ + { + "type": "function", + "function": { + "name": "filesystem_server_get_directory_tree", + "description": "List a directory tree.", + "parameters": { + "type": "object", + "properties": { + "path": {"type": "string"}, + "max_depth": {"type": "integer"}, + "include_files": {"type": "boolean"}, + "show_size": {"type": "boolean"}, + }, + }, + }, + } +] + + +def _parse(tok, completion: str, *, tools=None): + ids = tok.encode(completion, add_special_tokens=False) + return parse_qwen35( + tok, + list(ids), + stop_ids={tok.eos_token_id} if tok.eos_token_id is not None else set(), + think_id=tok.convert_tokens_to_ids(""), + think_end_id=tok.convert_tokens_to_ids(""), + tool_call_id=tok.convert_tokens_to_ids(""), + tool_call_end_id=tok.convert_tokens_to_ids(""), + tools=tools, + ) + + +def test_qwen35_capital_true_for_boolean_is_ok_status(): + """Regression for issue #47: ``True`` + used to be flagged ``INVALID_JSON`` because ``json.loads("True")`` + fails. SGLang and vLLM both accept it via case-folded comparison; + renderers now does too.""" + tok = load_tokenizer("Qwen/Qwen3.5-9B") + parsed = _parse( + tok, + "\n" + "\n" + "/\n" + "2\n" + "True\n" + "True\n" + "\n" + "", + tools=_TOOLS_FROM_ISSUE, + ) + assert len(parsed.tool_calls) == 1 + tc = parsed.tool_calls[0] + assert tc.status == ToolCallParseStatus.OK + assert tc.name == "filesystem_server_get_directory_tree" + assert tc.arguments == { + "path": "/", + "max_depth": 2, + "include_files": True, + "show_size": True, + } + + +# ── Structural tolerance: parity with vLLM's three regex patterns ──── + + +def test_qwen35_tolerates_missing_closing_parameter(): + """vLLM ``tool_call_parameter_regex`` uses positive lookaheads so a + missing ```` is recovered from the next ```` boundary instead of dropping the value.""" + tok = load_tokenizer("Qwen/Qwen3.5-9B") + parsed = _parse( + tok, + "\n" + "\n" + "/\n" # missing + "2\n" + "\n" + "", + tools=_TOOLS_FROM_ISSUE, + ) + assert len(parsed.tool_calls) == 1 + tc = parsed.tool_calls[0] + assert tc.status == ToolCallParseStatus.OK + assert tc.name == "filesystem_server_get_directory_tree" + assert tc.arguments == {"path": "/", "max_depth": 2} + + +def test_qwen35_tolerates_unclosed_function(): + """vLLM ``tool_call_function_regex`` second alternation matches an + unclosed ``\n" + "\n" + "/\n", # no , no + tools=_TOOLS_FROM_ISSUE, + ) + assert len(parsed.tool_calls) == 1 + tc = parsed.tool_calls[0] + assert tc.status == ToolCallParseStatus.UNCLOSED_BLOCK + assert tc.name == "filesystem_server_get_directory_tree" + assert tc.arguments == {"path": "/"} + + +def test_qwen35_backoff_no_tool_call_markers(): + """vLLM ``qwen3coder_tool_parser.py:269-271``: when the output + contains no ```` markers, treat the whole completion as + a single tool-call region and scan for ```` directly. + Content text is the raw slice up to the first ``\n" + "/etc\n" + "1\n" + "", + tools=_TOOLS_FROM_ISSUE, + ) + assert len(parsed.tool_calls) == 1 + tc = parsed.tool_calls[0] + assert tc.status == ToolCallParseStatus.OK + assert tc.name == "filesystem_server_get_directory_tree" + assert tc.arguments == {"path": "/etc", "max_depth": 1} + # vLLM-parity: content = text up to first ```` and no ````; the commented-out ``.rstrip()`` confirms + intent. We mirror that — surrounding whitespace round-trips.""" + tok = load_tokenizer("Qwen/Qwen3.5-9B") + parsed = _parse( + tok, + "let me check\n\n" # trailing whitespace before + "\n" + "\n" + "/\n" + "\n" + "", + tools=_TOOLS_FROM_ISSUE, + ) + assert parsed.content == "let me check\n\n" + assert len(parsed.tool_calls) == 1 + assert parsed.tool_calls[0].status == ToolCallParseStatus.OK + + +def test_qwen35_tool_call_id_matches_vllm_format(): + """vLLM's ``ToolCall`` default factory mints ids of the form + ``chatcmpl-tool-<16 hex>``. Each surfaced call gets a fresh id.""" + import re as _re + + tok = load_tokenizer("Qwen/Qwen3.5-9B") + parsed = _parse( + tok, + "\n" + "\n" + "/\n" + "\n" + "\n" + "\n" + "\n" + "/etc\n" + "\n" + "", + tools=_TOOLS_FROM_ISSUE, + ) + assert len(parsed.tool_calls) == 2 + pattern = _re.compile(r"^chatcmpl-tool-[0-9a-f]{16}$") + ids = [tc.id for tc in parsed.tool_calls] + for tcid in ids: + assert tcid is not None and pattern.match(tcid), tcid + # Two surfaced calls must get distinct ids. + assert ids[0] != ids[1] + + +def test_qwen35_exception_fallback_returns_full_text(): + """vLLM ``qwen3coder_tool_parser.py:327-331`` catch-all returns + ``tools_called=False, tool_calls=[], content=model_output`` on any + unhandled extraction error. We mirror that by wrapping the + tool-call extraction; reasoning content extracted earlier is kept. + """ + from unittest.mock import patch + + tok = load_tokenizer("Qwen/Qwen3.5-9B") + completion = ( + "\n" + "\n" + "/\n" + "\n" + "" + ) + ids = tok.encode(completion, add_special_tokens=False) + + with patch( + "renderers.parsing._parse_xml_tool_calls", + side_effect=RuntimeError("boom"), + ): + parsed = parse_qwen35( + tok, + list(ids), + stop_ids={tok.eos_token_id} if tok.eos_token_id is not None else set(), + think_id=tok.convert_tokens_to_ids(""), + think_end_id=tok.convert_tokens_to_ids(""), + tool_call_id=tok.convert_tokens_to_ids(""), + tool_call_end_id=tok.convert_tokens_to_ids(""), + tools=_TOOLS_FROM_ISSUE, + ) + assert parsed.tool_calls == [] + # content is the full decoded post-think text, surfaced verbatim. + assert "" in parsed.content + assert "" in parsed.content + + +def test_qwen35_parameter_value_preserves_internal_whitespace(): + """vLLM strips exactly one leading and one trailing ``\\n`` from the + parameter body (``qwen3coder_tool_parser.py:246-250``); inner + indentation must round-trip verbatim.""" + tok = load_tokenizer("Qwen/Qwen3.5-9B") + parsed = _parse( + tok, + "\n" + "\n" + "\n /etc \n\n" + "\n" + "", + tools=_TOOLS_FROM_ISSUE, + ) + assert len(parsed.tool_calls) == 1 + tc = parsed.tool_calls[0] + assert tc.status == ToolCallParseStatus.OK + # One leading + one trailing \n stripped — interior spaces preserved. + assert tc.arguments == {"path": " /etc "} diff --git a/tests/test_tool_arg_type_preservation.py b/tests/test_tool_arg_type_preservation.py index cf2a179..4ca4688 100644 --- a/tests/test_tool_arg_type_preservation.py +++ b/tests/test_tool_arg_type_preservation.py @@ -70,7 +70,12 @@ def renderer(model, renderer_name): # Each case: a single ``x`` argument whose value is a STRING that # happens to be valid JSON of another type. With a schema declaring -# ``x: string``, the parser must return the string verbatim. +# ``x: string``, the parser must return the string verbatim across all +# renderers — including the ``string-null`` case, which now rides +# vLLM's priority-ordered ladder where ``string`` is the always- +# succeeding terminal and ``null`` is only coerced when ``"null"`` is +# in the type set (``Optional[str]`` declares it; pure ``str`` does +# not). See ``vllm/tool_parsers/utils.py:coerce_to_schema_type``. JSON_LOOKING_STRING_ARGS = [ pytest.param({"x": "true"}, id="string-bool"), pytest.param({"x": "42"}, id="string-int"), @@ -119,7 +124,16 @@ def _extract_assistant_tokens(renderer, prompt, assistant_msg, *, tools=None): @pytest.mark.parametrize("args", JSON_LOOKING_STRING_ARGS) def test_string_arg_preserves_type(model, renderer_name, renderer, args): """Tool-call args of declared type ``str`` must round-trip as ``str``, - not get re-parsed as bool/int/null/list/dict by the parser.""" + not get re-parsed as bool/int/null/list/dict by the parser. + + Under vLLM ``coerce_to_schema_type`` parity, a string-typed param + whose wire bytes happen to spell ``null`` stays a string — null + coercion only fires when ``"null"`` is declared in the schema + (typically via ``Optional[X]`` ⇒ ``anyOf [X, null]`` or + ``type: ["X", "null"]``). The string branch is the always- + succeeding terminal of the priority ladder, so it absorbs every + bare wire value without flagging. + """ msg = { "role": "assistant", "content": "", @@ -135,6 +149,7 @@ def test_string_arg_preserves_type(model, renderer_name, renderer, args): assert parsed.tool_calls, f"{model}: parser returned no tool_calls" got = _normalize_args(parsed.tool_calls[0].arguments) + assert got == args, ( f"{model}: tool-arg type drift — sent {args!r}, parser returned {got!r}" )