From 230384a588d6d456bc478432748eb6a3d5eeabc5 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 13 May 2026 09:42:09 -0700 Subject: [PATCH 01/16] feat(types): add RendererTransport literal + ClientConfig.renderer_transport --- verifiers/types.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/verifiers/types.py b/verifiers/types.py index 4242f8a86f..8bbc6bd573 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -78,6 +78,23 @@ EndpointClient: TypeAlias = AsyncOpenAI | OpenAI | AsyncAnthropic | Anthropic MessageType = Literal["chat", "completion"] # deprecated +# Wire-shape selector shared between RendererClient and +# OpenAIChatCompletionsTokenClient. Picks which inference-server surface the +# client targets at request-build time. Same flag drives both clients so a +# single `ClientConfig.renderer_transport` setting routes consistently. +# +# - "prime_vllm_generate" (default): vLLM's TITO surface. For RendererClient +# that's POST /v1/chat/completions with a renderer-flavored request body. +# For OpenAIChatCompletionsTokenClient that's POST +# /v1/chat/completions/tokens with `tokens=prompt_ids` and bridge +# tokenization via the server's /tokenize route. +# - "dynamo_chat_nvext": Dynamo's standard chat-completions route with +# pre-tokenized prompt carried in `nvext.token_data`. Server-side token +# IDs come back via `nvext.engine_data.completion_token_ids` (PR #8119 +# canonical channel). Bridge tokenization runs locally via the +# transformers fast tokenizer; no /tokenize HTTP round-trip. +RendererTransport = Literal["prime_vllm_generate", "dynamo_chat_nvext"] + # Provider-agnostic message + response types class CustomBaseModel(BaseModel): @@ -1269,6 +1286,8 @@ class ClientConfig(BaseModel): Drives the renderer pool when ``client_type == "renderer"``. Defaults to ``None`` so non-renderer clients aren't forced to declare it; the renderer client treats ``None`` as ``AutoRendererConfig()``.""" + renderer: str = "auto" + renderer_transport: RendererTransport = "prime_vllm_generate" renderer_model_name: str | None = None """Override the tokenizer model name used to instantiate the renderer pool. Defaults to the model used in API requests.""" From 131109619bf39a26accfb33b3c6964af0896aad0 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 13 May 2026 09:42:18 -0700 Subject: [PATCH 02/16] feat(clients): graft nvext.engine_data onto OpenAI response in parse_tokens Dynamo's vLLM and SGLang backends emit engine-emitted token IDs and per-token logprobs under `response.nvext.engine_data` when the client opts in via `nvext.extra_fields=["engine_data"]` (PR #8119). The vLLM-native path uses non-standard top-level fields (`choices[0].token_ids`, `response.prompt_token_ids`). Add a small graft inside `from_native_response.parse_tokens` that copies the engine_data fields onto the OpenAI-shaped response when present and the top-level fields are absent. The rest of parse_tokens then reads via the standard SDK attribute path regardless of backend. --- .../clients/openai_chat_completions_client.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index d7d262f4be..87a0564510 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -469,8 +469,54 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason: case _: return None + def _graft_engine_data(response: OpenAIChatResponse) -> None: + """Graft ``nvext.engine_data.*`` onto top-level response fields. + + Dynamo's vLLM/SGLang backends emit engine-side token IDs and + per-token logprobs under ``response.nvext.engine_data`` when the + client opts in via ``nvext.extra_fields=["engine_data"]`` (PR + #8119). Older vLLM-native paths set + ``response.choices[0].token_ids`` / ``response.prompt_token_ids`` + directly. This helper bridges the gap: if ``engine_data`` is + present and the top-level fields are missing, copy them across. + The rest of ``parse_tokens`` then reads via the standard openai + SDK attribute path regardless of backend. + """ + nvext = getattr(response, "nvext", None) + if nvext is None and hasattr(response, "model_dump"): + nvext = response.model_dump().get("nvext") + if not isinstance(nvext, dict): + return + engine_data = nvext.get("engine_data") + if not isinstance(engine_data, dict): + return + choice = response.choices[0] + if ( + getattr(choice, "token_ids", None) is None + and engine_data.get("completion_token_ids") is not None + ): + try: + choice.token_ids = list(engine_data["completion_token_ids"]) + except Exception: + object.__setattr__( + choice, "token_ids", list(engine_data["completion_token_ids"]) + ) + if ( + getattr(response, "prompt_token_ids", None) is None + and engine_data.get("prompt_token_ids") is not None + ): + try: + response.prompt_token_ids = list(engine_data["prompt_token_ids"]) + except Exception: + object.__setattr__( + response, + "prompt_token_ids", + list(engine_data["prompt_token_ids"]), + ) + def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: assert len(response.choices) == 1, "Response should always have one choice" + _graft_engine_data(response) choice = response.choices[0] if not hasattr(choice, "token_ids"): return None From c766529f48d5ddaef9dde7f398fc5d8ee6414ad7 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 13 May 2026 09:42:28 -0700 Subject: [PATCH 03/16] feat(tito): add dynamo_chat_nvext transport + local bridge tokenize The verifiers TITO client previously only spoke vLLM's TITO surface (POST /v1/chat/completions/tokens with tokens=prompt_ids; bridge tokens via /tokenize). Dynamo serves neither route, so multi-turn TITO against Dynamo silently degraded to MITO every turn-2+. This teaches OpenAIChatCompletionsTokenClient to read ClientConfig.renderer_transport and route accordingly: * prime_vllm_generate (default): unchanged. POST /v1/chat/completions/tokens with tokens=prompt_ids; bridge tokens via /tokenize HTTP. Requires vLLM >= 0.20. * dynamo_chat_nvext: POST /v1/chat/completions with placeholder messages + nvext.token_data=prompt_ids. Bridge tokens are computed locally via the model's HF fast tokenizer (no /tokenize HTTP round-trip). Server returns engine-side token IDs and logprobs under nvext.engine_data (PR #8119 channel), parsed by the OpenAIChatCompletionsClient.from_native_response graft so the rest of the pipeline is transport-agnostic. Also fix the normalize_for_comparison asymmetry that caused get_prompt_ids to never match for vf.Message-shaped input (the form MultiTurnEnv produces after maybe_normalize_messages). Drop None-valued keys so model_dump's exhaustive view is equivalent to to_native_prompt's slimmer view. --- .../openai_chat_completions_token_client.py | 266 +++++++++++++++++- 1 file changed, 258 insertions(+), 8 deletions(-) diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 2d8cd701cc..e5ec9a4a6e 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -18,11 +18,15 @@ OpenAITool, handle_openai_overlong_prompt, ) -from verifiers.types import SamplingArgs, State +from verifiers.types import RendererTransport, SamplingArgs, State from verifiers.utils.client_utils import ( post_chat_completion_with_routed_experts_sidecar, ) +# Sentinel for the default (legacy vLLM) transport. Lets callers route +# around the legacy /tokenize body shape without changing the signature. +_DEFAULT_TRANSPORT: RendererTransport = "prime_vllm_generate" + def _has_multimodal_content(messages) -> bool: """Check if any message contains multimodal content (images, audio). @@ -51,7 +55,25 @@ class TokenizeResponse(BaseModel): class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient): - """Wrapper for custom vLLM route /v1/chat/completions/tokens via AsyncOpenAI client.""" + """Token-in / token-out chat client. + + Two transports share this class, selected via + ``ClientConfig.renderer_transport``: + + * ``prime_vllm_generate`` (default): vLLM's TITO surface. + Posts to ``/v1/chat/completions/tokens`` with ``tokens=prompt_ids`` + and uses the server's ``/tokenize`` endpoint for bridge tokens. + Requires vLLM ``>=0.20``. + + * ``dynamo_chat_nvext``: Dynamo's standard ``/v1/chat/completions`` + route with ``nvext.token_data=prompt_ids``. Server-side response + token IDs come back via ``response.nvext.engine_data.*`` + (`OpenAIChatCompletionsClient.from_native_response` grafts them + onto the OpenAI-shaped response). Bridge tokens are computed + locally via the model's HuggingFace fast tokenizer — no + ``/tokenize`` HTTP round-trip — since Dynamo doesn't expose vLLM's + token routes. + """ @property def token_client(self) -> AsyncOpenAI: @@ -61,6 +83,38 @@ def token_client(self) -> AsyncOpenAI: base_url = base_url[:-3] return self.client.with_options(base_url=base_url) + @property + def renderer_transport(self) -> RendererTransport: + """Wire-shape selector. ``ClientConfig.renderer_transport`` if set, + else the default vLLM TITO surface. Mirrors the same field used by + ``RendererClient`` so backend selection stays in one place.""" + return cast( + RendererTransport, + getattr(self._config, "renderer_transport", _DEFAULT_TRANSPORT) + if self._config is not None + else _DEFAULT_TRANSPORT, + ) + + def _get_local_tokenizer(self, model: str): + """Lazy, per-model HF fast tokenizer for the ``dynamo_chat_nvext`` + transport. Bridge tokens are stitched locally — no ``/tokenize`` + round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained`` + cost once. + """ + cache: dict[str, Any] = self.__dict__.setdefault("_tokenizer_cache", {}) + if model in cache: + return cache[model] + try: + from transformers import AutoTokenizer # type: ignore[import-not-found] + except ImportError as exc: # pragma: no cover - dependency surface + raise ImportError( + "OpenAIChatCompletionsTokenClient with " + "renderer_transport='dynamo_chat_nvext' requires " + "`transformers`. Install with `pip install transformers`." + ) from exc + cache[model] = AutoTokenizer.from_pretrained(model) + return cache[model] + @handle_openai_overlong_prompt async def get_native_response( self, @@ -75,12 +129,49 @@ def normalize_sampling_args(sampling_args: SamplingArgs): if "max_tokens" in sampling_args: sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens") sampling_args["logprobs"] = True - extra_body = dict(return_token_ids=True) - if "extra_body" in sampling_args: - sampling_args["extra_body"] = { - **sampling_args["extra_body"], - **extra_body, + + # Transport-specific opt-ins. Both transports get response-side + # token IDs, just via different fields: + # + # * prime_vllm_generate (vLLM): `extra_body.return_token_ids=True` + # tells vLLM to set the non-standard `choices[0].token_ids` and + # `response.prompt_token_ids` fields. `parse_tokens` reads them + # directly. + # + # * dynamo_chat_nvext: `nvext.extra_fields=["engine_data"]` + # tells Dynamo's response builder to emit `response.nvext` + # `engine_data.{completion_token_ids, completion_logprobs, + # prompt_token_ids}` (PR #8119 channel mirrored to vLLM in + # ai-dynamo/dynamo `rl-sdk-2`). `from_native_response` grafts + # this onto the OpenAI-shaped response so `parse_tokens` + # works unmodified. `return_token_ids` is dropped because + # Dynamo's strict validator rejects it. + if self.renderer_transport == "dynamo_chat_nvext": + extra_body: dict[str, Any] = { + "nvext": {"extra_fields": ["engine_data"]} } + else: + extra_body = {"return_token_ids": True} + + if "extra_body" in sampling_args: + merged = {**sampling_args["extra_body"]} + # Merge nvext.extra_fields cumulatively rather than overwriting, + # so caller-provided extra_fields (e.g. "timing", "worker_id") + # coexist with our "engine_data" opt-in. + if "nvext" in merged and "nvext" in extra_body: + base = dict(merged.get("nvext") or {}) + inc = dict(extra_body.get("nvext") or {}) + base_ef = list(base.get("extra_fields") or []) + inc_ef = list(inc.get("extra_fields") or []) + merged_ef = list(dict.fromkeys(base_ef + inc_ef)) + merged_nvext = {**base, **inc, "extra_fields": merged_ef} + merged["nvext"] = merged_nvext + sampling_args["extra_body"] = { + **{k: v for k, v in extra_body.items() if k != "nvext"}, + **merged, + } + else: + sampling_args["extra_body"] = {**merged, **extra_body} else: sampling_args["extra_body"] = extra_body return {k: v for k, v in sampling_args.items() if v is not None} @@ -126,6 +217,16 @@ def normalize_sampling_args(sampling_args: SamplingArgs): prompt, model, sampling_args, tools, extra_headers=extra_headers ) + if self.renderer_transport == "dynamo_chat_nvext": + return await self._post_dynamo_chat_nvext( + prompt=prompt, + prompt_ids=prompt_ids, + model=model, + tools=tools, + sampling_args=sampling_args, + extra_headers=extra_headers, + ) + extra_body = sampling_args.pop("extra_body", {}) body = { "model": model, @@ -143,6 +244,86 @@ def normalize_sampling_args(sampling_args: SamplingArgs): extra_headers=extra_headers, ) + async def _post_dynamo_chat_nvext( + self, + prompt: OpenAIChatMessages, + prompt_ids: list[int], + model: str, + tools: list[OpenAITool] | None, + sampling_args: dict, + extra_headers: Mapping[str, str] | None, + ) -> OpenAIChatResponse: + """Post stitched ``prompt_ids`` to Dynamo's chat-completions route. + + The engine sees ``nvext.token_data`` and skips its own tokenization, + so the placeholder ``messages`` value stays small regardless of + trajectory length. Response token IDs come back via + ``response.nvext.engine_data.completion_token_ids`` and are grafted + onto ``choices[0].token_ids`` by + ``OpenAIChatCompletionsClient.from_native_response`` so the rest of + the pipeline reads them via the standard openai SDK attribute path. + """ + extra_body = dict(sampling_args.pop("extra_body", {}) or {}) + + # nvext.token_data is the canonical pre-tokenized-prompt channel. + # Merge with caller-provided nvext (extra_fields etc.) rather than + # overwriting it. normalize_sampling_args already injected + # extra_fields=["engine_data"] into extra_body.nvext, so this just + # adds token_data to that same dict. + caller_nvext = dict(extra_body.pop("nvext", None) or {}) + caller_nvext["token_data"] = prompt_ids + nvext = caller_nvext + + body: dict[str, Any] = { + "model": model, + "messages": prompt, # placeholder; engine ignores when token_data present + "stream": False, + "nvext": nvext, + } + if tools: + body["tools"] = tools + + # Sampling params that Dynamo's chat-completions surface accepts + # directly. Anything else stays in extra_body and rides as an + # unrecognized passthrough field (validate.rs PASSTHROUGH_EXTRA_FIELDS). + promotable = ( + "max_completion_tokens", + "max_tokens", + "temperature", + "top_p", + "top_k", + "min_p", + "seed", + "n", + "repetition_penalty", + "min_tokens", + "logprobs", + "top_logprobs", + "stop", + ) + for key in promotable: + value = sampling_args.get(key, extra_body.get(key)) + if value is not None and key not in body: + body[key] = value + + # Remaining extra_body keys (cache_salt, stop_token_ids, + # bad_words_token_ids, ...) pass through unchanged. The dynamo + # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist accepts these + # without rejection; unknown keys are silently ignored. + passthrough = { + k: v + for k, v in extra_body.items() + if k not in promotable and v is not None and k not in body + } + body.update(passthrough) + + return await self.client.post( + "/chat/completions", + body=body, + cast_to=ChatCompletion, + options={"headers": extra_headers} if extra_headers else {}, + ) + async def get_prompt_ids( self, state: State, @@ -176,6 +357,15 @@ def normalize_for_comparison(value: Any) -> Any: # prefix-match equality is unaffected. if normalized.get("content") == "": normalized["content"] = None + # Drop None-valued keys so model_dump's exhaustive view (which + # carries e.g. thinking_blocks=None on AssistantMessage) is + # equivalent to to_native_prompt's slimmer view (which omits + # the field entirely). Without this, vf.Message-shaped input + # (what MultiTurnEnv produces after maybe_normalize_messages) + # never matches the to_native_prompt-normalized step messages, + # which breaks the prefix match and forces TITO to fall back + # to MITO every turn-2+. + normalized = {k: v for k, v in normalized.items() if v is not None} return normalized if isinstance(value, list): return [normalize_for_comparison(item) for item in value] @@ -369,9 +559,28 @@ async def tokenize( extra_kwargs: dict | None = None, **kwargs, ) -> list[int]: - """Tokenize messages using the vLLM /tokenize API.""" + """Tokenize messages for bridge-token computation. + + Dispatched by ``renderer_transport``: + + * ``prime_vllm_generate`` (default): POST to vLLM's ``/tokenize`` route. + * ``dynamo_chat_nvext``: local HF fast-tokenizer call. Dynamo doesn't + expose ``/tokenize``; running locally also saves two HTTP RTTs per + turn (the bridge computes both ``add_generation_prompt=True`` and + ``False`` views). The HF Rust encode releases the GIL so the + ``asyncio.to_thread`` wrap gives the event loop real parallelism. + """ if extra_kwargs is None: extra_kwargs = {} + + if self.renderer_transport == "dynamo_chat_nvext": + return await self._local_tokenize( + messages=messages, + tools=tools, + model=model, + extra_kwargs=extra_kwargs, + ) + if isinstance(messages, str): body = dict( model=model, @@ -392,3 +601,44 @@ async def tokenize( "/tokenize", body=body, cast_to=TokenizeResponse ) return tokenize_response.tokens + + async def _local_tokenize( + self, + messages: str | OpenAIChatMessages, + tools: list[OpenAITool] | None, + model: str, + extra_kwargs: dict, + ) -> list[int]: + """Local in-process tokenization for the ``dynamo_chat_nvext`` transport. + + Bridge tokenization under TITO calls this twice per turn (once for + ``add_generation_prompt=True`` and once for ``False``). Both runs + execute in a worker thread so the event loop stays free; HF fast + tokenizers release the GIL during the Rust encode pass. + """ + import asyncio + + tokenizer = self._get_local_tokenizer(model) + add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True)) + chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {}) + + if isinstance(messages, str): + def _encode_text() -> list[int]: + return list(tokenizer.encode(messages, add_special_tokens=False)) + return await asyncio.to_thread(_encode_text) + + def _encode_chat() -> list[int]: + ids = tokenizer.apply_chat_template( + messages, + tools=tools, + add_generation_prompt=add_generation_prompt, + tokenize=True, + **chat_template_kwargs, + ) + if hasattr(ids, "input_ids"): + ids = ids.input_ids + if ids and isinstance(ids[0], list): + ids = ids[0] + return [int(t) for t in ids] + + return await asyncio.to_thread(_encode_chat) From f12bf6346d8dc2ee3669d5e18265880b2d6bda00 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 13 May 2026 20:43:58 -0700 Subject: [PATCH 04/16] feat(clients): graft top-level nvext.completion_token_ids + prompt_token_ids (plan B3) --- .../clients/openai_chat_completions_client.py | 70 ++++++++++++------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index 87a0564510..c4e60f3926 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -470,49 +470,67 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason: return None def _graft_engine_data(response: OpenAIChatResponse) -> None: - """Graft ``nvext.engine_data.*`` onto top-level response fields. - - Dynamo's vLLM/SGLang backends emit engine-side token IDs and - per-token logprobs under ``response.nvext.engine_data`` when the - client opts in via ``nvext.extra_fields=["engine_data"]`` (PR - #8119). Older vLLM-native paths set - ``response.choices[0].token_ids`` / ``response.prompt_token_ids`` - directly. This helper bridges the gap: if ``engine_data`` is - present and the top-level fields are missing, copy them across. - The rest of ``parse_tokens`` then reads via the standard openai - SDK attribute path regardless of backend. + """Graft engine-side token IDs onto top-level response fields. + + Three coexisting wire shapes from dynamo's vLLM/SGLang backends: + + 1. ``response.nvext.engine_data.{completion_token_ids, + completion_logprobs, prompt_token_ids}`` — PR #8119 channel + (opt-in: ``nvext.extra_fields=["engine_data"]``). + 2. ``response.nvext.completion_token_ids`` — top-level shape + from rl-sdk-2 plan A4 (opt-in: + ``nvext.extra_fields=["completion_token_ids"]``). No + logprobs in this shape; logprobs ride the standard + ``choices[0].logprobs.content[*].logprob`` channel. + 3. Older vLLM-native paths set ``response.choices[0].token_ids`` + / ``response.prompt_token_ids`` directly (no grafting needed). + + This helper bridges (1) and (2) onto the top-level fields the + rest of ``parse_tokens`` reads via the standard openai SDK + attribute path. ``engine_data`` wins when both are present (it + carries more — including logprobs + prompt_token_ids). """ nvext = getattr(response, "nvext", None) if nvext is None and hasattr(response, "model_dump"): nvext = response.model_dump().get("nvext") if not isinstance(nvext, dict): return - engine_data = nvext.get("engine_data") - if not isinstance(engine_data, dict): - return choice = response.choices[0] + + engine_data = nvext.get("engine_data") + completion_token_ids_top = nvext.get("completion_token_ids") + prompt_token_ids_top = nvext.get("prompt_token_ids") + + # Prefer engine_data over top-level when both arrive: engine_data + # bundles logprobs + prompt_token_ids in one place. + completion_token_ids: list[int] | None = None + prompt_token_ids: list[int] | None = None + if isinstance(engine_data, dict): + if engine_data.get("completion_token_ids") is not None: + completion_token_ids = list(engine_data["completion_token_ids"]) + if engine_data.get("prompt_token_ids") is not None: + prompt_token_ids = list(engine_data["prompt_token_ids"]) + if completion_token_ids is None and completion_token_ids_top is not None: + completion_token_ids = list(completion_token_ids_top) + if prompt_token_ids is None and prompt_token_ids_top is not None: + prompt_token_ids = list(prompt_token_ids_top) + if ( getattr(choice, "token_ids", None) is None - and engine_data.get("completion_token_ids") is not None + and completion_token_ids is not None ): try: - choice.token_ids = list(engine_data["completion_token_ids"]) + choice.token_ids = completion_token_ids except Exception: - object.__setattr__( - choice, "token_ids", list(engine_data["completion_token_ids"]) - ) + object.__setattr__(choice, "token_ids", completion_token_ids) if ( getattr(response, "prompt_token_ids", None) is None - and engine_data.get("prompt_token_ids") is not None + and prompt_token_ids is not None ): try: - response.prompt_token_ids = list(engine_data["prompt_token_ids"]) + response.prompt_token_ids = prompt_token_ids except Exception: - object.__setattr__( - response, - "prompt_token_ids", - list(engine_data["prompt_token_ids"]), - ) + object.__setattr__(response, "prompt_token_ids", prompt_token_ids) def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: assert len(response.choices) == 1, "Response should always have one choice" From ee3482aebfaf35e47ec73a55db9276364d63e1cd Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Thu, 14 May 2026 10:21:39 -0700 Subject: [PATCH 05/16] feat(clients): thread renderer_transport from ClientConfig to renderers.generate() --- verifiers/clients/renderer_client.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index 64ca4ec89d..ba97e1800c 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -603,15 +603,17 @@ async def get_native_response( multi_modal_data = None prompt_attribution = None - # ``renderers.client.generate`` discovers the engine's context-length - # cap on its own (via ``GET /v1/models``, cached) and raises - # ``renderers.OverlongPromptError`` on pre-flight overflow. Rebadge - # that into the verifiers-native ``OverlongPromptError`` so the - # ``MultiTurnEnv.prompt_too_long`` stop condition picks it up via - # the ``vf.Error`` hierarchy. The ``@handle_openai_overlong_prompt`` - # decorator still handles the fallback case (cap unknown → engine - # 4xx → vf.OverlongPromptError) for engines whose ``/v1/models`` - # doesn't expose ``max_model_len``. + # Thread renderer_transport from ClientConfig into generate() so the + # renderer client works against Dynamo's /v1/chat/completions surface + # as well as vLLM's /inference/v1/generate. setup_clients auto-picks + # "dynamo_chat_nvext" when client_config.backend == "dynamo". + # ``renderers.client.generate`` raises ``renderers.OverlongPromptError`` + # on pre-flight overflow; rebadge to verifiers-native so MultiTurnEnv stops. + transport = ( + self._config.renderer_transport + if self._config is not None + else "prime_vllm_generate" + ) try: return await generate( client=self.client, @@ -623,6 +625,7 @@ async def get_native_response( prompt_attribution=prompt_attribution, tools=tools, sampling_params=sampling_params, + transport=transport, cache_salt=args.get("cache_salt") or sampling_params.pop("cache_salt", None), priority=args.get("priority") or sampling_params.pop("priority", None), From 3b58bf98c0c8b4bec247de61ed5c0ee99860f352 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 00:31:57 -0700 Subject: [PATCH 06/16] fix(clients): address PR review R1-R5 (guard transport kwarg, import ChatCompletion, scrub return_token_ids, forward sampling args, graft engine_data logprobs) + rename to dynamo_chat --- ...st_openai_chat_completions_token_client.py | 33 ++++++++++++ .../clients/openai_chat_completions_client.py | 17 ++++++ .../openai_chat_completions_token_client.py | 52 ++++++++++++------- verifiers/clients/renderer_client.py | 11 ++-- verifiers/types.py | 8 +-- 5 files changed, 95 insertions(+), 26 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index 923ff118e0..46b0016416 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -293,3 +293,36 @@ async def fake_get_prompt_ids( # noqa: ANN001 assert len(recording_client.calls) == 1 assert recording_client.calls[0]["path"] == "/chat/completions/tokens" assert recording_client.calls[0]["body"]["tokens"] == [10, 20] + + +@pytest.mark.asyncio +async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): + """dynamo_chat wire body: vLLM-only keys scrubbed (R3), standard sampling + args forwarded (R4), nvext token_data + passthrough preserved.""" + recording_client = _RecordingClient() + client = OpenAIChatCompletionsTokenClient(recording_client) + + await client._post_dynamo_chat( + prompt=cast(Any, [{"role": "user", "content": ""}]), + prompt_ids=[1, 2, 3], + model="test-model", + tools=None, + sampling_args={ + "temperature": 0.5, + "presence_penalty": 0.2, # standard arg outside the old allowlist + "extra_body": { + "return_token_ids": True, # vLLM-only — must be scrubbed + "nvext": {"extra_fields": ["engine_data"]}, + "cache_salt": "ckpt-1", # passthrough must survive + }, + }, + extra_headers=None, + ) + + body = recording_client.calls[0]["body"] + assert "return_token_ids" not in body # R3 + assert body["presence_penalty"] == 0.2 # R4 + assert body["temperature"] == 0.5 + assert body["nvext"]["token_data"] == [1, 2, 3] + assert body["nvext"]["extra_fields"] == ["engine_data"] + assert body["cache_salt"] == "ckpt-1" # passthrough preserved diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index c4e60f3926..b954dd4ce0 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -505,11 +505,16 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None: # bundles logprobs + prompt_token_ids in one place. completion_token_ids: list[int] | None = None prompt_token_ids: list[int] | None = None + completion_logprobs: list[float] | None = None if isinstance(engine_data, dict): if engine_data.get("completion_token_ids") is not None: completion_token_ids = list(engine_data["completion_token_ids"]) if engine_data.get("prompt_token_ids") is not None: prompt_token_ids = list(engine_data["prompt_token_ids"]) + if engine_data.get("completion_logprobs") is not None: + completion_logprobs = [ + float(x) for x in engine_data["completion_logprobs"] + ] if completion_token_ids is None and completion_token_ids_top is not None: completion_token_ids = list(completion_token_ids_top) if prompt_token_ids is None and prompt_token_ids_top is not None: @@ -531,6 +536,18 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None: response.prompt_token_ids = prompt_token_ids except Exception: object.__setattr__(response, "prompt_token_ids", prompt_token_ids) + # Dynamo returns logprobs only under engine_data, not + # choices[0].logprobs. Synthesize the standard shape so parse_tokens + # (which requires choices[0].logprobs.content) can read them. + if ( + getattr(choice, "logprobs", None) is None + and completion_logprobs is not None + ): + synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]} + try: + choice.logprobs = synthesized + except Exception: + object.__setattr__(choice, "logprobs", synthesized) def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: assert len(response.choices) == 1, "Response should always have one choice" diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index e5ec9a4a6e..4ddb17dab5 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -3,6 +3,7 @@ from openai import AsyncOpenAI, BaseModel from openai.types.chat import ( + ChatCompletion, ChatCompletionAssistantMessageParam, ) from openai.types.chat.chat_completion_message_function_tool_call_param import ( @@ -25,7 +26,7 @@ # Sentinel for the default (legacy vLLM) transport. Lets callers route # around the legacy /tokenize body shape without changing the signature. -_DEFAULT_TRANSPORT: RendererTransport = "prime_vllm_generate" +_DEFAULT_TRANSPORT: RendererTransport = "vllm_generate" def _has_multimodal_content(messages) -> bool: @@ -60,12 +61,12 @@ class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient): Two transports share this class, selected via ``ClientConfig.renderer_transport``: - * ``prime_vllm_generate`` (default): vLLM's TITO surface. + * ``vllm_generate`` (default): vLLM's TITO surface. Posts to ``/v1/chat/completions/tokens`` with ``tokens=prompt_ids`` and uses the server's ``/tokenize`` endpoint for bridge tokens. Requires vLLM ``>=0.20``. - * ``dynamo_chat_nvext``: Dynamo's standard ``/v1/chat/completions`` + * ``dynamo_chat``: Dynamo's standard ``/v1/chat/completions`` route with ``nvext.token_data=prompt_ids``. Server-side response token IDs come back via ``response.nvext.engine_data.*`` (`OpenAIChatCompletionsClient.from_native_response` grafts them @@ -96,7 +97,7 @@ def renderer_transport(self) -> RendererTransport: ) def _get_local_tokenizer(self, model: str): - """Lazy, per-model HF fast tokenizer for the ``dynamo_chat_nvext`` + """Lazy, per-model HF fast tokenizer for the ``dynamo_chat`` transport. Bridge tokens are stitched locally — no ``/tokenize`` round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained`` cost once. @@ -109,7 +110,7 @@ def _get_local_tokenizer(self, model: str): except ImportError as exc: # pragma: no cover - dependency surface raise ImportError( "OpenAIChatCompletionsTokenClient with " - "renderer_transport='dynamo_chat_nvext' requires " + "renderer_transport='dynamo_chat' requires " "`transformers`. Install with `pip install transformers`." ) from exc cache[model] = AutoTokenizer.from_pretrained(model) @@ -133,12 +134,12 @@ def normalize_sampling_args(sampling_args: SamplingArgs): # Transport-specific opt-ins. Both transports get response-side # token IDs, just via different fields: # - # * prime_vllm_generate (vLLM): `extra_body.return_token_ids=True` + # * vllm_generate (vLLM): `extra_body.return_token_ids=True` # tells vLLM to set the non-standard `choices[0].token_ids` and # `response.prompt_token_ids` fields. `parse_tokens` reads them # directly. # - # * dynamo_chat_nvext: `nvext.extra_fields=["engine_data"]` + # * dynamo_chat: `nvext.extra_fields=["engine_data"]` # tells Dynamo's response builder to emit `response.nvext` # `engine_data.{completion_token_ids, completion_logprobs, # prompt_token_ids}` (PR #8119 channel mirrored to vLLM in @@ -146,7 +147,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs): # this onto the OpenAI-shaped response so `parse_tokens` # works unmodified. `return_token_ids` is dropped because # Dynamo's strict validator rejects it. - if self.renderer_transport == "dynamo_chat_nvext": + if self.renderer_transport == "dynamo_chat": extra_body: dict[str, Any] = { "nvext": {"extra_fields": ["engine_data"]} } @@ -217,8 +218,8 @@ def normalize_sampling_args(sampling_args: SamplingArgs): prompt, model, sampling_args, tools, extra_headers=extra_headers ) - if self.renderer_transport == "dynamo_chat_nvext": - return await self._post_dynamo_chat_nvext( + if self.renderer_transport == "dynamo_chat": + return await self._post_dynamo_chat( prompt=prompt, prompt_ids=prompt_ids, model=model, @@ -244,7 +245,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs): extra_headers=extra_headers, ) - async def _post_dynamo_chat_nvext( + async def _post_dynamo_chat( self, prompt: OpenAIChatMessages, prompt_ids: list[int], @@ -300,20 +301,33 @@ async def _post_dynamo_chat_nvext( "logprobs", "top_logprobs", "stop", + # Standard chat-completions sampling args (parity with the vLLM path, + # which spreads the full normalized sampling_args). + "presence_penalty", + "frequency_penalty", + "logit_bias", + "response_format", + "parallel_tool_calls", ) for key in promotable: value = sampling_args.get(key, extra_body.get(key)) if value is not None and key not in body: body[key] = value + # vLLM-only extra_body keys Dynamo's strict validator rejects — never + # forward these on the dynamo_chat wire (e.g. return_token_ids, which + # the vLLM path uses for TITO but Dynamo 400s on). + vllm_only = {"return_token_ids"} # Remaining extra_body keys (cache_salt, stop_token_ids, - # bad_words_token_ids, ...) pass through unchanged. The dynamo - # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist accepts these - # without rejection; unknown keys are silently ignored. + # bad_words_token_ids, ...) pass through unchanged via the dynamo + # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist. passthrough = { k: v for k, v in extra_body.items() - if k not in promotable and v is not None and k not in body + if k not in promotable + and k not in vllm_only + and v is not None + and k not in body } body.update(passthrough) @@ -563,8 +577,8 @@ async def tokenize( Dispatched by ``renderer_transport``: - * ``prime_vllm_generate`` (default): POST to vLLM's ``/tokenize`` route. - * ``dynamo_chat_nvext``: local HF fast-tokenizer call. Dynamo doesn't + * ``vllm_generate`` (default): POST to vLLM's ``/tokenize`` route. + * ``dynamo_chat``: local HF fast-tokenizer call. Dynamo doesn't expose ``/tokenize``; running locally also saves two HTTP RTTs per turn (the bridge computes both ``add_generation_prompt=True`` and ``False`` views). The HF Rust encode releases the GIL so the @@ -573,7 +587,7 @@ async def tokenize( if extra_kwargs is None: extra_kwargs = {} - if self.renderer_transport == "dynamo_chat_nvext": + if self.renderer_transport == "dynamo_chat": return await self._local_tokenize( messages=messages, tools=tools, @@ -609,7 +623,7 @@ async def _local_tokenize( model: str, extra_kwargs: dict, ) -> list[int]: - """Local in-process tokenization for the ``dynamo_chat_nvext`` transport. + """Local in-process tokenization for the ``dynamo_chat`` transport. Bridge tokenization under TITO calls this twice per turn (once for ``add_generation_prompt=True`` and once for ``False``). Both runs diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index ba97e1800c..cc0acd3556 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -606,14 +606,19 @@ async def get_native_response( # Thread renderer_transport from ClientConfig into generate() so the # renderer client works against Dynamo's /v1/chat/completions surface # as well as vLLM's /inference/v1/generate. setup_clients auto-picks - # "dynamo_chat_nvext" when client_config.backend == "dynamo". + # "dynamo_chat" when client_config.backend == "dynamo". # ``renderers.client.generate`` raises ``renderers.OverlongPromptError`` # on pre-flight overflow; rebadge to verifiers-native so MultiTurnEnv stops. transport = ( self._config.renderer_transport if self._config is not None - else "prime_vllm_generate" + else "vllm_generate" ) + # Only pass transport= when non-default: a pinned ``renderers`` may + # predate the kwarg, so the default path must use the upstream signature. + generate_kwargs: dict[str, Any] = {} + if transport != "vllm_generate": + generate_kwargs["transport"] = transport try: return await generate( client=self.client, @@ -625,11 +630,11 @@ async def get_native_response( prompt_attribution=prompt_attribution, tools=tools, sampling_params=sampling_params, - transport=transport, cache_salt=args.get("cache_salt") or sampling_params.pop("cache_salt", None), priority=args.get("priority") or sampling_params.pop("priority", None), extra_headers=extra_headers or None, + **generate_kwargs, ) except RendererOverlongPromptError as exc: raise OverlongPromptError(str(exc)) from exc diff --git a/verifiers/types.py b/verifiers/types.py index 8bbc6bd573..0d2dcb8abf 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -83,17 +83,17 @@ # client targets at request-build time. Same flag drives both clients so a # single `ClientConfig.renderer_transport` setting routes consistently. # -# - "prime_vllm_generate" (default): vLLM's TITO surface. For RendererClient +# - "vllm_generate" (default): vLLM's TITO surface. For RendererClient # that's POST /v1/chat/completions with a renderer-flavored request body. # For OpenAIChatCompletionsTokenClient that's POST # /v1/chat/completions/tokens with `tokens=prompt_ids` and bridge # tokenization via the server's /tokenize route. -# - "dynamo_chat_nvext": Dynamo's standard chat-completions route with +# - "dynamo_chat": Dynamo's standard chat-completions route with # pre-tokenized prompt carried in `nvext.token_data`. Server-side token # IDs come back via `nvext.engine_data.completion_token_ids` (PR #8119 # canonical channel). Bridge tokenization runs locally via the # transformers fast tokenizer; no /tokenize HTTP round-trip. -RendererTransport = Literal["prime_vllm_generate", "dynamo_chat_nvext"] +RendererTransport = Literal["vllm_generate", "dynamo_chat"] # Provider-agnostic message + response types @@ -1287,7 +1287,7 @@ class ClientConfig(BaseModel): to ``None`` so non-renderer clients aren't forced to declare it; the renderer client treats ``None`` as ``AutoRendererConfig()``.""" renderer: str = "auto" - renderer_transport: RendererTransport = "prime_vllm_generate" + renderer_transport: RendererTransport = "vllm_generate" renderer_model_name: str | None = None """Override the tokenizer model name used to instantiate the renderer pool. Defaults to the model used in API requests.""" From 7a85b8469a2599407fe726e57550a794a929e77a Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 00:41:48 -0700 Subject: [PATCH 07/16] fix(clients): graft engine_data logprobs even when choice logprobs is content-less; trim test comments --- ...st_openai_chat_completions_token_client.py | 49 +++++++++++++++++-- .../clients/openai_chat_completions_client.py | 16 ++++-- 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index 46b0016416..5391aee273 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -297,8 +297,8 @@ async def fake_get_prompt_ids( # noqa: ANN001 @pytest.mark.asyncio async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): - """dynamo_chat wire body: vLLM-only keys scrubbed (R3), standard sampling - args forwarded (R4), nvext token_data + passthrough preserved.""" + """dynamo_chat wire body: vLLM-only keys scrubbed, standard sampling args + forwarded, nvext token_data + passthrough preserved.""" recording_client = _RecordingClient() client = OpenAIChatCompletionsTokenClient(recording_client) @@ -320,9 +320,48 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): ) body = recording_client.calls[0]["body"] - assert "return_token_ids" not in body # R3 - assert body["presence_penalty"] == 0.2 # R4 + assert "return_token_ids" not in body + assert body["presence_penalty"] == 0.2 assert body["temperature"] == 0.5 assert body["nvext"]["token_data"] == [1, 2, 3] assert body["nvext"]["extra_fields"] == ["engine_data"] - assert body["cache_salt"] == "ckpt-1" # passthrough preserved + assert body["cache_salt"] == "ckpt-1" + + +@pytest.mark.asyncio +async def test_graft_engine_data_synthesizes_logprobs_when_content_less(): + """engine_data.completion_logprobs must be grafted even when the choice + carries a content-less logprobs object (not only when absent).""" + from openai.types.chat import ChatCompletion + + client = OpenAIChatCompletionsClient(_NoopClient()) + native = ChatCompletion.model_validate( + { + "id": "x", + "object": "chat.completion", + "created": 1, + "model": "test-model", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "ok"}, + "finish_reason": "stop", + "logprobs": {"content": None}, # present but content-less + } + ], + "nvext": { + "engine_data": { + "completion_token_ids": [10, 11], + "prompt_token_ids": [1, 2, 3], + "completion_logprobs": [-0.1, -0.2], + } + }, + } + ) + + vf_response = await client.from_native_response(native) + tokens = vf_response.message.tokens + assert tokens is not None # would be None before the fix (TITO lost) + assert tokens.completion_ids == [10, 11] + assert tokens.prompt_ids == [1, 2, 3] + assert tokens.completion_logprobs == [-0.1, -0.2] diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index b954dd4ce0..0da8b410cd 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -538,11 +538,17 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None: object.__setattr__(response, "prompt_token_ids", prompt_token_ids) # Dynamo returns logprobs only under engine_data, not # choices[0].logprobs. Synthesize the standard shape so parse_tokens - # (which requires choices[0].logprobs.content) can read them. - if ( - getattr(choice, "logprobs", None) is None - and completion_logprobs is not None - ): + # (which requires choices[0].logprobs.content) can read them. Graft + # whenever the choice has no usable logprobs content — i.e. logprobs + # is missing OR present-but-content-less (empty/None content) — not + # only when it is absent entirely. + existing_lp = getattr(choice, "logprobs", None) + existing_content = ( + existing_lp.get("content") + if isinstance(existing_lp, dict) + else getattr(existing_lp, "content", None) + ) + if completion_logprobs is not None and not existing_content: synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]} try: choice.logprobs = synthesized From 7cbb603ef6fead35e88d3d24c2a3638afa28c58a Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 00:45:28 -0700 Subject: [PATCH 08/16] fix(clients): dynamo_chat forwards full normalized sampling_args (drop fixed allowlist) for vLLM-path parity --- ...st_openai_chat_completions_token_client.py | 6 ++- .../openai_chat_completions_token_client.py | 54 ++++--------------- 2 files changed, 13 insertions(+), 47 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index 5391aee273..b3e5a798f4 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -309,11 +309,12 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): tools=None, sampling_args={ "temperature": 0.5, - "presence_penalty": 0.2, # standard arg outside the old allowlist + "presence_penalty": 0.2, + "reasoning_effort": "high", # arbitrary key: full parity, not an allowlist "extra_body": { "return_token_ids": True, # vLLM-only — must be scrubbed "nvext": {"extra_fields": ["engine_data"]}, - "cache_salt": "ckpt-1", # passthrough must survive + "cache_salt": "ckpt-1", }, }, extra_headers=None, @@ -323,6 +324,7 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): assert "return_token_ids" not in body assert body["presence_penalty"] == 0.2 assert body["temperature"] == 0.5 + assert body["reasoning_effort"] == "high" assert body["nvext"]["token_data"] == [1, 2, 3] assert body["nvext"]["extra_fields"] == ["engine_data"] assert body["cache_salt"] == "ckpt-1" diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 4ddb17dab5..6a3ee0164a 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -284,52 +284,16 @@ async def _post_dynamo_chat( if tools: body["tools"] = tools - # Sampling params that Dynamo's chat-completions surface accepts - # directly. Anything else stays in extra_body and rides as an - # unrecognized passthrough field (validate.rs PASSTHROUGH_EXTRA_FIELDS). - promotable = ( - "max_completion_tokens", - "max_tokens", - "temperature", - "top_p", - "top_k", - "min_p", - "seed", - "n", - "repetition_penalty", - "min_tokens", - "logprobs", - "top_logprobs", - "stop", - # Standard chat-completions sampling args (parity with the vLLM path, - # which spreads the full normalized sampling_args). - "presence_penalty", - "frequency_penalty", - "logit_bias", - "response_format", - "parallel_tool_calls", - ) - for key in promotable: - value = sampling_args.get(key, extra_body.get(key)) - if value is not None and key not in body: - body[key] = value - - # vLLM-only extra_body keys Dynamo's strict validator rejects — never - # forward these on the dynamo_chat wire (e.g. return_token_ids, which - # the vLLM path uses for TITO but Dynamo 400s on). + # Forward the full normalized sampling_args (parity with the vLLM path, + # which spreads all of sampling_args), then remaining extra_body keys — + # minus vLLM-only keys Dynamo's strict validator rejects (return_token_ids). + # Unknown keys ride through the dynamo frontend's PASSTHROUGH_EXTRA_FIELDS. vllm_only = {"return_token_ids"} - # Remaining extra_body keys (cache_salt, stop_token_ids, - # bad_words_token_ids, ...) pass through unchanged via the dynamo - # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist. - passthrough = { - k: v - for k, v in extra_body.items() - if k not in promotable - and k not in vllm_only - and v is not None - and k not in body - } - body.update(passthrough) + for source in (sampling_args, extra_body): + for key, value in source.items(): + if value is None or key in vllm_only or key in body: + continue + body[key] = value return await self.client.post( "/chat/completions", From 6b2dfbbaa4272b131b620b63dadff8dd92eaf9ed Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 01:07:28 -0700 Subject: [PATCH 09/16] fix(clients): centralize Dynamo denylist scrub (MITO+TITO), guard logprob length, tokenizer override, drop dead renderer field --- ...st_openai_chat_completions_token_client.py | 2 ++ .../clients/openai_chat_completions_client.py | 7 ++++- .../openai_chat_completions_token_client.py | 26 +++++++++++++++++-- verifiers/types.py | 1 - 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index b3e5a798f4..ad962c5ba2 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -311,6 +311,7 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): "temperature": 0.5, "presence_penalty": 0.2, "reasoning_effort": "high", # arbitrary key: full parity, not an allowlist + "spaces_between_special_tokens": False, # vLLM-only — must be scrubbed "extra_body": { "return_token_ids": True, # vLLM-only — must be scrubbed "nvext": {"extra_fields": ["engine_data"]}, @@ -322,6 +323,7 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): body = recording_client.calls[0]["body"] assert "return_token_ids" not in body + assert "spaces_between_special_tokens" not in body assert body["presence_penalty"] == 0.2 assert body["temperature"] == 0.5 assert body["reasoning_effort"] == "high" diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index 0da8b410cd..f8e7e80f4e 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -548,7 +548,12 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None: if isinstance(existing_lp, dict) else getattr(existing_lp, "content", None) ) - if completion_logprobs is not None and not existing_content: + if ( + completion_logprobs is not None + and completion_token_ids is not None + and len(completion_logprobs) == len(completion_token_ids) + and not existing_content + ): synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]} try: choice.logprobs = synthesized diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 6a3ee0164a..4725a74612 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -28,6 +28,12 @@ # around the legacy /tokenize body shape without changing the signature. _DEFAULT_TRANSPORT: RendererTransport = "vllm_generate" +# vLLM/prime-only sampling keys Dynamo's strict validator rejects — scrubbed +# from every dynamo_chat request body (both MITO and TITO paths). +_DYNAMO_DROP_KEYS = frozenset( + {"return_token_ids", "spaces_between_special_tokens", "priority"} +) + def _has_multimodal_content(messages) -> bool: """Check if any message contains multimodal content (images, audio). @@ -175,6 +181,15 @@ def normalize_sampling_args(sampling_args: SamplingArgs): sampling_args["extra_body"] = {**merged, **extra_body} else: sampling_args["extra_body"] = extra_body + if self.renderer_transport == "dynamo_chat": + # Drop vLLM/prime-only keys Dynamo rejects from both top-level + # args and extra_body, so MITO + TITO paths send a clean body. + eb = sampling_args.get("extra_body") + if isinstance(eb, dict): + for k in _DYNAMO_DROP_KEYS: + eb.pop(k, None) + for k in _DYNAMO_DROP_KEYS: + sampling_args.pop(k, None) return {k: v for k, v in sampling_args.items() if v is not None} sampling_args = normalize_sampling_args(sampling_args) @@ -288,7 +303,7 @@ async def _post_dynamo_chat( # which spreads all of sampling_args), then remaining extra_body keys — # minus vLLM-only keys Dynamo's strict validator rejects (return_token_ids). # Unknown keys ride through the dynamo frontend's PASSTHROUGH_EXTRA_FIELDS. - vllm_only = {"return_token_ids"} + vllm_only = _DYNAMO_DROP_KEYS for source in (sampling_args, extra_body): for key, value in source.items(): if value is None or key in vllm_only or key in body: @@ -596,7 +611,14 @@ async def _local_tokenize( """ import asyncio - tokenizer = self._get_local_tokenizer(model) + # Prefer the explicit tokenizer override so model aliases don't silently + # disable turn-2+ TITO (fall back to the served model name). + tok_model = ( + getattr(self._config, "renderer_model_name", None) or model + if self._config is not None + else model + ) + tokenizer = self._get_local_tokenizer(tok_model) add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True)) chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {}) diff --git a/verifiers/types.py b/verifiers/types.py index 0d2dcb8abf..ed1ffdb145 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -1286,7 +1286,6 @@ class ClientConfig(BaseModel): Drives the renderer pool when ``client_type == "renderer"``. Defaults to ``None`` so non-renderer clients aren't forced to declare it; the renderer client treats ``None`` as ``AutoRendererConfig()``.""" - renderer: str = "auto" renderer_transport: RendererTransport = "vllm_generate" renderer_model_name: str | None = None """Override the tokenizer model name used to instantiate the renderer From 9d260d3968c13201e8024ac628d3f8d024f4c329 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 01:31:14 -0700 Subject: [PATCH 10/16] fix(clients): enforce logprobs/ids length invariant in parse_tokens (all paths) --- verifiers/clients/openai_chat_completions_client.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index f8e7e80f4e..2084ea6f33 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -600,6 +600,11 @@ def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: logprobs_content = response.choices[0].logprobs["content"] completion_logprobs = [token["logprob"] for token in logprobs_content] + if len(completion_logprobs) != len(completion_ids): + # Engine returned mismatched logprobs/ids — drop rather than emit + # out-of-sync ResponseTokens. + return None + choice_extra = choice.model_extra or {} return ResponseTokens( prompt_ids=prompt_ids, From 4aa48a4dfcd8ae6907e28e9629b4ca17e5418105 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 03:04:29 -0700 Subject: [PATCH 11/16] fix(clients): centralize tokenizer override in _get_local_tokenizer; route dynamo TITO through routed-experts sidecar helper --- .../openai_chat_completions_token_client.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 4725a74612..8de5a10ca5 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -3,7 +3,6 @@ from openai import AsyncOpenAI, BaseModel from openai.types.chat import ( - ChatCompletion, ChatCompletionAssistantMessageParam, ) from openai.types.chat.chat_completion_message_function_tool_call_param import ( @@ -108,6 +107,14 @@ def _get_local_tokenizer(self, model: str): round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained`` cost once. """ + # Honor the explicit tokenizer override (renderer_model_name) so model + # aliases don't break bridge stitching; fall back to the served model. + override = ( + getattr(self._config, "renderer_model_name", None) + if self._config is not None + else None + ) + model = override or model cache: dict[str, Any] = self.__dict__.setdefault("_tokenizer_cache", {}) if model in cache: return cache[model] @@ -310,11 +317,14 @@ async def _post_dynamo_chat( continue body[key] = value - return await self.client.post( + # Use the sidecar-aware post (same as the vLLM TITO + MITO paths) so any + # routed_experts blob is streamed, not JSON-parsed. dynamo_chat opts into + # extra_fields=["engine_data"] only, so routed_experts is normally absent. + return await post_chat_completion_with_routed_experts_sidecar( + self.client, "/chat/completions", body=body, - cast_to=ChatCompletion, - options={"headers": extra_headers} if extra_headers else {}, + extra_headers=extra_headers, ) async def get_prompt_ids( @@ -611,14 +621,7 @@ async def _local_tokenize( """ import asyncio - # Prefer the explicit tokenizer override so model aliases don't silently - # disable turn-2+ TITO (fall back to the served model name). - tok_model = ( - getattr(self._config, "renderer_model_name", None) or model - if self._config is not None - else model - ) - tokenizer = self._get_local_tokenizer(tok_model) + tokenizer = self._get_local_tokenizer(model) add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True)) chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {}) From d713edc7ab58b4f3f2ad79e410f09d2c4166042c Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 03:11:41 -0700 Subject: [PATCH 12/16] fix(clients): load HF tokenizer inside worker thread (cache-miss from_pretrained must not block the event loop) --- verifiers/clients/openai_chat_completions_token_client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 8de5a10ca5..427ed4ee1e 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -621,16 +621,19 @@ async def _local_tokenize( """ import asyncio - tokenizer = self._get_local_tokenizer(model) add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True)) chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {}) + # Load the tokenizer inside the worker thread: a cache miss runs the + # synchronous AutoTokenizer.from_pretrained, which must not block the loop. if isinstance(messages, str): def _encode_text() -> list[int]: + tokenizer = self._get_local_tokenizer(model) return list(tokenizer.encode(messages, add_special_tokens=False)) return await asyncio.to_thread(_encode_text) def _encode_chat() -> list[int]: + tokenizer = self._get_local_tokenizer(model) ids = tokenizer.apply_chat_template( messages, tools=tools, From 193c5491c8fd5a3168dfefb9f4d6fa2868fdeb50 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 10 Jun 2026 02:33:59 -0700 Subject: [PATCH 13/16] feat(types): add dtype to RoutedExpertsPayload contract --- verifiers/types.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/verifiers/types.py b/verifiers/types.py index ed1ffdb145..aa408f3a7c 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -228,6 +228,10 @@ class RoutedExpertsPayload(TypedDict): data: Any shape: list[int] start: int + # Element dtype of the decoded expert-id buffer ("uint8" / "uint16" / + # "int32"). NotRequired so payloads serialized before this field still + # validate; consumers default to "uint8" (the historical encoding). + dtype: NotRequired[str] class ResponseTokens(CustomBaseModel): From c30dad26f8906f8bdbfa93806e9870c9af7bf96b Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 10 Jun 2026 11:05:07 -0700 Subject: [PATCH 14/16] fix(routed_experts): tighten dtype to Literal and make sidecar stripper key-order robust --- tests/test_trajectory_processing.py | 23 +++++++++++++++++++++++ verifiers/types.py | 8 ++++---- verifiers/utils/response_utils.py | 17 +++++++++++++---- 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/tests/test_trajectory_processing.py b/tests/test_trajectory_processing.py index 386e4fd947..4d4ee30111 100644 --- a/tests/test_trajectory_processing.py +++ b/tests/test_trajectory_processing.py @@ -483,3 +483,26 @@ def test_trajectory_step_mask_combining(): assert token_ids == [1, 2, 3, 4, 5] assert mask == [0, 0, 0, 1, 1] assert logprobs == [0.0, 0.0, 0.0, -0.1, -0.2] + + +def test_strip_routed_experts_data_key_order_robust(): + """The zero-copy stripper must find ``data`` regardless of key order + (``dtype``/``shape``/``start`` may precede it) and no-op when absent.""" + from verifiers.utils.response_utils import strip_routed_experts_data + + # data first (fast path) + raw = b'{"routed_experts":{"data":"QUJD","shape":[3],"start":0,"dtype":"uint8"}}' + stripped, blob = strip_routed_experts_data(raw) + assert blob is not None and blob.tobytes() == b"QUJD" + assert b'"data":""' in stripped + + # dtype/shape/start before data — must still strip the blob + raw2 = b'{"routed_experts":{"dtype":"uint16","shape":[3],"start":0,"data":"WFla"}}' + stripped2, blob2 = strip_routed_experts_data(raw2) + assert blob2 is not None and blob2.tobytes() == b"WFla" + assert b'"data":""' in stripped2 + + # absent — no-op passthrough + raw3 = b'{"choices":[{"token_ids":[1,2]}]}' + stripped3, blob3 = strip_routed_experts_data(raw3) + assert blob3 is None and stripped3 == raw3 diff --git a/verifiers/types.py b/verifiers/types.py index aa408f3a7c..62226d1f47 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -228,10 +228,10 @@ class RoutedExpertsPayload(TypedDict): data: Any shape: list[int] start: int - # Element dtype of the decoded expert-id buffer ("uint8" / "uint16" / - # "int32"). NotRequired so payloads serialized before this field still - # validate; consumers default to "uint8" (the historical encoding). - dtype: NotRequired[str] + # Element dtype of the decoded expert-id buffer. NotRequired so payloads + # serialized before this field still validate; a decoder that doesn't see + # it falls back to "uint8" (the historical encoding). + dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]] class ResponseTokens(CustomBaseModel): diff --git a/verifiers/utils/response_utils.py b/verifiers/utils/response_utils.py index 7bc13bc22d..336f59f17b 100644 --- a/verifiers/utils/response_utils.py +++ b/verifiers/utils/response_utils.py @@ -9,15 +9,24 @@ TrajectoryStepTokens, ) -ROUTED_EXPERTS_DATA_PREFIX = b'"routed_experts":{"data":"' +ROUTED_EXPERTS_OBJ_PREFIX = b'"routed_experts":{' +ROUTED_EXPERTS_DATA_KEY = b'"data":"' def strip_routed_experts_data(raw: bytes) -> tuple[bytes, memoryview | None]: - data_start = raw.find(ROUTED_EXPERTS_DATA_PREFIX) - if data_start < 0: + # Zero-copy fast path for the large base64 routed_experts blob: find the + # "data" value inside the routed_experts object regardless of key order + # (shape/start/dtype may precede it), slice it out before JSON parsing. + # No-op fallback (consumer b64-decodes the string) if the shape isn't found. + obj_start = raw.find(ROUTED_EXPERTS_OBJ_PREFIX) + if obj_start < 0: return raw, None - data_start += len(ROUTED_EXPERTS_DATA_PREFIX) + data_key = raw.find(ROUTED_EXPERTS_DATA_KEY, obj_start) + if data_key < 0: + return raw, None + + data_start = data_key + len(ROUTED_EXPERTS_DATA_KEY) data_end = raw.index(b'"', data_start) routed_data = memoryview(raw)[data_start:data_end] stripped = raw[:data_start] + raw[data_end:] From ea53210208163cc115615d5468040fce26fc6701 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 10 Jun 2026 11:18:37 -0700 Subject: [PATCH 15/16] fix(routed_experts): bound sidecar stripper to the routed_experts object; document dtype field --- docs/reference.md | 1 + tests/test_trajectory_processing.py | 6 ++++++ verifiers/utils/response_utils.py | 10 +++++++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/reference.md b/docs/reference.md index a50811f4aa..26b02f258d 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -215,6 +215,7 @@ class RoutedExpertsPayload(TypedDict): data: Any # actually memoryview; kept opaque so Pydantic skips schema validation shape: list[int] start: int + dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]] # optional; absent → uint8 ``` ### TrajectoryStepTokens diff --git a/tests/test_trajectory_processing.py b/tests/test_trajectory_processing.py index 4d4ee30111..3ebe7cdbb2 100644 --- a/tests/test_trajectory_processing.py +++ b/tests/test_trajectory_processing.py @@ -502,6 +502,12 @@ def test_strip_routed_experts_data_key_order_robust(): assert blob2 is not None and blob2.tobytes() == b"WFla" assert b'"data":""' in stripped2 + # routed_experts object lacks data; an unrelated sibling has data — must + # NOT be mistaken for routed experts (search bounded to the object). + raw4 = b'{"routed_experts":{"shape":[3],"start":0},"other":{"data":"UNRELATED"}}' + stripped4, blob4 = strip_routed_experts_data(raw4) + assert blob4 is None and stripped4 == raw4 + # absent — no-op passthrough raw3 = b'{"choices":[{"token_ids":[1,2]}]}' stripped3, blob3 = strip_routed_experts_data(raw3) diff --git a/verifiers/utils/response_utils.py b/verifiers/utils/response_utils.py index 336f59f17b..64539bda2a 100644 --- a/verifiers/utils/response_utils.py +++ b/verifiers/utils/response_utils.py @@ -22,7 +22,15 @@ def strip_routed_experts_data(raw: bytes) -> tuple[bytes, memoryview | None]: if obj_start < 0: return raw, None - data_key = raw.find(ROUTED_EXPERTS_DATA_KEY, obj_start) + # Bound the search to the routed_experts object so a missing `data` here + # can't match an unrelated sibling's `data` later in the response. The + # object's values (base64 string, int shape/start, dtype) contain no `}`, + # so the first `}` after the prefix closes it. + obj_end = raw.find(b"}", obj_start) + if obj_end < 0: + return raw, None + + data_key = raw.find(ROUTED_EXPERTS_DATA_KEY, obj_start, obj_end) if data_key < 0: return raw, None From b31ff2d767f482178ecb68ba73ff44a67ec1a7eb Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 10 Jun 2026 17:33:46 -0700 Subject: [PATCH 16/16] docs(clients): drop PR-number and branch/plan references from dynamo_chat comments --- verifiers/clients/openai_chat_completions_client.py | 4 ++-- verifiers/clients/openai_chat_completions_token_client.py | 3 +-- verifiers/types.py | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index 2084ea6f33..0246b9f669 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -475,10 +475,10 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None: Three coexisting wire shapes from dynamo's vLLM/SGLang backends: 1. ``response.nvext.engine_data.{completion_token_ids, - completion_logprobs, prompt_token_ids}`` — PR #8119 channel + completion_logprobs, prompt_token_ids}`` (opt-in: ``nvext.extra_fields=["engine_data"]``). 2. ``response.nvext.completion_token_ids`` — top-level shape - from rl-sdk-2 plan A4 (opt-in: + (opt-in: ``nvext.extra_fields=["completion_token_ids"]``). No logprobs in this shape; logprobs ride the standard ``choices[0].logprobs.content[*].logprob`` channel. diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 427ed4ee1e..36fd9f08cb 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -155,8 +155,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs): # * dynamo_chat: `nvext.extra_fields=["engine_data"]` # tells Dynamo's response builder to emit `response.nvext` # `engine_data.{completion_token_ids, completion_logprobs, - # prompt_token_ids}` (PR #8119 channel mirrored to vLLM in - # ai-dynamo/dynamo `rl-sdk-2`). `from_native_response` grafts + # prompt_token_ids}`. `from_native_response` grafts # this onto the OpenAI-shaped response so `parse_tokens` # works unmodified. `return_token_ids` is dropped because # Dynamo's strict validator rejects it. diff --git a/verifiers/types.py b/verifiers/types.py index 62226d1f47..fd511603c3 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -90,8 +90,8 @@ # tokenization via the server's /tokenize route. # - "dynamo_chat": Dynamo's standard chat-completions route with # pre-tokenized prompt carried in `nvext.token_data`. Server-side token -# IDs come back via `nvext.engine_data.completion_token_ids` (PR #8119 -# canonical channel). Bridge tokenization runs locally via the +# IDs come back via `nvext.engine_data.completion_token_ids` (the +# canonical Dynamo channel). Bridge tokenization runs locally via the # transformers fast tokenizer; no /tokenize HTTP round-trip. RendererTransport = Literal["vllm_generate", "dynamo_chat"]