From 230384a588d6d456bc478432748eb6a3d5eeabc5 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Wed, 13 May 2026 09:42:09 -0700
Subject: [PATCH 01/16] feat(types): add RendererTransport literal +
 ClientConfig.renderer_transport

---
 verifiers/types.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/verifiers/types.py b/verifiers/types.py
index 4242f8a86f..8bbc6bd573 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -78,6 +78,23 @@
 EndpointClient: TypeAlias = AsyncOpenAI | OpenAI | AsyncAnthropic | Anthropic
 MessageType = Literal["chat", "completion"]  # deprecated
 
+# Wire-shape selector shared between RendererClient and
+# OpenAIChatCompletionsTokenClient. Picks which inference-server surface the
+# client targets at request-build time. Same flag drives both clients so a
+# single `ClientConfig.renderer_transport` setting routes consistently.
+#
+# - "prime_vllm_generate" (default): vLLM's TITO surface. For RendererClient
+#   that's POST /v1/chat/completions with a renderer-flavored request body.
+#   For OpenAIChatCompletionsTokenClient that's POST
+#   /v1/chat/completions/tokens with `tokens=prompt_ids` and bridge
+#   tokenization via the server's /tokenize route.
+# - "dynamo_chat_nvext": Dynamo's standard chat-completions route with
+#   pre-tokenized prompt carried in `nvext.token_data`. Server-side token
+#   IDs come back via `nvext.engine_data.completion_token_ids` (PR #8119
+#   canonical channel). Bridge tokenization runs locally via the
+#   transformers fast tokenizer; no /tokenize HTTP round-trip.
+RendererTransport = Literal["prime_vllm_generate", "dynamo_chat_nvext"]
+
 
 # Provider-agnostic message + response types
 class CustomBaseModel(BaseModel):
@@ -1269,6 +1286,8 @@ class ClientConfig(BaseModel):
     Drives the renderer pool when ``client_type == "renderer"``. Defaults
     to ``None`` so non-renderer clients aren't forced to declare it; the
     renderer client treats ``None`` as ``AutoRendererConfig()``."""
+    renderer: str = "auto"
+    renderer_transport: RendererTransport = "prime_vllm_generate"
     renderer_model_name: str | None = None
     """Override the tokenizer model name used to instantiate the renderer
     pool. Defaults to the model used in API requests."""

From 131109619bf39a26accfb33b3c6964af0896aad0 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Wed, 13 May 2026 09:42:18 -0700
Subject: [PATCH 02/16] feat(clients): graft nvext.engine_data onto OpenAI
 response in parse_tokens

Dynamo's vLLM and SGLang backends emit engine-emitted token IDs and per-token
logprobs under `response.nvext.engine_data` when the client opts in via
`nvext.extra_fields=["engine_data"]` (PR #8119). The vLLM-native path uses
non-standard top-level fields (`choices[0].token_ids`, `response.prompt_token_ids`).

Add a small graft inside `from_native_response.parse_tokens` that copies the
engine_data fields onto the OpenAI-shaped response when present and the
top-level fields are absent. The rest of parse_tokens then reads via the
standard SDK attribute path regardless of backend.
---
 .../clients/openai_chat_completions_client.py | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
index d7d262f4be..87a0564510 100644
--- a/verifiers/clients/openai_chat_completions_client.py
+++ b/verifiers/clients/openai_chat_completions_client.py
@@ -469,8 +469,54 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason:
                 case _:
                     return None
 
+        def _graft_engine_data(response: OpenAIChatResponse) -> None:
+            """Graft ``nvext.engine_data.*`` onto top-level response fields.
+
+            Dynamo's vLLM/SGLang backends emit engine-side token IDs and
+            per-token logprobs under ``response.nvext.engine_data`` when the
+            client opts in via ``nvext.extra_fields=["engine_data"]`` (PR
+            #8119). Older vLLM-native paths set
+            ``response.choices[0].token_ids`` / ``response.prompt_token_ids``
+            directly. This helper bridges the gap: if ``engine_data`` is
+            present and the top-level fields are missing, copy them across.
+            The rest of ``parse_tokens`` then reads via the standard openai
+            SDK attribute path regardless of backend.
+            """
+            nvext = getattr(response, "nvext", None)
+            if nvext is None and hasattr(response, "model_dump"):
+                nvext = response.model_dump().get("nvext")
+            if not isinstance(nvext, dict):
+                return
+            engine_data = nvext.get("engine_data")
+            if not isinstance(engine_data, dict):
+                return
+            choice = response.choices[0]
+            if (
+                getattr(choice, "token_ids", None) is None
+                and engine_data.get("completion_token_ids") is not None
+            ):
+                try:
+                    choice.token_ids = list(engine_data["completion_token_ids"])
+                except Exception:
+                    object.__setattr__(
+                        choice, "token_ids", list(engine_data["completion_token_ids"])
+                    )
+            if (
+                getattr(response, "prompt_token_ids", None) is None
+                and engine_data.get("prompt_token_ids") is not None
+            ):
+                try:
+                    response.prompt_token_ids = list(engine_data["prompt_token_ids"])
+                except Exception:
+                    object.__setattr__(
+                        response,
+                        "prompt_token_ids",
+                        list(engine_data["prompt_token_ids"]),
+                    )
+
         def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
             assert len(response.choices) == 1, "Response should always have one choice"
+            _graft_engine_data(response)
             choice = response.choices[0]
             if not hasattr(choice, "token_ids"):
                 return None

From c766529f48d5ddaef9dde7f398fc5d8ee6414ad7 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Wed, 13 May 2026 09:42:28 -0700
Subject: [PATCH 03/16] feat(tito): add dynamo_chat_nvext transport + local
 bridge tokenize

The verifiers TITO client previously only spoke vLLM's TITO surface
(POST /v1/chat/completions/tokens with tokens=prompt_ids; bridge tokens
via /tokenize). Dynamo serves neither route, so multi-turn TITO against
Dynamo silently degraded to MITO every turn-2+.

This teaches OpenAIChatCompletionsTokenClient to read
ClientConfig.renderer_transport and route accordingly:

  * prime_vllm_generate (default): unchanged. POST /v1/chat/completions/tokens
    with tokens=prompt_ids; bridge tokens via /tokenize HTTP. Requires vLLM
    >= 0.20.

  * dynamo_chat_nvext: POST /v1/chat/completions with placeholder messages +
    nvext.token_data=prompt_ids. Bridge tokens are computed locally via the
    model's HF fast tokenizer (no /tokenize HTTP round-trip). Server returns
    engine-side token IDs and logprobs under nvext.engine_data (PR #8119
    channel), parsed by the OpenAIChatCompletionsClient.from_native_response
    graft so the rest of the pipeline is transport-agnostic.

Also fix the normalize_for_comparison asymmetry that caused get_prompt_ids
to never match for vf.Message-shaped input (the form MultiTurnEnv produces
after maybe_normalize_messages). Drop None-valued keys so model_dump's
exhaustive view is equivalent to to_native_prompt's slimmer view.
---
 .../openai_chat_completions_token_client.py   | 266 +++++++++++++++++-
 1 file changed, 258 insertions(+), 8 deletions(-)

diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py
index 2d8cd701cc..e5ec9a4a6e 100644
--- a/verifiers/clients/openai_chat_completions_token_client.py
+++ b/verifiers/clients/openai_chat_completions_token_client.py
@@ -18,11 +18,15 @@
     OpenAITool,
     handle_openai_overlong_prompt,
 )
-from verifiers.types import SamplingArgs, State
+from verifiers.types import RendererTransport, SamplingArgs, State
 from verifiers.utils.client_utils import (
     post_chat_completion_with_routed_experts_sidecar,
 )
 
+# Sentinel for the default (legacy vLLM) transport. Lets callers route
+# around the legacy /tokenize body shape without changing the signature.
+_DEFAULT_TRANSPORT: RendererTransport = "prime_vllm_generate"
+
 
 def _has_multimodal_content(messages) -> bool:
     """Check if any message contains multimodal content (images, audio).
@@ -51,7 +55,25 @@ class TokenizeResponse(BaseModel):
 
 
 class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient):
-    """Wrapper for custom vLLM route /v1/chat/completions/tokens via AsyncOpenAI client."""
+    """Token-in / token-out chat client.
+
+    Two transports share this class, selected via
+    ``ClientConfig.renderer_transport``:
+
+    * ``prime_vllm_generate`` (default): vLLM's TITO surface.
+      Posts to ``/v1/chat/completions/tokens`` with ``tokens=prompt_ids``
+      and uses the server's ``/tokenize`` endpoint for bridge tokens.
+      Requires vLLM ``>=0.20``.
+
+    * ``dynamo_chat_nvext``: Dynamo's standard ``/v1/chat/completions``
+      route with ``nvext.token_data=prompt_ids``. Server-side response
+      token IDs come back via ``response.nvext.engine_data.*``
+      (`OpenAIChatCompletionsClient.from_native_response` grafts them
+      onto the OpenAI-shaped response). Bridge tokens are computed
+      locally via the model's HuggingFace fast tokenizer — no
+      ``/tokenize`` HTTP round-trip — since Dynamo doesn't expose vLLM's
+      token routes.
+    """
 
     @property
     def token_client(self) -> AsyncOpenAI:
@@ -61,6 +83,38 @@ def token_client(self) -> AsyncOpenAI:
             base_url = base_url[:-3]
         return self.client.with_options(base_url=base_url)
 
+    @property
+    def renderer_transport(self) -> RendererTransport:
+        """Wire-shape selector. ``ClientConfig.renderer_transport`` if set,
+        else the default vLLM TITO surface. Mirrors the same field used by
+        ``RendererClient`` so backend selection stays in one place."""
+        return cast(
+            RendererTransport,
+            getattr(self._config, "renderer_transport", _DEFAULT_TRANSPORT)
+            if self._config is not None
+            else _DEFAULT_TRANSPORT,
+        )
+
+    def _get_local_tokenizer(self, model: str):
+        """Lazy, per-model HF fast tokenizer for the ``dynamo_chat_nvext``
+        transport. Bridge tokens are stitched locally — no ``/tokenize``
+        round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained``
+        cost once.
+        """
+        cache: dict[str, Any] = self.__dict__.setdefault("_tokenizer_cache", {})
+        if model in cache:
+            return cache[model]
+        try:
+            from transformers import AutoTokenizer  # type: ignore[import-not-found]
+        except ImportError as exc:  # pragma: no cover - dependency surface
+            raise ImportError(
+                "OpenAIChatCompletionsTokenClient with "
+                "renderer_transport='dynamo_chat_nvext' requires "
+                "`transformers`. Install with `pip install transformers`."
+            ) from exc
+        cache[model] = AutoTokenizer.from_pretrained(model)
+        return cache[model]
+
     @handle_openai_overlong_prompt
     async def get_native_response(
         self,
@@ -75,12 +129,49 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
             if "max_tokens" in sampling_args:
                 sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens")
             sampling_args["logprobs"] = True
-            extra_body = dict(return_token_ids=True)
-            if "extra_body" in sampling_args:
-                sampling_args["extra_body"] = {
-                    **sampling_args["extra_body"],
-                    **extra_body,
+
+            # Transport-specific opt-ins. Both transports get response-side
+            # token IDs, just via different fields:
+            #
+            #   * prime_vllm_generate (vLLM): `extra_body.return_token_ids=True`
+            #     tells vLLM to set the non-standard `choices[0].token_ids` and
+            #     `response.prompt_token_ids` fields. `parse_tokens` reads them
+            #     directly.
+            #
+            #   * dynamo_chat_nvext: `nvext.extra_fields=["engine_data"]`
+            #     tells Dynamo's response builder to emit `response.nvext`
+            #     `engine_data.{completion_token_ids, completion_logprobs,
+            #     prompt_token_ids}` (PR #8119 channel mirrored to vLLM in
+            #     ai-dynamo/dynamo `rl-sdk-2`). `from_native_response` grafts
+            #     this onto the OpenAI-shaped response so `parse_tokens`
+            #     works unmodified. `return_token_ids` is dropped because
+            #     Dynamo's strict validator rejects it.
+            if self.renderer_transport == "dynamo_chat_nvext":
+                extra_body: dict[str, Any] = {
+                    "nvext": {"extra_fields": ["engine_data"]}
                 }
+            else:
+                extra_body = {"return_token_ids": True}
+
+            if "extra_body" in sampling_args:
+                merged = {**sampling_args["extra_body"]}
+                # Merge nvext.extra_fields cumulatively rather than overwriting,
+                # so caller-provided extra_fields (e.g. "timing", "worker_id")
+                # coexist with our "engine_data" opt-in.
+                if "nvext" in merged and "nvext" in extra_body:
+                    base = dict(merged.get("nvext") or {})
+                    inc = dict(extra_body.get("nvext") or {})
+                    base_ef = list(base.get("extra_fields") or [])
+                    inc_ef = list(inc.get("extra_fields") or [])
+                    merged_ef = list(dict.fromkeys(base_ef + inc_ef))
+                    merged_nvext = {**base, **inc, "extra_fields": merged_ef}
+                    merged["nvext"] = merged_nvext
+                    sampling_args["extra_body"] = {
+                        **{k: v for k, v in extra_body.items() if k != "nvext"},
+                        **merged,
+                    }
+                else:
+                    sampling_args["extra_body"] = {**merged, **extra_body}
             else:
                 sampling_args["extra_body"] = extra_body
             return {k: v for k, v in sampling_args.items() if v is not None}
@@ -126,6 +217,16 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
                 prompt, model, sampling_args, tools, extra_headers=extra_headers
             )
 
+        if self.renderer_transport == "dynamo_chat_nvext":
+            return await self._post_dynamo_chat_nvext(
+                prompt=prompt,
+                prompt_ids=prompt_ids,
+                model=model,
+                tools=tools,
+                sampling_args=sampling_args,
+                extra_headers=extra_headers,
+            )
+
         extra_body = sampling_args.pop("extra_body", {})
         body = {
             "model": model,
@@ -143,6 +244,86 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
             extra_headers=extra_headers,
         )
 
+    async def _post_dynamo_chat_nvext(
+        self,
+        prompt: OpenAIChatMessages,
+        prompt_ids: list[int],
+        model: str,
+        tools: list[OpenAITool] | None,
+        sampling_args: dict,
+        extra_headers: Mapping[str, str] | None,
+    ) -> OpenAIChatResponse:
+        """Post stitched ``prompt_ids`` to Dynamo's chat-completions route.
+
+        The engine sees ``nvext.token_data`` and skips its own tokenization,
+        so the placeholder ``messages`` value stays small regardless of
+        trajectory length. Response token IDs come back via
+        ``response.nvext.engine_data.completion_token_ids`` and are grafted
+        onto ``choices[0].token_ids`` by
+        ``OpenAIChatCompletionsClient.from_native_response`` so the rest of
+        the pipeline reads them via the standard openai SDK attribute path.
+        """
+        extra_body = dict(sampling_args.pop("extra_body", {}) or {})
+
+        # nvext.token_data is the canonical pre-tokenized-prompt channel.
+        # Merge with caller-provided nvext (extra_fields etc.) rather than
+        # overwriting it. normalize_sampling_args already injected
+        # extra_fields=["engine_data"] into extra_body.nvext, so this just
+        # adds token_data to that same dict.
+        caller_nvext = dict(extra_body.pop("nvext", None) or {})
+        caller_nvext["token_data"] = prompt_ids
+        nvext = caller_nvext
+
+        body: dict[str, Any] = {
+            "model": model,
+            "messages": prompt,  # placeholder; engine ignores when token_data present
+            "stream": False,
+            "nvext": nvext,
+        }
+        if tools:
+            body["tools"] = tools
+
+        # Sampling params that Dynamo's chat-completions surface accepts
+        # directly. Anything else stays in extra_body and rides as an
+        # unrecognized passthrough field (validate.rs PASSTHROUGH_EXTRA_FIELDS).
+        promotable = (
+            "max_completion_tokens",
+            "max_tokens",
+            "temperature",
+            "top_p",
+            "top_k",
+            "min_p",
+            "seed",
+            "n",
+            "repetition_penalty",
+            "min_tokens",
+            "logprobs",
+            "top_logprobs",
+            "stop",
+        )
+        for key in promotable:
+            value = sampling_args.get(key, extra_body.get(key))
+            if value is not None and key not in body:
+                body[key] = value
+
+        # Remaining extra_body keys (cache_salt, stop_token_ids,
+        # bad_words_token_ids, ...) pass through unchanged. The dynamo
+        # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist accepts these
+        # without rejection; unknown keys are silently ignored.
+        passthrough = {
+            k: v
+            for k, v in extra_body.items()
+            if k not in promotable and v is not None and k not in body
+        }
+        body.update(passthrough)
+
+        return await self.client.post(
+            "/chat/completions",
+            body=body,
+            cast_to=ChatCompletion,
+            options={"headers": extra_headers} if extra_headers else {},
+        )
+
     async def get_prompt_ids(
         self,
         state: State,
@@ -176,6 +357,15 @@ def normalize_for_comparison(value: Any) -> Any:
                 # prefix-match equality is unaffected.
                 if normalized.get("content") == "":
                     normalized["content"] = None
+                # Drop None-valued keys so model_dump's exhaustive view (which
+                # carries e.g. thinking_blocks=None on AssistantMessage) is
+                # equivalent to to_native_prompt's slimmer view (which omits
+                # the field entirely). Without this, vf.Message-shaped input
+                # (what MultiTurnEnv produces after maybe_normalize_messages)
+                # never matches the to_native_prompt-normalized step messages,
+                # which breaks the prefix match and forces TITO to fall back
+                # to MITO every turn-2+.
+                normalized = {k: v for k, v in normalized.items() if v is not None}
                 return normalized
             if isinstance(value, list):
                 return [normalize_for_comparison(item) for item in value]
@@ -369,9 +559,28 @@ async def tokenize(
         extra_kwargs: dict | None = None,
         **kwargs,
     ) -> list[int]:
-        """Tokenize messages using the vLLM /tokenize API."""
+        """Tokenize messages for bridge-token computation.
+
+        Dispatched by ``renderer_transport``:
+
+        * ``prime_vllm_generate`` (default): POST to vLLM's ``/tokenize`` route.
+        * ``dynamo_chat_nvext``: local HF fast-tokenizer call. Dynamo doesn't
+          expose ``/tokenize``; running locally also saves two HTTP RTTs per
+          turn (the bridge computes both ``add_generation_prompt=True`` and
+          ``False`` views). The HF Rust encode releases the GIL so the
+          ``asyncio.to_thread`` wrap gives the event loop real parallelism.
+        """
         if extra_kwargs is None:
             extra_kwargs = {}
+
+        if self.renderer_transport == "dynamo_chat_nvext":
+            return await self._local_tokenize(
+                messages=messages,
+                tools=tools,
+                model=model,
+                extra_kwargs=extra_kwargs,
+            )
+
         if isinstance(messages, str):
             body = dict(
                 model=model,
@@ -392,3 +601,44 @@ async def tokenize(
                 "/tokenize", body=body, cast_to=TokenizeResponse
             )
         return tokenize_response.tokens
+
+    async def _local_tokenize(
+        self,
+        messages: str | OpenAIChatMessages,
+        tools: list[OpenAITool] | None,
+        model: str,
+        extra_kwargs: dict,
+    ) -> list[int]:
+        """Local in-process tokenization for the ``dynamo_chat_nvext`` transport.
+
+        Bridge tokenization under TITO calls this twice per turn (once for
+        ``add_generation_prompt=True`` and once for ``False``). Both runs
+        execute in a worker thread so the event loop stays free; HF fast
+        tokenizers release the GIL during the Rust encode pass.
+        """
+        import asyncio
+
+        tokenizer = self._get_local_tokenizer(model)
+        add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True))
+        chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {})
+
+        if isinstance(messages, str):
+            def _encode_text() -> list[int]:
+                return list(tokenizer.encode(messages, add_special_tokens=False))
+            return await asyncio.to_thread(_encode_text)
+
+        def _encode_chat() -> list[int]:
+            ids = tokenizer.apply_chat_template(
+                messages,
+                tools=tools,
+                add_generation_prompt=add_generation_prompt,
+                tokenize=True,
+                **chat_template_kwargs,
+            )
+            if hasattr(ids, "input_ids"):
+                ids = ids.input_ids
+            if ids and isinstance(ids[0], list):
+                ids = ids[0]
+            return [int(t) for t in ids]
+
+        return await asyncio.to_thread(_encode_chat)

From f12bf6346d8dc2ee3669d5e18265880b2d6bda00 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Wed, 13 May 2026 20:43:58 -0700
Subject: [PATCH 04/16] feat(clients): graft top-level
 nvext.completion_token_ids + prompt_token_ids (plan B3)

---
 .../clients/openai_chat_completions_client.py | 70 ++++++++++++-------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
index 87a0564510..c4e60f3926 100644
--- a/verifiers/clients/openai_chat_completions_client.py
+++ b/verifiers/clients/openai_chat_completions_client.py
@@ -470,49 +470,67 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason:
                     return None
 
         def _graft_engine_data(response: OpenAIChatResponse) -> None:
-            """Graft ``nvext.engine_data.*`` onto top-level response fields.
-
-            Dynamo's vLLM/SGLang backends emit engine-side token IDs and
-            per-token logprobs under ``response.nvext.engine_data`` when the
-            client opts in via ``nvext.extra_fields=["engine_data"]`` (PR
-            #8119). Older vLLM-native paths set
-            ``response.choices[0].token_ids`` / ``response.prompt_token_ids``
-            directly. This helper bridges the gap: if ``engine_data`` is
-            present and the top-level fields are missing, copy them across.
-            The rest of ``parse_tokens`` then reads via the standard openai
-            SDK attribute path regardless of backend.
+            """Graft engine-side token IDs onto top-level response fields.
+
+            Three coexisting wire shapes from dynamo's vLLM/SGLang backends:
+
+              1. ``response.nvext.engine_data.{completion_token_ids,
+                 completion_logprobs, prompt_token_ids}`` — PR #8119 channel
+                 (opt-in: ``nvext.extra_fields=["engine_data"]``).
+              2. ``response.nvext.completion_token_ids`` — top-level shape
+                 from rl-sdk-2 plan A4 (opt-in:
+                 ``nvext.extra_fields=["completion_token_ids"]``). No
+                 logprobs in this shape; logprobs ride the standard
+                 ``choices[0].logprobs.content[*].logprob`` channel.
+              3. Older vLLM-native paths set ``response.choices[0].token_ids``
+                 / ``response.prompt_token_ids`` directly (no grafting needed).
+
+            This helper bridges (1) and (2) onto the top-level fields the
+            rest of ``parse_tokens`` reads via the standard openai SDK
+            attribute path. ``engine_data`` wins when both are present (it
+            carries more — including logprobs + prompt_token_ids).
             """
             nvext = getattr(response, "nvext", None)
             if nvext is None and hasattr(response, "model_dump"):
                 nvext = response.model_dump().get("nvext")
             if not isinstance(nvext, dict):
                 return
-            engine_data = nvext.get("engine_data")
-            if not isinstance(engine_data, dict):
-                return
             choice = response.choices[0]
+
+            engine_data = nvext.get("engine_data")
+            completion_token_ids_top = nvext.get("completion_token_ids")
+            prompt_token_ids_top = nvext.get("prompt_token_ids")
+
+            # Prefer engine_data over top-level when both arrive: engine_data
+            # bundles logprobs + prompt_token_ids in one place.
+            completion_token_ids: list[int] | None = None
+            prompt_token_ids: list[int] | None = None
+            if isinstance(engine_data, dict):
+                if engine_data.get("completion_token_ids") is not None:
+                    completion_token_ids = list(engine_data["completion_token_ids"])
+                if engine_data.get("prompt_token_ids") is not None:
+                    prompt_token_ids = list(engine_data["prompt_token_ids"])
+            if completion_token_ids is None and completion_token_ids_top is not None:
+                completion_token_ids = list(completion_token_ids_top)
+            if prompt_token_ids is None and prompt_token_ids_top is not None:
+                prompt_token_ids = list(prompt_token_ids_top)
+
             if (
                 getattr(choice, "token_ids", None) is None
-                and engine_data.get("completion_token_ids") is not None
+                and completion_token_ids is not None
             ):
                 try:
-                    choice.token_ids = list(engine_data["completion_token_ids"])
+                    choice.token_ids = completion_token_ids
                 except Exception:
-                    object.__setattr__(
-                        choice, "token_ids", list(engine_data["completion_token_ids"])
-                    )
+                    object.__setattr__(choice, "token_ids", completion_token_ids)
             if (
                 getattr(response, "prompt_token_ids", None) is None
-                and engine_data.get("prompt_token_ids") is not None
+                and prompt_token_ids is not None
             ):
                 try:
-                    response.prompt_token_ids = list(engine_data["prompt_token_ids"])
+                    response.prompt_token_ids = prompt_token_ids
                 except Exception:
-                    object.__setattr__(
-                        response,
-                        "prompt_token_ids",
-                        list(engine_data["prompt_token_ids"]),
-                    )
+                    object.__setattr__(response, "prompt_token_ids", prompt_token_ids)
 
         def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
             assert len(response.choices) == 1, "Response should always have one choice"

From ee3482aebfaf35e47ec73a55db9276364d63e1cd Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Thu, 14 May 2026 10:21:39 -0700
Subject: [PATCH 05/16] feat(clients): thread renderer_transport from
 ClientConfig to renderers.generate()

---
 verifiers/clients/renderer_client.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py
index 64ca4ec89d..ba97e1800c 100644
--- a/verifiers/clients/renderer_client.py
+++ b/verifiers/clients/renderer_client.py
@@ -603,15 +603,17 @@ async def get_native_response(
             multi_modal_data = None
             prompt_attribution = None
 
-        # ``renderers.client.generate`` discovers the engine's context-length
-        # cap on its own (via ``GET /v1/models``, cached) and raises
-        # ``renderers.OverlongPromptError`` on pre-flight overflow. Rebadge
-        # that into the verifiers-native ``OverlongPromptError`` so the
-        # ``MultiTurnEnv.prompt_too_long`` stop condition picks it up via
-        # the ``vf.Error`` hierarchy. The ``@handle_openai_overlong_prompt``
-        # decorator still handles the fallback case (cap unknown → engine
-        # 4xx → vf.OverlongPromptError) for engines whose ``/v1/models``
-        # doesn't expose ``max_model_len``.
+        # Thread renderer_transport from ClientConfig into generate() so the
+        # renderer client works against Dynamo's /v1/chat/completions surface
+        # as well as vLLM's /inference/v1/generate. setup_clients auto-picks
+        # "dynamo_chat_nvext" when client_config.backend == "dynamo".
+        # ``renderers.client.generate`` raises ``renderers.OverlongPromptError``
+        # on pre-flight overflow; rebadge to verifiers-native so MultiTurnEnv stops.
+        transport = (
+            self._config.renderer_transport
+            if self._config is not None
+            else "prime_vllm_generate"
+        )
         try:
             return await generate(
                 client=self.client,
@@ -623,6 +625,7 @@ async def get_native_response(
                 prompt_attribution=prompt_attribution,
                 tools=tools,
                 sampling_params=sampling_params,
+                transport=transport,
                 cache_salt=args.get("cache_salt")
                 or sampling_params.pop("cache_salt", None),
                 priority=args.get("priority") or sampling_params.pop("priority", None),

From 3b58bf98c0c8b4bec247de61ed5c0ee99860f352 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Tue, 9 Jun 2026 00:31:57 -0700
Subject: [PATCH 06/16] fix(clients): address PR review R1-R5 (guard transport
 kwarg, import ChatCompletion, scrub return_token_ids, forward sampling args,
 graft engine_data logprobs) + rename to dynamo_chat

---
 ...st_openai_chat_completions_token_client.py | 33 ++++++++++++
 .../clients/openai_chat_completions_client.py | 17 ++++++
 .../openai_chat_completions_token_client.py   | 52 ++++++++++++-------
 verifiers/clients/renderer_client.py          | 11 ++--
 verifiers/types.py                            |  8 +--
 5 files changed, 95 insertions(+), 26 deletions(-)

diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py
index 923ff118e0..46b0016416 100644
--- a/tests/test_openai_chat_completions_token_client.py
+++ b/tests/test_openai_chat_completions_token_client.py
@@ -293,3 +293,36 @@ async def fake_get_prompt_ids(  # noqa: ANN001
     assert len(recording_client.calls) == 1
     assert recording_client.calls[0]["path"] == "/chat/completions/tokens"
     assert recording_client.calls[0]["body"]["tokens"] == [10, 20]
+
+
+@pytest.mark.asyncio
+async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling():
+    """dynamo_chat wire body: vLLM-only keys scrubbed (R3), standard sampling
+    args forwarded (R4), nvext token_data + passthrough preserved."""
+    recording_client = _RecordingClient()
+    client = OpenAIChatCompletionsTokenClient(recording_client)
+
+    await client._post_dynamo_chat(
+        prompt=cast(Any, [{"role": "user", "content": ""}]),
+        prompt_ids=[1, 2, 3],
+        model="test-model",
+        tools=None,
+        sampling_args={
+            "temperature": 0.5,
+            "presence_penalty": 0.2,  # standard arg outside the old allowlist
+            "extra_body": {
+                "return_token_ids": True,  # vLLM-only — must be scrubbed
+                "nvext": {"extra_fields": ["engine_data"]},
+                "cache_salt": "ckpt-1",  # passthrough must survive
+            },
+        },
+        extra_headers=None,
+    )
+
+    body = recording_client.calls[0]["body"]
+    assert "return_token_ids" not in body  # R3
+    assert body["presence_penalty"] == 0.2  # R4
+    assert body["temperature"] == 0.5
+    assert body["nvext"]["token_data"] == [1, 2, 3]
+    assert body["nvext"]["extra_fields"] == ["engine_data"]
+    assert body["cache_salt"] == "ckpt-1"  # passthrough preserved
diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
index c4e60f3926..b954dd4ce0 100644
--- a/verifiers/clients/openai_chat_completions_client.py
+++ b/verifiers/clients/openai_chat_completions_client.py
@@ -505,11 +505,16 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None:
             # bundles logprobs + prompt_token_ids in one place.
             completion_token_ids: list[int] | None = None
             prompt_token_ids: list[int] | None = None
+            completion_logprobs: list[float] | None = None
             if isinstance(engine_data, dict):
                 if engine_data.get("completion_token_ids") is not None:
                     completion_token_ids = list(engine_data["completion_token_ids"])
                 if engine_data.get("prompt_token_ids") is not None:
                     prompt_token_ids = list(engine_data["prompt_token_ids"])
+                if engine_data.get("completion_logprobs") is not None:
+                    completion_logprobs = [
+                        float(x) for x in engine_data["completion_logprobs"]
+                    ]
             if completion_token_ids is None and completion_token_ids_top is not None:
                 completion_token_ids = list(completion_token_ids_top)
             if prompt_token_ids is None and prompt_token_ids_top is not None:
@@ -531,6 +536,18 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None:
                     response.prompt_token_ids = prompt_token_ids
                 except Exception:
                     object.__setattr__(response, "prompt_token_ids", prompt_token_ids)
+            # Dynamo returns logprobs only under engine_data, not
+            # choices[0].logprobs. Synthesize the standard shape so parse_tokens
+            # (which requires choices[0].logprobs.content) can read them.
+            if (
+                getattr(choice, "logprobs", None) is None
+                and completion_logprobs is not None
+            ):
+                synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]}
+                try:
+                    choice.logprobs = synthesized
+                except Exception:
+                    object.__setattr__(choice, "logprobs", synthesized)
 
         def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
             assert len(response.choices) == 1, "Response should always have one choice"
diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py
index e5ec9a4a6e..4ddb17dab5 100644
--- a/verifiers/clients/openai_chat_completions_token_client.py
+++ b/verifiers/clients/openai_chat_completions_token_client.py
@@ -3,6 +3,7 @@
 
 from openai import AsyncOpenAI, BaseModel
 from openai.types.chat import (
+    ChatCompletion,
     ChatCompletionAssistantMessageParam,
 )
 from openai.types.chat.chat_completion_message_function_tool_call_param import (
@@ -25,7 +26,7 @@
 
 # Sentinel for the default (legacy vLLM) transport. Lets callers route
 # around the legacy /tokenize body shape without changing the signature.
-_DEFAULT_TRANSPORT: RendererTransport = "prime_vllm_generate"
+_DEFAULT_TRANSPORT: RendererTransport = "vllm_generate"
 
 
 def _has_multimodal_content(messages) -> bool:
@@ -60,12 +61,12 @@ class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient):
     Two transports share this class, selected via
     ``ClientConfig.renderer_transport``:
 
-    * ``prime_vllm_generate`` (default): vLLM's TITO surface.
+    * ``vllm_generate`` (default): vLLM's TITO surface.
       Posts to ``/v1/chat/completions/tokens`` with ``tokens=prompt_ids``
       and uses the server's ``/tokenize`` endpoint for bridge tokens.
       Requires vLLM ``>=0.20``.
 
-    * ``dynamo_chat_nvext``: Dynamo's standard ``/v1/chat/completions``
+    * ``dynamo_chat``: Dynamo's standard ``/v1/chat/completions``
       route with ``nvext.token_data=prompt_ids``. Server-side response
       token IDs come back via ``response.nvext.engine_data.*``
       (`OpenAIChatCompletionsClient.from_native_response` grafts them
@@ -96,7 +97,7 @@ def renderer_transport(self) -> RendererTransport:
         )
 
     def _get_local_tokenizer(self, model: str):
-        """Lazy, per-model HF fast tokenizer for the ``dynamo_chat_nvext``
+        """Lazy, per-model HF fast tokenizer for the ``dynamo_chat``
         transport. Bridge tokens are stitched locally — no ``/tokenize``
         round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained``
         cost once.
@@ -109,7 +110,7 @@ def _get_local_tokenizer(self, model: str):
         except ImportError as exc:  # pragma: no cover - dependency surface
             raise ImportError(
                 "OpenAIChatCompletionsTokenClient with "
-                "renderer_transport='dynamo_chat_nvext' requires "
+                "renderer_transport='dynamo_chat' requires "
                 "`transformers`. Install with `pip install transformers`."
             ) from exc
         cache[model] = AutoTokenizer.from_pretrained(model)
@@ -133,12 +134,12 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
             # Transport-specific opt-ins. Both transports get response-side
             # token IDs, just via different fields:
             #
-            #   * prime_vllm_generate (vLLM): `extra_body.return_token_ids=True`
+            #   * vllm_generate (vLLM): `extra_body.return_token_ids=True`
             #     tells vLLM to set the non-standard `choices[0].token_ids` and
             #     `response.prompt_token_ids` fields. `parse_tokens` reads them
             #     directly.
             #
-            #   * dynamo_chat_nvext: `nvext.extra_fields=["engine_data"]`
+            #   * dynamo_chat: `nvext.extra_fields=["engine_data"]`
             #     tells Dynamo's response builder to emit `response.nvext`
             #     `engine_data.{completion_token_ids, completion_logprobs,
             #     prompt_token_ids}` (PR #8119 channel mirrored to vLLM in
@@ -146,7 +147,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
             #     this onto the OpenAI-shaped response so `parse_tokens`
             #     works unmodified. `return_token_ids` is dropped because
             #     Dynamo's strict validator rejects it.
-            if self.renderer_transport == "dynamo_chat_nvext":
+            if self.renderer_transport == "dynamo_chat":
                 extra_body: dict[str, Any] = {
                     "nvext": {"extra_fields": ["engine_data"]}
                 }
@@ -217,8 +218,8 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
                 prompt, model, sampling_args, tools, extra_headers=extra_headers
             )
 
-        if self.renderer_transport == "dynamo_chat_nvext":
-            return await self._post_dynamo_chat_nvext(
+        if self.renderer_transport == "dynamo_chat":
+            return await self._post_dynamo_chat(
                 prompt=prompt,
                 prompt_ids=prompt_ids,
                 model=model,
@@ -244,7 +245,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
             extra_headers=extra_headers,
         )
 
-    async def _post_dynamo_chat_nvext(
+    async def _post_dynamo_chat(
         self,
         prompt: OpenAIChatMessages,
         prompt_ids: list[int],
@@ -300,20 +301,33 @@ async def _post_dynamo_chat_nvext(
             "logprobs",
             "top_logprobs",
             "stop",
+            # Standard chat-completions sampling args (parity with the vLLM path,
+            # which spreads the full normalized sampling_args).
+            "presence_penalty",
+            "frequency_penalty",
+            "logit_bias",
+            "response_format",
+            "parallel_tool_calls",
         )
         for key in promotable:
             value = sampling_args.get(key, extra_body.get(key))
             if value is not None and key not in body:
                 body[key] = value
 
+        # vLLM-only extra_body keys Dynamo's strict validator rejects — never
+        # forward these on the dynamo_chat wire (e.g. return_token_ids, which
+        # the vLLM path uses for TITO but Dynamo 400s on).
+        vllm_only = {"return_token_ids"}
         # Remaining extra_body keys (cache_salt, stop_token_ids,
-        # bad_words_token_ids, ...) pass through unchanged. The dynamo
-        # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist accepts these
-        # without rejection; unknown keys are silently ignored.
+        # bad_words_token_ids, ...) pass through unchanged via the dynamo
+        # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist.
         passthrough = {
             k: v
             for k, v in extra_body.items()
-            if k not in promotable and v is not None and k not in body
+            if k not in promotable
+            and k not in vllm_only
+            and v is not None
+            and k not in body
         }
         body.update(passthrough)
 
@@ -563,8 +577,8 @@ async def tokenize(
 
         Dispatched by ``renderer_transport``:
 
-        * ``prime_vllm_generate`` (default): POST to vLLM's ``/tokenize`` route.
-        * ``dynamo_chat_nvext``: local HF fast-tokenizer call. Dynamo doesn't
+        * ``vllm_generate`` (default): POST to vLLM's ``/tokenize`` route.
+        * ``dynamo_chat``: local HF fast-tokenizer call. Dynamo doesn't
           expose ``/tokenize``; running locally also saves two HTTP RTTs per
           turn (the bridge computes both ``add_generation_prompt=True`` and
           ``False`` views). The HF Rust encode releases the GIL so the
@@ -573,7 +587,7 @@ async def tokenize(
         if extra_kwargs is None:
             extra_kwargs = {}
 
-        if self.renderer_transport == "dynamo_chat_nvext":
+        if self.renderer_transport == "dynamo_chat":
             return await self._local_tokenize(
                 messages=messages,
                 tools=tools,
@@ -609,7 +623,7 @@ async def _local_tokenize(
         model: str,
         extra_kwargs: dict,
     ) -> list[int]:
-        """Local in-process tokenization for the ``dynamo_chat_nvext`` transport.
+        """Local in-process tokenization for the ``dynamo_chat`` transport.
 
         Bridge tokenization under TITO calls this twice per turn (once for
         ``add_generation_prompt=True`` and once for ``False``). Both runs
diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py
index ba97e1800c..cc0acd3556 100644
--- a/verifiers/clients/renderer_client.py
+++ b/verifiers/clients/renderer_client.py
@@ -606,14 +606,19 @@ async def get_native_response(
         # Thread renderer_transport from ClientConfig into generate() so the
         # renderer client works against Dynamo's /v1/chat/completions surface
         # as well as vLLM's /inference/v1/generate. setup_clients auto-picks
-        # "dynamo_chat_nvext" when client_config.backend == "dynamo".
+        # "dynamo_chat" when client_config.backend == "dynamo".
         # ``renderers.client.generate`` raises ``renderers.OverlongPromptError``
         # on pre-flight overflow; rebadge to verifiers-native so MultiTurnEnv stops.
         transport = (
             self._config.renderer_transport
             if self._config is not None
-            else "prime_vllm_generate"
+            else "vllm_generate"
         )
+        # Only pass transport= when non-default: a pinned ``renderers`` may
+        # predate the kwarg, so the default path must use the upstream signature.
+        generate_kwargs: dict[str, Any] = {}
+        if transport != "vllm_generate":
+            generate_kwargs["transport"] = transport
         try:
             return await generate(
                 client=self.client,
@@ -625,11 +630,11 @@ async def get_native_response(
                 prompt_attribution=prompt_attribution,
                 tools=tools,
                 sampling_params=sampling_params,
-                transport=transport,
                 cache_salt=args.get("cache_salt")
                 or sampling_params.pop("cache_salt", None),
                 priority=args.get("priority") or sampling_params.pop("priority", None),
                 extra_headers=extra_headers or None,
+                **generate_kwargs,
             )
         except RendererOverlongPromptError as exc:
             raise OverlongPromptError(str(exc)) from exc
diff --git a/verifiers/types.py b/verifiers/types.py
index 8bbc6bd573..0d2dcb8abf 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -83,17 +83,17 @@
 # client targets at request-build time. Same flag drives both clients so a
 # single `ClientConfig.renderer_transport` setting routes consistently.
 #
-# - "prime_vllm_generate" (default): vLLM's TITO surface. For RendererClient
+# - "vllm_generate" (default): vLLM's TITO surface. For RendererClient
 #   that's POST /v1/chat/completions with a renderer-flavored request body.
 #   For OpenAIChatCompletionsTokenClient that's POST
 #   /v1/chat/completions/tokens with `tokens=prompt_ids` and bridge
 #   tokenization via the server's /tokenize route.
-# - "dynamo_chat_nvext": Dynamo's standard chat-completions route with
+# - "dynamo_chat": Dynamo's standard chat-completions route with
 #   pre-tokenized prompt carried in `nvext.token_data`. Server-side token
 #   IDs come back via `nvext.engine_data.completion_token_ids` (PR #8119
 #   canonical channel). Bridge tokenization runs locally via the
 #   transformers fast tokenizer; no /tokenize HTTP round-trip.
-RendererTransport = Literal["prime_vllm_generate", "dynamo_chat_nvext"]
+RendererTransport = Literal["vllm_generate", "dynamo_chat"]
 
 
 # Provider-agnostic message + response types
@@ -1287,7 +1287,7 @@ class ClientConfig(BaseModel):
     to ``None`` so non-renderer clients aren't forced to declare it; the
     renderer client treats ``None`` as ``AutoRendererConfig()``."""
     renderer: str = "auto"
-    renderer_transport: RendererTransport = "prime_vllm_generate"
+    renderer_transport: RendererTransport = "vllm_generate"
     renderer_model_name: str | None = None
     """Override the tokenizer model name used to instantiate the renderer
     pool. Defaults to the model used in API requests."""

From 7a85b8469a2599407fe726e57550a794a929e77a Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Tue, 9 Jun 2026 00:41:48 -0700
Subject: [PATCH 07/16] fix(clients): graft engine_data logprobs even when
 choice logprobs is content-less; trim test comments

---
 ...st_openai_chat_completions_token_client.py | 49 +++++++++++++++++--
 .../clients/openai_chat_completions_client.py | 16 ++++--
 2 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py
index 46b0016416..5391aee273 100644
--- a/tests/test_openai_chat_completions_token_client.py
+++ b/tests/test_openai_chat_completions_token_client.py
@@ -297,8 +297,8 @@ async def fake_get_prompt_ids(  # noqa: ANN001
 
 @pytest.mark.asyncio
 async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling():
-    """dynamo_chat wire body: vLLM-only keys scrubbed (R3), standard sampling
-    args forwarded (R4), nvext token_data + passthrough preserved."""
+    """dynamo_chat wire body: vLLM-only keys scrubbed, standard sampling args
+    forwarded, nvext token_data + passthrough preserved."""
     recording_client = _RecordingClient()
     client = OpenAIChatCompletionsTokenClient(recording_client)
 
@@ -320,9 +320,48 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling():
     )
 
     body = recording_client.calls[0]["body"]
-    assert "return_token_ids" not in body  # R3
-    assert body["presence_penalty"] == 0.2  # R4
+    assert "return_token_ids" not in body
+    assert body["presence_penalty"] == 0.2
     assert body["temperature"] == 0.5
     assert body["nvext"]["token_data"] == [1, 2, 3]
     assert body["nvext"]["extra_fields"] == ["engine_data"]
-    assert body["cache_salt"] == "ckpt-1"  # passthrough preserved
+    assert body["cache_salt"] == "ckpt-1"
+
+
+@pytest.mark.asyncio
+async def test_graft_engine_data_synthesizes_logprobs_when_content_less():
+    """engine_data.completion_logprobs must be grafted even when the choice
+    carries a content-less logprobs object (not only when absent)."""
+    from openai.types.chat import ChatCompletion
+
+    client = OpenAIChatCompletionsClient(_NoopClient())
+    native = ChatCompletion.model_validate(
+        {
+            "id": "x",
+            "object": "chat.completion",
+            "created": 1,
+            "model": "test-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": "ok"},
+                    "finish_reason": "stop",
+                    "logprobs": {"content": None},  # present but content-less
+                }
+            ],
+            "nvext": {
+                "engine_data": {
+                    "completion_token_ids": [10, 11],
+                    "prompt_token_ids": [1, 2, 3],
+                    "completion_logprobs": [-0.1, -0.2],
+                }
+            },
+        }
+    )
+
+    vf_response = await client.from_native_response(native)
+    tokens = vf_response.message.tokens
+    assert tokens is not None  # would be None before the fix (TITO lost)
+    assert tokens.completion_ids == [10, 11]
+    assert tokens.prompt_ids == [1, 2, 3]
+    assert tokens.completion_logprobs == [-0.1, -0.2]
diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
index b954dd4ce0..0da8b410cd 100644
--- a/verifiers/clients/openai_chat_completions_client.py
+++ b/verifiers/clients/openai_chat_completions_client.py
@@ -538,11 +538,17 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None:
                     object.__setattr__(response, "prompt_token_ids", prompt_token_ids)
             # Dynamo returns logprobs only under engine_data, not
             # choices[0].logprobs. Synthesize the standard shape so parse_tokens
-            # (which requires choices[0].logprobs.content) can read them.
-            if (
-                getattr(choice, "logprobs", None) is None
-                and completion_logprobs is not None
-            ):
+            # (which requires choices[0].logprobs.content) can read them. Graft
+            # whenever the choice has no usable logprobs content — i.e. logprobs
+            # is missing OR present-but-content-less (empty/None content) — not
+            # only when it is absent entirely.
+            existing_lp = getattr(choice, "logprobs", None)
+            existing_content = (
+                existing_lp.get("content")
+                if isinstance(existing_lp, dict)
+                else getattr(existing_lp, "content", None)
+            )
+            if completion_logprobs is not None and not existing_content:
                 synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]}
                 try:
                     choice.logprobs = synthesized

From 7cbb603ef6fead35e88d3d24c2a3638afa28c58a Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Tue, 9 Jun 2026 00:45:28 -0700
Subject: [PATCH 08/16] fix(clients): dynamo_chat forwards full normalized
 sampling_args (drop fixed allowlist) for vLLM-path parity

---
 ...st_openai_chat_completions_token_client.py |  6 ++-
 .../openai_chat_completions_token_client.py   | 54 ++++---------------
 2 files changed, 13 insertions(+), 47 deletions(-)

diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py
index 5391aee273..b3e5a798f4 100644
--- a/tests/test_openai_chat_completions_token_client.py
+++ b/tests/test_openai_chat_completions_token_client.py
@@ -309,11 +309,12 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling():
         tools=None,
         sampling_args={
             "temperature": 0.5,
-            "presence_penalty": 0.2,  # standard arg outside the old allowlist
+            "presence_penalty": 0.2,
+            "reasoning_effort": "high",  # arbitrary key: full parity, not an allowlist
             "extra_body": {
                 "return_token_ids": True,  # vLLM-only — must be scrubbed
                 "nvext": {"extra_fields": ["engine_data"]},
-                "cache_salt": "ckpt-1",  # passthrough must survive
+                "cache_salt": "ckpt-1",
             },
         },
         extra_headers=None,
@@ -323,6 +324,7 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling():
     assert "return_token_ids" not in body
     assert body["presence_penalty"] == 0.2
     assert body["temperature"] == 0.5
+    assert body["reasoning_effort"] == "high"
     assert body["nvext"]["token_data"] == [1, 2, 3]
     assert body["nvext"]["extra_fields"] == ["engine_data"]
     assert body["cache_salt"] == "ckpt-1"
diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py
index 4ddb17dab5..6a3ee0164a 100644
--- a/verifiers/clients/openai_chat_completions_token_client.py
+++ b/verifiers/clients/openai_chat_completions_token_client.py
@@ -284,52 +284,16 @@ async def _post_dynamo_chat(
         if tools:
             body["tools"] = tools
 
-        # Sampling params that Dynamo's chat-completions surface accepts
-        # directly. Anything else stays in extra_body and rides as an
-        # unrecognized passthrough field (validate.rs PASSTHROUGH_EXTRA_FIELDS).
-        promotable = (
-            "max_completion_tokens",
-            "max_tokens",
-            "temperature",
-            "top_p",
-            "top_k",
-            "min_p",
-            "seed",
-            "n",
-            "repetition_penalty",
-            "min_tokens",
-            "logprobs",
-            "top_logprobs",
-            "stop",
-            # Standard chat-completions sampling args (parity with the vLLM path,
-            # which spreads the full normalized sampling_args).
-            "presence_penalty",
-            "frequency_penalty",
-            "logit_bias",
-            "response_format",
-            "parallel_tool_calls",
-        )
-        for key in promotable:
-            value = sampling_args.get(key, extra_body.get(key))
-            if value is not None and key not in body:
-                body[key] = value
-
-        # vLLM-only extra_body keys Dynamo's strict validator rejects — never
-        # forward these on the dynamo_chat wire (e.g. return_token_ids, which
-        # the vLLM path uses for TITO but Dynamo 400s on).
+        # Forward the full normalized sampling_args (parity with the vLLM path,
+        # which spreads all of sampling_args), then remaining extra_body keys —
+        # minus vLLM-only keys Dynamo's strict validator rejects (return_token_ids).
+        # Unknown keys ride through the dynamo frontend's PASSTHROUGH_EXTRA_FIELDS.
         vllm_only = {"return_token_ids"}
-        # Remaining extra_body keys (cache_salt, stop_token_ids,
-        # bad_words_token_ids, ...) pass through unchanged via the dynamo
-        # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist.
-        passthrough = {
-            k: v
-            for k, v in extra_body.items()
-            if k not in promotable
-            and k not in vllm_only
-            and v is not None
-            and k not in body
-        }
-        body.update(passthrough)
+        for source in (sampling_args, extra_body):
+            for key, value in source.items():
+                if value is None or key in vllm_only or key in body:
+                    continue
+                body[key] = value
 
         return await self.client.post(
             "/chat/completions",

From 6b2dfbbaa4272b131b620b63dadff8dd92eaf9ed Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Tue, 9 Jun 2026 01:07:28 -0700
Subject: [PATCH 09/16] fix(clients): centralize Dynamo denylist scrub
 (MITO+TITO), guard logprob length, tokenizer override, drop dead renderer
 field

---
 ...st_openai_chat_completions_token_client.py |  2 ++
 .../clients/openai_chat_completions_client.py |  7 ++++-
 .../openai_chat_completions_token_client.py   | 26 +++++++++++++++++--
 verifiers/types.py                            |  1 -
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py
index b3e5a798f4..ad962c5ba2 100644
--- a/tests/test_openai_chat_completions_token_client.py
+++ b/tests/test_openai_chat_completions_token_client.py
@@ -311,6 +311,7 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling():
             "temperature": 0.5,
             "presence_penalty": 0.2,
             "reasoning_effort": "high",  # arbitrary key: full parity, not an allowlist
+            "spaces_between_special_tokens": False,  # vLLM-only — must be scrubbed
             "extra_body": {
                 "return_token_ids": True,  # vLLM-only — must be scrubbed
                 "nvext": {"extra_fields": ["engine_data"]},
@@ -322,6 +323,7 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling():
 
     body = recording_client.calls[0]["body"]
     assert "return_token_ids" not in body
+    assert "spaces_between_special_tokens" not in body
     assert body["presence_penalty"] == 0.2
     assert body["temperature"] == 0.5
     assert body["reasoning_effort"] == "high"
diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
index 0da8b410cd..f8e7e80f4e 100644
--- a/verifiers/clients/openai_chat_completions_client.py
+++ b/verifiers/clients/openai_chat_completions_client.py
@@ -548,7 +548,12 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None:
                 if isinstance(existing_lp, dict)
                 else getattr(existing_lp, "content", None)
             )
-            if completion_logprobs is not None and not existing_content:
+            if (
+                completion_logprobs is not None
+                and completion_token_ids is not None
+                and len(completion_logprobs) == len(completion_token_ids)
+                and not existing_content
+            ):
                 synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]}
                 try:
                     choice.logprobs = synthesized
diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py
index 6a3ee0164a..4725a74612 100644
--- a/verifiers/clients/openai_chat_completions_token_client.py
+++ b/verifiers/clients/openai_chat_completions_token_client.py
@@ -28,6 +28,12 @@
 # around the legacy /tokenize body shape without changing the signature.
 _DEFAULT_TRANSPORT: RendererTransport = "vllm_generate"
 
+# vLLM/prime-only sampling keys Dynamo's strict validator rejects — scrubbed
+# from every dynamo_chat request body (both MITO and TITO paths).
+_DYNAMO_DROP_KEYS = frozenset(
+    {"return_token_ids", "spaces_between_special_tokens", "priority"}
+)
+
 
 def _has_multimodal_content(messages) -> bool:
     """Check if any message contains multimodal content (images, audio).
@@ -175,6 +181,15 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
                     sampling_args["extra_body"] = {**merged, **extra_body}
             else:
                 sampling_args["extra_body"] = extra_body
+            if self.renderer_transport == "dynamo_chat":
+                # Drop vLLM/prime-only keys Dynamo rejects from both top-level
+                # args and extra_body, so MITO + TITO paths send a clean body.
+                eb = sampling_args.get("extra_body")
+                if isinstance(eb, dict):
+                    for k in _DYNAMO_DROP_KEYS:
+                        eb.pop(k, None)
+                for k in _DYNAMO_DROP_KEYS:
+                    sampling_args.pop(k, None)
             return {k: v for k, v in sampling_args.items() if v is not None}
 
         sampling_args = normalize_sampling_args(sampling_args)
@@ -288,7 +303,7 @@ async def _post_dynamo_chat(
         # which spreads all of sampling_args), then remaining extra_body keys —
         # minus vLLM-only keys Dynamo's strict validator rejects (return_token_ids).
         # Unknown keys ride through the dynamo frontend's PASSTHROUGH_EXTRA_FIELDS.
-        vllm_only = {"return_token_ids"}
+        vllm_only = _DYNAMO_DROP_KEYS
         for source in (sampling_args, extra_body):
             for key, value in source.items():
                 if value is None or key in vllm_only or key in body:
@@ -596,7 +611,14 @@ async def _local_tokenize(
         """
         import asyncio
 
-        tokenizer = self._get_local_tokenizer(model)
+        # Prefer the explicit tokenizer override so model aliases don't silently
+        # disable turn-2+ TITO (fall back to the served model name).
+        tok_model = (
+            getattr(self._config, "renderer_model_name", None) or model
+            if self._config is not None
+            else model
+        )
+        tokenizer = self._get_local_tokenizer(tok_model)
         add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True))
         chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {})
 
diff --git a/verifiers/types.py b/verifiers/types.py
index 0d2dcb8abf..ed1ffdb145 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -1286,7 +1286,6 @@ class ClientConfig(BaseModel):
     Drives the renderer pool when ``client_type == "renderer"``. Defaults
     to ``None`` so non-renderer clients aren't forced to declare it; the
     renderer client treats ``None`` as ``AutoRendererConfig()``."""
-    renderer: str = "auto"
     renderer_transport: RendererTransport = "vllm_generate"
     renderer_model_name: str | None = None
     """Override the tokenizer model name used to instantiate the renderer

From 9d260d3968c13201e8024ac628d3f8d024f4c329 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Tue, 9 Jun 2026 01:31:14 -0700
Subject: [PATCH 10/16] fix(clients): enforce logprobs/ids length invariant in
 parse_tokens (all paths)

---
 verifiers/clients/openai_chat_completions_client.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
index f8e7e80f4e..2084ea6f33 100644
--- a/verifiers/clients/openai_chat_completions_client.py
+++ b/verifiers/clients/openai_chat_completions_client.py
@@ -600,6 +600,11 @@ def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
                 logprobs_content = response.choices[0].logprobs["content"]
                 completion_logprobs = [token["logprob"] for token in logprobs_content]
 
+            if len(completion_logprobs) != len(completion_ids):
+                # Engine returned mismatched logprobs/ids — drop rather than emit
+                # out-of-sync ResponseTokens.
+                return None
+
             choice_extra = choice.model_extra or {}
             return ResponseTokens(
                 prompt_ids=prompt_ids,

From 4aa48a4dfcd8ae6907e28e9629b4ca17e5418105 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Tue, 9 Jun 2026 03:04:29 -0700
Subject: [PATCH 11/16] fix(clients): centralize tokenizer override in
 _get_local_tokenizer; route dynamo TITO through routed-experts sidecar helper

---
 .../openai_chat_completions_token_client.py   | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py
index 4725a74612..8de5a10ca5 100644
--- a/verifiers/clients/openai_chat_completions_token_client.py
+++ b/verifiers/clients/openai_chat_completions_token_client.py
@@ -3,7 +3,6 @@
 
 from openai import AsyncOpenAI, BaseModel
 from openai.types.chat import (
-    ChatCompletion,
     ChatCompletionAssistantMessageParam,
 )
 from openai.types.chat.chat_completion_message_function_tool_call_param import (
@@ -108,6 +107,14 @@ def _get_local_tokenizer(self, model: str):
         round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained``
         cost once.
         """
+        # Honor the explicit tokenizer override (renderer_model_name) so model
+        # aliases don't break bridge stitching; fall back to the served model.
+        override = (
+            getattr(self._config, "renderer_model_name", None)
+            if self._config is not None
+            else None
+        )
+        model = override or model
         cache: dict[str, Any] = self.__dict__.setdefault("_tokenizer_cache", {})
         if model in cache:
             return cache[model]
@@ -310,11 +317,14 @@ async def _post_dynamo_chat(
                     continue
                 body[key] = value
 
-        return await self.client.post(
+        # Use the sidecar-aware post (same as the vLLM TITO + MITO paths) so any
+        # routed_experts blob is streamed, not JSON-parsed. dynamo_chat opts into
+        # extra_fields=["engine_data"] only, so routed_experts is normally absent.
+        return await post_chat_completion_with_routed_experts_sidecar(
+            self.client,
             "/chat/completions",
             body=body,
-            cast_to=ChatCompletion,
-            options={"headers": extra_headers} if extra_headers else {},
+            extra_headers=extra_headers,
         )
 
     async def get_prompt_ids(
@@ -611,14 +621,7 @@ async def _local_tokenize(
         """
         import asyncio
 
-        # Prefer the explicit tokenizer override so model aliases don't silently
-        # disable turn-2+ TITO (fall back to the served model name).
-        tok_model = (
-            getattr(self._config, "renderer_model_name", None) or model
-            if self._config is not None
-            else model
-        )
-        tokenizer = self._get_local_tokenizer(tok_model)
+        tokenizer = self._get_local_tokenizer(model)
         add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True))
         chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {})
 

From d713edc7ab58b4f3f2ad79e410f09d2c4166042c Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Tue, 9 Jun 2026 03:11:41 -0700
Subject: [PATCH 12/16] fix(clients): load HF tokenizer inside worker thread
 (cache-miss from_pretrained must not block the event loop)

---
 verifiers/clients/openai_chat_completions_token_client.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py
index 8de5a10ca5..427ed4ee1e 100644
--- a/verifiers/clients/openai_chat_completions_token_client.py
+++ b/verifiers/clients/openai_chat_completions_token_client.py
@@ -621,16 +621,19 @@ async def _local_tokenize(
         """
         import asyncio
 
-        tokenizer = self._get_local_tokenizer(model)
         add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True))
         chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {})
 
+        # Load the tokenizer inside the worker thread: a cache miss runs the
+        # synchronous AutoTokenizer.from_pretrained, which must not block the loop.
         if isinstance(messages, str):
             def _encode_text() -> list[int]:
+                tokenizer = self._get_local_tokenizer(model)
                 return list(tokenizer.encode(messages, add_special_tokens=False))
             return await asyncio.to_thread(_encode_text)
 
         def _encode_chat() -> list[int]:
+            tokenizer = self._get_local_tokenizer(model)
             ids = tokenizer.apply_chat_template(
                 messages,
                 tools=tools,

From 193c5491c8fd5a3168dfefb9f4d6fa2868fdeb50 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Wed, 10 Jun 2026 02:33:59 -0700
Subject: [PATCH 13/16] feat(types): add dtype to RoutedExpertsPayload contract

---
 verifiers/types.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/verifiers/types.py b/verifiers/types.py
index ed1ffdb145..aa408f3a7c 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -228,6 +228,10 @@ class RoutedExpertsPayload(TypedDict):
     data: Any
     shape: list[int]
     start: int
+    # Element dtype of the decoded expert-id buffer ("uint8" / "uint16" /
+    # "int32"). NotRequired so payloads serialized before this field still
+    # validate; consumers default to "uint8" (the historical encoding).
+    dtype: NotRequired[str]
 
 
 class ResponseTokens(CustomBaseModel):

From c30dad26f8906f8bdbfa93806e9870c9af7bf96b Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Wed, 10 Jun 2026 11:05:07 -0700
Subject: [PATCH 14/16] fix(routed_experts): tighten dtype to Literal and make
 sidecar stripper key-order robust

---
 tests/test_trajectory_processing.py | 23 +++++++++++++++++++++++
 verifiers/types.py                  |  8 ++++----
 verifiers/utils/response_utils.py   | 17 +++++++++++++----
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/tests/test_trajectory_processing.py b/tests/test_trajectory_processing.py
index 386e4fd947..4d4ee30111 100644
--- a/tests/test_trajectory_processing.py
+++ b/tests/test_trajectory_processing.py
@@ -483,3 +483,26 @@ def test_trajectory_step_mask_combining():
     assert token_ids == [1, 2, 3, 4, 5]
     assert mask == [0, 0, 0, 1, 1]
     assert logprobs == [0.0, 0.0, 0.0, -0.1, -0.2]
+
+
+def test_strip_routed_experts_data_key_order_robust():
+    """The zero-copy stripper must find ``data`` regardless of key order
+    (``dtype``/``shape``/``start`` may precede it) and no-op when absent."""
+    from verifiers.utils.response_utils import strip_routed_experts_data
+
+    # data first (fast path)
+    raw = b'{"routed_experts":{"data":"QUJD","shape":[3],"start":0,"dtype":"uint8"}}'
+    stripped, blob = strip_routed_experts_data(raw)
+    assert blob is not None and blob.tobytes() == b"QUJD"
+    assert b'"data":""' in stripped
+
+    # dtype/shape/start before data — must still strip the blob
+    raw2 = b'{"routed_experts":{"dtype":"uint16","shape":[3],"start":0,"data":"WFla"}}'
+    stripped2, blob2 = strip_routed_experts_data(raw2)
+    assert blob2 is not None and blob2.tobytes() == b"WFla"
+    assert b'"data":""' in stripped2
+
+    # absent — no-op passthrough
+    raw3 = b'{"choices":[{"token_ids":[1,2]}]}'
+    stripped3, blob3 = strip_routed_experts_data(raw3)
+    assert blob3 is None and stripped3 == raw3
diff --git a/verifiers/types.py b/verifiers/types.py
index aa408f3a7c..62226d1f47 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -228,10 +228,10 @@ class RoutedExpertsPayload(TypedDict):
     data: Any
     shape: list[int]
     start: int
-    # Element dtype of the decoded expert-id buffer ("uint8" / "uint16" /
-    # "int32"). NotRequired so payloads serialized before this field still
-    # validate; consumers default to "uint8" (the historical encoding).
-    dtype: NotRequired[str]
+    # Element dtype of the decoded expert-id buffer. NotRequired so payloads
+    # serialized before this field still validate; a decoder that doesn't see
+    # it falls back to "uint8" (the historical encoding).
+    dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]]
 
 
 class ResponseTokens(CustomBaseModel):
diff --git a/verifiers/utils/response_utils.py b/verifiers/utils/response_utils.py
index 7bc13bc22d..336f59f17b 100644
--- a/verifiers/utils/response_utils.py
+++ b/verifiers/utils/response_utils.py
@@ -9,15 +9,24 @@
     TrajectoryStepTokens,
 )
 
-ROUTED_EXPERTS_DATA_PREFIX = b'"routed_experts":{"data":"'
+ROUTED_EXPERTS_OBJ_PREFIX = b'"routed_experts":{'
+ROUTED_EXPERTS_DATA_KEY = b'"data":"'
 
 
 def strip_routed_experts_data(raw: bytes) -> tuple[bytes, memoryview | None]:
-    data_start = raw.find(ROUTED_EXPERTS_DATA_PREFIX)
-    if data_start < 0:
+    # Zero-copy fast path for the large base64 routed_experts blob: find the
+    # "data" value inside the routed_experts object regardless of key order
+    # (shape/start/dtype may precede it), slice it out before JSON parsing.
+    # No-op fallback (consumer b64-decodes the string) if the shape isn't found.
+    obj_start = raw.find(ROUTED_EXPERTS_OBJ_PREFIX)
+    if obj_start < 0:
         return raw, None
 
-    data_start += len(ROUTED_EXPERTS_DATA_PREFIX)
+    data_key = raw.find(ROUTED_EXPERTS_DATA_KEY, obj_start)
+    if data_key < 0:
+        return raw, None
+
+    data_start = data_key + len(ROUTED_EXPERTS_DATA_KEY)
     data_end = raw.index(b'"', data_start)
     routed_data = memoryview(raw)[data_start:data_end]
     stripped = raw[:data_start] + raw[data_end:]

From ea53210208163cc115615d5468040fce26fc6701 Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Wed, 10 Jun 2026 11:18:37 -0700
Subject: [PATCH 15/16] fix(routed_experts): bound sidecar stripper to the
 routed_experts object; document dtype field

---
 docs/reference.md                   |  1 +
 tests/test_trajectory_processing.py |  6 ++++++
 verifiers/utils/response_utils.py   | 10 +++++++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/docs/reference.md b/docs/reference.md
index a50811f4aa..26b02f258d 100644
--- a/docs/reference.md
+++ b/docs/reference.md
@@ -215,6 +215,7 @@ class RoutedExpertsPayload(TypedDict):
     data: Any  # actually memoryview; kept opaque so Pydantic skips schema validation
     shape: list[int]
     start: int
+    dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]]  # optional; absent → uint8
 ```
 
 ### TrajectoryStepTokens
diff --git a/tests/test_trajectory_processing.py b/tests/test_trajectory_processing.py
index 4d4ee30111..3ebe7cdbb2 100644
--- a/tests/test_trajectory_processing.py
+++ b/tests/test_trajectory_processing.py
@@ -502,6 +502,12 @@ def test_strip_routed_experts_data_key_order_robust():
     assert blob2 is not None and blob2.tobytes() == b"WFla"
     assert b'"data":""' in stripped2
 
+    # routed_experts object lacks data; an unrelated sibling has data — must
+    # NOT be mistaken for routed experts (search bounded to the object).
+    raw4 = b'{"routed_experts":{"shape":[3],"start":0},"other":{"data":"UNRELATED"}}'
+    stripped4, blob4 = strip_routed_experts_data(raw4)
+    assert blob4 is None and stripped4 == raw4
+
     # absent — no-op passthrough
     raw3 = b'{"choices":[{"token_ids":[1,2]}]}'
     stripped3, blob3 = strip_routed_experts_data(raw3)
diff --git a/verifiers/utils/response_utils.py b/verifiers/utils/response_utils.py
index 336f59f17b..64539bda2a 100644
--- a/verifiers/utils/response_utils.py
+++ b/verifiers/utils/response_utils.py
@@ -22,7 +22,15 @@ def strip_routed_experts_data(raw: bytes) -> tuple[bytes, memoryview | None]:
     if obj_start < 0:
         return raw, None
 
-    data_key = raw.find(ROUTED_EXPERTS_DATA_KEY, obj_start)
+    # Bound the search to the routed_experts object so a missing `data` here
+    # can't match an unrelated sibling's `data` later in the response. The
+    # object's values (base64 string, int shape/start, dtype) contain no `}`,
+    # so the first `}` after the prefix closes it.
+    obj_end = raw.find(b"}", obj_start)
+    if obj_end < 0:
+        return raw, None
+
+    data_key = raw.find(ROUTED_EXPERTS_DATA_KEY, obj_start, obj_end)
     if data_key < 0:
         return raw, None
 

From b31ff2d767f482178ecb68ba73ff44a67ec1a7eb Mon Sep 17 00:00:00 2001
From: Biswa Panda <biswa.panda@gmail.com>
Date: Wed, 10 Jun 2026 17:33:46 -0700
Subject: [PATCH 16/16] docs(clients): drop PR-number and branch/plan
 references from dynamo_chat comments

---
 verifiers/clients/openai_chat_completions_client.py       | 4 ++--
 verifiers/clients/openai_chat_completions_token_client.py | 3 +--
 verifiers/types.py                                        | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
index 2084ea6f33..0246b9f669 100644
--- a/verifiers/clients/openai_chat_completions_client.py
+++ b/verifiers/clients/openai_chat_completions_client.py
@@ -475,10 +475,10 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None:
             Three coexisting wire shapes from dynamo's vLLM/SGLang backends:
 
               1. ``response.nvext.engine_data.{completion_token_ids,
-                 completion_logprobs, prompt_token_ids}`` — PR #8119 channel
+                 completion_logprobs, prompt_token_ids}``
                  (opt-in: ``nvext.extra_fields=["engine_data"]``).
               2. ``response.nvext.completion_token_ids`` — top-level shape
-                 from rl-sdk-2 plan A4 (opt-in:
+                 (opt-in:
                  ``nvext.extra_fields=["completion_token_ids"]``). No
                  logprobs in this shape; logprobs ride the standard
                  ``choices[0].logprobs.content[*].logprob`` channel.
diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py
index 427ed4ee1e..36fd9f08cb 100644
--- a/verifiers/clients/openai_chat_completions_token_client.py
+++ b/verifiers/clients/openai_chat_completions_token_client.py
@@ -155,8 +155,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
             #   * dynamo_chat: `nvext.extra_fields=["engine_data"]`
             #     tells Dynamo's response builder to emit `response.nvext`
             #     `engine_data.{completion_token_ids, completion_logprobs,
-            #     prompt_token_ids}` (PR #8119 channel mirrored to vLLM in
-            #     ai-dynamo/dynamo `rl-sdk-2`). `from_native_response` grafts
+            #     prompt_token_ids}`. `from_native_response` grafts
             #     this onto the OpenAI-shaped response so `parse_tokens`
             #     works unmodified. `return_token_ids` is dropped because
             #     Dynamo's strict validator rejects it.
diff --git a/verifiers/types.py b/verifiers/types.py
index 62226d1f47..fd511603c3 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -90,8 +90,8 @@
 #   tokenization via the server's /tokenize route.
 # - "dynamo_chat": Dynamo's standard chat-completions route with
 #   pre-tokenized prompt carried in `nvext.token_data`. Server-side token
-#   IDs come back via `nvext.engine_data.completion_token_ids` (PR #8119
-#   canonical channel). Bridge tokenization runs locally via the
+#   IDs come back via `nvext.engine_data.completion_token_ids` (the
+#   canonical Dynamo channel). Bridge tokenization runs locally via the
 #   transformers fast tokenizer; no /tokenize HTTP round-trip.
 RendererTransport = Literal["vllm_generate", "dynamo_chat"]