diff --git a/docs/reference.md b/docs/reference.md
index a50811f4aa..26b02f258d 100644
--- a/docs/reference.md
+++ b/docs/reference.md
@@ -215,6 +215,7 @@ class RoutedExpertsPayload(TypedDict):
     data: Any  # actually memoryview; kept opaque so Pydantic skips schema validation
     shape: list[int]
     start: int
+    dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]]  # optional; absent → uint8
 ```
 
 ### TrajectoryStepTokens
diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py
index 923ff118e0..ad962c5ba2 100644
--- a/tests/test_openai_chat_completions_token_client.py
+++ b/tests/test_openai_chat_completions_token_client.py
@@ -293,3 +293,79 @@ async def fake_get_prompt_ids(  # noqa: ANN001
     assert len(recording_client.calls) == 1
     assert recording_client.calls[0]["path"] == "/chat/completions/tokens"
     assert recording_client.calls[0]["body"]["tokens"] == [10, 20]
+
+
+@pytest.mark.asyncio
+async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling():
+    """dynamo_chat wire body: vLLM-only keys scrubbed, standard sampling args
+    forwarded, nvext token_data + passthrough preserved."""
+    recording_client = _RecordingClient()
+    client = OpenAIChatCompletionsTokenClient(recording_client)
+
+    await client._post_dynamo_chat(
+        prompt=cast(Any, [{"role": "user", "content": ""}]),
+        prompt_ids=[1, 2, 3],
+        model="test-model",
+        tools=None,
+        sampling_args={
+            "temperature": 0.5,
+            "presence_penalty": 0.2,
+            "reasoning_effort": "high",  # arbitrary key: full parity, not an allowlist
+            "spaces_between_special_tokens": False,  # vLLM-only — must be scrubbed
+            "extra_body": {
+                "return_token_ids": True,  # vLLM-only — must be scrubbed
+                "nvext": {"extra_fields": ["engine_data"]},
+                "cache_salt": "ckpt-1",
+            },
+        },
+        extra_headers=None,
+    )
+
+    body = recording_client.calls[0]["body"]
+    assert "return_token_ids" not in body
+    assert "spaces_between_special_tokens" not in body
+    assert body["presence_penalty"] == 0.2
+    assert body["temperature"] == 0.5
+    assert body["reasoning_effort"] == "high"
+    assert body["nvext"]["token_data"] == [1, 2, 3]
+    assert body["nvext"]["extra_fields"] == ["engine_data"]
+    assert body["cache_salt"] == "ckpt-1"
+
+
+@pytest.mark.asyncio
+async def test_graft_engine_data_synthesizes_logprobs_when_content_less():
+    """engine_data.completion_logprobs must be grafted even when the choice
+    carries a content-less logprobs object (not only when absent)."""
+    from openai.types.chat import ChatCompletion
+
+    client = OpenAIChatCompletionsClient(_NoopClient())
+    native = ChatCompletion.model_validate(
+        {
+            "id": "x",
+            "object": "chat.completion",
+            "created": 1,
+            "model": "test-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": "ok"},
+                    "finish_reason": "stop",
+                    "logprobs": {"content": None},  # present but content-less
+                }
+            ],
+            "nvext": {
+                "engine_data": {
+                    "completion_token_ids": [10, 11],
+                    "prompt_token_ids": [1, 2, 3],
+                    "completion_logprobs": [-0.1, -0.2],
+                }
+            },
+        }
+    )
+
+    vf_response = await client.from_native_response(native)
+    tokens = vf_response.message.tokens
+    assert tokens is not None  # would be None before the fix (TITO lost)
+    assert tokens.completion_ids == [10, 11]
+    assert tokens.prompt_ids == [1, 2, 3]
+    assert tokens.completion_logprobs == [-0.1, -0.2]
diff --git a/tests/test_trajectory_processing.py b/tests/test_trajectory_processing.py
index 386e4fd947..3ebe7cdbb2 100644
--- a/tests/test_trajectory_processing.py
+++ b/tests/test_trajectory_processing.py
@@ -483,3 +483,32 @@ def test_trajectory_step_mask_combining():
     assert token_ids == [1, 2, 3, 4, 5]
     assert mask == [0, 0, 0, 1, 1]
     assert logprobs == [0.0, 0.0, 0.0, -0.1, -0.2]
+
+
+def test_strip_routed_experts_data_key_order_robust():
+    """The zero-copy stripper must find ``data`` regardless of key order
+    (``dtype``/``shape``/``start`` may precede it) and no-op when absent."""
+    from verifiers.utils.response_utils import strip_routed_experts_data
+
+    # data first (fast path)
+    raw = b'{"routed_experts":{"data":"QUJD","shape":[3],"start":0,"dtype":"uint8"}}'
+    stripped, blob = strip_routed_experts_data(raw)
+    assert blob is not None and blob.tobytes() == b"QUJD"
+    assert b'"data":""' in stripped
+
+    # dtype/shape/start before data — must still strip the blob
+    raw2 = b'{"routed_experts":{"dtype":"uint16","shape":[3],"start":0,"data":"WFla"}}'
+    stripped2, blob2 = strip_routed_experts_data(raw2)
+    assert blob2 is not None and blob2.tobytes() == b"WFla"
+    assert b'"data":""' in stripped2
+
+    # routed_experts object lacks data; an unrelated sibling has data — must
+    # NOT be mistaken for routed experts (search bounded to the object).
+    raw4 = b'{"routed_experts":{"shape":[3],"start":0},"other":{"data":"UNRELATED"}}'
+    stripped4, blob4 = strip_routed_experts_data(raw4)
+    assert blob4 is None and stripped4 == raw4
+
+    # absent — no-op passthrough
+    raw3 = b'{"choices":[{"token_ids":[1,2]}]}'
+    stripped3, blob3 = strip_routed_experts_data(raw3)
+    assert blob3 is None and stripped3 == raw3
diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
index d7d262f4be..0246b9f669 100644
--- a/verifiers/clients/openai_chat_completions_client.py
+++ b/verifiers/clients/openai_chat_completions_client.py
@@ -469,8 +469,100 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason:
                 case _:
                     return None
 
+        def _graft_engine_data(response: OpenAIChatResponse) -> None:
+            """Graft engine-side token IDs onto top-level response fields.
+
+            Three coexisting wire shapes from dynamo's vLLM/SGLang backends:
+
+              1. ``response.nvext.engine_data.{completion_token_ids,
+                 completion_logprobs, prompt_token_ids}``
+                 (opt-in: ``nvext.extra_fields=["engine_data"]``).
+              2. ``response.nvext.completion_token_ids`` — top-level shape
+                 (opt-in:
+                 ``nvext.extra_fields=["completion_token_ids"]``). No
+                 logprobs in this shape; logprobs ride the standard
+                 ``choices[0].logprobs.content[*].logprob`` channel.
+              3. Older vLLM-native paths set ``response.choices[0].token_ids``
+                 / ``response.prompt_token_ids`` directly (no grafting needed).
+
+            This helper bridges (1) and (2) onto the top-level fields the
+            rest of ``parse_tokens`` reads via the standard openai SDK
+            attribute path. ``engine_data`` wins when both are present (it
+            carries more — including logprobs + prompt_token_ids).
+            """
+            nvext = getattr(response, "nvext", None)
+            if nvext is None and hasattr(response, "model_dump"):
+                nvext = response.model_dump().get("nvext")
+            if not isinstance(nvext, dict):
+                return
+            choice = response.choices[0]
+
+            engine_data = nvext.get("engine_data")
+            completion_token_ids_top = nvext.get("completion_token_ids")
+            prompt_token_ids_top = nvext.get("prompt_token_ids")
+
+            # Prefer engine_data over top-level when both arrive: engine_data
+            # bundles logprobs + prompt_token_ids in one place.
+            completion_token_ids: list[int] | None = None
+            prompt_token_ids: list[int] | None = None
+            completion_logprobs: list[float] | None = None
+            if isinstance(engine_data, dict):
+                if engine_data.get("completion_token_ids") is not None:
+                    completion_token_ids = list(engine_data["completion_token_ids"])
+                if engine_data.get("prompt_token_ids") is not None:
+                    prompt_token_ids = list(engine_data["prompt_token_ids"])
+                if engine_data.get("completion_logprobs") is not None:
+                    completion_logprobs = [
+                        float(x) for x in engine_data["completion_logprobs"]
+                    ]
+            if completion_token_ids is None and completion_token_ids_top is not None:
+                completion_token_ids = list(completion_token_ids_top)
+            if prompt_token_ids is None and prompt_token_ids_top is not None:
+                prompt_token_ids = list(prompt_token_ids_top)
+
+            if (
+                getattr(choice, "token_ids", None) is None
+                and completion_token_ids is not None
+            ):
+                try:
+                    choice.token_ids = completion_token_ids
+                except Exception:
+                    object.__setattr__(choice, "token_ids", completion_token_ids)
+            if (
+                getattr(response, "prompt_token_ids", None) is None
+                and prompt_token_ids is not None
+            ):
+                try:
+                    response.prompt_token_ids = prompt_token_ids
+                except Exception:
+                    object.__setattr__(response, "prompt_token_ids", prompt_token_ids)
+            # Dynamo returns logprobs only under engine_data, not
+            # choices[0].logprobs. Synthesize the standard shape so parse_tokens
+            # (which requires choices[0].logprobs.content) can read them. Graft
+            # whenever the choice has no usable logprobs content — i.e. logprobs
+            # is missing OR present-but-content-less (empty/None content) — not
+            # only when it is absent entirely.
+            existing_lp = getattr(choice, "logprobs", None)
+            existing_content = (
+                existing_lp.get("content")
+                if isinstance(existing_lp, dict)
+                else getattr(existing_lp, "content", None)
+            )
+            if (
+                completion_logprobs is not None
+                and completion_token_ids is not None
+                and len(completion_logprobs) == len(completion_token_ids)
+                and not existing_content
+            ):
+                synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]}
+                try:
+                    choice.logprobs = synthesized
+                except Exception:
+                    object.__setattr__(choice, "logprobs", synthesized)
+
         def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
             assert len(response.choices) == 1, "Response should always have one choice"
+            _graft_engine_data(response)
             choice = response.choices[0]
             if not hasattr(choice, "token_ids"):
                 return None
@@ -508,6 +600,11 @@ def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
                 logprobs_content = response.choices[0].logprobs["content"]
                 completion_logprobs = [token["logprob"] for token in logprobs_content]
 
+            if len(completion_logprobs) != len(completion_ids):
+                # Engine returned mismatched logprobs/ids — drop rather than emit
+                # out-of-sync ResponseTokens.
+                return None
+
             choice_extra = choice.model_extra or {}
             return ResponseTokens(
                 prompt_ids=prompt_ids,
diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py
index 2d8cd701cc..36fd9f08cb 100644
--- a/verifiers/clients/openai_chat_completions_token_client.py
+++ b/verifiers/clients/openai_chat_completions_token_client.py
@@ -18,11 +18,21 @@
     OpenAITool,
     handle_openai_overlong_prompt,
 )
-from verifiers.types import SamplingArgs, State
+from verifiers.types import RendererTransport, SamplingArgs, State
 from verifiers.utils.client_utils import (
     post_chat_completion_with_routed_experts_sidecar,
 )
 
+# Sentinel for the default (legacy vLLM) transport. Lets callers route
+# around the legacy /tokenize body shape without changing the signature.
+_DEFAULT_TRANSPORT: RendererTransport = "vllm_generate"
+
+# vLLM/prime-only sampling keys Dynamo's strict validator rejects — scrubbed
+# from every dynamo_chat request body (both MITO and TITO paths).
+_DYNAMO_DROP_KEYS = frozenset(
+    {"return_token_ids", "spaces_between_special_tokens", "priority"}
+)
+
 
 def _has_multimodal_content(messages) -> bool:
     """Check if any message contains multimodal content (images, audio).
@@ -51,7 +61,25 @@ class TokenizeResponse(BaseModel):
 
 
 class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient):
-    """Wrapper for custom vLLM route /v1/chat/completions/tokens via AsyncOpenAI client."""
+    """Token-in / token-out chat client.
+
+    Two transports share this class, selected via
+    ``ClientConfig.renderer_transport``:
+
+    * ``vllm_generate`` (default): vLLM's TITO surface.
+      Posts to ``/v1/chat/completions/tokens`` with ``tokens=prompt_ids``
+      and uses the server's ``/tokenize`` endpoint for bridge tokens.
+      Requires vLLM ``>=0.20``.
+
+    * ``dynamo_chat``: Dynamo's standard ``/v1/chat/completions``
+      route with ``nvext.token_data=prompt_ids``. Server-side response
+      token IDs come back via ``response.nvext.engine_data.*``
+      (`OpenAIChatCompletionsClient.from_native_response` grafts them
+      onto the OpenAI-shaped response). Bridge tokens are computed
+      locally via the model's HuggingFace fast tokenizer — no
+      ``/tokenize`` HTTP round-trip — since Dynamo doesn't expose vLLM's
+      token routes.
+    """
 
     @property
     def token_client(self) -> AsyncOpenAI:
@@ -61,6 +89,46 @@ def token_client(self) -> AsyncOpenAI:
             base_url = base_url[:-3]
         return self.client.with_options(base_url=base_url)
 
+    @property
+    def renderer_transport(self) -> RendererTransport:
+        """Wire-shape selector. ``ClientConfig.renderer_transport`` if set,
+        else the default vLLM TITO surface. Mirrors the same field used by
+        ``RendererClient`` so backend selection stays in one place."""
+        return cast(
+            RendererTransport,
+            getattr(self._config, "renderer_transport", _DEFAULT_TRANSPORT)
+            if self._config is not None
+            else _DEFAULT_TRANSPORT,
+        )
+
+    def _get_local_tokenizer(self, model: str):
+        """Lazy, per-model HF fast tokenizer for the ``dynamo_chat``
+        transport. Bridge tokens are stitched locally — no ``/tokenize``
+        round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained``
+        cost once.
+        """
+        # Honor the explicit tokenizer override (renderer_model_name) so model
+        # aliases don't break bridge stitching; fall back to the served model.
+        override = (
+            getattr(self._config, "renderer_model_name", None)
+            if self._config is not None
+            else None
+        )
+        model = override or model
+        cache: dict[str, Any] = self.__dict__.setdefault("_tokenizer_cache", {})
+        if model in cache:
+            return cache[model]
+        try:
+            from transformers import AutoTokenizer  # type: ignore[import-not-found]
+        except ImportError as exc:  # pragma: no cover - dependency surface
+            raise ImportError(
+                "OpenAIChatCompletionsTokenClient with "
+                "renderer_transport='dynamo_chat' requires "
+                "`transformers`. Install with `pip install transformers`."
+            ) from exc
+        cache[model] = AutoTokenizer.from_pretrained(model)
+        return cache[model]
+
     @handle_openai_overlong_prompt
     async def get_native_response(
         self,
@@ -75,14 +143,59 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
             if "max_tokens" in sampling_args:
                 sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens")
             sampling_args["logprobs"] = True
-            extra_body = dict(return_token_ids=True)
-            if "extra_body" in sampling_args:
-                sampling_args["extra_body"] = {
-                    **sampling_args["extra_body"],
-                    **extra_body,
+
+            # Transport-specific opt-ins. Both transports get response-side
+            # token IDs, just via different fields:
+            #
+            #   * vllm_generate (vLLM): `extra_body.return_token_ids=True`
+            #     tells vLLM to set the non-standard `choices[0].token_ids` and
+            #     `response.prompt_token_ids` fields. `parse_tokens` reads them
+            #     directly.
+            #
+            #   * dynamo_chat: `nvext.extra_fields=["engine_data"]`
+            #     tells Dynamo's response builder to emit `response.nvext`
+            #     `engine_data.{completion_token_ids, completion_logprobs,
+            #     prompt_token_ids}`. `from_native_response` grafts
+            #     this onto the OpenAI-shaped response so `parse_tokens`
+            #     works unmodified. `return_token_ids` is dropped because
+            #     Dynamo's strict validator rejects it.
+            if self.renderer_transport == "dynamo_chat":
+                extra_body: dict[str, Any] = {
+                    "nvext": {"extra_fields": ["engine_data"]}
                 }
+            else:
+                extra_body = {"return_token_ids": True}
+
+            if "extra_body" in sampling_args:
+                merged = {**sampling_args["extra_body"]}
+                # Merge nvext.extra_fields cumulatively rather than overwriting,
+                # so caller-provided extra_fields (e.g. "timing", "worker_id")
+                # coexist with our "engine_data" opt-in.
+                if "nvext" in merged and "nvext" in extra_body:
+                    base = dict(merged.get("nvext") or {})
+                    inc = dict(extra_body.get("nvext") or {})
+                    base_ef = list(base.get("extra_fields") or [])
+                    inc_ef = list(inc.get("extra_fields") or [])
+                    merged_ef = list(dict.fromkeys(base_ef + inc_ef))
+                    merged_nvext = {**base, **inc, "extra_fields": merged_ef}
+                    merged["nvext"] = merged_nvext
+                    sampling_args["extra_body"] = {
+                        **{k: v for k, v in extra_body.items() if k != "nvext"},
+                        **merged,
+                    }
+                else:
+                    sampling_args["extra_body"] = {**merged, **extra_body}
             else:
                 sampling_args["extra_body"] = extra_body
+            if self.renderer_transport == "dynamo_chat":
+                # Drop vLLM/prime-only keys Dynamo rejects from both top-level
+                # args and extra_body, so MITO + TITO paths send a clean body.
+                eb = sampling_args.get("extra_body")
+                if isinstance(eb, dict):
+                    for k in _DYNAMO_DROP_KEYS:
+                        eb.pop(k, None)
+                for k in _DYNAMO_DROP_KEYS:
+                    sampling_args.pop(k, None)
             return {k: v for k, v in sampling_args.items() if v is not None}
 
         sampling_args = normalize_sampling_args(sampling_args)
@@ -126,6 +239,16 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
                 prompt, model, sampling_args, tools, extra_headers=extra_headers
             )
 
+        if self.renderer_transport == "dynamo_chat":
+            return await self._post_dynamo_chat(
+                prompt=prompt,
+                prompt_ids=prompt_ids,
+                model=model,
+                tools=tools,
+                sampling_args=sampling_args,
+                extra_headers=extra_headers,
+            )
+
         extra_body = sampling_args.pop("extra_body", {})
         body = {
             "model": model,
@@ -143,6 +266,66 @@ def normalize_sampling_args(sampling_args: SamplingArgs):
             extra_headers=extra_headers,
         )
 
+    async def _post_dynamo_chat(
+        self,
+        prompt: OpenAIChatMessages,
+        prompt_ids: list[int],
+        model: str,
+        tools: list[OpenAITool] | None,
+        sampling_args: dict,
+        extra_headers: Mapping[str, str] | None,
+    ) -> OpenAIChatResponse:
+        """Post stitched ``prompt_ids`` to Dynamo's chat-completions route.
+
+        The engine sees ``nvext.token_data`` and skips its own tokenization,
+        so the placeholder ``messages`` value stays small regardless of
+        trajectory length. Response token IDs come back via
+        ``response.nvext.engine_data.completion_token_ids`` and are grafted
+        onto ``choices[0].token_ids`` by
+        ``OpenAIChatCompletionsClient.from_native_response`` so the rest of
+        the pipeline reads them via the standard openai SDK attribute path.
+        """
+        extra_body = dict(sampling_args.pop("extra_body", {}) or {})
+
+        # nvext.token_data is the canonical pre-tokenized-prompt channel.
+        # Merge with caller-provided nvext (extra_fields etc.) rather than
+        # overwriting it. normalize_sampling_args already injected
+        # extra_fields=["engine_data"] into extra_body.nvext, so this just
+        # adds token_data to that same dict.
+        caller_nvext = dict(extra_body.pop("nvext", None) or {})
+        caller_nvext["token_data"] = prompt_ids
+        nvext = caller_nvext
+
+        body: dict[str, Any] = {
+            "model": model,
+            "messages": prompt,  # placeholder; engine ignores when token_data present
+            "stream": False,
+            "nvext": nvext,
+        }
+        if tools:
+            body["tools"] = tools
+
+        # Forward the full normalized sampling_args (parity with the vLLM path,
+        # which spreads all of sampling_args), then remaining extra_body keys —
+        # minus vLLM-only keys Dynamo's strict validator rejects (return_token_ids).
+        # Unknown keys ride through the dynamo frontend's PASSTHROUGH_EXTRA_FIELDS.
+        vllm_only = _DYNAMO_DROP_KEYS
+        for source in (sampling_args, extra_body):
+            for key, value in source.items():
+                if value is None or key in vllm_only or key in body:
+                    continue
+                body[key] = value
+
+        # Use the sidecar-aware post (same as the vLLM TITO + MITO paths) so any
+        # routed_experts blob is streamed, not JSON-parsed. dynamo_chat opts into
+        # extra_fields=["engine_data"] only, so routed_experts is normally absent.
+        return await post_chat_completion_with_routed_experts_sidecar(
+            self.client,
+            "/chat/completions",
+            body=body,
+            extra_headers=extra_headers,
+        )
+
     async def get_prompt_ids(
         self,
         state: State,
@@ -176,6 +359,15 @@ def normalize_for_comparison(value: Any) -> Any:
                 # prefix-match equality is unaffected.
                 if normalized.get("content") == "":
                     normalized["content"] = None
+                # Drop None-valued keys so model_dump's exhaustive view (which
+                # carries e.g. thinking_blocks=None on AssistantMessage) is
+                # equivalent to to_native_prompt's slimmer view (which omits
+                # the field entirely). Without this, vf.Message-shaped input
+                # (what MultiTurnEnv produces after maybe_normalize_messages)
+                # never matches the to_native_prompt-normalized step messages,
+                # which breaks the prefix match and forces TITO to fall back
+                # to MITO every turn-2+.
+                normalized = {k: v for k, v in normalized.items() if v is not None}
                 return normalized
             if isinstance(value, list):
                 return [normalize_for_comparison(item) for item in value]
@@ -369,9 +561,28 @@ async def tokenize(
         extra_kwargs: dict | None = None,
         **kwargs,
     ) -> list[int]:
-        """Tokenize messages using the vLLM /tokenize API."""
+        """Tokenize messages for bridge-token computation.
+
+        Dispatched by ``renderer_transport``:
+
+        * ``vllm_generate`` (default): POST to vLLM's ``/tokenize`` route.
+        * ``dynamo_chat``: local HF fast-tokenizer call. Dynamo doesn't
+          expose ``/tokenize``; running locally also saves two HTTP RTTs per
+          turn (the bridge computes both ``add_generation_prompt=True`` and
+          ``False`` views). The HF Rust encode releases the GIL so the
+          ``asyncio.to_thread`` wrap gives the event loop real parallelism.
+        """
         if extra_kwargs is None:
             extra_kwargs = {}
+
+        if self.renderer_transport == "dynamo_chat":
+            return await self._local_tokenize(
+                messages=messages,
+                tools=tools,
+                model=model,
+                extra_kwargs=extra_kwargs,
+            )
+
         if isinstance(messages, str):
             body = dict(
                 model=model,
@@ -392,3 +603,47 @@ async def tokenize(
                 "/tokenize", body=body, cast_to=TokenizeResponse
             )
         return tokenize_response.tokens
+
+    async def _local_tokenize(
+        self,
+        messages: str | OpenAIChatMessages,
+        tools: list[OpenAITool] | None,
+        model: str,
+        extra_kwargs: dict,
+    ) -> list[int]:
+        """Local in-process tokenization for the ``dynamo_chat`` transport.
+
+        Bridge tokenization under TITO calls this twice per turn (once for
+        ``add_generation_prompt=True`` and once for ``False``). Both runs
+        execute in a worker thread so the event loop stays free; HF fast
+        tokenizers release the GIL during the Rust encode pass.
+        """
+        import asyncio
+
+        add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True))
+        chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {})
+
+        # Load the tokenizer inside the worker thread: a cache miss runs the
+        # synchronous AutoTokenizer.from_pretrained, which must not block the loop.
+        if isinstance(messages, str):
+            def _encode_text() -> list[int]:
+                tokenizer = self._get_local_tokenizer(model)
+                return list(tokenizer.encode(messages, add_special_tokens=False))
+            return await asyncio.to_thread(_encode_text)
+
+        def _encode_chat() -> list[int]:
+            tokenizer = self._get_local_tokenizer(model)
+            ids = tokenizer.apply_chat_template(
+                messages,
+                tools=tools,
+                add_generation_prompt=add_generation_prompt,
+                tokenize=True,
+                **chat_template_kwargs,
+            )
+            if hasattr(ids, "input_ids"):
+                ids = ids.input_ids
+            if ids and isinstance(ids[0], list):
+                ids = ids[0]
+            return [int(t) for t in ids]
+
+        return await asyncio.to_thread(_encode_chat)
diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py
index 64ca4ec89d..cc0acd3556 100644
--- a/verifiers/clients/renderer_client.py
+++ b/verifiers/clients/renderer_client.py
@@ -603,15 +603,22 @@ async def get_native_response(
             multi_modal_data = None
             prompt_attribution = None
 
-        # ``renderers.client.generate`` discovers the engine's context-length
-        # cap on its own (via ``GET /v1/models``, cached) and raises
-        # ``renderers.OverlongPromptError`` on pre-flight overflow. Rebadge
-        # that into the verifiers-native ``OverlongPromptError`` so the
-        # ``MultiTurnEnv.prompt_too_long`` stop condition picks it up via
-        # the ``vf.Error`` hierarchy. The ``@handle_openai_overlong_prompt``
-        # decorator still handles the fallback case (cap unknown → engine
-        # 4xx → vf.OverlongPromptError) for engines whose ``/v1/models``
-        # doesn't expose ``max_model_len``.
+        # Thread renderer_transport from ClientConfig into generate() so the
+        # renderer client works against Dynamo's /v1/chat/completions surface
+        # as well as vLLM's /inference/v1/generate. setup_clients auto-picks
+        # "dynamo_chat" when client_config.backend == "dynamo".
+        # ``renderers.client.generate`` raises ``renderers.OverlongPromptError``
+        # on pre-flight overflow; rebadge to verifiers-native so MultiTurnEnv stops.
+        transport = (
+            self._config.renderer_transport
+            if self._config is not None
+            else "vllm_generate"
+        )
+        # Only pass transport= when non-default: a pinned ``renderers`` may
+        # predate the kwarg, so the default path must use the upstream signature.
+        generate_kwargs: dict[str, Any] = {}
+        if transport != "vllm_generate":
+            generate_kwargs["transport"] = transport
         try:
             return await generate(
                 client=self.client,
@@ -627,6 +634,7 @@ async def get_native_response(
                 or sampling_params.pop("cache_salt", None),
                 priority=args.get("priority") or sampling_params.pop("priority", None),
                 extra_headers=extra_headers or None,
+                **generate_kwargs,
             )
         except RendererOverlongPromptError as exc:
             raise OverlongPromptError(str(exc)) from exc
diff --git a/verifiers/types.py b/verifiers/types.py
index 4242f8a86f..fd511603c3 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -78,6 +78,23 @@
 EndpointClient: TypeAlias = AsyncOpenAI | OpenAI | AsyncAnthropic | Anthropic
 MessageType = Literal["chat", "completion"]  # deprecated
 
+# Wire-shape selector shared between RendererClient and
+# OpenAIChatCompletionsTokenClient. Picks which inference-server surface the
+# client targets at request-build time. Same flag drives both clients so a
+# single `ClientConfig.renderer_transport` setting routes consistently.
+#
+# - "vllm_generate" (default): vLLM's TITO surface. For RendererClient
+#   that's POST /v1/chat/completions with a renderer-flavored request body.
+#   For OpenAIChatCompletionsTokenClient that's POST
+#   /v1/chat/completions/tokens with `tokens=prompt_ids` and bridge
+#   tokenization via the server's /tokenize route.
+# - "dynamo_chat": Dynamo's standard chat-completions route with
+#   pre-tokenized prompt carried in `nvext.token_data`. Server-side token
+#   IDs come back via `nvext.engine_data.completion_token_ids` (the
+#   canonical Dynamo channel). Bridge tokenization runs locally via the
+#   transformers fast tokenizer; no /tokenize HTTP round-trip.
+RendererTransport = Literal["vllm_generate", "dynamo_chat"]
+
 
 # Provider-agnostic message + response types
 class CustomBaseModel(BaseModel):
@@ -211,6 +228,10 @@ class RoutedExpertsPayload(TypedDict):
     data: Any
     shape: list[int]
     start: int
+    # Element dtype of the decoded expert-id buffer. NotRequired so payloads
+    # serialized before this field still validate; a decoder that doesn't see
+    # it falls back to "uint8" (the historical encoding).
+    dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]]
 
 
 class ResponseTokens(CustomBaseModel):
@@ -1269,6 +1290,7 @@ class ClientConfig(BaseModel):
     Drives the renderer pool when ``client_type == "renderer"``. Defaults
     to ``None`` so non-renderer clients aren't forced to declare it; the
     renderer client treats ``None`` as ``AutoRendererConfig()``."""
+    renderer_transport: RendererTransport = "vllm_generate"
     renderer_model_name: str | None = None
     """Override the tokenizer model name used to instantiate the renderer
     pool. Defaults to the model used in API requests."""
diff --git a/verifiers/utils/response_utils.py b/verifiers/utils/response_utils.py
index 7bc13bc22d..64539bda2a 100644
--- a/verifiers/utils/response_utils.py
+++ b/verifiers/utils/response_utils.py
@@ -9,15 +9,32 @@
     TrajectoryStepTokens,
 )
 
-ROUTED_EXPERTS_DATA_PREFIX = b'"routed_experts":{"data":"'
+ROUTED_EXPERTS_OBJ_PREFIX = b'"routed_experts":{'
+ROUTED_EXPERTS_DATA_KEY = b'"data":"'
 
 
 def strip_routed_experts_data(raw: bytes) -> tuple[bytes, memoryview | None]:
-    data_start = raw.find(ROUTED_EXPERTS_DATA_PREFIX)
-    if data_start < 0:
+    # Zero-copy fast path for the large base64 routed_experts blob: find the
+    # "data" value inside the routed_experts object regardless of key order
+    # (shape/start/dtype may precede it), slice it out before JSON parsing.
+    # No-op fallback (consumer b64-decodes the string) if the shape isn't found.
+    obj_start = raw.find(ROUTED_EXPERTS_OBJ_PREFIX)
+    if obj_start < 0:
         return raw, None
 
-    data_start += len(ROUTED_EXPERTS_DATA_PREFIX)
+    # Bound the search to the routed_experts object so a missing `data` here
+    # can't match an unrelated sibling's `data` later in the response. The
+    # object's values (base64 string, int shape/start, dtype) contain no `}`,
+    # so the first `}` after the prefix closes it.
+    obj_end = raw.find(b"}", obj_start)
+    if obj_end < 0:
+        return raw, None
+
+    data_key = raw.find(ROUTED_EXPERTS_DATA_KEY, obj_start, obj_end)
+    if data_key < 0:
+        return raw, None
+
+    data_start = data_key + len(ROUTED_EXPERTS_DATA_KEY)
     data_end = raw.index(b'"', data_start)
     routed_data = memoryview(raw)[data_start:data_end]
     stripped = raw[:data_start] + raw[data_end:]