PrimeIntellect-ai · biswapanda · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 14, 2026
diff --git a/docs/reference.md b/docs/reference.md
@@ -215,6 +215,7 @@ class RoutedExpertsPayload(TypedDict):
     data: Any  # actually memoryview; kept opaque so Pydantic skips schema validation
     shape: list[int]
     start: int
+    dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]]  # optional; absent → uint8
 ```
 
 ### TrajectoryStepTokens

diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py
@@ -293,3 +293,79 @@ async def fake_get_prompt_ids(  # noqa: ANN001
     assert len(recording_client.calls) == 1
     assert recording_client.calls[0]["path"] == "/chat/completions/tokens"
     assert recording_client.calls[0]["body"]["tokens"] == [10, 20]
+
+
+@pytest.mark.asyncio
+async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling():
+    """dynamo_chat wire body: vLLM-only keys scrubbed, standard sampling args
+    forwarded, nvext token_data + passthrough preserved."""
+    recording_client = _RecordingClient()
+    client = OpenAIChatCompletionsTokenClient(recording_client)
+
+    await client._post_dynamo_chat(
+        prompt=cast(Any, [{"role": "user", "content": ""}]),
+        prompt_ids=[1, 2, 3],
+        model="test-model",
+        tools=None,
+        sampling_args={
+            "temperature": 0.5,
+            "presence_penalty": 0.2,
+            "reasoning_effort": "high",  # arbitrary key: full parity, not an allowlist
+            "spaces_between_special_tokens": False,  # vLLM-only — must be scrubbed
+            "extra_body": {
+                "return_token_ids": True,  # vLLM-only — must be scrubbed
+                "nvext": {"extra_fields": ["engine_data"]},
+                "cache_salt": "ckpt-1",
+            },
+        },
+        extra_headers=None,
+    )
+
+    body = recording_client.calls[0]["body"]
+    assert "return_token_ids" not in body
+    assert "spaces_between_special_tokens" not in body
+    assert body["presence_penalty"] == 0.2
+    assert body["temperature"] == 0.5
+    assert body["reasoning_effort"] == "high"
+    assert body["nvext"]["token_data"] == [1, 2, 3]
+    assert body["nvext"]["extra_fields"] == ["engine_data"]
+    assert body["cache_salt"] == "ckpt-1"
+
+
+@pytest.mark.asyncio
+async def test_graft_engine_data_synthesizes_logprobs_when_content_less():
+    """engine_data.completion_logprobs must be grafted even when the choice
+    carries a content-less logprobs object (not only when absent)."""
+    from openai.types.chat import ChatCompletion
+
+    client = OpenAIChatCompletionsClient(_NoopClient())
+    native = ChatCompletion.model_validate(
+        {
+            "id": "x",
+            "object": "chat.completion",
+            "created": 1,
+            "model": "test-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": "ok"},
+                    "finish_reason": "stop",
+                    "logprobs": {"content": None},  # present but content-less
+                }
+            ],
+            "nvext": {
+                "engine_data": {
+                    "completion_token_ids": [10, 11],
+                    "prompt_token_ids": [1, 2, 3],
+                    "completion_logprobs": [-0.1, -0.2],
+                }
+            },
+        }
+    )
+
+    vf_response = await client.from_native_response(native)
+    tokens = vf_response.message.tokens
+    assert tokens is not None  # would be None before the fix (TITO lost)
+    assert tokens.completion_ids == [10, 11]
+    assert tokens.prompt_ids == [1, 2, 3]
+    assert tokens.completion_logprobs == [-0.1, -0.2]
diff --git a/tests/test_trajectory_processing.py b/tests/test_trajectory_processing.py
@@ -483,3 +483,32 @@ def test_trajectory_step_mask_combining():
     assert token_ids == [1, 2, 3, 4, 5]
     assert mask == [0, 0, 0, 1, 1]
     assert logprobs == [0.0, 0.0, 0.0, -0.1, -0.2]
+
+
+def test_strip_routed_experts_data_key_order_robust():
+    """The zero-copy stripper must find ``data`` regardless of key order
+    (``dtype``/``shape``/``start`` may precede it) and no-op when absent."""
+    from verifiers.utils.response_utils import strip_routed_experts_data
+
+    # data first (fast path)
+    raw = b'{"routed_experts":{"data":"QUJD","shape":[3],"start":0,"dtype":"uint8"}}'
+    stripped, blob = strip_routed_experts_data(raw)
+    assert blob is not None and blob.tobytes() == b"QUJD"
+    assert b'"data":""' in stripped
+
+    # dtype/shape/start before data — must still strip the blob
+    raw2 = b'{"routed_experts":{"dtype":"uint16","shape":[3],"start":0,"data":"WFla"}}'
+    stripped2, blob2 = strip_routed_experts_data(raw2)
+    assert blob2 is not None and blob2.tobytes() == b"WFla"
+    assert b'"data":""' in stripped2
+
+    # routed_experts object lacks data; an unrelated sibling has data — must
+    # NOT be mistaken for routed experts (search bounded to the object).
+    raw4 = b'{"routed_experts":{"shape":[3],"start":0},"other":{"data":"UNRELATED"}}'
+    stripped4, blob4 = strip_routed_experts_data(raw4)
+    assert blob4 is None and stripped4 == raw4
+
+    # absent — no-op passthrough
+    raw3 = b'{"choices":[{"token_ids":[1,2]}]}'
+    stripped3, blob3 = strip_routed_experts_data(raw3)
+    assert blob3 is None and stripped3 == raw3
diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
@@ -469,8 +469,100 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason:
                 case _:
                     return None
 
+        def _graft_engine_data(response: OpenAIChatResponse) -> None:
+            """Graft engine-side token IDs onto top-level response fields.
+
+            Three coexisting wire shapes from dynamo's vLLM/SGLang backends:
+
+              1. ``response.nvext.engine_data.{completion_token_ids,
+                 completion_logprobs, prompt_token_ids}``
+                 (opt-in: ``nvext.extra_fields=["engine_data"]``).
+              2. ``response.nvext.completion_token_ids`` — top-level shape
+                 (opt-in:
+                 ``nvext.extra_fields=["completion_token_ids"]``). No
+                 logprobs in this shape; logprobs ride the standard
+                 ``choices[0].logprobs.content[*].logprob`` channel.
+              3. Older vLLM-native paths set ``response.choices[0].token_ids``
+                 / ``response.prompt_token_ids`` directly (no grafting needed).
+
+            This helper bridges (1) and (2) onto the top-level fields the
+            rest of ``parse_tokens`` reads via the standard openai SDK
+            attribute path. ``engine_data`` wins when both are present (it
+            carries more — including logprobs + prompt_token_ids).
+            """
+            nvext = getattr(response, "nvext", None)
+            if nvext is None and hasattr(response, "model_dump"):
+                nvext = response.model_dump().get("nvext")
+            if not isinstance(nvext, dict):
+                return
+            choice = response.choices[0]
+
+            engine_data = nvext.get("engine_data")
+            completion_token_ids_top = nvext.get("completion_token_ids")
+            prompt_token_ids_top = nvext.get("prompt_token_ids")
+
+            # Prefer engine_data over top-level when both arrive: engine_data
+            # bundles logprobs + prompt_token_ids in one place.
+            completion_token_ids: list[int] | None = None
+            prompt_token_ids: list[int] | None = None
+            completion_logprobs: list[float] | None = None
+            if isinstance(engine_data, dict):
+                if engine_data.get("completion_token_ids") is not None:
+                    completion_token_ids = list(engine_data["completion_token_ids"])
+                if engine_data.get("prompt_token_ids") is not None:
+                    prompt_token_ids = list(engine_data["prompt_token_ids"])
+                if engine_data.get("completion_logprobs") is not None:
+                    completion_logprobs = [
+                        float(x) for x in engine_data["completion_logprobs"]
+                    ]
+            if completion_token_ids is None and completion_token_ids_top is not None:
+                completion_token_ids = list(completion_token_ids_top)
+            if prompt_token_ids is None and prompt_token_ids_top is not None:
+                prompt_token_ids = list(prompt_token_ids_top)
+
+            if (
+                getattr(choice, "token_ids", None) is None
+                and completion_token_ids is not None
+            ):
+                try:
+                    choice.token_ids = completion_token_ids
+                except Exception:
+                    object.__setattr__(choice, "token_ids", completion_token_ids)
+            if (
+                getattr(response, "prompt_token_ids", None) is None
+                and prompt_token_ids is not None
+            ):
+                try:
+                    response.prompt_token_ids = prompt_token_ids
+                except Exception:
+                    object.__setattr__(response, "prompt_token_ids", prompt_token_ids)
+            # Dynamo returns logprobs only under engine_data, not
+            # choices[0].logprobs. Synthesize the standard shape so parse_tokens
+            # (which requires choices[0].logprobs.content) can read them. Graft
+            # whenever the choice has no usable logprobs content — i.e. logprobs
+            # is missing OR present-but-content-less (empty/None content) — not
+            # only when it is absent entirely.
+            existing_lp = getattr(choice, "logprobs", None)
+            existing_content = (
+                existing_lp.get("content")
+                if isinstance(existing_lp, dict)
+                else getattr(existing_lp, "content", None)
+            )
+            if (
+                completion_logprobs is not None
+                and completion_token_ids is not None
+                and len(completion_logprobs) == len(completion_token_ids)
+                and not existing_content
+            ):
+                synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]}
+                try:
+                    choice.logprobs = synthesized
+                except Exception:
+                    object.__setattr__(choice, "logprobs", synthesized)
+
         def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
             assert len(response.choices) == 1, "Response should always have one choice"
+            _graft_engine_data(response)
             choice = response.choices[0]
             if not hasattr(choice, "token_ids"):
                 return None
@@ -508,6 +600,11 @@ def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
                 logprobs_content = response.choices[0].logprobs["content"]
                 completion_logprobs = [token["logprob"] for token in logprobs_content]
 
+            if len(completion_logprobs) != len(completion_ids):
+                # Engine returned mismatched logprobs/ids — drop rather than emit
+                # out-of-sync ResponseTokens.
+                return None
+
             choice_extra = choice.model_extra or {}
             return ResponseTokens(
                 prompt_ids=prompt_ids,