Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
230384a
feat(types): add RendererTransport literal + ClientConfig.renderer_tr…
biswapanda May 13, 2026
1311096
feat(clients): graft nvext.engine_data onto OpenAI response in parse_…
biswapanda May 13, 2026
c766529
feat(tito): add dynamo_chat_nvext transport + local bridge tokenize
biswapanda May 13, 2026
f12bf63
feat(clients): graft top-level nvext.completion_token_ids + prompt_to…
biswapanda May 14, 2026
ee3482a
feat(clients): thread renderer_transport from ClientConfig to rendere…
biswapanda May 14, 2026
3b58bf9
fix(clients): address PR review R1-R5 (guard transport kwarg, import …
biswapanda Jun 9, 2026
7a85b84
fix(clients): graft engine_data logprobs even when choice logprobs is…
biswapanda Jun 9, 2026
7cbb603
fix(clients): dynamo_chat forwards full normalized sampling_args (dro…
biswapanda Jun 9, 2026
6b2dfbb
fix(clients): centralize Dynamo denylist scrub (MITO+TITO), guard log…
biswapanda Jun 9, 2026
9d260d3
fix(clients): enforce logprobs/ids length invariant in parse_tokens (…
biswapanda Jun 9, 2026
4aa48a4
fix(clients): centralize tokenizer override in _get_local_tokenizer; …
biswapanda Jun 9, 2026
d713edc
fix(clients): load HF tokenizer inside worker thread (cache-miss from…
biswapanda Jun 9, 2026
193c549
feat(types): add dtype to RoutedExpertsPayload contract
biswapanda Jun 10, 2026
c30dad2
fix(routed_experts): tighten dtype to Literal and make sidecar stripp…
biswapanda Jun 10, 2026
ea53210
fix(routed_experts): bound sidecar stripper to the routed_experts obj…
biswapanda Jun 10, 2026
b31ff2d
docs(clients): drop PR-number and branch/plan references from dynamo_…
biswapanda Jun 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ class RoutedExpertsPayload(TypedDict):
data: Any # actually memoryview; kept opaque so Pydantic skips schema validation
shape: list[int]
start: int
dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]] # optional; absent → uint8
```

### TrajectoryStepTokens
Expand Down
76 changes: 76 additions & 0 deletions tests/test_openai_chat_completions_token_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,79 @@ async def fake_get_prompt_ids( # noqa: ANN001
assert len(recording_client.calls) == 1
assert recording_client.calls[0]["path"] == "/chat/completions/tokens"
assert recording_client.calls[0]["body"]["tokens"] == [10, 20]


@pytest.mark.asyncio
async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling():
"""dynamo_chat wire body: vLLM-only keys scrubbed, standard sampling args
forwarded, nvext token_data + passthrough preserved."""
recording_client = _RecordingClient()
client = OpenAIChatCompletionsTokenClient(recording_client)

await client._post_dynamo_chat(
prompt=cast(Any, [{"role": "user", "content": ""}]),
prompt_ids=[1, 2, 3],
model="test-model",
tools=None,
sampling_args={
"temperature": 0.5,
"presence_penalty": 0.2,
"reasoning_effort": "high", # arbitrary key: full parity, not an allowlist
"spaces_between_special_tokens": False, # vLLM-only — must be scrubbed
"extra_body": {
"return_token_ids": True, # vLLM-only — must be scrubbed
"nvext": {"extra_fields": ["engine_data"]},
"cache_salt": "ckpt-1",
},
},
extra_headers=None,
)

body = recording_client.calls[0]["body"]
assert "return_token_ids" not in body
assert "spaces_between_special_tokens" not in body
assert body["presence_penalty"] == 0.2
assert body["temperature"] == 0.5
assert body["reasoning_effort"] == "high"
assert body["nvext"]["token_data"] == [1, 2, 3]
assert body["nvext"]["extra_fields"] == ["engine_data"]
assert body["cache_salt"] == "ckpt-1"


@pytest.mark.asyncio
async def test_graft_engine_data_synthesizes_logprobs_when_content_less():
"""engine_data.completion_logprobs must be grafted even when the choice
carries a content-less logprobs object (not only when absent)."""
from openai.types.chat import ChatCompletion

client = OpenAIChatCompletionsClient(_NoopClient())
native = ChatCompletion.model_validate(
{
"id": "x",
"object": "chat.completion",
"created": 1,
"model": "test-model",
"choices": [
{
"index": 0,
"message": {"role": "assistant", "content": "ok"},
"finish_reason": "stop",
"logprobs": {"content": None}, # present but content-less
}
],
"nvext": {
"engine_data": {
"completion_token_ids": [10, 11],
"prompt_token_ids": [1, 2, 3],
"completion_logprobs": [-0.1, -0.2],
}
},
}
)

vf_response = await client.from_native_response(native)
tokens = vf_response.message.tokens
assert tokens is not None # would be None before the fix (TITO lost)
assert tokens.completion_ids == [10, 11]
assert tokens.prompt_ids == [1, 2, 3]
assert tokens.completion_logprobs == [-0.1, -0.2]
29 changes: 29 additions & 0 deletions tests/test_trajectory_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,3 +483,32 @@ def test_trajectory_step_mask_combining():
assert token_ids == [1, 2, 3, 4, 5]
assert mask == [0, 0, 0, 1, 1]
assert logprobs == [0.0, 0.0, 0.0, -0.1, -0.2]


def test_strip_routed_experts_data_key_order_robust():
"""The zero-copy stripper must find ``data`` regardless of key order
(``dtype``/``shape``/``start`` may precede it) and no-op when absent."""
from verifiers.utils.response_utils import strip_routed_experts_data

# data first (fast path)
raw = b'{"routed_experts":{"data":"QUJD","shape":[3],"start":0,"dtype":"uint8"}}'
stripped, blob = strip_routed_experts_data(raw)
assert blob is not None and blob.tobytes() == b"QUJD"
assert b'"data":""' in stripped

# dtype/shape/start before data — must still strip the blob
raw2 = b'{"routed_experts":{"dtype":"uint16","shape":[3],"start":0,"data":"WFla"}}'
stripped2, blob2 = strip_routed_experts_data(raw2)
assert blob2 is not None and blob2.tobytes() == b"WFla"
assert b'"data":""' in stripped2

# routed_experts object lacks data; an unrelated sibling has data — must
# NOT be mistaken for routed experts (search bounded to the object).
raw4 = b'{"routed_experts":{"shape":[3],"start":0},"other":{"data":"UNRELATED"}}'
stripped4, blob4 = strip_routed_experts_data(raw4)
assert blob4 is None and stripped4 == raw4

# absent — no-op passthrough
raw3 = b'{"choices":[{"token_ids":[1,2]}]}'
stripped3, blob3 = strip_routed_experts_data(raw3)
assert blob3 is None and stripped3 == raw3
97 changes: 97 additions & 0 deletions verifiers/clients/openai_chat_completions_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,8 +469,100 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason:
case _:
return None

def _graft_engine_data(response: OpenAIChatResponse) -> None:
"""Graft engine-side token IDs onto top-level response fields.

Three coexisting wire shapes from dynamo's vLLM/SGLang backends:

1. ``response.nvext.engine_data.{completion_token_ids,
completion_logprobs, prompt_token_ids}``
(opt-in: ``nvext.extra_fields=["engine_data"]``).
2. ``response.nvext.completion_token_ids`` — top-level shape
(opt-in:
``nvext.extra_fields=["completion_token_ids"]``). No
logprobs in this shape; logprobs ride the standard
``choices[0].logprobs.content[*].logprob`` channel.
3. Older vLLM-native paths set ``response.choices[0].token_ids``
/ ``response.prompt_token_ids`` directly (no grafting needed).

This helper bridges (1) and (2) onto the top-level fields the
rest of ``parse_tokens`` reads via the standard openai SDK
attribute path. ``engine_data`` wins when both are present (it
carries more — including logprobs + prompt_token_ids).
"""
nvext = getattr(response, "nvext", None)
if nvext is None and hasattr(response, "model_dump"):
nvext = response.model_dump().get("nvext")
if not isinstance(nvext, dict):
return
choice = response.choices[0]

engine_data = nvext.get("engine_data")
completion_token_ids_top = nvext.get("completion_token_ids")
prompt_token_ids_top = nvext.get("prompt_token_ids")

# Prefer engine_data over top-level when both arrive: engine_data
# bundles logprobs + prompt_token_ids in one place.
completion_token_ids: list[int] | None = None
prompt_token_ids: list[int] | None = None
completion_logprobs: list[float] | None = None
if isinstance(engine_data, dict):
if engine_data.get("completion_token_ids") is not None:
completion_token_ids = list(engine_data["completion_token_ids"])
if engine_data.get("prompt_token_ids") is not None:
prompt_token_ids = list(engine_data["prompt_token_ids"])
if engine_data.get("completion_logprobs") is not None:
completion_logprobs = [
float(x) for x in engine_data["completion_logprobs"]
]
if completion_token_ids is None and completion_token_ids_top is not None:
completion_token_ids = list(completion_token_ids_top)
if prompt_token_ids is None and prompt_token_ids_top is not None:
prompt_token_ids = list(prompt_token_ids_top)

if (
getattr(choice, "token_ids", None) is None
and completion_token_ids is not None
):
try:
choice.token_ids = completion_token_ids
except Exception:
object.__setattr__(choice, "token_ids", completion_token_ids)
if (
getattr(response, "prompt_token_ids", None) is None
and prompt_token_ids is not None
):
try:
response.prompt_token_ids = prompt_token_ids
except Exception:
object.__setattr__(response, "prompt_token_ids", prompt_token_ids)
# Dynamo returns logprobs only under engine_data, not
# choices[0].logprobs. Synthesize the standard shape so parse_tokens
# (which requires choices[0].logprobs.content) can read them. Graft
# whenever the choice has no usable logprobs content — i.e. logprobs
# is missing OR present-but-content-less (empty/None content) — not
# only when it is absent entirely.
existing_lp = getattr(choice, "logprobs", None)
existing_content = (
existing_lp.get("content")
if isinstance(existing_lp, dict)
else getattr(existing_lp, "content", None)
)
if (
completion_logprobs is not None
and completion_token_ids is not None
and len(completion_logprobs) == len(completion_token_ids)
and not existing_content
):
synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]}
try:
choice.logprobs = synthesized
except Exception:
object.__setattr__(choice, "logprobs", synthesized)
Comment thread
cursor[bot] marked this conversation as resolved.

def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
assert len(response.choices) == 1, "Response should always have one choice"
_graft_engine_data(response)
choice = response.choices[0]
if not hasattr(choice, "token_ids"):
return None
Expand Down Expand Up @@ -508,6 +600,11 @@ def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
logprobs_content = response.choices[0].logprobs["content"]
completion_logprobs = [token["logprob"] for token in logprobs_content]

if len(completion_logprobs) != len(completion_ids):
# Engine returned mismatched logprobs/ids — drop rather than emit
# out-of-sync ResponseTokens.
return None

choice_extra = choice.model_extra or {}
return ResponseTokens(
prompt_ids=prompt_ids,
Expand Down
Loading