Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 43 additions & 4 deletions .github/workflows/task-11-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,45 @@ jobs:
fi

git diff --name-only "${{ github.event.pull_request.base.sha }}" HEAD > /tmp/task-11-changed-files.txt
if grep -Eq '^(packages/|bench/|scripts/run-pr530-benchmarks\.mjs$|package\.json$|package-lock\.json$|tsconfig|vitest)' /tmp/task-11-changed-files.txt; then
suites=()
add_suite() {
local suite="$1"
for existing in "${suites[@]}"; do
if [ "$existing" = "$suite" ]; then
return
fi
done
suites+=("$suite")
}

if grep -Eq '^(bench/|scripts/run-pr530-benchmarks\.mjs$|package\.json$|package-lock\.json$|tsconfig|vitest)' /tmp/task-11-changed-files.txt; then
echo "run=true" >> "$GITHUB_OUTPUT"
echo "suites=all" >> "$GITHUB_OUTPUT"
exit 0
fi

if grep -Eq '^packages/ai/(src/|test/|bench/|package\.json$|tsconfig)' /tmp/task-11-changed-files.txt; then
add_suite "ai-event-stream"
add_suite "ai-model-registry"
fi
if grep -Eq '^packages/tui/(src/|test/|bench/|package\.json$|tsconfig)' /tmp/task-11-changed-files.txt; then
add_suite "tui-editor"
add_suite "tui-markdown"
fi
if grep -Eq '^packages/coding-agent/(src/|test/|bench/|examples/|package\.json$|tsconfig)' /tmp/task-11-changed-files.txt; then
add_suite "coding-agent-render-transcript"
add_suite "coding-agent-bash-output"
add_suite "coding-agent-jsonl-parse"
add_suite "coding-agent-rpc-event-emit"
add_suite "emit-context-clone"
add_suite "compaction-trim"
add_suite "word-diff"
fi

if [ "${#suites[@]}" -gt 0 ]; then
suite_list="$(IFS=,; echo "${suites[*]}")"
echo "run=true" >> "$GITHUB_OUTPUT"
echo "suites=${suite_list}" >> "$GITHUB_OUTPUT"
else
echo "run=false" >> "$GITHUB_OUTPUT"
{
Expand Down Expand Up @@ -69,6 +106,7 @@ jobs:
shell: bash
run: |
mkdir -p /tmp/task-11-bench
benchmark_suites="${{ steps.scope.outputs.suites }}"

if [ -n "${{ github.event.pull_request.base.sha }}" ]; then
base_ref="${{ github.event.pull_request.base.sha }}"
Expand All @@ -81,13 +119,13 @@ jobs:
cd /tmp/senpi-base
npm ci --ignore-scripts
node scripts/run-pr530-benchmarks.mjs \
--suite all \
--suite "$benchmark_suites" \
--iterations 30 \
--json /tmp/task-11-bench/base.json
)

node scripts/run-pr530-benchmarks.mjs \
--suite all \
--suite "$benchmark_suites" \
--iterations 30 \
--json /tmp/task-11-bench/head.json

Expand Down Expand Up @@ -148,7 +186,7 @@ jobs:

set +e
node scripts/run-pr530-benchmarks.mjs \
--suite all \
--suite "$benchmark_suites" \
--iterations 30 \
--baseline bench/baseline/all-baseline.json \
--allow-regression-pct 0 \
Expand All @@ -167,6 +205,7 @@ jobs:
echo "## Task 11 benchmark gate"
echo
echo "- same-run status: pass"
echo "- suites: ${benchmark_suites}"
echo "- same-run median regression tolerance: 5%"
echo "- same-run absolute median regression tolerance: 0.01ms"
echo "- static baseline status: ${static_status}"
Expand Down
2 changes: 2 additions & 0 deletions packages/ai/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

### Fixed

- Fixed Anthropic same-model replay to preserve provider-native server tool blocks around signed thinking, avoiding `thinking` / `redacted_thinking` modification errors on follow-up tool-result requests.

### Removed

## [2026.6.17-2] - 2026-06-17
Expand Down
19 changes: 19 additions & 0 deletions packages/ai/src/providers/anthropic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,21 @@ function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null;
}

const REPLAYABLE_ANTHROPIC_PROVIDER_NATIVE_TYPES: ReadonlySet<string> = new Set([
"server_tool_use",
"web_search_tool_result",
"web_fetch_tool_result",
"code_execution_tool_result",
"bash_code_execution_tool_result",
"text_editor_code_execution_tool_result",
"tool_search_tool_result",
"container_upload",
]);

function isReplayableAnthropicProviderNativeBlock(raw: unknown): raw is ContentBlockParam {
return isRecord(raw) && typeof raw.type === "string" && REPLAYABLE_ANTHROPIC_PROVIDER_NATIVE_TYPES.has(raw.type);
}

function stringRecord(value: unknown): Record<string, string> | undefined {
if (!isRecord(value)) {
return undefined;
Expand Down Expand Up @@ -1461,6 +1476,7 @@ function convertMessages(
}
} else if (msg.role === "assistant") {
const blocks: ContentBlockParam[] = [];
const isSameModel = msg.provider === model.provider && msg.api === model.api && msg.model === model.id;

for (const block of msg.content) {
if (block.type === "text") {
Expand Down Expand Up @@ -1510,6 +1526,9 @@ function convertMessages(
input: block.arguments ?? {},
});
} else if (block.type === "providerNative") {
if (isSameModel && isReplayableAnthropicProviderNativeBlock(block.raw)) {
blocks.push(block.raw);
}
}
}
if (blocks.length === 0) continue;
Expand Down
163 changes: 163 additions & 0 deletions packages/ai/test/anthropic-provider-native-replay.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import { describe, expect, it } from "vitest";
import { getModel } from "../src/models.ts";
import { streamSimple } from "../src/stream.ts";
import type { AssistantMessage, Context, Model, SimpleStreamOptions } from "../src/types.ts";

interface CapturedAnthropicMessage {
readonly role: string;
readonly content: unknown;
}

interface CapturedAnthropicPayload {
readonly messages?: readonly CapturedAnthropicMessage[];
}

const usage = {
input: 0,
output: 0,
cacheRead: 0,
cacheWrite: 0,
totalTokens: 0,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
} as const;

function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null;
}

function parsePayload(value: unknown): CapturedAnthropicPayload {
if (!isRecord(value)) {
return {};
}
const messages = value.messages;
if (!Array.isArray(messages)) {
return {};
}
return {
messages: messages.flatMap((message) => {
if (!isRecord(message) || typeof message.role !== "string") {
return [];
}
return [{ role: message.role, content: message.content }];
}),
};
}

function assistantMessage(
content: AssistantMessage["content"],
overrides?: Partial<AssistantMessage>,
): AssistantMessage {
return {
role: "assistant",
api: "anthropic-messages",
provider: "anthropic",
model: "claude-haiku-4-5",
content,
usage,
stopReason: "stop",
timestamp: 1,
...overrides,
};
}

async function capturePayload(
model: Model<"anthropic-messages">,
messages: Context["messages"],
options?: SimpleStreamOptions,
): Promise<CapturedAnthropicPayload> {
let capturedPayload: CapturedAnthropicPayload | undefined;
const payloadCaptureModel: Model<"anthropic-messages"> = {
...model,
baseUrl: "http://127.0.0.1:9",
};
const stream = streamSimple(
payloadCaptureModel,
{ messages },
{
...options,
apiKey: "fake-key",
onPayload: (payload) => {
capturedPayload = parsePayload(payload);
return payload;
},
},
);

await stream.result();

if (!capturedPayload) {
throw new Error("Expected payload to be captured before request failure");
}

return capturedPayload;
}

describe("Anthropic provider-native replay", () => {
it("preserves same-model server tool blocks around signed thinking", async () => {
const model = getModel("anthropic", "claude-haiku-4-5");
const serverToolUse = { type: "server_tool_use", id: "srvu_1", name: "web_search", input: { query: "hi" } };
const webSearchToolResult = {
type: "web_search_tool_result",
tool_use_id: "srvu_1",
content: [
{ type: "web_search_result", title: "Example", url: "https://example.com", encrypted_content: "enc" },
],
};
const assistant = assistantMessage(
[
{ type: "providerNative", subtype: "server_tool_use", raw: serverToolUse },
{ type: "providerNative", subtype: "web_search_tool_result", raw: webSearchToolResult },
{ type: "thinking", thinking: "protected thinking", thinkingSignature: "sig_1" },
{ type: "text", text: "kept" },
{ type: "toolCall", id: "toolu_1", name: "read", arguments: { path: "README.md" } },
],
{ stopReason: "toolUse" },
);

const payload = await capturePayload(model, [
{ role: "user", content: "hello", timestamp: 1 },
assistant,
{
role: "toolResult",
toolCallId: "toolu_1",
toolName: "read",
content: [{ type: "text", text: "tool output" }],
isError: false,
timestamp: 2,
},
]);

const assistantPayload = payload.messages?.find((message) => message.role === "assistant");
expect(assistantPayload?.content).toEqual([
serverToolUse,
webSearchToolResult,
{ type: "thinking", thinking: "protected thinking", signature: "sig_1" },
{ type: "text", text: "kept" },
{ type: "tool_use", id: "toolu_1", name: "read", input: { path: "README.md" } },
]);
});

it("drops cross-provider provider-native blocks", async () => {
const model = getModel("anthropic", "claude-haiku-4-5");
const assistant = assistantMessage(
[
{ type: "providerNative", subtype: "web_search_call", raw: { type: "web_search_call", id: "ws_1" } },
{ type: "text", text: "kept" },
],
{
api: "openai-responses",
provider: "openai",
model: "gpt-5.4",
},
);

const payload = await capturePayload(model, [
{ role: "user", content: "hello", timestamp: 1 },
assistant,
{ role: "user", content: "follow up", timestamp: 2 },
]);

const assistantPayload = payload.messages?.find((message) => message.role === "assistant");
expect(assistantPayload?.content).toEqual([{ type: "text", text: "kept" }]);
});
});
93 changes: 0 additions & 93 deletions packages/ai/test/anthropic.provider-native.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,97 +109,4 @@ describe("Anthropic provider-native content blocks", () => {
raw: webSearchToolResultBlock,
});
});

it("skips providerNative blocks when converting assistant messages for replay", async () => {
const model = getModel("anthropic", "claude-haiku-4-5");
const assistantContent: Context["messages"][number] = {
role: "assistant",
content: [
{ type: "text", text: "kept" },
{ type: "providerNative", subtype: "server_tool_use", raw: { type: "server_tool_use", id: "srvu_1" } },
{
type: "providerNative",
subtype: "web_search_tool_result",
raw: { type: "web_search_tool_result", tool_use_id: "srvu_1" },
},
],
api: "anthropic-messages",
provider: "anthropic",
model: "claude-haiku-4-5",
usage: {
input: 0,
output: 0,
cacheRead: 0,
cacheWrite: 0,
totalTokens: 0,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
},
stopReason: "stop",
timestamp: Date.now(),
};

const context: Context = {
messages: [{ role: "user", content: "hello", timestamp: Date.now() }, assistantContent],
};

let capturedPayload: unknown;
const response = createSseResponse([
{
event: "message_start",
data: JSON.stringify({
type: "message_start",
message: {
id: "msg_replay",
usage: {
input_tokens: 2,
output_tokens: 0,
cache_read_input_tokens: 0,
cache_creation_input_tokens: 0,
},
},
}),
},
{
event: "content_block_start",
data: JSON.stringify({
type: "content_block_start",
index: 0,
content_block: { type: "text", text: "" },
}),
},
{
event: "content_block_delta",
data: JSON.stringify({ type: "content_block_delta", index: 0, delta: { type: "text_delta", text: "ok" } }),
},
{ event: "content_block_stop", data: JSON.stringify({ type: "content_block_stop", index: 0 }) },
{
event: "message_delta",
data: JSON.stringify({
type: "message_delta",
delta: { stop_reason: "end_turn" },
usage: {
input_tokens: 2,
output_tokens: 2,
cache_read_input_tokens: 0,
cache_creation_input_tokens: 0,
},
}),
},
{ event: "message_stop", data: JSON.stringify({ type: "message_stop" }) },
]);

const stream = streamAnthropic(model, context, {
client: createFakeAnthropicClient(response, (params) => {
capturedPayload = params;
}),
});
await stream.result();

const payload = capturedPayload as {
messages: Array<{ role: string; content: Array<{ type: string; text?: string }> }>;
};
const assistantMessage = payload.messages.find((message) => message.role === "assistant");
expect(assistantMessage).toBeDefined();
expect(assistantMessage?.content).toEqual([{ type: "text", text: "kept" }]);
});
});
Loading
Loading