From a7fabccac6630678d2eeb7027e0ca7a01406508f Mon Sep 17 00:00:00 2001 From: YeonGyu-Kim Date: Wed, 17 Jun 2026 19:00:56 +0900 Subject: [PATCH 1/2] fix(ai): preserve Anthropic server tool replay --- packages/ai/CHANGELOG.md | 2 + packages/ai/src/providers/anthropic.ts | 19 ++ .../anthropic-provider-native-replay.test.ts | 163 ++++++++++++++++++ .../ai/test/anthropic.provider-native.test.ts | 93 ---------- packages/coding-agent/CHANGELOG.md | 2 + 5 files changed, 186 insertions(+), 93 deletions(-) create mode 100644 packages/ai/test/anthropic-provider-native-replay.test.ts diff --git a/packages/ai/CHANGELOG.md b/packages/ai/CHANGELOG.md index e332250e0..4c032665e 100644 --- a/packages/ai/CHANGELOG.md +++ b/packages/ai/CHANGELOG.md @@ -10,6 +10,8 @@ ### Fixed +- Fixed Anthropic same-model replay to preserve provider-native server tool blocks around signed thinking, avoiding `thinking` / `redacted_thinking` modification errors on follow-up tool-result requests. + ### Removed ## [2026.6.17-2] - 2026-06-17 diff --git a/packages/ai/src/providers/anthropic.ts b/packages/ai/src/providers/anthropic.ts index f83ee926d..8282f3960 100644 --- a/packages/ai/src/providers/anthropic.ts +++ b/packages/ai/src/providers/anthropic.ts @@ -317,6 +317,21 @@ function isRecord(value: unknown): value is Record { return typeof value === "object" && value !== null; } +const REPLAYABLE_ANTHROPIC_PROVIDER_NATIVE_TYPES: ReadonlySet = new Set([ + "server_tool_use", + "web_search_tool_result", + "web_fetch_tool_result", + "code_execution_tool_result", + "bash_code_execution_tool_result", + "text_editor_code_execution_tool_result", + "tool_search_tool_result", + "container_upload", +]); + +function isReplayableAnthropicProviderNativeBlock(raw: unknown): raw is ContentBlockParam { + return isRecord(raw) && typeof raw.type === "string" && REPLAYABLE_ANTHROPIC_PROVIDER_NATIVE_TYPES.has(raw.type); +} + function stringRecord(value: unknown): Record | undefined { if (!isRecord(value)) { return undefined; @@ -1461,6 +1476,7 @@ function convertMessages( } } else if (msg.role === "assistant") { const blocks: ContentBlockParam[] = []; + const isSameModel = msg.provider === model.provider && msg.api === model.api && msg.model === model.id; for (const block of msg.content) { if (block.type === "text") { @@ -1510,6 +1526,9 @@ function convertMessages( input: block.arguments ?? {}, }); } else if (block.type === "providerNative") { + if (isSameModel && isReplayableAnthropicProviderNativeBlock(block.raw)) { + blocks.push(block.raw); + } } } if (blocks.length === 0) continue; diff --git a/packages/ai/test/anthropic-provider-native-replay.test.ts b/packages/ai/test/anthropic-provider-native-replay.test.ts new file mode 100644 index 000000000..fa4c4ea3b --- /dev/null +++ b/packages/ai/test/anthropic-provider-native-replay.test.ts @@ -0,0 +1,163 @@ +import { describe, expect, it } from "vitest"; +import { getModel } from "../src/models.ts"; +import { streamSimple } from "../src/stream.ts"; +import type { AssistantMessage, Context, Model, SimpleStreamOptions } from "../src/types.ts"; + +interface CapturedAnthropicMessage { + readonly role: string; + readonly content: unknown; +} + +interface CapturedAnthropicPayload { + readonly messages?: readonly CapturedAnthropicMessage[]; +} + +const usage = { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + totalTokens: 0, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, +} as const; + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} + +function parsePayload(value: unknown): CapturedAnthropicPayload { + if (!isRecord(value)) { + return {}; + } + const messages = value.messages; + if (!Array.isArray(messages)) { + return {}; + } + return { + messages: messages.flatMap((message) => { + if (!isRecord(message) || typeof message.role !== "string") { + return []; + } + return [{ role: message.role, content: message.content }]; + }), + }; +} + +function assistantMessage( + content: AssistantMessage["content"], + overrides?: Partial, +): AssistantMessage { + return { + role: "assistant", + api: "anthropic-messages", + provider: "anthropic", + model: "claude-haiku-4-5", + content, + usage, + stopReason: "stop", + timestamp: 1, + ...overrides, + }; +} + +async function capturePayload( + model: Model<"anthropic-messages">, + messages: Context["messages"], + options?: SimpleStreamOptions, +): Promise { + let capturedPayload: CapturedAnthropicPayload | undefined; + const payloadCaptureModel: Model<"anthropic-messages"> = { + ...model, + baseUrl: "http://127.0.0.1:9", + }; + const stream = streamSimple( + payloadCaptureModel, + { messages }, + { + ...options, + apiKey: "fake-key", + onPayload: (payload) => { + capturedPayload = parsePayload(payload); + return payload; + }, + }, + ); + + await stream.result(); + + if (!capturedPayload) { + throw new Error("Expected payload to be captured before request failure"); + } + + return capturedPayload; +} + +describe("Anthropic provider-native replay", () => { + it("preserves same-model server tool blocks around signed thinking", async () => { + const model = getModel("anthropic", "claude-haiku-4-5"); + const serverToolUse = { type: "server_tool_use", id: "srvu_1", name: "web_search", input: { query: "hi" } }; + const webSearchToolResult = { + type: "web_search_tool_result", + tool_use_id: "srvu_1", + content: [ + { type: "web_search_result", title: "Example", url: "https://example.com", encrypted_content: "enc" }, + ], + }; + const assistant = assistantMessage( + [ + { type: "providerNative", subtype: "server_tool_use", raw: serverToolUse }, + { type: "providerNative", subtype: "web_search_tool_result", raw: webSearchToolResult }, + { type: "thinking", thinking: "protected thinking", thinkingSignature: "sig_1" }, + { type: "text", text: "kept" }, + { type: "toolCall", id: "toolu_1", name: "read", arguments: { path: "README.md" } }, + ], + { stopReason: "toolUse" }, + ); + + const payload = await capturePayload(model, [ + { role: "user", content: "hello", timestamp: 1 }, + assistant, + { + role: "toolResult", + toolCallId: "toolu_1", + toolName: "read", + content: [{ type: "text", text: "tool output" }], + isError: false, + timestamp: 2, + }, + ]); + + const assistantPayload = payload.messages?.find((message) => message.role === "assistant"); + expect(assistantPayload?.content).toEqual([ + serverToolUse, + webSearchToolResult, + { type: "thinking", thinking: "protected thinking", signature: "sig_1" }, + { type: "text", text: "kept" }, + { type: "tool_use", id: "toolu_1", name: "read", input: { path: "README.md" } }, + ]); + }); + + it("drops cross-provider provider-native blocks", async () => { + const model = getModel("anthropic", "claude-haiku-4-5"); + const assistant = assistantMessage( + [ + { type: "providerNative", subtype: "web_search_call", raw: { type: "web_search_call", id: "ws_1" } }, + { type: "text", text: "kept" }, + ], + { + api: "openai-responses", + provider: "openai", + model: "gpt-5.4", + }, + ); + + const payload = await capturePayload(model, [ + { role: "user", content: "hello", timestamp: 1 }, + assistant, + { role: "user", content: "follow up", timestamp: 2 }, + ]); + + const assistantPayload = payload.messages?.find((message) => message.role === "assistant"); + expect(assistantPayload?.content).toEqual([{ type: "text", text: "kept" }]); + }); +}); diff --git a/packages/ai/test/anthropic.provider-native.test.ts b/packages/ai/test/anthropic.provider-native.test.ts index 53e229f1b..2f67ae12a 100644 --- a/packages/ai/test/anthropic.provider-native.test.ts +++ b/packages/ai/test/anthropic.provider-native.test.ts @@ -109,97 +109,4 @@ describe("Anthropic provider-native content blocks", () => { raw: webSearchToolResultBlock, }); }); - - it("skips providerNative blocks when converting assistant messages for replay", async () => { - const model = getModel("anthropic", "claude-haiku-4-5"); - const assistantContent: Context["messages"][number] = { - role: "assistant", - content: [ - { type: "text", text: "kept" }, - { type: "providerNative", subtype: "server_tool_use", raw: { type: "server_tool_use", id: "srvu_1" } }, - { - type: "providerNative", - subtype: "web_search_tool_result", - raw: { type: "web_search_tool_result", tool_use_id: "srvu_1" }, - }, - ], - api: "anthropic-messages", - provider: "anthropic", - model: "claude-haiku-4-5", - usage: { - input: 0, - output: 0, - cacheRead: 0, - cacheWrite: 0, - totalTokens: 0, - cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, - }, - stopReason: "stop", - timestamp: Date.now(), - }; - - const context: Context = { - messages: [{ role: "user", content: "hello", timestamp: Date.now() }, assistantContent], - }; - - let capturedPayload: unknown; - const response = createSseResponse([ - { - event: "message_start", - data: JSON.stringify({ - type: "message_start", - message: { - id: "msg_replay", - usage: { - input_tokens: 2, - output_tokens: 0, - cache_read_input_tokens: 0, - cache_creation_input_tokens: 0, - }, - }, - }), - }, - { - event: "content_block_start", - data: JSON.stringify({ - type: "content_block_start", - index: 0, - content_block: { type: "text", text: "" }, - }), - }, - { - event: "content_block_delta", - data: JSON.stringify({ type: "content_block_delta", index: 0, delta: { type: "text_delta", text: "ok" } }), - }, - { event: "content_block_stop", data: JSON.stringify({ type: "content_block_stop", index: 0 }) }, - { - event: "message_delta", - data: JSON.stringify({ - type: "message_delta", - delta: { stop_reason: "end_turn" }, - usage: { - input_tokens: 2, - output_tokens: 2, - cache_read_input_tokens: 0, - cache_creation_input_tokens: 0, - }, - }), - }, - { event: "message_stop", data: JSON.stringify({ type: "message_stop" }) }, - ]); - - const stream = streamAnthropic(model, context, { - client: createFakeAnthropicClient(response, (params) => { - capturedPayload = params; - }), - }); - await stream.result(); - - const payload = capturedPayload as { - messages: Array<{ role: string; content: Array<{ type: string; text?: string }> }>; - }; - const assistantMessage = payload.messages.find((message) => message.role === "assistant"); - expect(assistantMessage).toBeDefined(); - expect(assistantMessage?.content).toEqual([{ type: "text", text: "kept" }]); - }); }); diff --git a/packages/coding-agent/CHANGELOG.md b/packages/coding-agent/CHANGELOG.md index 5e0f6bc18..5c297f8d6 100644 --- a/packages/coding-agent/CHANGELOG.md +++ b/packages/coding-agent/CHANGELOG.md @@ -6,6 +6,8 @@ ### Fixed +- Fixed inherited Anthropic same-model replay to preserve provider-native server tool blocks around signed thinking, avoiding `thinking` / `redacted_thinking` modification errors on follow-up tool-result requests. + ### Changed ### Removed From 856288b653503bd199f9ddfdcc736669418db057 Mon Sep 17 00:00:00 2001 From: YeonGyu-Kim Date: Wed, 17 Jun 2026 19:08:43 +0900 Subject: [PATCH 2/2] ci: scope PR530 benchmark suites by package --- .github/workflows/task-11-benchmarks.yml | 47 ++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/.github/workflows/task-11-benchmarks.yml b/.github/workflows/task-11-benchmarks.yml index 315ca3519..7618b111d 100644 --- a/.github/workflows/task-11-benchmarks.yml +++ b/.github/workflows/task-11-benchmarks.yml @@ -29,8 +29,45 @@ jobs: fi git diff --name-only "${{ github.event.pull_request.base.sha }}" HEAD > /tmp/task-11-changed-files.txt - if grep -Eq '^(packages/|bench/|scripts/run-pr530-benchmarks\.mjs$|package\.json$|package-lock\.json$|tsconfig|vitest)' /tmp/task-11-changed-files.txt; then + suites=() + add_suite() { + local suite="$1" + for existing in "${suites[@]}"; do + if [ "$existing" = "$suite" ]; then + return + fi + done + suites+=("$suite") + } + + if grep -Eq '^(bench/|scripts/run-pr530-benchmarks\.mjs$|package\.json$|package-lock\.json$|tsconfig|vitest)' /tmp/task-11-changed-files.txt; then + echo "run=true" >> "$GITHUB_OUTPUT" + echo "suites=all" >> "$GITHUB_OUTPUT" + exit 0 + fi + + if grep -Eq '^packages/ai/(src/|test/|bench/|package\.json$|tsconfig)' /tmp/task-11-changed-files.txt; then + add_suite "ai-event-stream" + add_suite "ai-model-registry" + fi + if grep -Eq '^packages/tui/(src/|test/|bench/|package\.json$|tsconfig)' /tmp/task-11-changed-files.txt; then + add_suite "tui-editor" + add_suite "tui-markdown" + fi + if grep -Eq '^packages/coding-agent/(src/|test/|bench/|examples/|package\.json$|tsconfig)' /tmp/task-11-changed-files.txt; then + add_suite "coding-agent-render-transcript" + add_suite "coding-agent-bash-output" + add_suite "coding-agent-jsonl-parse" + add_suite "coding-agent-rpc-event-emit" + add_suite "emit-context-clone" + add_suite "compaction-trim" + add_suite "word-diff" + fi + + if [ "${#suites[@]}" -gt 0 ]; then + suite_list="$(IFS=,; echo "${suites[*]}")" echo "run=true" >> "$GITHUB_OUTPUT" + echo "suites=${suite_list}" >> "$GITHUB_OUTPUT" else echo "run=false" >> "$GITHUB_OUTPUT" { @@ -69,6 +106,7 @@ jobs: shell: bash run: | mkdir -p /tmp/task-11-bench + benchmark_suites="${{ steps.scope.outputs.suites }}" if [ -n "${{ github.event.pull_request.base.sha }}" ]; then base_ref="${{ github.event.pull_request.base.sha }}" @@ -81,13 +119,13 @@ jobs: cd /tmp/senpi-base npm ci --ignore-scripts node scripts/run-pr530-benchmarks.mjs \ - --suite all \ + --suite "$benchmark_suites" \ --iterations 30 \ --json /tmp/task-11-bench/base.json ) node scripts/run-pr530-benchmarks.mjs \ - --suite all \ + --suite "$benchmark_suites" \ --iterations 30 \ --json /tmp/task-11-bench/head.json @@ -148,7 +186,7 @@ jobs: set +e node scripts/run-pr530-benchmarks.mjs \ - --suite all \ + --suite "$benchmark_suites" \ --iterations 30 \ --baseline bench/baseline/all-baseline.json \ --allow-regression-pct 0 \ @@ -167,6 +205,7 @@ jobs: echo "## Task 11 benchmark gate" echo echo "- same-run status: pass" + echo "- suites: ${benchmark_suites}" echo "- same-run median regression tolerance: 5%" echo "- same-run absolute median regression tolerance: 0.01ms" echo "- static baseline status: ${static_status}"