From a7fabccac6630678d2eeb7027e0ca7a01406508f Mon Sep 17 00:00:00 2001
From: YeonGyu-Kim <code.yeon.gyu@gmail.com>
Date: Wed, 17 Jun 2026 19:00:56 +0900
Subject: [PATCH 1/2] fix(ai): preserve Anthropic server tool replay

---
 packages/ai/CHANGELOG.md                      |   2 +
 packages/ai/src/providers/anthropic.ts        |  19 ++
 .../anthropic-provider-native-replay.test.ts  | 163 ++++++++++++++++++
 .../ai/test/anthropic.provider-native.test.ts |  93 ----------
 packages/coding-agent/CHANGELOG.md            |   2 +
 5 files changed, 186 insertions(+), 93 deletions(-)
 create mode 100644 packages/ai/test/anthropic-provider-native-replay.test.ts
diff --git a/packages/ai/CHANGELOG.md b/packages/ai/CHANGELOG.md
index e332250e0..4c032665e 100644
--- a/packages/ai/CHANGELOG.md
+++ b/packages/ai/CHANGELOG.md
@@ -10,6 +10,8 @@
 
 ### Fixed
 
+- Fixed Anthropic same-model replay to preserve provider-native server tool blocks around signed thinking, avoiding `thinking` / `redacted_thinking` modification errors on follow-up tool-result requests.
+
 ### Removed
 
 ## [2026.6.17-2] - 2026-06-17
diff --git a/packages/ai/src/providers/anthropic.ts b/packages/ai/src/providers/anthropic.ts
index f83ee926d..8282f3960 100644
--- a/packages/ai/src/providers/anthropic.ts
+++ b/packages/ai/src/providers/anthropic.ts
@@ -317,6 +317,21 @@ function isRecord(value: unknown): value is Record<string, unknown> {
 	return typeof value === "object" && value !== null;
 }
 
+const REPLAYABLE_ANTHROPIC_PROVIDER_NATIVE_TYPES: ReadonlySet<string> = new Set([
+	"server_tool_use",
+	"web_search_tool_result",
+	"web_fetch_tool_result",
+	"code_execution_tool_result",
+	"bash_code_execution_tool_result",
+	"text_editor_code_execution_tool_result",
+	"tool_search_tool_result",
+	"container_upload",
+]);
+
+function isReplayableAnthropicProviderNativeBlock(raw: unknown): raw is ContentBlockParam {
+	return isRecord(raw) && typeof raw.type === "string" && REPLAYABLE_ANTHROPIC_PROVIDER_NATIVE_TYPES.has(raw.type);
+}
+
 function stringRecord(value: unknown): Record<string, string> | undefined {
 	if (!isRecord(value)) {
 		return undefined;
@@ -1461,6 +1476,7 @@ function convertMessages(
 			}
 		} else if (msg.role === "assistant") {
 			const blocks: ContentBlockParam[] = [];
+			const isSameModel = msg.provider === model.provider && msg.api === model.api && msg.model === model.id;
 
 			for (const block of msg.content) {
 				if (block.type === "text") {
@@ -1510,6 +1526,9 @@ function convertMessages(
 						input: block.arguments ?? {},
 					});
 				} else if (block.type === "providerNative") {
+					if (isSameModel && isReplayableAnthropicProviderNativeBlock(block.raw)) {
+						blocks.push(block.raw);
+					}
 				}
 			}
 			if (blocks.length === 0) continue;
diff --git a/packages/ai/test/anthropic-provider-native-replay.test.ts b/packages/ai/test/anthropic-provider-native-replay.test.ts
new file mode 100644
index 000000000..fa4c4ea3b
--- /dev/null
+++ b/packages/ai/test/anthropic-provider-native-replay.test.ts
@@ -0,0 +1,163 @@
+import { describe, expect, it } from "vitest";
+import { getModel } from "../src/models.ts";
+import { streamSimple } from "../src/stream.ts";
+import type { AssistantMessage, Context, Model, SimpleStreamOptions } from "../src/types.ts";
+
+interface CapturedAnthropicMessage {
+	readonly role: string;
+	readonly content: unknown;
+}
+
+interface CapturedAnthropicPayload {
+	readonly messages?: readonly CapturedAnthropicMessage[];
+}
+
+const usage = {
+	input: 0,
+	output: 0,
+	cacheRead: 0,
+	cacheWrite: 0,
+	totalTokens: 0,
+	cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+} as const;
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+	return typeof value === "object" && value !== null;
+}
+
+function parsePayload(value: unknown): CapturedAnthropicPayload {
+	if (!isRecord(value)) {
+		return {};
+	}
+	const messages = value.messages;
+	if (!Array.isArray(messages)) {
+		return {};
+	}
+	return {
+		messages: messages.flatMap((message) => {
+			if (!isRecord(message) || typeof message.role !== "string") {
+				return [];
+			}
+			return [{ role: message.role, content: message.content }];
+		}),
+	};
+}
+
+function assistantMessage(
+	content: AssistantMessage["content"],
+	overrides?: Partial<AssistantMessage>,
+): AssistantMessage {
+	return {
+		role: "assistant",
+		api: "anthropic-messages",
+		provider: "anthropic",
+		model: "claude-haiku-4-5",
+		content,
+		usage,
+		stopReason: "stop",
+		timestamp: 1,
+		...overrides,
+	};
+}
+
+async function capturePayload(
+	model: Model<"anthropic-messages">,
+	messages: Context["messages"],
+	options?: SimpleStreamOptions,
+): Promise<CapturedAnthropicPayload> {
+	let capturedPayload: CapturedAnthropicPayload | undefined;
+	const payloadCaptureModel: Model<"anthropic-messages"> = {
+		...model,
+		baseUrl: "http://127.0.0.1:9",
+	};
+	const stream = streamSimple(
+		payloadCaptureModel,
+		{ messages },
+		{
+			...options,
+			apiKey: "fake-key",
+			onPayload: (payload) => {
+				capturedPayload = parsePayload(payload);
+				return payload;
+			},
+		},
+	);
+
+	await stream.result();
+
+	if (!capturedPayload) {
+		throw new Error("Expected payload to be captured before request failure");
+	}
+
+	return capturedPayload;
+}
+
+describe("Anthropic provider-native replay", () => {
+	it("preserves same-model server tool blocks around signed thinking", async () => {
+		const model = getModel("anthropic", "claude-haiku-4-5");
+		const serverToolUse = { type: "server_tool_use", id: "srvu_1", name: "web_search", input: { query: "hi" } };
+		const webSearchToolResult = {
+			type: "web_search_tool_result",
+			tool_use_id: "srvu_1",
+			content: [
+				{ type: "web_search_result", title: "Example", url: "https://example.com", encrypted_content: "enc" },
+			],
+		};
+		const assistant = assistantMessage(
+			[
+				{ type: "providerNative", subtype: "server_tool_use", raw: serverToolUse },
+				{ type: "providerNative", subtype: "web_search_tool_result", raw: webSearchToolResult },
+				{ type: "thinking", thinking: "protected thinking", thinkingSignature: "sig_1" },
+				{ type: "text", text: "kept" },
+				{ type: "toolCall", id: "toolu_1", name: "read", arguments: { path: "README.md" } },
+			],
+			{ stopReason: "toolUse" },
+		);
+
+		const payload = await capturePayload(model, [
+			{ role: "user", content: "hello", timestamp: 1 },
+			assistant,
+			{
+				role: "toolResult",
+				toolCallId: "toolu_1",
+				toolName: "read",
+				content: [{ type: "text", text: "tool output" }],
+				isError: false,
+				timestamp: 2,
+			},
+		]);
+
+		const assistantPayload = payload.messages?.find((message) => message.role === "assistant");
+		expect(assistantPayload?.content).toEqual([
+			serverToolUse,
+			webSearchToolResult,
+			{ type: "thinking", thinking: "protected thinking", signature: "sig_1" },
+			{ type: "text", text: "kept" },
+			{ type: "tool_use", id: "toolu_1", name: "read", input: { path: "README.md" } },
+		]);
+	});
+
+	it("drops cross-provider provider-native blocks", async () => {
+		const model = getModel("anthropic", "claude-haiku-4-5");
+		const assistant = assistantMessage(
+			[
+				{ type: "providerNative", subtype: "web_search_call", raw: { type: "web_search_call", id: "ws_1" } },
+				{ type: "text", text: "kept" },
+			],
+			{
+				api: "openai-responses",
+				provider: "openai",
+				model: "gpt-5.4",
+			},
+		);
+
+		const payload = await capturePayload(model, [
+			{ role: "user", content: "hello", timestamp: 1 },
+			assistant,
+			{ role: "user", content: "follow up", timestamp: 2 },
+		]);
+
+		const assistantPayload = payload.messages?.find((message) => message.role === "assistant");
+		expect(assistantPayload?.content).toEqual([{ type: "text", text: "kept" }]);
+	});
+});
diff --git a/packages/ai/test/anthropic.provider-native.test.ts b/packages/ai/test/anthropic.provider-native.test.ts
index 53e229f1b..2f67ae12a 100644
--- a/packages/ai/test/anthropic.provider-native.test.ts
+++ b/packages/ai/test/anthropic.provider-native.test.ts
@@ -109,97 +109,4 @@ describe("Anthropic provider-native content blocks", () => {
 			raw: webSearchToolResultBlock,
 		});
 	});
-
-	it("skips providerNative blocks when converting assistant messages for replay", async () => {
-		const model = getModel("anthropic", "claude-haiku-4-5");
-		const assistantContent: Context["messages"][number] = {
-			role: "assistant",
-			content: [
-				{ type: "text", text: "kept" },
-				{ type: "providerNative", subtype: "server_tool_use", raw: { type: "server_tool_use", id: "srvu_1" } },
-				{
-					type: "providerNative",
-					subtype: "web_search_tool_result",
-					raw: { type: "web_search_tool_result", tool_use_id: "srvu_1" },
-				},
-			],
-			api: "anthropic-messages",
-			provider: "anthropic",
-			model: "claude-haiku-4-5",
-			usage: {
-				input: 0,
-				output: 0,
-				cacheRead: 0,
-				cacheWrite: 0,
-				totalTokens: 0,
-				cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
-			},
-			stopReason: "stop",
-			timestamp: Date.now(),
-		};
-
-		const context: Context = {
-			messages: [{ role: "user", content: "hello", timestamp: Date.now() }, assistantContent],
-		};
-
-		let capturedPayload: unknown;
-		const response = createSseResponse([
-			{
-				event: "message_start",
-				data: JSON.stringify({
-					type: "message_start",
-					message: {
-						id: "msg_replay",
-						usage: {
-							input_tokens: 2,
-							output_tokens: 0,
-							cache_read_input_tokens: 0,
-							cache_creation_input_tokens: 0,
-						},
-					},
-				}),
-			},
-			{
-				event: "content_block_start",
-				data: JSON.stringify({
-					type: "content_block_start",
-					index: 0,
-					content_block: { type: "text", text: "" },
-				}),
-			},
-			{
-				event: "content_block_delta",
-				data: JSON.stringify({ type: "content_block_delta", index: 0, delta: { type: "text_delta", text: "ok" } }),
-			},
-			{ event: "content_block_stop", data: JSON.stringify({ type: "content_block_stop", index: 0 }) },
-			{
-				event: "message_delta",
-				data: JSON.stringify({
-					type: "message_delta",
-					delta: { stop_reason: "end_turn" },
-					usage: {
-						input_tokens: 2,
-						output_tokens: 2,
-						cache_read_input_tokens: 0,
-						cache_creation_input_tokens: 0,
-					},
-				}),
-			},
-			{ event: "message_stop", data: JSON.stringify({ type: "message_stop" }) },
-		]);
-
-		const stream = streamAnthropic(model, context, {
-			client: createFakeAnthropicClient(response, (params) => {
-				capturedPayload = params;
-			}),
-		});
-		await stream.result();
-
-		const payload = capturedPayload as {
-			messages: Array<{ role: string; content: Array<{ type: string; text?: string }> }>;
-		};
-		const assistantMessage = payload.messages.find((message) => message.role === "assistant");
-		expect(assistantMessage).toBeDefined();
-		expect(assistantMessage?.content).toEqual([{ type: "text", text: "kept" }]);
-	});
 });
diff --git a/packages/coding-agent/CHANGELOG.md b/packages/coding-agent/CHANGELOG.md
index 5e0f6bc18..5c297f8d6 100644
--- a/packages/coding-agent/CHANGELOG.md
+++ b/packages/coding-agent/CHANGELOG.md
@@ -6,6 +6,8 @@
 
 ### Fixed
 
+- Fixed inherited Anthropic same-model replay to preserve provider-native server tool blocks around signed thinking, avoiding `thinking` / `redacted_thinking` modification errors on follow-up tool-result requests.
+
 ### Changed
 
 ### Removed

From 856288b653503bd199f9ddfdcc736669418db057 Mon Sep 17 00:00:00 2001
From: YeonGyu-Kim <code.yeon.gyu@gmail.com>
Date: Wed, 17 Jun 2026 19:08:43 +0900
Subject: [PATCH 2/2] ci: scope PR530 benchmark suites by package

---
 .github/workflows/task-11-benchmarks.yml | 47 ++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/task-11-benchmarks.yml b/.github/workflows/task-11-benchmarks.yml
index 315ca3519..7618b111d 100644
--- a/.github/workflows/task-11-benchmarks.yml
+++ b/.github/workflows/task-11-benchmarks.yml
@@ -29,8 +29,45 @@ jobs:
           fi
 
           git diff --name-only "${{ github.event.pull_request.base.sha }}" HEAD > /tmp/task-11-changed-files.txt
-          if grep -Eq '^(packages/|bench/|scripts/run-pr530-benchmarks\.mjs$|package\.json$|package-lock\.json$|tsconfig|vitest)' /tmp/task-11-changed-files.txt; then
+          suites=()
+          add_suite() {
+            local suite="$1"
+            for existing in "${suites[@]}"; do
+              if [ "$existing" = "$suite" ]; then
+                return
+              fi
+            done
+            suites+=("$suite")
+          }
+
+          if grep -Eq '^(bench/|scripts/run-pr530-benchmarks\.mjs$|package\.json$|package-lock\.json$|tsconfig|vitest)' /tmp/task-11-changed-files.txt; then
+            echo "run=true" >> "$GITHUB_OUTPUT"
+            echo "suites=all" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          if grep -Eq '^packages/ai/(src/|test/|bench/|package\.json$|tsconfig)' /tmp/task-11-changed-files.txt; then
+            add_suite "ai-event-stream"
+            add_suite "ai-model-registry"
+          fi
+          if grep -Eq '^packages/tui/(src/|test/|bench/|package\.json$|tsconfig)' /tmp/task-11-changed-files.txt; then
+            add_suite "tui-editor"
+            add_suite "tui-markdown"
+          fi
+          if grep -Eq '^packages/coding-agent/(src/|test/|bench/|examples/|package\.json$|tsconfig)' /tmp/task-11-changed-files.txt; then
+            add_suite "coding-agent-render-transcript"
+            add_suite "coding-agent-bash-output"
+            add_suite "coding-agent-jsonl-parse"
+            add_suite "coding-agent-rpc-event-emit"
+            add_suite "emit-context-clone"
+            add_suite "compaction-trim"
+            add_suite "word-diff"
+          fi
+
+          if [ "${#suites[@]}" -gt 0 ]; then
+            suite_list="$(IFS=,; echo "${suites[*]}")"
             echo "run=true" >> "$GITHUB_OUTPUT"
+            echo "suites=${suite_list}" >> "$GITHUB_OUTPUT"
           else
             echo "run=false" >> "$GITHUB_OUTPUT"
             {
@@ -69,6 +106,7 @@ jobs:
         shell: bash
         run: |
           mkdir -p /tmp/task-11-bench
+          benchmark_suites="${{ steps.scope.outputs.suites }}"
 
           if [ -n "${{ github.event.pull_request.base.sha }}" ]; then
             base_ref="${{ github.event.pull_request.base.sha }}"
@@ -81,13 +119,13 @@ jobs:
             cd /tmp/senpi-base
             npm ci --ignore-scripts
             node scripts/run-pr530-benchmarks.mjs \
-              --suite all \
+              --suite "$benchmark_suites" \
               --iterations 30 \
               --json /tmp/task-11-bench/base.json
           )
 
           node scripts/run-pr530-benchmarks.mjs \
-            --suite all \
+            --suite "$benchmark_suites" \
             --iterations 30 \
             --json /tmp/task-11-bench/head.json
 
@@ -148,7 +186,7 @@ jobs:
 
           set +e
           node scripts/run-pr530-benchmarks.mjs \
-            --suite all \
+            --suite "$benchmark_suites" \
             --iterations 30 \
             --baseline bench/baseline/all-baseline.json \
             --allow-regression-pct 0 \
@@ -167,6 +205,7 @@ jobs:
             echo "## Task 11 benchmark gate"
             echo
             echo "- same-run status: pass"
+            echo "- suites: ${benchmark_suites}"
             echo "- same-run median regression tolerance: 5%"
             echo "- same-run absolute median regression tolerance: 0.01ms"
             echo "- static baseline status: ${static_status}"