diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
index 78a14e4c1..ea975ead1 100644
--- a/.github/workflows/eval.yml
+++ b/.github/workflows/eval.yml
@@ -5,13 +5,15 @@ on:
   push:
     branches: [main]
     paths:
-      - "packages/mcp-core/src/tools*"
+      - "packages/mcp-core/src/tools/**"
+      - "packages/mcp-core/src/internal/agents/**"
       - "packages/mcp-server-evals/**"
       - "packages/mcp-server-mocks/**"
       - ".github/workflows/eval.yml"
   pull_request:
     paths:
-      - "packages/mcp-core/src/tools*"
+      - "packages/mcp-core/src/tools/**"
+      - "packages/mcp-core/src/internal/agents/**"
       - "packages/mcp-server-evals/**"
       - "packages/mcp-server-mocks/**"
       - ".github/workflows/eval.yml"
@@ -57,140 +59,17 @@ jobs:
         run: pnpm build
 
       - name: Run evals
-        run: pnpm eval:ci evals
+        if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: pnpm --filter @sentry/mcp-server-evals eval:ci
         continue-on-error: true
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
-      - name: Create eval status check
-        uses: actions/github-script@v7
-        # Skip for fork PRs (no write permissions) but still run for pushes, workflow_dispatch, and same-repo PRs
+      - name: Report eval results
+        uses: getsentry/vitest-evals@v0
         if: ${{ !cancelled() && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) }}
-        continue-on-error: true # Don't fail workflow if check creation fails
         with:
-          script: |
-            const fs = require('fs');
-            const path = require('path');
-
-            // Read eval results
-            const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json');
-            console.log(`Reading eval results from: ${resultsPath}`);
-
-            let vitestResults;
-            try {
-              vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'));
-            } catch (error) {
-              if (error.code === 'ENOENT') {
-                throw new Error(
-                  `Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.`
-                );
-              }
-              throw new Error(`Failed to read/parse eval results: ${error.message}`);
-            }
-
-            // Extract eval results from vitest format
-            const evalResults = [];
-            for (const testFile of vitestResults.testResults || []) {
-              for (const test of testFile.assertionResults || []) {
-                if (test.meta?.eval) {
-                  evalResults.push({
-                    name: test.fullName || test.title,
-                    file: testFile.name,
-                    avgScore: test.meta.eval.avgScore ?? null,
-                    scores: test.meta.eval.scores || [],
-                    passed: test.status === 'passed',
-                    duration: test.duration,
-                  });
-                }
-              }
-            }
-
-            // Calculate statistics
-            const totalTests = evalResults.length;
-            // Treat null scores as 0.0 for consistent categorization
-            const scores = evalResults.map(r => r.avgScore !== null ? r.avgScore : 0);
-
-            const avgScore = scores.length > 0
-              ? scores.reduce((sum, score) => sum + score, 0) / scores.length
-              : 0;
-
-            const green = scores.filter(s => s >= 0.75).length;
-            const yellow = scores.filter(s => s >= 0.5 && s < 0.75).length;
-            const red = scores.filter(s => s < 0.5).length;
-
-            // Determine conclusion
-            const conclusion = avgScore >= 0.5 ? 'success' : 'failure';
-
-            // Format score helper
-            function formatScore(score) {
-              if (score >= 0.75) return `🟢 ${score.toFixed(2)}`;
-              if (score >= 0.5) return `🟡 ${score.toFixed(2)}`;
-              return `🔴 ${score.toFixed(2)}`;
-            }
-
-            // Build title
-            const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`;
-
-            // Build summary
-            const summary = [
-              `## Overall Statistics`,
-              ``,
-              `- **Total Evaluations**: ${totalTests}`,
-              `- **Average Score**: ${formatScore(avgScore)}`,
-              `- **Pass Threshold**: 0.50 (catastrophic failure)`,
-              ``,
-              `### Score Distribution`,
-              `- 🟢 Green (≥0.75): ${green} evals`,
-              `- 🟡 Yellow (0.50-0.74): ${yellow} evals`,
-              `- 🔴 Red (<0.50): ${red} evals`,
-            ].join('\n');
-
-            // Build detailed results
-            const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
-            const details = [
-              `## Individual Eval Scores`,
-              ``,
-              ...detailsByScore.map(result => {
-                const score = result.avgScore !== null ? result.avgScore : 0;
-                const statusIcon = result.passed ? '✅' : '❌';
-                const scoreDisplay = formatScore(score);
-
-                let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`;
-
-                // Add rationale for failed or low-scoring tests
-                if (!result.passed || score < 0.75) {
-                  const firstScore = result.scores[0];
-                  if (firstScore?.metadata?.rationale) {
-                    line += `\n   - ${firstScore.metadata.rationale}`;
-                  }
-                }
-
-                return line;
-              }),
-              ``,
-              `---`,
-              ``,
-              `### Conclusion`,
-              ``,
-              conclusion === 'success'
-                ? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)`
-                : `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`,
-            ].join('\n');
-
-            // Create check run
-            await github.rest.checks.create({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              name: 'Evaluation Results',
-              head_sha: context.sha,
-              status: 'completed',
-              conclusion: conclusion,
-              output: {
-                title: title,
-                summary: summary,
-                text: details,
-              },
-            });
-
-            console.log(`✅ Check run created with conclusion: ${conclusion}`);
-            console.log(`   Average Score: ${avgScore.toFixed(2)}`);
\ No newline at end of file
+          results: packages/mcp-server-evals/eval-results.json
+          publish-check: true
+          check-name: Evaluation Results
+          fail-on-failures: true
diff --git a/docs/adding-tools.md b/docs/adding-tools.md
index aa8be1e59..1252e43d4 100644
--- a/docs/adding-tools.md
+++ b/docs/adding-tools.md
@@ -255,20 +255,25 @@ See [api-patterns.md](api-patterns.md#mock-patterns) for validation examples.
 **⚠️ Each eval costs time and API credits. Only test core functionality!**
 
 ```typescript
-describeEval("your-tool", {
-  data: async () => [
-    {
-      input: `Primary use case in ${FIXTURES.organizationSlug}`,
-      expected: "Expected response"
-    },
-    // Maximum 2-3 scenarios!
-  ],
-  task: TaskRunner(),
-  scorers: [Factuality()],
-  threshold: 0.6,
-});
+import { describeToolPredictionEval, FIXTURES } from "./utils";
+
+describeToolPredictionEval("your-tool", [
+  {
+    input: `Primary use case in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
+      {
+        name: "your_tool",
+        arguments: { organizationSlug: FIXTURES.organizationSlug },
+      },
+    ],
+  },
+  // Maximum 2-3 scenarios!
+]);
 ```
 
+Use `describeMcpToolCallEval` instead when the eval needs to execute the full
+MCP harness and validate actual tool calls, usage data, and traces.
+
 ## Testing Workflow
 
 ```bash
@@ -279,7 +284,7 @@ pnpm test tools.test
 pnpm inspector
 
 # 3. Run minimal evals
-pnpm eval your-tool
+pnpm --filter @sentry/mcp-server-evals eval your-tool
 ```
 
 ## Checklist
diff --git a/docs/pr-management.md b/docs/pr-management.md
index b5b90f248..2c817e335 100644
--- a/docs/pr-management.md
+++ b/docs/pr-management.md
@@ -184,11 +184,11 @@ datasets: errors, logs, and spans.
 Co-Authored-By: Codex CLI Agent <noreply@openai.com>"
 
 # Bug fix
-git commit -m "fix(evals): update search-events eval to use available exports
+git commit -m "fix(evals): migrate search-events eval to shared harness
 
-Replace missing TaskRunner and Factuality imports with NoOpTaskRunner 
-and ToolPredictionScorer to resolve CI build failures after factuality 
-checker removal.
+Replace bespoke prediction scoring with describeToolPredictionEval so the
+suite uses the shared vitest-evals harness, report metadata, and GitHub check
+output.
 
 Co-Authored-By: Codex CLI Agent <noreply@openai.com>"
 
diff --git a/docs/testing.md b/docs/testing.md
index 827707809..8704fbd60 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -253,23 +253,26 @@ expect(result.timestamp).toMatchInlineSnapshot(); // ❌
 ### Eval Test Structure
 
 ```typescript
-import { describeEval } from "vitest-evals";
-import { TaskRunner, Factuality } from "./utils";
-
-describeEval("tool-name", {
-  data: async () => [
-    {
-      input: "Natural language request",
-      expected: "Expected response content"
-    }
-  ],
-  task: TaskRunner(),      // Uses AI to call tools
-  scorers: [Factuality()], // Validates output
-  threshold: 0.6,
-  timeout: 30000
-});
+import { describeToolPredictionEval, FIXTURES } from "./utils";
+
+describeToolPredictionEval("tool-name", [
+  {
+    input: `Natural language request in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
+      {
+        name: "your_tool",
+        arguments: { organizationSlug: FIXTURES.organizationSlug },
+      },
+    ],
+  },
+]);
 ```
 
+Use `describeToolPredictionEval` for fast tool-selection coverage. Use
+`describeMcpToolCallEval` when the eval must run the full MCP harness and
+capture actual tool calls, usage, and traces. Use `describeSearchAgentEval` for
+embedded search agents that return structured query output.
+
 ### Running Evals
 
 ```bash
@@ -277,9 +280,15 @@ describeEval("tool-name", {
 pnpm eval
 
 # Run specific eval
-pnpm eval tool-name
+pnpm --filter @sentry/mcp-server-evals eval tool-name
+
+# Serve the last JSON report locally
+pnpm eval:report
 ```
 
+Eval runs write `packages/mcp-server-evals/eval-results.json`; CI and the local
+report UI both read that JSON artifact.
+
 ## Test Data Management
 
 ### Using Fixtures
diff --git a/package.json b/package.json
index d26fc15a2..c8cbd4325 100644
--- a/package.json
+++ b/package.json
@@ -27,6 +27,8 @@
     "deploy": "turbo deploy",
     "eval": "dotenv -e .env -e .env.local -- turbo eval",
     "eval:ci": "CI=true dotenv -e .env -e .env.local -- pnpm --stream -r run eval:ci",
+    "eval:report": "pnpm --filter @sentry/mcp-server-evals eval:report",
+    "eval:ui": "pnpm --filter @sentry/mcp-server-evals eval:ui",
     "flue:issue-triage": "flue run issue-triage --target node",
     "format": "biome format --write",
     "lint": "biome lint",
diff --git a/packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts b/packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts
index 845d46484..1c0139f09 100644
--- a/packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts
+++ b/packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts
@@ -2,6 +2,7 @@ import {
   generateText,
   Output,
   type Tool,
+  type GenerateTextResult,
   APICallError,
   NoObjectGeneratedError,
   stepCountIs,
@@ -16,9 +17,17 @@ export type ToolCall = {
   args: unknown;
 };
 
+type EmbeddedAgentGenerateResult = GenerateTextResult<
+  Record<string, Tool>,
+  ReturnType<typeof Output.object>
+>;
+
 interface EmbeddedAgentResult<T> {
   result: T;
   toolCalls: ToolCall[];
+  steps?: EmbeddedAgentGenerateResult["steps"];
+  usage?: EmbeddedAgentGenerateResult["usage"];
+  totalUsage?: EmbeddedAgentGenerateResult["totalUsage"];
 }
 
 /**
@@ -54,7 +63,7 @@ export async function callEmbeddedAgent<
       system,
       prompt,
       tools,
-      stopWhen: stepCountIs(5),
+      stopWhen: stepCountIs(7),
       experimental_output: Output.object({ schema }),
       experimental_telemetry: {
         isEnabled: true,
@@ -101,6 +110,9 @@ export async function callEmbeddedAgent<
     return {
       result: parsedResult.data,
       toolCalls: capturedToolCalls,
+      steps: result.steps,
+      usage: result.usage,
+      totalUsage: result.totalUsage,
     };
   } catch (error: unknown) {
     // Rescue NoObjectGeneratedError: try to parse the raw LLM text through the schema
diff --git a/packages/mcp-core/src/skillDefinitions.json b/packages/mcp-core/src/skillDefinitions.json
index 74a6b4060..83ccf9438 100644
--- a/packages/mcp-core/src/skillDefinitions.json
+++ b/packages/mcp-core/src/skillDefinitions.json
@@ -64,7 +64,7 @@
       },
       {
         "name": "get_sentry_resource",
-        "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=<image_file_name>: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: <traceId>:<spanId>\n- snapshot: <snapshotId>\n- snapshotImage: <snapshotId>:<image_file_name>\n\n<examples>\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n</examples>",
+        "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=<image_file_name>: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: <traceId>:<spanId>\n- breadcrumbs: issue shortId or event ID\n- snapshot: <snapshotId>\n- snapshotImage: <snapshotId>:<image_file_name>\n\n<examples>\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n</examples>",
         "requiredScopes": ["event:read", "project:read"]
       },
       {
@@ -129,7 +129,7 @@
       },
       {
         "name": "get_sentry_resource",
-        "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=<image_file_name>: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: <traceId>:<spanId>\n- snapshot: <snapshotId>\n- snapshotImage: <snapshotId>:<image_file_name>\n\n<examples>\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n</examples>",
+        "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=<image_file_name>: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: <traceId>:<spanId>\n- breadcrumbs: issue shortId or event ID\n- snapshot: <snapshotId>\n- snapshotImage: <snapshotId>:<image_file_name>\n\n<examples>\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n</examples>",
         "requiredScopes": ["event:read", "project:read"]
       },
       {
@@ -219,7 +219,7 @@
       },
       {
         "name": "get_sentry_resource",
-        "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=<image_file_name>: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: <traceId>:<spanId>\n- snapshot: <snapshotId>\n- snapshotImage: <snapshotId>:<image_file_name>\n\n<examples>\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n</examples>",
+        "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=<image_file_name>: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: <traceId>:<spanId>\n- breadcrumbs: issue shortId or event ID\n- snapshot: <snapshotId>\n- snapshotImage: <snapshotId>:<image_file_name>\n\n<examples>\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n</examples>",
         "requiredScopes": ["event:read", "project:read"]
       },
       {
@@ -329,7 +329,7 @@
       },
       {
         "name": "get_sentry_resource",
-        "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=<image_file_name>: returns the image preview and metadata. Use the Sentry tool `get_snapshot_image(organizationSlug='<organization_slug>', snapshotId='<snapshot_id>', imageIdentifier='<image_file_name>', imageResolution='full')` for full-resolution image bytes.\n\nResource IDs:\n- span: <traceId>:<spanId>\n- snapshot: <snapshotId>\n- snapshotImage: <snapshotId>:<image_file_name>\n\n<examples>\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n</examples>",
+        "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=<image_file_name>: returns the image preview and metadata. Use the Sentry tool `get_snapshot_image(organizationSlug='<organization_slug>', snapshotId='<snapshot_id>', imageIdentifier='<image_file_name>', imageResolution='full')` for full-resolution image bytes.\n\nResource IDs:\n- span: <traceId>:<spanId>\n- breadcrumbs: issue shortId or event ID\n- snapshot: <snapshotId>\n- snapshotImage: <snapshotId>:<image_file_name>\n\n<examples>\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n</examples>",
         "requiredScopes": ["event:read", "project:read"]
       },
       {
diff --git a/packages/mcp-core/src/toolDefinitions.json b/packages/mcp-core/src/toolDefinitions.json
index d767e70df..e91775915 100644
--- a/packages/mcp-core/src/toolDefinitions.json
+++ b/packages/mcp-core/src/toolDefinitions.json
@@ -587,7 +587,7 @@
   },
   {
     "name": "get_sentry_resource",
-    "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=<image_file_name>: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: <traceId>:<spanId>\n- snapshot: <snapshotId>\n- snapshotImage: <snapshotId>:<image_file_name>\n\n<examples>\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n</examples>",
+    "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=<image_file_name>: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: <traceId>:<spanId>\n- breadcrumbs: issue shortId or event ID\n- snapshot: <snapshotId>\n- snapshotImage: <snapshotId>:<image_file_name>\n\n<examples>\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n</examples>",
     "inputSchema": {
       "type": "object",
       "properties": {
@@ -613,7 +613,7 @@
         },
         "resourceId": {
           "type": "string",
-          "description": "Resource identifier: issue shortId (e.g., 'PROJECT-123'), event ID, trace ID, AI conversation ID, replay ID, snapshot artifact ID, `<snapshotId>:<image_file_name>` for snapshot image resources, or `traceId:spanId` for span resources. Required when not using a URL."
+          "description": "Resource identifier: issue shortId (e.g., 'PROJECT-123'), event ID, trace ID, AI conversation ID, replay ID, snapshot artifact ID, issue shortId or event ID for breadcrumbs, `<snapshotId>:<image_file_name>` for snapshot image resources, or `traceId:spanId` for span resources. Required when not using a URL."
         },
         "organizationSlug": {
           "type": "string",
diff --git a/packages/mcp-core/src/tools/catalog/get-sentry-resource.ts b/packages/mcp-core/src/tools/catalog/get-sentry-resource.ts
index d40e00e13..b545cb28f 100644
--- a/packages/mcp-core/src/tools/catalog/get-sentry-resource.ts
+++ b/packages/mcp-core/src/tools/catalog/get-sentry-resource.ts
@@ -529,7 +529,7 @@ export default defineTool({
       "Supports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.",
       "Trace lookups return a condensed overview by default.",
       "",
-      "AI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.",
+      "AI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.",
       "",
       "For preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):",
       "- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)",
@@ -537,12 +537,14 @@ export default defineTool({
       "",
       "Resource IDs:",
       "- span: <traceId>:<spanId>",
+      "- breadcrumbs: issue shortId or event ID",
       "- snapshot: <snapshotId>",
       "- snapshotImage: <snapshotId>:<image_file_name>",
       "",
       "<examples>",
       "get_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')",
       "get_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')",
+      "get_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')",
       "get_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId='<traceId>:<spanId>')",
       "get_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')",
       "get_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')",
@@ -582,7 +584,7 @@ export default defineTool({
       .trim()
       .optional()
       .describe(
-        "Resource identifier: issue shortId (e.g., 'PROJECT-123'), event ID, trace ID, AI conversation ID, replay ID, snapshot artifact ID, `<snapshotId>:<image_file_name>` for snapshot image resources, or `traceId:spanId` for span resources. Required when not using a URL.",
+        "Resource identifier: issue shortId (e.g., 'PROJECT-123'), event ID, trace ID, AI conversation ID, replay ID, snapshot artifact ID, issue shortId or event ID for breadcrumbs, `<snapshotId>:<image_file_name>` for snapshot image resources, or `traceId:spanId` for span resources. Required when not using a URL.",
       ),
 
     organizationSlug: ParamOrganizationSlug.optional(),
diff --git a/packages/mcp-core/src/tools/support/search-events/agent.ts b/packages/mcp-core/src/tools/support/search-events/agent.ts
index 15acc7d7b..abb3b991f 100644
--- a/packages/mcp-core/src/tools/support/search-events/agent.ts
+++ b/packages/mcp-core/src/tools/support/search-events/agent.ts
@@ -7,6 +7,7 @@ import { createWhoamiTool } from "../../../internal/agents/tools/whoami";
 import { createDatasetAttributesTool } from "./utils";
 import { systemPrompt } from "./config";
 import { PUBLIC_EVENTS_DATASETS } from "../../../utils/events-datasets";
+import type { ToolCall } from "../../../internal/agents/callEmbeddedAgent";
 
 const SEARCH_EVENTS_DATASETS = [...PUBLIC_EVENTS_DATASETS, "replays"] as const;
 
@@ -91,7 +92,7 @@ export async function searchEventsAgent(
   options: SearchEventsAgentOptions,
 ): Promise<{
   result: z.output<typeof searchEventsAgentOutputSchema>;
-  toolCalls: any[];
+  toolCalls: ToolCall[];
 }> {
   // Provider check happens in callEmbeddedAgent via getAgentProvider()
   // Create tools pre-bound with the provided API service and organization
diff --git a/packages/mcp-core/src/tools/support/search-events/config.ts b/packages/mcp-core/src/tools/support/search-events/config.ts
index 3575c3459..408bee95d 100644
--- a/packages/mcp-core/src/tools/support/search-events/config.ts
+++ b/packages/mcp-core/src/tools/support/search-events/config.ts
@@ -228,7 +228,9 @@ CORRECT QUERY PATTERNS (FOLLOW THESE):
 - For field existence: Use has:field_name (NOT field_name IS NOT NULL)
 - For field absence: Use !has:field_name (NOT field_name IS NULL)
 - For time periods: Use timeRange parameter (NOT SQL date functions)
+- For numeric thresholds: Use comparison operators like field:>value, field:<value, field:>=value, or field:<=value (NOT wildcard/string prefixes)
 - Example: "items processed yesterday" → query: "has:item.processed", timeRange: {"statsPeriod": "24h"}
+- Example: "temperature above 0.7" → query: "gen_ai.request.temperature:>0.7"
 
 PROCESS:
 1. Analyze the user's query
@@ -241,6 +243,7 @@ COMMON ERRORS TO AVOID:
 - Using SQL syntax (IS NOT NULL, IS NULL, yesterday(), today(), etc.) - Use has: operator and timeRange instead
 - Using numeric functions (sum, avg, min, max, percentiles) on non-numeric fields
 - Using incorrect field names (use the otelSemantics tool to look up correct names)
+- Approximating numeric thresholds with wildcard strings (use field:>value or field:<value comparisons)
 - Missing required fields in the fields array for aggregate queries
 - Invalid sort parameter not included in fields array
 - Putting replay environment filters inside the replay query instead of the separate environment field
@@ -271,6 +274,7 @@ export const NUMERIC_FIELDS: Record<string, Set<string>> = {
     "gen_ai.usage.input_tokens",
     "gen_ai.usage.output_tokens",
     "gen_ai.request.max_tokens",
+    "gen_ai.request.temperature",
     // Web Vitals measurements
     "measurements.lcp",
     "measurements.cls",
@@ -329,6 +333,7 @@ export const DATASET_FIELDS = {
     "gen_ai.provider.name": "AI provider name (e.g., anthropic, openai)",
     "gen_ai.request.model": "Model name (e.g., claude-3-5-sonnet-20241022)",
     "gen_ai.operation.name": "Operation type (e.g., chat, completion)",
+    "gen_ai.request.temperature": "LLM sampling temperature (numeric)",
     "gen_ai.usage.input_tokens": "Number of input tokens (numeric)",
     "gen_ai.usage.output_tokens": "Number of output tokens (numeric)",
     "gen_ai.tool.name": "Tool name (e.g., search_issues, search_events)",
@@ -584,6 +589,21 @@ export const DATASET_EXAMPLES: Record<
         sort: "-sum(gen_ai.usage.input_tokens)",
       },
     },
+    {
+      description: "LLM calls where temperature is above 0.7",
+      output: {
+        query: "gen_ai.request.temperature:>0.7",
+        fields: [
+          "gen_ai.request.model",
+          "gen_ai.request.temperature",
+          "gen_ai.operation.name",
+          "span.duration",
+          "timestamp",
+          "trace",
+        ],
+        sort: "-span.duration",
+      },
+    },
     {
       description: "top MCP tool calls by usage",
       output: {
diff --git a/packages/mcp-core/src/tools/support/search-issue-events/agent.ts b/packages/mcp-core/src/tools/support/search-issue-events/agent.ts
index 37991efcf..5a1885689 100644
--- a/packages/mcp-core/src/tools/support/search-issue-events/agent.ts
+++ b/packages/mcp-core/src/tools/support/search-issue-events/agent.ts
@@ -1,5 +1,6 @@
 import { z } from "zod";
 import { callEmbeddedAgent } from "../../../internal/agents/callEmbeddedAgent";
+import type { ToolCall } from "../../../internal/agents/callEmbeddedAgent";
 import type { SentryApiService } from "../../../api-client";
 import { createWhoamiTool } from "../../../internal/agents/tools/whoami";
 import { createIssueEventFieldsTool } from "./utils";
@@ -76,7 +77,7 @@ export async function searchIssueEventsAgent(
   options: SearchIssueEventsAgentOptions,
 ): Promise<{
   result: z.output<typeof searchIssueEventsAgentOutputSchema>;
-  toolCalls: any[];
+  toolCalls: ToolCall[];
 }> {
   // Provider check happens in callEmbeddedAgent via getAgentProvider()
   // Create tools pre-bound with the provided API service and organization
diff --git a/packages/mcp-core/src/tools/support/search-issues/agent.ts b/packages/mcp-core/src/tools/support/search-issues/agent.ts
index 75f5967c3..34448154d 100644
--- a/packages/mcp-core/src/tools/support/search-issues/agent.ts
+++ b/packages/mcp-core/src/tools/support/search-issues/agent.ts
@@ -1,6 +1,7 @@
 import { z } from "zod";
 import type { SentryApiService } from "../../../api-client";
 import { callEmbeddedAgent } from "../../../internal/agents/callEmbeddedAgent";
+import type { ToolCall } from "../../../internal/agents/callEmbeddedAgent";
 import { createDatasetFieldsTool } from "../../../internal/agents/tools/dataset-fields";
 import { createWhoamiTool } from "../../../internal/agents/tools/whoami";
 import { systemPrompt } from "./config";
@@ -35,7 +36,7 @@ export async function searchIssuesAgent(
   options: SearchIssuesAgentOptions,
 ): Promise<{
   result: z.output<typeof searchIssuesAgentOutputSchema>;
-  toolCalls: any[];
+  toolCalls: ToolCall[];
 }> {
   // Provider check happens in callEmbeddedAgent via getAgentProvider()
   // Create tools pre-bound with the provided API service and organization
diff --git a/packages/mcp-server-evals/README.md b/packages/mcp-server-evals/README.md
index 526af9ee0..7804afdfb 100644
--- a/packages/mcp-server-evals/README.md
+++ b/packages/mcp-server-evals/README.md
@@ -2,6 +2,75 @@
 
 Evaluation helpers and a local mock stdio runner used when developing and validating the Sentry MCP server.
 
+## Running evals
+
+The suite uses the harness-first `vitest-evals` API through repo-local helpers
+in `src/evals/utils`. Keep eval files focused on fixture cases; the helpers
+own harness selection, judges, thresholds, timeouts, usage capture, and traces.
+
+```bash
+# Requires OPENAI_API_KEY in .env or .env.local
+pnpm eval
+
+# Run a single eval file/suite pattern
+pnpm --filter @sentry/mcp-server-evals eval search-issues
+
+# Print expanded tool/output detail in the terminal report
+pnpm --filter @sentry/mcp-server-evals eval:info
+```
+
+Eval runs write `packages/mcp-server-evals/eval-results.json`, which is the
+artifact used by both the local report UI and GitHub Actions.
+
+## Writing evals
+
+Use the smallest helper that exercises the behavior you need:
+
+- `describeToolPredictionEval` for fast prediction suites that ask a model to
+  predict which MCP tools should be called. The harness output is
+  `{ predictedTools, rationale }`; a deterministic judge compares it with
+  `expectedTools`.
+- `describeMcpToolCallEval` for full MCP harness runs through the mock stdio
+  server. Use this when actual tool interception, usage data, and traces matter.
+- `describeSearchAgentEval` for embedded search agents that return structured
+  query output plus captured tool calls.
+
+```typescript
+import { describeToolPredictionEval, FIXTURES } from "./utils";
+
+describeToolPredictionEval("list-projects", [
+  {
+    input: `What projects do I have access to in ${FIXTURES.organizationSlug}?`,
+    expectedTools: [
+      {
+        name: "find_projects",
+        arguments: { organizationSlug: FIXTURES.organizationSlug },
+      },
+    ],
+  },
+]);
+```
+
+## Local report UI
+
+After running evals, open the report UI with either root shortcut:
+
+```bash
+pnpm eval:report
+pnpm eval:ui
+```
+
+Both commands serve `packages/mcp-server-evals/eval-results.json` with
+`vitest-evals serve`.
+
+## CI reporting
+
+`.github/workflows/eval.yml` emits Vitest JSON and JUnit XML, then uses
+`getsentry/vitest-evals@v0` to publish the GitHub Actions summary,
+annotations, and the `Evaluation Results` check run. The JSON artifact is the
+source of truth because it preserves eval scores and metadata; JUnit is kept
+for tools that expect XML.
+
 ## Mock stdio runner
 
 - Command: `pnpm --filter @sentry/mcp-server-evals start`
diff --git a/packages/mcp-server-evals/package.json b/packages/mcp-server-evals/package.json
index dbc5cf6db..fbdc7641e 100644
--- a/packages/mcp-server-evals/package.json
+++ b/packages/mcp-server-evals/package.json
@@ -11,8 +11,15 @@
     "build": "tsc -b",
     "dev": "tsc -w",
     "start": "tsx src/bin/start-mock-stdio.ts",
-    "eval": "vitest --config=vitest.config.ts",
-    "eval:ci": "vitest run --reporter=vitest-evals/reporter --reporter=junit --reporter=json --outputFile.json=eval-results.json --outputFile.junit=eval.junit.xml"
+    "test": "vitest run --config=vitest.unit.config.ts",
+    "test:ci": "vitest run --config=vitest.unit.config.ts --reporter=default --reporter=junit --outputFile=tests.junit.xml",
+    "test:watch": "vitest --config=vitest.unit.config.ts",
+    "eval": "vitest run --config=vitest.config.ts --reporter=vitest-evals/reporter --reporter=json --outputFile.json=eval-results.json",
+    "eval:ci": "vitest run --config=vitest.config.ts --reporter=vitest-evals/reporter --reporter=junit --reporter=json --outputFile.json=eval-results.json --outputFile.junit=eval.junit.xml",
+    "eval:info": "VITEST_EVALS_REPORT_LEVEL=info vitest run --config=vitest.config.ts --reporter=vitest-evals/reporter --reporter=json --outputFile.json=eval-results.json",
+    "eval:report": "vitest-evals serve eval-results.json",
+    "eval:ui": "vitest-evals serve eval-results.json",
+    "eval:watch": "vitest --config=vitest.config.ts"
   },
   "dependencies": {
     "@ai-sdk/mcp": "catalog:",
@@ -22,6 +29,7 @@
     "@sentry/mcp-server": "workspace:*",
     "@sentry/mcp-server-mocks": "workspace:*",
     "@sentry/mcp-server-tsconfig": "workspace:*",
+    "@vitest-evals/harness-ai-sdk": "catalog:",
     "ai": "catalog:",
     "dotenv": "catalog:",
     "msw": "catalog:",
diff --git a/packages/mcp-server-evals/src/evals/autofix.eval.ts b/packages/mcp-server-evals/src/evals/autofix.eval.ts
index d6a4590c8..1400e689f 100644
--- a/packages/mcp-server-evals/src/evals/autofix.eval.ts
+++ b/packages/mcp-server-evals/src/evals/autofix.eval.ts
@@ -1,35 +1,26 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("begin-issue-fix", {
-  data: async () => {
-    return [
+describeToolPredictionEval("begin-issue-fix", [
+  {
+    input: `Whats the status on root causing this issue in Sentry?\n${FIXTURES.testIssueUrl}`,
+    expectedTools: [
       {
-        input: `Whats the status on root causing this issue in Sentry?\n${FIXTURES.testIssueUrl}`,
-        expectedTools: [
-          {
-            name: "analyze_issue_with_seer",
-            arguments: {
-              issueUrl: FIXTURES.testIssueUrl,
-            },
-          },
-        ],
+        name: "analyze_issue_with_seer",
+        arguments: {
+          issueUrl: FIXTURES.testIssueUrl,
+        },
       },
+    ],
+  },
+  {
+    input: `Can you root cause this issue and retrieve the analysis?\n${FIXTURES.testIssueUrl}`,
+    expectedTools: [
       {
-        input: `Can you root cause this issue and retrieve the analysis?\n${FIXTURES.testIssueUrl}`,
-        expectedTools: [
-          {
-            name: "analyze_issue_with_seer",
-            arguments: {
-              issueUrl: FIXTURES.testIssueUrl,
-            },
-          },
-        ],
+        name: "analyze_issue_with_seer",
+        arguments: {
+          issueUrl: FIXTURES.testIssueUrl,
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/create-dsn.eval.ts b/packages/mcp-server-evals/src/evals/create-dsn.eval.ts
index 5fa59f61a..ae146e91c 100644
--- a/packages/mcp-server-evals/src/evals/create-dsn.eval.ts
+++ b/packages/mcp-server-evals/src/evals/create-dsn.eval.ts
@@ -1,26 +1,17 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("create-dsn", {
-  data: async () => {
-    return [
+describeToolPredictionEval("create-dsn", [
+  {
+    input: `Create a new DSN named "Production" for '${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}'`,
+    expectedTools: [
       {
-        input: `Create a new DSN named "Production" for '${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}'`,
-        expectedTools: [
-          {
-            name: "create_dsn",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              projectSlug: FIXTURES.projectSlug,
-              name: "Production",
-            },
-          },
-        ],
+        name: "create_dsn",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          projectSlug: FIXTURES.projectSlug,
+          name: "Production",
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/create-project.eval.ts b/packages/mcp-server-evals/src/evals/create-project.eval.ts
index f551c7ded..20258277c 100644
--- a/packages/mcp-server-evals/src/evals/create-project.eval.ts
+++ b/packages/mcp-server-evals/src/evals/create-project.eval.ts
@@ -1,38 +1,29 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("create-project", {
-  data: async () => {
-    return [
+describeToolPredictionEval("create-project", [
+  {
+    input: `Create a new project in Sentry for '${FIXTURES.organizationSlug}' called '${FIXTURES.projectSlug}' with the '${FIXTURES.teamSlug}' team. Output **only** the project slug and the SENTRY_DSN in the format of:\n<PROJECT_SLUG>\n<SENTRY_DSN>`,
+    expectedTools: [
       {
-        input: `Create a new project in Sentry for '${FIXTURES.organizationSlug}' called '${FIXTURES.projectSlug}' with the '${FIXTURES.teamSlug}' team. Output **only** the project slug and the SENTRY_DSN in the format of:\n<PROJECT_SLUG>\n<SENTRY_DSN>`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "find_teams",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-          {
-            name: "create_project",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              regionUrl: "https://us.sentry.io",
-              teamSlug: FIXTURES.teamSlug,
-              name: FIXTURES.projectSlug,
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-    ];
+      {
+        name: "find_teams",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          regionUrl: "https://us.sentry.io",
+        },
+      },
+      {
+        name: "create_project",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          regionUrl: "https://us.sentry.io",
+          teamSlug: FIXTURES.teamSlug,
+          name: FIXTURES.projectSlug,
+        },
+      },
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/create-team.eval.ts b/packages/mcp-server-evals/src/evals/create-team.eval.ts
index 2a789f505..a109f898d 100644
--- a/packages/mcp-server-evals/src/evals/create-team.eval.ts
+++ b/packages/mcp-server-evals/src/evals/create-team.eval.ts
@@ -1,30 +1,21 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("create-team", {
-  data: async () => {
-    return [
+describeToolPredictionEval("create-team", [
+  {
+    input: `Create a new team in Sentry for '${FIXTURES.organizationSlug}' called 'the-goats' response with **only** the team slug and no other text.`,
+    expectedTools: [
       {
-        input: `Create a new team in Sentry for '${FIXTURES.organizationSlug}' called 'the-goats' response with **only** the team slug and no other text.`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "create_team",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              name: "the-goats",
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-    ];
+      {
+        name: "create_team",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          name: "the-goats",
+          regionUrl: "https://us.sentry.io",
+        },
+      },
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/get-issue.eval.ts b/packages/mcp-server-evals/src/evals/get-issue.eval.ts
index 03c877c68..4d7efac94 100644
--- a/packages/mcp-server-evals/src/evals/get-issue.eval.ts
+++ b/packages/mcp-server-evals/src/evals/get-issue.eval.ts
@@ -1,55 +1,46 @@
-import { describeEval, ToolCallScorer } from "vitest-evals";
-import { FIXTURES, McpToolCallTaskRunner } from "./utils";
+import { describeMcpToolCallEval, FIXTURES } from "./utils";
 
-describeEval("get-issue", {
-  data: async () => {
-    return [
+describeMcpToolCallEval("get-issue", [
+  {
+    input: `Explain CLOUDFLARE-MCP-41 from Sentry in ${FIXTURES.organizationSlug}.`,
+    expectedTools: [
       {
-        input: `Explain CLOUDFLARE-MCP-41 from Sentry in ${FIXTURES.organizationSlug}.`,
-        expectedTools: [
-          {
-            name: "search_tools",
-            arguments: {
-              query: "issue",
-            },
-          },
-          {
-            name: "execute_tool",
-            arguments: {
-              name: "get_issue_details",
-              arguments: {
-                organizationSlug: FIXTURES.organizationSlug,
-                issueId: "CLOUDFLARE-MCP-41",
-              },
-            },
-          },
-        ],
+        name: "search_tools",
+        arguments: {
+          query: /issue|get_issue_details/,
+        },
       },
       {
-        input: `Explain the event with ID 7ca573c0f4814912aaa9bdc77d1a7d51 from Sentry in ${FIXTURES.organizationSlug}.`,
-        expectedTools: [
-          {
-            name: "search_tools",
-            arguments: {
-              query: "issue",
-            },
+        name: "execute_tool",
+        arguments: {
+          name: "get_issue_details",
+          arguments: {
+            organizationSlug: FIXTURES.organizationSlug,
+            issueId: "CLOUDFLARE-MCP-41",
           },
-          {
-            name: "execute_tool",
-            arguments: {
-              name: "get_issue_details",
-              arguments: {
-                organizationSlug: FIXTURES.organizationSlug,
-                eventId: "7ca573c0f4814912aaa9bdc77d1a7d51",
-              },
-            },
+        },
+      },
+    ],
+  },
+  {
+    input: `Explain the event with ID 7ca573c0f4814912aaa9bdc77d1a7d51 from Sentry in ${FIXTURES.organizationSlug}.`,
+    expectedTools: [
+      {
+        name: "search_tools",
+        arguments: {
+          query: /issue|event|get_issue_details/,
+        },
+      },
+      {
+        name: "execute_tool",
+        arguments: {
+          name: "get_issue_details",
+          arguments: {
+            organizationSlug: FIXTURES.organizationSlug,
+            eventId: "7ca573c0f4814912aaa9bdc77d1a7d51",
           },
-        ],
+        },
       },
-    ];
+    ],
   },
-  task: McpToolCallTaskRunner(),
-  scorers: [ToolCallScorer({ ordered: true, params: "fuzzy" })],
-  threshold: 0.6,
-  timeout: 90000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/get-sentry-resource.eval.ts b/packages/mcp-server-evals/src/evals/get-sentry-resource.eval.ts
index 42437788e..625deabe8 100644
--- a/packages/mcp-server-evals/src/evals/get-sentry-resource.eval.ts
+++ b/packages/mcp-server-evals/src/evals/get-sentry-resource.eval.ts
@@ -1,60 +1,51 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("get-sentry-resource", {
-  data: async () => {
-    return [
+describeToolPredictionEval("get-sentry-resource", [
+  {
+    input: `What's happening in this Sentry issue? ${FIXTURES.issueUrl}`,
+    expectedTools: [
       {
-        input: `What's happening in this Sentry issue? ${FIXTURES.issueUrl}`,
-        expectedTools: [
-          {
-            name: "get_sentry_resource",
-            arguments: {
-              url: FIXTURES.issueUrl,
-            },
-          },
-        ],
+        name: "get_sentry_resource",
+        arguments: {
+          url: FIXTURES.issueUrl,
+        },
       },
+    ],
+  },
+  {
+    input: `Show me the breadcrumbs for ${FIXTURES.issueUrl}`,
+    expectedTools: [
       {
-        input: `Show me the breadcrumbs for ${FIXTURES.issueUrl}`,
-        expectedTools: [
-          {
-            name: "get_sentry_resource",
-            arguments: {
-              url: FIXTURES.issueUrl,
-              resourceType: "breadcrumbs",
-            },
-          },
-        ],
+        name: "get_sentry_resource",
+        arguments: {
+          url: FIXTURES.issueUrl,
+          resourceType: "breadcrumbs",
+        },
       },
+    ],
+  },
+  {
+    input: `Fetch the breadcrumbs for issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}.`,
+    expectedTools: [
       {
-        input: `Fetch the breadcrumbs for issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}.`,
-        expectedTools: [
-          {
-            name: "get_sentry_resource",
-            arguments: {
-              resourceType: "breadcrumbs",
-              organizationSlug: FIXTURES.organizationSlug,
-              resourceId: FIXTURES.issueId,
-            },
-          },
-        ],
+        name: "get_sentry_resource",
+        arguments: {
+          resourceType: "breadcrumbs",
+          organizationSlug: FIXTURES.organizationSlug,
+          resourceId: FIXTURES.issueId,
+        },
       },
+    ],
+  },
+  {
+    input: `Show me what happened in this trace: ${FIXTURES.traceUrl}`,
+    expectedTools: [
       {
-        input: `Show me what happened in this trace: ${FIXTURES.traceUrl}`,
-        expectedTools: [
-          {
-            name: "get_sentry_resource",
-            arguments: {
-              url: FIXTURES.traceUrl,
-            },
-          },
-        ],
+        name: "get_sentry_resource",
+        arguments: {
+          url: FIXTURES.traceUrl,
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/get-trace-details.eval.ts b/packages/mcp-server-evals/src/evals/get-trace-details.eval.ts
index 86678bed4..61d82e7f8 100644
--- a/packages/mcp-server-evals/src/evals/get-trace-details.eval.ts
+++ b/packages/mcp-server-evals/src/evals/get-trace-details.eval.ts
@@ -1,55 +1,46 @@
-import { describeEval, ToolCallScorer } from "vitest-evals";
-import { FIXTURES, McpToolCallTaskRunner } from "./utils";
+import { describeMcpToolCallEval, FIXTURES } from "./utils";
 
-describeEval("get-trace-details", {
-  data: async () => {
-    return [
+describeMcpToolCallEval("get-trace-details", [
+  {
+    input: `Show me trace ${FIXTURES.traceId} from Sentry in ${FIXTURES.organizationSlug}.`,
+    expectedTools: [
       {
-        input: `Show me trace ${FIXTURES.traceId} from Sentry in ${FIXTURES.organizationSlug}.`,
-        expectedTools: [
-          {
-            name: "search_tools",
-            arguments: {
-              query: "trace",
-            },
-          },
-          {
-            name: "execute_tool",
-            arguments: {
-              name: "get_trace_details",
-              arguments: {
-                organizationSlug: FIXTURES.organizationSlug,
-                traceId: FIXTURES.traceId,
-              },
-            },
-          },
-        ],
+        name: "search_tools",
+        arguments: {
+          query: "trace",
+        },
       },
       {
-        input: `Explain trace ${FIXTURES.traceId} in ${FIXTURES.organizationSlug}.`,
-        expectedTools: [
-          {
-            name: "search_tools",
-            arguments: {
-              query: "trace",
-            },
+        name: "execute_tool",
+        arguments: {
+          name: "get_trace_details",
+          arguments: {
+            organizationSlug: FIXTURES.organizationSlug,
+            traceId: FIXTURES.traceId,
           },
-          {
-            name: "execute_tool",
-            arguments: {
-              name: "get_trace_details",
-              arguments: {
-                organizationSlug: FIXTURES.organizationSlug,
-                traceId: FIXTURES.traceId,
-              },
-            },
+        },
+      },
+    ],
+  },
+  {
+    input: `Explain trace ${FIXTURES.traceId} in ${FIXTURES.organizationSlug}.`,
+    expectedTools: [
+      {
+        name: "search_tools",
+        arguments: {
+          query: "trace",
+        },
+      },
+      {
+        name: "execute_tool",
+        arguments: {
+          name: "get_trace_details",
+          arguments: {
+            organizationSlug: FIXTURES.organizationSlug,
+            traceId: FIXTURES.traceId,
           },
-        ],
+        },
       },
-    ];
+    ],
   },
-  task: McpToolCallTaskRunner(),
-  scorers: [ToolCallScorer({ ordered: true, params: "fuzzy" })],
-  threshold: 0.6,
-  timeout: 90000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/list-dsns.eval.ts b/packages/mcp-server-evals/src/evals/list-dsns.eval.ts
index ad9341666..84103a5b8 100644
--- a/packages/mcp-server-evals/src/evals/list-dsns.eval.ts
+++ b/packages/mcp-server-evals/src/evals/list-dsns.eval.ts
@@ -1,25 +1,16 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("list-dsns", {
-  data: async () => {
-    return [
+describeToolPredictionEval("list-dsns", [
+  {
+    input: `What is the SENTRY_DSN for ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}?`,
+    expectedTools: [
       {
-        input: `What is the SENTRY_DSN for ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}?`,
-        expectedTools: [
-          {
-            name: "find_dsns",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              projectSlug: FIXTURES.projectSlug,
-            },
-          },
-        ],
+        name: "find_dsns",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          projectSlug: FIXTURES.projectSlug,
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/list-issues.eval.ts b/packages/mcp-server-evals/src/evals/list-issues.eval.ts
index 64295d64c..377ea66cc 100644
--- a/packages/mcp-server-evals/src/evals/list-issues.eval.ts
+++ b/packages/mcp-server-evals/src/evals/list-issues.eval.ts
@@ -1,94 +1,85 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("list-issues", {
-  data: async () => {
-    return [
+describeToolPredictionEval("list-issues", [
+  {
+    input: `What are the most common production errors in ${FIXTURES.organizationSlug}?`,
+    expectedTools: [
       {
-        input: `What are the most common production errors in ${FIXTURES.organizationSlug}?`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issues",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              query: "is:unresolved",
-              sort: "freq",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
       {
-        input: `Show me the top issues in ${FIXTURES.organizationSlug} organization`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issues",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              sort: "freq",
-            },
-          },
-        ],
+        name: "search_issues",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          query: "is:unresolved",
+          sort: "freq",
+        },
       },
+    ],
+  },
+  {
+    input: `Show me the top issues in ${FIXTURES.organizationSlug} organization`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "search_issues",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          sort: "freq",
+        },
+      },
+    ],
+  },
+  {
+    input: `What are the most recent issues in ${FIXTURES.organizationSlug}?`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "search_issues",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          sort: "date",
+        },
+      },
+    ],
+  },
+  {
+    input: `Find the newest production issues in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
       {
-        input: `What are the most recent issues in ${FIXTURES.organizationSlug}?`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issues",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              sort: "date",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
       {
-        input: `Find the newest production issues in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issues",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              sort: "new",
-            },
-          },
-        ],
+        name: "search_issues",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          sort: "new",
+        },
+      },
+    ],
+  },
+  {
+    input: `What issues is david@sentry.io experiencing in ${FIXTURES.organizationSlug}?`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
       },
       {
-        input: `What issues is david@sentry.io experiencing in ${FIXTURES.organizationSlug}?`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issues",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              query: "user.email:david@sentry.io",
-            },
-          },
-        ],
+        name: "search_issues",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          query: "user.email:david@sentry.io",
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/list-organizations.eval.ts b/packages/mcp-server-evals/src/evals/list-organizations.eval.ts
index 826e53402..f5238fd39 100644
--- a/packages/mcp-server-evals/src/evals/list-organizations.eval.ts
+++ b/packages/mcp-server-evals/src/evals/list-organizations.eval.ts
@@ -1,22 +1,13 @@
-import { describeEval } from "vitest-evals";
-import { NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval } from "./utils";
 
-describeEval("list-organizations", {
-  data: async () => {
-    return [
+describeToolPredictionEval("list-organizations", [
+  {
+    input: `What organizations do I have access to in Sentry`,
+    expectedTools: [
       {
-        input: `What organizations do I have access to in Sentry`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/list-projects.eval.ts b/packages/mcp-server-evals/src/evals/list-projects.eval.ts
index 50c698034..e98cfccaf 100644
--- a/packages/mcp-server-evals/src/evals/list-projects.eval.ts
+++ b/packages/mcp-server-evals/src/evals/list-projects.eval.ts
@@ -1,29 +1,20 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("list-projects", {
-  data: async () => {
-    return [
+describeToolPredictionEval("list-projects", [
+  {
+    input: `What projects do I have access to in Sentry for '${FIXTURES.organizationSlug}'`,
+    expectedTools: [
       {
-        input: `What projects do I have access to in Sentry for '${FIXTURES.organizationSlug}'`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "find_projects",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-    ];
+      {
+        name: "find_projects",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          regionUrl: "https://us.sentry.io",
+        },
+      },
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/list-releases.eval.ts b/packages/mcp-server-evals/src/evals/list-releases.eval.ts
index bba7d48da..7c1972896 100644
--- a/packages/mcp-server-evals/src/evals/list-releases.eval.ts
+++ b/packages/mcp-server-evals/src/evals/list-releases.eval.ts
@@ -1,53 +1,44 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("list-releases", {
-  data: async () => {
-    return [
+describeToolPredictionEval("list-releases", [
+  {
+    input: `Show me the releases in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
       {
-        input: `Show me the releases in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "find_releases",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
       {
-        input: `Show me a list of versions in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "find_projects",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-          {
-            name: "find_releases",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              projectSlug: FIXTURES.projectSlug,
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "find_releases",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          regionUrl: "https://us.sentry.io",
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+  {
+    input: `Show me a list of versions in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "find_projects",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          regionUrl: "https://us.sentry.io",
+        },
+      },
+      {
+        name: "find_releases",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          projectSlug: FIXTURES.projectSlug,
+          regionUrl: "https://us.sentry.io",
+        },
+      },
+    ],
+  },
+]);
diff --git a/packages/mcp-server-evals/src/evals/list-tags.eval.ts b/packages/mcp-server-evals/src/evals/list-tags.eval.ts
index 3470c83c8..fee738162 100644
--- a/packages/mcp-server-evals/src/evals/list-tags.eval.ts
+++ b/packages/mcp-server-evals/src/evals/list-tags.eval.ts
@@ -1,29 +1,22 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("list-tags", {
-  data: async () => {
-    return [
+describeToolPredictionEval("get-issue-tag-values", [
+  {
+    input: `What are common values for the url tag on issue CLOUDFLARE-MCP-41 in ${FIXTURES.organizationSlug}?`,
+    expectedTools: [
       {
-        input: `What are common tags in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "find_tags",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-    ];
+      {
+        name: "get_issue_tag_values",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          regionUrl: "https://us.sentry.io",
+          issueId: "CLOUDFLARE-MCP-41",
+          tagKey: "url",
+        },
+      },
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/list-teams.eval.ts b/packages/mcp-server-evals/src/evals/list-teams.eval.ts
index 3e598dbe0..d28f329a1 100644
--- a/packages/mcp-server-evals/src/evals/list-teams.eval.ts
+++ b/packages/mcp-server-evals/src/evals/list-teams.eval.ts
@@ -1,61 +1,52 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("list-teams", {
-  data: async () => {
-    return [
+describeToolPredictionEval("list-teams", [
+  {
+    input: `What teams do I have access to in Sentry for '${FIXTURES.organizationSlug}'`,
+    expectedTools: [
       {
-        input: `What teams do I have access to in Sentry for '${FIXTURES.organizationSlug}'`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "find_teams",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
       {
-        input: `Do I have access to the team '${FIXTURES.teamSlug}' for '${FIXTURES.organizationSlug}'`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "find_teams",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "find_teams",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          regionUrl: "https://us.sentry.io",
+        },
+      },
+    ],
+  },
+  {
+    input: `Do I have access to the team '${FIXTURES.teamSlug}' for '${FIXTURES.organizationSlug}'`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "find_teams",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          regionUrl: "https://us.sentry.io",
+        },
+      },
+    ],
+  },
+  {
+    input: `Do I have access to the team 'an-imaginary-team' for '${FIXTURES.organizationSlug}'`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
       },
       {
-        input: `Do I have access to the team 'an-imaginary-team' for '${FIXTURES.organizationSlug}'`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "find_teams",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "find_teams",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          regionUrl: "https://us.sentry.io",
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/search-docs.eval.ts b/packages/mcp-server-evals/src/evals/search-docs.eval.ts
index 2d9454dca..b7cbdb817 100644
--- a/packages/mcp-server-evals/src/evals/search-docs.eval.ts
+++ b/packages/mcp-server-evals/src/evals/search-docs.eval.ts
@@ -1,51 +1,42 @@
-import { describeEval } from "vitest-evals";
-import { NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval } from "./utils";
 
-describeEval("search-docs", {
-  data: async () => {
-    return [
+describeToolPredictionEval("search-docs", [
+  {
+    input:
+      "I need documentation on how to set up error tracking with Sentry in JavaScript",
+    expectedTools: [
       {
-        input:
-          "I need documentation on how to set up error tracking with Sentry in JavaScript",
-        expectedTools: [
-          {
-            name: "search_docs",
-            arguments: {
-              query: "set up error tracking JavaScript",
-              maxResults: 3,
-            },
-          },
-        ],
+        name: "search_docs",
+        arguments: {
+          query: "set up error tracking JavaScript",
+          maxResults: 3,
+        },
       },
+    ],
+  },
+  {
+    input:
+      "I need help configuring Sentry with React components and error boundaries",
+    expectedTools: [
       {
-        input:
-          "I need help configuring Sentry with React components and error boundaries",
-        expectedTools: [
-          {
-            name: "search_docs",
-            arguments: {
-              query: "React components error boundaries",
-              maxResults: 3,
-            },
-          },
-        ],
+        name: "search_docs",
+        arguments: {
+          query: "React components error boundaries",
+          maxResults: 3,
+        },
       },
+    ],
+  },
+  {
+    input: "What is Sentry's rate limiting and how does it work?",
+    expectedTools: [
       {
-        input: "What is Sentry's rate limiting and how does it work?",
-        expectedTools: [
-          {
-            name: "search_docs",
-            arguments: {
-              query: "rate limiting",
-              maxResults: 3,
-            },
-          },
-        ],
+        name: "search_docs",
+        arguments: {
+          query: "rate limiting",
+          maxResults: 3,
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/search-events-agent.eval.ts b/packages/mcp-server-evals/src/evals/search-events-agent.eval.ts
index 9ca562017..d7786d1cc 100644
--- a/packages/mcp-server-evals/src/evals/search-events-agent.eval.ts
+++ b/packages/mcp-server-evals/src/evals/search-events-agent.eval.ts
@@ -1,227 +1,198 @@
-import { describeEval } from "vitest-evals";
-import { ToolCallScorer } from "vitest-evals";
-import { searchEventsAgent } from "@sentry/mcp-core/tools/search-events/agent";
-import { SentryApiService } from "@sentry/mcp-core/api-client";
-import { StructuredOutputScorer } from "./utils/structuredOutputScorer";
+import { describeSearchAgentEval, searchEventsAgentHarness } from "./utils";
 import "../setup-env";
 
 // The shared MSW server is already started in setup-env.ts
 
-describeEval("search-events-agent", {
-  data: async () => {
-    return [
-      {
-        // Simple query with common fields - should NOT require tool calls
-        input: "Show me all errors from today",
-        expectedTools: [],
-        expected: {
-          dataset: "errors",
-          query: "", // No filters, just time range
-          sort: "-timestamp",
-          timeRange: { statsPeriod: "24h" },
-        },
-      },
-      {
-        // Query with "me" reference - should only require whoami
-        input: "Show me my errors from last week",
-        expectedTools: [
-          {
-            name: "whoami",
-            arguments: {},
-          },
-        ],
-        expected: {
-          dataset: "errors",
-          query: /user\.email:test@example\.com|user\.id:123456/, // Can be either
-          sort: "-timestamp",
-          timeRange: { statsPeriod: "7d" },
-        },
-      },
+// biome-ignore format: keep the long eval case list diff stable.
+describeSearchAgentEval("search-events-agent", searchEventsAgentHarness, [
+  {
+    // Simple query with common fields - should NOT require tool calls
+    input: "Show me all errors from today",
+    expectedTools: [],
+    expected: {
+      dataset: "errors",
+      query: "", // No filters, just time range
+      sort: "-timestamp",
+      timeRange: { statsPeriod: "24h" },
+    },
+  },
+  {
+    // Query with "me" reference may use direct Sentry syntax or resolve whoami.
+    input: "Show me my errors from last week",
+    expectedTools: [],
+    expected: {
+      dataset: "errors",
+      query:
+        /assignedTo:me|user\.email:"?test@example\.com"?|user\.id:"?123456"?/, // Can be direct shorthand or resolved identity
+      sort: "-timestamp",
+      timeRange: { statsPeriod: "7d" },
+    },
+  },
+  {
+    // Common performance query - should NOT require tool calls
+    input: "Show me slow API calls taking more than 1 second",
+    expectedTools: [],
+    expected: {
+      dataset: "spans",
+      query: /span\.duration:>1000|span\.duration:>1s/, // Can express as ms or seconds
+      sort: "-span.duration",
+    },
+  },
+  {
+    // Query with OpenTelemetry attributes that need discovery
+    input: "Show me LLM calls where temperature setting is above 0.7",
+    expectedTools: [
       {
-        // Common performance query - should NOT require tool calls
-        input: "Show me slow API calls taking more than 1 second",
-        expectedTools: [],
-        expected: {
+        name: "datasetAttributes",
+        arguments: {
           dataset: "spans",
-          query: /span\.duration:>1000|span\.duration:>1s/, // Can express as ms or seconds
-          sort: "-span.duration",
         },
       },
       {
-        // Query with OpenTelemetry attributes that need discovery
-        input: "Show me LLM calls where temperature setting is above 0.7",
-        expectedTools: [
-          {
-            name: "datasetAttributes",
-            arguments: {
-              dataset: "spans",
-            },
-          },
-          {
-            name: "otelSemantics",
-            arguments: {
-              namespace: "gen_ai",
-              dataset: "spans",
-            },
-          },
-        ],
-        expected: {
+        name: "otelSemantics",
+        arguments: {
+          namespace: "gen_ai",
           dataset: "spans",
-          query: "gen_ai.request.temperature:>0.7",
-          sort: "-span.duration",
         },
       },
+    ],
+    expected: {
+      dataset: "spans",
+      query: /gen_ai\.request\.temperature:>0\.7/,
+      sort: "-span.duration",
+    },
+  },
+  {
+    // Query with custom field requiring discovery
+    input: "Find errors with custom.payment.processor field",
+    expectedTools: [
       {
-        // Query with custom field requiring discovery
-        input: "Find errors with custom.payment.processor field",
-        expectedTools: [
-          {
-            name: "datasetAttributes",
-            arguments: {
-              dataset: "errors",
-            },
-          },
-        ],
-        expected: {
+        name: "datasetAttributes",
+        arguments: {
           dataset: "errors",
-          query: "has:custom.payment.processor",
-          sort: "-timestamp",
         },
       },
+    ],
+    expected: {
+      dataset: "errors",
+      query:
+        /has:custom\.payment\.processor|has:tags\[custom\.payment\.processor\]/,
+      sort: "-timestamp",
+    },
+  },
+  {
+    // Query with custom field requiring discovery
+    input: "Show me spans where custom.db.pool_size is greater than 10",
+    expectedTools: [
       {
-        // Query with custom field requiring discovery
-        input: "Show me spans where custom.db.pool_size is greater than 10",
-        expectedTools: [
-          {
-            name: "datasetAttributes",
-            arguments: {
-              dataset: "spans",
-            },
-          },
-        ],
-        expected: {
+        name: "datasetAttributes",
+        arguments: {
           dataset: "spans",
-          query: "custom.db.pool_size:>10",
-          sort: "-span.duration",
         },
       },
+    ],
+    expected: {
+      dataset: "spans",
+      query: /custom\.db\.pool_size:>10|has:custom\.db\.pool_size/,
+      sort: /-span\.duration|-custom\.db\.pool_size|-timestamp/,
+    },
+  },
+  {
+    // User-supplied Sentry syntax should remain authoritative. The agent
+    // can validate fields, but it should not rewrite or drop explicit
+    // filters/fields while translating the request.
+    input:
+      'In spans, search for transaction:"VPN connections" tags[type]:Unified tags[country]:CN over the last 7 days. Return tags[type], tags[sequence], and count(), sorted by count descending.',
+    expectedTools: [
       {
-        // User-supplied Sentry syntax should remain authoritative. The agent
-        // can validate fields, but it should not rewrite or drop explicit
-        // filters/fields while translating the request.
-        input:
-          'In spans, search for transaction:"VPN connections" tags[type]:Unified tags[country]:CN over the last 7 days. Return tags[type], tags[sequence], and count(), sorted by count descending.',
-        expectedTools: [
-          {
-            name: "datasetAttributes",
-          },
-        ],
-        expected: {
-          dataset: "spans",
-          query: (value: unknown) =>
-            typeof value === "string" &&
-            [
-              'transaction:"VPN connections"',
-              "tags[type]:Unified",
-              "tags[country]:CN",
-            ].every((token) => value.includes(token)),
-          fields: (value: unknown) =>
-            Array.isArray(value) &&
-            ["tags[type]", "tags[sequence]", "count()"].every((field) =>
-              value.includes(field),
-            ),
-          sort: "-count()",
-          timeRange: { statsPeriod: "7d" },
-        },
+        name: "datasetAttributes",
       },
+    ],
+    expected: {
+      dataset: "spans",
+      query: (value: unknown) =>
+        typeof value === "string" &&
+        [
+          'transaction:"VPN connections"',
+          "tags[type]:Unified",
+          "tags[country]:CN",
+        ].every((token) => value.includes(token)),
+      fields: (value: unknown) =>
+        Array.isArray(value) &&
+        ["tags[type]", "tags[sequence]", "count()"].every((field) =>
+          value.includes(field),
+        ),
+      sort: "-count()",
+      timeRange: { statsPeriod: "7d" },
+    },
+  },
+  {
+    // Query requiring equation field calculation
+    input: "How many total tokens did we consume yesterday",
+    expectedTools: [
       {
-        // Query requiring equation field calculation
-        input: "How many total tokens did we consume yesterday",
-        expectedTools: [
-          {
-            name: "datasetAttributes",
-            arguments: {
-              dataset: "spans",
-            },
-          },
-          // Agent may find gen_ai fields and use them for calculation
-        ],
-        expected: {
+        name: "datasetAttributes",
+        arguments: {
           dataset: "spans",
-          // For aggregations, query filter is optional - empty query gets all spans
-          query: /^$|has:gen_ai\.usage\.(input_tokens|output_tokens)/,
-          // Equation to sum both token types
-          fields: [
-            "equation|sum(gen_ai.usage.input_tokens) + sum(gen_ai.usage.output_tokens)",
-          ],
-          // Sort by the equation result in descending order
-          sort: "-equation|sum(gen_ai.usage.input_tokens) + sum(gen_ai.usage.output_tokens)",
-          timeRange: { statsPeriod: "24h" },
-        },
-      },
-      {
-        // Query that tests sort field self-correction
-        // Agent should self-correct by adding count() to fields when sorting by it
-        input: "Show me the top 10 most frequent error types",
-        expectedTools: [],
-        expected: {
-          dataset: "errors",
-          query: "", // No specific filter, just aggregate all errors
-          // Agent should include count() in fields since we're sorting by it
-          fields: ["error.type", "count()"],
-          // Sort by count in descending order to get "most frequent"
-          sort: "-count()",
-          // timeRange can be null or have a default period
         },
       },
+      // Agent may find gen_ai fields and use them for calculation
+    ],
+    expected: {
+      dataset: "spans",
+      // For aggregations, query filter is optional - empty query gets all spans
+      query: /^$|has:gen_ai\.usage\.(input_tokens|output_tokens)/,
+      // Equation to sum both token types
+      fields: [
+        "equation|sum(gen_ai.usage.input_tokens) + sum(gen_ai.usage.output_tokens)",
+      ],
+      // Sort by the equation result in descending order
+      sort: "-equation|sum(gen_ai.usage.input_tokens) + sum(gen_ai.usage.output_tokens)",
+      timeRange: { statsPeriod: "24h" },
+    },
+  },
+  {
+    // Query that tests sort field self-correction
+    // Agent should self-correct by adding count() to fields when sorting by it
+    input: "Show me the top 10 most frequent error types",
+    expectedTools: [],
+    expected: {
+      dataset: "errors",
+      // Empty query is ideal, but filtering to rows with error.type is also a
+      // valid way to protect the grouping field.
+      query: /^$|has:error\.type/,
+      // Agent should include count() in fields since we're sorting by it
+      fields: ["error.type", "count()"],
+      // Sort by count in descending order to get "most frequent"
+      sort: "-count()",
+      // timeRange can be null or have a default period
+    },
+  },
+  {
+    // Complex aggregate query that tests sort field self-correction
+    // Agent should self-correct by including avg(span.duration) in fields
+    input:
+      "Show me database operations grouped by type, sorted by average duration",
+    expectedTools: [
       {
-        // Complex aggregate query that tests sort field self-correction
-        // Agent should self-correct by including avg(span.duration) in fields
-        input:
-          "Show me database operations grouped by type, sorted by average duration",
-        expectedTools: [
-          {
-            name: "datasetAttributes",
-            arguments: {
-              dataset: "spans",
-            },
-          },
-        ],
-        expected: {
+        name: "datasetAttributes",
+        arguments: {
           dataset: "spans",
-          query: "has:db.operation",
-          // Agent must include avg(span.duration) since we're sorting by it
-          // Use db.operation as the grouping field (span.op is deprecated)
-          fields: ["db.operation", "avg(span.duration)"],
-          // Sort by average duration
-          sort: "-avg(span.duration)",
-          // timeRange is optional
         },
       },
-    ];
-  },
-  task: async (input) => {
-    // Create a real API service that will use MSW mocks
-    const apiService = new SentryApiService({
-      accessToken: "test-token",
-    });
-
-    const agentResult = await searchEventsAgent({
-      query: input,
-      organizationSlug: "sentry-mcp-evals",
-      apiService,
-    });
-
-    return {
-      result: JSON.stringify(agentResult.result),
-      toolCalls: agentResult.toolCalls.map((call: any) => ({
-        name: call.toolName,
-        arguments: call.args,
-      })),
-    };
+    ],
+    expected: {
+      dataset: "spans",
+      query: /has:db\.operation|has:db\.system/,
+      // Agent must include avg(span.duration) since we're sorting by it
+      // Use db.operation as the grouping field (span.op is deprecated)
+      fields: (value: unknown) =>
+        Array.isArray(value) &&
+        ["avg(span.duration)"].every((field) => value.includes(field)) &&
+        (value.includes("db.operation") || value.includes("db.system")),
+      // Sort by average duration
+      sort: "-avg(span.duration)",
+      // timeRange is optional
+    },
   },
-  scorers: [
-    ToolCallScorer(), // Validates tool calls
-    StructuredOutputScorer({ match: "fuzzy" }), // Validates the structured query output with flexible matching
-  ],
-});
+], { timeout: 180000 });
diff --git a/packages/mcp-server-evals/src/evals/search-events.eval.ts b/packages/mcp-server-evals/src/evals/search-events.eval.ts
index 79f06d2dd..b00152389 100644
--- a/packages/mcp-server-evals/src/evals/search-events.eval.ts
+++ b/packages/mcp-server-evals/src/evals/search-events.eval.ts
@@ -1,110 +1,101 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
 // Note: This eval requires OPENAI_API_KEY to be set in the environment
 // The search_events tool uses the AI SDK to translate natural language queries
-describeEval("search-events", {
-  data: async () => {
-    return [
-      // Core test: Basic error event search
+describeToolPredictionEval("search-events", [
+  // Core test: Basic error event search
+  {
+    input: `Find database timeouts in ${FIXTURES.organizationSlug} from the last week`,
+    expectedTools: [
       {
-        input: `Find database timeouts in ${FIXTURES.organizationSlug} from the last week`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_events",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              query: "database timeouts from the last week",
-              dataset: "errors",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-      // Core test: Performance spans search
       {
-        input: `Find slow API calls taking over 5 seconds in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_events",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              query: "slow API calls taking over 5 seconds",
-              dataset: "spans",
-            },
-          },
-        ],
+        name: "search_events",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          query: "database timeouts from the last week",
+          dataset: "errors",
+        },
       },
-      // Core test: Logs search
+    ],
+  },
+  // Core test: Performance spans search
+  {
+    input: `Find slow API calls taking over 5 seconds in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "search_events",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          query: "slow API calls taking over 5 seconds",
+          dataset: "spans",
+        },
+      },
+    ],
+  },
+  // Core test: Logs search
+  {
+    input: `Show me error logs from the last hour in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "search_events",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          query: "error logs from the last hour",
+          dataset: "logs",
+        },
+      },
+    ],
+  },
+  // Core test: Project-specific search
+  {
+    input: `Show me authentication errors in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "search_events",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          projectSlug: FIXTURES.projectSlug,
+          query: "authentication errors",
+          dataset: "errors",
+        },
+      },
+    ],
+  },
+  // Core test: Search with 'me' reference
+  {
+    input: `Show me errors affecting me in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
       {
-        input: `Show me error logs from the last hour in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_events",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              query: "error logs from the last hour",
-              dataset: "logs",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-      // Core test: Project-specific search
       {
-        input: `Show me authentication errors in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_events",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              projectSlug: FIXTURES.projectSlug,
-              query: "authentication errors",
-              dataset: "errors",
-            },
-          },
-        ],
+        name: "whoami",
+        arguments: {},
       },
-      // Core test: Search with 'me' reference
       {
-        input: `Show me errors affecting me in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "whoami",
-            arguments: {},
-          },
-          {
-            name: "search_events",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              query: "errors affecting user.id:12345",
-              dataset: "errors",
-            },
-          },
-        ],
+        name: "search_events",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          query: "errors affecting user.id:12345",
+          dataset: "errors",
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/search-issue-events-agent.eval.ts b/packages/mcp-server-evals/src/evals/search-issue-events-agent.eval.ts
index 7e32c449f..4514502ed 100644
--- a/packages/mcp-server-evals/src/evals/search-issue-events-agent.eval.ts
+++ b/packages/mcp-server-evals/src/evals/search-issue-events-agent.eval.ts
@@ -1,128 +1,98 @@
-import { describeEval } from "vitest-evals";
-import { ToolCallScorer } from "vitest-evals";
-import { searchIssueEventsAgent } from "@sentry/mcp-core/tools/search-issue-events/agent";
-import { SentryApiService } from "@sentry/mcp-core/api-client";
-import { StructuredOutputScorer } from "./utils/structuredOutputScorer";
+import {
+  describeSearchAgentEval,
+  searchIssueEventsAgentHarness,
+} from "./utils";
 import "../setup-env";
 
 // The shared MSW server is already started in setup-env.ts
 
-describeEval("search-issue-events-agent", {
-  data: async () => {
-    return [
-      {
-        // Simple time-based query - should NOT require tool calls
-        input: "Show me events from the last hour",
-        expectedTools: [],
-        expected: {
-          query: "", // No additional filters beyond issue constraint
-          sort: "-timestamp",
-          timeRange: { statsPeriod: "1h" },
-        },
+describeSearchAgentEval(
+  "search-issue-events-agent",
+  searchIssueEventsAgentHarness,
+  [
+    {
+      // Simple time-based query - should NOT require tool calls
+      input: "Show me events from the last hour",
+      expectedTools: [],
+      expected: {
+        query: "", // No additional filters beyond issue constraint
+        sort: "-timestamp",
+        timeRange: { statsPeriod: "1h" },
       },
-      {
-        // Environment and release filtering - should NOT require tool calls
-        input: "Find production events with release v1.0.5",
-        expectedTools: [],
-        expected: {
-          query:
-            /environment:production.*release:v1\.0\.5|release:v1\.0\.5.*environment:production/,
-          sort: "-timestamp",
-        },
+    },
+    {
+      // Environment and release filtering - should NOT require tool calls
+      input: "Find production events with release v1.0.5",
+      expectedTools: [],
+      expected: {
+        query:
+          /environment:production.*release:v1\.0\.5|release:v1\.0\.5.*environment:production/,
+        sort: "-timestamp",
       },
-      {
-        // User-specific filtering - may require whoami if query uses "me"
-        input: "Show me events affecting user alice@example.com",
-        expectedTools: [],
-        expected: {
-          query: "user.email:alice@example.com",
-          sort: "-timestamp",
-        },
+    },
+    {
+      // User-specific filtering - may require whoami if query uses "me"
+      input: "Show me events affecting user alice@example.com",
+      expectedTools: [],
+      expected: {
+        query: "user.email:alice@example.com",
+        sort: "-timestamp",
       },
-      {
-        // Query with "me" reference - should require whoami
-        input: "Show me events from my user",
-        expectedTools: [
-          {
-            name: "whoami",
-            arguments: {},
-          },
-        ],
-        expected: {
-          query: /user\.email:test@example\.com|user:test@example\.com/, // Various valid forms
-          sort: "-timestamp",
+    },
+    {
+      // Query with "me" reference - should require whoami
+      input: "Show me events from my user",
+      expectedTools: [
+        {
+          name: "whoami",
         },
+      ],
+      expected: {
+        query: /user\.email:"?test@example\.com"?|user:"?test@example\.com"?/, // Various valid forms
+        sort: "-timestamp",
       },
-      {
-        // Trace ID filtering - should NOT require tool calls
-        input: "Find events with trace ID abc123def456",
-        expectedTools: [],
-        expected: {
-          query: "trace:abc123def456",
-          sort: "-timestamp",
-        },
+    },
+    {
+      // Trace ID filtering - should NOT require tool calls
+      input: "Find events with trace ID abc123def456",
+      expectedTools: [],
+      expected: {
+        query: "trace:abc123def456",
+        sort: "-timestamp",
       },
-      {
-        // URL pattern filtering - should NOT require tool calls
-        input: "Show me events from the /checkout/ page",
-        expectedTools: [],
-        expected: {
-          query: /"url:.*\/checkout\/.*"|url:".*checkout.*"/, // URL pattern with wildcard
-          sort: "-timestamp",
-        },
+    },
+    {
+      // URL pattern filtering - should NOT require tool calls
+      input: "Show me events from the /checkout/ page",
+      expectedTools: [],
+      expected: {
+        query: /"url:.*\/checkout\/.*"|url:".*checkout.*"/, // URL pattern with wildcard
+        sort: "-timestamp",
       },
-      {
-        // Combined filters with time range
-        input: "Production events from yesterday with specific release",
-        expectedTools: [],
-        expected: {
-          query:
-            /environment:production.*release:|release:.*environment:production/,
-          sort: "-timestamp",
-          timeRange: { statsPeriod: "24h" },
-        },
+    },
+    {
+      // Combined filters with time range
+      input: "Production events from yesterday with specific release",
+      expectedTools: [],
+      expected: {
+        query:
+          /^$|^environment:production$|environment:production.*(?:release:|has:release)|(?:release:|has:release).*environment:production/,
+        sort: "-timestamp",
+        timeRange: { statsPeriod: "24h" },
       },
-      {
-        // Query that might need field discovery for uncommon tags
-        input: "Events where device family is mobile",
-        expectedTools: [
-          {
-            name: "issueEventFields",
-            arguments: {},
-          },
-        ],
-        expected: {
-          query: /device\.family:mobile|device:mobile/,
-          sort: "-timestamp",
+    },
+    {
+      // Query that might need field discovery for uncommon tags
+      input: "Events where device family is mobile",
+      expectedTools: [
+        {
+          name: "issueEventFields",
         },
+      ],
+      expected: {
+        query: /device\.family:mobile|device:mobile/,
+        sort: "-timestamp",
       },
-    ];
-  },
-  task: async (input) => {
-    // Create a real API service that will use MSW mocks
-    const apiService = new SentryApiService({
-      accessToken: "test-token",
-    });
-
-    const agentResult = await searchIssueEventsAgent({
-      query: input,
-      organizationSlug: "sentry-mcp-evals",
-      apiService,
-    });
-
-    // Return in the format expected by ToolCallScorer
-    return {
-      result: JSON.stringify(agentResult.result),
-      toolCalls: agentResult.toolCalls.map((call: any) => ({
-        name: call.toolName,
-        arguments: call.args,
-      })),
-    };
-  },
-  scorers: [
-    ToolCallScorer(), // Validates tool calls
-    StructuredOutputScorer({ match: "fuzzy" }), // Validates the structured query output with flexible matching
+    },
   ],
-  threshold: 0.6,
-  timeout: 30000,
-});
+);
diff --git a/packages/mcp-server-evals/src/evals/search-issue-events.eval.ts b/packages/mcp-server-evals/src/evals/search-issue-events.eval.ts
index 61f693939..9e278da31 100644
--- a/packages/mcp-server-evals/src/evals/search-issue-events.eval.ts
+++ b/packages/mcp-server-evals/src/evals/search-issue-events.eval.ts
@@ -1,87 +1,78 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
 // Note: This eval requires OPENAI_API_KEY to be set in the environment
 // The search_issue_events tool uses the AI SDK to translate natural language queries
-describeEval("search-issue-events", {
-  data: async () => {
-    return [
-      // Core test: Basic time-based filtering within an issue
+describeToolPredictionEval("search-issue-events", [
+  // Core test: Basic time-based filtering within an issue
+  {
+    input: `Show me events from the last hour in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
       {
-        input: `Show me events from the last hour in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issue_events",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              issueId: FIXTURES.issueId,
-              query: "from the last hour",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-      // Core test: Environment and release filtering
       {
-        input: `Find production events with release v1.0 in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issue_events",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              issueId: FIXTURES.issueId,
-              query: "production events with release v1.0",
-            },
-          },
-        ],
+        name: "search_issue_events",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          issueId: FIXTURES.issueId,
+          query: "from the last hour",
+        },
       },
-      // Core test: User-specific filtering
+    ],
+  },
+  // Core test: Environment and release filtering
+  {
+    input: `Find production events with release v1.0 in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "search_issue_events",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          issueId: FIXTURES.issueId,
+          query: "production events with release v1.0",
+        },
+      },
+    ],
+  },
+  // Core test: User-specific filtering
+  {
+    input: `Show me events affecting user alice@example.com in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "search_issue_events",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          issueId: FIXTURES.issueId,
+          query: "affecting user alice@example.com",
+        },
+      },
+    ],
+  },
+  // Core test: Trace ID filtering
+  {
+    input: `Find events with trace ID abc123 in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
       {
-        input: `Show me events affecting user alice@example.com in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issue_events",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              issueId: FIXTURES.issueId,
-              query: "affecting user alice@example.com",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-      // Core test: Trace ID filtering
       {
-        input: `Find events with trace ID abc123 in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issue_events",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              issueId: FIXTURES.issueId,
-              query: "with trace ID abc123",
-            },
-          },
-        ],
+        name: "search_issue_events",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          issueId: FIXTURES.issueId,
+          query: "with trace ID abc123",
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/search-issues-agent.eval.ts b/packages/mcp-server-evals/src/evals/search-issues-agent.eval.ts
index 56622f257..a1bc00cc5 100644
--- a/packages/mcp-server-evals/src/evals/search-issues-agent.eval.ts
+++ b/packages/mcp-server-evals/src/evals/search-issues-agent.eval.ts
@@ -1,155 +1,111 @@
-import { SentryApiService } from "@sentry/mcp-core/api-client";
-import { searchIssuesAgent } from "@sentry/mcp-core/tools/search-issues/agent";
-import { describeEval } from "vitest-evals";
-import { ToolCallScorer } from "vitest-evals";
-import { StructuredOutputScorer } from "./utils/structuredOutputScorer";
+import { describeSearchAgentEval, searchIssuesAgentHarness } from "./utils";
 import "../setup-env";
 
 // The shared MSW server is already started in setup-env.ts
 
-describeEval("search-issues-agent", {
-  data: async () => {
-    return [
-      {
-        // Simple query with common fields - should NOT require tool calls
-        input: "Show me unresolved issues",
-        expectedTools: [],
-        expected: {
-          query: "is:unresolved",
-          sort: "date", // Agent uses "date" as default
-        },
-      },
-      {
-        // Natural-language "me" reference should resolve through whoami.
-        input: "Show me issues assigned to me",
-        expectedTools: [
-          {
-            name: "whoami",
-            arguments: {},
-          },
-        ],
-        expected: {
-          query:
-            /assigned_or_suggested:test@example\.com|assigned:test@example\.com|assigned:me/, // Various valid forms
-          sort: "date",
-        },
-      },
-      {
-        // Explicit "me" is valid Sentry syntax and should not be resolved.
-        input: "assigned:me is:unresolved",
-        expectedTools: [],
-        expected: {
-          query: /(?=.*assigned:me)(?=.*is:unresolved)/,
-          sort: "date",
-        },
-      },
-      {
-        // Complex query but with common fields - should NOT require tool calls
-        input: "Show me critical unhandled errors from the last 24 hours",
-        expectedTools: [],
-        expected: {
-          query:
-            /(?=.*is:unresolved)(?=.*error\.handled:false)(?=.*lastSeen:-24h)/,
-          sort: /date|user/,
-        },
-      },
-      {
-        // Tag-presence query can be expressed directly with has:
-        input: "Show me issues with custom.payment.failed tag",
-        expectedTools: [],
-        expected: {
-          query:
-            /has:custom\.payment\.failed|custom\.payment\.failed|tags\[custom\.payment\.failed\]/, // All are valid tag forms
-          sort: "date", // Agent should always return a sort value
-        },
-      },
-      {
-        // Another query requiring field discovery
-        input: "Find issues where the kafka.consumer.group is orders-processor",
-        expectedTools: [
-          {
-            name: "issueFields",
-            arguments: {}, // No arguments needed anymore
-          },
-        ],
-        expected: {
-          query:
-            /kafka\.consumer\.group:orders-processor|tags\[kafka\.consumer\.group\]:orders-processor/,
-          sort: "date", // Agent should always return a sort value
-        },
-      },
-      {
-        // Easy to fix issues - should use seer_actionability filter
-        input: "Show me easy to fix bugs",
-        expectedTools: [],
-        expected: {
-          query: /issue\.seer_actionability/,
-          sort: "date",
-        },
-      },
-      {
-        // Quick wins query - should combine actionability with unresolved
-        input: "Show me quick wins in production",
-        expectedTools: [],
-        expected: {
-          query:
-            /issue\.seer_actionability.*environment:production|environment:production.*issue\.seer_actionability/,
-          sort: /date|user/,
-        },
-      },
-      {
-        // Explicit issue-search syntax should be preserved, not broadened.
-        input: "is:for_review release:latest assigned:me issue.priority:high",
-        expectedTools: [],
-        expected: {
-          query:
-            /(?=.*is:for_review)(?=.*release:latest)(?=.*assigned:me)(?=.*issue\.priority:high)/,
-          sort: "date",
-        },
-      },
-      {
-        // Mixed natural language may set sort, but explicit filters stay intact.
-        input: "sort by users is:for_review release:latest",
-        expectedTools: [],
-        expected: {
-          query: /^(?!.*sort:)(?=.*is:for_review)(?=.*release:latest)/,
-          sort: "user",
-        },
-      },
+describeSearchAgentEval("search-issues-agent", searchIssuesAgentHarness, [
+  {
+    // Simple query with common fields - should NOT require tool calls
+    input: "Show me unresolved issues",
+    expectedTools: [],
+    expected: {
+      query: "is:unresolved",
+      sort: "date", // Agent uses "date" as default
+    },
+  },
+  {
+    // Natural-language "me" reference should resolve through whoami.
+    input: "Show me issues assigned to me",
+    expectedTools: [
       {
-        // Valid inbox/substatus filters should not be generalized.
-        input: "is:new is:regressed",
-        expectedTools: [],
-        expected: {
-          query: /^(?!.*is:unresolved)(?=.*is:new)(?=.*is:regressed)/,
-          sort: "date",
-        },
+        name: "whoami",
       },
-    ];
+    ],
+    expected: {
+      query:
+        /assigned_or_suggested:test@example\.com|assigned:test@example\.com|assigned:me/, // Various valid forms
+    },
   },
-  task: async (input) => {
-    // Create a real API service that will use MSW mocks
-    const apiService = new SentryApiService({
-      accessToken: "test-token",
-    });
-
-    const agentResult = await searchIssuesAgent({
-      query: input,
-      organizationSlug: "sentry-mcp-evals",
-      apiService,
-    });
-
-    // Return in the format expected by ToolCallScorer
-    return {
-      result: JSON.stringify(agentResult.result),
-      toolCalls: agentResult.toolCalls.map((call: any) => ({
-        name: call.toolName,
-        arguments: call.args,
-      })),
-    };
+  {
+    // Explicit "me" is valid Sentry syntax and should not be resolved.
+    input: "assigned:me is:unresolved",
+    expectedTools: [],
+    expected: {
+      query: /(?=.*assigned:me)(?=.*is:unresolved)/,
+    },
+  },
+  {
+    // Complex query but with common fields - should NOT require tool calls
+    input: "Show me critical unhandled errors from the last 24 hours",
+    expectedTools: [],
+    expected: {
+      query:
+        /(?=.*is:unresolved)(?=.*(?:error\.handled:false|error\.unhandled:true))(?=.*lastSeen:(?:-24h|>=?-24h))/,
+      sort: /date|user/,
+    },
+  },
+  {
+    // Tag-presence query can be expressed directly with has:
+    input: "Show me issues with custom.payment.failed tag",
+    expectedTools: [],
+    expected: {
+      query:
+        /has:custom\.payment\.failed|custom\.payment\.failed|tags\[custom\.payment\.failed\]/, // All are valid tag forms
+      sort: (value: unknown) => value === null || value === "date",
+    },
+  },
+  {
+    // Custom tag queries may either use field discovery or direct tag syntax.
+    input: "Find issues where the kafka.consumer.group is orders-processor",
+    expectedTools: [],
+    expected: {
+      query:
+        /kafka\.consumer\.group:orders-processor|tags\[kafka\.consumer\.group\]:orders-processor/,
+    },
+  },
+  {
+    // Easy to fix issues - should use seer_actionability filter
+    input: "Show me easy to fix bugs",
+    expectedTools: [],
+    expected: {
+      query: /issue\.seer_actionability/,
+      sort: "date",
+    },
+  },
+  {
+    // Quick wins query - should combine actionability with unresolved
+    input: "Show me quick wins in production",
+    expectedTools: [],
+    expected: {
+      query:
+        /issue\.seer_actionability.*environment:production|environment:production.*issue\.seer_actionability/,
+      sort: /date|user/,
+    },
+  },
+  {
+    // Explicit issue-search syntax should be preserved, not broadened.
+    input: "is:for_review release:latest assigned:me issue.priority:high",
+    expectedTools: [],
+    expected: {
+      query:
+        /(?=.*is:for_review)(?=.*release:latest)(?=.*assigned:me)(?=.*issue\.priority:high)/,
+    },
+  },
+  {
+    // Mixed natural language may set sort, but explicit filters stay intact.
+    input: "sort by users is:for_review release:latest",
+    expectedTools: [],
+    expected: {
+      query: /^(?!.*sort:)(?=.*is:for_review)(?=.*release:latest)/,
+      sort: "user",
+    },
+  },
+  {
+    // Valid inbox/substatus filters should not be generalized.
+    input: "is:new is:regressed",
+    expectedTools: [],
+    expected: {
+      query: /^(?!.*is:unresolved)(?=.*is:new)(?=.*is:regressed)/,
+    },
   },
-  scorers: [
-    ToolCallScorer(), // Validates tool calls
-    StructuredOutputScorer({ match: "fuzzy" }), // Validates the structured query output with flexible matching
-  ],
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/search-issues.eval.ts b/packages/mcp-server-evals/src/evals/search-issues.eval.ts
index c504c165a..c24ed4e6c 100644
--- a/packages/mcp-server-evals/src/evals/search-issues.eval.ts
+++ b/packages/mcp-server-evals/src/evals/search-issues.eval.ts
@@ -1,88 +1,79 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
 // Note: This eval requires OPENAI_API_KEY to be set in the environment
 // The search_issues tool uses the AI SDK to translate natural language queries
-describeEval("search-issues", {
-  data: async () => {
-    return [
-      // Core test: Basic issue search
+describeToolPredictionEval("search-issues", [
+  // Core test: Basic issue search
+  {
+    input: `Show me unresolved issues in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
       {
-        input: `Show me unresolved issues in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issues",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              query: "unresolved issues",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-      // Core test: Search with 'me' reference (tests whoami integration)
       {
-        input: `Find issues assigned to me in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "whoami",
-            arguments: {},
-          },
-          {
-            name: "search_issues",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              query: "issues assigned to me",
-            },
-          },
-        ],
+        name: "search_issues",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          query: "unresolved issues",
+        },
       },
-      // Core test: Project-specific search
+    ],
+  },
+  // Core test: Search with 'me' reference (tests whoami integration)
+  {
+    input: `Find issues assigned to me in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "whoami",
+        arguments: {},
+      },
+      {
+        name: "search_issues",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          query: "issues assigned to me",
+        },
+      },
+    ],
+  },
+  // Core test: Project-specific search
+  {
+    input: `Search for database errors in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "search_issues",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          projectSlugOrId: FIXTURES.projectSlug,
+          query: "database errors",
+        },
+      },
+    ],
+  },
+  // Core test: Complex natural language query
+  {
+    input: `Find critical production errors affecting more than 100 users in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
       {
-        input: `Search for database errors in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issues",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              projectSlugOrId: FIXTURES.projectSlug,
-              query: "database errors",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-      // Core test: Complex natural language query
       {
-        input: `Find critical production errors affecting more than 100 users in ${FIXTURES.organizationSlug}`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "search_issues",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              query: "critical production errors affecting more than 100 users",
-            },
-          },
-        ],
+        name: "search_issues",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          query: "critical production errors affecting more than 100 users",
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/update-issue.eval.ts b/packages/mcp-server-evals/src/evals/update-issue.eval.ts
index e5cb3174b..af4b15513 100644
--- a/packages/mcp-server-evals/src/evals/update-issue.eval.ts
+++ b/packages/mcp-server-evals/src/evals/update-issue.eval.ts
@@ -1,125 +1,116 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("update-issue", {
-  data: async () => {
-    return [
-      // Core use case: Resolve an issue
+describeToolPredictionEval("update-issue", [
+  // Core use case: Resolve an issue
+  {
+    input: `Resolve the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug}. Output only the new status as a single word.`,
+    expectedTools: [
       {
-        input: `Resolve the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug}. Output only the new status as a single word.`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "update_issue",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              issueId: FIXTURES.issueId,
-              status: "resolved",
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-      // Core use case: Assign an issue
       {
-        input: `Assign the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} to 'john.doe'. Output only the assigned username.`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "update_issue",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              issueId: FIXTURES.issueId,
-              assignedTo: "john.doe",
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "update_issue",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          issueId: FIXTURES.issueId,
+          status: "resolved",
+          regionUrl: "https://us.sentry.io",
+        },
       },
-      // Core use case: Using issue URL (alternative input method)
+    ],
+  },
+  // Core use case: Assign an issue
+  {
+    input: `Assign the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} to 'john.doe'. Output only the assigned username.`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "update_issue",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          issueId: FIXTURES.issueId,
+          assignedTo: "john.doe",
+          regionUrl: "https://us.sentry.io",
+        },
+      },
+    ],
+  },
+  // Core use case: Using issue URL (alternative input method)
+  {
+    input: `Resolve the issue at ${FIXTURES.issueUrl}. Output only the new status as a single word.`,
+    expectedTools: [
       {
-        input: `Resolve the issue at ${FIXTURES.issueUrl}. Output only the new status as a single word.`,
-        expectedTools: [
-          {
-            name: "update_issue",
-            arguments: {
-              issueUrl: FIXTURES.issueUrl,
-              status: "resolved",
-            },
-          },
-        ],
+        name: "update_issue",
+        arguments: {
+          issueUrl: FIXTURES.issueUrl,
+          status: "resolved",
+        },
       },
-      // Regression: default ignored status should map to "until escalating"
+    ],
+  },
+  // Regression: default ignored status should map to "until escalating"
+  {
+    input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} until it escalates. Output only the new status as a single word.`,
+    expectedTools: [
       {
-        input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} until it escalates. Output only the new status as a single word.`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "update_issue",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              issueId: FIXTURES.issueId,
-              status: "ignored",
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "find_organizations",
+        arguments: {},
       },
-      // Regression: permanent ignores need the explicit forever mode
       {
-        input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} forever. Output only the new status as a single word.`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "update_issue",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              issueId: FIXTURES.issueId,
-              status: "ignored",
-              ignoreMode: "forever",
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "update_issue",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          issueId: FIXTURES.issueId,
+          status: "ignored",
+          regionUrl: "https://us.sentry.io",
+        },
+      },
+    ],
+  },
+  // Regression: permanent ignores need the explicit forever mode
+  {
+    input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} forever. Output only the new status as a single word.`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+      {
+        name: "update_issue",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          issueId: FIXTURES.issueId,
+          status: "ignored",
+          ignoreMode: "forever",
+          regionUrl: "https://us.sentry.io",
+        },
+      },
+    ],
+  },
+  // Regression: count-based ignores should use the structured ignore fields
+  {
+    input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} until it happens 100 times in 60 minutes. Output only the new status as a single word.`,
+    expectedTools: [
+      {
+        name: "find_organizations",
+        arguments: {},
       },
-      // Regression: count-based ignores should use the structured ignore fields
       {
-        input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} until it happens 100 times in 60 minutes. Output only the new status as a single word.`,
-        expectedTools: [
-          {
-            name: "find_organizations",
-            arguments: {},
-          },
-          {
-            name: "update_issue",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              issueId: FIXTURES.issueId,
-              status: "ignored",
-              ignoreMode: "untilOccurrenceCount",
-              ignoreCount: 100,
-              ignoreWindowMinutes: 60,
-              regionUrl: "https://us.sentry.io",
-            },
-          },
-        ],
+        name: "update_issue",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          issueId: FIXTURES.issueId,
+          status: "ignored",
+          ignoreMode: "untilOccurrenceCount",
+          ignoreCount: 100,
+          ignoreWindowMinutes: 60,
+          regionUrl: "https://us.sentry.io",
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/update-project.eval.ts b/packages/mcp-server-evals/src/evals/update-project.eval.ts
index 2f979007e..4f4c11364 100644
--- a/packages/mcp-server-evals/src/evals/update-project.eval.ts
+++ b/packages/mcp-server-evals/src/evals/update-project.eval.ts
@@ -1,40 +1,31 @@
-import { describeEval } from "vitest-evals";
-import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils";
+import { describeToolPredictionEval, FIXTURES } from "./utils";
 
-describeEval("update-project", {
-  data: async () => {
-    return [
+describeToolPredictionEval("update-project", [
+  {
+    input: `Update the project '${FIXTURES.projectSlug}' in organization '${FIXTURES.organizationSlug}' to change its name to 'Updated Project Name' and slug to 'updated-project-slug'. Output only the new project slug as plain text without any formatting:\nupdated-project-slug`,
+    expectedTools: [
       {
-        input: `Update the project '${FIXTURES.projectSlug}' in organization '${FIXTURES.organizationSlug}' to change its name to 'Updated Project Name' and slug to 'updated-project-slug'. Output only the new project slug as plain text without any formatting:\nupdated-project-slug`,
-        expectedTools: [
-          {
-            name: "update_project",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              projectSlug: FIXTURES.projectSlug,
-              name: "Updated Project Name",
-              slug: "updated-project-slug",
-            },
-          },
-        ],
+        name: "update_project",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          projectSlug: FIXTURES.projectSlug,
+          name: "Updated Project Name",
+          slug: "updated-project-slug",
+        },
       },
+    ],
+  },
+  {
+    input: `Assign the project '${FIXTURES.projectSlug}' in organization '${FIXTURES.organizationSlug}' to the team '${FIXTURES.teamSlug}'. Output only the team slug as plain text without any formatting:\nthe-goats`,
+    expectedTools: [
       {
-        input: `Assign the project '${FIXTURES.projectSlug}' in organization '${FIXTURES.organizationSlug}' to the team '${FIXTURES.teamSlug}'. Output only the team slug as plain text without any formatting:\nthe-goats`,
-        expectedTools: [
-          {
-            name: "update_project",
-            arguments: {
-              organizationSlug: FIXTURES.organizationSlug,
-              projectSlug: FIXTURES.projectSlug,
-              teamSlug: FIXTURES.teamSlug,
-            },
-          },
-        ],
+        name: "update_project",
+        arguments: {
+          organizationSlug: FIXTURES.organizationSlug,
+          projectSlug: FIXTURES.projectSlug,
+          teamSlug: FIXTURES.teamSlug,
+        },
       },
-    ];
+    ],
   },
-  task: NoOpTaskRunner(),
-  scorers: [ToolPredictionScorer()],
-  threshold: 0.6,
-  timeout: 30000,
-});
+]);
diff --git a/packages/mcp-server-evals/src/evals/utils/describe.ts b/packages/mcp-server-evals/src/evals/utils/describe.ts
new file mode 100644
index 000000000..43b79b80e
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/describe.ts
@@ -0,0 +1,118 @@
+import {
+  describeEval,
+  StructuredOutputJudge,
+  ToolCallJudge,
+  type Harness,
+  type JsonValue,
+} from "vitest-evals";
+import {
+  ToolPredictionJudge,
+  toolPredictionHarness,
+} from "./toolPredictionHarness";
+import { mcpToolCallHarness } from "./mcpToolCallHarness";
+import type {
+  EvalCase,
+  StructuredEvalMetadata,
+  ToolCallEvalMetadata,
+  ToolPredictionMetadata,
+} from "./types";
+
+type EvalOptions = {
+  threshold?: number | null;
+  timeout?: number;
+};
+
+function resolveThreshold(
+  threshold: number | null | undefined,
+  defaultThreshold: number,
+) {
+  return threshold === undefined ? defaultThreshold : threshold;
+}
+
+export function describeToolPredictionEval(
+  name: string,
+  cases: EvalCase<ToolPredictionMetadata>[],
+  options: EvalOptions = {},
+) {
+  describeEval(
+    name,
+    {
+      harness: toolPredictionHarness,
+      judges: [ToolPredictionJudge],
+      judgeThreshold: resolveThreshold(options.threshold, 0.6),
+    },
+    (it) => {
+      for (const testCase of cases) {
+        const { input, name: testName, ...metadata } = testCase;
+
+        it(
+          testName ?? input,
+          { timeout: options.timeout ?? 30000 },
+          async ({ run }) => {
+            await run(input, { metadata });
+          },
+        );
+      }
+    },
+  );
+}
+
+export function describeMcpToolCallEval(
+  name: string,
+  cases: EvalCase<ToolCallEvalMetadata>[],
+  options: EvalOptions = {},
+) {
+  describeEval(
+    name,
+    {
+      harness: mcpToolCallHarness,
+      judges: [ToolCallJudge({ ordered: true, params: "fuzzy" })],
+      judgeThreshold: resolveThreshold(options.threshold, 0.6),
+    },
+    (it) => {
+      for (const testCase of cases) {
+        const { input, name: testName, ...metadata } = testCase;
+
+        it(
+          testName ?? input,
+          { timeout: options.timeout ?? 90000 },
+          async ({ run }) => {
+            await run(input, { metadata });
+          },
+        );
+      }
+    },
+  );
+}
+
+export function describeSearchAgentEval(
+  name: string,
+  harness: Harness<string, JsonValue, StructuredEvalMetadata>,
+  cases: EvalCase<StructuredEvalMetadata>[],
+  options: EvalOptions = {},
+) {
+  describeEval(
+    name,
+    {
+      harness,
+      judges: [
+        ToolCallJudge({ params: "fuzzy" }),
+        StructuredOutputJudge({ match: "fuzzy" }),
+      ],
+      judgeThreshold: resolveThreshold(options.threshold, 0.6),
+    },
+    (it) => {
+      for (const testCase of cases) {
+        const { input, name: testName, ...metadata } = testCase;
+
+        it(
+          testName ?? input,
+          { timeout: options.timeout ?? 150000 },
+          async ({ run }) => {
+            await run(input, { metadata });
+          },
+        );
+      }
+    },
+  );
+}
diff --git a/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.test.ts b/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.test.ts
new file mode 100644
index 000000000..cddab5931
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.test.ts
@@ -0,0 +1,77 @@
+import { describe, expect, it } from "vitest";
+import { createEmbeddedSearchAgentHarness } from "./embeddedAgentHarness";
+
+function createHarnessContext() {
+  const artifacts = {};
+
+  return {
+    metadata: {},
+    artifacts,
+    setArtifact: () => {},
+  };
+}
+
+describe("createEmbeddedSearchAgentHarness", () => {
+  it("uses a fallback session when AI SDK steps lack harness model metadata", async () => {
+    const harness = createEmbeddedSearchAgentHarness(
+      "test-embedded-agent",
+      async () => ({
+        result: {
+          query: "is:unresolved",
+        },
+        toolCalls: [
+          {
+            toolName: "whoami",
+            args: {},
+          },
+        ],
+        steps: [
+          {
+            usage: {
+              inputTokens: 1,
+              outputTokens: 2,
+              totalTokens: 3,
+            },
+          },
+        ],
+        totalUsage: {
+          inputTokens: 1,
+          outputTokens: 2,
+          totalTokens: 3,
+        },
+      }),
+    );
+
+    const run = await harness.run(
+      "show unresolved issues",
+      createHarnessContext(),
+    );
+
+    expect(run.output).toEqual({
+      query: "is:unresolved",
+    });
+    expect(run.session.messages).toEqual([
+      {
+        role: "user",
+        content: "show unresolved issues",
+      },
+      {
+        role: "assistant",
+        content: {
+          query: "is:unresolved",
+        },
+        toolCalls: [
+          {
+            name: "whoami",
+            arguments: {},
+          },
+        ],
+      },
+    ]);
+    expect(run.usage).toEqual({
+      inputTokens: 1,
+      outputTokens: 2,
+      totalTokens: 3,
+    });
+  });
+});
diff --git a/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.ts b/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.ts
new file mode 100644
index 000000000..44b320d06
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.ts
@@ -0,0 +1,91 @@
+import { SentryApiService } from "@sentry/mcp-core/api-client";
+import { searchEventsAgent } from "@sentry/mcp-core/tools/search-events/agent";
+import { searchIssueEventsAgent } from "@sentry/mcp-core/tools/search-issue-events/agent";
+import { searchIssuesAgent } from "@sentry/mcp-core/tools/search-issues/agent";
+import { aiSdkHarness } from "@vitest-evals/harness-ai-sdk";
+import type { JsonValue, ToolCallRecord } from "vitest-evals";
+import { withFallbackSession } from "./fallbackSession";
+import { FIXTURES } from "./fixtures";
+import { requireJsonValue, toJsonRecord } from "./json";
+import type { StructuredEvalMetadata } from "./types";
+
+type CapturedToolCall = {
+  toolName: string;
+  args: unknown;
+};
+
+type EmbeddedSearchAgentOptions = {
+  query: string;
+  organizationSlug: string;
+  apiService: SentryApiService;
+  projectId?: string;
+};
+
+type EmbeddedSearchAgentResult = {
+  result: unknown;
+  toolCalls: CapturedToolCall[];
+  steps?: unknown[];
+  usage?: unknown;
+  totalUsage?: unknown;
+};
+
+type EmbeddedSearchAgent = (
+  options: EmbeddedSearchAgentOptions,
+) => Promise<EmbeddedSearchAgentResult>;
+
+function toToolCallRecord(call: CapturedToolCall): ToolCallRecord {
+  return {
+    name: call.toolName,
+    arguments: toJsonRecord(call.args),
+  };
+}
+
+export function createEmbeddedSearchAgentHarness(
+  name: string,
+  agent: EmbeddedSearchAgent,
+) {
+  return aiSdkHarness<
+    undefined,
+    string,
+    StructuredEvalMetadata,
+    EmbeddedSearchAgentResult,
+    Record<string, never>,
+    JsonValue
+  >({
+    name,
+    run: async ({ input }) => {
+      const apiService = new SentryApiService({
+        accessToken: "test-token",
+      });
+
+      const result = await agent({
+        query: input,
+        organizationSlug: FIXTURES.organizationSlug,
+        apiService,
+      });
+
+      return withFallbackSession(
+        input,
+        result,
+        requireJsonValue(result.result, "agent output"),
+        result.toolCalls.map(toToolCallRecord),
+      );
+    },
+    output: ({ result }) => requireJsonValue(result.result, "agent output"),
+  });
+}
+
+export const searchEventsAgentHarness = createEmbeddedSearchAgentHarness(
+  "search-events-agent",
+  searchEventsAgent,
+);
+
+export const searchIssueEventsAgentHarness = createEmbeddedSearchAgentHarness(
+  "search-issue-events-agent",
+  searchIssueEventsAgent,
+);
+
+export const searchIssuesAgentHarness = createEmbeddedSearchAgentHarness(
+  "search-issues-agent",
+  searchIssuesAgent,
+);
diff --git a/packages/mcp-server-evals/src/evals/utils/fallbackSession.ts b/packages/mcp-server-evals/src/evals/utils/fallbackSession.ts
new file mode 100644
index 000000000..0187e6b8c
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/fallbackSession.ts
@@ -0,0 +1,69 @@
+import type {
+  JsonValue,
+  NormalizedSession,
+  ToolCallRecord,
+} from "vitest-evals";
+
+export function createFallbackSession(
+  input: string,
+  output: JsonValue,
+  toolCalls: ToolCallRecord[] = [],
+): NormalizedSession {
+  return {
+    messages: [
+      {
+        role: "user",
+        content: input,
+      },
+      {
+        role: "assistant",
+        content: output,
+        ...(toolCalls.length > 0 ? { toolCalls } : {}),
+      },
+    ],
+  };
+}
+
+function hasHarnessStepModel(step: unknown) {
+  if (!step || typeof step !== "object" || !("model" in step)) {
+    return false;
+  }
+
+  const { model } = step;
+  if (!model || typeof model !== "object") {
+    return false;
+  }
+
+  return (
+    "provider" in model &&
+    typeof model.provider === "string" &&
+    "modelId" in model &&
+    typeof model.modelId === "string"
+  );
+}
+
+export function withFallbackSession<Result extends { steps?: unknown[] }>(
+  input: string,
+  result: Result,
+  output: JsonValue,
+  toolCalls: ToolCallRecord[] = [],
+) {
+  const session = createFallbackSession(input, output, toolCalls);
+
+  if (
+    Array.isArray(result.steps) &&
+    result.steps.length > 0 &&
+    result.steps.every(hasHarnessStepModel)
+  ) {
+    return {
+      ...result,
+      session,
+    };
+  }
+
+  return {
+    ...result,
+    steps: undefined,
+    session,
+  };
+}
diff --git a/packages/mcp-server-evals/src/evals/utils/index.ts b/packages/mcp-server-evals/src/evals/utils/index.ts
index 0316b2a61..01c2cc246 100644
--- a/packages/mcp-server-evals/src/evals/utils/index.ts
+++ b/packages/mcp-server-evals/src/evals/utils/index.ts
@@ -1,7 +1,17 @@
 export { FIXTURES } from "./fixtures";
-export { McpToolCallTaskRunner } from "./mcpToolCallRunner";
-export { NoOpTaskRunner } from "./runner";
 export {
-  ToolPredictionScorer,
-  type ExpectedToolCall,
-} from "./toolPredictionScorer";
+  describeMcpToolCallEval,
+  describeSearchAgentEval,
+  describeToolPredictionEval,
+} from "./describe";
+export {
+  searchEventsAgentHarness,
+  searchIssueEventsAgentHarness,
+  searchIssuesAgentHarness,
+} from "./embeddedAgentHarness";
+export {
+  ToolPredictionJudge,
+  toolPredictionHarness,
+} from "./toolPredictionHarness";
+export { mcpToolCallHarness } from "./mcpToolCallHarness";
+export type { ExpectedToolCall } from "./types";
diff --git a/packages/mcp-server-evals/src/evals/utils/json.ts b/packages/mcp-server-evals/src/evals/utils/json.ts
new file mode 100644
index 000000000..176eba04f
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/json.ts
@@ -0,0 +1,25 @@
+import { toJsonValue, type JsonValue } from "vitest-evals";
+
+export function toJsonRecord(value: unknown): Record<string, JsonValue> {
+  const normalized = toJsonValue(value);
+
+  if (
+    normalized &&
+    typeof normalized === "object" &&
+    !Array.isArray(normalized)
+  ) {
+    return normalized;
+  }
+
+  return {};
+}
+
+export function requireJsonValue(value: unknown, label: string): JsonValue {
+  const normalized = toJsonValue(value);
+
+  if (normalized === undefined) {
+    throw new Error(`${label} is not JSON-serializable`);
+  }
+
+  return normalized;
+}
diff --git a/packages/mcp-server-evals/src/evals/utils/mcpClient.test.ts b/packages/mcp-server-evals/src/evals/utils/mcpClient.test.ts
new file mode 100644
index 000000000..eceaa0c39
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/mcpClient.test.ts
@@ -0,0 +1,15 @@
+import { describe, expect, it } from "vitest";
+import { getAvailableToolDescriptions } from "./mcpClient";
+
+describe("getAvailableToolDescriptions", () => {
+  it("uses stable tool definitions for prediction prompts", async () => {
+    const descriptions = await getAvailableToolDescriptions();
+    const toolNames = descriptions.map((description) =>
+      description.slice(0, description.indexOf(" - ")),
+    );
+
+    expect(toolNames).toContain("find_teams");
+    expect(toolNames).toContain("create_project");
+    expect(toolNames).toContain("find_releases");
+  });
+});
diff --git a/packages/mcp-server-evals/src/evals/utils/mcpClient.ts b/packages/mcp-server-evals/src/evals/utils/mcpClient.ts
new file mode 100644
index 000000000..0193d8ffa
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/mcpClient.ts
@@ -0,0 +1,54 @@
+import { experimental_createMCPClient } from "@ai-sdk/mcp";
+import { Experimental_StdioMCPTransport } from "@ai-sdk/mcp/mcp-stdio";
+import toolDefinitions from "@sentry/mcp-core/toolDefinitions";
+
+type MockMcpClient = Awaited<ReturnType<typeof experimental_createMCPClient>>;
+
+let cachedToolDescriptions: Promise<string[]> | null = null;
+
+function createMockTransport() {
+  return new Experimental_StdioMCPTransport({
+    command: "pnpm",
+    args: ["--filter", "@sentry/mcp-server-evals", "start"],
+    env: {
+      ...process.env,
+      SENTRY_ACCESS_TOKEN: "mocked-access-token",
+      SENTRY_HOST: "sentry.io",
+    },
+  });
+}
+
+function getShortDescription(description: string): string {
+  return description.split("\n")[0] ?? "";
+}
+
+export async function withMockMcpClient<T>(
+  callback: (client: MockMcpClient) => Promise<T>,
+): Promise<T> {
+  const client = await experimental_createMCPClient({
+    transport: createMockTransport(),
+  });
+
+  try {
+    return await callback(client);
+  } finally {
+    await client.close();
+  }
+}
+
+async function loadAvailableToolDescriptions() {
+  return toolDefinitions.map(
+    (tool) => `${tool.name} - ${getShortDescription(tool.description)}`,
+  );
+}
+
+export async function getAvailableToolDescriptions(): Promise<string[]> {
+  cachedToolDescriptions ??= loadAvailableToolDescriptions().catch(
+    (error: unknown) => {
+      cachedToolDescriptions = null;
+      throw error;
+    },
+  );
+
+  return cachedToolDescriptions;
+}
diff --git a/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.test.ts b/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.test.ts
new file mode 100644
index 000000000..08c334284
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.test.ts
@@ -0,0 +1,222 @@
+import { aiSdkHarness } from "@vitest-evals/harness-ai-sdk";
+import {
+  dynamicTool,
+  type LanguageModelUsage,
+  type ToolExecutionOptions,
+} from "ai";
+import {
+  toolCalls as collectToolCalls,
+  type ToolCallRecord,
+} from "vitest-evals";
+import { describe, expect, it } from "vitest";
+import { z } from "zod";
+import {
+  captureMcpToolCalls,
+  createMcpToolCallRun,
+  prepareMcpToolCallStep,
+} from "./mcpToolCallHarness";
+
+function createToolOptions(toolCallId: string): ToolExecutionOptions {
+  return {
+    toolCallId,
+    messages: [],
+  };
+}
+
+function createUsage(): LanguageModelUsage {
+  return {
+    inputTokens: 10,
+    inputTokenDetails: {
+      noCacheTokens: 10,
+      cacheReadTokens: undefined,
+      cacheWriteTokens: undefined,
+    },
+    outputTokens: 5,
+    outputTokenDetails: {
+      textTokens: 5,
+      reasoningTokens: undefined,
+    },
+    totalTokens: 15,
+  };
+}
+
+describe("captureMcpToolCalls", () => {
+  it("captures dynamic MCP tool execution before delegating", async () => {
+    const capturedToolCalls: ToolCallRecord[] = [];
+    const tools = captureMcpToolCalls(
+      {
+        search_tools: dynamicTool({
+          inputSchema: z.object({
+            query: z.string(),
+          }),
+          execute: async (input) => ({
+            name: "get_issue_details",
+            input,
+          }),
+        }),
+      },
+      capturedToolCalls,
+    );
+
+    const result = await tools.search_tools.execute?.(
+      { query: "issue" },
+      createToolOptions("call_1"),
+    );
+
+    expect(result).toEqual({
+      name: "get_issue_details",
+      input: {
+        query: "issue",
+      },
+    });
+    expect(capturedToolCalls).toMatchObject([
+      {
+        id: "call_1",
+        name: "search_tools",
+        arguments: {
+          query: "issue",
+        },
+        result: {
+          name: "get_issue_details",
+          input: {
+            query: "issue",
+          },
+        },
+      },
+    ]);
+    expect(capturedToolCalls[0].startedAt).toEqual(expect.any(String));
+    expect(capturedToolCalls[0].finishedAt).toEqual(expect.any(String));
+    expect(capturedToolCalls[0].durationMs).toEqual(expect.any(Number));
+  });
+
+  it("records tool errors before rethrowing", async () => {
+    const capturedToolCalls: ToolCallRecord[] = [];
+    const tools = captureMcpToolCalls(
+      {
+        execute_tool: dynamicTool({
+          inputSchema: z.object({
+            name: z.string(),
+          }),
+          execute: async () => {
+            throw new Error("tool failed");
+          },
+        }),
+      },
+      capturedToolCalls,
+    );
+
+    await expect(
+      tools.execute_tool.execute?.(
+        { name: "get_issue_details" },
+        createToolOptions("call_2"),
+      ),
+    ).rejects.toThrow("tool failed");
+
+    expect(capturedToolCalls).toMatchObject([
+      {
+        id: "call_2",
+        name: "execute_tool",
+        arguments: {
+          name: "get_issue_details",
+        },
+        error: {
+          type: "Error",
+          message: "tool failed",
+        },
+      },
+    ]);
+  });
+});
+
+describe("prepareMcpToolCallStep", () => {
+  it("forces discovery before catalog execution", () => {
+    expect(prepareMcpToolCallStep(0)).toEqual({
+      toolChoice: {
+        type: "tool",
+        toolName: "search_tools",
+      },
+      activeTools: ["search_tools"],
+    });
+    expect(prepareMcpToolCallStep(1)).toEqual({
+      toolChoice: {
+        type: "tool",
+        toolName: "execute_tool",
+      },
+      activeTools: ["execute_tool"],
+    });
+    expect(prepareMcpToolCallStep(2)).toBeUndefined();
+  });
+});
+
+describe("createMcpToolCallRun", () => {
+  it("preserves the captured sequence when raw AI SDK steps only expose the last call", async () => {
+    const capturedToolCalls: ToolCallRecord[] = [
+      {
+        id: "call_1",
+        name: "search_tools",
+        arguments: {
+          query: "issue",
+        },
+      },
+      {
+        id: "call_2",
+        name: "execute_tool",
+        arguments: {
+          name: "get_issue_details",
+        },
+      },
+    ];
+    const result = {
+      text: "Issue summary",
+      steps: [
+        {
+          model: {
+            provider: "openai",
+            modelId: "gpt-4o",
+          },
+          toolCalls: [
+            {
+              toolCallId: "call_2",
+              toolName: "execute_tool",
+              input: {
+                name: "get_issue_details",
+              },
+            },
+          ],
+          usage: createUsage(),
+        },
+      ],
+      totalUsage: createUsage(),
+    };
+    const harness = aiSdkHarness({
+      name: "mcp-tool-call-test",
+      run: async () =>
+        createMcpToolCallRun("Explain an issue", result, capturedToolCalls),
+    });
+
+    const run = await harness.run("Explain an issue", {
+      metadata: {},
+      artifacts: {},
+      setArtifact: () => {},
+    });
+
+    expect(collectToolCalls(run.session).map(({ name }) => name)).toEqual([
+      "search_tools",
+      "execute_tool",
+    ]);
+    expect(run.usage).toMatchObject({
+      provider: "openai",
+      model: "gpt-4o",
+      inputTokens: 10,
+      outputTokens: 5,
+      totalTokens: 15,
+      toolCalls: 2,
+    });
+    expect(
+      (run.traces ?? [])
+        .flatMap((trace) => trace.spans)
+        .filter((span) => span.kind === "tool")
+        .map((span) => span.name),
+    ).toEqual(["search_tools", "execute_tool"]);
+  });
+});
diff --git a/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.ts b/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.ts
new file mode 100644
index 000000000..512dcffee
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.ts
@@ -0,0 +1,239 @@
+import { openai } from "@ai-sdk/openai";
+import { aiSdkHarness } from "@vitest-evals/harness-ai-sdk";
+import {
+  generateText,
+  stepCountIs,
+  type LanguageModelUsage,
+  type PrepareStepResult,
+  type ToolExecutionOptions,
+  type ToolSet,
+} from "ai";
+import type { Harness, HarnessRun, ToolCallRecord } from "vitest-evals";
+import { toJsonValue } from "vitest-evals";
+import { createFallbackSession } from "./fallbackSession";
+import { toJsonRecord } from "./json";
+import { withMockMcpClient } from "./mcpClient";
+import type { ToolCallEvalMetadata } from "./types";
+
+const defaultModel = openai("gpt-4o");
+
+type AiSdkResultWithUsage = {
+  text: string;
+  steps?: unknown;
+  totalUsage?: LanguageModelUsage;
+  usage?: LanguageModelUsage;
+};
+
+type ExecutableTool = ToolSet[string] & {
+  execute: (input: unknown, options: ToolExecutionOptions) => unknown;
+};
+
+function isExecutableTool(tool: ToolSet[string]): tool is ExecutableTool {
+  return typeof tool.execute === "function";
+}
+
+function toToolCallError(error: unknown): NonNullable<ToolCallRecord["error"]> {
+  if (error instanceof Error) {
+    return {
+      type: error.name,
+      message: error.message,
+    };
+  }
+
+  const normalized = toJsonValue(error);
+  if (
+    normalized &&
+    typeof normalized === "object" &&
+    !Array.isArray(normalized) &&
+    typeof normalized.message === "string"
+  ) {
+    return {
+      ...normalized,
+      type: typeof normalized.type === "string" ? normalized.type : "Error",
+      message: normalized.message,
+    };
+  }
+
+  return {
+    type: "Error",
+    message: String(error ?? "Unknown tool call error"),
+  };
+}
+
+export function captureMcpToolCalls<TTools extends ToolSet>(
+  tools: TTools,
+  capturedToolCalls: ToolCallRecord[],
+): TTools {
+  return Object.fromEntries(
+    Object.entries(tools).map(([toolName, tool]) => {
+      if (!isExecutableTool(tool)) {
+        return [toolName, tool];
+      }
+
+      const execute = tool.execute;
+      const wrappedTool = {
+        ...tool,
+        execute: async (
+          toolInput: unknown,
+          execution: ToolExecutionOptions,
+        ) => {
+          const startedAt = new Date();
+          const toolCall: ToolCallRecord = {
+            id: execution.toolCallId,
+            name: toolName,
+            arguments: toJsonRecord(toolInput),
+            startedAt: startedAt.toISOString(),
+          };
+          capturedToolCalls.push(toolCall);
+
+          try {
+            const result = await execute(toolInput, execution);
+            const finishedAt = new Date();
+            const normalizedResult = toJsonValue(result);
+
+            if (normalizedResult !== undefined) {
+              toolCall.result = normalizedResult;
+            }
+            toolCall.finishedAt = finishedAt.toISOString();
+            toolCall.durationMs = finishedAt.getTime() - startedAt.getTime();
+
+            return result;
+          } catch (error) {
+            const finishedAt = new Date();
+            toolCall.error = toToolCallError(error);
+            toolCall.finishedAt = finishedAt.toISOString();
+            toolCall.durationMs = finishedAt.getTime() - startedAt.getTime();
+            throw error;
+          }
+        },
+      };
+
+      return [toolName, wrappedTool];
+    }),
+  ) as TTools;
+}
+
+function getLastStepModel(result: AiSdkResultWithUsage) {
+  const steps = Array.isArray(result.steps) ? result.steps : [];
+  const lastStep = steps.at(-1);
+
+  if (!lastStep || typeof lastStep !== "object" || !("model" in lastStep)) {
+    return {};
+  }
+
+  const { model } = lastStep;
+  if (!model || typeof model !== "object") {
+    return {};
+  }
+
+  return {
+    provider: "provider" in model ? String(model.provider) : undefined,
+    model: "modelId" in model ? String(model.modelId) : undefined,
+  };
+}
+
+function getTotalTokens(usage: LanguageModelUsage | undefined) {
+  if (!usage) {
+    return undefined;
+  }
+
+  return (
+    usage.totalTokens ?? (usage.inputTokens ?? 0) + (usage.outputTokens ?? 0)
+  );
+}
+
+export function createMcpToolCallRun(
+  input: string,
+  result: AiSdkResultWithUsage,
+  capturedToolCalls: ToolCallRecord[],
+): HarnessRun<string> {
+  const usage = result.totalUsage ?? result.usage;
+  const model = getLastStepModel(result);
+
+  return {
+    session: createFallbackSession(input, result.text, capturedToolCalls),
+    output: result.text,
+    usage: {
+      ...model,
+      inputTokens: usage?.inputTokens,
+      outputTokens: usage?.outputTokens,
+      reasoningTokens:
+        usage?.outputTokenDetails?.reasoningTokens ?? usage?.reasoningTokens,
+      totalTokens: getTotalTokens(usage),
+      toolCalls: capturedToolCalls.length,
+      metadata: toJsonRecord({
+        cacheReadTokens:
+          usage?.inputTokenDetails?.cacheReadTokens ?? usage?.cachedInputTokens,
+        cacheWriteTokens: usage?.inputTokenDetails?.cacheWriteTokens,
+        raw: usage?.raw,
+      }),
+    },
+    errors: [],
+  };
+}
+
+function forcedToolStep(toolName: "search_tools" | "execute_tool") {
+  return {
+    toolChoice: {
+      type: "tool",
+      toolName,
+    },
+    activeTools: [toolName],
+  } satisfies PrepareStepResult<ToolSet>;
+}
+
+export function prepareMcpToolCallStep(
+  stepNumber: number,
+): PrepareStepResult<ToolSet> | undefined {
+  if (stepNumber === 0) {
+    return forcedToolStep("search_tools");
+  }
+
+  if (stepNumber === 1) {
+    return forcedToolStep("execute_tool");
+  }
+}
+
+export function createMcpToolCallHarness(
+  maxSteps = 6,
+): Harness<string, string, ToolCallEvalMetadata> {
+  return aiSdkHarness<
+    undefined,
+    string,
+    ToolCallEvalMetadata,
+    HarnessRun<string>
+  >({
+    name: "mcp-tool-call",
+    run: async ({ input, context }) => {
+      return await withMockMcpClient(async (client) => {
+        const capturedToolCalls: ToolCallRecord[] = [];
+        const tools = captureMcpToolCalls(
+          await client.tools(),
+          capturedToolCalls,
+        );
+        const result = await generateText({
+          model: defaultModel,
+          tools,
+          system: [
+            "You are a Sentry assistant with access to Sentry MCP tools.",
+            "Use search_tools before execute_tool when the needed Sentry operation is not directly listed as a tool.",
+            "When search_tools returns a tool, call execute_tool with that returned tool name and arguments matching the returned schema.",
+            "When the user says 'from Sentry in <organization>', Sentry is the product name and <organization> is the organizationSlug.",
+          ].join("\n"),
+          prompt: input,
+          stopWhen: stepCountIs(maxSteps),
+          abortSignal: context.signal,
+          prepareStep: ({ stepNumber }) => prepareMcpToolCallStep(stepNumber),
+          experimental_telemetry: {
+            isEnabled: true,
+            functionId: "catalog_tool_behavior_eval",
+          },
+        });
+
+        return createMcpToolCallRun(input, result, capturedToolCalls);
+      });
+    },
+  });
+}
+
+export const mcpToolCallHarness = createMcpToolCallHarness();
diff --git a/packages/mcp-server-evals/src/evals/utils/mcpToolCallRunner.ts b/packages/mcp-server-evals/src/evals/utils/mcpToolCallRunner.ts
deleted file mode 100644
index 2c674c9a7..000000000
--- a/packages/mcp-server-evals/src/evals/utils/mcpToolCallRunner.ts
+++ /dev/null
@@ -1,64 +0,0 @@
-import { experimental_createMCPClient } from "@ai-sdk/mcp";
-import { Experimental_StdioMCPTransport } from "@ai-sdk/mcp/mcp-stdio";
-import { openai } from "@ai-sdk/openai";
-import { generateText, stepCountIs, type LanguageModel } from "ai";
-
-const defaultModel = openai("gpt-4o");
-
-function toToolCall(call: { toolName: string; input: unknown }) {
-  const input =
-    call.input && typeof call.input === "object" && !Array.isArray(call.input)
-      ? (call.input as Record<string, unknown>)
-      : {};
-
-  return {
-    name: call.toolName,
-    arguments: input,
-  };
-}
-
-export function McpToolCallTaskRunner(
-  model: LanguageModel = defaultModel,
-  maxSteps = 6,
-) {
-  return async function McpToolCallTaskRunner(input: string) {
-    const transport = new Experimental_StdioMCPTransport({
-      command: "pnpm",
-      args: ["--filter", "@sentry/mcp-server-evals", "start"],
-      env: {
-        ...process.env,
-        SENTRY_ACCESS_TOKEN: "mocked-access-token",
-        SENTRY_HOST: "sentry.io",
-      },
-    });
-    const client = await experimental_createMCPClient({ transport });
-
-    try {
-      const tools = await client.tools();
-      const result = await generateText({
-        model,
-        tools,
-        system: [
-          "You are a Sentry assistant with access to Sentry MCP tools.",
-          "Use search_tools before execute_tool when the needed Sentry operation is not directly listed as a tool.",
-          "When search_tools returns a tool, call execute_tool with that returned tool name and arguments matching the returned schema.",
-        ].join("\n"),
-        prompt: input,
-        stopWhen: stepCountIs(maxSteps),
-        experimental_telemetry: {
-          isEnabled: true,
-          functionId: "catalog_tool_behavior_eval",
-        },
-      });
-
-      return {
-        result: result.text,
-        toolCalls: result.steps.flatMap((step) =>
-          step.toolCalls.map(toToolCall),
-        ),
-      };
-    } finally {
-      await client.close();
-    }
-  };
-}
diff --git a/packages/mcp-server-evals/src/evals/utils/runner.ts b/packages/mcp-server-evals/src/evals/utils/runner.ts
deleted file mode 100644
index 7a8e6d105..000000000
--- a/packages/mcp-server-evals/src/evals/utils/runner.ts
+++ /dev/null
@@ -1,14 +0,0 @@
-/**
- * A no-op task runner that doesn't execute tools, just returns the input
- * for use with ToolPredictionScorer. This allows tests to focus on predicting
- * which tools would be called without actually executing them.
- */
-export function NoOpTaskRunner() {
-  return async function NoOpTaskRunner(input: string) {
-    // Just return the input as the result, no tool execution
-    return {
-      result: input,
-      toolCalls: [],
-    };
-  };
-}
diff --git a/packages/mcp-server-evals/src/evals/utils/structuredOutputScorer.ts b/packages/mcp-server-evals/src/evals/utils/structuredOutputScorer.ts
deleted file mode 100644
index 65fdf4cd9..000000000
--- a/packages/mcp-server-evals/src/evals/utils/structuredOutputScorer.ts
+++ /dev/null
@@ -1,282 +0,0 @@
-import type { Score, ScoreFn, BaseScorerOptions } from "vitest-evals";
-
-interface StructuredOutputScorerOptions extends BaseScorerOptions {
-  expected?: Record<string, any>;
-}
-
-interface StructuredOutputScorerConfig {
-  /**
-   * How to match field values
-   * - "strict": Exact equality required (default)
-   * - "fuzzy": More flexible matching (regex patterns, type coercion)
-   * - Custom function: Your own comparison logic
-   * @default "strict"
-   */
-  match?:
-    | "strict"
-    | "fuzzy"
-    | ((expected: any, actual: any, key: string) => boolean);
-
-  /**
-   * Whether all expected fields must be present for a passing score
-   * When false: gives partial credit based on fields matched
-   * @default true
-   */
-  requireAll?: boolean;
-
-  /**
-   * Whether to allow additional fields beyond those expected
-   * @default true
-   */
-  allowExtras?: boolean;
-
-  /**
-   * Enable debug logging
-   * @default false
-   */
-  debug?: boolean;
-}
-
-/**
- * A configurable scorer for evaluating structured outputs (e.g., JSON) from LLM responses.
- *
- * Similar to ToolCallScorer but for validating structured data outputs like API queries.
- *
- * @param config - Configuration options for the scorer
- * @param config.match - How to match field values: "strict", "fuzzy", or custom function
- * @param config.requireAll - Require all expected fields (vs partial credit)
- * @param config.allowExtras - Allow additional fields in output
- * @param config.debug - Enable debug logging
- *
- * @example
- * // Default: strict matching
- * describeEval("query generation", {
- *   data: async () => [{
- *     input: "Show me errors from today",
- *     expected: {
- *       dataset: "errors",
- *       query: "",
- *       sort: "-timestamp",
- *       timeRange: { statsPeriod: "24h" }
- *     }
- *   }],
- *   task: myTask,
- *   scorers: [StructuredOutputScorer()]
- * });
- *
- * @example
- * // Fuzzy matching with regex patterns
- * describeEval("flexible query matching", {
- *   data: async () => [{
- *     input: "Find slow API calls",
- *     expected: {
- *       dataset: "spans",
- *       query: /span\.duration:>1000|span\.duration:>1s/,
- *       sort: "-span.duration"
- *     }
- *   }],
- *   task: myTask,
- *   scorers: [StructuredOutputScorer({ match: "fuzzy" })]
- * });
- */
-export function StructuredOutputScorer(
-  config: StructuredOutputScorerConfig = {},
-): ScoreFn<StructuredOutputScorerOptions> {
-  const {
-    match = "strict",
-    requireAll = true,
-    allowExtras = true,
-    debug = false,
-  } = config;
-
-  return async (opts: StructuredOutputScorerOptions): Promise<Score> => {
-    const { output, expected } = opts;
-
-    // If no expected output provided, just check if we got valid JSON
-    if (!expected) {
-      try {
-        JSON.parse(output);
-        return { score: 1, metadata: { rationale: "Valid JSON output" } };
-      } catch {
-        return { score: 0, metadata: { rationale: "Invalid JSON output" } };
-      }
-    }
-
-    let parsed: Record<string, any>;
-    try {
-      parsed = JSON.parse(output);
-    } catch (error) {
-      return {
-        score: 0,
-        metadata: { rationale: `Failed to parse output as JSON: ${error}` },
-      };
-    }
-
-    // Check for error field in output
-    if (parsed.error && parsed.error !== "" && parsed.error !== null) {
-      return {
-        score: 0,
-        metadata: { rationale: `Output contains error: ${parsed.error}` },
-      };
-    }
-
-    const matchFn = getMatchFunction(match);
-    const { matches, mismatches, extras } = compareObjects(
-      expected,
-      parsed,
-      matchFn,
-    );
-
-    if (debug) {
-      console.log("StructuredOutputScorer debug:");
-      console.log("Expected:", expected);
-      console.log("Actual:", parsed);
-      console.log("Matches:", matches);
-      console.log("Mismatches:", mismatches);
-      console.log("Extras:", extras);
-    }
-
-    // Calculate score
-    const totalExpected = Object.keys(expected).length;
-    const totalMatched = matches.length;
-    const hasExtras = extras.length > 0;
-
-    let score: number;
-    let rationale: string;
-
-    if (requireAll && mismatches.length > 0) {
-      score = 0;
-      rationale = `Missing required fields: ${mismatches.map((m) => m.key).join(", ")}`;
-    } else if (!allowExtras && hasExtras) {
-      score = 0;
-      rationale = `Unexpected extra fields: ${extras.join(", ")}`;
-    } else if (totalExpected === 0) {
-      score = 1;
-      rationale = "No expected fields to match";
-    } else {
-      score = totalMatched / totalExpected;
-      if (score === 1) {
-        rationale = "All expected fields match";
-      } else {
-        rationale = `Matched ${totalMatched}/${totalExpected} fields`;
-      }
-    }
-
-    // Add mismatch details to rationale
-    if (mismatches.length > 0 && score < 1) {
-      const details = mismatches
-        .map(
-          (m) =>
-            `${m.key}: expected ${formatValue(m.expected)}, got ${formatValue(m.actual)}`,
-        )
-        .join("; ");
-      rationale += ` - ${details}`;
-    }
-
-    return {
-      score,
-      metadata: {
-        rationale,
-        output,
-      },
-    };
-  };
-}
-
-function getMatchFunction(
-  match: StructuredOutputScorerConfig["match"],
-): (expected: any, actual: any, key: string) => boolean {
-  if (typeof match === "function") {
-    return match;
-  }
-
-  if (match === "fuzzy") {
-    return fuzzyMatch;
-  }
-
-  return strictMatch;
-}
-
-function strictMatch(expected: any, actual: any): boolean {
-  return JSON.stringify(expected) === JSON.stringify(actual);
-}
-
-function fuzzyMatch(expected: any, actual: any): boolean {
-  // Handle regex patterns
-  if (expected instanceof RegExp) {
-    return typeof actual === "string" && expected.test(actual);
-  }
-
-  // Handle functions (custom validators)
-  if (typeof expected === "function") {
-    return expected(actual);
-  }
-
-  // Handle null/undefined (intentionally using == for null/undefined check)
-  if (
-    expected === null ||
-    expected === undefined ||
-    actual === null ||
-    actual === undefined
-  ) {
-    return expected === actual;
-  }
-
-  // Handle arrays
-  if (Array.isArray(expected) && Array.isArray(actual)) {
-    if (expected.length !== actual.length) return false;
-    return expected.every((exp, i) => fuzzyMatch(exp, actual[i]));
-  }
-
-  // Handle objects
-  if (typeof expected === "object" && typeof actual === "object") {
-    return Object.keys(expected).every((key) =>
-      fuzzyMatch(expected[key], actual[key]),
-    );
-  }
-
-  // Handle primitives - fuzzy match allows type coercion (e.g., "1" matches 1)
-  // biome-ignore lint/suspicious/noDoubleEquals: Intentional for fuzzy matching with type coercion
-  return expected == actual;
-}
-
-interface ComparisonResult {
-  matches: Array<{ key: string; expected: any; actual: any }>;
-  mismatches: Array<{ key: string; expected: any; actual: any }>;
-  extras: string[];
-}
-
-function compareObjects(
-  expected: Record<string, any>,
-  actual: Record<string, any>,
-  matchFn: (expected: any, actual: any, key: string) => boolean,
-): ComparisonResult {
-  const matches: ComparisonResult["matches"] = [];
-  const mismatches: ComparisonResult["mismatches"] = [];
-
-  // Check expected fields
-  for (const [key, expectedValue] of Object.entries(expected)) {
-    const actualValue = actual[key];
-
-    if (matchFn(expectedValue, actualValue, key)) {
-      matches.push({ key, expected: expectedValue, actual: actualValue });
-    } else {
-      mismatches.push({ key, expected: expectedValue, actual: actualValue });
-    }
-  }
-
-  // Find extra fields
-  const expectedKeys = new Set(Object.keys(expected));
-  const extras = Object.keys(actual).filter((key) => !expectedKeys.has(key));
-
-  return { matches, mismatches, extras };
-}
-
-function formatValue(value: any): string {
-  if (value === undefined) return "undefined";
-  if (value === null) return "null";
-  if (value instanceof RegExp) return value.toString();
-  if (typeof value === "string") return `"${value}"`;
-  if (typeof value === "object") return JSON.stringify(value);
-  return String(value);
-}
diff --git a/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.test.ts b/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.test.ts
new file mode 100644
index 000000000..5b7f28c4b
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.test.ts
@@ -0,0 +1,185 @@
+import { describe, expect, it } from "vitest";
+import type { Harness, HarnessRun } from "vitest-evals";
+import {
+  ToolPredictionJudge,
+  generatePredictionPrompt,
+} from "./toolPredictionHarness";
+import type { ToolPredictionMetadata, ToolPredictionOutput } from "./types";
+
+function createJudgeContext(
+  output: ToolPredictionOutput,
+  metadata: ToolPredictionMetadata,
+): Parameters<typeof ToolPredictionJudge.assess>[0] {
+  const run: HarnessRun<ToolPredictionOutput> = {
+    output,
+    session: { messages: [] },
+    usage: {},
+    errors: [],
+  };
+  const harness: Harness<string, ToolPredictionOutput, ToolPredictionMetadata> =
+    {
+      name: "test-tool-prediction",
+      run: async () => run,
+    };
+
+  return {
+    input: "test input",
+    output,
+    toolCalls: [],
+    metadata,
+    run,
+    session: run.session,
+    harness,
+  };
+}
+
+describe("ToolPredictionJudge", () => {
+  it("does not leak expected tool calls into the prediction prompt", () => {
+    const prompt = generatePredictionPrompt(
+      ["- search_issues: Search Sentry issues"],
+      "Find recent crashes in production",
+    );
+
+    expect(prompt).toContain("- search_issues: Search Sentry issues");
+    expect(prompt).toContain("Find recent crashes in production");
+    expect(prompt).not.toContain("EXPECTED TOOL CALLS");
+    expect(prompt).not.toContain("follow them exactly");
+    expect(prompt).not.toContain("expected tools");
+  });
+
+  it("scores matching predicted tools", async () => {
+    const result = await ToolPredictionJudge.assess(
+      createJudgeContext(
+        {
+          score: 1,
+          rationale: "The task asks for accessible organizations.",
+          predictedTools: [
+            {
+              name: "find_organizations",
+              arguments: {},
+            },
+          ],
+        },
+        {
+          expectedTools: [
+            {
+              name: "find_organizations",
+              arguments: {},
+            },
+          ],
+        },
+      ),
+    );
+
+    expect(result.score).toBe(1);
+    expect(result.metadata?.predictedTools).toEqual([
+      {
+        name: "find_organizations",
+        arguments: {},
+      },
+    ]);
+  });
+
+  it("uses deterministic score when the model underrates matching tools", async () => {
+    const result = await ToolPredictionJudge.assess(
+      createJudgeContext(
+        {
+          score: 0,
+          rationale: "The expected discovery call is not necessary.",
+          predictedTools: [
+            {
+              name: "find_organizations",
+              arguments: {},
+            },
+          ],
+        },
+        {
+          expectedTools: [
+            {
+              name: "find_organizations",
+              arguments: {},
+            },
+          ],
+        },
+      ),
+    );
+
+    expect(result.score).toBe(1);
+    expect(result.metadata?.modelScore).toBe(0);
+    expect(result.metadata?.deterministicScore).toBe(1);
+  });
+
+  it("ignores inflated model scores for wrong predicted tools", async () => {
+    const result = await ToolPredictionJudge.assess(
+      createJudgeContext(
+        {
+          score: 0.8,
+          rationale: "The prediction picked the wrong lookup path.",
+          predictedTools: [
+            {
+              name: "find_organizations",
+              arguments: {},
+            },
+          ],
+        },
+        {
+          expectedTools: [
+            {
+              name: "search_docs",
+              arguments: {
+                query: "rate limiting",
+              },
+            },
+          ],
+        },
+      ),
+    );
+
+    expect(result.score).toBe(0);
+    expect(result.metadata?.rationale).toContain("wrong lookup path");
+    expect(result.metadata?.deterministicRationale).toContain(
+      "Partial match: 0/1",
+    );
+    expect(result.metadata?.deterministicScore).toBe(0);
+  });
+
+  it("uses deterministic partial scores for incomplete multi-step predictions", async () => {
+    const result = await ToolPredictionJudge.assess(
+      createJudgeContext(
+        {
+          score: 0.6,
+          rationale: "The prediction found the issue but missed the update.",
+          predictedTools: [
+            {
+              name: "search_issues",
+              arguments: {
+                organizationSlug: "sentry",
+              },
+            },
+          ],
+        },
+        {
+          expectedTools: [
+            {
+              name: "search_issues",
+              arguments: {
+                organizationSlug: "sentry",
+              },
+            },
+            {
+              name: "update_issue",
+              arguments: {
+                organizationSlug: "sentry",
+              },
+            },
+          ],
+        },
+      ),
+    );
+
+    expect(result.score).toBe(0.5);
+    expect(result.metadata?.rationale).toContain("missed the update");
+    expect(result.metadata?.deterministicRationale).toContain("Partial match");
+    expect(result.metadata?.deterministicScore).toBe(0.5);
+  });
+});
diff --git a/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.ts b/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.ts
new file mode 100644
index 000000000..255247069
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.ts
@@ -0,0 +1,189 @@
+import { openai } from "@ai-sdk/openai";
+import { aiSdkHarness } from "@vitest-evals/harness-ai-sdk";
+import { generateObject, type GenerateObjectResult } from "ai";
+import {
+  createJudge,
+  ToolCallJudge,
+  type JudgeContext,
+  type JsonValue,
+  type ToolCallRecord,
+} from "vitest-evals";
+import { z } from "zod";
+import { requireJsonValue, toJsonRecord } from "./json";
+import { getAvailableToolDescriptions } from "./mcpClient";
+import type {
+  ExpectedToolCall,
+  PredictedToolCall,
+  ToolPredictionMetadata,
+  ToolPredictionOutput,
+} from "./types";
+
+const defaultModel = openai("gpt-4o");
+
+const jsonPrimitiveSchema = z.union([
+  z.string(),
+  z.number(),
+  z.boolean(),
+  z.null(),
+]);
+const shallowJsonValueSchema = z.union([
+  jsonPrimitiveSchema,
+  z.array(jsonPrimitiveSchema),
+  z.record(jsonPrimitiveSchema),
+]);
+const jsonValueSchema: z.ZodType<JsonValue> = z.union([
+  shallowJsonValueSchema,
+  z.array(shallowJsonValueSchema),
+  z.record(shallowJsonValueSchema),
+]);
+
+const predictionSchema = z.object({
+  score: z
+    .number()
+    .min(0)
+    .max(1)
+    .describe("Confidence score for the predicted tool calls from 0 to 1"),
+  rationale: z
+    .string()
+    .describe("Brief explanation of the score and predicted tool calls"),
+  predictedTools: z
+    .array(
+      z.object({
+        name: z.string().describe("Sentry MCP tool name"),
+        arguments: z.record(jsonValueSchema).optional().default({}),
+      }),
+    )
+    .describe("Ordered Sentry MCP tool calls the assistant would likely make"),
+});
+
+type RawToolPredictionOutput = z.infer<typeof predictionSchema>;
+type ToolPredictionResult = GenerateObjectResult<RawToolPredictionOutput>;
+
+export function generatePredictionPrompt(
+  availableTools: string[],
+  task: string,
+) {
+  return `You are predicting which Sentry MCP tools an AI assistant would call for a user task.
+
+[AVAILABLE TOOLS]
+${availableTools.join("\n")}
+
+[USER TASK]
+${task}
+
+Return the ordered tool calls the assistant would likely make and a confidence score for your prediction. Do not answer the user task directly.
+
+Guidance:
+- Use only the available tool descriptions and the user task to decide.
+- Predict discovery calls only when an assistant would need them before the final action.
+- If the task does not require Sentry MCP tools, return an empty predictedTools array.
+- Include arguments only when they are available or strongly implied by the task.
+- Extra parameters like regionUrl are acceptable only when the assistant would have learned them from an earlier discovery call.
+- For natural-language search queries, preserve the user's meaning rather than inventing exact syntax.
+
+Score confidence as follows:
+- 1.0: The tool sequence is obvious from the task and catalog.
+- 0.8: The likely tools are clear, with minor uncertainty in arguments.
+- 0.6: The broad tool choice is plausible, but ordering or arguments are uncertain.
+- 0.3: A tool may be needed, but the task is ambiguous.
+- 0.0: No reliable tool prediction can be made.`;
+}
+
+function normalizePredictedToolCall(
+  toolCall: RawToolPredictionOutput["predictedTools"][number],
+): PredictedToolCall {
+  return {
+    name: toolCall.name,
+    arguments: toJsonRecord(toolCall.arguments),
+  };
+}
+
+function normalizePredictionOutput(
+  output: RawToolPredictionOutput,
+): ToolPredictionOutput {
+  return {
+    score: output.score,
+    rationale: output.rationale,
+    predictedTools: output.predictedTools.map(normalizePredictedToolCall),
+  };
+}
+
+function toToolCallRecord(toolCall: PredictedToolCall): ToolCallRecord {
+  return {
+    name: toolCall.name,
+    arguments: toolCall.arguments,
+  };
+}
+
+function normalizeExpectedToolCalls(expectedTools: ExpectedToolCall[] = []) {
+  return expectedTools.map((toolCall) => ({
+    name: toolCall.name,
+    arguments: toJsonRecord(toolCall.arguments),
+  }));
+}
+
+export function createToolPredictionHarness() {
+  return aiSdkHarness<
+    undefined,
+    string,
+    ToolPredictionMetadata,
+    ToolPredictionResult,
+    Record<string, never>,
+    ToolPredictionOutput
+  >({
+    name: "tool-prediction",
+    run: async ({ input, context }) => {
+      const availableTools = await getAvailableToolDescriptions();
+      context.setArtifact("availableTools", availableTools);
+
+      return await generateObject({
+        model: defaultModel,
+        prompt: generatePredictionPrompt(availableTools, input),
+        schema: predictionSchema,
+        abortSignal: context.signal,
+        experimental_telemetry: {
+          isEnabled: true,
+          functionId: "tool_prediction_harness",
+        },
+      });
+    },
+    output: ({ result }) => normalizePredictionOutput(result.object),
+  });
+}
+
+const toolCallJudge = ToolCallJudge({
+  ordered: true,
+  params: "fuzzy",
+  requireAll: false,
+});
+
+export const ToolPredictionJudge = createJudge<
+  JudgeContext<string, ToolPredictionOutput, ToolPredictionMetadata>
+>("ToolPredictionJudge", async (context) => {
+  const predictedToolCalls =
+    context.output.predictedTools.map(toToolCallRecord);
+  const toolCallJudgeResult = await toolCallJudge.assess({
+    ...context,
+    toolCalls: predictedToolCalls,
+    expectedTools: context.metadata.expectedTools,
+  });
+  const deterministicScore = toolCallJudgeResult.score ?? 0;
+
+  return {
+    score: deterministicScore,
+    metadata: {
+      ...toolCallJudgeResult.metadata,
+      rationale: context.output.rationale,
+      modelScore: context.output.score,
+      predictedTools: requireJsonValue(predictedToolCalls, "predictedTools"),
+      expectedTools: requireJsonValue(
+        normalizeExpectedToolCalls(context.metadata.expectedTools),
+        "expectedTools",
+      ),
+      deterministicScore,
+      deterministicRationale: toolCallJudgeResult.metadata?.rationale,
+    },
+  };
+});
+
+export const toolPredictionHarness = createToolPredictionHarness();
diff --git a/packages/mcp-server-evals/src/evals/utils/toolPredictionScorer.ts b/packages/mcp-server-evals/src/evals/utils/toolPredictionScorer.ts
deleted file mode 100644
index dcfaf1bbe..000000000
--- a/packages/mcp-server-evals/src/evals/utils/toolPredictionScorer.ts
+++ /dev/null
@@ -1,223 +0,0 @@
-import { openai } from "@ai-sdk/openai";
-import { generateObject, type LanguageModel } from "ai";
-import { z } from "zod";
-import { experimental_createMCPClient } from "@ai-sdk/mcp";
-import { Experimental_StdioMCPTransport } from "@ai-sdk/mcp/mcp-stdio";
-
-// Cache for available tools to avoid reconnecting for each test
-let cachedTools: string[] | null = null;
-
-/**
- * Get available tools from the MCP server by connecting to it directly.
- * This ensures the tool list stays in sync with what's actually registered.
- */
-async function getAvailableTools(): Promise<string[]> {
-  if (cachedTools) {
-    return cachedTools;
-  }
-
-  // Use pnpm exec to run the binary from the workspace
-  const transport = new Experimental_StdioMCPTransport({
-    command: "pnpm",
-    args: [
-      "exec",
-      "sentry-mcp",
-      "--access-token=mocked-access-token",
-      "--all-scopes",
-    ],
-    env: {
-      ...process.env,
-      SENTRY_ACCESS_TOKEN: "mocked-access-token",
-      SENTRY_HOST: "sentry.io",
-    },
-  });
-
-  const client = await experimental_createMCPClient({
-    transport,
-  });
-
-  // Discover available tools
-  const toolsMap = await client.tools();
-
-  // Convert tools to the format expected by the scorer
-  cachedTools = Object.entries(toolsMap).map(([name, tool]) => {
-    // Extract the first line of description for a concise summary
-    const shortDescription = (tool as any).description?.split("\n")[0] || "";
-    return `${name} - ${shortDescription}`;
-  });
-
-  // Clean up
-  await client.close();
-
-  return cachedTools;
-}
-
-export interface ExpectedToolCall {
-  name: string;
-  arguments: Record<string, any>;
-}
-
-interface ToolPredictionScorerOptions {
-  input: string;
-  output: string;
-  expectedTools?: ExpectedToolCall[];
-  result?: any;
-}
-
-const defaultModel = openai("gpt-4o");
-
-const predictionSchema = z.object({
-  score: z.number().min(0).max(1).describe("Score from 0 to 1"),
-  rationale: z.string().describe("Explanation of the score"),
-  predictedTools: z
-    .array(
-      z.object({
-        name: z.string(),
-        arguments: z.record(z.any()).optional().default({}),
-      }),
-    )
-    .describe("What tools the AI would likely call"),
-});
-
-function generateSystemPrompt(
-  availableTools: string[],
-  task: string,
-  expectedDescription: string,
-): string {
-  return `You are evaluating whether an AI assistant with access to Sentry MCP tools would make the correct tool calls for a given task.
-
-[AVAILABLE TOOLS]
-${availableTools.join("\n")}
-
-[TASK]
-${task}
-
-[EXPECTED TOOL CALLS]
-${expectedDescription}
-
-Based on the task and available tools, predict what tools the AI would call to complete this task.
-
-IMPORTANT: Look at what information is already provided in the task:
-- When only an organization name is given (e.g., "in sentry-mcp-evals"), discovery calls ARE typically needed
-- When organization/project are given in "org/project" format, the AI may skip discovery if confident
-- The expected tool calls show what is ACTUALLY expected for this specific case - follow them exactly
-- Discovery calls (find_organizations, find_projects) are commonly used to get regionUrl and verify access
-- Match the expected tool sequence exactly - if expected includes discovery, predict discovery
-
-Consider:
-1. Match the expected tool sequence exactly - the expected tools show realistic AI behavior
-2. When a value like "sentry-mcp-evals" appears alone, it's typically an organizationSlug, not a projectSlug
-3. Arguments should match expected values (organizationSlug, projectSlug, name, etc.)
-4. For natural language queries in search_events, exact phrasing doesn't need to match
-5. Extra parameters like regionUrl are acceptable
-6. The AI commonly does discovery calls even when slugs appear to be provided, to get region info
-
-Score as follows:
-- 1.0: All expected tools would be called with correct arguments in the right order
-- 0.8: All expected tools would be called, minor differences (extra params, slight variations)
-- 0.6: Most expected tools would be called but missing some or wrong order
-- 0.3: Some expected tools would be called but significant issues
-- 0.0: Wrong tools or critical tools missing
-
-CRITICAL: The expected tools represent the actual realistic behavior for this specific case. Follow the expected sequence exactly:
-- If expected tools include discovery calls, predict discovery calls
-- If expected tools do NOT include discovery calls, do NOT predict them
-- The test author has determined what's appropriate for each specific scenario`;
-}
-
-/**
- * A scorer that uses AI to predict what tools would be called without executing them.
- * This is much faster than actually running the tools and checking what was called.
- *
- * @param model - Optional language model to use for predictions (defaults to gpt-4o)
- * @returns A scorer function that compares predicted vs expected tool calls
- *
- * @example
- * ```typescript
- * import { ToolPredictionScorer } from './utils/toolPredictionScorer';
- * import { NoOpTaskRunner } from './utils/runner';
- * import { describeEval } from 'vitest-evals';
- *
- * describeEval("Sentry issue search", {
- *   data: async () => [
- *     {
- *       input: "Find the newest issues in my-org",
- *       expectedTools: [
- *         { name: "find_organizations", arguments: {} },
- *         { name: "find_issues", arguments: { organizationSlug: "my-org", sortBy: "first_seen" } }
- *       ]
- *     }
- *   ],
- *   task: NoOpTaskRunner(), // Don't execute tools, just predict them
- *   scorers: [ToolPredictionScorer()],
- *   threshold: 0.8
- * });
- * ```
- *
- * The scorer works by:
- * 1. Connecting to the MCP server to get available tools and their descriptions
- * 2. Using AI to predict what tools would be called for the given task
- * 3. Comparing predictions against the expectedTools array
- * 4. Returning a score from 0.0 to 1.0 based on accuracy
- *
- * Scoring criteria:
- * - 1.0: All expected tools predicted with correct arguments in right order
- * - 0.8: All expected tools predicted, minor differences (extra params, slight variations)
- * - 0.6: Most expected tools predicted but missing some or wrong order
- * - 0.3: Some expected tools predicted but significant issues
- * - 0.0: Wrong tools or critical tools missing
- *
- * If `expectedTools` is not provided in test data, the scorer is automatically skipped
- * and returns `{ score: null }` to allow other scorers to run without interference.
- */
-export function ToolPredictionScorer(model: LanguageModel = defaultModel) {
-  return async function ToolPredictionScorer(
-    opts: ToolPredictionScorerOptions,
-  ) {
-    // If expectedTools is not defined, skip this scorer
-    if (!opts.expectedTools) {
-      return {
-        score: null,
-        metadata: {
-          rationale: "Skipped: No expectedTools defined for this test case",
-        },
-      };
-    }
-
-    const expectedTools = opts.expectedTools;
-
-    // Get available tools from the MCP server
-    const AVAILABLE_TOOLS = await getAvailableTools();
-
-    // Generate a description of the expected tools for the prompt
-    const expectedDescription = expectedTools
-      .map(
-        (tool) =>
-          `- ${tool.name} with arguments: ${JSON.stringify(tool.arguments)}`,
-      )
-      .join("\n");
-
-    const { object } = await generateObject({
-      model,
-      prompt: generateSystemPrompt(
-        AVAILABLE_TOOLS,
-        opts.input,
-        expectedDescription,
-      ),
-      schema: predictionSchema,
-      experimental_telemetry: {
-        isEnabled: true,
-        functionId: "tool_prediction_scorer",
-      },
-    });
-
-    return {
-      score: object.score,
-      metadata: {
-        rationale: object.rationale,
-        predictedTools: object.predictedTools,
-        expectedTools: expectedTools,
-      },
-    };
-  };
-}
diff --git a/packages/mcp-server-evals/src/evals/utils/types.ts b/packages/mcp-server-evals/src/evals/utils/types.ts
new file mode 100644
index 000000000..b73005d04
--- /dev/null
+++ b/packages/mcp-server-evals/src/evals/utils/types.ts
@@ -0,0 +1,36 @@
+import type { JsonValue } from "vitest-evals";
+
+export type JsonRecord = Record<string, JsonValue>;
+
+export interface ExpectedToolCall {
+  name: string;
+  arguments?: Record<string, unknown>;
+}
+
+export type PredictedToolCall = {
+  name: string;
+  arguments?: JsonRecord;
+};
+
+export type ToolPredictionOutput = {
+  score: number;
+  rationale: string;
+  predictedTools: PredictedToolCall[];
+};
+
+export type ToolPredictionMetadata = Record<string, unknown> & {
+  expectedTools?: ExpectedToolCall[];
+};
+
+export type ToolCallEvalMetadata = Record<string, unknown> & {
+  expectedTools?: ExpectedToolCall[];
+};
+
+export type StructuredEvalMetadata = ToolCallEvalMetadata & {
+  expected?: Record<string, unknown>;
+};
+
+export type EvalCase<TMetadata extends Record<string, unknown>> = {
+  input: string;
+  name?: string;
+} & TMetadata;
diff --git a/packages/mcp-server-evals/vitest.config.ts b/packages/mcp-server-evals/vitest.config.ts
index 8d0f7cab7..cdbf92da4 100644
--- a/packages/mcp-server-evals/vitest.config.ts
+++ b/packages/mcp-server-evals/vitest.config.ts
@@ -3,7 +3,7 @@ import { defineConfig } from "vitest/config";
 
 export default defineConfig({
   test: {
-    include: ["**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}"],
+    include: ["src/**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}"],
     reporters: ["vitest-evals/reporter"],
     coverage: {
       provider: "v8",
diff --git a/packages/mcp-server-evals/vitest.unit.config.ts b/packages/mcp-server-evals/vitest.unit.config.ts
new file mode 100644
index 000000000..6ca4a5286
--- /dev/null
+++ b/packages/mcp-server-evals/vitest.unit.config.ts
@@ -0,0 +1,8 @@
+/// <reference types="vitest" />
+import { defineConfig } from "vitest/config";
+
+export default defineConfig({
+  test: {
+    include: ["src/**/*.test.ts"],
+  },
+});
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 1574dbc70..86590370d 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -75,6 +75,9 @@ catalogs:
     '@vitejs/plugin-react':
       specifier: ^4.6.0
       version: 4.6.0
+    '@vitest-evals/harness-ai-sdk':
+      specifier: ^0.12.0
+      version: 0.12.0
     agents:
       specifier: ^0.3.10
       version: 0.3.10
@@ -166,8 +169,8 @@ catalogs:
       specifier: ^4.1.2
       version: 4.1.2
     vitest-evals:
-      specifier: ^0.4.0
-      version: 0.4.0
+      specifier: ^0.12.0
+      version: 0.12.0
     workers-mcp:
       specifier: 0.1.0-3
       version: 0.1.0-3
@@ -228,7 +231,7 @@ importers:
         version: 4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3))
       vitest-evals:
         specifier: 'catalog:'
-        version: 0.4.0(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))
+        version: 0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76)
     devDependencies:
       '@flue/cli':
         specifier: 'catalog:'
@@ -531,6 +534,9 @@ importers:
       '@sentry/mcp-server-tsconfig':
         specifier: workspace:*
         version: link:../mcp-server-tsconfig
+      '@vitest-evals/harness-ai-sdk':
+        specifier: 'catalog:'
+        version: 0.12.0(ai@6.0.64(zod@3.25.76))(vitest-evals@0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76))
       ai:
         specifier: 'catalog:'
         version: 6.0.64(zod@3.25.76)
@@ -548,7 +554,7 @@ importers:
         version: 4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3))
       vitest-evals:
         specifier: 'catalog:'
-        version: 0.4.0(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))
+        version: 0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76)
       zod:
         specifier: 'catalog:'
         version: 3.25.76
@@ -2891,6 +2897,18 @@ packages:
     peerDependencies:
       vite: ^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0-beta.0
 
+  '@vitest-evals/core@0.12.0':
+    resolution: {integrity: sha512-JOatlrVw4jcP9VCBAFcM07pGxUA2iLt4Ks5jaRYqyATjkNwPYnyNDL+YHgvelANfPA0BBX8MzRfs6vEkzJgC+A==}
+
+  '@vitest-evals/harness-ai-sdk@0.12.0':
+    resolution: {integrity: sha512-0yvM80vMqhCl+bc9j3tlDQfOc5H3rL3VNO2RUX8fRgDuWJ3iORW+WDENP+L4PO85GHvLgvUVGDhx+IJBfb26DA==}
+    peerDependencies:
+      ai: '>=4 <7'
+      vitest-evals: '*'
+
+  '@vitest-evals/report-ui@0.12.0':
+    resolution: {integrity: sha512-rjWKnB+WL1ekiIvHdcnEX0tfaCwfeG3BNU6jvGKuJsHqkf8JRtuTyy/xgUKKsb56CokcZ3K3hmeo6RKik/KBrQ==}
+
   '@vitest/expect@4.1.2':
     resolution: {integrity: sha512-gbu+7B0YgUJ2nkdsRJrFFW6X7NTP44WlhiclHniUhxADQJH5Szt9mZ9hWnJPJ8YwOK5zUOSSlSvyzRf0u1DSBQ==}
 
@@ -5657,11 +5675,19 @@ packages:
       yaml:
         optional: true
 
-  vitest-evals@0.4.0:
-    resolution: {integrity: sha512-tvKIc8sCtK7LZnSTFLh5C7BlDzSZhefKzCR68QNShVa7gkiepg7CZH8j3T6ZBWwIa5VgfmFkZ1Iv5NKzUpSfGQ==}
+  vitest-evals@0.12.0:
+    resolution: {integrity: sha512-pyVA4N8gM+T2JB+SGFNSuXcgf/CHbBygAXkXR1fEPEfleKyMacJXPF9gLWIyyC1x5BCrt0r4zkwzkdjZrdpwZQ==}
+    hasBin: true
     peerDependencies:
-      tinyrainbow: '*'
-      vitest: '*'
+      ai: '>=4 <7'
+      tinyrainbow: '>=2 <4'
+      vitest: '>=4 <5'
+      zod: '>=3 <5'
+    peerDependenciesMeta:
+      ai:
+        optional: true
+      zod:
+        optional: true
 
   vitest@4.1.2:
     resolution: {integrity: sha512-xjR1dMTVHlFLh98JE3i/f/WePqJsah4A0FK9cc8Ehp9Udk0AZk6ccpIZhh1qJ/yxVWRZ+Q54ocnD8TXmkhspGg==}
@@ -8325,6 +8351,19 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
+  '@vitest-evals/core@0.12.0':
+    dependencies:
+      zod: 3.25.76
+
+  '@vitest-evals/harness-ai-sdk@0.12.0(ai@6.0.64(zod@3.25.76))(vitest-evals@0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76))':
+    dependencies:
+      ai: 6.0.64(zod@3.25.76)
+      vitest-evals: 0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76)
+
+  '@vitest-evals/report-ui@0.12.0':
+    dependencies:
+      '@vitest-evals/core': 0.12.0
+
   '@vitest/expect@4.1.2':
     dependencies:
       '@standard-schema/spec': 1.1.0
@@ -11572,15 +11611,25 @@ snapshots:
       tsx: 4.20.3
       yaml: 2.8.3
 
-  vitest-evals@0.4.0(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3))):
+  vitest-evals@0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76):
     dependencies:
+      '@vitest-evals/core': 0.12.0
+      '@vitest-evals/report-ui': 0.12.0
       tinyrainbow: 3.1.0
       vitest: 4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3))
+    optionalDependencies:
+      ai: 6.0.64(zod@3.25.76)
+      zod: 3.25.76
 
-  vitest-evals@0.4.0(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3))):
+  vitest-evals@0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76):
     dependencies:
+      '@vitest-evals/core': 0.12.0
+      '@vitest-evals/report-ui': 0.12.0
       tinyrainbow: 3.1.0
       vitest: 4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3))
+    optionalDependencies:
+      ai: 6.0.64(zod@3.25.76)
+      zod: 3.25.76
 
   vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)):
     dependencies:
diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml
index 2a85df454..f46ec8a1b 100644
--- a/pnpm-workspace.yaml
+++ b/pnpm-workspace.yaml
@@ -58,7 +58,8 @@ catalog:
   valibot: ^1.4.0
   vite: ^6.3.5
   vitest: ^4.1.2
-  vitest-evals: ^0.4.0
+  '@vitest-evals/harness-ai-sdk': ^0.12.0
+  vitest-evals: ^0.12.0
   workers-mcp: 0.1.0-3
   wrangler: 4.80.0
   zod: ^3.25.67
diff --git a/turbo.json b/turbo.json
index a52f6860d..9d61d733b 100644
--- a/turbo.json
+++ b/turbo.json
@@ -47,6 +47,7 @@
         "**/*.test.ts",
         "**/*.spec.ts",
         "vitest.config.ts",
+        "vitest.unit.config.ts",
         "package.json"
       ],
       "outputs": ["coverage/**", "*.junit.xml"],
@@ -61,7 +62,7 @@
     "eval": {
       "dependsOn": ["^build"],
       "outputs": [],
-      "cache": true
+      "cache": false
     },
     "build": {
       "dependsOn": ["^build"],
@@ -87,6 +88,7 @@
     "NODE_ENV",
     "CI",
     "OPENAI_API_KEY",
+    "VITEST_EVALS_REPORT_LEVEL",
     "COOKIE_SECRET",
     "SENTRY_CLIENT_ID",
     "SENTRY_CLIENT_SECRET",