getsentry · dcramer · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -5,13 +5,15 @@ on:
   push:
     branches: [main]
     paths:
-      - "packages/mcp-core/src/tools*"
+      - "packages/mcp-core/src/tools/**"
+      - "packages/mcp-core/src/internal/agents/**"
       - "packages/mcp-server-evals/**"
       - "packages/mcp-server-mocks/**"
       - ".github/workflows/eval.yml"
   pull_request:
     paths:
-      - "packages/mcp-core/src/tools*"
+      - "packages/mcp-core/src/tools/**"
+      - "packages/mcp-core/src/internal/agents/**"
       - "packages/mcp-server-evals/**"
       - "packages/mcp-server-mocks/**"
       - ".github/workflows/eval.yml"
@@ -57,140 +59,17 @@ jobs:
         run: pnpm build
 
       - name: Run evals
-        run: pnpm eval:ci evals
+        if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }}
+        run: pnpm --filter @sentry/mcp-server-evals eval:ci
         continue-on-error: true
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
-      - name: Create eval status check
-        uses: actions/github-script@v7
-        # Skip for fork PRs (no write permissions) but still run for pushes, workflow_dispatch, and same-repo PRs
+      - name: Report eval results
+        uses: getsentry/vitest-evals@v0
         if: ${{ !cancelled() && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) }}
-        continue-on-error: true # Don't fail workflow if check creation fails
         with:
-          script: |
-            const fs = require('fs');
-            const path = require('path');
-
-            // Read eval results
-            const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json');
-            console.log(`Reading eval results from: ${resultsPath}`);
-
-            let vitestResults;
-            try {
-              vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'));
-            } catch (error) {
-              if (error.code === 'ENOENT') {
-                throw new Error(
-                  `Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.`
-                );
-              }
-              throw new Error(`Failed to read/parse eval results: ${error.message}`);
-            }
-
-            // Extract eval results from vitest format
-            const evalResults = [];
-            for (const testFile of vitestResults.testResults || []) {
-              for (const test of testFile.assertionResults || []) {
-                if (test.meta?.eval) {
-                  evalResults.push({
-                    name: test.fullName || test.title,
-                    file: testFile.name,
-                    avgScore: test.meta.eval.avgScore ?? null,
-                    scores: test.meta.eval.scores || [],
-                    passed: test.status === 'passed',
-                    duration: test.duration,
-                  });
-                }
-              }
-            }
-
-            // Calculate statistics
-            const totalTests = evalResults.length;
-            // Treat null scores as 0.0 for consistent categorization
-            const scores = evalResults.map(r => r.avgScore !== null ? r.avgScore : 0);
-
-            const avgScore = scores.length > 0
-              ? scores.reduce((sum, score) => sum + score, 0) / scores.length
-              : 0;
-
-            const green = scores.filter(s => s >= 0.75).length;
-            const yellow = scores.filter(s => s >= 0.5 && s < 0.75).length;
-            const red = scores.filter(s => s < 0.5).length;
-
-            // Determine conclusion
-            const conclusion = avgScore >= 0.5 ? 'success' : 'failure';
-
-            // Format score helper
-            function formatScore(score) {
-              if (score >= 0.75) return `🟢 ${score.toFixed(2)}`;
-              if (score >= 0.5) return `🟡 ${score.toFixed(2)}`;
-              return `🔴 ${score.toFixed(2)}`;
-            }
-
-            // Build title
-            const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`;
-
-            // Build summary
-            const summary = [
-              `## Overall Statistics`,
-              ``,
-              `- **Total Evaluations**: ${totalTests}`,
-              `- **Average Score**: ${formatScore(avgScore)}`,
-              `- **Pass Threshold**: 0.50 (catastrophic failure)`,
-              ``,
-              `### Score Distribution`,
-              `- 🟢 Green (≥0.75): ${green} evals`,
-              `- 🟡 Yellow (0.50-0.74): ${yellow} evals`,
-              `- 🔴 Red (<0.50): ${red} evals`,
-            ].join('\n');
-
-            // Build detailed results
-            const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
-            const details = [
-              `## Individual Eval Scores`,
-              ``,
-              ...detailsByScore.map(result => {
-                const score = result.avgScore !== null ? result.avgScore : 0;
-                const statusIcon = result.passed ? '✅' : '❌';
-                const scoreDisplay = formatScore(score);
-
-                let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`;
-
-                // Add rationale for failed or low-scoring tests
-                if (!result.passed || score < 0.75) {
-                  const firstScore = result.scores[0];
-                  if (firstScore?.metadata?.rationale) {
-                    line += `\n   - ${firstScore.metadata.rationale}`;
-                  }
-                }
-
-                return line;
-              }),
-              ``,
-              `---`,
-              ``,
-              `### Conclusion`,
-              ``,
-              conclusion === 'success'
-                ? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)`
-                : `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`,
-            ].join('\n');
-
-            // Create check run
-            await github.rest.checks.create({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              name: 'Evaluation Results',
-              head_sha: context.sha,
-              status: 'completed',
-              conclusion: conclusion,
-              output: {
-                title: title,
-                summary: summary,
-                text: details,
-              },
-            });
-
-            console.log(`✅ Check run created with conclusion: ${conclusion}`);
-            console.log(`   Average Score: ${avgScore.toFixed(2)}`);
+          results: packages/mcp-server-evals/eval-results.json
+          publish-check: true
+          check-name: Evaluation Results
+          fail-on-failures: true
diff --git a/docs/adding-tools.md b/docs/adding-tools.md
@@ -255,20 +255,25 @@ See [api-patterns.md](api-patterns.md#mock-patterns) for validation examples.
 **⚠️ Each eval costs time and API credits. Only test core functionality!**
 
 ```typescript
-describeEval("your-tool", {
-  data: async () => [
-    {
-      input: `Primary use case in ${FIXTURES.organizationSlug}`,
-      expected: "Expected response"
-    },
-    // Maximum 2-3 scenarios!
-  ],
-  task: TaskRunner(),
-  scorers: [Factuality()],
-  threshold: 0.6,
-});
+import { describeToolPredictionEval, FIXTURES } from "./utils";
+
+describeToolPredictionEval("your-tool", [
+  {
+    input: `Primary use case in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
+      {
+        name: "your_tool",
+        arguments: { organizationSlug: FIXTURES.organizationSlug },
+      },
+    ],
+  },
+  // Maximum 2-3 scenarios!
+]);
 ```
 
+Use `describeMcpToolCallEval` instead when the eval needs to execute the full
+MCP harness and validate actual tool calls, usage data, and traces.
+
 ## Testing Workflow
 
 ```bash
@@ -279,7 +284,7 @@ pnpm test tools.test
 pnpm inspector
 
 # 3. Run minimal evals
-pnpm eval your-tool
+pnpm --filter @sentry/mcp-server-evals eval your-tool
 ```
 
 ## Checklist

diff --git a/docs/pr-management.md b/docs/pr-management.md
@@ -184,11 +184,11 @@ datasets: errors, logs, and spans.
 Co-Authored-By: Codex CLI Agent <noreply@openai.com>"
 
 # Bug fix
-git commit -m "fix(evals): update search-events eval to use available exports
+git commit -m "fix(evals): migrate search-events eval to shared harness
 
-Replace missing TaskRunner and Factuality imports with NoOpTaskRunner 
-and ToolPredictionScorer to resolve CI build failures after factuality 
-checker removal.
+Replace bespoke prediction scoring with describeToolPredictionEval so the
+suite uses the shared vitest-evals harness, report metadata, and GitHub check
+output.
 
 Co-Authored-By: Codex CLI Agent <noreply@openai.com>"
 

diff --git a/docs/testing.md b/docs/testing.md
@@ -253,33 +253,42 @@ expect(result.timestamp).toMatchInlineSnapshot(); // ❌
 ### Eval Test Structure
 
 ```typescript
-import { describeEval } from "vitest-evals";
-import { TaskRunner, Factuality } from "./utils";
-
-describeEval("tool-name", {
-  data: async () => [
-    {
-      input: "Natural language request",
-      expected: "Expected response content"
-    }
-  ],
-  task: TaskRunner(),      // Uses AI to call tools
-  scorers: [Factuality()], // Validates output
-  threshold: 0.6,
-  timeout: 30000
-});
+import { describeToolPredictionEval, FIXTURES } from "./utils";
+
+describeToolPredictionEval("tool-name", [
+  {
+    input: `Natural language request in ${FIXTURES.organizationSlug}`,
+    expectedTools: [
+      {
+        name: "your_tool",
+        arguments: { organizationSlug: FIXTURES.organizationSlug },
+      },
+    ],
+  },
+]);
 ```
 
+Use `describeToolPredictionEval` for fast tool-selection coverage. Use
+`describeMcpToolCallEval` when the eval must run the full MCP harness and
+capture actual tool calls, usage, and traces. Use `describeSearchAgentEval` for
+embedded search agents that return structured query output.
+
 ### Running Evals
 
 ```bash
 # Requires OPENAI_API_KEY in .env
 pnpm eval
 
 # Run specific eval
-pnpm eval tool-name
+pnpm --filter @sentry/mcp-server-evals eval tool-name
+
+# Serve the last JSON report locally
+pnpm eval:report
 ```
 
+Eval runs write `packages/mcp-server-evals/eval-results.json`; CI and the local
+report UI both read that JSON artifact.
+
 ## Test Data Management
 
 ### Using Fixtures

diff --git a/package.json b/package.json
@@ -27,6 +27,8 @@
     "deploy": "turbo deploy",
     "eval": "dotenv -e .env -e .env.local -- turbo eval",
     "eval:ci": "CI=true dotenv -e .env -e .env.local -- pnpm --stream -r run eval:ci",
+    "eval:report": "pnpm --filter @sentry/mcp-server-evals eval:report",
+    "eval:ui": "pnpm --filter @sentry/mcp-server-evals eval:ui",
     "flue:issue-triage": "flue run issue-triage --target node",
     "format": "biome format --write",
     "lint": "biome lint",

diff --git a/packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts b/packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts
@@ -2,6 +2,7 @@ import {
   generateText,
   Output,
   type Tool,
+  type GenerateTextResult,
   APICallError,
   NoObjectGeneratedError,
   stepCountIs,
@@ -16,9 +17,17 @@ export type ToolCall = {
   args: unknown;
 };
 
+type EmbeddedAgentGenerateResult = GenerateTextResult<
+  Record<string, Tool>,
+  ReturnType<typeof Output.object>
+>;
+
 interface EmbeddedAgentResult<T> {
   result: T;
   toolCalls: ToolCall[];
+  steps?: EmbeddedAgentGenerateResult["steps"];
+  usage?: EmbeddedAgentGenerateResult["usage"];
+  totalUsage?: EmbeddedAgentGenerateResult["totalUsage"];
 }
 
 /**
@@ -54,7 +63,7 @@ export async function callEmbeddedAgent<
       system,
       prompt,
       tools,
-      stopWhen: stepCountIs(5),
+      stopWhen: stepCountIs(7),
       experimental_output: Output.object({ schema }),
       experimental_telemetry: {
         isEnabled: true,
@@ -101,6 +110,9 @@ export async function callEmbeddedAgent<
     return {
       result: parsedResult.data,
       toolCalls: capturedToolCalls,
+      steps: result.steps,
+      usage: result.usage,
+      totalUsage: result.totalUsage,
     };
   } catch (error: unknown) {
     // Rescue NoObjectGeneratedError: try to parse the raw LLM text through the schema