diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 78a14e4c1..ea975ead1 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -5,13 +5,15 @@ on: push: branches: [main] paths: - - "packages/mcp-core/src/tools*" + - "packages/mcp-core/src/tools/**" + - "packages/mcp-core/src/internal/agents/**" - "packages/mcp-server-evals/**" - "packages/mcp-server-mocks/**" - ".github/workflows/eval.yml" pull_request: paths: - - "packages/mcp-core/src/tools*" + - "packages/mcp-core/src/tools/**" + - "packages/mcp-core/src/internal/agents/**" - "packages/mcp-server-evals/**" - "packages/mcp-server-mocks/**" - ".github/workflows/eval.yml" @@ -57,140 +59,17 @@ jobs: run: pnpm build - name: Run evals - run: pnpm eval:ci evals + if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }} + run: pnpm --filter @sentry/mcp-server-evals eval:ci continue-on-error: true env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - - name: Create eval status check - uses: actions/github-script@v7 - # Skip for fork PRs (no write permissions) but still run for pushes, workflow_dispatch, and same-repo PRs + - name: Report eval results + uses: getsentry/vitest-evals@v0 if: ${{ !cancelled() && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) }} - continue-on-error: true # Don't fail workflow if check creation fails with: - script: | - const fs = require('fs'); - const path = require('path'); - - // Read eval results - const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json'); - console.log(`Reading eval results from: ${resultsPath}`); - - let vitestResults; - try { - vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8')); - } catch (error) { - if (error.code === 'ENOENT') { - throw new Error( - `Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.` - ); - } - throw new Error(`Failed to read/parse eval results: ${error.message}`); - } - - // Extract eval results from vitest format - const evalResults = []; - for (const testFile of vitestResults.testResults || []) { - for (const test of testFile.assertionResults || []) { - if (test.meta?.eval) { - evalResults.push({ - name: test.fullName || test.title, - file: testFile.name, - avgScore: test.meta.eval.avgScore ?? null, - scores: test.meta.eval.scores || [], - passed: test.status === 'passed', - duration: test.duration, - }); - } - } - } - - // Calculate statistics - const totalTests = evalResults.length; - // Treat null scores as 0.0 for consistent categorization - const scores = evalResults.map(r => r.avgScore !== null ? r.avgScore : 0); - - const avgScore = scores.length > 0 - ? scores.reduce((sum, score) => sum + score, 0) / scores.length - : 0; - - const green = scores.filter(s => s >= 0.75).length; - const yellow = scores.filter(s => s >= 0.5 && s < 0.75).length; - const red = scores.filter(s => s < 0.5).length; - - // Determine conclusion - const conclusion = avgScore >= 0.5 ? 'success' : 'failure'; - - // Format score helper - function formatScore(score) { - if (score >= 0.75) return `🟢 ${score.toFixed(2)}`; - if (score >= 0.5) return `🟡 ${score.toFixed(2)}`; - return `🔴 ${score.toFixed(2)}`; - } - - // Build title - const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`; - - // Build summary - const summary = [ - `## Overall Statistics`, - ``, - `- **Total Evaluations**: ${totalTests}`, - `- **Average Score**: ${formatScore(avgScore)}`, - `- **Pass Threshold**: 0.50 (catastrophic failure)`, - ``, - `### Score Distribution`, - `- 🟢 Green (≥0.75): ${green} evals`, - `- 🟡 Yellow (0.50-0.74): ${yellow} evals`, - `- 🔴 Red (<0.50): ${red} evals`, - ].join('\n'); - - // Build detailed results - const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0)); - const details = [ - `## Individual Eval Scores`, - ``, - ...detailsByScore.map(result => { - const score = result.avgScore !== null ? result.avgScore : 0; - const statusIcon = result.passed ? '✅' : '❌'; - const scoreDisplay = formatScore(score); - - let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`; - - // Add rationale for failed or low-scoring tests - if (!result.passed || score < 0.75) { - const firstScore = result.scores[0]; - if (firstScore?.metadata?.rationale) { - line += `\n - ${firstScore.metadata.rationale}`; - } - } - - return line; - }), - ``, - `---`, - ``, - `### Conclusion`, - ``, - conclusion === 'success' - ? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)` - : `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`, - ].join('\n'); - - // Create check run - await github.rest.checks.create({ - owner: context.repo.owner, - repo: context.repo.repo, - name: 'Evaluation Results', - head_sha: context.sha, - status: 'completed', - conclusion: conclusion, - output: { - title: title, - summary: summary, - text: details, - }, - }); - - console.log(`✅ Check run created with conclusion: ${conclusion}`); - console.log(` Average Score: ${avgScore.toFixed(2)}`); \ No newline at end of file + results: packages/mcp-server-evals/eval-results.json + publish-check: true + check-name: Evaluation Results + fail-on-failures: true diff --git a/docs/adding-tools.md b/docs/adding-tools.md index aa8be1e59..1252e43d4 100644 --- a/docs/adding-tools.md +++ b/docs/adding-tools.md @@ -255,20 +255,25 @@ See [api-patterns.md](api-patterns.md#mock-patterns) for validation examples. **⚠️ Each eval costs time and API credits. Only test core functionality!** ```typescript -describeEval("your-tool", { - data: async () => [ - { - input: `Primary use case in ${FIXTURES.organizationSlug}`, - expected: "Expected response" - }, - // Maximum 2-3 scenarios! - ], - task: TaskRunner(), - scorers: [Factuality()], - threshold: 0.6, -}); +import { describeToolPredictionEval, FIXTURES } from "./utils"; + +describeToolPredictionEval("your-tool", [ + { + input: `Primary use case in ${FIXTURES.organizationSlug}`, + expectedTools: [ + { + name: "your_tool", + arguments: { organizationSlug: FIXTURES.organizationSlug }, + }, + ], + }, + // Maximum 2-3 scenarios! +]); ``` +Use `describeMcpToolCallEval` instead when the eval needs to execute the full +MCP harness and validate actual tool calls, usage data, and traces. + ## Testing Workflow ```bash @@ -279,7 +284,7 @@ pnpm test tools.test pnpm inspector # 3. Run minimal evals -pnpm eval your-tool +pnpm --filter @sentry/mcp-server-evals eval your-tool ``` ## Checklist diff --git a/docs/pr-management.md b/docs/pr-management.md index b5b90f248..2c817e335 100644 --- a/docs/pr-management.md +++ b/docs/pr-management.md @@ -184,11 +184,11 @@ datasets: errors, logs, and spans. Co-Authored-By: Codex CLI Agent " # Bug fix -git commit -m "fix(evals): update search-events eval to use available exports +git commit -m "fix(evals): migrate search-events eval to shared harness -Replace missing TaskRunner and Factuality imports with NoOpTaskRunner -and ToolPredictionScorer to resolve CI build failures after factuality -checker removal. +Replace bespoke prediction scoring with describeToolPredictionEval so the +suite uses the shared vitest-evals harness, report metadata, and GitHub check +output. Co-Authored-By: Codex CLI Agent " diff --git a/docs/testing.md b/docs/testing.md index 827707809..8704fbd60 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -253,23 +253,26 @@ expect(result.timestamp).toMatchInlineSnapshot(); // ❌ ### Eval Test Structure ```typescript -import { describeEval } from "vitest-evals"; -import { TaskRunner, Factuality } from "./utils"; - -describeEval("tool-name", { - data: async () => [ - { - input: "Natural language request", - expected: "Expected response content" - } - ], - task: TaskRunner(), // Uses AI to call tools - scorers: [Factuality()], // Validates output - threshold: 0.6, - timeout: 30000 -}); +import { describeToolPredictionEval, FIXTURES } from "./utils"; + +describeToolPredictionEval("tool-name", [ + { + input: `Natural language request in ${FIXTURES.organizationSlug}`, + expectedTools: [ + { + name: "your_tool", + arguments: { organizationSlug: FIXTURES.organizationSlug }, + }, + ], + }, +]); ``` +Use `describeToolPredictionEval` for fast tool-selection coverage. Use +`describeMcpToolCallEval` when the eval must run the full MCP harness and +capture actual tool calls, usage, and traces. Use `describeSearchAgentEval` for +embedded search agents that return structured query output. + ### Running Evals ```bash @@ -277,9 +280,15 @@ describeEval("tool-name", { pnpm eval # Run specific eval -pnpm eval tool-name +pnpm --filter @sentry/mcp-server-evals eval tool-name + +# Serve the last JSON report locally +pnpm eval:report ``` +Eval runs write `packages/mcp-server-evals/eval-results.json`; CI and the local +report UI both read that JSON artifact. + ## Test Data Management ### Using Fixtures diff --git a/package.json b/package.json index d26fc15a2..c8cbd4325 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,8 @@ "deploy": "turbo deploy", "eval": "dotenv -e .env -e .env.local -- turbo eval", "eval:ci": "CI=true dotenv -e .env -e .env.local -- pnpm --stream -r run eval:ci", + "eval:report": "pnpm --filter @sentry/mcp-server-evals eval:report", + "eval:ui": "pnpm --filter @sentry/mcp-server-evals eval:ui", "flue:issue-triage": "flue run issue-triage --target node", "format": "biome format --write", "lint": "biome lint", diff --git a/packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts b/packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts index 845d46484..1c0139f09 100644 --- a/packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts +++ b/packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts @@ -2,6 +2,7 @@ import { generateText, Output, type Tool, + type GenerateTextResult, APICallError, NoObjectGeneratedError, stepCountIs, @@ -16,9 +17,17 @@ export type ToolCall = { args: unknown; }; +type EmbeddedAgentGenerateResult = GenerateTextResult< + Record, + ReturnType +>; + interface EmbeddedAgentResult { result: T; toolCalls: ToolCall[]; + steps?: EmbeddedAgentGenerateResult["steps"]; + usage?: EmbeddedAgentGenerateResult["usage"]; + totalUsage?: EmbeddedAgentGenerateResult["totalUsage"]; } /** @@ -54,7 +63,7 @@ export async function callEmbeddedAgent< system, prompt, tools, - stopWhen: stepCountIs(5), + stopWhen: stepCountIs(7), experimental_output: Output.object({ schema }), experimental_telemetry: { isEnabled: true, @@ -101,6 +110,9 @@ export async function callEmbeddedAgent< return { result: parsedResult.data, toolCalls: capturedToolCalls, + steps: result.steps, + usage: result.usage, + totalUsage: result.totalUsage, }; } catch (error: unknown) { // Rescue NoObjectGeneratedError: try to parse the raw LLM text through the schema diff --git a/packages/mcp-core/src/skillDefinitions.json b/packages/mcp-core/src/skillDefinitions.json index 74a6b4060..83ccf9438 100644 --- a/packages/mcp-core/src/skillDefinitions.json +++ b/packages/mcp-core/src/skillDefinitions.json @@ -64,7 +64,7 @@ }, { "name": "get_sentry_resource", - "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: :\n- snapshot: \n- snapshotImage: :\n\n\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n", + "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: :\n- breadcrumbs: issue shortId or event ID\n- snapshot: \n- snapshotImage: :\n\n\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n", "requiredScopes": ["event:read", "project:read"] }, { @@ -129,7 +129,7 @@ }, { "name": "get_sentry_resource", - "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: :\n- snapshot: \n- snapshotImage: :\n\n\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n", + "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: :\n- breadcrumbs: issue shortId or event ID\n- snapshot: \n- snapshotImage: :\n\n\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n", "requiredScopes": ["event:read", "project:read"] }, { @@ -219,7 +219,7 @@ }, { "name": "get_sentry_resource", - "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: :\n- snapshot: \n- snapshotImage: :\n\n\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n", + "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: :\n- breadcrumbs: issue shortId or event ID\n- snapshot: \n- snapshotImage: :\n\n\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n", "requiredScopes": ["event:read", "project:read"] }, { @@ -329,7 +329,7 @@ }, { "name": "get_sentry_resource", - "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=: returns the image preview and metadata. Use the Sentry tool `get_snapshot_image(organizationSlug='', snapshotId='', imageIdentifier='', imageResolution='full')` for full-resolution image bytes.\n\nResource IDs:\n- span: :\n- snapshot: \n- snapshotImage: :\n\n\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n", + "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=: returns the image preview and metadata. Use the Sentry tool `get_snapshot_image(organizationSlug='', snapshotId='', imageIdentifier='', imageResolution='full')` for full-resolution image bytes.\n\nResource IDs:\n- span: :\n- breadcrumbs: issue shortId or event ID\n- snapshot: \n- snapshotImage: :\n\n\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n", "requiredScopes": ["event:read", "project:read"] }, { diff --git a/packages/mcp-core/src/toolDefinitions.json b/packages/mcp-core/src/toolDefinitions.json index d767e70df..e91775915 100644 --- a/packages/mcp-core/src/toolDefinitions.json +++ b/packages/mcp-core/src/toolDefinitions.json @@ -587,7 +587,7 @@ }, { "name": "get_sentry_resource", - "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: :\n- snapshot: \n- snapshotImage: :\n\n\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n", + "description": "Fetch a Sentry resource by URL, or by resourceType plus resourceId.\nPass a Sentry URL directly when possible; the resource type is auto-detected.\n\nSupports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.\nTrace lookups return a condensed overview by default.\n\nAI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.\n\nFor preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):\n- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)\n- With ?selectedSnapshot=: returns the image preview and metadata. Full-resolution snapshot image bytes are not available in this session.\n\nResource IDs:\n- span: :\n- breadcrumbs: issue shortId or event ID\n- snapshot: \n- snapshotImage: :\n\n\nget_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')\nget_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')\nget_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')\nget_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')\nget_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/?selectedSnapshot=login_screen.png')\n", "inputSchema": { "type": "object", "properties": { @@ -613,7 +613,7 @@ }, "resourceId": { "type": "string", - "description": "Resource identifier: issue shortId (e.g., 'PROJECT-123'), event ID, trace ID, AI conversation ID, replay ID, snapshot artifact ID, `:` for snapshot image resources, or `traceId:spanId` for span resources. Required when not using a URL." + "description": "Resource identifier: issue shortId (e.g., 'PROJECT-123'), event ID, trace ID, AI conversation ID, replay ID, snapshot artifact ID, issue shortId or event ID for breadcrumbs, `:` for snapshot image resources, or `traceId:spanId` for span resources. Required when not using a URL." }, "organizationSlug": { "type": "string", diff --git a/packages/mcp-core/src/tools/catalog/get-sentry-resource.ts b/packages/mcp-core/src/tools/catalog/get-sentry-resource.ts index d40e00e13..b545cb28f 100644 --- a/packages/mcp-core/src/tools/catalog/get-sentry-resource.ts +++ b/packages/mcp-core/src/tools/catalog/get-sentry-resource.ts @@ -529,7 +529,7 @@ export default defineTool({ "Supports issues, events, traces, spans, AI conversations, breadcrumbs, replays, preprod snapshots, and snapshot images.", "Trace lookups return a condensed overview by default.", "", - "AI Conversations: A conversation is a set of spans sharing the same gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID to fetch all spans for that conversation. To discover or list conversation IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues — do not use search_issues for conversation queries.", + "AI Conversations: spans sharing gen_ai.conversation.id. Use resourceType='ai_conversation' with a conversation ID. To discover IDs, use search_events with dataset='spans' and query='has:gen_ai.conversation.id'. Conversations are NOT issues.", "", "For preprod snapshot URLs (matching 'sentry.io/preprod/snapshots/'):", "- Without ?selectedSnapshot=: returns the snapshot diff summary (changed, added, removed images)", @@ -537,12 +537,14 @@ export default defineTool({ "", "Resource IDs:", "- span: :", + "- breadcrumbs: issue shortId or event ID", "- snapshot: ", "- snapshotImage: :", "", "", "get_sentry_resource(url='https://sentry.io/issues/PROJECT-123/')", "get_sentry_resource(resourceType='issue', organizationSlug='my-org', resourceId='PROJECT-123')", + "get_sentry_resource(resourceType='breadcrumbs', organizationSlug='my-org', resourceId='PROJECT-123')", "get_sentry_resource(resourceType='span', organizationSlug='my-org', resourceId=':')", "get_sentry_resource(resourceType='ai_conversation', organizationSlug='my-org', resourceId='conversation-123')", "get_sentry_resource(url='https://sentry.sentry.io/preprod/snapshots/123/')", @@ -582,7 +584,7 @@ export default defineTool({ .trim() .optional() .describe( - "Resource identifier: issue shortId (e.g., 'PROJECT-123'), event ID, trace ID, AI conversation ID, replay ID, snapshot artifact ID, `:` for snapshot image resources, or `traceId:spanId` for span resources. Required when not using a URL.", + "Resource identifier: issue shortId (e.g., 'PROJECT-123'), event ID, trace ID, AI conversation ID, replay ID, snapshot artifact ID, issue shortId or event ID for breadcrumbs, `:` for snapshot image resources, or `traceId:spanId` for span resources. Required when not using a URL.", ), organizationSlug: ParamOrganizationSlug.optional(), diff --git a/packages/mcp-core/src/tools/support/search-events/agent.ts b/packages/mcp-core/src/tools/support/search-events/agent.ts index 15acc7d7b..abb3b991f 100644 --- a/packages/mcp-core/src/tools/support/search-events/agent.ts +++ b/packages/mcp-core/src/tools/support/search-events/agent.ts @@ -7,6 +7,7 @@ import { createWhoamiTool } from "../../../internal/agents/tools/whoami"; import { createDatasetAttributesTool } from "./utils"; import { systemPrompt } from "./config"; import { PUBLIC_EVENTS_DATASETS } from "../../../utils/events-datasets"; +import type { ToolCall } from "../../../internal/agents/callEmbeddedAgent"; const SEARCH_EVENTS_DATASETS = [...PUBLIC_EVENTS_DATASETS, "replays"] as const; @@ -91,7 +92,7 @@ export async function searchEventsAgent( options: SearchEventsAgentOptions, ): Promise<{ result: z.output; - toolCalls: any[]; + toolCalls: ToolCall[]; }> { // Provider check happens in callEmbeddedAgent via getAgentProvider() // Create tools pre-bound with the provided API service and organization diff --git a/packages/mcp-core/src/tools/support/search-events/config.ts b/packages/mcp-core/src/tools/support/search-events/config.ts index 3575c3459..408bee95d 100644 --- a/packages/mcp-core/src/tools/support/search-events/config.ts +++ b/packages/mcp-core/src/tools/support/search-events/config.ts @@ -228,7 +228,9 @@ CORRECT QUERY PATTERNS (FOLLOW THESE): - For field existence: Use has:field_name (NOT field_name IS NOT NULL) - For field absence: Use !has:field_name (NOT field_name IS NULL) - For time periods: Use timeRange parameter (NOT SQL date functions) +- For numeric thresholds: Use comparison operators like field:>value, field:=value, or field:<=value (NOT wildcard/string prefixes) - Example: "items processed yesterday" → query: "has:item.processed", timeRange: {"statsPeriod": "24h"} +- Example: "temperature above 0.7" → query: "gen_ai.request.temperature:>0.7" PROCESS: 1. Analyze the user's query @@ -241,6 +243,7 @@ COMMON ERRORS TO AVOID: - Using SQL syntax (IS NOT NULL, IS NULL, yesterday(), today(), etc.) - Use has: operator and timeRange instead - Using numeric functions (sum, avg, min, max, percentiles) on non-numeric fields - Using incorrect field names (use the otelSemantics tool to look up correct names) +- Approximating numeric thresholds with wildcard strings (use field:>value or field:> = { "gen_ai.usage.input_tokens", "gen_ai.usage.output_tokens", "gen_ai.request.max_tokens", + "gen_ai.request.temperature", // Web Vitals measurements "measurements.lcp", "measurements.cls", @@ -329,6 +333,7 @@ export const DATASET_FIELDS = { "gen_ai.provider.name": "AI provider name (e.g., anthropic, openai)", "gen_ai.request.model": "Model name (e.g., claude-3-5-sonnet-20241022)", "gen_ai.operation.name": "Operation type (e.g., chat, completion)", + "gen_ai.request.temperature": "LLM sampling temperature (numeric)", "gen_ai.usage.input_tokens": "Number of input tokens (numeric)", "gen_ai.usage.output_tokens": "Number of output tokens (numeric)", "gen_ai.tool.name": "Tool name (e.g., search_issues, search_events)", @@ -584,6 +589,21 @@ export const DATASET_EXAMPLES: Record< sort: "-sum(gen_ai.usage.input_tokens)", }, }, + { + description: "LLM calls where temperature is above 0.7", + output: { + query: "gen_ai.request.temperature:>0.7", + fields: [ + "gen_ai.request.model", + "gen_ai.request.temperature", + "gen_ai.operation.name", + "span.duration", + "timestamp", + "trace", + ], + sort: "-span.duration", + }, + }, { description: "top MCP tool calls by usage", output: { diff --git a/packages/mcp-core/src/tools/support/search-issue-events/agent.ts b/packages/mcp-core/src/tools/support/search-issue-events/agent.ts index 37991efcf..5a1885689 100644 --- a/packages/mcp-core/src/tools/support/search-issue-events/agent.ts +++ b/packages/mcp-core/src/tools/support/search-issue-events/agent.ts @@ -1,5 +1,6 @@ import { z } from "zod"; import { callEmbeddedAgent } from "../../../internal/agents/callEmbeddedAgent"; +import type { ToolCall } from "../../../internal/agents/callEmbeddedAgent"; import type { SentryApiService } from "../../../api-client"; import { createWhoamiTool } from "../../../internal/agents/tools/whoami"; import { createIssueEventFieldsTool } from "./utils"; @@ -76,7 +77,7 @@ export async function searchIssueEventsAgent( options: SearchIssueEventsAgentOptions, ): Promise<{ result: z.output; - toolCalls: any[]; + toolCalls: ToolCall[]; }> { // Provider check happens in callEmbeddedAgent via getAgentProvider() // Create tools pre-bound with the provided API service and organization diff --git a/packages/mcp-core/src/tools/support/search-issues/agent.ts b/packages/mcp-core/src/tools/support/search-issues/agent.ts index 75f5967c3..34448154d 100644 --- a/packages/mcp-core/src/tools/support/search-issues/agent.ts +++ b/packages/mcp-core/src/tools/support/search-issues/agent.ts @@ -1,6 +1,7 @@ import { z } from "zod"; import type { SentryApiService } from "../../../api-client"; import { callEmbeddedAgent } from "../../../internal/agents/callEmbeddedAgent"; +import type { ToolCall } from "../../../internal/agents/callEmbeddedAgent"; import { createDatasetFieldsTool } from "../../../internal/agents/tools/dataset-fields"; import { createWhoamiTool } from "../../../internal/agents/tools/whoami"; import { systemPrompt } from "./config"; @@ -35,7 +36,7 @@ export async function searchIssuesAgent( options: SearchIssuesAgentOptions, ): Promise<{ result: z.output; - toolCalls: any[]; + toolCalls: ToolCall[]; }> { // Provider check happens in callEmbeddedAgent via getAgentProvider() // Create tools pre-bound with the provided API service and organization diff --git a/packages/mcp-server-evals/README.md b/packages/mcp-server-evals/README.md index 526af9ee0..7804afdfb 100644 --- a/packages/mcp-server-evals/README.md +++ b/packages/mcp-server-evals/README.md @@ -2,6 +2,75 @@ Evaluation helpers and a local mock stdio runner used when developing and validating the Sentry MCP server. +## Running evals + +The suite uses the harness-first `vitest-evals` API through repo-local helpers +in `src/evals/utils`. Keep eval files focused on fixture cases; the helpers +own harness selection, judges, thresholds, timeouts, usage capture, and traces. + +```bash +# Requires OPENAI_API_KEY in .env or .env.local +pnpm eval + +# Run a single eval file/suite pattern +pnpm --filter @sentry/mcp-server-evals eval search-issues + +# Print expanded tool/output detail in the terminal report +pnpm --filter @sentry/mcp-server-evals eval:info +``` + +Eval runs write `packages/mcp-server-evals/eval-results.json`, which is the +artifact used by both the local report UI and GitHub Actions. + +## Writing evals + +Use the smallest helper that exercises the behavior you need: + +- `describeToolPredictionEval` for fast prediction suites that ask a model to + predict which MCP tools should be called. The harness output is + `{ predictedTools, rationale }`; a deterministic judge compares it with + `expectedTools`. +- `describeMcpToolCallEval` for full MCP harness runs through the mock stdio + server. Use this when actual tool interception, usage data, and traces matter. +- `describeSearchAgentEval` for embedded search agents that return structured + query output plus captured tool calls. + +```typescript +import { describeToolPredictionEval, FIXTURES } from "./utils"; + +describeToolPredictionEval("list-projects", [ + { + input: `What projects do I have access to in ${FIXTURES.organizationSlug}?`, + expectedTools: [ + { + name: "find_projects", + arguments: { organizationSlug: FIXTURES.organizationSlug }, + }, + ], + }, +]); +``` + +## Local report UI + +After running evals, open the report UI with either root shortcut: + +```bash +pnpm eval:report +pnpm eval:ui +``` + +Both commands serve `packages/mcp-server-evals/eval-results.json` with +`vitest-evals serve`. + +## CI reporting + +`.github/workflows/eval.yml` emits Vitest JSON and JUnit XML, then uses +`getsentry/vitest-evals@v0` to publish the GitHub Actions summary, +annotations, and the `Evaluation Results` check run. The JSON artifact is the +source of truth because it preserves eval scores and metadata; JUnit is kept +for tools that expect XML. + ## Mock stdio runner - Command: `pnpm --filter @sentry/mcp-server-evals start` diff --git a/packages/mcp-server-evals/package.json b/packages/mcp-server-evals/package.json index dbc5cf6db..fbdc7641e 100644 --- a/packages/mcp-server-evals/package.json +++ b/packages/mcp-server-evals/package.json @@ -11,8 +11,15 @@ "build": "tsc -b", "dev": "tsc -w", "start": "tsx src/bin/start-mock-stdio.ts", - "eval": "vitest --config=vitest.config.ts", - "eval:ci": "vitest run --reporter=vitest-evals/reporter --reporter=junit --reporter=json --outputFile.json=eval-results.json --outputFile.junit=eval.junit.xml" + "test": "vitest run --config=vitest.unit.config.ts", + "test:ci": "vitest run --config=vitest.unit.config.ts --reporter=default --reporter=junit --outputFile=tests.junit.xml", + "test:watch": "vitest --config=vitest.unit.config.ts", + "eval": "vitest run --config=vitest.config.ts --reporter=vitest-evals/reporter --reporter=json --outputFile.json=eval-results.json", + "eval:ci": "vitest run --config=vitest.config.ts --reporter=vitest-evals/reporter --reporter=junit --reporter=json --outputFile.json=eval-results.json --outputFile.junit=eval.junit.xml", + "eval:info": "VITEST_EVALS_REPORT_LEVEL=info vitest run --config=vitest.config.ts --reporter=vitest-evals/reporter --reporter=json --outputFile.json=eval-results.json", + "eval:report": "vitest-evals serve eval-results.json", + "eval:ui": "vitest-evals serve eval-results.json", + "eval:watch": "vitest --config=vitest.config.ts" }, "dependencies": { "@ai-sdk/mcp": "catalog:", @@ -22,6 +29,7 @@ "@sentry/mcp-server": "workspace:*", "@sentry/mcp-server-mocks": "workspace:*", "@sentry/mcp-server-tsconfig": "workspace:*", + "@vitest-evals/harness-ai-sdk": "catalog:", "ai": "catalog:", "dotenv": "catalog:", "msw": "catalog:", diff --git a/packages/mcp-server-evals/src/evals/autofix.eval.ts b/packages/mcp-server-evals/src/evals/autofix.eval.ts index d6a4590c8..1400e689f 100644 --- a/packages/mcp-server-evals/src/evals/autofix.eval.ts +++ b/packages/mcp-server-evals/src/evals/autofix.eval.ts @@ -1,35 +1,26 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("begin-issue-fix", { - data: async () => { - return [ +describeToolPredictionEval("begin-issue-fix", [ + { + input: `Whats the status on root causing this issue in Sentry?\n${FIXTURES.testIssueUrl}`, + expectedTools: [ { - input: `Whats the status on root causing this issue in Sentry?\n${FIXTURES.testIssueUrl}`, - expectedTools: [ - { - name: "analyze_issue_with_seer", - arguments: { - issueUrl: FIXTURES.testIssueUrl, - }, - }, - ], + name: "analyze_issue_with_seer", + arguments: { + issueUrl: FIXTURES.testIssueUrl, + }, }, + ], + }, + { + input: `Can you root cause this issue and retrieve the analysis?\n${FIXTURES.testIssueUrl}`, + expectedTools: [ { - input: `Can you root cause this issue and retrieve the analysis?\n${FIXTURES.testIssueUrl}`, - expectedTools: [ - { - name: "analyze_issue_with_seer", - arguments: { - issueUrl: FIXTURES.testIssueUrl, - }, - }, - ], + name: "analyze_issue_with_seer", + arguments: { + issueUrl: FIXTURES.testIssueUrl, + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/create-dsn.eval.ts b/packages/mcp-server-evals/src/evals/create-dsn.eval.ts index 5fa59f61a..ae146e91c 100644 --- a/packages/mcp-server-evals/src/evals/create-dsn.eval.ts +++ b/packages/mcp-server-evals/src/evals/create-dsn.eval.ts @@ -1,26 +1,17 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("create-dsn", { - data: async () => { - return [ +describeToolPredictionEval("create-dsn", [ + { + input: `Create a new DSN named "Production" for '${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}'`, + expectedTools: [ { - input: `Create a new DSN named "Production" for '${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}'`, - expectedTools: [ - { - name: "create_dsn", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - projectSlug: FIXTURES.projectSlug, - name: "Production", - }, - }, - ], + name: "create_dsn", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + projectSlug: FIXTURES.projectSlug, + name: "Production", + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/create-project.eval.ts b/packages/mcp-server-evals/src/evals/create-project.eval.ts index f551c7ded..20258277c 100644 --- a/packages/mcp-server-evals/src/evals/create-project.eval.ts +++ b/packages/mcp-server-evals/src/evals/create-project.eval.ts @@ -1,38 +1,29 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("create-project", { - data: async () => { - return [ +describeToolPredictionEval("create-project", [ + { + input: `Create a new project in Sentry for '${FIXTURES.organizationSlug}' called '${FIXTURES.projectSlug}' with the '${FIXTURES.teamSlug}' team. Output **only** the project slug and the SENTRY_DSN in the format of:\n\n`, + expectedTools: [ { - input: `Create a new project in Sentry for '${FIXTURES.organizationSlug}' called '${FIXTURES.projectSlug}' with the '${FIXTURES.teamSlug}' team. Output **only** the project slug and the SENTRY_DSN in the format of:\n\n`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "find_teams", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - regionUrl: "https://us.sentry.io", - }, - }, - { - name: "create_project", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - regionUrl: "https://us.sentry.io", - teamSlug: FIXTURES.teamSlug, - name: FIXTURES.projectSlug, - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - ]; + { + name: "find_teams", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + regionUrl: "https://us.sentry.io", + }, + }, + { + name: "create_project", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + regionUrl: "https://us.sentry.io", + teamSlug: FIXTURES.teamSlug, + name: FIXTURES.projectSlug, + }, + }, + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/create-team.eval.ts b/packages/mcp-server-evals/src/evals/create-team.eval.ts index 2a789f505..a109f898d 100644 --- a/packages/mcp-server-evals/src/evals/create-team.eval.ts +++ b/packages/mcp-server-evals/src/evals/create-team.eval.ts @@ -1,30 +1,21 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("create-team", { - data: async () => { - return [ +describeToolPredictionEval("create-team", [ + { + input: `Create a new team in Sentry for '${FIXTURES.organizationSlug}' called 'the-goats' response with **only** the team slug and no other text.`, + expectedTools: [ { - input: `Create a new team in Sentry for '${FIXTURES.organizationSlug}' called 'the-goats' response with **only** the team slug and no other text.`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "create_team", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - name: "the-goats", - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - ]; + { + name: "create_team", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + name: "the-goats", + regionUrl: "https://us.sentry.io", + }, + }, + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/get-issue.eval.ts b/packages/mcp-server-evals/src/evals/get-issue.eval.ts index 03c877c68..4d7efac94 100644 --- a/packages/mcp-server-evals/src/evals/get-issue.eval.ts +++ b/packages/mcp-server-evals/src/evals/get-issue.eval.ts @@ -1,55 +1,46 @@ -import { describeEval, ToolCallScorer } from "vitest-evals"; -import { FIXTURES, McpToolCallTaskRunner } from "./utils"; +import { describeMcpToolCallEval, FIXTURES } from "./utils"; -describeEval("get-issue", { - data: async () => { - return [ +describeMcpToolCallEval("get-issue", [ + { + input: `Explain CLOUDFLARE-MCP-41 from Sentry in ${FIXTURES.organizationSlug}.`, + expectedTools: [ { - input: `Explain CLOUDFLARE-MCP-41 from Sentry in ${FIXTURES.organizationSlug}.`, - expectedTools: [ - { - name: "search_tools", - arguments: { - query: "issue", - }, - }, - { - name: "execute_tool", - arguments: { - name: "get_issue_details", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - issueId: "CLOUDFLARE-MCP-41", - }, - }, - }, - ], + name: "search_tools", + arguments: { + query: /issue|get_issue_details/, + }, }, { - input: `Explain the event with ID 7ca573c0f4814912aaa9bdc77d1a7d51 from Sentry in ${FIXTURES.organizationSlug}.`, - expectedTools: [ - { - name: "search_tools", - arguments: { - query: "issue", - }, + name: "execute_tool", + arguments: { + name: "get_issue_details", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + issueId: "CLOUDFLARE-MCP-41", }, - { - name: "execute_tool", - arguments: { - name: "get_issue_details", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - eventId: "7ca573c0f4814912aaa9bdc77d1a7d51", - }, - }, + }, + }, + ], + }, + { + input: `Explain the event with ID 7ca573c0f4814912aaa9bdc77d1a7d51 from Sentry in ${FIXTURES.organizationSlug}.`, + expectedTools: [ + { + name: "search_tools", + arguments: { + query: /issue|event|get_issue_details/, + }, + }, + { + name: "execute_tool", + arguments: { + name: "get_issue_details", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + eventId: "7ca573c0f4814912aaa9bdc77d1a7d51", }, - ], + }, }, - ]; + ], }, - task: McpToolCallTaskRunner(), - scorers: [ToolCallScorer({ ordered: true, params: "fuzzy" })], - threshold: 0.6, - timeout: 90000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/get-sentry-resource.eval.ts b/packages/mcp-server-evals/src/evals/get-sentry-resource.eval.ts index 42437788e..625deabe8 100644 --- a/packages/mcp-server-evals/src/evals/get-sentry-resource.eval.ts +++ b/packages/mcp-server-evals/src/evals/get-sentry-resource.eval.ts @@ -1,60 +1,51 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("get-sentry-resource", { - data: async () => { - return [ +describeToolPredictionEval("get-sentry-resource", [ + { + input: `What's happening in this Sentry issue? ${FIXTURES.issueUrl}`, + expectedTools: [ { - input: `What's happening in this Sentry issue? ${FIXTURES.issueUrl}`, - expectedTools: [ - { - name: "get_sentry_resource", - arguments: { - url: FIXTURES.issueUrl, - }, - }, - ], + name: "get_sentry_resource", + arguments: { + url: FIXTURES.issueUrl, + }, }, + ], + }, + { + input: `Show me the breadcrumbs for ${FIXTURES.issueUrl}`, + expectedTools: [ { - input: `Show me the breadcrumbs for ${FIXTURES.issueUrl}`, - expectedTools: [ - { - name: "get_sentry_resource", - arguments: { - url: FIXTURES.issueUrl, - resourceType: "breadcrumbs", - }, - }, - ], + name: "get_sentry_resource", + arguments: { + url: FIXTURES.issueUrl, + resourceType: "breadcrumbs", + }, }, + ], + }, + { + input: `Fetch the breadcrumbs for issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}.`, + expectedTools: [ { - input: `Fetch the breadcrumbs for issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}.`, - expectedTools: [ - { - name: "get_sentry_resource", - arguments: { - resourceType: "breadcrumbs", - organizationSlug: FIXTURES.organizationSlug, - resourceId: FIXTURES.issueId, - }, - }, - ], + name: "get_sentry_resource", + arguments: { + resourceType: "breadcrumbs", + organizationSlug: FIXTURES.organizationSlug, + resourceId: FIXTURES.issueId, + }, }, + ], + }, + { + input: `Show me what happened in this trace: ${FIXTURES.traceUrl}`, + expectedTools: [ { - input: `Show me what happened in this trace: ${FIXTURES.traceUrl}`, - expectedTools: [ - { - name: "get_sentry_resource", - arguments: { - url: FIXTURES.traceUrl, - }, - }, - ], + name: "get_sentry_resource", + arguments: { + url: FIXTURES.traceUrl, + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/get-trace-details.eval.ts b/packages/mcp-server-evals/src/evals/get-trace-details.eval.ts index 86678bed4..61d82e7f8 100644 --- a/packages/mcp-server-evals/src/evals/get-trace-details.eval.ts +++ b/packages/mcp-server-evals/src/evals/get-trace-details.eval.ts @@ -1,55 +1,46 @@ -import { describeEval, ToolCallScorer } from "vitest-evals"; -import { FIXTURES, McpToolCallTaskRunner } from "./utils"; +import { describeMcpToolCallEval, FIXTURES } from "./utils"; -describeEval("get-trace-details", { - data: async () => { - return [ +describeMcpToolCallEval("get-trace-details", [ + { + input: `Show me trace ${FIXTURES.traceId} from Sentry in ${FIXTURES.organizationSlug}.`, + expectedTools: [ { - input: `Show me trace ${FIXTURES.traceId} from Sentry in ${FIXTURES.organizationSlug}.`, - expectedTools: [ - { - name: "search_tools", - arguments: { - query: "trace", - }, - }, - { - name: "execute_tool", - arguments: { - name: "get_trace_details", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - traceId: FIXTURES.traceId, - }, - }, - }, - ], + name: "search_tools", + arguments: { + query: "trace", + }, }, { - input: `Explain trace ${FIXTURES.traceId} in ${FIXTURES.organizationSlug}.`, - expectedTools: [ - { - name: "search_tools", - arguments: { - query: "trace", - }, + name: "execute_tool", + arguments: { + name: "get_trace_details", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + traceId: FIXTURES.traceId, }, - { - name: "execute_tool", - arguments: { - name: "get_trace_details", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - traceId: FIXTURES.traceId, - }, - }, + }, + }, + ], + }, + { + input: `Explain trace ${FIXTURES.traceId} in ${FIXTURES.organizationSlug}.`, + expectedTools: [ + { + name: "search_tools", + arguments: { + query: "trace", + }, + }, + { + name: "execute_tool", + arguments: { + name: "get_trace_details", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + traceId: FIXTURES.traceId, }, - ], + }, }, - ]; + ], }, - task: McpToolCallTaskRunner(), - scorers: [ToolCallScorer({ ordered: true, params: "fuzzy" })], - threshold: 0.6, - timeout: 90000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/list-dsns.eval.ts b/packages/mcp-server-evals/src/evals/list-dsns.eval.ts index ad9341666..84103a5b8 100644 --- a/packages/mcp-server-evals/src/evals/list-dsns.eval.ts +++ b/packages/mcp-server-evals/src/evals/list-dsns.eval.ts @@ -1,25 +1,16 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("list-dsns", { - data: async () => { - return [ +describeToolPredictionEval("list-dsns", [ + { + input: `What is the SENTRY_DSN for ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}?`, + expectedTools: [ { - input: `What is the SENTRY_DSN for ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}?`, - expectedTools: [ - { - name: "find_dsns", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - projectSlug: FIXTURES.projectSlug, - }, - }, - ], + name: "find_dsns", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + projectSlug: FIXTURES.projectSlug, + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/list-issues.eval.ts b/packages/mcp-server-evals/src/evals/list-issues.eval.ts index 64295d64c..377ea66cc 100644 --- a/packages/mcp-server-evals/src/evals/list-issues.eval.ts +++ b/packages/mcp-server-evals/src/evals/list-issues.eval.ts @@ -1,94 +1,85 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("list-issues", { - data: async () => { - return [ +describeToolPredictionEval("list-issues", [ + { + input: `What are the most common production errors in ${FIXTURES.organizationSlug}?`, + expectedTools: [ { - input: `What are the most common production errors in ${FIXTURES.organizationSlug}?`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issues", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - query: "is:unresolved", - sort: "freq", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, { - input: `Show me the top issues in ${FIXTURES.organizationSlug} organization`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issues", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - sort: "freq", - }, - }, - ], + name: "search_issues", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + query: "is:unresolved", + sort: "freq", + }, }, + ], + }, + { + input: `Show me the top issues in ${FIXTURES.organizationSlug} organization`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "search_issues", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + sort: "freq", + }, + }, + ], + }, + { + input: `What are the most recent issues in ${FIXTURES.organizationSlug}?`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "search_issues", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + sort: "date", + }, + }, + ], + }, + { + input: `Find the newest production issues in ${FIXTURES.organizationSlug}`, + expectedTools: [ { - input: `What are the most recent issues in ${FIXTURES.organizationSlug}?`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issues", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - sort: "date", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, { - input: `Find the newest production issues in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issues", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - sort: "new", - }, - }, - ], + name: "search_issues", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + sort: "new", + }, + }, + ], + }, + { + input: `What issues is david@sentry.io experiencing in ${FIXTURES.organizationSlug}?`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, }, { - input: `What issues is david@sentry.io experiencing in ${FIXTURES.organizationSlug}?`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issues", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - query: "user.email:david@sentry.io", - }, - }, - ], + name: "search_issues", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + query: "user.email:david@sentry.io", + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/list-organizations.eval.ts b/packages/mcp-server-evals/src/evals/list-organizations.eval.ts index 826e53402..f5238fd39 100644 --- a/packages/mcp-server-evals/src/evals/list-organizations.eval.ts +++ b/packages/mcp-server-evals/src/evals/list-organizations.eval.ts @@ -1,22 +1,13 @@ -import { describeEval } from "vitest-evals"; -import { NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval } from "./utils"; -describeEval("list-organizations", { - data: async () => { - return [ +describeToolPredictionEval("list-organizations", [ + { + input: `What organizations do I have access to in Sentry`, + expectedTools: [ { - input: `What organizations do I have access to in Sentry`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - ], + name: "find_organizations", + arguments: {}, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/list-projects.eval.ts b/packages/mcp-server-evals/src/evals/list-projects.eval.ts index 50c698034..e98cfccaf 100644 --- a/packages/mcp-server-evals/src/evals/list-projects.eval.ts +++ b/packages/mcp-server-evals/src/evals/list-projects.eval.ts @@ -1,29 +1,20 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("list-projects", { - data: async () => { - return [ +describeToolPredictionEval("list-projects", [ + { + input: `What projects do I have access to in Sentry for '${FIXTURES.organizationSlug}'`, + expectedTools: [ { - input: `What projects do I have access to in Sentry for '${FIXTURES.organizationSlug}'`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "find_projects", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - ]; + { + name: "find_projects", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + regionUrl: "https://us.sentry.io", + }, + }, + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/list-releases.eval.ts b/packages/mcp-server-evals/src/evals/list-releases.eval.ts index bba7d48da..7c1972896 100644 --- a/packages/mcp-server-evals/src/evals/list-releases.eval.ts +++ b/packages/mcp-server-evals/src/evals/list-releases.eval.ts @@ -1,53 +1,44 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("list-releases", { - data: async () => { - return [ +describeToolPredictionEval("list-releases", [ + { + input: `Show me the releases in ${FIXTURES.organizationSlug}`, + expectedTools: [ { - input: `Show me the releases in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "find_releases", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, { - input: `Show me a list of versions in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "find_projects", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - regionUrl: "https://us.sentry.io", - }, - }, - { - name: "find_releases", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - projectSlug: FIXTURES.projectSlug, - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "find_releases", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + regionUrl: "https://us.sentry.io", + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); + { + input: `Show me a list of versions in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "find_projects", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + regionUrl: "https://us.sentry.io", + }, + }, + { + name: "find_releases", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + projectSlug: FIXTURES.projectSlug, + regionUrl: "https://us.sentry.io", + }, + }, + ], + }, +]); diff --git a/packages/mcp-server-evals/src/evals/list-tags.eval.ts b/packages/mcp-server-evals/src/evals/list-tags.eval.ts index 3470c83c8..fee738162 100644 --- a/packages/mcp-server-evals/src/evals/list-tags.eval.ts +++ b/packages/mcp-server-evals/src/evals/list-tags.eval.ts @@ -1,29 +1,22 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("list-tags", { - data: async () => { - return [ +describeToolPredictionEval("get-issue-tag-values", [ + { + input: `What are common values for the url tag on issue CLOUDFLARE-MCP-41 in ${FIXTURES.organizationSlug}?`, + expectedTools: [ { - input: `What are common tags in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "find_tags", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - ]; + { + name: "get_issue_tag_values", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + regionUrl: "https://us.sentry.io", + issueId: "CLOUDFLARE-MCP-41", + tagKey: "url", + }, + }, + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/list-teams.eval.ts b/packages/mcp-server-evals/src/evals/list-teams.eval.ts index 3e598dbe0..d28f329a1 100644 --- a/packages/mcp-server-evals/src/evals/list-teams.eval.ts +++ b/packages/mcp-server-evals/src/evals/list-teams.eval.ts @@ -1,61 +1,52 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("list-teams", { - data: async () => { - return [ +describeToolPredictionEval("list-teams", [ + { + input: `What teams do I have access to in Sentry for '${FIXTURES.organizationSlug}'`, + expectedTools: [ { - input: `What teams do I have access to in Sentry for '${FIXTURES.organizationSlug}'`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "find_teams", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, { - input: `Do I have access to the team '${FIXTURES.teamSlug}' for '${FIXTURES.organizationSlug}'`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "find_teams", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "find_teams", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + regionUrl: "https://us.sentry.io", + }, + }, + ], + }, + { + input: `Do I have access to the team '${FIXTURES.teamSlug}' for '${FIXTURES.organizationSlug}'`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "find_teams", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + regionUrl: "https://us.sentry.io", + }, + }, + ], + }, + { + input: `Do I have access to the team 'an-imaginary-team' for '${FIXTURES.organizationSlug}'`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, }, { - input: `Do I have access to the team 'an-imaginary-team' for '${FIXTURES.organizationSlug}'`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "find_teams", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "find_teams", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + regionUrl: "https://us.sentry.io", + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/search-docs.eval.ts b/packages/mcp-server-evals/src/evals/search-docs.eval.ts index 2d9454dca..b7cbdb817 100644 --- a/packages/mcp-server-evals/src/evals/search-docs.eval.ts +++ b/packages/mcp-server-evals/src/evals/search-docs.eval.ts @@ -1,51 +1,42 @@ -import { describeEval } from "vitest-evals"; -import { NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval } from "./utils"; -describeEval("search-docs", { - data: async () => { - return [ +describeToolPredictionEval("search-docs", [ + { + input: + "I need documentation on how to set up error tracking with Sentry in JavaScript", + expectedTools: [ { - input: - "I need documentation on how to set up error tracking with Sentry in JavaScript", - expectedTools: [ - { - name: "search_docs", - arguments: { - query: "set up error tracking JavaScript", - maxResults: 3, - }, - }, - ], + name: "search_docs", + arguments: { + query: "set up error tracking JavaScript", + maxResults: 3, + }, }, + ], + }, + { + input: + "I need help configuring Sentry with React components and error boundaries", + expectedTools: [ { - input: - "I need help configuring Sentry with React components and error boundaries", - expectedTools: [ - { - name: "search_docs", - arguments: { - query: "React components error boundaries", - maxResults: 3, - }, - }, - ], + name: "search_docs", + arguments: { + query: "React components error boundaries", + maxResults: 3, + }, }, + ], + }, + { + input: "What is Sentry's rate limiting and how does it work?", + expectedTools: [ { - input: "What is Sentry's rate limiting and how does it work?", - expectedTools: [ - { - name: "search_docs", - arguments: { - query: "rate limiting", - maxResults: 3, - }, - }, - ], + name: "search_docs", + arguments: { + query: "rate limiting", + maxResults: 3, + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/search-events-agent.eval.ts b/packages/mcp-server-evals/src/evals/search-events-agent.eval.ts index 9ca562017..d7786d1cc 100644 --- a/packages/mcp-server-evals/src/evals/search-events-agent.eval.ts +++ b/packages/mcp-server-evals/src/evals/search-events-agent.eval.ts @@ -1,227 +1,198 @@ -import { describeEval } from "vitest-evals"; -import { ToolCallScorer } from "vitest-evals"; -import { searchEventsAgent } from "@sentry/mcp-core/tools/search-events/agent"; -import { SentryApiService } from "@sentry/mcp-core/api-client"; -import { StructuredOutputScorer } from "./utils/structuredOutputScorer"; +import { describeSearchAgentEval, searchEventsAgentHarness } from "./utils"; import "../setup-env"; // The shared MSW server is already started in setup-env.ts -describeEval("search-events-agent", { - data: async () => { - return [ - { - // Simple query with common fields - should NOT require tool calls - input: "Show me all errors from today", - expectedTools: [], - expected: { - dataset: "errors", - query: "", // No filters, just time range - sort: "-timestamp", - timeRange: { statsPeriod: "24h" }, - }, - }, - { - // Query with "me" reference - should only require whoami - input: "Show me my errors from last week", - expectedTools: [ - { - name: "whoami", - arguments: {}, - }, - ], - expected: { - dataset: "errors", - query: /user\.email:test@example\.com|user\.id:123456/, // Can be either - sort: "-timestamp", - timeRange: { statsPeriod: "7d" }, - }, - }, +// biome-ignore format: keep the long eval case list diff stable. +describeSearchAgentEval("search-events-agent", searchEventsAgentHarness, [ + { + // Simple query with common fields - should NOT require tool calls + input: "Show me all errors from today", + expectedTools: [], + expected: { + dataset: "errors", + query: "", // No filters, just time range + sort: "-timestamp", + timeRange: { statsPeriod: "24h" }, + }, + }, + { + // Query with "me" reference may use direct Sentry syntax or resolve whoami. + input: "Show me my errors from last week", + expectedTools: [], + expected: { + dataset: "errors", + query: + /assignedTo:me|user\.email:"?test@example\.com"?|user\.id:"?123456"?/, // Can be direct shorthand or resolved identity + sort: "-timestamp", + timeRange: { statsPeriod: "7d" }, + }, + }, + { + // Common performance query - should NOT require tool calls + input: "Show me slow API calls taking more than 1 second", + expectedTools: [], + expected: { + dataset: "spans", + query: /span\.duration:>1000|span\.duration:>1s/, // Can express as ms or seconds + sort: "-span.duration", + }, + }, + { + // Query with OpenTelemetry attributes that need discovery + input: "Show me LLM calls where temperature setting is above 0.7", + expectedTools: [ { - // Common performance query - should NOT require tool calls - input: "Show me slow API calls taking more than 1 second", - expectedTools: [], - expected: { + name: "datasetAttributes", + arguments: { dataset: "spans", - query: /span\.duration:>1000|span\.duration:>1s/, // Can express as ms or seconds - sort: "-span.duration", }, }, { - // Query with OpenTelemetry attributes that need discovery - input: "Show me LLM calls where temperature setting is above 0.7", - expectedTools: [ - { - name: "datasetAttributes", - arguments: { - dataset: "spans", - }, - }, - { - name: "otelSemantics", - arguments: { - namespace: "gen_ai", - dataset: "spans", - }, - }, - ], - expected: { + name: "otelSemantics", + arguments: { + namespace: "gen_ai", dataset: "spans", - query: "gen_ai.request.temperature:>0.7", - sort: "-span.duration", }, }, + ], + expected: { + dataset: "spans", + query: /gen_ai\.request\.temperature:>0\.7/, + sort: "-span.duration", + }, + }, + { + // Query with custom field requiring discovery + input: "Find errors with custom.payment.processor field", + expectedTools: [ { - // Query with custom field requiring discovery - input: "Find errors with custom.payment.processor field", - expectedTools: [ - { - name: "datasetAttributes", - arguments: { - dataset: "errors", - }, - }, - ], - expected: { + name: "datasetAttributes", + arguments: { dataset: "errors", - query: "has:custom.payment.processor", - sort: "-timestamp", }, }, + ], + expected: { + dataset: "errors", + query: + /has:custom\.payment\.processor|has:tags\[custom\.payment\.processor\]/, + sort: "-timestamp", + }, + }, + { + // Query with custom field requiring discovery + input: "Show me spans where custom.db.pool_size is greater than 10", + expectedTools: [ { - // Query with custom field requiring discovery - input: "Show me spans where custom.db.pool_size is greater than 10", - expectedTools: [ - { - name: "datasetAttributes", - arguments: { - dataset: "spans", - }, - }, - ], - expected: { + name: "datasetAttributes", + arguments: { dataset: "spans", - query: "custom.db.pool_size:>10", - sort: "-span.duration", }, }, + ], + expected: { + dataset: "spans", + query: /custom\.db\.pool_size:>10|has:custom\.db\.pool_size/, + sort: /-span\.duration|-custom\.db\.pool_size|-timestamp/, + }, + }, + { + // User-supplied Sentry syntax should remain authoritative. The agent + // can validate fields, but it should not rewrite or drop explicit + // filters/fields while translating the request. + input: + 'In spans, search for transaction:"VPN connections" tags[type]:Unified tags[country]:CN over the last 7 days. Return tags[type], tags[sequence], and count(), sorted by count descending.', + expectedTools: [ { - // User-supplied Sentry syntax should remain authoritative. The agent - // can validate fields, but it should not rewrite or drop explicit - // filters/fields while translating the request. - input: - 'In spans, search for transaction:"VPN connections" tags[type]:Unified tags[country]:CN over the last 7 days. Return tags[type], tags[sequence], and count(), sorted by count descending.', - expectedTools: [ - { - name: "datasetAttributes", - }, - ], - expected: { - dataset: "spans", - query: (value: unknown) => - typeof value === "string" && - [ - 'transaction:"VPN connections"', - "tags[type]:Unified", - "tags[country]:CN", - ].every((token) => value.includes(token)), - fields: (value: unknown) => - Array.isArray(value) && - ["tags[type]", "tags[sequence]", "count()"].every((field) => - value.includes(field), - ), - sort: "-count()", - timeRange: { statsPeriod: "7d" }, - }, + name: "datasetAttributes", }, + ], + expected: { + dataset: "spans", + query: (value: unknown) => + typeof value === "string" && + [ + 'transaction:"VPN connections"', + "tags[type]:Unified", + "tags[country]:CN", + ].every((token) => value.includes(token)), + fields: (value: unknown) => + Array.isArray(value) && + ["tags[type]", "tags[sequence]", "count()"].every((field) => + value.includes(field), + ), + sort: "-count()", + timeRange: { statsPeriod: "7d" }, + }, + }, + { + // Query requiring equation field calculation + input: "How many total tokens did we consume yesterday", + expectedTools: [ { - // Query requiring equation field calculation - input: "How many total tokens did we consume yesterday", - expectedTools: [ - { - name: "datasetAttributes", - arguments: { - dataset: "spans", - }, - }, - // Agent may find gen_ai fields and use them for calculation - ], - expected: { + name: "datasetAttributes", + arguments: { dataset: "spans", - // For aggregations, query filter is optional - empty query gets all spans - query: /^$|has:gen_ai\.usage\.(input_tokens|output_tokens)/, - // Equation to sum both token types - fields: [ - "equation|sum(gen_ai.usage.input_tokens) + sum(gen_ai.usage.output_tokens)", - ], - // Sort by the equation result in descending order - sort: "-equation|sum(gen_ai.usage.input_tokens) + sum(gen_ai.usage.output_tokens)", - timeRange: { statsPeriod: "24h" }, - }, - }, - { - // Query that tests sort field self-correction - // Agent should self-correct by adding count() to fields when sorting by it - input: "Show me the top 10 most frequent error types", - expectedTools: [], - expected: { - dataset: "errors", - query: "", // No specific filter, just aggregate all errors - // Agent should include count() in fields since we're sorting by it - fields: ["error.type", "count()"], - // Sort by count in descending order to get "most frequent" - sort: "-count()", - // timeRange can be null or have a default period }, }, + // Agent may find gen_ai fields and use them for calculation + ], + expected: { + dataset: "spans", + // For aggregations, query filter is optional - empty query gets all spans + query: /^$|has:gen_ai\.usage\.(input_tokens|output_tokens)/, + // Equation to sum both token types + fields: [ + "equation|sum(gen_ai.usage.input_tokens) + sum(gen_ai.usage.output_tokens)", + ], + // Sort by the equation result in descending order + sort: "-equation|sum(gen_ai.usage.input_tokens) + sum(gen_ai.usage.output_tokens)", + timeRange: { statsPeriod: "24h" }, + }, + }, + { + // Query that tests sort field self-correction + // Agent should self-correct by adding count() to fields when sorting by it + input: "Show me the top 10 most frequent error types", + expectedTools: [], + expected: { + dataset: "errors", + // Empty query is ideal, but filtering to rows with error.type is also a + // valid way to protect the grouping field. + query: /^$|has:error\.type/, + // Agent should include count() in fields since we're sorting by it + fields: ["error.type", "count()"], + // Sort by count in descending order to get "most frequent" + sort: "-count()", + // timeRange can be null or have a default period + }, + }, + { + // Complex aggregate query that tests sort field self-correction + // Agent should self-correct by including avg(span.duration) in fields + input: + "Show me database operations grouped by type, sorted by average duration", + expectedTools: [ { - // Complex aggregate query that tests sort field self-correction - // Agent should self-correct by including avg(span.duration) in fields - input: - "Show me database operations grouped by type, sorted by average duration", - expectedTools: [ - { - name: "datasetAttributes", - arguments: { - dataset: "spans", - }, - }, - ], - expected: { + name: "datasetAttributes", + arguments: { dataset: "spans", - query: "has:db.operation", - // Agent must include avg(span.duration) since we're sorting by it - // Use db.operation as the grouping field (span.op is deprecated) - fields: ["db.operation", "avg(span.duration)"], - // Sort by average duration - sort: "-avg(span.duration)", - // timeRange is optional }, }, - ]; - }, - task: async (input) => { - // Create a real API service that will use MSW mocks - const apiService = new SentryApiService({ - accessToken: "test-token", - }); - - const agentResult = await searchEventsAgent({ - query: input, - organizationSlug: "sentry-mcp-evals", - apiService, - }); - - return { - result: JSON.stringify(agentResult.result), - toolCalls: agentResult.toolCalls.map((call: any) => ({ - name: call.toolName, - arguments: call.args, - })), - }; + ], + expected: { + dataset: "spans", + query: /has:db\.operation|has:db\.system/, + // Agent must include avg(span.duration) since we're sorting by it + // Use db.operation as the grouping field (span.op is deprecated) + fields: (value: unknown) => + Array.isArray(value) && + ["avg(span.duration)"].every((field) => value.includes(field)) && + (value.includes("db.operation") || value.includes("db.system")), + // Sort by average duration + sort: "-avg(span.duration)", + // timeRange is optional + }, }, - scorers: [ - ToolCallScorer(), // Validates tool calls - StructuredOutputScorer({ match: "fuzzy" }), // Validates the structured query output with flexible matching - ], -}); +], { timeout: 180000 }); diff --git a/packages/mcp-server-evals/src/evals/search-events.eval.ts b/packages/mcp-server-evals/src/evals/search-events.eval.ts index 79f06d2dd..b00152389 100644 --- a/packages/mcp-server-evals/src/evals/search-events.eval.ts +++ b/packages/mcp-server-evals/src/evals/search-events.eval.ts @@ -1,110 +1,101 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; // Note: This eval requires OPENAI_API_KEY to be set in the environment // The search_events tool uses the AI SDK to translate natural language queries -describeEval("search-events", { - data: async () => { - return [ - // Core test: Basic error event search +describeToolPredictionEval("search-events", [ + // Core test: Basic error event search + { + input: `Find database timeouts in ${FIXTURES.organizationSlug} from the last week`, + expectedTools: [ { - input: `Find database timeouts in ${FIXTURES.organizationSlug} from the last week`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_events", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - query: "database timeouts from the last week", - dataset: "errors", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - // Core test: Performance spans search { - input: `Find slow API calls taking over 5 seconds in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_events", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - query: "slow API calls taking over 5 seconds", - dataset: "spans", - }, - }, - ], + name: "search_events", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + query: "database timeouts from the last week", + dataset: "errors", + }, }, - // Core test: Logs search + ], + }, + // Core test: Performance spans search + { + input: `Find slow API calls taking over 5 seconds in ${FIXTURES.organizationSlug}`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "search_events", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + query: "slow API calls taking over 5 seconds", + dataset: "spans", + }, + }, + ], + }, + // Core test: Logs search + { + input: `Show me error logs from the last hour in ${FIXTURES.organizationSlug}`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "search_events", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + query: "error logs from the last hour", + dataset: "logs", + }, + }, + ], + }, + // Core test: Project-specific search + { + input: `Show me authentication errors in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "search_events", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + projectSlug: FIXTURES.projectSlug, + query: "authentication errors", + dataset: "errors", + }, + }, + ], + }, + // Core test: Search with 'me' reference + { + input: `Show me errors affecting me in ${FIXTURES.organizationSlug}`, + expectedTools: [ { - input: `Show me error logs from the last hour in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_events", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - query: "error logs from the last hour", - dataset: "logs", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - // Core test: Project-specific search { - input: `Show me authentication errors in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_events", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - projectSlug: FIXTURES.projectSlug, - query: "authentication errors", - dataset: "errors", - }, - }, - ], + name: "whoami", + arguments: {}, }, - // Core test: Search with 'me' reference { - input: `Show me errors affecting me in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "whoami", - arguments: {}, - }, - { - name: "search_events", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - query: "errors affecting user.id:12345", - dataset: "errors", - }, - }, - ], + name: "search_events", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + query: "errors affecting user.id:12345", + dataset: "errors", + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/search-issue-events-agent.eval.ts b/packages/mcp-server-evals/src/evals/search-issue-events-agent.eval.ts index 7e32c449f..4514502ed 100644 --- a/packages/mcp-server-evals/src/evals/search-issue-events-agent.eval.ts +++ b/packages/mcp-server-evals/src/evals/search-issue-events-agent.eval.ts @@ -1,128 +1,98 @@ -import { describeEval } from "vitest-evals"; -import { ToolCallScorer } from "vitest-evals"; -import { searchIssueEventsAgent } from "@sentry/mcp-core/tools/search-issue-events/agent"; -import { SentryApiService } from "@sentry/mcp-core/api-client"; -import { StructuredOutputScorer } from "./utils/structuredOutputScorer"; +import { + describeSearchAgentEval, + searchIssueEventsAgentHarness, +} from "./utils"; import "../setup-env"; // The shared MSW server is already started in setup-env.ts -describeEval("search-issue-events-agent", { - data: async () => { - return [ - { - // Simple time-based query - should NOT require tool calls - input: "Show me events from the last hour", - expectedTools: [], - expected: { - query: "", // No additional filters beyond issue constraint - sort: "-timestamp", - timeRange: { statsPeriod: "1h" }, - }, +describeSearchAgentEval( + "search-issue-events-agent", + searchIssueEventsAgentHarness, + [ + { + // Simple time-based query - should NOT require tool calls + input: "Show me events from the last hour", + expectedTools: [], + expected: { + query: "", // No additional filters beyond issue constraint + sort: "-timestamp", + timeRange: { statsPeriod: "1h" }, }, - { - // Environment and release filtering - should NOT require tool calls - input: "Find production events with release v1.0.5", - expectedTools: [], - expected: { - query: - /environment:production.*release:v1\.0\.5|release:v1\.0\.5.*environment:production/, - sort: "-timestamp", - }, + }, + { + // Environment and release filtering - should NOT require tool calls + input: "Find production events with release v1.0.5", + expectedTools: [], + expected: { + query: + /environment:production.*release:v1\.0\.5|release:v1\.0\.5.*environment:production/, + sort: "-timestamp", }, - { - // User-specific filtering - may require whoami if query uses "me" - input: "Show me events affecting user alice@example.com", - expectedTools: [], - expected: { - query: "user.email:alice@example.com", - sort: "-timestamp", - }, + }, + { + // User-specific filtering - may require whoami if query uses "me" + input: "Show me events affecting user alice@example.com", + expectedTools: [], + expected: { + query: "user.email:alice@example.com", + sort: "-timestamp", }, - { - // Query with "me" reference - should require whoami - input: "Show me events from my user", - expectedTools: [ - { - name: "whoami", - arguments: {}, - }, - ], - expected: { - query: /user\.email:test@example\.com|user:test@example\.com/, // Various valid forms - sort: "-timestamp", + }, + { + // Query with "me" reference - should require whoami + input: "Show me events from my user", + expectedTools: [ + { + name: "whoami", }, + ], + expected: { + query: /user\.email:"?test@example\.com"?|user:"?test@example\.com"?/, // Various valid forms + sort: "-timestamp", }, - { - // Trace ID filtering - should NOT require tool calls - input: "Find events with trace ID abc123def456", - expectedTools: [], - expected: { - query: "trace:abc123def456", - sort: "-timestamp", - }, + }, + { + // Trace ID filtering - should NOT require tool calls + input: "Find events with trace ID abc123def456", + expectedTools: [], + expected: { + query: "trace:abc123def456", + sort: "-timestamp", }, - { - // URL pattern filtering - should NOT require tool calls - input: "Show me events from the /checkout/ page", - expectedTools: [], - expected: { - query: /"url:.*\/checkout\/.*"|url:".*checkout.*"/, // URL pattern with wildcard - sort: "-timestamp", - }, + }, + { + // URL pattern filtering - should NOT require tool calls + input: "Show me events from the /checkout/ page", + expectedTools: [], + expected: { + query: /"url:.*\/checkout\/.*"|url:".*checkout.*"/, // URL pattern with wildcard + sort: "-timestamp", }, - { - // Combined filters with time range - input: "Production events from yesterday with specific release", - expectedTools: [], - expected: { - query: - /environment:production.*release:|release:.*environment:production/, - sort: "-timestamp", - timeRange: { statsPeriod: "24h" }, - }, + }, + { + // Combined filters with time range + input: "Production events from yesterday with specific release", + expectedTools: [], + expected: { + query: + /^$|^environment:production$|environment:production.*(?:release:|has:release)|(?:release:|has:release).*environment:production/, + sort: "-timestamp", + timeRange: { statsPeriod: "24h" }, }, - { - // Query that might need field discovery for uncommon tags - input: "Events where device family is mobile", - expectedTools: [ - { - name: "issueEventFields", - arguments: {}, - }, - ], - expected: { - query: /device\.family:mobile|device:mobile/, - sort: "-timestamp", + }, + { + // Query that might need field discovery for uncommon tags + input: "Events where device family is mobile", + expectedTools: [ + { + name: "issueEventFields", }, + ], + expected: { + query: /device\.family:mobile|device:mobile/, + sort: "-timestamp", }, - ]; - }, - task: async (input) => { - // Create a real API service that will use MSW mocks - const apiService = new SentryApiService({ - accessToken: "test-token", - }); - - const agentResult = await searchIssueEventsAgent({ - query: input, - organizationSlug: "sentry-mcp-evals", - apiService, - }); - - // Return in the format expected by ToolCallScorer - return { - result: JSON.stringify(agentResult.result), - toolCalls: agentResult.toolCalls.map((call: any) => ({ - name: call.toolName, - arguments: call.args, - })), - }; - }, - scorers: [ - ToolCallScorer(), // Validates tool calls - StructuredOutputScorer({ match: "fuzzy" }), // Validates the structured query output with flexible matching + }, ], - threshold: 0.6, - timeout: 30000, -}); +); diff --git a/packages/mcp-server-evals/src/evals/search-issue-events.eval.ts b/packages/mcp-server-evals/src/evals/search-issue-events.eval.ts index 61f693939..9e278da31 100644 --- a/packages/mcp-server-evals/src/evals/search-issue-events.eval.ts +++ b/packages/mcp-server-evals/src/evals/search-issue-events.eval.ts @@ -1,87 +1,78 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; // Note: This eval requires OPENAI_API_KEY to be set in the environment // The search_issue_events tool uses the AI SDK to translate natural language queries -describeEval("search-issue-events", { - data: async () => { - return [ - // Core test: Basic time-based filtering within an issue +describeToolPredictionEval("search-issue-events", [ + // Core test: Basic time-based filtering within an issue + { + input: `Show me events from the last hour in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`, + expectedTools: [ { - input: `Show me events from the last hour in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issue_events", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - issueId: FIXTURES.issueId, - query: "from the last hour", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - // Core test: Environment and release filtering { - input: `Find production events with release v1.0 in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issue_events", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - issueId: FIXTURES.issueId, - query: "production events with release v1.0", - }, - }, - ], + name: "search_issue_events", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + issueId: FIXTURES.issueId, + query: "from the last hour", + }, }, - // Core test: User-specific filtering + ], + }, + // Core test: Environment and release filtering + { + input: `Find production events with release v1.0 in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "search_issue_events", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + issueId: FIXTURES.issueId, + query: "production events with release v1.0", + }, + }, + ], + }, + // Core test: User-specific filtering + { + input: `Show me events affecting user alice@example.com in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "search_issue_events", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + issueId: FIXTURES.issueId, + query: "affecting user alice@example.com", + }, + }, + ], + }, + // Core test: Trace ID filtering + { + input: `Find events with trace ID abc123 in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`, + expectedTools: [ { - input: `Show me events affecting user alice@example.com in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issue_events", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - issueId: FIXTURES.issueId, - query: "affecting user alice@example.com", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - // Core test: Trace ID filtering { - input: `Find events with trace ID abc123 in issue ${FIXTURES.issueId} in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issue_events", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - issueId: FIXTURES.issueId, - query: "with trace ID abc123", - }, - }, - ], + name: "search_issue_events", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + issueId: FIXTURES.issueId, + query: "with trace ID abc123", + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/search-issues-agent.eval.ts b/packages/mcp-server-evals/src/evals/search-issues-agent.eval.ts index 56622f257..a1bc00cc5 100644 --- a/packages/mcp-server-evals/src/evals/search-issues-agent.eval.ts +++ b/packages/mcp-server-evals/src/evals/search-issues-agent.eval.ts @@ -1,155 +1,111 @@ -import { SentryApiService } from "@sentry/mcp-core/api-client"; -import { searchIssuesAgent } from "@sentry/mcp-core/tools/search-issues/agent"; -import { describeEval } from "vitest-evals"; -import { ToolCallScorer } from "vitest-evals"; -import { StructuredOutputScorer } from "./utils/structuredOutputScorer"; +import { describeSearchAgentEval, searchIssuesAgentHarness } from "./utils"; import "../setup-env"; // The shared MSW server is already started in setup-env.ts -describeEval("search-issues-agent", { - data: async () => { - return [ - { - // Simple query with common fields - should NOT require tool calls - input: "Show me unresolved issues", - expectedTools: [], - expected: { - query: "is:unresolved", - sort: "date", // Agent uses "date" as default - }, - }, - { - // Natural-language "me" reference should resolve through whoami. - input: "Show me issues assigned to me", - expectedTools: [ - { - name: "whoami", - arguments: {}, - }, - ], - expected: { - query: - /assigned_or_suggested:test@example\.com|assigned:test@example\.com|assigned:me/, // Various valid forms - sort: "date", - }, - }, - { - // Explicit "me" is valid Sentry syntax and should not be resolved. - input: "assigned:me is:unresolved", - expectedTools: [], - expected: { - query: /(?=.*assigned:me)(?=.*is:unresolved)/, - sort: "date", - }, - }, - { - // Complex query but with common fields - should NOT require tool calls - input: "Show me critical unhandled errors from the last 24 hours", - expectedTools: [], - expected: { - query: - /(?=.*is:unresolved)(?=.*error\.handled:false)(?=.*lastSeen:-24h)/, - sort: /date|user/, - }, - }, - { - // Tag-presence query can be expressed directly with has: - input: "Show me issues with custom.payment.failed tag", - expectedTools: [], - expected: { - query: - /has:custom\.payment\.failed|custom\.payment\.failed|tags\[custom\.payment\.failed\]/, // All are valid tag forms - sort: "date", // Agent should always return a sort value - }, - }, - { - // Another query requiring field discovery - input: "Find issues where the kafka.consumer.group is orders-processor", - expectedTools: [ - { - name: "issueFields", - arguments: {}, // No arguments needed anymore - }, - ], - expected: { - query: - /kafka\.consumer\.group:orders-processor|tags\[kafka\.consumer\.group\]:orders-processor/, - sort: "date", // Agent should always return a sort value - }, - }, - { - // Easy to fix issues - should use seer_actionability filter - input: "Show me easy to fix bugs", - expectedTools: [], - expected: { - query: /issue\.seer_actionability/, - sort: "date", - }, - }, - { - // Quick wins query - should combine actionability with unresolved - input: "Show me quick wins in production", - expectedTools: [], - expected: { - query: - /issue\.seer_actionability.*environment:production|environment:production.*issue\.seer_actionability/, - sort: /date|user/, - }, - }, - { - // Explicit issue-search syntax should be preserved, not broadened. - input: "is:for_review release:latest assigned:me issue.priority:high", - expectedTools: [], - expected: { - query: - /(?=.*is:for_review)(?=.*release:latest)(?=.*assigned:me)(?=.*issue\.priority:high)/, - sort: "date", - }, - }, - { - // Mixed natural language may set sort, but explicit filters stay intact. - input: "sort by users is:for_review release:latest", - expectedTools: [], - expected: { - query: /^(?!.*sort:)(?=.*is:for_review)(?=.*release:latest)/, - sort: "user", - }, - }, +describeSearchAgentEval("search-issues-agent", searchIssuesAgentHarness, [ + { + // Simple query with common fields - should NOT require tool calls + input: "Show me unresolved issues", + expectedTools: [], + expected: { + query: "is:unresolved", + sort: "date", // Agent uses "date" as default + }, + }, + { + // Natural-language "me" reference should resolve through whoami. + input: "Show me issues assigned to me", + expectedTools: [ { - // Valid inbox/substatus filters should not be generalized. - input: "is:new is:regressed", - expectedTools: [], - expected: { - query: /^(?!.*is:unresolved)(?=.*is:new)(?=.*is:regressed)/, - sort: "date", - }, + name: "whoami", }, - ]; + ], + expected: { + query: + /assigned_or_suggested:test@example\.com|assigned:test@example\.com|assigned:me/, // Various valid forms + }, }, - task: async (input) => { - // Create a real API service that will use MSW mocks - const apiService = new SentryApiService({ - accessToken: "test-token", - }); - - const agentResult = await searchIssuesAgent({ - query: input, - organizationSlug: "sentry-mcp-evals", - apiService, - }); - - // Return in the format expected by ToolCallScorer - return { - result: JSON.stringify(agentResult.result), - toolCalls: agentResult.toolCalls.map((call: any) => ({ - name: call.toolName, - arguments: call.args, - })), - }; + { + // Explicit "me" is valid Sentry syntax and should not be resolved. + input: "assigned:me is:unresolved", + expectedTools: [], + expected: { + query: /(?=.*assigned:me)(?=.*is:unresolved)/, + }, + }, + { + // Complex query but with common fields - should NOT require tool calls + input: "Show me critical unhandled errors from the last 24 hours", + expectedTools: [], + expected: { + query: + /(?=.*is:unresolved)(?=.*(?:error\.handled:false|error\.unhandled:true))(?=.*lastSeen:(?:-24h|>=?-24h))/, + sort: /date|user/, + }, + }, + { + // Tag-presence query can be expressed directly with has: + input: "Show me issues with custom.payment.failed tag", + expectedTools: [], + expected: { + query: + /has:custom\.payment\.failed|custom\.payment\.failed|tags\[custom\.payment\.failed\]/, // All are valid tag forms + sort: (value: unknown) => value === null || value === "date", + }, + }, + { + // Custom tag queries may either use field discovery or direct tag syntax. + input: "Find issues where the kafka.consumer.group is orders-processor", + expectedTools: [], + expected: { + query: + /kafka\.consumer\.group:orders-processor|tags\[kafka\.consumer\.group\]:orders-processor/, + }, + }, + { + // Easy to fix issues - should use seer_actionability filter + input: "Show me easy to fix bugs", + expectedTools: [], + expected: { + query: /issue\.seer_actionability/, + sort: "date", + }, + }, + { + // Quick wins query - should combine actionability with unresolved + input: "Show me quick wins in production", + expectedTools: [], + expected: { + query: + /issue\.seer_actionability.*environment:production|environment:production.*issue\.seer_actionability/, + sort: /date|user/, + }, + }, + { + // Explicit issue-search syntax should be preserved, not broadened. + input: "is:for_review release:latest assigned:me issue.priority:high", + expectedTools: [], + expected: { + query: + /(?=.*is:for_review)(?=.*release:latest)(?=.*assigned:me)(?=.*issue\.priority:high)/, + }, + }, + { + // Mixed natural language may set sort, but explicit filters stay intact. + input: "sort by users is:for_review release:latest", + expectedTools: [], + expected: { + query: /^(?!.*sort:)(?=.*is:for_review)(?=.*release:latest)/, + sort: "user", + }, + }, + { + // Valid inbox/substatus filters should not be generalized. + input: "is:new is:regressed", + expectedTools: [], + expected: { + query: /^(?!.*is:unresolved)(?=.*is:new)(?=.*is:regressed)/, + }, }, - scorers: [ - ToolCallScorer(), // Validates tool calls - StructuredOutputScorer({ match: "fuzzy" }), // Validates the structured query output with flexible matching - ], -}); +]); diff --git a/packages/mcp-server-evals/src/evals/search-issues.eval.ts b/packages/mcp-server-evals/src/evals/search-issues.eval.ts index c504c165a..c24ed4e6c 100644 --- a/packages/mcp-server-evals/src/evals/search-issues.eval.ts +++ b/packages/mcp-server-evals/src/evals/search-issues.eval.ts @@ -1,88 +1,79 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; // Note: This eval requires OPENAI_API_KEY to be set in the environment // The search_issues tool uses the AI SDK to translate natural language queries -describeEval("search-issues", { - data: async () => { - return [ - // Core test: Basic issue search +describeToolPredictionEval("search-issues", [ + // Core test: Basic issue search + { + input: `Show me unresolved issues in ${FIXTURES.organizationSlug}`, + expectedTools: [ { - input: `Show me unresolved issues in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issues", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - query: "unresolved issues", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - // Core test: Search with 'me' reference (tests whoami integration) { - input: `Find issues assigned to me in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "whoami", - arguments: {}, - }, - { - name: "search_issues", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - query: "issues assigned to me", - }, - }, - ], + name: "search_issues", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + query: "unresolved issues", + }, }, - // Core test: Project-specific search + ], + }, + // Core test: Search with 'me' reference (tests whoami integration) + { + input: `Find issues assigned to me in ${FIXTURES.organizationSlug}`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "whoami", + arguments: {}, + }, + { + name: "search_issues", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + query: "issues assigned to me", + }, + }, + ], + }, + // Core test: Project-specific search + { + input: `Search for database errors in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "search_issues", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + projectSlugOrId: FIXTURES.projectSlug, + query: "database errors", + }, + }, + ], + }, + // Core test: Complex natural language query + { + input: `Find critical production errors affecting more than 100 users in ${FIXTURES.organizationSlug}`, + expectedTools: [ { - input: `Search for database errors in ${FIXTURES.organizationSlug}/${FIXTURES.projectSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issues", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - projectSlugOrId: FIXTURES.projectSlug, - query: "database errors", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - // Core test: Complex natural language query { - input: `Find critical production errors affecting more than 100 users in ${FIXTURES.organizationSlug}`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "search_issues", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - query: "critical production errors affecting more than 100 users", - }, - }, - ], + name: "search_issues", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + query: "critical production errors affecting more than 100 users", + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/update-issue.eval.ts b/packages/mcp-server-evals/src/evals/update-issue.eval.ts index e5cb3174b..af4b15513 100644 --- a/packages/mcp-server-evals/src/evals/update-issue.eval.ts +++ b/packages/mcp-server-evals/src/evals/update-issue.eval.ts @@ -1,125 +1,116 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("update-issue", { - data: async () => { - return [ - // Core use case: Resolve an issue +describeToolPredictionEval("update-issue", [ + // Core use case: Resolve an issue + { + input: `Resolve the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug}. Output only the new status as a single word.`, + expectedTools: [ { - input: `Resolve the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug}. Output only the new status as a single word.`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "update_issue", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - issueId: FIXTURES.issueId, - status: "resolved", - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - // Core use case: Assign an issue { - input: `Assign the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} to 'john.doe'. Output only the assigned username.`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "update_issue", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - issueId: FIXTURES.issueId, - assignedTo: "john.doe", - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "update_issue", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + issueId: FIXTURES.issueId, + status: "resolved", + regionUrl: "https://us.sentry.io", + }, }, - // Core use case: Using issue URL (alternative input method) + ], + }, + // Core use case: Assign an issue + { + input: `Assign the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} to 'john.doe'. Output only the assigned username.`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "update_issue", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + issueId: FIXTURES.issueId, + assignedTo: "john.doe", + regionUrl: "https://us.sentry.io", + }, + }, + ], + }, + // Core use case: Using issue URL (alternative input method) + { + input: `Resolve the issue at ${FIXTURES.issueUrl}. Output only the new status as a single word.`, + expectedTools: [ { - input: `Resolve the issue at ${FIXTURES.issueUrl}. Output only the new status as a single word.`, - expectedTools: [ - { - name: "update_issue", - arguments: { - issueUrl: FIXTURES.issueUrl, - status: "resolved", - }, - }, - ], + name: "update_issue", + arguments: { + issueUrl: FIXTURES.issueUrl, + status: "resolved", + }, }, - // Regression: default ignored status should map to "until escalating" + ], + }, + // Regression: default ignored status should map to "until escalating" + { + input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} until it escalates. Output only the new status as a single word.`, + expectedTools: [ { - input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} until it escalates. Output only the new status as a single word.`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "update_issue", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - issueId: FIXTURES.issueId, - status: "ignored", - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "find_organizations", + arguments: {}, }, - // Regression: permanent ignores need the explicit forever mode { - input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} forever. Output only the new status as a single word.`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "update_issue", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - issueId: FIXTURES.issueId, - status: "ignored", - ignoreMode: "forever", - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "update_issue", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + issueId: FIXTURES.issueId, + status: "ignored", + regionUrl: "https://us.sentry.io", + }, + }, + ], + }, + // Regression: permanent ignores need the explicit forever mode + { + input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} forever. Output only the new status as a single word.`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + { + name: "update_issue", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + issueId: FIXTURES.issueId, + status: "ignored", + ignoreMode: "forever", + regionUrl: "https://us.sentry.io", + }, + }, + ], + }, + // Regression: count-based ignores should use the structured ignore fields + { + input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} until it happens 100 times in 60 minutes. Output only the new status as a single word.`, + expectedTools: [ + { + name: "find_organizations", + arguments: {}, }, - // Regression: count-based ignores should use the structured ignore fields { - input: `Ignore the issue ${FIXTURES.issueId} in organization ${FIXTURES.organizationSlug} until it happens 100 times in 60 minutes. Output only the new status as a single word.`, - expectedTools: [ - { - name: "find_organizations", - arguments: {}, - }, - { - name: "update_issue", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - issueId: FIXTURES.issueId, - status: "ignored", - ignoreMode: "untilOccurrenceCount", - ignoreCount: 100, - ignoreWindowMinutes: 60, - regionUrl: "https://us.sentry.io", - }, - }, - ], + name: "update_issue", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + issueId: FIXTURES.issueId, + status: "ignored", + ignoreMode: "untilOccurrenceCount", + ignoreCount: 100, + ignoreWindowMinutes: 60, + regionUrl: "https://us.sentry.io", + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/update-project.eval.ts b/packages/mcp-server-evals/src/evals/update-project.eval.ts index 2f979007e..4f4c11364 100644 --- a/packages/mcp-server-evals/src/evals/update-project.eval.ts +++ b/packages/mcp-server-evals/src/evals/update-project.eval.ts @@ -1,40 +1,31 @@ -import { describeEval } from "vitest-evals"; -import { FIXTURES, NoOpTaskRunner, ToolPredictionScorer } from "./utils"; +import { describeToolPredictionEval, FIXTURES } from "./utils"; -describeEval("update-project", { - data: async () => { - return [ +describeToolPredictionEval("update-project", [ + { + input: `Update the project '${FIXTURES.projectSlug}' in organization '${FIXTURES.organizationSlug}' to change its name to 'Updated Project Name' and slug to 'updated-project-slug'. Output only the new project slug as plain text without any formatting:\nupdated-project-slug`, + expectedTools: [ { - input: `Update the project '${FIXTURES.projectSlug}' in organization '${FIXTURES.organizationSlug}' to change its name to 'Updated Project Name' and slug to 'updated-project-slug'. Output only the new project slug as plain text without any formatting:\nupdated-project-slug`, - expectedTools: [ - { - name: "update_project", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - projectSlug: FIXTURES.projectSlug, - name: "Updated Project Name", - slug: "updated-project-slug", - }, - }, - ], + name: "update_project", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + projectSlug: FIXTURES.projectSlug, + name: "Updated Project Name", + slug: "updated-project-slug", + }, }, + ], + }, + { + input: `Assign the project '${FIXTURES.projectSlug}' in organization '${FIXTURES.organizationSlug}' to the team '${FIXTURES.teamSlug}'. Output only the team slug as plain text without any formatting:\nthe-goats`, + expectedTools: [ { - input: `Assign the project '${FIXTURES.projectSlug}' in organization '${FIXTURES.organizationSlug}' to the team '${FIXTURES.teamSlug}'. Output only the team slug as plain text without any formatting:\nthe-goats`, - expectedTools: [ - { - name: "update_project", - arguments: { - organizationSlug: FIXTURES.organizationSlug, - projectSlug: FIXTURES.projectSlug, - teamSlug: FIXTURES.teamSlug, - }, - }, - ], + name: "update_project", + arguments: { + organizationSlug: FIXTURES.organizationSlug, + projectSlug: FIXTURES.projectSlug, + teamSlug: FIXTURES.teamSlug, + }, }, - ]; + ], }, - task: NoOpTaskRunner(), - scorers: [ToolPredictionScorer()], - threshold: 0.6, - timeout: 30000, -}); +]); diff --git a/packages/mcp-server-evals/src/evals/utils/describe.ts b/packages/mcp-server-evals/src/evals/utils/describe.ts new file mode 100644 index 000000000..43b79b80e --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/describe.ts @@ -0,0 +1,118 @@ +import { + describeEval, + StructuredOutputJudge, + ToolCallJudge, + type Harness, + type JsonValue, +} from "vitest-evals"; +import { + ToolPredictionJudge, + toolPredictionHarness, +} from "./toolPredictionHarness"; +import { mcpToolCallHarness } from "./mcpToolCallHarness"; +import type { + EvalCase, + StructuredEvalMetadata, + ToolCallEvalMetadata, + ToolPredictionMetadata, +} from "./types"; + +type EvalOptions = { + threshold?: number | null; + timeout?: number; +}; + +function resolveThreshold( + threshold: number | null | undefined, + defaultThreshold: number, +) { + return threshold === undefined ? defaultThreshold : threshold; +} + +export function describeToolPredictionEval( + name: string, + cases: EvalCase[], + options: EvalOptions = {}, +) { + describeEval( + name, + { + harness: toolPredictionHarness, + judges: [ToolPredictionJudge], + judgeThreshold: resolveThreshold(options.threshold, 0.6), + }, + (it) => { + for (const testCase of cases) { + const { input, name: testName, ...metadata } = testCase; + + it( + testName ?? input, + { timeout: options.timeout ?? 30000 }, + async ({ run }) => { + await run(input, { metadata }); + }, + ); + } + }, + ); +} + +export function describeMcpToolCallEval( + name: string, + cases: EvalCase[], + options: EvalOptions = {}, +) { + describeEval( + name, + { + harness: mcpToolCallHarness, + judges: [ToolCallJudge({ ordered: true, params: "fuzzy" })], + judgeThreshold: resolveThreshold(options.threshold, 0.6), + }, + (it) => { + for (const testCase of cases) { + const { input, name: testName, ...metadata } = testCase; + + it( + testName ?? input, + { timeout: options.timeout ?? 90000 }, + async ({ run }) => { + await run(input, { metadata }); + }, + ); + } + }, + ); +} + +export function describeSearchAgentEval( + name: string, + harness: Harness, + cases: EvalCase[], + options: EvalOptions = {}, +) { + describeEval( + name, + { + harness, + judges: [ + ToolCallJudge({ params: "fuzzy" }), + StructuredOutputJudge({ match: "fuzzy" }), + ], + judgeThreshold: resolveThreshold(options.threshold, 0.6), + }, + (it) => { + for (const testCase of cases) { + const { input, name: testName, ...metadata } = testCase; + + it( + testName ?? input, + { timeout: options.timeout ?? 150000 }, + async ({ run }) => { + await run(input, { metadata }); + }, + ); + } + }, + ); +} diff --git a/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.test.ts b/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.test.ts new file mode 100644 index 000000000..cddab5931 --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.test.ts @@ -0,0 +1,77 @@ +import { describe, expect, it } from "vitest"; +import { createEmbeddedSearchAgentHarness } from "./embeddedAgentHarness"; + +function createHarnessContext() { + const artifacts = {}; + + return { + metadata: {}, + artifacts, + setArtifact: () => {}, + }; +} + +describe("createEmbeddedSearchAgentHarness", () => { + it("uses a fallback session when AI SDK steps lack harness model metadata", async () => { + const harness = createEmbeddedSearchAgentHarness( + "test-embedded-agent", + async () => ({ + result: { + query: "is:unresolved", + }, + toolCalls: [ + { + toolName: "whoami", + args: {}, + }, + ], + steps: [ + { + usage: { + inputTokens: 1, + outputTokens: 2, + totalTokens: 3, + }, + }, + ], + totalUsage: { + inputTokens: 1, + outputTokens: 2, + totalTokens: 3, + }, + }), + ); + + const run = await harness.run( + "show unresolved issues", + createHarnessContext(), + ); + + expect(run.output).toEqual({ + query: "is:unresolved", + }); + expect(run.session.messages).toEqual([ + { + role: "user", + content: "show unresolved issues", + }, + { + role: "assistant", + content: { + query: "is:unresolved", + }, + toolCalls: [ + { + name: "whoami", + arguments: {}, + }, + ], + }, + ]); + expect(run.usage).toEqual({ + inputTokens: 1, + outputTokens: 2, + totalTokens: 3, + }); + }); +}); diff --git a/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.ts b/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.ts new file mode 100644 index 000000000..44b320d06 --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/embeddedAgentHarness.ts @@ -0,0 +1,91 @@ +import { SentryApiService } from "@sentry/mcp-core/api-client"; +import { searchEventsAgent } from "@sentry/mcp-core/tools/search-events/agent"; +import { searchIssueEventsAgent } from "@sentry/mcp-core/tools/search-issue-events/agent"; +import { searchIssuesAgent } from "@sentry/mcp-core/tools/search-issues/agent"; +import { aiSdkHarness } from "@vitest-evals/harness-ai-sdk"; +import type { JsonValue, ToolCallRecord } from "vitest-evals"; +import { withFallbackSession } from "./fallbackSession"; +import { FIXTURES } from "./fixtures"; +import { requireJsonValue, toJsonRecord } from "./json"; +import type { StructuredEvalMetadata } from "./types"; + +type CapturedToolCall = { + toolName: string; + args: unknown; +}; + +type EmbeddedSearchAgentOptions = { + query: string; + organizationSlug: string; + apiService: SentryApiService; + projectId?: string; +}; + +type EmbeddedSearchAgentResult = { + result: unknown; + toolCalls: CapturedToolCall[]; + steps?: unknown[]; + usage?: unknown; + totalUsage?: unknown; +}; + +type EmbeddedSearchAgent = ( + options: EmbeddedSearchAgentOptions, +) => Promise; + +function toToolCallRecord(call: CapturedToolCall): ToolCallRecord { + return { + name: call.toolName, + arguments: toJsonRecord(call.args), + }; +} + +export function createEmbeddedSearchAgentHarness( + name: string, + agent: EmbeddedSearchAgent, +) { + return aiSdkHarness< + undefined, + string, + StructuredEvalMetadata, + EmbeddedSearchAgentResult, + Record, + JsonValue + >({ + name, + run: async ({ input }) => { + const apiService = new SentryApiService({ + accessToken: "test-token", + }); + + const result = await agent({ + query: input, + organizationSlug: FIXTURES.organizationSlug, + apiService, + }); + + return withFallbackSession( + input, + result, + requireJsonValue(result.result, "agent output"), + result.toolCalls.map(toToolCallRecord), + ); + }, + output: ({ result }) => requireJsonValue(result.result, "agent output"), + }); +} + +export const searchEventsAgentHarness = createEmbeddedSearchAgentHarness( + "search-events-agent", + searchEventsAgent, +); + +export const searchIssueEventsAgentHarness = createEmbeddedSearchAgentHarness( + "search-issue-events-agent", + searchIssueEventsAgent, +); + +export const searchIssuesAgentHarness = createEmbeddedSearchAgentHarness( + "search-issues-agent", + searchIssuesAgent, +); diff --git a/packages/mcp-server-evals/src/evals/utils/fallbackSession.ts b/packages/mcp-server-evals/src/evals/utils/fallbackSession.ts new file mode 100644 index 000000000..0187e6b8c --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/fallbackSession.ts @@ -0,0 +1,69 @@ +import type { + JsonValue, + NormalizedSession, + ToolCallRecord, +} from "vitest-evals"; + +export function createFallbackSession( + input: string, + output: JsonValue, + toolCalls: ToolCallRecord[] = [], +): NormalizedSession { + return { + messages: [ + { + role: "user", + content: input, + }, + { + role: "assistant", + content: output, + ...(toolCalls.length > 0 ? { toolCalls } : {}), + }, + ], + }; +} + +function hasHarnessStepModel(step: unknown) { + if (!step || typeof step !== "object" || !("model" in step)) { + return false; + } + + const { model } = step; + if (!model || typeof model !== "object") { + return false; + } + + return ( + "provider" in model && + typeof model.provider === "string" && + "modelId" in model && + typeof model.modelId === "string" + ); +} + +export function withFallbackSession( + input: string, + result: Result, + output: JsonValue, + toolCalls: ToolCallRecord[] = [], +) { + const session = createFallbackSession(input, output, toolCalls); + + if ( + Array.isArray(result.steps) && + result.steps.length > 0 && + result.steps.every(hasHarnessStepModel) + ) { + return { + ...result, + session, + }; + } + + return { + ...result, + steps: undefined, + session, + }; +} diff --git a/packages/mcp-server-evals/src/evals/utils/index.ts b/packages/mcp-server-evals/src/evals/utils/index.ts index 0316b2a61..01c2cc246 100644 --- a/packages/mcp-server-evals/src/evals/utils/index.ts +++ b/packages/mcp-server-evals/src/evals/utils/index.ts @@ -1,7 +1,17 @@ export { FIXTURES } from "./fixtures"; -export { McpToolCallTaskRunner } from "./mcpToolCallRunner"; -export { NoOpTaskRunner } from "./runner"; export { - ToolPredictionScorer, - type ExpectedToolCall, -} from "./toolPredictionScorer"; + describeMcpToolCallEval, + describeSearchAgentEval, + describeToolPredictionEval, +} from "./describe"; +export { + searchEventsAgentHarness, + searchIssueEventsAgentHarness, + searchIssuesAgentHarness, +} from "./embeddedAgentHarness"; +export { + ToolPredictionJudge, + toolPredictionHarness, +} from "./toolPredictionHarness"; +export { mcpToolCallHarness } from "./mcpToolCallHarness"; +export type { ExpectedToolCall } from "./types"; diff --git a/packages/mcp-server-evals/src/evals/utils/json.ts b/packages/mcp-server-evals/src/evals/utils/json.ts new file mode 100644 index 000000000..176eba04f --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/json.ts @@ -0,0 +1,25 @@ +import { toJsonValue, type JsonValue } from "vitest-evals"; + +export function toJsonRecord(value: unknown): Record { + const normalized = toJsonValue(value); + + if ( + normalized && + typeof normalized === "object" && + !Array.isArray(normalized) + ) { + return normalized; + } + + return {}; +} + +export function requireJsonValue(value: unknown, label: string): JsonValue { + const normalized = toJsonValue(value); + + if (normalized === undefined) { + throw new Error(`${label} is not JSON-serializable`); + } + + return normalized; +} diff --git a/packages/mcp-server-evals/src/evals/utils/mcpClient.test.ts b/packages/mcp-server-evals/src/evals/utils/mcpClient.test.ts new file mode 100644 index 000000000..eceaa0c39 --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/mcpClient.test.ts @@ -0,0 +1,15 @@ +import { describe, expect, it } from "vitest"; +import { getAvailableToolDescriptions } from "./mcpClient"; + +describe("getAvailableToolDescriptions", () => { + it("uses stable tool definitions for prediction prompts", async () => { + const descriptions = await getAvailableToolDescriptions(); + const toolNames = descriptions.map((description) => + description.slice(0, description.indexOf(" - ")), + ); + + expect(toolNames).toContain("find_teams"); + expect(toolNames).toContain("create_project"); + expect(toolNames).toContain("find_releases"); + }); +}); diff --git a/packages/mcp-server-evals/src/evals/utils/mcpClient.ts b/packages/mcp-server-evals/src/evals/utils/mcpClient.ts new file mode 100644 index 000000000..0193d8ffa --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/mcpClient.ts @@ -0,0 +1,54 @@ +import { experimental_createMCPClient } from "@ai-sdk/mcp"; +import { Experimental_StdioMCPTransport } from "@ai-sdk/mcp/mcp-stdio"; +import toolDefinitions from "@sentry/mcp-core/toolDefinitions"; + +type MockMcpClient = Awaited>; + +let cachedToolDescriptions: Promise | null = null; + +function createMockTransport() { + return new Experimental_StdioMCPTransport({ + command: "pnpm", + args: ["--filter", "@sentry/mcp-server-evals", "start"], + env: { + ...process.env, + SENTRY_ACCESS_TOKEN: "mocked-access-token", + SENTRY_HOST: "sentry.io", + }, + }); +} + +function getShortDescription(description: string): string { + return description.split("\n")[0] ?? ""; +} + +export async function withMockMcpClient( + callback: (client: MockMcpClient) => Promise, +): Promise { + const client = await experimental_createMCPClient({ + transport: createMockTransport(), + }); + + try { + return await callback(client); + } finally { + await client.close(); + } +} + +async function loadAvailableToolDescriptions() { + return toolDefinitions.map( + (tool) => `${tool.name} - ${getShortDescription(tool.description)}`, + ); +} + +export async function getAvailableToolDescriptions(): Promise { + cachedToolDescriptions ??= loadAvailableToolDescriptions().catch( + (error: unknown) => { + cachedToolDescriptions = null; + throw error; + }, + ); + + return cachedToolDescriptions; +} diff --git a/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.test.ts b/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.test.ts new file mode 100644 index 000000000..08c334284 --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.test.ts @@ -0,0 +1,222 @@ +import { aiSdkHarness } from "@vitest-evals/harness-ai-sdk"; +import { + dynamicTool, + type LanguageModelUsage, + type ToolExecutionOptions, +} from "ai"; +import { + toolCalls as collectToolCalls, + type ToolCallRecord, +} from "vitest-evals"; +import { describe, expect, it } from "vitest"; +import { z } from "zod"; +import { + captureMcpToolCalls, + createMcpToolCallRun, + prepareMcpToolCallStep, +} from "./mcpToolCallHarness"; + +function createToolOptions(toolCallId: string): ToolExecutionOptions { + return { + toolCallId, + messages: [], + }; +} + +function createUsage(): LanguageModelUsage { + return { + inputTokens: 10, + inputTokenDetails: { + noCacheTokens: 10, + cacheReadTokens: undefined, + cacheWriteTokens: undefined, + }, + outputTokens: 5, + outputTokenDetails: { + textTokens: 5, + reasoningTokens: undefined, + }, + totalTokens: 15, + }; +} + +describe("captureMcpToolCalls", () => { + it("captures dynamic MCP tool execution before delegating", async () => { + const capturedToolCalls: ToolCallRecord[] = []; + const tools = captureMcpToolCalls( + { + search_tools: dynamicTool({ + inputSchema: z.object({ + query: z.string(), + }), + execute: async (input) => ({ + name: "get_issue_details", + input, + }), + }), + }, + capturedToolCalls, + ); + + const result = await tools.search_tools.execute?.( + { query: "issue" }, + createToolOptions("call_1"), + ); + + expect(result).toEqual({ + name: "get_issue_details", + input: { + query: "issue", + }, + }); + expect(capturedToolCalls).toMatchObject([ + { + id: "call_1", + name: "search_tools", + arguments: { + query: "issue", + }, + result: { + name: "get_issue_details", + input: { + query: "issue", + }, + }, + }, + ]); + expect(capturedToolCalls[0].startedAt).toEqual(expect.any(String)); + expect(capturedToolCalls[0].finishedAt).toEqual(expect.any(String)); + expect(capturedToolCalls[0].durationMs).toEqual(expect.any(Number)); + }); + + it("records tool errors before rethrowing", async () => { + const capturedToolCalls: ToolCallRecord[] = []; + const tools = captureMcpToolCalls( + { + execute_tool: dynamicTool({ + inputSchema: z.object({ + name: z.string(), + }), + execute: async () => { + throw new Error("tool failed"); + }, + }), + }, + capturedToolCalls, + ); + + await expect( + tools.execute_tool.execute?.( + { name: "get_issue_details" }, + createToolOptions("call_2"), + ), + ).rejects.toThrow("tool failed"); + + expect(capturedToolCalls).toMatchObject([ + { + id: "call_2", + name: "execute_tool", + arguments: { + name: "get_issue_details", + }, + error: { + type: "Error", + message: "tool failed", + }, + }, + ]); + }); +}); + +describe("prepareMcpToolCallStep", () => { + it("forces discovery before catalog execution", () => { + expect(prepareMcpToolCallStep(0)).toEqual({ + toolChoice: { + type: "tool", + toolName: "search_tools", + }, + activeTools: ["search_tools"], + }); + expect(prepareMcpToolCallStep(1)).toEqual({ + toolChoice: { + type: "tool", + toolName: "execute_tool", + }, + activeTools: ["execute_tool"], + }); + expect(prepareMcpToolCallStep(2)).toBeUndefined(); + }); +}); + +describe("createMcpToolCallRun", () => { + it("preserves the captured sequence when raw AI SDK steps only expose the last call", async () => { + const capturedToolCalls: ToolCallRecord[] = [ + { + id: "call_1", + name: "search_tools", + arguments: { + query: "issue", + }, + }, + { + id: "call_2", + name: "execute_tool", + arguments: { + name: "get_issue_details", + }, + }, + ]; + const result = { + text: "Issue summary", + steps: [ + { + model: { + provider: "openai", + modelId: "gpt-4o", + }, + toolCalls: [ + { + toolCallId: "call_2", + toolName: "execute_tool", + input: { + name: "get_issue_details", + }, + }, + ], + usage: createUsage(), + }, + ], + totalUsage: createUsage(), + }; + const harness = aiSdkHarness({ + name: "mcp-tool-call-test", + run: async () => + createMcpToolCallRun("Explain an issue", result, capturedToolCalls), + }); + + const run = await harness.run("Explain an issue", { + metadata: {}, + artifacts: {}, + setArtifact: () => {}, + }); + + expect(collectToolCalls(run.session).map(({ name }) => name)).toEqual([ + "search_tools", + "execute_tool", + ]); + expect(run.usage).toMatchObject({ + provider: "openai", + model: "gpt-4o", + inputTokens: 10, + outputTokens: 5, + totalTokens: 15, + toolCalls: 2, + }); + expect( + (run.traces ?? []) + .flatMap((trace) => trace.spans) + .filter((span) => span.kind === "tool") + .map((span) => span.name), + ).toEqual(["search_tools", "execute_tool"]); + }); +}); diff --git a/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.ts b/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.ts new file mode 100644 index 000000000..512dcffee --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/mcpToolCallHarness.ts @@ -0,0 +1,239 @@ +import { openai } from "@ai-sdk/openai"; +import { aiSdkHarness } from "@vitest-evals/harness-ai-sdk"; +import { + generateText, + stepCountIs, + type LanguageModelUsage, + type PrepareStepResult, + type ToolExecutionOptions, + type ToolSet, +} from "ai"; +import type { Harness, HarnessRun, ToolCallRecord } from "vitest-evals"; +import { toJsonValue } from "vitest-evals"; +import { createFallbackSession } from "./fallbackSession"; +import { toJsonRecord } from "./json"; +import { withMockMcpClient } from "./mcpClient"; +import type { ToolCallEvalMetadata } from "./types"; + +const defaultModel = openai("gpt-4o"); + +type AiSdkResultWithUsage = { + text: string; + steps?: unknown; + totalUsage?: LanguageModelUsage; + usage?: LanguageModelUsage; +}; + +type ExecutableTool = ToolSet[string] & { + execute: (input: unknown, options: ToolExecutionOptions) => unknown; +}; + +function isExecutableTool(tool: ToolSet[string]): tool is ExecutableTool { + return typeof tool.execute === "function"; +} + +function toToolCallError(error: unknown): NonNullable { + if (error instanceof Error) { + return { + type: error.name, + message: error.message, + }; + } + + const normalized = toJsonValue(error); + if ( + normalized && + typeof normalized === "object" && + !Array.isArray(normalized) && + typeof normalized.message === "string" + ) { + return { + ...normalized, + type: typeof normalized.type === "string" ? normalized.type : "Error", + message: normalized.message, + }; + } + + return { + type: "Error", + message: String(error ?? "Unknown tool call error"), + }; +} + +export function captureMcpToolCalls( + tools: TTools, + capturedToolCalls: ToolCallRecord[], +): TTools { + return Object.fromEntries( + Object.entries(tools).map(([toolName, tool]) => { + if (!isExecutableTool(tool)) { + return [toolName, tool]; + } + + const execute = tool.execute; + const wrappedTool = { + ...tool, + execute: async ( + toolInput: unknown, + execution: ToolExecutionOptions, + ) => { + const startedAt = new Date(); + const toolCall: ToolCallRecord = { + id: execution.toolCallId, + name: toolName, + arguments: toJsonRecord(toolInput), + startedAt: startedAt.toISOString(), + }; + capturedToolCalls.push(toolCall); + + try { + const result = await execute(toolInput, execution); + const finishedAt = new Date(); + const normalizedResult = toJsonValue(result); + + if (normalizedResult !== undefined) { + toolCall.result = normalizedResult; + } + toolCall.finishedAt = finishedAt.toISOString(); + toolCall.durationMs = finishedAt.getTime() - startedAt.getTime(); + + return result; + } catch (error) { + const finishedAt = new Date(); + toolCall.error = toToolCallError(error); + toolCall.finishedAt = finishedAt.toISOString(); + toolCall.durationMs = finishedAt.getTime() - startedAt.getTime(); + throw error; + } + }, + }; + + return [toolName, wrappedTool]; + }), + ) as TTools; +} + +function getLastStepModel(result: AiSdkResultWithUsage) { + const steps = Array.isArray(result.steps) ? result.steps : []; + const lastStep = steps.at(-1); + + if (!lastStep || typeof lastStep !== "object" || !("model" in lastStep)) { + return {}; + } + + const { model } = lastStep; + if (!model || typeof model !== "object") { + return {}; + } + + return { + provider: "provider" in model ? String(model.provider) : undefined, + model: "modelId" in model ? String(model.modelId) : undefined, + }; +} + +function getTotalTokens(usage: LanguageModelUsage | undefined) { + if (!usage) { + return undefined; + } + + return ( + usage.totalTokens ?? (usage.inputTokens ?? 0) + (usage.outputTokens ?? 0) + ); +} + +export function createMcpToolCallRun( + input: string, + result: AiSdkResultWithUsage, + capturedToolCalls: ToolCallRecord[], +): HarnessRun { + const usage = result.totalUsage ?? result.usage; + const model = getLastStepModel(result); + + return { + session: createFallbackSession(input, result.text, capturedToolCalls), + output: result.text, + usage: { + ...model, + inputTokens: usage?.inputTokens, + outputTokens: usage?.outputTokens, + reasoningTokens: + usage?.outputTokenDetails?.reasoningTokens ?? usage?.reasoningTokens, + totalTokens: getTotalTokens(usage), + toolCalls: capturedToolCalls.length, + metadata: toJsonRecord({ + cacheReadTokens: + usage?.inputTokenDetails?.cacheReadTokens ?? usage?.cachedInputTokens, + cacheWriteTokens: usage?.inputTokenDetails?.cacheWriteTokens, + raw: usage?.raw, + }), + }, + errors: [], + }; +} + +function forcedToolStep(toolName: "search_tools" | "execute_tool") { + return { + toolChoice: { + type: "tool", + toolName, + }, + activeTools: [toolName], + } satisfies PrepareStepResult; +} + +export function prepareMcpToolCallStep( + stepNumber: number, +): PrepareStepResult | undefined { + if (stepNumber === 0) { + return forcedToolStep("search_tools"); + } + + if (stepNumber === 1) { + return forcedToolStep("execute_tool"); + } +} + +export function createMcpToolCallHarness( + maxSteps = 6, +): Harness { + return aiSdkHarness< + undefined, + string, + ToolCallEvalMetadata, + HarnessRun + >({ + name: "mcp-tool-call", + run: async ({ input, context }) => { + return await withMockMcpClient(async (client) => { + const capturedToolCalls: ToolCallRecord[] = []; + const tools = captureMcpToolCalls( + await client.tools(), + capturedToolCalls, + ); + const result = await generateText({ + model: defaultModel, + tools, + system: [ + "You are a Sentry assistant with access to Sentry MCP tools.", + "Use search_tools before execute_tool when the needed Sentry operation is not directly listed as a tool.", + "When search_tools returns a tool, call execute_tool with that returned tool name and arguments matching the returned schema.", + "When the user says 'from Sentry in ', Sentry is the product name and is the organizationSlug.", + ].join("\n"), + prompt: input, + stopWhen: stepCountIs(maxSteps), + abortSignal: context.signal, + prepareStep: ({ stepNumber }) => prepareMcpToolCallStep(stepNumber), + experimental_telemetry: { + isEnabled: true, + functionId: "catalog_tool_behavior_eval", + }, + }); + + return createMcpToolCallRun(input, result, capturedToolCalls); + }); + }, + }); +} + +export const mcpToolCallHarness = createMcpToolCallHarness(); diff --git a/packages/mcp-server-evals/src/evals/utils/mcpToolCallRunner.ts b/packages/mcp-server-evals/src/evals/utils/mcpToolCallRunner.ts deleted file mode 100644 index 2c674c9a7..000000000 --- a/packages/mcp-server-evals/src/evals/utils/mcpToolCallRunner.ts +++ /dev/null @@ -1,64 +0,0 @@ -import { experimental_createMCPClient } from "@ai-sdk/mcp"; -import { Experimental_StdioMCPTransport } from "@ai-sdk/mcp/mcp-stdio"; -import { openai } from "@ai-sdk/openai"; -import { generateText, stepCountIs, type LanguageModel } from "ai"; - -const defaultModel = openai("gpt-4o"); - -function toToolCall(call: { toolName: string; input: unknown }) { - const input = - call.input && typeof call.input === "object" && !Array.isArray(call.input) - ? (call.input as Record) - : {}; - - return { - name: call.toolName, - arguments: input, - }; -} - -export function McpToolCallTaskRunner( - model: LanguageModel = defaultModel, - maxSteps = 6, -) { - return async function McpToolCallTaskRunner(input: string) { - const transport = new Experimental_StdioMCPTransport({ - command: "pnpm", - args: ["--filter", "@sentry/mcp-server-evals", "start"], - env: { - ...process.env, - SENTRY_ACCESS_TOKEN: "mocked-access-token", - SENTRY_HOST: "sentry.io", - }, - }); - const client = await experimental_createMCPClient({ transport }); - - try { - const tools = await client.tools(); - const result = await generateText({ - model, - tools, - system: [ - "You are a Sentry assistant with access to Sentry MCP tools.", - "Use search_tools before execute_tool when the needed Sentry operation is not directly listed as a tool.", - "When search_tools returns a tool, call execute_tool with that returned tool name and arguments matching the returned schema.", - ].join("\n"), - prompt: input, - stopWhen: stepCountIs(maxSteps), - experimental_telemetry: { - isEnabled: true, - functionId: "catalog_tool_behavior_eval", - }, - }); - - return { - result: result.text, - toolCalls: result.steps.flatMap((step) => - step.toolCalls.map(toToolCall), - ), - }; - } finally { - await client.close(); - } - }; -} diff --git a/packages/mcp-server-evals/src/evals/utils/runner.ts b/packages/mcp-server-evals/src/evals/utils/runner.ts deleted file mode 100644 index 7a8e6d105..000000000 --- a/packages/mcp-server-evals/src/evals/utils/runner.ts +++ /dev/null @@ -1,14 +0,0 @@ -/** - * A no-op task runner that doesn't execute tools, just returns the input - * for use with ToolPredictionScorer. This allows tests to focus on predicting - * which tools would be called without actually executing them. - */ -export function NoOpTaskRunner() { - return async function NoOpTaskRunner(input: string) { - // Just return the input as the result, no tool execution - return { - result: input, - toolCalls: [], - }; - }; -} diff --git a/packages/mcp-server-evals/src/evals/utils/structuredOutputScorer.ts b/packages/mcp-server-evals/src/evals/utils/structuredOutputScorer.ts deleted file mode 100644 index 65fdf4cd9..000000000 --- a/packages/mcp-server-evals/src/evals/utils/structuredOutputScorer.ts +++ /dev/null @@ -1,282 +0,0 @@ -import type { Score, ScoreFn, BaseScorerOptions } from "vitest-evals"; - -interface StructuredOutputScorerOptions extends BaseScorerOptions { - expected?: Record; -} - -interface StructuredOutputScorerConfig { - /** - * How to match field values - * - "strict": Exact equality required (default) - * - "fuzzy": More flexible matching (regex patterns, type coercion) - * - Custom function: Your own comparison logic - * @default "strict" - */ - match?: - | "strict" - | "fuzzy" - | ((expected: any, actual: any, key: string) => boolean); - - /** - * Whether all expected fields must be present for a passing score - * When false: gives partial credit based on fields matched - * @default true - */ - requireAll?: boolean; - - /** - * Whether to allow additional fields beyond those expected - * @default true - */ - allowExtras?: boolean; - - /** - * Enable debug logging - * @default false - */ - debug?: boolean; -} - -/** - * A configurable scorer for evaluating structured outputs (e.g., JSON) from LLM responses. - * - * Similar to ToolCallScorer but for validating structured data outputs like API queries. - * - * @param config - Configuration options for the scorer - * @param config.match - How to match field values: "strict", "fuzzy", or custom function - * @param config.requireAll - Require all expected fields (vs partial credit) - * @param config.allowExtras - Allow additional fields in output - * @param config.debug - Enable debug logging - * - * @example - * // Default: strict matching - * describeEval("query generation", { - * data: async () => [{ - * input: "Show me errors from today", - * expected: { - * dataset: "errors", - * query: "", - * sort: "-timestamp", - * timeRange: { statsPeriod: "24h" } - * } - * }], - * task: myTask, - * scorers: [StructuredOutputScorer()] - * }); - * - * @example - * // Fuzzy matching with regex patterns - * describeEval("flexible query matching", { - * data: async () => [{ - * input: "Find slow API calls", - * expected: { - * dataset: "spans", - * query: /span\.duration:>1000|span\.duration:>1s/, - * sort: "-span.duration" - * } - * }], - * task: myTask, - * scorers: [StructuredOutputScorer({ match: "fuzzy" })] - * }); - */ -export function StructuredOutputScorer( - config: StructuredOutputScorerConfig = {}, -): ScoreFn { - const { - match = "strict", - requireAll = true, - allowExtras = true, - debug = false, - } = config; - - return async (opts: StructuredOutputScorerOptions): Promise => { - const { output, expected } = opts; - - // If no expected output provided, just check if we got valid JSON - if (!expected) { - try { - JSON.parse(output); - return { score: 1, metadata: { rationale: "Valid JSON output" } }; - } catch { - return { score: 0, metadata: { rationale: "Invalid JSON output" } }; - } - } - - let parsed: Record; - try { - parsed = JSON.parse(output); - } catch (error) { - return { - score: 0, - metadata: { rationale: `Failed to parse output as JSON: ${error}` }, - }; - } - - // Check for error field in output - if (parsed.error && parsed.error !== "" && parsed.error !== null) { - return { - score: 0, - metadata: { rationale: `Output contains error: ${parsed.error}` }, - }; - } - - const matchFn = getMatchFunction(match); - const { matches, mismatches, extras } = compareObjects( - expected, - parsed, - matchFn, - ); - - if (debug) { - console.log("StructuredOutputScorer debug:"); - console.log("Expected:", expected); - console.log("Actual:", parsed); - console.log("Matches:", matches); - console.log("Mismatches:", mismatches); - console.log("Extras:", extras); - } - - // Calculate score - const totalExpected = Object.keys(expected).length; - const totalMatched = matches.length; - const hasExtras = extras.length > 0; - - let score: number; - let rationale: string; - - if (requireAll && mismatches.length > 0) { - score = 0; - rationale = `Missing required fields: ${mismatches.map((m) => m.key).join(", ")}`; - } else if (!allowExtras && hasExtras) { - score = 0; - rationale = `Unexpected extra fields: ${extras.join(", ")}`; - } else if (totalExpected === 0) { - score = 1; - rationale = "No expected fields to match"; - } else { - score = totalMatched / totalExpected; - if (score === 1) { - rationale = "All expected fields match"; - } else { - rationale = `Matched ${totalMatched}/${totalExpected} fields`; - } - } - - // Add mismatch details to rationale - if (mismatches.length > 0 && score < 1) { - const details = mismatches - .map( - (m) => - `${m.key}: expected ${formatValue(m.expected)}, got ${formatValue(m.actual)}`, - ) - .join("; "); - rationale += ` - ${details}`; - } - - return { - score, - metadata: { - rationale, - output, - }, - }; - }; -} - -function getMatchFunction( - match: StructuredOutputScorerConfig["match"], -): (expected: any, actual: any, key: string) => boolean { - if (typeof match === "function") { - return match; - } - - if (match === "fuzzy") { - return fuzzyMatch; - } - - return strictMatch; -} - -function strictMatch(expected: any, actual: any): boolean { - return JSON.stringify(expected) === JSON.stringify(actual); -} - -function fuzzyMatch(expected: any, actual: any): boolean { - // Handle regex patterns - if (expected instanceof RegExp) { - return typeof actual === "string" && expected.test(actual); - } - - // Handle functions (custom validators) - if (typeof expected === "function") { - return expected(actual); - } - - // Handle null/undefined (intentionally using == for null/undefined check) - if ( - expected === null || - expected === undefined || - actual === null || - actual === undefined - ) { - return expected === actual; - } - - // Handle arrays - if (Array.isArray(expected) && Array.isArray(actual)) { - if (expected.length !== actual.length) return false; - return expected.every((exp, i) => fuzzyMatch(exp, actual[i])); - } - - // Handle objects - if (typeof expected === "object" && typeof actual === "object") { - return Object.keys(expected).every((key) => - fuzzyMatch(expected[key], actual[key]), - ); - } - - // Handle primitives - fuzzy match allows type coercion (e.g., "1" matches 1) - // biome-ignore lint/suspicious/noDoubleEquals: Intentional for fuzzy matching with type coercion - return expected == actual; -} - -interface ComparisonResult { - matches: Array<{ key: string; expected: any; actual: any }>; - mismatches: Array<{ key: string; expected: any; actual: any }>; - extras: string[]; -} - -function compareObjects( - expected: Record, - actual: Record, - matchFn: (expected: any, actual: any, key: string) => boolean, -): ComparisonResult { - const matches: ComparisonResult["matches"] = []; - const mismatches: ComparisonResult["mismatches"] = []; - - // Check expected fields - for (const [key, expectedValue] of Object.entries(expected)) { - const actualValue = actual[key]; - - if (matchFn(expectedValue, actualValue, key)) { - matches.push({ key, expected: expectedValue, actual: actualValue }); - } else { - mismatches.push({ key, expected: expectedValue, actual: actualValue }); - } - } - - // Find extra fields - const expectedKeys = new Set(Object.keys(expected)); - const extras = Object.keys(actual).filter((key) => !expectedKeys.has(key)); - - return { matches, mismatches, extras }; -} - -function formatValue(value: any): string { - if (value === undefined) return "undefined"; - if (value === null) return "null"; - if (value instanceof RegExp) return value.toString(); - if (typeof value === "string") return `"${value}"`; - if (typeof value === "object") return JSON.stringify(value); - return String(value); -} diff --git a/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.test.ts b/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.test.ts new file mode 100644 index 000000000..5b7f28c4b --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.test.ts @@ -0,0 +1,185 @@ +import { describe, expect, it } from "vitest"; +import type { Harness, HarnessRun } from "vitest-evals"; +import { + ToolPredictionJudge, + generatePredictionPrompt, +} from "./toolPredictionHarness"; +import type { ToolPredictionMetadata, ToolPredictionOutput } from "./types"; + +function createJudgeContext( + output: ToolPredictionOutput, + metadata: ToolPredictionMetadata, +): Parameters[0] { + const run: HarnessRun = { + output, + session: { messages: [] }, + usage: {}, + errors: [], + }; + const harness: Harness = + { + name: "test-tool-prediction", + run: async () => run, + }; + + return { + input: "test input", + output, + toolCalls: [], + metadata, + run, + session: run.session, + harness, + }; +} + +describe("ToolPredictionJudge", () => { + it("does not leak expected tool calls into the prediction prompt", () => { + const prompt = generatePredictionPrompt( + ["- search_issues: Search Sentry issues"], + "Find recent crashes in production", + ); + + expect(prompt).toContain("- search_issues: Search Sentry issues"); + expect(prompt).toContain("Find recent crashes in production"); + expect(prompt).not.toContain("EXPECTED TOOL CALLS"); + expect(prompt).not.toContain("follow them exactly"); + expect(prompt).not.toContain("expected tools"); + }); + + it("scores matching predicted tools", async () => { + const result = await ToolPredictionJudge.assess( + createJudgeContext( + { + score: 1, + rationale: "The task asks for accessible organizations.", + predictedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + ], + }, + { + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + ], + }, + ), + ); + + expect(result.score).toBe(1); + expect(result.metadata?.predictedTools).toEqual([ + { + name: "find_organizations", + arguments: {}, + }, + ]); + }); + + it("uses deterministic score when the model underrates matching tools", async () => { + const result = await ToolPredictionJudge.assess( + createJudgeContext( + { + score: 0, + rationale: "The expected discovery call is not necessary.", + predictedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + ], + }, + { + expectedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + ], + }, + ), + ); + + expect(result.score).toBe(1); + expect(result.metadata?.modelScore).toBe(0); + expect(result.metadata?.deterministicScore).toBe(1); + }); + + it("ignores inflated model scores for wrong predicted tools", async () => { + const result = await ToolPredictionJudge.assess( + createJudgeContext( + { + score: 0.8, + rationale: "The prediction picked the wrong lookup path.", + predictedTools: [ + { + name: "find_organizations", + arguments: {}, + }, + ], + }, + { + expectedTools: [ + { + name: "search_docs", + arguments: { + query: "rate limiting", + }, + }, + ], + }, + ), + ); + + expect(result.score).toBe(0); + expect(result.metadata?.rationale).toContain("wrong lookup path"); + expect(result.metadata?.deterministicRationale).toContain( + "Partial match: 0/1", + ); + expect(result.metadata?.deterministicScore).toBe(0); + }); + + it("uses deterministic partial scores for incomplete multi-step predictions", async () => { + const result = await ToolPredictionJudge.assess( + createJudgeContext( + { + score: 0.6, + rationale: "The prediction found the issue but missed the update.", + predictedTools: [ + { + name: "search_issues", + arguments: { + organizationSlug: "sentry", + }, + }, + ], + }, + { + expectedTools: [ + { + name: "search_issues", + arguments: { + organizationSlug: "sentry", + }, + }, + { + name: "update_issue", + arguments: { + organizationSlug: "sentry", + }, + }, + ], + }, + ), + ); + + expect(result.score).toBe(0.5); + expect(result.metadata?.rationale).toContain("missed the update"); + expect(result.metadata?.deterministicRationale).toContain("Partial match"); + expect(result.metadata?.deterministicScore).toBe(0.5); + }); +}); diff --git a/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.ts b/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.ts new file mode 100644 index 000000000..255247069 --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/toolPredictionHarness.ts @@ -0,0 +1,189 @@ +import { openai } from "@ai-sdk/openai"; +import { aiSdkHarness } from "@vitest-evals/harness-ai-sdk"; +import { generateObject, type GenerateObjectResult } from "ai"; +import { + createJudge, + ToolCallJudge, + type JudgeContext, + type JsonValue, + type ToolCallRecord, +} from "vitest-evals"; +import { z } from "zod"; +import { requireJsonValue, toJsonRecord } from "./json"; +import { getAvailableToolDescriptions } from "./mcpClient"; +import type { + ExpectedToolCall, + PredictedToolCall, + ToolPredictionMetadata, + ToolPredictionOutput, +} from "./types"; + +const defaultModel = openai("gpt-4o"); + +const jsonPrimitiveSchema = z.union([ + z.string(), + z.number(), + z.boolean(), + z.null(), +]); +const shallowJsonValueSchema = z.union([ + jsonPrimitiveSchema, + z.array(jsonPrimitiveSchema), + z.record(jsonPrimitiveSchema), +]); +const jsonValueSchema: z.ZodType = z.union([ + shallowJsonValueSchema, + z.array(shallowJsonValueSchema), + z.record(shallowJsonValueSchema), +]); + +const predictionSchema = z.object({ + score: z + .number() + .min(0) + .max(1) + .describe("Confidence score for the predicted tool calls from 0 to 1"), + rationale: z + .string() + .describe("Brief explanation of the score and predicted tool calls"), + predictedTools: z + .array( + z.object({ + name: z.string().describe("Sentry MCP tool name"), + arguments: z.record(jsonValueSchema).optional().default({}), + }), + ) + .describe("Ordered Sentry MCP tool calls the assistant would likely make"), +}); + +type RawToolPredictionOutput = z.infer; +type ToolPredictionResult = GenerateObjectResult; + +export function generatePredictionPrompt( + availableTools: string[], + task: string, +) { + return `You are predicting which Sentry MCP tools an AI assistant would call for a user task. + +[AVAILABLE TOOLS] +${availableTools.join("\n")} + +[USER TASK] +${task} + +Return the ordered tool calls the assistant would likely make and a confidence score for your prediction. Do not answer the user task directly. + +Guidance: +- Use only the available tool descriptions and the user task to decide. +- Predict discovery calls only when an assistant would need them before the final action. +- If the task does not require Sentry MCP tools, return an empty predictedTools array. +- Include arguments only when they are available or strongly implied by the task. +- Extra parameters like regionUrl are acceptable only when the assistant would have learned them from an earlier discovery call. +- For natural-language search queries, preserve the user's meaning rather than inventing exact syntax. + +Score confidence as follows: +- 1.0: The tool sequence is obvious from the task and catalog. +- 0.8: The likely tools are clear, with minor uncertainty in arguments. +- 0.6: The broad tool choice is plausible, but ordering or arguments are uncertain. +- 0.3: A tool may be needed, but the task is ambiguous. +- 0.0: No reliable tool prediction can be made.`; +} + +function normalizePredictedToolCall( + toolCall: RawToolPredictionOutput["predictedTools"][number], +): PredictedToolCall { + return { + name: toolCall.name, + arguments: toJsonRecord(toolCall.arguments), + }; +} + +function normalizePredictionOutput( + output: RawToolPredictionOutput, +): ToolPredictionOutput { + return { + score: output.score, + rationale: output.rationale, + predictedTools: output.predictedTools.map(normalizePredictedToolCall), + }; +} + +function toToolCallRecord(toolCall: PredictedToolCall): ToolCallRecord { + return { + name: toolCall.name, + arguments: toolCall.arguments, + }; +} + +function normalizeExpectedToolCalls(expectedTools: ExpectedToolCall[] = []) { + return expectedTools.map((toolCall) => ({ + name: toolCall.name, + arguments: toJsonRecord(toolCall.arguments), + })); +} + +export function createToolPredictionHarness() { + return aiSdkHarness< + undefined, + string, + ToolPredictionMetadata, + ToolPredictionResult, + Record, + ToolPredictionOutput + >({ + name: "tool-prediction", + run: async ({ input, context }) => { + const availableTools = await getAvailableToolDescriptions(); + context.setArtifact("availableTools", availableTools); + + return await generateObject({ + model: defaultModel, + prompt: generatePredictionPrompt(availableTools, input), + schema: predictionSchema, + abortSignal: context.signal, + experimental_telemetry: { + isEnabled: true, + functionId: "tool_prediction_harness", + }, + }); + }, + output: ({ result }) => normalizePredictionOutput(result.object), + }); +} + +const toolCallJudge = ToolCallJudge({ + ordered: true, + params: "fuzzy", + requireAll: false, +}); + +export const ToolPredictionJudge = createJudge< + JudgeContext +>("ToolPredictionJudge", async (context) => { + const predictedToolCalls = + context.output.predictedTools.map(toToolCallRecord); + const toolCallJudgeResult = await toolCallJudge.assess({ + ...context, + toolCalls: predictedToolCalls, + expectedTools: context.metadata.expectedTools, + }); + const deterministicScore = toolCallJudgeResult.score ?? 0; + + return { + score: deterministicScore, + metadata: { + ...toolCallJudgeResult.metadata, + rationale: context.output.rationale, + modelScore: context.output.score, + predictedTools: requireJsonValue(predictedToolCalls, "predictedTools"), + expectedTools: requireJsonValue( + normalizeExpectedToolCalls(context.metadata.expectedTools), + "expectedTools", + ), + deterministicScore, + deterministicRationale: toolCallJudgeResult.metadata?.rationale, + }, + }; +}); + +export const toolPredictionHarness = createToolPredictionHarness(); diff --git a/packages/mcp-server-evals/src/evals/utils/toolPredictionScorer.ts b/packages/mcp-server-evals/src/evals/utils/toolPredictionScorer.ts deleted file mode 100644 index dcfaf1bbe..000000000 --- a/packages/mcp-server-evals/src/evals/utils/toolPredictionScorer.ts +++ /dev/null @@ -1,223 +0,0 @@ -import { openai } from "@ai-sdk/openai"; -import { generateObject, type LanguageModel } from "ai"; -import { z } from "zod"; -import { experimental_createMCPClient } from "@ai-sdk/mcp"; -import { Experimental_StdioMCPTransport } from "@ai-sdk/mcp/mcp-stdio"; - -// Cache for available tools to avoid reconnecting for each test -let cachedTools: string[] | null = null; - -/** - * Get available tools from the MCP server by connecting to it directly. - * This ensures the tool list stays in sync with what's actually registered. - */ -async function getAvailableTools(): Promise { - if (cachedTools) { - return cachedTools; - } - - // Use pnpm exec to run the binary from the workspace - const transport = new Experimental_StdioMCPTransport({ - command: "pnpm", - args: [ - "exec", - "sentry-mcp", - "--access-token=mocked-access-token", - "--all-scopes", - ], - env: { - ...process.env, - SENTRY_ACCESS_TOKEN: "mocked-access-token", - SENTRY_HOST: "sentry.io", - }, - }); - - const client = await experimental_createMCPClient({ - transport, - }); - - // Discover available tools - const toolsMap = await client.tools(); - - // Convert tools to the format expected by the scorer - cachedTools = Object.entries(toolsMap).map(([name, tool]) => { - // Extract the first line of description for a concise summary - const shortDescription = (tool as any).description?.split("\n")[0] || ""; - return `${name} - ${shortDescription}`; - }); - - // Clean up - await client.close(); - - return cachedTools; -} - -export interface ExpectedToolCall { - name: string; - arguments: Record; -} - -interface ToolPredictionScorerOptions { - input: string; - output: string; - expectedTools?: ExpectedToolCall[]; - result?: any; -} - -const defaultModel = openai("gpt-4o"); - -const predictionSchema = z.object({ - score: z.number().min(0).max(1).describe("Score from 0 to 1"), - rationale: z.string().describe("Explanation of the score"), - predictedTools: z - .array( - z.object({ - name: z.string(), - arguments: z.record(z.any()).optional().default({}), - }), - ) - .describe("What tools the AI would likely call"), -}); - -function generateSystemPrompt( - availableTools: string[], - task: string, - expectedDescription: string, -): string { - return `You are evaluating whether an AI assistant with access to Sentry MCP tools would make the correct tool calls for a given task. - -[AVAILABLE TOOLS] -${availableTools.join("\n")} - -[TASK] -${task} - -[EXPECTED TOOL CALLS] -${expectedDescription} - -Based on the task and available tools, predict what tools the AI would call to complete this task. - -IMPORTANT: Look at what information is already provided in the task: -- When only an organization name is given (e.g., "in sentry-mcp-evals"), discovery calls ARE typically needed -- When organization/project are given in "org/project" format, the AI may skip discovery if confident -- The expected tool calls show what is ACTUALLY expected for this specific case - follow them exactly -- Discovery calls (find_organizations, find_projects) are commonly used to get regionUrl and verify access -- Match the expected tool sequence exactly - if expected includes discovery, predict discovery - -Consider: -1. Match the expected tool sequence exactly - the expected tools show realistic AI behavior -2. When a value like "sentry-mcp-evals" appears alone, it's typically an organizationSlug, not a projectSlug -3. Arguments should match expected values (organizationSlug, projectSlug, name, etc.) -4. For natural language queries in search_events, exact phrasing doesn't need to match -5. Extra parameters like regionUrl are acceptable -6. The AI commonly does discovery calls even when slugs appear to be provided, to get region info - -Score as follows: -- 1.0: All expected tools would be called with correct arguments in the right order -- 0.8: All expected tools would be called, minor differences (extra params, slight variations) -- 0.6: Most expected tools would be called but missing some or wrong order -- 0.3: Some expected tools would be called but significant issues -- 0.0: Wrong tools or critical tools missing - -CRITICAL: The expected tools represent the actual realistic behavior for this specific case. Follow the expected sequence exactly: -- If expected tools include discovery calls, predict discovery calls -- If expected tools do NOT include discovery calls, do NOT predict them -- The test author has determined what's appropriate for each specific scenario`; -} - -/** - * A scorer that uses AI to predict what tools would be called without executing them. - * This is much faster than actually running the tools and checking what was called. - * - * @param model - Optional language model to use for predictions (defaults to gpt-4o) - * @returns A scorer function that compares predicted vs expected tool calls - * - * @example - * ```typescript - * import { ToolPredictionScorer } from './utils/toolPredictionScorer'; - * import { NoOpTaskRunner } from './utils/runner'; - * import { describeEval } from 'vitest-evals'; - * - * describeEval("Sentry issue search", { - * data: async () => [ - * { - * input: "Find the newest issues in my-org", - * expectedTools: [ - * { name: "find_organizations", arguments: {} }, - * { name: "find_issues", arguments: { organizationSlug: "my-org", sortBy: "first_seen" } } - * ] - * } - * ], - * task: NoOpTaskRunner(), // Don't execute tools, just predict them - * scorers: [ToolPredictionScorer()], - * threshold: 0.8 - * }); - * ``` - * - * The scorer works by: - * 1. Connecting to the MCP server to get available tools and their descriptions - * 2. Using AI to predict what tools would be called for the given task - * 3. Comparing predictions against the expectedTools array - * 4. Returning a score from 0.0 to 1.0 based on accuracy - * - * Scoring criteria: - * - 1.0: All expected tools predicted with correct arguments in right order - * - 0.8: All expected tools predicted, minor differences (extra params, slight variations) - * - 0.6: Most expected tools predicted but missing some or wrong order - * - 0.3: Some expected tools predicted but significant issues - * - 0.0: Wrong tools or critical tools missing - * - * If `expectedTools` is not provided in test data, the scorer is automatically skipped - * and returns `{ score: null }` to allow other scorers to run without interference. - */ -export function ToolPredictionScorer(model: LanguageModel = defaultModel) { - return async function ToolPredictionScorer( - opts: ToolPredictionScorerOptions, - ) { - // If expectedTools is not defined, skip this scorer - if (!opts.expectedTools) { - return { - score: null, - metadata: { - rationale: "Skipped: No expectedTools defined for this test case", - }, - }; - } - - const expectedTools = opts.expectedTools; - - // Get available tools from the MCP server - const AVAILABLE_TOOLS = await getAvailableTools(); - - // Generate a description of the expected tools for the prompt - const expectedDescription = expectedTools - .map( - (tool) => - `- ${tool.name} with arguments: ${JSON.stringify(tool.arguments)}`, - ) - .join("\n"); - - const { object } = await generateObject({ - model, - prompt: generateSystemPrompt( - AVAILABLE_TOOLS, - opts.input, - expectedDescription, - ), - schema: predictionSchema, - experimental_telemetry: { - isEnabled: true, - functionId: "tool_prediction_scorer", - }, - }); - - return { - score: object.score, - metadata: { - rationale: object.rationale, - predictedTools: object.predictedTools, - expectedTools: expectedTools, - }, - }; - }; -} diff --git a/packages/mcp-server-evals/src/evals/utils/types.ts b/packages/mcp-server-evals/src/evals/utils/types.ts new file mode 100644 index 000000000..b73005d04 --- /dev/null +++ b/packages/mcp-server-evals/src/evals/utils/types.ts @@ -0,0 +1,36 @@ +import type { JsonValue } from "vitest-evals"; + +export type JsonRecord = Record; + +export interface ExpectedToolCall { + name: string; + arguments?: Record; +} + +export type PredictedToolCall = { + name: string; + arguments?: JsonRecord; +}; + +export type ToolPredictionOutput = { + score: number; + rationale: string; + predictedTools: PredictedToolCall[]; +}; + +export type ToolPredictionMetadata = Record & { + expectedTools?: ExpectedToolCall[]; +}; + +export type ToolCallEvalMetadata = Record & { + expectedTools?: ExpectedToolCall[]; +}; + +export type StructuredEvalMetadata = ToolCallEvalMetadata & { + expected?: Record; +}; + +export type EvalCase> = { + input: string; + name?: string; +} & TMetadata; diff --git a/packages/mcp-server-evals/vitest.config.ts b/packages/mcp-server-evals/vitest.config.ts index 8d0f7cab7..cdbf92da4 100644 --- a/packages/mcp-server-evals/vitest.config.ts +++ b/packages/mcp-server-evals/vitest.config.ts @@ -3,7 +3,7 @@ import { defineConfig } from "vitest/config"; export default defineConfig({ test: { - include: ["**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}"], + include: ["src/**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}"], reporters: ["vitest-evals/reporter"], coverage: { provider: "v8", diff --git a/packages/mcp-server-evals/vitest.unit.config.ts b/packages/mcp-server-evals/vitest.unit.config.ts new file mode 100644 index 000000000..6ca4a5286 --- /dev/null +++ b/packages/mcp-server-evals/vitest.unit.config.ts @@ -0,0 +1,8 @@ +/// +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + include: ["src/**/*.test.ts"], + }, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1574dbc70..86590370d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -75,6 +75,9 @@ catalogs: '@vitejs/plugin-react': specifier: ^4.6.0 version: 4.6.0 + '@vitest-evals/harness-ai-sdk': + specifier: ^0.12.0 + version: 0.12.0 agents: specifier: ^0.3.10 version: 0.3.10 @@ -166,8 +169,8 @@ catalogs: specifier: ^4.1.2 version: 4.1.2 vitest-evals: - specifier: ^0.4.0 - version: 0.4.0 + specifier: ^0.12.0 + version: 0.12.0 workers-mcp: specifier: 0.1.0-3 version: 0.1.0-3 @@ -228,7 +231,7 @@ importers: version: 4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)) vitest-evals: specifier: 'catalog:' - version: 0.4.0(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3))) + version: 0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76) devDependencies: '@flue/cli': specifier: 'catalog:' @@ -531,6 +534,9 @@ importers: '@sentry/mcp-server-tsconfig': specifier: workspace:* version: link:../mcp-server-tsconfig + '@vitest-evals/harness-ai-sdk': + specifier: 'catalog:' + version: 0.12.0(ai@6.0.64(zod@3.25.76))(vitest-evals@0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76)) ai: specifier: 'catalog:' version: 6.0.64(zod@3.25.76) @@ -548,7 +554,7 @@ importers: version: 4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)) vitest-evals: specifier: 'catalog:' - version: 0.4.0(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3))) + version: 0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76) zod: specifier: 'catalog:' version: 3.25.76 @@ -2891,6 +2897,18 @@ packages: peerDependencies: vite: ^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0-beta.0 + '@vitest-evals/core@0.12.0': + resolution: {integrity: sha512-JOatlrVw4jcP9VCBAFcM07pGxUA2iLt4Ks5jaRYqyATjkNwPYnyNDL+YHgvelANfPA0BBX8MzRfs6vEkzJgC+A==} + + '@vitest-evals/harness-ai-sdk@0.12.0': + resolution: {integrity: sha512-0yvM80vMqhCl+bc9j3tlDQfOc5H3rL3VNO2RUX8fRgDuWJ3iORW+WDENP+L4PO85GHvLgvUVGDhx+IJBfb26DA==} + peerDependencies: + ai: '>=4 <7' + vitest-evals: '*' + + '@vitest-evals/report-ui@0.12.0': + resolution: {integrity: sha512-rjWKnB+WL1ekiIvHdcnEX0tfaCwfeG3BNU6jvGKuJsHqkf8JRtuTyy/xgUKKsb56CokcZ3K3hmeo6RKik/KBrQ==} + '@vitest/expect@4.1.2': resolution: {integrity: sha512-gbu+7B0YgUJ2nkdsRJrFFW6X7NTP44WlhiclHniUhxADQJH5Szt9mZ9hWnJPJ8YwOK5zUOSSlSvyzRf0u1DSBQ==} @@ -5657,11 +5675,19 @@ packages: yaml: optional: true - vitest-evals@0.4.0: - resolution: {integrity: sha512-tvKIc8sCtK7LZnSTFLh5C7BlDzSZhefKzCR68QNShVa7gkiepg7CZH8j3T6ZBWwIa5VgfmFkZ1Iv5NKzUpSfGQ==} + vitest-evals@0.12.0: + resolution: {integrity: sha512-pyVA4N8gM+T2JB+SGFNSuXcgf/CHbBygAXkXR1fEPEfleKyMacJXPF9gLWIyyC1x5BCrt0r4zkwzkdjZrdpwZQ==} + hasBin: true peerDependencies: - tinyrainbow: '*' - vitest: '*' + ai: '>=4 <7' + tinyrainbow: '>=2 <4' + vitest: '>=4 <5' + zod: '>=3 <5' + peerDependenciesMeta: + ai: + optional: true + zod: + optional: true vitest@4.1.2: resolution: {integrity: sha512-xjR1dMTVHlFLh98JE3i/f/WePqJsah4A0FK9cc8Ehp9Udk0AZk6ccpIZhh1qJ/yxVWRZ+Q54ocnD8TXmkhspGg==} @@ -8325,6 +8351,19 @@ snapshots: transitivePeerDependencies: - supports-color + '@vitest-evals/core@0.12.0': + dependencies: + zod: 3.25.76 + + '@vitest-evals/harness-ai-sdk@0.12.0(ai@6.0.64(zod@3.25.76))(vitest-evals@0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76))': + dependencies: + ai: 6.0.64(zod@3.25.76) + vitest-evals: 0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76) + + '@vitest-evals/report-ui@0.12.0': + dependencies: + '@vitest-evals/core': 0.12.0 + '@vitest/expect@4.1.2': dependencies: '@standard-schema/spec': 1.1.0 @@ -11572,15 +11611,25 @@ snapshots: tsx: 4.20.3 yaml: 2.8.3 - vitest-evals@0.4.0(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3))): + vitest-evals@0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76): dependencies: + '@vitest-evals/core': 0.12.0 + '@vitest-evals/report-ui': 0.12.0 tinyrainbow: 3.1.0 vitest: 4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)) + optionalDependencies: + ai: 6.0.64(zod@3.25.76) + zod: 3.25.76 - vitest-evals@0.4.0(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3))): + vitest-evals@0.12.0(ai@6.0.64(zod@3.25.76))(tinyrainbow@3.1.0)(vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)))(zod@3.25.76): dependencies: + '@vitest-evals/core': 0.12.0 + '@vitest-evals/report-ui': 0.12.0 tinyrainbow: 3.1.0 vitest: 4.1.2(@opentelemetry/api@1.9.1)(@types/node@24.0.10)(msw@2.10.2(@types/node@24.0.10)(typescript@5.8.3))(vite@6.3.5(@types/node@24.0.10)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)) + optionalDependencies: + ai: 6.0.64(zod@3.25.76) + zod: 3.25.76 vitest@4.1.2(@opentelemetry/api@1.9.1)(@types/node@22.16.0)(msw@2.10.2(@types/node@22.16.0)(typescript@5.8.3))(vite@6.3.5(@types/node@22.16.0)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.20.3)(yaml@2.8.3)): dependencies: diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index 2a85df454..f46ec8a1b 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -58,7 +58,8 @@ catalog: valibot: ^1.4.0 vite: ^6.3.5 vitest: ^4.1.2 - vitest-evals: ^0.4.0 + '@vitest-evals/harness-ai-sdk': ^0.12.0 + vitest-evals: ^0.12.0 workers-mcp: 0.1.0-3 wrangler: 4.80.0 zod: ^3.25.67 diff --git a/turbo.json b/turbo.json index a52f6860d..9d61d733b 100644 --- a/turbo.json +++ b/turbo.json @@ -47,6 +47,7 @@ "**/*.test.ts", "**/*.spec.ts", "vitest.config.ts", + "vitest.unit.config.ts", "package.json" ], "outputs": ["coverage/**", "*.junit.xml"], @@ -61,7 +62,7 @@ "eval": { "dependsOn": ["^build"], "outputs": [], - "cache": true + "cache": false }, "build": { "dependsOn": ["^build"], @@ -87,6 +88,7 @@ "NODE_ENV", "CI", "OPENAI_API_KEY", + "VITEST_EVALS_REPORT_LEVEL", "COOKIE_SECRET", "SENTRY_CLIENT_ID", "SENTRY_CLIENT_SECRET",