Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 12 additions & 133 deletions .github/workflows/eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@ on:
push:
branches: [main]
paths:
- "packages/mcp-core/src/tools*"
- "packages/mcp-core/src/tools/**"
- "packages/mcp-core/src/internal/agents/**"
- "packages/mcp-server-evals/**"
- "packages/mcp-server-mocks/**"
- ".github/workflows/eval.yml"
pull_request:
paths:
- "packages/mcp-core/src/tools*"
- "packages/mcp-core/src/tools/**"
- "packages/mcp-core/src/internal/agents/**"
- "packages/mcp-server-evals/**"
- "packages/mcp-server-mocks/**"
- ".github/workflows/eval.yml"
Expand Down Expand Up @@ -57,140 +59,17 @@ jobs:
run: pnpm build

- name: Run evals
run: pnpm eval:ci evals
if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }}
run: pnpm --filter @sentry/mcp-server-evals eval:ci
continue-on-error: true
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

- name: Create eval status check
uses: actions/github-script@v7
# Skip for fork PRs (no write permissions) but still run for pushes, workflow_dispatch, and same-repo PRs
- name: Report eval results
uses: getsentry/vitest-evals@v0
if: ${{ !cancelled() && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) }}
continue-on-error: true # Don't fail workflow if check creation fails
with:
script: |
const fs = require('fs');
const path = require('path');

// Read eval results
const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json');
console.log(`Reading eval results from: ${resultsPath}`);

let vitestResults;
try {
vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'));
} catch (error) {
if (error.code === 'ENOENT') {
throw new Error(
`Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.`
);
}
throw new Error(`Failed to read/parse eval results: ${error.message}`);
}

// Extract eval results from vitest format
const evalResults = [];
for (const testFile of vitestResults.testResults || []) {
for (const test of testFile.assertionResults || []) {
if (test.meta?.eval) {
evalResults.push({
name: test.fullName || test.title,
file: testFile.name,
avgScore: test.meta.eval.avgScore ?? null,
scores: test.meta.eval.scores || [],
passed: test.status === 'passed',
duration: test.duration,
});
}
}
}

// Calculate statistics
const totalTests = evalResults.length;
// Treat null scores as 0.0 for consistent categorization
const scores = evalResults.map(r => r.avgScore !== null ? r.avgScore : 0);

const avgScore = scores.length > 0
? scores.reduce((sum, score) => sum + score, 0) / scores.length
: 0;

const green = scores.filter(s => s >= 0.75).length;
const yellow = scores.filter(s => s >= 0.5 && s < 0.75).length;
const red = scores.filter(s => s < 0.5).length;

// Determine conclusion
const conclusion = avgScore >= 0.5 ? 'success' : 'failure';

// Format score helper
function formatScore(score) {
if (score >= 0.75) return `🟢 ${score.toFixed(2)}`;
if (score >= 0.5) return `🟡 ${score.toFixed(2)}`;
return `🔴 ${score.toFixed(2)}`;
}

// Build title
const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`;

// Build summary
const summary = [
`## Overall Statistics`,
``,
`- **Total Evaluations**: ${totalTests}`,
`- **Average Score**: ${formatScore(avgScore)}`,
`- **Pass Threshold**: 0.50 (catastrophic failure)`,
``,
`### Score Distribution`,
`- 🟢 Green (≥0.75): ${green} evals`,
`- 🟡 Yellow (0.50-0.74): ${yellow} evals`,
`- 🔴 Red (<0.50): ${red} evals`,
].join('\n');

// Build detailed results
const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
const details = [
`## Individual Eval Scores`,
``,
...detailsByScore.map(result => {
const score = result.avgScore !== null ? result.avgScore : 0;
const statusIcon = result.passed ? '✅' : '❌';
const scoreDisplay = formatScore(score);

let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`;

// Add rationale for failed or low-scoring tests
if (!result.passed || score < 0.75) {
const firstScore = result.scores[0];
if (firstScore?.metadata?.rationale) {
line += `\n - ${firstScore.metadata.rationale}`;
}
}

return line;
}),
``,
`---`,
``,
`### Conclusion`,
``,
conclusion === 'success'
? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)`
: `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`,
].join('\n');

// Create check run
await github.rest.checks.create({
owner: context.repo.owner,
repo: context.repo.repo,
name: 'Evaluation Results',
head_sha: context.sha,
status: 'completed',
conclusion: conclusion,
output: {
title: title,
summary: summary,
text: details,
},
});

console.log(`✅ Check run created with conclusion: ${conclusion}`);
console.log(` Average Score: ${avgScore.toFixed(2)}`);
results: packages/mcp-server-evals/eval-results.json
publish-check: true
check-name: Evaluation Results
fail-on-failures: true
31 changes: 18 additions & 13 deletions docs/adding-tools.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,20 +255,25 @@ See [api-patterns.md](api-patterns.md#mock-patterns) for validation examples.
**⚠️ Each eval costs time and API credits. Only test core functionality!**

```typescript
describeEval("your-tool", {
data: async () => [
{
input: `Primary use case in ${FIXTURES.organizationSlug}`,
expected: "Expected response"
},
// Maximum 2-3 scenarios!
],
task: TaskRunner(),
scorers: [Factuality()],
threshold: 0.6,
});
import { describeToolPredictionEval, FIXTURES } from "./utils";

describeToolPredictionEval("your-tool", [
{
input: `Primary use case in ${FIXTURES.organizationSlug}`,
expectedTools: [
{
name: "your_tool",
arguments: { organizationSlug: FIXTURES.organizationSlug },
},
],
},
// Maximum 2-3 scenarios!
]);
```

Use `describeMcpToolCallEval` instead when the eval needs to execute the full
MCP harness and validate actual tool calls, usage data, and traces.

## Testing Workflow

```bash
Expand All @@ -279,7 +284,7 @@ pnpm test tools.test
pnpm inspector

# 3. Run minimal evals
pnpm eval your-tool
pnpm --filter @sentry/mcp-server-evals eval your-tool
```

## Checklist
Expand Down
8 changes: 4 additions & 4 deletions docs/pr-management.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,11 @@ datasets: errors, logs, and spans.
Co-Authored-By: Codex CLI Agent <noreply@openai.com>"

# Bug fix
git commit -m "fix(evals): update search-events eval to use available exports
git commit -m "fix(evals): migrate search-events eval to shared harness

Replace missing TaskRunner and Factuality imports with NoOpTaskRunner
and ToolPredictionScorer to resolve CI build failures after factuality
checker removal.
Replace bespoke prediction scoring with describeToolPredictionEval so the
suite uses the shared vitest-evals harness, report metadata, and GitHub check
output.

Co-Authored-By: Codex CLI Agent <noreply@openai.com>"

Expand Down
41 changes: 25 additions & 16 deletions docs/testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -253,33 +253,42 @@ expect(result.timestamp).toMatchInlineSnapshot(); // ❌
### Eval Test Structure

```typescript
import { describeEval } from "vitest-evals";
import { TaskRunner, Factuality } from "./utils";

describeEval("tool-name", {
data: async () => [
{
input: "Natural language request",
expected: "Expected response content"
}
],
task: TaskRunner(), // Uses AI to call tools
scorers: [Factuality()], // Validates output
threshold: 0.6,
timeout: 30000
});
import { describeToolPredictionEval, FIXTURES } from "./utils";

describeToolPredictionEval("tool-name", [
{
input: `Natural language request in ${FIXTURES.organizationSlug}`,
expectedTools: [
{
name: "your_tool",
arguments: { organizationSlug: FIXTURES.organizationSlug },
},
],
},
]);
```

Use `describeToolPredictionEval` for fast tool-selection coverage. Use
`describeMcpToolCallEval` when the eval must run the full MCP harness and
capture actual tool calls, usage, and traces. Use `describeSearchAgentEval` for
embedded search agents that return structured query output.

### Running Evals

```bash
# Requires OPENAI_API_KEY in .env
pnpm eval

# Run specific eval
pnpm eval tool-name
pnpm --filter @sentry/mcp-server-evals eval tool-name

# Serve the last JSON report locally
pnpm eval:report
```

Eval runs write `packages/mcp-server-evals/eval-results.json`; CI and the local
report UI both read that JSON artifact.

## Test Data Management

### Using Fixtures
Expand Down
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
"deploy": "turbo deploy",
"eval": "dotenv -e .env -e .env.local -- turbo eval",
"eval:ci": "CI=true dotenv -e .env -e .env.local -- pnpm --stream -r run eval:ci",
"eval:report": "pnpm --filter @sentry/mcp-server-evals eval:report",
"eval:ui": "pnpm --filter @sentry/mcp-server-evals eval:ui",
"flue:issue-triage": "flue run issue-triage --target node",
"format": "biome format --write",
"lint": "biome lint",
Expand Down
14 changes: 13 additions & 1 deletion packages/mcp-core/src/internal/agents/callEmbeddedAgent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {
generateText,
Output,
type Tool,
type GenerateTextResult,
APICallError,
NoObjectGeneratedError,
stepCountIs,
Expand All @@ -16,9 +17,17 @@ export type ToolCall = {
args: unknown;
};

type EmbeddedAgentGenerateResult = GenerateTextResult<
Record<string, Tool>,
ReturnType<typeof Output.object>
>;

interface EmbeddedAgentResult<T> {
result: T;
toolCalls: ToolCall[];
steps?: EmbeddedAgentGenerateResult["steps"];
usage?: EmbeddedAgentGenerateResult["usage"];
totalUsage?: EmbeddedAgentGenerateResult["totalUsage"];
}

/**
Expand Down Expand Up @@ -54,7 +63,7 @@ export async function callEmbeddedAgent<
system,
prompt,
tools,
stopWhen: stepCountIs(5),
stopWhen: stepCountIs(7),
experimental_output: Output.object({ schema }),
experimental_telemetry: {
isEnabled: true,
Expand Down Expand Up @@ -101,6 +110,9 @@ export async function callEmbeddedAgent<
return {
result: parsedResult.data,
toolCalls: capturedToolCalls,
steps: result.steps,
usage: result.usage,
totalUsage: result.totalUsage,
};
} catch (error: unknown) {
// Rescue NoObjectGeneratedError: try to parse the raw LLM text through the schema
Expand Down
Loading
Loading