Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions packages/app/cypress/e2e/evaluation-drawer-share.cy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
/**
* E2E tests for the eval-samples drawer share-link feature.
*
* When `E2E_FIXTURES=1`, the Next.js server itself returns fixture data from
* `cypress/fixtures/api/*.json` for every API route, so these tests just
* visit pages and assert on the rendered UI — no `cy.intercept` needed.
*
* Coverage:
* - Share button is visible inside the open drawer.
* - Opening the drawer mirrors e_drawer to the share URL.
* - Setting a filter / search also appears in the share URL.
* - Visiting with e_drawer + e_dfilter + e_dq restores drawer + filter + search.
* - Missing e_drawer key → silent no-op (drawer stays closed).
*/

const dismissModal = (win: Window) => {
win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
};

function visitEvalTable(queryString = '') {
cy.visit(`/evaluation${queryString}`, { onBeforeLoad: dismissModal });
cy.get('[data-testid="evaluation-chart-display"]').should('be.visible');
cy.get('[data-testid="evaluation-view-toggle"]').contains('Table').click();
cy.get('[data-testid="evaluation-results-table"]').should('be.visible');
}

function openFirstDrawer() {
cy.get('[data-testid="evaluation-results-table"]')
.find('button')
.contains('Prompts')
.first()
.click();
// Wait for drawer dialog to mount
cy.get('[data-testid="eval-drawer-share-button"]').should('be.visible');
}

// ---------------------------------------------------------------------------
// Share button presence
// ---------------------------------------------------------------------------

describe('Eval Drawer — Share button', () => {
before(() => {
visitEvalTable();
});

it('shows at least one Prompts button in the evaluation table', () => {
cy.get('[data-testid="evaluation-results-table"]')
.find('button')
.contains('Prompts')
.should('exist');
});

it('opens the drawer and renders the Share button', () => {
openFirstDrawer();
cy.get('[data-testid="eval-drawer-share-button"]').should('be.visible');
cy.get('body').type('{esc}');
});
});

// ---------------------------------------------------------------------------
// Share URL encoding
// ---------------------------------------------------------------------------

describe('Eval Drawer — Share URL encodes filter and search', () => {
beforeEach(() => {
visitEvalTable();
openFirstDrawer();
});

it('share URL includes e_drawer after opening a row', () => {
cy.get('[data-testid="eval-drawer-share-button"]').click();
cy.get('[data-testid="eval-drawer-share-button-url-input"]')
.invoke('val')
.should('match', /[?&]e_drawer=[^&]+/u);
});

it('share URL includes e_dfilter=failed after switching filter', () => {
cy.contains('button', 'Failed').click();
cy.get('[data-testid="eval-drawer-share-button"]').click();
cy.get('[data-testid="eval-drawer-share-button-url-input"]')
.invoke('val')
.should('include', 'e_dfilter=failed');
});

it('share URL includes e_dq after typing a search', () => {
cy.get('[aria-label="Search samples on this page"]').clear().type('lemon');
cy.get('[data-testid="eval-drawer-share-button"]').click();
cy.get('[data-testid="eval-drawer-share-button-url-input"]')
.invoke('val')
.should('include', 'e_dq=lemon');
});
});

// ---------------------------------------------------------------------------
// Restore from URL params
// ---------------------------------------------------------------------------

describe('Eval Drawer — Restore from URL params', () => {
// Capture the composite drawer key dynamically from the first row so the
// test is not coupled to a specific fixture value.
let drawerKey: string;

before(() => {
visitEvalTable();
openFirstDrawer();

cy.get('[data-testid="eval-drawer-share-button"]').click();
cy.get('[data-testid="eval-drawer-share-button-url-input"]')
.invoke('val')
.then((url) => {
const match = /[?&]e_drawer=([^&]+)/u.exec(String(url));
if (match) drawerKey = decodeURIComponent(match[1]);
});
});

it('re-opens the drawer when e_drawer is in the URL', () => {
cy.then(() => {
visitEvalTable(`?e_drawer=${encodeURIComponent(drawerKey)}`);
cy.get('[data-testid="eval-drawer-share-button"]', { timeout: 8000 }).should('be.visible');
});
});

it('restores filter=failed when e_dfilter=failed is in the URL', () => {
cy.then(() => {
visitEvalTable(`?e_drawer=${encodeURIComponent(drawerKey)}&e_dfilter=failed`);
cy.get('[data-testid="eval-drawer-share-button"]', { timeout: 8000 }).should('be.visible');
cy.contains('button', 'Failed').should('have.attr', 'aria-pressed', 'true');
});
});

it('restores search text when e_dq is in the URL', () => {
cy.then(() => {
visitEvalTable(`?e_drawer=${encodeURIComponent(drawerKey)}&e_dq=lemon`);
cy.get('[data-testid="eval-drawer-share-button"]', { timeout: 8000 }).should('be.visible');
cy.get('[aria-label="Search samples on this page"]').should('have.value', 'lemon');
});
});
});

// ---------------------------------------------------------------------------
// Missing-row fallback — silent no-op
// ---------------------------------------------------------------------------

describe('Eval Drawer — Missing row is a silent no-op', () => {
it('leaves the drawer closed when the e_drawer key has no match', () => {
visitEvalTable('?e_drawer=nonexistent~row~fp4~sglang~none~0~1~8~');
cy.wait(1500);
cy.get('[data-testid="eval-drawer-share-button"]').should('not.exist');
});
});
63 changes: 63 additions & 0 deletions packages/app/cypress/fixtures/api/eval-samples.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
{
"total": 5,
"passedTotal": 4,
"failedTotal": 1,
"source": "db",
"samples": [
{
"docId": 1,
"prompt": "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers market daily for $2 per fresh duck egg. How much does she make every day at the farmers market?",
"target": "18",
"response": "18",
"rawResponse": "18",
"demonstrations": null,
"passed": true,
"score": 1,
"metrics": { "em_strict": 1 }
},
{
"docId": 2,
"prompt": "A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?",
"target": "3",
"response": "3",
"rawResponse": "3",
"demonstrations": null,
"passed": true,
"score": 1,
"metrics": { "em_strict": 1 }
},
{
"docId": 3,
"prompt": "James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. How many total meters does he run a week?",
"target": "540",
"response": "540",
"rawResponse": "540",
"demonstrations": null,
"passed": true,
"score": 1,
"metrics": { "em_strict": 1 }
},
{
"docId": 4,
"prompt": "Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. How many sheep do Toulouse, Charleston, and Seattle have together if Seattle has 20 sheep?",
"target": "260",
"response": "260",
"rawResponse": "260",
"demonstrations": null,
"passed": true,
"score": 1,
"metrics": { "em_strict": 1 }
},
{
"docId": 5,
"prompt": "Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?",
"target": "11",
"response": "12",
"rawResponse": "12",
"demonstrations": null,
"passed": false,
"score": 0,
"metrics": { "em_strict": 0 }
}
]
}
73 changes: 72 additions & 1 deletion packages/app/scripts/capture-cypress-fixtures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,17 @@ const fixturesDir = resolve(__dirname, '..', 'cypress', 'fixtures', 'api');
// doesn't assert on specific values, so any realistic snapshot suffices.
const BENCHMARK_MODEL = 'DeepSeek-R1-0528';

// eval-samples: the fixture represents a single eval_result_id. We pick the
// first non-disaggregated dsr1 gsm8k row (most likely to have samples ingested).
// The cypress tests intercept ALL eval-samples requests with this fixture, so
// the specific ID doesn't matter — only the shape matters.
const EVAL_SAMPLES_MODEL = 'dsr1';
const EVAL_SAMPLES_TASK = 'gsm8k';
// How many sample rows to keep per filter variant (all / passed / failed).
// Enough to render the drawer and exercise filter-chip counts; not so many
// that the fixture gets large.
const EVAL_SAMPLES_LIMIT = 10;

// History must cover every (isl, osl) combo that appears in the benchmarks
// fixture, otherwise the drill-down trend modal shows "no historical data"
// when the user double-clicks a scatter point with a non-default (isl, osl).
Expand Down Expand Up @@ -140,9 +151,35 @@ async function main() {
`Latest date: ${latestDate}; keeping top ${TOP_DATES_PER_PARTITION} dates per partition`,
);

interface EvalSampleRow {
docId: number;
prompt: string | null;
target: string | null;
response: string | null;
rawResponse: string | null;
demonstrations: { question: string; answer: string }[] | null;
passed: boolean | null;
score: number | null;
metrics: Record<string, number>;
}
interface EvalSamplesResponse {
total: number;
passedTotal: number;
failedTotal: number;
source: string;
samples: EvalSampleRow[];
}
interface EvalRow {
id: string;
model: string;
task: string;
disagg: boolean;
date: string;
}

const availability = await fetchJson<{ date: string; model: string }[]>('/api/v1/availability');
const reliability = await fetchJson<{ date: string; hardware: string }[]>('/api/v1/reliability');
const evaluations = await fetchJson<{ date: string; model: string }[]>('/api/v1/evaluations');
const evaluations = await fetchJson<EvalRow[]>('/api/v1/evaluations');

// Latest-snapshot: already deduped to one row per config, no date filter.
// ~20 conc levels per (hw, fw, prec, isl, osl) — sample down to keep the
Expand Down Expand Up @@ -180,6 +217,37 @@ async function main() {
);
}

// Find a non-disaggregated eval row with samples likely to be ingested.
const evalSampleSourceRow = evaluations.find(
(r) => r.model === EVAL_SAMPLES_MODEL && r.task === EVAL_SAMPLES_TASK && !r.disagg,
);
let evalSamples: EvalSamplesResponse | null = null;
if (evalSampleSourceRow) {
const params = new URLSearchParams({
eval_result_id: evalSampleSourceRow.id,
filter: 'all',
offset: '0',
limit: String(EVAL_SAMPLES_LIMIT),
});
try {
const raw = await fetchJson<EvalSamplesResponse>(`/api/v1/eval-samples?${params}`);
// Strip the demonstrations field from each sample to keep the fixture small.
evalSamples = {
...raw,
samples: raw.samples.slice(0, EVAL_SAMPLES_LIMIT).map((s) => ({
...s,
demonstrations: null,
})),
};
} catch (error) {
console.warn(`eval-samples fetch failed (${evalSampleSourceRow.id}): ${error}; skipping`);
}
} else {
console.warn(
`No non-disagg ${EVAL_SAMPLES_MODEL}/${EVAL_SAMPLES_TASK} row found; skipping eval-samples fixture`,
);
}

const submissions = await fetchJson<{ summary: unknown[]; volume: unknown[] }>(
'/api/v1/submissions',
);
Expand Down Expand Up @@ -250,6 +318,9 @@ async function main() {
}),
],
['workflow-info', await writeFixture('workflow-info', workflowInfo)],
...(evalSamples
? ([['eval-samples', await writeFixture('eval-samples', evalSamples)]] as [string, number][])
: []),
];

for (const [name, bytes] of sizes) {
Expand Down
38 changes: 37 additions & 1 deletion packages/app/src/app/api/v1/eval-samples/route.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import { type NextRequest, NextResponse } from 'next/server';

import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
import { FIXTURES_MODE, JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
import { getEvalSamples } from '@semianalysisai/inferencex-db/queries/eval-samples';

import { cachedJson, cachedQuery } from '@/lib/api-cache';
import { extractDemonstrations } from '@/lib/eval-sample-utils';
import { loadFixture } from '@/lib/test-fixtures';

export const dynamic = 'force-dynamic';

Expand Down Expand Up @@ -56,6 +57,41 @@ export async function GET(request: NextRequest) {
}
const filter = filterParam as 'all' | 'passed' | 'failed';

if (FIXTURES_MODE) {
// The fixture is captured for filter='all'. Recompute the per-filter view
// (samples + total) here so chip counts and the filter chip itself match
// what the live route would return.
const fx = loadFixture<{
samples: {
docId: number;
prompt: string | null;
target: string | null;
response: string | null;
rawResponse: string | null;
demonstrations: { question: string; answer: string }[] | null;
passed: boolean | null;
score: number | null;
metrics: Record<string, number>;
}[];
total: number;
passedTotal: number;
failedTotal: number;
source: 'db' | 'github_artifact';
}>('eval-samples');
const filtered =
filter === 'all'
? fx.samples
: fx.samples.filter((s) => (filter === 'passed' ? s.passed === true : s.passed === false));
const sliced = filtered.slice(offset, offset + limit);
return cachedJson({
samples: sliced,
total: filter === 'all' ? fx.total : filter === 'passed' ? fx.passedTotal : fx.failedTotal,
passedTotal: fx.passedTotal,
failedTotal: fx.failedTotal,
source: fx.source,
});
}

try {
const result = await getCachedEvalSamples(evalResultId, filter, offset, limit);

Expand Down
Loading