diff --git a/packages/app/cypress/e2e/evaluation-drawer-share.cy.ts b/packages/app/cypress/e2e/evaluation-drawer-share.cy.ts new file mode 100644 index 00000000..00fabc88 --- /dev/null +++ b/packages/app/cypress/e2e/evaluation-drawer-share.cy.ts @@ -0,0 +1,150 @@ +/** + * E2E tests for the eval-samples drawer share-link feature. + * + * When `E2E_FIXTURES=1`, the Next.js server itself returns fixture data from + * `cypress/fixtures/api/*.json` for every API route, so these tests just + * visit pages and assert on the rendered UI — no `cy.intercept` needed. + * + * Coverage: + * - Share button is visible inside the open drawer. + * - Opening the drawer mirrors e_drawer to the share URL. + * - Setting a filter / search also appears in the share URL. + * - Visiting with e_drawer + e_dfilter + e_dq restores drawer + filter + search. + * - Missing e_drawer key → silent no-op (drawer stays closed). + */ + +const dismissModal = (win: Window) => { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); +}; + +function visitEvalTable(queryString = '') { + cy.visit(`/evaluation${queryString}`, { onBeforeLoad: dismissModal }); + cy.get('[data-testid="evaluation-chart-display"]').should('be.visible'); + cy.get('[data-testid="evaluation-view-toggle"]').contains('Table').click(); + cy.get('[data-testid="evaluation-results-table"]').should('be.visible'); +} + +function openFirstDrawer() { + cy.get('[data-testid="evaluation-results-table"]') + .find('button') + .contains('Prompts') + .first() + .click(); + // Wait for drawer dialog to mount + cy.get('[data-testid="eval-drawer-share-button"]').should('be.visible'); +} + +// --------------------------------------------------------------------------- +// Share button presence +// --------------------------------------------------------------------------- + +describe('Eval Drawer — Share button', () => { + before(() => { + visitEvalTable(); + }); + + it('shows at least one Prompts button in the evaluation table', () => { + cy.get('[data-testid="evaluation-results-table"]') + .find('button') + .contains('Prompts') + .should('exist'); + }); + + it('opens the drawer and renders the Share button', () => { + openFirstDrawer(); + cy.get('[data-testid="eval-drawer-share-button"]').should('be.visible'); + cy.get('body').type('{esc}'); + }); +}); + +// --------------------------------------------------------------------------- +// Share URL encoding +// --------------------------------------------------------------------------- + +describe('Eval Drawer — Share URL encodes filter and search', () => { + beforeEach(() => { + visitEvalTable(); + openFirstDrawer(); + }); + + it('share URL includes e_drawer after opening a row', () => { + cy.get('[data-testid="eval-drawer-share-button"]').click(); + cy.get('[data-testid="eval-drawer-share-button-url-input"]') + .invoke('val') + .should('match', /[?&]e_drawer=[^&]+/u); + }); + + it('share URL includes e_dfilter=failed after switching filter', () => { + cy.contains('button', 'Failed').click(); + cy.get('[data-testid="eval-drawer-share-button"]').click(); + cy.get('[data-testid="eval-drawer-share-button-url-input"]') + .invoke('val') + .should('include', 'e_dfilter=failed'); + }); + + it('share URL includes e_dq after typing a search', () => { + cy.get('[aria-label="Search samples on this page"]').clear().type('lemon'); + cy.get('[data-testid="eval-drawer-share-button"]').click(); + cy.get('[data-testid="eval-drawer-share-button-url-input"]') + .invoke('val') + .should('include', 'e_dq=lemon'); + }); +}); + +// --------------------------------------------------------------------------- +// Restore from URL params +// --------------------------------------------------------------------------- + +describe('Eval Drawer — Restore from URL params', () => { + // Capture the composite drawer key dynamically from the first row so the + // test is not coupled to a specific fixture value. + let drawerKey: string; + + before(() => { + visitEvalTable(); + openFirstDrawer(); + + cy.get('[data-testid="eval-drawer-share-button"]').click(); + cy.get('[data-testid="eval-drawer-share-button-url-input"]') + .invoke('val') + .then((url) => { + const match = /[?&]e_drawer=([^&]+)/u.exec(String(url)); + if (match) drawerKey = decodeURIComponent(match[1]); + }); + }); + + it('re-opens the drawer when e_drawer is in the URL', () => { + cy.then(() => { + visitEvalTable(`?e_drawer=${encodeURIComponent(drawerKey)}`); + cy.get('[data-testid="eval-drawer-share-button"]', { timeout: 8000 }).should('be.visible'); + }); + }); + + it('restores filter=failed when e_dfilter=failed is in the URL', () => { + cy.then(() => { + visitEvalTable(`?e_drawer=${encodeURIComponent(drawerKey)}&e_dfilter=failed`); + cy.get('[data-testid="eval-drawer-share-button"]', { timeout: 8000 }).should('be.visible'); + cy.contains('button', 'Failed').should('have.attr', 'aria-pressed', 'true'); + }); + }); + + it('restores search text when e_dq is in the URL', () => { + cy.then(() => { + visitEvalTable(`?e_drawer=${encodeURIComponent(drawerKey)}&e_dq=lemon`); + cy.get('[data-testid="eval-drawer-share-button"]', { timeout: 8000 }).should('be.visible'); + cy.get('[aria-label="Search samples on this page"]').should('have.value', 'lemon'); + }); + }); +}); + +// --------------------------------------------------------------------------- +// Missing-row fallback — silent no-op +// --------------------------------------------------------------------------- + +describe('Eval Drawer — Missing row is a silent no-op', () => { + it('leaves the drawer closed when the e_drawer key has no match', () => { + visitEvalTable('?e_drawer=nonexistent~row~fp4~sglang~none~0~1~8~'); + cy.wait(1500); + cy.get('[data-testid="eval-drawer-share-button"]').should('not.exist'); + }); +}); diff --git a/packages/app/cypress/fixtures/api/eval-samples.json b/packages/app/cypress/fixtures/api/eval-samples.json new file mode 100644 index 00000000..f78d2c71 --- /dev/null +++ b/packages/app/cypress/fixtures/api/eval-samples.json @@ -0,0 +1,63 @@ +{ + "total": 5, + "passedTotal": 4, + "failedTotal": 1, + "source": "db", + "samples": [ + { + "docId": 1, + "prompt": "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers market daily for $2 per fresh duck egg. How much does she make every day at the farmers market?", + "target": "18", + "response": "18", + "rawResponse": "18", + "demonstrations": null, + "passed": true, + "score": 1, + "metrics": { "em_strict": 1 } + }, + { + "docId": 2, + "prompt": "A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?", + "target": "3", + "response": "3", + "rawResponse": "3", + "demonstrations": null, + "passed": true, + "score": 1, + "metrics": { "em_strict": 1 } + }, + { + "docId": 3, + "prompt": "James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. How many total meters does he run a week?", + "target": "540", + "response": "540", + "rawResponse": "540", + "demonstrations": null, + "passed": true, + "score": 1, + "metrics": { "em_strict": 1 } + }, + { + "docId": 4, + "prompt": "Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. How many sheep do Toulouse, Charleston, and Seattle have together if Seattle has 20 sheep?", + "target": "260", + "response": "260", + "rawResponse": "260", + "demonstrations": null, + "passed": true, + "score": 1, + "metrics": { "em_strict": 1 } + }, + { + "docId": 5, + "prompt": "Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?", + "target": "11", + "response": "12", + "rawResponse": "12", + "demonstrations": null, + "passed": false, + "score": 0, + "metrics": { "em_strict": 0 } + } + ] +} diff --git a/packages/app/scripts/capture-cypress-fixtures.ts b/packages/app/scripts/capture-cypress-fixtures.ts index 5f149289..02b7c57d 100644 --- a/packages/app/scripts/capture-cypress-fixtures.ts +++ b/packages/app/scripts/capture-cypress-fixtures.ts @@ -26,6 +26,17 @@ const fixturesDir = resolve(__dirname, '..', 'cypress', 'fixtures', 'api'); // doesn't assert on specific values, so any realistic snapshot suffices. const BENCHMARK_MODEL = 'DeepSeek-R1-0528'; +// eval-samples: the fixture represents a single eval_result_id. We pick the +// first non-disaggregated dsr1 gsm8k row (most likely to have samples ingested). +// The cypress tests intercept ALL eval-samples requests with this fixture, so +// the specific ID doesn't matter — only the shape matters. +const EVAL_SAMPLES_MODEL = 'dsr1'; +const EVAL_SAMPLES_TASK = 'gsm8k'; +// How many sample rows to keep per filter variant (all / passed / failed). +// Enough to render the drawer and exercise filter-chip counts; not so many +// that the fixture gets large. +const EVAL_SAMPLES_LIMIT = 10; + // History must cover every (isl, osl) combo that appears in the benchmarks // fixture, otherwise the drill-down trend modal shows "no historical data" // when the user double-clicks a scatter point with a non-default (isl, osl). @@ -140,9 +151,35 @@ async function main() { `Latest date: ${latestDate}; keeping top ${TOP_DATES_PER_PARTITION} dates per partition`, ); + interface EvalSampleRow { + docId: number; + prompt: string | null; + target: string | null; + response: string | null; + rawResponse: string | null; + demonstrations: { question: string; answer: string }[] | null; + passed: boolean | null; + score: number | null; + metrics: Record; + } + interface EvalSamplesResponse { + total: number; + passedTotal: number; + failedTotal: number; + source: string; + samples: EvalSampleRow[]; + } + interface EvalRow { + id: string; + model: string; + task: string; + disagg: boolean; + date: string; + } + const availability = await fetchJson<{ date: string; model: string }[]>('/api/v1/availability'); const reliability = await fetchJson<{ date: string; hardware: string }[]>('/api/v1/reliability'); - const evaluations = await fetchJson<{ date: string; model: string }[]>('/api/v1/evaluations'); + const evaluations = await fetchJson('/api/v1/evaluations'); // Latest-snapshot: already deduped to one row per config, no date filter. // ~20 conc levels per (hw, fw, prec, isl, osl) — sample down to keep the @@ -180,6 +217,37 @@ async function main() { ); } + // Find a non-disaggregated eval row with samples likely to be ingested. + const evalSampleSourceRow = evaluations.find( + (r) => r.model === EVAL_SAMPLES_MODEL && r.task === EVAL_SAMPLES_TASK && !r.disagg, + ); + let evalSamples: EvalSamplesResponse | null = null; + if (evalSampleSourceRow) { + const params = new URLSearchParams({ + eval_result_id: evalSampleSourceRow.id, + filter: 'all', + offset: '0', + limit: String(EVAL_SAMPLES_LIMIT), + }); + try { + const raw = await fetchJson(`/api/v1/eval-samples?${params}`); + // Strip the demonstrations field from each sample to keep the fixture small. + evalSamples = { + ...raw, + samples: raw.samples.slice(0, EVAL_SAMPLES_LIMIT).map((s) => ({ + ...s, + demonstrations: null, + })), + }; + } catch (error) { + console.warn(`eval-samples fetch failed (${evalSampleSourceRow.id}): ${error}; skipping`); + } + } else { + console.warn( + `No non-disagg ${EVAL_SAMPLES_MODEL}/${EVAL_SAMPLES_TASK} row found; skipping eval-samples fixture`, + ); + } + const submissions = await fetchJson<{ summary: unknown[]; volume: unknown[] }>( '/api/v1/submissions', ); @@ -250,6 +318,9 @@ async function main() { }), ], ['workflow-info', await writeFixture('workflow-info', workflowInfo)], + ...(evalSamples + ? ([['eval-samples', await writeFixture('eval-samples', evalSamples)]] as [string, number][]) + : []), ]; for (const [name, bytes] of sizes) { diff --git a/packages/app/src/app/api/v1/eval-samples/route.ts b/packages/app/src/app/api/v1/eval-samples/route.ts index 0dde813d..30f8682a 100644 --- a/packages/app/src/app/api/v1/eval-samples/route.ts +++ b/packages/app/src/app/api/v1/eval-samples/route.ts @@ -1,10 +1,11 @@ import { type NextRequest, NextResponse } from 'next/server'; -import { JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; +import { FIXTURES_MODE, JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; import { getEvalSamples } from '@semianalysisai/inferencex-db/queries/eval-samples'; import { cachedJson, cachedQuery } from '@/lib/api-cache'; import { extractDemonstrations } from '@/lib/eval-sample-utils'; +import { loadFixture } from '@/lib/test-fixtures'; export const dynamic = 'force-dynamic'; @@ -56,6 +57,41 @@ export async function GET(request: NextRequest) { } const filter = filterParam as 'all' | 'passed' | 'failed'; + if (FIXTURES_MODE) { + // The fixture is captured for filter='all'. Recompute the per-filter view + // (samples + total) here so chip counts and the filter chip itself match + // what the live route would return. + const fx = loadFixture<{ + samples: { + docId: number; + prompt: string | null; + target: string | null; + response: string | null; + rawResponse: string | null; + demonstrations: { question: string; answer: string }[] | null; + passed: boolean | null; + score: number | null; + metrics: Record; + }[]; + total: number; + passedTotal: number; + failedTotal: number; + source: 'db' | 'github_artifact'; + }>('eval-samples'); + const filtered = + filter === 'all' + ? fx.samples + : fx.samples.filter((s) => (filter === 'passed' ? s.passed === true : s.passed === false)); + const sliced = filtered.slice(offset, offset + limit); + return cachedJson({ + samples: sliced, + total: filter === 'all' ? fx.total : filter === 'passed' ? fx.passedTotal : fx.failedTotal, + passedTotal: fx.passedTotal, + failedTotal: fx.failedTotal, + source: fx.source, + }); + } + try { const result = await getCachedEvalSamples(evalResultId, filter, offset, limit); diff --git a/packages/app/src/components/evaluation/EvaluationContext.tsx b/packages/app/src/components/evaluation/EvaluationContext.tsx index 897dd156..74a30331 100644 --- a/packages/app/src/components/evaluation/EvaluationContext.tsx +++ b/packages/app/src/components/evaluation/EvaluationContext.tsx @@ -6,11 +6,15 @@ import { useCallback, useContext, useEffect, + useLayoutEffect, useMemo, useRef, useState, } from 'react'; +// useLayoutEffect warns during SSR; alias to useEffect on the server (no-op there anyway). +const useIsomorphicLayoutEffect = typeof window === 'undefined' ? useEffect : useLayoutEffect; + import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants'; import { track } from '@/lib/analytics'; @@ -59,9 +63,9 @@ export function EvaluationProvider({ children }: { children: ReactNode }) { const rawData: EvalRow[] = rawRows ?? []; const unofficialRawData: EvalRow[] = unofficialEvalRows ?? []; - const [selectedRunDate, setSelectedRunDate] = useState( - () => getUrlParam('e_rundate') || globalRunDate || '', - ); + // Initialize with safe defaults that match SSR output — URL-param values + // are applied in useIsomorphicLayoutEffect below to avoid hydration mismatches. + const [selectedRunDate, setSelectedRunDate] = useState(''); const handleSetSelectedRunDate = useCallback( (date: string) => { @@ -73,15 +77,13 @@ export function EvaluationProvider({ children }: { children: ReactNode }) { [inferenceAvailableDates, setGlobalRunDate], ); - const [selectedBenchmark, setSelectedBenchmark] = useState( - () => getUrlParam('e_bench') || undefined, - ); + const [selectedBenchmark, setSelectedBenchmark] = useState(undefined); const { highContrast, setHighContrast, isLegendExpanded, setIsLegendExpanded } = useChartUIState({ urlPrefix: 'e_', }); - const [showLabels, setShowLabels] = useState(() => getUrlParam('e_labels') === '1'); + const [showLabels, setShowLabels] = useState(false); const { activeSet: enabledHardware, @@ -93,12 +95,25 @@ export function EvaluationProvider({ children }: { children: ReactNode }) { // Pending legend-active selection restored from `e_active` URL param. // Consumed once when hwTypesWithData first populates. - const [pendingActiveHardware, setPendingActiveHardware] = useState | null>(() => { - const v = getUrlParam('e_active'); - if (!v) return null; - const set = new Set(v.split(',').filter(Boolean)); - return set.size > 0 ? set : null; - }); + const [pendingActiveHardware, setPendingActiveHardware] = useState | null>(null); + + // Apply URL-param overrides client-side only (avoids SSR/hydration mismatch). + // Runs synchronously before paint via useIsomorphicLayoutEffect. + const urlInitRef = useRef(false); + useIsomorphicLayoutEffect(() => { + if (urlInitRef.current) return; + urlInitRef.current = true; + const urlRunDate = getUrlParam('e_rundate'); + if (urlRunDate) setSelectedRunDate(urlRunDate); + const urlBench = getUrlParam('e_bench'); + if (urlBench) setSelectedBenchmark(urlBench); + if (getUrlParam('e_labels') === '1') setShowLabels(true); + const urlActive = getUrlParam('e_active'); + if (urlActive) { + const set = new Set(urlActive.split(',').filter(Boolean)); + if (set.size > 0) setPendingActiveHardware(set); + } + }, []); const availableBenchmarks = useMemo(() => { const tasks = new Set([ diff --git a/packages/app/src/components/evaluation/eval-drawer-key.test.ts b/packages/app/src/components/evaluation/eval-drawer-key.test.ts new file mode 100644 index 00000000..57311262 --- /dev/null +++ b/packages/app/src/components/evaluation/eval-drawer-key.test.ts @@ -0,0 +1,121 @@ +import { describe, expect, it } from 'vitest'; + +import type { EvaluationChartData } from '@/components/evaluation/types'; +import { DRAWER_KEY_DELIMITER, findRowByDrawerKey, rowToDrawerKey } from './eval-drawer-key'; + +const BASE_ROW: EvaluationChartData = { + evalResultId: 42, + configId: 1, + hwKey: 'mi355x_vllm' as any, + hardware: 'mi355x', + configLabel: 'MI355X (ATOM!) C32 T4 E1', + score: 0.96, + scoreError: 0.01, + minScore: 0.95, + maxScore: 0.97, + errorMin: 0.95, + errorMax: 0.97, + model: 'DeepSeek-R1-0528', + benchmark: 'gsm8k', + specDecode: 'none', + date: '2026-03-28', + datetime: '2026-03-28T00:00:00Z', + precision: 'fp4', + framework: 'vllm', + tp: 4, + ep: 0, + dp_attention: false, + conc: 32, + disagg: false, + isMultinode: false, + prefillTp: 4, + prefillEp: 0, + prefillDpAttention: false, + prefillNumWorkers: 0, + decodeNumWorkers: 0, + numPrefillGpu: 0, + numDecodeGpu: 0, +}; + +const UNOFFICIAL_ROW: EvaluationChartData = { + ...BASE_ROW, + evalResultId: -1, + runUrl: 'https://github.com/owner/repo/actions/runs/12345678', +}; + +describe('rowToDrawerKey', () => { + it('builds the expected composite key for an official row', () => { + const key = rowToDrawerKey(BASE_ROW); + expect(key).toBe('gsm8k~mi355x~fp4~vllm~none~0~32~4~'); + }); + + it('includes the runId for an unofficial row', () => { + const key = rowToDrawerKey(UNOFFICIAL_ROW); + expect(key).toBe('gsm8k~mi355x~fp4~vllm~none~0~32~4~12345678'); + }); + + it('encodes disagg=true as "1"', () => { + const key = rowToDrawerKey({ ...BASE_ROW, disagg: true }); + expect(key).toContain(`${DRAWER_KEY_DELIMITER}1${DRAWER_KEY_DELIMITER}`); + }); + + it('produces different keys for rows that differ only in tp', () => { + const key1 = rowToDrawerKey(BASE_ROW); + const key2 = rowToDrawerKey({ ...BASE_ROW, tp: 8 }); + expect(key1).not.toBe(key2); + }); + + it('produces different keys for rows that differ only in conc', () => { + const key1 = rowToDrawerKey(BASE_ROW); + const key2 = rowToDrawerKey({ ...BASE_ROW, conc: 256 }); + expect(key1).not.toBe(key2); + }); + + it('none of the built-in field values contain the delimiter', () => { + const fields = [ + BASE_ROW.benchmark, + BASE_ROW.hardware, + BASE_ROW.precision, + BASE_ROW.framework, + BASE_ROW.specDecode, + String(BASE_ROW.conc), + String(BASE_ROW.tp), + ]; + for (const f of fields) { + expect(f).not.toContain(DRAWER_KEY_DELIMITER); + } + }); +}); + +describe('findRowByDrawerKey', () => { + const rows: EvaluationChartData[] = [ + BASE_ROW, + { ...BASE_ROW, evalResultId: 99, tp: 8, conc: 256 }, + UNOFFICIAL_ROW, + ]; + + it('finds an official row by its composite key', () => { + const key = rowToDrawerKey(BASE_ROW); + expect(findRowByDrawerKey(rows, key)).toBe(BASE_ROW); + }); + + it('finds an unofficial row by its composite key', () => { + const key = rowToDrawerKey(UNOFFICIAL_ROW); + expect(findRowByDrawerKey(rows, key)).toBe(UNOFFICIAL_ROW); + }); + + it('returns null on a miss', () => { + expect(findRowByDrawerKey(rows, 'nonexistent~key')).toBeNull(); + }); + + it('returns null on an empty list', () => { + expect(findRowByDrawerKey([], rowToDrawerKey(BASE_ROW))).toBeNull(); + }); + + it('round-trips: key from row → find back same row', () => { + for (const row of rows) { + const key = rowToDrawerKey(row); + expect(findRowByDrawerKey(rows, key)).toBe(row); + } + }); +}); diff --git a/packages/app/src/components/evaluation/eval-drawer-key.ts b/packages/app/src/components/evaluation/eval-drawer-key.ts new file mode 100644 index 00000000..a0ec2c45 --- /dev/null +++ b/packages/app/src/components/evaluation/eval-drawer-key.ts @@ -0,0 +1,63 @@ +/** + * Composite key helpers for the eval samples drawer share link. + * + * The key encodes the unique aggregation dimensions so links survive + * re-ingests (evalResultId churns on every run, but these fields are stable). + * + * Format: ~~~~~~~~ + * + * Fields: + * benchmark — e.g. "gsm8k" + * hardware — bare hardware key, e.g. "mi355x" + * precision — e.g. "fp4" + * framework — e.g. "vllm" + * spec — spec-decode method, e.g. "none" + * disagg — "1" | "0" + * conc — concurrency, e.g. "32" + * tp — tensor-parallelism, e.g. "8" + * runId — GitHub Actions run ID for unofficial rows; "" for official rows + */ + +import type { EvaluationChartData } from '@/components/evaluation/types'; + +export const DRAWER_KEY_DELIMITER = '~'; + +/** + * Builds the composite drawer key from a table row. + * Works for both official rows (runId = "") and unofficial rows (runId from runUrl). + */ +export function rowToDrawerKey(row: EvaluationChartData): string { + const runId = extractRunIdFromUrl(row.runUrl); + const parts = [ + row.benchmark, + row.hardware, + row.precision, + row.framework, + row.specDecode, + row.disagg ? '1' : '0', + String(row.conc), + String(row.tp), + runId ?? '', + ]; + return parts.join(DRAWER_KEY_DELIMITER); +} + +/** + * Finds the first row in `rows` whose composite key matches `key`. + * Returns null if no match is found. + */ +export function findRowByDrawerKey( + rows: EvaluationChartData[], + key: string, +): EvaluationChartData | null { + for (const row of rows) { + if (rowToDrawerKey(row) === key) return row; + } + return null; +} + +function extractRunIdFromUrl(url: string | undefined): string | null { + if (!url) return null; + const m = url.match(/\/actions\/runs\/(\d+)/u); + return m ? m[1] : null; +} diff --git a/packages/app/src/components/evaluation/ui/EvalSamplesDrawer.tsx b/packages/app/src/components/evaluation/ui/EvalSamplesDrawer.tsx index 8bef7b03..3fb39451 100644 --- a/packages/app/src/components/evaluation/ui/EvalSamplesDrawer.tsx +++ b/packages/app/src/components/evaluation/ui/EvalSamplesDrawer.tsx @@ -1,13 +1,16 @@ 'use client'; import { ChevronLeft, ChevronRight, Search } from 'lucide-react'; -import { useEffect, useMemo, useState } from 'react'; +import { useEffect, useMemo, useRef, useState } from 'react'; import type { EvaluationChartData } from '@/components/evaluation/types'; import { Dialog, DialogContent, DialogTitle } from '@/components/ui/dialog'; +import { ShareButton } from '@/components/ui/share-button'; import { useEvalSamples } from '@/hooks/api/use-eval-samples'; +import { useUrlState } from '@/hooks/useUrlState'; import { track } from '@/lib/analytics'; import type { EvalSamplesFilter, EvalSamplesLiveContext } from '@/lib/api'; +import { writeUrlParams } from '@/lib/url-state'; const PAGE_SIZE = 50; @@ -28,19 +31,46 @@ interface EvalSamplesDrawerProps { */ export default function EvalSamplesDrawer({ row, onClose }: EvalSamplesDrawerProps) { const open = row !== null; + const { getUrlParam } = useUrlState(); const [filter, setFilter] = useState('all'); const [page, setPage] = useState(0); const [search, setSearch] = useState(''); const [expanded, setExpanded] = useState>(new Set()); + // Track whether we've already seeded from URL params (only happens once per page lifetime). + const urlParamsConsumedRef = useRef(false); + // Reset transient state whenever a new row is opened. + // On the very first open this session, seed filter/search from URL params instead. useEffect(() => { if (!open) return; - setFilter('all'); + // These always reset regardless of whether we're seeding from URL params. setPage(0); - setSearch(''); setExpanded(new Set()); - }, [row?.evalResultId, open]); + if (urlParamsConsumedRef.current) { + setFilter('all'); + setSearch(''); + } else { + urlParamsConsumedRef.current = true; + const rawFilter = getUrlParam('e_dfilter'); + const rawSearch = getUrlParam('e_dq'); + const validFilter: EvalSamplesFilter = + rawFilter === 'passed' || rawFilter === 'failed' ? rawFilter : 'all'; + setFilter(validFilter); + setSearch(rawSearch ?? ''); + } + }, [row?.evalResultId, open]); // eslint-disable-line react-hooks/exhaustive-deps -- getUrlParam is stable + + // Mirror filter/search to the in-memory URL store so buildShareUrl() picks them up. + useEffect(() => { + if (!open) return; + writeUrlParams({ e_dfilter: filter === 'all' ? '' : filter }); + }, [filter, open]); + + useEffect(() => { + if (!open) return; + writeUrlParams({ e_dq: search }); + }, [search, open]); // Build a live-fetch context for unofficial runs from the row's identifying // fields. The hook ignores this when `evalResultId > 0` (DB-backed path). @@ -140,8 +170,9 @@ export default function EvalSamplesDrawer({ row, onClose }: EvalSamplesDrawerPro aria-describedby={undefined} > {/* Header — `DialogContent` renders its own absolute-positioned close - button in the top-right, so we leave room with `pr-10`. */} -
+ button at right-4. We render a Share button at right-11 and leave + pr-20 so neither overlaps the title text. */} +
{row ? ( @@ -160,6 +191,10 @@ export default function EvalSamplesDrawer({ row, onClose }: EvalSamplesDrawerPro
)}
+ {/* Share button — positioned to the left of Radix's close X (right-4) */} +
+ +
{/* Filter chips + search */} diff --git a/packages/app/src/components/evaluation/ui/EvaluationTable.tsx b/packages/app/src/components/evaluation/ui/EvaluationTable.tsx index 514ec384..f2d3f4d1 100644 --- a/packages/app/src/components/evaluation/ui/EvaluationTable.tsx +++ b/packages/app/src/components/evaluation/ui/EvaluationTable.tsx @@ -1,14 +1,17 @@ 'use client'; import { MessageSquareText } from 'lucide-react'; -import { useMemo, useState } from 'react'; +import { useEffect, useMemo, useRef, useState } from 'react'; import EvalSamplesDrawer from '@/components/evaluation/ui/EvalSamplesDrawer'; +import { findRowByDrawerKey, rowToDrawerKey } from '@/components/evaluation/eval-drawer-key'; import type { EvaluationChartData } from '@/components/evaluation/types'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; import { type DataTableColumn, DataTable } from '@/components/ui/data-table'; import { track } from '@/lib/analytics'; import { overlayRunColor, overlayRunIndex } from '@/lib/overlay-run-style'; +import { useUrlState } from '@/hooks/useUrlState'; +import { writeUrlParams } from '@/lib/url-state'; interface EvaluationTableProps { data: EvaluationChartData[]; @@ -16,12 +19,32 @@ interface EvaluationTableProps { export default function EvaluationTable({ data }: EvaluationTableProps) { const { runIndexByUrl } = useUnofficialRun(); + const { getUrlParam } = useUrlState(); const sorted = useMemo(() => [...data].toSorted((a, b) => b.score - a.score), [data]); const hasDisaggConfigs = useMemo(() => data.some((d) => d.disagg), [data]); const [drawerRow, setDrawerRow] = useState(null); + // Auto-open the drawer when the page loads with an e_drawer URL param. + // We guard with a ref so we only attempt once — after the first successful + // match (or after data has loaded with no match), we stop trying. + const drawerKeyParam = getUrlParam('e_drawer'); + const autoOpenConsumedRef = useRef(false); + useEffect(() => { + if (autoOpenConsumedRef.current) return; + if (!drawerKeyParam) { + autoOpenConsumedRef.current = true; + return; + } + if (sorted.length === 0) return; // wait for data to populate + autoOpenConsumedRef.current = true; + const match = findRowByDrawerKey(sorted, drawerKeyParam); + if (match) setDrawerRow(match); + // No match → silent no-op (row may have been removed from this run-date). + }, [sorted, drawerKeyParam]); + const openDrawer = (row: EvaluationChartData) => { setDrawerRow(row); + writeUrlParams({ e_drawer: rowToDrawerKey(row) }); // Notify the first-visit nudge to dismiss itself once the user has // discovered the affordance on their own. if (typeof window !== 'undefined') { @@ -34,6 +57,11 @@ export default function EvaluationTable({ data }: EvaluationTableProps) { }); }; + const closeDrawer = () => { + setDrawerRow(null); + writeUrlParams({ e_drawer: '', e_dfilter: '', e_dq: '' }); + }; + const columns = useMemo[]>( () => [ { @@ -167,7 +195,7 @@ export default function EvaluationTable({ data }: EvaluationTableProps) { testId="evaluation-results-table" analyticsPrefix="evaluation_table" /> - setDrawerRow(null)} /> + ); } diff --git a/packages/app/src/components/ui/share-button.test.tsx b/packages/app/src/components/ui/share-button.test.tsx index f581818b..1aeca89d 100644 --- a/packages/app/src/components/ui/share-button.test.tsx +++ b/packages/app/src/components/ui/share-button.test.tsx @@ -39,7 +39,7 @@ describe('ShareButton', () => { expect(trigger).not.toBeNull(); expect(trigger?.textContent).toContain('Share'); // Popover content lives in a portal and is not in the DOM until opened. - expect(document.querySelector('[data-testid="share-popover"]')).toBeNull(); + expect(document.querySelector('[data-testid="share-button-popover"]')).toBeNull(); }); it('opens the popover with the share URL pre-filled when the trigger is clicked', () => { @@ -50,12 +50,14 @@ describe('ShareButton', () => { act(() => trigger?.click()); - const input = document.querySelector('[data-testid="share-url-input"]'); + const input = document.querySelector( + '[data-testid="share-button-url-input"]', + ); expect(input).not.toBeNull(); expect(input?.value).toBe('https://inferencex.semianalysis.com/?g_model=dsr1#inference'); // Copy + social buttons live inside the popover content. - expect(document.querySelector('[data-testid="share-copy-button"]')).not.toBeNull(); + expect(document.querySelector('[data-testid="share-button-copy-button"]')).not.toBeNull(); expect(document.querySelector('[data-testid="share-twitter"]')).not.toBeNull(); expect(document.querySelector('[data-testid="share-linkedin"]')).not.toBeNull(); }); diff --git a/packages/app/src/components/ui/share-button.tsx b/packages/app/src/components/ui/share-button.tsx index 867c383c..247cdb3c 100644 --- a/packages/app/src/components/ui/share-button.tsx +++ b/packages/app/src/components/ui/share-button.tsx @@ -10,7 +10,11 @@ import { buildShareUrl } from '@/lib/url-state'; import { Button } from './button'; import { Popover, PopoverContent, PopoverTrigger } from './popover'; -export function ShareButton() { +interface ShareButtonProps { + testId?: string; +} + +export function ShareButton({ testId = 'share-button' }: ShareButtonProps = {}) { const [open, setOpen] = useState(false); const [copied, setCopied] = useState(false); const [url, setUrl] = useState(''); @@ -53,7 +57,7 @@ export function ShareButton() {