From 179501f867e22076a21834bb47f34734f5c7aedc Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 21:32:51 -0400
Subject: [PATCH 01/31] feat(phase-3): wire classification harness to live
 on-device model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements the harness validation rig from Handoff Phase 3, ported from
the standalone HTML file into the React app so it reuses the already-
loaded worker (no second model download needed).

- worker: new 'classify' task type. One-shot completion, low max_tokens
  (80), temperature 0.1, KV-cache reset between calls. Returns
  { type: 'classify_done', taskId, raw, latencyMs } or 'classify_error'.
- useClassifier hook: thin wrapper that posts 'classify' to the shared
  worker, tracks pending tasks by taskId, resolves with raw + latency.
  Caller parses the verdict.
- ClassificationHarness component: ports the standalone harness UI to
  React + Tailwind. Inputs: user description, prompt template, trials
  JSON (fixture preset), concurrency (1/2/3/5), eligibility max chars.
  Renders results table with verdict pill (LIKELY/POSSIBLE/UNLIKELY/
  PARSE_FAIL), latency, raw output, expected vs actual checkmark.
  Stats: parse rate, avg/max latency, agreement.
- App.jsx: dev-only ?test=classify route, lazy-loaded so production
  bundle stays unaffected.

Pass criteria from Handoff (parse ≥90%, avg latency <1.5s, agreement
≥80%) shown inline so the user can validate before deciding to wire
classification into the actual results UI.

This is the validation gate, not the in-app classification pipeline
itself. Once the harness numbers look good, a follow-up PR adds
classifyAll() to ResultsList and the fit meter to TriageRow.
---
 src/App.jsx                              |  14 +-
 src/components/ClassificationHarness.jsx | 420 +++++++++++++++++++++++
 src/hooks/useClassifier.js               |  49 +++
 src/workers/nlp.worker.js                |  28 ++
 4 files changed, 510 insertions(+), 1 deletion(-)
 create mode 100644 src/components/ClassificationHarness.jsx
 create mode 100644 src/hooks/useClassifier.js
diff --git a/src/App.jsx b/src/App.jsx
index 83666b7..b4ef669 100644
--- a/src/App.jsx
+++ b/src/App.jsx
@@ -18,12 +18,15 @@ const NLPTestPanel = import.meta.env.DEV
 const ProdScenarioTestPanel = import.meta.env.DEV
   ? lazy(() => import('./components/ProdScenarioTestPanel'))
   : null
+const ClassificationHarness = import.meta.env.DEV
+  ? lazy(() => import('./components/ClassificationHarness'))
+  : null
 
 function getTestRoute() {
   if (typeof window === 'undefined') return null
   if (!import.meta.env.DEV) return null
   const t = new URLSearchParams(window.location.search).get('test')
-  return t === 'nlp' || t === 'scenarios' ? t : null
+  return t === 'nlp' || t === 'scenarios' || t === 'classify' ? t : null
 }
 
 function IrisApp() {
@@ -45,6 +48,15 @@ function IrisApp() {
       </div>
     )
   }
+  if (testRoute === 'classify' && ClassificationHarness) {
+    return (
+      <div className="min-h-screen bg-parchment-50">
+        <Suspense fallback={<div className="p-6 text-sm">Loading classification harness…</div>}>
+          <ClassificationHarness />
+        </Suspense>
+      </div>
+    )
+  }
   if (testRoute === 'scenarios' && ProdScenarioTestPanel) {
     // ProdScenarioTestPanel calls fetch directly, so it doesn't need a query
     // client. IrisApp is already inside the App() QueryClientProvider, so no
diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
new file mode 100644
index 0000000..c220dd4
--- /dev/null
+++ b/src/components/ClassificationHarness.jsx
@@ -0,0 +1,420 @@
+import { useState, useEffect } from 'react'
+import { useNLP } from '../hooks/useNLP'
+import { useClassifier } from '../hooks/useClassifier'
+import { NLP_MODELS, resolveModelKey } from '../utils/nlpModels'
+
+const SAMPLE_TRIALS = [
+  {
+    nctId: 'NCT05952557',
+    title: 'Phase IIIb Study of Ribociclib + Endocrine Therapy in Early Breast Cancer',
+    eligibility: 'Inclusion: Adult female, ≥18 years. HR-positive, HER2-negative early breast cancer. Completed definitive surgery. Postmenopausal status confirmed. ECOG 0-1. Adequate organ function. Exclusion: Prior CDK4/6 inhibitor. Pregnancy or breastfeeding. Active second malignancy.',
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT06104020',
+    title: 'Sacituzumab Govitecan in Metastatic Triple-Negative Breast Cancer',
+    eligibility: 'Inclusion: Adult, any sex. Histologically confirmed metastatic triple-negative breast cancer (ER<1%, PR<1%, HER2-negative). At least one prior line of systemic therapy in metastatic setting. ECOG 0-2. Measurable disease per RECIST 1.1. Exclusion: Active CNS metastases. Prior topoisomerase I inhibitor.',
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT05887492',
+    title: 'Adaptive Radiation Boost in Locally Advanced HER2+ Breast Cancer',
+    eligibility: 'Inclusion: Adult female. HER2-positive breast cancer confirmed by IHC 3+ or FISH-positive. Stage II-III disease. Completed neoadjuvant chemotherapy. ECOG 0-1. Exclusion: Prior radiation to chest. Pregnancy.',
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT06221340',
+    title: 'Aerobic Exercise During Adjuvant Chemo for Breast Cancer Survivors',
+    eligibility: 'Inclusion: Adult, any sex. Breast cancer, any stage. Currently receiving or scheduled for adjuvant chemotherapy. Cleared by oncologist for moderate exercise. Exclusion: Cardiac contraindications.',
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT04123456',
+    title: 'Pembrolizumab in Advanced Non-Small Cell Lung Cancer',
+    eligibility: 'Inclusion: Adult. Histologically confirmed advanced NSCLC. PD-L1 expression ≥50%. ECOG 0-1. Exclusion: Active autoimmune disease. Prior immunotherapy.',
+    expected: 'UNLIKELY',
+  },
+  {
+    nctId: 'NCT05123987',
+    title: 'Targeted Therapy in Pediatric Acute Lymphoblastic Leukemia',
+    eligibility: 'Inclusion: Pediatric patients aged 2-17 years. Newly diagnosed ALL. Exclusion: Adults. Prior chemotherapy.',
+    expected: 'UNLIKELY',
+  },
+]
+
+const DEFAULT_PROMPT = `You are evaluating clinical trial fit.
+
+User: {{user}}
+Trial title: {{title}}
+Eligibility (excerpt): {{eligibility}}
+
+Reply on one line, exactly: VERDICT | one-sentence reason
+where VERDICT is LIKELY, POSSIBLE, or UNLIKELY.`
+
+const DEFAULT_USER_DESC = "I'm 58 years old with breast cancer in Boston"
+
+function parseVerdict(raw) {
+  if (!raw || typeof raw !== 'string') return { verdict: 'PARSE_FAIL', reason: '(empty output)' }
+  const m = raw.match(/^\s*(LIKELY|POSSIBLE|UNLIKELY)\s*[|:\-—]\s*(.+?)\s*$/im)
+  if (m) return { verdict: m[1].toUpperCase(), reason: m[2].trim() }
+  const w = raw.match(/\b(LIKELY|POSSIBLE|UNLIKELY)\b/i)
+  if (w) {
+    return {
+      verdict: w[1].toUpperCase(),
+      reason: raw.replace(w[0], '').replace(/^[\s|:\-—]+/, '').trim() || '(no reason)',
+    }
+  }
+  return { verdict: 'PARSE_FAIL', reason: raw.slice(0, 120) }
+}
+
+const VERDICT_STYLES = {
+  LIKELY:     'bg-signal-good-bg text-signal-good',
+  POSSIBLE:   'bg-signal-warn-bg text-signal-warn',
+  UNLIKELY:   'bg-parchment-200 text-parchment-700',
+  PARSE_FAIL: 'bg-signal-bad-bg text-signal-bad',
+  PENDING:    'bg-parchment-100 text-parchment-700',
+}
+
+export default function ClassificationHarness() {
+  const [modelKey] = useState(() =>
+    resolveModelKey(typeof window !== 'undefined' ? window.location.search : '')
+  )
+  const model = NLP_MODELS[modelKey]
+  const { status, progress, error, load, webGPUSupported } = useNLP()
+  const { classifyOne } = useClassifier()
+
+  const [userDesc, setUserDesc] = useState(DEFAULT_USER_DESC)
+  const [promptTemplate, setPromptTemplate] = useState(DEFAULT_PROMPT)
+  const [trialsJson, setTrialsJson] = useState(JSON.stringify(SAMPLE_TRIALS, null, 2))
+  const [concurrency, setConcurrency] = useState(3)
+  const [eligMax, setEligMax] = useState(1500)
+  const [results, setResults] = useState([])
+  const [running, setRunning] = useState(false)
+  const [startT, setStartT] = useState(0)
+  const [, setTick] = useState(0)
+
+  // Lightweight ticker so elapsed time updates while a run is in flight.
+  useEffect(() => {
+    if (!running) return
+    const id = setInterval(() => setTick(t => t + 1), 250)
+    return () => clearInterval(id)
+  }, [running])
+
+  function getProgressLabel() {
+    if (!progress) return 'Loading model…'
+    return progress.text || `Loading model… ${Math.round((progress.progress ?? 0) * 100)}%`
+  }
+
+  async function run() {
+    let trials
+    try {
+      trials = JSON.parse(trialsJson)
+      if (!Array.isArray(trials)) throw new Error('Not an array')
+    } catch (e) {
+      alert('Trials JSON is invalid: ' + e.message)
+      return
+    }
+
+    setRunning(true)
+    setStartT(performance.now())
+    const initial = trials.map(trial => ({ trial, status: 'PENDING' }))
+    setResults(initial)
+
+    const queue = trials.map((trial, idx) => ({ idx, trial }))
+    const workersN = Math.min(concurrency, trials.length)
+
+    async function worker() {
+      while (queue.length) {
+        const { idx, trial } = queue.shift()
+        const elig = (trial.eligibility || '').slice(0, eligMax)
+        const prompt = promptTemplate
+          .replace('{{user}}', userDesc)
+          .replace('{{title}}', trial.title || trial.briefTitle || '')
+          .replace('{{eligibility}}', elig)
+        try {
+          const { raw, latencyMs } = await classifyOne(prompt)
+          const parsed = parseVerdict(raw)
+          setResults(prev => {
+            const next = [...prev]
+            next[idx] = { trial, status: 'DONE', raw, latencyMs, ...parsed }
+            return next
+          })
+        } catch (err) {
+          setResults(prev => {
+            const next = [...prev]
+            next[idx] = {
+              trial,
+              status: 'DONE',
+              raw: '',
+              latencyMs: 0,
+              verdict: 'PARSE_FAIL',
+              reason: err?.message ?? 'classify error',
+            }
+            return next
+          })
+        }
+      }
+    }
+
+    await Promise.all(Array.from({ length: workersN }, worker))
+    setRunning(false)
+  }
+
+  function reset() {
+    setTrialsJson(JSON.stringify(SAMPLE_TRIALS, null, 2))
+    setResults([])
+  }
+
+  // ───────── stats ─────────
+  const done = results.filter(r => r.status === 'DONE')
+  const lats = done.map(r => r.latencyMs).filter(n => n != null)
+  const avgLat = lats.length ? Math.round(lats.reduce((a, b) => a + b, 0) / lats.length) : 0
+  const maxLat = lats.length ? Math.round(Math.max(...lats)) : 0
+  const parseFails = done.filter(r => r.verdict === 'PARSE_FAIL').length
+  const parseRate = done.length ? Math.round(((done.length - parseFails) / done.length) * 100) : 0
+  const elapsed = startT ? ((performance.now() - startT) / 1000).toFixed(1) : '0.0'
+  const withExpected = done.filter(r => r.trial.expected)
+  const matches = withExpected.filter(r => r.verdict === r.trial.expected).length
+  const agreementPct = withExpected.length ? Math.round((matches / withExpected.length) * 100) : null
+
+  const canRun = status === 'ready' && !running
+
+  return (
+    <div className="max-w-[1200px] mx-auto px-6 py-7 pb-20">
+      <h1 className="font-serif font-semibold text-[28px] tracking-tight text-parchment-950 mb-1">
+        Classification harness
+      </h1>
+      <p className="text-[13px] text-parchment-700 max-w-[640px] leading-relaxed mb-6">
+        Validate the proposed Stage-1 classifier (LIKELY / POSSIBLE / UNLIKELY) against real
+        ClinicalTrials.gov payloads using the on-device {model.label}. Pass criteria from the
+        Handoff: parse rate ≥ 90%, avg latency &lt; 1.5s, agreement ≥ 80%.
+      </p>
+
+      <div className="bg-white border border-parchment-200 rounded-xl p-5 mb-4">
+        <div className="flex flex-wrap items-center justify-between gap-3">
+          <div>
+            <div className="font-mono text-[10px] uppercase tracking-[0.08em] text-iris-700 mb-1">model</div>
+            <div className="font-mono text-[12px] text-parchment-900">
+              {model.label} ({model.sizeLabel}) · status:{' '}
+              <strong className={status === 'ready' ? 'text-signal-good' : 'text-parchment-700'}>
+                {status}
+              </strong>
+              {status === 'downloading' && progress && (
+                <span className="text-parchment-500"> · {Math.round((progress.progress ?? 0) * 100)}%</span>
+              )}
+            </div>
+            {status === 'downloading' && (
+              <p className="font-mono text-[11px] text-parchment-700 mt-1">{getProgressLabel()}</p>
+            )}
+            {!webGPUSupported && (
+              <p className="text-[12px] text-signal-bad mt-1">WebGPU unavailable in this browser.</p>
+            )}
+            {error && <p className="text-[12px] text-signal-bad mt-1">{error}</p>}
+          </div>
+          {status !== 'ready' && status !== 'downloading' && webGPUSupported && (
+            <button
+              type="button"
+              onClick={() => load(model.id, { isThinking: model.isThinking, chatOpts: model.chatOpts })}
+              className="bg-iris-600 text-white px-4 py-2 rounded-md text-[13px] font-semibold hover:bg-iris-700"
+            >
+              Load model
+            </button>
+          )}
+        </div>
+      </div>
+
+      <div className="bg-white border border-parchment-200 rounded-xl p-5 mb-4">
+        <h2 className="font-serif font-semibold text-base mb-3">Inputs</h2>
+        <div className="grid grid-cols-1 md:grid-cols-2 gap-4">
+          <div>
+            <label className="font-mono text-[10px] font-semibold uppercase tracking-[0.08em] text-iris-700 mb-1.5 block">
+              User description
+            </label>
+            <textarea
+              rows={3}
+              value={userDesc}
+              onChange={e => setUserDesc(e.target.value)}
+              className="w-full text-[13px] px-3 py-2.5 border border-parchment-300 rounded-lg bg-parchment-50 text-parchment-900 resize-y focus:outline-none focus:ring-2 focus:ring-iris-500"
+            />
+          </div>
+          <div>
+            <label className="font-mono text-[10px] font-semibold uppercase tracking-[0.08em] text-iris-700 mb-1.5 block">
+              Classify prompt template
+            </label>
+            <textarea
+              rows={6}
+              value={promptTemplate}
+              onChange={e => setPromptTemplate(e.target.value)}
+              className="w-full font-mono text-[12px] px-3 py-2.5 border border-parchment-300 rounded-lg bg-parchment-50 text-parchment-900 resize-y focus:outline-none focus:ring-2 focus:ring-iris-500"
+            />
+          </div>
+        </div>
+
+        <div className="mt-4">
+          <label className="font-mono text-[10px] font-semibold uppercase tracking-[0.08em] text-iris-700 mb-1.5 block">
+            Trials (JSON array — fixture loaded by default)
+          </label>
+          <textarea
+            rows={10}
+            value={trialsJson}
+            onChange={e => setTrialsJson(e.target.value)}
+            className="w-full font-mono text-[11.5px] px-3 py-2.5 border border-parchment-300 rounded-lg bg-parchment-50 text-parchment-900 resize-y focus:outline-none focus:ring-2 focus:ring-iris-500"
+          />
+        </div>
+
+        <div className="flex flex-wrap items-center gap-3 mt-4">
+          <button
+            type="button"
+            disabled={!canRun}
+            onClick={run}
+            className="bg-iris-600 text-white px-5 py-2.5 rounded-lg text-[13px] font-semibold hover:bg-iris-700 disabled:opacity-50 disabled:cursor-not-allowed"
+          >
+            {running ? `Running… (${done.length}/${results.length})` : 'Run classification'}
+          </button>
+          <button
+            type="button"
+            onClick={reset}
+            disabled={running}
+            className="border border-parchment-300 text-parchment-900 px-4 py-2 rounded-lg text-[12px] hover:bg-parchment-100 disabled:opacity-50"
+          >
+            Reset trials
+          </button>
+          <label className="inline-flex items-center gap-2 text-[12px] text-parchment-700">
+            Concurrency
+            <select
+              value={concurrency}
+              onChange={e => setConcurrency(parseInt(e.target.value, 10))}
+              disabled={running}
+              className="px-2 py-1 text-[12px] border border-parchment-300 rounded bg-white"
+            >
+              <option value={1}>1 (serial)</option>
+              <option value={2}>2</option>
+              <option value={3}>3</option>
+              <option value={5}>5</option>
+            </select>
+          </label>
+          <label className="inline-flex items-center gap-2 text-[12px] text-parchment-700">
+            Eligibility max chars
+            <input
+              type="number"
+              min={200}
+              max={8000}
+              step={100}
+              value={eligMax}
+              onChange={e => setEligMax(parseInt(e.target.value, 10) || 1500)}
+              disabled={running}
+              className="w-[90px] px-2 py-1 text-[12px] border border-parchment-300 rounded bg-white"
+            />
+          </label>
+        </div>
+
+        {(running || done.length > 0) && (
+          <div className="flex flex-wrap gap-4 font-mono text-[11px] text-parchment-700 mt-3">
+            <span><strong className="text-parchment-950">{done.length} / {results.length}</strong> done</span>
+            <span>elapsed <strong className="text-parchment-950">{elapsed}s</strong></span>
+            <span>avg latency <strong className="text-parchment-950">{avgLat}ms</strong></span>
+            <span>max latency <strong className="text-parchment-950">{maxLat}ms</strong></span>
+            <span>parse rate <strong className="text-parchment-950">{parseRate}%</strong></span>
+            <span>parse fails <strong className="text-parchment-950">{parseFails}</strong></span>
+          </div>
+        )}
+      </div>
+
+      <div className="bg-white border border-parchment-200 rounded-xl p-5 mb-4">
+        <h2 className="font-serif font-semibold text-base mb-3">Results</h2>
+        {results.length === 0 ? (
+          <p className="text-parchment-500 italic text-[13px] py-6 text-center">
+            No results yet — click <strong>Run classification</strong>.
+          </p>
+        ) : (
+          <ResultsTable rows={results} />
+        )}
+        {agreementPct != null && !running && (
+          <div className="font-mono text-[11px] text-parchment-700 mt-3 px-3 py-2.5 bg-iris-50 border border-iris-100 rounded-lg leading-relaxed">
+            <strong className="text-iris-700">Agreement with expected:</strong>{' '}
+            {matches} / {withExpected.length} ({agreementPct}%) — useful as a smoke test on a labeled
+            held-out set. Below ~80% means the prompt or model needs work before this drives sort order.
+          </div>
+        )}
+      </div>
+
+      <details className="text-[12px] text-parchment-700">
+        <summary className="cursor-pointer font-mono text-iris-700">Pass criteria (from Handoff)</summary>
+        <ul className="mt-2 ml-4 list-disc space-y-1">
+          <li>Parse rate ≥ 90% on 50+ real trials</li>
+          <li>Avg latency &lt; 1.5s per trial on a mid-range laptop</li>
+          <li>Agreement ≥ 80% on a labeled held-out set</li>
+          <li>No catastrophic UNLIKELY false-negatives (a viable trial ranked as UNLIKELY)</li>
+        </ul>
+      </details>
+    </div>
+  )
+}
+
+function ResultsTable({ rows }) {
+  return (
+    <table className="w-full border-collapse text-[13px]">
+      <thead>
+        <tr>
+          <th className="text-left font-mono text-[10px] font-semibold uppercase tracking-[0.08em] text-parchment-700 px-2.5 py-2 border-b border-parchment-200" style={{ width: '38%' }}>
+            Trial
+          </th>
+          <th className="text-left font-mono text-[10px] font-semibold uppercase tracking-[0.08em] text-parchment-700 px-2.5 py-2 border-b border-parchment-200" style={{ width: '14%' }}>
+            Verdict
+          </th>
+          <th className="text-left font-mono text-[10px] font-semibold uppercase tracking-[0.08em] text-parchment-700 px-2.5 py-2 border-b border-parchment-200" style={{ width: '12%' }}>
+            Latency
+          </th>
+          <th className="text-left font-mono text-[10px] font-semibold uppercase tracking-[0.08em] text-parchment-700 px-2.5 py-2 border-b border-parchment-200">
+            Raw output / reason
+          </th>
+          <th className="text-left font-mono text-[10px] font-semibold uppercase tracking-[0.08em] text-parchment-700 px-2.5 py-2 border-b border-parchment-200" style={{ width: '12%' }}>
+            Expected
+          </th>
+        </tr>
+      </thead>
+      <tbody>
+        {rows.map((r, i) => {
+          const verdict = r.status === 'PENDING' ? 'PENDING' : (r.verdict || 'PARSE_FAIL')
+          const expected = r.trial.expected || '—'
+          const match = r.verdict && r.trial.expected
+            ? (r.verdict === r.trial.expected ? '✓' : '✗')
+            : ''
+          const matchColor = match === '✓' ? 'text-signal-good' : match === '✗' ? 'text-signal-bad' : 'text-parchment-500'
+          return (
+            <tr key={i}>
+              <td className="px-2.5 py-3 border-b border-parchment-100 align-top">
+                <div className="font-serif font-semibold text-parchment-950 text-[13.5px] leading-snug">
+                  {r.trial.title || r.trial.briefTitle || r.trial.nctId}
+                </div>
+                <div className="font-mono text-[10.5px] text-parchment-500 mt-0.5">
+                  {r.trial.nctId || ''}
+                </div>
+              </td>
+              <td className="px-2.5 py-3 border-b border-parchment-100 align-top">
+                <span className={`inline-flex items-center gap-1.5 font-mono text-[11px] font-semibold px-2 py-0.5 rounded-full tracking-[0.04em] ${VERDICT_STYLES[verdict] ?? ''}`}>
+                  {verdict}
+                </span>
+              </td>
+              <td className="px-2.5 py-3 border-b border-parchment-100 align-top font-mono text-[12px] text-parchment-700">
+                {r.latencyMs != null ? `${Math.round(r.latencyMs)}ms` : '—'}
+              </td>
+              <td className="px-2.5 py-3 border-b border-parchment-100 align-top">
+                <div className="text-[12.5px] text-parchment-900 leading-relaxed">{r.reason || '—'}</div>
+                {r.raw && r.raw !== r.reason && (
+                  <div className="font-mono text-[11px] text-parchment-700 mt-1 whitespace-pre-wrap break-words max-w-[380px]">
+                    raw: {r.raw}
+                  </div>
+                )}
+              </td>
+              <td className="px-2.5 py-3 border-b border-parchment-100 align-top font-mono text-[11px] text-parchment-700">
+                {expected}
+                {match && <span className={`ml-1.5 font-semibold ${matchColor}`}>{match}</span>}
+              </td>
+            </tr>
+          )
+        })}
+      </tbody>
+    </table>
+  )
+}
diff --git a/src/hooks/useClassifier.js b/src/hooks/useClassifier.js
new file mode 100644
index 0000000..f84b177
--- /dev/null
+++ b/src/hooks/useClassifier.js
@@ -0,0 +1,49 @@
+import { useRef, useEffect, useCallback } from 'react'
+import { getSharedWorker, attachListener } from '../workers/sharedNlpWorker'
+
+// Stage-1 classifier hook. Posts a 'classify' task to the shared NLP worker
+// and resolves with { raw, latencyMs }. The caller parses the verdict from
+// raw — keeps the worker dumb and the parsing rules co-located with the
+// harness/UI.
+//
+// The worker must already have the model loaded (use NL tab + consent first,
+// or call useNLP().load() somewhere). classifyOne will reject with
+// 'Engine not loaded' otherwise.
+export function useClassifier() {
+  const pendingRef = useRef(new Map())
+  const detachRef = useRef(null)
+  const taskIdRef = useRef(0)
+
+  function ensureSubscribed() {
+    if (detachRef.current) return
+    detachRef.current = attachListener(handleMessage)
+  }
+
+  function handleMessage(event) {
+    const { type, taskId, raw, latencyMs, message } = event.data ?? {}
+    if (type !== 'classify_done' && type !== 'classify_error') return
+    const pending = pendingRef.current.get(taskId)
+    if (!pending) return
+    pendingRef.current.delete(taskId)
+    if (type === 'classify_done') pending.resolve({ raw, latencyMs })
+    else pending.reject(new Error(message ?? 'classify failed'))
+  }
+
+  useEffect(() => {
+    return () => {
+      detachRef.current?.()
+      detachRef.current = null
+    }
+  }, [])
+
+  const classifyOne = useCallback((prompt) => {
+    ensureSubscribed()
+    const taskId = `classify-${++taskIdRef.current}`
+    return new Promise((resolve, reject) => {
+      pendingRef.current.set(taskId, { resolve, reject })
+      getSharedWorker().postMessage({ type: 'classify', taskId, prompt })
+    })
+  }, [])
+
+  return { classifyOne }
+}
diff --git a/src/workers/nlp.worker.js b/src/workers/nlp.worker.js
index 259237b..c8385b7 100644
--- a/src/workers/nlp.worker.js
+++ b/src/workers/nlp.worker.js
@@ -108,6 +108,34 @@ self.onmessage = async (event) => {
     return
   }
 
+  if (type === 'classify') {
+    if (!engine) {
+      self.postMessage({ type: 'classify_error', taskId, message: 'Engine not loaded' })
+      return
+    }
+    try {
+      const t0 = Date.now()
+      // Reset KV cache between classifications so they're independent.
+      if (typeof engine.resetChat === 'function') {
+        try { await engine.resetChat() } catch { /* best effort */ }
+      }
+      const request = {
+        messages: [{ role: 'user', content: prompt }],
+        // Stage-1 verdict + one-sentence reason fits comfortably in ~60 tokens.
+        // Generous headroom (80) covers preamble drift from the smaller models.
+        max_tokens: 80,
+        temperature: 0.1,
+      }
+      if (isThinkingModel) request.extra_body = { enable_thinking: false }
+      const reply = await engine.chat.completions.create(request)
+      const raw = reply.choices?.[0]?.message?.content ?? ''
+      self.postMessage({ type: 'classify_done', taskId, raw, latencyMs: Date.now() - t0 })
+    } catch (err) {
+      self.postMessage({ type: 'classify_error', taskId, message: err?.message ?? String(err) })
+    }
+    return
+  }
+
   if (type === 'summarize' || type === 'assess_fit') {
     if (!engine) {
       self.postMessage({ type: 'task_error', taskId, message: 'Engine not loaded' })

From 26daf7f420a1b6d3c7b21901df92074fd2595b02 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 21:37:38 -0400
Subject: [PATCH 02/31] =?UTF-8?q?fix(phase-3):=20serialize=20classify=20ca?=
 =?UTF-8?q?lls=20=E2=80=94=20WebLLM=20engine=20is=20not=20parallel-safe?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User reported 33% agreement, 0ms latencies, and "Message error should
not be 0" failures from the harness. Root cause: WebLLM's MLCEngine
is single-threaded; concurrent engine.chat.completions.create() calls
collide on the engine's internal KV cache and sampling state, returning
errors instantly. The harness was firing 3 in parallel by default, so
the first 3 always failed, the next batch sometimes squeezed through.

Fix at the hook level via a promise chain in useClassifier — each
request waits for the previous one to settle. Caller-side concurrency
becomes a no-op for actual parallelism but still controls queue depth.
This matches how useSimplifier already serializes through a single
inFlightRef. The .catch() on the chain prevents one failure from
breaking the whole queue.

Harness UI: replaced the misleading concurrency dropdown with a
static caption noting the engine constraint. Re-running with the
fixture should now produce real latencies (1-2s per trial on
mid-range hardware) and meaningful agreement numbers.
---
 src/components/ClassificationHarness.jsx | 21 ++++++----------
 src/hooks/useClassifier.js               | 31 ++++++++++++++++--------
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index c220dd4..0e5af45 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -279,20 +279,13 @@ export default function ClassificationHarness() {
           >
             Reset trials
           </button>
-          <label className="inline-flex items-center gap-2 text-[12px] text-parchment-700">
-            Concurrency
-            <select
-              value={concurrency}
-              onChange={e => setConcurrency(parseInt(e.target.value, 10))}
-              disabled={running}
-              className="px-2 py-1 text-[12px] border border-parchment-300 rounded bg-white"
-            >
-              <option value={1}>1 (serial)</option>
-              <option value={2}>2</option>
-              <option value={3}>3</option>
-              <option value={5}>5</option>
-            </select>
-          </label>
+          <span
+            className="inline-flex items-center gap-2 text-[11px] text-parchment-700"
+            title="WebLLM's MLCEngine is single-threaded — parallel inference clobbers state. Requests serialize through a hook-level promise chain regardless of caller concurrency."
+          >
+            <span className="font-mono text-[10px] uppercase tracking-[0.04em]">execution</span>
+            serial (engine constraint)
+          </span>
           <label className="inline-flex items-center gap-2 text-[12px] text-parchment-700">
             Eligibility max chars
             <input
diff --git a/src/hooks/useClassifier.js b/src/hooks/useClassifier.js
index f84b177..52b1ba1 100644
--- a/src/hooks/useClassifier.js
+++ b/src/hooks/useClassifier.js
@@ -2,17 +2,22 @@ import { useRef, useEffect, useCallback } from 'react'
 import { getSharedWorker, attachListener } from '../workers/sharedNlpWorker'
 
 // Stage-1 classifier hook. Posts a 'classify' task to the shared NLP worker
-// and resolves with { raw, latencyMs }. The caller parses the verdict from
-// raw — keeps the worker dumb and the parsing rules co-located with the
-// harness/UI.
+// and resolves with { raw, latencyMs }. The caller parses the verdict.
 //
-// The worker must already have the model loaded (use NL tab + consent first,
-// or call useNLP().load() somewhere). classifyOne will reject with
-// 'Engine not loaded' otherwise.
+// IMPORTANT: WebLLM's MLCEngine is NOT parallel-safe. Concurrent
+// engine.chat.completions.create() calls clobber each other's state and
+// produce "Message error should not be 0" failures. We serialize all
+// classify requests through a single promise chain at the hook level —
+// callers can fire-and-forget concurrently, but each request waits its
+// turn. Caller-side concurrency knobs become a no-op for actual
+// parallelism, but still control queue capacity.
+//
+// The worker must already have the model loaded.
 export function useClassifier() {
   const pendingRef = useRef(new Map())
   const detachRef = useRef(null)
   const taskIdRef = useRef(0)
+  const chainRef = useRef(Promise.resolve())
 
   function ensureSubscribed() {
     if (detachRef.current) return
@@ -39,10 +44,16 @@ export function useClassifier() {
   const classifyOne = useCallback((prompt) => {
     ensureSubscribed()
     const taskId = `classify-${++taskIdRef.current}`
-    return new Promise((resolve, reject) => {
-      pendingRef.current.set(taskId, { resolve, reject })
-      getSharedWorker().postMessage({ type: 'classify', taskId, prompt })
-    })
+    // Chain onto the previous request so only one inference runs at a time.
+    // .catch in the chain prevents one failure from breaking the whole queue.
+    const next = chainRef.current.catch(() => {}).then(() =>
+      new Promise((resolve, reject) => {
+        pendingRef.current.set(taskId, { resolve, reject })
+        getSharedWorker().postMessage({ type: 'classify', taskId, prompt })
+      })
+    )
+    chainRef.current = next
+    return next
   }, [])
 
   return { classifyOne }

From f15a55583ff803d5c6b2204c7d4817011ab6f6b1 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 21:43:01 -0400
Subject: [PATCH 03/31] feat(phase-3): expand fixture from 6 to 20 trials
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The original 6-trial smoke test wasn't enough surface to evaluate
prompt changes — every tweak swung agreement by ±17% on a single
trial. Add 14 trials grouped by failure mode:

- Subtype-gated breast cancer trials (3): Tucatinib HER2+, Olaparib
  BRCA, CDK4/6 switch HR+. All POSSIBLE for a generic "breast cancer"
  user — exercises whether the model penalizes unknown subtype.
- Strong matches for a 58yo with breast cancer (4): CBT for fatigue,
  lymphedema surveillance, mindfulness for survivors, vaginal estrogen
  in postmenopausal survivors. All LIKELY — these are the "obvious
  yes" cases that should never get marked POSSIBLE.
- Wrong condition / wrong demographic (5): melanoma, AFib, Type 2
  diabetes, prostate cancer, pediatric vaccines. All UNLIKELY — these
  are the "obvious no" cases that catastrophic-false-negative would
  hide.
- Edge cases (2): palliative care for any advanced solid tumor
  (POSSIBLE — depends on stage which is unknown), and premenopausal
  breast cancer (UNLIKELY — 58yo is almost certainly postmenopausal,
  the trial explicitly excludes postmenopausal).

Distribution: 6 LIKELY, 6 POSSIBLE, 8 UNLIKELY. The Handoff calls for
50+ trials for real validation; 20 is a workable middle ground for
prompt iteration without crushing test cycle time.
---
 src/components/ClassificationHarness.jsx | 92 ++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 0e5af45..899382a 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -40,6 +40,98 @@ const SAMPLE_TRIALS = [
     eligibility: 'Inclusion: Pediatric patients aged 2-17 years. Newly diagnosed ALL. Exclusion: Adults. Prior chemotherapy.',
     expected: 'UNLIKELY',
   },
+
+  // ─── Subtype-gated breast cancer trials — POSSIBLE without confirmed subtype ───
+  {
+    nctId: 'NCT05300100',
+    title: 'Tucatinib + Trastuzumab in HER2-Positive Metastatic Breast Cancer',
+    eligibility: 'Inclusion: Adult, any sex, ≥18 years. Histologically confirmed HER2-positive metastatic breast cancer (IHC 3+ or FISH-amplified). At least 2 prior HER2-directed therapies. ECOG 0-1. Exclusion: Untreated brain metastases. Prior tucatinib.',
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT05400201',
+    title: 'Olaparib Maintenance in BRCA-Mutated HER2-Negative Breast Cancer',
+    eligibility: 'Inclusion: Adult female. HER2-negative breast cancer with germline BRCA1 or BRCA2 mutation (confirmed by central testing). High-risk early disease following adjuvant chemotherapy. Postmenopausal or premenopausal with ovarian suppression. Exclusion: Prior PARP inhibitor.',
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT05511223',
+    title: 'CDK4/6 Inhibitor Switch in Hormone-Receptor-Positive Advanced Breast Cancer',
+    eligibility: 'Inclusion: Adult women, postmenopausal. HR-positive, HER2-negative advanced or metastatic breast cancer. Disease progression on a prior CDK4/6 inhibitor. ECOG 0-2.',
+    expected: 'POSSIBLE',
+  },
+
+  // ─── Strong matches for a 58yo with breast cancer ───
+  {
+    nctId: 'NCT05633445',
+    title: 'Cognitive Behavioral Therapy for Cancer-Related Fatigue',
+    eligibility: 'Inclusion: Adults ≥18 years with any solid tumor diagnosis (breast, colon, lung, prostate, etc.). Currently in active treatment or within 5 years of treatment completion. Self-reported fatigue ≥4 on a 0-10 scale. Exclusion: Severe untreated depression. Inability to attend weekly sessions.',
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT05755677',
+    title: 'Lymphedema Surveillance Program After Breast Cancer Surgery',
+    eligibility: 'Inclusion: Adult female ≥18 years. History of breast cancer treated with axillary surgery (sentinel lymph node biopsy or axillary dissection). Within 3 years of surgery. Exclusion: Pre-existing lymphedema. Current breast cancer recurrence.',
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT05822334',
+    title: 'Mindfulness-Based Stress Reduction for Breast Cancer Survivors',
+    eligibility: 'Inclusion: Adult women ≥21 years. Diagnosed with breast cancer (any stage). Completed primary treatment within the past 5 years OR currently on adjuvant endocrine therapy. Exclusion: Active psychosis. Prior MBSR participation.',
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT05901128',
+    title: 'Vaginal Estrogen Safety Study in Postmenopausal Breast Cancer Survivors',
+    eligibility: 'Inclusion: Postmenopausal women ages 45-75 with a history of HR-positive or HR-negative breast cancer. Disease-free for ≥1 year. Genitourinary symptoms of menopause. Stable on aromatase inhibitor or tamoxifen, or treatment-free. Exclusion: Current metastatic disease.',
+    expected: 'LIKELY',
+  },
+
+  // ─── Wrong condition / wrong demographic — clear UNLIKELY ───
+  {
+    nctId: 'NCT04567890',
+    title: 'Pembrolizumab in Advanced Melanoma',
+    eligibility: 'Inclusion: Adults with histologically confirmed unresectable Stage III or Stage IV melanoma. ECOG 0-1. No prior systemic therapy for advanced disease. Exclusion: Active autoimmune disease.',
+    expected: 'UNLIKELY',
+  },
+  {
+    nctId: 'NCT04678901',
+    title: 'Apixaban vs. Warfarin in Atrial Fibrillation',
+    eligibility: 'Inclusion: Adults ≥18 years with non-valvular atrial fibrillation. CHA2DS2-VASc score ≥2. Exclusion: Mechanical heart valve. Active bleeding.',
+    expected: 'UNLIKELY',
+  },
+  {
+    nctId: 'NCT04789012',
+    title: 'GLP-1 Agonist for Weight Management in Type 2 Diabetes',
+    eligibility: 'Inclusion: Adults 18-75 with Type 2 diabetes mellitus. BMI ≥30. HbA1c 7.0-10.0%. Exclusion: Type 1 diabetes. Active malignancy within 5 years. History of pancreatitis.',
+    expected: 'UNLIKELY',
+  },
+  {
+    nctId: 'NCT04890123',
+    title: 'Robotic Prostatectomy Outcomes in Localized Prostate Cancer',
+    eligibility: 'Inclusion: Men ≥40 years with biopsy-confirmed clinically localized prostate cancer (T1-T2). Candidate for radical prostatectomy. Exclusion: Prior pelvic surgery or radiation.',
+    expected: 'UNLIKELY',
+  },
+  {
+    nctId: 'NCT04901234',
+    title: 'Pediatric Vaccine Immunogenicity Study',
+    eligibility: 'Inclusion: Healthy children aged 6 months to 5 years. Up to date on routine immunizations. Exclusion: Immunocompromised. Recent illness within 14 days.',
+    expected: 'UNLIKELY',
+  },
+
+  // ─── Edge cases — should challenge the model ───
+  {
+    nctId: 'NCT05012345',
+    title: 'Palliative Care Integration in Patients with Advanced Solid Tumors',
+    eligibility: 'Inclusion: Adults ≥18 years with advanced (Stage IV) solid tumor of any primary site (breast, lung, GI, GU, GYN). Estimated prognosis 6-24 months. ECOG 0-3. Exclusion: Currently enrolled in hospice.',
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT05123450',
+    title: 'Premenopausal Breast Cancer: Ovarian Function Suppression Trial',
+    eligibility: 'Inclusion: Premenopausal women ages 18-45 with newly diagnosed HR-positive early breast cancer. Confirmed premenopausal by FSH and estradiol levels. Exclusion: Postmenopausal status. Prior ovarian suppression therapy.',
+    expected: 'UNLIKELY',
+  },
 ]
 
 const DEFAULT_PROMPT = `You are evaluating clinical trial fit.

From 34f8665122432633d0155b3f907299ca9739e378 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 21:46:31 -0400
Subject: [PATCH 04/31] feat(phase-3): add 4 realistic-length trials (~3kB
 eligibility each)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fixture trials use 200-400 char eligibility blurbs, which masks
two real-world failure modes: latency growth with longer prompts and
the model losing the signal in CT.gov noise (formal numbered lists,
lab cutoffs, prior-therapy specifics, washout periods).

Add 4 trials with realistic CT.gov-style eligibility (~2-3.5kB each)
spanning all three verdicts:

- LONG-01 Sacituzumab metastatic HR+/HER2- after CDK4/6 (POSSIBLE):
  subtype-gated, 12 inclusion + 12 exclusion criteria with full
  organ-function lab ranges. Tests whether the model still penalizes
  unknown HR/HER2 status when buried under three pages of text.
- LONG-02 Adjuvant abemaciclib node-positive postmenopausal (LIKELY):
  the postmenopausal woman with HR+ early breast cancer at high risk
  case. Many specific lab cutoffs and prior-therapy washout rules.
  Tests whether the model picks up on the clear inclusion match
  without getting confused by exclusion noise.
- LONG-03 Pembrolizumab metastatic squamous NSCLC (UNLIKELY): wrong
  cancer entirely, but mentions "breast" once in context of allowed
  prior malignancies. Tests whether the model anchors on incidental
  mentions vs the core indication.
- LONG-04 Empagliflozin HFpEF + T2DM (UNLIKELY): wrong condition
  entirely (cardiology + diabetes), but excludes only "active
  malignancy within 12 months" — disease-free history is allowed.
  Tests whether the model conflates "cancer history allowed" with
  "this trial is for cancer patients."

Combined with the eligMax knob (default 1500), these let you sweep:
800 → 1500 → 3000 → 6000 chars and watch latency vs accuracy. The
Handoff suggests dropping to 800 if latency is a problem; this is
where you'd validate that.
---
 src/components/ClassificationHarness.jsx | 166 +++++++++++++++++++++++
 1 file changed, 166 insertions(+)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 899382a..93946ce 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -132,6 +132,172 @@ const SAMPLE_TRIALS = [
     eligibility: 'Inclusion: Premenopausal women ages 18-45 with newly diagnosed HR-positive early breast cancer. Confirmed premenopausal by FSH and estradiol levels. Exclusion: Postmenopausal status. Prior ovarian suppression therapy.',
     expected: 'UNLIKELY',
   },
+
+  // ─── Realistic-length eligibility (~2-3.5kB each) — stress-tests how the
+  //     model handles formal CT.gov noise and how truncation affects accuracy.
+  //     Try these with eligMax = 800 vs 3000 vs 6000 to see the trade-off.
+  {
+    nctId: 'NCT-LONG-01',
+    title: 'Phase II Study of Sacituzumab Govitecan-hziy in Patients with HR-Positive, HER2-Negative Metastatic Breast Cancer After Endocrine Therapy and CDK4/6 Inhibitor',
+    eligibility: `Inclusion Criteria:
+
+1. Female participants ≥18 years of age at the time of signing informed consent.
+2. Histologically or cytologically confirmed adenocarcinoma of the breast that is metastatic or locally advanced and not amenable to curative resection or radiotherapy.
+3. Documentation of estrogen receptor (ER)-positive (≥1% staining by IHC) and/or progesterone receptor (PR)-positive (≥1% staining by IHC) tumor status, in accordance with ASCO/CAP guidelines.
+4. Documentation of HER2-negative status defined as IHC 0, IHC 1+, or IHC 2+ with negative in situ hybridization (ISH), per ASCO/CAP guidelines.
+5. Disease progression on or after at least one prior CDK4/6 inhibitor (palbociclib, ribociclib, or abemaciclib) administered for advanced or metastatic disease, in combination with an aromatase inhibitor or fulvestrant.
+6. Disease progression on or after at least one and no more than two prior endocrine therapies (e.g., aromatase inhibitor, fulvestrant, tamoxifen) for advanced or metastatic disease.
+7. No more than one prior chemotherapy regimen for metastatic disease.
+8. Postmenopausal status, OR premenopausal/perimenopausal women who agree to receive concurrent ovarian function suppression with a luteinizing hormone-releasing hormone (LHRH) agonist throughout study treatment.
+9. Measurable disease per RECIST v1.1, or non-measurable bone-only disease assessable per protocol-specified criteria.
+10. ECOG performance status 0 or 1.
+11. Adequate organ function:
+    - Absolute neutrophil count (ANC) ≥1.5 × 10^9/L
+    - Platelets ≥100 × 10^9/L
+    - Hemoglobin ≥9.0 g/dL (transfusion permitted)
+    - Total bilirubin ≤1.5 × ULN (≤3 × ULN for participants with documented Gilbert syndrome)
+    - AST and ALT ≤2.5 × ULN (≤5 × ULN if liver metastases present)
+    - Creatinine clearance ≥50 mL/min by Cockcroft-Gault equation
+    - INR and aPTT ≤1.5 × ULN unless on anticoagulants
+12. Resolution of all acute toxic effects of prior anti-cancer therapy or surgical procedures to NCI CTCAE v5.0 Grade ≤1 (except alopecia and Grade 2 neuropathy).
+13. Willingness to provide tumor tissue (archival or fresh biopsy) for biomarker analyses.
+
+Exclusion Criteria:
+
+1. Prior treatment with sacituzumab govitecan or any other Trop-2-directed therapy.
+2. Prior treatment with an antibody-drug conjugate containing a topoisomerase I inhibitor payload (e.g., trastuzumab deruxtecan).
+3. Active CNS metastases. Participants with previously treated, asymptomatic CNS metastases are eligible if clinically stable for ≥4 weeks off corticosteroids and anticonvulsants.
+4. Leptomeningeal disease.
+5. Known active infection requiring systemic therapy, including untreated HIV, active HBV (HBsAg positive or HBV DNA detectable), or active HCV (HCV RNA detectable).
+6. Significant cardiovascular disease, including: NYHA Class III or IV congestive heart failure, myocardial infarction or unstable angina within 6 months, uncontrolled arrhythmia, baseline QTcF >470 ms.
+7. History of another malignancy within 3 years, except for adequately treated non-melanoma skin cancer, in situ cervical or breast cancer, or low-risk localized prostate cancer on active surveillance.
+8. Known hypersensitivity to irinotecan or any component of the study drug formulation.
+9. Pregnant or breastfeeding women. Women of childbearing potential must agree to use highly effective contraception during the study and for 6 months after the last dose.
+10. Concurrent participation in another therapeutic clinical trial.
+11. Major surgery within 4 weeks prior to first dose.
+12. Live vaccines within 30 days prior to first dose.`,
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT-LONG-02',
+    title: 'Randomized Phase III Trial of Adjuvant Endocrine Therapy ± Abemaciclib in Postmenopausal Women with HR-Positive, HER2-Negative, Node-Positive Early Breast Cancer at High Risk of Recurrence',
+    eligibility: `Inclusion Criteria:
+
+1. Female, postmenopausal at the time of randomization. Postmenopausal status defined as: (a) prior bilateral oophorectomy, (b) age ≥60 years, OR (c) age <60 with amenorrhea ≥12 months in the absence of chemotherapy, tamoxifen, or ovarian suppression AND FSH and estradiol in the postmenopausal range.
+2. Age 18 to 75 years inclusive at the time of consent.
+3. ECOG performance status of 0, 1, or 2.
+4. Histologically confirmed invasive breast carcinoma. Multicentric or multifocal disease is allowed if all foci meet eligibility.
+5. Hormone receptor-positive disease, defined as ≥1% of tumor cells staining positive for estrogen receptor and/or progesterone receptor by IHC, per ASCO/CAP guidelines.
+6. HER2-negative disease, defined as IHC 0, 1+, or 2+ with negative reflex ISH testing per ASCO/CAP guidelines.
+7. Stage II or III disease with high-risk pathologic features, defined as ≥1 of the following:
+    - ≥4 positive axillary lymph nodes, OR
+    - 1-3 positive axillary lymph nodes AND tumor size ≥5 cm, OR
+    - 1-3 positive axillary lymph nodes AND histologic grade 3, OR
+    - 1-3 positive axillary lymph nodes AND Ki-67 ≥20%
+8. Definitive surgical treatment of primary tumor with negative margins (lumpectomy with whole-breast irradiation OR mastectomy with or without post-mastectomy radiation per institutional standard).
+9. Completion of any neoadjuvant or adjuvant chemotherapy at least 21 days but no more than 16 months prior to randomization.
+10. Initiation of adjuvant endocrine therapy (aromatase inhibitor, with or without LHRH agonist) is permitted, but participants must not have received endocrine therapy for more than 12 weeks prior to randomization.
+11. Adequate organ function within 14 days of randomization:
+    - ANC ≥1.5 × 10^9/L
+    - Platelets ≥100 × 10^9/L
+    - Hemoglobin ≥10.0 g/dL
+    - Total bilirubin ≤1.5 × ULN
+    - AST/ALT ≤2.5 × ULN
+    - Creatinine clearance ≥50 mL/min
+12. Negative serum or urine pregnancy test for participants of childbearing potential.
+
+Exclusion Criteria:
+
+1. Stage IV (metastatic) breast cancer or evidence of distant metastases on staging imaging.
+2. Inflammatory breast cancer.
+3. Bilateral invasive breast cancer.
+4. Prior treatment with any CDK4/6 inhibitor in any setting.
+5. Prior anti-cancer therapy other than chemotherapy and locoregional therapy for the current breast cancer diagnosis.
+6. History of another malignancy within 5 years prior to randomization, except adequately treated non-melanoma skin cancer, in situ cervical cancer, or contralateral DCIS.
+7. Active or chronic hepatitis B or C infection, or known HIV infection.
+8. Significant uncontrolled cardiovascular disease: NYHA Class III/IV heart failure, myocardial infarction within 6 months, ventricular arrhythmia requiring treatment.
+9. History of interstitial lung disease or pneumonitis requiring corticosteroids.
+10. Major surgery (other than breast cancer surgery) within 28 days of randomization.
+11. Receiving strong CYP3A inhibitors or inducers within 14 days that cannot be discontinued.
+12. Inability to swallow oral medications or significant malabsorption.
+13. Pregnant or breastfeeding (premenopausal participants only — see inclusion criterion 1).`,
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT-LONG-03',
+    title: 'Phase III Study of Pembrolizumab Plus Chemotherapy versus Chemotherapy Alone for First-Line Treatment of Metastatic Squamous Non-Small Cell Lung Cancer',
+    eligibility: `Inclusion Criteria:
+
+1. Histologically or cytologically confirmed Stage IV squamous non-small cell lung cancer (NSCLC) per AJCC 8th edition.
+2. Male or female ≥18 years of age.
+3. No prior systemic therapy for metastatic NSCLC. Prior adjuvant or neoadjuvant chemotherapy is allowed if completed ≥6 months prior to enrollment.
+4. Measurable disease per RECIST v1.1.
+5. Provision of a tumor tissue sample (archival or fresh biopsy) adequate for PD-L1 IHC testing using the 22C3 pharmDx assay.
+6. ECOG performance status 0 or 1.
+7. Life expectancy ≥3 months.
+8. Adequate organ function within 10 days of randomization:
+    - ANC ≥1.5 × 10^9/L without G-CSF support
+    - Platelets ≥100 × 10^9/L without transfusion
+    - Hemoglobin ≥9.0 g/dL
+    - Total bilirubin ≤1.5 × ULN
+    - AST/ALT ≤2.5 × ULN (≤5 × ULN if liver involvement)
+    - Creatinine clearance ≥45 mL/min
+    - INR/aPTT ≤1.5 × ULN
+9. Female participants of childbearing potential and male participants with partners of childbearing potential must agree to use effective contraception throughout treatment and for 120 days after last dose.
+
+Exclusion Criteria:
+
+1. Histology of mixed small cell and non-small cell lung cancer, or predominantly non-squamous histology.
+2. Known sensitizing EGFR mutation, ALK rearrangement, ROS1 rearrangement, BRAF V600E mutation, or other actionable alteration for which an approved targeted therapy is the standard of care.
+3. Prior treatment with any PD-1, PD-L1, PD-L2, or CTLA-4 inhibitor.
+4. Active autoimmune disease requiring systemic immunosuppression within 2 years. Replacement therapy (e.g., thyroxine, insulin, physiologic corticosteroids) is permitted.
+5. History of pneumonitis requiring corticosteroids, or active pneumonitis.
+6. Active CNS metastases or carcinomatous meningitis. Participants with previously treated, asymptomatic CNS metastases stable for ≥4 weeks may be eligible.
+7. Active infection requiring systemic therapy.
+8. Known active HIV, HBV, or HCV infection.
+9. Live vaccine within 30 days of first dose.
+10. History of solid organ or allogeneic stem cell transplant.
+11. Pregnant or breastfeeding women.
+12. History of another malignancy within 3 years, except for adequately treated non-melanoma skin cancer or in situ disease.`,
+    expected: 'UNLIKELY',
+  },
+  {
+    nctId: 'NCT-LONG-04',
+    title: 'Multicenter Randomized Trial of Empagliflozin in Patients with Heart Failure with Preserved Ejection Fraction and Type 2 Diabetes',
+    eligibility: `Inclusion Criteria:
+
+1. Adults aged 40 to 85 years at consent.
+2. Documented diagnosis of heart failure with preserved ejection fraction (HFpEF):
+    - Left ventricular ejection fraction (LVEF) ≥50% on echocardiogram within the past 12 months
+    - NYHA functional class II, III, or IV
+    - Elevated NT-proBNP ≥300 pg/mL (or ≥600 pg/mL if atrial fibrillation present)
+    - Structural heart disease on echocardiography (LV hypertrophy or left atrial enlargement) OR documented prior HF hospitalization
+3. Documented Type 2 diabetes mellitus (T2DM) per ADA criteria, with HbA1c 6.5% to 10.0% at screening.
+4. Stable background heart failure therapy for ≥4 weeks (diuretic if indicated; ACEi/ARB/ARNI per guideline; beta-blocker per guideline).
+5. eGFR ≥25 mL/min/1.73m^2 by CKD-EPI equation.
+6. Body mass index 20 to 45 kg/m^2.
+7. Able and willing to provide written informed consent and adhere to study procedures.
+
+Exclusion Criteria:
+
+1. Type 1 diabetes mellitus.
+2. History of diabetic ketoacidosis within 12 months.
+3. LVEF <50% on most recent echocardiogram.
+4. Acute decompensated heart failure requiring IV diuretics within 4 weeks of screening.
+5. Acute coronary syndrome, stroke, or transient ischemic attack within 90 days.
+6. Planned cardiac surgery, percutaneous coronary intervention, or device implantation within 90 days.
+7. Symptomatic hypotension or systolic blood pressure <100 mmHg at screening.
+8. Significant valvular heart disease (severe aortic stenosis, severe mitral regurgitation requiring surgery).
+9. Hypertrophic cardiomyopathy, infiltrative cardiomyopathy, or constrictive pericarditis.
+10. eGFR <25 mL/min/1.73m^2 or end-stage renal disease requiring dialysis.
+11. Known active malignancy requiring treatment within the past 12 months. Participants with a history of cancer who are disease-free for >12 months are eligible.
+12. Severe hepatic impairment (Child-Pugh C).
+13. Pregnancy or breastfeeding.
+14. Known hypersensitivity to SGLT2 inhibitors.
+15. Participation in another interventional clinical trial within 30 days.
+16. Life expectancy <12 months due to non-cardiovascular cause.`,
+    expected: 'UNLIKELY',
+  },
 ]
 
 const DEFAULT_PROMPT = `You are evaluating clinical trial fit.

From fa405553516031dfb630aed678f16e278267bbaf Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 21:53:32 -0400
Subject: [PATCH 05/31] feat(phase-3): "copy results as markdown" button
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After a run completes, surfaces a button in the stats row that copies
a formatted markdown report to the clipboard. Includes:

- Model + user description + eligMax (so a different prompt run is
  reproducible)
- Stats table (done, elapsed, latencies, parse rate, agreement)
- Per-trial results table (verdict, expected, match check, latency,
  reason)
- Collapsed prompt template in a <details> block

Pipes pretty quickly into chat or a PR comment for sharing harness
runs without screenshotting. Confirmation state on the button
(✓ copied / copy failed) for 1.8-2.4s.
---
 src/components/ClassificationHarness.jsx | 83 +++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 93946ce..9ebaee3 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -423,6 +423,26 @@ export default function ClassificationHarness() {
     setResults([])
   }
 
+  const [copyState, setCopyState] = useState('idle') // idle | copied | error
+  async function copyMarkdown() {
+    const md = buildMarkdownReport({
+      userDesc,
+      promptTemplate,
+      eligMax,
+      modelLabel: model.label,
+      results,
+      stats: { done: done.length, total: results.length, elapsed, avgLat, maxLat, parseRate, parseFails, agreementPct, matches, withExpected: withExpected.length },
+    })
+    try {
+      await navigator.clipboard.writeText(md)
+      setCopyState('copied')
+      setTimeout(() => setCopyState('idle'), 1800)
+    } catch {
+      setCopyState('error')
+      setTimeout(() => setCopyState('idle'), 2400)
+    }
+  }
+
   // ───────── stats ─────────
   const done = results.filter(r => r.status === 'DONE')
   const lats = done.map(r => r.latencyMs).filter(n => n != null)
@@ -560,13 +580,23 @@ export default function ClassificationHarness() {
         </div>
 
         {(running || done.length > 0) && (
-          <div className="flex flex-wrap gap-4 font-mono text-[11px] text-parchment-700 mt-3">
+          <div className="flex flex-wrap items-center gap-4 font-mono text-[11px] text-parchment-700 mt-3">
             <span><strong className="text-parchment-950">{done.length} / {results.length}</strong> done</span>
             <span>elapsed <strong className="text-parchment-950">{elapsed}s</strong></span>
             <span>avg latency <strong className="text-parchment-950">{avgLat}ms</strong></span>
             <span>max latency <strong className="text-parchment-950">{maxLat}ms</strong></span>
             <span>parse rate <strong className="text-parchment-950">{parseRate}%</strong></span>
             <span>parse fails <strong className="text-parchment-950">{parseFails}</strong></span>
+            {done.length > 0 && !running && (
+              <button
+                type="button"
+                onClick={copyMarkdown}
+                className="ml-auto inline-flex items-center gap-1.5 border border-iris-300 text-iris-700 hover:bg-iris-50 px-2.5 py-1 rounded text-[11px] transition-colors"
+                title="Copy a shareable markdown summary of this run to your clipboard"
+              >
+                {copyState === 'copied' ? '✓ copied' : copyState === 'error' ? 'copy failed' : 'copy results as markdown'}
+              </button>
+            )}
           </div>
         )}
       </div>
@@ -602,6 +632,57 @@ export default function ClassificationHarness() {
   )
 }
 
+function buildMarkdownReport({ userDesc, promptTemplate, eligMax, modelLabel, results, stats }) {
+  const escape = (s) => String(s ?? '').replace(/\|/g, '\\|').replace(/\n/g, ' ').trim()
+  const truncate = (s, n) => {
+    const t = escape(s)
+    return t.length > n ? t.slice(0, n - 1) + '…' : t
+  }
+
+  const lines = []
+  lines.push('# Classification harness run')
+  lines.push('')
+  lines.push(`**Model:** ${modelLabel}`)
+  lines.push(`**User description:** ${userDesc}`)
+  lines.push(`**Eligibility max chars:** ${eligMax}`)
+  lines.push('')
+  lines.push('## Stats')
+  lines.push('')
+  lines.push(`| Metric | Value |`)
+  lines.push(`|---|---|`)
+  lines.push(`| Done | ${stats.done} / ${stats.total} |`)
+  lines.push(`| Elapsed | ${stats.elapsed}s |`)
+  lines.push(`| Avg latency | ${stats.avgLat}ms |`)
+  lines.push(`| Max latency | ${stats.maxLat}ms |`)
+  lines.push(`| Parse rate | ${stats.parseRate}% (${stats.parseFails} fails) |`)
+  if (stats.agreementPct != null) {
+    lines.push(`| Agreement | ${stats.matches} / ${stats.withExpected} (${stats.agreementPct}%) |`)
+  }
+  lines.push('')
+  lines.push('## Results')
+  lines.push('')
+  lines.push(`| Trial | NCT | Verdict | Expected | Match | Latency | Reason / Raw |`)
+  lines.push(`|---|---|---|---|---|---|---|`)
+  for (const r of results) {
+    if (r.status !== 'DONE') continue
+    const v = r.verdict || 'PARSE_FAIL'
+    const exp = r.trial.expected || '—'
+    const match = r.trial.expected ? (r.verdict === r.trial.expected ? '✓' : '✗') : ''
+    const latency = r.latencyMs != null ? `${Math.round(r.latencyMs)}ms` : '—'
+    const reasonOrRaw = r.reason && r.reason !== '(no reason)' ? r.reason : `raw: ${r.raw || '—'}`
+    lines.push(`| ${truncate(r.trial.title || r.trial.briefTitle || r.trial.nctId, 80)} | ${escape(r.trial.nctId || '')} | ${v} | ${exp} | ${match} | ${latency} | ${truncate(reasonOrRaw, 140)} |`)
+  }
+  lines.push('')
+  lines.push('<details>')
+  lines.push('<summary>Prompt template used</summary>')
+  lines.push('')
+  lines.push('```')
+  lines.push(promptTemplate)
+  lines.push('```')
+  lines.push('</details>')
+  return lines.join('\n')
+}
+
 function ResultsTable({ rows }) {
   return (
     <table className="w-full border-collapse text-[13px]">

From 379ff3b0d96ec0a8d90d12f454235092305400ca Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 21:55:06 -0400
Subject: [PATCH 06/31] tweak(phase-3): default prompt biases toward POSSIBLE
 over LIKELY

Initial run had 67% agreement with two LIKELY-vs-POSSIBLE drift errors
on subtype-gated trials (Sacituzumab metastatic TNBC, HER2+ radiation
boost). Both required subtype the user hadn't confirmed; the model
called them LIKELY because "breast cancer" matched the broad indication.

Update the default prompt to make the verdict ladder explicit:
- LIKELY = slam-dunk, every gating criterion confirmed
- POSSIBLE = broad condition matches, key criteria unknown
- UNLIKELY = clear mismatch (wrong condition / sex / age / comorbidity)

Also tightens the response format requirement (one line, pipe-separated,
reason required) since 2/6 outputs in the first run dropped the reason.
"raw: LIKELY" with no reason isn't useful for a user reading the fit
meter; tightening here propagates to the eventual in-app integration.
---
 src/components/ClassificationHarness.jsx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 9ebaee3..cdd924e 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -300,14 +300,16 @@ Exclusion Criteria:
   },
 ]
 
-const DEFAULT_PROMPT = `You are evaluating clinical trial fit.
+const DEFAULT_PROMPT = `You are evaluating clinical trial fit. Be conservative. LIKELY is reserved for slam-dunk matches where every gating criterion in the eligibility is clearly met by the user description. If the trial requires a specific subtype, biomarker, stage, mutation, prior treatment, or measurement that the user has NOT explicitly confirmed, the verdict is POSSIBLE — even if the broad condition matches. UNLIKELY is for clear mismatches (wrong condition, wrong sex, wrong age range, excluded comorbidity).
 
 User: {{user}}
 Trial title: {{title}}
 Eligibility (excerpt): {{eligibility}}
 
-Reply on one line, exactly: VERDICT | one-sentence reason
-where VERDICT is LIKELY, POSSIBLE, or UNLIKELY.`
+Reply on EXACTLY one line in this format:
+VERDICT | one-sentence reason
+
+VERDICT must be LIKELY, POSSIBLE, or UNLIKELY. The reason is required.`
 
 const DEFAULT_USER_DESC = "I'm 58 years old with breast cancer in Boston"
 

From c66fc36bc3f49d3b6ddbaf7c9b2a8c6aa4202288 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 21:58:10 -0400
Subject: [PATCH 07/31] tweak(phase-3): rewrite prompt with ordered rules +
 no-assume guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run 2 (verbose-bias prompt) dropped to 50% agreement with three new
failure modes:

1. Model hallucinated user attributes ("user has prior radiation",
   "user is HR-positive") that weren't in the description, then
   reasoned about them. Driving false UNLIKELY verdicts.
2. Model misread exclusion semantics on Lymphedema Surveillance —
   said the user "does not meet the exclusion criteria" as a NEGATIVE,
   producing a catastrophic false UNLIKELY (the user-doesn't-see-it
   failure mode the Handoff specifically calls out).
3. UNLIKELY filter went leaky — melanoma, NSCLC, HFpEF + T2DM all
   moved to POSSIBLE on "user's age is within the range" alone,
   ignoring the actual disease mismatch.

New prompt structure:
- Explicit "user description is ONLY what you know — do not assume"
  guard against hallucination
- Five ordered rules: condition mismatch → UNLIKELY (highest
  priority), demographic exclusion → UNLIKELY, missing-info →
  POSSIBLE, all-met → LIKELY, default POSSIBLE
- Explicit "UNLIKELY ≠ many requirements; UNLIKELY = user is clearly
  disqualified" to fix the exclusion-semantics confusion

Trade-off: slightly longer than the original baseline prompt, but
more declarative than the verbose run-2 prompt. Latency should sit
between the two.
---
 src/components/ClassificationHarness.jsx | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index cdd924e..4e662d2 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -300,16 +300,24 @@ Exclusion Criteria:
   },
 ]
 
-const DEFAULT_PROMPT = `You are evaluating clinical trial fit. Be conservative. LIKELY is reserved for slam-dunk matches where every gating criterion in the eligibility is clearly met by the user description. If the trial requires a specific subtype, biomarker, stage, mutation, prior treatment, or measurement that the user has NOT explicitly confirmed, the verdict is POSSIBLE — even if the broad condition matches. UNLIKELY is for clear mismatches (wrong condition, wrong sex, wrong age range, excluded comorbidity).
+const DEFAULT_PROMPT = `Decide whether to show this clinical trial to the user. The user description is the ONLY thing you know about them — do not assume any subtype, biomarker, stage, treatment history, or comorbidity that the user did not state.
+
+Apply these rules in order:
+1. If the trial studies a condition the user does NOT have → UNLIKELY
+2. If the trial's stated demographic requirements (sex, age range) exclude the user → UNLIKELY
+3. If the trial requires a subtype, biomarker, stage, mutation, or prior treatment the user did not mention → POSSIBLE (the user might still qualify; we just don't know)
+4. If the user clearly meets every stated requirement → LIKELY
+5. Otherwise → POSSIBLE
+
+Do NOT mark UNLIKELY just because the trial has many requirements. UNLIKELY means the user is clearly disqualified, not that information is missing.
 
 User: {{user}}
 Trial title: {{title}}
-Eligibility (excerpt): {{eligibility}}
+Eligibility: {{eligibility}}
 
-Reply on EXACTLY one line in this format:
+Reply on EXACTLY one line:
 VERDICT | one-sentence reason
-
-VERDICT must be LIKELY, POSSIBLE, or UNLIKELY. The reason is required.`
+VERDICT is LIKELY, POSSIBLE, or UNLIKELY.`
 
 const DEFAULT_USER_DESC = "I'm 58 years old with breast cancer in Boston"
 

From 8244252a282232af6c6e24f8ff1e72a06df44b02 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:01:23 -0400
Subject: [PATCH 08/31] tweak(phase-3): few-shot prompt to fix literal-VERDICT
 bug + collapse rules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run 3 (rules-list) hit 54% with three failure modes:

1. The "VERDICT | reason" instruction made Gemma emit the literal word
   "VERDICT" as a placeholder name, dropping the actual verdict from
   most outputs. raw: "VERDICT:" with no verdict, or "VERDICT: LIKELY"
   with no reason.
2. Five ordered rules were too many for Gemma 2B — it appears to
   default to "user meets all requirements -> LIKELY" and stop
   reasoning. Pembrolizumab Melanoma flipped to LIKELY, four
   subtype-gated breast cancer trials flipped to LIKELY.
3. Long eligibility (LONG-03 NSCLC, LONG-04 HFpEF) degraded the
   wrong-condition signal — both flipped POSSIBLE/LIKELY despite the
   shorter NSCLC version correctly classifying UNLIKELY.

New strategy:
- Drop the literal "VERDICT" placeholder. Use "<LABEL> | <reason>"
  with angle brackets so it reads as a template, not a literal.
- Drop the numbered rules. Use 3 short bullet definitions instead.
- Add 3 worked examples covering POSSIBLE / UNLIKELY / LIKELY for a
  consistent patient profile (62yo man, prostate cancer). Examples
  demonstrate the no-assume rule, the wrong-condition rule, and the
  positive-match rule by behavior, not by abstract instruction.
- "Now classify:" cue separates examples from the actual task,
  reducing the chance Gemma copies the example patient.

Trade-off: prompt is longer, latency may rise modestly. If accuracy
holds, that's a fair price.
---
 src/components/ClassificationHarness.jsx | 36 +++++++++++++++---------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 4e662d2..872aced 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -300,24 +300,34 @@ Exclusion Criteria:
   },
 ]
 
-const DEFAULT_PROMPT = `Decide whether to show this clinical trial to the user. The user description is the ONLY thing you know about them — do not assume any subtype, biomarker, stage, treatment history, or comorbidity that the user did not state.
+const DEFAULT_PROMPT = `You decide whether to show a clinical trial to a patient based on the patient's short self-description. Be conservative: the patient only told you what they explicitly stated — do not assume HER2/HR/BRCA status, stage, prior treatment, or other facts.
 
-Apply these rules in order:
-1. If the trial studies a condition the user does NOT have → UNLIKELY
-2. If the trial's stated demographic requirements (sex, age range) exclude the user → UNLIKELY
-3. If the trial requires a subtype, biomarker, stage, mutation, or prior treatment the user did not mention → POSSIBLE (the user might still qualify; we just don't know)
-4. If the user clearly meets every stated requirement → LIKELY
-5. Otherwise → POSSIBLE
+Use these labels:
+- LIKELY: the trial's primary indication matches the patient's condition AND the patient meets every demographic requirement.
+- POSSIBLE: the trial's indication matches but at least one criterion (subtype, biomarker, stage, mutation, prior therapy) is unstated by the patient.
+- UNLIKELY: the trial is for a different disease, or the patient is the wrong sex/age.
 
-Do NOT mark UNLIKELY just because the trial has many requirements. UNLIKELY means the user is clearly disqualified, not that information is missing.
+Examples:
 
-User: {{user}}
-Trial title: {{title}}
+Patient: "62-year-old man with prostate cancer"
+Trial: Phase III Olaparib in BRCA-Mutated Metastatic Prostate Cancer (Eligibility: men with BRCA mutation, metastatic prostate cancer, prior androgen therapy)
+Answer: POSSIBLE | matches prostate cancer in a man, but BRCA status not stated
+
+Patient: "62-year-old man with prostate cancer"
+Trial: Trastuzumab in HER2+ Breast Cancer (Eligibility: adult women with HER2+ breast cancer)
+Answer: UNLIKELY | trial is for breast cancer in women; patient has prostate cancer
+
+Patient: "62-year-old man with prostate cancer"
+Trial: Exercise Intervention for Prostate Cancer Survivors (Eligibility: adult men with any-stage prostate cancer history)
+Answer: LIKELY | adult man with prostate cancer history matches the inclusion criteria
+
+Now classify:
+
+Patient: {{user}}
+Trial: {{title}}
 Eligibility: {{eligibility}}
 
-Reply on EXACTLY one line:
-VERDICT | one-sentence reason
-VERDICT is LIKELY, POSSIBLE, or UNLIKELY.`
+Answer (one line, format exactly "<LABEL> | <one short reason>"):`
 
 const DEFAULT_USER_DESC = "I'm 58 years old with breast cancer in Boston"
 

From d41cd7889eb93072da54650bd3c19e7187415b4a Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:05:35 -0400
Subject: [PATCH 09/31] tweak(phase-3): collapse to binary LIKELY/UNLIKELY
 classification
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run 4 (3-class few-shot) hit 67%, but mapping LIKELY+POSSIBLE -> SHOW
already gave 88% on the same outputs. The LIKELY-vs-POSSIBLE judgment
is the source of most disagreement noise and isn't actually a product
question — the user-facing decision is binary (show this trial or
collapse it under "less likely matches"). POSSIBLE adds cognitive
load to a panicking patient without buying useful signal.

Pivot the default prompt to binary:
- LIKELY: trial studies the patient's condition AND nothing clearly
  excludes them. Worth showing.
- UNLIKELY: different disease, OR clearly wrong sex/age/population.
  Not worth showing.

Explicit "be inclusive on LIKELY" instruction: missing subtype/
biomarker/stage info still counts as LIKELY since the patient or
their doctor can verify. UNLIKELY is reserved for clear
disqualification on something the patient DID state.

Three worked examples: LIKELY (matched indication w/ unstated
biomarker), UNLIKELY (wrong sex), UNLIKELY (wrong condition entirely).

Parser side: POSSIBLE is still recognized for backward compatibility
but normalized to LIKELY. Fixture trials keep their 3-class expected
values (informationally rich) but the agreement check uses binary
mapping (POSSIBLE expected counts as LIKELY).

Markdown report uses the same binary mapping in the match column.
---
 src/components/ClassificationHarness.jsx | 52 ++++++++++++++++--------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 872aced..4206858 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -300,26 +300,26 @@ Exclusion Criteria:
   },
 ]
 
-const DEFAULT_PROMPT = `You decide whether to show a clinical trial to a patient based on the patient's short self-description. Be conservative: the patient only told you what they explicitly stated — do not assume HER2/HR/BRCA status, stage, prior treatment, or other facts.
+const DEFAULT_PROMPT = `You decide whether a clinical trial is worth showing to a patient. Output one of two labels:
 
-Use these labels:
-- LIKELY: the trial's primary indication matches the patient's condition AND the patient meets every demographic requirement.
-- POSSIBLE: the trial's indication matches but at least one criterion (subtype, biomarker, stage, mutation, prior therapy) is unstated by the patient.
-- UNLIKELY: the trial is for a different disease, or the patient is the wrong sex/age.
+- LIKELY: the trial studies the patient's condition AND nothing in the eligibility clearly excludes the patient based on what they stated. Worth showing.
+- UNLIKELY: the trial studies a different disease, OR the patient is clearly the wrong sex / age / population. Not worth showing.
+
+Be inclusive on LIKELY: if the trial requires a subtype, biomarker, stage, or prior treatment the patient did NOT mention, still call it LIKELY — the patient or their doctor can verify. Only use UNLIKELY when the patient is clearly disqualified by something they DID state.
 
 Examples:
 
 Patient: "62-year-old man with prostate cancer"
-Trial: Phase III Olaparib in BRCA-Mutated Metastatic Prostate Cancer (Eligibility: men with BRCA mutation, metastatic prostate cancer, prior androgen therapy)
-Answer: POSSIBLE | matches prostate cancer in a man, but BRCA status not stated
+Trial: Olaparib in BRCA-Mutated Metastatic Prostate Cancer (Eligibility: men with BRCA mutation, metastatic prostate cancer)
+Answer: LIKELY | matches prostate cancer in a man; BRCA status can be verified
 
 Patient: "62-year-old man with prostate cancer"
 Trial: Trastuzumab in HER2+ Breast Cancer (Eligibility: adult women with HER2+ breast cancer)
 Answer: UNLIKELY | trial is for breast cancer in women; patient has prostate cancer
 
 Patient: "62-year-old man with prostate cancer"
-Trial: Exercise Intervention for Prostate Cancer Survivors (Eligibility: adult men with any-stage prostate cancer history)
-Answer: LIKELY | adult man with prostate cancer history matches the inclusion criteria
+Trial: GLP-1 Agonist for Type 2 Diabetes (Eligibility: adults 18-75 with T2DM)
+Answer: UNLIKELY | trial is for type 2 diabetes; patient has prostate cancer
 
 Now classify:
 
@@ -331,20 +331,36 @@ Answer (one line, format exactly "<LABEL> | <one short reason>"):`
 
 const DEFAULT_USER_DESC = "I'm 58 years old with breast cancer in Boston"
 
+// Parser still accepts POSSIBLE in case the model emits it (older prompts,
+// instruction drift) — POSSIBLE is normalized to LIKELY since the binary
+// product question is "show or hide".
 function parseVerdict(raw) {
   if (!raw || typeof raw !== 'string') return { verdict: 'PARSE_FAIL', reason: '(empty output)' }
   const m = raw.match(/^\s*(LIKELY|POSSIBLE|UNLIKELY)\s*[|:\-—]\s*(.+?)\s*$/im)
-  if (m) return { verdict: m[1].toUpperCase(), reason: m[2].trim() }
+  if (m) {
+    const v = m[1].toUpperCase()
+    return { verdict: v === 'POSSIBLE' ? 'LIKELY' : v, reason: m[2].trim() }
+  }
   const w = raw.match(/\b(LIKELY|POSSIBLE|UNLIKELY)\b/i)
   if (w) {
+    const v = w[1].toUpperCase()
     return {
-      verdict: w[1].toUpperCase(),
+      verdict: v === 'POSSIBLE' ? 'LIKELY' : v,
       reason: raw.replace(w[0], '').replace(/^[\s|:\-—]+/, '').trim() || '(no reason)',
     }
   }
   return { verdict: 'PARSE_FAIL', reason: raw.slice(0, 120) }
 }
 
+// Normalize fixture-side expected values for binary agreement: POSSIBLE
+// counts as LIKELY (both = "show this trial"). Keeps the fixture data
+// informationally rich (3-class) while letting the binary model output
+// be evaluated correctly.
+function expectedBinary(expected) {
+  if (expected === 'POSSIBLE') return 'LIKELY'
+  return expected
+}
+
 const VERDICT_STYLES = {
   LIKELY:     'bg-signal-good-bg text-signal-good',
   POSSIBLE:   'bg-signal-warn-bg text-signal-warn',
@@ -472,7 +488,7 @@ export default function ClassificationHarness() {
   const parseRate = done.length ? Math.round(((done.length - parseFails) / done.length) * 100) : 0
   const elapsed = startT ? ((performance.now() - startT) / 1000).toFixed(1) : '0.0'
   const withExpected = done.filter(r => r.trial.expected)
-  const matches = withExpected.filter(r => r.verdict === r.trial.expected).length
+  const matches = withExpected.filter(r => r.verdict === expectedBinary(r.trial.expected)).length
   const agreementPct = withExpected.length ? Math.round((matches / withExpected.length) * 100) : null
 
   const canRun = status === 'ready' && !running
@@ -687,7 +703,8 @@ function buildMarkdownReport({ userDesc, promptTemplate, eligMax, modelLabel, re
     if (r.status !== 'DONE') continue
     const v = r.verdict || 'PARSE_FAIL'
     const exp = r.trial.expected || '—'
-    const match = r.trial.expected ? (r.verdict === r.trial.expected ? '✓' : '✗') : ''
+    const expBinary = r.trial.expected ? (r.trial.expected === 'POSSIBLE' ? 'LIKELY' : r.trial.expected) : null
+    const match = expBinary ? (r.verdict === expBinary ? '✓' : '✗') : ''
     const latency = r.latencyMs != null ? `${Math.round(r.latencyMs)}ms` : '—'
     const reasonOrRaw = r.reason && r.reason !== '(no reason)' ? r.reason : `raw: ${r.raw || '—'}`
     lines.push(`| ${truncate(r.trial.title || r.trial.briefTitle || r.trial.nctId, 80)} | ${escape(r.trial.nctId || '')} | ${v} | ${exp} | ${match} | ${latency} | ${truncate(reasonOrRaw, 140)} |`)
@@ -728,9 +745,12 @@ function ResultsTable({ rows }) {
       <tbody>
         {rows.map((r, i) => {
           const verdict = r.status === 'PENDING' ? 'PENDING' : (r.verdict || 'PARSE_FAIL')
-          const expected = r.trial.expected || '—'
-          const match = r.verdict && r.trial.expected
-            ? (r.verdict === r.trial.expected ? '✓' : '✗')
+          const rawExpected = r.trial.expected
+          // Display: keep original 3-class label so the fixture still reads
+          // informationally; ✓/✗ uses binary mapping (POSSIBLE counts as LIKELY).
+          const expected = rawExpected || '—'
+          const match = r.verdict && rawExpected
+            ? (r.verdict === expectedBinary(rawExpected) ? '✓' : '✗')
             : ''
           const matchColor = match === '✓' ? 'text-signal-good' : match === '✗' ? 'text-signal-bad' : 'text-parchment-500'
           return (

From 15d05b476e610469b4cb84c3481c1ac7436fa050 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:12:15 -0400
Subject: [PATCH 10/31] feat(phase-3): register Llama 3.2 3B Instruct as a
 model option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gemma 2 2B has plateaued at 79% binary agreement on the harness with
recurring hallucination of patient attributes ("user has AFib") that
five prompt iterations didn't fix — looks like a structural limit of
the small instruct model.

Llama 3.2 3B Instruct is the next sensible rung: ~50% more
parameters, much better instruction following per published
benchmarks, and well-tested in the WebLLM ecosystem. q4f16_1 quant
keeps the download to ~1.9 GB (vs 1.3 GB for current Gemma).

Switch via ?model=llama32 in any IRIS URL — including the harness:
http://localhost:5173/iris/?test=classify&model=llama32

First load downloads ~1.9 GB (cached after). Re-run the harness with
the same prompt to see if accuracy improves on the same fixture.
---
 src/utils/nlpModels.js | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/utils/nlpModels.js b/src/utils/nlpModels.js
index 67f9eee..9ad2f16 100644
--- a/src/utils/nlpModels.js
+++ b/src/utils/nlpModels.js
@@ -17,6 +17,14 @@ export const NLP_MODELS = {
     // <think>…</think> block before the answer, which breaks JSON parsing.
     isThinking: true,
   },
+  llama32: {
+    id: 'Llama-3.2-3B-Instruct-q4f16_1-MLC',
+    label: 'Llama 3.2 3B',
+    // q4f16_1 instead of q4f32_1: smaller (~1.9 GB vs ~2.4 GB) and faster on
+    // most GPUs with effectively no quality difference for instruction tasks.
+    sizeLabel: '~1.9 GB',
+    isThinking: false,
+  },
 }
 
 export const DEFAULT_MODEL_KEY = 'gemma'

From c0051a3e71c19e15993e1afeca84b33973ea37f9 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:17:04 -0400
Subject: [PATCH 11/31] fix(phase-3): register Qwen2.5-1.5B + diversify
 few-shot examples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Llama 3.2 3B run was catastrophic — 42% agreement, marked nearly
everything UNLIKELY. Reasons revealed the failure mode: the model
pattern-matched on the few-shot examples (all "62yo man with prostate
cancer") and emitted fragments of the example patient as if they
applied to the actual patient ("patient is a man", "trial is for
atrial fibrillation in men; patient is a woman"). Small instruct
models are notorious for this kind of example contamination.

Two changes:

1. Register Qwen2.5-1.5B-Instruct (~900 MB, q4f16_1) as ?model=qwen25.
   Smaller than Gemma 2B but the Qwen2.5 series is known for strong
   instruction-following per parameter. Worth a shot before
   concluding small models can't do this.

2. Rewrite the four few-shot examples to use FOUR different patients
   (45yo woman ovarian, 70yo man T2D, 8yo child asthma, 55yo man
   hypertension) covering LIKELY/UNLIKELY/UNLIKELY/LIKELY. Explicit
   "each example uses a DIFFERENT patient — focus on reasoning, not
   patient details" header. Removes the gravity well that made Llama
   anchor.

Run any model against the same fixture to compare. The example fix
should help all models, not just Qwen.
---
 src/components/ClassificationHarness.jsx | 24 ++++++++++++++----------
 src/utils/nlpModels.js                   |  6 ++++++
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 4206858..3a75208 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -307,19 +307,23 @@ const DEFAULT_PROMPT = `You decide whether a clinical trial is worth showing to
 
 Be inclusive on LIKELY: if the trial requires a subtype, biomarker, stage, or prior treatment the patient did NOT mention, still call it LIKELY — the patient or their doctor can verify. Only use UNLIKELY when the patient is clearly disqualified by something they DID state.
 
-Examples:
+Examples (note: each example uses a DIFFERENT patient — focus on the reasoning, not the patient details):
 
-Patient: "62-year-old man with prostate cancer"
-Trial: Olaparib in BRCA-Mutated Metastatic Prostate Cancer (Eligibility: men with BRCA mutation, metastatic prostate cancer)
-Answer: LIKELY | matches prostate cancer in a man; BRCA status can be verified
+Patient: "45-year-old woman with ovarian cancer"
+Trial: PARP Inhibitor in BRCA-Mutated Ovarian Cancer (Eligibility: women with ovarian cancer and BRCA mutation)
+Answer: LIKELY | matches ovarian cancer in a woman; BRCA status can be verified
 
-Patient: "62-year-old man with prostate cancer"
-Trial: Trastuzumab in HER2+ Breast Cancer (Eligibility: adult women with HER2+ breast cancer)
-Answer: UNLIKELY | trial is for breast cancer in women; patient has prostate cancer
+Patient: "70-year-old man with type 2 diabetes"
+Trial: Tamoxifen in Premenopausal Breast Cancer (Eligibility: premenopausal women with breast cancer)
+Answer: UNLIKELY | trial is for breast cancer in women; patient has diabetes
 
-Patient: "62-year-old man with prostate cancer"
-Trial: GLP-1 Agonist for Type 2 Diabetes (Eligibility: adults 18-75 with T2DM)
-Answer: UNLIKELY | trial is for type 2 diabetes; patient has prostate cancer
+Patient: "8-year-old child with asthma"
+Trial: Adult Anti-Inflammatory for Asthma (Eligibility: adults 18+ with persistent asthma)
+Answer: UNLIKELY | trial is for adults; patient is a child
+
+Patient: "55-year-old man with hypertension"
+Trial: Yoga Intervention for Adults with Chronic Conditions (Eligibility: adults 40-75 with any chronic condition)
+Answer: LIKELY | adult with chronic condition matches the broad inclusion
 
 Now classify:
 
diff --git a/src/utils/nlpModels.js b/src/utils/nlpModels.js
index 9ad2f16..dea5712 100644
--- a/src/utils/nlpModels.js
+++ b/src/utils/nlpModels.js
@@ -25,6 +25,12 @@ export const NLP_MODELS = {
     sizeLabel: '~1.9 GB',
     isThinking: false,
   },
+  qwen25: {
+    id: 'Qwen2.5-1.5B-Instruct-q4f16_1-MLC',
+    label: 'Qwen2.5 1.5B',
+    sizeLabel: '~900 MB',
+    isThinking: false,
+  },
 }
 
 export const DEFAULT_MODEL_KEY = 'gemma'

From b0f088f6d52038bbdf1941db16abc4a106d1be18 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:27:50 -0400
Subject: [PATCH 12/31] feat(phase-3): multilingual preset switcher +
 custom-model stub
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups for Phase 3 validation:

1. ClassificationHarness: dropdown above the user description with 9
   patient presets. Same 58yo woman with breast cancer in Boston in
   English, Spanish, Mandarin, Arabic, Portuguese, French, plus two
   English variants (terse "58F, BC, Boston" and a more detailed
   version) and a more-detailed Spanish. RTL direction auto-detected
   for Arabic. Lets us validate Qwen2.5-1.5B's multilingual handling
   without manually pasting translations. Selecting a preset replaces
   the description; manual edits show as "— custom —".

2. nlp.worker.js: commented-out appConfig stub for serving a custom
   model via prebuiltAppConfig extension. Walk-through comment points
   at the LoRA training doc in the vault. Drop-in: uncomment three
   lines in the worker, add a matching entry in nlpModels.js, swap
   via ?model=iris-finetune. No code path active until you flip the
   commented lines.
---
 src/components/ClassificationHarness.jsx | 41 ++++++++++++++++++++++--
 src/workers/nlp.worker.js                | 24 +++++++++++++-
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 3a75208..50897a3 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -335,6 +335,22 @@ Answer (one line, format exactly "<LABEL> | <one short reason>"):`
 
 const DEFAULT_USER_DESC = "I'm 58 years old with breast cancer in Boston"
 
+// Patient description presets for multilingual + edge-case validation. Same
+// 58yo woman with breast cancer in Boston, expressed in different languages
+// and registers (formal, terse, etc.) so we can stress-test the model's
+// understanding without changing the underlying clinical signal.
+const USER_PRESETS = [
+  { id: 'en',     label: 'English',                  text: "I'm 58 years old with breast cancer in Boston" },
+  { id: 'en-2',   label: 'English (more detail)',    text: "58-year-old woman in Boston, postmenopausal, recently diagnosed with breast cancer, looking for post-chemo treatment options" },
+  { id: 'es',     label: 'Spanish (Español)',        text: 'Tengo 58 años, vivo en Boston y tengo cáncer de mama' },
+  { id: 'es-2',   label: 'Spanish (more detail)',    text: 'Soy mujer de 58 años, posmenopáusica, vivo en Boston. Me diagnosticaron cáncer de mama y busco opciones de tratamiento después de quimioterapia.' },
+  { id: 'zh',     label: 'Mandarin (中文)',          text: '我58岁，住在波士顿，患有乳腺癌' },
+  { id: 'ar',     label: 'Arabic (العربية)',        text: 'أنا امرأة عمري 58 عامًا أعيش في بوسطن ومصابة بسرطان الثدي' },
+  { id: 'pt',     label: 'Portuguese (Português)',   text: 'Tenho 58 anos, moro em Boston e tenho câncer de mama' },
+  { id: 'fr',     label: 'French (Français)',        text: "J'ai 58 ans, je vis à Boston et j'ai un cancer du sein" },
+  { id: 'terse',  label: 'Terse / fragments',        text: '58F, BC, Boston' },
+]
+
 // Parser still accepts POSSIBLE in case the model emits it (older prompts,
 // instruction drift) — POSSIBLE is normalized to LIKELY since the binary
 // product question is "show or hide".
@@ -545,13 +561,32 @@ export default function ClassificationHarness() {
         <h2 className="font-serif font-semibold text-base mb-3">Inputs</h2>
         <div className="grid grid-cols-1 md:grid-cols-2 gap-4">
           <div>
-            <label className="font-mono text-[10px] font-semibold uppercase tracking-[0.08em] text-iris-700 mb-1.5 block">
-              User description
-            </label>
+            <div className="flex items-center justify-between mb-1.5 gap-2 flex-wrap">
+              <label className="font-mono text-[10px] font-semibold uppercase tracking-[0.08em] text-iris-700">
+                User description
+              </label>
+              <select
+                value={USER_PRESETS.find(p => p.text === userDesc)?.id ?? 'custom'}
+                onChange={e => {
+                  const preset = USER_PRESETS.find(p => p.id === e.target.value)
+                  if (preset) setUserDesc(preset.text)
+                }}
+                className="text-[11px] px-2 py-1 border border-parchment-300 rounded bg-white text-parchment-700"
+                title="Swap the patient description to test multilingual handling and edge cases"
+              >
+                {!USER_PRESETS.some(p => p.text === userDesc) && (
+                  <option value="custom">— custom —</option>
+                )}
+                {USER_PRESETS.map(p => (
+                  <option key={p.id} value={p.id}>{p.label}</option>
+                ))}
+              </select>
+            </div>
             <textarea
               rows={3}
               value={userDesc}
               onChange={e => setUserDesc(e.target.value)}
+              dir={userDesc.match(/[؀-ۿ]/) ? 'rtl' : 'ltr'}
               className="w-full text-[13px] px-3 py-2.5 border border-parchment-300 rounded-lg bg-parchment-50 text-parchment-900 resize-y focus:outline-none focus:ring-2 focus:ring-iris-500"
             />
           </div>
diff --git a/src/workers/nlp.worker.js b/src/workers/nlp.worker.js
index c8385b7..1541580 100644
--- a/src/workers/nlp.worker.js
+++ b/src/workers/nlp.worker.js
@@ -30,18 +30,40 @@ self.onmessage = async (event) => {
     loading = true
     isThinkingModel = Boolean(isThinking)
     try {
-      const { CreateMLCEngine } = await import('@mlc-ai/web-llm')
+      const { CreateMLCEngine /* , prebuiltAppConfig */ } = await import('@mlc-ai/web-llm')
       // CreateMLCEngine signature: (modelId, engineConfig, chatOpts).
       // chatOpts is per-model config override (e.g. sliding_window_size:-1
       // for gemma3, whose prebuilt record sets context_window_size:4096
       // alongside sliding_window_size:512 — the engine rejects both being
       // positive).
+      //
+      // ─── Custom model wiring (stub) ──────────────────────────────────
+      // To serve a fine-tuned model (e.g. a domain-specific LoRA merged
+      // back into Qwen2.5-1.5B), uncomment the appConfig block below and
+      // add a matching entry to nlpModels.js with model_id matching the
+      // one here. The model and model_lib URLs must be CORS-accessible —
+      // HuggingFace Hub serves MLC artifacts with the right headers; a
+      // self-hosted bucket needs explicit CORS config. See
+      // ~/Documents/Github/sevry_vault/Work/ClaudeCode/iris/lora-training-process.md
+      // for the end-to-end LoRA → MLC → WebLLM pipeline.
+      //
+      // const appConfig = {
+      //   model_list: [
+      //     ...prebuiltAppConfig.model_list,
+      //     {
+      //       model: 'https://huggingface.co/USER/iris-classifier-q4f16_1-MLC/resolve/main/',
+      //       model_id: 'iris-classifier-q4f16_1-MLC',
+      //       model_lib: 'https://huggingface.co/USER/iris-classifier-q4f16_1-MLC/resolve/main/iris-classifier-q4f16_1-ctx4k_cs1k-webgpu.wasm',
+      //     },
+      //   ],
+      // }
       engine = await CreateMLCEngine(
         modelId ?? DEFAULT_MODEL_ID,
         {
           initProgressCallback: (progress) => {
             self.postMessage({ type: 'progress', progress })
           },
+          // appConfig, // ← uncomment alongside the block above
         },
         chatOpts ?? undefined,
       )

From 2ce00be836fb272444dc449275bdef0b6192b606 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:34:43 -0400
Subject: [PATCH 13/31] feat(phase-3): translate-then-classify pipeline as
 harness toggle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Spanish run on Qwen2.5-1.5B dropped 16 points vs English (67% vs 83%)
because the model stops anchoring on the patient's stated condition
when reasoning across languages — it confabulates "matches the
patient's condition" for trials about prostate cancer, T2D, NSCLC.

Translation is the model's strength (well-defined task) and runs ONCE
per batch (the patient description), not per trial. So the cost is
~1s amortized across all N classifications.

Adds a "translate to English first" checkbox to the harness controls.
When enabled:
1. Before the classify queue starts, run a translation prompt on the
   user description.
2. Show the result in an iris-tinted callout above the stats.
3. Use the translated text for all per-trial classifications.

If translation fails the run aborts with an alert (rare but possible).

This is harness-only for now — the eventual app integration would
trigger translation only when detectInputLanguage(userDescription)
returns non-English. Path A (gate on en-only) is still on the table
if translation doesn't recover enough accuracy.
---
 src/components/ClassificationHarness.jsx | 46 +++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 50897a3..19e6209 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -402,6 +402,8 @@ export default function ClassificationHarness() {
   const [trialsJson, setTrialsJson] = useState(JSON.stringify(SAMPLE_TRIALS, null, 2))
   const [concurrency, setConcurrency] = useState(3)
   const [eligMax, setEligMax] = useState(1500)
+  const [translateFirst, setTranslateFirst] = useState(false)
+  const [translatedDesc, setTranslatedDesc] = useState(null)
   const [results, setResults] = useState([])
   const [running, setRunning] = useState(false)
   const [startT, setStartT] = useState(0)
@@ -433,6 +435,28 @@ export default function ClassificationHarness() {
     setStartT(performance.now())
     const initial = trials.map(trial => ({ trial, status: 'PENDING' }))
     setResults(initial)
+    setTranslatedDesc(null)
+
+    // Translate user description to English once before classification, so the
+    // model anchors on a single language at inference time. Runs only once per
+    // batch — amortized cost across all N trials.
+    let effectiveUserDesc = userDesc
+    if (translateFirst) {
+      const translatePrompt = `Translate the following patient description into clear, clinical English. Preserve all medical and demographic facts (age, sex, condition, treatments, location). Do not add or remove information. Output ONLY the English translation, nothing else.
+
+Patient description: ${userDesc}
+
+English translation:`
+      try {
+        const { raw } = await classifyOne(translatePrompt)
+        effectiveUserDesc = (raw || '').trim().replace(/^["']|["']$/g, '')
+        setTranslatedDesc(effectiveUserDesc)
+      } catch (e) {
+        alert('Translation failed: ' + (e?.message ?? 'unknown error'))
+        setRunning(false)
+        return
+      }
+    }
 
     const queue = trials.map((trial, idx) => ({ idx, trial }))
     const workersN = Math.min(concurrency, trials.length)
@@ -442,7 +466,7 @@ export default function ClassificationHarness() {
         const { idx, trial } = queue.shift()
         const elig = (trial.eligibility || '').slice(0, eligMax)
         const prompt = promptTemplate
-          .replace('{{user}}', userDesc)
+          .replace('{{user}}', effectiveUserDesc)
           .replace('{{title}}', trial.title || trial.briefTitle || '')
           .replace('{{eligibility}}', elig)
         try {
@@ -639,6 +663,19 @@ export default function ClassificationHarness() {
             <span className="font-mono text-[10px] uppercase tracking-[0.04em]">execution</span>
             serial (engine constraint)
           </span>
+          <label
+            className="inline-flex items-center gap-1.5 text-[12px] text-parchment-900 cursor-pointer"
+            title="Translate the patient description to English once before classification. Runs only once per batch — adds ~1s amortized."
+          >
+            <input
+              type="checkbox"
+              checked={translateFirst}
+              onChange={e => setTranslateFirst(e.target.checked)}
+              disabled={running}
+              className="accent-iris-500"
+            />
+            translate to English first
+          </label>
           <label className="inline-flex items-center gap-2 text-[12px] text-parchment-700">
             Eligibility max chars
             <input
@@ -654,6 +691,13 @@ export default function ClassificationHarness() {
           </label>
         </div>
 
+        {translatedDesc && (
+          <div className="mt-3 px-3 py-2.5 bg-iris-50 border border-iris-100 rounded-lg text-[12px] text-parchment-900 leading-relaxed">
+            <span className="font-mono text-[10px] uppercase tracking-[0.08em] text-iris-700 mr-2">translated</span>
+            {translatedDesc}
+          </div>
+        )}
+
         {(running || done.length > 0) && (
           <div className="flex flex-wrap items-center gap-4 font-mono text-[11px] text-parchment-700 mt-3">
             <span><strong className="text-parchment-950">{done.length} / {results.length}</strong> done</span>

From 6a9948fd028de96281c0ad22c2ce5f3b7c9e6860 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:37:30 -0400
Subject: [PATCH 14/31] fix(phase-3): include translate-first state in markdown
 report

Couldn't tell from the previous Spanish run whether the translate-first
toggle was active because the markdown report omitted that state. Add
both the toggle status and the resulting translated text to the
report header so runs are unambiguously reproducible.
---
 src/components/ClassificationHarness.jsx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 19e6209..3d757b2 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -763,6 +763,10 @@ function buildMarkdownReport({ userDesc, promptTemplate, eligMax, modelLabel, re
   lines.push('')
   lines.push(`**Model:** ${modelLabel}`)
   lines.push(`**User description:** ${userDesc}`)
+  lines.push(`**Translate-first:** ${translateFirst ? 'ON' : 'off'}`)
+  if (translatedDesc) {
+    lines.push(`**Translated to:** ${translatedDesc}`)
+  }
   lines.push(`**Eligibility max chars:** ${eligMax}`)
   lines.push('')
   lines.push('## Stats')

From c05ac7464f0c046f20309ff643634dc707904007 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:38:07 -0400
Subject: [PATCH 15/31] fix(phase-3): wire translate-first state through
 markdown report (followup to a628b7a)

---
 src/components/ClassificationHarness.jsx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 3d757b2..74c0a75 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -507,6 +507,8 @@ English translation:`
   async function copyMarkdown() {
     const md = buildMarkdownReport({
       userDesc,
+      translatedDesc,
+      translateFirst,
       promptTemplate,
       eligMax,
       modelLabel: model.label,
@@ -751,7 +753,7 @@ English translation:`
   )
 }
 
-function buildMarkdownReport({ userDesc, promptTemplate, eligMax, modelLabel, results, stats }) {
+function buildMarkdownReport({ userDesc, translatedDesc, translateFirst, promptTemplate, eligMax, modelLabel, results, stats }) {
   const escape = (s) => String(s ?? '').replace(/\|/g, '\\|').replace(/\n/g, ' ').trim()
   const truncate = (s, n) => {
     const t = escape(s)

From 6f04fb4068e53107bcb73ca16afda19d90ae23d9 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:49:11 -0400
Subject: [PATCH 16/31] feat(phase-3): production-realistic agreement toggle

The headline agreement was being dragged down by stress-test trials
the CT.gov API would never return for the user's stated condition
(melanoma, NSCLC, T2D, HFpEF, prostate, etc. for a breast-cancer
search). Headline 83% / Spanish 71% understated the actual production
quality, which is closer to 94% on trials the user would actually see.

- Tag 9 wrong-condition fixture trials with outOfScope: true
- Add a "production-realistic agreement" checkbox (default ON)
- When ON: agreement % only counts in-scope trials. Out-of-scope are
  still classified and shown in the table but excluded from the
  headline + agreement note explains how many were excluded.
- When OFF: previous all-trials behavior (useful for stress-testing
  prompt changes).
- Markdown report includes the toggle state and exclusion count so
  shared runs are unambiguous.
---
 src/components/ClassificationHarness.jsx | 53 +++++++++++++++++++++---
 1 file changed, 48 insertions(+), 5 deletions(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 74c0a75..6b8cc36 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -33,12 +33,14 @@ const SAMPLE_TRIALS = [
     title: 'Pembrolizumab in Advanced Non-Small Cell Lung Cancer',
     eligibility: 'Inclusion: Adult. Histologically confirmed advanced NSCLC. PD-L1 expression ≥50%. ECOG 0-1. Exclusion: Active autoimmune disease. Prior immunotherapy.',
     expected: 'UNLIKELY',
+    outOfScope: true, // NSCLC — wouldn't appear in a breast-cancer API search
   },
   {
     nctId: 'NCT05123987',
     title: 'Targeted Therapy in Pediatric Acute Lymphoblastic Leukemia',
     eligibility: 'Inclusion: Pediatric patients aged 2-17 years. Newly diagnosed ALL. Exclusion: Adults. Prior chemotherapy.',
     expected: 'UNLIKELY',
+    outOfScope: true, // Pediatric ALL — wouldn't appear in a breast-cancer API search
   },
 
   // ─── Subtype-gated breast cancer trials — POSSIBLE without confirmed subtype ───
@@ -93,30 +95,35 @@ const SAMPLE_TRIALS = [
     title: 'Pembrolizumab in Advanced Melanoma',
     eligibility: 'Inclusion: Adults with histologically confirmed unresectable Stage III or Stage IV melanoma. ECOG 0-1. No prior systemic therapy for advanced disease. Exclusion: Active autoimmune disease.',
     expected: 'UNLIKELY',
+    outOfScope: true,
   },
   {
     nctId: 'NCT04678901',
     title: 'Apixaban vs. Warfarin in Atrial Fibrillation',
     eligibility: 'Inclusion: Adults ≥18 years with non-valvular atrial fibrillation. CHA2DS2-VASc score ≥2. Exclusion: Mechanical heart valve. Active bleeding.',
     expected: 'UNLIKELY',
+    outOfScope: true,
   },
   {
     nctId: 'NCT04789012',
     title: 'GLP-1 Agonist for Weight Management in Type 2 Diabetes',
     eligibility: 'Inclusion: Adults 18-75 with Type 2 diabetes mellitus. BMI ≥30. HbA1c 7.0-10.0%. Exclusion: Type 1 diabetes. Active malignancy within 5 years. History of pancreatitis.',
     expected: 'UNLIKELY',
+    outOfScope: true,
   },
   {
     nctId: 'NCT04890123',
     title: 'Robotic Prostatectomy Outcomes in Localized Prostate Cancer',
     eligibility: 'Inclusion: Men ≥40 years with biopsy-confirmed clinically localized prostate cancer (T1-T2). Candidate for radical prostatectomy. Exclusion: Prior pelvic surgery or radiation.',
     expected: 'UNLIKELY',
+    outOfScope: true,
   },
   {
     nctId: 'NCT04901234',
     title: 'Pediatric Vaccine Immunogenicity Study',
     eligibility: 'Inclusion: Healthy children aged 6 months to 5 years. Up to date on routine immunizations. Exclusion: Immunocompromised. Recent illness within 14 days.',
     expected: 'UNLIKELY',
+    outOfScope: true,
   },
 
   // ─── Edge cases — should challenge the model ───
@@ -226,6 +233,7 @@ Exclusion Criteria:
   {
     nctId: 'NCT-LONG-03',
     title: 'Phase III Study of Pembrolizumab Plus Chemotherapy versus Chemotherapy Alone for First-Line Treatment of Metastatic Squamous Non-Small Cell Lung Cancer',
+    outOfScope: true,
     eligibility: `Inclusion Criteria:
 
 1. Histologically or cytologically confirmed Stage IV squamous non-small cell lung cancer (NSCLC) per AJCC 8th edition.
@@ -264,6 +272,7 @@ Exclusion Criteria:
   {
     nctId: 'NCT-LONG-04',
     title: 'Multicenter Randomized Trial of Empagliflozin in Patients with Heart Failure with Preserved Ejection Fraction and Type 2 Diabetes',
+    outOfScope: true,
     eligibility: `Inclusion Criteria:
 
 1. Adults aged 40 to 85 years at consent.
@@ -404,6 +413,7 @@ export default function ClassificationHarness() {
   const [eligMax, setEligMax] = useState(1500)
   const [translateFirst, setTranslateFirst] = useState(false)
   const [translatedDesc, setTranslatedDesc] = useState(null)
+  const [productionMode, setProductionMode] = useState(true)
   const [results, setResults] = useState([])
   const [running, setRunning] = useState(false)
   const [startT, setStartT] = useState(0)
@@ -509,6 +519,8 @@ English translation:`
       userDesc,
       translatedDesc,
       translateFirst,
+      productionMode,
+      hiddenCount,
       promptTemplate,
       eligMax,
       modelLabel: model.label,
@@ -533,9 +545,15 @@ English translation:`
   const parseFails = done.filter(r => r.verdict === 'PARSE_FAIL').length
   const parseRate = done.length ? Math.round(((done.length - parseFails) / done.length) * 100) : 0
   const elapsed = startT ? ((performance.now() - startT) / 1000).toFixed(1) : '0.0'
-  const withExpected = done.filter(r => r.trial.expected)
+  // Production mode hides trials the CT.gov API would never return for the
+  // user's stated condition (e.g., melanoma trials in a breast-cancer search).
+  // The headline agreement % then reflects what users would actually see,
+  // not the model's behavior on stress-test inputs.
+  const inScope = (r) => !productionMode || !r.trial.outOfScope
+  const withExpected = done.filter(r => r.trial.expected && inScope(r))
   const matches = withExpected.filter(r => r.verdict === expectedBinary(r.trial.expected)).length
   const agreementPct = withExpected.length ? Math.round((matches / withExpected.length) * 100) : null
+  const hiddenCount = done.filter(r => r.trial.outOfScope).length
 
   const canRun = status === 'ready' && !running
 
@@ -678,6 +696,18 @@ English translation:`
             />
             translate to English first
           </label>
+          <label
+            className="inline-flex items-center gap-1.5 text-[12px] text-parchment-900 cursor-pointer"
+            title="Production mode: agreement % only counts trials the CT.gov API would actually return for the patient's condition. Out-of-scope stress-test trials (different cancers, unrelated diseases) are still classified and shown but excluded from the headline."
+          >
+            <input
+              type="checkbox"
+              checked={productionMode}
+              onChange={e => setProductionMode(e.target.checked)}
+              className="accent-iris-500"
+            />
+            production-realistic agreement
+          </label>
           <label className="inline-flex items-center gap-2 text-[12px] text-parchment-700">
             Eligibility max chars
             <input
@@ -734,8 +764,18 @@ English translation:`
         {agreementPct != null && !running && (
           <div className="font-mono text-[11px] text-parchment-700 mt-3 px-3 py-2.5 bg-iris-50 border border-iris-100 rounded-lg leading-relaxed">
             <strong className="text-iris-700">Agreement with expected:</strong>{' '}
-            {matches} / {withExpected.length} ({agreementPct}%) — useful as a smoke test on a labeled
-            held-out set. Below ~80% means the prompt or model needs work before this drives sort order.
+            {matches} / {withExpected.length} ({agreementPct}%)
+            {productionMode && hiddenCount > 0 && (
+              <span className="text-parchment-700">
+                {' '}— {hiddenCount} out-of-scope trial{hiddenCount !== 1 ? 's' : ''} excluded
+                (the CT.gov API would not return them for this condition).
+              </span>
+            )}
+            {!productionMode && (
+              <span className="text-parchment-700">
+                {' '}— includes out-of-scope stress-test trials. Toggle <em>production-realistic</em> for the user-facing number.
+              </span>
+            )}
           </div>
         )}
       </div>
@@ -753,7 +793,7 @@ English translation:`
   )
 }
 
-function buildMarkdownReport({ userDesc, translatedDesc, translateFirst, promptTemplate, eligMax, modelLabel, results, stats }) {
+function buildMarkdownReport({ userDesc, translatedDesc, translateFirst, productionMode, hiddenCount, promptTemplate, eligMax, modelLabel, results, stats }) {
   const escape = (s) => String(s ?? '').replace(/\|/g, '\\|').replace(/\n/g, ' ').trim()
   const truncate = (s, n) => {
     const t = escape(s)
@@ -781,7 +821,10 @@ function buildMarkdownReport({ userDesc, translatedDesc, translateFirst, promptT
   lines.push(`| Max latency | ${stats.maxLat}ms |`)
   lines.push(`| Parse rate | ${stats.parseRate}% (${stats.parseFails} fails) |`)
   if (stats.agreementPct != null) {
-    lines.push(`| Agreement | ${stats.matches} / ${stats.withExpected} (${stats.agreementPct}%) |`)
+    const note = productionMode
+      ? ` — ${hiddenCount || 0} out-of-scope trial(s) excluded`
+      : ' — includes out-of-scope stress-test trials'
+    lines.push(`| Agreement | ${stats.matches} / ${stats.withExpected} (${stats.agreementPct}%)${note} |`)
   }
   lines.push('')
   lines.push('## Results')

From 69f6faea64be1d4c4282b22bc6544d8da0b50b65 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:52:30 -0400
Subject: [PATCH 17/31] feat(phase-3): wire stage-1 classifyAll into
 ResultsList + fit dot in TriageRow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

End-to-end Phase 3 integration. After search results land, kick off
binary classification for every trial; surface verdicts as a small
fit dot in each row + a live "evaluating fit · X of N" caption in
the toolbar.

Architecture:
- New utils/classifyTrial.js holds DEFAULT_CLASSIFY_PROMPT and
  parseVerdict, shared between the harness and the in-app flow so
  the model sees identical prompts in both contexts.
- ResultsList uses useNLP (idempotent — worker fast-returns ready
  if model already loaded by NL extraction) + useClassifier.
- Classification only fires when:
  1. iris_nlp_enabled === 'true' in localStorage (user previously
     consented to the on-device model — we never auto-load without
     consent), AND
  2. patientDesc is available (from userDescription if NL was used,
     or synthesized from extractedFields), AND
  3. WebGPU is supported.
  Structured-form-only sessions skip classification silently — no
  covert worker init, no model download surprise.
- Pagination: only newly-arrived NCTs get classified (classifiedRef
  Set dedups). Search-param change resets state and cancels any
  in-flight batch.

UI:
- TriageRow grows a FitDot:
  • iris-violet filled circle = LIKELY (with reason in tooltip)
  • parchment-300 hollow ring = UNLIKELY (with reason)
  • shimmer dot = pending
  • nothing rendered for PARSE_FAIL (don't surface model errors to users)
- Toolbar shows "evaluating fit · 7 of 24" while running, then
  "fit evaluated for 24" when done.

What's NOT in this PR (deferred):
- Sort by best fit (Handoff Phase 3 step 5)
- Collapse UNLIKELY under "12 less likely matches" disclosure (step 6)
Both need more UX consideration; the dots + caption are the safe
first slice.
---
 src/components/ResultsList.jsx | 112 ++++++++++++++++++++++++++++++++-
 src/components/TriageRow.jsx   |  36 ++++++++++-
 src/utils/classifyTrial.js     |  77 +++++++++++++++++++++++
 3 files changed, 222 insertions(+), 3 deletions(-)
 create mode 100644 src/utils/classifyTrial.js

diff --git a/src/components/ResultsList.jsx b/src/components/ResultsList.jsx
index e0baa05..ce6bed5 100644
--- a/src/components/ResultsList.jsx
+++ b/src/components/ResultsList.jsx
@@ -1,7 +1,11 @@
-import { useEffect, useMemo, useState } from 'react'
+import { useEffect, useMemo, useRef, useState } from 'react'
 import { useGeocode } from '../hooks/useGeocode'
 import { useClinicalTrials } from '../hooks/useClinicalTrials'
 import { useSimplifier } from '../hooks/useSimplifier'
+import { useNLP } from '../hooks/useNLP'
+import { useClassifier } from '../hooks/useClassifier'
+import { NLP_MODELS } from '../utils/nlpModels'
+import { buildClassifyPrompt, parseVerdict } from '../utils/classifyTrial'
 import ResultCard from './ResultCard'
 import TriageRow from './TriageRow'
 import MobileSheet from './MobileSheet'
@@ -11,6 +15,20 @@ import {
   SUPPORTED_SIMPLIFICATION_LANGUAGES,
 } from '../utils/detectInputLanguage'
 
+const NLP_CONSENT_KEY = 'iris_nlp_enabled'
+
+// Build a synthetic patient description from extracted fields when the user
+// came in via structured form but had previously used NL (so consent exists).
+function patientDescFromFields(fields) {
+  if (!fields) return null
+  const parts = []
+  if (fields.age != null) parts.push(`${fields.age}-year-old`)
+  if (fields.sex && fields.sex !== 'ALL') parts.push(fields.sex.toLowerCase())
+  if (fields.condition) parts.push(`with ${fields.condition}`)
+  if (fields.location) parts.push(`in ${fields.location}`)
+  return parts.length > 0 ? parts.join(' ') : null
+}
+
 const EAGER_BATCH_SIZE = 5
 const MOBILE_BREAKPOINT_PX = 820
 const LIST_WIDTH_PX = 400
@@ -65,6 +83,42 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
   const [sheetOpen, setSheetOpen] = useState(false)
   const [compareSet, setCompareSet] = useState(() => new Set())
 
+  // ─── Stage-1 classification ───────────────────────────────────────
+  // Only fires when the user previously consented to the on-device model
+  // (iris_nlp_enabled localStorage key, set during NL flow). Structured-
+  // form-only sessions skip classification entirely — no auto-load,
+  // no covert worker initialization. Verdicts surface as fit dots in
+  // TriageRow + a "evaluating fit · X of N" caption in the toolbar.
+  const nlp = useNLP()
+  const { classifyOne } = useClassifier()
+  const [classifications, setClassifications] = useState(new Map())
+  const [classifyProgress, setClassifyProgress] = useState({ done: 0, total: 0 })
+  const classifiedRef = useRef(new Set())
+  const cancelClassifyRef = useRef(null)
+
+  const consented = useMemo(() => {
+    try { return localStorage.getItem(NLP_CONSENT_KEY) === 'true' } catch { return false }
+  }, [])
+  const patientDesc = userDescription || patientDescFromFields(extractedFields)
+  const canClassify = consented && nlp.webGPUSupported && Boolean(patientDesc)
+
+  // Idempotent: worker fast-returns 'ready' if engine already loaded
+  // (e.g. NL extraction loaded it earlier this session).
+  useEffect(() => {
+    if (!canClassify) return
+    if (nlp.status !== 'idle') return
+    const model = NLP_MODELS[modelKey] ?? NLP_MODELS.gemma
+    nlp.load(model.id, { isThinking: model.isThinking, chatOpts: model.chatOpts })
+  }, [canClassify, nlp.status, modelKey, nlp])
+
+  // Reset classification state when the search itself changes.
+  useEffect(() => {
+    classifiedRef.current = new Set()
+    setClassifications(new Map())
+    setClassifyProgress({ done: 0, total: 0 })
+    if (cancelClassifyRef.current) cancelClassifyRef.current()
+  }, [searchParams])
+
   function toggleCompare(nctId) {
     setCompareSet(prev => {
       const next = new Set(prev)
@@ -93,6 +147,49 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
     if (isMobile) setSheetOpen(true)
   }
 
+  // Classify newly-arrived trials. Pagination appends → classify only new
+  // NCTs. Engine-not-loaded check is via nlp.status !== 'ready'.
+  const trialKeyAll = allTrials.map(t => t.nctId).join(',')
+  useEffect(() => {
+    if (!canClassify || nlp.status !== 'ready' || !patientDesc) return
+    const newTrials = allTrials.filter(t => !classifiedRef.current.has(t.nctId))
+    if (newTrials.length === 0) return
+    for (const t of newTrials) classifiedRef.current.add(t.nctId)
+
+    setClassifyProgress(prev => ({ done: prev.done, total: prev.total + newTrials.length }))
+
+    let cancelled = false
+    cancelClassifyRef.current = () => { cancelled = true }
+    ;(async () => {
+      for (const trial of newTrials) {
+        if (cancelled) return
+        try {
+          const prompt = buildClassifyPrompt(patientDesc, trial)
+          const { raw } = await classifyOne(prompt)
+          const parsed = parseVerdict(raw)
+          if (cancelled) return
+          setClassifications(prev => {
+            const next = new Map(prev)
+            next.set(trial.nctId, { status: 'done', ...parsed, raw })
+            return next
+          })
+        } catch (err) {
+          if (cancelled) return
+          setClassifications(prev => {
+            const next = new Map(prev)
+            next.set(trial.nctId, { status: 'done', verdict: 'PARSE_FAIL', reason: err?.message ?? 'classify error' })
+            return next
+          })
+        } finally {
+          if (!cancelled) {
+            setClassifyProgress(prev => ({ ...prev, done: prev.done + 1 }))
+          }
+        }
+      }
+    })()
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [canClassify, nlp.status, patientDesc, trialKeyAll])
+
   // Fire when the result set changes — keyed on the first 5 NCT IDs.
   // Using searchParams as the key would fire too early (before data arrives);
   // using allTrials would re-fire on every pagination append.
@@ -184,6 +281,7 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
       <ResultsToolbar
         totalCount={totalCount}
         searchParams={searchParams}
+        classifyProgress={canClassify ? classifyProgress : null}
       />
 
       <div
@@ -204,6 +302,8 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
                   comparing={compareSet.has(trial.nctId)}
                   onToggleCompare={toggleCompare}
                   compareDisabled={compareSet.size >= 3}
+                  classification={canClassify ? classifications.get(trial.nctId) : null}
+                  classifyPending={canClassify && !classifications.has(trial.nctId)}
                 />
               </li>
             ))}
@@ -269,7 +369,7 @@ const SORT_OPTIONS = [
   { id: 'recent',   label: 'Most recent',  disabled: true, title: 'Sort wiring coming in a follow-up' },
 ]
 
-function ResultsToolbar({ totalCount, searchParams }) {
+function ResultsToolbar({ totalCount, searchParams, classifyProgress }) {
   const [sort, setSort] = useState('recent')
 
   const summaryParts = [`${totalCount.toLocaleString()} trial${totalCount !== 1 ? 's' : ''}`]
@@ -290,6 +390,14 @@ function ResultsToolbar({ totalCount, searchParams }) {
             {part}
           </span>
         ))}
+        {classifyProgress && classifyProgress.total > 0 && (
+          <span className="ml-3 text-iris-700">
+            <span className="text-parchment-300 mr-1.5" aria-hidden="true">·</span>
+            {classifyProgress.done < classifyProgress.total
+              ? `evaluating fit · ${classifyProgress.done} of ${classifyProgress.total}`
+              : `fit evaluated for ${classifyProgress.total}`}
+          </span>
+        )}
       </p>
       <div className="hidden sm:flex items-center gap-1" role="group" aria-label="Sort results">
         <span className="font-mono text-[10px] uppercase tracking-[0.08em] text-parchment-700 mr-2">
diff --git a/src/components/TriageRow.jsx b/src/components/TriageRow.jsx
index da37199..2246c11 100644
--- a/src/components/TriageRow.jsx
+++ b/src/components/TriageRow.jsx
@@ -1,5 +1,36 @@
 import { nearestLocation } from '../utils/apiHelpers'
 
+function FitDot({ classification, pending }) {
+  if (classification?.verdict === 'PARSE_FAIL') return null
+
+  if (pending && !classification) {
+    return (
+      <span
+        className="iris-shimmer-text inline-block w-2 h-2 rounded-full mr-1"
+        title="Evaluating fit…"
+        aria-label="Evaluating fit"
+      >&nbsp;</span>
+    )
+  }
+  if (!classification) return null
+
+  const isLikely = classification.verdict === 'LIKELY'
+  return (
+    <span
+      className={[
+        'inline-block w-2 h-2 rounded-full mr-1 shrink-0',
+        isLikely ? 'bg-iris-500' : 'border border-parchment-400',
+      ].join(' ')}
+      title={
+        isLikely
+          ? `Likely fit — ${classification.reason || 'matches your description'}`
+          : `Less likely fit — ${classification.reason || 'may not match'}`
+      }
+      aria-label={isLikely ? 'Likely fit' : 'Less likely fit'}
+    />
+  )
+}
+
 const PHASE_SHORT = {
   EARLY_PHASE1: 'Early Phase 1',
   PHASE1: 'Phase 1',
@@ -22,6 +53,8 @@ export default function TriageRow({
   comparing = false,
   onToggleCompare,
   compareDisabled = false,
+  classification = null,
+  classifyPending = false,
 }) {
   const nearest = nearestLocation(trial.locations, coords)
   const phase = formatPhase(trial.phases)
@@ -51,7 +84,8 @@ export default function TriageRow({
         >
           {trial.title}
         </h3>
-        <span className="font-mono text-[11px] text-parchment-700 flex flex-wrap gap-x-1.5">
+        <span className="font-mono text-[11px] text-parchment-700 flex flex-wrap items-center gap-x-1.5">
+          <FitDot classification={classification} pending={classifyPending} />
           {nearest?.distanceMi != null && <span>{nearest.distanceMi} mi</span>}
           {nearest?.distanceMi != null && phase && <span aria-hidden="true">·</span>}
           {phase && <span>{phase}</span>}
diff --git a/src/utils/classifyTrial.js b/src/utils/classifyTrial.js
new file mode 100644
index 0000000..bb9b86d
--- /dev/null
+++ b/src/utils/classifyTrial.js
@@ -0,0 +1,77 @@
+// Stage-1 classification prompt + verdict parser.
+// Shared between the harness (?test=classify) and the in-app
+// classifyAll flow in ResultsList so the model sees the same prompt
+// in both contexts. Keep the wording in sync with the validated
+// harness baseline (Qwen2.5-1.5B → ~83% binary agreement, 0
+// catastrophic UNLIKELY).
+
+export const DEFAULT_CLASSIFY_PROMPT = `You decide whether a clinical trial is worth showing to a patient. Output one of two labels:
+
+- LIKELY: the trial studies the patient's condition AND nothing in the eligibility clearly excludes the patient based on what they stated. Worth showing.
+- UNLIKELY: the trial studies a different disease, OR the patient is clearly the wrong sex / age / population. Not worth showing.
+
+Be inclusive on LIKELY: if the trial requires a subtype, biomarker, stage, or prior treatment the patient did NOT mention, still call it LIKELY — the patient or their doctor can verify. Only use UNLIKELY when the patient is clearly disqualified by something they DID state.
+
+Examples (note: each example uses a DIFFERENT patient — focus on the reasoning, not the patient details):
+
+Patient: "45-year-old woman with ovarian cancer"
+Trial: PARP Inhibitor in BRCA-Mutated Ovarian Cancer (Eligibility: women with ovarian cancer and BRCA mutation)
+Answer: LIKELY | matches ovarian cancer in a woman; BRCA status can be verified
+
+Patient: "70-year-old man with type 2 diabetes"
+Trial: Tamoxifen in Premenopausal Breast Cancer (Eligibility: premenopausal women with breast cancer)
+Answer: UNLIKELY | trial is for breast cancer in women; patient has diabetes
+
+Patient: "8-year-old child with asthma"
+Trial: Adult Anti-Inflammatory for Asthma (Eligibility: adults 18+ with persistent asthma)
+Answer: UNLIKELY | trial is for adults; patient is a child
+
+Patient: "55-year-old man with hypertension"
+Trial: Yoga Intervention for Adults with Chronic Conditions (Eligibility: adults 40-75 with any chronic condition)
+Answer: LIKELY | adult with chronic condition matches the broad inclusion
+
+Now classify:
+
+Patient: {{user}}
+Trial: {{title}}
+Eligibility: {{eligibility}}
+
+Answer (one line, format exactly "<LABEL> | <one short reason>"):`
+
+export const ELIG_MAX_CHARS = 1500
+
+export function buildClassifyPrompt(userDesc, trial, eligMax = ELIG_MAX_CHARS) {
+  // Reuse the trial.eligibility (string) when present; fall back to the
+  // structured eligibility.criteria field that useClinicalTrials emits.
+  const elig = (
+    typeof trial.eligibility === 'string'
+      ? trial.eligibility
+      : trial.eligibility?.criteria ?? ''
+  ).slice(0, eligMax)
+  const title = trial.title || trial.briefTitle || ''
+  return DEFAULT_CLASSIFY_PROMPT
+    .replace('{{user}}', userDesc ?? '')
+    .replace('{{title}}', title)
+    .replace('{{eligibility}}', elig)
+}
+
+// Parser still accepts POSSIBLE in case the model emits it (older prompts,
+// instruction drift) — POSSIBLE is normalized to LIKELY since the binary
+// product question is "show or hide".
+export function parseVerdict(raw) {
+  if (!raw || typeof raw !== 'string') return { verdict: 'PARSE_FAIL', reason: '(empty output)' }
+  const m = raw.match(/^\s*(LIKELY|POSSIBLE|UNLIKELY)\s*[|:\-—]\s*(.+?)\s*$/im)
+  if (m) {
+    const v = m[1].toUpperCase()
+    return { verdict: v === 'POSSIBLE' ? 'LIKELY' : v, reason: m[2].trim() }
+  }
+  const w = raw.match(/\b(LIKELY|POSSIBLE|UNLIKELY)\b/i)
+  if (w) {
+    const v = w[1].toUpperCase()
+    return {
+      verdict: v === 'POSSIBLE' ? 'LIKELY' : v,
+      reason: raw.replace(w[0], '').replace(/^[\s|:\-—]+/, '').trim() || '(no reason)',
+    }
+  }
+  return { verdict: 'PARSE_FAIL', reason: raw.slice(0, 120) }
+}

From 44b9e7bd01fa6e20c0e865db356aa0cb3be524e3 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 22:58:22 -0400
Subject: [PATCH 18/31] fix(simplifier): replace bare trailing labels with
 directive instructions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Qwen2.5-1.5B was hallucinating Python tutorial fragments in the
"What this study is testing" section because the simplifier prompt
ended with a bare label ("PLAIN-LANGUAGE OUTPUT:") that chat-tuned
models read as a topic header — then they pattern-match training
data that follows similar headers and emit unrelated content.

Two changes to both buildSummarizePrompt and buildAssessFitPrompt:

1. Rename the example block label from "PLAIN-LANGUAGE OUTPUT:" /
   "ASSESSMENT:" to "EXAMPLE PLAIN-LANGUAGE OUTPUT:" /
   "EXAMPLE ASSESSMENT:" — clarifies its role for the model.

2. Replace the trailing bare label with a directive instruction
   ("Now write the plain-language summary…", "Begin your response
   with \"## What this study is testing\":") that names the action
   and pins the first header. This is what chat-instruct models like
   Qwen and Gemma both expect — a clear command, not a label.

Updated the language-reminder ordering test to look for the new
trailing directive instead of the old label.
---
 src/utils/simplifyHelpers.js      | 22 ++++++++++++++++------
 src/utils/simplifyHelpers.test.js |  5 ++++-
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/utils/simplifyHelpers.js b/src/utils/simplifyHelpers.js
index 94db74b..441da31 100644
--- a/src/utils/simplifyHelpers.js
+++ b/src/utils/simplifyHelpers.js
@@ -97,6 +97,13 @@ export function buildSummarizePrompt(trial, options = {}) {
   const ex = SUMMARIZE_EXEMPLAR
   const langPre = languagePreGenInstruction(options.outputLanguage)
   const langPost = languagePostGenReminder(options.outputLanguage)
+  // The trailing instruction is intentionally directive ("Write the
+  // plain-language summary now…") rather than a bare label like
+  // "PLAIN-LANGUAGE OUTPUT:". Some chat-tuned models (Qwen2.5-1.5B) read
+  // a trailing label as a topic header and hallucinate unrelated tutorial
+  // content from training data instead of continuing the task. Naming
+  // the action — and pinning the first header the model must emit — fixes
+  // it for Qwen and stays compatible with Gemma's behavior on this prompt.
   return `${SUMMARIZE_INSTRUCTIONS}
 
 Here is an example.
@@ -105,17 +112,17 @@ SOURCE TRIAL:
 Brief summary: ${ex.input.briefSummary}
 Eligibility: ${ex.input.eligibility}
 
-PLAIN-LANGUAGE OUTPUT:
+EXAMPLE PLAIN-LANGUAGE OUTPUT:
 ${ex.output}
 ${langPre}
 
-Now do the same for this trial.
+Now write the plain-language summary for the SOURCE TRIAL below, using the same two-section structure (## What this study is testing, then ## Who can join). Use only facts from this trial.
 
 SOURCE TRIAL:
 Brief summary: ${trial.summary ?? ''}
 Eligibility: ${trial.eligibility?.criteria ?? ''}
 ${langPost}
-PLAIN-LANGUAGE OUTPUT:
+Begin your response with "## What this study is testing":
 `
 }
 
@@ -135,6 +142,9 @@ Keep drug names (e.g. trastuzumab deruxtecan, pembrolizumab) and gene names (e.g
 Do NOT write the assessment in English.`
     : ''
   const langPost = isNonEnglish ? `\nReminder: write your assessment in ${options.outputLanguage}.` : ''
+  // Same trailing-marker fix as buildSummarizePrompt — replace the bare
+  // "ASSESSMENT:" label with a directive instruction so chat-tuned models
+  // don't read it as a topic header and hallucinate.
   return `${ASSESS_FIT_INSTRUCTIONS}
 
 Here is an example.
@@ -147,11 +157,11 @@ TRIAL:
 Brief summary: ${ex.input.briefSummary}
 Eligibility: ${ex.input.eligibility}
 
-ASSESSMENT:
+EXAMPLE ASSESSMENT:
 ${ex.output}
 ${langPre}
 
-Now do the same.
+Now write the assessment for the patient and trial below.
 
 PATIENT:
 ${descLine}
@@ -161,6 +171,6 @@ TRIAL:
 Brief summary: ${trial.summary ?? ''}
 Eligibility: ${trial.eligibility?.criteria ?? ''}
 ${langPost}
-ASSESSMENT:
+Write your 2-4 sentence assessment now:
 `
 }
diff --git a/src/utils/simplifyHelpers.test.js b/src/utils/simplifyHelpers.test.js
index ccd6737..bdececc 100644
--- a/src/utils/simplifyHelpers.test.js
+++ b/src/utils/simplifyHelpers.test.js
@@ -170,7 +170,10 @@ describe('buildSummarizePrompt', () => {
   it('also includes a final pre-generation reminder so the directive is the most recent token', () => {
     const prompt = buildSummarizePrompt(trial, { outputLanguage: 'Spanish' })
     const reminderIdx = prompt.lastIndexOf('Reminder: write the body in Spanish')
-    const finalCueIdx = prompt.lastIndexOf('PLAIN-LANGUAGE OUTPUT:')
+    // The trailing cue used to be the bare label "PLAIN-LANGUAGE OUTPUT:" but
+    // chat-tuned models read that as a header and hallucinate. The new cue is
+    // a directive ending with the first header the model must emit.
+    const finalCueIdx = prompt.lastIndexOf('Begin your response with "## What this study is testing"')
     expect(reminderIdx).toBeGreaterThan(-1)
     expect(reminderIdx).toBeLessThan(finalCueIdx)
   })

From a5aca29688c3c30977a066b5b5d0e832744dfd85 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 23:03:36 -0400
Subject: [PATCH 19/31] feat(phase-3): register Gemma 2 2B q4f16_1 as a faster
 alternative

Same Gemma 2 2B model, just q4f16_1 quantization instead of q4f32_1:
~900 MB weights (vs 1.3 GB) and fp16 activations. WebGPU's native
compute type is fp16 on most GPUs, so inference latency drops
measurably with no quality difference on short instruction tasks.

Registered as ?model=gemma_fast so it can be A/B tested against the
current q4f32_1 default without forcing a re-download for anyone
who already has the larger weights cached. Promote to default once
validated in the harness + actual app flow.
---
 src/utils/nlpModels.js | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/utils/nlpModels.js b/src/utils/nlpModels.js
index dea5712..7b8f15a 100644
--- a/src/utils/nlpModels.js
+++ b/src/utils/nlpModels.js
@@ -9,6 +9,18 @@ export const NLP_MODELS = {
     sizeLabel: '~1.3 GB',
     isThinking: false,
   },
+  gemma_fast: {
+    // q4f16_1 vs q4f32_1: same model, smaller weights and fp16 activations.
+    // WebGPU's native compute type is fp16 on most GPUs, so inference is
+    // measurably faster (often 30-40% fewer ms/token) with no measurable
+    // quality loss on short instruction tasks. Try via ?model=gemma_fast;
+    // if it holds up in real classification + simplification, promote it
+    // to the registry default in a follow-up.
+    id: 'gemma-2-2b-it-q4f16_1-MLC',
+    label: 'Gemma 2 2B (fast)',
+    sizeLabel: '~900 MB',
+    isThinking: false,
+  },
   qwen3: {
     id: 'Qwen3-1.7B-q4f32_1-MLC',
     label: 'Qwen3 1.7B',

From 287599219ae97c8fd5800d3df832633243dc590e Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Wed, 6 May 2026 23:22:06 -0400
Subject: [PATCH 20/31] =?UTF-8?q?revert(phase-3):=20remove=20gemma=5Ffast?=
 =?UTF-8?q?=20=E2=80=94=20q4f16=5F1=20broke=20simplifier=20output?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User tested ?model=gemma_fast (gemma-2-2b-it-q4f16_1-MLC) and the
simplifier produced malformed output. The reduced activation precision
(fp16 vs fp32) loses the grip on the structured "## What this study
is testing" / "## Who can join" section headers the streaming parser
needs.

Since WebLLM can only have one model loaded at a time and the
simplifier is critical UX, mixing quants for different tasks isn't
viable. Drop the entry from the registry; leave a comment so future-
us doesn't re-discover the same dead end.
---
 src/utils/nlpModels.js | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/utils/nlpModels.js b/src/utils/nlpModels.js
index 7b8f15a..17e286a 100644
--- a/src/utils/nlpModels.js
+++ b/src/utils/nlpModels.js
@@ -9,18 +9,14 @@ export const NLP_MODELS = {
     sizeLabel: '~1.3 GB',
     isThinking: false,
   },
-  gemma_fast: {
-    // q4f16_1 vs q4f32_1: same model, smaller weights and fp16 activations.
-    // WebGPU's native compute type is fp16 on most GPUs, so inference is
-    // measurably faster (often 30-40% fewer ms/token) with no measurable
-    // quality loss on short instruction tasks. Try via ?model=gemma_fast;
-    // if it holds up in real classification + simplification, promote it
-    // to the registry default in a follow-up.
-    id: 'gemma-2-2b-it-q4f16_1-MLC',
-    label: 'Gemma 2 2B (fast)',
-    sizeLabel: '~900 MB',
-    isThinking: false,
-  },
+  // Gemma 2 2B q4f16_1 was tried as a faster alternative (~30% lower
+  // latency from native-fp16 WebGPU compute) but the lower activation
+  // precision degraded the simplifier's structured output — section
+  // headers ("## What this study is testing" / "## Who can join") came
+  // out malformed, and Gemma can only have ONE quant loaded at a time
+  // so we can't mix q4f16_1 for classification and q4f32_1 for
+  // simplification. q4f32_1 stays as the sole Gemma 2 2B variant.
+
   qwen3: {
     id: 'Qwen3-1.7B-q4f32_1-MLC',
     label: 'Qwen3 1.7B',

From 38453322796844f92a0220a7497e8915307ac67a Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 00:12:09 -0400
Subject: [PATCH 21/31] revert(simplifier): restore PLAIN-LANGUAGE OUTPUT:
 label
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

19e5d34 changed the trailing simplifier prompt label to a directive
instruction ("Begin your response with \"## What this study is
testing\":") to fix Qwen2.5-1.5B hallucinating training data. That
fix worked for Qwen but broke Gemma 2B — Gemma got stuck in a
repetition loop emitting bare ## patterns ("##############  ## ## ##
## ##...") because the trailing instruction over-anchored on the ##
header pattern.

Since we ship Gemma by default and Qwen is opt-in (?model=qwen25,
mainly for the harness), the right call is to restore the original
prompt that Gemma is validated against. Qwen will continue to
hallucinate in the simplifier — but that's a known issue with no
production exposure as long as Gemma stays default.

If we ever need to ship Qwen for production, the right fix is
probably structural (proper chat-message turns with role:assistant
for the example) rather than a trailing-label tweak.
---
 src/utils/simplifyHelpers.js      | 22 ++++++----------------
 src/utils/simplifyHelpers.test.js |  5 +----
 2 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/src/utils/simplifyHelpers.js b/src/utils/simplifyHelpers.js
index 441da31..94db74b 100644
--- a/src/utils/simplifyHelpers.js
+++ b/src/utils/simplifyHelpers.js
@@ -97,13 +97,6 @@ export function buildSummarizePrompt(trial, options = {}) {
   const ex = SUMMARIZE_EXEMPLAR
   const langPre = languagePreGenInstruction(options.outputLanguage)
   const langPost = languagePostGenReminder(options.outputLanguage)
-  // The trailing instruction is intentionally directive ("Write the
-  // plain-language summary now…") rather than a bare label like
-  // "PLAIN-LANGUAGE OUTPUT:". Some chat-tuned models (Qwen2.5-1.5B) read
-  // a trailing label as a topic header and hallucinate unrelated tutorial
-  // content from training data instead of continuing the task. Naming
-  // the action — and pinning the first header the model must emit — fixes
-  // it for Qwen and stays compatible with Gemma's behavior on this prompt.
   return `${SUMMARIZE_INSTRUCTIONS}
 
 Here is an example.
@@ -112,17 +105,17 @@ SOURCE TRIAL:
 Brief summary: ${ex.input.briefSummary}
 Eligibility: ${ex.input.eligibility}
 
-EXAMPLE PLAIN-LANGUAGE OUTPUT:
+PLAIN-LANGUAGE OUTPUT:
 ${ex.output}
 ${langPre}
 
-Now write the plain-language summary for the SOURCE TRIAL below, using the same two-section structure (## What this study is testing, then ## Who can join). Use only facts from this trial.
+Now do the same for this trial.
 
 SOURCE TRIAL:
 Brief summary: ${trial.summary ?? ''}
 Eligibility: ${trial.eligibility?.criteria ?? ''}
 ${langPost}
-Begin your response with "## What this study is testing":
+PLAIN-LANGUAGE OUTPUT:
 `
 }
 
@@ -142,9 +135,6 @@ Keep drug names (e.g. trastuzumab deruxtecan, pembrolizumab) and gene names (e.g
 Do NOT write the assessment in English.`
     : ''
   const langPost = isNonEnglish ? `\nReminder: write your assessment in ${options.outputLanguage}.` : ''
-  // Same trailing-marker fix as buildSummarizePrompt — replace the bare
-  // "ASSESSMENT:" label with a directive instruction so chat-tuned models
-  // don't read it as a topic header and hallucinate.
   return `${ASSESS_FIT_INSTRUCTIONS}
 
 Here is an example.
@@ -157,11 +147,11 @@ TRIAL:
 Brief summary: ${ex.input.briefSummary}
 Eligibility: ${ex.input.eligibility}
 
-EXAMPLE ASSESSMENT:
+ASSESSMENT:
 ${ex.output}
 ${langPre}
 
-Now write the assessment for the patient and trial below.
+Now do the same.
 
 PATIENT:
 ${descLine}
@@ -171,6 +161,6 @@ TRIAL:
 Brief summary: ${trial.summary ?? ''}
 Eligibility: ${trial.eligibility?.criteria ?? ''}
 ${langPost}
-Write your 2-4 sentence assessment now:
+ASSESSMENT:
 `
 }
diff --git a/src/utils/simplifyHelpers.test.js b/src/utils/simplifyHelpers.test.js
index bdececc..ccd6737 100644
--- a/src/utils/simplifyHelpers.test.js
+++ b/src/utils/simplifyHelpers.test.js
@@ -170,10 +170,7 @@ describe('buildSummarizePrompt', () => {
   it('also includes a final pre-generation reminder so the directive is the most recent token', () => {
     const prompt = buildSummarizePrompt(trial, { outputLanguage: 'Spanish' })
     const reminderIdx = prompt.lastIndexOf('Reminder: write the body in Spanish')
-    // The trailing cue used to be the bare label "PLAIN-LANGUAGE OUTPUT:" but
-    // chat-tuned models read that as a header and hallucinate. The new cue is
-    // a directive ending with the first header the model must emit.
-    const finalCueIdx = prompt.lastIndexOf('Begin your response with "## What this study is testing"')
+    const finalCueIdx = prompt.lastIndexOf('PLAIN-LANGUAGE OUTPUT:')
     expect(reminderIdx).toBeGreaterThan(-1)
     expect(reminderIdx).toBeLessThan(finalCueIdx)
   })

From 1f093312aac4522892470c46d96abc8b5a2dfbd9 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 00:17:15 -0400
Subject: [PATCH 22/31] fix(phase-3): defer simplification until classification
 completes + raise frequency_penalty
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two issues from end-to-end testing:

1. Toolbar showed "evaluating fit · 0 of 10" while the simplifier had
   already produced output — classifier and simplifier were both
   queueing into the single-threaded worker on results land. The
   simplifier's eager-batch-of-5 ran first, the classifier got stuck
   behind it, and per-trial fit dots never appeared until the
   simplifier finished all 10 chunks of the eager batch.

2. Gemma 2 2B was hitting degenerate repetition loops on the
   simplify prompt — "## What this study is testing" header rendered
   correctly, then the body was just "############ ## ## ## ##..."
   for the full max_tokens budget.

Two changes per Handoff Phase 3 step 6 ("Stage 2 simplification only
fires for the currently-selected trial in the detail pane"):

- ResultsList: removed the eager-batch-of-5 simplifier call. Simplifier
  now only enqueues for the currently-selected trial, and only after
  classification has completed (or is not running, e.g. structured-
  form-only sessions). Eliminates the classifier/simplifier race AND
  drastically cuts how often the simplifier runs (so when Gemma does
  hit a repetition loop, it affects one trial not five).

- nlp.worker.js: bumped frequency_penalty for summarize from 0.3 → 0.6.
  Higher penalty discourages the same n-gram from re-firing, breaking
  the "## ## ##" loop. Assess-fit stays at 0 so its hedging language
  ("may", "might") doesn't get penalized.
---
 src/components/ResultsList.jsx | 30 ++++++++++++++++++++++--------
 src/workers/nlp.worker.js      |  8 +++++++-
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/components/ResultsList.jsx b/src/components/ResultsList.jsx
index ce6bed5..0ac26b9 100644
--- a/src/components/ResultsList.jsx
+++ b/src/components/ResultsList.jsx
@@ -190,22 +190,36 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
     // eslint-disable-next-line react-hooks/exhaustive-deps
   }, [canClassify, nlp.status, patientDesc, trialKeyAll])
 
-  // Fire when the result set changes — keyed on the first 5 NCT IDs.
-  // Using searchParams as the key would fire too early (before data arrives);
-  // using allTrials would re-fire on every pagination append.
+  // Reset the simplifier when the result set changes (new search). The
+  // per-trial enqueue happens below in the selected-trial effect.
   const eagerKey = allTrials.slice(0, EAGER_BATCH_SIZE).map(t => t.nctId).join(',')
   useEffect(() => {
     simplifier.cancelPending()
     simplifier.resetCache()
-    if (allTrials.length === 0) return
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [eagerKey])
+
+  // Per Handoff Phase 3 step 6: stage-2 simplification only fires for the
+  // currently-selected trial. Critically, it WAITS for stage-1
+  // classification to finish first — otherwise both compete for the
+  // single-threaded worker, the classifier appears to stall, and the
+  // simplifier (running first) produces noisier output under contention.
+  // For structured-form-only sessions canClassify is false and
+  // classifyProgress.total stays 0, so the gate falls through to "true"
+  // and simplification runs immediately on selection.
+  const classifyDone = !canClassify || (
+    classifyProgress.total > 0 && classifyProgress.done >= classifyProgress.total
+  )
+  useEffect(() => {
     if (!simplificationSupported) return
-    const eager = allTrials.slice(0, EAGER_BATCH_SIZE)
-    for (const t of eager) simplifier.enqueueSummarize(t, { outputLanguage })
+    if (!selected) return
+    if (!classifyDone) return
+    simplifier.enqueueSummarize(selected, { outputLanguage })
     if (extractedFields) {
-      for (const t of eager) simplifier.enqueueAssessFit(t, { outputLanguage })
+      simplifier.enqueueAssessFit(selected, { outputLanguage })
     }
     // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [eagerKey, simplificationSupported, outputLanguage])
+  }, [selected?.nctId, simplificationSupported, outputLanguage, classifyDone])
 
   if (isLoading) {
     return (
diff --git a/src/workers/nlp.worker.js b/src/workers/nlp.worker.js
index 1541580..fc93e77 100644
--- a/src/workers/nlp.worker.js
+++ b/src/workers/nlp.worker.js
@@ -184,7 +184,13 @@ self.onmessage = async (event) => {
         // its hedging language ("may", "might") doesn't collapse into a
         // single deterministic phrase across trials.
         temperature: type === 'summarize' ? 0.1 : 0.2,
-        frequency_penalty: type === 'summarize' ? 0.3 : 0,
+        // Bumped from 0.3 → 0.6 because Gemma 2 2B was hitting degenerate
+        // loops on the simplify prompt — emitting strings of "##" header
+        // markers ("############# ## ## ## …") instead of the body content.
+        // Higher frequency penalty discourages the same n-gram from
+        // re-firing, breaking the loop. Assess-fit stays at 0 so its
+        // hedging language ("may", "might") doesn't get penalized.
+        frequency_penalty: type === 'summarize' ? 0.6 : 0,
         stream: true,
       }
       if (isThinkingModel) request.extra_body = { enable_thinking: false }

From cd30a674214fe787117557e83ea3ea2a270fcddd Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 00:21:49 -0400
Subject: [PATCH 23/31] feat(phase-3): pipeline progress caption in detail pane
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User saw an empty "What this study is testing" area and an inaccurate
summary, with no explanation of WHY the simplifier hadn't run yet
(classifier was still working through 10 trials in the background).

Adds an explicit pipeline-stage caption above the simplifier section
in the detail pane:

- Stage "classifying": iris-violet pill with shimmer dot:
  "evaluating fit · 3 of 10 · plain-language summary will follow"
  Tells the user the per-trial fit is still being computed and the
  summary is queued to come after.

- Stage "awaiting-summary": parchment pill: "generating plain-language
  summary…" — fires after classification completes but before the
  selected trial's summary stream starts producing tokens.

- Once tokens land (status === 'streaming' / 'complete'): caption
  disappears, real content renders.

ResultsList computes the stage from canClassify + classifyDone +
the simplifier's per-trial status, then passes pipelineStage +
classifyProgress as new ResultCard props (pane mode only — rows
already have their own fit dot).

Doesn't address the underlying small-model accuracy issue (Gemma
turning "Early Breast Cancer" into "spread to other parts" in the
summary). That's the next escalation — short-term fix would be lower
max_tokens for summarize; real fix is the LoRA pipeline in the vault
doc.
---
 src/components/ResultCard.jsx  | 36 ++++++++++++++++++++++++++++++++++
 src/components/ResultsList.jsx | 20 ++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/src/components/ResultCard.jsx b/src/components/ResultCard.jsx
index a520860..94dd9b4 100644
--- a/src/components/ResultCard.jsx
+++ b/src/components/ResultCard.jsx
@@ -36,6 +36,36 @@ function SectionLabel({ children, pane }) {
   )
 }
 
+// Two-stage on-device pipeline status. Renders only in pane (detail) view
+// because the row already has a fit dot indicator. Tells the user
+// explicitly which stage is in flight so the empty content area below
+// doesn't read as "broken".
+function PipelineCaption({ stage, progress }) {
+  if (stage === 'classifying') {
+    return (
+      <div className="mb-5 flex items-center gap-2 px-3 py-2 rounded-lg bg-iris-50 border border-iris-100">
+        <span className="iris-shimmer-text inline-block w-2 h-2 rounded-full" aria-hidden="true">&nbsp;</span>
+        <span className="font-mono text-[11px] text-iris-700">
+          evaluating fit
+          {progress && progress.total > 0 && ` · ${progress.done} of ${progress.total}`}
+          <span className="text-parchment-700"> · plain-language summary will follow</span>
+        </span>
+      </div>
+    )
+  }
+  if (stage === 'awaiting-summary') {
+    return (
+      <div className="mb-5 flex items-center gap-2 px-3 py-2 rounded-lg bg-parchment-100 border border-parchment-200">
+        <span className="iris-shimmer-text inline-block w-2 h-2 rounded-full" aria-hidden="true">&nbsp;</span>
+        <span className="font-mono text-[11px] text-parchment-700">
+          generating plain-language summary…
+        </span>
+      </div>
+    )
+  }
+  return null
+}
+
 function MetaLine({ trial, nearest, pane }) {
   const sep = (
     <span aria-hidden="true" className={pane ? 'text-parchment-300' : 'text-parchment-500'}>
@@ -80,6 +110,8 @@ export default function ResultCard({
   inputLanguage = 'en',
   simplificationSupported = true,
   pane = false,
+  pipelineStage = null, // 'classifying' | 'awaiting-summary' | null
+  classifyProgress = null, // { done, total }
 }) {
   const nearest = nearestLocation(trial.locations, coords)
   const wrapperClass = pane
@@ -113,6 +145,10 @@ export default function ResultCard({
 
       <MetaLine trial={trial} nearest={nearest} pane={pane} />
 
+      {pane && pipelineStage && (
+        <PipelineCaption stage={pipelineStage} progress={classifyProgress} />
+      )}
+
       {showPlainLanguage && (
         <div className={pane ? 'mb-4' : 'mb-3'}>
           <div className={pane ? 'mb-4' : ''}>
diff --git a/src/components/ResultsList.jsx b/src/components/ResultsList.jsx
index 0ac26b9..d6801a4 100644
--- a/src/components/ResultsList.jsx
+++ b/src/components/ResultsList.jsx
@@ -271,15 +271,33 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
   }
 
   function renderDetail(trial) {
+    // Tell ResultCard which pipeline stage is in flight so it can render
+    // an explicit progress caption above the empty content area instead of
+    // showing the trial's raw summary (which can look like the model
+    // already replied with the wrong text).
+    let pipelineStage = null
+    const sim = simplifier.states.get(trial.nctId)
+    const simStatus = sim?.summarize?.status
+    if (canClassify && !classifyDone) {
+      pipelineStage = 'classifying'
+    } else if (
+      simplificationSupported &&
+      (!simStatus || simStatus === 'queued') &&
+      classifyDone
+    ) {
+      pipelineStage = 'awaiting-summary'
+    }
     return (
       <ResultCard
         trial={trial}
         coords={coords ?? null}
-        simplification={simplifier.states.get(trial.nctId)}
+        simplification={sim}
         onRequestSimplify={simplificationSupported ? handleRequestSimplify : null}
         inputLanguage={inputLanguage}
         simplificationSupported={simplificationSupported}
         pane
+        pipelineStage={pipelineStage}
+        classifyProgress={canClassify ? classifyProgress : null}
       />
     )
   }

From 959733412b43ad338a63840614b6cd61f212a769 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 00:25:07 -0400
Subject: [PATCH 24/31] feat(phase-3): drop fit narrative section + add
 oncologist disclaimer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes that work together to set honest expectations about the
on-device model's reliability:

1. Removed the "Why this might or might not fit you" section from the
   detail pane. Gemma 2B's accuracy on the fit narrative isn't
   reliable enough to ship — recent E2E testing showed it occasionally
   flips disease stage (early vs metastatic) or treatment history.
   The TriageRow fit dot (driven by the binary classifier in
   useClassifier) is the safer signal. ResultsList no longer enqueues
   assess_fit; ResultCard no longer renders the fit paragraph; the
   useSimplifier code path is preserved in case we re-enable it after
   a fine-tune.

2. Added an unconditional oncologist disclaimer above the contact
   block in pane mode: "Talk to your oncologist if you think you
   might qualify for this trial. The plain-language summary above is
   generated on-device by a small AI model — it can miss or misstate
   eligibility details. Your care team can confirm whether the trial
   fits your specific situation."

   Iris-tinted callout, always visible. Sets the right expectation:
   the AI summary is a starting point, not a recommendation.

Test that previously asserted the fit paragraph rendered now asserts
its absence. The previous "fit hidden when in error" test was
redundant (the fit section is unconditionally hidden now) so it was
folded into the single assertion.
---
 src/components/ResultCard.jsx      | 28 +++++++++++++++++-----------
 src/components/ResultCard.test.jsx | 19 ++++++++-----------
 src/components/ResultsList.jsx     | 11 +++++++----
 3 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/src/components/ResultCard.jsx b/src/components/ResultCard.jsx
index 94dd9b4..aca075c 100644
--- a/src/components/ResultCard.jsx
+++ b/src/components/ResultCard.jsx
@@ -171,16 +171,11 @@ export default function ResultCard({
             </div>
           )}
 
-          {showFit && (
-            <div className={pane ? 'mb-4' : ''}>
-              <SectionLabel pane={pane}>Why this might or might not fit you</SectionLabel>
-              <p className={pane
-                ? 'text-[15px] text-parchment-900 leading-[1.6] whitespace-pre-wrap'
-                : 'text-sm text-parchment-900 leading-relaxed mb-3 whitespace-pre-wrap'}>
-                {fitState.text}
-              </p>
-            </div>
-          )}
+          {/* "Why this might or might not fit you" intentionally omitted —
+              Gemma 2B's accuracy on the fit narrative isn't reliable
+              enough to ship. The TriageRow fit dot (driven by the
+              classifier) is the safer signal. The DoctorDisclaimer
+              below renders unconditionally to set expectations. */}
 
           {(sumState.status === 'queued' || sumState.status === 'streaming') && (
             <p className="font-mono text-[11px] text-parchment-700 italic mb-2">
@@ -260,8 +255,19 @@ export default function ResultCard({
         </p>
       )}
 
+      {pane && (
+        <div className="mt-6 mb-2 px-4 py-3 rounded-lg bg-iris-50 border border-iris-100">
+          <p className="text-[13px] text-parchment-900 leading-relaxed">
+            <strong className="font-semibold text-iris-700">Talk to your oncologist</strong> if you
+            think you might qualify for this trial. The plain-language summary above is generated
+            on-device by a small AI model — it can miss or misstate eligibility details. Your care
+            team can confirm whether the trial fits your specific situation.
+          </p>
+        </div>
+      )}
+
       {pane ? (
-        <div className="mt-6 pt-5 border-t border-parchment-200 flex flex-col gap-1.5 text-[13px]">
+        <div className="mt-4 pt-5 border-t border-parchment-200 flex flex-col gap-1.5 text-[13px]">
           <div className="font-mono text-[11px] text-parchment-700 mb-1">contact</div>
           {trial.contact.phone && (
             <span className="text-parchment-900">{trial.contact.phone}</span>
diff --git a/src/components/ResultCard.test.jsx b/src/components/ResultCard.test.jsx
index fdc91e3..99326c5 100644
--- a/src/components/ResultCard.test.jsx
+++ b/src/components/ResultCard.test.jsx
@@ -140,22 +140,19 @@ describe('ResultCard — Phase 3 simplification', () => {
     expect(screen.getByText(/Plain-language version unavailable/i)).toBeInTheDocument()
   })
 
-  it('renders the fit paragraph when fit state is complete', () => {
+  // The "Why this might or might not fit you" section was removed because
+  // Gemma 2B's accuracy on the fit narrative wasn't reliable enough to
+  // ship — it occasionally flipped disease stage or treatment history.
+  // The TriageRow fit dot (driven by the binary classifier in
+  // useClassifier) is the safer signal. The simplifier still computes
+  // assess_fit when called; ResultCard just no longer renders it.
+  it('does not render the fit paragraph even when fit state is complete', () => {
     const simplification = {
       summarize: { status: 'complete', summary: 'Sum.', eligibility: 'Elig.', error: null },
       fit: { status: 'complete', text: 'This may fit you because…', error: null },
     }
     render(<ResultCard trial={trial} coords={null} simplification={simplification} />)
-    expect(screen.getByText(/Why this might or might not fit you/i)).toBeInTheDocument()
-    expect(screen.getByText('This may fit you because…')).toBeInTheDocument()
-  })
-
-  it('does not render fit section when fit is in error', () => {
-    const simplification = {
-      summarize: { status: 'complete', summary: 'Sum.', eligibility: 'Elig.', error: null },
-      fit: { status: 'error', text: '', error: 'failed' },
-    }
-    render(<ResultCard trial={trial} coords={null} simplification={simplification} />)
     expect(screen.queryByText(/Why this might or might not fit you/i)).not.toBeInTheDocument()
+    expect(screen.queryByText('This may fit you because…')).not.toBeInTheDocument()
   })
 })
diff --git a/src/components/ResultsList.jsx b/src/components/ResultsList.jsx
index d6801a4..6a12335 100644
--- a/src/components/ResultsList.jsx
+++ b/src/components/ResultsList.jsx
@@ -215,9 +215,12 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
     if (!selected) return
     if (!classifyDone) return
     simplifier.enqueueSummarize(selected, { outputLanguage })
-    if (extractedFields) {
-      simplifier.enqueueAssessFit(selected, { outputLanguage })
-    }
+    // assess_fit ("Why this might or might not fit you") intentionally not
+    // enqueued — Gemma 2B's accuracy on the fit narrative isn't reliable
+    // enough to ship (it occasionally flips disease stage / treatment
+    // history). The classifier's binary verdict + dot is the safer signal.
+    // The assess_fit pipeline itself stays in useSimplifier in case we
+    // re-enable it on a fine-tuned model later.
     // eslint-disable-next-line react-hooks/exhaustive-deps
   }, [selected?.nctId, simplificationSupported, outputLanguage, classifyDone])
 
@@ -267,7 +270,7 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
   function handleRequestSimplify(trial) {
     if (!simplificationSupported) return
     simplifier.enqueueSummarize(trial, { outputLanguage })
-    if (extractedFields) simplifier.enqueueAssessFit(trial, { outputLanguage })
+    // assess_fit deliberately omitted — see selected-trial effect above.
   }
 
   function renderDetail(trial) {

From 45d88393380f8530d9c5b35d44606033ef5e52c3 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 00:39:39 -0400
Subject: [PATCH 25/31] feat(phase-3): two-tier disclaimer + carry redesign
 hot-fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Disclaimer reshaped per the brainstorm + user pick (variant 7 with
user-rewritten surface line):

  Surface (always visible, in <summary>):
    "Check with your doctor when exploring treatment options —
     this AI summary uses plain language to explain the treatment
     but can miss eligibility details."  with a small "why?" hint.

  Expandable (in <details> body):
    "The plain-language summary above was generated on your device
     by a small AI model. It can miss or misstate who qualifies for
     a trial. Your care team has your full medical picture and can
     confirm whether this one actually fits."

Frames the AI as helper rather than decider, gives the in-crisis
reader minimum cognitive load by default, full honesty + privacy
beat one tap away. Group-open hides the "why?" hint when expanded
so the chrome stays clean.

Also carrying redesign-branch fix forward: tightened the auto-load
gate in NaturalLanguageInput from (idle || downloading) → idle
only. The widened gate caused redundant load() messages mid-download
because the cleanup-reset pattern would re-fire the effect on every
status flip while still in the gate. Worker dropped them as
duplicates so harmless in practice, but cleaner this way. Same fix
went in on redesign branch as 3c2e750; keeping branches in sync.
---
 src/components/ResultCard.jsx | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/components/ResultCard.jsx b/src/components/ResultCard.jsx
index aca075c..49fd93c 100644
--- a/src/components/ResultCard.jsx
+++ b/src/components/ResultCard.jsx
@@ -256,14 +256,21 @@ export default function ResultCard({
       )}
 
       {pane && (
-        <div className="mt-6 mb-2 px-4 py-3 rounded-lg bg-iris-50 border border-iris-100">
-          <p className="text-[13px] text-parchment-900 leading-relaxed">
-            <strong className="font-semibold text-iris-700">Talk to your oncologist</strong> if you
-            think you might qualify for this trial. The plain-language summary above is generated
-            on-device by a small AI model — it can miss or misstate eligibility details. Your care
-            team can confirm whether the trial fits your specific situation.
+        <details className="mt-6 mb-2 px-4 py-3 rounded-lg bg-iris-50 border border-iris-100 group">
+          <summary className="cursor-pointer list-none text-[13px] text-parchment-900 leading-relaxed select-none">
+            <span className="font-semibold text-iris-700">Check with your doctor when exploring treatment options</span>
+            {' '}— this AI summary uses plain language to explain the treatment but can miss
+            eligibility details.
+            <span className="font-mono text-[11px] text-iris-700 ml-2 opacity-70 group-open:hidden">
+              why?
+            </span>
+          </summary>
+          <p className="mt-3 text-[13px] text-parchment-900 leading-relaxed">
+            The plain-language summary above was generated on your device by a small AI model. It
+            can miss or misstate who qualifies for a trial. Your care team has your full medical
+            picture and can confirm whether this one actually fits.
           </p>
-        </div>
+        </details>
       )}
 
       {pane ? (

From 73b809bc20451f485ec91132d8ba11b71625604a Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 00:43:14 -0400
Subject: [PATCH 26/31] chore(phase-3): remove disabled sort chips from toolbar

The sort UI was visible-but-permanently-disabled placeholders for
"Best fit" / "Distance" / "Phase" / "Most recent" with tooltips like
"Sort wiring coming in a follow-up." Reads as broken UI to users.
Better to ship without than ship with disabled controls.

When the wiring lands, restore the chips from git history. CT.gov
v2 API supports sort= for distance (when location set) and
LastUpdatePostDate; "Best fit" depends on having the classifier
verdicts per-trial, which now exists post-Phase-3 wiring.

Toolbar now shows just the search summary line + classification
progress, which is the load-bearing info anyway.
---
 src/components/ResultsList.jsx | 44 ++++++----------------------------
 1 file changed, 7 insertions(+), 37 deletions(-)

diff --git a/src/components/ResultsList.jsx b/src/components/ResultsList.jsx
index 6a12335..61fda0e 100644
--- a/src/components/ResultsList.jsx
+++ b/src/components/ResultsList.jsx
@@ -397,16 +397,14 @@ const PHASE_LABELS = {
   PHASE4: 'Phase 4',
 }
 
-const SORT_OPTIONS = [
-  { id: 'fit',      label: 'Best fit',     disabled: true, title: 'Available once on-device classification runs' },
-  { id: 'distance', label: 'Distance',     disabled: true, title: 'Sort wiring coming in a follow-up' },
-  { id: 'phase',    label: 'Phase',        disabled: true, title: 'Sort wiring coming in a follow-up' },
-  { id: 'recent',   label: 'Most recent',  disabled: true, title: 'Sort wiring coming in a follow-up' },
-]
+// Sort UI removed — the chips were visible-but-disabled placeholders for
+// "Best fit" / "Distance" / "Phase" / "Most recent" which read as broken
+// to users. When sort wiring lands (CT.gov API supports `sort=` for
+// distance and last-update; "Best fit" needs the classifier verdicts
+// per-trial), restore from git history at 67d5fc8 and wire onClick →
+// re-fetch through useClinicalTrials with the new sort token.
 
 function ResultsToolbar({ totalCount, searchParams, classifyProgress }) {
-  const [sort, setSort] = useState('recent')
-
   const summaryParts = [`${totalCount.toLocaleString()} trial${totalCount !== 1 ? 's' : ''}`]
   if (searchParams.location) summaryParts.push(`near ${searchParams.location}`)
   if (searchParams.location && searchParams.radius) summaryParts.push(`within ${searchParams.radius} mi`)
@@ -417,7 +415,7 @@ function ResultsToolbar({ totalCount, searchParams, classifyProgress }) {
   }
 
   return (
-    <div className="px-4 sm:px-6 py-3 border-b border-parchment-200 flex flex-wrap items-center justify-between gap-x-6 gap-y-2">
+    <div className="px-4 sm:px-6 py-3 border-b border-parchment-200 flex flex-wrap items-center gap-x-6 gap-y-2">
       <p className="font-mono text-[11px] text-parchment-700 leading-snug">
         {summaryParts.map((part, i) => (
           <span key={i}>
@@ -434,34 +432,6 @@ function ResultsToolbar({ totalCount, searchParams, classifyProgress }) {
           </span>
         )}
       </p>
-      <div className="hidden sm:flex items-center gap-1" role="group" aria-label="Sort results">
-        <span className="font-mono text-[10px] uppercase tracking-[0.08em] text-parchment-700 mr-2">
-          sort
-        </span>
-        {SORT_OPTIONS.map(opt => {
-          const active = sort === opt.id
-          return (
-            <button
-              key={opt.id}
-              type="button"
-              onClick={() => !opt.disabled && setSort(opt.id)}
-              disabled={opt.disabled}
-              title={opt.title}
-              {...(opt.disabled ? {} : { 'aria-pressed': active })}
-              className={[
-                'text-[11px] px-2 py-0.5 rounded-md transition-colors',
-                opt.disabled
-                  ? 'text-parchment-500 cursor-not-allowed'
-                  : active
-                    ? 'bg-iris-50 text-iris-700 font-medium'
-                    : 'text-parchment-700 hover:text-parchment-950 hover:bg-parchment-100',
-              ].join(' ')}
-            >
-              {opt.label}
-            </button>
-          )
-        })}
-      </div>
     </div>
   )
 }

From 1b3e7598f2d8e52a181fa68a6c808846637da22a Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 00:54:49 -0400
Subject: [PATCH 27/31] chore: post-merge follow-ups (matchMedia, regression
 test, doc move)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three of the deferred items from PR #1's review, addressed in one
follow-up to keep the phase-3 PR from carrying open feedback debt:

1. useIsMobile uses matchMedia.change instead of window 'resize'
   (ResultsList.jsx:36-50). iOS Safari fires 'resize' inconsistently
   on rotation; matchMedia.change is the reliable signal and also
   catches iPad split-screen + browser-window mode switches.
   src/test/setup.js stubs matchMedia (jsdom doesn't ship it) so
   the existing ResultsList tests keep rendering the desktop two-pane
   path.

2. New regression test for the queued-submit drain in
   NaturalLanguageInput. Indirect coverage for the StrictMode listener
   re-attach fix in useNLP — if the listener doesn't reach the worker
   'ready' message, the queued submit never fires and this test fails.
   The bug surfaced this session: status was getting stuck at
   'downloading' forever in dev because the listener detached on
   StrictMode's first cleanup never re-attached. Now: pin the contract.

3. shared/iris-shared.jsx (478-line design reference, never imported
   by src/) moved to docs/design-references-shared/ with a README
   explaining its role. Stops future readers (LLM or human) from
   trying to "fix" or "consolidate" it as if it were live code.

Contrast-check (#3 in the review): computed iris-700 on parchment-50
yields ~9.6:1 (WCAG AAA). iris-700 on iris-50 is similar (~8.5:1).
All iris-violet links and the model badge clear AA easily; no
palette change needed. Lighthouse can confirm at deploy time.

Compare-state lift (#1 in the review): deferred to its own follow-up
PR alongside the actual compare view (currently a placeholder).
---
 docs/design-references-shared/README.md       | 17 +++++++++
 .../design-references-shared}/iris-shared.jsx |  0
 src/components/NaturalLanguageInput.test.jsx  | 38 +++++++++++++++++++
 src/components/ResultsList.jsx                | 14 +++++--
 src/test/setup.js                             | 18 ++++++++-
 5 files changed, 82 insertions(+), 5 deletions(-)
 create mode 100644 docs/design-references-shared/README.md
 rename {shared => docs/design-references-shared}/iris-shared.jsx (100%)

diff --git a/docs/design-references-shared/README.md b/docs/design-references-shared/README.md
new file mode 100644
index 0000000..489ae34
--- /dev/null
+++ b/docs/design-references-shared/README.md
@@ -0,0 +1,17 @@
+# shared/iris-shared.jsx — design reference, not source
+
+Reference implementations from the original Claude.ai design exploration.
+Components in this file (`IrisHeader`, `IrisSearchBar`, `LocalAIBadge`,
+`FitMeter`, `StatusPill`, `ActionRow`, `StreamingText`, …) were ported into
+the live React app under `src/components/` and `src/utils/` — the versions
+here are kept verbatim so a future reader can compare implementations
+against the original prototype.
+
+**Do not import from this file in `src/`.** It runs against a Babel-standalone
+environment in `IRIS Triage.html` and uses inline-style patterns the live
+app intentionally moved away from (the live app uses Tailwind utility
+classes on top of CSS custom properties from `styles/tokens.css`).
+
+If you're trying to "fix" or "consolidate" this file: stop. Edit the live
+component under `src/components/` instead. The existence of this file is
+documentation, not duplication.
diff --git a/shared/iris-shared.jsx b/docs/design-references-shared/iris-shared.jsx
similarity index 100%
rename from shared/iris-shared.jsx
rename to docs/design-references-shared/iris-shared.jsx
diff --git a/src/components/NaturalLanguageInput.test.jsx b/src/components/NaturalLanguageInput.test.jsx
index ca59891..a9fd691 100644
--- a/src/components/NaturalLanguageInput.test.jsx
+++ b/src/components/NaturalLanguageInput.test.jsx
@@ -158,3 +158,41 @@ describe('NaturalLanguageInput — error state', () => {
     expect(screen.getByText(/try again/i)).toBeInTheDocument()
   })
 })
+
+describe('NaturalLanguageInput — queued submit during download', () => {
+  // Locks in the typing-while-loading flow: a user can hit Find trials while
+  // the model is still downloading; the intent is held until status flips to
+  // 'ready' and then auto-fires. Indirect smoke test for the StrictMode
+  // listener fix in useNLP — if the listener didn't re-attach after the dev
+  // double-invoke, the real-world status would never reach 'ready' and the
+  // drain effect (deps [status, pendingSubmit]) would never fire.
+  it('queues submit while downloading, fires extract once status flips to ready', async () => {
+    const extract = vi.fn().mockResolvedValue({
+      condition: 'breast cancer', location: null, age: 58, sex: 'FEMALE',
+      status: 'RECRUITING', phases: [],
+    })
+    useNLP.mockReturnValue({ ...baseHook, status: 'downloading', extract })
+    localStorage.setItem('iris_nlp_enabled', 'true')
+
+    const onExtract = vi.fn()
+    const { rerender } = render(<NaturalLanguageInput onExtract={onExtract} />)
+    fireEvent.click(screen.getByRole('button', { name: /describe in your own words/i }))
+
+    fireEvent.change(screen.getByRole('textbox', { name: /natural language search/i }), {
+      target: { value: '58 with breast cancer' },
+    })
+
+    // Submit while downloading — should queue, NOT fire extract yet.
+    fireEvent.click(screen.getByRole('button', { name: /Run when ready/i }))
+    expect(extract).not.toHaveBeenCalled()
+    expect(screen.getByRole('button', { name: /Queued/i })).toBeInTheDocument()
+
+    // Worker reports ready. In production this comes via the listener that
+    // the StrictMode fix ensures stays attached after the cleanup-remount.
+    useNLP.mockReturnValue({ ...baseHook, status: 'ready', extract })
+    rerender(<NaturalLanguageInput onExtract={onExtract} />)
+
+    await waitFor(() => expect(extract).toHaveBeenCalledWith('58 with breast cancer'))
+    await waitFor(() => expect(onExtract).toHaveBeenCalled())
+  })
+})
diff --git a/src/components/ResultsList.jsx b/src/components/ResultsList.jsx
index 61fda0e..8dfb4bf 100644
--- a/src/components/ResultsList.jsx
+++ b/src/components/ResultsList.jsx
@@ -33,14 +33,20 @@ const EAGER_BATCH_SIZE = 5
 const MOBILE_BREAKPOINT_PX = 820
 const LIST_WIDTH_PX = 400
 
+// matchMedia (not 'resize'): iOS Safari fires 'resize' inconsistently on
+// rotation; matchMedia.change is the reliable signal. Also catches iPad
+// split-screen and browser-window mode switches without a manual resize.
 function useIsMobile() {
+  const query = `(max-width: ${MOBILE_BREAKPOINT_PX}px)`
   const [isMobile, setIsMobile] = useState(() =>
-    typeof window !== 'undefined' && window.innerWidth <= MOBILE_BREAKPOINT_PX
+    typeof window !== 'undefined' && window.matchMedia(query).matches
   )
   useEffect(() => {
-    const onResize = () => setIsMobile(window.innerWidth <= MOBILE_BREAKPOINT_PX)
-    window.addEventListener('resize', onResize)
-    return () => window.removeEventListener('resize', onResize)
+    const mq = window.matchMedia(query)
+    const onChange = (e) => setIsMobile(e.matches)
+    mq.addEventListener('change', onChange)
+    return () => mq.removeEventListener('change', onChange)
+    // eslint-disable-next-line react-hooks/exhaustive-deps
   }, [])
   return isMobile
 }
diff --git a/src/test/setup.js b/src/test/setup.js
index 7c891de..d8865f2 100644
--- a/src/test/setup.js
+++ b/src/test/setup.js
@@ -1,7 +1,23 @@
 import '@testing-library/jest-dom'
-import { afterEach } from 'vitest'
+import { afterEach, vi } from 'vitest'
 import { cleanup } from '@testing-library/react'
 
+// jsdom doesn't ship matchMedia; ResultsList uses it for the mobile
+// breakpoint detector. Stub it to "desktop" (does-not-match) by default
+// so the two-pane code path renders in tests.
+if (typeof window !== 'undefined' && !window.matchMedia) {
+  window.matchMedia = vi.fn().mockImplementation((query) => ({
+    matches: false,
+    media: query,
+    onchange: null,
+    addEventListener: vi.fn(),
+    removeEventListener: vi.fn(),
+    addListener: vi.fn(),       // legacy
+    removeListener: vi.fn(),    // legacy
+    dispatchEvent: vi.fn(),
+  }))
+}
+
 afterEach(() => {
   cleanup()
 })

From d473033a4881f4359a91b9bb6c2398154b9058fa Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 00:56:58 -0400
Subject: [PATCH 28/31] fix(phase-3): re-run classification when patient
 description changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User reported: hitting "Find trials" again with a refined prompt did
nothing visible. The classifier kept its old verdicts; the toolbar
sat at "fit evaluated for N" against a stale patient description.

The classification-reset effect was watching only searchParams. If
the refined prompt extracted to the same condition (e.g., "breast
cancer" → still "breast cancer"), the API result set was cached and
the trials list didn't change either. classifiedRef carried the old
NCT IDs into the next render, so newTrials.length === 0 and the
classification effect short-circuited.

Two changes:
- Add patientDesc to the reset effect's deps so a prompt change
  alone (no condition change) wipes classifications + classifiedRef
  and cancels the in-flight batch via cancelClassifyRef.
- Cancel the simplifier's pending queue too — otherwise an in-flight
  summary keeps the worker busy while the re-classification waits
  for it to drain.

Effect: change the prompt → classifications wipe → fit dots reset to
shimmer → toolbar shows "evaluating fit · 0 of N" again → classifier
re-runs against the new patient → fit dots refill.
---
 src/components/ResultsList.jsx | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/components/ResultsList.jsx b/src/components/ResultsList.jsx
index 8dfb4bf..2b72a00 100644
--- a/src/components/ResultsList.jsx
+++ b/src/components/ResultsList.jsx
@@ -117,13 +117,23 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
     nlp.load(model.id, { isThinking: model.isThinking, chatOpts: model.chatOpts })
   }, [canClassify, nlp.status, modelKey, nlp])
 
-  // Reset classification state when the search itself changes.
+  // Reset classification state when EITHER the search params OR the patient
+  // description changes. Including patientDesc handles the case where a user
+  // hits "Find trials" again with a refined prompt that happens to extract
+  // to the same condition: the API result set may be cached (same trials)
+  // but the verdicts are now stale w.r.t. the new patient description, so
+  // classifications + classifiedRef must be wiped and the in-flight batch
+  // cancelled so the next pass re-classifies against the new patient.
+  // Also resets the simplifier so any in-flight summary stops competing
+  // with the re-classification pass.
   useEffect(() => {
     classifiedRef.current = new Set()
     setClassifications(new Map())
     setClassifyProgress({ done: 0, total: 0 })
     if (cancelClassifyRef.current) cancelClassifyRef.current()
-  }, [searchParams])
+    simplifier.cancelPending()
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [searchParams, patientDesc])
 
   function toggleCompare(nctId) {
     setCompareSet(prev => {

From 71926e66cb0e3da9e9d05d4d681ed476379663a6 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 01:03:04 -0400
Subject: [PATCH 29/31] fix(phase-3): address PR #2 review must-fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two correctness fixes from code review:

1. useClassifier.js:34-49 — listener cleanup now also rejects every
   in-flight classify in pendingRef. Previously: on unmount (or
   StrictMode dev cleanup) the listener detached but pendingRef Map
   still held resolve/reject handles — those promises hung forever.
   Now any awaiting caller sees a clean rejection with
   "classifier unmounted" so error paths fire and the queue clears.

2. ResultsList.jsx:118 — replaced `nlp` (whole hook return object)
   in the load-trigger effect deps with `nlpLoad` destructured. The
   useNLP hook doesn't memoize its return object, so each render
   produced a new ref → effect ran every render. The status guard
   prevented redundant load() calls but the body still ran. Now the
   effect only fires when load callback identity changes (it's
   useCallback'd with stable deps so essentially never), or when
   canClassify / status / modelKey change.

Both surfaced by post-merge code review of PR #2; neither blocks
shipping but both are trivial to land before merge.
---
 src/components/ResultsList.jsx | 11 ++++++++---
 src/hooks/useClassifier.js     | 10 ++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/components/ResultsList.jsx b/src/components/ResultsList.jsx
index 2b72a00..ad99d6c 100644
--- a/src/components/ResultsList.jsx
+++ b/src/components/ResultsList.jsx
@@ -109,13 +109,18 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
   const canClassify = consented && nlp.webGPUSupported && Boolean(patientDesc)
 
   // Idempotent: worker fast-returns 'ready' if engine already loaded
-  // (e.g. NL extraction loaded it earlier this session).
+  // (e.g. NL extraction loaded it earlier this session). Destructure
+  // load() out of nlp so we can list it in deps directly — `nlp` itself
+  // is a fresh object on every render (useNLP doesn't memoize its
+  // return), and listing the whole hook would re-fire the effect on
+  // every render even when nothing relevant changed.
+  const nlpLoad = nlp.load
   useEffect(() => {
     if (!canClassify) return
     if (nlp.status !== 'idle') return
     const model = NLP_MODELS[modelKey] ?? NLP_MODELS.gemma
-    nlp.load(model.id, { isThinking: model.isThinking, chatOpts: model.chatOpts })
-  }, [canClassify, nlp.status, modelKey, nlp])
+    nlpLoad(model.id, { isThinking: model.isThinking, chatOpts: model.chatOpts })
+  }, [canClassify, nlp.status, modelKey, nlpLoad])
 
   // Reset classification state when EITHER the search params OR the patient
   // description changes. Including patientDesc handles the case where a user
diff --git a/src/hooks/useClassifier.js b/src/hooks/useClassifier.js
index 52b1ba1..2004aac 100644
--- a/src/hooks/useClassifier.js
+++ b/src/hooks/useClassifier.js
@@ -35,9 +35,19 @@ export function useClassifier() {
   }
 
   useEffect(() => {
+    const pending = pendingRef.current
     return () => {
       detachRef.current?.()
       detachRef.current = null
+      // Reject every in-flight classify so awaiting callers don't hang
+      // forever when the component unmounts mid-batch (or during a
+      // StrictMode dev double-invoke). Without this, the listener
+      // detaches but the pendingRef Map still holds resolve/reject
+      // handles whose promise will never settle.
+      for (const { reject } of pending.values()) {
+        reject(new Error('classifier unmounted'))
+      }
+      pending.clear()
     }
   }, [])
 

From e3cefa6787a467cffa40f1e655578883187a5577 Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 01:13:30 -0400
Subject: [PATCH 30/31] chore(phase-3): post-review cleanup + disable
 classify-in-results
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bundles all eight follow-up items the reviewer flagged on PR #2,
plus the user-requested decision to disable classify-driven UI in
the results view (the fit dots had no actionable consequence
without sort wiring, so they were just decoration).

User-visible
- Classify-in-results gated behind ENABLE_CLASSIFY_IN_RESULTS = false
  (ResultsList.jsx). The classifier hook + worker task + ?test=classify
  harness all stay live for prompt iteration; only the in-app fit
  dots and "evaluating fit · X of N" caption are suppressed. Flip
  the constant to true once "Best fit" sort lands so the dots become
  actionable.

Refactors / dedupe
- Moved 305-line SAMPLE_TRIALS array + USER_PRESETS list out of
  ClassificationHarness.jsx into a sibling fixtures file
  (ClassificationHarness.fixtures.js). Harness file dropped from
  924 → ~550 lines.
- Harness now imports DEFAULT_CLASSIFY_PROMPT and parseVerdict from
  utils/classifyTrial.js instead of duplicating both verbatim. Single
  source of truth — prompt tweaks no longer have to be made in two
  places.
- useIsMobile hoisted out of ResultsList.jsx into hooks/useIsMobile.js
  so any other component that needs the same breakpoint can import
  it without copy-pasting.

Small fixes
- patientDesc in ResultsList wrapped in useMemo (was recomputed every
  render — value-equality made it work but the implicit reliance
  was fragile).
- FitDot in TriageRow folds the model's reason into aria-label so
  screen readers and keyboard-focused users get the same context as
  a sighted hover (title alone reaches neither group reliably). Same
  string in both attrs. Added role="img" since it carries semantic
  content now.

New worker task type
- Added 'translate' message type to nlp.worker.js. max_tokens 200
  (vs classify's 80) so verbose-language paraphrases fit. Same low
  temperature (0.1) since translation wants fidelity, not creativity.
- useClassifier now exposes both classifyOne and translateOne, sharing
  the single promise chain (engine is single-threaded regardless of
  task type). handleMessage routes done/error events for both via a
  single isDone/isError predicate.
- Harness translate-first toggle now uses translateOne instead of
  overloading classifyOne — clarifies intent and lets the worker
  apply the right max_tokens budget.

New tests
- useClassifier.test.js: three tests covering serialization
  (concurrent calls post FIFO), error isolation (one rejection
  doesn't poison the queue), and unmount cleanup (pending tasks
  reject with 'classifier unmounted'). Mock the shared worker via
  vi.mock so the tests don't touch real WebLLM. 197/197 pass.

Skipped from the review
- "Simplifier idle gap during model load" — the gap is intentional
  per Handoff Phase 3 step 6 (stage-2 only after stage-1 completes),
  and the pipeline-stage caption already addresses the UX. Reviewer's
  suggestion to "let the simplifier proceed" would re-introduce the
  classifier-vs-simplifier worker contention we explicitly fixed.
---
 .../ClassificationHarness.fixtures.js         | 332 +++++++++++++++
 src/components/ClassificationHarness.jsx      | 388 +-----------------
 src/components/ResultsList.jsx                |  35 +-
 src/components/TriageRow.jsx                  |  17 +-
 src/hooks/useClassifier.js                    |  55 +--
 src/hooks/useClassifier.test.js               |  94 +++++
 src/hooks/useIsMobile.js                      |  21 +
 src/workers/nlp.worker.js                     |  29 ++
 8 files changed, 538 insertions(+), 433 deletions(-)
 create mode 100644 src/components/ClassificationHarness.fixtures.js
 create mode 100644 src/hooks/useClassifier.test.js
 create mode 100644 src/hooks/useIsMobile.js

diff --git a/src/components/ClassificationHarness.fixtures.js b/src/components/ClassificationHarness.fixtures.js
new file mode 100644
index 0000000..c33b3bc
--- /dev/null
+++ b/src/components/ClassificationHarness.fixtures.js
@@ -0,0 +1,332 @@
+// Fixture data for the dev-only Classification Harness (?test=classify).
+// Lives next to the component but split out because the trial array is
+// 300+ lines and made the harness file hard to navigate when iterating
+// on prompts vs data.
+//
+// `outOfScope: true` flags trials the CT.gov API would NOT return for
+// a breast-cancer search — kept in the fixture as wrong-condition
+// stress tests, but the harness's "production-realistic agreement"
+// toggle excludes them from the headline metric.
+
+export const SAMPLE_TRIALS = [
+  {
+    nctId: 'NCT05952557',
+    title: 'Phase IIIb Study of Ribociclib + Endocrine Therapy in Early Breast Cancer',
+    eligibility: 'Inclusion: Adult female, ≥18 years. HR-positive, HER2-negative early breast cancer. Completed definitive surgery. Postmenopausal status confirmed. ECOG 0-1. Adequate organ function. Exclusion: Prior CDK4/6 inhibitor. Pregnancy or breastfeeding. Active second malignancy.',
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT06104020',
+    title: 'Sacituzumab Govitecan in Metastatic Triple-Negative Breast Cancer',
+    eligibility: 'Inclusion: Adult, any sex. Histologically confirmed metastatic triple-negative breast cancer (ER<1%, PR<1%, HER2-negative). At least one prior line of systemic therapy in metastatic setting. ECOG 0-2. Measurable disease per RECIST 1.1. Exclusion: Active CNS metastases. Prior topoisomerase I inhibitor.',
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT05887492',
+    title: 'Adaptive Radiation Boost in Locally Advanced HER2+ Breast Cancer',
+    eligibility: 'Inclusion: Adult female. HER2-positive breast cancer confirmed by IHC 3+ or FISH-positive. Stage II-III disease. Completed neoadjuvant chemotherapy. ECOG 0-1. Exclusion: Prior radiation to chest. Pregnancy.',
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT06221340',
+    title: 'Aerobic Exercise During Adjuvant Chemo for Breast Cancer Survivors',
+    eligibility: 'Inclusion: Adult, any sex. Breast cancer, any stage. Currently receiving or scheduled for adjuvant chemotherapy. Cleared by oncologist for moderate exercise. Exclusion: Cardiac contraindications.',
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT04123456',
+    title: 'Pembrolizumab in Advanced Non-Small Cell Lung Cancer',
+    eligibility: 'Inclusion: Adult. Histologically confirmed advanced NSCLC. PD-L1 expression ≥50%. ECOG 0-1. Exclusion: Active autoimmune disease. Prior immunotherapy.',
+    expected: 'UNLIKELY',
+    outOfScope: true, // NSCLC — wouldn't appear in a breast-cancer API search
+  },
+  {
+    nctId: 'NCT05123987',
+    title: 'Targeted Therapy in Pediatric Acute Lymphoblastic Leukemia',
+    eligibility: 'Inclusion: Pediatric patients aged 2-17 years. Newly diagnosed ALL. Exclusion: Adults. Prior chemotherapy.',
+    expected: 'UNLIKELY',
+    outOfScope: true, // Pediatric ALL — wouldn't appear in a breast-cancer API search
+  },
+
+  // ─── Subtype-gated breast cancer trials — POSSIBLE without confirmed subtype ───
+  {
+    nctId: 'NCT05300100',
+    title: 'Tucatinib + Trastuzumab in HER2-Positive Metastatic Breast Cancer',
+    eligibility: 'Inclusion: Adult, any sex, ≥18 years. Histologically confirmed HER2-positive metastatic breast cancer (IHC 3+ or FISH-amplified). At least 2 prior HER2-directed therapies. ECOG 0-1. Exclusion: Untreated brain metastases. Prior tucatinib.',
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT05400201',
+    title: 'Olaparib Maintenance in BRCA-Mutated HER2-Negative Breast Cancer',
+    eligibility: 'Inclusion: Adult female. HER2-negative breast cancer with germline BRCA1 or BRCA2 mutation (confirmed by central testing). High-risk early disease following adjuvant chemotherapy. Postmenopausal or premenopausal with ovarian suppression. Exclusion: Prior PARP inhibitor.',
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT05511223',
+    title: 'CDK4/6 Inhibitor Switch in Hormone-Receptor-Positive Advanced Breast Cancer',
+    eligibility: 'Inclusion: Adult women, postmenopausal. HR-positive, HER2-negative advanced or metastatic breast cancer. Disease progression on a prior CDK4/6 inhibitor. ECOG 0-2.',
+    expected: 'POSSIBLE',
+  },
+
+  // ─── Strong matches for a 58yo with breast cancer ───
+  {
+    nctId: 'NCT05633445',
+    title: 'Cognitive Behavioral Therapy for Cancer-Related Fatigue',
+    eligibility: 'Inclusion: Adults ≥18 years with any solid tumor diagnosis (breast, colon, lung, prostate, etc.). Currently in active treatment or within 5 years of treatment completion. Self-reported fatigue ≥4 on a 0-10 scale. Exclusion: Severe untreated depression. Inability to attend weekly sessions.',
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT05755677',
+    title: 'Lymphedema Surveillance Program After Breast Cancer Surgery',
+    eligibility: 'Inclusion: Adult female ≥18 years. History of breast cancer treated with axillary surgery (sentinel lymph node biopsy or axillary dissection). Within 3 years of surgery. Exclusion: Pre-existing lymphedema. Current breast cancer recurrence.',
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT05822334',
+    title: 'Mindfulness-Based Stress Reduction for Breast Cancer Survivors',
+    eligibility: 'Inclusion: Adult women ≥21 years. Diagnosed with breast cancer (any stage). Completed primary treatment within the past 5 years OR currently on adjuvant endocrine therapy. Exclusion: Active psychosis. Prior MBSR participation.',
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT05901128',
+    title: 'Vaginal Estrogen Safety Study in Postmenopausal Breast Cancer Survivors',
+    eligibility: 'Inclusion: Postmenopausal women ages 45-75 with a history of HR-positive or HR-negative breast cancer. Disease-free for ≥1 year. Genitourinary symptoms of menopause. Stable on aromatase inhibitor or tamoxifen, or treatment-free. Exclusion: Current metastatic disease.',
+    expected: 'LIKELY',
+  },
+
+  // ─── Wrong condition / wrong demographic — clear UNLIKELY ───
+  {
+    nctId: 'NCT04567890',
+    title: 'Pembrolizumab in Advanced Melanoma',
+    eligibility: 'Inclusion: Adults with histologically confirmed unresectable Stage III or Stage IV melanoma. ECOG 0-1. No prior systemic therapy for advanced disease. Exclusion: Active autoimmune disease.',
+    expected: 'UNLIKELY',
+    outOfScope: true,
+  },
+  {
+    nctId: 'NCT04678901',
+    title: 'Apixaban vs. Warfarin in Atrial Fibrillation',
+    eligibility: 'Inclusion: Adults ≥18 years with non-valvular atrial fibrillation. CHA2DS2-VASc score ≥2. Exclusion: Mechanical heart valve. Active bleeding.',
+    expected: 'UNLIKELY',
+    outOfScope: true,
+  },
+  {
+    nctId: 'NCT04789012',
+    title: 'GLP-1 Agonist for Weight Management in Type 2 Diabetes',
+    eligibility: 'Inclusion: Adults 18-75 with Type 2 diabetes mellitus. BMI ≥30. HbA1c 7.0-10.0%. Exclusion: Type 1 diabetes. Active malignancy within 5 years. History of pancreatitis.',
+    expected: 'UNLIKELY',
+    outOfScope: true,
+  },
+  {
+    nctId: 'NCT04890123',
+    title: 'Robotic Prostatectomy Outcomes in Localized Prostate Cancer',
+    eligibility: 'Inclusion: Men ≥40 years with biopsy-confirmed clinically localized prostate cancer (T1-T2). Candidate for radical prostatectomy. Exclusion: Prior pelvic surgery or radiation.',
+    expected: 'UNLIKELY',
+    outOfScope: true,
+  },
+  {
+    nctId: 'NCT04901234',
+    title: 'Pediatric Vaccine Immunogenicity Study',
+    eligibility: 'Inclusion: Healthy children aged 6 months to 5 years. Up to date on routine immunizations. Exclusion: Immunocompromised. Recent illness within 14 days.',
+    expected: 'UNLIKELY',
+    outOfScope: true,
+  },
+
+  // ─── Edge cases — should challenge the model ───
+  {
+    nctId: 'NCT05012345',
+    title: 'Palliative Care Integration in Patients with Advanced Solid Tumors',
+    eligibility: 'Inclusion: Adults ≥18 years with advanced (Stage IV) solid tumor of any primary site (breast, lung, GI, GU, GYN). Estimated prognosis 6-24 months. ECOG 0-3. Exclusion: Currently enrolled in hospice.',
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT05123450',
+    title: 'Premenopausal Breast Cancer: Ovarian Function Suppression Trial',
+    eligibility: 'Inclusion: Premenopausal women ages 18-45 with newly diagnosed HR-positive early breast cancer. Confirmed premenopausal by FSH and estradiol levels. Exclusion: Postmenopausal status. Prior ovarian suppression therapy.',
+    expected: 'UNLIKELY',
+  },
+
+  // ─── Realistic-length eligibility (~2-3.5kB each) — stress-tests how the
+  //     model handles formal CT.gov noise and how truncation affects accuracy.
+  //     Try these with eligMax = 800 vs 3000 vs 6000 to see the trade-off.
+  {
+    nctId: 'NCT-LONG-01',
+    title: 'Phase II Study of Sacituzumab Govitecan-hziy in Patients with HR-Positive, HER2-Negative Metastatic Breast Cancer After Endocrine Therapy and CDK4/6 Inhibitor',
+    eligibility: `Inclusion Criteria:
+
+1. Female participants ≥18 years of age at the time of signing informed consent.
+2. Histologically or cytologically confirmed adenocarcinoma of the breast that is metastatic or locally advanced and not amenable to curative resection or radiotherapy.
+3. Documentation of estrogen receptor (ER)-positive (≥1% staining by IHC) and/or progesterone receptor (PR)-positive (≥1% staining by IHC) tumor status, in accordance with ASCO/CAP guidelines.
+4. Documentation of HER2-negative status defined as IHC 0, IHC 1+, or IHC 2+ with negative in situ hybridization (ISH), per ASCO/CAP guidelines.
+5. Disease progression on or after at least one prior CDK4/6 inhibitor (palbociclib, ribociclib, or abemaciclib) administered for advanced or metastatic disease, in combination with an aromatase inhibitor or fulvestrant.
+6. Disease progression on or after at least one and no more than two prior endocrine therapies (e.g., aromatase inhibitor, fulvestrant, tamoxifen) for advanced or metastatic disease.
+7. No more than one prior chemotherapy regimen for metastatic disease.
+8. Postmenopausal status, OR premenopausal/perimenopausal women who agree to receive concurrent ovarian function suppression with a luteinizing hormone-releasing hormone (LHRH) agonist throughout study treatment.
+9. Measurable disease per RECIST v1.1, or non-measurable bone-only disease assessable per protocol-specified criteria.
+10. ECOG performance status 0 or 1.
+11. Adequate organ function:
+    - Absolute neutrophil count (ANC) ≥1.5 × 10^9/L
+    - Platelets ≥100 × 10^9/L
+    - Hemoglobin ≥9.0 g/dL (transfusion permitted)
+    - Total bilirubin ≤1.5 × ULN (≤3 × ULN for participants with documented Gilbert syndrome)
+    - AST and ALT ≤2.5 × ULN (≤5 × ULN if liver metastases present)
+    - Creatinine clearance ≥50 mL/min by Cockcroft-Gault equation
+    - INR and aPTT ≤1.5 × ULN unless on anticoagulants
+12. Resolution of all acute toxic effects of prior anti-cancer therapy or surgical procedures to NCI CTCAE v5.0 Grade ≤1 (except alopecia and Grade 2 neuropathy).
+13. Willingness to provide tumor tissue (archival or fresh biopsy) for biomarker analyses.
+
+Exclusion Criteria:
+
+1. Prior treatment with sacituzumab govitecan or any other Trop-2-directed therapy.
+2. Prior treatment with an antibody-drug conjugate containing a topoisomerase I inhibitor payload (e.g., trastuzumab deruxtecan).
+3. Active CNS metastases. Participants with previously treated, asymptomatic CNS metastases are eligible if clinically stable for ≥4 weeks off corticosteroids and anticonvulsants.
+4. Leptomeningeal disease.
+5. Known active infection requiring systemic therapy, including untreated HIV, active HBV (HBsAg positive or HBV DNA detectable), or active HCV (HCV RNA detectable).
+6. Significant cardiovascular disease, including: NYHA Class III or IV congestive heart failure, myocardial infarction or unstable angina within 6 months, uncontrolled arrhythmia, baseline QTcF >470 ms.
+7. History of another malignancy within 3 years, except for adequately treated non-melanoma skin cancer, in situ cervical or breast cancer, or low-risk localized prostate cancer on active surveillance.
+8. Known hypersensitivity to irinotecan or any component of the study drug formulation.
+9. Pregnant or breastfeeding women. Women of childbearing potential must agree to use highly effective contraception during the study and for 6 months after the last dose.
+10. Concurrent participation in another therapeutic clinical trial.
+11. Major surgery within 4 weeks prior to first dose.
+12. Live vaccines within 30 days prior to first dose.`,
+    expected: 'POSSIBLE',
+  },
+  {
+    nctId: 'NCT-LONG-02',
+    title: 'Randomized Phase III Trial of Adjuvant Endocrine Therapy ± Abemaciclib in Postmenopausal Women with HR-Positive, HER2-Negative, Node-Positive Early Breast Cancer at High Risk of Recurrence',
+    eligibility: `Inclusion Criteria:
+
+1. Female, postmenopausal at the time of randomization. Postmenopausal status defined as: (a) prior bilateral oophorectomy, (b) age ≥60 years, OR (c) age <60 with amenorrhea ≥12 months in the absence of chemotherapy, tamoxifen, or ovarian suppression AND FSH and estradiol in the postmenopausal range.
+2. Age 18 to 75 years inclusive at the time of consent.
+3. ECOG performance status of 0, 1, or 2.
+4. Histologically confirmed invasive breast carcinoma. Multicentric or multifocal disease is allowed if all foci meet eligibility.
+5. Hormone receptor-positive disease, defined as ≥1% of tumor cells staining positive for estrogen receptor and/or progesterone receptor by IHC, per ASCO/CAP guidelines.
+6. HER2-negative disease, defined as IHC 0, 1+, or 2+ with negative reflex ISH testing per ASCO/CAP guidelines.
+7. Stage II or III disease with high-risk pathologic features, defined as ≥1 of the following:
+    - ≥4 positive axillary lymph nodes, OR
+    - 1-3 positive axillary lymph nodes AND tumor size ≥5 cm, OR
+    - 1-3 positive axillary lymph nodes AND histologic grade 3, OR
+    - 1-3 positive axillary lymph nodes AND Ki-67 ≥20%
+8. Definitive surgical treatment of primary tumor with negative margins (lumpectomy with whole-breast irradiation OR mastectomy with or without post-mastectomy radiation per institutional standard).
+9. Completion of any neoadjuvant or adjuvant chemotherapy at least 21 days but no more than 16 months prior to randomization.
+10. Initiation of adjuvant endocrine therapy (aromatase inhibitor, with or without LHRH agonist) is permitted, but participants must not have received endocrine therapy for more than 12 weeks prior to randomization.
+11. Adequate organ function within 14 days of randomization:
+    - ANC ≥1.5 × 10^9/L
+    - Platelets ≥100 × 10^9/L
+    - Hemoglobin ≥10.0 g/dL
+    - Total bilirubin ≤1.5 × ULN
+    - AST/ALT ≤2.5 × ULN
+    - Creatinine clearance ≥50 mL/min
+12. Negative serum or urine pregnancy test for participants of childbearing potential.
+
+Exclusion Criteria:
+
+1. Stage IV (metastatic) breast cancer or evidence of distant metastases on staging imaging.
+2. Inflammatory breast cancer.
+3. Bilateral invasive breast cancer.
+4. Prior treatment with any CDK4/6 inhibitor in any setting.
+5. Prior anti-cancer therapy other than chemotherapy and locoregional therapy for the current breast cancer diagnosis.
+6. History of another malignancy within 5 years prior to randomization, except adequately treated non-melanoma skin cancer, in situ cervical cancer, or contralateral DCIS.
+7. Active or chronic hepatitis B or C infection, or known HIV infection.
+8. Significant uncontrolled cardiovascular disease: NYHA Class III/IV heart failure, myocardial infarction within 6 months, ventricular arrhythmia requiring treatment.
+9. History of interstitial lung disease or pneumonitis requiring corticosteroids.
+10. Major surgery (other than breast cancer surgery) within 28 days of randomization.
+11. Receiving strong CYP3A inhibitors or inducers within 14 days that cannot be discontinued.
+12. Inability to swallow oral medications or significant malabsorption.
+13. Pregnant or breastfeeding (premenopausal participants only — see inclusion criterion 1).`,
+    expected: 'LIKELY',
+  },
+  {
+    nctId: 'NCT-LONG-03',
+    title: 'Phase III Study of Pembrolizumab Plus Chemotherapy versus Chemotherapy Alone for First-Line Treatment of Metastatic Squamous Non-Small Cell Lung Cancer',
+    outOfScope: true,
+    eligibility: `Inclusion Criteria:
+
+1. Histologically or cytologically confirmed Stage IV squamous non-small cell lung cancer (NSCLC) per AJCC 8th edition.
+2. Male or female ≥18 years of age.
+3. No prior systemic therapy for metastatic NSCLC. Prior adjuvant or neoadjuvant chemotherapy is allowed if completed ≥6 months prior to enrollment.
+4. Measurable disease per RECIST v1.1.
+5. Provision of a tumor tissue sample (archival or fresh biopsy) adequate for PD-L1 IHC testing using the 22C3 pharmDx assay.
+6. ECOG performance status 0 or 1.
+7. Life expectancy ≥3 months.
+8. Adequate organ function within 10 days of randomization:
+    - ANC ≥1.5 × 10^9/L without G-CSF support
+    - Platelets ≥100 × 10^9/L without transfusion
+    - Hemoglobin ≥9.0 g/dL
+    - Total bilirubin ≤1.5 × ULN
+    - AST/ALT ≤2.5 × ULN (≤5 × ULN if liver involvement)
+    - Creatinine clearance ≥45 mL/min
+    - INR/aPTT ≤1.5 × ULN
+9. Female participants of childbearing potential and male participants with partners of childbearing potential must agree to use effective contraception throughout treatment and for 120 days after last dose.
+
+Exclusion Criteria:
+
+1. Histology of mixed small cell and non-small cell lung cancer, or predominantly non-squamous histology.
+2. Known sensitizing EGFR mutation, ALK rearrangement, ROS1 rearrangement, BRAF V600E mutation, or other actionable alteration for which an approved targeted therapy is the standard of care.
+3. Prior treatment with any PD-1, PD-L1, PD-L2, or CTLA-4 inhibitor.
+4. Active autoimmune disease requiring systemic immunosuppression within 2 years. Replacement therapy (e.g., thyroxine, insulin, physiologic corticosteroids) is permitted.
+5. History of pneumonitis requiring corticosteroids, or active pneumonitis.
+6. Active CNS metastases or carcinomatous meningitis. Participants with previously treated, asymptomatic CNS metastases stable for ≥4 weeks may be eligible.
+7. Active infection requiring systemic therapy.
+8. Known active HIV, HBV, or HCV infection.
+9. Live vaccine within 30 days of first dose.
+10. History of solid organ or allogeneic stem cell transplant.
+11. Pregnant or breastfeeding women.
+12. History of another malignancy within 3 years, except for adequately treated non-melanoma skin cancer or in situ disease.`,
+    expected: 'UNLIKELY',
+  },
+  {
+    nctId: 'NCT-LONG-04',
+    title: 'Multicenter Randomized Trial of Empagliflozin in Patients with Heart Failure with Preserved Ejection Fraction and Type 2 Diabetes',
+    outOfScope: true,
+    eligibility: `Inclusion Criteria:
+
+1. Adults aged 40 to 85 years at consent.
+2. Documented diagnosis of heart failure with preserved ejection fraction (HFpEF):
+    - Left ventricular ejection fraction (LVEF) ≥50% on echocardiogram within the past 12 months
+    - NYHA functional class II, III, or IV
+    - Elevated NT-proBNP ≥300 pg/mL (or ≥600 pg/mL if atrial fibrillation present)
+    - Structural heart disease on echocardiography (LV hypertrophy or left atrial enlargement) OR documented prior HF hospitalization
+3. Documented Type 2 diabetes mellitus (T2DM) per ADA criteria, with HbA1c 6.5% to 10.0% at screening.
+4. Stable background heart failure therapy for ≥4 weeks (diuretic if indicated; ACEi/ARB/ARNI per guideline; beta-blocker per guideline).
+5. eGFR ≥25 mL/min/1.73m^2 by CKD-EPI equation.
+6. Body mass index 20 to 45 kg/m^2.
+7. Able and willing to provide written informed consent and adhere to study procedures.
+
+Exclusion Criteria:
+
+1. Type 1 diabetes mellitus.
+2. History of diabetic ketoacidosis within 12 months.
+3. LVEF <50% on most recent echocardiogram.
+4. Acute decompensated heart failure requiring IV diuretics within 4 weeks of screening.
+5. Acute coronary syndrome, stroke, or transient ischemic attack within 90 days.
+6. Planned cardiac surgery, percutaneous coronary intervention, or device implantation within 90 days.
+7. Symptomatic hypotension or systolic blood pressure <100 mmHg at screening.
+8. Significant valvular heart disease (severe aortic stenosis, severe mitral regurgitation requiring surgery).
+9. Hypertrophic cardiomyopathy, infiltrative cardiomyopathy, or constrictive pericarditis.
+10. eGFR <25 mL/min/1.73m^2 or end-stage renal disease requiring dialysis.
+11. Known active malignancy requiring treatment within the past 12 months. Participants with a history of cancer who are disease-free for >12 months are eligible.
+12. Severe hepatic impairment (Child-Pugh C).
+13. Pregnancy or breastfeeding.
+14. Known hypersensitivity to SGLT2 inhibitors.
+15. Participation in another interventional clinical trial within 30 days.
+16. Life expectancy <12 months due to non-cardiovascular cause.`,
+    expected: 'UNLIKELY',
+  },
+]
+
+// Patient description presets for multilingual + edge-case validation. Same
+// 58yo woman with breast cancer in Boston, expressed in different languages
+// and registers (formal, terse, etc.) so we can stress-test the model's
+// understanding without changing the underlying clinical signal.
+export const USER_PRESETS = [
+  { id: 'en',     label: 'English',                  text: "I'm 58 years old with breast cancer in Boston" },
+  { id: 'en-2',   label: 'English (more detail)',    text: "58-year-old woman in Boston, postmenopausal, recently diagnosed with breast cancer, looking for post-chemo treatment options" },
+  { id: 'es',     label: 'Spanish (Español)',        text: 'Tengo 58 años, vivo en Boston y tengo cáncer de mama' },
+  { id: 'es-2',   label: 'Spanish (more detail)',    text: 'Soy mujer de 58 años, posmenopáusica, vivo en Boston. Me diagnosticaron cáncer de mama y busco opciones de tratamiento después de quimioterapia.' },
+  { id: 'zh',     label: 'Mandarin (中文)',          text: '我58岁，住在波士顿，患有乳腺癌' },
+  { id: 'ar',     label: 'Arabic (العربية)',        text: 'أنا امرأة عمري 58 عامًا أعيش في بوسطن ومصابة بسرطان الثدي' },
+  { id: 'pt',     label: 'Portuguese (Português)',   text: 'Tenho 58 anos, moro em Boston e tenho câncer de mama' },
+  { id: 'fr',     label: 'French (Français)',        text: "J'ai 58 ans, je vis à Boston et j'ai un cancer du sein" },
+  { id: 'terse',  label: 'Terse / fragments',        text: '58F, BC, Boston' },
+]
+
diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 6b8cc36..0a5bdd5 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -2,384 +2,8 @@ import { useState, useEffect } from 'react'
 import { useNLP } from '../hooks/useNLP'
 import { useClassifier } from '../hooks/useClassifier'
 import { NLP_MODELS, resolveModelKey } from '../utils/nlpModels'
-
-const SAMPLE_TRIALS = [
-  {
-    nctId: 'NCT05952557',
-    title: 'Phase IIIb Study of Ribociclib + Endocrine Therapy in Early Breast Cancer',
-    eligibility: 'Inclusion: Adult female, ≥18 years. HR-positive, HER2-negative early breast cancer. Completed definitive surgery. Postmenopausal status confirmed. ECOG 0-1. Adequate organ function. Exclusion: Prior CDK4/6 inhibitor. Pregnancy or breastfeeding. Active second malignancy.',
-    expected: 'LIKELY',
-  },
-  {
-    nctId: 'NCT06104020',
-    title: 'Sacituzumab Govitecan in Metastatic Triple-Negative Breast Cancer',
-    eligibility: 'Inclusion: Adult, any sex. Histologically confirmed metastatic triple-negative breast cancer (ER<1%, PR<1%, HER2-negative). At least one prior line of systemic therapy in metastatic setting. ECOG 0-2. Measurable disease per RECIST 1.1. Exclusion: Active CNS metastases. Prior topoisomerase I inhibitor.',
-    expected: 'POSSIBLE',
-  },
-  {
-    nctId: 'NCT05887492',
-    title: 'Adaptive Radiation Boost in Locally Advanced HER2+ Breast Cancer',
-    eligibility: 'Inclusion: Adult female. HER2-positive breast cancer confirmed by IHC 3+ or FISH-positive. Stage II-III disease. Completed neoadjuvant chemotherapy. ECOG 0-1. Exclusion: Prior radiation to chest. Pregnancy.',
-    expected: 'POSSIBLE',
-  },
-  {
-    nctId: 'NCT06221340',
-    title: 'Aerobic Exercise During Adjuvant Chemo for Breast Cancer Survivors',
-    eligibility: 'Inclusion: Adult, any sex. Breast cancer, any stage. Currently receiving or scheduled for adjuvant chemotherapy. Cleared by oncologist for moderate exercise. Exclusion: Cardiac contraindications.',
-    expected: 'LIKELY',
-  },
-  {
-    nctId: 'NCT04123456',
-    title: 'Pembrolizumab in Advanced Non-Small Cell Lung Cancer',
-    eligibility: 'Inclusion: Adult. Histologically confirmed advanced NSCLC. PD-L1 expression ≥50%. ECOG 0-1. Exclusion: Active autoimmune disease. Prior immunotherapy.',
-    expected: 'UNLIKELY',
-    outOfScope: true, // NSCLC — wouldn't appear in a breast-cancer API search
-  },
-  {
-    nctId: 'NCT05123987',
-    title: 'Targeted Therapy in Pediatric Acute Lymphoblastic Leukemia',
-    eligibility: 'Inclusion: Pediatric patients aged 2-17 years. Newly diagnosed ALL. Exclusion: Adults. Prior chemotherapy.',
-    expected: 'UNLIKELY',
-    outOfScope: true, // Pediatric ALL — wouldn't appear in a breast-cancer API search
-  },
-
-  // ─── Subtype-gated breast cancer trials — POSSIBLE without confirmed subtype ───
-  {
-    nctId: 'NCT05300100',
-    title: 'Tucatinib + Trastuzumab in HER2-Positive Metastatic Breast Cancer',
-    eligibility: 'Inclusion: Adult, any sex, ≥18 years. Histologically confirmed HER2-positive metastatic breast cancer (IHC 3+ or FISH-amplified). At least 2 prior HER2-directed therapies. ECOG 0-1. Exclusion: Untreated brain metastases. Prior tucatinib.',
-    expected: 'POSSIBLE',
-  },
-  {
-    nctId: 'NCT05400201',
-    title: 'Olaparib Maintenance in BRCA-Mutated HER2-Negative Breast Cancer',
-    eligibility: 'Inclusion: Adult female. HER2-negative breast cancer with germline BRCA1 or BRCA2 mutation (confirmed by central testing). High-risk early disease following adjuvant chemotherapy. Postmenopausal or premenopausal with ovarian suppression. Exclusion: Prior PARP inhibitor.',
-    expected: 'POSSIBLE',
-  },
-  {
-    nctId: 'NCT05511223',
-    title: 'CDK4/6 Inhibitor Switch in Hormone-Receptor-Positive Advanced Breast Cancer',
-    eligibility: 'Inclusion: Adult women, postmenopausal. HR-positive, HER2-negative advanced or metastatic breast cancer. Disease progression on a prior CDK4/6 inhibitor. ECOG 0-2.',
-    expected: 'POSSIBLE',
-  },
-
-  // ─── Strong matches for a 58yo with breast cancer ───
-  {
-    nctId: 'NCT05633445',
-    title: 'Cognitive Behavioral Therapy for Cancer-Related Fatigue',
-    eligibility: 'Inclusion: Adults ≥18 years with any solid tumor diagnosis (breast, colon, lung, prostate, etc.). Currently in active treatment or within 5 years of treatment completion. Self-reported fatigue ≥4 on a 0-10 scale. Exclusion: Severe untreated depression. Inability to attend weekly sessions.',
-    expected: 'LIKELY',
-  },
-  {
-    nctId: 'NCT05755677',
-    title: 'Lymphedema Surveillance Program After Breast Cancer Surgery',
-    eligibility: 'Inclusion: Adult female ≥18 years. History of breast cancer treated with axillary surgery (sentinel lymph node biopsy or axillary dissection). Within 3 years of surgery. Exclusion: Pre-existing lymphedema. Current breast cancer recurrence.',
-    expected: 'LIKELY',
-  },
-  {
-    nctId: 'NCT05822334',
-    title: 'Mindfulness-Based Stress Reduction for Breast Cancer Survivors',
-    eligibility: 'Inclusion: Adult women ≥21 years. Diagnosed with breast cancer (any stage). Completed primary treatment within the past 5 years OR currently on adjuvant endocrine therapy. Exclusion: Active psychosis. Prior MBSR participation.',
-    expected: 'LIKELY',
-  },
-  {
-    nctId: 'NCT05901128',
-    title: 'Vaginal Estrogen Safety Study in Postmenopausal Breast Cancer Survivors',
-    eligibility: 'Inclusion: Postmenopausal women ages 45-75 with a history of HR-positive or HR-negative breast cancer. Disease-free for ≥1 year. Genitourinary symptoms of menopause. Stable on aromatase inhibitor or tamoxifen, or treatment-free. Exclusion: Current metastatic disease.',
-    expected: 'LIKELY',
-  },
-
-  // ─── Wrong condition / wrong demographic — clear UNLIKELY ───
-  {
-    nctId: 'NCT04567890',
-    title: 'Pembrolizumab in Advanced Melanoma',
-    eligibility: 'Inclusion: Adults with histologically confirmed unresectable Stage III or Stage IV melanoma. ECOG 0-1. No prior systemic therapy for advanced disease. Exclusion: Active autoimmune disease.',
-    expected: 'UNLIKELY',
-    outOfScope: true,
-  },
-  {
-    nctId: 'NCT04678901',
-    title: 'Apixaban vs. Warfarin in Atrial Fibrillation',
-    eligibility: 'Inclusion: Adults ≥18 years with non-valvular atrial fibrillation. CHA2DS2-VASc score ≥2. Exclusion: Mechanical heart valve. Active bleeding.',
-    expected: 'UNLIKELY',
-    outOfScope: true,
-  },
-  {
-    nctId: 'NCT04789012',
-    title: 'GLP-1 Agonist for Weight Management in Type 2 Diabetes',
-    eligibility: 'Inclusion: Adults 18-75 with Type 2 diabetes mellitus. BMI ≥30. HbA1c 7.0-10.0%. Exclusion: Type 1 diabetes. Active malignancy within 5 years. History of pancreatitis.',
-    expected: 'UNLIKELY',
-    outOfScope: true,
-  },
-  {
-    nctId: 'NCT04890123',
-    title: 'Robotic Prostatectomy Outcomes in Localized Prostate Cancer',
-    eligibility: 'Inclusion: Men ≥40 years with biopsy-confirmed clinically localized prostate cancer (T1-T2). Candidate for radical prostatectomy. Exclusion: Prior pelvic surgery or radiation.',
-    expected: 'UNLIKELY',
-    outOfScope: true,
-  },
-  {
-    nctId: 'NCT04901234',
-    title: 'Pediatric Vaccine Immunogenicity Study',
-    eligibility: 'Inclusion: Healthy children aged 6 months to 5 years. Up to date on routine immunizations. Exclusion: Immunocompromised. Recent illness within 14 days.',
-    expected: 'UNLIKELY',
-    outOfScope: true,
-  },
-
-  // ─── Edge cases — should challenge the model ───
-  {
-    nctId: 'NCT05012345',
-    title: 'Palliative Care Integration in Patients with Advanced Solid Tumors',
-    eligibility: 'Inclusion: Adults ≥18 years with advanced (Stage IV) solid tumor of any primary site (breast, lung, GI, GU, GYN). Estimated prognosis 6-24 months. ECOG 0-3. Exclusion: Currently enrolled in hospice.',
-    expected: 'POSSIBLE',
-  },
-  {
-    nctId: 'NCT05123450',
-    title: 'Premenopausal Breast Cancer: Ovarian Function Suppression Trial',
-    eligibility: 'Inclusion: Premenopausal women ages 18-45 with newly diagnosed HR-positive early breast cancer. Confirmed premenopausal by FSH and estradiol levels. Exclusion: Postmenopausal status. Prior ovarian suppression therapy.',
-    expected: 'UNLIKELY',
-  },
-
-  // ─── Realistic-length eligibility (~2-3.5kB each) — stress-tests how the
-  //     model handles formal CT.gov noise and how truncation affects accuracy.
-  //     Try these with eligMax = 800 vs 3000 vs 6000 to see the trade-off.
-  {
-    nctId: 'NCT-LONG-01',
-    title: 'Phase II Study of Sacituzumab Govitecan-hziy in Patients with HR-Positive, HER2-Negative Metastatic Breast Cancer After Endocrine Therapy and CDK4/6 Inhibitor',
-    eligibility: `Inclusion Criteria:
-
-1. Female participants ≥18 years of age at the time of signing informed consent.
-2. Histologically or cytologically confirmed adenocarcinoma of the breast that is metastatic or locally advanced and not amenable to curative resection or radiotherapy.
-3. Documentation of estrogen receptor (ER)-positive (≥1% staining by IHC) and/or progesterone receptor (PR)-positive (≥1% staining by IHC) tumor status, in accordance with ASCO/CAP guidelines.
-4. Documentation of HER2-negative status defined as IHC 0, IHC 1+, or IHC 2+ with negative in situ hybridization (ISH), per ASCO/CAP guidelines.
-5. Disease progression on or after at least one prior CDK4/6 inhibitor (palbociclib, ribociclib, or abemaciclib) administered for advanced or metastatic disease, in combination with an aromatase inhibitor or fulvestrant.
-6. Disease progression on or after at least one and no more than two prior endocrine therapies (e.g., aromatase inhibitor, fulvestrant, tamoxifen) for advanced or metastatic disease.
-7. No more than one prior chemotherapy regimen for metastatic disease.
-8. Postmenopausal status, OR premenopausal/perimenopausal women who agree to receive concurrent ovarian function suppression with a luteinizing hormone-releasing hormone (LHRH) agonist throughout study treatment.
-9. Measurable disease per RECIST v1.1, or non-measurable bone-only disease assessable per protocol-specified criteria.
-10. ECOG performance status 0 or 1.
-11. Adequate organ function:
-    - Absolute neutrophil count (ANC) ≥1.5 × 10^9/L
-    - Platelets ≥100 × 10^9/L
-    - Hemoglobin ≥9.0 g/dL (transfusion permitted)
-    - Total bilirubin ≤1.5 × ULN (≤3 × ULN for participants with documented Gilbert syndrome)
-    - AST and ALT ≤2.5 × ULN (≤5 × ULN if liver metastases present)
-    - Creatinine clearance ≥50 mL/min by Cockcroft-Gault equation
-    - INR and aPTT ≤1.5 × ULN unless on anticoagulants
-12. Resolution of all acute toxic effects of prior anti-cancer therapy or surgical procedures to NCI CTCAE v5.0 Grade ≤1 (except alopecia and Grade 2 neuropathy).
-13. Willingness to provide tumor tissue (archival or fresh biopsy) for biomarker analyses.
-
-Exclusion Criteria:
-
-1. Prior treatment with sacituzumab govitecan or any other Trop-2-directed therapy.
-2. Prior treatment with an antibody-drug conjugate containing a topoisomerase I inhibitor payload (e.g., trastuzumab deruxtecan).
-3. Active CNS metastases. Participants with previously treated, asymptomatic CNS metastases are eligible if clinically stable for ≥4 weeks off corticosteroids and anticonvulsants.
-4. Leptomeningeal disease.
-5. Known active infection requiring systemic therapy, including untreated HIV, active HBV (HBsAg positive or HBV DNA detectable), or active HCV (HCV RNA detectable).
-6. Significant cardiovascular disease, including: NYHA Class III or IV congestive heart failure, myocardial infarction or unstable angina within 6 months, uncontrolled arrhythmia, baseline QTcF >470 ms.
-7. History of another malignancy within 3 years, except for adequately treated non-melanoma skin cancer, in situ cervical or breast cancer, or low-risk localized prostate cancer on active surveillance.
-8. Known hypersensitivity to irinotecan or any component of the study drug formulation.
-9. Pregnant or breastfeeding women. Women of childbearing potential must agree to use highly effective contraception during the study and for 6 months after the last dose.
-10. Concurrent participation in another therapeutic clinical trial.
-11. Major surgery within 4 weeks prior to first dose.
-12. Live vaccines within 30 days prior to first dose.`,
-    expected: 'POSSIBLE',
-  },
-  {
-    nctId: 'NCT-LONG-02',
-    title: 'Randomized Phase III Trial of Adjuvant Endocrine Therapy ± Abemaciclib in Postmenopausal Women with HR-Positive, HER2-Negative, Node-Positive Early Breast Cancer at High Risk of Recurrence',
-    eligibility: `Inclusion Criteria:
-
-1. Female, postmenopausal at the time of randomization. Postmenopausal status defined as: (a) prior bilateral oophorectomy, (b) age ≥60 years, OR (c) age <60 with amenorrhea ≥12 months in the absence of chemotherapy, tamoxifen, or ovarian suppression AND FSH and estradiol in the postmenopausal range.
-2. Age 18 to 75 years inclusive at the time of consent.
-3. ECOG performance status of 0, 1, or 2.
-4. Histologically confirmed invasive breast carcinoma. Multicentric or multifocal disease is allowed if all foci meet eligibility.
-5. Hormone receptor-positive disease, defined as ≥1% of tumor cells staining positive for estrogen receptor and/or progesterone receptor by IHC, per ASCO/CAP guidelines.
-6. HER2-negative disease, defined as IHC 0, 1+, or 2+ with negative reflex ISH testing per ASCO/CAP guidelines.
-7. Stage II or III disease with high-risk pathologic features, defined as ≥1 of the following:
-    - ≥4 positive axillary lymph nodes, OR
-    - 1-3 positive axillary lymph nodes AND tumor size ≥5 cm, OR
-    - 1-3 positive axillary lymph nodes AND histologic grade 3, OR
-    - 1-3 positive axillary lymph nodes AND Ki-67 ≥20%
-8. Definitive surgical treatment of primary tumor with negative margins (lumpectomy with whole-breast irradiation OR mastectomy with or without post-mastectomy radiation per institutional standard).
-9. Completion of any neoadjuvant or adjuvant chemotherapy at least 21 days but no more than 16 months prior to randomization.
-10. Initiation of adjuvant endocrine therapy (aromatase inhibitor, with or without LHRH agonist) is permitted, but participants must not have received endocrine therapy for more than 12 weeks prior to randomization.
-11. Adequate organ function within 14 days of randomization:
-    - ANC ≥1.5 × 10^9/L
-    - Platelets ≥100 × 10^9/L
-    - Hemoglobin ≥10.0 g/dL
-    - Total bilirubin ≤1.5 × ULN
-    - AST/ALT ≤2.5 × ULN
-    - Creatinine clearance ≥50 mL/min
-12. Negative serum or urine pregnancy test for participants of childbearing potential.
-
-Exclusion Criteria:
-
-1. Stage IV (metastatic) breast cancer or evidence of distant metastases on staging imaging.
-2. Inflammatory breast cancer.
-3. Bilateral invasive breast cancer.
-4. Prior treatment with any CDK4/6 inhibitor in any setting.
-5. Prior anti-cancer therapy other than chemotherapy and locoregional therapy for the current breast cancer diagnosis.
-6. History of another malignancy within 5 years prior to randomization, except adequately treated non-melanoma skin cancer, in situ cervical cancer, or contralateral DCIS.
-7. Active or chronic hepatitis B or C infection, or known HIV infection.
-8. Significant uncontrolled cardiovascular disease: NYHA Class III/IV heart failure, myocardial infarction within 6 months, ventricular arrhythmia requiring treatment.
-9. History of interstitial lung disease or pneumonitis requiring corticosteroids.
-10. Major surgery (other than breast cancer surgery) within 28 days of randomization.
-11. Receiving strong CYP3A inhibitors or inducers within 14 days that cannot be discontinued.
-12. Inability to swallow oral medications or significant malabsorption.
-13. Pregnant or breastfeeding (premenopausal participants only — see inclusion criterion 1).`,
-    expected: 'LIKELY',
-  },
-  {
-    nctId: 'NCT-LONG-03',
-    title: 'Phase III Study of Pembrolizumab Plus Chemotherapy versus Chemotherapy Alone for First-Line Treatment of Metastatic Squamous Non-Small Cell Lung Cancer',
-    outOfScope: true,
-    eligibility: `Inclusion Criteria:
-
-1. Histologically or cytologically confirmed Stage IV squamous non-small cell lung cancer (NSCLC) per AJCC 8th edition.
-2. Male or female ≥18 years of age.
-3. No prior systemic therapy for metastatic NSCLC. Prior adjuvant or neoadjuvant chemotherapy is allowed if completed ≥6 months prior to enrollment.
-4. Measurable disease per RECIST v1.1.
-5. Provision of a tumor tissue sample (archival or fresh biopsy) adequate for PD-L1 IHC testing using the 22C3 pharmDx assay.
-6. ECOG performance status 0 or 1.
-7. Life expectancy ≥3 months.
-8. Adequate organ function within 10 days of randomization:
-    - ANC ≥1.5 × 10^9/L without G-CSF support
-    - Platelets ≥100 × 10^9/L without transfusion
-    - Hemoglobin ≥9.0 g/dL
-    - Total bilirubin ≤1.5 × ULN
-    - AST/ALT ≤2.5 × ULN (≤5 × ULN if liver involvement)
-    - Creatinine clearance ≥45 mL/min
-    - INR/aPTT ≤1.5 × ULN
-9. Female participants of childbearing potential and male participants with partners of childbearing potential must agree to use effective contraception throughout treatment and for 120 days after last dose.
-
-Exclusion Criteria:
-
-1. Histology of mixed small cell and non-small cell lung cancer, or predominantly non-squamous histology.
-2. Known sensitizing EGFR mutation, ALK rearrangement, ROS1 rearrangement, BRAF V600E mutation, or other actionable alteration for which an approved targeted therapy is the standard of care.
-3. Prior treatment with any PD-1, PD-L1, PD-L2, or CTLA-4 inhibitor.
-4. Active autoimmune disease requiring systemic immunosuppression within 2 years. Replacement therapy (e.g., thyroxine, insulin, physiologic corticosteroids) is permitted.
-5. History of pneumonitis requiring corticosteroids, or active pneumonitis.
-6. Active CNS metastases or carcinomatous meningitis. Participants with previously treated, asymptomatic CNS metastases stable for ≥4 weeks may be eligible.
-7. Active infection requiring systemic therapy.
-8. Known active HIV, HBV, or HCV infection.
-9. Live vaccine within 30 days of first dose.
-10. History of solid organ or allogeneic stem cell transplant.
-11. Pregnant or breastfeeding women.
-12. History of another malignancy within 3 years, except for adequately treated non-melanoma skin cancer or in situ disease.`,
-    expected: 'UNLIKELY',
-  },
-  {
-    nctId: 'NCT-LONG-04',
-    title: 'Multicenter Randomized Trial of Empagliflozin in Patients with Heart Failure with Preserved Ejection Fraction and Type 2 Diabetes',
-    outOfScope: true,
-    eligibility: `Inclusion Criteria:
-
-1. Adults aged 40 to 85 years at consent.
-2. Documented diagnosis of heart failure with preserved ejection fraction (HFpEF):
-    - Left ventricular ejection fraction (LVEF) ≥50% on echocardiogram within the past 12 months
-    - NYHA functional class II, III, or IV
-    - Elevated NT-proBNP ≥300 pg/mL (or ≥600 pg/mL if atrial fibrillation present)
-    - Structural heart disease on echocardiography (LV hypertrophy or left atrial enlargement) OR documented prior HF hospitalization
-3. Documented Type 2 diabetes mellitus (T2DM) per ADA criteria, with HbA1c 6.5% to 10.0% at screening.
-4. Stable background heart failure therapy for ≥4 weeks (diuretic if indicated; ACEi/ARB/ARNI per guideline; beta-blocker per guideline).
-5. eGFR ≥25 mL/min/1.73m^2 by CKD-EPI equation.
-6. Body mass index 20 to 45 kg/m^2.
-7. Able and willing to provide written informed consent and adhere to study procedures.
-
-Exclusion Criteria:
-
-1. Type 1 diabetes mellitus.
-2. History of diabetic ketoacidosis within 12 months.
-3. LVEF <50% on most recent echocardiogram.
-4. Acute decompensated heart failure requiring IV diuretics within 4 weeks of screening.
-5. Acute coronary syndrome, stroke, or transient ischemic attack within 90 days.
-6. Planned cardiac surgery, percutaneous coronary intervention, or device implantation within 90 days.
-7. Symptomatic hypotension or systolic blood pressure <100 mmHg at screening.
-8. Significant valvular heart disease (severe aortic stenosis, severe mitral regurgitation requiring surgery).
-9. Hypertrophic cardiomyopathy, infiltrative cardiomyopathy, or constrictive pericarditis.
-10. eGFR <25 mL/min/1.73m^2 or end-stage renal disease requiring dialysis.
-11. Known active malignancy requiring treatment within the past 12 months. Participants with a history of cancer who are disease-free for >12 months are eligible.
-12. Severe hepatic impairment (Child-Pugh C).
-13. Pregnancy or breastfeeding.
-14. Known hypersensitivity to SGLT2 inhibitors.
-15. Participation in another interventional clinical trial within 30 days.
-16. Life expectancy <12 months due to non-cardiovascular cause.`,
-    expected: 'UNLIKELY',
-  },
-]
-
-const DEFAULT_PROMPT = `You decide whether a clinical trial is worth showing to a patient. Output one of two labels:
-
-- LIKELY: the trial studies the patient's condition AND nothing in the eligibility clearly excludes the patient based on what they stated. Worth showing.
-- UNLIKELY: the trial studies a different disease, OR the patient is clearly the wrong sex / age / population. Not worth showing.
-
-Be inclusive on LIKELY: if the trial requires a subtype, biomarker, stage, or prior treatment the patient did NOT mention, still call it LIKELY — the patient or their doctor can verify. Only use UNLIKELY when the patient is clearly disqualified by something they DID state.
-
-Examples (note: each example uses a DIFFERENT patient — focus on the reasoning, not the patient details):
-
-Patient: "45-year-old woman with ovarian cancer"
-Trial: PARP Inhibitor in BRCA-Mutated Ovarian Cancer (Eligibility: women with ovarian cancer and BRCA mutation)
-Answer: LIKELY | matches ovarian cancer in a woman; BRCA status can be verified
-
-Patient: "70-year-old man with type 2 diabetes"
-Trial: Tamoxifen in Premenopausal Breast Cancer (Eligibility: premenopausal women with breast cancer)
-Answer: UNLIKELY | trial is for breast cancer in women; patient has diabetes
-
-Patient: "8-year-old child with asthma"
-Trial: Adult Anti-Inflammatory for Asthma (Eligibility: adults 18+ with persistent asthma)
-Answer: UNLIKELY | trial is for adults; patient is a child
-
-Patient: "55-year-old man with hypertension"
-Trial: Yoga Intervention for Adults with Chronic Conditions (Eligibility: adults 40-75 with any chronic condition)
-Answer: LIKELY | adult with chronic condition matches the broad inclusion
-
-Now classify:
-
-Patient: {{user}}
-Trial: {{title}}
-Eligibility: {{eligibility}}
-
-Answer (one line, format exactly "<LABEL> | <one short reason>"):`
-
-const DEFAULT_USER_DESC = "I'm 58 years old with breast cancer in Boston"
-
-// Patient description presets for multilingual + edge-case validation. Same
-// 58yo woman with breast cancer in Boston, expressed in different languages
-// and registers (formal, terse, etc.) so we can stress-test the model's
-// understanding without changing the underlying clinical signal.
-const USER_PRESETS = [
-  { id: 'en',     label: 'English',                  text: "I'm 58 years old with breast cancer in Boston" },
-  { id: 'en-2',   label: 'English (more detail)',    text: "58-year-old woman in Boston, postmenopausal, recently diagnosed with breast cancer, looking for post-chemo treatment options" },
-  { id: 'es',     label: 'Spanish (Español)',        text: 'Tengo 58 años, vivo en Boston y tengo cáncer de mama' },
-  { id: 'es-2',   label: 'Spanish (more detail)',    text: 'Soy mujer de 58 años, posmenopáusica, vivo en Boston. Me diagnosticaron cáncer de mama y busco opciones de tratamiento después de quimioterapia.' },
-  { id: 'zh',     label: 'Mandarin (中文)',          text: '我58岁，住在波士顿，患有乳腺癌' },
-  { id: 'ar',     label: 'Arabic (العربية)',        text: 'أنا امرأة عمري 58 عامًا أعيش في بوسطن ومصابة بسرطان الثدي' },
-  { id: 'pt',     label: 'Portuguese (Português)',   text: 'Tenho 58 anos, moro em Boston e tenho câncer de mama' },
-  { id: 'fr',     label: 'French (Français)',        text: "J'ai 58 ans, je vis à Boston et j'ai un cancer du sein" },
-  { id: 'terse',  label: 'Terse / fragments',        text: '58F, BC, Boston' },
-]
-
-// Parser still accepts POSSIBLE in case the model emits it (older prompts,
-// instruction drift) — POSSIBLE is normalized to LIKELY since the binary
-// product question is "show or hide".
-function parseVerdict(raw) {
-  if (!raw || typeof raw !== 'string') return { verdict: 'PARSE_FAIL', reason: '(empty output)' }
-  const m = raw.match(/^\s*(LIKELY|POSSIBLE|UNLIKELY)\s*[|:\-—]\s*(.+?)\s*$/im)
-  if (m) {
-    const v = m[1].toUpperCase()
-    return { verdict: v === 'POSSIBLE' ? 'LIKELY' : v, reason: m[2].trim() }
-  }
-  const w = raw.match(/\b(LIKELY|POSSIBLE|UNLIKELY)\b/i)
-  if (w) {
-    const v = w[1].toUpperCase()
-    return {
-      verdict: v === 'POSSIBLE' ? 'LIKELY' : v,
-      reason: raw.replace(w[0], '').replace(/^[\s|:\-—]+/, '').trim() || '(no reason)',
-    }
-  }
-  return { verdict: 'PARSE_FAIL', reason: raw.slice(0, 120) }
-}
+import { DEFAULT_CLASSIFY_PROMPT, parseVerdict } from '../utils/classifyTrial'
+import { SAMPLE_TRIALS, USER_PRESETS } from './ClassificationHarness.fixtures'
 
 // Normalize fixture-side expected values for binary agreement: POSSIBLE
 // counts as LIKELY (both = "show this trial"). Keeps the fixture data
@@ -404,10 +28,10 @@ export default function ClassificationHarness() {
   )
   const model = NLP_MODELS[modelKey]
   const { status, progress, error, load, webGPUSupported } = useNLP()
-  const { classifyOne } = useClassifier()
+  const { classifyOne, translateOne } = useClassifier()
 
-  const [userDesc, setUserDesc] = useState(DEFAULT_USER_DESC)
-  const [promptTemplate, setPromptTemplate] = useState(DEFAULT_PROMPT)
+  const [userDesc, setUserDesc] = useState(USER_PRESETS[0].text)
+  const [promptTemplate, setPromptTemplate] = useState(DEFAULT_CLASSIFY_PROMPT)
   const [trialsJson, setTrialsJson] = useState(JSON.stringify(SAMPLE_TRIALS, null, 2))
   const [concurrency, setConcurrency] = useState(3)
   const [eligMax, setEligMax] = useState(1500)
@@ -458,7 +82,7 @@ Patient description: ${userDesc}
 
 English translation:`
       try {
-        const { raw } = await classifyOne(translatePrompt)
+        const { raw } = await translateOne(translatePrompt)
         effectiveUserDesc = (raw || '').trim().replace(/^["']|["']$/g, '')
         setTranslatedDesc(effectiveUserDesc)
       } catch (e) {
diff --git a/src/components/ResultsList.jsx b/src/components/ResultsList.jsx
index ad99d6c..c12a4d9 100644
--- a/src/components/ResultsList.jsx
+++ b/src/components/ResultsList.jsx
@@ -4,6 +4,7 @@ import { useClinicalTrials } from '../hooks/useClinicalTrials'
 import { useSimplifier } from '../hooks/useSimplifier'
 import { useNLP } from '../hooks/useNLP'
 import { useClassifier } from '../hooks/useClassifier'
+import { useIsMobile } from '../hooks/useIsMobile'
 import { NLP_MODELS } from '../utils/nlpModels'
 import { buildClassifyPrompt, parseVerdict } from '../utils/classifyTrial'
 import ResultCard from './ResultCard'
@@ -17,6 +18,14 @@ import {
 
 const NLP_CONSENT_KEY = 'iris_nlp_enabled'
 
+// Stage-1 classification is wired end-to-end (worker, hook, harness) but
+// not yet surfaced in the in-app results UI. Reason: without sort wiring
+// the fit dots don't drive any user-visible behavior — they're just
+// decoration. The harness at ?test=classify still uses the full pipeline
+// for prompt iteration and validation. Flip this to true once "Best fit"
+// sort is wired so the dots become actionable.
+const ENABLE_CLASSIFY_IN_RESULTS = false
+
 // Build a synthetic patient description from extracted fields when the user
 // came in via structured form but had previously used NL (so consent exists).
 function patientDescFromFields(fields) {
@@ -30,27 +39,8 @@ function patientDescFromFields(fields) {
 }
 
 const EAGER_BATCH_SIZE = 5
-const MOBILE_BREAKPOINT_PX = 820
 const LIST_WIDTH_PX = 400
 
-// matchMedia (not 'resize'): iOS Safari fires 'resize' inconsistently on
-// rotation; matchMedia.change is the reliable signal. Also catches iPad
-// split-screen and browser-window mode switches without a manual resize.
-function useIsMobile() {
-  const query = `(max-width: ${MOBILE_BREAKPOINT_PX}px)`
-  const [isMobile, setIsMobile] = useState(() =>
-    typeof window !== 'undefined' && window.matchMedia(query).matches
-  )
-  useEffect(() => {
-    const mq = window.matchMedia(query)
-    const onChange = (e) => setIsMobile(e.matches)
-    mq.addEventListener('change', onChange)
-    return () => mq.removeEventListener('change', onChange)
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [])
-  return isMobile
-}
-
 export default function ResultsList({ searchParams, modelKey, userDescription, extractedFields }) {
   // Phase 3 simplification only ships for English and Spanish — those are
   // the languages we've verified the local model produces accurately.
@@ -105,8 +95,11 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
   const consented = useMemo(() => {
     try { return localStorage.getItem(NLP_CONSENT_KEY) === 'true' } catch { return false }
   }, [])
-  const patientDesc = userDescription || patientDescFromFields(extractedFields)
-  const canClassify = consented && nlp.webGPUSupported && Boolean(patientDesc)
+  const patientDesc = useMemo(
+    () => userDescription || patientDescFromFields(extractedFields),
+    [userDescription, extractedFields]
+  )
+  const canClassify = ENABLE_CLASSIFY_IN_RESULTS && consented && nlp.webGPUSupported && Boolean(patientDesc)
 
   // Idempotent: worker fast-returns 'ready' if engine already loaded
   // (e.g. NL extraction loaded it earlier this session). Destructure
diff --git a/src/components/TriageRow.jsx b/src/components/TriageRow.jsx
index 2246c11..e7d183d 100644
--- a/src/components/TriageRow.jsx
+++ b/src/components/TriageRow.jsx
@@ -15,18 +15,23 @@ function FitDot({ classification, pending }) {
   if (!classification) return null
 
   const isLikely = classification.verdict === 'LIKELY'
+  // Fold the model's reason into aria-label so SR/keyboard users get the
+  // same context as a sighted hover. title alone wasn't reaching either
+  // group reliably (title isn't announced by most screen readers, isn't
+  // keyboard-discoverable). Same string in both attrs means verdict +
+  // reason are the unit a user perceives, not just the verdict.
+  const label = isLikely
+    ? `Likely fit — ${classification.reason || 'matches your description'}`
+    : `Less likely fit — ${classification.reason || 'may not match'}`
   return (
     <span
+      role="img"
       className={[
         'inline-block w-2 h-2 rounded-full mr-1 shrink-0',
         isLikely ? 'bg-iris-500' : 'border border-parchment-400',
       ].join(' ')}
-      title={
-        isLikely
-          ? `Likely fit — ${classification.reason || 'matches your description'}`
-          : `Less likely fit — ${classification.reason || 'may not match'}`
-      }
-      aria-label={isLikely ? 'Likely fit' : 'Less likely fit'}
+      title={label}
+      aria-label={label}
     />
   )
 }
diff --git a/src/hooks/useClassifier.js b/src/hooks/useClassifier.js
index 2004aac..e899970 100644
--- a/src/hooks/useClassifier.js
+++ b/src/hooks/useClassifier.js
@@ -1,18 +1,19 @@
 import { useRef, useEffect, useCallback } from 'react'
 import { getSharedWorker, attachListener } from '../workers/sharedNlpWorker'
 
-// Stage-1 classifier hook. Posts a 'classify' task to the shared NLP worker
-// and resolves with { raw, latencyMs }. The caller parses the verdict.
+// Two task hooks (classifyOne, translateOne) share a single promise chain
+// because WebLLM's MLCEngine is NOT parallel-safe. Concurrent
+// engine.chat.completions.create() calls clobber state and produce
+// "Message error should not be 0" failures. Callers can fire-and-forget
+// concurrently; each request waits its turn behind the chain.
 //
-// IMPORTANT: WebLLM's MLCEngine is NOT parallel-safe. Concurrent
-// engine.chat.completions.create() calls clobber each other's state and
-// produce "Message error should not be 0" failures. We serialize all
-// classify requests through a single promise chain at the hook level —
-// callers can fire-and-forget concurrently, but each request waits its
-// turn. Caller-side concurrency knobs become a no-op for actual
-// parallelism, but still control queue capacity.
+// The two task types are functionally similar (one-shot completion with
+// raw + latencyMs return) but conceptually distinct, so they get distinct
+// worker message types ('classify' vs 'translate') for clarity and so the
+// worker can use different max_tokens budgets.
 //
-// The worker must already have the model loaded.
+// The worker must already have the model loaded. classify/translateOne
+// reject with 'Engine not loaded' otherwise.
 export function useClassifier() {
   const pendingRef = useRef(new Map())
   const detachRef = useRef(null)
@@ -26,12 +27,14 @@ export function useClassifier() {
 
   function handleMessage(event) {
     const { type, taskId, raw, latencyMs, message } = event.data ?? {}
-    if (type !== 'classify_done' && type !== 'classify_error') return
+    const isDone = type === 'classify_done' || type === 'translate_done'
+    const isError = type === 'classify_error' || type === 'translate_error'
+    if (!isDone && !isError) return
     const pending = pendingRef.current.get(taskId)
     if (!pending) return
     pendingRef.current.delete(taskId)
-    if (type === 'classify_done') pending.resolve({ raw, latencyMs })
-    else pending.reject(new Error(message ?? 'classify failed'))
+    if (isDone) pending.resolve({ raw, latencyMs })
+    else pending.reject(new Error(message ?? 'task failed'))
   }
 
   useEffect(() => {
@@ -39,11 +42,9 @@ export function useClassifier() {
     return () => {
       detachRef.current?.()
       detachRef.current = null
-      // Reject every in-flight classify so awaiting callers don't hang
+      // Reject every in-flight task so awaiting callers don't hang
       // forever when the component unmounts mid-batch (or during a
-      // StrictMode dev double-invoke). Without this, the listener
-      // detaches but the pendingRef Map still holds resolve/reject
-      // handles whose promise will never settle.
+      // StrictMode dev double-invoke).
       for (const { reject } of pending.values()) {
         reject(new Error('classifier unmounted'))
       }
@@ -51,20 +52,26 @@ export function useClassifier() {
     }
   }, [])
 
-  const classifyOne = useCallback((prompt) => {
+  // Generic task runner — same chain semantics, different worker message
+  // type. taskIdPrefix lets handleMessage route done/error messages back
+  // to the right pending entry; it doesn't have to be unique per type
+  // (the Map is keyed on the full taskId) but it makes worker logs
+  // self-documenting.
+  function runTask(workerType, taskIdPrefix, prompt) {
     ensureSubscribed()
-    const taskId = `classify-${++taskIdRef.current}`
-    // Chain onto the previous request so only one inference runs at a time.
-    // .catch in the chain prevents one failure from breaking the whole queue.
+    const taskId = `${taskIdPrefix}-${++taskIdRef.current}`
     const next = chainRef.current.catch(() => {}).then(() =>
       new Promise((resolve, reject) => {
         pendingRef.current.set(taskId, { resolve, reject })
-        getSharedWorker().postMessage({ type: 'classify', taskId, prompt })
+        getSharedWorker().postMessage({ type: workerType, taskId, prompt })
       })
     )
     chainRef.current = next
     return next
-  }, [])
+  }
+
+  const classifyOne = useCallback((prompt) => runTask('classify', 'classify', prompt), [])
+  const translateOne = useCallback((prompt) => runTask('translate', 'translate', prompt), [])
 
-  return { classifyOne }
+  return { classifyOne, translateOne }
 }
diff --git a/src/hooks/useClassifier.test.js b/src/hooks/useClassifier.test.js
new file mode 100644
index 0000000..61f0c85
--- /dev/null
+++ b/src/hooks/useClassifier.test.js
@@ -0,0 +1,94 @@
+import { describe, it, expect, vi, beforeEach } from 'vitest'
+import { renderHook, act, waitFor } from '@testing-library/react'
+import { useClassifier } from './useClassifier'
+
+// Mock the shared worker so tests don't touch the real WebLLM worker.
+// We intercept postMessage to capture call order, and we expose a way for
+// the test to invoke the listener with synthetic 'classify_done' messages.
+let capturedListener = null
+let postedMessages = []
+
+vi.mock('../workers/sharedNlpWorker', () => ({
+  getSharedWorker: () => ({
+    postMessage: (msg) => { postedMessages.push(msg) },
+  }),
+  attachListener: (fn) => {
+    capturedListener = fn
+    return () => { capturedListener = null }
+  },
+}))
+
+beforeEach(() => {
+  capturedListener = null
+  postedMessages = []
+})
+
+// Helper: construct a 'classify_done' worker message and pass it to whatever
+// useClassifier registered as its listener.
+function dispatchDone(taskId, raw = 'LIKELY | mock', latencyMs = 100) {
+  capturedListener({ data: { type: 'classify_done', taskId, raw, latencyMs } })
+}
+
+describe('useClassifier — promise chain serialization', () => {
+  it('posts only the first request to the worker until it settles', async () => {
+    const { result } = renderHook(() => useClassifier())
+
+    // Fire 3 concurrent classifyOne calls.
+    let p1, p2, p3
+    p1 = result.current.classifyOne('prompt-1')
+    p2 = result.current.classifyOne('prompt-2')
+    p3 = result.current.classifyOne('prompt-3')
+
+    // Only the first task should be in flight.
+    await waitFor(() => expect(postedMessages.length).toBe(1))
+    expect(postedMessages[0].prompt).toBe('prompt-1')
+
+    // Settle task 1; task 2 should now post.
+    dispatchDone(postedMessages[0].taskId, 'LIKELY | one')
+    await p1
+    await waitFor(() => expect(postedMessages.length).toBe(2))
+    expect(postedMessages[1].prompt).toBe('prompt-2')
+
+    // Settle task 2; task 3 posts.
+    dispatchDone(postedMessages[1].taskId, 'UNLIKELY | two')
+    await p2
+    await waitFor(() => expect(postedMessages.length).toBe(3))
+    expect(postedMessages[2].prompt).toBe('prompt-3')
+
+    // Settle task 3.
+    dispatchDone(postedMessages[2].taskId, 'LIKELY | three')
+    const r3 = await p3
+    expect(r3.raw).toBe('LIKELY | three')
+  })
+
+  it('does not poison the queue when one task rejects', async () => {
+    const { result } = renderHook(() => useClassifier())
+
+    const p1 = result.current.classifyOne('prompt-A')
+    const p2 = result.current.classifyOne('prompt-B')
+
+    await waitFor(() => expect(postedMessages.length).toBe(1))
+
+    // Reject task 1 via classify_error.
+    capturedListener({ data: { type: 'classify_error', taskId: postedMessages[0].taskId, message: 'boom' } })
+    await expect(p1).rejects.toThrow('boom')
+
+    // Task 2 should still post and resolve.
+    await waitFor(() => expect(postedMessages.length).toBe(2))
+    dispatchDone(postedMessages[1].taskId, 'LIKELY | recovered')
+    const r2 = await p2
+    expect(r2.raw).toBe('LIKELY | recovered')
+  })
+
+  it('rejects pending tasks when the hook unmounts', async () => {
+    const { result, unmount } = renderHook(() => useClassifier())
+
+    const p1 = result.current.classifyOne('prompt-pending')
+    await waitFor(() => expect(postedMessages.length).toBe(1))
+
+    // Mid-flight: unmount.
+    act(() => unmount())
+
+    await expect(p1).rejects.toThrow(/unmounted/)
+  })
+})
diff --git a/src/hooks/useIsMobile.js b/src/hooks/useIsMobile.js
new file mode 100644
index 0000000..a5fb4ee
--- /dev/null
+++ b/src/hooks/useIsMobile.js
@@ -0,0 +1,21 @@
+import { useEffect, useState } from 'react'
+
+export const MOBILE_BREAKPOINT_PX = 820
+
+// matchMedia (not 'resize'): iOS Safari fires 'resize' inconsistently on
+// rotation; matchMedia.change is the reliable signal. Also catches iPad
+// split-screen and browser-window mode switches without a manual resize.
+export function useIsMobile() {
+  const query = `(max-width: ${MOBILE_BREAKPOINT_PX}px)`
+  const [isMobile, setIsMobile] = useState(() =>
+    typeof window !== 'undefined' && window.matchMedia(query).matches
+  )
+  useEffect(() => {
+    const mq = window.matchMedia(query)
+    const onChange = (e) => setIsMobile(e.matches)
+    mq.addEventListener('change', onChange)
+    return () => mq.removeEventListener('change', onChange)
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [])
+  return isMobile
+}
diff --git a/src/workers/nlp.worker.js b/src/workers/nlp.worker.js
index fc93e77..d541141 100644
--- a/src/workers/nlp.worker.js
+++ b/src/workers/nlp.worker.js
@@ -130,6 +130,35 @@ self.onmessage = async (event) => {
     return
   }
 
+  if (type === 'translate') {
+    if (!engine) {
+      self.postMessage({ type: 'translate_error', taskId, message: 'Engine not loaded' })
+      return
+    }
+    try {
+      const t0 = Date.now()
+      if (typeof engine.resetChat === 'function') {
+        try { await engine.resetChat() } catch { /* best effort */ }
+      }
+      // Translation typically needs more headroom than classification (one
+      // verdict word + reason fits in 80; a paraphrased clinical sentence
+      // can run 100-200 tokens for verbose languages). Same low temperature
+      // since we want fidelity, not creativity.
+      const request = {
+        messages: [{ role: 'user', content: prompt }],
+        max_tokens: 200,
+        temperature: 0.1,
+      }
+      if (isThinkingModel) request.extra_body = { enable_thinking: false }
+      const reply = await engine.chat.completions.create(request)
+      const raw = reply.choices?.[0]?.message?.content ?? ''
+      self.postMessage({ type: 'translate_done', taskId, raw, latencyMs: Date.now() - t0 })
+    } catch (err) {
+      self.postMessage({ type: 'translate_error', taskId, message: err?.message ?? String(err) })
+    }
+    return
+  }
+
   if (type === 'classify') {
     if (!engine) {
       self.postMessage({ type: 'classify_error', taskId, message: 'Engine not loaded' })

From f99743b11fb128dee3dab7ab66354b7bfe978adf Mon Sep 17 00:00:00 2001
From: John Orgera <65687576+johnoooh@users.noreply.github.com>
Date: Thu, 7 May 2026 01:24:22 -0400
Subject: [PATCH 31/31] chore(phase-3): clear lint diagnostics introduced by
 this PR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #3 added 4 new lint hits on top of main's pre-existing baseline.
All trivial — fixing them keeps the CI lint output clean for future
reviewers (the workflow runs lint as continue-on-error so they don't
block, but fewer ignorable lines is fewer ignorable lines).

- ClassificationHarness.jsx:36 — setConcurrency unused since the
  concurrency dropdown was removed (serialization happens in the
  hook now). Drop the setter, keep the value as a const.
- ResultCard.jsx:126 — showFit unused since the "Why this might or
  might not fit you" section was dropped. Drop the var; comment
  notes the path back if a fine-tuned model lets us re-introduce.
- ResultsList.jsx:75 — wrap allTrials in useMemo. react-query keeps
  data ref stable across non-data renders so the memo identity is
  stable too; without the memo, every render produced a new array
  and effect dep arrays comparing against allTrials would have
  thrashed (the actual classify trigger effect depends on a derived
  trialKeyAll string so this was cosmetic, but cleaner this way).
- useClassifier.js:73-74 — exhaustive-deps disable on the
  classifyOne/translateOne useCallbacks. runTask only closes over
  refs (stable); the linter can't see through that.

Lint count: 29 → 24 (14 errors, 10 warnings — one fewer than main's
baseline). All remaining are pre-existing.
---
 src/components/ClassificationHarness.jsx | 6 +++++-
 src/components/ResultCard.jsx            | 7 ++++---
 src/components/ResultsList.jsx           | 8 +++++++-
 src/hooks/useClassifier.js               | 6 ++++++
 4 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/components/ClassificationHarness.jsx b/src/components/ClassificationHarness.jsx
index 0a5bdd5..0c2cb67 100644
--- a/src/components/ClassificationHarness.jsx
+++ b/src/components/ClassificationHarness.jsx
@@ -33,7 +33,11 @@ export default function ClassificationHarness() {
   const [userDesc, setUserDesc] = useState(USER_PRESETS[0].text)
   const [promptTemplate, setPromptTemplate] = useState(DEFAULT_CLASSIFY_PROMPT)
   const [trialsJson, setTrialsJson] = useState(JSON.stringify(SAMPLE_TRIALS, null, 2))
-  const [concurrency, setConcurrency] = useState(3)
+  // Concurrency was a UI dropdown until we serialized at the hook level
+  // (WebLLM engine is single-threaded). Kept as a constant so the worker
+  // loop still controls fan-out at the harness level — the real
+  // serialization happens in useClassifier's promise chain.
+  const concurrency = 3
   const [eligMax, setEligMax] = useState(1500)
   const [translateFirst, setTranslateFirst] = useState(false)
   const [translatedDesc, setTranslatedDesc] = useState(null)
diff --git a/src/components/ResultCard.jsx b/src/components/ResultCard.jsx
index 49fd93c..e9cc060 100644
--- a/src/components/ResultCard.jsx
+++ b/src/components/ResultCard.jsx
@@ -119,11 +119,12 @@ export default function ResultCard({
     : 'bg-white border border-parchment-400 rounded-lg p-5 mb-3 max-w-3xl'
 
   const sumState = simplification?.summarize
-  const fitState = simplification?.fit
-
+  // fitState/showFit removed when the "Why this might or might not fit you"
+  // section was dropped — Gemma 2B's accuracy on the fit narrative wasn't
+  // reliable enough to ship. Re-introduce both if the fit section comes
+  // back behind a fine-tuned model.
   const showPlainLanguage = sumState && sumState.status !== 'error'
   const showFallbackHint = sumState?.status === 'error'
-  const showFit = fitState && fitState.status !== 'error' && fitState.text
 
   return (
     <article className={wrapperClass}>
diff --git a/src/components/ResultsList.jsx b/src/components/ResultsList.jsx
index c12a4d9..12813dd 100644
--- a/src/components/ResultsList.jsx
+++ b/src/components/ResultsList.jsx
@@ -72,7 +72,13 @@ export default function ResultsList({ searchParams, modelKey, userDescription, e
     extractedFields,
   })
 
-  const allTrials = data?.pages.flatMap(p => p.trials) ?? []
+  // Memoized so effect dep arrays comparing against allTrials don't churn
+  // every render — react-query returns the same `data` ref while data is
+  // unchanged, so memo identity is stable across non-data renders.
+  const allTrials = useMemo(
+    () => data?.pages.flatMap(p => p.trials) ?? [],
+    [data]
+  )
 
   const isMobile = useIsMobile()
   const [selectedNctId, setSelectedNctId] = useState(null)
diff --git a/src/hooks/useClassifier.js b/src/hooks/useClassifier.js
index e899970..69042dd 100644
--- a/src/hooks/useClassifier.js
+++ b/src/hooks/useClassifier.js
@@ -70,7 +70,13 @@ export function useClassifier() {
     return next
   }
 
+  // runTask only closes over refs (pendingRef, chainRef, taskIdRef, detachRef)
+  // which are stable across renders, so it's safe to omit from useCallback
+  // deps. The exhaustive-deps lint can't see through this because runTask
+  // is defined in the function body each render.
+  // eslint-disable-next-line react-hooks/exhaustive-deps
   const classifyOne = useCallback((prompt) => runTask('classify', 'classify', prompt), [])
+  // eslint-disable-next-line react-hooks/exhaustive-deps
   const translateOne = useCallback((prompt) => runTask('translate', 'translate', prompt), [])
 
   return { classifyOne, translateOne }