diff --git a/ai-research-tool-evaluation-harness/README.md b/ai-research-tool-evaluation-harness/README.md new file mode 100644 index 0000000..8adb2e0 --- /dev/null +++ b/ai-research-tool-evaluation-harness/README.md @@ -0,0 +1,25 @@ +# AI Research Tool Evaluation Harness + +This module adds a focused quality gate for the AI-Assisted Research Tools MVP. It does not call a live model. Instead, it evaluates proposed summarizer, peer-review, and citation-tool outputs against deterministic manuscript evidence so reviewers can see whether AI outputs are grounded, compliant, and insertion-ready. + +## What it checks + +- Summary mode contracts for abstract, executive, and layperson outputs +- Evidence-span coverage for findings, implications, next steps, and recommendations +- Peer-review diagnostics for statistical, ethics, data availability, clarity, and similarity risks +- Citation recommendations for claim coverage, confidence, DOI metadata, and APA/Nature formatting +- Actionable quality gates with blocker/warning severities, readiness score, and audit digest + +## Run locally + +```bash +npm run check +npm test +npm run demo +``` + +The sample bundle intentionally flags one unsupported citation gap, one missing data availability statement, one p-value interpretation risk, and one layperson-summary jargon issue. + +## Demo video + +The reviewer demo is at `docs/research-tool-evaluation-demo.mp4`. It shows a terminal-style walkthrough of `npm run check`, `npm test`, and `npm run demo` using the current module output. diff --git a/ai-research-tool-evaluation-harness/demo.js b/ai-research-tool-evaluation-harness/demo.js new file mode 100644 index 0000000..d6ecf44 --- /dev/null +++ b/ai-research-tool-evaluation-harness/demo.js @@ -0,0 +1,23 @@ +"use strict"; + +const sampleBundle = require("./sample-data.json"); +const { evaluateResearchToolRun } = require("./src/research-tool-evaluation-harness"); + +const report = evaluateResearchToolRun(sampleBundle); + +console.log(JSON.stringify({ + manuscriptId: report.manuscriptId, + decision: report.decision, + score: report.score, + summaryDecisions: report.summaryReports.map((summary) => ({ + mode: summary.mode, + decision: summary.decision, + findingIds: summary.findings.map((finding) => finding.id), + })), + peerReviewDecision: report.peerReviewReport.decision, + citationDecision: report.citationReport.decision, + blockerActions: report.actionQueue + .filter((action) => action.severity === "blocker") + .map((action) => action.findingId), + auditDigest: report.auditDigest, +}, null, 2)); diff --git a/ai-research-tool-evaluation-harness/docs/requirement-map.md b/ai-research-tool-evaluation-harness/docs/requirement-map.md new file mode 100644 index 0000000..f56fe6f --- /dev/null +++ b/ai-research-tool-evaluation-harness/docs/requirement-map.md @@ -0,0 +1,14 @@ +# Requirement Map + +Issue: SCIBASE-AI/SCIBASE.AI#13 - AI-Assisted Research Tools (MVP Level) + +| Requirement | Implementation | +| --- | --- | +| AI paper summarizer | `evaluateSummaryOutputs` checks mode-specific summaries, key findings, implications, next steps, and grounding to manuscript evidence. | +| Abstract, executive, layperson modes | `evaluateSummaryMode` applies deterministic constraints for each mode, including layperson jargon checks. | +| Domain-aware output | The sample manuscript carries domain metadata, and the evaluator requires evidence-backed findings rather than generic prose. | +| AI peer review aid | `evaluatePeerReviewAid` checks statistical diagnostics, compliance diagnostics, similarity risk, clarity findings, and template coverage. | +| Statistical and compliance checks | The harness flags p-value interpretation risks, missing confidence intervals, missing ethics statements, and missing data availability statements. | +| AI citation tool | `evaluateCitationRecommendations` maps required claims to citation candidates and verifies evidence IDs, confidence, DOI metadata, and formatting. | +| Multiple reference styles | APA and Nature formatting are validated with style-specific checks. | +| Immediate reviewer value | `evaluateResearchToolRun` returns a readiness decision, action queue, score, and SHA-256 audit digest suitable for PR/demo review. | diff --git a/ai-research-tool-evaluation-harness/docs/research-tool-evaluation-demo.mp4 b/ai-research-tool-evaluation-harness/docs/research-tool-evaluation-demo.mp4 new file mode 100644 index 0000000..fcedbfb Binary files /dev/null and b/ai-research-tool-evaluation-harness/docs/research-tool-evaluation-demo.mp4 differ diff --git a/ai-research-tool-evaluation-harness/package.json b/ai-research-tool-evaluation-harness/package.json new file mode 100644 index 0000000..8f9845c --- /dev/null +++ b/ai-research-tool-evaluation-harness/package.json @@ -0,0 +1,13 @@ +{ + "name": "ai-research-tool-evaluation-harness", + "version": "1.0.0", + "private": true, + "description": "Deterministic evaluation harness for AI-assisted research tool outputs.", + "main": "src/research-tool-evaluation-harness.js", + "scripts": { + "check": "node --check src/research-tool-evaluation-harness.js && node --check test.js && node --check demo.js", + "test": "node test.js", + "demo": "node demo.js" + }, + "license": "MIT" +} diff --git a/ai-research-tool-evaluation-harness/sample-data.json b/ai-research-tool-evaluation-harness/sample-data.json new file mode 100644 index 0000000..e4c0a4b --- /dev/null +++ b/ai-research-tool-evaluation-harness/sample-data.json @@ -0,0 +1,128 @@ +{ + "now": "2026-05-15T17:30:00.000Z", + "manuscript": { + "id": "ms-neuro-metabolomics-042", + "title": "Metabolomic markers of sleep disruption in shift workers", + "domain": "biomedicine", + "requiresEthicsStatement": true, + "requiresDataAvailability": true + }, + "evidenceCorpus": [ + { + "id": "ev-objective", + "sourceType": "manuscript", + "title": "Abstract objective", + "excerpt": "We evaluated whether plasma metabolomic profiles differed between night-shift and day-shift hospital staff.", + "authors": ["Nguyen Lab"], + "year": 2026 + }, + { + "id": "ev-method", + "sourceType": "manuscript", + "title": "Methods", + "excerpt": "The cohort included 184 participants with repeated fasting plasma samples and actigraphy-derived sleep measures.", + "authors": ["Nguyen Lab"], + "year": 2026 + }, + { + "id": "ev-result", + "sourceType": "manuscript", + "title": "Results", + "excerpt": "Night-shift workers showed higher kynurenine and lower phosphatidylcholine levels after adjustment for age, sex, caffeine intake, and body mass index.", + "authors": ["Nguyen Lab"], + "year": 2026 + }, + { + "id": "ev-stat-risk", + "sourceType": "manuscript", + "title": "Statistical notes", + "excerpt": "The draft describes p = 0.08 as significant for the exploratory lipid panel and omits confidence intervals for the adjusted odds ratio.", + "authors": ["Nguyen Lab"], + "year": 2026 + }, + { + "id": "ref-kynurenine", + "sourceType": "reference", + "title": "Kynurenine pathway changes in circadian disruption", + "excerpt": "Prior work links circadian disruption with kynurenine pathway remodeling and inflammatory signaling.", + "authors": ["Patel", "Singh"], + "year": 2024, + "doi": "10.5555/circadian.kynurenine", + "openAccess": true + } + ], + "claims": [ + { + "id": "claim-kynurenine", + "text": "Night-shift work is associated with higher kynurenine concentrations.", + "citationRequired": true, + "evidenceIds": ["ev-result", "ref-kynurenine"] + }, + { + "id": "claim-causal-sleep", + "text": "Sleep disruption causes the observed phosphatidylcholine decrease.", + "citationRequired": true, + "evidenceIds": [] + } + ], + "toolOutputs": { + "summaries": [ + { + "mode": "abstract", + "text": "Objective: evaluate plasma metabolomic profiles in night-shift and day-shift hospital staff. Methods: 184 participants provided repeated fasting plasma samples with actigraphy-derived sleep measures. Results: night-shift workers showed higher kynurenine and lower phosphatidylcholine after adjustment. Conclusion: the data support a metabolomic association with shift work, but causal sleep claims need stronger evidence.", + "evidenceIds": ["ev-objective", "ev-method", "ev-result"], + "keyFindings": ["Higher kynurenine in night-shift workers"], + "implications": ["Metabolomic screening may help prioritize follow-up studies"], + "nextSteps": ["Add confidence intervals and avoid causal wording"] + }, + { + "mode": "layperson", + "text": "This paper says night-shift hospital staff had metabolomic perturbations in kynurenine and phosphatidylcholine.", + "evidenceIds": ["ev-result"], + "keyFindings": ["Night-shift workers had different blood chemistry"], + "implications": ["Work schedules may be linked with health-related chemistry changes"], + "nextSteps": ["Explain the result with simpler words"] + } + ], + "peerReview": { + "template": "biomedicine", + "diagnostics": [ + { + "id": "stat-pvalue-001", + "category": "statistics", + "severity": "blocker", + "message": "The draft calls p = 0.08 significant.", + "evidenceIds": ["ev-stat-risk"] + }, + { + "id": "clarity-001", + "category": "clarity", + "severity": "warning", + "message": "Lay summary contains domain jargon.", + "evidenceIds": ["ev-result"] + } + ], + "similarity": { + "score": 0.18, + "matchedSources": [] + }, + "compliance": { + "ethicsStatement": "IRB-2026-041 approved the protocol.", + "dataAvailability": "", + "fundingStatement": "Supported by institutional pilot funding.", + "conflictStatement": "The authors report no conflicts." + } + }, + "citations": [ + { + "claimId": "claim-kynurenine", + "sourceId": "ref-kynurenine", + "style": "APA", + "formatted": "Patel, & Singh. (2024). Kynurenine pathway changes in circadian disruption. https://doi.org/10.5555/circadian.kynurenine", + "confidence": 0.91, + "evidenceIds": ["ev-result", "ref-kynurenine"], + "insertAfter": "higher kynurenine concentrations" + } + ] + } +} diff --git a/ai-research-tool-evaluation-harness/src/research-tool-evaluation-harness.js b/ai-research-tool-evaluation-harness/src/research-tool-evaluation-harness.js new file mode 100644 index 0000000..19fb4f9 --- /dev/null +++ b/ai-research-tool-evaluation-harness/src/research-tool-evaluation-harness.js @@ -0,0 +1,469 @@ +"use strict"; + +const crypto = require("node:crypto"); + +const REQUIRED_SUMMARY_MODES = new Set(["abstract", "executive", "layperson"]); +const PEER_REVIEW_CATEGORIES = new Set(["statistics", "ethics", "data-availability", "clarity", "similarity"]); +const LAYPERSON_JARGON = [ + "metabolomic", + "kynurenine", + "phosphatidylcholine", + "perturbations", + "actigraphy", +]; + +function stableStringify(value) { + if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`; + if (value && typeof value === "object") { + return `{${Object.keys(value) + .sort() + .map((key) => `${JSON.stringify(key)}:${stableStringify(value[key])}`) + .join(",")}}`; + } + return JSON.stringify(value); +} + +function stableHash(value) { + return crypto.createHash("sha256").update(stableStringify(value)).digest("hex"); +} + +function requireFields(object, fields, label) { + const missing = fields.filter((field) => object[field] === undefined || object[field] === null); + if (missing.length > 0) throw new Error(`${label} is missing required field(s): ${missing.join(", ")}`); +} + +function finding(severity, id, title, detail, remediation, targetIds = []) { + return { severity, id, title, detail, remediation, targetIds }; +} + +function tokenize(text) { + return new Set(String(text || "") + .toLowerCase() + .replace(/[^a-z0-9\s-]/g, " ") + .split(/\s+/) + .filter((token) => token.length > 3)); +} + +function wordCount(text) { + return String(text || "").trim().split(/\s+/).filter(Boolean).length; +} + +function indexById(items, label) { + return new Map(items.map((item) => { + requireFields(item, ["id"], label); + return [item.id, item]; + })); +} + +function evidenceOverlap(text, evidence) { + const textTokens = tokenize(text); + const evidenceTokens = tokenize(evidence.excerpt || evidence.title || ""); + let overlap = 0; + for (const token of textTokens) { + if (evidenceTokens.has(token)) overlap += 1; + } + return overlap; +} + +function validateEvidenceIds(evidenceIds, evidenceIndex, targetId) { + const findings = []; + for (const evidenceId of evidenceIds || []) { + if (!evidenceIndex.has(evidenceId)) { + findings.push(finding( + "blocker", + "unknown-evidence-id", + "Output references unknown evidence", + `${targetId} references evidence ${evidenceId}, which is not in the evidence corpus.`, + "Use only registered manuscript or reference evidence IDs.", + [targetId, evidenceId], + )); + } + } + return findings; +} + +function evaluateSummaryMode(summary, evidenceIndex) { + requireFields(summary, ["mode", "text", "evidenceIds"], "summary"); + const findings = []; + const summaryId = `summary:${summary.mode}`; + const count = wordCount(summary.text); + + findings.push(...validateEvidenceIds(summary.evidenceIds, evidenceIndex, summaryId)); + + if (!REQUIRED_SUMMARY_MODES.has(summary.mode)) { + findings.push(finding( + "warning", + "unknown-summary-mode", + "Summary mode is not part of the MVP contract", + `${summary.mode} is not one of abstract, executive, or layperson.`, + "Use one of the supported MVP summary modes.", + [summary.mode], + )); + } + + if (summary.evidenceIds.length === 0) { + findings.push(finding( + "blocker", + "summary-evidence-missing", + "Summary has no evidence spans", + `${summaryId} does not cite any manuscript evidence.`, + "Attach evidence IDs for every summary mode before exposing the output.", + [summary.mode], + )); + } + + const supported = summary.evidenceIds.some((evidenceId) => { + const evidence = evidenceIndex.get(evidenceId); + return evidence && evidenceOverlap(summary.text, evidence) >= 2; + }); + if (!supported) { + findings.push(finding( + "blocker", + "summary-not-grounded", + "Summary text is weakly grounded", + `${summaryId} has evidence IDs but little lexical overlap with the cited evidence.`, + "Rewrite the summary with direct support from the cited evidence spans.", + [summary.mode], + )); + } + + if (summary.mode === "abstract") { + for (const marker of ["objective", "methods", "results", "conclusion"]) { + if (!summary.text.toLowerCase().includes(marker)) { + findings.push(finding( + "warning", + "abstract-section-missing", + "Abstract summary is missing a structured marker", + `The abstract-style summary does not include ${marker}.`, + "Include objective, methods, results, and conclusion markers.", + [summary.mode, marker], + )); + } + } + if (count > 180) { + findings.push(finding( + "warning", + "abstract-too-long", + "Abstract summary is too long", + `The abstract summary has ${count} words.`, + "Keep the abstract-style summary concise.", + [summary.mode], + )); + } + } + + if (summary.mode === "layperson") { + const jargon = LAYPERSON_JARGON.filter((term) => summary.text.toLowerCase().includes(term)); + if (jargon.length > 0) { + findings.push(finding( + "warning", + "layperson-jargon", + "Layperson summary contains unexplained jargon", + `The layperson summary contains: ${jargon.join(", ")}.`, + "Replace jargon with plain-language descriptions or define it inline.", + [summary.mode, ...jargon], + )); + } + } + + for (const field of ["keyFindings", "implications", "nextSteps"]) { + if (!Array.isArray(summary[field]) || summary[field].length === 0) { + findings.push(finding( + "warning", + "summary-output-section-missing", + "Summary omits an expected output section", + `${summaryId} has no ${field}.`, + "Return key findings, implications, and next steps for collaborator-ready summaries.", + [summary.mode, field], + )); + } + } + + return { + mode: summary.mode, + wordCount: count, + decision: findings.some((item) => item.severity === "blocker") + ? "blocked" + : findings.length > 0 ? "needs-review" : "ready", + findings, + }; +} + +function evaluateSummaryOutputs(summaries, evidenceIndex) { + const reports = summaries.map((summary) => evaluateSummaryMode(summary, evidenceIndex)); + const modes = new Set(summaries.map((summary) => summary.mode)); + for (const requiredMode of REQUIRED_SUMMARY_MODES) { + if (!modes.has(requiredMode)) { + reports.push({ + mode: requiredMode, + wordCount: 0, + decision: "needs-review", + findings: [finding( + "warning", + "summary-mode-missing", + "Expected summary mode is missing", + `No ${requiredMode} summary was provided.`, + "Generate all MVP summary modes before marking the workflow complete.", + [requiredMode], + )], + }); + } + } + return reports; +} + +function evaluatePeerReviewAid(peerReview, manuscript, evidenceIndex) { + requireFields(peerReview, ["template", "diagnostics", "compliance"], "peer review output"); + const findings = []; + const categories = new Set(peerReview.diagnostics.map((diagnostic) => diagnostic.category)); + + for (const diagnostic of peerReview.diagnostics) { + requireFields(diagnostic, ["id", "category", "severity", "message", "evidenceIds"], "peer review diagnostic"); + findings.push(...validateEvidenceIds(diagnostic.evidenceIds, evidenceIndex, diagnostic.id)); + if (!PEER_REVIEW_CATEGORIES.has(diagnostic.category)) { + findings.push(finding( + "warning", + "unknown-peer-review-category", + "Peer-review diagnostic category is not recognized", + `${diagnostic.id} uses category ${diagnostic.category}.`, + "Use one of the standard MVP diagnostic categories.", + [diagnostic.id, diagnostic.category], + )); + } + } + + if (!categories.has("statistics")) { + findings.push(finding( + "warning", + "statistics-diagnostic-missing", + "Statistical diagnostics are missing", + "The peer-review aid did not evaluate statistical risks.", + "Check p-values, confidence intervals, effect sizes, and sample-size reporting.", + ["statistics"], + )); + } + + const statEvidence = [...evidenceIndex.values()].find((item) => /p\s*=\s*0\.08/i.test(item.excerpt || "")); + const statDiagnostic = peerReview.diagnostics.some((item) => item.category === "statistics" && item.severity !== "info"); + if (statEvidence && !statDiagnostic) { + findings.push(finding( + "blocker", + "pvalue-risk-undetected", + "P-value interpretation risk was not detected", + "The evidence corpus contains p = 0.08 described as significant, but no statistical warning was returned.", + "Add a statistics diagnostic for p-value and confidence interval consistency.", + ["statistics", statEvidence.id], + )); + } + + if (manuscript.requiresEthicsStatement && !peerReview.compliance.ethicsStatement) { + findings.push(finding( + "blocker", + "ethics-statement-missing", + "Required ethics statement is missing", + "The manuscript requires ethics documentation but the peer-review output did not find one.", + "Add or request an ethics statement before submission.", + ["ethicsStatement"], + )); + } + + if (manuscript.requiresDataAvailability && !peerReview.compliance.dataAvailability) { + findings.push(finding( + "blocker", + "data-availability-missing", + "Required data availability statement is missing", + "The manuscript requires a data availability statement but none was detected.", + "Add a data availability statement or explain controlled-access restrictions.", + ["dataAvailability"], + )); + } + + if (peerReview.similarity && peerReview.similarity.score >= 0.35) { + findings.push(finding( + "blocker", + "similarity-score-high", + "Similarity score requires plagiarism review", + `Similarity score is ${peerReview.similarity.score}.`, + "Review matched sources before submission.", + ["similarity"], + )); + } + + return { + template: peerReview.template, + diagnosticsChecked: peerReview.diagnostics.length, + decision: findings.some((item) => item.severity === "blocker") + ? "blocked" + : findings.length > 0 ? "needs-review" : "ready", + findings, + }; +} + +function formatLooksValid(style, formatted) { + if (style === "APA") return /\(\d{4}\)/.test(formatted) && /doi\.org\//.test(formatted); + if (style === "Nature") return /^\d+\.\s+/.test(formatted) || /\.\s+\d{4};/.test(formatted); + return formatted.length > 20; +} + +function evaluateCitationRecommendations(citations, claims, evidenceIndex) { + const findings = []; + const citationByClaim = new Map(citations.map((citation) => [citation.claimId, citation])); + + for (const claim of claims) { + requireFields(claim, ["id", "text", "citationRequired"], "claim"); + const citation = citationByClaim.get(claim.id); + if (claim.citationRequired && !citation) { + findings.push(finding( + "blocker", + "required-claim-citation-missing", + "Required claim has no citation recommendation", + `${claim.id} requires a citation but no recommendation was provided.`, + "Return a citation recommendation or downgrade unsupported claim language.", + [claim.id], + )); + continue; + } + + if (!citation) continue; + requireFields(citation, ["claimId", "sourceId", "style", "formatted", "confidence", "evidenceIds"], "citation"); + findings.push(...validateEvidenceIds(citation.evidenceIds, evidenceIndex, citation.claimId)); + + const source = evidenceIndex.get(citation.sourceId); + if (!source) { + findings.push(finding( + "blocker", + "citation-source-missing", + "Citation source is not registered", + `${citation.claimId} references source ${citation.sourceId}, which is not in the evidence corpus.`, + "Recommend citations from registered references.", + [citation.claimId, citation.sourceId], + )); + } else if (source.sourceType === "reference" && !source.doi) { + findings.push(finding( + "warning", + "citation-doi-missing", + "External reference lacks DOI", + `${source.id} has no DOI metadata.`, + "Attach DOI metadata when available for reliable reference formatting.", + [citation.claimId, source.id], + )); + } + + if (citation.confidence < 0.75) { + findings.push(finding( + "warning", + "citation-confidence-low", + "Citation confidence is below review threshold", + `${citation.claimId} confidence is ${citation.confidence}.`, + "Ask a reviewer to confirm the citation before insertion.", + [citation.claimId], + )); + } + + if (!formatLooksValid(citation.style, citation.formatted)) { + findings.push(finding( + "warning", + "citation-format-invalid", + "Citation formatting does not match requested style", + `${citation.claimId} has a ${citation.style} citation that failed deterministic formatting checks.`, + "Regenerate the reference in the requested journal style.", + [citation.claimId, citation.style], + )); + } + + const claimEvidence = new Set(claim.evidenceIds || []); + const overlapsClaim = citation.evidenceIds.some((evidenceId) => claimEvidence.has(evidenceId)); + if (claim.evidenceIds.length > 0 && !overlapsClaim) { + findings.push(finding( + "blocker", + "citation-not-linked-to-claim-evidence", + "Citation is not linked to claim evidence", + `${citation.claimId} recommendation does not reference any expected claim evidence span.`, + "Tie citation recommendations to the claim's supporting manuscript or reference evidence.", + [citation.claimId], + )); + } + } + + return { + citationCount: citations.length, + requiredClaimCount: claims.filter((claim) => claim.citationRequired).length, + decision: findings.some((item) => item.severity === "blocker") + ? "blocked" + : findings.length > 0 ? "needs-review" : "ready", + findings, + }; +} + +function buildActionQueue(reports) { + return reports.flatMap((report) => { + return report.findings.map((item) => ({ + severity: item.severity, + findingId: item.id, + title: item.title, + remediation: item.remediation, + targetIds: item.targetIds, + })); + }).sort((a, b) => { + const rank = { blocker: 0, warning: 1, info: 2 }; + return rank[a.severity] - rank[b.severity] || a.findingId.localeCompare(b.findingId); + }); +} + +function calculateReadinessScore(reports) { + const findings = reports.flatMap((report) => report.findings); + const penalty = findings.reduce((score, item) => { + if (item.severity === "blocker") return score + 18; + if (item.severity === "warning") return score + 7; + return score + 2; + }, 0); + return Math.max(0, 100 - penalty); +} + +function evaluateResearchToolRun(bundle, options = {}) { + requireFields(bundle, ["manuscript", "evidenceCorpus", "claims", "toolOutputs"], "research tool bundle"); + requireFields(bundle.manuscript, ["id", "title", "domain"], "manuscript"); + requireFields(bundle.toolOutputs, ["summaries", "peerReview", "citations"], "tool outputs"); + + const evidenceIndex = indexById(bundle.evidenceCorpus, "evidence"); + const summaryReports = evaluateSummaryOutputs(bundle.toolOutputs.summaries, evidenceIndex); + const peerReviewReport = evaluatePeerReviewAid(bundle.toolOutputs.peerReview, bundle.manuscript, evidenceIndex); + const citationReport = evaluateCitationRecommendations(bundle.toolOutputs.citations, bundle.claims, evidenceIndex); + + const reports = [...summaryReports, peerReviewReport, citationReport]; + const actionQueue = buildActionQueue(reports); + const score = calculateReadinessScore(reports); + const decision = actionQueue.some((item) => item.severity === "blocker") + ? "review-required" + : actionQueue.length > 0 ? "human-check-recommended" : "ready-for-assisted-use"; + const evaluatedAt = options.now || bundle.now || new Date().toISOString(); + const auditDigest = stableHash({ + manuscriptId: bundle.manuscript.id, + evaluatedAt, + decision, + score, + summaryReports, + peerReviewReport, + citationReport, + actionQueue, + }); + + return { + manuscriptId: bundle.manuscript.id, + title: bundle.manuscript.title, + evaluatedAt, + decision, + score, + summaryReports, + peerReviewReport, + citationReport, + actionQueue, + auditDigest: `sha256:${auditDigest}`, + }; +} + +module.exports = { + evaluateResearchToolRun, + stableHash, + stableStringify, +}; diff --git a/ai-research-tool-evaluation-harness/test.js b/ai-research-tool-evaluation-harness/test.js new file mode 100644 index 0000000..bf3f943 --- /dev/null +++ b/ai-research-tool-evaluation-harness/test.js @@ -0,0 +1,59 @@ +"use strict"; + +const assert = require("node:assert/strict"); +const sampleBundle = require("./sample-data.json"); +const { + evaluateResearchToolRun, + stableHash, +} = require("./src/research-tool-evaluation-harness"); + +function clone(value) { + return JSON.parse(JSON.stringify(value)); +} + +const blocked = evaluateResearchToolRun(sampleBundle); + +assert.equal(blocked.decision, "review-required"); +assert.equal(blocked.score < 100, true); +assert.match(blocked.auditDigest, /^sha256:[a-f0-9]{64}$/); +assert(blocked.actionQueue.some((item) => item.findingId === "data-availability-missing")); +assert(blocked.actionQueue.some((item) => item.findingId === "required-claim-citation-missing")); +assert(blocked.actionQueue.some((item) => item.findingId === "layperson-jargon")); +assert.equal(blocked.peerReviewReport.decision, "blocked"); +assert.equal(blocked.citationReport.decision, "blocked"); + +const readyBundle = clone(sampleBundle); +readyBundle.toolOutputs.summaries.push({ + mode: "executive", + text: "Night-shift workers showed higher kynurenine and lower phosphatidylcholine levels after adjustment for age, caffeine intake, and body mass index.", + evidenceIds: ["ev-objective", "ev-result"], + keyFindings: ["Night-shift staff had different blood chemistry markers"], + implications: ["The result can guide follow-up studies"], + nextSteps: ["Add confidence intervals and data availability text"], +}); +readyBundle.toolOutputs.summaries[1].text = "This paper says night-shift workers showed higher blood marker levels and lower lipid levels after adjustment for age, caffeine, and body size."; +readyBundle.toolOutputs.peerReview.compliance.dataAvailability = "De-identified tables and analysis scripts will be deposited in the project repository before publication."; +readyBundle.toolOutputs.citations.push({ + claimId: "claim-causal-sleep", + sourceId: "ev-result", + style: "Nature", + formatted: "1. Nguyen Lab. Results. 2026;", + confidence: 0.82, + evidenceIds: ["ev-result"], + insertAfter: "phosphatidylcholine decrease", +}); +readyBundle.claims[1].text = "Sleep disruption may be associated with the observed phosphatidylcholine decrease."; +readyBundle.claims[1].evidenceIds = ["ev-result"]; + +const ready = evaluateResearchToolRun(readyBundle); + +assert.equal(ready.decision, "ready-for-assisted-use"); +assert.equal(ready.score, 100); +assert.equal(ready.actionQueue.length, 0); +assert.equal(ready.summaryReports.every((report) => report.decision === "ready"), true); +assert.equal(ready.peerReviewReport.decision, "ready"); +assert.equal(ready.citationReport.decision, "ready"); +assert.notEqual(blocked.auditDigest, ready.auditDigest); +assert.equal(stableHash({ b: 2, a: 1 }), stableHash({ a: 1, b: 2 })); + +console.log("ai research tool evaluation harness tests passed");