SCIBASE-AI · zhengjynicolas · May 15, 2026
diff --git a/ai-research-tool-evaluation-harness/README.md b/ai-research-tool-evaluation-harness/README.md
@@ -0,0 +1,25 @@
+# AI Research Tool Evaluation Harness
+
+This module adds a focused quality gate for the AI-Assisted Research Tools MVP. It does not call a live model. Instead, it evaluates proposed summarizer, peer-review, and citation-tool outputs against deterministic manuscript evidence so reviewers can see whether AI outputs are grounded, compliant, and insertion-ready.
+
+## What it checks
+
+- Summary mode contracts for abstract, executive, and layperson outputs
+- Evidence-span coverage for findings, implications, next steps, and recommendations
+- Peer-review diagnostics for statistical, ethics, data availability, clarity, and similarity risks
+- Citation recommendations for claim coverage, confidence, DOI metadata, and APA/Nature formatting
+- Actionable quality gates with blocker/warning severities, readiness score, and audit digest
+
+## Run locally
+
+```bash
+npm run check
+npm test
+npm run demo
+```
+
+The sample bundle intentionally flags one unsupported citation gap, one missing data availability statement, one p-value interpretation risk, and one layperson-summary jargon issue.
+
+## Demo video
+
+The reviewer demo is at `docs/research-tool-evaluation-demo.mp4`. It shows a terminal-style walkthrough of `npm run check`, `npm test`, and `npm run demo` using the current module output.
diff --git a/ai-research-tool-evaluation-harness/demo.js b/ai-research-tool-evaluation-harness/demo.js
@@ -0,0 +1,23 @@
+"use strict";
+
+const sampleBundle = require("./sample-data.json");
+const { evaluateResearchToolRun } = require("./src/research-tool-evaluation-harness");
+
+const report = evaluateResearchToolRun(sampleBundle);
+
+console.log(JSON.stringify({
+  manuscriptId: report.manuscriptId,
+  decision: report.decision,
+  score: report.score,
+  summaryDecisions: report.summaryReports.map((summary) => ({
+    mode: summary.mode,
+    decision: summary.decision,
+    findingIds: summary.findings.map((finding) => finding.id),
+  })),
+  peerReviewDecision: report.peerReviewReport.decision,
+  citationDecision: report.citationReport.decision,
+  blockerActions: report.actionQueue
+    .filter((action) => action.severity === "blocker")
+    .map((action) => action.findingId),
+  auditDigest: report.auditDigest,
+}, null, 2));
diff --git a/ai-research-tool-evaluation-harness/docs/requirement-map.md b/ai-research-tool-evaluation-harness/docs/requirement-map.md
@@ -0,0 +1,14 @@
+# Requirement Map
+
+Issue: SCIBASE-AI/SCIBASE.AI#13 - AI-Assisted Research Tools (MVP Level)
+
+| Requirement | Implementation |
+| --- | --- |
+| AI paper summarizer | `evaluateSummaryOutputs` checks mode-specific summaries, key findings, implications, next steps, and grounding to manuscript evidence. |
+| Abstract, executive, layperson modes | `evaluateSummaryMode` applies deterministic constraints for each mode, including layperson jargon checks. |
+| Domain-aware output | The sample manuscript carries domain metadata, and the evaluator requires evidence-backed findings rather than generic prose. |
+| AI peer review aid | `evaluatePeerReviewAid` checks statistical diagnostics, compliance diagnostics, similarity risk, clarity findings, and template coverage. |
+| Statistical and compliance checks | The harness flags p-value interpretation risks, missing confidence intervals, missing ethics statements, and missing data availability statements. |
+| AI citation tool | `evaluateCitationRecommendations` maps required claims to citation candidates and verifies evidence IDs, confidence, DOI metadata, and formatting. |
+| Multiple reference styles | APA and Nature formatting are validated with style-specific checks. |
+| Immediate reviewer value | `evaluateResearchToolRun` returns a readiness decision, action queue, score, and SHA-256 audit digest suitable for PR/demo review. |
diff --git a/ai-research-tool-evaluation-harness/docs/research-tool-evaluation-demo.mp4 b/ai-research-tool-evaluation-harness/docs/research-tool-evaluation-demo.mp4
diff --git a/ai-research-tool-evaluation-harness/package.json b/ai-research-tool-evaluation-harness/package.json
@@ -0,0 +1,13 @@
+{
+  "name": "ai-research-tool-evaluation-harness",
+  "version": "1.0.0",
+  "private": true,
+  "description": "Deterministic evaluation harness for AI-assisted research tool outputs.",
+  "main": "src/research-tool-evaluation-harness.js",
+  "scripts": {
+    "check": "node --check src/research-tool-evaluation-harness.js && node --check test.js && node --check demo.js",
+    "test": "node test.js",
+    "demo": "node demo.js"
+  },
+  "license": "MIT"
+}
diff --git a/ai-research-tool-evaluation-harness/sample-data.json b/ai-research-tool-evaluation-harness/sample-data.json
@@ -0,0 +1,128 @@
+{
+  "now": "2026-05-15T17:30:00.000Z",
+  "manuscript": {
+    "id": "ms-neuro-metabolomics-042",
+    "title": "Metabolomic markers of sleep disruption in shift workers",
+    "domain": "biomedicine",
+    "requiresEthicsStatement": true,
+    "requiresDataAvailability": true
+  },
+  "evidenceCorpus": [
+    {
+      "id": "ev-objective",
+      "sourceType": "manuscript",
+      "title": "Abstract objective",
+      "excerpt": "We evaluated whether plasma metabolomic profiles differed between night-shift and day-shift hospital staff.",
+      "authors": ["Nguyen Lab"],
+      "year": 2026
+    },
+    {
+      "id": "ev-method",
+      "sourceType": "manuscript",
+      "title": "Methods",
+      "excerpt": "The cohort included 184 participants with repeated fasting plasma samples and actigraphy-derived sleep measures.",
+      "authors": ["Nguyen Lab"],
+      "year": 2026
+    },
+    {
+      "id": "ev-result",
+      "sourceType": "manuscript",
+      "title": "Results",
+      "excerpt": "Night-shift workers showed higher kynurenine and lower phosphatidylcholine levels after adjustment for age, sex, caffeine intake, and body mass index.",
+      "authors": ["Nguyen Lab"],
+      "year": 2026
+    },
+    {
+      "id": "ev-stat-risk",
+      "sourceType": "manuscript",
+      "title": "Statistical notes",
+      "excerpt": "The draft describes p = 0.08 as significant for the exploratory lipid panel and omits confidence intervals for the adjusted odds ratio.",
+      "authors": ["Nguyen Lab"],
+      "year": 2026
+    },
+    {
+      "id": "ref-kynurenine",
+      "sourceType": "reference",
+      "title": "Kynurenine pathway changes in circadian disruption",
+      "excerpt": "Prior work links circadian disruption with kynurenine pathway remodeling and inflammatory signaling.",
+      "authors": ["Patel", "Singh"],
+      "year": 2024,
+      "doi": "10.5555/circadian.kynurenine",
+      "openAccess": true
+    }
+  ],
+  "claims": [
+    {
+      "id": "claim-kynurenine",
+      "text": "Night-shift work is associated with higher kynurenine concentrations.",
+      "citationRequired": true,
+      "evidenceIds": ["ev-result", "ref-kynurenine"]
+    },
+    {
+      "id": "claim-causal-sleep",
+      "text": "Sleep disruption causes the observed phosphatidylcholine decrease.",
+      "citationRequired": true,
+      "evidenceIds": []
+    }
+  ],
+  "toolOutputs": {
+    "summaries": [
+      {
+        "mode": "abstract",
+        "text": "Objective: evaluate plasma metabolomic profiles in night-shift and day-shift hospital staff. Methods: 184 participants provided repeated fasting plasma samples with actigraphy-derived sleep measures. Results: night-shift workers showed higher kynurenine and lower phosphatidylcholine after adjustment. Conclusion: the data support a metabolomic association with shift work, but causal sleep claims need stronger evidence.",
+        "evidenceIds": ["ev-objective", "ev-method", "ev-result"],
+        "keyFindings": ["Higher kynurenine in night-shift workers"],
+        "implications": ["Metabolomic screening may help prioritize follow-up studies"],
+        "nextSteps": ["Add confidence intervals and avoid causal wording"]
+      },
+      {
+        "mode": "layperson",
+        "text": "This paper says night-shift hospital staff had metabolomic perturbations in kynurenine and phosphatidylcholine.",
+        "evidenceIds": ["ev-result"],
+        "keyFindings": ["Night-shift workers had different blood chemistry"],
+        "implications": ["Work schedules may be linked with health-related chemistry changes"],
+        "nextSteps": ["Explain the result with simpler words"]
+      }
+    ],
+    "peerReview": {
+      "template": "biomedicine",
+      "diagnostics": [
+        {
+          "id": "stat-pvalue-001",
+          "category": "statistics",
+          "severity": "blocker",
+          "message": "The draft calls p = 0.08 significant.",
+          "evidenceIds": ["ev-stat-risk"]
+        },
+        {
+          "id": "clarity-001",
+          "category": "clarity",
+          "severity": "warning",
+          "message": "Lay summary contains domain jargon.",
+          "evidenceIds": ["ev-result"]
+        }
+      ],
+      "similarity": {
+        "score": 0.18,
+        "matchedSources": []
+      },
+      "compliance": {
+        "ethicsStatement": "IRB-2026-041 approved the protocol.",
+        "dataAvailability": "",
+        "fundingStatement": "Supported by institutional pilot funding.",
+        "conflictStatement": "The authors report no conflicts."
+      }
+    },
+    "citations": [
+      {
+        "claimId": "claim-kynurenine",
+        "sourceId": "ref-kynurenine",
+        "style": "APA",
+        "formatted": "Patel, & Singh. (2024). Kynurenine pathway changes in circadian disruption. https://doi.org/10.5555/circadian.kynurenine",
+        "confidence": 0.91,
+        "evidenceIds": ["ev-result", "ref-kynurenine"],
+        "insertAfter": "higher kynurenine concentrations"
+      }
+    ]
+  }
+}