diff --git a/review-calibration-bench/README.md b/review-calibration-bench/README.md new file mode 100644 index 0000000..95b0d70 --- /dev/null +++ b/review-calibration-bench/README.md @@ -0,0 +1,35 @@ +# Review Calibration Bench + +Dependency-free peer-review calibration and coaching signals for the Community & Reputation System bounty. + +This module focuses on the quality gate before peer-review activity increases a researcher's public reputation. It compares structured review rubric scores against consensus panels, identifies leniency/severity drift, flags reproducibility blind spots, and emits transparent trust-adjustment and coaching actions. + +## Run + +```bash +npm run check +npm test +npm run demo +``` + +## Demo Output + +```text +Program: community-review-q2-2026 +Status: coaching-needed +Reviewers calibrated: 3 +Coaching actions: 7 +Quarantined reviewers: 1 +Top reviewer: ada (trusted-reviewer) +Top action: Review practice set: rigor scores are higher than consensus. +``` + +## Files + +- `src/review-calibration.js` builds calibration reports, leaderboard scores, coaching actions, trust adjustments, dashboard summary, audit trail, and stable digest. +- `data/sample-calibration.json` contains synthetic structured reviews, consensus panels, reviewer modes, and contributor signals. +- `test/review-calibration.test.js` verifies calibration scores, drift classification, blind-spot detection, quarantine behavior, leaderboard ordering, and digest stability. +- `docs/requirement-map.md` maps this slice to issue #15. +- `docs/demo.svg` and `docs/demo.mp4` provide a short visual artifact for review. + +No real private review content, identity secret, or external service credential is used. diff --git a/review-calibration-bench/data/sample-calibration.json b/review-calibration-bench/data/sample-calibration.json new file mode 100644 index 0000000..915de21 --- /dev/null +++ b/review-calibration-bench/data/sample-calibration.json @@ -0,0 +1,171 @@ +{ + "programId": "community-review-q2-2026", + "asOf": "2026-05-15T00:00:00Z", + "rubric": { + "criteria": [ + "clarity", + "rigor", + "novelty", + "reproducibility" + ], + "weights": { + "clarity": 0.2, + "rigor": 0.3, + "novelty": 0.2, + "reproducibility": 0.3 + }, + "calibrationThreshold": 0.7 + }, + "projects": [ + { + "id": "project-organoid-benchmark", + "domain": "biology", + "consensus": { + "clarity": 4, + "rigor": 4, + "novelty": 3, + "reproducibility": 2 + } + }, + { + "id": "project-graph-protocol", + "domain": "computational-science", + "consensus": { + "clarity": 3, + "rigor": 5, + "novelty": 4, + "reproducibility": 4 + } + }, + { + "id": "project-open-dataset-release", + "domain": "data-science", + "consensus": { + "clarity": 5, + "rigor": 3, + "novelty": 2, + "reproducibility": 5 + } + } + ], + "reviews": [ + { + "id": "review-ada-1", + "reviewerId": "ada", + "projectId": "project-organoid-benchmark", + "mode": "public", + "scores": { + "clarity": 4, + "rigor": 4, + "novelty": 3, + "reproducibility": 3 + }, + "comments": [ + "Strong protocol trace, but execution container still needs one replication note." + ] + }, + { + "id": "review-ada-2", + "reviewerId": "ada", + "projectId": "project-graph-protocol", + "mode": "public", + "scores": { + "clarity": 3, + "rigor": 4, + "novelty": 4, + "reproducibility": 4 + }, + "comments": [ + "Good evidence paths and adequate reproducibility metadata." + ] + }, + { + "id": "review-bohr-1", + "reviewerId": "bohr", + "projectId": "project-organoid-benchmark", + "mode": "anonymous", + "scores": { + "clarity": 5, + "rigor": 5, + "novelty": 5, + "reproducibility": 5 + }, + "comments": [ + "Excellent across all dimensions." + ] + }, + { + "id": "review-bohr-2", + "reviewerId": "bohr", + "projectId": "project-open-dataset-release", + "mode": "anonymous", + "scores": { + "clarity": 5, + "rigor": 5, + "novelty": 4, + "reproducibility": 5 + }, + "comments": [ + "Dataset appears ready for reuse." + ] + }, + { + "id": "review-curie-1", + "reviewerId": "curie", + "projectId": "project-graph-protocol", + "mode": "double-blind", + "scores": { + "clarity": 2, + "rigor": 4, + "novelty": 3, + "reproducibility": 2 + }, + "comments": [ + "Promising, but missing a complete notebook execution trail." + ] + }, + { + "id": "review-curie-2", + "reviewerId": "curie", + "projectId": "project-open-dataset-release", + "mode": "double-blind", + "scores": { + "clarity": 4, + "rigor": 3, + "novelty": 2, + "reproducibility": 3 + }, + "comments": [ + "Good data dictionary, but independent rerun evidence is incomplete." + ] + } + ], + "contributors": [ + { + "reviewerId": "ada", + "roles": [ + "review", + "validation" + ], + "completedBounties": 2, + "endorsements": 5 + }, + { + "reviewerId": "bohr", + "roles": [ + "review" + ], + "completedBounties": 4, + "endorsements": 8 + }, + { + "reviewerId": "curie", + "roles": [ + "review", + "reproducibility" + ], + "completedBounties": 1, + "endorsements": 3 + } + ] +} diff --git a/review-calibration-bench/docs/demo.mp4 b/review-calibration-bench/docs/demo.mp4 new file mode 100644 index 0000000..e9f885b Binary files /dev/null and b/review-calibration-bench/docs/demo.mp4 differ diff --git a/review-calibration-bench/docs/demo.svg b/review-calibration-bench/docs/demo.svg new file mode 100644 index 0000000..9ac20e1 --- /dev/null +++ b/review-calibration-bench/docs/demo.svg @@ -0,0 +1,29 @@ + + Review calibration bench demo + Dashboard-style summary of peer-review calibration, coaching queue, reviewer quarantine, and trusted reviewer tier. + + + Review Calibration Bench + Structured peer-review quality signals before reputation credit + + + Reviewers + 3 calibrated + + + + Coaching + 7 actions + + + + Quarantine + 1 reviewer + + + + Top action + Review practice set: rigor scores are higher than consensus. + Transparent trust adjustment ยท reproducibility blind spot detected + + diff --git a/review-calibration-bench/docs/requirement-map.md b/review-calibration-bench/docs/requirement-map.md new file mode 100644 index 0000000..1feda74 --- /dev/null +++ b/review-calibration-bench/docs/requirement-map.md @@ -0,0 +1,30 @@ +# Requirement Map + +This module contributes a focused structured-review quality layer for issue #15, "Community & Reputation System." + +| Issue area | Covered by this module | +| --- | --- | +| Peer reviews and comments | Scores structured reviews against discipline-neutral rubric criteria: clarity, rigor, novelty, reproducibility | +| Optional scoring quality | Compares reviewer scores against consensus panels and identifies systematic leniency, severity, and inconsistency | +| Review history on profiles | Builds reviewer calibration reports, modes used, review counts, and audit-trail events | +| Contributor credits | Includes CRediT-style reviewer roles and completed bounty / endorsement contribution signals | +| Reputation scoring | Emits transparent trust adjustments, tiers, and quarantine decisions when calibration is weak | +| Incentive tiers | Produces trusted reviewer, calibrated reviewer, coaching-needed, and mentor-required tiers | + +## Distinctness + +Existing #15 submissions cover broad community reputation ledgers, CRediT graphs, badges, leaderboards, abuse detection, and appeals. This module focuses on the quality gate before peer-review activity increases reputation: + +- Does the reviewer score close to consensus? +- Is a reviewer systematically too lenient or too severe? +- Does a reviewer overlook reproducibility? +- Should the review count toward reputation immediately, or enter a coaching queue first? + +## Verification + +```bash +cd review-calibration-bench +npm run check +npm test +npm run demo +``` diff --git a/review-calibration-bench/package.json b/review-calibration-bench/package.json new file mode 100644 index 0000000..286aa01 --- /dev/null +++ b/review-calibration-bench/package.json @@ -0,0 +1,18 @@ +{ + "name": "review-calibration-bench", + "version": "1.0.0", + "private": true, + "description": "Dependency-free peer-review calibration and coaching signals for scientific reputation systems.", + "scripts": { + "check": "node --check src/review-calibration.js && node --check scripts/demo.js && node --check test/review-calibration.test.js", + "demo": "node scripts/demo.js", + "test": "node test/review-calibration.test.js" + }, + "keywords": [ + "peer-review", + "reputation", + "calibration", + "review-quality" + ], + "license": "MIT" +} diff --git a/review-calibration-bench/scripts/demo.js b/review-calibration-bench/scripts/demo.js new file mode 100644 index 0000000..30a9312 --- /dev/null +++ b/review-calibration-bench/scripts/demo.js @@ -0,0 +1,16 @@ +const fs = require("node:fs"); +const path = require("node:path"); +const { buildCalibrationBench } = require("../src/review-calibration"); + +const samplePath = path.join(__dirname, "..", "data", "sample-calibration.json"); +const input = JSON.parse(fs.readFileSync(samplePath, "utf8")); +const report = buildCalibrationBench(input); + +console.log(`Program: ${report.programId}`); +console.log(`Status: ${report.dashboard.status}`); +console.log(`Reviewers calibrated: ${report.dashboard.reviewerCount}`); +console.log(`Coaching actions: ${report.dashboard.coachingActionCount}`); +console.log(`Quarantined reviewers: ${report.dashboard.quarantinedReviewerCount}`); +console.log(`Top reviewer: ${report.leaderboard[0].reviewerId} (${report.leaderboard[0].tier})`); +console.log(`Top action: ${report.dashboard.topAction}`); +console.log(`Digest: ${report.digest}`); diff --git a/review-calibration-bench/src/review-calibration.js b/review-calibration-bench/src/review-calibration.js new file mode 100644 index 0000000..d09a9d7 --- /dev/null +++ b/review-calibration-bench/src/review-calibration.js @@ -0,0 +1,274 @@ +const crypto = require("node:crypto"); + +function buildCalibrationBench(input) { + const validation = validateCalibrationInput(input); + const projectIndex = new Map((input.projects || []).map((project) => [project.id, project])); + const reviewerGroups = groupReviewsByReviewer(input.reviews || []); + const reviewerReports = Array.from(reviewerGroups.entries()).map(([reviewerId, reviews]) => + evaluateReviewer(reviewerId, reviews, projectIndex, input.rubric || {}, input.contributors || []) + ); + const leaderboard = buildLeaderboard(reviewerReports); + const coachingQueue = buildCoachingQueue(reviewerReports); + const trustAdjustments = buildTrustAdjustments(reviewerReports, input.rubric || {}); + const dashboard = buildDashboard(input, reviewerReports, coachingQueue, trustAdjustments); + + const report = { + programId: input.programId, + asOf: input.asOf, + validation, + reviewerReports, + leaderboard, + coachingQueue, + trustAdjustments, + dashboard, + auditTrail: buildAuditTrail(input, reviewerReports, coachingQueue, trustAdjustments) + }; + + report.digest = stableDigest(report); + return report; +} + +function validateCalibrationInput(input) { + const required = [ + ["programId", input.programId], + ["rubric.criteria", input.rubric && (input.rubric.criteria || []).length], + ["projects", (input.projects || []).length], + ["reviews", (input.reviews || []).length] + ]; + const missing = required.filter(([, value]) => !value).map(([field]) => field); + const reviewIssues = (input.reviews || []).flatMap((review) => { + const issues = []; + if (!review.id) issues.push("review.id"); + if (!review.reviewerId) issues.push( `${review.id || "unknown"}.reviewerId` + ); + if (!review.projectId) issues.push(`${review.id || "unknown"}.projectId`); + if (!review.scores) issues.push(`${review.id || "unknown"}.scores`); + return issues; + }); + + return { + status: missing.length === 0 && reviewIssues.length === 0 ? "passed" : "incomplete", + score: Math.max(0, 100 - missing.length * 15 - reviewIssues.length * 5), + missing, + reviewIssues + }; +} + +function evaluateReviewer(reviewerId, reviews, projectIndex, rubric, contributors) { + const criteria = rubric.criteria || []; + const deltas = reviews.flatMap((review) => { + const project = projectIndex.get(review.projectId); + if (!project) return []; + return criteria.map((criterion) => ({ + reviewId: review.id, + projectId: review.projectId, + criterion, + score: Number(review.scores[criterion] || 0), + consensus: Number(project.consensus[criterion] || 0), + delta: Number(review.scores[criterion] || 0) - Number(project.consensus[criterion] || 0) + })); + }); + const byCriterion = criteria.map((criterion) => { + const criterionDeltas = deltas.filter((item) => item.criterion === criterion); + const averageDelta = average(criterionDeltas.map((item) => item.delta)); + const meanAbsoluteError = average(criterionDeltas.map((item) => Math.abs(item.delta))); + return { + criterion, + averageDelta: round(averageDelta), + meanAbsoluteError: round(meanAbsoluteError), + drift: classifyDrift(averageDelta, meanAbsoluteError) + }; + }); + const weightedError = weightedMean(byCriterion, rubric.weights || {}); + const calibrationScore = round(Math.max(0, 1 - weightedError / 4)); + const reproducibilityBlindSpot = byCriterion.find((item) => item.criterion === "reproducibility" && item.averageDelta > 0.75); + const contributor = contributors.find((item) => item.reviewerId === reviewerId) || {}; + const reputationSignal = buildReputationSignal(calibrationScore, contributor, reproducibilityBlindSpot); + + return { + reviewerId, + reviewCount: reviews.length, + modes: Array.from(new Set(reviews.map((review) => review.mode))), + byCriterion, + weightedError: round(weightedError), + calibrationScore, + reproducibilityBlindSpot: Boolean(reproducibilityBlindSpot), + reputationSignal, + coachingActions: buildReviewerCoachingActions(reviewerId, byCriterion, calibrationScore, reproducibilityBlindSpot) + }; +} + +function buildReputationSignal(calibrationScore, contributor, reproducibilityBlindSpot) { + const contributionBonus = Math.min(0.12, (contributor.completedBounties || 0) * 0.02 + (contributor.endorsements || 0) * 0.005); + const blindSpotPenalty = reproducibilityBlindSpot ? 0.08 : 0; + const calibratedScore = round(Math.max(0, Math.min(1, calibrationScore + contributionBonus - blindSpotPenalty))); + return { + calibratedScore, + contributionBonus: round(contributionBonus), + blindSpotPenalty, + roles: contributor.roles || [], + tier: chooseTier(calibratedScore) + }; +} + +function buildReviewerCoachingActions(reviewerId, byCriterion, calibrationScore, reproducibilityBlindSpot) { + const actions = []; + for (const item of byCriterion) { + if (item.drift === "lenient") { + actions.push({ + reviewerId, + type: "leniency-calibration", + criterion: item.criterion, + message: `Review practice set: ${item.criterion} scores are higher than consensus.` + }); + } + if (item.drift === "severe") { + actions.push({ + reviewerId, + type: "severity-calibration", + criterion: item.criterion, + message: `Review practice set: ${item.criterion} scores are lower than consensus.` + }); + } + } + if (reproducibilityBlindSpot) { + actions.push({ + reviewerId, + type: "reproducibility-blind-spot", + criterion: "reproducibility", + message: "Add reproducibility evidence checklist before assigning Trusted Reviewer status." + }); + } + if (calibrationScore < 0.7) { + actions.push({ + reviewerId, + type: "mentor-review", + criterion: "overall", + message: "Pair with a calibrated reviewer for the next structured peer review." + }); + } + return actions; +} + +function buildLeaderboard(reviewerReports) { + return reviewerReports + .map((report) => ({ + reviewerId: report.reviewerId, + calibrationScore: report.calibrationScore, + trustScore: report.reputationSignal.calibratedScore, + tier: report.reputationSignal.tier, + reviewCount: report.reviewCount + })) + .sort((a, b) => b.trustScore - a.trustScore || b.calibrationScore - a.calibrationScore); +} + +function buildCoachingQueue(reviewerReports) { + return reviewerReports.flatMap((report) => report.coachingActions); +} + +function buildTrustAdjustments(reviewerReports, rubric) { + const threshold = Number(rubric.calibrationThreshold || 0.65); + return reviewerReports.map((report) => ({ + reviewerId: report.reviewerId, + status: report.calibrationScore >= threshold ? "eligible" : "quarantine-until-coached", + calibrationScore: report.calibrationScore, + trustScore: report.reputationSignal.calibratedScore, + tier: report.reputationSignal.tier, + reason: report.calibrationScore >= threshold + ? "Structured review scores are close enough to consensus for reputation credit." + : "Review scores require calibration before they increase public reputation." + })); +} + +function buildDashboard(input, reviewerReports, coachingQueue, trustAdjustments) { + const quarantined = trustAdjustments.filter((item) => item.status === "quarantine-until-coached"); + return { + title: `Review calibration ${input.programId}`, + status: coachingQueue.length === 0 ? "ready-for-reputation-credit" : "coaching-needed", + reviewerCount: reviewerReports.length, + coachingActionCount: coachingQueue.length, + quarantinedReviewerCount: quarantined.length, + topAction: coachingQueue[0] ? coachingQueue[0].message : "Publish calibrated reputation scores." + }; +} + +function buildAuditTrail(input, reviewerReports, coachingQueue, trustAdjustments) { + return [ + { + type: "calibration-built", + programId: input.programId, + reviewerCount: reviewerReports.length, + coachingActionCount: coachingQueue.length + }, + ...reviewerReports.map((report) => ({ + type: "reviewer-scored", + reviewerId: report.reviewerId, + calibrationScore: report.calibrationScore, + tier: report.reputationSignal.tier + })), + ...trustAdjustments.map((adjustment) => ({ + type: "trust-adjustment", + reviewerId: adjustment.reviewerId, + status: adjustment.status, + trustScore: adjustment.trustScore + })) + ]; +} + +function groupReviewsByReviewer(reviews) { + const groups = new Map(); + for (const review of reviews) { + const current = groups.get(review.reviewerId) || []; + current.push(review); + groups.set(review.reviewerId, current); + } + return groups; +} + +function weightedMean(byCriterion, weights) { + const totalWeight = byCriterion.reduce((sum, item) => sum + Number(weights[item.criterion] || 1), 0); + return byCriterion.reduce((sum, item) => sum + item.meanAbsoluteError * Number(weights[item.criterion] || 1), 0) / totalWeight; +} + +function classifyDrift(averageDelta, meanAbsoluteError) { + if (averageDelta >= 0.75 && meanAbsoluteError >= 0.75) return "lenient"; + if (averageDelta <= -0.75 && meanAbsoluteError >= 0.75) return "severe"; + if (meanAbsoluteError >= 1.25) return "inconsistent"; + return "calibrated"; +} + +function chooseTier(score) { + if (score >= 0.88) return "trusted-reviewer"; + if (score >= 0.75) return "calibrated-reviewer"; + if (score >= 0.6) return "needs-light-coaching"; + return "mentor-required"; +} + +function average(values) { + if (values.length === 0) return 0; + return values.reduce((sum, value) => sum + value, 0) / values.length; +} + +function round(value) { + return Math.round((value + Number.EPSILON) * 1000) / 1000; +} + +function stableDigest(value) { + return crypto.createHash("sha256").update(stableStringify(value)).digest("hex"); +} + +function stableStringify(value) { + if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`; + if (value && typeof value === "object") { + return `{${Object.keys(value).sort().map((key) => `${JSON.stringify(key)}:${stableStringify(value[key])}`).join(",")} }`.replace(' }', '}'); + } + return JSON.stringify(value); +} + +module.exports = { + buildCalibrationBench, + validateCalibrationInput, + evaluateReviewer, + classifyDrift, + stableDigest +}; diff --git a/review-calibration-bench/test/review-calibration.test.js b/review-calibration-bench/test/review-calibration.test.js new file mode 100644 index 0000000..8523310 --- /dev/null +++ b/review-calibration-bench/test/review-calibration.test.js @@ -0,0 +1,49 @@ +const assert = require("node:assert/strict"); +const fs = require("node:fs"); +const path = require("node:path"); +const { + buildCalibrationBench, + classifyDrift, + validateCalibrationInput +} = require("../src/review-calibration"); + +const samplePath = path.join(__dirname, "..", "data", "sample-calibration.json"); +const input = JSON.parse(fs.readFileSync(samplePath, "utf8")); +const report = buildCalibrationBench(input); + +assert.equal(report.validation.status, "passed"); +assert.equal(report.reviewerReports.length, 3); +assert.equal(report.dashboard.status, "coaching-needed"); +assert.equal(report.dashboard.reviewerCount, 3); +assert.equal(report.dashboard.quarantinedReviewerCount, 1); +assert.ok(report.coachingQueue.some((action) => action.type === "reproducibility-blind-spot")); + +const ada = report.reviewerReports.find((item) => item.reviewerId === "ada"); +assert.equal(ada.calibrationScore, 0.925); +assert.equal(ada.reputationSignal.tier, "trusted-reviewer"); +assert.equal(ada.coachingActions.length, 0); + +const bohr = report.reviewerReports.find((item) => item.reviewerId === "bohr"); +assert.equal(bohr.reproducibilityBlindSpot, true); +assert.equal(bohr.reputationSignal.tier, "needs-light-coaching"); +assert.ok(bohr.coachingActions.some((action) => action.type === "leniency-calibration")); +assert.ok(bohr.coachingActions.some((action) => action.type === "reproducibility-blind-spot")); + +const curie = report.reviewerReports.find((item) => item.reviewerId === "curie"); +assert.equal(curie.reputationSignal.tier, "calibrated-reviewer"); +assert.ok(curie.coachingActions.some((action) => action.type === "severity-calibration")); + +assert.equal(report.leaderboard[0].reviewerId, "ada"); +assert.equal(report.trustAdjustments.find((item) => item.reviewerId === "bohr").status, "quarantine-until-coached"); +assert.equal(report.digest, buildCalibrationBench(input).digest); + +assert.equal(classifyDrift(1, 1), "lenient"); +assert.equal(classifyDrift(-1, 1), "severe"); +assert.equal(classifyDrift(0.2, 1.5), "inconsistent"); +assert.equal(classifyDrift(0.2, 0.3), "calibrated"); + +const incomplete = validateCalibrationInput({ programId: "draft" }); +assert.equal(incomplete.status, "incomplete"); +assert.ok(incomplete.missing.includes("rubric.criteria")); + +console.log("review-calibration-bench tests passed");