diff --git a/scientific-artifact-provenance-chain/README.md b/scientific-artifact-provenance-chain/README.md new file mode 100644 index 0000000..fef52dd --- /dev/null +++ b/scientific-artifact-provenance-chain/README.md @@ -0,0 +1,64 @@ +# Scientific Artifact Provenance Chain + +This module adds a dependency-free provenance receipt builder for scientific data and code hosting. It gives reviewers a deterministic way to prove which datasets, code files, environments, and outputs belong to a reproducible research package. + +## What It Covers + +- Content-hashed artifact manifests for datasets, code, figures, models, supplements, and runtime environments. +- Metadata-aware previews for tabular data and source files. +- Dataset/code version diffs with hash, size, line, and metadata change signals. +- PROV-like activity chains that connect raw inputs, analysis commands, environments, and generated outputs. +- FAIR scoring and validation warnings for missing license, metadata, runtime, or provenance references. +- JSON-LD and DataCite-style exports for machine discovery and DOI registration workflows. +- Rerun plans that tell reviewers which commands and environments are ready to execute. + +## Usage + +```bash +node scientific-artifact-provenance-chain/test.js +node scientific-artifact-provenance-chain/demo.js +``` + +```js +const { + buildProvenanceReceipt, + validateProvenanceReceipt, + exportJsonLd, +} = require("./index"); + +const receipt = buildProvenanceReceipt({ + projectId: "scibase:project:enzyme-kinetics", + title: "Enzyme kinetics reproducibility bundle", + doi: "10.5555/scibase.enzyme.2026", + license: "CC-BY-4.0", + creators: ["Researcher"], + artifacts: [ + { + path: "data/raw/assay.csv", + content: "sample,rate\nA,1.2\n", + license: "CC-BY-4.0", + metadata: { variables: ["sample", "rate"] }, + }, + ], + activities: [], +}); + +console.log(validateProvenanceReceipt(receipt)); +console.log(exportJsonLd(receipt)); +``` + +## Requirement Mapping + +| Issue #14 requirement | Implementation | +| --- | --- | +| Store datasets, code, models, figures, and supplementary files | `makeArtifactRecord()` classifies supported artifact types and records path, role, size, hash, license, and metadata. | +| Folder organization and versioning | Artifact paths are normalized as project-relative paths; `compareArtifactVersions()` produces deterministic version diffs. | +| Metadata-aware previews | CSV/TSV previews expose columns and line counts; code previews expose language and line counts. | +| JSON-LD, DataCite, schema.org | `exportJsonLd()` and `exportDataCite()` produce machine-readable metadata payloads. | +| FAIR compliance | `scoreFair()` and `validateProvenanceReceipt()` check findability, accessibility, interoperability, and reusability signals. | +| Executable environments and reruns | Activities include commands and environment references; `buildRerunPlan()` reports reviewer-ready execution steps. | +| Reproducibility review | Provenance validation detects missing references, unlicensed artifacts, missing runtime metadata, and derived artifacts without generation activity. | + +## Design Notes + +The module stores only content hashes and caller-provided content in memory for deterministic tests. It does not read arbitrary local files, execute commands, call external services, or require credentials. That makes it safe to run in CI and suitable as a foundation for a future upload/API layer. \ No newline at end of file diff --git a/scientific-artifact-provenance-chain/demo.js b/scientific-artifact-provenance-chain/demo.js new file mode 100644 index 0000000..bcc5ecf --- /dev/null +++ b/scientific-artifact-provenance-chain/demo.js @@ -0,0 +1,61 @@ +"use strict"; + +const { + buildProvenanceReceipt, + buildRerunPlan, + exportDataCite, + exportJsonLd, + validateProvenanceReceipt, +} = require("./index"); + +const receipt = buildProvenanceReceipt({ + projectId: "scibase:demo:soil-carbon", + title: "Soil carbon notebook replay package", + doi: "10.5555/scibase.soil-carbon.demo", + license: "CC-BY-4.0", + creators: ["SCIBASE demo researcher"], + artifacts: [ + { + path: "data/raw/soil_samples.csv", + content: "plot,carbon_pct\nnorth,2.1\nsouth,1.8\n", + license: "CC-BY-4.0", + metadata: { variables: ["plot", "carbon_pct"], instrument: "field-sampling" }, + }, + { + path: "notebooks/soil_carbon.ipynb", + content: "{\"cells\":[{\"cell_type\":\"code\",\"source\":\"print('soil carbon')\"}]}", + license: "MIT", + metadata: { runtime: "python>=3.11", kernel: "python3" }, + }, + { + path: "outputs/soil_summary.csv", + content: "metric,value\nmean_carbon_pct,1.95\n", + license: "CC-BY-4.0", + derivedFrom: ["data/raw/soil_samples.csv", "notebooks/soil_carbon.ipynb"], + metadata: { variables: ["metric", "value"] }, + }, + { + path: "env/environment.yml", + content: "name: soil-carbon\ndependencies:\n - python=3.11\n - pandas\n", + kind: "environment", + }, + ], + activities: [ + { + id: "activity:notebook-run", + type: "notebook-replay", + actor: "reviewer-workflow", + command: "jupyter nbconvert --execute notebooks/soil_carbon.ipynb", + environment: "env/environment.yml", + used: ["data/raw/soil_samples.csv", "notebooks/soil_carbon.ipynb"], + generated: ["outputs/soil_summary.csv"], + }, + ], +}); + +console.log(JSON.stringify({ + validation: validateProvenanceReceipt(receipt), + rerunPlan: buildRerunPlan(receipt), + jsonLd: exportJsonLd(receipt), + datacite: exportDataCite(receipt), +}, null, 2)); \ No newline at end of file diff --git a/scientific-artifact-provenance-chain/index.js b/scientific-artifact-provenance-chain/index.js new file mode 100644 index 0000000..144a048 --- /dev/null +++ b/scientific-artifact-provenance-chain/index.js @@ -0,0 +1,339 @@ +"use strict"; + +const crypto = require("crypto"); +const path = require("path"); + +const DATA_EXTENSIONS = new Set([".csv", ".tsv", ".xlsx", ".json", ".jsonl", ".parquet", ".feather"]); +const CODE_EXTENSIONS = new Set([".py", ".r", ".jl", ".ipynb", ".sh", ".sql"]); +const FIGURE_EXTENSIONS = new Set([".png", ".jpg", ".jpeg", ".svg", ".pdf", ".tif", ".tiff"]); +const MODEL_EXTENSIONS = new Set([".onnx", ".pkl", ".pt", ".pth", ".joblib", ".h5", ".safetensors"]); +const ENVIRONMENT_EXTENSIONS = new Set([".yml", ".yaml", ".toml", ".lock", ".dockerfile"]); + +function stableStringify(value) { + if (Array.isArray(value)) { + return `[${value.map(stableStringify).join(",")}]`; + } + if (value && typeof value === "object") { + return `{${Object.keys(value) + .sort() + .map((key) => `${JSON.stringify(key)}:${stableStringify(value[key])}`) + .join(",")}}`; + } + return JSON.stringify(value); +} + +function sha256(input) { + return crypto.createHash("sha256").update(String(input)).digest("hex"); +} + +function normalizeArtifactPath(artifactPath) { + if (!artifactPath || typeof artifactPath !== "string") { + throw new Error("artifact path is required"); + } + + const normalized = artifactPath.replaceAll("\\", "/").replace(/^\/+/, ""); + if (normalized.includes("..")) { + throw new Error(`artifact path cannot escape the project: ${artifactPath}`); + } + return normalized; +} + +function inferArtifactKind(artifactPath, explicitKind) { + if (explicitKind) { + return explicitKind; + } + + const lower = artifactPath.toLowerCase(); + const ext = path.extname(lower); + const basename = path.basename(lower); + + if (DATA_EXTENSIONS.has(ext)) return "dataset"; + if (CODE_EXTENSIONS.has(ext)) return "code"; + if (FIGURE_EXTENSIONS.has(ext)) return "figure"; + if (MODEL_EXTENSIONS.has(ext)) return "model"; + if (ENVIRONMENT_EXTENSIONS.has(ext) || basename === "dockerfile") return "environment"; + return "supplement"; +} + +function lineStats(content) { + const text = String(content); + const lines = text.length === 0 ? [] : text.split(/\r?\n/); + return { + lineCount: lines.length, + nonEmptyLineCount: lines.filter((line) => line.trim()).length, + }; +} + +function makeArtifactRecord(input) { + const artifactPath = normalizeArtifactPath(input.path); + const content = input.content == null ? "" : String(input.content); + const hash = sha256(content); + const kind = inferArtifactKind(artifactPath, input.kind); + const stats = lineStats(content); + + return { + id: input.id || `artifact:${hash.slice(0, 16)}`, + path: artifactPath, + kind, + role: input.role || defaultRoleForKind(kind), + sha256: hash, + sizeBytes: Buffer.byteLength(content), + license: input.license || null, + createdBy: input.createdBy || "unknown", + derivedFrom: (input.derivedFrom || []).map(normalizeArtifactPath), + metadata: input.metadata || {}, + preview: buildPreviewDescriptor(artifactPath, kind, content, stats), + }; +} + +function defaultRoleForKind(kind) { + if (kind === "dataset") return "research-data"; + if (kind === "code") return "analysis-code"; + if (kind === "figure") return "research-output"; + if (kind === "environment") return "execution-environment"; + if (kind === "model") return "trained-model"; + return "supplementary-material"; +} + +function buildPreviewDescriptor(artifactPath, kind, content, stats) { + if (kind === "dataset" && [".csv", ".tsv"].includes(path.extname(artifactPath).toLowerCase())) { + const delimiter = path.extname(artifactPath).toLowerCase() === ".tsv" ? "\t" : ","; + const firstLine = String(content).split(/\r?\n/)[0] || ""; + return { + type: "tabular-preview", + columns: firstLine ? firstLine.split(delimiter).map((column) => column.trim()).filter(Boolean) : [], + ...stats, + }; + } + if (kind === "code") { + return { + type: "source-preview", + language: path.extname(artifactPath).replace(".", "") || "text", + ...stats, + }; + } + return { + type: `${kind}-preview`, + ...stats, + }; +} + +function compareArtifactVersions(previous, next) { + const sizeDeltaBytes = next.sizeBytes - previous.sizeBytes; + const changed = previous.sha256 !== next.sha256; + return { + path: next.path, + previousSha256: previous.sha256, + nextSha256: next.sha256, + changed, + sizeDeltaBytes, + lineDelta: (next.preview.lineCount || 0) - (previous.preview.lineCount || 0), + metadataChanged: stableStringify(previous.metadata) !== stableStringify(next.metadata), + }; +} + +function buildProvenanceReceipt(input) { + const artifacts = (input.artifacts || []).map(makeArtifactRecord); + const artifactIndex = new Map(); + for (const artifact of artifacts) { + artifactIndex.set(artifact.id, artifact); + artifactIndex.set(artifact.path, artifact); + artifactIndex.set(artifact.sha256, artifact); + } + + const activities = (input.activities || []).map((activity, index) => normalizeActivity(activity, index, artifactIndex)); + const generatedAt = input.generatedAt || new Date().toISOString(); + const receiptCore = { + projectId: input.projectId, + title: input.title, + doi: input.doi || null, + license: input.license || null, + creators: input.creators || [], + generatedAt, + artifacts, + activities, + }; + + return { + ...receiptCore, + receiptId: `provenance:${sha256(stableStringify(receiptCore)).slice(0, 24)}`, + fairScore: scoreFair(receiptCore), + }; +} + +function normalizeActivity(activity, index, artifactIndex) { + const used = (activity.used || []).map((ref) => resolveArtifactRef(ref, artifactIndex)); + const generated = (activity.generated || []).map((ref) => resolveArtifactRef(ref, artifactIndex)); + return { + id: activity.id || `activity:${index + 1}`, + type: activity.type || "analysis", + actor: activity.actor || "unknown", + command: activity.command || null, + environment: activity.environment || null, + used, + generated, + startedAt: activity.startedAt || null, + completedAt: activity.completedAt || null, + notes: activity.notes || null, + }; +} + +function resolveArtifactRef(ref, artifactIndex) { + const key = typeof ref === "string" ? ref : ref.id || ref.path || ref.sha256; + const artifact = artifactIndex.get(key); + if (!artifact) { + return { missing: true, ref: key }; + } + return { id: artifact.id, path: artifact.path, sha256: artifact.sha256, kind: artifact.kind }; +} + +function validateProvenanceReceipt(receipt) { + const errors = []; + const warnings = []; + const artifactPaths = new Set(); + const generatedArtifactIds = new Set(); + + if (!receipt.projectId) errors.push("projectId is required"); + if (!receipt.title) errors.push("title is required"); + if (!receipt.license) warnings.push("project license is missing"); + if (!receipt.creators || receipt.creators.length === 0) warnings.push("at least one creator is recommended"); + + for (const artifact of receipt.artifacts || []) { + if (artifactPaths.has(artifact.path)) { + errors.push(`duplicate artifact path: ${artifact.path}`); + } + artifactPaths.add(artifact.path); + + if (!artifact.license && artifact.kind !== "environment") { + warnings.push(`${artifact.path} has no artifact-level license`); + } + if (artifact.kind === "dataset" && !artifact.metadata.variables) { + warnings.push(`${artifact.path} dataset is missing variable metadata`); + } + if (artifact.kind === "code" && !artifact.metadata.runtime) { + warnings.push(`${artifact.path} code is missing runtime metadata`); + } + } + + for (const activity of receipt.activities || []) { + for (const ref of [...activity.used, ...activity.generated]) { + if (ref.missing) { + errors.push(`${activity.id} references missing artifact: ${ref.ref}`); + } + } + for (const ref of activity.generated) { + if (!ref.missing) { + generatedArtifactIds.add(ref.id); + } + } + if (activity.generated.length > 0 && !activity.command) { + warnings.push(`${activity.id} generates artifacts without a command`); + } + if (activity.command && !activity.environment) { + warnings.push(`${activity.id} has a command but no execution environment`); + } + } + + for (const artifact of receipt.artifacts || []) { + if (artifact.derivedFrom.length > 0 && !generatedArtifactIds.has(artifact.id)) { + warnings.push(`${artifact.path} declares derivedFrom but is not generated by an activity`); + } + } + + return { + valid: errors.length === 0, + errors, + warnings, + fairScore: receipt.fairScore, + }; +} + +function scoreFair(receipt) { + const checks = { + findable: Boolean(receipt.doi || receipt.projectId) && (receipt.artifacts || []).every((artifact) => artifact.sha256), + accessible: Boolean(receipt.license) && (receipt.artifacts || []).every((artifact) => artifact.path), + interoperable: (receipt.artifacts || []).some((artifact) => artifact.metadata && Object.keys(artifact.metadata).length > 0), + reusable: (receipt.artifacts || []).every((artifact) => artifact.license || artifact.kind === "environment"), + }; + const passed = Object.values(checks).filter(Boolean).length; + return { + score: Math.round((passed / Object.keys(checks).length) * 100), + checks, + }; +} + +function buildRerunPlan(receipt) { + return (receipt.activities || []).map((activity, index) => ({ + step: index + 1, + activityId: activity.id, + type: activity.type, + command: activity.command, + environment: activity.environment, + inputs: activity.used.map((artifact) => artifact.path || artifact.ref), + outputs: activity.generated.map((artifact) => artifact.path || artifact.ref), + ready: Boolean(activity.command && activity.environment) && !activity.used.some((artifact) => artifact.missing), + })); +} + +function exportJsonLd(receipt) { + return { + "@context": { + schema: "https://schema.org/", + prov: "https://www.w3.org/ns/prov#", + datacite: "https://purl.org/spar/datacite/", + }, + "@id": receipt.doi || receipt.projectId, + "@type": "schema:Dataset", + "schema:name": receipt.title, + "schema:identifier": receipt.doi || receipt.projectId, + "schema:license": receipt.license, + "schema:creator": receipt.creators, + "prov:generatedAtTime": receipt.generatedAt, + "schema:hasPart": receipt.artifacts.map((artifact) => ({ + "@id": artifact.id, + "@type": artifact.kind === "code" ? "schema:SoftwareSourceCode" : "schema:CreativeWork", + "schema:name": artifact.path, + "schema:encodingFormat": artifact.kind, + "schema:sha256": artifact.sha256, + "schema:license": artifact.license || receipt.license, + })), + "prov:wasGeneratedBy": receipt.activities.map((activity) => ({ + "@id": activity.id, + "@type": "prov:Activity", + "prov:used": activity.used.map((artifact) => artifact.id || artifact.ref), + "prov:generated": activity.generated.map((artifact) => artifact.id || artifact.ref), + "prov:wasAssociatedWith": activity.actor, + })), + }; +} + +function exportDataCite(receipt) { + return { + identifiers: receipt.doi ? [{ identifier: receipt.doi, identifierType: "DOI" }] : [], + creators: receipt.creators.map((creator) => ({ name: creator })), + titles: [{ title: receipt.title }], + publisher: "SCIBASE.AI artifact provenance chain", + publicationYear: new Date(receipt.generatedAt).getUTCFullYear(), + types: { resourceTypeGeneral: "Dataset", resourceType: "Research artifact bundle" }, + rightsList: receipt.license ? [{ rights: receipt.license }] : [], + descriptions: [ + { + descriptionType: "TechnicalInfo", + description: `Receipt ${receipt.receiptId} tracks ${receipt.artifacts.length} artifacts and ${receipt.activities.length} provenance activities.`, + }, + ], + }; +} + +module.exports = { + buildProvenanceReceipt, + buildRerunPlan, + compareArtifactVersions, + exportDataCite, + exportJsonLd, + inferArtifactKind, + makeArtifactRecord, + sha256, + stableStringify, + validateProvenanceReceipt, +}; \ No newline at end of file diff --git a/scientific-artifact-provenance-chain/test.js b/scientific-artifact-provenance-chain/test.js new file mode 100644 index 0000000..a93f39c --- /dev/null +++ b/scientific-artifact-provenance-chain/test.js @@ -0,0 +1,125 @@ +"use strict"; + +const assert = require("assert"); +const { + buildProvenanceReceipt, + buildRerunPlan, + compareArtifactVersions, + exportDataCite, + exportJsonLd, + inferArtifactKind, + makeArtifactRecord, + validateProvenanceReceipt, +} = require("./index"); + +function fixtureReceipt() { + return buildProvenanceReceipt({ + projectId: "scibase:project:enzyme-kinetics", + title: "Enzyme kinetics reproducibility bundle", + doi: "10.5555/scibase.enzyme.2026", + license: "CC-BY-4.0", + creators: ["SCIBASE QA"], + generatedAt: "2026-05-15T00:00:00.000Z", + artifacts: [ + { + path: "data/raw/assay.csv", + content: "sample,rate\nA,1.2\nB,1.4\n", + license: "CC-BY-4.0", + metadata: { variables: ["sample", "rate"], instrument: "plate-reader" }, + }, + { + path: "analysis/fit_model.py", + content: "import csv\nprint('fit model')\n", + license: "MIT", + metadata: { runtime: "python>=3.11" }, + }, + { + path: "results/fit.csv", + content: "parameter,value\nkm,0.42\nvmax,1.8\n", + license: "CC-BY-4.0", + derivedFrom: ["data/raw/assay.csv", "analysis/fit_model.py"], + metadata: { variables: ["parameter", "value"] }, + }, + { + path: "env/environment.yml", + content: "name: enzyme\nchannels: [conda-forge]\n", + kind: "environment", + metadata: { runtime: "conda" }, + }, + ], + activities: [ + { + id: "activity:fit", + type: "analysis", + actor: "workflow:enzyme-fit", + command: "python analysis/fit_model.py data/raw/assay.csv results/fit.csv", + environment: "env/environment.yml", + used: ["data/raw/assay.csv", "analysis/fit_model.py"], + generated: ["results/fit.csv"], + }, + ], + }); +} + +function testKindInference() { + assert.equal(inferArtifactKind("raw/table.csv"), "dataset"); + assert.equal(inferArtifactKind("notebooks/model.ipynb"), "code"); + assert.equal(inferArtifactKind("figures/chart.svg"), "figure"); + assert.equal(inferArtifactKind("models/checkpoint.onnx"), "model"); + assert.equal(inferArtifactKind("Dockerfile"), "environment"); +} + +function testReceiptValidation() { + const receipt = fixtureReceipt(); + const result = validateProvenanceReceipt(receipt); + assert.equal(result.valid, true); + assert.deepEqual(result.errors, []); + assert.equal(result.fairScore.score, 100); + assert.ok(receipt.receiptId.startsWith("provenance:")); +} + +function testMissingReferenceFailsValidation() { + const receipt = buildProvenanceReceipt({ + projectId: "scibase:project:bad-ref", + title: "Bad reference", + license: "CC0-1.0", + artifacts: [{ path: "data.csv", content: "x\n1\n", license: "CC0-1.0", metadata: { variables: ["x"] } }], + activities: [{ id: "activity:missing", command: "python run.py", environment: "python:3.11", used: ["missing.csv"] }], + }); + const result = validateProvenanceReceipt(receipt); + assert.equal(result.valid, false); + assert.match(result.errors.join("\n"), /missing\.csv/); +} + +function testVersionDiff() { + const previous = makeArtifactRecord({ path: "data.csv", content: "x\n1\n", license: "CC0-1.0" }); + const next = makeArtifactRecord({ path: "data.csv", content: "x\n1\n2\n", license: "CC0-1.0" }); + const diff = compareArtifactVersions(previous, next); + assert.equal(diff.changed, true); + assert.equal(diff.lineDelta, 1); + assert.ok(diff.sizeDeltaBytes > 0); +} + +function testExportsAndRerunPlan() { + const receipt = fixtureReceipt(); + const jsonLd = exportJsonLd(receipt); + const datacite = exportDataCite(receipt); + const plan = buildRerunPlan(receipt); + + assert.equal(jsonLd["@type"], "schema:Dataset"); + assert.equal(jsonLd["schema:hasPart"].length, 4); + assert.equal(datacite.identifiers[0].identifier, "10.5555/scibase.enzyme.2026"); + assert.equal(plan[0].ready, true); + assert.deepEqual(plan[0].outputs, ["results/fit.csv"]); +} + +function run() { + testKindInference(); + testReceiptValidation(); + testMissingReferenceFailsValidation(); + testVersionDiff(); + testExportsAndRerunPlan(); + console.log("scientific-artifact-provenance-chain tests passed"); +} + +run(); \ No newline at end of file