diff --git a/repository-reproducibility-bundle-guard/README.md b/repository-reproducibility-bundle-guard/README.md new file mode 100644 index 00000000..1eb8736a --- /dev/null +++ b/repository-reproducibility-bundle-guard/README.md @@ -0,0 +1,17 @@ +# Repository Reproducibility Bundle Guard + +Self-contained SCIBASE Project Repository & Version Control slice for issue #10. The guard checks whether a scientific repository release candidate is reproducible before a tag, DOI export, or public publication packet is treated as ready. + +## Why this slice is distinct + +Existing #10 submissions cover broad repository ledgers, branch protection, component-owner approval, semantic tags, external reference pinning, notebook-output diffs, fork provenance, release signatures, restore rehearsal, automation credential rotation, artifact pruning, Git LFS pointer integrity, review-decision provenance, and release-note claim evidence. This module focuses only on reproducibility bundle readiness: required manifest components, SHA-256 integrity, executable lockfile evidence, pinned runtime images, data/result provenance, release metadata, and export-bundle completeness. + +## Run + +```bash +npm test +npm run demo +npm run demo:video +``` + +Demo artifacts are written to `reports/`, including JSON, Markdown, SVG, GIF, and MP4 files. diff --git a/repository-reproducibility-bundle-guard/demo.js b/repository-reproducibility-bundle-guard/demo.js new file mode 100644 index 00000000..02a5c09f --- /dev/null +++ b/repository-reproducibility-bundle-guard/demo.js @@ -0,0 +1,61 @@ +const fs = require("fs"); +const path = require("path"); + +const { assessReproducibilityBundle } = require("./index"); +const { releaseCandidate, riskyCandidate } = require("./sample-data"); + +const reportsDir = path.join(__dirname, "reports"); +fs.mkdirSync(reportsDir, { recursive: true }); + +function markdownReport(name, report) { + const findings = report.findings.length + ? report.findings + .map((item) => `- ${item.severity.toUpperCase()} ${item.code}: ${item.message}`) + .join("\n") + : "- No reproducibility bundle findings."; + return `# Repository Reproducibility Bundle Guard + +Scenario: ${name} + +Repository: ${report.repositoryId} +Release tag: ${report.releaseTag} +Decision: ${report.decision.toUpperCase()} + +Reviewed ${report.summary.componentsReviewed} manifest components and ${report.summary.bundleEntries} bundle entries. + +## Findings + +${findings} + +## Release Criteria + +${report.releaseCriteria.map((item) => `- ${item}`).join("\n")} +`; +} + +function svgReport(report) { + const color = report.decision === "hold" ? "#b91c1c" : report.decision === "revise" ? "#c2410c" : "#15803d"; + return ` + + Repository Reproducibility Bundle Guard + ${report.repositoryId} / ${report.releaseTag} + + ${report.decision.toUpperCase()} + Components: ${report.summary.componentsReviewed} + High: ${report.summary.high} + Medium: ${report.summary.medium} + Low: ${report.summary.low} + Synthetic release packets only. No private data or external services. +`; +} + +for (const [name, candidate] of [ + ["release-candidate", releaseCandidate], + ["risky-candidate", riskyCandidate], +]) { + const report = assessReproducibilityBundle(candidate); + fs.writeFileSync(path.join(reportsDir, `${name}.json`), JSON.stringify(report, null, 2)); + fs.writeFileSync(path.join(reportsDir, `${name}.md`), markdownReport(name, report)); + fs.writeFileSync(path.join(reportsDir, `${name}.svg`), svgReport(report)); + console.log(`${name}: ${report.decision} (${report.summary.findings} findings)`); +} diff --git a/repository-reproducibility-bundle-guard/demo_video.py b/repository-reproducibility-bundle-guard/demo_video.py new file mode 100644 index 00000000..54235b4e --- /dev/null +++ b/repository-reproducibility-bundle-guard/demo_video.py @@ -0,0 +1,46 @@ +from pathlib import Path + +import imageio.v3 as iio +import numpy as np +from PIL import Image, ImageDraw, ImageFont + + +ROOT = Path(__file__).resolve().parent +REPORTS = ROOT / "reports" +REPORTS.mkdir(exist_ok=True) + + +def font(size): + for name in ("arial.ttf", "segoeui.ttf"): + try: + return ImageFont.truetype(name, size) + except OSError: + pass + return ImageFont.load_default() + + +slides = [ + ("Reproducibility Bundle Guard", "Project Repository & Version Control #10"), + ("Manifest", "manuscript, data, code, notebooks, results, protocols, metadata"), + ("Integrity", "SHA-256 hashes, lockfiles, pinned runtimes, provenance refs"), + ("Decision", "hold release until the export bundle is reproducible"), +] + +frames = [] +for index, (title, subtitle) in enumerate(slides, start=1): + image = Image.new("RGB", (960, 544), "#101827") + draw = ImageDraw.Draw(image) + draw.rectangle((44, 52, 916, 492), outline="#22c55e", width=3) + draw.text((80, 124), title, fill="#f8fafc", font=font(40)) + draw.text((80, 206), subtitle, fill="#dcfce7", font=font(24)) + draw.rectangle((80, 326, 818, 382), fill="#166534") + draw.text((104, 342), "release only when reviewers can reproduce the exact packet", fill="#f0fdf4", font=font(21)) + draw.text((80, 438), f"Slide {index}/4 - synthetic reviewer artifact", fill="#cbd5e1", font=font(20)) + frames.extend([image] * 14) + +gif_path = REPORTS / "demo.gif" +mp4_path = REPORTS / "demo.mp4" +frames[0].save(gif_path, save_all=True, append_images=frames[1:], duration=120, loop=0) +iio.imwrite(mp4_path, [np.asarray(frame) for frame in frames], fps=8, codec="libx264") +print(f"wrote {gif_path}") +print(f"wrote {mp4_path}") diff --git a/repository-reproducibility-bundle-guard/index.js b/repository-reproducibility-bundle-guard/index.js new file mode 100644 index 00000000..f5e864f5 --- /dev/null +++ b/repository-reproducibility-bundle-guard/index.js @@ -0,0 +1,272 @@ +const HIGH = "high"; +const MEDIUM = "medium"; +const LOW = "low"; + +const REQUIRED_COMPONENT_TYPES = [ + "manuscript", + "data", + "code", + "notebook", + "results", + "protocol", + "metadata", +]; + +function requiredString(value, field) { + if (typeof value !== "string" || value.trim() === "") { + throw new TypeError(`${field} must be a non-empty string`); + } + return value.trim(); +} + +function array(value, field) { + if (!Array.isArray(value)) { + throw new TypeError(`${field} must be an array`); + } + return value; +} + +function unique(values) { + return [...new Set(values.map(String))]; +} + +function isSha256(value) { + return typeof value === "string" && /^[a-f0-9]{64}$/.test(value); +} + +function isPinnedRuntime(value) { + return typeof value === "string" && /@sha256:[a-f0-9]{64}$/.test(value); +} + +function normalizeComponent(raw, index) { + return { + id: requiredString(raw.id, `components[${index}].id`), + type: requiredString(raw.type, `components[${index}].type`), + path: requiredString(raw.path, `components[${index}].path`), + sha256: requiredString(raw.sha256, `components[${index}].sha256`).toLowerCase(), + bytes: Number(raw.bytes || 0), + versioned: raw.versioned !== false, + provenanceRefs: unique(raw.provenanceRefs || []), + runtimeImage: raw.runtimeImage ? String(raw.runtimeImage) : "", + lockfiles: unique(raw.lockfiles || []), + exportTargets: unique(raw.exportTargets || []), + }; +} + +function normalizeCandidate(raw) { + return { + repositoryId: requiredString(raw.repositoryId, "repositoryId"), + releaseTag: requiredString(raw.releaseTag, "releaseTag"), + commit: requiredString(raw.commit, "commit"), + metadata: { + doi: raw.metadata && raw.metadata.doi ? String(raw.metadata.doi) : "", + schemaOrg: Boolean(raw.metadata && raw.metadata.schemaOrg), + dataCite: Boolean(raw.metadata && raw.metadata.dataCite), + license: raw.metadata && raw.metadata.license ? String(raw.metadata.license) : "", + authors: unique((raw.metadata && raw.metadata.authors) || []), + }, + components: array(raw.components || [], "components").map(normalizeComponent), + bundle: { + archivePath: raw.bundle && raw.bundle.archivePath ? String(raw.bundle.archivePath) : "", + sha256: raw.bundle && raw.bundle.sha256 ? String(raw.bundle.sha256).toLowerCase() : "", + includes: unique((raw.bundle && raw.bundle.includes) || []), + generatedAt: raw.bundle && raw.bundle.generatedAt ? String(raw.bundle.generatedAt) : "", + }, + }; +} + +function finding(code, severity, sourceId, message, remediation) { + return { code, severity, sourceId, message, remediation }; +} + +function assessReproducibilityBundle(rawCandidate) { + const candidate = normalizeCandidate(rawCandidate); + const findings = []; + const componentTypes = new Set(candidate.components.map((item) => item.type)); + const componentPaths = candidate.components.map((item) => item.path); + const duplicatePaths = componentPaths.filter((path, index) => componentPaths.indexOf(path) !== index); + + for (const type of REQUIRED_COMPONENT_TYPES) { + if (!componentTypes.has(type)) { + findings.push( + finding( + "MISSING_REQUIRED_COMPONENT", + HIGH, + type, + `${type} component is absent from the release manifest.`, + "Add the required component or explicitly hold the release until the research packet is complete." + ) + ); + } + } + + for (const path of unique(duplicatePaths)) { + findings.push( + finding( + "DUPLICATE_MANIFEST_PATH", + HIGH, + path, + `${path} appears more than once in the release manifest.`, + "Keep one canonical manifest row per path so rollback and integrity checks are deterministic." + ) + ); + } + + for (const component of candidate.components) { + if (!isSha256(component.sha256)) { + findings.push( + finding( + "INVALID_COMPONENT_DIGEST", + HIGH, + component.id, + `${component.path} does not carry a valid SHA-256 digest.`, + "Record the lowercase SHA-256 digest before tagging the release." + ) + ); + } + + if (!component.versioned) { + findings.push( + finding( + "UNVERSIONED_COMPONENT", + HIGH, + component.id, + `${component.path} is not marked as version-controlled.`, + "Commit, tag, or archive the component under repository version control before release." + ) + ); + } + + if (["code", "notebook"].includes(component.type) && component.lockfiles.length === 0) { + findings.push( + finding( + "MISSING_RUNTIME_LOCKFILE", + HIGH, + component.id, + `${component.path} lacks lockfile evidence for executable reproducibility.`, + "Attach package-lock, requirements lock, renv, Manifest.toml, or equivalent runtime lock evidence." + ) + ); + } + + if (["code", "notebook"].includes(component.type) && !isPinnedRuntime(component.runtimeImage)) { + findings.push( + finding( + "UNPINNED_RUNTIME_IMAGE", + MEDIUM, + component.id, + `${component.path} runtime image is not digest-pinned.`, + "Pin the container/runtime image by immutable digest instead of a mutable tag." + ) + ); + } + + if (["data", "results"].includes(component.type) && component.provenanceRefs.length === 0) { + findings.push( + finding( + "MISSING_DATA_PROVENANCE", + MEDIUM, + component.id, + `${component.path} lacks provenance references to inputs, instruments, or analysis commits.`, + "Link the dataset/result to source instruments, upstream datasets, notebooks, or analysis commits." + ) + ); + } + + if (component.type === "metadata" && component.exportTargets.length === 0) { + findings.push( + finding( + "MISSING_METADATA_EXPORT_TARGETS", + LOW, + component.id, + `${component.path} does not declare export targets.`, + "Declare DOI, schema.org, DataCite, or repository export targets for discovery." + ) + ); + } + } + + if (!candidate.metadata.doi || !candidate.metadata.schemaOrg || !candidate.metadata.dataCite || !candidate.metadata.license) { + findings.push( + finding( + "INCOMPLETE_RELEASE_METADATA", + HIGH, + "metadata", + "Release metadata is missing DOI, schema.org, DataCite, or license evidence.", + "Complete persistent identifier, discovery metadata, citation metadata, and license fields before publication." + ) + ); + } + + if (candidate.metadata.authors.length === 0) { + findings.push( + finding( + "MISSING_AUTHOR_ATTRIBUTION", + MEDIUM, + "metadata.authors", + "Release metadata does not include author attribution.", + "Attach author identifiers before the repository release is exported or assigned a DOI." + ) + ); + } + + if (!candidate.bundle.archivePath || !isSha256(candidate.bundle.sha256)) { + findings.push( + finding( + "INVALID_EXPORT_BUNDLE", + HIGH, + "bundle", + "Release bundle archive path or digest is missing/invalid.", + "Generate a release archive with a stable SHA-256 digest before tagging." + ) + ); + } + + const missingFromBundle = unique( + candidate.components + .filter((component) => !candidate.bundle.includes.includes(component.path)) + .map((component) => component.path) + ); + if (missingFromBundle.length > 0) { + findings.push( + finding( + "BUNDLE_OMITS_MANIFEST_COMPONENTS", + HIGH, + "bundle.includes", + `Release bundle omits manifest paths: ${missingFromBundle.join(", ")}.`, + "Regenerate the archive so every manifest component is present in the export bundle." + ) + ); + } + + const high = findings.filter((item) => item.severity === HIGH).length; + const medium = findings.filter((item) => item.severity === MEDIUM).length; + return { + repositoryId: candidate.repositoryId, + releaseTag: candidate.releaseTag, + commit: candidate.commit, + decision: high > 0 ? "hold" : medium > 0 ? "revise" : "release", + summary: { + componentsReviewed: candidate.components.length, + requiredTypesCovered: REQUIRED_COMPONENT_TYPES.filter((type) => componentTypes.has(type)).length, + bundleEntries: candidate.bundle.includes.length, + findings: findings.length, + high, + medium, + low: findings.filter((item) => item.severity === LOW).length, + }, + findings, + releaseCriteria: [ + "Every required scientific repository component is present in the manifest.", + "Every manifest entry has a stable digest and version-control status.", + "Executable code and notebooks include lockfile evidence and pinned runtimes.", + "Datasets and results link back to provenance inputs.", + "The export bundle includes every manifest path and has its own SHA-256 digest.", + ], + }; +} + +module.exports = { + assessReproducibilityBundle, + normalizeCandidate, +}; diff --git a/repository-reproducibility-bundle-guard/package.json b/repository-reproducibility-bundle-guard/package.json new file mode 100644 index 00000000..b0cbbdfb --- /dev/null +++ b/repository-reproducibility-bundle-guard/package.json @@ -0,0 +1,13 @@ +{ + "name": "repository-reproducibility-bundle-guard", + "version": "1.0.0", + "description": "Reproducibility bundle guard for SCIBASE project repositories and version control", + "main": "index.js", + "type": "commonjs", + "scripts": { + "test": "node test.js", + "demo": "node demo.js", + "demo:video": "python demo_video.py" + }, + "license": "MIT" +} diff --git a/repository-reproducibility-bundle-guard/reports/demo.gif b/repository-reproducibility-bundle-guard/reports/demo.gif new file mode 100644 index 00000000..870cedb6 Binary files /dev/null and b/repository-reproducibility-bundle-guard/reports/demo.gif differ diff --git a/repository-reproducibility-bundle-guard/reports/demo.mp4 b/repository-reproducibility-bundle-guard/reports/demo.mp4 new file mode 100644 index 00000000..6fdfe969 Binary files /dev/null and b/repository-reproducibility-bundle-guard/reports/demo.mp4 differ diff --git a/repository-reproducibility-bundle-guard/reports/release-candidate.json b/repository-reproducibility-bundle-guard/reports/release-candidate.json new file mode 100644 index 00000000..0e469f39 --- /dev/null +++ b/repository-reproducibility-bundle-guard/reports/release-candidate.json @@ -0,0 +1,23 @@ +{ + "repositoryId": "repo-climate-forecasting", + "releaseTag": "preprint-v2.1.0", + "commit": "9f8c7d6e5b4a3210", + "decision": "release", + "summary": { + "componentsReviewed": 7, + "requiredTypesCovered": 7, + "bundleEntries": 7, + "findings": 0, + "high": 0, + "medium": 0, + "low": 0 + }, + "findings": [], + "releaseCriteria": [ + "Every required scientific repository component is present in the manifest.", + "Every manifest entry has a stable digest and version-control status.", + "Executable code and notebooks include lockfile evidence and pinned runtimes.", + "Datasets and results link back to provenance inputs.", + "The export bundle includes every manifest path and has its own SHA-256 digest." + ] +} \ No newline at end of file diff --git a/repository-reproducibility-bundle-guard/reports/release-candidate.md b/repository-reproducibility-bundle-guard/reports/release-candidate.md new file mode 100644 index 00000000..425fc4c2 --- /dev/null +++ b/repository-reproducibility-bundle-guard/reports/release-candidate.md @@ -0,0 +1,21 @@ +# Repository Reproducibility Bundle Guard + +Scenario: release-candidate + +Repository: repo-climate-forecasting +Release tag: preprint-v2.1.0 +Decision: RELEASE + +Reviewed 7 manifest components and 7 bundle entries. + +## Findings + +- No reproducibility bundle findings. + +## Release Criteria + +- Every required scientific repository component is present in the manifest. +- Every manifest entry has a stable digest and version-control status. +- Executable code and notebooks include lockfile evidence and pinned runtimes. +- Datasets and results link back to provenance inputs. +- The export bundle includes every manifest path and has its own SHA-256 digest. diff --git a/repository-reproducibility-bundle-guard/reports/release-candidate.svg b/repository-reproducibility-bundle-guard/reports/release-candidate.svg new file mode 100644 index 00000000..987300c8 --- /dev/null +++ b/repository-reproducibility-bundle-guard/reports/release-candidate.svg @@ -0,0 +1,12 @@ + + + Repository Reproducibility Bundle Guard + repo-climate-forecasting / preprint-v2.1.0 + + RELEASE + Components: 7 + High: 0 + Medium: 0 + Low: 0 + Synthetic release packets only. No private data or external services. + \ No newline at end of file diff --git a/repository-reproducibility-bundle-guard/reports/risky-candidate.json b/repository-reproducibility-bundle-guard/reports/risky-candidate.json new file mode 100644 index 00000000..066048f9 --- /dev/null +++ b/repository-reproducibility-bundle-guard/reports/risky-candidate.json @@ -0,0 +1,136 @@ +{ + "repositoryId": "repo-incomplete-export", + "releaseTag": "preprint-v2.2.0", + "commit": "9f8c7d6e5b4a3210", + "decision": "hold", + "summary": { + "componentsReviewed": 5, + "requiredTypesCovered": 4, + "bundleEntries": 2, + "findings": 16, + "high": 11, + "medium": 4, + "low": 1 + }, + "findings": [ + { + "code": "MISSING_REQUIRED_COMPONENT", + "severity": "high", + "sourceId": "notebook", + "message": "notebook component is absent from the release manifest.", + "remediation": "Add the required component or explicitly hold the release until the research packet is complete." + }, + { + "code": "MISSING_REQUIRED_COMPONENT", + "severity": "high", + "sourceId": "results", + "message": "results component is absent from the release manifest.", + "remediation": "Add the required component or explicitly hold the release until the research packet is complete." + }, + { + "code": "MISSING_REQUIRED_COMPONENT", + "severity": "high", + "sourceId": "protocol", + "message": "protocol component is absent from the release manifest.", + "remediation": "Add the required component or explicitly hold the release until the research packet is complete." + }, + { + "code": "DUPLICATE_MANIFEST_PATH", + "severity": "high", + "sourceId": "code/train.py", + "message": "code/train.py appears more than once in the release manifest.", + "remediation": "Keep one canonical manifest row per path so rollback and integrity checks are deterministic." + }, + { + "code": "INVALID_COMPONENT_DIGEST", + "severity": "high", + "sourceId": "manuscript-main", + "message": "manuscript/main.md does not carry a valid SHA-256 digest.", + "remediation": "Record the lowercase SHA-256 digest before tagging the release." + }, + { + "code": "MISSING_DATA_PROVENANCE", + "severity": "medium", + "sourceId": "observations", + "message": "data/observations.parquet lacks provenance references to inputs, instruments, or analysis commits.", + "remediation": "Link the dataset/result to source instruments, upstream datasets, notebooks, or analysis commits." + }, + { + "code": "MISSING_RUNTIME_LOCKFILE", + "severity": "high", + "sourceId": "analysis-code", + "message": "code/train.py lacks lockfile evidence for executable reproducibility.", + "remediation": "Attach package-lock, requirements lock, renv, Manifest.toml, or equivalent runtime lock evidence." + }, + { + "code": "UNPINNED_RUNTIME_IMAGE", + "severity": "medium", + "sourceId": "analysis-code", + "message": "code/train.py runtime image is not digest-pinned.", + "remediation": "Pin the container/runtime image by immutable digest instead of a mutable tag." + }, + { + "code": "MISSING_RUNTIME_LOCKFILE", + "severity": "high", + "sourceId": "duplicate-code", + "message": "code/train.py lacks lockfile evidence for executable reproducibility.", + "remediation": "Attach package-lock, requirements lock, renv, Manifest.toml, or equivalent runtime lock evidence." + }, + { + "code": "UNPINNED_RUNTIME_IMAGE", + "severity": "medium", + "sourceId": "duplicate-code", + "message": "code/train.py runtime image is not digest-pinned.", + "remediation": "Pin the container/runtime image by immutable digest instead of a mutable tag." + }, + { + "code": "UNVERSIONED_COMPONENT", + "severity": "high", + "sourceId": "metadata", + "message": "metadata.json is not marked as version-controlled.", + "remediation": "Commit, tag, or archive the component under repository version control before release." + }, + { + "code": "MISSING_METADATA_EXPORT_TARGETS", + "severity": "low", + "sourceId": "metadata", + "message": "metadata.json does not declare export targets.", + "remediation": "Declare DOI, schema.org, DataCite, or repository export targets for discovery." + }, + { + "code": "INCOMPLETE_RELEASE_METADATA", + "severity": "high", + "sourceId": "metadata", + "message": "Release metadata is missing DOI, schema.org, DataCite, or license evidence.", + "remediation": "Complete persistent identifier, discovery metadata, citation metadata, and license fields before publication." + }, + { + "code": "MISSING_AUTHOR_ATTRIBUTION", + "severity": "medium", + "sourceId": "metadata.authors", + "message": "Release metadata does not include author attribution.", + "remediation": "Attach author identifiers before the repository release is exported or assigned a DOI." + }, + { + "code": "INVALID_EXPORT_BUNDLE", + "severity": "high", + "sourceId": "bundle", + "message": "Release bundle archive path or digest is missing/invalid.", + "remediation": "Generate a release archive with a stable SHA-256 digest before tagging." + }, + { + "code": "BUNDLE_OMITS_MANIFEST_COMPONENTS", + "severity": "high", + "sourceId": "bundle.includes", + "message": "Release bundle omits manifest paths: data/observations.parquet, code/train.py.", + "remediation": "Regenerate the archive so every manifest component is present in the export bundle." + } + ], + "releaseCriteria": [ + "Every required scientific repository component is present in the manifest.", + "Every manifest entry has a stable digest and version-control status.", + "Executable code and notebooks include lockfile evidence and pinned runtimes.", + "Datasets and results link back to provenance inputs.", + "The export bundle includes every manifest path and has its own SHA-256 digest." + ] +} \ No newline at end of file diff --git a/repository-reproducibility-bundle-guard/reports/risky-candidate.md b/repository-reproducibility-bundle-guard/reports/risky-candidate.md new file mode 100644 index 00000000..60ecdd3c --- /dev/null +++ b/repository-reproducibility-bundle-guard/reports/risky-candidate.md @@ -0,0 +1,36 @@ +# Repository Reproducibility Bundle Guard + +Scenario: risky-candidate + +Repository: repo-incomplete-export +Release tag: preprint-v2.2.0 +Decision: HOLD + +Reviewed 5 manifest components and 2 bundle entries. + +## Findings + +- HIGH MISSING_REQUIRED_COMPONENT: notebook component is absent from the release manifest. +- HIGH MISSING_REQUIRED_COMPONENT: results component is absent from the release manifest. +- HIGH MISSING_REQUIRED_COMPONENT: protocol component is absent from the release manifest. +- HIGH DUPLICATE_MANIFEST_PATH: code/train.py appears more than once in the release manifest. +- HIGH INVALID_COMPONENT_DIGEST: manuscript/main.md does not carry a valid SHA-256 digest. +- MEDIUM MISSING_DATA_PROVENANCE: data/observations.parquet lacks provenance references to inputs, instruments, or analysis commits. +- HIGH MISSING_RUNTIME_LOCKFILE: code/train.py lacks lockfile evidence for executable reproducibility. +- MEDIUM UNPINNED_RUNTIME_IMAGE: code/train.py runtime image is not digest-pinned. +- HIGH MISSING_RUNTIME_LOCKFILE: code/train.py lacks lockfile evidence for executable reproducibility. +- MEDIUM UNPINNED_RUNTIME_IMAGE: code/train.py runtime image is not digest-pinned. +- HIGH UNVERSIONED_COMPONENT: metadata.json is not marked as version-controlled. +- LOW MISSING_METADATA_EXPORT_TARGETS: metadata.json does not declare export targets. +- HIGH INCOMPLETE_RELEASE_METADATA: Release metadata is missing DOI, schema.org, DataCite, or license evidence. +- MEDIUM MISSING_AUTHOR_ATTRIBUTION: Release metadata does not include author attribution. +- HIGH INVALID_EXPORT_BUNDLE: Release bundle archive path or digest is missing/invalid. +- HIGH BUNDLE_OMITS_MANIFEST_COMPONENTS: Release bundle omits manifest paths: data/observations.parquet, code/train.py. + +## Release Criteria + +- Every required scientific repository component is present in the manifest. +- Every manifest entry has a stable digest and version-control status. +- Executable code and notebooks include lockfile evidence and pinned runtimes. +- Datasets and results link back to provenance inputs. +- The export bundle includes every manifest path and has its own SHA-256 digest. diff --git a/repository-reproducibility-bundle-guard/reports/risky-candidate.svg b/repository-reproducibility-bundle-guard/reports/risky-candidate.svg new file mode 100644 index 00000000..28b96f95 --- /dev/null +++ b/repository-reproducibility-bundle-guard/reports/risky-candidate.svg @@ -0,0 +1,12 @@ + + + Repository Reproducibility Bundle Guard + repo-incomplete-export / preprint-v2.2.0 + + HOLD + Components: 5 + High: 11 + Medium: 4 + Low: 1 + Synthetic release packets only. No private data or external services. + \ No newline at end of file diff --git a/repository-reproducibility-bundle-guard/requirements-map.md b/repository-reproducibility-bundle-guard/requirements-map.md new file mode 100644 index 00000000..36552554 --- /dev/null +++ b/repository-reproducibility-bundle-guard/requirements-map.md @@ -0,0 +1,14 @@ +# Requirements Map + +Issue #10 asks for project repositories that support structured scientific components, versioning, integrity, collaboration, metadata, reproducibility, and export readiness. + +This slice covers a focused release gate: + +- Repository structure: validates that manuscript, data, code, notebooks, results, protocols, and metadata are all present in the release manifest. +- File and metadata versioning: requires each component to be versioned and carry a stable SHA-256 digest. +- Reproducibility: requires executable code and notebooks to include lockfile evidence and digest-pinned runtime images. +- Provenance: requires datasets and results to reference upstream inputs, instruments, notebooks, or commits. +- Metadata export: requires DOI, schema.org, DataCite, license, author attribution, and metadata export targets. +- Publication/export readiness: requires the release archive to have its own digest and include every manifest path. + +Out of scope by design: branch protection, merge requests, release signatures, Git LFS pointer integrity, restore rehearsals, credential rotation, and review-decision provenance, because those are already covered by separate same-issue slices. diff --git a/repository-reproducibility-bundle-guard/sample-data.js b/repository-reproducibility-bundle-guard/sample-data.js new file mode 100644 index 00000000..adfabf78 --- /dev/null +++ b/repository-reproducibility-bundle-guard/sample-data.js @@ -0,0 +1,171 @@ +const goodDigest = "a".repeat(64); +const secondDigest = "b".repeat(64); +const thirdDigest = "c".repeat(64); +const fourthDigest = "d".repeat(64); +const fifthDigest = "e".repeat(64); +const sixthDigest = "f".repeat(64); +const seventhDigest = "1".repeat(64); +const bundleDigest = "2".repeat(64); +const runtimeDigest = `ghcr.io/scibase/python-research@sha256:${"3".repeat(64)}`; + +const releaseCandidate = { + repositoryId: "repo-climate-forecasting", + releaseTag: "preprint-v2.1.0", + commit: "9f8c7d6e5b4a3210", + metadata: { + doi: "10.5555/scibase.climate.2026.002", + schemaOrg: true, + dataCite: true, + license: "CC-BY-4.0", + authors: ["orcid:0000-0002-1825-0097", "orcid:0000-0003-1415-9265"], + }, + components: [ + { + id: "manuscript-main", + type: "manuscript", + path: "manuscript/main.md", + sha256: goodDigest, + bytes: 18422, + provenanceRefs: ["protocols/forecast-protocol.md"], + exportTargets: ["pdf", "html"], + }, + { + id: "observations", + type: "data", + path: "data/observations.parquet", + sha256: secondDigest, + bytes: 90021, + provenanceRefs: ["instrument:NOAA-GHCN", "commit:9f8c7d6e5b4a3210"], + exportTargets: ["parquet", "csv-preview"], + }, + { + id: "analysis-code", + type: "code", + path: "code/train.py", + sha256: thirdDigest, + bytes: 15420, + runtimeImage: runtimeDigest, + lockfiles: ["code/requirements.lock"], + provenanceRefs: ["data/observations.parquet"], + }, + { + id: "notebook-validation", + type: "notebook", + path: "notebooks/validation.ipynb", + sha256: fourthDigest, + bytes: 74210, + runtimeImage: runtimeDigest, + lockfiles: ["code/requirements.lock"], + provenanceRefs: ["code/train.py"], + }, + { + id: "figure-pack", + type: "results", + path: "results/figures.zip", + sha256: fifthDigest, + bytes: 25001, + provenanceRefs: ["notebooks/validation.ipynb"], + exportTargets: ["png", "svg"], + }, + { + id: "forecast-protocol", + type: "protocol", + path: "protocols/forecast-protocol.md", + sha256: sixthDigest, + bytes: 9360, + provenanceRefs: ["doi:10.5555/scibase.climate.2026.001"], + }, + { + id: "metadata", + type: "metadata", + path: "metadata.json", + sha256: seventhDigest, + bytes: 4200, + exportTargets: ["doi", "schema.org", "datacite"], + }, + ], + bundle: { + archivePath: "exports/repo-climate-forecasting-preprint-v2.1.0.tar.gz", + sha256: bundleDigest, + generatedAt: "2026-06-01T18:00:00Z", + includes: [ + "manuscript/main.md", + "data/observations.parquet", + "code/train.py", + "notebooks/validation.ipynb", + "results/figures.zip", + "protocols/forecast-protocol.md", + "metadata.json", + ], + }, +}; + +const riskyCandidate = { + ...releaseCandidate, + repositoryId: "repo-incomplete-export", + releaseTag: "preprint-v2.2.0", + metadata: { + doi: "", + schemaOrg: false, + dataCite: true, + license: "", + authors: [], + }, + components: [ + { + id: "manuscript-main", + type: "manuscript", + path: "manuscript/main.md", + sha256: "not-a-digest", + bytes: 18422, + provenanceRefs: [], + }, + { + id: "observations", + type: "data", + path: "data/observations.parquet", + sha256: secondDigest, + bytes: 90021, + provenanceRefs: [], + }, + { + id: "analysis-code", + type: "code", + path: "code/train.py", + sha256: thirdDigest, + bytes: 15420, + runtimeImage: "ghcr.io/scibase/python-research:latest", + lockfiles: [], + provenanceRefs: ["data/observations.parquet"], + }, + { + id: "duplicate-code", + type: "code", + path: "code/train.py", + sha256: thirdDigest, + bytes: 15420, + runtimeImage: "ghcr.io/scibase/python-research:latest", + lockfiles: [], + }, + { + id: "metadata", + type: "metadata", + path: "metadata.json", + sha256: seventhDigest, + bytes: 4200, + exportTargets: [], + versioned: false, + }, + ], + bundle: { + archivePath: "exports/repo-incomplete-export.tar.gz", + sha256: "bad-bundle-digest", + generatedAt: "2026-06-01T18:00:00Z", + includes: ["manuscript/main.md", "metadata.json"], + }, +}; + +module.exports = { + releaseCandidate, + riskyCandidate, +}; diff --git a/repository-reproducibility-bundle-guard/test.js b/repository-reproducibility-bundle-guard/test.js new file mode 100644 index 00000000..ca1a03e6 --- /dev/null +++ b/repository-reproducibility-bundle-guard/test.js @@ -0,0 +1,46 @@ +const assert = require("assert"); + +const { assessReproducibilityBundle, normalizeCandidate } = require("./index"); +const { releaseCandidate, riskyCandidate } = require("./sample-data"); + +const clean = assessReproducibilityBundle(releaseCandidate); +assert.strictEqual(clean.decision, "release"); +assert.strictEqual(clean.summary.findings, 0); +assert.strictEqual(clean.summary.requiredTypesCovered, 7); + +const risky = assessReproducibilityBundle(riskyCandidate); +assert.strictEqual(risky.decision, "hold"); +for (const code of [ + "MISSING_REQUIRED_COMPONENT", + "DUPLICATE_MANIFEST_PATH", + "INVALID_COMPONENT_DIGEST", + "UNVERSIONED_COMPONENT", + "MISSING_RUNTIME_LOCKFILE", + "UNPINNED_RUNTIME_IMAGE", + "MISSING_DATA_PROVENANCE", + "MISSING_METADATA_EXPORT_TARGETS", + "INCOMPLETE_RELEASE_METADATA", + "MISSING_AUTHOR_ATTRIBUTION", + "INVALID_EXPORT_BUNDLE", + "BUNDLE_OMITS_MANIFEST_COMPONENTS", +]) { + assert(risky.findings.some((finding) => finding.code === code), `missing ${code}`); +} + +const reviseOnly = assessReproducibilityBundle({ + ...releaseCandidate, + components: releaseCandidate.components.map((component) => + component.id === "analysis-code" + ? { ...component, runtimeImage: "ghcr.io/scibase/python-research:latest" } + : component + ), +}); +assert.strictEqual(reviseOnly.decision, "revise"); +assert(reviseOnly.findings.some((finding) => finding.code === "UNPINNED_RUNTIME_IMAGE")); + +assert.throws( + () => normalizeCandidate({ ...releaseCandidate, repositoryId: "" }), + /repositoryId must be a non-empty string/ +); + +console.log("repository reproducibility bundle guard tests passed");