diff --git a/geospatial-spatial-autocorrelation-assistant/.gitignore b/geospatial-spatial-autocorrelation-assistant/.gitignore new file mode 100644 index 00000000..2bf074d6 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/.gitignore @@ -0,0 +1 @@ +reports/frames/ diff --git a/geospatial-spatial-autocorrelation-assistant/README.md b/geospatial-spatial-autocorrelation-assistant/README.md new file mode 100644 index 00000000..d04aa6b9 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/README.md @@ -0,0 +1,47 @@ +# Geospatial Spatial-Autocorrelation Review Assistant + +Self-contained reviewer utility for SCIBASE issue #16, AI-Powered Research Assistant Suite. It reviews synthetic geospatial manuscript packets before AI peer-review, reproducibility, or research-gap recommendations are released to researchers. + +## What It Checks + +- Coordinate reference system and analysis projection evidence. +- Invalid or over-precise coordinates for sensitive human-subject or protected-location studies. +- Spatial train/test leakage from nearby train and holdout samples. +- High Moran's I paired with random validation splits. +- Preprocessing fitted on the full dataset for spatial covariates. +- Test-set tuning, missing external spatial validation, stale covariate windows, and missing raster/vector source metadata. +- Reproducibility artifacts: data manifest, code commit, environment spec, and spatial block map. +- Research-gap prompts for under-sampled regions and missing spatial validation benchmarks. + +## Why This Is Distinct + +Existing #16 work covers broad assistant orchestration, evidence binding, structured abstracts, randomization/blinding, survival analysis, missing-data sensitivity, causal adjustment, genomic/proteomics/single-cell review, and related peer-review checks. Existing #17 geospatial work validates sample-provenance graph edges. This module is a separate #16 peer-review layer for manuscript-method validity: spatial autocorrelation, blocked validation, coordinate/projection evidence, and geography-aware reproducibility. + +## Usage + +```bash +npm run check +npm test +npm run demo +npm run verify-video +``` + +Generated reviewer artifacts are written to `reports/`: + +- `risky-audit.json` +- `clean-audit.json` +- `risky-review.md` +- `summary.svg` +- `demo.mp4` + +## API + +```js +const { + evaluateGeospatialReviewPacket, + renderMarkdownReport, + renderSvgSummary +} = require("./index"); +``` + +The evaluator returns a deterministic status (`READY`, `REVIEW`, or `HOLD`), finding counts, manuscript decisions, reproducibility scores, remediation actions, research-gap opportunities, and a stable fingerprint. diff --git a/geospatial-spatial-autocorrelation-assistant/demo.js b/geospatial-spatial-autocorrelation-assistant/demo.js new file mode 100644 index 00000000..4521dbe4 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/demo.js @@ -0,0 +1,47 @@ +"use strict"; + +const fs = require("node:fs"); +const path = require("node:path"); +const { + evaluateGeospatialReviewPacket, + renderMarkdownReport, + renderSvgSummary +} = require("./index"); +const { riskyPacket, cleanPacket } = require("./sample-data"); + +const reportsDir = path.join(__dirname, "reports"); +fs.mkdirSync(reportsDir, { recursive: true }); + +const now = "2026-06-01T10:30:00.000Z"; +const risky = evaluateGeospatialReviewPacket(riskyPacket, { now }); +const clean = evaluateGeospatialReviewPacket(cleanPacket, { now }); + +fs.writeFileSync(path.join(reportsDir, "risky-audit.json"), `${JSON.stringify(risky, null, 2)}\n`); +fs.writeFileSync(path.join(reportsDir, "clean-audit.json"), `${JSON.stringify(clean, null, 2)}\n`); +fs.writeFileSync(path.join(reportsDir, "risky-review.md"), renderMarkdownReport(risky, riskyPacket)); +fs.writeFileSync(path.join(reportsDir, "summary.svg"), renderSvgSummary(risky)); +fs.writeFileSync( + path.join(reportsDir, "manifest.json"), + `${JSON.stringify( + { + generatedAt: now, + artifacts: [ + "risky-audit.json", + "clean-audit.json", + "risky-review.md", + "summary.svg", + "demo.mp4" + ], + riskyStatus: risky.status, + cleanStatus: clean.status, + riskyFingerprint: risky.fingerprint, + cleanFingerprint: clean.fingerprint + }, + null, + 2 + )}\n` +); + +console.log(`Risky packet: ${risky.status} (${risky.findings.length} findings)`); +console.log(`Clean packet: ${clean.status} (${clean.findings.length} findings)`); +console.log(`Wrote reports to ${reportsDir}`); diff --git a/geospatial-spatial-autocorrelation-assistant/index.js b/geospatial-spatial-autocorrelation-assistant/index.js new file mode 100644 index 00000000..6712ff47 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/index.js @@ -0,0 +1,690 @@ +"use strict"; + +const crypto = require("node:crypto"); + +const SEVERITY_ORDER = ["critical", "high", "warning", "info"]; + +const DEFAULT_POLICY = { + minSpatialHoldoutKm: 25, + highMoransI: 0.35, + maxSensitivePrecisionDecimals: 4, + minRegionsForBroadClaims: 3, + maxCovariateWindowDays: 365 +}; + +function evaluateGeospatialReviewPacket(packet, options = {}) { + if (!isPlainObject(packet)) { + throw new TypeError("evaluateGeospatialReviewPacket expects a packet object"); + } + + const now = options.now ?? new Date().toISOString(); + const policy = { ...DEFAULT_POLICY, ...(isPlainObject(packet.policy) ? packet.policy : {}) }; + const manuscripts = asArray(packet.manuscripts); + const findings = []; + + if (manuscripts.length === 0) { + findings.push( + finding( + "PACKET_SCHEMA_MISSING_MANUSCRIPTS", + "high", + "The geospatial review packet has no manuscripts to inspect.", + "AI peer review output needs at least one manuscript or study packet.", + "manuscripts", + "Attach manuscript metadata, spatial design, samples, model split evidence, and reproducibility artifacts.", + "research assistant owner" + ) + ); + } + + manuscripts.forEach((manuscript, index) => inspectManuscript(manuscript, index, policy, findings)); + + const sortedFindings = sortFindings(findings); + const status = determineStatus(sortedFindings); + const reviewDecisions = manuscripts.map((manuscript, index) => + buildReviewDecision(manuscript, index, sortedFindings) + ); + const researchGapOpportunities = buildResearchGapOpportunities(manuscripts, sortedFindings, policy); + const remediationActions = sortedFindings.map((item) => ({ + code: item.code, + manuscriptId: item.manuscriptId ?? null, + modelId: item.modelId ?? null, + owner: item.owner, + action: item.remediation + })); + + const fingerprint = crypto + .createHash("sha256") + .update( + JSON.stringify({ + policy, + manuscripts: manuscripts.map((manuscript) => ({ + id: manuscript.id, + spatialDesign: manuscript.spatialDesign, + claims: manuscript.claims, + models: manuscript.models, + artifactKeys: Object.keys(isPlainObject(manuscript.reproducibilityArtifacts) ? manuscript.reproducibilityArtifacts : {}) + })), + codes: sortedFindings.map((item) => item.code) + }) + ) + .digest("hex") + .slice(0, 16); + + return { + generatedAt: now, + status, + summary: summarize(status, sortedFindings, manuscripts.length, researchGapOpportunities.length), + findingCounts: countBySeverity(sortedFindings), + findings: sortedFindings, + reviewDecisions, + researchGapOpportunities, + remediationActions, + fingerprint + }; +} + +function renderMarkdownReport(result, packet) { + const lines = [ + "# Geospatial Spatial-Autocorrelation Review Assistant", + "", + `Packet: ${packet.id ?? "unknown"}`, + `Status: ${result.status}`, + `Fingerprint: ${result.fingerprint}`, + "", + "## Summary", + "", + result.summary, + "", + "## Manuscript Decisions", + "" + ]; + + result.reviewDecisions.forEach((decision) => { + lines.push( + `- ${decision.manuscriptId}: ${decision.decision}; reproducibility score ${decision.reproducibilityScore}/100; ${decision.reasonCodes.length} finding(s)` + ); + }); + + lines.push("", "## Findings", ""); + if (result.findings.length === 0) { + lines.push("- No geospatial peer-review blockers found."); + } else { + result.findings.forEach((item) => { + lines.push(`- ${item.severity.toUpperCase()} ${item.code}: ${item.message}`); + lines.push(` - Evidence: ${item.evidence}`); + lines.push(` - Remediation: ${item.remediation}`); + }); + } + + lines.push("", "## Research Gap Opportunities", ""); + if (result.researchGapOpportunities.length === 0) { + lines.push("- No under-sampled geography or replication opportunities were generated."); + } else { + result.researchGapOpportunities.forEach((gap) => { + lines.push(`- ${gap.id}: ${gap.title}`); + lines.push(` - Rationale: ${gap.rationale}`); + lines.push(` - First action: ${gap.firstAction}`); + }); + } + + return `${lines.join("\n")}\n`; +} + +function renderSvgSummary(result) { + const counts = result.findingCounts; + const critical = counts.critical ?? 0; + const high = counts.high ?? 0; + const warning = counts.warning ?? 0; + const ready = result.status === "READY"; + const statusColor = ready ? "#16794c" : result.status === "REVIEW" ? "#a15c00" : "#a11b32"; + const holdWidth = Math.min(330, (critical + high) * 54); + const warningWidth = Math.min(220, warning * 42); + const readyWidth = ready ? 300 : Math.max(80, 300 - holdWidth); + + return [ + ``, + ``, + ``, + `Geospatial review assistant`, + `Status ${escapeXml(result.status)} - fingerprint ${escapeXml(result.fingerprint)}`, + ``, + ``, + ``, + `SPATIAL QA`, + `Critical/high blockers: ${critical + high}`, + `Research gaps: ${result.researchGapOpportunities.length}`, + `Manuscripts checked: ${result.reviewDecisions.length}`, + `` + ].join("\n"); +} + +function inspectManuscript(manuscript, index, policy, findings) { + const manuscriptId = manuscript.id ?? `manuscript-${index}`; + const path = `manuscripts[${index}]`; + const spatialDesign = isPlainObject(manuscript.spatialDesign) ? manuscript.spatialDesign : {}; + const samples = asArray(manuscript.samples); + const models = asArray(manuscript.models); + const claims = asArray(manuscript.claims); + const artifacts = isPlainObject(manuscript.reproducibilityArtifacts) ? manuscript.reproducibilityArtifacts : {}; + + if (!manuscript.id) { + findings.push( + finding( + "MANUSCRIPT_MISSING_ID", + "high", + `Manuscript at index ${index} has no stable id.`, + "Reviewer packets need stable manuscript ids for traceability.", + `${path}.id`, + "Assign a stable manuscript id before releasing assistant output.", + "research assistant owner", + manuscriptId + ) + ); + } + + if (!spatialDesign.crs && !spatialDesign.epsg) { + findings.push( + finding( + "MISSING_CRS_EVIDENCE", + "high", + `${manuscriptId} does not declare a coordinate reference system.`, + "Spatial distances, joins, and raster overlays cannot be reviewed without CRS evidence.", + `${path}.spatialDesign.crs`, + "Declare the source CRS/EPSG code and any analysis projection used for distance or area operations.", + "geospatial methods reviewer", + manuscriptId + ) + ); + } + + if (samples.length === 0) { + findings.push( + finding( + "MISSING_SPATIAL_SAMPLE_TABLE", + "high", + `${manuscriptId} has no sample table with coordinates and split labels.`, + "Spatial leakage and regional coverage checks need sample-level geography.", + `${path}.samples`, + "Attach synthetic-safe sample coordinates, split labels, site ids, and region labels.", + "data steward", + manuscriptId + ) + ); + } + + inspectCoordinates(samples, path, manuscriptId, findings); + inspectSensitivePrecision(manuscript, spatialDesign, path, manuscriptId, policy, findings); + inspectBroadClaims(manuscript, claims, samples, path, manuscriptId, policy, findings); + + models.forEach((model, modelIndex) => + inspectModel(model, modelIndex, manuscript, samples, path, manuscriptId, policy, findings) + ); + + inspectArtifacts(artifacts, models, path, manuscriptId, findings); +} + +function inspectCoordinates(samples, path, manuscriptId, findings) { + samples.forEach((sample, sampleIndex) => { + const lat = Number(sample.lat); + const lon = Number(sample.lon); + if (!Number.isFinite(lat) || !Number.isFinite(lon) || lat < -90 || lat > 90 || lon < -180 || lon > 180) { + findings.push( + finding( + "INVALID_COORDINATE", + "critical", + `${manuscriptId} has an invalid coordinate at sample ${sample.id ?? sampleIndex}.`, + `Observed lat=${sample.lat}, lon=${sample.lon}.`, + `${path}.samples[${sampleIndex}]`, + "Correct or exclude invalid coordinates before AI peer review summarizes spatial findings.", + "data steward", + manuscriptId + ) + ); + } + }); +} + +function inspectSensitivePrecision(manuscript, spatialDesign, path, manuscriptId, policy, findings) { + const sensitivity = String(manuscript.sensitivity ?? "").toLowerCase(); + const sensitive = sensitivity.includes("human") || sensitivity.includes("protected") || sensitivity.includes("restricted"); + const decimals = Number(spatialDesign.coordinatePrecisionDecimals); + if (sensitive && Number.isFinite(decimals) && decimals > policy.maxSensitivePrecisionDecimals) { + findings.push( + finding( + "SENSITIVE_COORDINATE_OVERPRECISION", + "high", + `${manuscriptId} exposes sensitive locations at ${decimals} decimal places.`, + "Human-subject or protected-species locations should be generalized before reviewer packets or public summaries.", + `${path}.spatialDesign.coordinatePrecisionDecimals`, + `Round or jitter coordinates to ${policy.maxSensitivePrecisionDecimals} decimals or provide an approved restricted-location access path.`, + "privacy reviewer", + manuscriptId + ) + ); + } +} + +function inspectBroadClaims(manuscript, claims, samples, path, manuscriptId, policy, findings) { + const broadClaims = claims.filter(isBroadClaim); + if (broadClaims.length === 0) { + return; + } + + const regions = new Set(samples.map((sample) => sample.region).filter(Boolean)); + broadClaims.forEach((claim, claimIndex) => { + const claimedRegions = asArray(claim.claimedRegions).filter(Boolean); + const expectedRegions = Math.max(policy.minRegionsForBroadClaims, claimedRegions.length || 0); + if (regions.size < expectedRegions) { + findings.push( + finding( + "OVERBROAD_GEOGRAPHIC_CLAIM", + "high", + `${manuscriptId} makes a broad geographic claim with only ${regions.size} observed region(s).`, + claim.text ?? "Broad geographic claim without matching sampled-region coverage.", + `${path}.claims[${claimIndex}]`, + "Limit the claim to sampled regions or add external validation sites covering the claimed geography.", + "methods reviewer", + manuscriptId + ) + ); + } + }); +} + +function inspectModel(model, modelIndex, manuscript, samples, path, manuscriptId, policy, findings) { + const modelId = model.id ?? `model-${modelIndex}`; + const modelPath = `${path}.models[${modelIndex}]`; + const splitStrategy = String(model.splitStrategy ?? "").toLowerCase(); + const moransI = Number(model.moransI); + const spatialSplit = isSpatialSplit(model); + const minDistance = minimumTrainTestDistanceKm(samples); + + if (Number.isFinite(minDistance) && minDistance < policy.minSpatialHoldoutKm && !spatialSplit) { + findings.push( + finding( + "SPATIAL_SPLIT_LEAKAGE", + "critical", + `${manuscriptId}/${modelId} has train/test samples only ${minDistance.toFixed(1)} km apart without spatial blocking.`, + `Policy requires at least ${policy.minSpatialHoldoutKm} km or explicit spatial block validation.`, + `${modelPath}.splitStrategy`, + "Use spatial block, leave-site-out, or regional holdout validation and regenerate performance claims.", + "model reviewer", + manuscriptId, + modelId + ) + ); + } + + if (Number.isFinite(moransI) && moransI >= policy.highMoransI && !spatialSplit) { + findings.push( + finding( + "HIGH_SPATIAL_AUTOCORRELATION_RANDOM_SPLIT", + "high", + `${manuscriptId}/${modelId} reports Moran's I ${moransI.toFixed(2)} with a ${model.splitStrategy ?? "missing"} split.`, + "High spatial autocorrelation inflates random train/test validation.", + `${modelPath}.moransI`, + "Run spatial block cross-validation or leave-region-out validation before presenting performance as reviewer-ready.", + "spatial statistics reviewer", + manuscriptId, + modelId + ) + ); + } + + const preprocessingFitScope = String(model.preprocessingFitScope ?? "").toLowerCase(); + if (preprocessingFitScope.includes("full") && hasSpatialCovariates(model)) { + findings.push( + finding( + "FULL_DATASET_PREPROCESSING_LEAKAGE", + "high", + `${manuscriptId}/${modelId} fits spatial preprocessing on the full dataset.`, + "Raster normalization, imputation, or feature selection must be learned inside each training fold.", + `${modelPath}.preprocessingFitScope`, + "Refit preprocessing inside training folds and attach fold-specific transformation hashes.", + "reproducibility reviewer", + manuscriptId, + modelId + ) + ); + } + + const tunedOn = String(model.hyperparameterTunedOn ?? "").toLowerCase(); + if (tunedOn.includes("test") || tunedOn.includes("holdout")) { + findings.push( + finding( + "TEST_SET_TUNING", + "critical", + `${manuscriptId}/${modelId} tunes model choices on the test/holdout set.`, + "Reviewer-facing performance claims require a locked final test set.", + `${modelPath}.hyperparameterTunedOn`, + "Move tuning to inner validation folds and rerun the locked test set once.", + "model reviewer", + manuscriptId, + modelId + ) + ); + } + + inspectCovariates(model, modelPath, manuscriptId, modelId, policy, findings); + + const needsExternalValidation = asArray(manuscript.claims).some(isBroadClaim) || String(model.deploymentContext ?? "").length > 0; + if (needsExternalValidation && asArray(model.externalValidationSites).length === 0) { + findings.push( + finding( + "MISSING_EXTERNAL_SPATIAL_VALIDATION", + "high", + `${manuscriptId}/${modelId} lacks external spatial validation for broader deployment claims.`, + "Broad geographic or deployment claims should be checked outside the training geography.", + `${modelPath}.externalValidationSites`, + "Add an out-of-region validation site or limit the manuscript claim to the sampled geography.", + "methods reviewer", + manuscriptId, + modelId + ) + ); + } + + if (!splitStrategy) { + findings.push( + finding( + "MISSING_SPLIT_STRATEGY", + "warning", + `${manuscriptId}/${modelId} does not describe its spatial validation split strategy.`, + "Peer review needs the split design to reason about leakage and autocorrelation.", + `${modelPath}.splitStrategy`, + "Document random, blocked, leave-site-out, or external validation split evidence.", + "methods reviewer", + manuscriptId, + modelId + ) + ); + } +} + +function inspectCovariates(model, modelPath, manuscriptId, modelId, policy, findings) { + asArray(model.covariates).forEach((covariate, covariateIndex) => { + if (!covariate.source) { + findings.push( + finding( + "COVARIATE_SOURCE_MISSING", + "warning", + `${manuscriptId}/${modelId} covariate ${covariate.name ?? covariateIndex} has no source citation or artifact id.`, + "Raster/vector covariates should be traceable for reproducibility and recency review.", + `${modelPath}.covariates[${covariateIndex}].source`, + "Attach a source DOI, artifact id, or repository path for each spatial covariate.", + "data steward", + manuscriptId, + modelId + ) + ); + } + + const windowDays = Number(covariate.acquisitionWindowDays); + if (Number.isFinite(windowDays) && windowDays > policy.maxCovariateWindowDays) { + findings.push( + finding( + "STALE_COVARIATE_WINDOW", + "warning", + `${manuscriptId}/${modelId} covariate ${covariate.name ?? covariateIndex} spans ${windowDays} acquisition days.`, + "Long covariate windows can hide temporal drift in geospatial models.", + `${modelPath}.covariates[${covariateIndex}].acquisitionWindowDays`, + "Use period-matched covariates or report temporal-drift sensitivity checks.", + "methods reviewer", + manuscriptId, + modelId + ) + ); + } + + const resolutionMeters = Number(covariate.resolutionMeters); + if (!Number.isFinite(resolutionMeters) || resolutionMeters <= 0) { + findings.push( + finding( + "COVARIATE_RESOLUTION_MISSING", + "warning", + `${manuscriptId}/${modelId} covariate ${covariate.name ?? covariateIndex} lacks raster/vector resolution evidence.`, + "Spatial scale mismatch cannot be reviewed without resolution metadata.", + `${modelPath}.covariates[${covariateIndex}].resolutionMeters`, + "Attach spatial resolution, aggregation rules, and resampling method for each covariate.", + "geospatial methods reviewer", + manuscriptId, + modelId + ) + ); + } + }); +} + +function inspectArtifacts(artifacts, models, path, manuscriptId, findings) { + const required = [ + ["dataManifest", "DATA_MANIFEST_MISSING", "Attach a data manifest with sample ids, coordinates, split labels, and hashes."], + ["codeCommit", "CODE_COMMIT_MISSING", "Attach the analysis code commit or immutable archive hash."], + ["environmentSpec", "ENVIRONMENT_SPEC_MISSING", "Attach a pinned environment or container digest for spatial libraries."] + ]; + + required.forEach(([key, code, remediation]) => { + if (!artifacts[key]) { + findings.push( + finding( + code, + "high", + `${manuscriptId} is missing reproducibility artifact ${key}.`, + "Geospatial results depend on data, code, and environment parity.", + `${path}.reproducibilityArtifacts.${key}`, + remediation, + "reproducibility reviewer", + manuscriptId + ) + ); + } + }); + + const hasSpatialModel = models.some((model) => isSpatialSplit(model) || Number.isFinite(Number(model.moransI))); + if (hasSpatialModel && !artifacts.spatialBlockMap) { + findings.push( + finding( + "SPATIAL_BLOCK_MAP_MISSING", + "warning", + `${manuscriptId} has spatial validation claims without a block map artifact.`, + "Reviewers need the held-out geometry or block map to audit leakage.", + `${path}.reproducibilityArtifacts.spatialBlockMap`, + "Attach a block-map artifact id, geometry hash, or leave-site-out manifest.", + "geospatial methods reviewer", + manuscriptId + ) + ); + } +} + +function buildReviewDecision(manuscript, index, findings) { + const manuscriptId = manuscript.id ?? `manuscript-${index}`; + const manuscriptFindings = findings.filter((item) => item.manuscriptId === manuscriptId || !item.manuscriptId); + const decision = manuscriptFindings.some((item) => item.severity === "critical" || item.severity === "high") + ? "HOLD" + : manuscriptFindings.some((item) => item.severity === "warning") + ? "REVIEW" + : "READY"; + + return { + manuscriptId, + decision, + reasonCodes: manuscriptFindings.map((item) => item.code), + reproducibilityScore: scoreFindings(manuscriptFindings) + }; +} + +function buildResearchGapOpportunities(manuscripts, findings, policy) { + const gaps = []; + manuscripts.forEach((manuscript, index) => { + const manuscriptId = manuscript.id ?? `manuscript-${index}`; + const samples = asArray(manuscript.samples); + const regions = new Set(samples.map((sample) => sample.region).filter(Boolean)); + const broad = asArray(manuscript.claims).some(isBroadClaim); + const manuscriptFindings = findings.filter((item) => item.manuscriptId === manuscriptId); + + if (broad && regions.size < policy.minRegionsForBroadClaims) { + gaps.push({ + id: `${manuscriptId}-regional-replication`, + title: "Prioritize out-of-region replication before broad geographic claims", + rationale: `${manuscriptId} samples ${regions.size} region(s), below the ${policy.minRegionsForBroadClaims}-region policy for broad claims.`, + firstAction: "Recruit or simulate a holdout site in the least represented claimed region." + }); + } + + if (manuscriptFindings.some((item) => item.code === "HIGH_SPATIAL_AUTOCORRELATION_RANDOM_SPLIT")) { + gaps.push({ + id: `${manuscriptId}-spatial-validation-gap`, + title: "Add spatial block validation benchmark", + rationale: "High autocorrelation with random validation means reported accuracy may be optimistic.", + firstAction: "Create a leave-region-out benchmark and compare it to the random split baseline." + }); + } + }); + return gaps; +} + +function summarize(status, findings, manuscriptCount, gapCount) { + if (findings.length === 0) { + return `${manuscriptCount} manuscript(s) are ready for geospatial peer-review release with no spatial leakage or reproducibility findings.`; + } + + const counts = countBySeverity(findings); + return `${status}: ${manuscriptCount} manuscript(s) produced ${findings.length} finding(s): ${counts.critical ?? 0} critical, ${counts.high ?? 0} high, ${counts.warning ?? 0} warning, and ${gapCount} research gap prompt(s).`; +} + +function finding(code, severity, message, evidence, path, remediation, owner, manuscriptId = null, modelId = null) { + return { + code, + severity, + message, + evidence, + path, + remediation, + owner, + manuscriptId, + modelId + }; +} + +function determineStatus(findings) { + if (findings.some((item) => item.severity === "critical" || item.severity === "high")) { + return "HOLD"; + } + if (findings.some((item) => item.severity === "warning")) { + return "REVIEW"; + } + return "READY"; +} + +function scoreFindings(findings) { + const counts = countBySeverity(findings); + return Math.max( + 0, + 100 - (counts.critical ?? 0) * 30 - (counts.high ?? 0) * 17 - (counts.warning ?? 0) * 7 + ); +} + +function sortFindings(findings) { + return [...findings].sort((a, b) => { + const severityDiff = SEVERITY_ORDER.indexOf(a.severity) - SEVERITY_ORDER.indexOf(b.severity); + if (severityDiff !== 0) { + return severityDiff; + } + return a.code.localeCompare(b.code); + }); +} + +function countBySeverity(findings) { + return findings.reduce((counts, item) => { + counts[item.severity] = (counts[item.severity] ?? 0) + 1; + return counts; + }, {}); +} + +function minimumTrainTestDistanceKm(samples) { + const train = samples.filter((sample) => String(sample.split ?? "").toLowerCase() === "train"); + const test = samples.filter((sample) => String(sample.split ?? "").toLowerCase() === "test"); + if (train.length === 0 || test.length === 0) { + return Infinity; + } + + let minimum = Infinity; + train.forEach((trainSample) => { + test.forEach((testSample) => { + const distance = haversineKm(trainSample.lat, trainSample.lon, testSample.lat, testSample.lon); + if (distance < minimum) { + minimum = distance; + } + }); + }); + return minimum; +} + +function haversineKm(latA, lonA, latB, lonB) { + const aLat = Number(latA); + const aLon = Number(lonA); + const bLat = Number(latB); + const bLon = Number(lonB); + if (![aLat, aLon, bLat, bLon].every(Number.isFinite)) { + return Infinity; + } + + const earthRadiusKm = 6371; + const dLat = radians(bLat - aLat); + const dLon = radians(bLon - aLon); + const startLat = radians(aLat); + const endLat = radians(bLat); + const a = + Math.sin(dLat / 2) ** 2 + + Math.cos(startLat) * Math.cos(endLat) * Math.sin(dLon / 2) ** 2; + return 2 * earthRadiusKm * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a)); +} + +function radians(value) { + return (value * Math.PI) / 180; +} + +function isSpatialSplit(model) { + const split = String(model.splitStrategy ?? "").toLowerCase(); + return split.includes("spatial") || split.includes("block") || split.includes("leave-site") || split.includes("leave_region"); +} + +function hasSpatialCovariates(model) { + return Boolean(model.spatialCovariates) || asArray(model.covariates).length > 0; +} + +function isBroadClaim(claim) { + const scope = String(claim.scope ?? "").toLowerCase(); + const text = String(claim.text ?? "").toLowerCase(); + return ( + ["global", "continental", "multi-region", "national", "deployment"].includes(scope) || + text.includes("generalize") || + text.includes("across regions") || + text.includes("continent") || + text.includes("nationwide") || + text.includes("global") + ); +} + +function isPlainObject(value) { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function asArray(value) { + return Array.isArray(value) ? value : []; +} + +function escapeXml(value) { + return String(value) + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll('"', """); +} + +module.exports = { + evaluateGeospatialReviewPacket, + renderMarkdownReport, + renderSvgSummary, + haversineKm +}; diff --git a/geospatial-spatial-autocorrelation-assistant/make-demo-video.js b/geospatial-spatial-autocorrelation-assistant/make-demo-video.js new file mode 100644 index 00000000..f690b872 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/make-demo-video.js @@ -0,0 +1,128 @@ +"use strict"; + +const { execFileSync } = require("node:child_process"); +const fs = require("node:fs"); +const path = require("node:path"); + +const WIDTH = 960; +const HEIGHT = 540; +const FONT = { + A: ["01110", "10001", "10001", "11111", "10001", "10001", "10001"], + B: ["11110", "10001", "10001", "11110", "10001", "10001", "11110"], + C: ["01111", "10000", "10000", "10000", "10000", "10000", "01111"], + D: ["11110", "10001", "10001", "10001", "10001", "10001", "11110"], + E: ["11111", "10000", "10000", "11110", "10000", "10000", "11111"], + G: ["01111", "10000", "10000", "10111", "10001", "10001", "01111"], + I: ["11111", "00100", "00100", "00100", "00100", "00100", "11111"], + K: ["10001", "10010", "10100", "11000", "10100", "10010", "10001"], + L: ["10000", "10000", "10000", "10000", "10000", "10000", "11111"], + O: ["01110", "10001", "10001", "10001", "10001", "10001", "01110"], + P: ["11110", "10001", "10001", "11110", "10000", "10000", "10000"], + R: ["11110", "10001", "10001", "11110", "10100", "10010", "10001"], + S: ["01111", "10000", "10000", "01110", "00001", "00001", "11110"], + T: ["11111", "00100", "00100", "00100", "00100", "00100", "00100"], + V: ["10001", "10001", "10001", "10001", "01010", "01010", "00100"], + W: ["10001", "10001", "10001", "10101", "10101", "10101", "01010"], + Y: ["10001", "01010", "00100", "00100", "00100", "00100", "00100"] +}; + +const reportsDir = path.join(__dirname, "reports"); +const framesDir = path.join(reportsDir, "frames"); +fs.mkdirSync(framesDir, { recursive: true }); + +for (const file of fs.readdirSync(framesDir)) { + fs.unlinkSync(path.join(framesDir, file)); +} + +const slides = [ + { label: "CRS READY", color: [22, 121, 76], fill: 0.72 }, + { label: "LEAKAGE", color: [161, 27, 50], fill: 0.88 }, + { label: "BLOCK SPLIT", color: [22, 121, 76], fill: 0.78 }, + { label: "GAP MAP", color: [161, 92, 0], fill: 0.64 } +]; + +let frameIndex = 0; +for (const slide of slides) { + for (let i = 0; i < 8; i += 1) { + const progress = (i + 1) / 8; + const buffer = createFrame(slide, progress); + fs.writeFileSync(path.join(framesDir, `frame-${String(frameIndex).padStart(3, "0")}.ppm`), buffer); + frameIndex += 1; + } +} + +const output = path.join(reportsDir, "demo.mp4"); +execFileSync( + "ffmpeg", + [ + "-y", + "-framerate", + "8", + "-i", + path.join(framesDir, "frame-%03d.ppm"), + "-pix_fmt", + "yuv420p", + "-movflags", + "+faststart", + output + ], + { stdio: "ignore" } +); + +const stats = fs.statSync(output); +console.log(`Wrote ${output} (${stats.size} bytes)`); + +function createFrame(slide, progress) { + const pixels = Buffer.alloc(WIDTH * HEIGHT * 3); + fillRect(pixels, 0, 0, WIDTH, HEIGHT, [16, 24, 32]); + fillRect(pixels, 48, 48, 864, 444, [248, 250, 252]); + fillRect(pixels, 80, 190, 800, 88, [226, 232, 240]); + fillRect(pixels, 80, 190, Math.round(800 * slide.fill * progress), 88, slide.color); + fillRect(pixels, 80, 322, 220, 42, [226, 232, 240]); + fillRect(pixels, 332, 322, 220, 42, [226, 232, 240]); + fillRect(pixels, 584, 322, 220, 42, [226, 232, 240]); + fillRect(pixels, 80, 322, 130, 42, [161, 27, 50]); + fillRect(pixels, 332, 322, 150, 42, [161, 92, 0]); + fillRect(pixels, 584, 322, 200, 42, [22, 121, 76]); + drawText(pixels, "SPATIAL REVIEW", 82, 104, 5, [17, 24, 39]); + drawText(pixels, slide.label, 108, 214, 7, [255, 255, 255]); + drawText(pixels, "PEER READY", 82, 414, 4, [51, 65, 85]); + return Buffer.concat([Buffer.from(`P6\n${WIDTH} ${HEIGHT}\n255\n`, "ascii"), pixels]); +} + +function fillRect(pixels, x, y, width, height, color) { + const x2 = Math.min(WIDTH, x + width); + const y2 = Math.min(HEIGHT, y + height); + for (let row = Math.max(0, y); row < y2; row += 1) { + for (let col = Math.max(0, x); col < x2; col += 1) { + const offset = (row * WIDTH + col) * 3; + pixels[offset] = color[0]; + pixels[offset + 1] = color[1]; + pixels[offset + 2] = color[2]; + } + } +} + +function drawText(pixels, text, x, y, scale, color) { + let cursor = x; + for (const rawChar of text) { + const char = rawChar.toUpperCase(); + if (char === " ") { + cursor += 4 * scale; + continue; + } + const glyph = FONT[char]; + if (!glyph) { + cursor += 6 * scale; + continue; + } + glyph.forEach((row, rowIndex) => { + for (let colIndex = 0; colIndex < row.length; colIndex += 1) { + if (row[colIndex] === "1") { + fillRect(pixels, cursor + colIndex * scale, y + rowIndex * scale, scale, scale, color); + } + } + }); + cursor += 6 * scale; + } +} diff --git a/geospatial-spatial-autocorrelation-assistant/package.json b/geospatial-spatial-autocorrelation-assistant/package.json new file mode 100644 index 00000000..a583ce99 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/package.json @@ -0,0 +1,21 @@ +{ + "name": "geospatial-spatial-autocorrelation-assistant", + "version": "1.0.0", + "private": true, + "description": "Synthetic geospatial peer-review assistant for spatial autocorrelation, split leakage, CRS, and reproducibility risk.", + "main": "index.js", + "scripts": { + "check": "node --check index.js && node --check sample-data.js && node --check test.js && node --check demo.js && node --check make-demo-video.js", + "test": "node test.js", + "demo": "node demo.js && node make-demo-video.js", + "verify-video": "ffprobe -v error -show_entries stream=codec_name,width,height,duration -of default=nokey=1:noprint_wrappers=1 reports/demo.mp4" + }, + "keywords": [ + "geospatial", + "peer-review", + "spatial-autocorrelation", + "reproducibility", + "synthetic" + ], + "license": "MIT" +} diff --git a/geospatial-spatial-autocorrelation-assistant/reports/clean-audit.json b/geospatial-spatial-autocorrelation-assistant/reports/clean-audit.json new file mode 100644 index 00000000..0faadf2c --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/reports/clean-audit.json @@ -0,0 +1,18 @@ +{ + "generatedAt": "2026-06-01T10:30:00.000Z", + "status": "READY", + "summary": "1 manuscript(s) are ready for geospatial peer-review release with no spatial leakage or reproducibility findings.", + "findingCounts": {}, + "findings": [], + "reviewDecisions": [ + { + "manuscriptId": "rangeland-blocked-validation", + "decision": "READY", + "reasonCodes": [], + "reproducibilityScore": 100 + } + ], + "researchGapOpportunities": [], + "remediationActions": [], + "fingerprint": "aa2c187bd4b36628" +} diff --git a/geospatial-spatial-autocorrelation-assistant/reports/demo.mp4 b/geospatial-spatial-autocorrelation-assistant/reports/demo.mp4 new file mode 100644 index 00000000..0e79ef6b Binary files /dev/null and b/geospatial-spatial-autocorrelation-assistant/reports/demo.mp4 differ diff --git a/geospatial-spatial-autocorrelation-assistant/reports/manifest.json b/geospatial-spatial-autocorrelation-assistant/reports/manifest.json new file mode 100644 index 00000000..c0dd10c0 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/reports/manifest.json @@ -0,0 +1,14 @@ +{ + "generatedAt": "2026-06-01T10:30:00.000Z", + "artifacts": [ + "risky-audit.json", + "clean-audit.json", + "risky-review.md", + "summary.svg", + "demo.mp4" + ], + "riskyStatus": "HOLD", + "cleanStatus": "READY", + "riskyFingerprint": "e036107e72f70a7e", + "cleanFingerprint": "aa2c187bd4b36628" +} diff --git a/geospatial-spatial-autocorrelation-assistant/reports/risky-audit.json b/geospatial-spatial-autocorrelation-assistant/reports/risky-audit.json new file mode 100644 index 00000000..0ea41478 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/reports/risky-audit.json @@ -0,0 +1,285 @@ +{ + "generatedAt": "2026-06-01T10:30:00.000Z", + "status": "HOLD", + "summary": "HOLD: 1 manuscript(s) produced 13 finding(s): 2 critical, 8 high, 3 warning, and 2 research gap prompt(s).", + "findingCounts": { + "critical": 2, + "high": 8, + "warning": 3 + }, + "findings": [ + { + "code": "SPATIAL_SPLIT_LEAKAGE", + "severity": "critical", + "message": "urban-heat-random-split/rf-heat-risk has train/test samples only 0.6 km apart without spatial blocking.", + "evidence": "Policy requires at least 35 km or explicit spatial block validation.", + "path": "manuscripts[0].models[0].splitStrategy", + "remediation": "Use spatial block, leave-site-out, or regional holdout validation and regenerate performance claims.", + "owner": "model reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk" + }, + { + "code": "TEST_SET_TUNING", + "severity": "critical", + "message": "urban-heat-random-split/rf-heat-risk tunes model choices on the test/holdout set.", + "evidence": "Reviewer-facing performance claims require a locked final test set.", + "path": "manuscripts[0].models[0].hyperparameterTunedOn", + "remediation": "Move tuning to inner validation folds and rerun the locked test set once.", + "owner": "model reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk" + }, + { + "code": "DATA_MANIFEST_MISSING", + "severity": "high", + "message": "urban-heat-random-split is missing reproducibility artifact dataManifest.", + "evidence": "Geospatial results depend on data, code, and environment parity.", + "path": "manuscripts[0].reproducibilityArtifacts.dataManifest", + "remediation": "Attach a data manifest with sample ids, coordinates, split labels, and hashes.", + "owner": "reproducibility reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": null + }, + { + "code": "ENVIRONMENT_SPEC_MISSING", + "severity": "high", + "message": "urban-heat-random-split is missing reproducibility artifact environmentSpec.", + "evidence": "Geospatial results depend on data, code, and environment parity.", + "path": "manuscripts[0].reproducibilityArtifacts.environmentSpec", + "remediation": "Attach a pinned environment or container digest for spatial libraries.", + "owner": "reproducibility reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": null + }, + { + "code": "FULL_DATASET_PREPROCESSING_LEAKAGE", + "severity": "high", + "message": "urban-heat-random-split/rf-heat-risk fits spatial preprocessing on the full dataset.", + "evidence": "Raster normalization, imputation, or feature selection must be learned inside each training fold.", + "path": "manuscripts[0].models[0].preprocessingFitScope", + "remediation": "Refit preprocessing inside training folds and attach fold-specific transformation hashes.", + "owner": "reproducibility reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk" + }, + { + "code": "HIGH_SPATIAL_AUTOCORRELATION_RANDOM_SPLIT", + "severity": "high", + "message": "urban-heat-random-split/rf-heat-risk reports Moran's I 0.62 with a random split.", + "evidence": "High spatial autocorrelation inflates random train/test validation.", + "path": "manuscripts[0].models[0].moransI", + "remediation": "Run spatial block cross-validation or leave-region-out validation before presenting performance as reviewer-ready.", + "owner": "spatial statistics reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk" + }, + { + "code": "MISSING_CRS_EVIDENCE", + "severity": "high", + "message": "urban-heat-random-split does not declare a coordinate reference system.", + "evidence": "Spatial distances, joins, and raster overlays cannot be reviewed without CRS evidence.", + "path": "manuscripts[0].spatialDesign.crs", + "remediation": "Declare the source CRS/EPSG code and any analysis projection used for distance or area operations.", + "owner": "geospatial methods reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": null + }, + { + "code": "MISSING_EXTERNAL_SPATIAL_VALIDATION", + "severity": "high", + "message": "urban-heat-random-split/rf-heat-risk lacks external spatial validation for broader deployment claims.", + "evidence": "Broad geographic or deployment claims should be checked outside the training geography.", + "path": "manuscripts[0].models[0].externalValidationSites", + "remediation": "Add an out-of-region validation site or limit the manuscript claim to the sampled geography.", + "owner": "methods reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk" + }, + { + "code": "OVERBROAD_GEOGRAPHIC_CLAIM", + "severity": "high", + "message": "urban-heat-random-split makes a broad geographic claim with only 1 observed region(s).", + "evidence": "The model generalizes across continental urban heat islands.", + "path": "manuscripts[0].claims[0]", + "remediation": "Limit the claim to sampled regions or add external validation sites covering the claimed geography.", + "owner": "methods reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": null + }, + { + "code": "SENSITIVE_COORDINATE_OVERPRECISION", + "severity": "high", + "message": "urban-heat-random-split exposes sensitive locations at 6 decimal places.", + "evidence": "Human-subject or protected-species locations should be generalized before reviewer packets or public summaries.", + "path": "manuscripts[0].spatialDesign.coordinatePrecisionDecimals", + "remediation": "Round or jitter coordinates to 4 decimals or provide an approved restricted-location access path.", + "owner": "privacy reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": null + }, + { + "code": "COVARIATE_SOURCE_MISSING", + "severity": "warning", + "message": "urban-heat-random-split/rf-heat-risk covariate NDVI has no source citation or artifact id.", + "evidence": "Raster/vector covariates should be traceable for reproducibility and recency review.", + "path": "manuscripts[0].models[0].covariates[0].source", + "remediation": "Attach a source DOI, artifact id, or repository path for each spatial covariate.", + "owner": "data steward", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk" + }, + { + "code": "SPATIAL_BLOCK_MAP_MISSING", + "severity": "warning", + "message": "urban-heat-random-split has spatial validation claims without a block map artifact.", + "evidence": "Reviewers need the held-out geometry or block map to audit leakage.", + "path": "manuscripts[0].reproducibilityArtifacts.spatialBlockMap", + "remediation": "Attach a block-map artifact id, geometry hash, or leave-site-out manifest.", + "owner": "geospatial methods reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": null + }, + { + "code": "STALE_COVARIATE_WINDOW", + "severity": "warning", + "message": "urban-heat-random-split/rf-heat-risk covariate NDVI spans 540 acquisition days.", + "evidence": "Long covariate windows can hide temporal drift in geospatial models.", + "path": "manuscripts[0].models[0].covariates[0].acquisitionWindowDays", + "remediation": "Use period-matched covariates or report temporal-drift sensitivity checks.", + "owner": "methods reviewer", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk" + } + ], + "reviewDecisions": [ + { + "manuscriptId": "urban-heat-random-split", + "decision": "HOLD", + "reasonCodes": [ + "SPATIAL_SPLIT_LEAKAGE", + "TEST_SET_TUNING", + "DATA_MANIFEST_MISSING", + "ENVIRONMENT_SPEC_MISSING", + "FULL_DATASET_PREPROCESSING_LEAKAGE", + "HIGH_SPATIAL_AUTOCORRELATION_RANDOM_SPLIT", + "MISSING_CRS_EVIDENCE", + "MISSING_EXTERNAL_SPATIAL_VALIDATION", + "OVERBROAD_GEOGRAPHIC_CLAIM", + "SENSITIVE_COORDINATE_OVERPRECISION", + "COVARIATE_SOURCE_MISSING", + "SPATIAL_BLOCK_MAP_MISSING", + "STALE_COVARIATE_WINDOW" + ], + "reproducibilityScore": 0 + } + ], + "researchGapOpportunities": [ + { + "id": "urban-heat-random-split-regional-replication", + "title": "Prioritize out-of-region replication before broad geographic claims", + "rationale": "urban-heat-random-split samples 1 region(s), below the 3-region policy for broad claims.", + "firstAction": "Recruit or simulate a holdout site in the least represented claimed region." + }, + { + "id": "urban-heat-random-split-spatial-validation-gap", + "title": "Add spatial block validation benchmark", + "rationale": "High autocorrelation with random validation means reported accuracy may be optimistic.", + "firstAction": "Create a leave-region-out benchmark and compare it to the random split baseline." + } + ], + "remediationActions": [ + { + "code": "SPATIAL_SPLIT_LEAKAGE", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk", + "owner": "model reviewer", + "action": "Use spatial block, leave-site-out, or regional holdout validation and regenerate performance claims." + }, + { + "code": "TEST_SET_TUNING", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk", + "owner": "model reviewer", + "action": "Move tuning to inner validation folds and rerun the locked test set once." + }, + { + "code": "DATA_MANIFEST_MISSING", + "manuscriptId": "urban-heat-random-split", + "modelId": null, + "owner": "reproducibility reviewer", + "action": "Attach a data manifest with sample ids, coordinates, split labels, and hashes." + }, + { + "code": "ENVIRONMENT_SPEC_MISSING", + "manuscriptId": "urban-heat-random-split", + "modelId": null, + "owner": "reproducibility reviewer", + "action": "Attach a pinned environment or container digest for spatial libraries." + }, + { + "code": "FULL_DATASET_PREPROCESSING_LEAKAGE", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk", + "owner": "reproducibility reviewer", + "action": "Refit preprocessing inside training folds and attach fold-specific transformation hashes." + }, + { + "code": "HIGH_SPATIAL_AUTOCORRELATION_RANDOM_SPLIT", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk", + "owner": "spatial statistics reviewer", + "action": "Run spatial block cross-validation or leave-region-out validation before presenting performance as reviewer-ready." + }, + { + "code": "MISSING_CRS_EVIDENCE", + "manuscriptId": "urban-heat-random-split", + "modelId": null, + "owner": "geospatial methods reviewer", + "action": "Declare the source CRS/EPSG code and any analysis projection used for distance or area operations." + }, + { + "code": "MISSING_EXTERNAL_SPATIAL_VALIDATION", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk", + "owner": "methods reviewer", + "action": "Add an out-of-region validation site or limit the manuscript claim to the sampled geography." + }, + { + "code": "OVERBROAD_GEOGRAPHIC_CLAIM", + "manuscriptId": "urban-heat-random-split", + "modelId": null, + "owner": "methods reviewer", + "action": "Limit the claim to sampled regions or add external validation sites covering the claimed geography." + }, + { + "code": "SENSITIVE_COORDINATE_OVERPRECISION", + "manuscriptId": "urban-heat-random-split", + "modelId": null, + "owner": "privacy reviewer", + "action": "Round or jitter coordinates to 4 decimals or provide an approved restricted-location access path." + }, + { + "code": "COVARIATE_SOURCE_MISSING", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk", + "owner": "data steward", + "action": "Attach a source DOI, artifact id, or repository path for each spatial covariate." + }, + { + "code": "SPATIAL_BLOCK_MAP_MISSING", + "manuscriptId": "urban-heat-random-split", + "modelId": null, + "owner": "geospatial methods reviewer", + "action": "Attach a block-map artifact id, geometry hash, or leave-site-out manifest." + }, + { + "code": "STALE_COVARIATE_WINDOW", + "manuscriptId": "urban-heat-random-split", + "modelId": "rf-heat-risk", + "owner": "methods reviewer", + "action": "Use period-matched covariates or report temporal-drift sensitivity checks." + } + ], + "fingerprint": "e036107e72f70a7e" +} diff --git a/geospatial-spatial-autocorrelation-assistant/reports/risky-review.md b/geospatial-spatial-autocorrelation-assistant/reports/risky-review.md new file mode 100644 index 00000000..e4579b20 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/reports/risky-review.md @@ -0,0 +1,64 @@ +# Geospatial Spatial-Autocorrelation Review Assistant + +Packet: geo-review-risky-2026-06 +Status: HOLD +Fingerprint: e036107e72f70a7e + +## Summary + +HOLD: 1 manuscript(s) produced 13 finding(s): 2 critical, 8 high, 3 warning, and 2 research gap prompt(s). + +## Manuscript Decisions + +- urban-heat-random-split: HOLD; reproducibility score 0/100; 13 finding(s) + +## Findings + +- CRITICAL SPATIAL_SPLIT_LEAKAGE: urban-heat-random-split/rf-heat-risk has train/test samples only 0.6 km apart without spatial blocking. + - Evidence: Policy requires at least 35 km or explicit spatial block validation. + - Remediation: Use spatial block, leave-site-out, or regional holdout validation and regenerate performance claims. +- CRITICAL TEST_SET_TUNING: urban-heat-random-split/rf-heat-risk tunes model choices on the test/holdout set. + - Evidence: Reviewer-facing performance claims require a locked final test set. + - Remediation: Move tuning to inner validation folds and rerun the locked test set once. +- HIGH DATA_MANIFEST_MISSING: urban-heat-random-split is missing reproducibility artifact dataManifest. + - Evidence: Geospatial results depend on data, code, and environment parity. + - Remediation: Attach a data manifest with sample ids, coordinates, split labels, and hashes. +- HIGH ENVIRONMENT_SPEC_MISSING: urban-heat-random-split is missing reproducibility artifact environmentSpec. + - Evidence: Geospatial results depend on data, code, and environment parity. + - Remediation: Attach a pinned environment or container digest for spatial libraries. +- HIGH FULL_DATASET_PREPROCESSING_LEAKAGE: urban-heat-random-split/rf-heat-risk fits spatial preprocessing on the full dataset. + - Evidence: Raster normalization, imputation, or feature selection must be learned inside each training fold. + - Remediation: Refit preprocessing inside training folds and attach fold-specific transformation hashes. +- HIGH HIGH_SPATIAL_AUTOCORRELATION_RANDOM_SPLIT: urban-heat-random-split/rf-heat-risk reports Moran's I 0.62 with a random split. + - Evidence: High spatial autocorrelation inflates random train/test validation. + - Remediation: Run spatial block cross-validation or leave-region-out validation before presenting performance as reviewer-ready. +- HIGH MISSING_CRS_EVIDENCE: urban-heat-random-split does not declare a coordinate reference system. + - Evidence: Spatial distances, joins, and raster overlays cannot be reviewed without CRS evidence. + - Remediation: Declare the source CRS/EPSG code and any analysis projection used for distance or area operations. +- HIGH MISSING_EXTERNAL_SPATIAL_VALIDATION: urban-heat-random-split/rf-heat-risk lacks external spatial validation for broader deployment claims. + - Evidence: Broad geographic or deployment claims should be checked outside the training geography. + - Remediation: Add an out-of-region validation site or limit the manuscript claim to the sampled geography. +- HIGH OVERBROAD_GEOGRAPHIC_CLAIM: urban-heat-random-split makes a broad geographic claim with only 1 observed region(s). + - Evidence: The model generalizes across continental urban heat islands. + - Remediation: Limit the claim to sampled regions or add external validation sites covering the claimed geography. +- HIGH SENSITIVE_COORDINATE_OVERPRECISION: urban-heat-random-split exposes sensitive locations at 6 decimal places. + - Evidence: Human-subject or protected-species locations should be generalized before reviewer packets or public summaries. + - Remediation: Round or jitter coordinates to 4 decimals or provide an approved restricted-location access path. +- WARNING COVARIATE_SOURCE_MISSING: urban-heat-random-split/rf-heat-risk covariate NDVI has no source citation or artifact id. + - Evidence: Raster/vector covariates should be traceable for reproducibility and recency review. + - Remediation: Attach a source DOI, artifact id, or repository path for each spatial covariate. +- WARNING SPATIAL_BLOCK_MAP_MISSING: urban-heat-random-split has spatial validation claims without a block map artifact. + - Evidence: Reviewers need the held-out geometry or block map to audit leakage. + - Remediation: Attach a block-map artifact id, geometry hash, or leave-site-out manifest. +- WARNING STALE_COVARIATE_WINDOW: urban-heat-random-split/rf-heat-risk covariate NDVI spans 540 acquisition days. + - Evidence: Long covariate windows can hide temporal drift in geospatial models. + - Remediation: Use period-matched covariates or report temporal-drift sensitivity checks. + +## Research Gap Opportunities + +- urban-heat-random-split-regional-replication: Prioritize out-of-region replication before broad geographic claims + - Rationale: urban-heat-random-split samples 1 region(s), below the 3-region policy for broad claims. + - First action: Recruit or simulate a holdout site in the least represented claimed region. +- urban-heat-random-split-spatial-validation-gap: Add spatial block validation benchmark + - Rationale: High autocorrelation with random validation means reported accuracy may be optimistic. + - First action: Create a leave-region-out benchmark and compare it to the random split baseline. diff --git a/geospatial-spatial-autocorrelation-assistant/reports/summary.svg b/geospatial-spatial-autocorrelation-assistant/reports/summary.svg new file mode 100644 index 00000000..1e389e92 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/reports/summary.svg @@ -0,0 +1,13 @@ + + + +Geospatial review assistant +Status HOLD - fingerprint e036107e72f70a7e + + + +SPATIAL QA +Critical/high blockers: 10 +Research gaps: 2 +Manuscripts checked: 1 + \ No newline at end of file diff --git a/geospatial-spatial-autocorrelation-assistant/sample-data.js b/geospatial-spatial-autocorrelation-assistant/sample-data.js new file mode 100644 index 00000000..60fd69d9 --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/sample-data.js @@ -0,0 +1,122 @@ +"use strict"; + +const riskyPacket = { + id: "geo-review-risky-2026-06", + policy: { + minSpatialHoldoutKm: 35, + highMoransI: 0.35, + maxSensitivePrecisionDecimals: 4, + minRegionsForBroadClaims: 3, + maxCovariateWindowDays: 365 + }, + manuscripts: [ + { + id: "urban-heat-random-split", + title: "Continental urban heat risk from neighborhood satellite features", + field: "environmental epidemiology", + sensitivity: "human-subjects", + spatialDesign: { + crs: "", + projection: "web map tiles", + coordinatePrecisionDecimals: 6, + samplingFrame: "three volunteer neighborhoods" + }, + claims: [ + { + id: "claim-generalization", + scope: "continental", + claimedRegions: ["Northeast", "Midwest", "South", "West"], + text: "The model generalizes across continental urban heat islands." + } + ], + samples: [ + { id: "s-001", lat: 40.712776, lon: -74.005974, split: "train", region: "Northeast", site: "NYC-A" }, + { id: "s-002", lat: 40.734112, lon: -73.98742, split: "test", region: "Northeast", site: "NYC-B" }, + { id: "s-003", lat: 40.75891, lon: -73.98513, split: "train", region: "Northeast", site: "NYC-C" }, + { id: "s-004", lat: 40.76172, lon: -73.97864, split: "test", region: "Northeast", site: "NYC-D" } + ], + models: [ + { + id: "rf-heat-risk", + splitStrategy: "random", + moransI: 0.62, + preprocessingFitScope: "full_dataset", + hyperparameterTunedOn: "test", + deploymentContext: "national heat-risk triage", + spatialCovariates: true, + externalValidationSites: [], + covariates: [ + { name: "NDVI", source: "", resolutionMeters: 1000, acquisitionWindowDays: 540 }, + { name: "impervious_surface", source: "city-open-data:impervious-v1", resolutionMeters: 30, acquisitionWindowDays: 90 } + ] + } + ], + reproducibilityArtifacts: { + dataManifest: "", + codeCommit: "2f7c91e", + environmentSpec: "", + spatialBlockMap: "" + } + } + ] +}; + +const cleanPacket = { + id: "geo-review-clean-2026-06", + policy: riskyPacket.policy, + manuscripts: [ + { + id: "rangeland-blocked-validation", + title: "Regional rangeland recovery forecasts with blocked spatial validation", + field: "ecology", + sensitivity: "public-environmental", + spatialDesign: { + crs: "EPSG:4326 WGS84 source coordinates; EPSG:5070 equal-area analysis projection", + projection: "EPSG:5070", + coordinatePrecisionDecimals: 3, + samplingFrame: "blocked stratified ecological sites" + }, + claims: [ + { + id: "claim-regional", + scope: "regional", + claimedRegions: ["Colorado Front Range", "New Mexico Plateau", "Utah Basin"], + text: "The blocked model supports regional recovery forecasts for sampled western rangeland systems." + } + ], + samples: [ + { id: "co-001", lat: 39.739, lon: -104.99, split: "train", region: "Colorado Front Range", site: "CO-A" }, + { id: "co-002", lat: 39.231, lon: -105.02, split: "train", region: "Colorado Front Range", site: "CO-B" }, + { id: "nm-001", lat: 35.084, lon: -106.65, split: "test", region: "New Mexico Plateau", site: "NM-A" }, + { id: "ut-001", lat: 40.760, lon: -111.89, split: "test", region: "Utah Basin", site: "UT-A" } + ], + models: [ + { + id: "blocked-gbm-recovery", + splitStrategy: "spatial_block_leave_region_out", + moransI: 0.18, + preprocessingFitScope: "training_fold", + hyperparameterTunedOn: "inner_validation", + deploymentContext: "", + spatialCovariates: true, + externalValidationSites: ["New Mexico Plateau", "Utah Basin"], + covariates: [ + { name: "soil_moisture", source: "doi:10.1234/soil-moisture-v3", resolutionMeters: 250, acquisitionWindowDays: 30 }, + { name: "burn_severity", source: "artifact:burn-severity-2026-05", resolutionMeters: 30, acquisitionWindowDays: 12 } + ] + } + ], + reproducibilityArtifacts: { + dataManifest: "artifact:geo-sample-manifest-v2", + codeCommit: "d6b0e3c", + environmentSpec: "container:ghcr.io/scibase/geo-review@sha256:abc123", + spatialBlockMap: "artifact:block-map-v2" + } + } + ] +}; + +module.exports = { + riskyPacket, + cleanPacket +}; diff --git a/geospatial-spatial-autocorrelation-assistant/test.js b/geospatial-spatial-autocorrelation-assistant/test.js new file mode 100644 index 00000000..9628888f --- /dev/null +++ b/geospatial-spatial-autocorrelation-assistant/test.js @@ -0,0 +1,60 @@ +"use strict"; + +const assert = require("node:assert/strict"); +const { + evaluateGeospatialReviewPacket, + renderMarkdownReport, + renderSvgSummary, + haversineKm +} = require("./index"); +const { riskyPacket, cleanPacket } = require("./sample-data"); + +assert.throws(() => evaluateGeospatialReviewPacket(null), /expects a packet object/); + +const risky = evaluateGeospatialReviewPacket(riskyPacket, { now: "2026-06-01T10:30:00.000Z" }); +assert.equal(risky.status, "HOLD"); +assert.equal(risky.reviewDecisions[0].decision, "HOLD"); +assert.ok(risky.findings.some((item) => item.code === "SPATIAL_SPLIT_LEAKAGE")); +assert.ok(risky.findings.some((item) => item.code === "MISSING_CRS_EVIDENCE")); +assert.ok(risky.findings.some((item) => item.code === "HIGH_SPATIAL_AUTOCORRELATION_RANDOM_SPLIT")); +assert.ok(risky.findings.some((item) => item.code === "TEST_SET_TUNING")); +assert.ok(risky.researchGapOpportunities.length >= 2); +assert.ok(risky.reviewDecisions[0].reproducibilityScore < 50); + +const riskyRepeat = evaluateGeospatialReviewPacket(riskyPacket, { now: "2026-06-01T10:31:00.000Z" }); +assert.equal(risky.fingerprint, riskyRepeat.fingerprint); + +const clean = evaluateGeospatialReviewPacket(cleanPacket, { now: "2026-06-01T10:30:00.000Z" }); +assert.equal(clean.status, "READY"); +assert.equal(clean.findings.length, 0); +assert.equal(clean.reviewDecisions[0].reproducibilityScore, 100); + +const invalidCoordinatePacket = { + id: "invalid-coordinate", + manuscripts: [ + { + id: "bad-coordinate", + spatialDesign: { crs: "EPSG:4326", coordinatePrecisionDecimals: 2 }, + claims: [], + samples: [{ id: "bad", lat: 110, lon: -74, split: "train", region: "X" }], + models: [], + reproducibilityArtifacts: { dataManifest: "m", codeCommit: "c", environmentSpec: "e" } + } + ] +}; +const invalid = evaluateGeospatialReviewPacket(invalidCoordinatePacket, { now: "2026-06-01T10:30:00.000Z" }); +assert.ok(invalid.findings.some((item) => item.code === "INVALID_COORDINATE")); + +const markdown = renderMarkdownReport(risky, riskyPacket); +assert.ok(markdown.includes("Spatial-Autocorrelation")); +assert.ok(markdown.includes("SPATIAL_SPLIT_LEAKAGE")); +assert.ok(markdown.includes("Research Gap Opportunities")); + +const svg = renderSvgSummary(risky); +assert.ok(svg.includes(" 2 && distance < 4); + +console.log("All geospatial spatial-autocorrelation assistant tests passed.");