From d62ff1da88a29c6e25053a3a3f8b64428d40ce70 Mon Sep 17 00:00:00 2001
From: luisleo526 <luisleo52655@gmail.com>
Date: Thu, 2 Jul 2026 22:53:49 +0800
Subject: [PATCH 1/2] harness: derive chart feeds from the corpus's single
 committed 1m feed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The corpus now ships exactly one feed: data/ohlcv_ETH-USDT-USDT_1m.csv
(full-history Binance ETH perp 1m, Git LFS). New
scripts/derive_corpus_feeds.py materializes the 15m chart feeds into
corpus/data/derived/ (gitignored): a full-history 900s resample (the
default chart feed) and its comparison-window slice (cold-start probes,
benchmark runners, window-bounds reference). Derivation is idempotent
(mtime-gated), pure-local, and guards against unsmudged LFS pointers.

- run_strategy.py: feed constants point at the derived files;
  ensure_derived() runs at main() entry (import stays side-effect free
  for ABI-mirror consumers like crossvalidate_metrics).
- run_corpus.sh: derive step before the build.
- crossvalidate_metrics.py: ensure_derived() after arg parse.
- benchmarks: corpus fallback paths now the derived window slice, so
  bench inputs stay bar-identical with the historical baseline.

Native-15m engine execution is unchanged — the resample was verified
bar-identical to the old committed 15m files except two warmup bars on
the 2024-10-28 outage day, one 0.01 in-window open (Binance's 15m kline
disagrees with its own 1m klines during a flat outage), and a fuller
final partial bucket. Gate: ctest 78/78; corpus 252/252 ok,
excellent=251/anomaly=1 exact.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01CnvqmHPmgUpeu2fz6A1mMU
---
 CONTRIBUTING.md                               |   8 +-
 benchmarks/compare.py                         |   6 +-
 .../runners/run_pineforge_canonical.cpp       |   2 +-
 benchmarks/runners/run_pinets_canonical.mjs   |   2 +-
 benchmarks/runners/run_pynecore.py            |   3 +-
 scripts/crossvalidate_metrics.py              |   1 +
 scripts/derive_corpus_feeds.py                | 114 ++++++++++++++++++
 scripts/run_corpus.sh                         |   9 +-
 scripts/run_strategy.py                       |  16 ++-
 9 files changed, 148 insertions(+), 13 deletions(-)
 create mode 100644 scripts/derive_corpus_feeds.py
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index beab26f..69187a2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -158,9 +158,11 @@ bash scripts/run_corpus.sh
 ```
 
 It builds every `corpus/validation/<probe>/generated.cpp` into a
-`strategy.dylib` / `strategy.so`, runs each against
-`corpus/data/ohlcv_ETH-USDT-USDT_15m_warmup6m.csv` when present
-(falling back to `corpus/data/ohlcv_ETH-USDT-USDT_15m.csv`), and
+`strategy.dylib` / `strategy.so`, runs each against the 15m chart feed
+derived from the corpus's single committed 1m feed
+(`corpus/data/ohlcv_ETH-USDT-USDT_1m.csv`, Git LFS; the harness
+materializes `corpus/data/derived/ohlcv_ETH-USDT-USDT_15m.csv` via
+`scripts/derive_corpus_feeds.py`), and
 rewrites the regenerated `engine_trades.csv` files. It also prints a
 canonical `scripts/verify_corpus.py --all --quiet` summary with the five
 parity labels (`excellent`, `strong`, `moderate`, `weak`, `minimal`).
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
index 54a7dfd..3372494 100755
--- a/benchmarks/compare.py
+++ b/benchmarks/compare.py
@@ -67,11 +67,13 @@
 # OHLCV resolution order (first existing wins):
 #   1. DATA/ETHUSDT_15.csv — snapshot (paths: benchmarks/assets/data or benchmarks/data)
 #   2. benchmarks/_workdir/data/ETHUSDT_15.csv — working copy from run_all.sh
-#   3. corpus/data/ohlcv_ETH-USDT-USDT_15m.csv — fallback
+#   3. corpus/data/derived/ohlcv_ETH-USDT-USDT_15m_window.csv — fallback
+#      (derived from the single committed 1m feed by
+#      scripts/derive_corpus_feeds.py)
 _CANDIDATE_OHLCV = [
     DATA / "ETHUSDT_15.csv",
     BENCH_DIR / "_workdir" / "data" / "ETHUSDT_15.csv",
-    REPO_ROOT / "corpus" / "data" / "ohlcv_ETH-USDT-USDT_15m.csv",
+    REPO_ROOT / "corpus" / "data" / "derived" / "ohlcv_ETH-USDT-USDT_15m_window.csv",
 ]
 OHLCV_PATH = next((p for p in _CANDIDATE_OHLCV if p.exists()), _CANDIDATE_OHLCV[-1])
 
diff --git a/benchmarks/runners/run_pineforge_canonical.cpp b/benchmarks/runners/run_pineforge_canonical.cpp
index 66f03e3..c5b332f 100755
--- a/benchmarks/runners/run_pineforge_canonical.cpp
+++ b/benchmarks/runners/run_pineforge_canonical.cpp
@@ -68,7 +68,7 @@ static std::string fmt(double v) {
 
 int main(int argc, char** argv) {
     const char* in_path  = argc > 1 ? argv[1]
-        : "../corpus/data/ohlcv_ETH-USDT-USDT_15m.csv";
+        : "../corpus/data/derived/ohlcv_ETH-USDT-USDT_15m_window.csv";
     const char* out_path = argc > 2 ? argv[2]
         : "strategies/_indicators/canonical_pineforge.csv";
 
diff --git a/benchmarks/runners/run_pinets_canonical.mjs b/benchmarks/runners/run_pinets_canonical.mjs
index 881e181..7747387 100755
--- a/benchmarks/runners/run_pinets_canonical.mjs
+++ b/benchmarks/runners/run_pinets_canonical.mjs
@@ -25,7 +25,7 @@ const candidates = [
     resolve(REPO, 'benchmarks/assets/data/ETHUSDT_15.csv'),
     resolve(REPO, 'benchmarks/data/ETHUSDT_15.csv'),
     resolve(REPO, 'benchmarks/_workdir/data/ETHUSDT_15.csv'),
-    resolve(REPO, 'corpus/data/ohlcv_ETH-USDT-USDT_15m.csv'),
+    resolve(REPO, 'corpus/data/derived/ohlcv_ETH-USDT-USDT_15m_window.csv'),
 ];
 const csvPath = candidates.find(existsSync) ?? candidates.at(-1);
 console.log(`pinets: using OHLCV ${csvPath.replace(REPO + '/', '')}`);
diff --git a/benchmarks/runners/run_pynecore.py b/benchmarks/runners/run_pynecore.py
index c4c8f11..bd8aa41 100755
--- a/benchmarks/runners/run_pynecore.py
+++ b/benchmarks/runners/run_pynecore.py
@@ -15,7 +15,8 @@
     {strategy_dir}/pynecore_stats.csv   — strategy stats (verbatim from pyne)
 
 The CLI invokes the locally-installed `pyne run` against the corpus
-OHLCV (`corpus/data/ohlcv_ETH-USDT-USDT_15m.csv`, pre-converted to
+OHLCV (`corpus/data/derived/ohlcv_ETH-USDT-USDT_15m_window.csv`,
+derived from the committed 1m feed, pre-converted to
 PyneCore's `.ohlcv` format under `benchmarks/_workdir/data/`). It then
 re-emits the resulting trade list in PineForge's TV-mirror schema —
 same column names, same exit-then-entry row order, same reverse-
diff --git a/scripts/crossvalidate_metrics.py b/scripts/crossvalidate_metrics.py
index 6735006..8c18689 100644
--- a/scripts/crossvalidate_metrics.py
+++ b/scripts/crossvalidate_metrics.py
@@ -702,6 +702,7 @@ def main() -> int:
                          "generated.cpp, corpus convention 1e6; pf_report_t "
                          "does not expose it)")
     args = ap.parse_args()
+    rs.ensure_derived()
     if args.all:
         return crossvalidate_all(args.corpus_root.resolve(), args.ohlcv.resolve())
     if args.strategy_dir is None:
diff --git a/scripts/derive_corpus_feeds.py b/scripts/derive_corpus_feeds.py
new file mode 100644
index 0000000..22c416e
--- /dev/null
+++ b/scripts/derive_corpus_feeds.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+"""Materialize the derived corpus feeds from the single committed 1m feed.
+
+The corpus ships exactly ONE reference feed:
+    corpus/data/ohlcv_ETH-USDT-USDT_1m.csv        (Git LFS)
+1-minute Binance ETH-USDT-USDT perp bars, full exchange history
+(2020-01-01 onward) through the end of the comparison window.
+
+Everything else the harnesses consume is derived deterministically from
+it into corpus/data/derived/ (gitignored):
+
+    ohlcv_ETH-USDT-USDT_15m.csv         900s resample, full history —
+                                        the default chart feed
+    ohlcv_ETH-USDT-USDT_15m_window.csv  comparison-window slice of the
+                                        above — cold-start probes and
+                                        benchmark runners (bench inputs
+                                        must stay historically
+                                        comparable), and the harness's
+                                        window-bounds fallback
+
+Resample rule: open=first, high=max, low=min, close=last, volume=sum
+(rounded to 6dp), timestamp=bucket start; a trailing partial bucket is
+kept. Idempotent: files are only rewritten when missing or older than
+the source feed. Import ensure_derived() or run as a script.
+"""
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SOURCE_1M = REPO_ROOT / "corpus" / "data" / "ohlcv_ETH-USDT-USDT_1m.csv"
+DERIVED_DIR = REPO_ROOT / "corpus" / "data" / "derived"
+DERIVED_15M = DERIVED_DIR / "ohlcv_ETH-USDT-USDT_15m.csv"
+DERIVED_15M_WINDOW = DERIVED_DIR / "ohlcv_ETH-USDT-USDT_15m_window.csv"
+
+# Comparison window (epoch ms, inclusive), pinned from the historical
+# window-only 15m feed the corpus used to ship: first bar 2025-04-20
+# 21:00 UTC, last bar 2026-05-04 06:00 UTC bucket.
+WINDOW_START_MS = 1745182800000
+WINDOW_END_MS = 1777906800000
+
+HEADER = "timestamp,open,high,low,close,volume"
+
+
+def _fmt(v: float) -> str:
+    """Shortest-repr float, integral values written bare (2382 not 2382.0)
+    — matches the committed feed's formatting."""
+    if v == int(v) and abs(v) < 1e15:
+        return str(int(v))
+    return repr(v)
+
+
+def _stale(target: Path, source: Path) -> bool:
+    return (not target.exists()
+            or target.stat().st_mtime < source.stat().st_mtime)
+
+
+def ensure_derived(verbose: bool = False) -> None:
+    """Create/refresh the derived feeds. Cheap no-op when up to date."""
+    if not SOURCE_1M.exists():
+        raise FileNotFoundError(
+            f"{SOURCE_1M} missing — is the corpus submodule checked out "
+            "with git-lfs installed? (file should be ~176 MB, not a "
+            "small LFS pointer)")
+    if SOURCE_1M.stat().st_size < 1_000_000:
+        raise RuntimeError(
+            f"{SOURCE_1M} is suspiciously small — likely an unsmudged "
+            "Git LFS pointer. Run: git lfs install && git lfs pull "
+            "(inside the corpus submodule).")
+    if not (_stale(DERIVED_15M, SOURCE_1M)
+            or _stale(DERIVED_15M_WINDOW, SOURCE_1M)):
+        return
+
+    if verbose:
+        print(f"[derive] resampling {SOURCE_1M.name} -> 15m ...")
+    buckets = []  # [ts, o, h, l, c, v] per 900s bucket, in order
+    cur_key = None
+    with SOURCE_1M.open() as fh:
+        next(fh)  # header
+        for line in fh:
+            ts_s, o, h, l, c, v = line.rstrip("\n").split(",")
+            ts = int(ts_s)
+            key = ts - ts % 900_000
+            if key != cur_key:
+                buckets.append([key, o, float(h), float(l), c, float(v)])
+                cur_key = key
+            else:
+                b = buckets[-1]
+                hf, lf = float(h), float(l)
+                if hf > b[2]:
+                    b[2] = hf
+                if lf < b[3]:
+                    b[3] = lf
+                b[4] = c
+                b[5] += float(v)
+
+    DERIVED_DIR.mkdir(parents=True, exist_ok=True)
+    full_lines = [HEADER]
+    window_lines = [HEADER]
+    for ts, o, h, l, c, v in buckets:
+        row = f"{ts},{o},{_fmt(h)},{_fmt(l)},{c},{_fmt(round(v, 6))}"
+        full_lines.append(row)
+        if WINDOW_START_MS <= ts <= WINDOW_END_MS:
+            window_lines.append(row)
+    for path, lines in ((DERIVED_15M, full_lines),
+                        (DERIVED_15M_WINDOW, window_lines)):
+        tmp = path.with_suffix(".csv.new")
+        tmp.write_text("\n".join(lines) + "\n")
+        tmp.replace(path)
+        if verbose:
+            print(f"[derive] wrote {path.relative_to(REPO_ROOT)} "
+                  f"({len(lines) - 1} bars)")
+
+
+if __name__ == "__main__":
+    ensure_derived(verbose=True)
diff --git a/scripts/run_corpus.sh b/scripts/run_corpus.sh
index 4dff840..8d8c430 100755
--- a/scripts/run_corpus.sh
+++ b/scripts/run_corpus.sh
@@ -45,7 +45,14 @@ Run:  git submodule update --init corpus
 (the TV validation corpus is a PUBLIC submodule: https://github.com/pineforge-4pass/pineforge-corpus)"
 fi
 
-# --- 0) (optional) regenerate generated.cpp from strategy.pine --------
+# --- 0) derive chart feeds from the single committed 1m feed ----------
+# The corpus ships one Git-LFS feed (full-history 1m); the 15m chart
+# feeds live in corpus/data/derived/ and are rebuilt here when stale.
+
+log "materializing derived corpus feeds"
+"$PY" scripts/derive_corpus_feeds.py
+
+# --- 0b) (optional) regenerate generated.cpp from strategy.pine -------
 # REGEN=1 re-derives every corpus/*/*/generated.cpp from its strategy.pine
 # through the pineforge-release Docker image (engine + bundled transpiler), so
 # the build below compiles freshly-transpiled C++ instead of the committed copy.
diff --git a/scripts/run_strategy.py b/scripts/run_strategy.py
index d0e0b81..c915ea5 100644
--- a/scripts/run_strategy.py
+++ b/scripts/run_strategy.py
@@ -22,7 +22,7 @@
 
     # Custom OHLCV input
     python scripts/run_strategy.py corpus/basic/greedy \\
-        --ohlcv corpus/data/ohlcv_ETH-USDT-USDT_15m.csv
+        --ohlcv corpus/data/derived/ohlcv_ETH-USDT-USDT_15m_window.csv
 
     # Don't overwrite engine_trades.csv if it already exists
     python scripts/run_strategy.py corpus/basic/greedy --no-overwrite
@@ -52,9 +52,15 @@
 from pathlib import Path
 
 REPO_ROOT = Path(__file__).resolve().parent.parent
-REFERENCE_OHLCV = REPO_ROOT / "corpus" / "data" / "ohlcv_ETH-USDT-USDT_15m.csv"
-WARMUP_OHLCV = REPO_ROOT / "corpus" / "data" / "ohlcv_ETH-USDT-USDT_15m_warmup6m.csv"
-DEFAULT_OHLCV = WARMUP_OHLCV if WARMUP_OHLCV.exists() else REFERENCE_OHLCV
+# The corpus ships a single committed feed (full-history 1m, Git LFS);
+# the 15m chart feeds are derived from it locally. ensure_derived() is
+# called from main() — importing this module stays side-effect free for
+# consumers that only want the ABI mirrors.
+from derive_corpus_feeds import (  # noqa: E402
+    DERIVED_15M, DERIVED_15M_WINDOW, ensure_derived)
+REFERENCE_OHLCV = DERIVED_15M_WINDOW
+WARMUP_OHLCV = DERIVED_15M
+DEFAULT_OHLCV = WARMUP_OHLCV
 
 # Keys in inputs.json that are validator/harness metadata, not Pine input()
 # values. Mirrors the canonical validator's VALIDATION_INPUT_META_KEYS so
@@ -1168,6 +1174,8 @@ def main() -> int:
                          "$PINEFORGE_RELEASE_IMAGE or ghcr .../pineforge-release:latest).")
     args = ap.parse_args()
 
+    ensure_derived()
+
     strategy_dir = args.strategy_dir.resolve()
     out_path = (args.output.resolve() if args.output
                 else strategy_dir / "engine_trades.csv")

From 8bb9abc57c015e89cd2d45419c439dc9fa74f0a7 Mon Sep 17 00:00:00 2001
From: luisleo526 <luisleo52655@gmail.com>
Date: Thu, 2 Jul 2026 22:56:49 +0800
Subject: [PATCH 2/2] corpus: bump submodule to single-1m-feed layout (a84307d)

Corpus PR #5: one committed feed (data/ohlcv_ETH-USDT-USDT_1m.csv,
full-history 1m via Git LFS); 15m chart feeds derived locally by
scripts/derive_corpus_feeds.py; private-infra rebuild script removed.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01CnvqmHPmgUpeu2fz6A1mMU
---
 corpus | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/corpus b/corpus
index 9d1d71c..a84307d 160000
--- a/corpus
+++ b/corpus
@@ -1 +1 @@
-Subproject commit 9d1d71c588d99ea6eb32aaca5e212fd359fa07e0
+Subproject commit a84307d1afd9e02f6eb95b4ae6818ebae6652851