From d62ff1da88a29c6e25053a3a3f8b64428d40ce70 Mon Sep 17 00:00:00 2001 From: luisleo526 Date: Thu, 2 Jul 2026 22:53:49 +0800 Subject: [PATCH 1/2] harness: derive chart feeds from the corpus's single committed 1m feed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The corpus now ships exactly one feed: data/ohlcv_ETH-USDT-USDT_1m.csv (full-history Binance ETH perp 1m, Git LFS). New scripts/derive_corpus_feeds.py materializes the 15m chart feeds into corpus/data/derived/ (gitignored): a full-history 900s resample (the default chart feed) and its comparison-window slice (cold-start probes, benchmark runners, window-bounds reference). Derivation is idempotent (mtime-gated), pure-local, and guards against unsmudged LFS pointers. - run_strategy.py: feed constants point at the derived files; ensure_derived() runs at main() entry (import stays side-effect free for ABI-mirror consumers like crossvalidate_metrics). - run_corpus.sh: derive step before the build. - crossvalidate_metrics.py: ensure_derived() after arg parse. - benchmarks: corpus fallback paths now the derived window slice, so bench inputs stay bar-identical with the historical baseline. Native-15m engine execution is unchanged — the resample was verified bar-identical to the old committed 15m files except two warmup bars on the 2024-10-28 outage day, one 0.01 in-window open (Binance's 15m kline disagrees with its own 1m klines during a flat outage), and a fuller final partial bucket. Gate: ctest 78/78; corpus 252/252 ok, excellent=251/anomaly=1 exact. Co-Authored-By: Claude Fable 5 Claude-Session: https://claude.ai/code/session_01CnvqmHPmgUpeu2fz6A1mMU --- CONTRIBUTING.md | 8 +- benchmarks/compare.py | 6 +- .../runners/run_pineforge_canonical.cpp | 2 +- benchmarks/runners/run_pinets_canonical.mjs | 2 +- benchmarks/runners/run_pynecore.py | 3 +- scripts/crossvalidate_metrics.py | 1 + scripts/derive_corpus_feeds.py | 114 ++++++++++++++++++ scripts/run_corpus.sh | 9 +- scripts/run_strategy.py | 16 ++- 9 files changed, 148 insertions(+), 13 deletions(-) create mode 100644 scripts/derive_corpus_feeds.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index beab26f..69187a2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -158,9 +158,11 @@ bash scripts/run_corpus.sh ``` It builds every `corpus/validation//generated.cpp` into a -`strategy.dylib` / `strategy.so`, runs each against -`corpus/data/ohlcv_ETH-USDT-USDT_15m_warmup6m.csv` when present -(falling back to `corpus/data/ohlcv_ETH-USDT-USDT_15m.csv`), and +`strategy.dylib` / `strategy.so`, runs each against the 15m chart feed +derived from the corpus's single committed 1m feed +(`corpus/data/ohlcv_ETH-USDT-USDT_1m.csv`, Git LFS; the harness +materializes `corpus/data/derived/ohlcv_ETH-USDT-USDT_15m.csv` via +`scripts/derive_corpus_feeds.py`), and rewrites the regenerated `engine_trades.csv` files. It also prints a canonical `scripts/verify_corpus.py --all --quiet` summary with the five parity labels (`excellent`, `strong`, `moderate`, `weak`, `minimal`). diff --git a/benchmarks/compare.py b/benchmarks/compare.py index 54a7dfd..3372494 100755 --- a/benchmarks/compare.py +++ b/benchmarks/compare.py @@ -67,11 +67,13 @@ # OHLCV resolution order (first existing wins): # 1. DATA/ETHUSDT_15.csv — snapshot (paths: benchmarks/assets/data or benchmarks/data) # 2. benchmarks/_workdir/data/ETHUSDT_15.csv — working copy from run_all.sh -# 3. corpus/data/ohlcv_ETH-USDT-USDT_15m.csv — fallback +# 3. corpus/data/derived/ohlcv_ETH-USDT-USDT_15m_window.csv — fallback +# (derived from the single committed 1m feed by +# scripts/derive_corpus_feeds.py) _CANDIDATE_OHLCV = [ DATA / "ETHUSDT_15.csv", BENCH_DIR / "_workdir" / "data" / "ETHUSDT_15.csv", - REPO_ROOT / "corpus" / "data" / "ohlcv_ETH-USDT-USDT_15m.csv", + REPO_ROOT / "corpus" / "data" / "derived" / "ohlcv_ETH-USDT-USDT_15m_window.csv", ] OHLCV_PATH = next((p for p in _CANDIDATE_OHLCV if p.exists()), _CANDIDATE_OHLCV[-1]) diff --git a/benchmarks/runners/run_pineforge_canonical.cpp b/benchmarks/runners/run_pineforge_canonical.cpp index 66f03e3..c5b332f 100755 --- a/benchmarks/runners/run_pineforge_canonical.cpp +++ b/benchmarks/runners/run_pineforge_canonical.cpp @@ -68,7 +68,7 @@ static std::string fmt(double v) { int main(int argc, char** argv) { const char* in_path = argc > 1 ? argv[1] - : "../corpus/data/ohlcv_ETH-USDT-USDT_15m.csv"; + : "../corpus/data/derived/ohlcv_ETH-USDT-USDT_15m_window.csv"; const char* out_path = argc > 2 ? argv[2] : "strategies/_indicators/canonical_pineforge.csv"; diff --git a/benchmarks/runners/run_pinets_canonical.mjs b/benchmarks/runners/run_pinets_canonical.mjs index 881e181..7747387 100755 --- a/benchmarks/runners/run_pinets_canonical.mjs +++ b/benchmarks/runners/run_pinets_canonical.mjs @@ -25,7 +25,7 @@ const candidates = [ resolve(REPO, 'benchmarks/assets/data/ETHUSDT_15.csv'), resolve(REPO, 'benchmarks/data/ETHUSDT_15.csv'), resolve(REPO, 'benchmarks/_workdir/data/ETHUSDT_15.csv'), - resolve(REPO, 'corpus/data/ohlcv_ETH-USDT-USDT_15m.csv'), + resolve(REPO, 'corpus/data/derived/ohlcv_ETH-USDT-USDT_15m_window.csv'), ]; const csvPath = candidates.find(existsSync) ?? candidates.at(-1); console.log(`pinets: using OHLCV ${csvPath.replace(REPO + '/', '')}`); diff --git a/benchmarks/runners/run_pynecore.py b/benchmarks/runners/run_pynecore.py index c4c8f11..bd8aa41 100755 --- a/benchmarks/runners/run_pynecore.py +++ b/benchmarks/runners/run_pynecore.py @@ -15,7 +15,8 @@ {strategy_dir}/pynecore_stats.csv — strategy stats (verbatim from pyne) The CLI invokes the locally-installed `pyne run` against the corpus -OHLCV (`corpus/data/ohlcv_ETH-USDT-USDT_15m.csv`, pre-converted to +OHLCV (`corpus/data/derived/ohlcv_ETH-USDT-USDT_15m_window.csv`, +derived from the committed 1m feed, pre-converted to PyneCore's `.ohlcv` format under `benchmarks/_workdir/data/`). It then re-emits the resulting trade list in PineForge's TV-mirror schema — same column names, same exit-then-entry row order, same reverse- diff --git a/scripts/crossvalidate_metrics.py b/scripts/crossvalidate_metrics.py index 6735006..8c18689 100644 --- a/scripts/crossvalidate_metrics.py +++ b/scripts/crossvalidate_metrics.py @@ -702,6 +702,7 @@ def main() -> int: "generated.cpp, corpus convention 1e6; pf_report_t " "does not expose it)") args = ap.parse_args() + rs.ensure_derived() if args.all: return crossvalidate_all(args.corpus_root.resolve(), args.ohlcv.resolve()) if args.strategy_dir is None: diff --git a/scripts/derive_corpus_feeds.py b/scripts/derive_corpus_feeds.py new file mode 100644 index 0000000..22c416e --- /dev/null +++ b/scripts/derive_corpus_feeds.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""Materialize the derived corpus feeds from the single committed 1m feed. + +The corpus ships exactly ONE reference feed: + corpus/data/ohlcv_ETH-USDT-USDT_1m.csv (Git LFS) +1-minute Binance ETH-USDT-USDT perp bars, full exchange history +(2020-01-01 onward) through the end of the comparison window. + +Everything else the harnesses consume is derived deterministically from +it into corpus/data/derived/ (gitignored): + + ohlcv_ETH-USDT-USDT_15m.csv 900s resample, full history — + the default chart feed + ohlcv_ETH-USDT-USDT_15m_window.csv comparison-window slice of the + above — cold-start probes and + benchmark runners (bench inputs + must stay historically + comparable), and the harness's + window-bounds fallback + +Resample rule: open=first, high=max, low=min, close=last, volume=sum +(rounded to 6dp), timestamp=bucket start; a trailing partial bucket is +kept. Idempotent: files are only rewritten when missing or older than +the source feed. Import ensure_derived() or run as a script. +""" +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +SOURCE_1M = REPO_ROOT / "corpus" / "data" / "ohlcv_ETH-USDT-USDT_1m.csv" +DERIVED_DIR = REPO_ROOT / "corpus" / "data" / "derived" +DERIVED_15M = DERIVED_DIR / "ohlcv_ETH-USDT-USDT_15m.csv" +DERIVED_15M_WINDOW = DERIVED_DIR / "ohlcv_ETH-USDT-USDT_15m_window.csv" + +# Comparison window (epoch ms, inclusive), pinned from the historical +# window-only 15m feed the corpus used to ship: first bar 2025-04-20 +# 21:00 UTC, last bar 2026-05-04 06:00 UTC bucket. +WINDOW_START_MS = 1745182800000 +WINDOW_END_MS = 1777906800000 + +HEADER = "timestamp,open,high,low,close,volume" + + +def _fmt(v: float) -> str: + """Shortest-repr float, integral values written bare (2382 not 2382.0) + — matches the committed feed's formatting.""" + if v == int(v) and abs(v) < 1e15: + return str(int(v)) + return repr(v) + + +def _stale(target: Path, source: Path) -> bool: + return (not target.exists() + or target.stat().st_mtime < source.stat().st_mtime) + + +def ensure_derived(verbose: bool = False) -> None: + """Create/refresh the derived feeds. Cheap no-op when up to date.""" + if not SOURCE_1M.exists(): + raise FileNotFoundError( + f"{SOURCE_1M} missing — is the corpus submodule checked out " + "with git-lfs installed? (file should be ~176 MB, not a " + "small LFS pointer)") + if SOURCE_1M.stat().st_size < 1_000_000: + raise RuntimeError( + f"{SOURCE_1M} is suspiciously small — likely an unsmudged " + "Git LFS pointer. Run: git lfs install && git lfs pull " + "(inside the corpus submodule).") + if not (_stale(DERIVED_15M, SOURCE_1M) + or _stale(DERIVED_15M_WINDOW, SOURCE_1M)): + return + + if verbose: + print(f"[derive] resampling {SOURCE_1M.name} -> 15m ...") + buckets = [] # [ts, o, h, l, c, v] per 900s bucket, in order + cur_key = None + with SOURCE_1M.open() as fh: + next(fh) # header + for line in fh: + ts_s, o, h, l, c, v = line.rstrip("\n").split(",") + ts = int(ts_s) + key = ts - ts % 900_000 + if key != cur_key: + buckets.append([key, o, float(h), float(l), c, float(v)]) + cur_key = key + else: + b = buckets[-1] + hf, lf = float(h), float(l) + if hf > b[2]: + b[2] = hf + if lf < b[3]: + b[3] = lf + b[4] = c + b[5] += float(v) + + DERIVED_DIR.mkdir(parents=True, exist_ok=True) + full_lines = [HEADER] + window_lines = [HEADER] + for ts, o, h, l, c, v in buckets: + row = f"{ts},{o},{_fmt(h)},{_fmt(l)},{c},{_fmt(round(v, 6))}" + full_lines.append(row) + if WINDOW_START_MS <= ts <= WINDOW_END_MS: + window_lines.append(row) + for path, lines in ((DERIVED_15M, full_lines), + (DERIVED_15M_WINDOW, window_lines)): + tmp = path.with_suffix(".csv.new") + tmp.write_text("\n".join(lines) + "\n") + tmp.replace(path) + if verbose: + print(f"[derive] wrote {path.relative_to(REPO_ROOT)} " + f"({len(lines) - 1} bars)") + + +if __name__ == "__main__": + ensure_derived(verbose=True) diff --git a/scripts/run_corpus.sh b/scripts/run_corpus.sh index 4dff840..8d8c430 100755 --- a/scripts/run_corpus.sh +++ b/scripts/run_corpus.sh @@ -45,7 +45,14 @@ Run: git submodule update --init corpus (the TV validation corpus is a PUBLIC submodule: https://github.com/pineforge-4pass/pineforge-corpus)" fi -# --- 0) (optional) regenerate generated.cpp from strategy.pine -------- +# --- 0) derive chart feeds from the single committed 1m feed ---------- +# The corpus ships one Git-LFS feed (full-history 1m); the 15m chart +# feeds live in corpus/data/derived/ and are rebuilt here when stale. + +log "materializing derived corpus feeds" +"$PY" scripts/derive_corpus_feeds.py + +# --- 0b) (optional) regenerate generated.cpp from strategy.pine ------- # REGEN=1 re-derives every corpus/*/*/generated.cpp from its strategy.pine # through the pineforge-release Docker image (engine + bundled transpiler), so # the build below compiles freshly-transpiled C++ instead of the committed copy. diff --git a/scripts/run_strategy.py b/scripts/run_strategy.py index d0e0b81..c915ea5 100644 --- a/scripts/run_strategy.py +++ b/scripts/run_strategy.py @@ -22,7 +22,7 @@ # Custom OHLCV input python scripts/run_strategy.py corpus/basic/greedy \\ - --ohlcv corpus/data/ohlcv_ETH-USDT-USDT_15m.csv + --ohlcv corpus/data/derived/ohlcv_ETH-USDT-USDT_15m_window.csv # Don't overwrite engine_trades.csv if it already exists python scripts/run_strategy.py corpus/basic/greedy --no-overwrite @@ -52,9 +52,15 @@ from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent -REFERENCE_OHLCV = REPO_ROOT / "corpus" / "data" / "ohlcv_ETH-USDT-USDT_15m.csv" -WARMUP_OHLCV = REPO_ROOT / "corpus" / "data" / "ohlcv_ETH-USDT-USDT_15m_warmup6m.csv" -DEFAULT_OHLCV = WARMUP_OHLCV if WARMUP_OHLCV.exists() else REFERENCE_OHLCV +# The corpus ships a single committed feed (full-history 1m, Git LFS); +# the 15m chart feeds are derived from it locally. ensure_derived() is +# called from main() — importing this module stays side-effect free for +# consumers that only want the ABI mirrors. +from derive_corpus_feeds import ( # noqa: E402 + DERIVED_15M, DERIVED_15M_WINDOW, ensure_derived) +REFERENCE_OHLCV = DERIVED_15M_WINDOW +WARMUP_OHLCV = DERIVED_15M +DEFAULT_OHLCV = WARMUP_OHLCV # Keys in inputs.json that are validator/harness metadata, not Pine input() # values. Mirrors the canonical validator's VALIDATION_INPUT_META_KEYS so @@ -1168,6 +1174,8 @@ def main() -> int: "$PINEFORGE_RELEASE_IMAGE or ghcr .../pineforge-release:latest).") args = ap.parse_args() + ensure_derived() + strategy_dir = args.strategy_dir.resolve() out_path = (args.output.resolve() if args.output else strategy_dir / "engine_trades.csv") From 8bb9abc57c015e89cd2d45419c439dc9fa74f0a7 Mon Sep 17 00:00:00 2001 From: luisleo526 Date: Thu, 2 Jul 2026 22:56:49 +0800 Subject: [PATCH 2/2] corpus: bump submodule to single-1m-feed layout (a84307d) Corpus PR #5: one committed feed (data/ohlcv_ETH-USDT-USDT_1m.csv, full-history 1m via Git LFS); 15m chart feeds derived locally by scripts/derive_corpus_feeds.py; private-infra rebuild script removed. Co-Authored-By: Claude Fable 5 Claude-Session: https://claude.ai/code/session_01CnvqmHPmgUpeu2fz6A1mMU --- corpus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corpus b/corpus index 9d1d71c..a84307d 160000 --- a/corpus +++ b/corpus @@ -1 +1 @@ -Subproject commit 9d1d71c588d99ea6eb32aaca5e212fd359fa07e0 +Subproject commit a84307d1afd9e02f6eb95b4ae6818ebae6652851