From 90e14714ebf8fd2863e8556b725829ff4af2e5b3 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 26 Apr 2026 16:27:59 -0400 Subject: [PATCH 1/2] Add yatchew_hr_test(null="mean_independence") mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors R YatchewTest::yatchew_test(order=0). Closes the placebo Yatchew R-parity gap from PR #392. - New keyword-only `null: Literal["linearity", "mean_independence"]` on `yatchew_hr_test` (default `"linearity"` is bit-exact backcompat). - `"mean_independence"` fits intercept-only OLS (residuals = dy - mean(dy)); the downstream sigma2_diff / sigma2_W / sort-by-d machinery is shared. - Wired through both unweighted and survey-weighted code paths (4-arm dispatch on (null × weighted)). - `YatchewTestResults` gained `null_form: str = "linearity"` field; `summary()` renders the correct null-hypothesis title; `__repr__` and `to_dict()` updated. - `tests/test_did_had_parity.py::TestYatchewParity` removed the placebo skip; routes effect rows through `null="linearity"` (R order=1) and placebo rows through `null="mean_independence"` (R order=0); both modes share the documented `× G/(G-1)` finite-sample convention shift and parity holds at `atol=1e-10`. - New `TestYatchewHRTestMeanIndependence` class (15 tests) covering happy path, naive Python baseline at `atol=1e-12`, population-variance closed form, invalid value, default backcompat, mode-agnostic tie/constant-d rejection, NaN handling, weighted reduction at w=ones(G) at `atol=1e-14`, weighted non-uniform baseline, default-under-weights, survey×null orthogonality, the (linearity, weighted) baseline (4-arm coverage), zero/replicate-weight rejection, and G<3 mode-agnostic. One additive backcompat case in each of `TestYatchewHRTest` and `TestYatchewHRTestSurvey`. - REGISTRY.md HAD § Yatchew note: TODO marker replaced with shipped description. CHANGELOG.md and TODO.md updated. Patch-level (additive keyword-only kwarg + additive dataclass field with default). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 3 +- TODO.md | 2 +- diff_diff/had_pretests.py | 132 +++++++++++++-- docs/methodology/REGISTRY.md | 2 +- tests/test_did_had_parity.py | 32 ++-- tests/test_had_pretests.py | 301 +++++++++++++++++++++++++++++++++++ 6 files changed, 439 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1797f815..40220bc6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - **`ChaisemartinDHaultfoeuille.by_path` is now compatible with `trends_linear` (DID^{fd} group-specific linear trends) and `trends_nonparam` (state-set trends).** For `trends_linear`, the first-differencing transform runs once globally before path enumeration, so per-path raw second-differences `DID^{fd}_{path, l}` surface on `path_effects[path]["horizons"][l]` automatically. Per-path **cumulated level effects** `delta_{path, l} = sum_{l'=1..l} DID^{fd}_{path, l'}` (the quantity R returns under `did_multiplegt_dyn(..., by_path, trends_lin)`) surface on the new `results.path_cumulated_event_study[path][l]` field, mirroring the global `linear_trends_effects` cumulation. `to_dataframe(level="by_path")` exposes `cumulated_effect` / `cumulated_se` columns (always present, NaN-when-None — mirrors the `cband_*` convention from PR #374); `summary()` renders a "Cumulated Level Effects (DID^{fd}, trends_linear)" sub-section under each per-path block. SE on the cumulated layer is the conservative upper bound (sum of per-horizon component SEs, NaN-consistent), matching the global `linear_trends_effects` convention. Path enumeration runs on the post-first-differenced `N_mat_fd`: switchers with `F_g==2` fail the window-eligibility check and are dropped from path enumeration entirely (the existing global `F_g >= 3` warning still surfaces the issue), so a path whose switchers all have `F_g < 3` is silently absent from `path_effects` rather than present-with-NaN. Placebo under `trends_linear` returns RAW per-horizon values — there is no per-path placebo cumulation surface in either Python or R. For `trends_nonparam`, the set membership column is validated and stored once globally as `set_ids_arr`; the `set_ids` parameter is now threaded through the four per-path IF helpers (`_compute_path_effects`, `_compute_path_placebos`, `_collect_path_bootstrap_inputs`, `_collect_path_placebo_bootstrap_inputs`) so per-path analytical SE, bootstrap, placebos, and sup-t bands all consume the set-restricted control pool automatically. Per-period effects remain unadjusted under both extensions, consistent with the existing per-period DID contract. Validated against R via two new golden-value scenarios: `single_baseline_multi_path_by_path_trends_lin` (n_periods=13, F_g >= 4, cohort-single-path; per-path cumulated point estimates match R bit-exactly with `POINT_RTOL=1e-9`, cumulated SE within `CUM_SE_RTOL=0.20`) and `multi_path_reversible_by_path_trends_nonparam` (per-path point estimates AND placebos match R bit-exactly with `POINT_RTOL=1e-9`, per-path SE within `SE_RTOL=0.15`). **F_g=3 boundary-case divergence (`by_path + trends_linear`):** `F_g=3` switchers have only 1 valid pre-window Z value after first-differencing, triggering 30%+ relative divergence between Python and R per-path point estimates on paths whose switchers include `F_g=3`. A targeted `UserWarning` fires at fit-time on this regime; R parity is asserted only on the `F_g >= 4` parity fixture. Placebo parity for `trends_linear` is intentionally skipped (R's per-path placebo computation re-runs on the path-restricted subsample with different control eligibility than Python's global-then-disaggregate architecture surfaces; placebo + `trends_linear` is exercised via internal regression only). Cross-path cohort-sharing SE deviation from R documented for `path_effects` is inherited unchanged. Gates at `chaisemartin_dhaultfoeuille.py:1014-1023` removed; `by_path` docstring updated to add the two new compatibility paragraphs and remove `trends_linear` / `trends_nonparam` from the incompatible list. R-parity tests at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathTrendsLinear` and `::TestDCDHDynRParityByPathTrendsNonparam`; cross-surface regressions at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathTrendsLinear` and `::TestByPathTrendsNonparam`. See `docs/methodology/REGISTRY.md` §ChaisemartinDHaultfoeuille `Note (Phase 3 by_path ...)` → "Per-path linear-trends DID^{fd}" and "Per-path state-set trends" for the full contract. +- **`yatchew_hr_test(null="mean_independence")` mode** mirroring R `YatchewTest::yatchew_test(order=0)`. Adds a `null: Literal["linearity", "mean_independence"]` keyword-only kwarg to `yatchew_hr_test`. Default `"linearity"` is bit-exact backcompat (residuals from OLS `dy = a + b·d + eps`, paper Assumption 8 / Theorem 7). New `"mean_independence"` fits intercept-only OLS (`dy = a + eps`, residuals `= dy - mean(dy)`); the downstream `sigma2_diff` / `sigma2_W` / sort-by-`d` machinery is identical between the two modes. Exposed on both unweighted and survey-weighted code paths (`weights=` / `survey_design=` compose orthogonally with `null=`). Adds a `null_form: str` field to `YatchewTestResults` so `summary()` renders the correct null-hypothesis description; `__repr__` and `to_dict()` updated. Closes the placebo Yatchew R-parity gap from PR #392 — `tests/test_did_had_parity.py::TestYatchewParity` now routes effect rows through `null="linearity"` (R `order=1`) and placebo rows through `null="mean_independence"` (R `order=0`); both modes share the documented `× G/(G-1)` finite-sample convention shift and parity holds at `atol=1e-10`. Patch-level (additive keyword-only kwarg + additive dataclass field with default). - **HAD `trends_lin=True` linear-trend detrending mode** on `HeterogeneousAdoptionDiD.fit(aggregate="event_study")`, `joint_pretrends_test`, and `joint_homogeneity_test`. Mirrors R `DIDHAD::did_had(..., trends_lin=TRUE)` (paper Eq. 17 / Eq. 18 / page 32 joint-Stute homogeneity-with-trends). Per-group linear-trend slope estimated as `Y[g, F-1] - Y[g, F-2]` and applied as `(t - base) × slope` adjustment to per-event-time outcome evolutions. Requires F ≥ 3 (panel must contain F-2). The "consumed" placebo at our event-time `e=-2` is auto-dropped (R reduces max placebo lag by 1 with the same effect). Mutually exclusive with survey weighting (`survey_design` / `survey` / `weights`): raises `NotImplementedError` per `feedback_per_method_survey_element_contract` (weighted slope estimator not derived from paper; tracked in TODO.md as a follow-up). Bit-exact backcompat for `trends_lin=False` (default). Patch-level (additive keyword-only kwarg). -- **HAD R-package end-to-end parity test** vs `DIDHAD` v2.0.0 (`Credible-Answers/did_had`) on the **`design="continuous_at_zero"` (Design 1') surface**. New parity fixture `benchmarks/data/did_had_golden.json` generated by `benchmarks/R/generate_did_had_golden.R` covers 3 paper-derived synthetic DGPs (Uniform, Beta(2,2), Beta(0.5,1)) × 5 method combinations (overall, event-study, placebo, yatchew, trends_lin). The harness explicitly forces `HeterogeneousAdoptionDiD(design="continuous_at_zero")` because R `did_had` always evaluates the local-linear at `d=0` regardless of dose distribution; our default `design="auto"` may legitimately choose `continuous_near_d_lower` or `mass_point` on dose distributions with boundary density bounded away from zero (e.g., Beta(2,2)) and thereby diverge from R numerically — that divergence is methodologically defensible but out of scope for this parity test. Python parity test `tests/test_did_had_parity.py` asserts point estimate / SE / CI bounds at `atol=1e-8` and Yatchew T-stat at `atol=1e-10` after a documented `× G/(G-1)` finite-sample convention shift. Two intentional convention deviations from R, documented in `docs/methodology/REGISTRY.md`: (a) we report the bias-corrected point estimate (modern CCF 2018 convention; R's `Estimate` column reports the conventional estimate with the bias-corrected CI separately — our `att` matches R's CI midpoint); (b) Yatchew uses paper Appendix E's literal (1/G) variance-denominator convention while R uses base-R `var()`'s (1/(N-1)) sample-variance convention (parity is bit-exact after the `× G/(G-1)` shift). Yatchew on placebos with R's mean-independence null (`order=0`) is not yet exposed in our `yatchew_hr_test` (we currently only support the linearity null) and is skipped in the parity test; tracked as TODO follow-up. +- **HAD R-package end-to-end parity test** vs `DIDHAD` v2.0.0 (`Credible-Answers/did_had`) on the **`design="continuous_at_zero"` (Design 1') surface**. New parity fixture `benchmarks/data/did_had_golden.json` generated by `benchmarks/R/generate_did_had_golden.R` covers 3 paper-derived synthetic DGPs (Uniform, Beta(2,2), Beta(0.5,1)) × 5 method combinations (overall, event-study, placebo, yatchew, trends_lin). The harness explicitly forces `HeterogeneousAdoptionDiD(design="continuous_at_zero")` because R `did_had` always evaluates the local-linear at `d=0` regardless of dose distribution; our default `design="auto"` may legitimately choose `continuous_near_d_lower` or `mass_point` on dose distributions with boundary density bounded away from zero (e.g., Beta(2,2)) and thereby diverge from R numerically — that divergence is methodologically defensible but out of scope for this parity test. Python parity test `tests/test_did_had_parity.py` asserts point estimate / SE / CI bounds at `atol=1e-8` and Yatchew T-stat at `atol=1e-10` after a documented `× G/(G-1)` finite-sample convention shift. Two intentional convention deviations from R, documented in `docs/methodology/REGISTRY.md`: (a) we report the bias-corrected point estimate (modern CCF 2018 convention; R's `Estimate` column reports the conventional estimate with the bias-corrected CI separately — our `att` matches R's CI midpoint); (b) Yatchew uses paper Appendix E's literal (1/G) variance-denominator convention while R uses base-R `var()`'s (1/(N-1)) sample-variance convention (parity is bit-exact after the `× G/(G-1)` shift). Yatchew on placebos with R's mean-independence null (`order=0`) was not exposed in `yatchew_hr_test` at the PR #392 cut and was skipped in the parity test; the follow-up `yatchew_hr_test(null="mean_independence")` entry above closes that gap (placebo rows now routed through `null="mean_independence"` and parity holds at the same `atol=1e-10`). - **Tutorial 20: HAD for National Brand Campaign with Regional Spend Intensity** (`docs/tutorials/20_had_brand_campaign.ipynb`) — end-to-end practitioner walkthrough for `HeterogeneousAdoptionDiD` on a 60-DMA panel where every market is treated at a different dose level and no never-treated unit exists; comparison comes from dose variation across markets, not from an untreated holdout. The DGP uses Uniform[\$5K, \$50K] regional add-on spend per DMA (every DMA participates, no DMA at exactly \$0), so `design="auto"` resolves to `continuous_near_d_lower` (Design 1) with target `WAS_d_lower` — interpreted as the average per-dollar marginal effect of regional spend above the lightest-touch DMA's spend (`d_lower` ≈ \$5K). Covers the headline `WAS_d_lower` fit on a 2-period collapse, the multi-week event study with per-week pointwise CIs and pre-launch placebos, and a stakeholder communication template that flags the Assumption 5/6 caveat (non-testable local-linearity at the boundary). Companion drift-test file `tests/test_t20_had_brand_campaign_drift.py` (13 tests pinning panel composition / sample median, design auto-detection / target / `d_lower`, overall `WAS_d_lower`, CI endpoints, dose mean, n_units, full event-study horizon presence, and per-horizon coverage). T20 wired into the existing `had.py` entry in `docs/doc-deps.yaml`; cross-link added from `docs/practitioner_decision_tree.rst` § "Universal Rollout (No Untreated Markets)" via a `.. tip::` block. ### Changed diff --git a/TODO.md b/TODO.md index 1d6cd918..988f96cc 100644 --- a/TODO.md +++ b/TODO.md @@ -103,7 +103,7 @@ Deferred items from PR reviews that were not addressed before merge. | `HeterogeneousAdoptionDiD` continuous paths: thread `cluster=` through `bias_corrected_local_linear` (Phase 1c's wrapper already supports cluster; Phase 2a ignores it with a `UserWarning` on the continuous path to keep scope tight). | `diff_diff/had.py`, `diff_diff/local_linear.py` | Phase 2a | Low | | `HeterogeneousAdoptionDiD` Eq 17 / Eq 18 linear-trend detrending: SHIPPED in PR #389 (Phase 4 R-parity, 2026-04). Exposed as `trends_lin: bool = False` keyword-only kwarg on `HeterogeneousAdoptionDiD.fit(aggregate="event_study")`, `joint_pretrends_test`, `joint_homogeneity_test`. Mirrors R `DIDHAD::did_had(..., trends_lin=TRUE)`. Pierce-Schott published-number parity (paper p=0.51 / p=0.40) deferred indefinitely (LBD-restricted analysis panel); replaced by end-to-end R-package parity at `tests/test_did_had_parity.py`. | `diff_diff/had_pretests.py::joint_pretrends_test`, `diff_diff/had.py` | Phase 4 (shipped) | Done | | `HeterogeneousAdoptionDiD` `trends_lin × survey_design` follow-up: per-group linear-trend slope under survey weighting (weighted slope estimator? per-PSU slope?) is not derived from the paper. PR #389 raises `NotImplementedError` on the combination across all 3 trends_lin surfaces. If user demand emerges, derive the weighted variant and lift the gate. | `diff_diff/had.py::HeterogeneousAdoptionDiD.fit`, `diff_diff/had_pretests.py::joint_pretrends_test`, `diff_diff/had_pretests.py::joint_homogeneity_test` | follow-up | Low | -| `HeterogeneousAdoptionDiD` `yatchew_hr_test(null="mean_independence")` mode: R `YatchewTest::yatchew_test(order=0)` fits `Y ~ 1` (intercept-only baseline) and tests mean-independence of Y from D; R's `DIDHAD::did_had(yatchew=TRUE)` uses this on placebo rows ("non-parametric pre-trends test"). Our `yatchew_hr_test` always fits `Y ~ D` (linearity null) — no `null=` parameter exposed. Adding the mean-independence mode would (a) give practitioners a more conventional pre-trends test surface, and (b) close the PR #389 R-parity feature gap on the placebo-Yatchew rows (currently skipped in `tests/test_did_had_parity.py::TestYatchewParity` because the two tests are not the same statistic). | `diff_diff/had_pretests.py::yatchew_hr_test` | follow-up | Medium | +| `HeterogeneousAdoptionDiD` `yatchew_hr_test(null="mean_independence")` mode: SHIPPED post-PR #392 (2026-04). Adds `null: Literal["linearity", "mean_independence"]` keyword-only kwarg mirroring R `YatchewTest::yatchew_test(order=0)`. Default `"linearity"` is bit-exact backcompat. `tests/test_did_had_parity.py::TestYatchewParity` now routes placebo rows through `null="mean_independence"` (R `order=0`) and effect rows through `null="linearity"` (R `order=1`); parity holds at `atol=1e-10` after the documented `× G/(G-1)` finite-sample convention shift. | `diff_diff/had_pretests.py::yatchew_hr_test` | follow-up (shipped) | Done | | `HeterogeneousAdoptionDiD` Stute family Stata-bridge parity: PR #389 R-parity covers the full HAD fit + Yatchew surfaces but skips Stute family (`stute_test`, `stute_joint_pretest`, `joint_pretrends_test`, `joint_homogeneity_test`) because no R `Stutetest` package exists publicly (chaisemartinPackages publishes only the Stata `stute_test` module; the paper cites a 2024c R Stutetest module that is not on GitHub or CRAN). Stata-bridge parity would add `benchmarks/stata/generate_stute_golden.do` + a Stata installation requirement. Low priority unless user demand emerges. | `benchmarks/stata/`, `tests/test_stute_test_parity.py` | follow-up | Low | | `HeterogeneousAdoptionDiD` Phase 3 Stute performance: Appendix D vectorized matrix form replaces the per-iteration OLS refit with a single precomputed `M = I - X(X'X)^{-1}X'` applied to `eps * eta`. Functionally identical, ~2x faster. Shipped literal-refit form in Phase 3 to match paper text and keep reviewer surface small. | `diff_diff/had_pretests.py::stute_test` | Phase 3 | Low | | `HeterogeneousAdoptionDiD` Phase 3 R-parity: Phase 3 ships coverage-rate validation on synthetic DGPs (not tight point parity against `chaisemartin::stute_test` / `yatchew_test`). Tight numerical parity requires aligning bootstrap seed semantics and `B` across numpy/R and is deferred. | `tests/test_had_pretests.py` | Phase 3 | Low | diff --git a/diff_diff/had_pretests.py b/diff_diff/had_pretests.py index 8feda360..4c63a88c 100644 --- a/diff_diff/had_pretests.py +++ b/diff_diff/had_pretests.py @@ -65,7 +65,7 @@ import warnings from dataclasses import dataclass -from typing import Any, Dict, Optional +from typing import Any, Dict, Literal, Optional import numpy as np import pandas as pd @@ -344,7 +344,10 @@ class YatchewTestResults: critical_value : float One-sided standard-normal critical value ``z_{1 - alpha}``. sigma2_lin : float - Residual variance from OLS of ``dy`` on ``d``. + Residual variance under the chosen null. Under + ``null_form="linearity"``: residual variance from OLS of ``dy`` + on ``d``. Under ``null_form="mean_independence"``: ``(1/G) * + sum((dy - mean(dy))^2)``, the population variance of ``dy``. sigma2_diff : float Yatchew differencing variance ``(1 / (2G)) * sum((dy_{(g)} - dy_{(g-1)})^2)`` - divisor is ``2G`` @@ -354,6 +357,13 @@ class YatchewTestResults: ``sqrt((1 / (G-1)) * sum(eps_{(g)}^2 * eps_{(g-1)}^2))``. n_obs : int Number of observations. + null_form : str + ``"linearity"`` (default; H_0: ``E[dY|D]`` is linear in ``D``, + residuals from OLS ``dy ~ 1 + d``) or ``"mean_independence"`` + (H_0: ``E[dY|D] = E[dY]``, residuals from intercept-only OLS + ``dy ~ 1``). Mirrors R ``YatchewTest::yatchew_test``'s + ``order`` argument (``order=1`` ↔ ``"linearity"``; ``order=0`` + ↔ ``"mean_independence"``). """ t_stat_hr: float @@ -365,20 +375,26 @@ class YatchewTestResults: sigma2_diff: float sigma2_W: float n_obs: int + null_form: str = "linearity" def __repr__(self) -> str: return ( f"YatchewTestResults(t_stat_hr={self.t_stat_hr:.4f}, " f"p_value={self.p_value:.4f}, reject={self.reject}, " - f"alpha={self.alpha}, n_obs={self.n_obs})" + f"alpha={self.alpha}, null_form={self.null_form!r}, " + f"n_obs={self.n_obs})" ) def summary(self) -> str: """Formatted summary table.""" width = 64 + title = { + "linearity": "Yatchew-HR linearity test (H_0: linear E[dY|D])", + "mean_independence": ("Yatchew-HR mean-independence test (H_0: E[dY|D] = E[dY])"), + }.get(self.null_form, f"Yatchew-HR test (null_form={self.null_form!r})") lines = [ "=" * width, - "Yatchew-HR linearity test (H_0: linear E[dY|D])".center(width), + title.center(width), "=" * width, f"{'T_hr statistic:':<30} {self.t_stat_hr:>20.4f}", f"{'p-value:':<30} {self.p_value:>20.4f}", @@ -410,6 +426,7 @@ def to_dict(self) -> Dict[str, Any]: "sigma2_diff": _json_safe_scalar(self.sigma2_diff), "sigma2_W": _json_safe_scalar(self.sigma2_W), "n_obs": int(self.n_obs), + "null_form": str(self.null_form), } def to_dataframe(self) -> pd.DataFrame: @@ -996,6 +1013,43 @@ def _fit_weighted_ols_intercept_slope( return a_hat, b_hat, residuals +def _fit_ols_intercept_only(dy: np.ndarray) -> "tuple[float, float, np.ndarray]": + """Fit ``dy = a + eps`` (intercept-only OLS, mean-independence null). + + Returns ``(a_hat, 0.0, residuals)`` where ``a_hat = mean(dy)`` and + ``residuals = dy - a_hat`` in the ORIGINAL input order. Slope + ``b_hat = 0.0`` is returned for tuple-symmetry with + :func:`_fit_ols_intercept_slope`; the downstream Yatchew variance code + consumes only ``residuals``. + + Mirrors R ``YatchewTest::yatchew_test(order=0)``. + """ + a_hat = float(np.mean(dy)) + residuals = dy - a_hat + return a_hat, 0.0, residuals + + +def _fit_weighted_ols_intercept_only( + dy: np.ndarray, w: np.ndarray +) -> "tuple[float, float, np.ndarray]": + """Weighted intercept-only OLS analog of :func:`_fit_ols_intercept_only`. + + Returns ``(a_hat, 0.0, residuals)`` where ``a_hat = sum(w * dy) / sum(w)`` + (the weighted mean) and ``residuals = dy - a_hat`` in the ORIGINAL input + order on the un-weighted scale. At ``w = ones(G)`` reduces bit-exactly + to :func:`_fit_ols_intercept_only`. + """ + sw = float(np.sum(w)) + if sw <= 0.0: + raise ValueError( + f"_fit_weighted_ols_intercept_only: sum(w) = {sw} <= 0; " + "weighted OLS requires positive total mass." + ) + a_hat = float(np.sum(w * dy) / sw) + residuals = dy - a_hat + return a_hat, 0.0, residuals + + def _cvm_statistic(eps_sorted: np.ndarray, d_sorted: np.ndarray) -> float: """Compute the tie-safe Cramer-von Mises cusum statistic. @@ -1956,25 +2010,32 @@ def yatchew_hr_test( dy: np.ndarray, alpha: float = 0.05, *, + null: Literal["linearity", "mean_independence"] = "linearity", survey_design: Any = None, survey: Any = None, weights: Optional[np.ndarray] = None, ) -> YatchewTestResults: - """Run the Yatchew heteroskedasticity-robust linearity test. + """Run the Yatchew heteroskedasticity-robust specification test. - Tests ``H_0: E[ΔY | D_2]`` is linear in ``D_2`` (paper Assumption 8, - Theorem 7) via the variance-ratio statistic + Tests one of two nulls (selected via ``null=``) using the + variance-ratio statistic T_hr = sqrt(G) * (sigma2_lin - sigma2_diff) / sigma2_W where - sigma2_lin = (1/G) * sum(eps^2) # OLS residuals + sigma2_lin = (1/G) * sum(eps^2) # residuals under chosen null sigma2_diff = (1/(2G)) * sum((dy_{(g)} - dy_{(g-1)})^2) # Yatchew differencing sigma2_W = sqrt((1/(G-1)) * sum(eps_{(g)}^2 * eps_{(g-1)}^2)) - and ``_{(g)}`` denotes sort by ``d``. Rejection uses the one-sided - standard-normal critical value ``z_{1-alpha}``. + and ``_{(g)}`` denotes sort by ``d``. Under ``null="linearity"`` + (default, paper Assumption 8 / Theorem 7) ``eps`` are residuals from + OLS ``dy = a + b*d + eps``. Under ``null="mean_independence"`` ``eps + = dy - mean(dy)`` (intercept-only OLS), mirroring R + ``YatchewTest::yatchew_test(order=0)``. The ``sigma2_diff`` and + ``sigma2_W`` formulas are identical between the two modes - + the only delta is the residual definition. Rejection uses the + one-sided standard-normal critical value ``z_{1-alpha}``. Parameters ---------- @@ -1982,6 +2043,24 @@ def yatchew_hr_test( Dose and first-difference outcome vectors. alpha : float, default 0.05 One-sided significance level. + null : {"linearity", "mean_independence"}, keyword-only, default "linearity" + Which null hypothesis the test targets: + + - ``"linearity"`` (default): H_0 ``E[dY | D]`` is linear in ``D`` + (paper Assumption 8, Theorem 7). Residuals come from OLS + ``dy = a + b*d + eps``. Bit-exact backcompat with pre-PR calls. + - ``"mean_independence"``: H_0 ``E[dY | D] = E[dY]`` (mean + independence of ``dY`` from ``D``). Residuals come from + intercept-only OLS ``dy = a + eps``, so + ``eps = dy - mean(dy)``. Mirrors R + ``YatchewTest::yatchew_test(order=0)``. Used by the + R-parity test on placebo Yatchew rows + (``Credible-Answers/did_had`` runs ``order=0`` on placebos + to test pre-trends as a non-parametric mean-independence + assertion). + + ``d`` is required under both modes (the sort-by-``d`` + differencing step is null-agnostic). survey_design : ResolvedSurveyDesign or None, keyword-only, default None Already-resolved survey design (per-unit). Array-in helpers accept ``ResolvedSurveyDesign`` ONLY; passing a ``SurveyDesign`` raises @@ -2094,6 +2173,12 @@ def yatchew_hr_test( if not (0.0 < alpha < 1.0): raise ValueError(f"alpha must satisfy 0 < alpha < 1, got {alpha}.") + if null not in ("linearity", "mean_independence"): + raise ValueError( + f"yatchew_hr_test: null must be one of " + f"('linearity', 'mean_independence'), got {null!r}." + ) + # Three-way mutex on survey_design / survey / weights (array-in pattern). n_set = sum(x is not None for x in (survey_design, survey, weights)) if n_set > 1: @@ -2225,6 +2310,7 @@ def yatchew_hr_test( sigma2_diff=float("nan"), sigma2_W=float("nan"), n_obs=G, + null_form=null, ) # Tie / constant-dose front-door guard. Yatchew's difference-based @@ -2258,16 +2344,28 @@ def yatchew_hr_test( sigma2_diff=float("nan"), sigma2_W=float("nan"), n_obs=G, + null_form=null, ) - # Phase 4.5 C: weighted vs unweighted OLS dispatch. Unweighted path is - # bit-exact pre-PR (stability invariant #1). + # 4-arm dispatch on (null, weighted). Unweighted-linearity path is + # bit-exact pre-PR (stability invariant #1). Mean-independence mode + # mirrors R YatchewTest::yatchew_test(order=0) and uses intercept-only + # OLS residuals; the downstream sigma2_diff / sigma2_W path is + # identical across nulls. + if null == "linearity": + if w_arr is None: + _, _, eps = _fit_ols_intercept_slope(d_arr, dy_arr) + else: + _, _, eps = _fit_weighted_ols_intercept_slope(d_arr, dy_arr, w_arr) + else: # null == "mean_independence" + if w_arr is None: + _, _, eps = _fit_ols_intercept_only(dy_arr) + else: + _, _, eps = _fit_weighted_ols_intercept_only(dy_arr, w_arr) if w_arr is None: - _, _, eps = _fit_ols_intercept_slope(d_arr, dy_arr) sigma2_lin = float(np.mean(eps * eps)) sum_w = float(G) # uniform-weights effective sample size = G else: - _, _, eps = _fit_weighted_ols_intercept_slope(d_arr, dy_arr, w_arr) sum_w = float(np.sum(w_arr)) sigma2_lin = float(np.sum(w_arr * eps * eps) / sum_w) @@ -2307,6 +2405,7 @@ def yatchew_hr_test( sigma2_diff=sigma2_diff_exact, sigma2_W=0.0, n_obs=G, + null_form=null, ) idx = np.argsort(d_arr, kind="stable") @@ -2375,6 +2474,7 @@ def yatchew_hr_test( sigma2_diff=sigma2_diff, sigma2_W=0.0, n_obs=G, + null_form=null, ) sigma2_W = float(np.sqrt(sigma4_W)) @@ -2394,6 +2494,7 @@ def yatchew_hr_test( sigma2_diff=sigma2_diff, sigma2_W=sigma2_W, n_obs=G, + null_form=null, ) @@ -4791,6 +4892,9 @@ def did_had_pretest_workflow( seed=seed, survey_design=per_test_survey_design, ) + # Linearity null is correct for the workflow's post-treatment Yatchew step + # (paper Theorem 7); placebo mean-independence routing lives in the + # R-parity test, not the workflow. yatchew_res = yatchew_hr_test( d_arr, dy_arr, diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 9060def0..6c7d5968 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -2499,7 +2499,7 @@ Shipped in `diff_diff/had_pretests.py` as `stute_joint_pretest()` (residuals-in - **Note:** Sum-of-CvMs aggregation is a standard joint specification-test construction (Delgado 1993; Escanciano 2006); the paper does not prescribe an aggregation rule. Sum-of-CvMs balances power across diffuse vs concentrated alternatives and bootstraps cleanly with shared-η. - **Note:** Event-study dispatch adjudicates step 3 via joint Stute only; there is no joint Yatchew variant because the paper does not derive one. The overall two-period path still uses the Phase 3 "Stute OR Yatchew" adjudication. Users who need Yatchew-style adjacent-difference variance-ratio robustness under multi-period data can run `yatchew_hr_test` on each (base, post) pair manually. - **Note (Phase 4 — Eq 17 / Eq 18 linear-trend detrending shipped):** `trends_lin: bool = False` (keyword-only) on `HeterogeneousAdoptionDiD.fit(aggregate="event_study")`, `joint_pretrends_test`, and `joint_homogeneity_test` (PR #389, 2026-04). Mirrors R `DIDHAD::did_had(..., trends_lin=TRUE)` (Credible-Answers/did_had v2.0.0, SHA `edc09197`). Per-group linear-trend slope estimated as `Y[g, F-1] - Y[g, F-2]` and applied as `(t - base) × slope` adjustment to per-event-time outcome evolutions (HAD.fit) or to `Y[g, t] - Y[g, base]` directly (joint pretests). The "consumed" placebo at our event-time `e=-2` is auto-dropped (R reduces max placebo lag by 1 with the same effect). Requires F ≥ 3 / `base_period - 1` in panel — front-door `ValueError` if not. Mutually exclusive with survey weighting (raises `NotImplementedError` per `feedback_per_method_survey_element_contract`; weighted slope estimator not derived from paper). Pierce-Schott published-number replication (paper p=0.51 / p=0.40 anchors) deferred indefinitely — primary analysis panel is LBD-restricted (Census FSRDC); the public-deposit proxy panel has filtering ambiguity that prevents exact published-number parity. Replaced by end-to-end R-package parity below, which is a strictly stronger correctness signal. -- **Note (R-package end-to-end parity, PR #389):** Validated against `DIDHAD` v2.0.0 (Credible-Answers/did_had, SHA `edc09197`) on the **`design="continuous_at_zero"` (Design 1') surface**, on 3 paper-derived synthetic DGPs (Uniform, Beta(2,2), Beta(0.5,1)) × 5 method combinations (overall, event-study, placebo, yatchew, trends_lin). Generator: `benchmarks/R/generate_did_had_golden.R`; fixture: `benchmarks/data/did_had_golden.json`; test: `tests/test_did_had_parity.py`. **Scope qualifier (PR #392 R8 P3):** the harness explicitly forces `HeterogeneousAdoptionDiD(design="continuous_at_zero")` because R `did_had` always evaluates the local-linear at `d=0` regardless of dose distribution. Our default `design="auto"` may legitimately resolve to `continuous_near_d_lower` (`d_lower=d.min()`, Design 1) or `mass_point` (Design 2) on dose distributions with boundary density bounded away from zero (e.g., Beta(2,2) at G=200), in which case the WAS estimand evaluates at a different point and diverges from R's `did_had` numerically. That divergence is methodologically defensible — our auto-detect uses more information when boundary mass is sparse — but is out of scope for this parity contract. Tolerances: point estimate / SE / CI bounds at `atol=1e-8`; closed-form Yatchew T-stat at `atol=1e-10` after a documented `× G/(G-1)` finite-sample convention shift. Two intentional convention deviations from R: **(a)** we report the **bias-corrected** point estimate `att = (mean(ΔY) - tau.bc) / mean(D)` (modern CCF 2018 convention); R's `Estimate` column reports the **conventional** estimate `(mean(ΔY) - tau.us) / mean(D)` with the bias-corrected CI separately — our `att` matches R's CI midpoint, our `se` / `conf_int_low` / `conf_int_high` match R's `se` / `ci_lo` / `ci_hi` directly. **(b)** Our `yatchew_hr_test` follows paper Appendix E's literal `(1/G)` and `(1/(2G))` variance-denominator convention; R's `YatchewTest::yatchew_test` uses base-R `var()`'s `(1/(N-1))` sample-variance convention. Ratio is exactly `N/(N-1)`; both converge to the same asymptotic null distribution. Yatchew on placebos with R's mean-independence null (`order=0`, fits `Y ~ 1`) is not yet exposed in our `yatchew_hr_test` (we always fit `Y ~ D`, the linearity null) and is skipped in the parity test; tracked as TODO follow-up to add a `null="mean_independence"` mode. +- **Note (R-package end-to-end parity, PR #389):** Validated against `DIDHAD` v2.0.0 (Credible-Answers/did_had, SHA `edc09197`) on the **`design="continuous_at_zero"` (Design 1') surface**, on 3 paper-derived synthetic DGPs (Uniform, Beta(2,2), Beta(0.5,1)) × 5 method combinations (overall, event-study, placebo, yatchew, trends_lin). Generator: `benchmarks/R/generate_did_had_golden.R`; fixture: `benchmarks/data/did_had_golden.json`; test: `tests/test_did_had_parity.py`. **Scope qualifier (PR #392 R8 P3):** the harness explicitly forces `HeterogeneousAdoptionDiD(design="continuous_at_zero")` because R `did_had` always evaluates the local-linear at `d=0` regardless of dose distribution. Our default `design="auto"` may legitimately resolve to `continuous_near_d_lower` (`d_lower=d.min()`, Design 1) or `mass_point` (Design 2) on dose distributions with boundary density bounded away from zero (e.g., Beta(2,2) at G=200), in which case the WAS estimand evaluates at a different point and diverges from R's `did_had` numerically. That divergence is methodologically defensible — our auto-detect uses more information when boundary mass is sparse — but is out of scope for this parity contract. Tolerances: point estimate / SE / CI bounds at `atol=1e-8`; closed-form Yatchew T-stat at `atol=1e-10` after a documented `× G/(G-1)` finite-sample convention shift. Two intentional convention deviations from R: **(a)** we report the **bias-corrected** point estimate `att = (mean(ΔY) - tau.bc) / mean(D)` (modern CCF 2018 convention); R's `Estimate` column reports the **conventional** estimate `(mean(ΔY) - tau.us) / mean(D)` with the bias-corrected CI separately — our `att` matches R's CI midpoint, our `se` / `conf_int_low` / `conf_int_high` match R's `se` / `ci_lo` / `ci_hi` directly. **(b)** Our `yatchew_hr_test` follows paper Appendix E's literal `(1/G)` and `(1/(2G))` variance-denominator convention; R's `YatchewTest::yatchew_test` uses base-R `var()`'s `(1/(N-1))` sample-variance convention. Ratio is exactly `N/(N-1)`; both converge to the same asymptotic null distribution. Yatchew on placebos with R's mean-independence null (`order=0`, fits `Y ~ 1`) is exposed via `yatchew_hr_test(null="mean_independence")` (added post-PR #392). The parity test routes effect rows through `null="linearity"` (R `order=1`) and placebo rows through `null="mean_independence"` (R `order=0`); both modes share the same `(1/G)` vs `(1/(N-1))` finite-sample convention shift and parity holds at `atol=1e-10` after the documented `× G/(G-1)` transform. - **Note:** Horizon labels in `StuteJointResult.horizon_labels` are `str(t)` verbatim and carry STRING IDENTITY ONLY — NOT a chronological ordering key. Callers who need chronological order must preserve the original period values alongside (e.g. from the `pre_periods` / `post_periods` argument). - **Note:** NaN propagation is explicit: when any horizon has NaN in residuals, `cvm_stat_joint=NaN`, `p_value=NaN`, `reject=False`, AND `per_horizon_stats={label: np.nan for every horizon}` (full dict preserved with NaN values — not empty, not partial). diff --git a/tests/test_did_had_parity.py b/tests/test_did_had_parity.py index 31cc9dbf..1bacf8aa 100644 --- a/tests/test_did_had_parity.py +++ b/tests/test_did_had_parity.py @@ -361,13 +361,13 @@ class TestYatchewParity: the yatchew flag on placebo tests "the null being tested is that groups' F-1 to F-1-ℓ outcome evolution is mean independent of their F-1+ℓ treatment". R's `YatchewTest::yatchew_test(order=0)` - fits Y ~ 1 (intercept only) instead of Y ~ D, producing a - fundamentally different residual. Our `yatchew_hr_test` always - fits Y ~ D (the linearity null). For parity we restrict to the - EFFECT rows (post-treatment, where both R and Python use the - Y ~ D linearity null). Placebo Yatchew parity requires a separate - Python helper exposing the mean-independence null and is - deferred.""" + fits Y ~ 1 (intercept only) instead of Y ~ D. Effect rows + (post-treatment) use R's `order=1` (linearity null) which matches + our default `yatchew_hr_test(null="linearity")`. Placebo rows are + routed through `yatchew_hr_test(null="mean_independence")` (added + post-PR #392), which mirrors R's `order=0`. Parity holds at the + same `atol=1e-10` after the documented N/(N-1) convention shift + on both modes.""" @pytest.mark.parametrize( "dgp_name,combo_name,effects,placebo,trends_lin", @@ -435,15 +435,14 @@ def test_yatchew_t_stat_parity( e = _r_id_to_event_time(int(r_id), trends_lin) if e not in dy_dict: continue - # Skip placebo rows: R uses `order=0` (Y ~ 1, intercept-only - # baseline, mean-independence null) on placebos; our - # `yatchew_hr_test` always fits Y ~ D (linearity null). - # Different test, not a parity opportunity. - if int(r_id) < 0: - continue + # Effect rows (R ID > 0): linearity null (Y ~ 1 + D), R's + # YatchewTest::yatchew_test(order=1). + # Placebo rows (R ID < 0): mean-independence null (Y ~ 1), + # R's YatchewTest::yatchew_test(order=0). Both modes share + # the same N/(N-1) convention shift downstream. + null_mode = "mean_independence" if int(r_id) < 0 else "linearity" dy_e = dy_dict[e] - # Yatchew test on (d, dy_e) with no weights. - r = yatchew_hr_test(d_arr, dy_e) + r = yatchew_hr_test(d_arr, dy_e, null=null_mode) # Apply documented convention shift: R's T = our T × G/(G-1) # (sample-variance vs population-variance denominators). G_horizon = int(r_yatchew_n[r_idx]) @@ -455,7 +454,8 @@ def test_yatchew_t_stat_parity( rtol=0, err_msg=( f"{dgp_name}/{combo_name}/Yatchew row {r_idx} " - f"(R ID={r_id}, our e={e}, G={G_horizon}): T_hr mismatch " + f"(R ID={r_id}, our e={e}, G={G_horizon}, " + f"null={null_mode!r}): T_hr mismatch " f"after N/(N-1) convention shift" ), ) diff --git a/tests/test_had_pretests.py b/tests/test_had_pretests.py index dac0e84a..bfc07bcb 100644 --- a/tests/test_had_pretests.py +++ b/tests/test_had_pretests.py @@ -443,6 +443,19 @@ def test_does_not_reject_linear_dgp(self): r = yatchew_hr_test(d, dy, alpha=0.05) assert r.reject is False + def test_default_null_is_linearity_with_correct_label(self): + """Default `null=` (omitted) tags result as ``null_form='linearity'`` + and is bit-exact with explicit `null='linearity'`.""" + d, dy = _linear_dgp(G=50, beta=2.0, sigma=0.3, seed=42) + r_default = yatchew_hr_test(d, dy, alpha=0.05) + r_explicit = yatchew_hr_test(d, dy, alpha=0.05, null="linearity") + assert r_default.null_form == "linearity" + assert r_explicit.null_form == "linearity" + np.testing.assert_array_equal(r_default.t_stat_hr, r_explicit.t_stat_hr) + np.testing.assert_array_equal(r_default.sigma2_lin, r_explicit.sigma2_lin) + np.testing.assert_array_equal(r_default.sigma2_diff, r_explicit.sigma2_diff) + np.testing.assert_array_equal(r_default.sigma2_W, r_explicit.sigma2_W) + def test_rejects_quadratic_dgp(self): """Commit criterion 9 (single-realization): quadratic -> reject.""" d, dy = _quadratic_dgp(G=200, beta=2.0, gamma=5.0, sigma=0.3, seed=42) @@ -3138,6 +3151,18 @@ def test_unweighted_bit_exact_after_kwargs_added(self): # captured); just check finiteness. assert np.isfinite(r_unweighted.t_stat_hr) + def test_default_null_under_weights_is_linearity_with_correct_label(self): + """Under weights, default `null=` (omitted) tags result as + ``null_form='linearity'`` and is bit-exact with explicit linearity.""" + d, dy = self._setup() + w = np.random.default_rng(7).uniform(0.5, 2.0, size=30) + r_default = yatchew_hr_test(d, dy, weights=w) + r_explicit = yatchew_hr_test(d, dy, weights=w, null="linearity") + assert r_default.null_form == "linearity" + assert r_explicit.null_form == "linearity" + np.testing.assert_array_equal(r_default.t_stat_hr, r_explicit.t_stat_hr) + np.testing.assert_array_equal(r_default.sigma2_lin, r_explicit.sigma2_lin) + def test_weighted_reduces_to_unweighted_at_uniform_weights(self): """Reviewer CRITICAL #2 lock: at w=ones(G), weighted variance components reduce to the unweighted formulas EXACTLY (atol=1e-14).""" @@ -3214,6 +3239,282 @@ def test_replicate_weights_raises(self): yatchew_hr_test(d, dy, survey=resolved_with_rep) +class TestYatchewHRTestMeanIndependence: + """``null="mean_independence"`` mode mirroring R + ``YatchewTest::yatchew_test(order=0)``. + + Closes the placebo Yatchew R-parity gap from PR #392 by exposing + the intercept-only OLS (``Y ~ 1``) residual path. Default + ``null="linearity"`` retains bit-exact backcompat across all existing + test classes. + """ + + def _setup(self, G=30, seed=42): + rng = np.random.default_rng(seed) + d = rng.uniform(0.0, 1.0, size=G) + dy = 2.0 * d + rng.normal(0.0, 0.3, size=G) + return d, dy + + def test_happy_path_returns_finite_with_correct_null_form(self): + d, dy = self._setup() + r = yatchew_hr_test(d, dy, null="mean_independence") + assert np.isfinite(r.t_stat_hr) + assert 0.0 <= r.p_value <= 1.0 + assert r.null_form == "mean_independence" + + def test_naive_python_baseline_atol_1e_12(self): + """Hand-compute residuals = dy - dy.mean(), sigma2_lin = (1/G) sum(eps^2), + sort by d, adjacent diffs, HR T-stat. Lock at atol=1e-12.""" + # G = 8 deterministic input. + d = np.array([0.10, 0.20, 0.30, 0.40, 0.55, 0.70, 0.85, 0.95]) + dy = np.array([1.2, 2.1, 1.8, 3.4, 2.9, 4.1, 3.7, 5.0]) + G = d.shape[0] + + eps = dy - dy.mean() + idx = np.argsort(d, kind="stable") + dy_s = dy[idx] + eps_s = eps[idx] + diff_dy = np.diff(dy_s) + sigma2_lin_expected = float(np.mean(eps * eps)) + sigma2_diff_expected = float(np.sum(diff_dy * diff_dy) / (2.0 * G)) + sigma4_W_expected = float(np.mean(eps_s[1:] ** 2 * eps_s[:-1] ** 2)) + sigma2_W_expected = float(np.sqrt(sigma4_W_expected)) + t_expected = float( + np.sqrt(G) * (sigma2_lin_expected - sigma2_diff_expected) / sigma2_W_expected + ) + + r = yatchew_hr_test(d, dy, null="mean_independence") + np.testing.assert_allclose(r.sigma2_lin, sigma2_lin_expected, atol=1e-12) + np.testing.assert_allclose(r.sigma2_diff, sigma2_diff_expected, atol=1e-12) + np.testing.assert_allclose(r.sigma2_W, sigma2_W_expected, atol=1e-12) + np.testing.assert_allclose(r.t_stat_hr, t_expected, atol=1e-12) + + def test_sigma2_lin_equals_population_variance_exactly(self): + """Under mean-independence, sigma2_lin = (1/G) sum((dy - mean(dy))^2) + = np.var(dy, ddof=0). Locks the (1/G) normalizer convention.""" + d, dy = self._setup() + G = d.shape[0] + r = yatchew_hr_test(d, dy, null="mean_independence") + expected = float(np.sum((dy - dy.mean()) ** 2) / G) + np.testing.assert_allclose(r.sigma2_lin, expected, atol=1e-14, rtol=1e-14) + # Equivalent np.var(ddof=0) form. + np.testing.assert_allclose(r.sigma2_lin, float(np.var(dy, ddof=0)), atol=1e-14) + + def test_invalid_null_value_raises(self): + d, dy = self._setup() + with pytest.raises(ValueError, match="null must be one of"): + yatchew_hr_test(d, dy, null="bogus") # type: ignore[arg-type] + + def test_default_null_matches_explicit_linearity_bit_exact(self): + """Omitting `null=` must be bit-identical to `null='linearity'`.""" + d, dy = self._setup() + r_default = yatchew_hr_test(d, dy, alpha=0.05) + r_explicit = yatchew_hr_test(d, dy, alpha=0.05, null="linearity") + np.testing.assert_array_equal(r_default.sigma2_lin, r_explicit.sigma2_lin) + np.testing.assert_array_equal(r_default.sigma2_diff, r_explicit.sigma2_diff) + np.testing.assert_array_equal(r_default.sigma2_W, r_explicit.sigma2_W) + np.testing.assert_array_equal(r_default.t_stat_hr, r_explicit.t_stat_hr) + assert r_default.null_form == "linearity" + assert r_explicit.null_form == "linearity" + + def test_tie_and_constant_d_rejection_is_mode_agnostic(self): + """Front-door tie / constant-d guard rejects under BOTH nulls + (the sort-by-d differencing step is null-agnostic).""" + # Constant d. + d_const = np.full(20, 0.5) + dy = np.linspace(0.0, 1.0, 20) + with warnings.catch_warnings(record=True) as caught_lin: + warnings.simplefilter("always") + r_lin = yatchew_hr_test(d_const, dy, null="linearity") + msgs_lin = [str(w.message) for w in caught_lin] + with warnings.catch_warnings(record=True) as caught_mi: + warnings.simplefilter("always") + r_mi = yatchew_hr_test(d_const, dy, null="mean_independence") + msgs_mi = [str(w.message) for w in caught_mi] + assert any("duplicate" in m for m in msgs_lin) + assert any("duplicate" in m for m in msgs_mi) + assert np.isnan(r_lin.t_stat_hr) and np.isnan(r_mi.t_stat_hr) + # Both record null_form correctly even on the NaN return path. + assert r_lin.null_form == "linearity" + assert r_mi.null_form == "mean_independence" + + # Ties (some duplicates). + d_ties = np.array([0.1, 0.2, 0.2, 0.5, 0.8, 0.9]) + dy_ties = np.array([1.0, 2.0, 2.5, 3.0, 4.0, 5.0]) + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + r = yatchew_hr_test(d_ties, dy_ties, null="mean_independence") + assert np.isnan(r.t_stat_hr) + assert r.null_form == "mean_independence" + + def test_nan_dy_input_raises(self): + """NaN in dy raises (mode-agnostic; same as linearity path).""" + d = np.linspace(0.0, 1.0, 20) + dy = np.linspace(0.0, 2.0, 20) + dy[3] = np.nan + with pytest.raises(ValueError, match="contains NaN"): + yatchew_hr_test(d, dy, null="mean_independence") + + # ─── Weighted ──────────────────────────────────────────────────────────── + + def test_weighted_reduces_to_unweighted_at_uniform_weights(self): + """At w=ones(G), weighted mean-independence reduces bit-exactly to + unweighted mean-independence (atol=1e-14).""" + d, dy = self._setup() + r_unw = yatchew_hr_test(d, dy, null="mean_independence") + r_w = yatchew_hr_test(d, dy, null="mean_independence", weights=np.ones(d.shape[0])) + np.testing.assert_allclose(r_unw.sigma2_lin, r_w.sigma2_lin, atol=1e-14, rtol=1e-14) + np.testing.assert_allclose(r_unw.sigma2_diff, r_w.sigma2_diff, atol=1e-14, rtol=1e-14) + np.testing.assert_allclose(r_unw.sigma2_W, r_w.sigma2_W, atol=1e-14, rtol=1e-14) + np.testing.assert_allclose(r_unw.t_stat_hr, r_w.t_stat_hr, atol=1e-14, rtol=1e-14) + assert r_w.null_form == "mean_independence" + + def test_weighted_non_uniform_baseline_atol_1e_12(self): + """Hand-compute weighted mean-independence on G=8 deterministic input + with non-uniform weights. Lock at atol=1e-12.""" + d = np.array([0.10, 0.20, 0.30, 0.40, 0.55, 0.70, 0.85, 0.95]) + dy = np.array([1.2, 2.1, 1.8, 3.4, 2.9, 4.1, 3.7, 5.0]) + w = np.array([1.0, 1.5, 0.8, 2.0, 1.2, 0.7, 1.3, 1.1]) + # Internal pweight normalization: w_eff = w * (G / sum(w)). + G = d.shape[0] + w_eff = w * (G / w.sum()) + sum_w_eff = float(np.sum(w_eff)) + + a_hat = float(np.sum(w_eff * dy) / sum_w_eff) + eps = dy - a_hat + sigma2_lin_expected = float(np.sum(w_eff * eps * eps) / sum_w_eff) + + idx = np.argsort(d, kind="stable") + dy_s = dy[idx] + eps_s = eps[idx] + w_s = w_eff[idx] + w_avg = 0.5 * (w_s[1:] + w_s[:-1]) + diff_dy = np.diff(dy_s) + sigma2_diff_expected = float(np.sum(w_avg * diff_dy * diff_dy) / (2.0 * sum_w_eff)) + sigma4_W_expected = float(np.sum(w_avg * eps_s[1:] ** 2 * eps_s[:-1] ** 2) / np.sum(w_avg)) + sigma2_W_expected = float(np.sqrt(sigma4_W_expected)) + t_expected = float( + np.sqrt(sum_w_eff) * (sigma2_lin_expected - sigma2_diff_expected) / sigma2_W_expected + ) + + r = yatchew_hr_test(d, dy, null="mean_independence", weights=w) + np.testing.assert_allclose(r.sigma2_lin, sigma2_lin_expected, atol=1e-12) + np.testing.assert_allclose(r.sigma2_diff, sigma2_diff_expected, atol=1e-12) + np.testing.assert_allclose(r.sigma2_W, sigma2_W_expected, atol=1e-12) + np.testing.assert_allclose(r.t_stat_hr, t_expected, atol=1e-12) + + def test_default_null_under_weights_matches_explicit_linearity(self): + d, dy = self._setup() + w = np.random.default_rng(7).uniform(0.5, 2.0, size=d.shape[0]) + r_default = yatchew_hr_test(d, dy, weights=w) + r_explicit = yatchew_hr_test(d, dy, weights=w, null="linearity") + np.testing.assert_array_equal(r_default.t_stat_hr, r_explicit.t_stat_hr) + np.testing.assert_array_equal(r_default.sigma2_lin, r_explicit.sigma2_lin) + assert r_default.null_form == "linearity" + + def test_survey_design_with_mean_independence_composes(self): + """`survey_design=` and `null='mean_independence'` are orthogonal; + passing both does NOT raise NotImplementedError.""" + from diff_diff.survey import make_pweight_design + + d, dy = self._setup() + w = np.random.default_rng(7).uniform(0.5, 2.0, size=d.shape[0]) + r = yatchew_hr_test( + d, + dy, + null="mean_independence", + survey_design=make_pweight_design(w), + ) + assert np.isfinite(r.t_stat_hr) + assert r.null_form == "mean_independence" + + def test_weighted_linearity_baseline_atol_1e_12(self): + """4-arm coverage gap fill: hand-compute weighted-linearity components + on G=8 deterministic input. Locks the (linearity, weighted) arm + explicitly (the other 3 arms locked by tests above + existing suite).""" + d = np.array([0.10, 0.20, 0.30, 0.40, 0.55, 0.70, 0.85, 0.95]) + dy = np.array([1.2, 2.1, 1.8, 3.4, 2.9, 4.1, 3.7, 5.0]) + w = np.array([1.0, 1.5, 0.8, 2.0, 1.2, 0.7, 1.3, 1.1]) + G = d.shape[0] + w_eff = w * (G / w.sum()) + sum_w_eff = float(np.sum(w_eff)) + + # Weighted OLS: dy = a + b*d + eps. + d_wm = float(np.sum(w_eff * d) / sum_w_eff) + dy_wm = float(np.sum(w_eff * dy) / sum_w_eff) + d_dev = d - d_wm + var_d_w = float(np.sum(w_eff * d_dev * d_dev)) + b_hat = float(np.sum(w_eff * d_dev * (dy - dy_wm)) / var_d_w) + a_hat = float(dy_wm - b_hat * d_wm) + eps = dy - a_hat - b_hat * d + sigma2_lin_expected = float(np.sum(w_eff * eps * eps) / sum_w_eff) + + idx = np.argsort(d, kind="stable") + dy_s = dy[idx] + eps_s = eps[idx] + w_s = w_eff[idx] + w_avg = 0.5 * (w_s[1:] + w_s[:-1]) + diff_dy = np.diff(dy_s) + sigma2_diff_expected = float(np.sum(w_avg * diff_dy * diff_dy) / (2.0 * sum_w_eff)) + sigma4_W_expected = float(np.sum(w_avg * eps_s[1:] ** 2 * eps_s[:-1] ** 2) / np.sum(w_avg)) + sigma2_W_expected = float(np.sqrt(sigma4_W_expected)) + t_expected = float( + np.sqrt(sum_w_eff) * (sigma2_lin_expected - sigma2_diff_expected) / sigma2_W_expected + ) + + r = yatchew_hr_test(d, dy, weights=w, null="linearity") + np.testing.assert_allclose(r.sigma2_lin, sigma2_lin_expected, atol=1e-12) + np.testing.assert_allclose(r.sigma2_diff, sigma2_diff_expected, atol=1e-12) + np.testing.assert_allclose(r.sigma2_W, sigma2_W_expected, atol=1e-12) + np.testing.assert_allclose(r.t_stat_hr, t_expected, atol=1e-12) + assert r.null_form == "linearity" + + def test_zero_weight_rejected_under_mean_independence(self): + """Strictly-positive weights still required (mode-agnostic).""" + d, dy = self._setup() + w = np.ones(d.shape[0]) + w[5] = 0.0 + with pytest.raises(ValueError, match="strictly positive"): + yatchew_hr_test(d, dy, null="mean_independence", weights=w) + + def test_replicate_weights_rejected_under_mean_independence(self): + from diff_diff.survey import ResolvedSurveyDesign + + d, dy = self._setup() + w = np.ones(d.shape[0]) + resolved_with_rep = ResolvedSurveyDesign( + weights=w, + weight_type="pweight", + strata=None, + psu=None, + fpc=None, + n_strata=0, + n_psu=d.shape[0], + lonely_psu="remove", + replicate_weights=np.tile(w[:, None], (1, 5)), + replicate_method="BRR", + n_replicates=5, + ) + with pytest.raises(NotImplementedError, match="replicate-weight"): + yatchew_hr_test(d, dy, null="mean_independence", survey=resolved_with_rep) + + # ─── Edge case ─────────────────────────────────────────────────────────── + + def test_G_below_min_returns_nan_under_both_nulls(self): + """G=2 returns NaN under both nulls (HR variance estimator + needs adjacent-pair products, undefined for G<3).""" + d = np.array([0.1, 0.5]) + dy = np.array([1.0, 2.0]) + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + r_lin = yatchew_hr_test(d, dy, null="linearity") + r_mi = yatchew_hr_test(d, dy, null="mean_independence") + assert np.isnan(r_lin.t_stat_hr) + assert np.isnan(r_mi.t_stat_hr) + assert r_lin.null_form == "linearity" + assert r_mi.null_form == "mean_independence" + + class TestJointStuteSurvey: """Phase 4.5 C survey/weights on stute_joint_pretest + joint_pretrends_test + joint_homogeneity_test.""" From ebdceded75bcbbffd7f7c47837da7f8d58d3955a Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 26 Apr 2026 17:07:02 -0400 Subject: [PATCH 2/2] Address PR #397 R1 review (2 P3, doc + test polish) P3: REGISTRY.md / docstring narrative inconsistency. Updates main Yatchew algorithm block (REGISTRY.md:2469-2477), Phase 3 row (REGISTRY.md:2546), module-level docstring (had_pretests.py:14-16), and YatchewTestResults class docstring (had_pretests.py:323-330) to describe yatchew_hr_test as a specification test with both nulls, labelling mean_independence explicitly as the documented R-parity extension (vs paper-derived linearity). P3: null_form propagation lightly tested. Adds three explicit regression tests in the existing test classes the reviewer cited: - TestJSONSerialization.test_yatchew_to_dict_carries_null_form: locks to_dict()["null_form"] for both modes. - TestSummary.test_yatchew_summary_switches_title_on_null_form: locks summary() title switching with both positive and negative controls. - TestSummary.test_yatchew_repr_includes_null_form: locks null_form= appears in repr() for both modes. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/had_pretests.py | 29 +++++++++++++++++++++-------- docs/methodology/REGISTRY.md | 11 ++++++----- tests/test_had_pretests.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 13 deletions(-) diff --git a/diff_diff/had_pretests.py b/diff_diff/had_pretests.py index 4c63a88c..d3d887bc 100644 --- a/diff_diff/had_pretests.py +++ b/diff_diff/had_pretests.py @@ -12,8 +12,13 @@ ``E[ΔY | D_2]`` with Mammen (1993) wild bootstrap p-value (paper Appendix D). 3. :func:`yatchew_hr_test` - heteroskedasticity-robust variance-ratio - linearity test (paper Theorem 7 / Equation 29). Feasible at - ``G >= 100k``. + specification test (paper Theorem 7 / Equation 29). Feasible at + ``G >= 100k``. Two nulls via the keyword-only ``null=`` argument: + ``"linearity"`` (default; paper Theorem 7, fits ``Y ~ 1 + D``) and + ``"mean_independence"`` (R-parity extension mirroring R + ``YatchewTest::yatchew_test(order=0)``; fits ``Y ~ 1``). The + downstream variance-ratio machinery is shared between the two + modes — only the residual definition differs. Joint / multi-period tests (Phase 3 follow-up): @@ -323,12 +328,20 @@ def to_dataframe(self) -> pd.DataFrame: class YatchewTestResults: """Result of :func:`yatchew_hr_test` (paper Theorem 7 / Equation 29). - Heteroskedasticity-robust test of the same linearity null as - :func:`stute_test`, but using Yatchew's difference-based variance - estimator. The test statistic - ``T_hr = sqrt(G) * (sigma2_lin - sigma2_diff) / sigma2_W`` - is asymptotically N(0, 1) under H_0; rejection uses the one-sided - standard-normal critical value. + Heteroskedasticity-robust specification test using Yatchew's + difference-based variance estimator. Two nulls are supported via + the ``null=`` argument on :func:`yatchew_hr_test` and reflected on + the ``null_form`` attribute below: ``"linearity"`` (default; paper + Theorem 7, the same null as :func:`stute_test`, residuals from OLS + ``dy ~ 1 + d``) and ``"mean_independence"`` (R-parity extension + mirroring R ``YatchewTest::yatchew_test(order=0)``, residuals from + intercept-only OLS ``dy ~ 1``). The test statistic + ``T_hr = sqrt(G) * (sigma2_lin - sigma2_diff) / sigma2_W`` is + asymptotically N(0, 1) under H_0 in both modes; rejection uses the + one-sided standard-normal critical value. Only the residual + definition (and therefore ``sigma2_lin``) differs between modes — + the ``sigma2_diff`` / ``sigma2_W`` / sort-by-``d`` machinery is + shared. Attributes ---------- diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 6c7d5968..5e5b6824 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -2466,15 +2466,16 @@ Shipped in `diff_diff/had_pretests.py` as `stute_test()`. Tests whether `E(ΔY | 6. Vectorized implementation (Appendix D): with `L` a `G × G` lower-triangular matrix of ones, `S = (1/G²) · 1ᵀ (L · E)^{∘2}`. Bootstrap uses a `G × G` realization matrix `H` of Mammen weights; memory-bounded at `G ≈ 100,000`. - **Note:** Default `n_bootstrap = 999` is a diff-diff choice; the paper does not prescribe. A front-door `ValueError` is raised for `n_bootstrap < 99` (below which the discretised bootstrap p-value grid `1/(B+1)` is too coarse to be meaningful). -*Algorithm variant - Yatchew (1997) heteroskedasticity-robust linearity test (Appendix E, Theorem 7):* -Shipped in `diff_diff/had_pretests.py` as `yatchew_hr_test()`. Alternative to Stute when `G` is large or heteroskedasticity is suspected. +*Algorithm variant - Yatchew (1997) heteroskedasticity-robust specification test (Appendix E, Theorem 7):* +Shipped in `diff_diff/had_pretests.py` as `yatchew_hr_test()`. Alternative to Stute when `G` is large or heteroskedasticity is suspected. Two nulls supported via the keyword-only `null=` argument: `"linearity"` (default; paper Theorem 7) and `"mean_independence"` (R-parity extension, see Note below). 1. Sort `(D_{g,2}, ΔY_g)` by `D_{g,2}`. 2. Compute difference-based variance estimator: `σ̂²_{diff} := (1/(2G)) Σ_{g=2}^G [(Y_{2,(g)} - Y_{1,(g)}) - (Y_{2,(g-1)} - Y_{1,(g-1)})]²`. -3. Fit linear regression; compute residual variance `σ̂²_{lin}`. +3. Fit OLS; compute residual variance `σ̂²_{lin}`. Under `null="linearity"` (default): residuals from `dy ~ 1 + d` (paper Theorem 7). Under `null="mean_independence"`: residuals from intercept-only `dy ~ 1`, i.e. `eps = dy - mean(dy)` (R-parity extension). 4. Heteroskedasticity-robust variance: `σ̂⁴_W := (1/(G-1)) Σ_{g=2}^G ε̂²_{lin,(g)} ε̂²_{lin,(g-1)}`. -5. Robust test statistic: `T_{hr} := √G · (σ̂²_{lin} - σ̂²_{diff}) / σ̂²_W`. Reject linearity if `T_{hr} ≥ q_{1-α}` (Equation 29 and downstream in Theorem 7). +5. Robust test statistic: `T_{hr} := √G · (σ̂²_{lin} - σ̂²_{diff}) / σ̂²_W`. Under `null="linearity"`, reject linearity if `T_{hr} ≥ q_{1-α}` (Equation 29 and downstream in Theorem 7). Under `null="mean_independence"`, the same statistic and critical value reject the mean-independence null `H_0: E[dY|D] = E[dY]` — only the residual definition (and therefore `σ̂²_lin`) differs between modes; the `σ̂²_diff`, `σ̂⁴_W`, and sort-by-`d` machinery are shared. 6. Theorem 7: under `H_0`, `lim E[φ_α] = α`; under fixed alternative, `lim E[φ_α] = 1`; local power against alternatives at rate `G^{-1/4}` (slower than Stute's `G^{-1/2}` rate, but scales to `G ≥ 10⁵`). 7. Inference on `β̂_{fe}` conditional on accepting the linearity test is asymptotically valid (Theorem 7, Point 1; citing de Chaisemartin and D'Haultfœuille 2024 arXiv:2407.03725). +- **Note (mean-independence null is the R-parity extension, not paper-derived):** The paper (Appendix E, Theorem 7) defines `yatchew_hr_test` only under the linearity null. The `null="mean_independence"` mode mirrors R `YatchewTest::yatchew_test(order=0)` and is used by R `DIDHAD::did_had(yatchew=TRUE)` on placebo rows ("non-parametric pre-trends test" per the package README). Exposed for parity coverage of the placebo-Yatchew rows in `tests/test_did_had_parity.py::TestYatchewParity` (PR #397). The default `"linearity"` mode is paper-derived; users invoking `"mean_independence"` are running an R-parity extension, not a paper-prescribed test. *Four-step pre-testing workflow (Section 4.2-4.3):* Shipped as `did_had_pretest_workflow()` in Phase 3 (two-period `aggregate="overall"`) and extended in the Phase 3 follow-up with `aggregate="event_study"` dispatch that closes the step-2 pre-trends gap on multi-period panels. The paper's decision rule for TWFE reliability in HADs: @@ -2543,7 +2544,7 @@ Shipped in `diff_diff/had_pretests.py` as `stute_joint_pretest()` (residuals-in - **Note (Phase 3 exact-linear short-circuit):** Both `stute_test` and `yatchew_hr_test` detect numerically-exact linear fits (OLS residuals below machine precision relative to the signal) and short-circuit to `p=1.0, reject=False` without running the bootstrap / computing `T_hr`. The detection compares `sum(eps^2)` against the CENTERED total sum of squares `sum((dy - dybar)^2)` (ratio `<= 1e-24`), which is equivalent to `1 - R^2` and is TRANSLATION-INVARIANT under additive shifts in dy. Comparing against uncentered `sum(dy^2)` would NOT be translation-invariant: adding a large constant to dy inflates `sum(dy^2)` and can spuriously trip the short-circuit on genuinely noisy data. Regression tests pin (a) the exact-linear fail-to-reject behavior for `dy = a + b*d`, (b) translation-invariance under `dy -> dy + 1e12` (the short-circuit must not fire on noisy data regardless of the constant offset). - [x] Phase 3: `qug_test()` (`T = D_{2,(1)} / (D_{2,(2)} - D_{2,(1)})`, rejection `{T > 1/α - 1}`). One-sided asymptotic p-value `1 / (1 + T)` under the Exp(1)/Exp(1) limit law (Theorem 4). Zero-dose observations filtered upfront (with `UserWarning`); `D_{(1)} == D_{(2)}` tie returns all-NaN inference (conservative). Closed-form tight parity tested at `atol=1e-12`. - [x] Phase 3: `stute_test()` Cramér-von Mises with Mammen wild bootstrap. Statistic `S = (1/G^2) Σ (cumsum_g)^2` (algebraically equivalent to paper's `Σ(g/G)^2 · ((1/g) Σ eps_{(h)})^2`). Bootstrap follows paper Appendix D Algorithm literal (per-iteration OLS refit). `n_bootstrap=999` default, `n_bootstrap >= 99` validated. `G < 10` returns NaN; `G > 100_000` emits a `UserWarning` pointing to `yatchew_hr_test`. Appendix-D vectorized matrix form deferred as a performance follow-up (tracked in `TODO.md`). -- [x] Phase 3: `yatchew_hr_test()` heteroskedasticity-robust linearity test. Test statistic `T_hr = sqrt(G) · (σ̂²_lin - σ̂²_diff) / σ̂²_W` from paper Equation 29. Normalizer `σ̂²_diff` divides by `2G` (paper-literal Theorem 7), NOT `2(G-1)`; hand-computed tight parity asserted at `atol=1e-12`. One-sided standard-normal critical value. `G < 3` returns NaN. +- [x] Phase 3: `yatchew_hr_test()` heteroskedasticity-robust specification test. Test statistic `T_hr = sqrt(G) · (σ̂²_lin - σ̂²_diff) / σ̂²_W` from paper Equation 29. Normalizer `σ̂²_diff` divides by `2G` (paper-literal Theorem 7), NOT `2(G-1)`; hand-computed tight parity asserted at `atol=1e-12`. One-sided standard-normal critical value. `G < 3` returns NaN. Phase 3 shipped only the linearity null (paper Theorem 7); the `null="mean_independence"` R-parity extension shipped post-PR #392 (see the algorithm-variant block above for the contract). - [x] Phase 3: `did_had_pretest_workflow()` composite helper. Two-period `data`-only entry point (Phase 2a overall-path dispatch); reduces panel via `_aggregate_first_difference` and runs all three IMPLEMENTED tests at a shared `alpha`. `seed` forwards to `stute_test` only (QUG and Yatchew are deterministic). Returns `HADPretestReport` with priority-ordered verdict string. Because Phase 3 ships steps 1 + 3 of the paper's four-step workflow but **not** step 2 (Assumption 7 pre-trends test via Equation 18), the fail-to-reject verdict explicitly flags the Assumption 7 gap rather than claiming unconditional TWFE safety: `"QUG and linearity diagnostics fail-to-reject; Assumption 7 pre-trends test NOT run (paper step 2 deferred to Phase 3 follow-up)"`. Verdict priority follows the paper's one-way rule (TWFE admissible only if NO test rejects): **conclusive rejections are the primary verdict and are NEVER hidden by inconclusive status** — any unresolved-step note is appended via `"; additional steps unresolved: ..."` rather than replacing the rejection. The pure `"inconclusive - QUG NaN"` / `"inconclusive - both Stute and Yatchew linearity tests NaN"` forms only fire when NO conclusive test rejects AND a required step is unresolved. The partial-workflow fail-to-reject verdict may carry a `"(Yatchew NaN - skipped)"` (or Stute) suffix when one linearity test is NaN but the other is conclusive (step 3 resolved via the paper's "Stute OR Yatchew" wording). Bundled rejection-reason strings name each failed assumption in the conclusive-rejection case. `all_pass` is `True` iff QUG is conclusive AND at least one of Stute/Yatchew is conclusive AND no conclusive test rejects. **Non-negative-dose contract**: all three raw linearity helpers (`qug_test`, `stute_test`, `yatchew_hr_test`) raise a front-door `ValueError` on any `d < 0`, mirroring the `_validate_had_panel` guard (paper Section 2 HAD support restriction). Multi-period panels pre-slice to `(F-1, F)` before calling; joint-horizon dispatch deferred to Phase 3 follow-up. - [ ] Phase 4: Pierce-Schott (2016) replication harness reproduces Figure 2 values. - [ ] Phase 4: Full DGP 1/2/3 coverage-rate reproduction from Table 1. diff --git a/tests/test_had_pretests.py b/tests/test_had_pretests.py index bfc07bcb..3f25503e 100644 --- a/tests/test_had_pretests.py +++ b/tests/test_had_pretests.py @@ -842,6 +842,16 @@ def test_yatchew_to_dict_is_json_safe(self): r = yatchew_hr_test(d, dy) json.dumps(r.to_dict()) # must not raise + def test_yatchew_to_dict_carries_null_form(self): + """``to_dict()`` surfaces ``null_form`` for both modes.""" + d, dy = _linear_dgp(G=50) + r_lin = yatchew_hr_test(d, dy) # default = "linearity" + r_mi = yatchew_hr_test(d, dy, null="mean_independence") + d_lin = json.loads(json.dumps(r_lin.to_dict())) + d_mi = json.loads(json.dumps(r_mi.to_dict())) + assert d_lin["null_form"] == "linearity" + assert d_mi["null_form"] == "mean_independence" + def test_report_to_dict_is_json_safe(self): """Commit criterion 16: json.dumps(report.to_dict()) succeeds.""" d, dy = _linear_dgp(G=50) @@ -1193,6 +1203,26 @@ def test_yatchew_summary_non_empty(self): assert len(s) > 0 assert "Yatchew" in s + def test_yatchew_summary_switches_title_on_null_form(self): + """``summary()`` renders the linearity / mean-independence title + per ``null_form``.""" + d, dy = _linear_dgp(G=50) + s_lin = yatchew_hr_test(d, dy).summary() + s_mi = yatchew_hr_test(d, dy, null="mean_independence").summary() + assert "linearity test" in s_lin + assert "mean-independence test" in s_mi + # Negative controls — each title is mode-specific. + assert "mean-independence test" not in s_lin + assert "linearity test" not in s_mi + + def test_yatchew_repr_includes_null_form(self): + """``repr()`` carries ``null_form=`` for both modes.""" + d, dy = _linear_dgp(G=50) + r_lin = yatchew_hr_test(d, dy) + r_mi = yatchew_hr_test(d, dy, null="mean_independence") + assert "null_form='linearity'" in repr(r_lin) + assert "null_form='mean_independence'" in repr(r_mi) + def test_report_summary_bundles_all(self): d, dy = _linear_dgp(G=50) panel = _make_two_period_panel(50, d, dy, seed=42)