-
Notifications
You must be signed in to change notification settings - Fork 32
Incorporate statistical significance testing to benchmarks #614
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,12 +7,22 @@ | |
| """ | ||
| Compare two CSVs from the same benchmark suite. | ||
|
|
||
| Auto-detects metric columns (containing "TFLOPS" or "GB/s") and key columns. | ||
| Outputs a markdown <details> block to stdout with per-config results, | ||
| and optionally appends a summary table row to --summary-file. | ||
| Two modes: | ||
|
|
||
| 1. Default (ratio) mode: compares two *aggregate* CSVs. Auto-detects metric | ||
| columns (containing "TFLOPS" or "GB/s") and key columns, computes throughput | ||
| speedups, and emits a markdown <details> block (optionally appending a | ||
| summary row to --summary-file). | ||
|
|
||
| 2. --stats mode: compares two *samples* CSVs (written with --csv-samples) using | ||
| a statistical test (Brunner-Munzel by default) via the benchstats package. | ||
| Reports whether per-config timing differences are significant and exits 1 | ||
| when a significant regression is found (for CI gating). Requires | ||
| ``pip install -r requirements.txt``. | ||
|
|
||
| Usage: | ||
| python compare_results.py baseline.csv candidate.csv --bench-name NAME --summary-file FILE | ||
| python compare_results.py base_samples.csv cand_samples.csv --stats | ||
| """ | ||
|
|
||
| import argparse | ||
|
|
@@ -49,6 +59,75 @@ def print_key_table(title, rows_df, key_cols): | |
| print() | ||
|
|
||
|
|
||
| def run_stats(args): | ||
| """Compare two samples CSVs with a statistical test via benchstats. | ||
|
|
||
| Timing (``time_ms``) is the main metric and drives the exit code; throughput | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doc/code mismatch: this docstring says the main metric is |
||
| / bandwidth (``TFLOPS`` / ``GB/s``) is reported as a secondary metric when | ||
| present in the CSV. | ||
|
|
||
| Returns a process exit code: 1 if a significant difference is found in the | ||
| main (timing) metric, else 0. | ||
| """ | ||
| import os | ||
|
|
||
| import rich.table # noqa: F401 benchstats 3.4.0 render uses rich.table.Table without importing it | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this a bug in benchstats? If so, can you report it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Benchstats imports There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All right, looks like I managed to mess this up. Thanks for noticing! Please use version 3.4.1 that has this imports fixed. Things shouldn't be that way. In the future please don't hesitate to ping me if you find something strange. |
||
| from parser_TEsamples import parser_TEsamples | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
import os, sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))so the script is invocable from any cwd — which matters for CI usage where the gate may not chdir into |
||
| from benchstats.compare import compareStats | ||
| from benchstats.render import renderComparisonResults | ||
| from benchstats.common import LoggingConsole, detectExportFormat | ||
|
|
||
| main_metrics = ["time_s"] | ||
|
|
||
| export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None | ||
| if export_fmt is not None and os.path.isfile(args.export_to): | ||
| os.remove(args.export_to) | ||
|
|
||
| console = LoggingConsole( | ||
| record=export_fmt is not None, | ||
| log_level=LoggingConsole.LogLevel.Warning, | ||
| ) | ||
|
|
||
| # metrics=None exposes every metric the CSV carries (time + throughput). | ||
| s1 = parser_TEsamples(args.baseline_csv, None, None, debug_log=console).getStats() | ||
| s2 = parser_TEsamples(args.candidate_csv, None, None, debug_log=console).getStats() | ||
|
|
||
| cr = compareStats( | ||
| s1, s2, | ||
| method=args.method, | ||
| alpha=args.alpha, | ||
| main_metrics=main_metrics, | ||
| debug_log=console, | ||
| ) | ||
|
|
||
| # Throughput metrics (e.g. TFLOPS / GB/s) are not times; blank benchstats' | ||
| # default per-value "s" suffix for them (the column header names the unit). | ||
| secondary = [m for m in cr.getMetrics() if m not in main_metrics] | ||
| style_overrides = {f"metric_{m}_unit": "" for m in secondary} | ||
|
|
||
| renderComparisonResults( | ||
| cr, console, | ||
| main_metrics=main_metrics, | ||
| style_overrides=style_overrides or None, | ||
| always_show_pvalues=args.always_show_pvalues, | ||
| ) | ||
|
|
||
| if export_fmt is not None: | ||
| if export_fmt == "txt": | ||
| console.save_text(args.export_to) | ||
| elif export_fmt == "svg": | ||
| console.save_svg(args.export_to, title="") | ||
| elif export_fmt == "html": | ||
| console.save_html(args.export_to) | ||
|
|
||
| if cr.at_least_one_differs: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
if main_metrics is None or metric_name in main_metrics:
at_least_one_differs = at_least_one_differs or less_positive or greater_positiveSo a candidate that is significantly faster will also trigger If the intent is true regression gating, walk |
||
| console.warning( | ||
| "At least one significant timing difference was detected (exit 1)." | ||
| ) | ||
| return 1 | ||
| return 0 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure the exit code is the best way to relay the information - non-0 exit codes usually indicate something has failed, while for benchmarks, a significant timing difference is often expected. Also, the granularity is a bit coarse - the exit code doesn't indicate in which 'direction' the difference goes. |
||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description="Compare benchmark CSVs") | ||
| parser.add_argument("baseline_csv", help="Baseline CSV") | ||
|
|
@@ -67,8 +146,35 @@ def main(): | |
| "Set to 0 to disable the filter." | ||
| ), | ||
| ) | ||
|
|
||
| stats_group = parser.add_argument_group( | ||
| "statistical comparison (--stats mode; operates on --csv-samples CSVs)" | ||
| ) | ||
| stats_group.add_argument( | ||
| "--stats", action="store_true", | ||
| help="Compare per-sample CSVs with a statistical test via benchstats.", | ||
| ) | ||
| stats_group.add_argument( | ||
| "--alpha", type=float, default=0.001, | ||
| help="Significance level for the test (default: 0.001).", | ||
| ) | ||
| stats_group.add_argument( | ||
| "--method", default="brunnermunzel", | ||
| help="Statistical test to use (default: brunnermunzel).", | ||
| ) | ||
| stats_group.add_argument( | ||
| "--always-show-pvalues", action="store_true", | ||
| help="Always show p-values, including for non-significant results.", | ||
| ) | ||
| stats_group.add_argument( | ||
| "--export-to", default=None, metavar="FILE", | ||
| help="Export the report to a .txt/.svg/.html file (format from extension).", | ||
| ) | ||
| args = parser.parse_args() | ||
|
|
||
| if args.stats: | ||
| return run_stats(args) | ||
|
|
||
| baseline_df = pd.read_csv(args.baseline_csv) | ||
| candidate_df = pd.read_csv(args.candidate_csv) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,183 @@ | ||
| #!/usr/bin/env python | ||
| ############################################################################### | ||
| # Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. | ||
| # | ||
| # See LICENSE for license information. | ||
| ############################################################################### | ||
| """benchstats parser for Transformer Engine microbenchmark samples CSVs. | ||
|
|
||
| Reads the per-sample CSV produced by ``run_benchmarks(...)`` with the | ||
| ``--csv-samples`` flag and turns it into the ``{benchmark_name: {metric: ndarray}}`` | ||
| structure consumed by ``benchstats.compare.compareStats``. | ||
|
|
||
| Columns of the samples CSV: the benchmark parameter columns, plus ``label``, | ||
| ``sample_idx``, ``time_ms``, ``throughput`` and ``unit``. A benchmark name is | ||
| built by joining every parameter column and ``label``, so each unique | ||
| (parameters, label) combination becomes one benchmark. | ||
|
|
||
| Two metrics are exposed: | ||
|
|
||
| - ``time_s`` (seconds, lower is better) -- always present; intended as the | ||
| *main* metric. Exposed in seconds because benchstats' renderer auto-scales | ||
| time values (to ms/us/ns) assuming a seconds base unit. | ||
| - the throughput metric, keyed by its unit (e.g. ``TFLOPS`` or ``GB/s``; higher | ||
| is better) -- present when the CSV carries throughput values. | ||
|
|
||
| ``benchstats``' renderer requires every benchmark to expose the same metric set. | ||
| Records without throughput (the samples-only ``Forward+Backward`` composites) are | ||
| therefore dropped from the comparison when throughput is available for the other | ||
| benchmarks; their raw samples remain in the CSV for other downstream analysis. | ||
|
|
||
| The class name matches the file name (``parser_TEsamples``) so it can also be | ||
| loaded by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``. | ||
| """ | ||
|
|
||
| import re | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
|
|
||
| from benchstats.common import ParserBase, LoggingConsole | ||
|
|
||
| _TIME_COL = "time_ms" # column name in the samples CSV (milliseconds) | ||
| _TIME_KEY = "time_s" # metric key exposed to benchstats (seconds) | ||
| _THR_COL = "throughput" | ||
| _UNIT_COL = "unit" | ||
| _GENERIC_THR = "throughput" | ||
| _NON_NAME_COLS = ("sample_idx", _TIME_COL, _THR_COL, _UNIT_COL) | ||
| _NAME_DELIM = " | " | ||
|
|
||
|
|
||
| class parser_TEsamples(ParserBase): | ||
| def __init__(self, csv_file_path, filter, metrics=None, debug_log=True) -> None: | ||
| assert isinstance(csv_file_path, str) | ||
| assert filter is None or isinstance(filter, (str, re.Pattern)) | ||
| assert metrics is None or ( | ||
| isinstance(metrics, (list, tuple)) and all(isinstance(m, str) for m in metrics) | ||
| ) | ||
|
|
||
| if debug_log is None or (isinstance(debug_log, bool) and not debug_log): | ||
| self.debug_log = False | ||
| elif isinstance(debug_log, bool) and debug_log: | ||
| self.debug_log = True | ||
| self.logger = LoggingConsole(log_level=LoggingConsole.LogLevel.Debug) | ||
| else: | ||
| self.debug_log = True | ||
| self.logger = debug_log | ||
|
|
||
| self.file = csv_file_path | ||
| self.filter = ( | ||
| filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter) | ||
| ) | ||
| self._requested_metrics = list(metrics) if metrics is not None else None | ||
| self._stats = self._build() | ||
|
|
||
| def getStats(self) -> dict[str, dict[str, np.ndarray]]: | ||
| return self._stats | ||
|
|
||
| def _log(self, level, msg): | ||
| if self.debug_log: | ||
| getattr(self.logger, level)(f"parser_TEsamples: {msg}") | ||
|
|
||
| def _build(self) -> dict[str, dict[str, np.ndarray]]: | ||
| df = pd.read_csv(self.file) | ||
|
|
||
| if _TIME_COL not in df.columns or "sample_idx" not in df.columns: | ||
| raise ValueError( | ||
| f"'{self.file}' is missing 'time_ms'/'sample_idx' columns. " | ||
| "Was it written with --csv-samples?" | ||
| ) | ||
|
|
||
| name_cols = [c for c in df.columns if c not in _NON_NAME_COLS] | ||
| if not name_cols: | ||
| raise ValueError(f"No benchmark-name columns found in '{self.file}'.") | ||
|
|
||
| df[_TIME_COL] = pd.to_numeric(df[_TIME_COL], errors="coerce") | ||
| has_thr_col = _THR_COL in df.columns | ||
| if has_thr_col: | ||
| df[_THR_COL] = pd.to_numeric(df[_THR_COL], errors="coerce") | ||
|
|
||
| # First pass: collect per-benchmark time samples, throughput samples and unit. | ||
| per_bm = {} # name -> {"time": ndarray, "thr": ndarray|None, "unit": str|None} | ||
| for key_vals, group in df.groupby(name_cols, sort=False): | ||
| if not isinstance(key_vals, tuple): | ||
| key_vals = (key_vals,) | ||
| bm_name = _NAME_DELIM.join(str(v) for v in key_vals) | ||
|
|
||
| if self.filter is not None and self.filter.search(bm_name) is None: | ||
| continue | ||
|
|
||
| time_ms = group[_TIME_COL].to_numpy(dtype=np.float64) | ||
| time_ms = time_ms[np.isfinite(time_ms)] | ||
| if time_ms.size == 0: | ||
| self._log("warning", f"benchmark '{bm_name}' has no finite time samples; skipping.") | ||
| continue | ||
| # benchstats' renderer auto-scales assuming seconds, so expose seconds. | ||
| time_s = time_ms / 1e3 | ||
|
|
||
| thr_s, unit = None, None | ||
| if has_thr_col: | ||
| thr_s = group[_THR_COL].to_numpy(dtype=np.float64) | ||
| thr_s = thr_s[np.isfinite(thr_s) & (thr_s > 0)] | ||
| if thr_s.size == 0: | ||
| thr_s = None | ||
| else: | ||
| units = [u for u in group[_UNIT_COL].astype(str).unique() | ||
| if u and u.lower() != "nan"] if _UNIT_COL in df.columns else [] | ||
| unit = units[0] if len(units) == 1 else (_GENERIC_THR if units else None) | ||
|
|
||
| if self.debug_log and time_s.size < 10: | ||
| self._log( | ||
| "warning", | ||
| f"benchmark '{bm_name}' has only {time_s.size} samples (>= 10 recommended); " | ||
| "re-run with a larger --min-samples.", | ||
| ) | ||
| per_bm[bm_name] = {"time": time_s, "thr": thr_s, "unit": unit} | ||
|
|
||
| if not per_bm: | ||
| self._log("warning", f"no benchmarks read from '{self.file}'.") | ||
| return {} | ||
|
|
||
| # Decide on a uniform metric set across all benchmarks. | ||
| with_thr = {n: d for n, d in per_bm.items() if d["thr"] is not None} | ||
| thr_key = self._resolve_throughput_key(with_thr) | ||
|
|
||
| if thr_key is not None and 0 < len(with_thr) < len(per_bm): | ||
| dropped = sorted(set(per_bm) - set(with_thr)) | ||
| self._log( | ||
| "warning", | ||
| f"excluding {len(dropped)} benchmark(s) without throughput from the comparison so " | ||
| f"throughput can be shown uniformly: {', '.join(dropped)}", | ||
| ) | ||
| per_bm = with_thr | ||
|
|
||
| # Build the result, honoring an explicit metric request if given. | ||
| stats = {} | ||
| for bm_name, d in per_bm.items(): | ||
| entry = {} | ||
| if self._metric_requested(_TIME_KEY, thr_key): | ||
| entry[_TIME_KEY] = d["time"] | ||
| if thr_key is not None and d["thr"] is not None and self._metric_requested(thr_key, thr_key): | ||
| entry[thr_key] = d["thr"] | ||
| if entry: | ||
| stats[bm_name] = entry | ||
| return stats | ||
|
|
||
| def _resolve_throughput_key(self, with_thr): | ||
| """Return a single throughput metric key shared by all throughput-bearing benchmarks.""" | ||
| if not with_thr: | ||
| return None | ||
| units = {d["unit"] for d in with_thr.values()} | ||
| if len(units) == 1: | ||
| return next(iter(units)) or _GENERIC_THR | ||
| return _GENERIC_THR # mixed units in one file (atypical) -> generic header | ||
|
|
||
| def _metric_requested(self, key, thr_key): | ||
| """Honor an explicit metrics= request (benchstats CLI), else expose everything.""" | ||
| if self._requested_metrics is None: | ||
| return True | ||
| req = self._requested_metrics | ||
| if key == _TIME_KEY: | ||
| return any(t in req for t in (_TIME_KEY, _TIME_COL, "time")) | ||
| # throughput: match the unit key, the generic name, or literal 'throughput' | ||
| return key in req or _GENERIC_THR in req or _THR_COL in req |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| # Extra dependencies for statistical benchmark comparison | ||
| # (compare_results.py --stats). benchstats pulls in rich, scipy and numpy. | ||
| benchstats>=3.4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
--statssounds a bit too generic, how about--statistical-testor so?