From 3e96c7e905892eda97e1227de0e516e3e4d08a6f Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Thu, 18 Jun 2026 18:45:53 +0000
Subject: [PATCH 1/4] Initial commit w/ interleaving and cold-caching

---
 benchmarks/asv/README.md              | 265 +++++++++++
 benchmarks/asv/__init__.py            |   0
 benchmarks/asv/asv.conf.json          |  16 +
 benchmarks/asv/bench_attention.py     | 102 +++++
 benchmarks/asv/bench_casting.py       | 100 +++++
 benchmarks/asv/bench_gemm.py          |  99 +++++
 benchmarks/asv/bench_gemm_fp8.py      | 104 +++++
 benchmarks/asv/bench_grouped_gemm.py  |  94 ++++
 benchmarks/asv/bench_normalization.py |  83 ++++
 benchmarks/asv/compare_results.py     | 143 ++++++
 benchmarks/asv/driver.py              | 613 ++++++++++++++++++++++++++
 benchmarks/asv/parser_TEasv.py        | 172 ++++++++
 benchmarks/asv/requirements.txt       |   3 +
 benchmarks/asv/run_benchmarks.sh      |  52 +++
 14 files changed, 1846 insertions(+)
 create mode 100644 benchmarks/asv/README.md
 create mode 100644 benchmarks/asv/__init__.py
 create mode 100644 benchmarks/asv/asv.conf.json
 create mode 100644 benchmarks/asv/bench_attention.py
 create mode 100644 benchmarks/asv/bench_casting.py
 create mode 100644 benchmarks/asv/bench_gemm.py
 create mode 100644 benchmarks/asv/bench_gemm_fp8.py
 create mode 100644 benchmarks/asv/bench_grouped_gemm.py
 create mode 100644 benchmarks/asv/bench_normalization.py
 create mode 100644 benchmarks/asv/compare_results.py
 create mode 100644 benchmarks/asv/driver.py
 create mode 100644 benchmarks/asv/parser_TEasv.py
 create mode 100644 benchmarks/asv/requirements.txt
 create mode 100755 benchmarks/asv/run_benchmarks.sh

diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md
new file mode 100644
index 000000000..eee8900e8
--- /dev/null
+++ b/benchmarks/asv/README.md
@@ -0,0 +1,265 @@
+# Benchmarks for TransformerEngine
+
+GPU microbenchmarks driven by `driver.py`. Results are written in
+[ASV (Air Speed Velocity)](https://asv.readthedocs.io/) JSON format so they
+can be browsed with `asv publish` / `asv preview`, but the `asv` CLI is **not**
+used to run benchmarks — `driver.py` runs everything in-process.
+
+## Prerequisites
+
+- TransformerEngine must already be built and installed in the current Python environment.
+- A ROCm or CUDA GPU must be available.
+- `asv` is only required if you want the HTML dashboard (`pip install asv`).
+
+## Running benchmarks
+
+Each `bench_*.py` file is directly executable, or you can drive them through
+`driver.py`. Results are saved to `benchmarks/.asv/results/` in ASV-compatible
+format by default.
+
+```bash
+cd benchmarks/asv
+python driver.py --all                      # run every suite
+python driver.py bench_gemm                 # run one suite via driver
+python bench_gemm.py                        # run one suite directly
+python bench_gemm.py time_forward           # filter to a specific method
+python bench_gemm.py -w 5 -n 20             # custom warmup/iteration counts
+python bench_casting.py --no-save           # skip saving results
+python bench_casting.py --cold-cache        # flush cache before each sample
+python bench_gemm.py --inner 50             # fix inner-loop count to 50
+python bench_gemm.py --target-window-ms 5   # tune inner so each window >=5 ms
+```
+
+### Timing model: inner loop and cache state
+
+Each `time_*` method runs the kernel `_inner` times inside a single CUDA event
+window and divides by `_inner`, so kernel-launch and CUDA-event jitter
+(`~0.5 µs` resolution on AMD) are amortized. By default the driver
+**auto-tunes** `_inner` per (combo, method) so each window lasts at least
+`--target-window-ms` (default `1.0 ms`):
+
+| Flag | Effect |
+|---|---|
+| `--inner auto` (default) | Probe a single invocation, then pick `_inner` so the next timed window lasts ≥ `--target-window-ms`. Capped at 10000. |
+| `--inner N` | Force a fixed `_inner = N` (overrides auto-tune). |
+| `--target-window-ms T` | Target window duration for `--inner auto` (default `1.0`). |
+| `--cold-cache` | Write a `--cache-flush-mb` byte scratch buffer before each sample to evict L2 + Infinity Cache. Implies `--inner=1` (otherwise iterations 2..N would refill the cache and the measurement degenerates back to warm-cache). |
+| `--cache-flush-mb M` | Scratch buffer size for `--cold-cache` (default `256`, sized for the MI300 Infinity Cache). |
+
+Choose the regime that matches the question you're asking:
+- **Warm cache, large `_inner`** (default): steady-state kernel throughput,
+  matches what a hot inner loop in a model sees. Lowest variance.
+- **Cold cache, `_inner=1`**: realistic cost of the kernel as an isolated
+  call into cold memory — closer to what `rocprofv3 --hip-trace` reports
+  on a freshly launched kernel. Higher variance; bandwidth-bound
+  benchmarks (cast, normalization) typically run 1.5–3× slower than warm.
+
+Caveat: the inner loop runs in Python, so each iteration carries
+~80–200 ns of interpreter overhead. For sub-microsecond kernels this is
+not removable without CUDA graph capture; pick `--inner` deliberately
+in that regime or use the cold-cache mode.
+
+### Sample scheduling: interleaving
+
+By default the driver does **not** collect a benchmark's samples in one
+contiguous block. It samples in round-robin chunks: it sets up a group of
+`(method, combo)` benchmarks, then takes one sample from each per round, for
+`-n` rounds. This is on by default because *sequential* scheduling (all of A,
+then all of B) makes wall-clock time a proxy for benchmark identity — so any
+time-correlated GPU noise (thermal warm-up ramp, DVFS throttle, a neighbor
+container on a shared GPU) becomes a systematic **bias** between benchmarks
+rather than noise. The Monte-Carlo study in `repro/transient_noise_sim.py`
+quantifies it: under a 5% thermal ramp a sequential Brunner-Munzel comparison
+fires a false positive 86% of the time (α=0.05), and a 20% ramp can flip a real
+5% speedup into a reported regression. Round-robin sampling spreads every
+benchmark across the same window, so a transient lands on one sample of each
+instead of corrupting one benchmark's whole block.
+
+The per-round visit order is also **randomly permuted** each round (a balanced
+randomized design, not a global shuffle). Fixed round-robin would still pin each
+benchmark to a constant phase within the round — so a monotonic ramp leaves a
+small constant per-benchmark offset, and each benchmark always sees the same
+predecessor's cache/clock state. Re-permuting each round makes both uniform in
+expectation, turning that residual bias into variance. The shuffle is seeded
+(`--seed`, default `0`) so runs stay reproducible.
+
+| Flag | Effect |
+|---|---|
+| `--interleave-group N` (default `8`) | Number of benchmarks sampled round-robin together. Each keeps a live GPU instance for the duration of the chunk, so **lower this if a group runs out of memory**; raise it to share the time window across more benchmarks. |
+| `--sequential` | Collect each benchmark's samples contiguously (≡ `--interleave-group 1`). Lowest memory, but biased under thermal drift — use only for quick local runs. |
+| `--seed S` (default `0`) | Seed for the per-round shuffle, fixed so runs are reproducible. |
+| `--no-shuffle` | Use a fixed round-robin order instead of permuting each round. Leaves a small residual ordering/predecessor bias; mainly for debugging. |
+
+Caveat: interleaving removes *within-run* time-position bias. It does **not**
+remove a whole-run thermal offset between two **separately produced** result
+files (e.g. a cold baseline run vs. a warm candidate run). For the statistical
+comparison below, produce the baseline and candidate result files back-to-back
+under similar conditions.
+
+### Helper script
+
+`run_benchmarks.sh` wraps common tasks and can be run from anywhere.
+
+```bash
+bash benchmarks/asv/run_benchmarks.sh <command> [options]
+```
+
+| Command | Description |
+|---|---|
+| `run [suite] [method]` | Run benchmarks in-process (saves ASV-compatible results) |
+| `view` | Build the ASV HTML dashboard from saved results and serve it on `localhost:8080` |
+| `list` | List available benchmark suites |
+| `compare BASE CAND` | Statistically compare two result JSONs (exits 1 on a significant regression) |
+
+## How results are stored
+
+ASV-format JSON files under `benchmarks/.asv/results/`:
+
+```
+benchmarks/.asv/results/
+  my-machine-name/
+    machine.json           # Hardware/OS metadata (auto-generated by driver)
+    <commit-hash>.json     # Timing results for that commit
+    <commit-hash>.json
+    ...
+```
+
+Each commit JSON contains the wall-clock timings for every benchmark + parameter combination
+run on that machine, including the raw per-call samples (the ASV `samples`
+column) used by `compare_results.py`. The `benchmarks/.asv/` directory is in
+`.gitignore`.
+
+## Viewing results
+
+To browse historical results in a dashboard, point `asv` at the saved JSON:
+
+```bash
+bash benchmarks/asv/run_benchmarks.sh view
+# or, manually:
+asv publish --config benchmarks/asv/asv.conf.json
+asv preview --config benchmarks/asv/asv.conf.json
+```
+
+`asv.conf.json` exists only to support `publish` / `preview`; benchmarks
+themselves are not invoked through `asv`.
+
+## Comparing two checkouts statistically
+
+The dashboard plots point estimates (medians), which cannot tell a real
+regression from measurement noise. To test whether timing differences between
+two checkouts are statistically significant, the driver records the raw per-call
+samples in each result file (the ASV `samples` column), and `compare_results.py`
+compares them with a Brunner-Munzel test via the
+[benchstats](https://github.com/Arech/benchstats) package:
+
+```bash
+pip install -r requirements.txt   # benchstats (pulls rich, scipy, numpy)
+
+cd benchmarks/asv
+
+# baseline checkout — saves <baseline-hash>-<env>.json
+python driver.py --all -n 20
+# candidate checkout — saves <candidate-hash>-<env>.json
+python driver.py --all -n 20
+
+python compare_results.py \
+    ../.asv/results/<machine>/<baseline-hash>-<env>.json \
+    ../.asv/results/<machine>/<candidate-hash>-<env>.json
+```
+
+It prints a table marking each `(benchmark, parameter combination)` as faster
+(`<`), slower (`>`), or not significantly different (`~`), and exits `1` when a
+significant difference is found, so it can gate CI.
+
+By default the result filename is derived from the commit hash, so two runs on
+the **same** commit (e.g. prototyping against a dirty working tree, where `HEAD`
+is unchanged) would overwrite each other. Pass `--label` to fold a tag into the
+filename and keep them distinct:
+
+```bash
+python driver.py --all -n 20 --label base   # -> <hash>-base-<env>.json
+# ... edit code (HEAD stays the same) ...
+python driver.py --all -n 20 --label cand   # -> <hash>-cand-<env>.json
+
+python compare_results.py \
+    ../.asv/results/<machine>/<hash>-base-<env>.json \
+    ../.asv/results/<machine>/<hash>-cand-<env>.json
+```
+
+| Flag | Effect |
+|---|---|
+| `--alpha A` | Significance level for the test (default `0.001`). |
+| `--method M` | Statistical test to use (default `brunnermunzel`). |
+| `--filter REGEX` | Only compare benchmarks whose name matches `REGEX`. |
+| `--always-show-pvalues` | Show p-values for non-significant rows too. |
+| `--export-to FILE` | Save the report to a `.txt`/`.svg`/`.html` file. |
+
+The test is rank-based and needs a reasonable number of samples per benchmark
+(≥ ~10 recommended); the default `-n 20` timed iterations satisfies this. Only
+timing is tested — throughput (`TFLOPS`/`GB/s`) is a constant-work transform of
+time, so a rank test on it is identical; the driver already prints throughput
+columns during a run.
+
+## Writing new benchmarks
+
+Create a new file in `benchmarks/asv/` following the naming convention `bench_<name>.py`.
+
+```python
+#!/usr/bin/env python3
+import torch
+import transformer_engine.pytorch as te
+
+class BenchSomething:
+    params = [[1024, 4096], ["config_a", "config_b"]]
+    param_names = ["M", "config"]
+    timeout = 300  # seconds, per parameter combination
+
+    # Driver overrides per (combo, method): _inner controls how many kernel
+    # invocations land in one CUDA event window; _scratch (when not None) is
+    # written to before each sample to evict the GPU cache.
+    _inner = 1
+    _scratch = None
+
+    def setup(self, M, config):
+        # Allocate tensors, create modules.
+        # This runs once per (combo, method); the same instance is reused for
+        # warmup and timed iterations.
+        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
+        ...
+
+    def time_forward(self, M, config):
+        # Use CUDA events for accurate GPU timing.
+        # Return elapsed seconds per single invocation — the driver uses this
+        # instead of wall time. Looping inside the event window amortizes
+        # CUDA event resolution and kernel-launch overhead.
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)        # cold-cache mode
+        self._evt[0].record()
+        for _ in range(self._inner):
+            self.module(self.x)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+    # Optional: define work_<name> to get throughput columns (TFLOPS / GB/s).
+    def work_forward(self, M, config):
+        return {"flops": 2 * M * self.N * self.K}   # compute-bound
+        # return {"bytes": M * self.hidden * 4}      # memory-bound
+
+if __name__ == "__main__":
+    from driver import run_as_main
+    run_as_main(__file__)
+```
+
+Key rules:
+- Method names starting with `time_` are automatically timed.
+- Use CUDA events and return elapsed seconds **per single invocation** —
+  divide the event delta by `self._inner` so the driver and the throughput
+  columns get per-call values regardless of inner-loop count.
+- Honor `self._inner` (loop the kernel) and `self._scratch` (write before
+  recording the start event) so the driver's `--inner` and `--cold-cache`
+  flags work for your benchmark.
+- Optionally define `work_<name>` companions to get TFLOPS or GB/s columns.
+  These return the per-call work, not per-window work.
+- Clear `.grad` attributes in backward benchmarks to prevent memory accumulation.
+- The `params` list defines a cross-product; keep the matrix size reasonable.
diff --git a/benchmarks/asv/__init__.py b/benchmarks/asv/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/asv/asv.conf.json b/benchmarks/asv/asv.conf.json
new file mode 100644
index 000000000..3c1616aac
--- /dev/null
+++ b/benchmarks/asv/asv.conf.json
@@ -0,0 +1,16 @@
+{
+    "version": 1,
+    "project": "TransformerEngine",
+    "project_url": "https://github.com/ROCm/TransformerEngine",
+    "repo": "../..",
+    "branches": ["HEAD"],
+    "environment_type": "existing",
+    "install_command": [],
+    "build_command": [],
+    "benchmark_dir": ".",
+    "results_dir": "../.asv/results",
+    "html_dir": "../.asv/html",
+    "install_timeout": 600,
+    "benchmark_timeout": 1200,
+    "launch_method": "spawn"
+}
diff --git a/benchmarks/asv/bench_attention.py b/benchmarks/asv/bench_attention.py
new file mode 100644
index 000000000..df6314c43
--- /dev/null
+++ b/benchmarks/asv/bench_attention.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Attention micro-benchmark using te.DotProductAttention.
+
+Benchmarks fused multi-head attention (with flash attention backend) for
+model configurations with grouped-query attention (GQA).
+
+Models:
+  - Llama 3   8B (TP=1, TP=8), 70B (TP=8), 405B (TP=8)
+  - Qwen 2.5  7B (TP=1), 72B (TP=8)
+
+Forward FLOPs = 4 * batch * num_q_heads * seq_len^2 * head_dim
+  (two matmuls: Q@K^T and attn@V, each contributing 2*b*h*s^2*d)
+Backward FLOPs = 2 * Forward FLOPs (approximately)
+
+Sources for model configs:
+  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
+  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
+  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
+  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
+  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+BATCH = 2
+
+# (num_q_heads, num_kv_heads, head_dim, tp)
+MODELS = {
+    "Llama3-8B_TP1":   (32, 8, 128, 1),
+    "Llama3-8B_TP8":   (32, 8, 128, 8),
+    "Llama3-70B_TP8":  (64, 8, 128, 8),
+    "Llama3-405B_TP8": (128, 8, 128, 8),
+    "Qwen2.5-7B_TP1":  (28, 4, 128, 1),
+    "Qwen2.5-72B_TP8": (64, 8, 128, 8),
+}
+
+
+class BenchAttention:
+    params = [[1024, 2048, 4096, 8192], list(MODELS)]
+    param_names = ["seq_len", "model"]
+    timeout = 300
+    _inner = 1
+    _scratch = None
+
+    def setup(self, seq_len, model):
+        n_q, n_kv, hd, tp = MODELS[model]
+        qh, kvh = n_q // tp, n_kv // tp
+        dtype = torch.bfloat16
+
+        self.attn = te.DotProductAttention(
+            num_attention_heads=qh, kv_channels=hd,
+            num_gqa_groups=kvh, attn_mask_type="causal",
+        ).to(device="cuda", dtype=dtype)
+
+        self.q = torch.randn(seq_len, BATCH, qh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.k = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.v = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.attn(self.q, self.k, self.v))
+        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
+
+    def work_forward(self, seq_len, model):
+        n_q, n_kv, hd, tp = MODELS[model]
+        qh = n_q // tp
+        return {"flops": 4 * BATCH * qh * seq_len * seq_len * hd}
+
+    def work_forward_backward(self, seq_len, model):
+        n_q, n_kv, hd, tp = MODELS[model]
+        qh = n_q // tp
+        return {"flops": 3 * 4 * BATCH * qh * seq_len * seq_len * hd}
+
+    def time_forward(self, seq_len, model):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        for _ in range(self._inner):
+            self.attn(self.q, self.k, self.v)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+    def time_forward_backward(self, seq_len, model):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        for _ in range(self._inner):
+            out = self.attn(self.q, self.k, self.v)
+            out.backward(self.grad_out)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        self.q.grad = self.k.grad = self.v.grad = None
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+if __name__ == "__main__":
+    from driver import run_as_main
+    run_as_main(__file__)
diff --git a/benchmarks/asv/bench_casting.py b/benchmarks/asv/bench_casting.py
new file mode 100644
index 000000000..713aa498e
--- /dev/null
+++ b/benchmarks/asv/bench_casting.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Benchmarks quantization (BF16 -> FP8) and dequantization (FP8 -> BF16) for
+both E4M3 (activations/weights) and E5M2 (gradients) formats.
+
+Shapes are (M, hidden_size) matching the activation tensors from models:
+  - Llama 3.1 8B, 70B, 405B
+  - Qwen 2.5  7B, 72B
+
+These casts are memory-bound; we report GB/s (input + output bytes).
+
+Sources for model configs:
+  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
+  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
+  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
+  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
+  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
+"""
+
+import torch
+from transformer_engine.pytorch import Float8CurrentScalingQuantizer
+from transformer_engine_torch import DType as TE_DType
+
+HIDDEN_SIZES = {
+    "Llama3-8B": 4096,
+    "Llama3-70B": 8192,
+    "Llama3-405B": 16384,
+    "Qwen2.5-7B": 3584,
+    "Qwen2.5-72B": 8192,
+}
+
+CAST_CONFIGS = {
+    "BF16_to_E4M3": ("quantize", TE_DType.kFloat8E4M3),
+    "E4M3_to_BF16": ("dequantize", TE_DType.kFloat8E4M3),
+    "BF16_to_E5M2": ("quantize", TE_DType.kFloat8E5M2),
+    "E5M2_to_BF16": ("dequantize", TE_DType.kFloat8E5M2),
+}
+
+
+class BenchCasting:
+    params = [[1024, 2048, 4096, 8192], list(HIDDEN_SIZES), list(CAST_CONFIGS)]
+    param_names = ["M", "model", "cast"]
+    timeout = 120
+    # Driver overrides these per (combo, method): _inner is the number of
+    # kernel invocations per CUDA event window (amortizes launch overhead);
+    # _scratch, when not None, is fill_()ed before each sample to evict the
+    # GPU cache.
+    _inner = 1
+    _scratch = None
+
+    def setup(self, M, model, cast):
+        hidden = HIDDEN_SIZES[model]
+        direction, fp8_dtype = CAST_CONFIGS[cast]
+        self.direction = direction
+        quantizer = Float8CurrentScalingQuantizer(
+            fp8_dtype=fp8_dtype,
+            device=torch.device("cuda"),
+            rowwise=True,
+            columnwise=False,
+        )
+        if direction == "dequantize":
+            bf16_tensor = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+            self.x = quantizer.quantize(bf16_tensor)
+        else:
+            self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+            self.quantizer = quantizer
+        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
+
+    def work_cast(self, M, model, cast):
+        hidden = HIDDEN_SIZES[model]
+        direction = CAST_CONFIGS[cast][0]
+        if direction == "quantize":
+            # Read BF16 (2B) + write FP8 (1B) + write scale
+            return {"bytes": M * hidden * 3}
+        else:
+            # Read FP8 (1B) + read scale + write BF16 (2B)
+            return {"bytes": M * hidden * 3}
+
+    def time_cast(self, M, model, cast):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        if self.direction == "quantize":
+            for _ in range(self._inner):
+                self.quantizer.quantize(self.x)
+        else:
+            for _ in range(self._inner):
+                self.x.dequantize(dtype=torch.bfloat16)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+if __name__ == "__main__":
+    from driver import run_as_main
+    run_as_main(__file__)
diff --git a/benchmarks/asv/bench_gemm.py b/benchmarks/asv/bench_gemm.py
new file mode 100644
index 000000000..b1ad40f99
--- /dev/null
+++ b/benchmarks/asv/bench_gemm.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""BF16 GEMM benchmarks via te.Linear.
+
+GEMM shapes derived from transformer layer projections:
+  QKV, AttnOut, GateUp (SwiGLU), Down.
+
+Model configuration sources:
+- Llama 3 8B (hidden=4096, intermediate=14336, heads=32, kv_heads=8, head_dim=128)
+  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
+
+- Llama 3 70B (hidden=8192, intermediate=28672, heads=64, kv_heads=8, head_dim=128)
+  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
+
+- Llama 3 405B (hidden=16384, intermediate=53248, heads=128, kv_heads=8, head_dim=128)
+  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
+
+- Qwen 2.5 7B (hidden=3584, intermediate=18944, heads=28, kv_heads=4, head_dim=128)
+  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
+
+- Qwen 2.5 72B (hidden=8192, intermediate=29568, heads=64, kv_heads=8, head_dim=128)
+  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
+  """
+
+import torch
+import transformer_engine.pytorch as te
+
+# (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp)
+MODELS = {
+    "Llama3-8B_TP1":   (4096, 14336, 32, 8, 128, 1),
+    "Llama3-8B_TP8":   (4096, 14336, 32, 8, 128, 8),
+    "Llama3-70B_TP8":  (8192, 28672, 64, 8, 128, 8),
+    "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8),
+    "Qwen2.5-7B_TP1":  (3584, 18944, 28, 4, 128, 1),
+    "Qwen2.5-72B_TP8": (8192, 29568, 64, 8, 128, 8),
+}
+
+# Pre-compute (N, K) for each GEMM shape
+SHAPES = {}
+for _name, (h, inter, nq, nkv, hd, tp) in MODELS.items():
+    SHAPES[f"{_name}-QKV"] = ((nq * hd + 2 * nkv * hd) // tp, h)
+    SHAPES[f"{_name}-AttnOut"] = (h, (nq * hd) // tp)
+    SHAPES[f"{_name}-GateUp"] = ((2 * inter) // tp, h)
+    SHAPES[f"{_name}-Down"] = (h, inter // tp)
+
+
+class BenchGemm:
+    params = [[1024, 2048, 4096, 8192], list(SHAPES)]
+    param_names = ["M", "shape"]
+    timeout = 300
+    _inner = 1
+    _scratch = None
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.linear(self.x))
+        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
+
+    def work_forward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 2 * M * N * K}
+
+    def work_forward_backward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 3 * 2 * M * N * K}
+
+    def time_forward(self, M, shape):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        for _ in range(self._inner):
+            self.linear(self.x)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+    def time_forward_backward(self, M, shape):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        for _ in range(self._inner):
+            out = self.linear(self.x)
+            out.backward(self.grad_out)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        self.x.grad = None
+        self.linear.weight.grad = None
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+if __name__ == "__main__":
+    from driver import run_as_main
+    run_as_main(__file__)
diff --git a/benchmarks/asv/bench_gemm_fp8.py b/benchmarks/asv/bench_gemm_fp8.py
new file mode 100644
index 000000000..8728695e4
--- /dev/null
+++ b/benchmarks/asv/bench_gemm_fp8.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+FP8 GEMM benchmarks via te.Linear under fp8_autocast.
+
+Same shapes as bench_gemm.py but with FP8 quantized compute:
+  - Llama 3   8B (TP=1, TP=8), 70B (TP=8), 405B (TP=8)
+  - Qwen 2.5  7B (TP=1), 72B (TP=8)
+
+Each model contributes four GEMM shapes:
+  QKV projection     (column-parallel)  N = (Qheads + 2*KVheads)*head_dim / TP, K = hidden
+  Attention output   (row-parallel)     N = hidden, K = Qheads*head_dim / TP
+  MLP Gate+Up        (column-parallel)  N = 2*intermediate / TP, K = hidden  (SwiGLU)
+  MLP Down           (row-parallel)     N = hidden, K = intermediate / TP
+
+Sources for model configs:
+  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
+  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
+  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
+  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
+  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
+"""
+
+import torch
+import transformer_engine.pytorch as te
+from transformer_engine.common.recipe import DelayedScaling, Format
+
+# (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp)
+MODELS = {
+    "Llama3-8B_TP1":   (4096, 14336, 32, 8, 128, 1),
+    "Llama3-8B_TP8":   (4096, 14336, 32, 8, 128, 8),
+    "Llama3-70B_TP8":  (8192, 28672, 64, 8, 128, 8),
+    "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8),
+    "Qwen2.5-7B_TP1":  (3584, 18944, 28, 4, 128, 1),
+    "Qwen2.5-72B_TP8": (8192, 29568, 64, 8, 128, 8),
+}
+
+SHAPES = {}
+for _name, (h, inter, nq, nkv, hd, tp) in MODELS.items():
+    SHAPES[f"{_name}-QKV"] = ((nq * hd + 2 * nkv * hd) // tp, h)
+    SHAPES[f"{_name}-AttnOut"] = (h, (nq * hd) // tp)
+    SHAPES[f"{_name}-GateUp"] = ((2 * inter) // tp, h)
+    SHAPES[f"{_name}-Down"] = (h, inter // tp)
+
+FP8_RECIPE = DelayedScaling(
+    fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max",
+)
+
+
+class BenchGemmFP8:
+    params = [[1024, 2048, 4096, 8192], list(SHAPES)]
+    param_names = ["M", "shape"]
+    timeout = 300
+    _inner = 1
+    _scratch = None
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn(M, N, dtype=dtype, device="cuda")
+        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
+
+    def work_forward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 2 * M * N * K}
+
+    def work_forward_backward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 3 * 2 * M * N * K}
+
+    def time_forward(self, M, shape):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+            for _ in range(self._inner):
+                self.linear(self.x)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+    def time_forward_backward(self, M, shape):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        for _ in range(self._inner):
+            with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+                out = self.linear(self.x)
+            out.backward(self.grad_out)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        self.x.grad = None
+        self.linear.weight.grad = None
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+if __name__ == "__main__":
+    from driver import run_as_main
+    run_as_main(__file__)
diff --git a/benchmarks/asv/bench_grouped_gemm.py b/benchmarks/asv/bench_grouped_gemm.py
new file mode 100644
index 000000000..199f651c6
--- /dev/null
+++ b/benchmarks/asv/bench_grouped_gemm.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Grouped GEMM benchmarks via te.GroupedLinear.
+
+MoE model configurations with GateUp and Down projections.
+Configurations are based on:
+https://github.com/AMD-AGI/Primus-Turbo/blob/main/benchmark/ops/config.py
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+# (n_routed_experts, moe_intermediate_size, hidden_size)
+MOE_MODELS = {
+    "DSV2-Lite": (64, 1408, 2048),
+    "DSV2":      (160, 1536, 5120),
+    "DSV3":      (256, 2048, 7168),
+    "Grok-V2":   (8, 16384, 8192),
+}
+
+# Build (config_key -> (num_gemms, N, K)) mapping
+CONFIGS = {}
+for model, (n_experts, inter, hidden) in MOE_MODELS.items():
+    for ep in [32, 16, 8]:
+        if n_experts % ep != 0:
+            continue
+        B = n_experts // ep
+        CONFIGS[f"{model}_EP{ep}-GateUp"] = (B, 2 * inter, hidden)
+        CONFIGS[f"{model}_EP{ep}-Down"] = (B, hidden, inter)
+
+
+class BenchGroupedGemm:
+    params = [[512, 1024, 2048, 4096], list(CONFIGS)]
+    param_names = ["M", "config"]
+    timeout = 300
+    _inner = 1
+    _scratch = None
+
+    def setup(self, M, config):
+        B, N, K = CONFIGS[config]
+        dtype = torch.bfloat16
+
+        self.module = te.GroupedLinear(
+            num_gemms=B, in_features=K, out_features=N, bias=False,
+        ).to(device="cuda", dtype=dtype)
+
+        self.xs = [
+            torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+            for _ in range(B)
+        ]
+        outs = self.module(self.xs)
+        self.grad_outs = [torch.randn_like(o) for o in outs]
+        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
+
+    def work_forward(self, M, config):
+        B, N, K = CONFIGS[config]
+        return {"flops": B * 2 * M * N * K}
+
+    def work_forward_backward(self, M, config):
+        B, N, K = CONFIGS[config]
+        return {"flops": B * 3 * 2 * M * N * K}
+
+    def time_forward(self, M, config):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        for _ in range(self._inner):
+            self.module(self.xs)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+    def time_forward_backward(self, M, config):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        for _ in range(self._inner):
+            outs = self.module(self.xs)
+            torch.autograd.backward(outs, self.grad_outs)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        for x in self.xs:
+            x.grad = None
+        for p in self.module.parameters():
+            p.grad = None
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+if __name__ == "__main__":
+    from driver import run_as_main
+    run_as_main(__file__)
diff --git a/benchmarks/asv/bench_normalization.py b/benchmarks/asv/bench_normalization.py
new file mode 100644
index 000000000..2b3608bac
--- /dev/null
+++ b/benchmarks/asv/bench_normalization.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+RMSNorm and LayerNorm benchmarks on activation-sized tensors.
+
+Shapes are derived from training workloads:
+  - Llama 3   8B, 70B, 405B (all use RMSNorm)
+  - Qwen 2.5  7B, 72B       (all use RMSNorm)
+
+Modern models predominantly use RMSNorm, but we benchmark both
+LayerNorm and RMSNorm since TE supports both and they share the
+same kernel infrastructure.
+
+The M dimension (batch * seq_len) is swept across typical training sizes.
+
+Sources for model configs:
+  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
+  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
+  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
+  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
+  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+NORMS = {"RMSNorm": te.RMSNorm, "LayerNorm": te.LayerNorm}
+HIDDEN_SIZES = [3584, 4096, 8192, 16384]
+
+
+class BenchNormalization:
+    params = [[1024, 2048, 4096, 8192], HIDDEN_SIZES, list(NORMS)]
+    param_names = ["M", "hidden", "norm_type"]
+    timeout = 120
+    _inner = 1
+    _scratch = None
+
+    def setup(self, M, hidden, norm_type):
+        dtype = torch.bfloat16
+        self.norm = NORMS[norm_type](hidden).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, hidden, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.norm(self.x))
+        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
+
+    def work_forward(self, M, hidden, norm_type):
+        # Read input (2B) + write output (2B) = 4 bytes per element
+        return {"bytes": M * hidden * 4}
+
+    def work_forward_backward(self, M, hidden, norm_type):
+        # Fwd: read+write (4B), Bwd: read input+grad_out+write grad_in (6B) = 10B
+        return {"bytes": M * hidden * 10}
+
+    def time_forward(self, M, hidden, norm_type):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        for _ in range(self._inner):
+            self.norm(self.x)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+    def time_forward_backward(self, M, hidden, norm_type):
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        self._evt[0].record()
+        for _ in range(self._inner):
+            out = self.norm(self.x)
+            out.backward(self.grad_out)
+        self._evt[1].record()
+        torch.cuda.synchronize()
+        self.x.grad = None
+        for p in self.norm.parameters():
+            p.grad = None
+        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
+
+if __name__ == "__main__":
+    from driver import run_as_main
+    run_as_main(__file__)
diff --git a/benchmarks/asv/compare_results.py b/benchmarks/asv/compare_results.py
new file mode 100644
index 000000000..c1313e1a2
--- /dev/null
+++ b/benchmarks/asv/compare_results.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Statistically compare two ASV result JSON files written by ``driver.py``.
+
+The point-estimate timings in the ASV dashboard cannot tell a real regression
+from measurement noise. This tool compares the raw per-call samples stored in
+two result files (one per checkout) using a statistical test (Brunner-Munzel by
+default) via the benchstats package. It marks each (benchmark, parameter
+combination) as faster (``<``), slower (``>``), or not significantly different
+(``~``) and exits ``1`` when a significant timing difference is found, so it can
+gate CI. A summary line reports how many benchmarks were significantly faster,
+significantly slower, or unchanged. Requires ``pip install -r requirements.txt``.
+
+Usage:
+    # run the suite on the baseline checkout, then on the candidate checkout,
+    # pointing each at its own results file, then:
+    python compare_results.py baseline.json candidate.json
+    python compare_results.py baseline.json candidate.json --alpha 0.01
+    python compare_results.py baseline.json candidate.json --export-to report.svg
+"""
+
+import argparse
+import os
+import sys
+
+
+def run_stats(args):
+    """Compare two ASV result JSONs with a statistical test via benchstats.
+
+    Returns a process exit code: 1 if a significant difference is found in the
+    timing metric, else 0.
+    """
+    import rich.table  # noqa: F401  benchstats 3.4.0 render uses rich.table.Table without importing it
+    from parser_TEasv import parser_TEasv
+    from benchstats.compare import compareStats
+    from benchstats.render import renderComparisonResults
+    from benchstats.common import LoggingConsole, detectExportFormat
+
+    main_metrics = ["time_s"]
+
+    export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None
+    if export_fmt is not None and os.path.isfile(args.export_to):
+        os.remove(args.export_to)
+
+    console = LoggingConsole(
+        record=export_fmt is not None,
+        log_level=LoggingConsole.LogLevel.Warning,
+    )
+
+    s1 = parser_TEasv(args.baseline_json, args.filter, None, debug_log=console).getStats()
+    s2 = parser_TEasv(args.candidate_json, args.filter, None, debug_log=console).getStats()
+
+    cr = compareStats(
+        s1, s2,
+        method=args.method,
+        alpha=args.alpha,
+        main_metrics=main_metrics,
+        debug_log=console,
+    )
+
+    renderComparisonResults(
+        cr, console,
+        main_metrics=main_metrics,
+        always_show_pvalues=args.always_show_pvalues,
+    )
+
+    # Tally significant results per direction for the timing metric. benchstats
+    # encodes the outcome of each comparison as set0-vs-set1: "<" means baseline
+    # < candidate (candidate's time is higher -> slower / a regression), ">"
+    # means baseline > candidate (candidate faster / a speedup), "~" means not
+    # significant at alpha. Printed via the console so it is captured by export.
+    for metric in main_metrics:
+        counts = {"<": 0, ">": 0, "~": 0}
+        for bm_res in cr.results.values():
+            res = bm_res.get(metric)
+            if res is not None:
+                counts[res.result] = counts.get(res.result, 0) + 1
+        total = counts["<"] + counts[">"] + counts["~"]
+        console.print(
+            f"\nSummary for '{metric}' ({cr.method}, alpha={cr.alpha:g}, "
+            f"{total} benchmarks):"
+        )
+        console.print(f"  candidate faster (significant, '>'): {counts['>']}")
+        console.print(f"  candidate slower (significant, '<'): {counts['<']}")
+        console.print(f"  no significant difference ('~'):     {counts['~']}")
+
+    if export_fmt is not None:
+        if export_fmt == "txt":
+            console.save_text(args.export_to)
+        elif export_fmt == "svg":
+            console.save_svg(args.export_to, title="")
+        elif export_fmt == "html":
+            console.save_html(args.export_to)
+
+    if cr.at_least_one_differs:
+        console.warning(
+            "At least one significant timing difference was detected (exit 1)."
+        )
+        return 1
+    return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Statistically compare two ASV result JSON files via benchstats.")
+    parser.add_argument("baseline_json", help="Baseline ASV result JSON")
+    parser.add_argument("candidate_json", help="Candidate ASV result JSON")
+    parser.add_argument(
+        "--filter", default=None,
+        help="Only compare benchmarks whose name matches this regex.",
+    )
+    parser.add_argument(
+        "--alpha", type=float, default=0.001,
+        help="Significance level for the test (default: 0.001).",
+    )
+    parser.add_argument(
+        "--method", default="brunnermunzel",
+        help="Statistical test to use (default: brunnermunzel).",
+    )
+    parser.add_argument(
+        "--always-show-pvalues", action="store_true",
+        help="Always show p-values, including for non-significant results.",
+    )
+    parser.add_argument(
+        "--export-to", default=None, metavar="FILE",
+        help="Export the report to a .txt/.svg/.html file (format from extension).",
+    )
+    args = parser.parse_args()
+
+    # The benchstats parser is imported lazily from the script directory.
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    if script_dir not in sys.path:
+        sys.path.insert(0, script_dir)
+
+    return run_stats(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/asv/driver.py b/benchmarks/asv/driver.py
new file mode 100644
index 000000000..52abcda64
--- /dev/null
+++ b/benchmarks/asv/driver.py
@@ -0,0 +1,613 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""ASV benchmark driver — runs bench classes in-process and saves ASV-compatible results.
+
+Usage:
+    python driver.py <suite> [method_filter] [-w W] [-n N] [--no-save]
+    python driver.py --all [-w W] [-n N] [--no-save]
+    python bench_gemm.py [method_filter] [-w W] [-n N] [--no-save]
+"""
+
+import argparse
+import glob
+import hashlib
+import importlib
+import inspect
+import itertools
+import json
+import os
+import platform
+import random
+import re
+import subprocess
+import sys
+import textwrap
+import time
+import numpy as np
+
+
+# ---------------------------------------------------------------------------
+# ASV result generation
+# ---------------------------------------------------------------------------
+
+def _get_benchmark_code_and_version(cls, method_name):
+    """Build the code string and version hash the same way ASV does.
+
+    ASV hashes a code string built from the time_* and setup methods.
+    The string is class header + indented time method + indented setup,
+    with no trailing newline.
+
+    Returns (code, version_hash).
+    """
+    time_src = textwrap.dedent(inspect.getsource(getattr(cls, method_name)))
+    setup_src = textwrap.dedent(inspect.getsource(cls.setup))
+    code = (
+        f"class {cls.__name__}:\n"
+        + textwrap.indent(time_src, "    ") + "\n"
+        + textwrap.indent(setup_src, "    ")
+    ).rstrip("\n")
+    return code, hashlib.sha256(code.encode()).hexdigest()
+
+
+def _format_param_value(v):
+    """Format a parameter value the way ASV stores it in JSON."""
+    if isinstance(v, str):
+        return f"'{v}'"
+    return repr(v)
+
+
+def _get_machine_info():
+    """Build the params/machine dict ASV expects."""
+    machine = platform.node()
+    info = {
+        "arch": platform.machine(),
+        "cpu": "",
+        "machine": machine,
+        "num_cpu": str(os.cpu_count()),
+        "os": f"{platform.system()} {platform.release()}",
+        "ram": "",
+    }
+    try:
+        with open("/proc/cpuinfo") as f:
+            for line in f:
+                if line.startswith("model name"):
+                    info["cpu"] = line.split(":", 1)[1].strip()
+                    break
+        with open("/proc/meminfo") as f:
+            for line in f:
+                if line.startswith("MemTotal"):
+                    info["ram"] = line.split()[1]  # kB
+                    break
+    except OSError:
+        pass
+    return machine, info
+
+
+def _get_commit_hash():
+    """Get the current git HEAD hash."""
+    try:
+        return subprocess.check_output(
+            ["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL
+        ).decode().strip()
+    except Exception:
+        return "unknown"
+
+
+def _compute_stats(samples):
+    """Return (median, mean, stdev, ci_lo, ci_hi, q25, q75) for *samples*.
+
+    Quartiles use linear interpolation (numpy default) — more meaningful at
+    small n than the index-floor approach. stdev is population stdev to
+    match the prior wire format; CI is a normal-approximation 99% half-width.
+    """
+    s = np.asarray(samples, dtype=np.float64)
+    mean = float(s.mean())
+    stdev = float(s.std(ddof=0))
+    median, q25, q75 = (float(x) for x in np.quantile(s, [0.5, 0.25, 0.75]))
+    ci = 2.576 * stdev / np.sqrt(s.size)  # 99% normal-approx half-width
+    return median, mean, stdev, max(0.0, mean - ci), mean + ci, q25, q75
+
+
+def _get_results_dir():
+    """Read results_dir from asv.conf.json, resolved to an absolute path."""
+    conf_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "asv.conf.json")
+    with open(conf_path) as f:
+        conf = json.load(f)
+    conf_dir = os.path.dirname(conf_path)
+    return os.path.normpath(os.path.join(conf_dir, conf["results_dir"]))
+
+
+def save_asv_results(all_results, bench_meta, label=None):
+    """Write results and benchmark index to ASV's results directory.
+
+    *label*, when given, is folded into the result filename so multiple runs on
+    the same commit (e.g. prototyping with a dirty working tree, where the HEAD
+    hash is unchanged) land in distinct files that ``compare_results.py`` can
+    compare instead of overwriting each other.
+    """
+    commit_hash = _get_commit_hash()
+    machine_name, machine_info = _get_machine_info()
+    env_name = "existing-" + sys.executable.replace("/", "_").strip("_")
+    results_dir = _get_results_dir()
+    machine_dir = os.path.join(results_dir, machine_name)
+    os.makedirs(machine_dir, exist_ok=True)
+
+    # Write machine.json if missing
+    machine_json = os.path.join(machine_dir, "machine.json")
+    if not os.path.exists(machine_json):
+        with open(machine_json, "w") as f:
+            json.dump({**machine_info, "version": 1}, f, indent=4)
+
+    # Load existing result file or start fresh. A label is sanitized to keep the
+    # filename safe (no path separators / whitespace) and inserted after the hash.
+    if label:
+        safe_label = re.sub(r"[^A-Za-z0-9._-]+", "_", label).strip("_")
+        filename = f"{commit_hash[:8]}-{safe_label}-{env_name}.json"
+    else:
+        filename = f"{commit_hash[:8]}-{env_name}.json"
+    result_path = os.path.join(machine_dir, filename)
+    if os.path.exists(result_path):
+        with open(result_path) as f:
+            data = json.load(f)
+    else:
+        data = {
+            "commit_hash": commit_hash,
+            "env_name": env_name,
+            "date": int(time.time() * 1000),
+            "params": {**machine_info, "python": sys.executable},
+            "python": sys.executable,
+            "requirements": {},
+            "env_vars": {},
+            "result_columns": [
+                "result", "params", "version",
+                "started_at", "duration",
+                "stats_ci_99_a", "stats_ci_99_b",
+                "stats_q_25", "stats_q_75",
+                "stats_number", "stats_repeat",
+                "samples",
+            ],
+            "results": {},
+            "durations": {},
+            "version": 2,
+        }
+
+    # Merge new results
+    for bench_key, bench_data in all_results.items():
+        data["results"][bench_key] = bench_data
+
+    with open(result_path, "w") as f:
+        json.dump(data, f, indent=2)
+
+    print(f"\nResults saved to {result_path}")
+
+    # Update benchmarks.json index so ASV dashboard stays in sync
+    benchmarks_path = os.path.join(results_dir, "benchmarks.json")
+    if os.path.exists(benchmarks_path):
+        with open(benchmarks_path) as f:
+            benchmarks_data = json.load(f)
+    else:
+        benchmarks_data = {"version": 2}
+
+    benchmarks_data.update(bench_meta)
+
+    with open(benchmarks_path, "w") as f:
+        json.dump(benchmarks_data, f, indent=4)
+
+    print(f"Updated {benchmarks_path}")
+
+
+# ---------------------------------------------------------------------------
+# Benchmark runner
+# ---------------------------------------------------------------------------
+
+_ASV_META_DEFAULTS = {
+    "min_run_count": 2, "number": 0, "repeat": 0, "rounds": 2,
+    "sample_time": 0.01, "type": "time", "unit": "seconds", "warmup_time": -1,
+}
+
+
+def _make_scratch(mb):
+    """Allocate a scratch buffer used to evict the GPU cache between samples.
+
+    Sized by default to exceed the MI300 Infinity Cache (256 MB) and the L2
+    (16 MB), so a single fill writes through every level of cache.
+    """
+    import torch  # noqa: deferred import — only needed when cold-cache is on
+    n = max(1, (mb * 1024 * 1024) // 4)  # float32 = 4 bytes
+    return torch.empty(n, dtype=torch.float32, device="cuda")
+
+
+def _autotune_inner(instance, method_name, combo, target_s, max_inner=10000):
+    """Pick an inner-loop count so one timed window lasts >= target_s.
+
+    The bench class is expected to honor instance._inner inside its time_*
+    method (loop the kernel that many times in one CUDA event window and
+    divide).  This probe runs two single invocations: one to settle algorithm
+    selection / cache state, and one to estimate the per-call cost.
+    """
+    method = getattr(instance, method_name)
+    saved_inner = instance._inner
+    instance._inner = 1
+    try:
+        method(*combo)               # discard: cold cache + autotuner warmup
+        t_per = method(*combo)       # seconds per single invocation
+    finally:
+        instance._inner = saved_inner
+    if t_per is None or t_per <= 0:
+        return 1
+    return max(1, min(max_inner, int(target_s / t_per) + 1))
+
+
+def _free_gpu_cache():
+    """Release cached GPU memory between interleave chunks.
+
+    No-op when torch was never imported (e.g. CPU-only test harnesses), so the
+    driver stays importable and runnable without torch present.
+    """
+    torch = sys.modules.get("torch")
+    if torch is not None:
+        try:
+            torch.cuda.empty_cache()
+        except Exception:
+            pass
+
+
+def run_class(
+    suite_name, cls, class_name, method_filter=None,
+    warmup=3, iters=7,
+    inner="auto", target_window_ms=1.0,
+    cold_cache=False, cache_flush_mb=256,
+    interleave_group=8, rng=None, shuffle=True,
+):
+    """Run all benchmarks in a class, returning (results, metadata) dicts.
+
+    Samples are collected in round-robin chunks of ``interleave_group``
+    ``(method, combo)`` benchmarks: one sample is taken from each benchmark in
+    the chunk per round, for ``iters`` rounds. This spreads every benchmark's
+    samples across the same wall-clock window so time-correlated GPU noise
+    (thermal ramp, DVFS throttle) becomes shared variance rather than a bias on
+    whichever benchmark happened to own a contiguous block of time. See
+    ``repro/transient_noise_sim.py``. ``interleave_group=1`` reproduces the
+    original contiguous (sequential) behavior; larger groups interleave more
+    benchmarks but keep that many GPU instances live at once.
+
+    When ``shuffle`` is true the per-round visit order is randomly permuted
+    (seeded by *rng*, a ``random.Random``; one is created with seed 0 if not
+    given). Fixed round-robin still pins each benchmark to a constant phase
+    within the round, so a monotonic ramp leaves a small constant per-benchmark
+    offset and each benchmark always sees the same predecessor's cache/clock
+    state. Permuting each round makes both uniform in expectation, turning that
+    residual bias into variance. The per-round structure is kept (each benchmark
+    still gets exactly ``iters`` evenly-spread samples) -- a balanced randomized
+    design, not a global shuffle that could re-cluster a benchmark's samples.
+    """
+    methods = sorted(m for m in dir(cls) if m.startswith("time_"))
+    if method_filter:
+        methods = [m for m in methods if method_filter in m]
+    if not methods:
+        return {}, {}
+
+    params = getattr(cls, "params", [[]])
+    param_names = getattr(cls, "param_names", [])
+    combos = list(itertools.product(*params))
+    asv_params = [[_format_param_value(v) for v in dim] for dim in params]
+
+    # Discover throughput columns from work_* companions
+    # Each entry: (dict_key, column_header, unit_divisor)
+    probe_keys = set()
+    for m in methods:
+        wfn = getattr(cls, "work_" + m[5:], None)
+        if wfn:
+            try:
+                probe_keys.update(wfn(cls(), *combos[0]))
+            except Exception:
+                pass
+    throughput_cols = []
+    if "flops" in probe_keys:
+        throughput_cols.append(("flops", "TFLOPS", 1e12))
+    if "bytes" in probe_keys:
+        throughput_cols.append(("bytes", "GB/s", 1e9))
+
+    # Print table header
+    target_window_s = target_window_ms / 1000.0
+    group = max(1, int(interleave_group))
+    if rng is None:
+        rng = random.Random(0)
+    inner_desc = (
+        "cold-cache (inner=1)" if cold_cache
+        else f"inner={inner}" if inner != "auto"
+        else f"inner=auto (>={target_window_ms:g}ms window)"
+    )
+    if group == 1:
+        sched_desc = "sequential"
+    else:
+        sched_desc = f"interleaved group={group}, " + ("shuffled" if shuffle else "fixed-order")
+    print(f"\n{class_name}  ({len(combos)} combos x {len(methods)} methods, "
+          f"{warmup} warmup, {iters} timed, {inner_desc}, {sched_desc})")
+    extra_hdr = "".join(f"  {label:>10}" for _, label, _ in throughput_cols)
+    HDR = (f"  {'median':>10}  {'mean':>10}  {'stdev':>10}"
+           f"  {'q25':>10}  {'q75':>10}  {'min':>10}  {'max':>10}"
+           + extra_hdr + f"  {'inner':>5}  {'method':<30}  params")
+    print("-" * len(HDR))
+    print(HDR)
+    print("-" * len(HDR))
+
+    all_results = {}
+    all_meta = {}
+
+    # Per-method result columns, indexed by combo position. Filling by index
+    # decouples the wire format from the order samples are actually collected in,
+    # so interleaved scheduling leaves the saved JSON identical to sequential.
+    n_combos = len(combos)
+    cols = {
+        m: {k: [None] * n_combos for k in
+            ("median", "ci_lo", "ci_hi", "q25", "q75", "number", "repeat", "samples")}
+        for m in methods
+    }
+    versions = {}
+    for method_name in methods:
+        bench_key = f"{suite_name}.{class_name}.{method_name}"
+        code, version = _get_benchmark_code_and_version(cls, method_name)
+        versions[method_name] = version
+        all_meta[bench_key] = {
+            **_ASV_META_DEFAULTS,
+            "code": code, "name": bench_key, "version": version,
+            "param_names": list(param_names), "params": asv_params,
+            "timeout": getattr(cls, "timeout", 300),
+        }
+
+    def _label(combo):
+        return ", ".join(f"{nm}={v}" for nm, v in zip(param_names, combo))
+
+    # Flatten to (method, combo) tasks, method-major so printed rows keep the
+    # same grouping as before, then sample them in round-robin chunks.
+    tasks = [(mi, ci) for mi in range(len(methods)) for ci in range(n_combos)]
+    started_at = int(time.time() * 1000)
+    t_start = time.perf_counter()
+
+    for chunk_start in range(0, len(tasks), group):
+        chunk = tasks[chunk_start:chunk_start + group]
+
+        # Setup phase: prepare every benchmark in the chunk (allocate tensors,
+        # pick _inner, warm up) and keep its instance live for round-robin timing.
+        live = []  # (instance, method_obj, method_name, combo, combo_idx)
+        for mi, ci in chunk:
+            method_name = methods[mi]
+            combo = combos[ci]
+            instance = cls()
+            try:
+                instance.setup(*combo)
+            except Exception as e:
+                print(f"  SKIP  {_label(combo)}  setup failed: {e}")
+                continue  # leaves None in this (method, combo) slot
+
+            # Inner-loop and cache configuration. Cold-cache mode forces
+            # inner=1 so only the first invocation in the window sees a
+            # cold cache; otherwise the 2nd..Nth invocations would refill
+            # it and we'd be back to a warm-cache measurement.
+            if cold_cache:
+                instance._scratch = _make_scratch(cache_flush_mb)
+                instance._inner = 1
+            elif inner == "auto":
+                instance._inner = _autotune_inner(
+                    instance, method_name, combo, target_window_s)
+            else:
+                instance._inner = max(1, int(inner))
+
+            method = getattr(instance, method_name)
+            for _ in range(warmup):
+                method(*combo)
+            live.append((instance, method, method_name, combo, ci))
+
+        # Timed phase: one sample from each live benchmark per round, so a
+        # transient spike lands on one sample of each rather than corrupting a
+        # whole benchmark's contiguous block. The visit order is re-permuted
+        # each round (when shuffle is on) so no benchmark is pinned to a fixed
+        # phase / predecessor; chunk_samples stays keyed by the stable index i.
+        chunk_samples = [[] for _ in live]
+        order = list(range(len(live)))
+        for _ in range(iters):
+            if shuffle and len(order) > 1:
+                rng.shuffle(order)
+            for i in order:
+                instance, method, method_name, combo, ci = live[i]
+                t0 = time.perf_counter()
+                result = method(*combo)
+                wall = time.perf_counter() - t0
+                chunk_samples[i].append(wall if result is None else result)
+
+        # Finalize phase: stats, throughput, print, store into the combo slot.
+        for i, (instance, method, method_name, combo, ci) in enumerate(live):
+            samples = chunk_samples[i]
+            median, mean, stdev, ci_lo, ci_hi, q25, q75 = _compute_stats(samples)
+            s_min, s_max = min(samples), max(samples)
+
+            c = cols[method_name]
+            c["median"][ci] = median
+            c["ci_lo"][ci] = ci_lo
+            c["ci_hi"][ci] = ci_hi
+            c["q25"][ci] = q25
+            c["q75"][ci] = q75
+            c["number"][ci] = instance._inner
+            c["repeat"][ci] = iters
+            # Keep the raw samples (seconds) for statistical comparison
+            # (compare_results.py). Rounded to 1 ns to keep the JSON compact
+            # without losing meaningful timing resolution.
+            c["samples"][ci] = [round(x, 9) for x in samples]
+
+            # Derive throughput from work_* companion
+            work = {}
+            wfn = getattr(instance, "work_" + method_name[5:], None)
+            if wfn and median > 0:
+                try:
+                    work = wfn(*combo)
+                except Exception:
+                    pass
+            extra_cols = ""
+            for key, _, divisor in throughput_cols:
+                if key in work and median > 0:
+                    extra_cols += f"  {work[key] / median / divisor:>10.1f}"
+                else:
+                    extra_cols += f"  {'':>10}"
+
+            print(f"  {median*1000:>8.3f}ms  {mean*1000:>8.3f}ms  "
+                  f"{stdev*1000:>8.3f}ms  {q25*1000:>8.3f}ms  {q75*1000:>8.3f}ms  "
+                  f"{s_min*1000:>8.3f}ms  {s_max*1000:>8.3f}ms"
+                  f"{extra_cols}  "
+                  f"{instance._inner:>5}  {method_name:<30}  {_label(combo)}")
+
+        # Release this chunk's GPU instances before setting up the next chunk.
+        live.clear()
+        _free_gpu_cache()
+
+    duration = time.perf_counter() - t_start
+    for method_name in methods:
+        bench_key = f"{suite_name}.{class_name}.{method_name}"
+        c = cols[method_name]
+        all_results[bench_key] = [
+            c["median"], asv_params, versions[method_name], started_at,
+            round(duration, 2),
+            c["ci_lo"], c["ci_hi"], c["q25"], c["q75"], c["number"], c["repeat"],
+            c["samples"],
+        ]
+
+    return all_results, all_meta
+
+
+def run_as_main(caller_file=None):
+    """Run benchmarks from a bench file or from the command line.
+
+    When called with a file path (from a bench file's ``__main__`` block),
+    the suite is derived from the filename.  When called without arguments
+    (i.e. ``python driver.py bench_gemm``), the suite is taken from argv.
+
+    Usage from a bench file::
+
+        if __name__ == "__main__":
+            from driver import run_as_main
+            run_as_main(__file__)
+    """
+    parser = argparse.ArgumentParser(
+        description="Run ASV benchmarks directly in-process (no subprocess overhead).")
+    if caller_file is None:
+        parser.add_argument("suite", nargs="?", default=None,
+                            help="Benchmark module name (e.g. bench_casting)")
+        parser.add_argument("--all", action="store_true",
+                            help="Run all bench_*.py suites in the directory")
+    parser.add_argument("method_filter", nargs="?", default=None,
+                        help="Only run time_* methods containing this string")
+    parser.add_argument("-w", "--warmup", type=int, default=10,
+                        help="Number of warmup iterations (default: 3)")
+    parser.add_argument("-n", "--iters", type=int, default=20,
+                        help="Number of timed iterations (default: 7)")
+    parser.add_argument("--inner", default="auto",
+                        help="Inner kernel invocations per timed window: "
+                             "'auto' (tune to --target-window-ms) or an integer "
+                             "(default: auto). Larger values amortize CUDA event "
+                             "and kernel-launch overhead.")
+    parser.add_argument("--target-window-ms", type=float, default=1.0,
+                        help="Target duration of one timed window when "
+                             "--inner=auto (default: 1.0 ms).")
+    parser.add_argument("--cold-cache", action="store_true",
+                        help="Flush the GPU cache (write a >LLC scratch buffer) "
+                             "before each sample. Forces --inner=1 because "
+                             "subsequent inner calls would refill the cache.")
+    parser.add_argument("--cache-flush-mb", type=int, default=256,
+                        help="Size in MB of the cache-flush buffer for "
+                             "--cold-cache (default: 256, sized for the MI300 "
+                             "Infinity Cache).")
+    parser.add_argument("--interleave-group", type=int, default=8,
+                        help="Number of (method, combo) benchmarks sampled "
+                             "round-robin together so time-correlated GPU noise "
+                             "(thermal ramp / DVFS throttle) is shared across "
+                             "them instead of biasing whichever benchmark owns a "
+                             "contiguous block of wall-clock time (default: 8). "
+                             "Each benchmark in a group keeps a live GPU "
+                             "instance, so lower this on out-of-memory. 1 = "
+                             "sequential. See repro/transient_noise_sim.py.")
+    parser.add_argument("--sequential", action="store_true",
+                        help="Collect each benchmark's samples in one contiguous "
+                             "block (equivalent to --interleave-group 1). Lowest "
+                             "memory, but biased under thermal drift.")
+    parser.add_argument("--seed", type=int, default=0,
+                        help="Seed for the per-round shuffle of the interleave "
+                             "order (default: 0), kept fixed so runs are "
+                             "reproducible.")
+    parser.add_argument("--no-shuffle", action="store_true",
+                        help="Disable the per-round random permutation and use a "
+                             "fixed round-robin order. Each benchmark then keeps "
+                             "a constant within-round phase and predecessor, "
+                             "leaving a small residual ordering bias.")
+    parser.add_argument("--no-save", action="store_true",
+                        help="Skip saving results to ASV format")
+    parser.add_argument("--label", default=None,
+                        help="Tag folded into the result filename "
+                             "(<hash>-<label>-<env>.json). Use it to keep "
+                             "multiple runs on the same commit (e.g. a dirty "
+                             "working tree) in distinct files for comparison.")
+    args = parser.parse_args()
+    if args.inner != "auto":
+        try:
+            args.inner = max(1, int(args.inner))
+        except ValueError:
+            parser.error("--inner must be 'auto' or a positive integer")
+    if args.sequential:
+        args.interleave_group = 1
+    args.interleave_group = max(1, args.interleave_group)
+
+    if caller_file is not None:
+        script_dir = os.path.dirname(os.path.abspath(caller_file))
+        suite_names = [os.path.splitext(os.path.basename(caller_file))[0]]
+    else:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        run_all = getattr(args, "all", False)
+        if run_all:
+            suite_names = sorted(
+                os.path.splitext(os.path.basename(f))[0]
+                for f in glob.glob(os.path.join(script_dir, "bench_*.py"))
+            )
+        elif args.suite:
+            suite_names = [args.suite]
+        else:
+            parser.error("provide a suite name or use --all")
+
+    os.chdir(script_dir)
+    if script_dir not in sys.path:
+        sys.path.insert(0, script_dir)
+
+    # One RNG for the whole run so the interleave order is reproducible given
+    # --seed. Shared across classes so the stream is deterministic end-to-end.
+    rng = random.Random(args.seed)
+    shuffle = not args.no_shuffle
+    if args.interleave_group > 1 and shuffle:
+        print(f"Interleave: group={args.interleave_group}, shuffled (seed={args.seed})")
+
+    all_results = {}
+    all_meta = {}
+    for suite_name in suite_names:
+        mod = importlib.import_module(suite_name)
+        for name in sorted(dir(mod)):
+            obj = getattr(mod, name)
+            if isinstance(obj, type) and name.startswith("Bench"):
+                results, meta = run_class(
+                    suite_name, obj, name, args.method_filter,
+                    warmup=args.warmup, iters=args.iters,
+                    inner=args.inner, target_window_ms=args.target_window_ms,
+                    cold_cache=args.cold_cache,
+                    cache_flush_mb=args.cache_flush_mb,
+                    interleave_group=args.interleave_group,
+                    rng=rng, shuffle=shuffle,
+                )
+                all_results.update(results)
+                all_meta.update(meta)
+
+    if all_results and not args.no_save:
+        save_asv_results(all_results, all_meta, label=args.label)
+
+
+if __name__ == "__main__":
+    run_as_main()
diff --git a/benchmarks/asv/parser_TEasv.py b/benchmarks/asv/parser_TEasv.py
new file mode 100644
index 000000000..47bbc6799
--- /dev/null
+++ b/benchmarks/asv/parser_TEasv.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""benchstats parser for ASV-format result JSON files written by ``driver.py``.
+
+Reads one ASV result file (``<results_dir>/<machine>/<hash>-<env>.json``) and
+turns it into the ``{benchmark_name: {metric: ndarray}}`` structure consumed by
+``benchstats.compare.compareStats``.
+
+An ASV result file stores, per benchmark, a row whose columns are named by the
+file's ``result_columns`` list. The driver records raw per-call timing samples
+in the ``samples`` column (a list of sample-lists, one per parameter
+combination, in ``itertools.product`` order over ``params``). This parser flattens
+that into one benchstats "benchmark" per (benchmark, parameter-combination):
+
+- the benchmark name is ``<suite>.<Class>.<time_method> | name=val, ...`` where
+  the parameter names come from ``benchmarks.json`` (falling back to positional
+  ``p0, p1, ...`` when the index is unavailable).
+- a single metric ``time_s`` (seconds, lower is better) holds the raw samples.
+  Samples are already stored in seconds; benchstats' renderer auto-scales them
+  (to ms/us/ns) assuming a seconds base unit.
+
+Throughput is intentionally not exposed as a separate metric: the ASV result
+file carries no per-sample work, and because work is constant per parameter
+combination a rank-based test on throughput is identical to the test on time.
+The driver already prints throughput columns during a run.
+
+The class name matches the file name (``parser_TEasv``) so it can also be loaded
+by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``.
+"""
+
+import itertools
+import json
+import os
+import re
+
+import numpy as np
+
+from benchstats.common import ParserBase, LoggingConsole
+
+_TIME_KEY = "time_s"           # metric key exposed to benchstats (seconds)
+_NAME_DELIM = " | "
+
+
+class parser_TEasv(ParserBase):
+    def __init__(self, json_file_path, filter, metrics=None, debug_log=True) -> None:
+        assert isinstance(json_file_path, str)
+        assert filter is None or isinstance(filter, (str, re.Pattern))
+        assert metrics is None or (
+            isinstance(metrics, (list, tuple)) and all(isinstance(m, str) for m in metrics)
+        )
+
+        if debug_log is None or (isinstance(debug_log, bool) and not debug_log):
+            self.debug_log = False
+        elif isinstance(debug_log, bool) and debug_log:
+            self.debug_log = True
+            self.logger = LoggingConsole(log_level=LoggingConsole.LogLevel.Debug)
+        else:
+            self.debug_log = True
+            self.logger = debug_log
+
+        self.file = json_file_path
+        self.filter = (
+            filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter)
+        )
+        self._requested_metrics = list(metrics) if metrics is not None else None
+        self._stats = self._build()
+
+    def getStats(self) -> dict[str, dict[str, np.ndarray]]:
+        return self._stats
+
+    def _log(self, level, msg):
+        if self.debug_log:
+            getattr(self.logger, level)(f"parser_TEasv: {msg}")
+
+    def _load_param_names(self) -> dict:
+        """Map ``bench_key -> [param_names]`` from the sibling ``benchmarks.json``.
+
+        Layout: ``<results_dir>/<machine>/<file>.json`` and
+        ``<results_dir>/benchmarks.json``. The names are only used for readable
+        labels, so a missing/unreadable index degrades gracefully to ``{}``.
+        """
+        results_dir = os.path.dirname(os.path.dirname(os.path.abspath(self.file)))
+        index_path = os.path.join(results_dir, "benchmarks.json")
+        try:
+            with open(index_path) as f:
+                index = json.load(f)
+        except (OSError, ValueError):
+            self._log("warning", f"could not read '{index_path}'; using positional param names.")
+            return {}
+        return {
+            key: meta["param_names"]
+            for key, meta in index.items()
+            if isinstance(meta, dict) and "param_names" in meta
+        }
+
+    def _build(self) -> dict[str, dict[str, np.ndarray]]:
+        with open(self.file) as f:
+            data = json.load(f)
+
+        columns = data.get("result_columns")
+        results = data.get("results", {})
+        if not isinstance(columns, list) or "samples" not in columns:
+            raise ValueError(
+                f"'{self.file}' has no 'samples' column. Re-run the benchmarks with a "
+                "driver.py that records raw samples."
+            )
+        i_params = columns.index("params")
+        i_samples = columns.index("samples")
+
+        names_map = self._load_param_names()
+        want_time = self._metric_requested(_TIME_KEY)
+
+        stats = {}
+        for bench_key, row in results.items():
+            if not row or len(row) <= i_samples:
+                continue
+            params = row[i_params] or []
+            sample_lists = row[i_samples]
+            if sample_lists is None:
+                self._log("warning", f"benchmark '{bench_key}' has no samples; skipping.")
+                continue
+
+            combos = list(itertools.product(*params)) if params else [()]
+            param_names = names_map.get(bench_key)
+
+            for combo, samples in itertools.zip_longest(combos, sample_lists):
+                if samples is None:
+                    continue
+                time_s = np.asarray(samples, dtype=np.float64)
+                time_s = time_s[np.isfinite(time_s)]
+                if time_s.size == 0:
+                    continue
+
+                label = self._format_combo(param_names, combo)
+                bm_name = bench_key + (_NAME_DELIM + label if label else "")
+                if self.filter is not None and self.filter.search(bm_name) is None:
+                    continue
+
+                if self.debug_log and time_s.size < 10:
+                    self._log(
+                        "warning",
+                        f"benchmark '{bm_name}' has only {time_s.size} samples "
+                        "(>= 10 recommended); re-run with a larger -n/--iters.",
+                    )
+                if want_time:
+                    stats[bm_name] = {_TIME_KEY: time_s}
+
+        if not stats:
+            self._log("warning", f"no benchmarks read from '{self.file}'.")
+        return stats
+
+    @staticmethod
+    def _format_combo(param_names, combo):
+        """Build a readable ``name=val, ...`` label for one parameter combination."""
+        if combo is None:
+            return ""
+        values = [str(v) for v in combo]
+        if param_names and len(param_names) == len(values):
+            return ", ".join(f"{n}={v}" for n, v in zip(param_names, values))
+        return ", ".join(values)
+
+    def _metric_requested(self, key):
+        """Honor an explicit metrics= request (benchstats CLI), else expose everything."""
+        if self._requested_metrics is None:
+            return True
+        if key == _TIME_KEY:
+            return any(t in self._requested_metrics for t in (_TIME_KEY, "time_ms", "time"))
+        return key in self._requested_metrics
diff --git a/benchmarks/asv/requirements.txt b/benchmarks/asv/requirements.txt
new file mode 100644
index 000000000..7b70ea9f7
--- /dev/null
+++ b/benchmarks/asv/requirements.txt
@@ -0,0 +1,3 @@
+# Extra dependencies for statistical benchmark comparison
+# (compare_results.py). benchstats pulls in rich, scipy and numpy.
+benchstats>=3.4
diff --git a/benchmarks/asv/run_benchmarks.sh b/benchmarks/asv/run_benchmarks.sh
new file mode 100755
index 000000000..07d1046df
--- /dev/null
+++ b/benchmarks/asv/run_benchmarks.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Helper script for common benchmark tasks.
+set -euo pipefail
+
+cd "$(git rev-parse --show-toplevel)"
+
+BENCH_DIR="benchmarks/asv"
+ASV_CONF="$(pwd)/$BENCH_DIR/asv.conf.json"
+
+usage() {
+    cat <<EOF
+Usage: bash benchmarks/asv/run_benchmarks.sh <command> [options]
+
+Commands:
+  run [-w W] [-n N] [SUITE] [METHOD]
+                        Run benchmarks in-process (saves ASV-compatible results)
+  view                  Build the ASV HTML dashboard from saved results and serve it
+  list                  List available benchmark suites
+  compare BASE CAND [OPTS]
+                        Statistically compare two ASV result JSONs (benchstats);
+                        exits 1 on a significant timing regression (CI gating)
+
+EOF
+}
+
+case "${1:-}" in
+    run)
+        shift
+        if [[ $# -eq 0 ]]; then
+            python "$BENCH_DIR/driver.py" --all
+        else
+            python "$BENCH_DIR/driver.py" "$@"
+        fi
+        ;;
+    view)
+        asv publish --config "$ASV_CONF"
+        echo "Starting preview server at http://localhost:8080"
+        asv preview --config "$ASV_CONF"
+        ;;
+    list)
+        echo "Available benchmark suites:"
+        ls "$BENCH_DIR"/bench_*.py 2>/dev/null | sed 's|.*/bench_|  bench_|;s|\.py$||'
+        ;;
+    compare)
+        shift
+        python "$BENCH_DIR/compare_results.py" "$@"
+        ;;
+    *)
+        usage
+        exit 1
+        ;;
+esac

From 4f6bd45275fffcc86ce6fe21afa5686d3f1903de Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Thu, 18 Jun 2026 19:46:14 +0000
Subject: [PATCH 2/4] Moved benchmark files, streamlined

---
 .gitignore                                    |   1 +
 benchmarks/asv/README.md                      | 265 --------
 benchmarks/asv/__init__.py                    |   0
 benchmarks/asv/asv.conf.json                  |  16 -
 benchmarks/asv/bench_attention.py             | 102 ---
 benchmarks/asv/bench_casting.py               | 100 ---
 benchmarks/asv/bench_gemm.py                  |  99 ---
 benchmarks/asv/bench_gemm_fp8.py              | 104 ---
 benchmarks/asv/bench_grouped_gemm.py          |  94 ---
 benchmarks/asv/bench_normalization.py         |  83 ---
 benchmarks/asv/compare_results.py             | 143 ----
 benchmarks/asv/driver.py                      | 613 ------------------
 benchmarks/asv/parser_TEasv.py                | 172 -----
 benchmarks/asv/requirements.txt               |   3 -
 benchmarks/asv/run_benchmarks.sh              |  52 --
 benchmarks/microbenchmarks/asv/README.md      | 152 +++++
 .../microbenchmarks/asv/bench_attention.py    |  59 ++
 .../microbenchmarks/asv/bench_casting.py      |  59 ++
 benchmarks/microbenchmarks/asv/bench_gemm.py  |  52 ++
 .../microbenchmarks/asv/bench_gemm_fp8.py     |  59 ++
 .../microbenchmarks/asv/bench_grouped_gemm.py |  55 ++
 .../asv/bench_normalization.py                |  52 ++
 .../microbenchmarks/asv/compare_results.py    | 144 ++++
 benchmarks/microbenchmarks/asv/driver.py      | 469 ++++++++++++++
 benchmarks/microbenchmarks/asv/models.py      |  89 +++
 .../microbenchmarks/asv/requirements.txt      |   4 +
 26 files changed, 1195 insertions(+), 1846 deletions(-)
 delete mode 100644 benchmarks/asv/README.md
 delete mode 100644 benchmarks/asv/__init__.py
 delete mode 100644 benchmarks/asv/asv.conf.json
 delete mode 100644 benchmarks/asv/bench_attention.py
 delete mode 100644 benchmarks/asv/bench_casting.py
 delete mode 100644 benchmarks/asv/bench_gemm.py
 delete mode 100644 benchmarks/asv/bench_gemm_fp8.py
 delete mode 100644 benchmarks/asv/bench_grouped_gemm.py
 delete mode 100644 benchmarks/asv/bench_normalization.py
 delete mode 100644 benchmarks/asv/compare_results.py
 delete mode 100644 benchmarks/asv/driver.py
 delete mode 100644 benchmarks/asv/parser_TEasv.py
 delete mode 100644 benchmarks/asv/requirements.txt
 delete mode 100755 benchmarks/asv/run_benchmarks.sh
 create mode 100644 benchmarks/microbenchmarks/asv/README.md
 create mode 100644 benchmarks/microbenchmarks/asv/bench_attention.py
 create mode 100644 benchmarks/microbenchmarks/asv/bench_casting.py
 create mode 100644 benchmarks/microbenchmarks/asv/bench_gemm.py
 create mode 100644 benchmarks/microbenchmarks/asv/bench_gemm_fp8.py
 create mode 100644 benchmarks/microbenchmarks/asv/bench_grouped_gemm.py
 create mode 100644 benchmarks/microbenchmarks/asv/bench_normalization.py
 create mode 100755 benchmarks/microbenchmarks/asv/compare_results.py
 create mode 100644 benchmarks/microbenchmarks/asv/driver.py
 create mode 100644 benchmarks/microbenchmarks/asv/models.py
 create mode 100644 benchmarks/microbenchmarks/asv/requirements.txt

diff --git a/.gitignore b/.gitignore
index fca0d9389..6aae86cad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .venv
+benchmarks/microbenchmarks/asv/results/
 *.o
 *.swp
 *.ii
diff --git a/benchmarks/asv/README.md b/benchmarks/asv/README.md
deleted file mode 100644
index eee8900e8..000000000
--- a/benchmarks/asv/README.md
+++ /dev/null
@@ -1,265 +0,0 @@
-# Benchmarks for TransformerEngine
-
-GPU microbenchmarks driven by `driver.py`. Results are written in
-[ASV (Air Speed Velocity)](https://asv.readthedocs.io/) JSON format so they
-can be browsed with `asv publish` / `asv preview`, but the `asv` CLI is **not**
-used to run benchmarks — `driver.py` runs everything in-process.
-
-## Prerequisites
-
-- TransformerEngine must already be built and installed in the current Python environment.
-- A ROCm or CUDA GPU must be available.
-- `asv` is only required if you want the HTML dashboard (`pip install asv`).
-
-## Running benchmarks
-
-Each `bench_*.py` file is directly executable, or you can drive them through
-`driver.py`. Results are saved to `benchmarks/.asv/results/` in ASV-compatible
-format by default.
-
-```bash
-cd benchmarks/asv
-python driver.py --all                      # run every suite
-python driver.py bench_gemm                 # run one suite via driver
-python bench_gemm.py                        # run one suite directly
-python bench_gemm.py time_forward           # filter to a specific method
-python bench_gemm.py -w 5 -n 20             # custom warmup/iteration counts
-python bench_casting.py --no-save           # skip saving results
-python bench_casting.py --cold-cache        # flush cache before each sample
-python bench_gemm.py --inner 50             # fix inner-loop count to 50
-python bench_gemm.py --target-window-ms 5   # tune inner so each window >=5 ms
-```
-
-### Timing model: inner loop and cache state
-
-Each `time_*` method runs the kernel `_inner` times inside a single CUDA event
-window and divides by `_inner`, so kernel-launch and CUDA-event jitter
-(`~0.5 µs` resolution on AMD) are amortized. By default the driver
-**auto-tunes** `_inner` per (combo, method) so each window lasts at least
-`--target-window-ms` (default `1.0 ms`):
-
-| Flag | Effect |
-|---|---|
-| `--inner auto` (default) | Probe a single invocation, then pick `_inner` so the next timed window lasts ≥ `--target-window-ms`. Capped at 10000. |
-| `--inner N` | Force a fixed `_inner = N` (overrides auto-tune). |
-| `--target-window-ms T` | Target window duration for `--inner auto` (default `1.0`). |
-| `--cold-cache` | Write a `--cache-flush-mb` byte scratch buffer before each sample to evict L2 + Infinity Cache. Implies `--inner=1` (otherwise iterations 2..N would refill the cache and the measurement degenerates back to warm-cache). |
-| `--cache-flush-mb M` | Scratch buffer size for `--cold-cache` (default `256`, sized for the MI300 Infinity Cache). |
-
-Choose the regime that matches the question you're asking:
-- **Warm cache, large `_inner`** (default): steady-state kernel throughput,
-  matches what a hot inner loop in a model sees. Lowest variance.
-- **Cold cache, `_inner=1`**: realistic cost of the kernel as an isolated
-  call into cold memory — closer to what `rocprofv3 --hip-trace` reports
-  on a freshly launched kernel. Higher variance; bandwidth-bound
-  benchmarks (cast, normalization) typically run 1.5–3× slower than warm.
-
-Caveat: the inner loop runs in Python, so each iteration carries
-~80–200 ns of interpreter overhead. For sub-microsecond kernels this is
-not removable without CUDA graph capture; pick `--inner` deliberately
-in that regime or use the cold-cache mode.
-
-### Sample scheduling: interleaving
-
-By default the driver does **not** collect a benchmark's samples in one
-contiguous block. It samples in round-robin chunks: it sets up a group of
-`(method, combo)` benchmarks, then takes one sample from each per round, for
-`-n` rounds. This is on by default because *sequential* scheduling (all of A,
-then all of B) makes wall-clock time a proxy for benchmark identity — so any
-time-correlated GPU noise (thermal warm-up ramp, DVFS throttle, a neighbor
-container on a shared GPU) becomes a systematic **bias** between benchmarks
-rather than noise. The Monte-Carlo study in `repro/transient_noise_sim.py`
-quantifies it: under a 5% thermal ramp a sequential Brunner-Munzel comparison
-fires a false positive 86% of the time (α=0.05), and a 20% ramp can flip a real
-5% speedup into a reported regression. Round-robin sampling spreads every
-benchmark across the same window, so a transient lands on one sample of each
-instead of corrupting one benchmark's whole block.
-
-The per-round visit order is also **randomly permuted** each round (a balanced
-randomized design, not a global shuffle). Fixed round-robin would still pin each
-benchmark to a constant phase within the round — so a monotonic ramp leaves a
-small constant per-benchmark offset, and each benchmark always sees the same
-predecessor's cache/clock state. Re-permuting each round makes both uniform in
-expectation, turning that residual bias into variance. The shuffle is seeded
-(`--seed`, default `0`) so runs stay reproducible.
-
-| Flag | Effect |
-|---|---|
-| `--interleave-group N` (default `8`) | Number of benchmarks sampled round-robin together. Each keeps a live GPU instance for the duration of the chunk, so **lower this if a group runs out of memory**; raise it to share the time window across more benchmarks. |
-| `--sequential` | Collect each benchmark's samples contiguously (≡ `--interleave-group 1`). Lowest memory, but biased under thermal drift — use only for quick local runs. |
-| `--seed S` (default `0`) | Seed for the per-round shuffle, fixed so runs are reproducible. |
-| `--no-shuffle` | Use a fixed round-robin order instead of permuting each round. Leaves a small residual ordering/predecessor bias; mainly for debugging. |
-
-Caveat: interleaving removes *within-run* time-position bias. It does **not**
-remove a whole-run thermal offset between two **separately produced** result
-files (e.g. a cold baseline run vs. a warm candidate run). For the statistical
-comparison below, produce the baseline and candidate result files back-to-back
-under similar conditions.
-
-### Helper script
-
-`run_benchmarks.sh` wraps common tasks and can be run from anywhere.
-
-```bash
-bash benchmarks/asv/run_benchmarks.sh <command> [options]
-```
-
-| Command | Description |
-|---|---|
-| `run [suite] [method]` | Run benchmarks in-process (saves ASV-compatible results) |
-| `view` | Build the ASV HTML dashboard from saved results and serve it on `localhost:8080` |
-| `list` | List available benchmark suites |
-| `compare BASE CAND` | Statistically compare two result JSONs (exits 1 on a significant regression) |
-
-## How results are stored
-
-ASV-format JSON files under `benchmarks/.asv/results/`:
-
-```
-benchmarks/.asv/results/
-  my-machine-name/
-    machine.json           # Hardware/OS metadata (auto-generated by driver)
-    <commit-hash>.json     # Timing results for that commit
-    <commit-hash>.json
-    ...
-```
-
-Each commit JSON contains the wall-clock timings for every benchmark + parameter combination
-run on that machine, including the raw per-call samples (the ASV `samples`
-column) used by `compare_results.py`. The `benchmarks/.asv/` directory is in
-`.gitignore`.
-
-## Viewing results
-
-To browse historical results in a dashboard, point `asv` at the saved JSON:
-
-```bash
-bash benchmarks/asv/run_benchmarks.sh view
-# or, manually:
-asv publish --config benchmarks/asv/asv.conf.json
-asv preview --config benchmarks/asv/asv.conf.json
-```
-
-`asv.conf.json` exists only to support `publish` / `preview`; benchmarks
-themselves are not invoked through `asv`.
-
-## Comparing two checkouts statistically
-
-The dashboard plots point estimates (medians), which cannot tell a real
-regression from measurement noise. To test whether timing differences between
-two checkouts are statistically significant, the driver records the raw per-call
-samples in each result file (the ASV `samples` column), and `compare_results.py`
-compares them with a Brunner-Munzel test via the
-[benchstats](https://github.com/Arech/benchstats) package:
-
-```bash
-pip install -r requirements.txt   # benchstats (pulls rich, scipy, numpy)
-
-cd benchmarks/asv
-
-# baseline checkout — saves <baseline-hash>-<env>.json
-python driver.py --all -n 20
-# candidate checkout — saves <candidate-hash>-<env>.json
-python driver.py --all -n 20
-
-python compare_results.py \
-    ../.asv/results/<machine>/<baseline-hash>-<env>.json \
-    ../.asv/results/<machine>/<candidate-hash>-<env>.json
-```
-
-It prints a table marking each `(benchmark, parameter combination)` as faster
-(`<`), slower (`>`), or not significantly different (`~`), and exits `1` when a
-significant difference is found, so it can gate CI.
-
-By default the result filename is derived from the commit hash, so two runs on
-the **same** commit (e.g. prototyping against a dirty working tree, where `HEAD`
-is unchanged) would overwrite each other. Pass `--label` to fold a tag into the
-filename and keep them distinct:
-
-```bash
-python driver.py --all -n 20 --label base   # -> <hash>-base-<env>.json
-# ... edit code (HEAD stays the same) ...
-python driver.py --all -n 20 --label cand   # -> <hash>-cand-<env>.json
-
-python compare_results.py \
-    ../.asv/results/<machine>/<hash>-base-<env>.json \
-    ../.asv/results/<machine>/<hash>-cand-<env>.json
-```
-
-| Flag | Effect |
-|---|---|
-| `--alpha A` | Significance level for the test (default `0.001`). |
-| `--method M` | Statistical test to use (default `brunnermunzel`). |
-| `--filter REGEX` | Only compare benchmarks whose name matches `REGEX`. |
-| `--always-show-pvalues` | Show p-values for non-significant rows too. |
-| `--export-to FILE` | Save the report to a `.txt`/`.svg`/`.html` file. |
-
-The test is rank-based and needs a reasonable number of samples per benchmark
-(≥ ~10 recommended); the default `-n 20` timed iterations satisfies this. Only
-timing is tested — throughput (`TFLOPS`/`GB/s`) is a constant-work transform of
-time, so a rank test on it is identical; the driver already prints throughput
-columns during a run.
-
-## Writing new benchmarks
-
-Create a new file in `benchmarks/asv/` following the naming convention `bench_<name>.py`.
-
-```python
-#!/usr/bin/env python3
-import torch
-import transformer_engine.pytorch as te
-
-class BenchSomething:
-    params = [[1024, 4096], ["config_a", "config_b"]]
-    param_names = ["M", "config"]
-    timeout = 300  # seconds, per parameter combination
-
-    # Driver overrides per (combo, method): _inner controls how many kernel
-    # invocations land in one CUDA event window; _scratch (when not None) is
-    # written to before each sample to evict the GPU cache.
-    _inner = 1
-    _scratch = None
-
-    def setup(self, M, config):
-        # Allocate tensors, create modules.
-        # This runs once per (combo, method); the same instance is reused for
-        # warmup and timed iterations.
-        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
-        ...
-
-    def time_forward(self, M, config):
-        # Use CUDA events for accurate GPU timing.
-        # Return elapsed seconds per single invocation — the driver uses this
-        # instead of wall time. Looping inside the event window amortizes
-        # CUDA event resolution and kernel-launch overhead.
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)        # cold-cache mode
-        self._evt[0].record()
-        for _ in range(self._inner):
-            self.module(self.x)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-    # Optional: define work_<name> to get throughput columns (TFLOPS / GB/s).
-    def work_forward(self, M, config):
-        return {"flops": 2 * M * self.N * self.K}   # compute-bound
-        # return {"bytes": M * self.hidden * 4}      # memory-bound
-
-if __name__ == "__main__":
-    from driver import run_as_main
-    run_as_main(__file__)
-```
-
-Key rules:
-- Method names starting with `time_` are automatically timed.
-- Use CUDA events and return elapsed seconds **per single invocation** —
-  divide the event delta by `self._inner` so the driver and the throughput
-  columns get per-call values regardless of inner-loop count.
-- Honor `self._inner` (loop the kernel) and `self._scratch` (write before
-  recording the start event) so the driver's `--inner` and `--cold-cache`
-  flags work for your benchmark.
-- Optionally define `work_<name>` companions to get TFLOPS or GB/s columns.
-  These return the per-call work, not per-window work.
-- Clear `.grad` attributes in backward benchmarks to prevent memory accumulation.
-- The `params` list defines a cross-product; keep the matrix size reasonable.
diff --git a/benchmarks/asv/__init__.py b/benchmarks/asv/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmarks/asv/asv.conf.json b/benchmarks/asv/asv.conf.json
deleted file mode 100644
index 3c1616aac..000000000
--- a/benchmarks/asv/asv.conf.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "version": 1,
-    "project": "TransformerEngine",
-    "project_url": "https://github.com/ROCm/TransformerEngine",
-    "repo": "../..",
-    "branches": ["HEAD"],
-    "environment_type": "existing",
-    "install_command": [],
-    "build_command": [],
-    "benchmark_dir": ".",
-    "results_dir": "../.asv/results",
-    "html_dir": "../.asv/html",
-    "install_timeout": 600,
-    "benchmark_timeout": 1200,
-    "launch_method": "spawn"
-}
diff --git a/benchmarks/asv/bench_attention.py b/benchmarks/asv/bench_attention.py
deleted file mode 100644
index df6314c43..000000000
--- a/benchmarks/asv/bench_attention.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/usr/bin/env python3
-###############################################################################
-# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
-#
-# See LICENSE for license information.
-###############################################################################
-"""
-Attention micro-benchmark using te.DotProductAttention.
-
-Benchmarks fused multi-head attention (with flash attention backend) for
-model configurations with grouped-query attention (GQA).
-
-Models:
-  - Llama 3   8B (TP=1, TP=8), 70B (TP=8), 405B (TP=8)
-  - Qwen 2.5  7B (TP=1), 72B (TP=8)
-
-Forward FLOPs = 4 * batch * num_q_heads * seq_len^2 * head_dim
-  (two matmuls: Q@K^T and attn@V, each contributing 2*b*h*s^2*d)
-Backward FLOPs = 2 * Forward FLOPs (approximately)
-
-Sources for model configs:
-  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
-  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
-  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
-  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
-  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
-"""
-
-import torch
-import transformer_engine.pytorch as te
-
-BATCH = 2
-
-# (num_q_heads, num_kv_heads, head_dim, tp)
-MODELS = {
-    "Llama3-8B_TP1":   (32, 8, 128, 1),
-    "Llama3-8B_TP8":   (32, 8, 128, 8),
-    "Llama3-70B_TP8":  (64, 8, 128, 8),
-    "Llama3-405B_TP8": (128, 8, 128, 8),
-    "Qwen2.5-7B_TP1":  (28, 4, 128, 1),
-    "Qwen2.5-72B_TP8": (64, 8, 128, 8),
-}
-
-
-class BenchAttention:
-    params = [[1024, 2048, 4096, 8192], list(MODELS)]
-    param_names = ["seq_len", "model"]
-    timeout = 300
-    _inner = 1
-    _scratch = None
-
-    def setup(self, seq_len, model):
-        n_q, n_kv, hd, tp = MODELS[model]
-        qh, kvh = n_q // tp, n_kv // tp
-        dtype = torch.bfloat16
-
-        self.attn = te.DotProductAttention(
-            num_attention_heads=qh, kv_channels=hd,
-            num_gqa_groups=kvh, attn_mask_type="causal",
-        ).to(device="cuda", dtype=dtype)
-
-        self.q = torch.randn(seq_len, BATCH, qh, hd, dtype=dtype, device="cuda", requires_grad=True)
-        self.k = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
-        self.v = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
-        self.grad_out = torch.randn_like(self.attn(self.q, self.k, self.v))
-        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
-
-    def work_forward(self, seq_len, model):
-        n_q, n_kv, hd, tp = MODELS[model]
-        qh = n_q // tp
-        return {"flops": 4 * BATCH * qh * seq_len * seq_len * hd}
-
-    def work_forward_backward(self, seq_len, model):
-        n_q, n_kv, hd, tp = MODELS[model]
-        qh = n_q // tp
-        return {"flops": 3 * 4 * BATCH * qh * seq_len * seq_len * hd}
-
-    def time_forward(self, seq_len, model):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        for _ in range(self._inner):
-            self.attn(self.q, self.k, self.v)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-    def time_forward_backward(self, seq_len, model):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        for _ in range(self._inner):
-            out = self.attn(self.q, self.k, self.v)
-            out.backward(self.grad_out)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        self.q.grad = self.k.grad = self.v.grad = None
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-if __name__ == "__main__":
-    from driver import run_as_main
-    run_as_main(__file__)
diff --git a/benchmarks/asv/bench_casting.py b/benchmarks/asv/bench_casting.py
deleted file mode 100644
index 713aa498e..000000000
--- a/benchmarks/asv/bench_casting.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-###############################################################################
-# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
-#
-# See LICENSE for license information.
-###############################################################################
-"""
-Benchmarks quantization (BF16 -> FP8) and dequantization (FP8 -> BF16) for
-both E4M3 (activations/weights) and E5M2 (gradients) formats.
-
-Shapes are (M, hidden_size) matching the activation tensors from models:
-  - Llama 3.1 8B, 70B, 405B
-  - Qwen 2.5  7B, 72B
-
-These casts are memory-bound; we report GB/s (input + output bytes).
-
-Sources for model configs:
-  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
-  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
-  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
-  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
-  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
-"""
-
-import torch
-from transformer_engine.pytorch import Float8CurrentScalingQuantizer
-from transformer_engine_torch import DType as TE_DType
-
-HIDDEN_SIZES = {
-    "Llama3-8B": 4096,
-    "Llama3-70B": 8192,
-    "Llama3-405B": 16384,
-    "Qwen2.5-7B": 3584,
-    "Qwen2.5-72B": 8192,
-}
-
-CAST_CONFIGS = {
-    "BF16_to_E4M3": ("quantize", TE_DType.kFloat8E4M3),
-    "E4M3_to_BF16": ("dequantize", TE_DType.kFloat8E4M3),
-    "BF16_to_E5M2": ("quantize", TE_DType.kFloat8E5M2),
-    "E5M2_to_BF16": ("dequantize", TE_DType.kFloat8E5M2),
-}
-
-
-class BenchCasting:
-    params = [[1024, 2048, 4096, 8192], list(HIDDEN_SIZES), list(CAST_CONFIGS)]
-    param_names = ["M", "model", "cast"]
-    timeout = 120
-    # Driver overrides these per (combo, method): _inner is the number of
-    # kernel invocations per CUDA event window (amortizes launch overhead);
-    # _scratch, when not None, is fill_()ed before each sample to evict the
-    # GPU cache.
-    _inner = 1
-    _scratch = None
-
-    def setup(self, M, model, cast):
-        hidden = HIDDEN_SIZES[model]
-        direction, fp8_dtype = CAST_CONFIGS[cast]
-        self.direction = direction
-        quantizer = Float8CurrentScalingQuantizer(
-            fp8_dtype=fp8_dtype,
-            device=torch.device("cuda"),
-            rowwise=True,
-            columnwise=False,
-        )
-        if direction == "dequantize":
-            bf16_tensor = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
-            self.x = quantizer.quantize(bf16_tensor)
-        else:
-            self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
-            self.quantizer = quantizer
-        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
-
-    def work_cast(self, M, model, cast):
-        hidden = HIDDEN_SIZES[model]
-        direction = CAST_CONFIGS[cast][0]
-        if direction == "quantize":
-            # Read BF16 (2B) + write FP8 (1B) + write scale
-            return {"bytes": M * hidden * 3}
-        else:
-            # Read FP8 (1B) + read scale + write BF16 (2B)
-            return {"bytes": M * hidden * 3}
-
-    def time_cast(self, M, model, cast):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        if self.direction == "quantize":
-            for _ in range(self._inner):
-                self.quantizer.quantize(self.x)
-        else:
-            for _ in range(self._inner):
-                self.x.dequantize(dtype=torch.bfloat16)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-if __name__ == "__main__":
-    from driver import run_as_main
-    run_as_main(__file__)
diff --git a/benchmarks/asv/bench_gemm.py b/benchmarks/asv/bench_gemm.py
deleted file mode 100644
index b1ad40f99..000000000
--- a/benchmarks/asv/bench_gemm.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env python3
-###############################################################################
-# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
-#
-# See LICENSE for license information.
-###############################################################################
-"""BF16 GEMM benchmarks via te.Linear.
-
-GEMM shapes derived from transformer layer projections:
-  QKV, AttnOut, GateUp (SwiGLU), Down.
-
-Model configuration sources:
-- Llama 3 8B (hidden=4096, intermediate=14336, heads=32, kv_heads=8, head_dim=128)
-  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
-
-- Llama 3 70B (hidden=8192, intermediate=28672, heads=64, kv_heads=8, head_dim=128)
-  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
-
-- Llama 3 405B (hidden=16384, intermediate=53248, heads=128, kv_heads=8, head_dim=128)
-  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
-
-- Qwen 2.5 7B (hidden=3584, intermediate=18944, heads=28, kv_heads=4, head_dim=128)
-  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
-
-- Qwen 2.5 72B (hidden=8192, intermediate=29568, heads=64, kv_heads=8, head_dim=128)
-  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
-  """
-
-import torch
-import transformer_engine.pytorch as te
-
-# (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp)
-MODELS = {
-    "Llama3-8B_TP1":   (4096, 14336, 32, 8, 128, 1),
-    "Llama3-8B_TP8":   (4096, 14336, 32, 8, 128, 8),
-    "Llama3-70B_TP8":  (8192, 28672, 64, 8, 128, 8),
-    "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8),
-    "Qwen2.5-7B_TP1":  (3584, 18944, 28, 4, 128, 1),
-    "Qwen2.5-72B_TP8": (8192, 29568, 64, 8, 128, 8),
-}
-
-# Pre-compute (N, K) for each GEMM shape
-SHAPES = {}
-for _name, (h, inter, nq, nkv, hd, tp) in MODELS.items():
-    SHAPES[f"{_name}-QKV"] = ((nq * hd + 2 * nkv * hd) // tp, h)
-    SHAPES[f"{_name}-AttnOut"] = (h, (nq * hd) // tp)
-    SHAPES[f"{_name}-GateUp"] = ((2 * inter) // tp, h)
-    SHAPES[f"{_name}-Down"] = (h, inter // tp)
-
-
-class BenchGemm:
-    params = [[1024, 2048, 4096, 8192], list(SHAPES)]
-    param_names = ["M", "shape"]
-    timeout = 300
-    _inner = 1
-    _scratch = None
-
-    def setup(self, M, shape):
-        N, K = SHAPES[shape]
-        dtype = torch.bfloat16
-        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
-        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
-        self.grad_out = torch.randn_like(self.linear(self.x))
-        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
-
-    def work_forward(self, M, shape):
-        N, K = SHAPES[shape]
-        return {"flops": 2 * M * N * K}
-
-    def work_forward_backward(self, M, shape):
-        N, K = SHAPES[shape]
-        return {"flops": 3 * 2 * M * N * K}
-
-    def time_forward(self, M, shape):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        for _ in range(self._inner):
-            self.linear(self.x)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-    def time_forward_backward(self, M, shape):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        for _ in range(self._inner):
-            out = self.linear(self.x)
-            out.backward(self.grad_out)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        self.x.grad = None
-        self.linear.weight.grad = None
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-if __name__ == "__main__":
-    from driver import run_as_main
-    run_as_main(__file__)
diff --git a/benchmarks/asv/bench_gemm_fp8.py b/benchmarks/asv/bench_gemm_fp8.py
deleted file mode 100644
index 8728695e4..000000000
--- a/benchmarks/asv/bench_gemm_fp8.py
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/usr/bin/env python3
-###############################################################################
-# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
-#
-# See LICENSE for license information.
-###############################################################################
-"""
-FP8 GEMM benchmarks via te.Linear under fp8_autocast.
-
-Same shapes as bench_gemm.py but with FP8 quantized compute:
-  - Llama 3   8B (TP=1, TP=8), 70B (TP=8), 405B (TP=8)
-  - Qwen 2.5  7B (TP=1), 72B (TP=8)
-
-Each model contributes four GEMM shapes:
-  QKV projection     (column-parallel)  N = (Qheads + 2*KVheads)*head_dim / TP, K = hidden
-  Attention output   (row-parallel)     N = hidden, K = Qheads*head_dim / TP
-  MLP Gate+Up        (column-parallel)  N = 2*intermediate / TP, K = hidden  (SwiGLU)
-  MLP Down           (row-parallel)     N = hidden, K = intermediate / TP
-
-Sources for model configs:
-  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
-  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
-  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
-  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
-  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
-"""
-
-import torch
-import transformer_engine.pytorch as te
-from transformer_engine.common.recipe import DelayedScaling, Format
-
-# (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp)
-MODELS = {
-    "Llama3-8B_TP1":   (4096, 14336, 32, 8, 128, 1),
-    "Llama3-8B_TP8":   (4096, 14336, 32, 8, 128, 8),
-    "Llama3-70B_TP8":  (8192, 28672, 64, 8, 128, 8),
-    "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8),
-    "Qwen2.5-7B_TP1":  (3584, 18944, 28, 4, 128, 1),
-    "Qwen2.5-72B_TP8": (8192, 29568, 64, 8, 128, 8),
-}
-
-SHAPES = {}
-for _name, (h, inter, nq, nkv, hd, tp) in MODELS.items():
-    SHAPES[f"{_name}-QKV"] = ((nq * hd + 2 * nkv * hd) // tp, h)
-    SHAPES[f"{_name}-AttnOut"] = (h, (nq * hd) // tp)
-    SHAPES[f"{_name}-GateUp"] = ((2 * inter) // tp, h)
-    SHAPES[f"{_name}-Down"] = (h, inter // tp)
-
-FP8_RECIPE = DelayedScaling(
-    fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max",
-)
-
-
-class BenchGemmFP8:
-    params = [[1024, 2048, 4096, 8192], list(SHAPES)]
-    param_names = ["M", "shape"]
-    timeout = 300
-    _inner = 1
-    _scratch = None
-
-    def setup(self, M, shape):
-        N, K = SHAPES[shape]
-        dtype = torch.bfloat16
-        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
-        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
-        self.grad_out = torch.randn(M, N, dtype=dtype, device="cuda")
-        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
-
-    def work_forward(self, M, shape):
-        N, K = SHAPES[shape]
-        return {"flops": 2 * M * N * K}
-
-    def work_forward_backward(self, M, shape):
-        N, K = SHAPES[shape]
-        return {"flops": 3 * 2 * M * N * K}
-
-    def time_forward(self, M, shape):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
-            for _ in range(self._inner):
-                self.linear(self.x)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-    def time_forward_backward(self, M, shape):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        for _ in range(self._inner):
-            with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
-                out = self.linear(self.x)
-            out.backward(self.grad_out)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        self.x.grad = None
-        self.linear.weight.grad = None
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-if __name__ == "__main__":
-    from driver import run_as_main
-    run_as_main(__file__)
diff --git a/benchmarks/asv/bench_grouped_gemm.py b/benchmarks/asv/bench_grouped_gemm.py
deleted file mode 100644
index 199f651c6..000000000
--- a/benchmarks/asv/bench_grouped_gemm.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/usr/bin/env python3
-###############################################################################
-# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
-#
-# See LICENSE for license information.
-###############################################################################
-"""Grouped GEMM benchmarks via te.GroupedLinear.
-
-MoE model configurations with GateUp and Down projections.
-Configurations are based on:
-https://github.com/AMD-AGI/Primus-Turbo/blob/main/benchmark/ops/config.py
-"""
-
-import torch
-import transformer_engine.pytorch as te
-
-# (n_routed_experts, moe_intermediate_size, hidden_size)
-MOE_MODELS = {
-    "DSV2-Lite": (64, 1408, 2048),
-    "DSV2":      (160, 1536, 5120),
-    "DSV3":      (256, 2048, 7168),
-    "Grok-V2":   (8, 16384, 8192),
-}
-
-# Build (config_key -> (num_gemms, N, K)) mapping
-CONFIGS = {}
-for model, (n_experts, inter, hidden) in MOE_MODELS.items():
-    for ep in [32, 16, 8]:
-        if n_experts % ep != 0:
-            continue
-        B = n_experts // ep
-        CONFIGS[f"{model}_EP{ep}-GateUp"] = (B, 2 * inter, hidden)
-        CONFIGS[f"{model}_EP{ep}-Down"] = (B, hidden, inter)
-
-
-class BenchGroupedGemm:
-    params = [[512, 1024, 2048, 4096], list(CONFIGS)]
-    param_names = ["M", "config"]
-    timeout = 300
-    _inner = 1
-    _scratch = None
-
-    def setup(self, M, config):
-        B, N, K = CONFIGS[config]
-        dtype = torch.bfloat16
-
-        self.module = te.GroupedLinear(
-            num_gemms=B, in_features=K, out_features=N, bias=False,
-        ).to(device="cuda", dtype=dtype)
-
-        self.xs = [
-            torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
-            for _ in range(B)
-        ]
-        outs = self.module(self.xs)
-        self.grad_outs = [torch.randn_like(o) for o in outs]
-        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
-
-    def work_forward(self, M, config):
-        B, N, K = CONFIGS[config]
-        return {"flops": B * 2 * M * N * K}
-
-    def work_forward_backward(self, M, config):
-        B, N, K = CONFIGS[config]
-        return {"flops": B * 3 * 2 * M * N * K}
-
-    def time_forward(self, M, config):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        for _ in range(self._inner):
-            self.module(self.xs)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-    def time_forward_backward(self, M, config):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        for _ in range(self._inner):
-            outs = self.module(self.xs)
-            torch.autograd.backward(outs, self.grad_outs)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        for x in self.xs:
-            x.grad = None
-        for p in self.module.parameters():
-            p.grad = None
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-if __name__ == "__main__":
-    from driver import run_as_main
-    run_as_main(__file__)
diff --git a/benchmarks/asv/bench_normalization.py b/benchmarks/asv/bench_normalization.py
deleted file mode 100644
index 2b3608bac..000000000
--- a/benchmarks/asv/bench_normalization.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python3
-###############################################################################
-# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
-#
-# See LICENSE for license information.
-###############################################################################
-"""
-RMSNorm and LayerNorm benchmarks on activation-sized tensors.
-
-Shapes are derived from training workloads:
-  - Llama 3   8B, 70B, 405B (all use RMSNorm)
-  - Qwen 2.5  7B, 72B       (all use RMSNorm)
-
-Modern models predominantly use RMSNorm, but we benchmark both
-LayerNorm and RMSNorm since TE supports both and they share the
-same kernel infrastructure.
-
-The M dimension (batch * seq_len) is swept across typical training sizes.
-
-Sources for model configs:
-  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
-  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
-  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
-  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
-  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
-"""
-
-import torch
-import transformer_engine.pytorch as te
-
-NORMS = {"RMSNorm": te.RMSNorm, "LayerNorm": te.LayerNorm}
-HIDDEN_SIZES = [3584, 4096, 8192, 16384]
-
-
-class BenchNormalization:
-    params = [[1024, 2048, 4096, 8192], HIDDEN_SIZES, list(NORMS)]
-    param_names = ["M", "hidden", "norm_type"]
-    timeout = 120
-    _inner = 1
-    _scratch = None
-
-    def setup(self, M, hidden, norm_type):
-        dtype = torch.bfloat16
-        self.norm = NORMS[norm_type](hidden).to(device="cuda", dtype=dtype)
-        self.x = torch.randn(M, hidden, dtype=dtype, device="cuda", requires_grad=True)
-        self.grad_out = torch.randn_like(self.norm(self.x))
-        self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
-
-    def work_forward(self, M, hidden, norm_type):
-        # Read input (2B) + write output (2B) = 4 bytes per element
-        return {"bytes": M * hidden * 4}
-
-    def work_forward_backward(self, M, hidden, norm_type):
-        # Fwd: read+write (4B), Bwd: read input+grad_out+write grad_in (6B) = 10B
-        return {"bytes": M * hidden * 10}
-
-    def time_forward(self, M, hidden, norm_type):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        for _ in range(self._inner):
-            self.norm(self.x)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-    def time_forward_backward(self, M, hidden, norm_type):
-        if self._scratch is not None:
-            self._scratch.fill_(1.0)
-        self._evt[0].record()
-        for _ in range(self._inner):
-            out = self.norm(self.x)
-            out.backward(self.grad_out)
-        self._evt[1].record()
-        torch.cuda.synchronize()
-        self.x.grad = None
-        for p in self.norm.parameters():
-            p.grad = None
-        return self._evt[0].elapsed_time(self._evt[1]) / 1000 / self._inner
-
-if __name__ == "__main__":
-    from driver import run_as_main
-    run_as_main(__file__)
diff --git a/benchmarks/asv/compare_results.py b/benchmarks/asv/compare_results.py
deleted file mode 100644
index c1313e1a2..000000000
--- a/benchmarks/asv/compare_results.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/usr/bin/env python3
-###############################################################################
-# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
-#
-# See LICENSE for license information.
-###############################################################################
-"""Statistically compare two ASV result JSON files written by ``driver.py``.
-
-The point-estimate timings in the ASV dashboard cannot tell a real regression
-from measurement noise. This tool compares the raw per-call samples stored in
-two result files (one per checkout) using a statistical test (Brunner-Munzel by
-default) via the benchstats package. It marks each (benchmark, parameter
-combination) as faster (``<``), slower (``>``), or not significantly different
-(``~``) and exits ``1`` when a significant timing difference is found, so it can
-gate CI. A summary line reports how many benchmarks were significantly faster,
-significantly slower, or unchanged. Requires ``pip install -r requirements.txt``.
-
-Usage:
-    # run the suite on the baseline checkout, then on the candidate checkout,
-    # pointing each at its own results file, then:
-    python compare_results.py baseline.json candidate.json
-    python compare_results.py baseline.json candidate.json --alpha 0.01
-    python compare_results.py baseline.json candidate.json --export-to report.svg
-"""
-
-import argparse
-import os
-import sys
-
-
-def run_stats(args):
-    """Compare two ASV result JSONs with a statistical test via benchstats.
-
-    Returns a process exit code: 1 if a significant difference is found in the
-    timing metric, else 0.
-    """
-    import rich.table  # noqa: F401  benchstats 3.4.0 render uses rich.table.Table without importing it
-    from parser_TEasv import parser_TEasv
-    from benchstats.compare import compareStats
-    from benchstats.render import renderComparisonResults
-    from benchstats.common import LoggingConsole, detectExportFormat
-
-    main_metrics = ["time_s"]
-
-    export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None
-    if export_fmt is not None and os.path.isfile(args.export_to):
-        os.remove(args.export_to)
-
-    console = LoggingConsole(
-        record=export_fmt is not None,
-        log_level=LoggingConsole.LogLevel.Warning,
-    )
-
-    s1 = parser_TEasv(args.baseline_json, args.filter, None, debug_log=console).getStats()
-    s2 = parser_TEasv(args.candidate_json, args.filter, None, debug_log=console).getStats()
-
-    cr = compareStats(
-        s1, s2,
-        method=args.method,
-        alpha=args.alpha,
-        main_metrics=main_metrics,
-        debug_log=console,
-    )
-
-    renderComparisonResults(
-        cr, console,
-        main_metrics=main_metrics,
-        always_show_pvalues=args.always_show_pvalues,
-    )
-
-    # Tally significant results per direction for the timing metric. benchstats
-    # encodes the outcome of each comparison as set0-vs-set1: "<" means baseline
-    # < candidate (candidate's time is higher -> slower / a regression), ">"
-    # means baseline > candidate (candidate faster / a speedup), "~" means not
-    # significant at alpha. Printed via the console so it is captured by export.
-    for metric in main_metrics:
-        counts = {"<": 0, ">": 0, "~": 0}
-        for bm_res in cr.results.values():
-            res = bm_res.get(metric)
-            if res is not None:
-                counts[res.result] = counts.get(res.result, 0) + 1
-        total = counts["<"] + counts[">"] + counts["~"]
-        console.print(
-            f"\nSummary for '{metric}' ({cr.method}, alpha={cr.alpha:g}, "
-            f"{total} benchmarks):"
-        )
-        console.print(f"  candidate faster (significant, '>'): {counts['>']}")
-        console.print(f"  candidate slower (significant, '<'): {counts['<']}")
-        console.print(f"  no significant difference ('~'):     {counts['~']}")
-
-    if export_fmt is not None:
-        if export_fmt == "txt":
-            console.save_text(args.export_to)
-        elif export_fmt == "svg":
-            console.save_svg(args.export_to, title="")
-        elif export_fmt == "html":
-            console.save_html(args.export_to)
-
-    if cr.at_least_one_differs:
-        console.warning(
-            "At least one significant timing difference was detected (exit 1)."
-        )
-        return 1
-    return 0
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Statistically compare two ASV result JSON files via benchstats.")
-    parser.add_argument("baseline_json", help="Baseline ASV result JSON")
-    parser.add_argument("candidate_json", help="Candidate ASV result JSON")
-    parser.add_argument(
-        "--filter", default=None,
-        help="Only compare benchmarks whose name matches this regex.",
-    )
-    parser.add_argument(
-        "--alpha", type=float, default=0.001,
-        help="Significance level for the test (default: 0.001).",
-    )
-    parser.add_argument(
-        "--method", default="brunnermunzel",
-        help="Statistical test to use (default: brunnermunzel).",
-    )
-    parser.add_argument(
-        "--always-show-pvalues", action="store_true",
-        help="Always show p-values, including for non-significant results.",
-    )
-    parser.add_argument(
-        "--export-to", default=None, metavar="FILE",
-        help="Export the report to a .txt/.svg/.html file (format from extension).",
-    )
-    args = parser.parse_args()
-
-    # The benchstats parser is imported lazily from the script directory.
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    if script_dir not in sys.path:
-        sys.path.insert(0, script_dir)
-
-    return run_stats(args)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/benchmarks/asv/driver.py b/benchmarks/asv/driver.py
deleted file mode 100644
index 52abcda64..000000000
--- a/benchmarks/asv/driver.py
+++ /dev/null
@@ -1,613 +0,0 @@
-#!/usr/bin/env python3
-###############################################################################
-# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
-#
-# See LICENSE for license information.
-###############################################################################
-"""ASV benchmark driver — runs bench classes in-process and saves ASV-compatible results.
-
-Usage:
-    python driver.py <suite> [method_filter] [-w W] [-n N] [--no-save]
-    python driver.py --all [-w W] [-n N] [--no-save]
-    python bench_gemm.py [method_filter] [-w W] [-n N] [--no-save]
-"""
-
-import argparse
-import glob
-import hashlib
-import importlib
-import inspect
-import itertools
-import json
-import os
-import platform
-import random
-import re
-import subprocess
-import sys
-import textwrap
-import time
-import numpy as np
-
-
-# ---------------------------------------------------------------------------
-# ASV result generation
-# ---------------------------------------------------------------------------
-
-def _get_benchmark_code_and_version(cls, method_name):
-    """Build the code string and version hash the same way ASV does.
-
-    ASV hashes a code string built from the time_* and setup methods.
-    The string is class header + indented time method + indented setup,
-    with no trailing newline.
-
-    Returns (code, version_hash).
-    """
-    time_src = textwrap.dedent(inspect.getsource(getattr(cls, method_name)))
-    setup_src = textwrap.dedent(inspect.getsource(cls.setup))
-    code = (
-        f"class {cls.__name__}:\n"
-        + textwrap.indent(time_src, "    ") + "\n"
-        + textwrap.indent(setup_src, "    ")
-    ).rstrip("\n")
-    return code, hashlib.sha256(code.encode()).hexdigest()
-
-
-def _format_param_value(v):
-    """Format a parameter value the way ASV stores it in JSON."""
-    if isinstance(v, str):
-        return f"'{v}'"
-    return repr(v)
-
-
-def _get_machine_info():
-    """Build the params/machine dict ASV expects."""
-    machine = platform.node()
-    info = {
-        "arch": platform.machine(),
-        "cpu": "",
-        "machine": machine,
-        "num_cpu": str(os.cpu_count()),
-        "os": f"{platform.system()} {platform.release()}",
-        "ram": "",
-    }
-    try:
-        with open("/proc/cpuinfo") as f:
-            for line in f:
-                if line.startswith("model name"):
-                    info["cpu"] = line.split(":", 1)[1].strip()
-                    break
-        with open("/proc/meminfo") as f:
-            for line in f:
-                if line.startswith("MemTotal"):
-                    info["ram"] = line.split()[1]  # kB
-                    break
-    except OSError:
-        pass
-    return machine, info
-
-
-def _get_commit_hash():
-    """Get the current git HEAD hash."""
-    try:
-        return subprocess.check_output(
-            ["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL
-        ).decode().strip()
-    except Exception:
-        return "unknown"
-
-
-def _compute_stats(samples):
-    """Return (median, mean, stdev, ci_lo, ci_hi, q25, q75) for *samples*.
-
-    Quartiles use linear interpolation (numpy default) — more meaningful at
-    small n than the index-floor approach. stdev is population stdev to
-    match the prior wire format; CI is a normal-approximation 99% half-width.
-    """
-    s = np.asarray(samples, dtype=np.float64)
-    mean = float(s.mean())
-    stdev = float(s.std(ddof=0))
-    median, q25, q75 = (float(x) for x in np.quantile(s, [0.5, 0.25, 0.75]))
-    ci = 2.576 * stdev / np.sqrt(s.size)  # 99% normal-approx half-width
-    return median, mean, stdev, max(0.0, mean - ci), mean + ci, q25, q75
-
-
-def _get_results_dir():
-    """Read results_dir from asv.conf.json, resolved to an absolute path."""
-    conf_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "asv.conf.json")
-    with open(conf_path) as f:
-        conf = json.load(f)
-    conf_dir = os.path.dirname(conf_path)
-    return os.path.normpath(os.path.join(conf_dir, conf["results_dir"]))
-
-
-def save_asv_results(all_results, bench_meta, label=None):
-    """Write results and benchmark index to ASV's results directory.
-
-    *label*, when given, is folded into the result filename so multiple runs on
-    the same commit (e.g. prototyping with a dirty working tree, where the HEAD
-    hash is unchanged) land in distinct files that ``compare_results.py`` can
-    compare instead of overwriting each other.
-    """
-    commit_hash = _get_commit_hash()
-    machine_name, machine_info = _get_machine_info()
-    env_name = "existing-" + sys.executable.replace("/", "_").strip("_")
-    results_dir = _get_results_dir()
-    machine_dir = os.path.join(results_dir, machine_name)
-    os.makedirs(machine_dir, exist_ok=True)
-
-    # Write machine.json if missing
-    machine_json = os.path.join(machine_dir, "machine.json")
-    if not os.path.exists(machine_json):
-        with open(machine_json, "w") as f:
-            json.dump({**machine_info, "version": 1}, f, indent=4)
-
-    # Load existing result file or start fresh. A label is sanitized to keep the
-    # filename safe (no path separators / whitespace) and inserted after the hash.
-    if label:
-        safe_label = re.sub(r"[^A-Za-z0-9._-]+", "_", label).strip("_")
-        filename = f"{commit_hash[:8]}-{safe_label}-{env_name}.json"
-    else:
-        filename = f"{commit_hash[:8]}-{env_name}.json"
-    result_path = os.path.join(machine_dir, filename)
-    if os.path.exists(result_path):
-        with open(result_path) as f:
-            data = json.load(f)
-    else:
-        data = {
-            "commit_hash": commit_hash,
-            "env_name": env_name,
-            "date": int(time.time() * 1000),
-            "params": {**machine_info, "python": sys.executable},
-            "python": sys.executable,
-            "requirements": {},
-            "env_vars": {},
-            "result_columns": [
-                "result", "params", "version",
-                "started_at", "duration",
-                "stats_ci_99_a", "stats_ci_99_b",
-                "stats_q_25", "stats_q_75",
-                "stats_number", "stats_repeat",
-                "samples",
-            ],
-            "results": {},
-            "durations": {},
-            "version": 2,
-        }
-
-    # Merge new results
-    for bench_key, bench_data in all_results.items():
-        data["results"][bench_key] = bench_data
-
-    with open(result_path, "w") as f:
-        json.dump(data, f, indent=2)
-
-    print(f"\nResults saved to {result_path}")
-
-    # Update benchmarks.json index so ASV dashboard stays in sync
-    benchmarks_path = os.path.join(results_dir, "benchmarks.json")
-    if os.path.exists(benchmarks_path):
-        with open(benchmarks_path) as f:
-            benchmarks_data = json.load(f)
-    else:
-        benchmarks_data = {"version": 2}
-
-    benchmarks_data.update(bench_meta)
-
-    with open(benchmarks_path, "w") as f:
-        json.dump(benchmarks_data, f, indent=4)
-
-    print(f"Updated {benchmarks_path}")
-
-
-# ---------------------------------------------------------------------------
-# Benchmark runner
-# ---------------------------------------------------------------------------
-
-_ASV_META_DEFAULTS = {
-    "min_run_count": 2, "number": 0, "repeat": 0, "rounds": 2,
-    "sample_time": 0.01, "type": "time", "unit": "seconds", "warmup_time": -1,
-}
-
-
-def _make_scratch(mb):
-    """Allocate a scratch buffer used to evict the GPU cache between samples.
-
-    Sized by default to exceed the MI300 Infinity Cache (256 MB) and the L2
-    (16 MB), so a single fill writes through every level of cache.
-    """
-    import torch  # noqa: deferred import — only needed when cold-cache is on
-    n = max(1, (mb * 1024 * 1024) // 4)  # float32 = 4 bytes
-    return torch.empty(n, dtype=torch.float32, device="cuda")
-
-
-def _autotune_inner(instance, method_name, combo, target_s, max_inner=10000):
-    """Pick an inner-loop count so one timed window lasts >= target_s.
-
-    The bench class is expected to honor instance._inner inside its time_*
-    method (loop the kernel that many times in one CUDA event window and
-    divide).  This probe runs two single invocations: one to settle algorithm
-    selection / cache state, and one to estimate the per-call cost.
-    """
-    method = getattr(instance, method_name)
-    saved_inner = instance._inner
-    instance._inner = 1
-    try:
-        method(*combo)               # discard: cold cache + autotuner warmup
-        t_per = method(*combo)       # seconds per single invocation
-    finally:
-        instance._inner = saved_inner
-    if t_per is None or t_per <= 0:
-        return 1
-    return max(1, min(max_inner, int(target_s / t_per) + 1))
-
-
-def _free_gpu_cache():
-    """Release cached GPU memory between interleave chunks.
-
-    No-op when torch was never imported (e.g. CPU-only test harnesses), so the
-    driver stays importable and runnable without torch present.
-    """
-    torch = sys.modules.get("torch")
-    if torch is not None:
-        try:
-            torch.cuda.empty_cache()
-        except Exception:
-            pass
-
-
-def run_class(
-    suite_name, cls, class_name, method_filter=None,
-    warmup=3, iters=7,
-    inner="auto", target_window_ms=1.0,
-    cold_cache=False, cache_flush_mb=256,
-    interleave_group=8, rng=None, shuffle=True,
-):
-    """Run all benchmarks in a class, returning (results, metadata) dicts.
-
-    Samples are collected in round-robin chunks of ``interleave_group``
-    ``(method, combo)`` benchmarks: one sample is taken from each benchmark in
-    the chunk per round, for ``iters`` rounds. This spreads every benchmark's
-    samples across the same wall-clock window so time-correlated GPU noise
-    (thermal ramp, DVFS throttle) becomes shared variance rather than a bias on
-    whichever benchmark happened to own a contiguous block of time. See
-    ``repro/transient_noise_sim.py``. ``interleave_group=1`` reproduces the
-    original contiguous (sequential) behavior; larger groups interleave more
-    benchmarks but keep that many GPU instances live at once.
-
-    When ``shuffle`` is true the per-round visit order is randomly permuted
-    (seeded by *rng*, a ``random.Random``; one is created with seed 0 if not
-    given). Fixed round-robin still pins each benchmark to a constant phase
-    within the round, so a monotonic ramp leaves a small constant per-benchmark
-    offset and each benchmark always sees the same predecessor's cache/clock
-    state. Permuting each round makes both uniform in expectation, turning that
-    residual bias into variance. The per-round structure is kept (each benchmark
-    still gets exactly ``iters`` evenly-spread samples) -- a balanced randomized
-    design, not a global shuffle that could re-cluster a benchmark's samples.
-    """
-    methods = sorted(m for m in dir(cls) if m.startswith("time_"))
-    if method_filter:
-        methods = [m for m in methods if method_filter in m]
-    if not methods:
-        return {}, {}
-
-    params = getattr(cls, "params", [[]])
-    param_names = getattr(cls, "param_names", [])
-    combos = list(itertools.product(*params))
-    asv_params = [[_format_param_value(v) for v in dim] for dim in params]
-
-    # Discover throughput columns from work_* companions
-    # Each entry: (dict_key, column_header, unit_divisor)
-    probe_keys = set()
-    for m in methods:
-        wfn = getattr(cls, "work_" + m[5:], None)
-        if wfn:
-            try:
-                probe_keys.update(wfn(cls(), *combos[0]))
-            except Exception:
-                pass
-    throughput_cols = []
-    if "flops" in probe_keys:
-        throughput_cols.append(("flops", "TFLOPS", 1e12))
-    if "bytes" in probe_keys:
-        throughput_cols.append(("bytes", "GB/s", 1e9))
-
-    # Print table header
-    target_window_s = target_window_ms / 1000.0
-    group = max(1, int(interleave_group))
-    if rng is None:
-        rng = random.Random(0)
-    inner_desc = (
-        "cold-cache (inner=1)" if cold_cache
-        else f"inner={inner}" if inner != "auto"
-        else f"inner=auto (>={target_window_ms:g}ms window)"
-    )
-    if group == 1:
-        sched_desc = "sequential"
-    else:
-        sched_desc = f"interleaved group={group}, " + ("shuffled" if shuffle else "fixed-order")
-    print(f"\n{class_name}  ({len(combos)} combos x {len(methods)} methods, "
-          f"{warmup} warmup, {iters} timed, {inner_desc}, {sched_desc})")
-    extra_hdr = "".join(f"  {label:>10}" for _, label, _ in throughput_cols)
-    HDR = (f"  {'median':>10}  {'mean':>10}  {'stdev':>10}"
-           f"  {'q25':>10}  {'q75':>10}  {'min':>10}  {'max':>10}"
-           + extra_hdr + f"  {'inner':>5}  {'method':<30}  params")
-    print("-" * len(HDR))
-    print(HDR)
-    print("-" * len(HDR))
-
-    all_results = {}
-    all_meta = {}
-
-    # Per-method result columns, indexed by combo position. Filling by index
-    # decouples the wire format from the order samples are actually collected in,
-    # so interleaved scheduling leaves the saved JSON identical to sequential.
-    n_combos = len(combos)
-    cols = {
-        m: {k: [None] * n_combos for k in
-            ("median", "ci_lo", "ci_hi", "q25", "q75", "number", "repeat", "samples")}
-        for m in methods
-    }
-    versions = {}
-    for method_name in methods:
-        bench_key = f"{suite_name}.{class_name}.{method_name}"
-        code, version = _get_benchmark_code_and_version(cls, method_name)
-        versions[method_name] = version
-        all_meta[bench_key] = {
-            **_ASV_META_DEFAULTS,
-            "code": code, "name": bench_key, "version": version,
-            "param_names": list(param_names), "params": asv_params,
-            "timeout": getattr(cls, "timeout", 300),
-        }
-
-    def _label(combo):
-        return ", ".join(f"{nm}={v}" for nm, v in zip(param_names, combo))
-
-    # Flatten to (method, combo) tasks, method-major so printed rows keep the
-    # same grouping as before, then sample them in round-robin chunks.
-    tasks = [(mi, ci) for mi in range(len(methods)) for ci in range(n_combos)]
-    started_at = int(time.time() * 1000)
-    t_start = time.perf_counter()
-
-    for chunk_start in range(0, len(tasks), group):
-        chunk = tasks[chunk_start:chunk_start + group]
-
-        # Setup phase: prepare every benchmark in the chunk (allocate tensors,
-        # pick _inner, warm up) and keep its instance live for round-robin timing.
-        live = []  # (instance, method_obj, method_name, combo, combo_idx)
-        for mi, ci in chunk:
-            method_name = methods[mi]
-            combo = combos[ci]
-            instance = cls()
-            try:
-                instance.setup(*combo)
-            except Exception as e:
-                print(f"  SKIP  {_label(combo)}  setup failed: {e}")
-                continue  # leaves None in this (method, combo) slot
-
-            # Inner-loop and cache configuration. Cold-cache mode forces
-            # inner=1 so only the first invocation in the window sees a
-            # cold cache; otherwise the 2nd..Nth invocations would refill
-            # it and we'd be back to a warm-cache measurement.
-            if cold_cache:
-                instance._scratch = _make_scratch(cache_flush_mb)
-                instance._inner = 1
-            elif inner == "auto":
-                instance._inner = _autotune_inner(
-                    instance, method_name, combo, target_window_s)
-            else:
-                instance._inner = max(1, int(inner))
-
-            method = getattr(instance, method_name)
-            for _ in range(warmup):
-                method(*combo)
-            live.append((instance, method, method_name, combo, ci))
-
-        # Timed phase: one sample from each live benchmark per round, so a
-        # transient spike lands on one sample of each rather than corrupting a
-        # whole benchmark's contiguous block. The visit order is re-permuted
-        # each round (when shuffle is on) so no benchmark is pinned to a fixed
-        # phase / predecessor; chunk_samples stays keyed by the stable index i.
-        chunk_samples = [[] for _ in live]
-        order = list(range(len(live)))
-        for _ in range(iters):
-            if shuffle and len(order) > 1:
-                rng.shuffle(order)
-            for i in order:
-                instance, method, method_name, combo, ci = live[i]
-                t0 = time.perf_counter()
-                result = method(*combo)
-                wall = time.perf_counter() - t0
-                chunk_samples[i].append(wall if result is None else result)
-
-        # Finalize phase: stats, throughput, print, store into the combo slot.
-        for i, (instance, method, method_name, combo, ci) in enumerate(live):
-            samples = chunk_samples[i]
-            median, mean, stdev, ci_lo, ci_hi, q25, q75 = _compute_stats(samples)
-            s_min, s_max = min(samples), max(samples)
-
-            c = cols[method_name]
-            c["median"][ci] = median
-            c["ci_lo"][ci] = ci_lo
-            c["ci_hi"][ci] = ci_hi
-            c["q25"][ci] = q25
-            c["q75"][ci] = q75
-            c["number"][ci] = instance._inner
-            c["repeat"][ci] = iters
-            # Keep the raw samples (seconds) for statistical comparison
-            # (compare_results.py). Rounded to 1 ns to keep the JSON compact
-            # without losing meaningful timing resolution.
-            c["samples"][ci] = [round(x, 9) for x in samples]
-
-            # Derive throughput from work_* companion
-            work = {}
-            wfn = getattr(instance, "work_" + method_name[5:], None)
-            if wfn and median > 0:
-                try:
-                    work = wfn(*combo)
-                except Exception:
-                    pass
-            extra_cols = ""
-            for key, _, divisor in throughput_cols:
-                if key in work and median > 0:
-                    extra_cols += f"  {work[key] / median / divisor:>10.1f}"
-                else:
-                    extra_cols += f"  {'':>10}"
-
-            print(f"  {median*1000:>8.3f}ms  {mean*1000:>8.3f}ms  "
-                  f"{stdev*1000:>8.3f}ms  {q25*1000:>8.3f}ms  {q75*1000:>8.3f}ms  "
-                  f"{s_min*1000:>8.3f}ms  {s_max*1000:>8.3f}ms"
-                  f"{extra_cols}  "
-                  f"{instance._inner:>5}  {method_name:<30}  {_label(combo)}")
-
-        # Release this chunk's GPU instances before setting up the next chunk.
-        live.clear()
-        _free_gpu_cache()
-
-    duration = time.perf_counter() - t_start
-    for method_name in methods:
-        bench_key = f"{suite_name}.{class_name}.{method_name}"
-        c = cols[method_name]
-        all_results[bench_key] = [
-            c["median"], asv_params, versions[method_name], started_at,
-            round(duration, 2),
-            c["ci_lo"], c["ci_hi"], c["q25"], c["q75"], c["number"], c["repeat"],
-            c["samples"],
-        ]
-
-    return all_results, all_meta
-
-
-def run_as_main(caller_file=None):
-    """Run benchmarks from a bench file or from the command line.
-
-    When called with a file path (from a bench file's ``__main__`` block),
-    the suite is derived from the filename.  When called without arguments
-    (i.e. ``python driver.py bench_gemm``), the suite is taken from argv.
-
-    Usage from a bench file::
-
-        if __name__ == "__main__":
-            from driver import run_as_main
-            run_as_main(__file__)
-    """
-    parser = argparse.ArgumentParser(
-        description="Run ASV benchmarks directly in-process (no subprocess overhead).")
-    if caller_file is None:
-        parser.add_argument("suite", nargs="?", default=None,
-                            help="Benchmark module name (e.g. bench_casting)")
-        parser.add_argument("--all", action="store_true",
-                            help="Run all bench_*.py suites in the directory")
-    parser.add_argument("method_filter", nargs="?", default=None,
-                        help="Only run time_* methods containing this string")
-    parser.add_argument("-w", "--warmup", type=int, default=10,
-                        help="Number of warmup iterations (default: 3)")
-    parser.add_argument("-n", "--iters", type=int, default=20,
-                        help="Number of timed iterations (default: 7)")
-    parser.add_argument("--inner", default="auto",
-                        help="Inner kernel invocations per timed window: "
-                             "'auto' (tune to --target-window-ms) or an integer "
-                             "(default: auto). Larger values amortize CUDA event "
-                             "and kernel-launch overhead.")
-    parser.add_argument("--target-window-ms", type=float, default=1.0,
-                        help="Target duration of one timed window when "
-                             "--inner=auto (default: 1.0 ms).")
-    parser.add_argument("--cold-cache", action="store_true",
-                        help="Flush the GPU cache (write a >LLC scratch buffer) "
-                             "before each sample. Forces --inner=1 because "
-                             "subsequent inner calls would refill the cache.")
-    parser.add_argument("--cache-flush-mb", type=int, default=256,
-                        help="Size in MB of the cache-flush buffer for "
-                             "--cold-cache (default: 256, sized for the MI300 "
-                             "Infinity Cache).")
-    parser.add_argument("--interleave-group", type=int, default=8,
-                        help="Number of (method, combo) benchmarks sampled "
-                             "round-robin together so time-correlated GPU noise "
-                             "(thermal ramp / DVFS throttle) is shared across "
-                             "them instead of biasing whichever benchmark owns a "
-                             "contiguous block of wall-clock time (default: 8). "
-                             "Each benchmark in a group keeps a live GPU "
-                             "instance, so lower this on out-of-memory. 1 = "
-                             "sequential. See repro/transient_noise_sim.py.")
-    parser.add_argument("--sequential", action="store_true",
-                        help="Collect each benchmark's samples in one contiguous "
-                             "block (equivalent to --interleave-group 1). Lowest "
-                             "memory, but biased under thermal drift.")
-    parser.add_argument("--seed", type=int, default=0,
-                        help="Seed for the per-round shuffle of the interleave "
-                             "order (default: 0), kept fixed so runs are "
-                             "reproducible.")
-    parser.add_argument("--no-shuffle", action="store_true",
-                        help="Disable the per-round random permutation and use a "
-                             "fixed round-robin order. Each benchmark then keeps "
-                             "a constant within-round phase and predecessor, "
-                             "leaving a small residual ordering bias.")
-    parser.add_argument("--no-save", action="store_true",
-                        help="Skip saving results to ASV format")
-    parser.add_argument("--label", default=None,
-                        help="Tag folded into the result filename "
-                             "(<hash>-<label>-<env>.json). Use it to keep "
-                             "multiple runs on the same commit (e.g. a dirty "
-                             "working tree) in distinct files for comparison.")
-    args = parser.parse_args()
-    if args.inner != "auto":
-        try:
-            args.inner = max(1, int(args.inner))
-        except ValueError:
-            parser.error("--inner must be 'auto' or a positive integer")
-    if args.sequential:
-        args.interleave_group = 1
-    args.interleave_group = max(1, args.interleave_group)
-
-    if caller_file is not None:
-        script_dir = os.path.dirname(os.path.abspath(caller_file))
-        suite_names = [os.path.splitext(os.path.basename(caller_file))[0]]
-    else:
-        script_dir = os.path.dirname(os.path.abspath(__file__))
-        run_all = getattr(args, "all", False)
-        if run_all:
-            suite_names = sorted(
-                os.path.splitext(os.path.basename(f))[0]
-                for f in glob.glob(os.path.join(script_dir, "bench_*.py"))
-            )
-        elif args.suite:
-            suite_names = [args.suite]
-        else:
-            parser.error("provide a suite name or use --all")
-
-    os.chdir(script_dir)
-    if script_dir not in sys.path:
-        sys.path.insert(0, script_dir)
-
-    # One RNG for the whole run so the interleave order is reproducible given
-    # --seed. Shared across classes so the stream is deterministic end-to-end.
-    rng = random.Random(args.seed)
-    shuffle = not args.no_shuffle
-    if args.interleave_group > 1 and shuffle:
-        print(f"Interleave: group={args.interleave_group}, shuffled (seed={args.seed})")
-
-    all_results = {}
-    all_meta = {}
-    for suite_name in suite_names:
-        mod = importlib.import_module(suite_name)
-        for name in sorted(dir(mod)):
-            obj = getattr(mod, name)
-            if isinstance(obj, type) and name.startswith("Bench"):
-                results, meta = run_class(
-                    suite_name, obj, name, args.method_filter,
-                    warmup=args.warmup, iters=args.iters,
-                    inner=args.inner, target_window_ms=args.target_window_ms,
-                    cold_cache=args.cold_cache,
-                    cache_flush_mb=args.cache_flush_mb,
-                    interleave_group=args.interleave_group,
-                    rng=rng, shuffle=shuffle,
-                )
-                all_results.update(results)
-                all_meta.update(meta)
-
-    if all_results and not args.no_save:
-        save_asv_results(all_results, all_meta, label=args.label)
-
-
-if __name__ == "__main__":
-    run_as_main()
diff --git a/benchmarks/asv/parser_TEasv.py b/benchmarks/asv/parser_TEasv.py
deleted file mode 100644
index 47bbc6799..000000000
--- a/benchmarks/asv/parser_TEasv.py
+++ /dev/null
@@ -1,172 +0,0 @@
-#!/usr/bin/env python3
-###############################################################################
-# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
-#
-# See LICENSE for license information.
-###############################################################################
-"""benchstats parser for ASV-format result JSON files written by ``driver.py``.
-
-Reads one ASV result file (``<results_dir>/<machine>/<hash>-<env>.json``) and
-turns it into the ``{benchmark_name: {metric: ndarray}}`` structure consumed by
-``benchstats.compare.compareStats``.
-
-An ASV result file stores, per benchmark, a row whose columns are named by the
-file's ``result_columns`` list. The driver records raw per-call timing samples
-in the ``samples`` column (a list of sample-lists, one per parameter
-combination, in ``itertools.product`` order over ``params``). This parser flattens
-that into one benchstats "benchmark" per (benchmark, parameter-combination):
-
-- the benchmark name is ``<suite>.<Class>.<time_method> | name=val, ...`` where
-  the parameter names come from ``benchmarks.json`` (falling back to positional
-  ``p0, p1, ...`` when the index is unavailable).
-- a single metric ``time_s`` (seconds, lower is better) holds the raw samples.
-  Samples are already stored in seconds; benchstats' renderer auto-scales them
-  (to ms/us/ns) assuming a seconds base unit.
-
-Throughput is intentionally not exposed as a separate metric: the ASV result
-file carries no per-sample work, and because work is constant per parameter
-combination a rank-based test on throughput is identical to the test on time.
-The driver already prints throughput columns during a run.
-
-The class name matches the file name (``parser_TEasv``) so it can also be loaded
-by the ``benchstats`` CLI via ``--files_parser`` / ``--file1_parser``.
-"""
-
-import itertools
-import json
-import os
-import re
-
-import numpy as np
-
-from benchstats.common import ParserBase, LoggingConsole
-
-_TIME_KEY = "time_s"           # metric key exposed to benchstats (seconds)
-_NAME_DELIM = " | "
-
-
-class parser_TEasv(ParserBase):
-    def __init__(self, json_file_path, filter, metrics=None, debug_log=True) -> None:
-        assert isinstance(json_file_path, str)
-        assert filter is None or isinstance(filter, (str, re.Pattern))
-        assert metrics is None or (
-            isinstance(metrics, (list, tuple)) and all(isinstance(m, str) for m in metrics)
-        )
-
-        if debug_log is None or (isinstance(debug_log, bool) and not debug_log):
-            self.debug_log = False
-        elif isinstance(debug_log, bool) and debug_log:
-            self.debug_log = True
-            self.logger = LoggingConsole(log_level=LoggingConsole.LogLevel.Debug)
-        else:
-            self.debug_log = True
-            self.logger = debug_log
-
-        self.file = json_file_path
-        self.filter = (
-            filter if filter is None or isinstance(filter, re.Pattern) else re.compile(filter)
-        )
-        self._requested_metrics = list(metrics) if metrics is not None else None
-        self._stats = self._build()
-
-    def getStats(self) -> dict[str, dict[str, np.ndarray]]:
-        return self._stats
-
-    def _log(self, level, msg):
-        if self.debug_log:
-            getattr(self.logger, level)(f"parser_TEasv: {msg}")
-
-    def _load_param_names(self) -> dict:
-        """Map ``bench_key -> [param_names]`` from the sibling ``benchmarks.json``.
-
-        Layout: ``<results_dir>/<machine>/<file>.json`` and
-        ``<results_dir>/benchmarks.json``. The names are only used for readable
-        labels, so a missing/unreadable index degrades gracefully to ``{}``.
-        """
-        results_dir = os.path.dirname(os.path.dirname(os.path.abspath(self.file)))
-        index_path = os.path.join(results_dir, "benchmarks.json")
-        try:
-            with open(index_path) as f:
-                index = json.load(f)
-        except (OSError, ValueError):
-            self._log("warning", f"could not read '{index_path}'; using positional param names.")
-            return {}
-        return {
-            key: meta["param_names"]
-            for key, meta in index.items()
-            if isinstance(meta, dict) and "param_names" in meta
-        }
-
-    def _build(self) -> dict[str, dict[str, np.ndarray]]:
-        with open(self.file) as f:
-            data = json.load(f)
-
-        columns = data.get("result_columns")
-        results = data.get("results", {})
-        if not isinstance(columns, list) or "samples" not in columns:
-            raise ValueError(
-                f"'{self.file}' has no 'samples' column. Re-run the benchmarks with a "
-                "driver.py that records raw samples."
-            )
-        i_params = columns.index("params")
-        i_samples = columns.index("samples")
-
-        names_map = self._load_param_names()
-        want_time = self._metric_requested(_TIME_KEY)
-
-        stats = {}
-        for bench_key, row in results.items():
-            if not row or len(row) <= i_samples:
-                continue
-            params = row[i_params] or []
-            sample_lists = row[i_samples]
-            if sample_lists is None:
-                self._log("warning", f"benchmark '{bench_key}' has no samples; skipping.")
-                continue
-
-            combos = list(itertools.product(*params)) if params else [()]
-            param_names = names_map.get(bench_key)
-
-            for combo, samples in itertools.zip_longest(combos, sample_lists):
-                if samples is None:
-                    continue
-                time_s = np.asarray(samples, dtype=np.float64)
-                time_s = time_s[np.isfinite(time_s)]
-                if time_s.size == 0:
-                    continue
-
-                label = self._format_combo(param_names, combo)
-                bm_name = bench_key + (_NAME_DELIM + label if label else "")
-                if self.filter is not None and self.filter.search(bm_name) is None:
-                    continue
-
-                if self.debug_log and time_s.size < 10:
-                    self._log(
-                        "warning",
-                        f"benchmark '{bm_name}' has only {time_s.size} samples "
-                        "(>= 10 recommended); re-run with a larger -n/--iters.",
-                    )
-                if want_time:
-                    stats[bm_name] = {_TIME_KEY: time_s}
-
-        if not stats:
-            self._log("warning", f"no benchmarks read from '{self.file}'.")
-        return stats
-
-    @staticmethod
-    def _format_combo(param_names, combo):
-        """Build a readable ``name=val, ...`` label for one parameter combination."""
-        if combo is None:
-            return ""
-        values = [str(v) for v in combo]
-        if param_names and len(param_names) == len(values):
-            return ", ".join(f"{n}={v}" for n, v in zip(param_names, values))
-        return ", ".join(values)
-
-    def _metric_requested(self, key):
-        """Honor an explicit metrics= request (benchstats CLI), else expose everything."""
-        if self._requested_metrics is None:
-            return True
-        if key == _TIME_KEY:
-            return any(t in self._requested_metrics for t in (_TIME_KEY, "time_ms", "time"))
-        return key in self._requested_metrics
diff --git a/benchmarks/asv/requirements.txt b/benchmarks/asv/requirements.txt
deleted file mode 100644
index 7b70ea9f7..000000000
--- a/benchmarks/asv/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-# Extra dependencies for statistical benchmark comparison
-# (compare_results.py). benchstats pulls in rich, scipy and numpy.
-benchstats>=3.4
diff --git a/benchmarks/asv/run_benchmarks.sh b/benchmarks/asv/run_benchmarks.sh
deleted file mode 100755
index 07d1046df..000000000
--- a/benchmarks/asv/run_benchmarks.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env bash
-# Helper script for common benchmark tasks.
-set -euo pipefail
-
-cd "$(git rev-parse --show-toplevel)"
-
-BENCH_DIR="benchmarks/asv"
-ASV_CONF="$(pwd)/$BENCH_DIR/asv.conf.json"
-
-usage() {
-    cat <<EOF
-Usage: bash benchmarks/asv/run_benchmarks.sh <command> [options]
-
-Commands:
-  run [-w W] [-n N] [SUITE] [METHOD]
-                        Run benchmarks in-process (saves ASV-compatible results)
-  view                  Build the ASV HTML dashboard from saved results and serve it
-  list                  List available benchmark suites
-  compare BASE CAND [OPTS]
-                        Statistically compare two ASV result JSONs (benchstats);
-                        exits 1 on a significant timing regression (CI gating)
-
-EOF
-}
-
-case "${1:-}" in
-    run)
-        shift
-        if [[ $# -eq 0 ]]; then
-            python "$BENCH_DIR/driver.py" --all
-        else
-            python "$BENCH_DIR/driver.py" "$@"
-        fi
-        ;;
-    view)
-        asv publish --config "$ASV_CONF"
-        echo "Starting preview server at http://localhost:8080"
-        asv preview --config "$ASV_CONF"
-        ;;
-    list)
-        echo "Available benchmark suites:"
-        ls "$BENCH_DIR"/bench_*.py 2>/dev/null | sed 's|.*/bench_|  bench_|;s|\.py$||'
-        ;;
-    compare)
-        shift
-        python "$BENCH_DIR/compare_results.py" "$@"
-        ;;
-    *)
-        usage
-        exit 1
-        ;;
-esac
diff --git a/benchmarks/microbenchmarks/asv/README.md b/benchmarks/microbenchmarks/asv/README.md
new file mode 100644
index 000000000..de82d85ae
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/README.md
@@ -0,0 +1,152 @@
+# TransformerEngine Microbenchmarks
+
+GPU microbenchmarks for TE ops (GEMM, FP8 GEMM, grouped GEMM, attention,
+casting, normalization), run in-process by `driver.py`. Each suite is a
+`bench_*.py` file with a `Bench*` class; the driver times every `time_*` method,
+prints a table with throughput, and saves raw per-call samples to JSON for
+statistical comparison.
+
+## Prerequisites
+
+- TransformerEngine built and installed in the current Python environment.
+- A ROCm or CUDA GPU.
+
+## Running
+
+```bash
+cd benchmarks/microbenchmarks/asv
+python driver.py --all                    # run every suite
+python driver.py bench_gemm               # run one suite via the driver
+python bench_gemm.py                      # run one suite directly
+python bench_gemm.py time_forward         # filter to methods containing a string
+python bench_gemm.py -w 5 -n 20           # custom warmup / timed iterations
+python bench_casting.py --no-save         # don't write a result file
+python bench_casting.py --cold-cache      # flush GPU cache before each sample
+python bench_gemm.py --inner 50           # fix the inner-loop count to 50
+```
+
+Results are written to `benchmarks/microbenchmarks/asv/results/<commit-hash>.json`
+(gitignored), one raw-sample record per benchmark + parameter combination.
+
+## Timing model: inner loop and cache state
+
+Each `time_*` method runs its kernel `_inner` times inside one CUDA-event window
+and divides by `_inner`, amortizing kernel-launch and CUDA-event jitter
+(`~0.5 µs` on AMD). By default the driver auto-tunes `_inner` per (combo, method)
+so each window lasts at least `--target-window-ms` (default `1.0 ms`).
+
+| Flag | Effect |
+|---|---|
+| `--inner auto` (default) | Probe one invocation, then pick `_inner` so the next window lasts ≥ `--target-window-ms` (capped at 10000). |
+| `--inner N` | Force a fixed `_inner = N`. |
+| `--target-window-ms T` | Target window duration for `--inner auto` (default `1.0`). |
+| `--cold-cache` | Write a `--cache-flush-mb` scratch buffer before each sample to evict L2 + Infinity Cache. Implies `--inner=1` (otherwise later inner iterations refill the cache). |
+| `--cache-flush-mb M` | Scratch buffer size for `--cold-cache` (default `256`, sized for the MI300 Infinity Cache). |
+
+- **Warm cache, large `_inner`** (default): steady-state throughput, lowest variance.
+- **Cold cache, `_inner=1`**: isolated cold-memory cost — higher variance; bandwidth-bound benches (cast, norm) run ~1.5–3× slower than warm.
+
+## Sample scheduling: interleaving
+
+By default the driver does **not** collect a benchmark's samples in one
+contiguous block. It samples in round-robin chunks: it sets up a group of
+`(method, combo)` benchmarks, then takes one sample from each per round, for `-n`
+rounds. Sequential scheduling (all of A, then all of B) makes wall-clock time a
+proxy for benchmark identity, so any time-correlated GPU noise (thermal ramp,
+DVFS throttle, a neighbor on a shared GPU) becomes a systematic **bias** between
+benchmarks rather than noise. Round-robin spreads every benchmark across the same
+window, so a transient lands on one sample of each. The per-round visit order is
+also randomly permuted (seeded, so runs are reproducible) to remove residual
+within-round phase/predecessor bias.
+
+| Flag | Effect |
+|---|---|
+| `--interleave-group N` (default `8`) | Benchmarks sampled round-robin together. Each keeps a live GPU instance, so **lower this if a group runs out of memory**. |
+| `--sequential` | Collect each benchmark's samples contiguously (≡ `--interleave-group 1`). Lowest memory, biased under thermal drift. |
+| `--seed S` (default `0`) | Seed for the per-round shuffle. |
+| `--no-shuffle` | Fixed round-robin order instead of permuting each round (debugging). |
+
+Interleaving removes *within-run* time-position bias. It does **not** remove a
+whole-run thermal offset between two separately produced result files, so for the
+comparison below, produce the baseline and candidate files back-to-back under
+similar conditions.
+
+## Comparing two checkouts statistically
+
+The driver records raw per-call samples; `compare_results.py` compares two result
+files with a Brunner-Munzel test via
+[benchstats](https://github.com/Arech/benchstats):
+
+```bash
+pip install -r requirements.txt   # benchstats (pulls rich, scipy, numpy)
+cd benchmarks/microbenchmarks/asv
+
+python driver.py --all -n 20      # on the baseline checkout -> results/<base>.json
+python driver.py --all -n 20      # on the candidate checkout -> results/<cand>.json
+python compare_results.py results/<base>.json results/<cand>.json
+```
+
+It marks each `(benchmark, parameter combination)` faster (`>`), slower (`<`), or
+not significant (`~`), and exits `1` on a significant difference (CI gating).
+
+Two runs on the **same** commit (e.g. a dirty working tree, where `HEAD` is
+unchanged) would overwrite each other; pass `--label` to keep them distinct:
+
+```bash
+python driver.py --all -n 20 --label base   # -> results/<hash>-base.json
+python driver.py --all -n 20 --label cand   # -> results/<hash>-cand.json
+python compare_results.py results/<hash>-base.json results/<hash>-cand.json
+```
+
+| Flag | Effect |
+|---|---|
+| `--alpha A` | Significance level (default `0.001`). |
+| `--method M` | Statistical test (default `brunnermunzel`). |
+| `--filter REGEX` | Only compare benchmarks whose name matches `REGEX`. |
+| `--always-show-pvalues` | Show p-values for non-significant rows too. |
+| `--export-to FILE` | Save the report to `.txt`/`.svg`/`.html`. |
+
+The rank test needs a reasonable sample count (≥ ~10); the default `-n 20`
+satisfies this. Only timing is tested — throughput is a constant-work transform
+of time, so a rank test on it is identical.
+
+## Writing a new benchmark
+
+Add `bench_<name>.py` with a `Bench*` class subclassing `BenchBase`. Pull model
+shapes from `models.py` so configs stay in one place.
+
+```python
+import torch
+import transformer_engine.pytorch as te
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES
+
+class BenchSomething(BenchBase):
+    params = [M_SIZES, ["config_a", "config_b"]]
+    param_names = ["M", "config"]
+
+    def setup(self, M, config):
+        # Allocate tensors / modules. Runs once per (combo, method); the same
+        # instance is reused for warmup and timed iterations.
+        self.module = ...
+        self.x = ...
+
+    def time_forward(self, M, config):
+        # self._time runs the callable _inner times in one CUDA-event window
+        # and returns seconds per single invocation (handles --cold-cache).
+        return self._time(lambda: self.module(self.x))
+
+    # Optional: work_<name> returns per-call work for throughput columns.
+    def work_forward(self, M, config):
+        return {"flops": 2 * M * self.N * self.K}   # or {"bytes": ...}
+
+if __name__ == "__main__":
+    run_as_main(__file__)
+```
+
+Rules:
+- `time_*` methods are timed automatically; time through `self._time(fn)`.
+- `work_<name>` companions return **per-call** work and yield TFLOPS (`flops`) or GB/s (`bytes`) columns.
+- Clear `.grad` attributes in backward benchmarks to prevent accumulation.
+- `params` is a cross-product — keep the matrix size reasonable.
diff --git a/benchmarks/microbenchmarks/asv/bench_attention.py b/benchmarks/microbenchmarks/asv/bench_attention.py
new file mode 100644
index 000000000..395bb07cb
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_attention.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Attention benchmarks via te.DotProductAttention (causal, GQA).
+
+Forward FLOPs  = 4 * batch * num_q_heads * seq_len^2 * head_dim
+  (Q@K^T and attn@V, each 2*b*h*s^2*d).
+Backward FLOPs ~= 2 * Forward FLOPs.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES, attention_configs
+
+BATCH = 2
+MODELS = attention_configs()  # name -> (num_q_heads, num_kv_heads, head_dim, tp)
+
+
+class BenchAttention(BenchBase):
+    params = [M_SIZES, list(MODELS)]  # M_SIZES used as seq_len
+    param_names = ["seq_len", "model"]
+
+    def setup(self, seq_len, model):
+        n_q, n_kv, hd, tp = MODELS[model]
+        qh, kvh = n_q // tp, n_kv // tp
+        dtype = torch.bfloat16
+        self.attn = te.DotProductAttention(
+            num_attention_heads=qh, kv_channels=hd,
+            num_gqa_groups=kvh, attn_mask_type="causal",
+        ).to(device="cuda", dtype=dtype)
+        self.q = torch.randn(seq_len, BATCH, qh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.k = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.v = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.attn(self.q, self.k, self.v))
+
+    def work_forward(self, seq_len, model):
+        n_q, _, hd, tp = MODELS[model]
+        return {"flops": 4 * BATCH * (n_q // tp) * seq_len * seq_len * hd}
+
+    def work_forward_backward(self, seq_len, model):
+        n_q, _, hd, tp = MODELS[model]
+        return {"flops": 3 * 4 * BATCH * (n_q // tp) * seq_len * seq_len * hd}
+
+    def time_forward(self, seq_len, model):
+        return self._time(lambda: self.attn(self.q, self.k, self.v))
+
+    def time_forward_backward(self, seq_len, model):
+        t = self._time(lambda: self.attn(self.q, self.k, self.v).backward(self.grad_out))
+        self.q.grad = self.k.grad = self.v.grad = None
+        return t
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/bench_casting.py b/benchmarks/microbenchmarks/asv/bench_casting.py
new file mode 100644
index 000000000..9f4399b03
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_casting.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Quantization (BF16 -> FP8) and dequantization (FP8 -> BF16) benchmarks.
+
+Covers E4M3 (activations/weights) and E5M2 (gradients). These casts are
+memory-bound, so we report GB/s (input + output bytes).
+"""
+
+import torch
+from transformer_engine.pytorch import Float8CurrentScalingQuantizer
+from transformer_engine_torch import DType as TE_DType
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES, hidden_sizes
+
+HIDDEN = hidden_sizes()
+
+# cast name -> (direction, fp8 dtype)
+CAST_CONFIGS = {
+    "BF16_to_E4M3": ("quantize", TE_DType.kFloat8E4M3),
+    "E4M3_to_BF16": ("dequantize", TE_DType.kFloat8E4M3),
+    "BF16_to_E5M2": ("quantize", TE_DType.kFloat8E5M2),
+    "E5M2_to_BF16": ("dequantize", TE_DType.kFloat8E5M2),
+}
+
+
+class BenchCasting(BenchBase):
+    params = [M_SIZES, list(HIDDEN), list(CAST_CONFIGS)]
+    param_names = ["M", "model", "cast"]
+
+    def setup(self, M, model, cast):
+        hidden = HIDDEN[model]
+        direction, fp8_dtype = CAST_CONFIGS[cast]
+        quantizer = Float8CurrentScalingQuantizer(
+            fp8_dtype=fp8_dtype, device=torch.device("cuda"),
+            rowwise=True, columnwise=False,
+        )
+        if direction == "dequantize":
+            x = quantizer.quantize(torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda"))
+            self._call = lambda: x.dequantize(dtype=torch.bfloat16)
+        else:
+            x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+            self._call = lambda: quantizer.quantize(x)
+
+    def work_cast(self, M, model, cast):
+        # quantize: read BF16 (2B) + write FP8 (1B) + scale; dequantize: the
+        # reverse -- 3 bytes/element either way.
+        return {"bytes": M * HIDDEN[model] * 3}
+
+    def time_cast(self, M, model, cast):
+        return self._time(self._call)
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/bench_gemm.py b/benchmarks/microbenchmarks/asv/bench_gemm.py
new file mode 100644
index 000000000..24319cf80
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_gemm.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""BF16 GEMM benchmarks via te.Linear.
+
+Shapes are the four transformer projections (QKV, AttnOut, GateUp, Down)
+derived from the models in models.py.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES, gemm_shapes
+
+SHAPES = gemm_shapes()
+
+
+class BenchGemm(BenchBase):
+    params = [M_SIZES, list(SHAPES)]
+    param_names = ["M", "shape"]
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.linear(self.x))
+
+    def work_forward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 2 * M * N * K}
+
+    def work_forward_backward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 3 * 2 * M * N * K}
+
+    def time_forward(self, M, shape):
+        return self._time(lambda: self.linear(self.x))
+
+    def time_forward_backward(self, M, shape):
+        t = self._time(lambda: self.linear(self.x).backward(self.grad_out))
+        self.x.grad = None
+        self.linear.weight.grad = None
+        return t
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/bench_gemm_fp8.py b/benchmarks/microbenchmarks/asv/bench_gemm_fp8.py
new file mode 100644
index 000000000..a6f761afa
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_gemm_fp8.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""FP8 GEMM benchmarks via te.Linear under fp8_autocast.
+
+Same shapes as bench_gemm.py but with FP8 (HYBRID) quantized compute.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+from transformer_engine.common.recipe import DelayedScaling, Format
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES, gemm_shapes
+
+SHAPES = gemm_shapes()
+FP8_RECIPE = DelayedScaling(
+    fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max",
+)
+
+
+class BenchGemmFP8(BenchBase):
+    params = [M_SIZES, list(SHAPES)]
+    param_names = ["M", "shape"]
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn(M, N, dtype=dtype, device="cuda")
+
+    def work_forward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 2 * M * N * K}
+
+    def work_forward_backward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 3 * 2 * M * N * K}
+
+    def _forward(self):
+        with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+            return self.linear(self.x)
+
+    def time_forward(self, M, shape):
+        return self._time(self._forward)
+
+    def time_forward_backward(self, M, shape):
+        t = self._time(lambda: self._forward().backward(self.grad_out))
+        self.x.grad = None
+        self.linear.weight.grad = None
+        return t
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/bench_grouped_gemm.py b/benchmarks/microbenchmarks/asv/bench_grouped_gemm.py
new file mode 100644
index 000000000..58b1d27fb
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_grouped_gemm.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Grouped GEMM benchmarks via te.GroupedLinear (MoE GateUp / Down)."""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES_MOE, grouped_gemm_configs
+
+CONFIGS = grouped_gemm_configs()  # name -> (num_gemms, N, K)
+
+
+class BenchGroupedGemm(BenchBase):
+    params = [M_SIZES_MOE, list(CONFIGS)]
+    param_names = ["M", "config"]
+
+    def setup(self, M, config):
+        B, N, K = CONFIGS[config]
+        dtype = torch.bfloat16
+        self.module = te.GroupedLinear(
+            num_gemms=B, in_features=K, out_features=N, bias=False,
+        ).to(device="cuda", dtype=dtype)
+        self.xs = [
+            torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+            for _ in range(B)
+        ]
+        self.grad_outs = [torch.randn_like(o) for o in self.module(self.xs)]
+
+    def work_forward(self, M, config):
+        B, N, K = CONFIGS[config]
+        return {"flops": B * 2 * M * N * K}
+
+    def work_forward_backward(self, M, config):
+        B, N, K = CONFIGS[config]
+        return {"flops": B * 3 * 2 * M * N * K}
+
+    def time_forward(self, M, config):
+        return self._time(lambda: self.module(self.xs))
+
+    def time_forward_backward(self, M, config):
+        t = self._time(lambda: torch.autograd.backward(self.module(self.xs), self.grad_outs))
+        for x in self.xs:
+            x.grad = None
+        for p in self.module.parameters():
+            p.grad = None
+        return t
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/bench_normalization.py b/benchmarks/microbenchmarks/asv/bench_normalization.py
new file mode 100644
index 000000000..3412e4170
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_normalization.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""RMSNorm and LayerNorm benchmarks on activation-sized tensors.
+
+Memory-bound; we report GB/s. The hidden dimension is swept over the distinct
+model hidden sizes and M (batch * seq_len) over typical training sizes.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES, unique_hidden_sizes
+
+NORMS = {"RMSNorm": te.RMSNorm, "LayerNorm": te.LayerNorm}
+
+
+class BenchNormalization(BenchBase):
+    params = [M_SIZES, unique_hidden_sizes(), list(NORMS)]
+    param_names = ["M", "hidden", "norm_type"]
+
+    def setup(self, M, hidden, norm_type):
+        dtype = torch.bfloat16
+        self.norm = NORMS[norm_type](hidden).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, hidden, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.norm(self.x))
+
+    def work_forward(self, M, hidden, norm_type):
+        # read input (2B) + write output (2B)
+        return {"bytes": M * hidden * 4}
+
+    def work_forward_backward(self, M, hidden, norm_type):
+        # fwd read+write (4B) + bwd read input+grad_out, write grad_in (6B)
+        return {"bytes": M * hidden * 10}
+
+    def time_forward(self, M, hidden, norm_type):
+        return self._time(lambda: self.norm(self.x))
+
+    def time_forward_backward(self, M, hidden, norm_type):
+        t = self._time(lambda: self.norm(self.x).backward(self.grad_out))
+        self.x.grad = None
+        for p in self.norm.parameters():
+            p.grad = None
+        return t
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/compare_results.py b/benchmarks/microbenchmarks/asv/compare_results.py
new file mode 100755
index 000000000..5558e278a
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/compare_results.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Statistically compare two result JSON files written by ``driver.py``.
+
+A point-estimate (median) cannot tell a real regression from measurement noise.
+This tool compares the raw per-call samples stored in two result files (one per
+checkout) with a statistical test (Brunner-Munzel by default) via the benchstats
+package. It marks each (benchmark, parameter combination) as faster (``>``),
+slower (``<``), or not significantly different (``~``), prints a per-direction
+summary, and exits ``1`` when a significant timing difference is found so it can
+gate CI. Requires ``pip install -r requirements.txt``.
+
+Usage:
+    # run the suite on each checkout (each saves <hash>.json), then:
+    python compare_results.py results/<base>.json results/<cand>.json
+    python compare_results.py base.json cand.json --alpha 0.01
+    python compare_results.py base.json cand.json --export-to report.svg
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+
+import numpy as np
+
+_TIME_KEY = "time_s"  # metric exposed to benchstats (seconds, lower is better)
+
+
+def _load_samples(path, name_filter=None):
+    """Load a driver result JSON into ``{bench_name: {"time_s": ndarray}}``.
+
+    One benchstats "benchmark" per (benchmark, parameter combination); the name
+    is ``<suite>.<Class>.<method> | name=val, ...``. Only timing is exposed:
+    throughput is a constant-work transform of time, so a rank test on it is
+    identical.
+    """
+    with open(path) as f:
+        data = json.load(f)
+    pattern = re.compile(name_filter) if name_filter else None
+
+    stats = {}
+    for bench_key, rec in data.get("results", {}).items():
+        param_names = rec.get("param_names") or []
+        for combo, samples in zip(rec.get("combos") or [], rec.get("samples") or []):
+            if not samples:
+                continue
+            arr = np.asarray(samples, dtype=np.float64)
+            arr = arr[np.isfinite(arr)]
+            if arr.size == 0:
+                continue
+            if param_names and len(param_names) == len(combo):
+                label = ", ".join(f"{n}={v}" for n, v in zip(param_names, combo))
+            else:
+                label = ", ".join(str(v) for v in combo)
+            name = bench_key + (" | " + label if label else "")
+            if pattern is not None and pattern.search(name) is None:
+                continue
+            stats[name] = {_TIME_KEY: arr}
+    return stats
+
+
+def run_stats(args):
+    """Compare two result JSONs; return 1 if a significant difference is found."""
+    import rich.table  # noqa: F401  benchstats render uses rich.table without importing it
+    from benchstats.compare import compareStats
+    from benchstats.render import renderComparisonResults
+    from benchstats.common import LoggingConsole, detectExportFormat
+
+    main_metrics = [_TIME_KEY]
+    export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None
+    if export_fmt is not None and os.path.isfile(args.export_to):
+        os.remove(args.export_to)
+
+    console = LoggingConsole(
+        record=export_fmt is not None, log_level=LoggingConsole.LogLevel.Warning,
+    )
+
+    s1 = _load_samples(args.baseline_json, args.filter)
+    s2 = _load_samples(args.candidate_json, args.filter)
+
+    cr = compareStats(
+        s1, s2, method=args.method, alpha=args.alpha,
+        main_metrics=main_metrics, debug_log=console,
+    )
+    renderComparisonResults(
+        cr, console, main_metrics=main_metrics,
+        always_show_pvalues=args.always_show_pvalues,
+    )
+
+    # benchstats encodes each comparison as baseline-vs-candidate: "<" means
+    # baseline < candidate (candidate slower -> regression), ">" means candidate
+    # faster, "~" means not significant at alpha.
+    for metric in main_metrics:
+        counts = {"<": 0, ">": 0, "~": 0}
+        for bm_res in cr.results.values():
+            res = bm_res.get(metric)
+            if res is not None:
+                counts[res.result] = counts.get(res.result, 0) + 1
+        total = sum(counts.values())
+        console.print(
+            f"\nSummary for '{metric}' ({cr.method}, alpha={cr.alpha:g}, "
+            f"{total} benchmarks):"
+        )
+        console.print(f"  candidate faster (significant, '>'): {counts['>']}")
+        console.print(f"  candidate slower (significant, '<'): {counts['<']}")
+        console.print(f"  no significant difference ('~'):     {counts['~']}")
+
+    if export_fmt is not None:
+        {"txt": lambda: console.save_text(args.export_to),
+         "svg": lambda: console.save_svg(args.export_to, title=""),
+         "html": lambda: console.save_html(args.export_to)}[export_fmt]()
+
+    if cr.at_least_one_differs:
+        console.warning("At least one significant timing difference was detected (exit 1).")
+        return 1
+    return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Statistically compare two driver result JSONs via benchstats.")
+    parser.add_argument("baseline_json", help="Baseline result JSON")
+    parser.add_argument("candidate_json", help="Candidate result JSON")
+    parser.add_argument("--filter", default=None,
+                        help="Only compare benchmarks whose name matches this regex.")
+    parser.add_argument("--alpha", type=float, default=0.001,
+                        help="Significance level for the test (default: 0.001).")
+    parser.add_argument("--method", default="brunnermunzel",
+                        help="Statistical test to use (default: brunnermunzel).")
+    parser.add_argument("--always-show-pvalues", action="store_true",
+                        help="Show p-values for non-significant rows too.")
+    parser.add_argument("--export-to", default=None, metavar="FILE",
+                        help="Export the report to a .txt/.svg/.html file (format from extension).")
+    return run_stats(parser.parse_args())
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/microbenchmarks/asv/driver.py b/benchmarks/microbenchmarks/asv/driver.py
new file mode 100644
index 000000000..aa9f2f5b4
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/driver.py
@@ -0,0 +1,469 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""In-process microbenchmark driver.
+
+Discovers ``Bench*`` classes in ``bench_*.py`` files, runs their ``time_*``
+methods with robust GPU timing (inner-loop amortization, optional cold cache,
+round-robin interleaving), prints a table with throughput, and saves the raw
+per-call samples to JSON for ``compare_results.py``.
+
+Usage:
+    python driver.py <suite> [method_filter] [-w W] [-n N] [--no-save]
+    python driver.py --all [-w W] [-n N]
+    python bench_gemm.py [method_filter] [-w W] [-n N]      # bench file as main
+"""
+
+import argparse
+import glob
+import importlib
+import itertools
+import json
+import os
+import random
+import re
+import subprocess
+import sys
+import time
+
+import numpy as np
+
+
+# ---------------------------------------------------------------------------
+# Benchmark base class
+# ---------------------------------------------------------------------------
+
+class BenchBase:
+    """Base for benchmark classes: driver-controlled knobs + the timing helper.
+
+    The driver sets ``_inner`` (kernel invocations per CUDA-event window, to
+    amortize launch + event overhead) and ``_scratch`` (a buffer written before
+    each sample to evict the GPU cache in ``--cold-cache`` mode) per
+    (combo, method). Subclasses time their kernels through :meth:`_time`.
+    """
+
+    _inner = 1
+    _scratch = None
+
+    def _time(self, fn):
+        """Run *fn* ``_inner`` times in one CUDA-event window; return seconds/call.
+
+        Honors ``--cold-cache`` (flush scratch before the window) and ``--inner``
+        (loop count). The per-call value is what the driver and throughput
+        columns consume regardless of inner-loop count.
+        """
+        import torch  # deferred: driver stays importable without torch
+        evt = getattr(self, "_evt", None)
+        if evt is None:
+            evt = self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        evt[0].record()
+        for _ in range(self._inner):
+            fn()
+        evt[1].record()
+        torch.cuda.synchronize()
+        return evt[0].elapsed_time(evt[1]) / 1000 / self._inner
+
+
+# ---------------------------------------------------------------------------
+# Results
+# ---------------------------------------------------------------------------
+
+def _get_commit_hash():
+    """Current git HEAD hash, or 'unknown' outside a checkout."""
+    try:
+        return subprocess.check_output(
+            ["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL
+        ).decode().strip()
+    except Exception:
+        return "unknown"
+
+
+def _results_dir():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "results")
+
+
+def save_results(all_results, label=None, results_dir=None):
+    """Write raw per-call samples to ``<results_dir>/<hash>[-<label>].json``.
+
+    *label*, when given, is folded into the filename so multiple runs on the same
+    commit (e.g. a dirty working tree, where HEAD is unchanged) land in distinct
+    files that ``compare_results.py`` can compare instead of overwriting.
+    """
+    commit = _get_commit_hash()
+    results_dir = results_dir or _results_dir()
+    os.makedirs(results_dir, exist_ok=True)
+
+    suffix = ""
+    if label:
+        suffix = "-" + re.sub(r"[^A-Za-z0-9._-]+", "_", label).strip("_")
+    path = os.path.join(results_dir, f"{commit[:8]}{suffix}.json")
+
+    if os.path.exists(path):
+        with open(path) as f:
+            data = json.load(f)
+    else:
+        data = {"commit_hash": commit, "date": int(time.time() * 1000), "results": {}}
+    data["results"].update(all_results)
+
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+    print(f"\nResults saved to {path}")
+
+
+def _compute_stats(samples):
+    """Return ``(median, mean, stdev, q25, q75)`` for *samples*.
+
+    Quartiles use linear interpolation (numpy default), more meaningful at small
+    n than index-floor; stdev is the population standard deviation.
+    """
+    s = np.asarray(samples, dtype=np.float64)
+    median, q25, q75 = (float(x) for x in np.quantile(s, [0.5, 0.25, 0.75]))
+    return median, float(s.mean()), float(s.std(ddof=0)), q25, q75
+
+
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+
+def _make_scratch(mb):
+    """Allocate a scratch buffer used to evict the GPU cache between samples.
+
+    Sized by default to exceed the MI300 Infinity Cache (256 MB) and the L2
+    (16 MB), so a single fill writes through every level of cache.
+    """
+    import torch  # deferred: only needed when cold-cache is on
+    n = max(1, (mb * 1024 * 1024) // 4)  # float32 = 4 bytes
+    return torch.empty(n, dtype=torch.float32, device="cuda")
+
+
+def _autotune_inner(instance, method_name, combo, target_s, max_inner=10000):
+    """Pick an inner-loop count so one timed window lasts >= *target_s*.
+
+    Runs two single invocations: one to settle algorithm selection / cache
+    state, and one to estimate the per-call cost.
+    """
+    method = getattr(instance, method_name)
+    saved_inner = instance._inner
+    instance._inner = 1
+    try:
+        method(*combo)               # discard: cold cache + autotuner warmup
+        t_per = method(*combo)       # seconds per single invocation
+    finally:
+        instance._inner = saved_inner
+    if t_per is None or t_per <= 0:
+        return 1
+    return max(1, min(max_inner, int(target_s / t_per) + 1))
+
+
+def _free_gpu_cache():
+    """Release cached GPU memory between interleave chunks (no-op without torch)."""
+    torch = sys.modules.get("torch")
+    if torch is not None:
+        try:
+            torch.cuda.empty_cache()
+        except Exception:
+            pass
+
+
+def run_class(
+    suite_name, cls, class_name, method_filter=None,
+    warmup=3, iters=7,
+    inner="auto", target_window_ms=1.0,
+    cold_cache=False, cache_flush_mb=256,
+    interleave_group=8, rng=None, shuffle=True,
+):
+    """Run all ``time_*`` methods in *cls*, returning a ``{bench_key: record}`` dict.
+
+    Samples are collected in round-robin chunks of ``interleave_group``
+    ``(method, combo)`` benchmarks: one sample from each per round, for *iters*
+    rounds. This spreads every benchmark's samples across the same wall-clock
+    window so time-correlated GPU noise (thermal ramp, DVFS throttle) becomes
+    shared variance rather than a bias on whichever benchmark owned a contiguous
+    block of time. ``interleave_group=1`` reproduces sequential behavior; larger
+    groups interleave more but keep that many GPU instances live at once.
+
+    When *shuffle* is true the per-round visit order is randomly permuted (seeded
+    by *rng*), making each benchmark's within-round phase and predecessor uniform
+    in expectation, turning residual ordering bias into variance. The per-round
+    structure is kept (each benchmark still gets exactly *iters* evenly-spread
+    samples) -- a balanced randomized design, not a global shuffle.
+    """
+    methods = sorted(m for m in dir(cls) if m.startswith("time_"))
+    if method_filter:
+        methods = [m for m in methods if method_filter in m]
+    if not methods:
+        return {}
+
+    params = getattr(cls, "params", [[]])
+    param_names = list(getattr(cls, "param_names", []))
+    combos = list(itertools.product(*params))
+    n_combos = len(combos)
+
+    # Discover throughput columns from work_* companions.
+    # Each entry: (dict_key, column_header, unit_divisor).
+    probe_keys = set()
+    for m in methods:
+        wfn = getattr(cls, "work_" + m[5:], None)
+        if wfn:
+            try:
+                probe_keys.update(wfn(cls(), *combos[0]))
+            except Exception:
+                pass
+    throughput_cols = []
+    if "flops" in probe_keys:
+        throughput_cols.append(("flops", "TFLOPS", 1e12))
+    if "bytes" in probe_keys:
+        throughput_cols.append(("bytes", "GB/s", 1e9))
+
+    target_window_s = target_window_ms / 1000.0
+    group = max(1, int(interleave_group))
+    if rng is None:
+        rng = random.Random(0)
+    inner_desc = (
+        "cold-cache (inner=1)" if cold_cache
+        else f"inner={inner}" if inner != "auto"
+        else f"inner=auto (>={target_window_ms:g}ms window)"
+    )
+    sched_desc = ("sequential" if group == 1
+                  else f"interleaved group={group}, " + ("shuffled" if shuffle else "fixed-order"))
+    print(f"\n{class_name}  ({len(combos)} combos x {len(methods)} methods, "
+          f"{warmup} warmup, {iters} timed, {inner_desc}, {sched_desc})")
+    extra_hdr = "".join(f"  {label:>10}" for _, label, _ in throughput_cols)
+    HDR = (f"  {'median':>10}  {'mean':>10}  {'stdev':>10}"
+           f"  {'q25':>10}  {'q75':>10}  {'min':>10}  {'max':>10}"
+           + extra_hdr + f"  {'inner':>5}  {'method':<30}  params")
+    print("-" * len(HDR))
+    print(HDR)
+    print("-" * len(HDR))
+
+    def _label(combo):
+        return ", ".join(f"{nm}={v}" for nm, v in zip(param_names, combo))
+
+    # Samples per method, indexed by combo position. Filling by index decouples
+    # the wire format from the order samples are actually collected in, so
+    # interleaved scheduling leaves the saved JSON identical to sequential.
+    samples_by_method = {m: [None] * n_combos for m in methods}
+
+    # Flatten to (method, combo) tasks, method-major so printed rows keep their
+    # grouping, then sample them in round-robin chunks.
+    tasks = [(mi, ci) for mi in range(len(methods)) for ci in range(n_combos)]
+
+    for chunk_start in range(0, len(tasks), group):
+        chunk = tasks[chunk_start:chunk_start + group]
+
+        # Setup phase: prepare every benchmark in the chunk (allocate tensors,
+        # pick _inner, warm up) and keep its instance live for round-robin timing.
+        live = []  # (instance, method_name, combo, combo_idx)
+        for mi, ci in chunk:
+            method_name = methods[mi]
+            combo = combos[ci]
+            instance = cls()
+            try:
+                instance.setup(*combo)
+            except Exception as e:
+                print(f"  SKIP  {_label(combo)}  setup failed: {e}")
+                continue  # leaves None in this (method, combo) slot
+
+            # Cold-cache mode forces inner=1 so only the first invocation in the
+            # window sees a cold cache; otherwise the 2nd..Nth would refill it.
+            if cold_cache:
+                instance._scratch = _make_scratch(cache_flush_mb)
+                instance._inner = 1
+            elif inner == "auto":
+                instance._inner = _autotune_inner(
+                    instance, method_name, combo, target_window_s)
+            else:
+                instance._inner = max(1, int(inner))
+
+            method = getattr(instance, method_name)
+            for _ in range(warmup):
+                method(*combo)
+            live.append((instance, method_name, combo, ci))
+
+        # Timed phase: one sample from each live benchmark per round, so a
+        # transient spike lands on one sample of each rather than corrupting a
+        # whole benchmark's contiguous block. Visit order is re-permuted each
+        # round (when shuffle is on); chunk_samples stays keyed by index i.
+        chunk_samples = [[] for _ in live]
+        order = list(range(len(live)))
+        for _ in range(iters):
+            if shuffle and len(order) > 1:
+                rng.shuffle(order)
+            for i in order:
+                instance, method_name, combo, ci = live[i]
+                method = getattr(instance, method_name)
+                t0 = time.perf_counter()
+                result = method(*combo)
+                wall = time.perf_counter() - t0
+                chunk_samples[i].append(wall if result is None else result)
+
+        # Finalize: stats, throughput, print, store into the combo slot.
+        for i, (instance, method_name, combo, ci) in enumerate(live):
+            samples = chunk_samples[i]
+            median, mean, stdev, q25, q75 = _compute_stats(samples)
+            s_min, s_max = min(samples), max(samples)
+
+            # Raw samples (seconds) for statistical comparison; rounded to 1 ns
+            # to keep the JSON compact without losing timing resolution.
+            samples_by_method[method_name][ci] = [round(x, 9) for x in samples]
+
+            work = {}
+            wfn = getattr(instance, "work_" + method_name[5:], None)
+            if wfn and median > 0:
+                try:
+                    work = wfn(*combo)
+                except Exception:
+                    pass
+            extra_cols = ""
+            for key, _, divisor in throughput_cols:
+                if key in work and median > 0:
+                    extra_cols += f"  {work[key] / median / divisor:>10.1f}"
+                else:
+                    extra_cols += f"  {'':>10}"
+
+            print(f"  {median*1000:>8.3f}ms  {mean*1000:>8.3f}ms  "
+                  f"{stdev*1000:>8.3f}ms  {q25*1000:>8.3f}ms  {q75*1000:>8.3f}ms  "
+                  f"{s_min*1000:>8.3f}ms  {s_max*1000:>8.3f}ms"
+                  f"{extra_cols}  "
+                  f"{instance._inner:>5}  {method_name:<30}  {_label(combo)}")
+
+        live.clear()
+        _free_gpu_cache()
+
+    combos_json = [list(c) for c in combos]
+    return {
+        f"{suite_name}.{class_name}.{m}": {
+            "param_names": param_names,
+            "combos": combos_json,
+            "samples": samples_by_method[m],
+        }
+        for m in methods
+    }
+
+
+def run_as_main(caller_file=None):
+    """Run benchmarks from a bench file's ``__main__`` block or the command line.
+
+    From a bench file::
+
+        if __name__ == "__main__":
+            from driver import run_as_main
+            run_as_main(__file__)
+    """
+    parser = argparse.ArgumentParser(
+        description="Run microbenchmarks in-process (no subprocess overhead).")
+    if caller_file is None:
+        parser.add_argument("suite", nargs="?", default=None,
+                            help="Benchmark module name (e.g. bench_casting)")
+        parser.add_argument("--all", action="store_true",
+                            help="Run all bench_*.py suites in the directory")
+    parser.add_argument("method_filter", nargs="?", default=None,
+                        help="Only run time_* methods containing this string")
+    parser.add_argument("-w", "--warmup", type=int, default=10,
+                        help="Number of warmup iterations (default: 10)")
+    parser.add_argument("-n", "--iters", type=int, default=20,
+                        help="Number of timed iterations (default: 20)")
+    parser.add_argument("--inner", default="auto",
+                        help="Inner kernel invocations per timed window: 'auto' "
+                             "(tune to --target-window-ms) or an integer "
+                             "(default: auto). Larger values amortize CUDA event "
+                             "and kernel-launch overhead.")
+    parser.add_argument("--target-window-ms", type=float, default=1.0,
+                        help="Target duration of one timed window when "
+                             "--inner=auto (default: 1.0 ms).")
+    parser.add_argument("--cold-cache", action="store_true",
+                        help="Flush the GPU cache (write a >LLC scratch buffer) "
+                             "before each sample. Forces --inner=1 because "
+                             "subsequent inner calls would refill the cache.")
+    parser.add_argument("--cache-flush-mb", type=int, default=256,
+                        help="Size in MB of the cache-flush buffer for "
+                             "--cold-cache (default: 256, sized for the MI300 "
+                             "Infinity Cache).")
+    parser.add_argument("--interleave-group", type=int, default=8,
+                        help="Number of (method, combo) benchmarks sampled "
+                             "round-robin together so time-correlated GPU noise "
+                             "is shared across them instead of biasing whichever "
+                             "benchmark owns a contiguous block of time "
+                             "(default: 8). Each keeps a live GPU instance, so "
+                             "lower this on out-of-memory. 1 = sequential.")
+    parser.add_argument("--sequential", action="store_true",
+                        help="Collect each benchmark's samples contiguously "
+                             "(equivalent to --interleave-group 1). Lowest "
+                             "memory, but biased under thermal drift.")
+    parser.add_argument("--seed", type=int, default=0,
+                        help="Seed for the per-round shuffle of the interleave "
+                             "order (default: 0), kept fixed for reproducibility.")
+    parser.add_argument("--no-shuffle", action="store_true",
+                        help="Disable the per-round random permutation and use a "
+                             "fixed round-robin order, leaving a small residual "
+                             "ordering bias.")
+    parser.add_argument("--no-save", action="store_true",
+                        help="Skip saving results to JSON.")
+    parser.add_argument("--label", default=None,
+                        help="Tag folded into the result filename "
+                             "(<hash>-<label>.json). Use it to keep multiple runs "
+                             "on the same commit in distinct files for comparison.")
+    args = parser.parse_args()
+    if args.inner != "auto":
+        try:
+            args.inner = max(1, int(args.inner))
+        except ValueError:
+            parser.error("--inner must be 'auto' or a positive integer")
+    if args.sequential:
+        args.interleave_group = 1
+    args.interleave_group = max(1, args.interleave_group)
+
+    if caller_file is not None:
+        script_dir = os.path.dirname(os.path.abspath(caller_file))
+        suite_names = [os.path.splitext(os.path.basename(caller_file))[0]]
+    else:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        if getattr(args, "all", False):
+            suite_names = sorted(
+                os.path.splitext(os.path.basename(f))[0]
+                for f in glob.glob(os.path.join(script_dir, "bench_*.py"))
+            )
+        elif args.suite:
+            suite_names = [args.suite]
+        else:
+            parser.error("provide a suite name or use --all")
+
+    os.chdir(script_dir)
+    if script_dir not in sys.path:
+        sys.path.insert(0, script_dir)
+
+    # One RNG for the whole run so the interleave order is reproducible given
+    # --seed; shared across classes so the stream is deterministic end-to-end.
+    rng = random.Random(args.seed)
+    shuffle = not args.no_shuffle
+    if args.interleave_group > 1 and shuffle:
+        print(f"Interleave: group={args.interleave_group}, shuffled (seed={args.seed})")
+
+    all_results = {}
+    for suite_name in suite_names:
+        mod = importlib.import_module(suite_name)
+        for name in sorted(dir(mod)):
+            obj = getattr(mod, name)
+            # Any Bench* class that defines a time_* method (excludes BenchBase,
+            # and is robust to the bench-file/driver __main__ double-import).
+            if (isinstance(obj, type) and name.startswith("Bench")
+                    and any(m.startswith("time_") for m in dir(obj))):
+                all_results.update(run_class(
+                    suite_name, obj, name, args.method_filter,
+                    warmup=args.warmup, iters=args.iters,
+                    inner=args.inner, target_window_ms=args.target_window_ms,
+                    cold_cache=args.cold_cache, cache_flush_mb=args.cache_flush_mb,
+                    interleave_group=args.interleave_group, rng=rng, shuffle=shuffle,
+                ))
+
+    if all_results and not args.no_save:
+        save_results(all_results, label=args.label)
+
+
+if __name__ == "__main__":
+    run_as_main()
diff --git a/benchmarks/microbenchmarks/asv/models.py b/benchmarks/microbenchmarks/asv/models.py
new file mode 100644
index 000000000..50a71393a
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/models.py
@@ -0,0 +1,89 @@
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Shared model configurations and shape derivations for the microbenchmarks.
+
+Single source of truth for the model shapes every ``bench_*.py`` sweeps over,
+so a new model is added in one place. Config sources:
+
+  - Llama 3.1 8B   https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
+  - Llama 3.1 70B  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
+  - Llama 3.1 405B https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
+  - Qwen 2.5 7B    https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
+  - Qwen 2.5 72B   https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
+  - MoE configs    https://github.com/AMD-AGI/Primus-Turbo/blob/main/benchmark/ops/config.py
+"""
+
+# Token-count (batch * seq_len) sweeps shared across suites.
+M_SIZES = [1024, 2048, 4096, 8192]
+M_SIZES_MOE = [512, 1024, 2048, 4096]
+
+# Dense transformer models, keyed by "<family>_TP<tp>".
+# Value = (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp).
+MODELS = {
+    "Llama3-8B_TP1":   (4096,  14336,  32, 8, 128, 1),
+    "Llama3-8B_TP8":   (4096,  14336,  32, 8, 128, 8),
+    "Llama3-70B_TP8":  (8192,  28672,  64, 8, 128, 8),
+    "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8),
+    "Qwen2.5-7B_TP1":  (3584,  18944,  28, 4, 128, 1),
+    "Qwen2.5-72B_TP8": (8192,  29568,  64, 8, 128, 8),
+}
+
+# MoE models for grouped GEMM: (num_routed_experts, moe_intermediate, hidden).
+MOE_MODELS = {
+    "DSV2-Lite": (64, 1408, 2048),
+    "DSV2":      (160, 1536, 5120),
+    "DSV3":      (256, 2048, 7168),
+    "Grok-V2":   (8, 16384, 8192),
+}
+
+
+def attention_configs(models=MODELS):
+    """Return ``{name: (num_q_heads, num_kv_heads, head_dim, tp)}``."""
+    return {name: cfg[2:6] for name, cfg in models.items()}
+
+
+def gemm_shapes(models=MODELS):
+    """Return ``{shape_name: (N, K)}`` for the four transformer projections.
+
+    Each model contributes QKV, AttnOut, GateUp (SwiGLU), and Down GEMMs.
+    """
+    shapes = {}
+    for name, (hidden, inter, n_q, n_kv, hd, tp) in models.items():
+        shapes[f"{name}-QKV"] = ((n_q * hd + 2 * n_kv * hd) // tp, hidden)
+        shapes[f"{name}-AttnOut"] = (hidden, (n_q * hd) // tp)
+        shapes[f"{name}-GateUp"] = ((2 * inter) // tp, hidden)
+        shapes[f"{name}-Down"] = (hidden, inter // tp)
+    return shapes
+
+
+def grouped_gemm_configs(models=MOE_MODELS, eps=(32, 16, 8)):
+    """Return ``{config_name: (num_gemms, N, K)}`` for MoE GateUp/Down GEMMs.
+
+    One entry per (model, expert-parallel size) where the experts divide evenly.
+    """
+    configs = {}
+    for model, (n_experts, inter, hidden) in models.items():
+        for ep in eps:
+            if n_experts % ep != 0:
+                continue
+            num_gemms = n_experts // ep
+            configs[f"{model}_EP{ep}-GateUp"] = (num_gemms, 2 * inter, hidden)
+            configs[f"{model}_EP{ep}-Down"] = (num_gemms, hidden, inter)
+    return configs
+
+
+def hidden_sizes(models=MODELS):
+    """Return ``{model_family: hidden}`` (TP-independent) for element-wise benches."""
+    out = {}
+    for name, cfg in models.items():
+        family = name.split("_TP")[0]
+        out.setdefault(family, cfg[0])
+    return out
+
+
+def unique_hidden_sizes(models=MODELS):
+    """Return the sorted distinct hidden dimensions across all models."""
+    return sorted(set(hidden_sizes(models).values()))
diff --git a/benchmarks/microbenchmarks/asv/requirements.txt b/benchmarks/microbenchmarks/asv/requirements.txt
new file mode 100644
index 000000000..ac761879b
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/requirements.txt
@@ -0,0 +1,4 @@
+# Extra dependencies for statistical benchmark comparison (compare_results.py).
+# benchstats pulls in rich, scipy and numpy.
+numpy
+benchstats>=3.4

From 00aa27b54ce038137e6c507e9635053a093c0c73 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Thu, 18 Jun 2026 20:01:22 +0000
Subject: [PATCH 3/4] Updated benchstats version

---
 benchmarks/microbenchmarks/asv/compare_results.py | 1 -
 benchmarks/microbenchmarks/asv/requirements.txt   | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/microbenchmarks/asv/compare_results.py b/benchmarks/microbenchmarks/asv/compare_results.py
index 5558e278a..18ea2dd3b 100755
--- a/benchmarks/microbenchmarks/asv/compare_results.py
+++ b/benchmarks/microbenchmarks/asv/compare_results.py
@@ -67,7 +67,6 @@ def _load_samples(path, name_filter=None):
 
 def run_stats(args):
     """Compare two result JSONs; return 1 if a significant difference is found."""
-    import rich.table  # noqa: F401  benchstats render uses rich.table without importing it
     from benchstats.compare import compareStats
     from benchstats.render import renderComparisonResults
     from benchstats.common import LoggingConsole, detectExportFormat
diff --git a/benchmarks/microbenchmarks/asv/requirements.txt b/benchmarks/microbenchmarks/asv/requirements.txt
index ac761879b..fb32ecc15 100644
--- a/benchmarks/microbenchmarks/asv/requirements.txt
+++ b/benchmarks/microbenchmarks/asv/requirements.txt
@@ -1,4 +1,4 @@
 # Extra dependencies for statistical benchmark comparison (compare_results.py).
 # benchstats pulls in rich, scipy and numpy.
 numpy
-benchstats>=3.4
+benchstats>=3.4.1

From cdc60751711042a05dc029a28e759874ed007a81 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Thu, 18 Jun 2026 20:14:52 +0000
Subject: [PATCH 4/4] Added kernel-profiling option

---
 benchmarks/microbenchmarks/asv/README.md |  19 ++++
 benchmarks/microbenchmarks/asv/driver.py | 130 ++++++++++++++++++++++-
 2 files changed, 146 insertions(+), 3 deletions(-)

diff --git a/benchmarks/microbenchmarks/asv/README.md b/benchmarks/microbenchmarks/asv/README.md
index de82d85ae..10644f8f7 100644
--- a/benchmarks/microbenchmarks/asv/README.md
+++ b/benchmarks/microbenchmarks/asv/README.md
@@ -23,6 +23,7 @@ python bench_gemm.py -w 5 -n 20           # custom warmup / timed iterations
 python bench_casting.py --no-save         # don't write a result file
 python bench_casting.py --cold-cache      # flush GPU cache before each sample
 python bench_gemm.py --inner 50           # fix the inner-loop count to 50
+python bench_gemm.py --kernel-profile     # per-kernel CUDA-time breakdown
 ```
 
 Results are written to `benchmarks/microbenchmarks/asv/results/<commit-hash>.json`
@@ -46,6 +47,24 @@ so each window lasts at least `--target-window-ms` (default `1.0 ms`).
 - **Warm cache, large `_inner`** (default): steady-state throughput, lowest variance.
 - **Cold cache, `_inner=1`**: isolated cold-memory cost — higher variance; bandwidth-bound benches (cast, norm) run ~1.5–3× slower than warm.
 
+## Kernel profiling
+
+`--kernel-profile` runs each benchmark once under `torch.profiler` instead of
+collecting timing distributions, and prints the GPU kernels it launched, sorted
+by total device time:
+
+```bash
+python driver.py bench_gemm --kernel-profile
+python bench_attention.py time_forward --kernel-profile   # one method
+```
+
+For each `(method, parameter combo)` it reports per-kernel total/avg CUDA time,
+launch count, and share of total — useful for spotting which kernel dominates or
+whether an op is launch-bound. This bypasses the timing machinery (`--inner`,
+`--cold-cache`, interleaving); `--profile-inner N` sets how many invocations are
+profiled per run (default `1`). Output is saved to
+`results/<commit-hash>-kernelprofile.json` unless `--no-save`.
+
 ## Sample scheduling: interleaving
 
 By default the driver does **not** collect a benchmark's samples in one
diff --git a/benchmarks/microbenchmarks/asv/driver.py b/benchmarks/microbenchmarks/asv/driver.py
index aa9f2f5b4..1443515f7 100644
--- a/benchmarks/microbenchmarks/asv/driver.py
+++ b/benchmarks/microbenchmarks/asv/driver.py
@@ -346,6 +346,111 @@ def _label(combo):
     }
 
 
+# ---------------------------------------------------------------------------
+# Kernel profiling
+# ---------------------------------------------------------------------------
+
+_KERNEL_NAME_MAX_WIDTH = 80
+
+
+def _shorten_kernel_name(name):
+    """Shorten verbose C++/HIP kernel names for readable output.
+
+    Strips a leading 'void ', removes template arguments (one level of nesting),
+    collapses whitespace, and truncates to ``_KERNEL_NAME_MAX_WIDTH``.
+    """
+    s = name[5:] if name.startswith("void ") else name
+    s = re.sub(r"<[^<>]*(?:<[^<>]*>[^<>]*)*>", "", s)
+    s = " ".join(s.split())
+    if len(s) > _KERNEL_NAME_MAX_WIDTH:
+        s = s[:_KERNEL_NAME_MAX_WIDTH - 3] + "..."
+    return s
+
+
+def profile_class(suite_name, cls, class_name, method_filter=None, warmup=3, inner=1):
+    """Per-kernel CUDA-time breakdown for each time_* method x parameter combo.
+
+    Unlike :func:`run_class` (timing distributions), this runs each benchmark
+    once under ``torch.profiler`` and reports the GPU kernels it launched, sorted
+    by total device time. Returns ``{bench_key: {combo_label: [kernel_row, ...]}}``.
+    """
+    import torch
+    from torch.profiler import profile, ProfilerActivity
+
+    methods = sorted(m for m in dir(cls) if m.startswith("time_"))
+    if method_filter:
+        methods = [m for m in methods if method_filter in m]
+    if not methods:
+        return {}
+
+    params = getattr(cls, "params", [[]])
+    param_names = list(getattr(cls, "param_names", []))
+    combos = list(itertools.product(*params))
+
+    def _label(combo):
+        return ", ".join(f"{nm}={v}" for nm, v in zip(param_names, combo))
+
+    out = {}
+    for method_name in methods:
+        bench_key = f"{suite_name}.{class_name}.{method_name}"
+        out[bench_key] = {}
+        for combo in combos:
+            instance = cls()
+            try:
+                instance.setup(*combo)
+            except Exception as e:
+                print(f"  SKIP  {_label(combo)}  setup failed: {e}")
+                continue
+            instance._inner = max(1, int(inner))
+            method = getattr(instance, method_name)
+            for _ in range(warmup):
+                method(*combo)
+            with profile(activities=[ProfilerActivity.CUDA]) as prof:
+                method(*combo)
+                torch.cuda.synchronize()
+
+            events = [e for e in prof.key_averages() if e.self_device_time_total > 0]
+            events.sort(key=lambda e: e.self_device_time_total, reverse=True)
+            total = sum(e.self_device_time_total for e in events)
+
+            w = _KERNEL_NAME_MAX_WIDTH
+            hdr = (f"  {'kernel':<{w}}  {'total us':>11}  {'calls':>6}"
+                   f"  {'avg us':>10}  {'%':>6}")
+            print(f"\n{bench_key}  ({_label(combo)})")
+            print(hdr)
+            print("  " + "-" * (len(hdr) - 2))
+            rows = []
+            for e in events:
+                avg = e.self_device_time_total / e.count if e.count else 0.0
+                pct = 100.0 * e.self_device_time_total / total if total else 0.0
+                print(f"  {_shorten_kernel_name(e.key):<{w}}  {e.self_device_time_total:>11.1f}"
+                      f"  {e.count:>6}  {avg:>10.2f}  {pct:>5.1f}%")
+                rows.append({
+                    "kernel": e.key, "total_us": round(e.self_device_time_total, 1),
+                    "calls": e.count, "avg_us": round(avg, 2), "pct": round(pct, 1),
+                })
+            print(f"  {'TOTAL':<{w}}  {total:>11.1f}")
+            out[bench_key][_label(combo)] = rows
+    return out
+
+
+def save_kernel_profile(all_profiles, label=None, results_dir=None):
+    """Write per-kernel profiles to ``<results_dir>/<hash>[-<label>]-kernelprofile.json``."""
+    commit = _get_commit_hash()
+    results_dir = results_dir or _results_dir()
+    os.makedirs(results_dir, exist_ok=True)
+    suffix = ""
+    if label:
+        suffix = "-" + re.sub(r"[^A-Za-z0-9._-]+", "_", label).strip("_")
+    path = os.path.join(results_dir, f"{commit[:8]}{suffix}-kernelprofile.json")
+    with open(path, "w") as f:
+        json.dump(
+            {"commit_hash": commit, "date": int(time.time() * 1000),
+             "kernel_profile": all_profiles}, f, indent=2,
+        )
+    print(f"\nKernel profile saved to {path}")
+
+
 def run_as_main(caller_file=None):
     """Run benchmarks from a bench file's ``__main__`` block or the command line.
 
@@ -402,6 +507,14 @@ def run_as_main(caller_file=None):
                         help="Disable the per-round random permutation and use a "
                              "fixed round-robin order, leaving a small residual "
                              "ordering bias.")
+    parser.add_argument("--kernel-profile", action="store_true",
+                        help="Profile per-kernel CUDA time via torch.profiler "
+                             "instead of measuring timing distributions. Runs each "
+                             "benchmark once and prints a per-kernel breakdown "
+                             "(saved to <hash>-kernelprofile.json unless --no-save).")
+    parser.add_argument("--profile-inner", type=int, default=1,
+                        help="Kernel invocations per profiled run in "
+                             "--kernel-profile mode (default: 1).")
     parser.add_argument("--no-save", action="store_true",
                         help="Skip saving results to JSON.")
     parser.add_argument("--label", default=None,
@@ -441,18 +554,26 @@ def run_as_main(caller_file=None):
     # --seed; shared across classes so the stream is deterministic end-to-end.
     rng = random.Random(args.seed)
     shuffle = not args.no_shuffle
-    if args.interleave_group > 1 and shuffle:
+    if not args.kernel_profile and args.interleave_group > 1 and shuffle:
         print(f"Interleave: group={args.interleave_group}, shuffled (seed={args.seed})")
 
     all_results = {}
+    all_profiles = {}
     for suite_name in suite_names:
         mod = importlib.import_module(suite_name)
         for name in sorted(dir(mod)):
             obj = getattr(mod, name)
             # Any Bench* class that defines a time_* method (excludes BenchBase,
             # and is robust to the bench-file/driver __main__ double-import).
-            if (isinstance(obj, type) and name.startswith("Bench")
+            if not (isinstance(obj, type) and name.startswith("Bench")
                     and any(m.startswith("time_") for m in dir(obj))):
+                continue
+            if args.kernel_profile:
+                all_profiles.update(profile_class(
+                    suite_name, obj, name, args.method_filter,
+                    warmup=args.warmup, inner=args.profile_inner,
+                ))
+            else:
                 all_results.update(run_class(
                     suite_name, obj, name, args.method_filter,
                     warmup=args.warmup, iters=args.iters,
@@ -461,7 +582,10 @@ def run_as_main(caller_file=None):
                     interleave_group=args.interleave_group, rng=rng, shuffle=shuffle,
                 ))
 
-    if all_results and not args.no_save:
+    if args.kernel_profile:
+        if all_profiles and not args.no_save:
+            save_kernel_profile(all_profiles, label=args.label)
+    elif all_results and not args.no_save:
         save_results(all_results, label=args.label)