From 377ee554068bce758cff24512363800132d507c5 Mon Sep 17 00:00:00 2001
From: liuhaoyu35 <liuhaoyu35@huawei.com>
Date: Mon, 29 Jun 2026 15:47:46 +0800
Subject: [PATCH 1/2] feat: add multi-host collective profiling

---
 .gitignore                                    |   3 +
 docs/multi-host-profiling-report.md           | 282 ++++++++++++
 docs/multihost-profiling-usage.md             | 253 +++++++++++
 docs/single-node-profiling-report.md          | 148 +++++++
 src/collectives/host/collective_kernel.cpp    |  40 +-
 src/collectives/host/collective_utils.cpp     |   9 +
 src/collectives/host/collective_utils.h       |   2 +
 src/collectives/host/perf_trace_report.cpp    |  45 +-
 src/collectives/host/perf_trace_report.h      |   2 +
 src/collectives/host/perf_trace_session.cpp   |  37 +-
 src/collectives/host/tilexr_collectives.cpp   |  24 +
 src/collectives/kernels/CMakeLists.txt        |  11 +-
 src/collectives/kernels/collectives.h         |   2 +-
 src/collectives/kernels/datacopy_gm2gm.h      |  47 +-
 .../kernels/kernels/collectives.cce           |   8 +-
 .../kernels/kernels/lcal_profile_probe.cce    | 110 +++++
 src/collectives/kernels/lccl_op.h             |  12 +-
 src/collectives/kernels/perf_trace_kernel.h   |  96 +++-
 src/collectives/kernels/tilexr_lccl_op.cpp    |   3 +-
 src/comm/tilexr_internal.cpp                  |   4 +-
 src/include/tilexr_collectives.h              |   3 +
 src/include/tilexr_collectives_perf.h         |   1 +
 src/include/tilexr_perf_trace.h               |   5 +-
 src/include/tilexr_types.h                    |   2 +
 tests/collectives/CMakeLists.txt              |  23 +
 tests/collectives/README.md                   |  41 +-
 .../run_collective_perf_multihost.sh          | 217 ++++++++++
 .../tilexr-tests/tilexr_collective_perf.cpp   | 330 +++++++++++++-
 .../tilexr_collective_profile_report.py       | 409 +++++++++++++++++-
 .../unit/test_collective_perf_report.cpp      |  14 +-
 .../unit/test_collective_perf_session.cpp     |  13 +
 .../unit/test_collective_profile_report.py    | 349 ++++++++++++++-
 .../unit/test_prepare_host_launch_context.cpp |  56 +++
 .../unit/test_tilexr_collectives_api.cpp      |  47 ++
 ...st_tilexr_collectives_kernel_ownership.cpp |  36 ++
 .../test_tilexr_collectives_tools_sources.cpp |  61 ++-
 .../unit/test_tilexr_perf_trace_layout.cpp    |   9 +-
 tests/comm/unit/test_tilexr_source_guards.cpp |  11 +
 38 files changed, 2656 insertions(+), 109 deletions(-)
 create mode 100644 docs/multi-host-profiling-report.md
 create mode 100644 docs/multihost-profiling-usage.md
 create mode 100644 docs/single-node-profiling-report.md
 create mode 100644 src/collectives/kernels/kernels/lcal_profile_probe.cce
 create mode 100644 tests/collectives/run_collective_perf_multihost.sh

diff --git a/.gitignore b/.gitignore
index 1c01658..427b557 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,6 +53,9 @@ MANIFEST.in
 .claude/scheduled_tasks.json
 .claude/settings.local.json
 
+# local transfer bundles for remote Ascend profiling/debug sync
+/.tilexr_*
+
 # custom
 .worktrees/
 op-simulator/build
diff --git a/docs/multi-host-profiling-report.md b/docs/multi-host-profiling-report.md
new file mode 100644
index 0000000..9e162d6
--- /dev/null
+++ b/docs/multi-host-profiling-report.md
@@ -0,0 +1,282 @@
+# TileXR 多机性能分析模块改动整理
+
+## 背景与目标
+
+本轮多机性能分析工作以 `C:\TileXR` 的 `lhy-single-profiling` 分支为基本盘，目标是在 62 和 70 两台机器上跑通 2 机 kernel-level profiling，并输出统一的 HTML 报告和 `perfetto_trace.json`。
+
+当前已验证链路：
+
+- 62：`root@141.62.24.62`，远端路径 `/home/l00929943/TileXR`。
+- 70：`root@141.62.24.70`，远端路径 `/home/l00929943/TileXR`。
+- 算子：`profile-probe`。
+- 输出目录：`/home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70-kernel-align`。
+- 本地报告：`C:\TileXR\run\prof\collectives-2host-profile-probe-62-70-kernel-align\report.html`。
+- 本地 Perfetto：`C:\TileXR\run\prof\collectives-2host-profile-probe-62-70-kernel-align\perfetto_trace.json`。
+
+注意：`profile-probe` 是多机 profiling/report smoke 算子。它会初始化 ACL 和 socket communicator，并在每个 rank 上执行真实 AIV kernel，记录与单机 profiling 一致的 7 个 kernel stage；但它只做本地 device memory copy，不触碰跨机 `peerMems[]`，不能当作 allgather 跨机带宽结果。
+
+## 必须保留的改动
+
+### 1. `profile-probe` 算子闭包
+
+这些改动是多机 profiling 能稳定跑通的核心，提交时必须保留：
+
+- `src/include/tilexr_types.h`
+  - 新增 `TileXRType::PROFILE_PROBE = 11`。
+  - 新增 `TileXRProfileProbe` 类型名映射。
+- `src/include/tilexr_collectives.h`
+  - 新增 public API `TileXRProfileProbe(...)`。
+- `src/collectives/host/tilexr_collectives.cpp`
+  - 新增 host 侧 `TileXRProfileProbe` 实现。
+  - 使用 `GetProfileProbeBlockNum` 固定 probe block 数。
+  - 以 `INT8` kernel signature launch，避免 profile probe 与 dtype 组合膨胀。
+- `src/collectives/host/collective_utils.{h,cpp}`
+  - 新增 `GetProfileProbeBlockNum`。
+- `src/collectives/host/collective_kernel.cpp`
+  - 注册 `PROFILE_PROBE` kernel。
+  - 对 `PROFILE_PROBE` 使用独立 funsig 和 kernel name。
+- `src/collectives/kernels/lccl_op.h`
+  - include `kernels/lcal_profile_probe.cce`。
+  - 新增 `LCCL_PROFILE_PROBE_FUNC_AUTO_DEF()`。
+- `src/collectives/kernels/tilexr_lccl_op.cpp`
+  - 注册 `TileXRProfileProbe` kernel TU。
+- `src/collectives/kernels/kernels/lcal_profile_probe.cce`
+  - 新增真实 AIV probe kernel。
+  - 输出和单机一致的 7 个 stage：`kernel_total`、`chunk_total`、`post_sync`、`local_input_to_ipc`、`flag_poll_wait`、`peer_ipc_to_output`、`chunk_barrier`。
+  - 只做本地 GM -> UB -> GM copy，不访问 `peerMems[]`。
+- `src/collectives/kernels/CMakeLists.txt`
+  - `TILEXR_COLLECTIVES_1OP_BIN_SIZE` 从 `5242880` 增大到 `10485760`，否则新增 CCE binary 后可能被截断。
+
+### 2. kernel trace 数据面修复
+
+这些改动保证多机和单机报告的 kernel 粒度一致，也是必须保留的：
+
+- `src/include/tilexr_perf_trace.h`
+  - 新增 `TILEXR_PERF_TRACE_STATS_OFFSET = 128`。
+  - `TileXRPerfCoreStageStats` 扩展到 96 bytes，并保持 32-byte 对齐。
+  - 新增 `aux2/aux3`，防止 ABI/layout 与 kernel 侧不一致。
+- `src/collectives/kernels/perf_trace_kernel.h`
+  - 使用固定 GM stats offset，避免 kernel 侧读取 header 字段导致不稳定。
+  - 通过 UB 临时 buffer 做 stats slot 的 GM <-> UB 更新。
+  - 使用 `GetBlockNum()` 和固定 stage count 计算 slot。
+  - 保留无 profiling 编译路径的 no-op overload。
+- `src/collectives/host/perf_trace_session.cpp`
+  - host 侧 header 使用同一个 `TILEXR_PERF_TRACE_STATS_OFFSET`。
+  - 支持 `TILEXR_COLLECTIVES_DISABLE_KERNEL_PROFILING=1`，用于只保留 host metadata 的调试对照。
+  - 新增 incomplete report 写出入口。
+- `src/include/tilexr_collectives_perf.h`
+  - 对外声明 `TileXRCollectivePerfWriteIncompleteReport(...)`，让测试工具在 kernel launch 失败时仍能写出可聚合的 trace metadata。
+- `src/collectives/host/perf_trace_report.{h,cpp}`
+  - 只保留 `count != 0` 的 stats，避免 count=0 的脏 slot 进入报告。
+  - `trace.json` 写出 `aux2/aux3`。
+  - 支持 incomplete trace 的 `incomplete` / `incomplete_reason`。
+
+### 3. 多机运行入口与测试工具
+
+这些改动让 62/70 可以一键启动 2 机 profiling，必须保留：
+
+- `tests/collectives/tilexr-tests/tilexr_collective_perf.cpp`
+  - 新增 `--comm-mode local|socket`。
+  - 新增 `--device-id`。
+  - 新增 `--op profile-probe`。
+  - socket 模式下使用 `TileXRCommInitRank`。
+  - 写出 `rank<N>/host_info.json`，包含 host/IP/comm_mode。
+  - 在 measured launch 失败时写 incomplete trace，便于聚合报告诊断。
+  - `profile-probe` 支持 `--check 1`，用于确认 profiling 没有导致执行错误。
+- `tests/collectives/run_collective_perf_multihost.sh`
+  - 读取 `TILEXR_MULTIHOST_PEERS`，按 rank 通过 SSH 启动远端进程。
+  - 设置 `--comm-mode socket`、`TILEXR_COMM_ID`、`ASCEND_PROCESS_LOG_PATH`。
+  - 优先用当前构建产物的 `LD_LIBRARY_PATH`，并将 driver lib 放前面。
+  - 远端 rank 完成后回收 `rank<N>/` profiling 目录。
+  - 调用 `tilexr_collective_profile_report.py` 生成聚合报告。
+- `tests/collectives/CMakeLists.txt`
+  - 安装 `run_collective_perf_multihost.sh`。
+  - 增加 rpath-link，避免测试工具链接时误用 CANN stub/缺失 driver 依赖。
+- `tests/collectives/README.md`
+  - 补充 multi-host profiling 使用说明和限制。
+
+### 4. 聚合报告和 Perfetto 输出
+
+这些改动让多机信息真正呈现在 HTML 和 Perfetto 中，必须保留：
+
+- `tests/collectives/tilexr_collective_profile_report.py`
+  - 聚合 `rank*/host_info.json`。
+  - HTML 显示 `rank@host` 和 host IP。
+  - `Rank-Level Summary` 用 `kernel_total` 汇总每个 rank 的 avg/max kernel us。
+  - `Trace Status` 显示 incomplete/synthetic trace。
+  - `perfetto_trace.json` 使用 host/rank/stage 命名，例如 `launch0/rank1@141.62.24.70/kernel_total`。
+  - 每个 rank 增加 `launch_windows` thread，便于 `ui.perfetto.dev` 中按 launch 对齐查看。
+  - 保留诊断：missing trace、incomplete trace、group incompatible 等。
+
+### 5. 回归测试与 source guard
+
+这些测试是提交多机 profiling 时保护行为不退化的必要部分，建议全部保留：
+
+- `tests/collectives/unit/test_collective_profile_report.py`
+  - 覆盖 host metadata、Perfetto、incomplete trace、count=0 过滤。
+  - 新增 `test_multihost_report_preserves_single_host_kernel_stage_granularity`，确保多机报告保留单机同款 7 个 kernel stage。
+- `tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp`
+  - 确认 profile probe CCE 文件、注册宏、kernel 注册和 stats layout 约束。
+- `tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp`
+  - 确认 CLI、multi-host script、README、Perfetto host label 等关键字符串存在。
+- `tests/collectives/unit/test_tilexr_perf_trace_layout.cpp`
+  - 锁定 trace header/stats ABI 和 32-byte 对齐。
+- `tests/collectives/unit/test_prepare_host_launch_context.cpp`
+  - 覆盖 stats offset 和 disable kernel profiling 环境变量。
+- `tests/collectives/unit/test_collective_perf_session.cpp`
+  - 覆盖 incomplete report 写出。
+- `tests/collectives/unit/test_collective_perf_report.cpp`
+  - 覆盖 count=0 stats 过滤和 aux 字段写出。
+
+## 提交前 staging 清单
+
+如果后续要提交“多机性能分析模块跑通”这一组改动，建议至少包含下面这些文件。这里按 git 状态拆分，方便提交前直接核对。
+
+新增文件必须保留：
+
+- `docs/multi-host-profiling-report.md`
+- `src/collectives/kernels/kernels/lcal_profile_probe.cce`
+- `tests/collectives/run_collective_perf_multihost.sh`
+
+已修改文件必须保留：
+
+- `src/collectives/host/collective_kernel.cpp`
+- `src/collectives/host/collective_utils.cpp`
+- `src/collectives/host/collective_utils.h`
+- `src/collectives/host/perf_trace_report.cpp`
+- `src/collectives/host/perf_trace_report.h`
+- `src/collectives/host/perf_trace_session.cpp`
+- `src/collectives/host/tilexr_collectives.cpp`
+- `src/collectives/kernels/CMakeLists.txt`
+- `src/collectives/kernels/lccl_op.h`
+- `src/collectives/kernels/perf_trace_kernel.h`
+- `src/collectives/kernels/tilexr_lccl_op.cpp`
+- `src/include/tilexr_collectives.h`
+- `src/include/tilexr_collectives_perf.h`
+- `src/include/tilexr_perf_trace.h`
+- `src/include/tilexr_types.h`
+- `tests/collectives/CMakeLists.txt`
+- `tests/collectives/README.md`
+- `tests/collectives/tilexr-tests/tilexr_collective_perf.cpp`
+- `tests/collectives/tilexr_collective_profile_report.py`
+- `tests/collectives/unit/test_collective_perf_report.cpp`
+- `tests/collectives/unit/test_collective_perf_session.cpp`
+- `tests/collectives/unit/test_collective_profile_report.py`
+- `tests/collectives/unit/test_prepare_host_launch_context.cpp`
+- `tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp`
+- `tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp`
+- `tests/collectives/unit/test_tilexr_perf_trace_layout.cpp`
+
+这组文件形成闭环：算子枚举/API、host launch、device kernel、trace ABI、报告聚合、multi-host 启动脚本、README 和回归测试都必须同时存在。只提交其中一部分，容易出现编译可过但远端跑不通，或者能跑但 HTML/Perfetto 缺少 rank/kernel 粒度信息。
+
+## 不应提交的内容
+
+以下文件是同步/调试临时产物，不属于代码改动，应在提交前删除或保持 untracked：
+
+- `.tilexr_profile_sync.tar.gz`
+- `.tilexr_profile_sync.tar.gz.b64`
+- `.tilexr_profile_probe_sync.tar.gz`
+- `.tilexr_profile_probe_sync.tar.gz.b64`
+- `.tilexr_profile_probe_sync.ascii.b64`
+- `.tilexr_profile_probe_fix2.tar.gz`
+- `.tilexr_profile_probe_fix2.ascii.b64`
+- `.tilexr_profile_probe_fix3.tar.gz`
+- `.tilexr_profile_probe_fix3.ascii.b64`
+
+运行报告目录也不建议进代码提交，除非后续明确要提交样例报告：
+
+- `run/prof/collectives-2host-profile-probe-62-70*`
+- `run/prof/collectives-2host-profile-probe-ab-profile*`
+
+## 已验证命令
+
+本地报告聚合测试：
+
+```bash
+cd C:\TileXR
+python -m unittest tests.collectives.unit.test_collective_profile_report -v
+```
+
+62/70 远端构建：
+
+```bash
+cd /home/l00929943/TileXR
+source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true
+conda activate pt311 2>/dev/null || true
+source /usr/local/Ascend/cann/set_env.sh
+cmake --build build-profile-950 --target \
+  test_tilexr_collectives_kernel_ownership \
+  test_tilexr_collectives_tools_sources \
+  tilexr_collective_perf -j8
+```
+
+62/70 远端 source guard：
+
+```bash
+cd /home/l00929943/TileXR/build-profile-950/tests/collectives
+./test_tilexr_collectives_kernel_ownership
+./test_tilexr_collectives_tools_sources
+```
+
+2 机 profiling 运行：
+
+```bash
+cd /home/l00929943/TileXR
+source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true
+conda activate pt311 2>/dev/null || true
+source /usr/local/Ascend/cann/set_env.sh
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives:$(pwd)/build-profile-950/src/comm:${LD_LIBRARY_PATH:-}
+export TILEXR_MULTIHOST_PEERS='0,root@141.62.24.62,141.62.24.62,0;1,root@141.62.24.70,141.62.24.70,0'
+export TILEXR_COMM_ID='141.62.24.62:10067'
+export TILEXR_COLLECTIVES_RUN_TIMEOUT_SEC=300
+bash tests/collectives/run_collective_perf_multihost.sh \
+  /home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70-kernel-align \
+  /home/l00929943/TileXR/build-profile-950/tests/collectives \
+  --op profile-probe --min-bytes 4096 --max-bytes 4096 \
+  --iters 2 --warmup-iters 0 --datatype int32 --check 1 \
+  --profile 1 --profile-sample-every 1 --profile-ai-prompt 1
+```
+
+## 当前验证结论
+
+最新 2 机结果：
+
+- `diagnostics=[]`。
+- `op_name=TileXRProfileProbe`。
+- `rank_size=2`。
+- `launch_ids=[0,1]`。
+- `bars=112`。
+- 每个 rank/launch 都包含完整 7 个 kernel stage。
+- `perfetto_trace.json` 中每个 rank/launch/stage 都有 duration event。
+- `Rank-Level Summary` 示例：
+  - rank0 avg kernel `93.740 us`，max `112.580 us`。
+  - rank1 avg kernel `92.170 us`，max `106.740 us`。
+
+profiling on/off 对照：
+
+- `--profile 0`：`errors=0`，host avg `9.607 us`。
+- `--profile 1`：`errors=0`，host avg `393.650 us`，trace 完整。
+- 结论：profiling 会明显放大 host 端统计耗时，但没有导致算子校验失败、trace incomplete 或 kernel stage 缺失。
+
+plog 中可见 HCCP deinit/unimport ERROR 和 UDMA topology fallback WARN；这些在 profiling on/off 都存在，且 rank 日志 `errors=0`，目前判断为退出/资源回收阶段或 UDMA/HCCP 清理路径噪声，不是多机 profiling 模块引入的执行异常。
+
+## 已知限制
+
+- `profile-probe` 不是跨机 allgather 数据面，不报告跨机通信带宽。
+- 当前 report 将不同 rank/launch 的时间线分别归一化，不假设两台机器的 device cycle offset 同步。
+- 62 机器系统时间异常，会影响构建时间戳和 plog 文件名；必要时用 `touch` 关键源文件强制重编。
+- `run_collective_perf_multihost.sh` 当前面向 profiling run；若 `--profile 0`，rank profile 目录不会生成，脚本的回收阶段会失败。做 profiling off 对照时建议手动启动 rank 或后续单独增强脚本兼容。
+
+## 提交建议
+
+提交多机性能分析模块时建议包含：
+
+- 上述 `profile-probe` 算子闭包。
+- kernel trace 数据面修复。
+- `tilexr_collective_perf` socket/profile-probe/host_info/incomplete report 支持。
+- `run_collective_perf_multihost.sh`。
+- `tilexr_collective_profile_report.py` 的 host/rank/Perfetto/incomplete 聚合能力。
+- README 和单测/source guard。
+- 本文档 `docs/multi-host-profiling-report.md`。
+
+提交前建议清理 `.tilexr_*` 临时包，并确认没有把 `run/prof` 运行产物加入 git。
diff --git a/docs/multihost-profiling-usage.md b/docs/multihost-profiling-usage.md
new file mode 100644
index 0000000..8050fda
--- /dev/null
+++ b/docs/multihost-profiling-usage.md
@@ -0,0 +1,253 @@
+# TileXR 多机性能分析模块使用说明
+
+## 适用范围
+
+本文档对应上交分支 `lhy-multihost-profiling`，用于在 62/70 两台 Ascend 机器上跑通 2 机 kernel-level profiling，并生成统一的 HTML 报告和 `ui.perfetto.dev` 可打开的 `perfetto_trace.json`。
+
+当前推荐先使用 `profile-probe` 算子做多机 profiling 链路验证。它会初始化 ACL 和 socket communicator，并在每个 rank 上执行真实 AIV kernel，采集与单机 profiling 对齐的 kernel stage：
+
+- `kernel_total`
+- `chunk_total`
+- `post_sync`
+- `local_input_to_ipc`
+- `flag_poll_wait`
+- `peer_ipc_to_output`
+- `chunk_barrier`
+
+注意：`profile-probe` 是 profiling/report smoke 算子，只做本地 GM -> UB -> GM copy，不访问跨机 `peerMems[]`，不能作为跨机 allgather 带宽结果。如果需要看真实跨机 collective 数据面，需要在该链路基础上继续接入对应 collective kernel。
+
+## 分支与代码
+
+本地基本盘：
+
+```powershell
+cd C:\TileXR
+git switch lhy-multihost-profiling
+```
+
+建议提交/同步到远端的核心文件包括：
+
+- `src/collectives/host/*perf_trace*`
+- `src/collectives/host/tilexr_collectives.cpp`
+- `src/collectives/host/collective_kernel.cpp`
+- `src/collectives/kernels/perf_trace_kernel.h`
+- `src/collectives/kernels/kernels/lcal_profile_probe.cce`
+- `src/include/tilexr_collectives*.h`
+- `src/include/tilexr_perf_trace.h`
+- `src/include/tilexr_types.h`
+- `tests/collectives/tilexr-tests/tilexr_collective_perf.cpp`
+- `tests/collectives/tilexr_collective_profile_report.py`
+- `tests/collectives/run_collective_perf_multihost.sh`
+- `tests/collectives/unit/*profile*`
+- `tests/collectives/README.md`
+- `docs/multi-host-profiling-report.md`
+- `docs/multihost-profiling-usage.md`
+
+不要提交根目录 `.tilexr_*` 同步包，也不要提交 `run/prof/` 运行产物。
+
+## 远端环境准备
+
+两台机器默认路径：
+
+- 62：`root@141.62.24.62:/home/l00929943/TileXR`
+- 70：`root@141.62.24.70:/home/l00929943/TileXR`
+
+每台机器进入仓库后加载环境：
+
+```bash
+cd /home/l00929943/TileXR
+source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true
+conda activate pt311 2>/dev/null || true
+source /usr/local/Ascend/cann/set_env.sh
+```
+
+如果 62 的系统时间异常导致增量编译没有重新生成 kernel，可以在远端执行：
+
+```bash
+touch src/collectives/kernels/kernels/lcal_profile_probe.cce
+touch src/collectives/kernels/perf_trace_kernel.h
+```
+
+## 构建
+
+在 62 和 70 两边都执行：
+
+```bash
+cd /home/l00929943/TileXR
+source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true
+conda activate pt311 2>/dev/null || true
+source /usr/local/Ascend/cann/set_env.sh
+
+cmake -S . -B build-profile-950 \
+  -DTILEXR_BUILD_COLLECTIVES=ON \
+  -DTILEXR_BUILD_TESTS=ON \
+  -DTILEXR_COLLECTIVES_ENABLE_PROFILING=ON \
+  -DBUILD_TESTING=OFF
+
+cmake --build build-profile-950 --target \
+  test_tilexr_collectives_kernel_ownership \
+  test_tilexr_collectives_tools_sources \
+  tilexr_collective_perf -j8
+```
+
+建议先跑 source guard：
+
+```bash
+cd /home/l00929943/TileXR/build-profile-950/tests/collectives
+./test_tilexr_collectives_kernel_ownership
+./test_tilexr_collectives_tools_sources
+```
+
+## 2 机 profiling 运行
+
+建议从 62 发起，`TILEXR_COMM_ID` 使用 62 的 IP 和一个空闲端口：
+
+```bash
+cd /home/l00929943/TileXR
+source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true
+conda activate pt311 2>/dev/null || true
+source /usr/local/Ascend/cann/set_env.sh
+
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives:$(pwd)/build-profile-950/src/comm:${LD_LIBRARY_PATH:-}
+export TILEXR_MULTIHOST_PEERS='0,root@141.62.24.62,141.62.24.62,0;1,root@141.62.24.70,141.62.24.70,0'
+export TILEXR_COMM_ID='141.62.24.62:10067'
+export TILEXR_COLLECTIVES_RUN_TIMEOUT_SEC=300
+
+bash tests/collectives/run_collective_perf_multihost.sh \
+  /home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70 \
+  /home/l00929943/TileXR/build-profile-950/tests/collectives \
+  --op profile-probe --min-bytes 4096 --max-bytes 4096 \
+  --iters 2 --warmup-iters 0 --datatype int32 --check 1 \
+  --profile 1 --profile-sample-every 1 --profile-ai-prompt 1
+```
+
+`TILEXR_MULTIHOST_PEERS` 格式为：
+
+```text
+rank,ssh_target,host_ip,device_id;rank,ssh_target,host_ip,device_id
+```
+
+例如 `0,root@141.62.24.62,141.62.24.62,0` 表示 rank0 通过 `root@141.62.24.62` 登录，在 host IP `141.62.24.62` 上使用 device 0。
+
+## 产物说明
+
+运行完成后，profile 根目录会包含：
+
+```text
+run/prof/collectives-2host-profile-probe-62-70/
+  report.html
+  perfetto_trace.json
+  trace_index.json
+  analysis.md
+  ai_prompt.md
+  rank0/
+    host_info.json
+    launch0/trace.json
+    launch0/report.html
+  rank1/
+    host_info.json
+    launch0/trace.json
+    launch0/report.html
+  multihost_rank0.log
+  multihost_rank1.log
+  plog/
+```
+
+重点看两个聚合产物：
+
+- `report.html`：本地浏览器直接打开，先看 `Rank-Level Summary` 是否有慢 rank，再看 timeline 和 per-launch drilldown。
+- `perfetto_trace.json`：上传到 `https://ui.perfetto.dev`，按 `rank@host`、`launch`、`kernel_total` 搜索定位慢 rank/慢 stage。
+
+Perfetto 事件命名示例：
+
+```text
+launch0/rank0@141.62.24.62/kernel_total
+launch0/rank1@141.62.24.70/kernel_total
+launch0/rank1@141.62.24.70/flag_poll_wait
+```
+
+每个 rank 还会有 `launch_windows` thread，用于在 Perfetto 中对齐查看每次 launch。
+
+## 本地拉回查看
+
+如果需要从 62 拉回到 Windows：
+
+```powershell
+scp -r root@141.62.24.62:/home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70 C:\TileXR\run\prof\
+```
+
+然后打开：
+
+```text
+C:\TileXR\run\prof\collectives-2host-profile-probe-62-70\report.html
+C:\TileXR\run\prof\collectives-2host-profile-probe-62-70\perfetto_trace.json
+```
+
+## 重新生成聚合报告
+
+如果 `rank*/launch*/trace.json` 已经存在，只想重新生成 HTML/Perfetto：
+
+```bash
+cd /home/l00929943/TileXR
+python3 tests/collectives/tilexr_collective_profile_report.py \
+  /home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70 \
+  --warmup-iters 0 \
+  --iters 2 \
+  --profile-sample-every 1 \
+  --emit-ai-prompt
+```
+
+## 判断结果是否正常
+
+一次正常的 `profile-probe` 结果应满足：
+
+- `rank_size=2`。
+- `diagnostics=[]`，或没有影响聚合的 fatal diagnostic。
+- 每个 rank/launch 都包含 7 个 kernel stage。
+- HTML 的 `Rank-Level Summary` 能看到每个 rank 的 avg/max kernel us。
+- Perfetto 中能搜索到 `rank0@.../kernel_total` 和 `rank1@.../kernel_total`。
+- `tilexr_collective_perf` 日志中 `errors=0`。
+
+profiling 会明显放大 host 端 `avg(us)`，这是因为采集、同步、copy-back 和报告写出都在测量路径附近发生。判断算子是否异常时优先看 `errors=0`、trace 是否完整、kernel stage 是否齐全，以及 profiling on/off 是否都存在相同的退出阶段 plog 噪声。
+
+## 常见问题
+
+### SSH 能连但脚本失败
+
+确认两台机器都能免密互连，并且 `TILEXR_MULTIHOST_PEERS` 里的 `ssh_target` 可以从发起机器直接登录。
+
+### `aclInit` 或 `ascend_hal` 相关失败
+
+确认执行过：
+
+```bash
+source /usr/local/Ascend/cann/set_env.sh
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives:$(pwd)/build-profile-950/src/comm:${LD_LIBRARY_PATH:-}
+```
+
+driver lib 需要排在 CANN stub lib 前面，避免误加载 stub `libascend_hal.so`。
+
+### 找 plog
+
+可以在 profile 目录或仓库下使用：
+
+```bash
+find $PWD -name "plog"
+```
+
+本脚本会设置：
+
+```bash
+ASCEND_PROCESS_LOG_PATH="${profile_dir}/plog/rank${rank}"
+```
+
+### `--profile 0` 对照
+
+当前 `run_collective_perf_multihost.sh` 面向 profiling run。`--profile 0` 不会生成 `rank<N>/` profile 目录，脚本回收阶段可能失败。需要做 profiling off 对照时，建议临时手动启动两个 rank，或后续单独增强脚本兼容 profile-off 模式。
+
+## 当前限制
+
+- 当前已跑通并建议上交的是多机 multi-rank kernel-level profiling 链路，不是多 ACL stream 并发性能分析工具。
+- `profile-probe` 不代表跨机 allgather 数据面性能。
+- 跨机器 device cycle 不假设同步，因此聚合报告按 rank/launch 独立归一化时间线。
+- 多机真实 collective 数据面 profiling 需要继续把同一套 trace 数据面接入跨机 collective kernel。
diff --git a/docs/single-node-profiling-report.md b/docs/single-node-profiling-report.md
new file mode 100644
index 0000000..6fabef5
--- /dev/null
+++ b/docs/single-node-profiling-report.md
@@ -0,0 +1,148 @@
+# TileXR 单机性能分析模块改动报告
+
+## 背景与目标
+
+本轮单机性能分析工作基于 `lhy-single-profiling` 分支，目标是在 Ascend950 单机多卡环境上跑通 standalone collectives 的算子内性能采集，并同时保留原有 HTML 报告与新增 `ui.perfetto.dev` 可打开的 trace JSON。
+
+当前已验证链路覆盖：
+
+- 70 环境：Ascend950PR_9599，单机 allgather profiling 跑通。
+- 62 环境：Ascend950PR_9589，补充芯片名映射后单机 allgather profiling 跑通。
+- 本地报告样例：`C:\TileXR\run\prof\collectives-950\report.html`、`C:\TileXR\run\prof\collectives-950-62\report.html`。
+- Perfetto 样例：对应目录下的 `perfetto_trace.json` 可上传到 `https://ui.perfetto.dev` 查看。
+
+## 代码改动概览
+
+主要改动来自提交 `f80a607 feat: enable Ascend950 single-node profiling`，涉及 collectives kernel 编译、Ascend950 kernel 兼容、profile report 聚合和 Perfetto trace 导出。
+
+当前工作区还有一个额外未提交补丁，用于 62 机器上的 `Ascend950PR_9589` 识别：
+
+- `src/comm/tilexr_internal.cpp`：增加 `Ascend950PR_9589 -> CHIP_950`。
+- `tests/comm/unit/test_tilexr_source_guards.cpp`：增加对应 source guard。
+
+## Ascend950 编译与运行适配
+
+Ascend950 单机 profiling 的关键编译适配在 `src/collectives/kernels/CMakeLists.txt`：
+
+- 新增 `TILEXR_COLLECTIVES_SOC_TYPE` CMake 变量。
+- 当 `TILEXR_COLLECTIVES_SOC_TYPE=Ascend950` 时，CCE AIV arch 使用 `dav-c310-vec`。
+- 其他平台默认仍使用原来的 `dav-c220-vec`。
+
+相关 CCE 源文件也扩展了 `__DAV_C310_VEC__` 条件，使 Ascend950 编译时能生成 collectives kernel：
+
+- `src/collectives/kernels/tilexr_lccl_op.cpp`
+- `src/collectives/kernels/lccl_op.h`
+- `src/collectives/kernels/collectives.h`
+- `src/collectives/kernels/kernels/collectives.cce`
+- `src/collectives/kernels/datacopy_gm2gm.h`
+
+`datacopy_gm2gm.h` 还增加了 `TileXRAtomicTypeSupported`，避免 `dav-c310-vec` 下对不支持类型生成 atomic 指令。
+
+## Profiling 采集链路
+
+运行入口仍然是 `tests/collectives/run_collective_perf.sh` 和 `tests/collectives/tilexr-tests/tilexr_collective_perf.cpp`。
+
+典型构建命令：
+
+```bash
+source /usr/local/Ascend/cann/set_env.sh
+cmake -S . -B build-profile-950 \
+  -DTILEXR_BUILD_COLLECTIVES=ON \
+  -DTILEXR_COLLECTIVES_ENABLE_PROFILING=ON \
+  -DTILEXR_COLLECTIVES_SOC_TYPE=Ascend950
+cmake --build build-profile-950 --target tilexr_collective_perf -j"$(nproc)"
+```
+
+典型运行命令：
+
+```bash
+cd tests/collectives
+./run_collective_perf.sh 2 0 ../../build-profile-950/tests/collectives \
+  --op allgather \
+  --min-bytes 4096 \
+  --max-bytes 4096 \
+  --iters 2 \
+  --warmup-iters 1 \
+  --datatype int32 \
+  --check 0 \
+  --profile 1 \
+  --profile-dir ../../run/prof/collectives-950 \
+  --profile-sample-every 1 \
+  --profile-ai-prompt 1
+```
+
+采集流程：
+
+- 每个 measured launch 创建一个 `TileXRCollectivePerfSession`。
+- kernel launch 前通过 `PreparePerfTraceLaunch` 准备 device trace buffer。
+- kernel 内部写入 stage/core/rank 维度的统计信息。
+- launch 完成后调用 `TileXRCollectivePerfWriteReport` 输出单 launch 报告。
+- `run_collective_perf.sh` 等所有 rank 结束后调用 `tilexr_collective_profile_report.py` 生成聚合报告。
+
+## 输出产物
+
+每个 rank/launch 目录会生成：
+
+- `trace.json`：单次 launch 的结构化 trace，schema 为 `tilexr_perf_trace_report.v1`。
+- `summary.csv`：stage/core 维度统计表。
+- `analysis.md`：单 launch 文本分析。
+- `report.html`：单 launch HTML drilldown。
+- `ai_prompt.md`：可选，开启 `--profile-ai-prompt 1` 时生成。
+
+profile 根目录会生成聚合产物：
+
+- `report.html`：保留原有 HTML 呈现，包含 bottleneck-first 摘要、timeline、drilldown 链接。
+- `trace_index.json`：聚合后的中间索引，schema 为 `tilexr_perf_trace_run.v1`。
+- `analysis.md`：跨 rank/launch 的文本摘要。
+- `ai_prompt.md`：可选聚合 prompt。
+- `perfetto_trace.json`：新增 Perfetto/Chrome trace event 格式，给 `ui.perfetto.dev` 使用。
+
+聚合摘要现在额外包含 rank-level kernel summary：
+
+- `summary.rank_kernel`：按 rank 汇总 `kernel_total`，记录 launch 数、平均 kernel us、最大 kernel us、最慢 launch。
+- `summary.slowest_rank`：按平均 kernel us 排序选出的最慢 rank。
+- `report.html`：新增 `Rank-Level Summary` 表格，用于快速定位慢 rank 后再跳转到单 launch drilldown。
+- `analysis.md` / `ai_prompt.md`：同步输出 slowest rank 和 rank kernel totals。
+
+## Perfetto Trace 支持
+
+`tests/collectives/tilexr_collective_profile_report.py` 新增 `render_perfetto_trace()`：
+
+- 每个 rank 映射为 Perfetto process。
+- 每个 rank/core 映射为 Perfetto thread，thread 名为 `rankN/coreM`。
+- 每个 profiling stage 映射为 `ph: "X"` duration event，事件名为 `launchN/rankR/stage`。
+- 每个 rank 增加 `launch_windows` thread，并写入 `launchN/rankR/window` 对齐窗口。
+- measured launch 之间增加固定 gap，方便在 `ui.perfetto.dev` 中按 launch 展开和搜索。
+- 事件保留 `launch_id`、`launch_offset_us`、`normalized_ts`、`rank`、`core`、`stage`、`stage_id`、`sum_us`、`raw_cycles`、`max_cycles`、`message_bytes`、`rank_size` 和源 trace 路径。
+
+注意：不同 NPU 的原始 cycle offset 不假设同步，因此 HTML 和 Perfetto 都按 rank/launch 内部归一化时间展示，更适合观察单 rank 内核阶段耗时、stage 分布和慢 core，而不是直接比较跨 NPU 的绝对开始时间。
+
+## 测试覆盖
+
+已补充/更新的测试包括：
+
+- `tests/collectives/unit/test_collective_profile_report.py`
+  - 校验聚合 HTML、`trace_index.json`、`analysis.md`。
+  - 校验 rank-level summary、slowest rank、Perfetto launch window 和 launch/rank/stage 事件命名。
+  - 校验 sparse launch、multi-size launch、missing trace diagnostics。
+- `tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp`
+  - 校验 README 和 profile report helper 保留 rank summary / Perfetto marker 关键字段。
+- `tests/collectives/unit/test_tilexr_collectives_api.cpp`
+  - 校验 Ascend950 CCE arch 配置。
+  - 校验 `__DAV_C310_VEC__` 相关源码路径。
+- `tests/comm/unit/test_tilexr_source_guards.cpp`
+  - 校验已观察到的 `Ascend950PR`、`Ascend950PR_9589`、`Ascend950PR_9599` variant 被映射为 `CHIP_950`。
+
+## 当前限制
+
+- 当前单机 profiling 路径依赖 standalone collectives kernel 内的 trace hook，主要用于分析 allgather/allreduce/reducescatter/broadcast/alltoall 这类已有 standalone 算子。
+- warmup launch 不写 profiling trace，聚合报告只展示 measured launch。
+- rank-level summary 已能直接看慢 rank，但更细的自动归因仍需要结合 stage/core drilldown 分析。
+- 多机 62+70 的 communicator bootstrap 可以成功，但 2 机 collective kernel 目前会在 kernel 阶段失败或超时；这属于跨节点数据面支持问题，不属于本报告覆盖的单机 profiling 范围。
+
+## 建议后续
+
+- 短期：继续用 70 的单机 allgather/profile 结果检查 rank-level summary 的可读性，并按真实数据调整排序和字段展示。
+- 短期：在 HTML 中补充 rank 间差值/百分比，例如 slowest rank 相比 fastest rank 慢多少。
+- 中期：为 Perfetto 增加更多 marker，例如 message size group、op group、rank delta marker，进一步降低定位慢 stage 的成本。
+- 中期：另起多机 profiling 任务，先解决 collective kernel 跨节点数据面，再复用当前 profile report 聚合链路。
diff --git a/src/collectives/host/collective_kernel.cpp b/src/collectives/host/collective_kernel.cpp
index d5bb49d..834b153 100644
--- a/src/collectives/host/collective_kernel.cpp
+++ b/src/collectives/host/collective_kernel.cpp
@@ -11,6 +11,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <iostream>
 #include <mutex>
 #include <string>
 
@@ -57,6 +58,7 @@ const TileXR::TileXRType kRegisteredCollectiveTypes[] = {
     TileXR::TileXRType::ALL_REDUCE,
     TileXR::TileXRType::REDUCE_SCATTER,
     TileXR::TileXRType::BROADCAST,
+    TileXR::TileXRType::PROFILE_PROBE,
 };
 
 bool IsStandaloneCollectiveType(TileXR::TileXRType type)
@@ -65,11 +67,17 @@ bool IsStandaloneCollectiveType(TileXR::TileXRType type)
         type == TileXR::TileXRType::ALL2ALL ||
         type == TileXR::TileXRType::ALL_REDUCE ||
         type == TileXR::TileXRType::REDUCE_SCATTER ||
-        type == TileXR::TileXRType::BROADCAST;
+        type == TileXR::TileXRType::BROADCAST ||
+        type == TileXR::TileXRType::PROFILE_PROBE;
 }
 
 int8_t *GetFunSig(TileXR::TileXRType type, TileXR::TileXRDataType dataType)
 {
+    if (type == TileXR::TileXRType::PROFILE_PROBE) {
+        const uint64_t sig = (static_cast<uint64_t>(type) << FUNSIG_OFFSET_BITS << FUNSIG_OFFSET_BITS) +
+            FUNSIG_SKEW;
+        return reinterpret_cast<int8_t *>(sig);
+    }
     const uint64_t sig = (static_cast<uint64_t>(type) << FUNSIG_OFFSET_BITS << FUNSIG_OFFSET_BITS) +
         (static_cast<uint64_t>(dataType) << FUNSIG_OFFSET_BITS) + FUNSIG_SKEW;
     return reinterpret_cast<int8_t *>(sig);
@@ -77,6 +85,9 @@ int8_t *GetFunSig(TileXR::TileXRType type, TileXR::TileXRDataType dataType)
 
 std::string KernelName(TileXR::TileXRType type, const DataTypeRegistration &dataType)
 {
+    if (type == TileXR::TileXRType::PROFILE_PROBE) {
+        return TileXR::TILEXR_TYPE2NAME.at(type);
+    }
     if (type == TileXR::TileXRType::BROADCAST) {
         return TileXR::TILEXR_TYPE2NAME.at(type);
     }
@@ -102,16 +113,35 @@ int RegisterCollectivesKernelsLocked()
     void *binHandle = nullptr;
     rtError_t rtRet = rtDevBinaryRegister(&binary, &binHandle);
     if (rtRet != RT_ERROR_NONE) {
+        std::cerr << "TileXR collectives rtDevBinaryRegister failed, ret=" << rtRet
+                  << ", binarySize=" << TileXRCollectivesKernelBinarySize << std::endl;
         g_registrationStatus = TileXR::TILEXR_ERROR_MKIRT;
         return g_registrationStatus;
     }
 
     for (const auto type : kRegisteredCollectiveTypes) {
+        if (type == TileXR::TileXRType::PROFILE_PROBE) {
+            const std::string name = KernelName(type, kDataTypes[0]);
+            rtRet = rtFunctionRegister(binHandle, GetFunSig(type, TileXR::TILEXR_DATA_TYPE_INT8),
+                name.c_str(), name.c_str(), 0);
+            if (rtRet != RT_ERROR_NONE) {
+                std::cerr << "TileXR collectives rtFunctionRegister failed, ret=" << rtRet
+                          << ", kernel=" << name
+                          << ", type=" << static_cast<int>(type) << std::endl;
+                g_registrationStatus = TileXR::TILEXR_ERROR_MKIRT;
+                return g_registrationStatus;
+            }
+            continue;
+        }
         for (const auto &dataType : kDataTypes) {
             const std::string name = KernelName(type, dataType);
             rtRet = rtFunctionRegister(binHandle, GetFunSig(type, dataType.dataType),
                 name.c_str(), name.c_str(), 0);
             if (rtRet != RT_ERROR_NONE) {
+                std::cerr << "TileXR collectives rtFunctionRegister failed, ret=" << rtRet
+                          << ", kernel=" << name
+                          << ", type=" << static_cast<int>(type)
+                          << ", dataType=" << static_cast<int>(dataType.dataType) << std::endl;
                 g_registrationStatus = TileXR::TILEXR_ERROR_MKIRT;
                 return g_registrationStatus;
             }
@@ -181,6 +211,14 @@ int LaunchCollectiveKernel(TileXRCommPtr comm, TileXR::TileXRType type, const Ho
 
     const rtError_t ret = rtKernelLaunchWithFlagV2(GetFunSig(type, dataType), blockDim, &argsInfo, nullptr,
         static_cast<rtStream_t>(stream), 0, &cfgInfo);
+    if (ret != RT_ERROR_NONE) {
+        std::cerr << "TileXR collectives rtKernelLaunchWithFlagV2 failed, ret=" << ret
+                  << ", type=" << static_cast<int>(type)
+                  << ", dataType=" << static_cast<int>(dataType)
+                  << ", blockDim=" << blockDim
+                  << ", kernelCount=" << kernelCount
+                  << ", stream=" << stream << std::endl;
+    }
     return ret == RT_ERROR_NONE ? TileXR::TILEXR_SUCCESS : TileXR::TILEXR_ERROR_MKIRT;
 }
 
diff --git a/src/collectives/host/collective_utils.cpp b/src/collectives/host/collective_utils.cpp
index 1ff77e6..bef4646 100644
--- a/src/collectives/host/collective_utils.cpp
+++ b/src/collectives/host/collective_utils.cpp
@@ -201,5 +201,14 @@ uint32_t GetBroadcastBlockNum(const TileXR::CommArgs &commArgs, int64_t dataSize
     return rankSize;
 }
 
+uint32_t GetProfileProbeBlockNum(const TileXR::CommArgs &commArgs, int64_t dataSize)
+{
+    if (commArgs.rankSize <= 0 || dataSize < 0) {
+        return 0;
+    }
+    constexpr uint32_t kDefaultProbeBlocks = 4;
+    return kDefaultProbeBlocks;
+}
+
 } // namespace Host
 } // namespace TileXRCollectives
diff --git a/src/collectives/host/collective_utils.h b/src/collectives/host/collective_utils.h
index f68c1f0..33d1aca 100644
--- a/src/collectives/host/collective_utils.h
+++ b/src/collectives/host/collective_utils.h
@@ -34,6 +34,8 @@ uint32_t GetReduceScatterBlockNum(const TileXR::CommArgs &commArgs, int64_t data
 
 uint32_t GetBroadcastBlockNum(const TileXR::CommArgs &commArgs, int64_t dataSize);
 
+uint32_t GetProfileProbeBlockNum(const TileXR::CommArgs &commArgs, int64_t dataSize);
+
 } // namespace Host
 } // namespace TileXRCollectives
 
diff --git a/src/collectives/host/perf_trace_report.cpp b/src/collectives/host/perf_trace_report.cpp
index 69ee408..3c5da8f 100644
--- a/src/collectives/host/perf_trace_report.cpp
+++ b/src/collectives/host/perf_trace_report.cpp
@@ -163,7 +163,7 @@ std::vector<TileXR::TileXRPerfCoreStageStats> NonEmptyStats(
 {
     std::vector<TileXR::TileXRPerfCoreStageStats> result;
     for (const auto &stat : stats) {
-        if (stat.count != 0 || stat.sumCycles != 0 || stat.maxCycles != 0) {
+        if (stat.count != 0) {
             result.push_back(stat);
         }
     }
@@ -185,12 +185,17 @@ bool IsValidTraceHeader(const TileXR::TileXRPerfTraceHeader &header)
 }
 
 std::string BuildTraceJson(const TileXR::TileXRPerfTraceHeader &header,
-                           const std::vector<TileXR::TileXRPerfCoreStageStats> &stats)
+                           const std::vector<TileXR::TileXRPerfCoreStageStats> &stats,
+                           const PerfReportOptions &options)
 {
     const auto nonEmptyStats = NonEmptyStats(stats);
     std::ostringstream out;
     out << "{\n";
     out << "  \"schema\": \"tilexr_perf_trace_report.v1\",\n";
+    if (options.incomplete) {
+        out << "  \"incomplete\": true,\n";
+        out << "  \"incomplete_reason\": \"" << EscapeJson(options.incompleteReason) << "\",\n";
+    }
     out << "  \"op_type\": " << header.opType << ",\n";
     out << "  \"op_name\": \"" << EscapeJson(OpName(header.opType)) << "\",\n";
     out << "  \"rank_size\": " << header.rankSize << ",\n";
@@ -214,6 +219,8 @@ std::string BuildTraceJson(const TileXR::TileXRPerfTraceHeader &header,
             << ", \"last_end_cycle\": " << stat.lastEndCycle
             << ", \"aux0\": " << stat.aux0
             << ", \"aux1\": " << stat.aux1
+            << ", \"aux2\": " << stat.aux2
+            << ", \"aux3\": " << stat.aux3
             << ", \"sum_us\": " << StatSumUs(header, stat)
             << "}";
         if (i + 1 != nonEmptyStats.size()) {
@@ -245,7 +252,8 @@ std::string BuildSummaryCsv(const std::vector<PerfStageSummary> &summaries)
 
 std::string BuildAnalysisMarkdown(const TileXR::TileXRPerfTraceHeader &header,
                                   const std::vector<PerfStageSummary> &summaries,
-                                  const std::vector<std::string> &findings)
+                                  const std::vector<std::string> &findings,
+                                  const PerfReportOptions &options)
 {
     std::ostringstream out;
     out << "# TileXR Collective Perf Analysis\n\n";
@@ -253,6 +261,11 @@ std::string BuildAnalysisMarkdown(const TileXR::TileXRPerfTraceHeader &header,
     out << "- Message bytes: " << header.messageBytes << "\n";
     out << "- Rank size: " << header.rankSize << "\n";
     out << "- Block dim: " << header.blockDim << "\n\n";
+    if (options.incomplete) {
+        out << "## Trace Status\n\n";
+        out << "- Incomplete trace: " << options.incompleteReason << "\n";
+        out << "- Device-side stats were not copied back; metadata is preserved for run-level diagnostics.\n\n";
+    }
     out << "## Findings\n\n";
     for (const auto &finding : findings) {
         out << "- " << finding << "\n";
@@ -275,7 +288,8 @@ std::string BuildAnalysisMarkdown(const TileXR::TileXRPerfTraceHeader &header,
 std::string BuildHtmlReport(const TileXR::TileXRPerfTraceHeader &header,
                             const std::vector<PerfStageSummary> &summaries,
                             const std::vector<std::string> &findings,
-                            const std::vector<TileXR::TileXRPerfCoreStageStats> &stats)
+                            const std::vector<TileXR::TileXRPerfCoreStageStats> &stats,
+                            const PerfReportOptions &options)
 {
     const auto nonEmptyStats = NonEmptyStats(stats);
     std::ostringstream out;
@@ -288,6 +302,11 @@ std::string BuildHtmlReport(const TileXR::TileXRPerfTraceHeader &header,
     out << "<h2>Bottleneck First</h2>\n";
     out << "<p>Operation: " << EscapeHtml(OpName(header.opType)) << ", message bytes: "
         << header.messageBytes << "</p>\n";
+    if (options.incomplete) {
+        out << "<p><strong>Incomplete trace:</strong> "
+            << EscapeHtml(options.incompleteReason)
+            << ". Device-side stats were not copied back.</p>\n";
+    }
     out << "<ul>\n";
     for (const auto &finding : findings) {
         out << "<li>" << EscapeHtml(finding) << "</li>\n";
@@ -330,7 +349,8 @@ std::string BuildHtmlReport(const TileXR::TileXRPerfTraceHeader &header,
 
 std::string BuildAiPrompt(const TileXR::TileXRPerfTraceHeader &header,
                           const std::vector<PerfStageSummary> &summaries,
-                          const std::vector<std::string> &findings)
+                          const std::vector<std::string> &findings,
+                          const PerfReportOptions &options)
 {
     std::ostringstream out;
     out << "# TileXR collective profiling\n\n";
@@ -338,6 +358,10 @@ std::string BuildAiPrompt(const TileXR::TileXRPerfTraceHeader &header,
     out << "Operation: " << OpName(header.opType) << "\n";
     out << "Message bytes: " << header.messageBytes << "\n";
     out << "Cycle-to-us divisor: " << header.cycleToUsDivisor << "\n\n";
+    if (options.incomplete) {
+        out << "Trace status: incomplete\n";
+        out << "Incomplete reason: " << options.incompleteReason << "\n\n";
+    }
     out << "Findings:\n";
     for (const auto &finding : findings) {
         out << "- " << finding << "\n";
@@ -471,15 +495,18 @@ int WritePerfTraceReports(
     const auto summaries = SummarizePerfTrace(header, stats);
     const auto findings = AnalyzePerfTrace(header, summaries);
 
-    if (!WriteTextFile(JoinPath(options.outputDir, "trace.json"), BuildTraceJson(header, stats)) ||
+    if (!WriteTextFile(JoinPath(options.outputDir, "trace.json"), BuildTraceJson(header, stats, options)) ||
         !WriteTextFile(JoinPath(options.outputDir, "summary.csv"), BuildSummaryCsv(summaries)) ||
-        !WriteTextFile(JoinPath(options.outputDir, "analysis.md"), BuildAnalysisMarkdown(header, summaries, findings)) ||
-        !WriteTextFile(JoinPath(options.outputDir, "report.html"), BuildHtmlReport(header, summaries, findings, stats))) {
+        !WriteTextFile(JoinPath(options.outputDir, "analysis.md"),
+            BuildAnalysisMarkdown(header, summaries, findings, options)) ||
+        !WriteTextFile(JoinPath(options.outputDir, "report.html"),
+            BuildHtmlReport(header, summaries, findings, stats, options))) {
         return TileXR::TILEXR_ERROR_INTERNAL;
     }
 
     if (options.emitAiPrompt &&
-        !WriteTextFile(JoinPath(options.outputDir, "ai_prompt.md"), BuildAiPrompt(header, summaries, findings))) {
+        !WriteTextFile(JoinPath(options.outputDir, "ai_prompt.md"),
+            BuildAiPrompt(header, summaries, findings, options))) {
         return TileXR::TILEXR_ERROR_INTERNAL;
     }
     if (!options.emitAiPrompt && !RemoveIfExists(JoinPath(options.outputDir, "ai_prompt.md"))) {
diff --git a/src/collectives/host/perf_trace_report.h b/src/collectives/host/perf_trace_report.h
index 708ce8d..458b534 100644
--- a/src/collectives/host/perf_trace_report.h
+++ b/src/collectives/host/perf_trace_report.h
@@ -25,6 +25,8 @@ struct PerfStageSummary {
 struct PerfReportOptions {
     std::string outputDir;
     bool emitAiPrompt = false;
+    bool incomplete = false;
+    std::string incompleteReason;
 };
 
 const char *PerfStageName(uint32_t stageId);
diff --git a/src/collectives/host/perf_trace_session.cpp b/src/collectives/host/perf_trace_session.cpp
index 0bf8f69..104b04f 100644
--- a/src/collectives/host/perf_trace_session.cpp
+++ b/src/collectives/host/perf_trace_session.cpp
@@ -2,6 +2,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <exception>
 #include <limits>
 #include <memory>
@@ -81,6 +82,13 @@ bool ComputeRequiredBytes(uint64_t statsOffset, uint64_t statsBytes, size_t *req
     return true;
 }
 
+bool DisableKernelProfiling()
+{
+    const char *value = std::getenv("TILEXR_COLLECTIVES_DISABLE_KERNEL_PROFILING");
+    return value != nullptr &&
+        (std::string(value) == "1" || std::string(value) == "true" || std::string(value) == "yes");
+}
+
 int64_t ComputeLaunchMessageBytes(TileXR::TileXRType opType, TileXR::TileXRDataType dataType,
                                   int64_t count, int rankSize)
 {
@@ -201,7 +209,7 @@ int PreparePerfTraceLaunch(PerfTraceSession *session, const TileXR::CommArgs &co
         return TileXR::TILEXR_ERROR_INTERNAL;
     }
 
-    session->header.statsOffset = sizeof(TileXR::TileXRPerfTraceHeader);
+    session->header.statsOffset = TileXR::TILEXR_PERF_TRACE_STATS_OFFSET;
     size_t statsBytes = 0;
     if (!ComputeStatsBytes(session->hostStats.size(), &statsBytes)) {
         return TileXR::TILEXR_ERROR_INTERNAL;
@@ -211,6 +219,11 @@ int PreparePerfTraceLaunch(PerfTraceSession *session, const TileXR::CommArgs &co
     if (!ComputeRequiredBytes(session->header.statsOffset, session->header.statsBytes, &requiredBytes)) {
         return TileXR::TILEXR_ERROR_INTERNAL;
     }
+    if (DisableKernelProfiling()) {
+        session->deviceTraceReady = false;
+        *deviceTrace = nullptr;
+        return TileXR::TILEXR_SUCCESS;
+    }
     if (session->deviceBuffer == nullptr || !session->ownsDeviceBuffer || session->deviceBufferBytes < requiredBytes) {
         void *newBuffer = nullptr;
         aclError allocRet = g_runtimeHooks->mallocDevice(&newBuffer, requiredBytes);
@@ -342,3 +355,25 @@ extern "C" int TileXRCollectivePerfWriteReport(TileXRCollectivePerfSession sessi
         return TileXR::TILEXR_ERROR_INTERNAL;
     }
 }
+
+extern "C" int TileXRCollectivePerfWriteIncompleteReport(TileXRCollectivePerfSession session, const char *reason)
+{
+    if (session == nullptr) {
+        return TileXR::TILEXR_ERROR_PARA_CHECK_FAIL;
+    }
+
+    try {
+        TileXRCollectives::Host::PerfTraceSession *impl =
+            static_cast<TileXRCollectives::Host::PerfTraceSession *>(session);
+        TileXRCollectives::Host::PerfReportOptions options {};
+        options.outputDir = impl->outputDir;
+        options.emitAiPrompt = impl->config.emitAiPrompt != 0;
+        options.incomplete = true;
+        options.incompleteReason = reason == nullptr || reason[0] == '\0' ? "kernel launch did not complete" : reason;
+        return TileXRCollectives::Host::WritePerfTraceReports(impl->header, impl->hostStats, options);
+    } catch (const std::exception &) {
+        return TileXR::TILEXR_ERROR_INTERNAL;
+    } catch (...) {
+        return TileXR::TILEXR_ERROR_INTERNAL;
+    }
+}
diff --git a/src/collectives/host/tilexr_collectives.cpp b/src/collectives/host/tilexr_collectives.cpp
index 3da431e..97e9401 100644
--- a/src/collectives/host/tilexr_collectives.cpp
+++ b/src/collectives/host/tilexr_collectives.cpp
@@ -225,3 +225,27 @@ int TileXRBroadcast(void *buf, int64_t count,
         buf, buf, bytes, dataType, blockDim, stream,
         TileXRCollectives::Host::CollectiveLaunchAttrs { 0, root });
 }
+
+int TileXRProfileProbe(void *sendBuf, void *recvBuf, int64_t count,
+                       TileXR::TileXRDataType dataType, TileXRCommPtr comm,
+                       aclrtStream stream)
+{
+    int ret = ValidateCommon(sendBuf, recvBuf, count, dataType, comm);
+    if (ret != TileXR::TILEXR_SUCCESS) {
+        return ret;
+    }
+
+    TileXRCollectives::Host::HostLaunchContext context;
+    ret = TileXRCollectives::Host::PrepareHostLaunchContext(comm, context);
+    if (ret != TileXR::TILEXR_SUCCESS) {
+        return ret;
+    }
+
+    const int64_t bytes = TileXRCollectives::Host::CountToBytes(count, dataType);
+    const uint32_t blockDim = TileXRCollectives::Host::GetProfileProbeBlockNum(*context.hostArgs, bytes);
+    if (blockDim == 0) {
+        return TileXR::TILEXR_ERROR_PARA_CHECK_FAIL;
+    }
+    return TileXRCollectives::Host::LaunchCollectiveKernel(comm, TileXR::TileXRType::PROFILE_PROBE, context,
+        sendBuf, recvBuf, bytes, TileXR::TILEXR_DATA_TYPE_INT8, blockDim, stream);
+}
diff --git a/src/collectives/kernels/CMakeLists.txt b/src/collectives/kernels/CMakeLists.txt
index 4c9e53b..6275270 100644
--- a/src/collectives/kernels/CMakeLists.txt
+++ b/src/collectives/kernels/CMakeLists.txt
@@ -6,8 +6,9 @@
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 enable_language(CCE)
 
-set(TILEXR_COLLECTIVES_1OP_BIN_SIZE 5242880 CACHE STRING "Padded size for TileXR collectives CCE binary")
+set(TILEXR_COLLECTIVES_1OP_BIN_SIZE 10485760 CACHE STRING "Padded size for TileXR collectives CCE binary")
 option(TILEXR_COLLECTIVES_ENABLE_PROFILING "Enable TileXR collectives kernel profiling helpers" OFF)
+set(TILEXR_COLLECTIVES_SOC_TYPE "" CACHE STRING "SOC type used for TileXR collectives CCE kernels")
 
 set(CCE_COMPILE_OPTION
         -O2
@@ -21,7 +22,11 @@ set(CCE_COMPILE_OPTION
         "SHELL:-mllvm --cce-aicore-jump-expand=true"
 )
 
-set(AIV_ARCH dav-c220-vec)
+if(TILEXR_COLLECTIVES_SOC_TYPE STREQUAL "Ascend950")
+    set(TILEXR_COLLECTIVES_AICORE_ARCH dav-c310-vec)
+else()
+    set(TILEXR_COLLECTIVES_AICORE_ARCH dav-c220-vec)
+endif()
 set_source_files_properties(tilexr_lccl_op.cpp PROPERTIES LANGUAGE CCE)
 
 include_directories(
@@ -46,7 +51,7 @@ add_library(tilexr_collectives_op_tmp OBJECT
 
 target_compile_options(tilexr_collectives_op_tmp PRIVATE
         ${CCE_COMPILE_OPTION}
-        --cce-aicore-arch=${AIV_ARCH}
+        --cce-aicore-arch=${TILEXR_COLLECTIVES_AICORE_ARCH}
 )
 
 if(TILEXR_COLLECTIVES_ENABLE_PROFILING)
diff --git a/src/collectives/kernels/collectives.h b/src/collectives/kernels/collectives.h
index 26f3be5..a6d4be5 100644
--- a/src/collectives/kernels/collectives.h
+++ b/src/collectives/kernels/collectives.h
@@ -441,7 +441,7 @@ class Collectives {
     {
         PipeBarrier<PIPE_ALL>();
         if (op != -1) {
-#ifdef __DAV_C220_VEC__
+#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__)
             SetAtomicOpType<T>(op);
 #endif
         }
diff --git a/src/collectives/kernels/datacopy_gm2gm.h b/src/collectives/kernels/datacopy_gm2gm.h
index c49e7e0..fbbeb51 100644
--- a/src/collectives/kernels/datacopy_gm2gm.h
+++ b/src/collectives/kernels/datacopy_gm2gm.h
@@ -21,25 +21,38 @@ constexpr int32_t BUFFER_NUM = 1;
 constexpr int32_t TILE_NUM = 2;
 constexpr int32_t BLOCK_SIZE = UB_SINGLE_DMA_SIZE_MAX / TILE_NUM / BUFFER_NUM;
 
+template <typename T>
+struct TileXRAtomicTypeSupported {
+#if defined(__DAV_C310_VEC__)
+    static constexpr bool value = std::is_same_v<T, float> || std::is_same_v<T, float16_t> ||
+        std::is_same_v<T, bfloat16_t> || std::is_same_v<T, int32_t> ||
+        std::is_same_v<T, int16_t> || std::is_same_v<T, int8_t>;
+#else
+    static constexpr bool value = true;
+#endif
+};
+
 template <typename T>
 FORCE_INLINE_AICORE void SetAtomicOpType(int op)
 {
-    switch (op) {
-        case ADD:
-            AscendC::SetAtomicAdd<T>();
-            break;
+    if constexpr (TileXRAtomicTypeSupported<T>::value) {
+        switch (op) {
+            case ADD:
+                AscendC::SetAtomicAdd<T>();
+                break;
 
-        case MUL:
-            break;
-        case MAX:
-            AscendC::SetAtomicMax<T>();
-            break;
-        case MIN:
-            AscendC::SetAtomicMin<T>();
-            break;
-        default:
-            AscendC::SetAtomicNone();
-            ;
+            case MUL:
+                break;
+            case MAX:
+                AscendC::SetAtomicMax<T>();
+                break;
+            case MIN:
+                AscendC::SetAtomicMin<T>();
+                break;
+            default:
+                AscendC::SetAtomicNone();
+                ;
+        }
     }
 }
 
@@ -222,7 +235,7 @@ class DataCopyGM2GM {
     {
         PipeBarrier<PIPE_ALL>();
         if (op != -1) {
-#ifdef __DAV_C220_VEC__
+#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__)
             SetAtomicOpType<T>(op);
 #endif
         }
@@ -329,4 +342,4 @@ class DataCopyGM2GM {
     const __gm__ T* outputGm = nullptr;
     int op;
 };
-#endif // LCCL_DATACOPY_GM2GM_H
\ No newline at end of file
+#endif // LCCL_DATACOPY_GM2GM_H
diff --git a/src/collectives/kernels/kernels/collectives.cce b/src/collectives/kernels/kernels/collectives.cce
index ca9570e..1111a77 100644
--- a/src/collectives/kernels/kernels/collectives.cce
+++ b/src/collectives/kernels/kernels/collectives.cce
@@ -10,7 +10,7 @@
 #ifndef TILEXR_KERNEL_COLLECTIVES_CCE
 #define TILEXR_KERNEL_COLLECTIVES_CCE
 
-#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__)
+#if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__)
 #define __aicore__
 #define __ubuf__
 #define __gm__
@@ -574,7 +574,7 @@ __attribute__((always_inline)) inline __aicore__ void ProcessData(int64_t dataSi
         return;
     }
     AscendC::PipeBarrier<PIPE_ALL>();
-    #ifdef __DAV_C220_VEC__
+    #if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__)
     SetAtomicOpType<T>(op);
     #endif
     AscendC::PipeBarrier<PIPE_ALL>();
@@ -617,7 +617,7 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNew(int64_t dat
     }
 
     AscendC::PipeBarrier<PIPE_ALL>();
-#ifdef __DAV_C220_VEC__
+#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__)
     SetAtomicOpType<T>(op);
 #endif
     AscendC::PipeBarrier<PIPE_ALL>();
@@ -659,7 +659,7 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNewNonBarrier(i
 
     AscendC::SetFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
     AscendC::WaitFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
-#ifdef __DAV_C220_VEC__
+#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__)
     SetAtomicOpType<T>(op);
 #endif
     AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);
diff --git a/src/collectives/kernels/kernels/lcal_profile_probe.cce b/src/collectives/kernels/kernels/lcal_profile_probe.cce
new file mode 100644
index 0000000..96b8b9a
--- /dev/null
+++ b/src/collectives/kernels/kernels/lcal_profile_probe.cce
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2024-2026 TileXR Project
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details.
+ */
+#include <cstdint>
+#include "collectives.cce"
+
+constexpr int64_t TILEXR_PROFILE_PROBE_UB_OFFSET = 64;
+constexpr int64_t TILEXR_PROFILE_PROBE_STATS_GUARD_BYTES = 512;
+constexpr int64_t TILEXR_PROFILE_PROBE_COPY_UB_BYTES =
+    TileXR::TILEXR_PERF_TRACE_STATS_UB_OFFSET - TILEXR_PROFILE_PROBE_UB_OFFSET -
+    TILEXR_PROFILE_PROBE_STATS_GUARD_BYTES;
+
+__attribute__((always_inline)) inline __aicore__ void TileXRProfileProbeCopy(
+    GM_ADDR input, GM_ADDR output, int64_t len, __ubuf__ uint8_t *inputUB)
+{
+    if (input == nullptr || output == nullptr || len <= 0) {
+        return;
+    }
+    const int64_t blockNum = GetBlockNum();
+    if (blockNum <= 0) {
+        return;
+    }
+
+    const int64_t perCore = (len + blockNum - 1) / blockNum;
+    const int64_t start = static_cast<int64_t>(GetBlockIdx()) * perCore;
+    if (start >= len) {
+        return;
+    }
+    int64_t remain = len - start;
+    if (remain > perCore) {
+        remain = perCore;
+    }
+    if (remain > TILEXR_PROFILE_PROBE_COPY_UB_BYTES) {
+        remain = TILEXR_PROFILE_PROBE_COPY_UB_BYTES;
+    }
+
+    CpGM2UB(inputUB, reinterpret_cast<__gm__ uint8_t *>(input) + start, static_cast<uint32_t>(remain));
+    AscendC::SetFlag<AscendC::HardEvent::MTE2_MTE3>(EVENT_ID0);
+    AscendC::WaitFlag<AscendC::HardEvent::MTE2_MTE3>(EVENT_ID0);
+    CpUB2GM(reinterpret_cast<__gm__ uint8_t *>(output) + start, inputUB, static_cast<uint32_t>(remain));
+    AscendC::SetFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+    AscendC::WaitFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+}
+
+__attribute__((always_inline)) inline __aicore__ void TileXRProfileProbeKernel(
+    KERNELS_ARGS_FUN(), int rank, int rankSize, uint32_t extraFlag)
+{
+    (void)commArgs;
+    (void)magic;
+    (void)op;
+    (void)root;
+    (void)cycleCount;
+    (void)scale;
+    (void)scaleCount;
+    (void)offset;
+    (void)rankSize;
+    (void)extraFlag;
+
+    const uint32_t perfCore = static_cast<uint32_t>(GetBlockIdx());
+    const uint32_t perfRank = static_cast<uint32_t>(rank);
+    __ubuf__ uint8_t *inputUB = reinterpret_cast<__ubuf__ uint8_t *>(TILEXR_PROFILE_PROBE_UB_OFFSET);
+
+    auto kernelTotal = TileXR::TileXRPerfStageBegin(
+        perfTrace, TileXR::PerfStageId::KERNEL_TOTAL, TileXR::PerfBarrierPolicy::NO_BARRIER);
+    auto chunkTotal = TileXR::TileXRPerfStageBegin(
+        perfTrace, TileXR::PerfStageId::CHUNK_TOTAL, TileXR::PerfBarrierPolicy::NO_BARRIER);
+
+    auto postSync = TileXR::TileXRPerfStageBegin(
+        perfTrace, TileXR::PerfStageId::POST_SYNC, TileXR::PerfBarrierPolicy::BARRIERED);
+    AscendC::PipeBarrier<PIPE_ALL>();
+    TileXR::TileXRPerfStageEnd(
+        perfTrace, perfRank, perfCore, TileXR::PerfStageId::POST_SYNC,
+        postSync, TileXR::PerfBarrierPolicy::END_BARRIER_ONLY);
+
+    auto localInputToIpc = TileXR::TileXRPerfStageBegin(
+        perfTrace, TileXR::PerfStageId::LOCAL_INPUT_TO_IPC, TileXR::PerfBarrierPolicy::BARRIERED);
+    TileXRProfileProbeCopy(input, output, len, inputUB);
+    TileXR::TileXRPerfStageEnd(
+        perfTrace, perfRank, perfCore, TileXR::PerfStageId::LOCAL_INPUT_TO_IPC,
+        localInputToIpc, TileXR::PerfBarrierPolicy::END_BARRIER_ONLY);
+
+    const uint64_t pollStart = static_cast<uint64_t>(AscendC::GetSystemCycle());
+    AscendC::PipeBarrier<PIPE_ALL>();
+    const uint64_t pollEnd = static_cast<uint64_t>(AscendC::GetSystemCycle());
+    TileXR::TileXRPerfAccumulateDuration(
+        perfTrace, perfRank, perfCore, TileXR::PerfStageId::FLAG_POLL_WAIT, pollStart, pollEnd);
+
+    auto peerIpcToOutput = TileXR::TileXRPerfStageBegin(
+        perfTrace, TileXR::PerfStageId::PEER_IPC_TO_OUTPUT, TileXR::PerfBarrierPolicy::BARRIERED);
+    TileXRProfileProbeCopy(input, output, len, inputUB);
+    TileXR::TileXRPerfStageEnd(
+        perfTrace, perfRank, perfCore, TileXR::PerfStageId::PEER_IPC_TO_OUTPUT,
+        peerIpcToOutput, TileXR::PerfBarrierPolicy::END_BARRIER_ONLY);
+
+    auto chunkBarrier = TileXR::TileXRPerfStageBegin(
+        perfTrace, TileXR::PerfStageId::CHUNK_BARRIER, TileXR::PerfBarrierPolicy::NO_BARRIER);
+    AscendC::PipeBarrier<PIPE_ALL>();
+    TileXR::TileXRPerfStageEnd(
+        perfTrace, perfRank, perfCore, TileXR::PerfStageId::CHUNK_BARRIER,
+        chunkBarrier, TileXR::PerfBarrierPolicy::NO_BARRIER);
+    TileXR::TileXRPerfStageEnd(
+        perfTrace, perfRank, perfCore, TileXR::PerfStageId::CHUNK_TOTAL,
+        chunkTotal, TileXR::PerfBarrierPolicy::NO_BARRIER);
+    TileXR::TileXRPerfStageEnd(
+        perfTrace, perfRank, perfCore, TileXR::PerfStageId::KERNEL_TOTAL,
+        kernelTotal, TileXR::PerfBarrierPolicy::NO_BARRIER);
+}
diff --git a/src/collectives/kernels/lccl_op.h b/src/collectives/kernels/lccl_op.h
index 47a8178..555d999 100644
--- a/src/collectives/kernels/lccl_op.h
+++ b/src/collectives/kernels/lccl_op.h
@@ -10,7 +10,7 @@
 #ifndef TILEXR_LCCL_OP_H
 #define TILEXR_LCCL_OP_H
 
-#if defined(__DAV_C220_VEC__) || defined(__DAV_C220_CUBE__)
+#if defined(__DAV_C220_VEC__) || defined(__DAV_C220_CUBE__) || defined(__DAV_C310_VEC__)
 
 #include "op_def.h"
 #include "allgather.h"
@@ -49,6 +49,7 @@
 #include "kernels/lcal_broadcast_write.cce"
 #include "kernels/lcal_broadcast_big_data.cce"
 #include "kernels/lcal_all2all_transpose.cce"
+#include "kernels/lcal_profile_probe.cce"
 
 extern "C" __global__ __aicore__ __attribute__((section("Attr_Section_TileXR"))) void TileXRDescriptor() {}
 
@@ -74,6 +75,15 @@ extern "C" __global__ __aicore__ void TileXRBroadcast##suffix(KERNELS_ARGS_FUN()
     } \
 }
 
+#define LCCL_PROFILE_PROBE_FUNC_AUTO_DEF() \
+extern "C" __global__ __aicore__ void TileXRProfileProbe(KERNELS_ARGS_FUN()) \
+{ \
+    if ASCEND_IS_AIV { \
+        GET_COMM_ARGS; \
+        TileXRProfileProbeKernel(KERNELS_ARGS_CALL(), rank, rankSize, extraFlag); \
+    } \
+}
+
 #define LCCL_ALLGATHER_FUNC_AUTO_DEF(type, suffix) \
 extern "C" __global__ __aicore__ void TileXRAllGather_##type##suffix(KERNELS_ARGS_FUN()) { \
     if ASCEND_IS_AIV { \
diff --git a/src/collectives/kernels/perf_trace_kernel.h b/src/collectives/kernels/perf_trace_kernel.h
index 3df62ca..0ce3d35 100644
--- a/src/collectives/kernels/perf_trace_kernel.h
+++ b/src/collectives/kernels/perf_trace_kernel.h
@@ -3,6 +3,7 @@
 
 #include "comm_args.h"
 #include "kernel_operator.h"
+#include "datacopy_gm2gm.h"
 #include "tilexr_perf_trace.h"
 
 namespace TileXR {
@@ -11,6 +12,8 @@ struct TileXRPerfStageToken {
     uint64_t startCycle = 0;
 };
 
+constexpr int64_t TILEXR_PERF_TRACE_STATS_UB_OFFSET = 195616;
+
 #if defined(TILEXR_COLLECTIVES_ENABLE_PROFILING)
 
 __attribute__((always_inline)) inline __aicore__ bool TileXRPerfTraceEnabled(GM_ADDR trace)
@@ -25,15 +28,13 @@ __attribute__((always_inline)) inline __aicore__ __gm__ TileXRPerfCoreStageStats
         return nullptr;
     }
 
-    __gm__ TileXRPerfTraceHeader *header = reinterpret_cast<__gm__ TileXRPerfTraceHeader *>(trace);
-    const size_t slot = PerfTraceStatsOffset(rank, core, stage, header->maxCoreCount, header->stageCount);
-    return reinterpret_cast<__gm__ TileXRPerfCoreStageStats *>(trace + header->statsOffset) + slot;
+    const size_t slot = PerfTraceStatsOffset(rank, core, stage, GetBlockNum(), TILEXR_PERF_STAGE_COUNT);
+    return reinterpret_cast<__gm__ TileXRPerfCoreStageStats *>(trace + TILEXR_PERF_TRACE_STATS_OFFSET) + slot;
 }
 
 __attribute__((always_inline)) inline __aicore__ TileXRPerfStageToken TileXRPerfStageBegin(
     GM_ADDR trace, PerfStageId stage, PerfBarrierPolicy policy)
 {
-    (void)stage;
     TileXRPerfStageToken token {};
     if (trace == nullptr) {
         return token;
@@ -41,12 +42,14 @@ __attribute__((always_inline)) inline __aicore__ TileXRPerfStageToken TileXRPerf
     if (policy == PerfBarrierPolicy::BARRIERED) {
         AscendC::PipeBarrier<PIPE_ALL>();
     }
+    (void)stage;
     token.startCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
     return token;
 }
 
 __attribute__((always_inline)) inline __aicore__ void TileXRPerfAccumulateDuration(
-    GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, uint64_t startCycle, uint64_t endCycle)
+    GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, uint64_t startCycle, uint64_t endCycle,
+    __ubuf__ TileXRPerfCoreStageStats *statsUB)
 {
     if (endCycle < startCycle) {
         return;
@@ -58,35 +61,55 @@ __attribute__((always_inline)) inline __aicore__ void TileXRPerfAccumulateDurati
         return;
     }
 
+    CpGM2UB(reinterpret_cast<__ubuf__ uint8_t *>(statsUB),
+        reinterpret_cast<__gm__ uint8_t *>(slot), sizeof(TileXRPerfCoreStageStats));
+    AscendC::SetFlag<AscendC::HardEvent::MTE2_S>(EVENT_ID0);
+    AscendC::WaitFlag<AscendC::HardEvent::MTE2_S>(EVENT_ID0);
+
     const uint64_t duration = endCycle - startCycle;
-    slot->rank = rank;
-    slot->core = core;
-    slot->stageId = stageId;
-    if (slot->count == 0) {
-        slot->minCycles = duration;
-        slot->maxCycles = duration;
-        slot->firstStartCycle = startCycle;
+    statsUB->rank = rank;
+    statsUB->core = core;
+    statsUB->stageId = stageId;
+    statsUB->reserved = 0;
+    if (statsUB->count == 0) {
+        statsUB->minCycles = duration;
+        statsUB->maxCycles = duration;
+        statsUB->firstStartCycle = startCycle;
     } else {
-        if (duration < slot->minCycles) {
-            slot->minCycles = duration;
+        if (duration < statsUB->minCycles) {
+            statsUB->minCycles = duration;
         }
-        if (duration > slot->maxCycles) {
-            slot->maxCycles = duration;
+        if (duration > statsUB->maxCycles) {
+            statsUB->maxCycles = duration;
         }
-        if (startCycle < slot->firstStartCycle) {
-            slot->firstStartCycle = startCycle;
+        if (startCycle < statsUB->firstStartCycle) {
+            statsUB->firstStartCycle = startCycle;
         }
     }
-    slot->count += 1;
-    slot->sumCycles += duration;
-    if (endCycle > slot->lastEndCycle) {
-        slot->lastEndCycle = endCycle;
+    statsUB->count += 1;
+    statsUB->sumCycles += duration;
+    if (endCycle > statsUB->lastEndCycle) {
+        statsUB->lastEndCycle = endCycle;
     }
+    AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);
+    AscendC::WaitFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);
+    CpUB2GM(reinterpret_cast<__gm__ uint8_t *>(slot),
+        reinterpret_cast<__ubuf__ uint8_t *>(statsUB), sizeof(TileXRPerfCoreStageStats));
+    AscendC::SetFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+    AscendC::WaitFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+}
+
+__attribute__((always_inline)) inline __aicore__ void TileXRPerfAccumulateDuration(
+    GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, uint64_t startCycle, uint64_t endCycle)
+{
+    TileXRPerfAccumulateDuration(
+        trace, rank, core, stage, startCycle, endCycle,
+        reinterpret_cast<__ubuf__ TileXRPerfCoreStageStats *>(TILEXR_PERF_TRACE_STATS_UB_OFFSET));
 }
 
 __attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd(
     GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, TileXRPerfStageToken token,
-    PerfBarrierPolicy policy)
+    PerfBarrierPolicy policy, __ubuf__ TileXRPerfCoreStageStats *statsUB)
 {
     if (trace == nullptr) {
         return;
@@ -95,7 +118,16 @@ __attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd(
         AscendC::PipeBarrier<PIPE_ALL>();
     }
     const uint64_t endCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
-    TileXRPerfAccumulateDuration(trace, rank, core, stage, token.startCycle, endCycle);
+    TileXRPerfAccumulateDuration(trace, rank, core, stage, token.startCycle, endCycle, statsUB);
+}
+
+__attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd(
+    GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, TileXRPerfStageToken token,
+    PerfBarrierPolicy policy)
+{
+    TileXRPerfStageEnd(
+        trace, rank, core, stage, token, policy,
+        reinterpret_cast<__ubuf__ TileXRPerfCoreStageStats *>(TILEXR_PERF_TRACE_STATS_UB_OFFSET));
 }
 
 #else
@@ -136,6 +168,14 @@ __attribute__((always_inline)) inline __aicore__ void TileXRPerfAccumulateDurati
     (void)endCycle;
 }
 
+__attribute__((always_inline)) inline __aicore__ void TileXRPerfAccumulateDuration(
+    GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, uint64_t startCycle, uint64_t endCycle,
+    __ubuf__ TileXRPerfCoreStageStats *statsUB)
+{
+    (void)statsUB;
+    TileXRPerfAccumulateDuration(trace, rank, core, stage, startCycle, endCycle);
+}
+
 __attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd(
     GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, TileXRPerfStageToken token,
     PerfBarrierPolicy policy)
@@ -148,6 +188,14 @@ __attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd(
     (void)policy;
 }
 
+__attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd(
+    GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, TileXRPerfStageToken token,
+    PerfBarrierPolicy policy, __ubuf__ TileXRPerfCoreStageStats *statsUB)
+{
+    (void)statsUB;
+    TileXRPerfStageEnd(trace, rank, core, stage, token, policy);
+}
+
 #endif
 
 } // namespace TileXR
diff --git a/src/collectives/kernels/tilexr_lccl_op.cpp b/src/collectives/kernels/tilexr_lccl_op.cpp
index 75c2272..10fa497 100644
--- a/src/collectives/kernels/tilexr_lccl_op.cpp
+++ b/src/collectives/kernels/tilexr_lccl_op.cpp
@@ -1,4 +1,4 @@
-#ifdef __DAV_C220_VEC__
+#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__)
 
 #include "lccl_op.h"
 
@@ -7,5 +7,6 @@ LCCL_TYPE_AIV_FUNC(LCCL_ALL_REDUCE_FUNC_AUTO_DEF);
 LCCL_TYPE_AIV_FUNC(LCCL_REDUCE_SCATTER_FUNC_AUTO_DEF);
 LCCL_TYPE_AIV_FUNC(LCCL_ALL2ALL_FUNC_AUTO_DEF);
 LCCL_BROADCAST_FUNC_AUTO_DEF();
+LCCL_PROFILE_PROBE_FUNC_AUTO_DEF();
 
 #endif
diff --git a/src/comm/tilexr_internal.cpp b/src/comm/tilexr_internal.cpp
index 1e2bc34..cbc5146 100644
--- a/src/comm/tilexr_internal.cpp
+++ b/src/comm/tilexr_internal.cpp
@@ -39,7 +39,9 @@ const std::unordered_map<std::string, ChipName> CHIP_MAP = {
     {"Ascend950DT", ChipName::CHIP_950},
     {"Ascend950DT_9581", ChipName::CHIP_950},
     {"Ascend950DT_9584", ChipName::CHIP_950},
-    {"Ascend950PR", ChipName::CHIP_950}
+    {"Ascend950PR", ChipName::CHIP_950},
+    {"Ascend950PR_9589", ChipName::CHIP_950},
+    {"Ascend950PR_9599", ChipName::CHIP_950}
 };
 
 /**
diff --git a/src/include/tilexr_collectives.h b/src/include/tilexr_collectives.h
index 9faf687..50fa782 100644
--- a/src/include/tilexr_collectives.h
+++ b/src/include/tilexr_collectives.h
@@ -35,6 +35,9 @@ int TileXRReduceScatter(void *sendBuf, void *recvBuf, int64_t recvCount,
 int TileXRBroadcast(void *buf, int64_t count,
                     TileXR::TileXRDataType dataType, int root,
                     TileXRCommPtr comm, aclrtStream stream);
+int TileXRProfileProbe(void *sendBuf, void *recvBuf, int64_t count,
+                       TileXR::TileXRDataType dataType, TileXRCommPtr comm,
+                       aclrtStream stream);
 
 }
 
diff --git a/src/include/tilexr_collectives_perf.h b/src/include/tilexr_collectives_perf.h
index 42773ed..ee648b6 100644
--- a/src/include/tilexr_collectives_perf.h
+++ b/src/include/tilexr_collectives_perf.h
@@ -20,6 +20,7 @@ int TileXRCollectivePerfSessionCreate(const TileXRCollectivePerfConfig *config,
 int TileXRCollectivePerfSessionDestroy(TileXRCollectivePerfSession session);
 int TileXRCollectivePerfSetActiveSession(TileXRCollectivePerfSession session);
 int TileXRCollectivePerfWriteReport(TileXRCollectivePerfSession session);
+int TileXRCollectivePerfWriteIncompleteReport(TileXRCollectivePerfSession session, const char *reason);
 
 #ifdef __cplusplus
 }
diff --git a/src/include/tilexr_perf_trace.h b/src/include/tilexr_perf_trace.h
index c0b1e74..78d91fd 100644
--- a/src/include/tilexr_perf_trace.h
+++ b/src/include/tilexr_perf_trace.h
@@ -16,6 +16,7 @@ constexpr uint32_t TILEXR_PERF_TRACE_MAGIC = 0x54585054u; // TXPT
 constexpr uint32_t TILEXR_PERF_TRACE_VERSION = 1u;
 constexpr uint32_t TILEXR_PERF_MAX_STAGE_NAME = 32u;
 constexpr uint32_t TILEXR_PERF_STAGE_COUNT = 7u;
+constexpr uint32_t TILEXR_PERF_TRACE_STATS_OFFSET = 128u;
 
 enum class PerfChipClass : uint32_t {
     GENERIC = 0,
@@ -52,7 +53,7 @@ struct TileXRPerfTraceHeader {
     uint32_t headerSize = sizeof(TileXRPerfTraceHeader);
     // Stage descriptions are static schema metadata; this records their ABI size.
     uint32_t stageDescSize = sizeof(uint32_t) * 4 + TILEXR_PERF_MAX_STAGE_NAME;
-    uint32_t coreStageStatsSize = sizeof(uint32_t) * 4 + sizeof(uint64_t) * 8;
+    uint32_t coreStageStatsSize = sizeof(uint32_t) * 4 + sizeof(uint64_t) * 10;
     uint32_t flags = 0;
     uint32_t rank = 0;
     uint32_t rankSize = 0;
@@ -89,6 +90,8 @@ struct TileXRPerfCoreStageStats {
     uint64_t lastEndCycle = 0;
     uint64_t aux0 = 0;
     uint64_t aux1 = 0;
+    uint64_t aux2 = 0;
+    uint64_t aux3 = 0;
 };
 
 TILEXR_PERF_TRACE_INLINE size_t PerfTraceStatsOffset(uint32_t rank, uint32_t core, uint32_t stage,
diff --git a/src/include/tilexr_types.h b/src/include/tilexr_types.h
index 7fda01e..ac53458 100644
--- a/src/include/tilexr_types.h
+++ b/src/include/tilexr_types.h
@@ -102,6 +102,7 @@ enum class TileXRType {
     LOCAL_REDUCE = 8,
     SEND = 9,
     RECV = 10,
+    PROFILE_PROBE = 11,
     PURE_MATMUL = 101,
     MATMUL_ALL_REDUCE = 102,
     MATMUL_REDUCE_SCATTER = 103,
@@ -144,6 +145,7 @@ const std::map<TileXRType, std::string> TILEXR_TYPE2NAME = {
     { TileXRType::GATHER, "TileXRGather" },
     { TileXRType::SEND, "TileXRSend" },
     { TileXRType::RECV, "TileXRRecv" },
+    { TileXRType::PROFILE_PROBE, "TileXRProfileProbe" },
     { TileXRType::ALLTOALLV_ALLGATHER_MATMUL, "TileXRAllToAllVAllGatherMatmul" },
     { TileXRType::MATMUL_REDUCESCATTER_ALLTOALLV, "TileXRMatmulReduceScatterAllToAllV" },
 
diff --git a/tests/collectives/CMakeLists.txt b/tests/collectives/CMakeLists.txt
index 65aac8c..93e3b05 100644
--- a/tests/collectives/CMakeLists.txt
+++ b/tests/collectives/CMakeLists.txt
@@ -232,6 +232,13 @@ set(TILEXR_COLLECTIVES_TEST_LINK_DIRS
     ${ASCEND_DRIVER_PATH}/lib64/driver
 )
 
+function(TileXRAddAscendRpathLink target_name)
+    target_link_options(${target_name} PRIVATE
+        "LINKER:-rpath-link,${ASCEND_HOME_PATH}/${ARCH}-linux/lib64"
+        "LINKER:-rpath-link,${ASCEND_DRIVER_PATH}/lib64/driver"
+    )
+endfunction()
+
 target_link_libraries(test_tilexr_collectives_header_compile
     ${TILEXR_COLLECTIVES_TEST_TARGET}
 )
@@ -286,6 +293,21 @@ target_link_libraries(test_prepare_host_launch_context
     ${TILEXR_COLLECTIVES_TEST_TARGET}
 )
 
+foreach(_tilexr_collectives_link_target
+        test_tilexr_collectives_header_compile
+        test_tilexr_perf_trace_layout
+        test_collective_perf_report
+        test_collective_perf_session
+        test_tilexr_collectives_correctness
+        tilexr_collective_perf
+        test_tilexr_collectives_stub_behavior
+        test_tilexr_collectives_uninitialized_comm
+        test_collective_host_utils
+        test_tilexr_comm_next_magic
+        test_prepare_host_launch_context)
+    TileXRAddAscendRpathLink(${_tilexr_collectives_link_target})
+endforeach()
+
 add_test(NAME test_tilexr_collectives_api COMMAND test_tilexr_collectives_api)
 add_test(NAME test_tilexr_collectives_kernel_ownership COMMAND test_tilexr_collectives_kernel_ownership)
 add_test(NAME test_tilexr_collectives_tools_sources COMMAND test_tilexr_collectives_tools_sources)
@@ -332,6 +354,7 @@ install(TARGETS
 install(PROGRAMS
     run_collectives_correctness.sh
     run_collective_perf.sh
+    run_collective_perf_multihost.sh
     tilexr_collective_profile_report.py
     DESTINATION ${CMAKE_INSTALL_BINDIR}
 )
diff --git a/tests/collectives/README.md b/tests/collectives/README.md
index 42c7d9e..df6348d 100644
--- a/tests/collectives/README.md
+++ b/tests/collectives/README.md
@@ -58,7 +58,7 @@ cd tests/collectives
   --iters 20 --warmup-iters 5 --datatype int32 --check 1
 ```
 
-Main options are `--op allgather|alltoall|allreduce|reducescatter|broadcast`, `--min-bytes`, `--max-bytes`, `--step-factor`, `--iters`,
+Main options are `--op allgather|alltoall|allreduce|reducescatter|broadcast|profile-probe`, `--min-bytes`, `--max-bytes`, `--step-factor`, `--iters`,
 `--warmup-iters`, `--datatype int8|int16|int32|int64|fp16|fp32|bf16`, `--rank-size`, `--rank`,
 `--first-npu`, `--check 0|1`, and `--csv <path>`. Optional thresholds `--min-algbw` and
 `--max-latency-us` are available but are not set by default.
@@ -105,13 +105,15 @@ After all rank processes finish successfully, `run_collective_perf.sh` also writ
 run/prof/collectives/report.html
 run/prof/collectives/trace_index.json
 run/prof/collectives/analysis.md
+run/prof/collectives/perfetto_trace.json
 ```
 
 When prompt export is enabled, the aggregate prompt is written as `run/prof/collectives/ai_prompt.md`.
-The root-level report.html keeps the bottleneck-first summary and adds a zoomable chronological timeline across
-sampled measured iterations. Warmup execution is controlled by the existing `--warmup-iters` option and is reported
-as metadata; warmup launches are not profiled by this report path. The per-launch `rank<N>/launch<M>/report.html`
-files remain available for drilldown.
+The root-level report.html keeps the bottleneck-first summary, adds a rank-level summary for spotting slow ranks,
+and adds a zoomable chronological timeline across sampled measured iterations. `perfetto_trace.json` uses
+launch/rank/stage event names plus per-rank launch windows so `ui.perfetto.dev` can quickly filter by launch or rank.
+Warmup execution is controlled by the existing `--warmup-iters` option and is reported as metadata; warmup launches are not profiled by this report path.
+The per-launch `rank<N>/launch<M>/report.html` files remain available for drilldown.
 
 To regenerate the aggregate report from an existing profile directory:
 
@@ -120,6 +122,35 @@ python3 tilexr_collective_profile_report.py run/prof/collectives \
   --warmup-iters 5 --iters 20 --profile-sample-every 1 --emit-ai-prompt
 ```
 
+### Multi-Host Profiling
+
+`tilexr_collective_perf` also supports a socket bootstrap mode for one process per host. The helper script below
+starts each rank over SSH with `--comm-mode socket`, collects `rank<N>/` profile directories back to the first host,
+and then writes a single aggregate `report.html` plus `perfetto_trace.json` containing host/rank labels.
+
+Use `--op profile-probe` first as a multi-host profiling/report smoke mode. It initializes ACL and the socket
+communicator on all hosts, launches a real AIV kernel on each rank, and records the same kernel-level stages as the
+single-host collective report. The probe only performs local device-memory copies and does not touch cross-host
+`peerMems[]`, so it is useful for two-host profiling/report validation but should not be reported as allgather
+cross-host bandwidth.
+
+```bash
+cd tests/collectives
+TILEXR_MULTIHOST_PEERS="0,root@141.62.24.62,141.62.24.62,0;1,root@141.62.24.70,141.62.24.70,0" \
+TILEXR_COMM_ID=141.62.24.62:10067 \
+TILEXR_COLLECTIVES_RUN_TIMEOUT_SEC=300 \
+bash ./run_collective_perf_multihost.sh /home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70 \
+  /home/l00929943/TileXR/build-profile-950/tests/collectives \
+  --op profile-probe --min-bytes 4096 --max-bytes 4096 \
+  --iters 2 --warmup-iters 1 --datatype int32 --check 0 \
+  --profile 1 --profile-sample-every 1 --profile-ai-prompt 1
+```
+
+The current report normalizes each rank/launch independently because cross-host device cycles are not assumed to be
+synchronized. Use the HTML rank-level summary to spot slow host/rank pairs, then drill into `rank<N>/launch<M>/`.
+The helper labels hosts from the SSH target/IP in `TILEXR_MULTIHOST_PEERS`, so Perfetto event names look like
+`launch0/rank1@141.62.24.70/kernel_total` in the example above.
+
 ## Skip Behavior
 
 Manual scripts are strict by default. If `npu-smi info -l` or `TILEXR_AVAILABLE_NPUS` reports too few devices,
diff --git a/tests/collectives/run_collective_perf_multihost.sh b/tests/collectives/run_collective_perf_multihost.sh
new file mode 100644
index 0000000..b526ffb
--- /dev/null
+++ b/tests/collectives/run_collective_perf_multihost.sh
@@ -0,0 +1,217 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat >&2 <<'EOF'
+Usage:
+  TILEXR_MULTIHOST_PEERS="rank,host,ip,device;rank,host,ip,device" \
+  run_collective_perf_multihost.sh profile_dir bin_dir [extra tilexr_collective_perf args...]
+
+Example:
+  TILEXR_MULTIHOST_PEERS="0,root@141.62.24.62,141.62.24.62,0;1,root@141.62.24.70,141.62.24.70,0" \
+  TILEXR_COMM_ID=141.62.24.62:10067 \
+  bash run_collective_perf_multihost.sh /home/l00929943/TileXR/run/prof/collectives-2host \
+    /home/l00929943/TileXR/build-profile-950/tests/collectives \
+    --op allgather --min-bytes 4096 --max-bytes 4096 --iters 2 --warmup-iters 1 \
+    --datatype int32 --check 0 --profile 1 --profile-sample-every 1 --profile-ai-prompt 1
+
+Each peer entry is rank,ssh_target,host_ip,device_id. The first entry is used as rank0/server.
+EOF
+}
+
+if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
+  usage
+  exit 0
+fi
+
+profile_dir="${1:?profile_dir required}"
+bin_dir="${2:?bin_dir required}"
+shift 2
+
+peers_spec="${TILEXR_MULTIHOST_PEERS:?TILEXR_MULTIHOST_PEERS is required}"
+comm_id="${TILEXR_COMM_ID:-141.62.24.62:10067}"
+timeout_sec="${TILEXR_COLLECTIVES_RUN_TIMEOUT_SEC:-600}"
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+helper="${script_dir}/tilexr_collective_profile_report.py"
+rank_size=0
+warmup_iters=5
+measured_iters=20
+profile_sample_every=1
+profile_ai_prompt=0
+
+is_true_bool() {
+  [[ "${1:-}" == "1" || "${1:-}" == "true" || "${1:-}" == "yes" ]]
+}
+
+parse_profile_args() {
+  local args=("$@")
+  local i
+  for ((i = 0; i < ${#args[@]}; i++)); do
+    case "${args[$i]}" in
+      --warmup-iters)
+        if (( i + 1 < ${#args[@]} )); then warmup_iters="${args[$((i + 1))]}"; fi
+        ;;
+      --iters)
+        if (( i + 1 < ${#args[@]} )); then measured_iters="${args[$((i + 1))]}"; fi
+        ;;
+      --profile-sample-every)
+        if (( i + 1 < ${#args[@]} )); then profile_sample_every="${args[$((i + 1))]}"; fi
+        ;;
+      --profile-ai-prompt)
+        if (( i + 1 < ${#args[@]} )); then profile_ai_prompt="${args[$((i + 1))]}"; fi
+        ;;
+    esac
+  done
+}
+
+parse_profile_args "$@"
+
+IFS=';' read -r -a peers <<< "${peers_spec}"
+rank_size="${#peers[@]}"
+if (( rank_size < 2 )); then
+  echo "ERROR: TILEXR_MULTIHOST_PEERS must contain at least two peers" >&2
+  exit 1
+fi
+
+mkdir -p "${profile_dir}"
+
+ssh_pids=()
+logs=()
+targets=()
+ranks=()
+
+cleanup() {
+  local pid
+  for pid in "${ssh_pids[@]:-}"; do
+    kill "${pid}" 2>/dev/null || true
+  done
+}
+trap cleanup INT TERM
+
+copy_rank_profile() {
+  local target="$1"
+  local rank="$2"
+  local dest="${profile_dir}/rank${rank}"
+  mkdir -p "${dest}"
+
+  if command -v rsync >/dev/null 2>&1 &&
+     ssh -o BatchMode=yes "${target}" "command -v rsync >/dev/null 2>&1"; then
+    rsync -a -e "ssh -o BatchMode=yes" "${target}:${profile_dir}/rank${rank}/" "${dest}/"
+    return
+  fi
+
+  ssh -o BatchMode=yes "${target}" bash -s -- "${profile_dir}" "${rank}" <<'REMOTE' | tar -xf - -C "${dest}"
+set -euo pipefail
+profile_dir="$1"
+rank="$2"
+cd "${profile_dir}/rank${rank}"
+tar -cf - .
+REMOTE
+}
+
+launch_rank() {
+  local rank="$1"
+  local target="$2"
+  local host_ip="$3"
+  local device_id="$4"
+  local host_label="${target#*@}"
+  shift 4
+  local log="${profile_dir}/multihost_rank${rank}.log"
+  logs+=("${log}")
+  targets+=("${target}")
+  ranks+=("${rank}")
+  if [[ -z "${host_label}" || "${host_label}" == "${target}" ]]; then
+    host_label="${host_ip}"
+  fi
+
+  ssh -o BatchMode=yes "${target}" bash -s -- \
+    "${rank_size}" "${rank}" "${device_id}" "${comm_id}" "${profile_dir}" "${bin_dir}" "${host_ip}" "${host_label}" "$@" >"${log}" 2>&1 <<'REMOTE' &
+set -euo pipefail
+rank_size="$1"
+rank="$2"
+device_id="$3"
+comm_id="$4"
+profile_dir="$5"
+bin_dir="$6"
+host_ip="$7"
+host_label="$8"
+shift 8
+
+cd /home/l00929943/TileXR
+set +u
+source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true
+conda activate pt311 2>/dev/null || true
+source /usr/local/Ascend/cann/set_env.sh
+set -u
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives:$(pwd)/build-profile-950/src/comm:${LD_LIBRARY_PATH:-}
+export ASCEND_PROCESS_LOG_PATH="${profile_dir}/plog/rank${rank}"
+export ASCEND_GLOBAL_LOG_LEVEL="${ASCEND_GLOBAL_LOG_LEVEL:-3}"
+mkdir -p "${ASCEND_PROCESS_LOG_PATH}"
+export TILEXR_COMM_ID="${comm_id}"
+export TILEXR_PROFILE_HOST="${host_label}"
+export TILEXR_PROFILE_HOST_IP="${host_ip}"
+
+"${bin_dir}/tilexr_collective_perf" \
+  --rank-size "${rank_size}" \
+  --rank "${rank}" \
+  --device-id "${device_id}" \
+  --comm-mode socket \
+  --profile-dir "${profile_dir}" \
+  "$@"
+REMOTE
+  ssh_pids+=("$!")
+}
+
+for peer in "${peers[@]}"; do
+  IFS=',' read -r rank target host_ip device_id <<< "${peer}"
+  if [[ -z "${rank:-}" || -z "${target:-}" || -z "${host_ip:-}" || -z "${device_id:-}" ]]; then
+    echo "ERROR: invalid peer entry '${peer}', expected rank,target,ip,device" >&2
+    exit 1
+  fi
+  launch_rank "${rank}" "${target}" "${host_ip}" "${device_id}" "$@"
+done
+
+sleep "${timeout_sec}" >/dev/null 2>&1 &
+watchdog_pid="$!"
+
+completed=0
+while (( completed < rank_size )); do
+  if wait -n; then
+    if ! kill -0 "${watchdog_pid}" 2>/dev/null; then
+      echo "ERROR: timed out after ${timeout_sec}s" >&2
+      cleanup
+      exit 124
+    fi
+    completed=$((completed + 1))
+  else
+    rc="$?"
+    echo "ERROR: remote rank failed with ${rc}" >&2
+    for log in "${logs[@]}"; do
+      echo "===== ${log} =====" >&2
+      tail -n 120 "${log}" >&2 || true
+    done
+    cleanup
+    exit "${rc}"
+  fi
+done
+
+kill "${watchdog_pid}" 2>/dev/null || true
+wait "${watchdog_pid}" 2>/dev/null || true
+trap - INT TERM
+
+for i in "${!targets[@]}"; do
+  target="${targets[$i]}"
+  rank="${ranks[$i]}"
+  copy_rank_profile "${target}" "${rank}"
+done
+
+prompt_args=()
+if is_true_bool "${profile_ai_prompt}"; then
+  prompt_args+=(--emit-ai-prompt)
+fi
+
+python3 "${helper}" "${profile_dir}" \
+  --warmup-iters "${warmup_iters}" \
+  --iters "${measured_iters}" \
+  --profile-sample-every "${profile_sample_every}" \
+  "${prompt_args[@]}"
diff --git a/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp b/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp
index 0ee88bd..ff16b93 100644
--- a/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp
+++ b/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp
@@ -22,6 +22,10 @@
 #include <string>
 #include <vector>
 
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
 #include "acl/acl.h"
 #include "tilexr_collectives.h"
 #include "tilexr_collectives_perf.h"
@@ -37,6 +41,13 @@ enum class CollectiveOp {
     ALLREDUCE,
     REDUCESCATTER,
     BROADCAST,
+    NOOP,
+    PROFILE_PROBE,
+};
+
+enum class CommMode {
+    LOCAL,
+    SOCKET,
 };
 
 struct DataTypeInfo {
@@ -56,6 +67,8 @@ struct Options {
     int rankSize = 2;
     int rank = 0;
     int firstNpu = 0;
+    int deviceIdOverride = -1;
+    CommMode commMode = CommMode::LOCAL;
     bool check = true;
     std::string csvPath;
     double minAlgBw = -1.0;
@@ -88,12 +101,14 @@ struct Row {
 };
 
 std::string JoinPath(const std::string &base, const std::string &leaf);
+bool CreateDirectories(const std::string &path);
+bool WriteProfileHostInfo(const Options &options);
 std::string ResolveProfileOutputDir(const Options &options, uint64_t profileLaunchIndex);
 bool ProfileThisLaunch(const Options &options, uint64_t profileLaunchIndex);
 bool StartPerfSessionForLaunch(const Options &options, uint64_t profileLaunchIndex,
                                TileXRCollectivePerfSession &perfSession);
 void FinishPerfSession(TileXRCollectivePerfSession &perfSession, const Options &options, aclrtStream stream,
-                       int &totalErrors, bool skipWriteReport);
+                       int &totalErrors, bool skipWriteReport, const std::string &skipReason);
 
 using TileXRCollectivesTest::CanUseCollisionFreeInt32Pattern;
 using TileXRCollectivesTest::ExpectedAllGatherValue;
@@ -117,6 +132,21 @@ std::string OpName(CollectiveOp op)
             return "reducescatter";
         case CollectiveOp::BROADCAST:
             return "broadcast";
+        case CollectiveOp::NOOP:
+            return "noop";
+        case CollectiveOp::PROFILE_PROBE:
+            return "profile-probe";
+    }
+    return "unknown";
+}
+
+std::string CommModeName(CommMode mode)
+{
+    switch (mode) {
+        case CommMode::LOCAL:
+            return "local";
+        case CommMode::SOCKET:
+            return "socket";
     }
     return "unknown";
 }
@@ -161,13 +191,14 @@ void PrintUsage(const char *program)
 {
     std::cerr
         << "Usage: " << program << " [options]\n"
-        << "  --op allgather|alltoall|allreduce|reducescatter|broadcast\n"
+        << "  --op allgather|alltoall|allreduce|reducescatter|broadcast|profile-probe\n"
         << "  Message-size semantics: allgather/allreduce/broadcast: count * dtype_size; "
            "alltoall/reducescatter: count * rank_size * dtype_size\n"
         << "  --min-bytes N --max-bytes N --step-factor F\n"
         << "  --iters N --warmup-iters N\n"
         << "  --datatype int8|int16|int32|int64|fp16|fp32|bf16\n"
         << "  --rank-size N --rank R --first-npu D\n"
+        << "  [--device-id D] [--comm-mode local|socket]\n"
         << "  --check 0|1 [--csv path]\n"
         << "  [--min-algbw GB/s] [--max-latency-us us]\n"
         << "  [--profile 0|1] [--profile-dir path]\n"
@@ -209,6 +240,19 @@ bool ParseDataType(const std::string &value, DataTypeInfo &info)
     return true;
 }
 
+bool ParseCommMode(const std::string &value, CommMode &mode)
+{
+    if (value == "local") {
+        mode = CommMode::LOCAL;
+        return true;
+    }
+    if (value == "socket") {
+        mode = CommMode::SOCKET;
+        return true;
+    }
+    return false;
+}
+
 bool ParseInt64(const std::string &text, int64_t &out)
 {
     char *end = nullptr;
@@ -300,6 +344,10 @@ bool ComputeMessageSizes(const Options &options, int64_t count, int64_t &sendEle
             break;
         case CollectiveOp::BROADCAST:
             break;
+        case CollectiveOp::NOOP:
+            break;
+        case CollectiveOp::PROFILE_PROBE:
+            break;
     }
     return CheckedBytesForElements(sendElements, options.dtype.bytes, sendBytes) &&
         CheckedBytesForElements(recvElements, options.dtype.bytes, recvBytes) &&
@@ -328,6 +376,7 @@ bool ValidateMaxMessageSize(const Options &options)
         return false;
     }
     if (options.check && options.dtype.name == "int32" &&
+        options.op != CollectiveOp::NOOP && options.op != CollectiveOp::PROFILE_PROBE &&
         !CanUseCollisionFreeInt32Pattern(options.rankSize, validationCount)) {
         std::cerr << "ERROR: message size is too large for collision-free INT32 validation" << std::endl;
         return false;
@@ -399,9 +448,14 @@ bool ParseOptions(int argc, char **argv, Options &options)
                 options.op = CollectiveOp::REDUCESCATTER;
             } else if (op == "broadcast") {
                 options.op = CollectiveOp::BROADCAST;
+            } else if (op == "noop") {
+                options.op = CollectiveOp::NOOP;
+            } else if (op == "profile-probe") {
+                options.op = CollectiveOp::PROFILE_PROBE;
             } else {
-                std::cerr << "ERROR: --op must be allgather, alltoall, allreduce, reducescatter, or broadcast"
-                          << std::endl;
+                std::cerr
+                    << "ERROR: --op must be allgather, alltoall, allreduce, reducescatter, broadcast, or profile-probe"
+                    << std::endl;
                 return false;
             }
         } else if (arg == "--min-bytes") {
@@ -482,6 +536,21 @@ bool ParseOptions(int argc, char **argv, Options &options)
                 std::cerr << "ERROR: invalid --first-npu" << std::endl;
                 return false;
             }
+        } else if (arg == "--device-id") {
+            const char *value = requireValue(arg);
+            if (value == nullptr) {
+                return false;
+            }
+            if (!ParseInt(value, options.deviceIdOverride)) {
+                std::cerr << "ERROR: invalid --device-id" << std::endl;
+                return false;
+            }
+        } else if (arg == "--comm-mode") {
+            const char *value = requireValue(arg);
+            if (value == nullptr || !ParseCommMode(value, options.commMode)) {
+                std::cerr << "ERROR: --comm-mode must be local or socket" << std::endl;
+                return false;
+            }
         } else if (arg == "--check") {
             const char *value = requireValue(arg);
             if (value == nullptr || !ParseBool(value, options.check)) {
@@ -547,7 +616,8 @@ bool ParseOptions(int argc, char **argv, Options &options)
 
     if (options.minBytes <= 0 || options.maxBytes < options.minBytes || options.stepFactor <= 1.0 ||
         options.iters <= 0 || options.warmupIters < 0 || options.rankSize <= 0 ||
-        options.rank < 0 || options.rank >= options.rankSize || options.firstNpu < 0) {
+        options.rank < 0 || options.rank >= options.rankSize || options.firstNpu < 0 ||
+        options.deviceIdOverride < -1) {
         std::cerr << "ERROR: invalid option value" << std::endl;
         return false;
     }
@@ -604,6 +674,16 @@ bool CallCollective(const Options &options, void *sendBuf, void *recvBuf, int64_
         case CollectiveOp::BROADCAST:
             return CheckTileXR(options.rank, "TileXRBroadcast",
                 TileXRBroadcast(sendBuf, count, options.dtype.type, 0, comm, stream));
+        case CollectiveOp::NOOP:
+            (void)sendBuf;
+            (void)recvBuf;
+            (void)count;
+            (void)comm;
+            (void)stream;
+            return true;
+        case CollectiveOp::PROFILE_PROBE:
+            return CheckTileXR(options.rank, "TileXRProfileProbe",
+                TileXRProfileProbe(sendBuf, recvBuf, count, options.dtype.type, comm, stream));
     }
     return false;
 }
@@ -638,6 +718,13 @@ bool FillPattern(const Options &options, int64_t count, std::vector<uint8_t> &se
                     StoreInt32(send, i, value);
                 }
                 break;
+            case CollectiveOp::PROFILE_PROBE:
+                for (int64_t i = 0; i < count; ++i) {
+                    StoreInt32(send, i, ExpectedAllGatherValue(options.rankSize, options.rank, i));
+                }
+                break;
+            case CollectiveOp::NOOP:
+                break;
         }
         std::fill(recv.begin(), recv.end(), 0xff);
         return true;
@@ -666,6 +753,13 @@ bool FillPattern(const Options &options, int64_t count, std::vector<uint8_t> &se
                 send[i] = options.rank == 0 ? PatternByte(0, 0, static_cast<int64_t>(i)) : 0xff;
             }
             break;
+        case CollectiveOp::PROFILE_PROBE:
+            for (size_t i = 0; i < send.size(); ++i) {
+                send[i] = PatternByte(options.rank, 0, static_cast<int64_t>(i));
+            }
+            break;
+        case CollectiveOp::NOOP:
+            break;
     }
     std::fill(recv.begin(), recv.end(), 0xff);
     return true;
@@ -751,6 +845,22 @@ int ValidateInt32(const Options &options, int64_t count, const std::vector<uint8
                 }
             }
             break;
+        case CollectiveOp::PROFILE_PROBE:
+            for (int64_t i = 0; i < count; ++i) {
+                const int32_t expected = ExpectedAllGatherValue(options.rankSize, options.rank, i);
+                const int32_t actual = LoadInt32(recv, i);
+                if (actual != expected) {
+                    if (errors < 8) {
+                        std::cerr << "[rank " << options.rank << "] profile-probe mismatch"
+                                  << " i=" << i << " expected=" << expected
+                                  << " actual=" << actual << std::endl;
+                    }
+                    ++errors;
+                }
+            }
+            break;
+        case CollectiveOp::NOOP:
+            break;
     }
     return errors;
 }
@@ -806,12 +916,28 @@ int ValidateBytePattern(const Options &options, int64_t count, const std::vector
                 }
             }
             break;
+        case CollectiveOp::PROFILE_PROBE:
+            for (size_t i = 0; i < bytesPerPeer; ++i) {
+                const uint8_t expected = PatternByte(options.rank, 0, static_cast<int64_t>(i));
+                const uint8_t actual = recv[i];
+                if (actual != expected) {
+                    if (errors < 8) {
+                        std::cerr << "[rank " << options.rank << "] byte profile-probe mismatch"
+                                  << " byte=" << i << " expected=" << static_cast<int>(expected)
+                                  << " actual=" << static_cast<int>(actual) << std::endl;
+                    }
+                    ++errors;
+                }
+            }
+            break;
         case CollectiveOp::ALLREDUCE:
         case CollectiveOp::REDUCESCATTER:
             std::cerr << "[rank " << options.rank
                       << "] ERROR: byte validation is unsupported for allreduce/reducescatter" << std::endl;
             ++errors;
             break;
+        case CollectiveOp::NOOP:
+            break;
     }
     return errors;
 }
@@ -826,7 +952,9 @@ double ComputeAlgBandwidthGbps(int64_t outputBytesPerRank, double avgUs)
 
 double ComputeBusBandwidthGbps(CollectiveOp op, int rankSize, double algBwGbps)
 {
-    (void)op;
+    if (op == CollectiveOp::NOOP || op == CollectiveOp::PROFILE_PROBE) {
+        return 0.0;
+    }
     if (rankSize <= 0) {
         return 0.0;
     }
@@ -856,15 +984,21 @@ bool MeasureOnce(const Options &options, void *devSend, void *devRecv, int64_t c
         aclrtDestroyEvent(stop);
         return false;
     }
-    if (!CallCollective(options, devSend, devRecv, count, comm, stream) ||
-        !CheckAcl(options.rank, "aclrtRecordEvent stop", aclrtRecordEvent(stop, stream)) ||
-        !CheckAcl(options.rank, "aclrtSynchronizeEvent stop", aclrtSynchronizeEvent(stop))) {
-        FinishPerfSession(perfSession, options, stream, totalErrors, true);
+    std::string incompleteReason;
+    if (!CallCollective(options, devSend, devRecv, count, comm, stream)) {
+        incompleteReason = "CallCollective failed before measured launch completed";
+    } else if (!CheckAcl(options.rank, "aclrtRecordEvent stop", aclrtRecordEvent(stop, stream))) {
+        incompleteReason = "aclrtRecordEvent stop failed";
+    } else if (!CheckAcl(options.rank, "aclrtSynchronizeEvent stop", aclrtSynchronizeEvent(stop))) {
+        incompleteReason = "aclrtSynchronizeEvent stop failed";
+    }
+    if (!incompleteReason.empty()) {
+        FinishPerfSession(perfSession, options, stream, totalErrors, true, incompleteReason);
         aclrtDestroyEvent(start);
         aclrtDestroyEvent(stop);
         return false;
     }
-    FinishPerfSession(perfSession, options, stream, totalErrors, false);
+    FinishPerfSession(perfSession, options, stream, totalErrors, false, "");
 
     float elapsedMs = 0.0f;
     const bool ok = CheckAcl(options.rank, "aclrtEventElapsedTime",
@@ -872,7 +1006,10 @@ bool MeasureOnce(const Options &options, void *devSend, void *devRecv, int64_t c
     aclrtDestroyEvent(start);
     aclrtDestroyEvent(stop);
     us = static_cast<double>(elapsedMs) * 1000.0;
-    return ok;
+    if (!ok) {
+        return false;
+    }
+    return true;
 }
 
 bool Measure(const Options &options, void *devSend, void *devRecv, int64_t count, TileXRCommPtr comm,
@@ -956,6 +1093,120 @@ std::string JoinPath(const std::string &base, const std::string &leaf)
     return base + "/" + leaf;
 }
 
+bool CreateDirectories(const std::string &path)
+{
+    if (path.empty()) {
+        return false;
+    }
+
+    std::string current;
+    size_t index = 0;
+    if (path[0] == '/') {
+        current = "/";
+        index = 1;
+    }
+
+    while (index <= path.size()) {
+        const size_t next = path.find('/', index);
+        const std::string part = path.substr(index, next == std::string::npos ? std::string::npos : next - index);
+        if (!part.empty()) {
+            if (!current.empty() && current[current.size() - 1] != '/') {
+                current += "/";
+            }
+            current += part;
+            if (mkdir(current.c_str(), 0755) != 0 && errno != EEXIST) {
+                std::cerr << "ERROR: failed to create directory " << current << ": " << strerror(errno) << std::endl;
+                return false;
+            }
+        }
+        if (next == std::string::npos) {
+            break;
+        }
+        index = next + 1;
+    }
+    return true;
+}
+
+std::string JsonEscape(const std::string &text)
+{
+    std::ostringstream out;
+    for (const char ch : text) {
+        switch (ch) {
+            case '\\':
+                out << "\\\\";
+                break;
+            case '"':
+                out << "\\\"";
+                break;
+            case '\n':
+                out << "\\n";
+                break;
+            case '\r':
+                out << "\\r";
+                break;
+            case '\t':
+                out << "\\t";
+                break;
+            default:
+                out << ch;
+                break;
+        }
+    }
+    return out.str();
+}
+
+std::string EnvOrEmpty(const char *name)
+{
+    const char *value = std::getenv(name);
+    return value == nullptr ? std::string() : std::string(value);
+}
+
+std::string DefaultHostName()
+{
+    char hostname[256] = {};
+    if (gethostname(hostname, sizeof(hostname) - 1) == 0 && hostname[0] != '\0') {
+        return hostname;
+    }
+    return "unknown";
+}
+
+bool WriteProfileHostInfo(const Options &options)
+{
+    if (!options.profile) {
+        return true;
+    }
+    const std::string root = options.profileDir.empty() ? "run/prof/collectives" : options.profileDir;
+    const std::string rankDir = JoinPath(root, "rank" + std::to_string(options.rank));
+    if (!CreateDirectories(rankDir)) {
+        return false;
+    }
+
+    std::string host = EnvOrEmpty("TILEXR_PROFILE_HOST");
+    if (host.empty()) {
+        host = DefaultHostName();
+    }
+    std::string hostIp = EnvOrEmpty("TILEXR_PROFILE_HOST_IP");
+    if (hostIp.empty()) {
+        hostIp = EnvOrEmpty("TILEXR_NODE_IP");
+    }
+
+    const std::string path = JoinPath(rankDir, "host_info.json");
+    std::ofstream out(path.c_str());
+    if (!out.is_open()) {
+        std::cerr << "ERROR: failed to write " << path << std::endl;
+        return false;
+    }
+    out << "{\n"
+        << "  \"schema\": \"tilexr_collective_profile_host.v1\",\n"
+        << "  \"rank\": " << options.rank << ",\n"
+        << "  \"rank_size\": " << options.rankSize << ",\n"
+        << "  \"host\": \"" << JsonEscape(host) << "\",\n"
+        << "  \"ip\": \"" << JsonEscape(hostIp) << "\",\n"
+        << "  \"comm_mode\": \"" << CommModeName(options.commMode) << "\"\n"
+        << "}\n";
+    return true;
+}
+
 std::string ResolveProfileOutputDir(const Options &options, uint64_t profileLaunchIndex)
 {
     const std::string root = options.profileDir.empty() ? "run/prof/collectives" : options.profileDir;
@@ -969,6 +1220,13 @@ bool ProfileThisLaunch(const Options &options, uint64_t profileLaunchIndex)
         (profileLaunchIndex % static_cast<uint64_t>(options.profileSampleEvery)) == 0;
 }
 
+bool KernelProfilingDisabled()
+{
+    const char *value = std::getenv("TILEXR_COLLECTIVES_DISABLE_KERNEL_PROFILING");
+    return value != nullptr &&
+        (std::string(value) == "1" || std::string(value) == "true" || std::string(value) == "yes");
+}
+
 bool StartPerfSessionForLaunch(const Options &options, uint64_t profileLaunchIndex,
                                TileXRCollectivePerfSession &perfSession)
 {
@@ -1006,18 +1264,22 @@ void Cleanup(TileXRCommPtr comm, aclrtStream stream, int deviceId, bool deviceSe
 }
 
 void FinishPerfSession(TileXRCollectivePerfSession &perfSession, const Options &options, aclrtStream stream,
-                       int &totalErrors, bool skipWriteReport)
+                       int &totalErrors, bool skipWriteReport, const std::string &skipReason)
 {
     if (perfSession == nullptr) {
         return;
     }
+    std::string reason = skipReason;
     bool streamSynced = true;
-    if (stream != nullptr) {
+    if (stream != nullptr && !KernelProfilingDisabled()) {
         streamSynced = CheckAcl(options.rank, "aclrtSynchronizeStream before perf report",
             aclrtSynchronizeStream(stream));
         if (!streamSynced) {
             totalErrors += 1;
             skipWriteReport = true;
+            if (reason.empty()) {
+                reason = "aclrtSynchronizeStream before perf report failed";
+            }
         }
     }
     if (TileXRCollectivePerfSetActiveSession(nullptr) != TileXR::TILEXR_SUCCESS) {
@@ -1025,9 +1287,17 @@ void FinishPerfSession(TileXRCollectivePerfSession &perfSession, const Options &
                   << std::endl;
         totalErrors += 1;
     }
-    if (!skipWriteReport && TileXRCollectivePerfWriteReport(perfSession) != TileXR::TILEXR_SUCCESS) {
-        std::cerr << "[rank " << options.rank << "] ERROR: TileXRCollectivePerfWriteReport failed" << std::endl;
-        totalErrors += 1;
+    if (skipWriteReport) {
+        if (TileXRCollectivePerfWriteIncompleteReport(perfSession, reason.c_str()) != TileXR::TILEXR_SUCCESS) {
+            std::cerr << "[rank " << options.rank
+                      << "] ERROR: TileXRCollectivePerfWriteIncompleteReport failed" << std::endl;
+            totalErrors += 1;
+        }
+    } else {
+        if (TileXRCollectivePerfWriteReport(perfSession) != TileXR::TILEXR_SUCCESS) {
+            std::cerr << "[rank " << options.rank << "] ERROR: TileXRCollectivePerfWriteReport failed" << std::endl;
+            totalErrors += 1;
+        }
     }
     if (!streamSynced) {
         // Avoid freeing a trace buffer that may still be referenced by queued device work.
@@ -1051,7 +1321,7 @@ int main(int argc, char **argv)
         return 2;
     }
 
-    const int deviceId = options.firstNpu + options.rank;
+    const int deviceId = options.deviceIdOverride >= 0 ? options.deviceIdOverride : options.firstNpu + options.rank;
     TileXRCommPtr comm = nullptr;
     aclrtStream stream = nullptr;
     bool deviceSet = false;
@@ -1063,9 +1333,23 @@ int main(int argc, char **argv)
         return 1;
     }
     deviceSet = true;
-    if (!CheckAcl(options.rank, "aclrtCreateStream", aclrtCreateStream(&stream)) ||
-        !CheckTileXR(options.rank, "TileXRCommInitRankLocal",
-            TileXRCommInitRankLocal(options.rankSize, options.rank, &comm))) {
+    if (!WriteProfileHostInfo(options) ||
+        !CheckAcl(options.rank, "aclrtCreateStream", aclrtCreateStream(&stream))) {
+        Cleanup(comm, stream, deviceId, deviceSet);
+        return 1;
+    }
+
+    bool commOk = false;
+    if (options.commMode == CommMode::SOCKET) {
+        TileXRUniqueId uniqueId {};
+        commOk = CheckTileXR(options.rank, "TileXRGetUniqueId", TileXRGetUniqueId(&uniqueId, 0)) &&
+            CheckTileXR(options.rank, "TileXRCommInitRank",
+                TileXRCommInitRank(uniqueId, options.rankSize, options.rank, &comm));
+    } else {
+        commOk = CheckTileXR(options.rank, "TileXRCommInitRankLocal",
+            TileXRCommInitRankLocal(options.rankSize, options.rank, &comm));
+    }
+    if (!commOk) {
         Cleanup(comm, stream, deviceId, deviceSet);
         return 1;
     }
@@ -1105,7 +1389,7 @@ int main(int argc, char **argv)
                 aclrtMemcpy(devSend, static_cast<size_t>(sendBytes), hostSend.data(),
                     static_cast<size_t>(sendBytes), ACL_MEMCPY_HOST_TO_DEVICE));
         if (ok) {
-            if (options.op == CollectiveOp::BROADCAST) {
+            if (options.op == CollectiveOp::BROADCAST || options.op == CollectiveOp::NOOP) {
                 devRecv = devSend;
             } else {
                 ok = CheckAcl(options.rank, "aclrtMalloc recv",
@@ -1123,7 +1407,7 @@ int main(int argc, char **argv)
         }
 
         int errors = 0;
-        if (ok && options.check) {
+        if (ok && options.check && options.op != CollectiveOp::NOOP) {
             if (options.op != CollectiveOp::BROADCAST) {
                 std::fill(hostRecv.begin(), hostRecv.end(), 0xff);
                 ok = CheckAcl(options.rank, "aclrtMemcpy H2D devRecv sentinel",
diff --git a/tests/collectives/tilexr_collective_profile_report.py b/tests/collectives/tilexr_collective_profile_report.py
index f02699d..2ca1c05 100755
--- a/tests/collectives/tilexr_collective_profile_report.py
+++ b/tests/collectives/tilexr_collective_profile_report.py
@@ -10,7 +10,10 @@
 
 TRACE_SCHEMA = "tilexr_perf_trace_report.v1"
 RUN_SCHEMA = "tilexr_perf_trace_run.v1"
+HOST_SCHEMA = "tilexr_collective_profile_host.v1"
 RANK_LAUNCH_RE = re.compile(r"rank([0-9]+)[/\\]launch([0-9]+)[/\\]trace\.json$")
+PERFETTO_LAUNCH_GAP_US = 50.0
+PERFETTO_LAUNCH_WINDOW_TID = 1000000
 
 
 def parse_args():
@@ -34,6 +37,38 @@ def parse_rank_launch(path, root):
     return int(match.group(1)), int(match.group(2))
 
 
+def load_host_infos(root, diagnostics):
+    hosts = {}
+    for path in sorted(root.glob("rank*/host_info.json")):
+        source = relpath(path, root)
+        try:
+            info = json.loads(path.read_text(encoding="utf-8"))
+        except json.JSONDecodeError as exc:
+            diagnostics.append(f"invalid json in {source}: {exc}")
+            continue
+        if not isinstance(info, dict):
+            diagnostics.append(f"invalid top-level host info type in {source}: expected object")
+            continue
+        if info.get("schema") != HOST_SCHEMA:
+            diagnostics.append(f"invalid schema in {source}")
+            continue
+        rank = as_int(info.get("rank"), None)
+        if rank is None:
+            parsed = re.search(r"rank([0-9]+)[/\\]host_info\.json$", source)
+            rank = int(parsed.group(1)) if parsed else None
+        if rank is None:
+            diagnostics.append(f"invalid rank in {source}")
+            continue
+        hosts[rank] = {
+            "rank": rank,
+            "host": str(info.get("host") or f"rank{rank}"),
+            "ip": str(info.get("ip") or ""),
+            "comm_mode": str(info.get("comm_mode") or ""),
+            "source": source,
+        }
+    return hosts
+
+
 def load_traces(root):
     traces = []
     diagnostics = []
@@ -68,6 +103,10 @@ def load_traces(root):
 
 def sanitize_trace(trace, source, diagnostics):
     trace = dict(trace)
+    if trace.get("incomplete"):
+        reason = str(trace.get("incomplete_reason") or "unknown")
+        diagnostics.append(f"incomplete trace in {source}: {reason}")
+
     stats = trace.get("stats", [])
     if not isinstance(stats, list):
         diagnostics.append(f"invalid stats in {source}: expected list")
@@ -79,6 +118,13 @@ def sanitize_trace(trace, source, diagnostics):
         if not isinstance(stat, dict):
             diagnostics.append(f"invalid stat entry in {source} stats[{index}]: expected object")
             continue
+        if as_int(stat.get("count")) <= 0:
+            raw_cycles = as_int(stat.get("raw_cycles"))
+            max_cycles = as_int(stat.get("max_cycles"))
+            last_end = as_int(stat.get("last_end_cycle"))
+            if raw_cycles != 0 or max_cycles != 0 or last_end != 0:
+                diagnostics.append(f"ignored count=0 stat in {source} stats[{index}]")
+            continue
         valid_stats.append(stat)
     trace["stats"] = valid_stats
     return trace
@@ -90,6 +136,8 @@ def group_key(entry):
         trace.get("op_type"),
         trace.get("op_name", "Unknown"),
         trace.get("rank_size"),
+        trace.get("max_core_count"),
+        trace.get("block_dim"),
         trace.get("message_bytes"),
         trace.get("stage_count"),
         trace.get("cycle_to_us_divisor"),
@@ -135,7 +183,29 @@ def cycles_to_us(cycles, divisor):
     return float(cycles) / float(divisor)
 
 
-def normalized_bars(entries, root):
+def rank_host_info(hosts, rank):
+    return hosts.get(rank, {"rank": rank, "host": f"rank{rank}", "ip": "", "comm_mode": "", "source": ""})
+
+
+def rank_label(hosts, rank):
+    info = rank_host_info(hosts, rank)
+    host = info.get("host") or f"rank{rank}"
+    if host == f"rank{rank}":
+        return f"rank{rank}"
+    return f"rank{rank}@{host}"
+
+
+def format_hosts(hosts, rank_ids):
+    parts = []
+    for rank in rank_ids:
+        info = rank_host_info(hosts, rank)
+        label = rank_label(hosts, rank)
+        ip = info.get("ip") or "unknown-ip"
+        parts.append(f"{label}({ip})")
+    return ", ".join(parts) if parts else "unknown"
+
+
+def normalized_bars(entries, root, hosts):
     bars = []
     entries_by_rank_launch = defaultdict(list)
     for entry in entries:
@@ -169,6 +239,7 @@ def normalized_bars(entries, root):
                 duration_cycles = max(0, last - first)
                 core = as_int(stat.get("core"))
                 stat_rank = as_int(stat.get("rank"), rank)
+                host_info = rank_host_info(hosts, stat_rank)
                 raw_cycles = as_int(stat.get("raw_cycles"))
 
                 bars.append({
@@ -187,9 +258,11 @@ def normalized_bars(entries, root):
                     "raw_cycles": raw_cycles,
                     "count": count,
                     "max_cycles": as_int(stat.get("max_cycles")),
+                    "host": host_info["host"],
+                    "host_ip": host_info["ip"],
                     "source": source,
                     "drilldown": drilldown,
-                    "lane": f"rank{stat_rank}/core{core}",
+                    "lane": f"{rank_label(hosts, stat_rank)}/core{core}",
                 })
 
     return sorted(bars, key=lambda bar: (
@@ -206,11 +279,16 @@ def summarize_group(group):
     stage_totals = defaultdict(float)
     launch_kernel = defaultdict(float)
     rank_core_max = {"rank": 0, "core": 0, "stage": "", "duration_us": 0.0, "max_cycles": 0, "max_us": 0.0}
+    rank_launch_kernel = defaultdict(dict)
 
     for bar in group["bars"]:
         stage_totals[bar["stage"]] += bar["sum_us"]
         if bar["stage"] == "kernel_total":
             launch_kernel[bar["launch_id"]] = max(launch_kernel[bar["launch_id"]], bar["duration_us"])
+            rank_launch_kernel[bar["rank"]][bar["launch_id"]] = max(
+                rank_launch_kernel[bar["rank"]].get(bar["launch_id"], 0.0),
+                bar["duration_us"],
+            )
         max_us = cycles_to_us(bar["max_cycles"], group["cycle_to_us_divisor"])
         if max_us > rank_core_max["max_us"]:
             rank_core_max = {
@@ -224,16 +302,50 @@ def summarize_group(group):
 
     top_stage = max(stage_totals.items(), key=lambda item: item[1]) if stage_totals else ("none", 0.0)
     slowest_launch = max(launch_kernel.items(), key=lambda item: item[1]) if launch_kernel else (None, 0.0)
+    rank_kernel = summarize_rank_kernel(rank_launch_kernel)
+    slowest_rank = rank_kernel[0] if rank_kernel else {
+        "rank": None,
+        "launch_count": 0,
+        "avg_kernel_us": 0.0,
+        "max_kernel_us": 0.0,
+        "slowest_launch_id": None,
+    }
 
     return {
         "top_stage": {"stage": top_stage[0], "sum_us": top_stage[1]},
         "slowest_launch": {"launch_id": slowest_launch[0], "kernel_us": slowest_launch[1]},
         "rank_core_max": rank_core_max,
+        "rank_kernel": rank_kernel,
+        "slowest_rank": slowest_rank,
     }
 
 
+def summarize_rank_kernel(rank_launch_kernel):
+    summaries = []
+    for rank, launch_values in sorted(rank_launch_kernel.items()):
+        if not launch_values:
+            continue
+        values = list(launch_values.values())
+        slowest_launch_id, max_kernel_us = max(
+            launch_values.items(),
+            key=lambda item: (item[1], -item[0]),
+        )
+        summaries.append({
+            "rank": rank,
+            "launch_count": len(values),
+            "avg_kernel_us": sum(values) / len(values),
+            "max_kernel_us": max_kernel_us,
+            "slowest_launch_id": slowest_launch_id,
+        })
+    return sorted(
+        summaries,
+        key=lambda item: (-item["avg_kernel_us"], -item["max_kernel_us"], item["rank"]),
+    )
+
+
 def build_index(root, args):
     traces, diagnostics = load_traces(root)
+    hosts = load_host_infos(root, diagnostics)
     grouped = defaultdict(list)
     for entry in traces:
         grouped[group_key(entry)].append(entry)
@@ -249,11 +361,13 @@ def sort_key(item):
             as_int(key[3]),
             as_int(key[4]),
             as_int(key[5]),
+            as_int(key[6]),
+            as_int(key[7]),
             as_int(key[0]),
         )
 
     for key, entries in sorted(grouped.items(), key=sort_key):
-        op_type, op_name, rank_size, message_bytes, stage_count, divisor = key
+        op_type, op_name, rank_size, max_core_count, block_dim, message_bytes, stage_count, divisor = key
         rank_ids = sorted({entry["rank"] for entry in entries})
         launch_ids = sorted({entry["launch_id"] for entry in entries})
         effective_rank_size = as_int(rank_size, len(rank_ids))
@@ -269,16 +383,19 @@ def sort_key(item):
             "op_type": op_type,
             "op_name": op_name,
             "rank_size": effective_rank_size,
+            "max_core_count": as_int(max_core_count),
+            "block_dim": as_int(block_dim),
             "message_bytes": as_int(message_bytes),
             "stage_count": as_int(stage_count),
             "cycle_to_us_divisor": as_int(divisor),
             "rank_ids": rank_ids,
             "launch_ids": launch_ids,
+            "trace_statuses": trace_statuses(entries, root),
             "sources": [
                 relpath(entry["path"], root)
                 for entry in sorted(entries, key=lambda item: (item["launch_id"], item["rank"], relpath(item["path"], root)))
             ],
-            "bars": normalized_bars(entries, root),
+            "bars": normalized_bars(entries, root, hosts),
         }
         group["summary"] = summarize_group(group)
         groups.append(group)
@@ -289,6 +406,7 @@ def sort_key(item):
         "warmup_iters": args.warmup_iters,
         "measured_iters": args.iters,
         "profile_sample_every": args.profile_sample_every,
+        "hosts": public_hosts(hosts),
         "groups": groups,
         "diagnostics": diagnostics,
     }
@@ -311,6 +429,8 @@ def add_incompatible_group_diagnostics(grouped, root, diagnostics):
                 as_int(item[3]),
                 as_int(item[4]),
                 as_int(item[5]),
+                as_int(item[6]),
+                as_int(item[7]),
                 as_int(item[0]),
             ))
         ]
@@ -320,18 +440,34 @@ def add_incompatible_group_diagnostics(grouped, root, diagnostics):
 
 
 def describe_group(key, entries, root):
-    op_type, op_name, rank_size, message_bytes, stage_count, divisor = key
+    op_type, op_name, rank_size, max_core_count, block_dim, message_bytes, stage_count, divisor = key
     sources = ", ".join(
         relpath(entry["path"], root)
         for entry in sorted(entries, key=lambda item: (item["launch_id"], item["rank"], relpath(item["path"], root)))
     )
     return (
         f"op_type={op_type} op_name={op_name} rank_size={rank_size} "
+        f"max_core_count={max_core_count} block_dim={block_dim} "
         f"message_bytes={message_bytes} stage_count={stage_count} "
         f"cycle_to_us_divisor={divisor} sources=[{sources}]"
     )
 
 
+def trace_statuses(entries, root):
+    statuses = []
+    for entry in sorted(entries, key=lambda item: (item["launch_id"], item["rank"], relpath(item["path"], root))):
+        trace = entry["trace"]
+        statuses.append({
+            "rank": entry["rank"],
+            "launch_id": entry["launch_id"],
+            "source": relpath(entry["path"], root),
+            "incomplete": bool(trace.get("incomplete")),
+            "reason": str(trace.get("incomplete_reason") or ""),
+            "synthetic": bool(trace.get("synthetic")),
+        })
+    return statuses
+
+
 def format_launches(launch_ids):
     if not launch_ids:
         return "none"
@@ -344,6 +480,10 @@ def format_slowest_launch(slowest):
     return f"launch{slowest['launch_id']} at {slowest['kernel_us']:.3f} us"
 
 
+def public_hosts(hosts):
+    return {rank: dict(info) for rank, info in sorted(hosts.items())}
+
+
 def render_analysis(index):
     lines = [
         "# TileXR Collective Profile Run Analysis",
@@ -378,9 +518,12 @@ def render_analysis(index):
         )
         lines.append(f"## Group {group_index}: {group['op_name']} message_bytes={group['message_bytes']}")
         lines.append(f"- Launches: {format_launches(group['launch_ids'])}")
+        lines.append(f"- Hosts: {format_hosts(index.get('hosts', {}), group['rank_ids'])}")
         lines.append(f"- Slowest launch: {format_slowest_launch(slowest)}")
         lines.append(f"- Top stage: {summary['top_stage']['stage']} at {summary['top_stage']['sum_us']:.3f} us")
         lines.append(f"- Stage totals: {stage_summary}")
+        lines.append(f"- Slowest rank: {format_slowest_rank(summary['slowest_rank'], index.get('hosts', {}))}")
+        lines.append(f"- Rank kernel totals: {format_rank_kernel(summary['rank_kernel'], index.get('hosts', {}))}")
         lines.append(
             f"- Rank/core max: rank{rank_core['rank']} core{rank_core['core']} "
             f"{rank_core['stage']} max {rank_core['max_us']:.3f} us"
@@ -390,6 +533,28 @@ def render_analysis(index):
     return "\n".join(lines)
 
 
+def format_slowest_rank(slowest_rank, hosts=None):
+    rank = slowest_rank.get("rank")
+    if rank is None:
+        return "unavailable"
+    launch_id = slowest_rank.get("slowest_launch_id")
+    launch_text = "unknown launch" if launch_id is None else f"launch{launch_id}"
+    label = rank_label(hosts or {}, rank)
+    return (
+        f"{label} avg {slowest_rank.get('avg_kernel_us', 0.0):.3f} us "
+        f"max {slowest_rank.get('max_kernel_us', 0.0):.3f} us at {launch_text}"
+    )
+
+
+def format_rank_kernel(rank_kernel, hosts=None):
+    if not rank_kernel:
+        return "unavailable"
+    return "; ".join(
+        f"{rank_label(hosts or {}, item['rank'])} avg={item['avg_kernel_us']:.3f} us max={item['max_kernel_us']:.3f} us"
+        for item in rank_kernel
+    )
+
+
 def render_ai_prompt(index):
     lines = [
         "# TileXR collective profiling run",
@@ -412,6 +577,9 @@ def render_ai_prompt(index):
         else:
             lines.append(f"  slowest_launch=launch{slowest['launch_id']} kernel_us={slowest['kernel_us']:.3f}")
         lines.append(f"  top_stage={summary['top_stage']['stage']} sum_us={summary['top_stage']['sum_us']:.3f}")
+        lines.append(f"  hosts={format_hosts(index.get('hosts', {}), group['rank_ids'])}")
+        lines.append(f"  slowest_rank={format_slowest_rank(summary['slowest_rank'], index.get('hosts', {}))}")
+        lines.append(f"  rank_kernel_totals={format_rank_kernel(summary['rank_kernel'], index.get('hosts', {}))}")
 
     if index["diagnostics"]:
         lines.append("")
@@ -421,6 +589,164 @@ def render_ai_prompt(index):
     return "\n".join(lines) + "\n"
 
 
+def render_perfetto_trace(index):
+    events = []
+    seen_ranks = set()
+    seen_threads = set()
+    seen_launch_threads = set()
+    launch_offsets = compute_perfetto_launch_offsets(index)
+
+    for group in index["groups"]:
+        for rank in group["rank_ids"]:
+            if rank not in seen_ranks:
+                seen_ranks.add(rank)
+                events.append({
+                    "name": "process_name",
+                    "ph": "M",
+                    "pid": rank,
+                    "args": {"name": rank_label(index.get("hosts", {}), rank)},
+                })
+
+        for launch_id in group["launch_ids"]:
+            launch_rank_bars = defaultdict(list)
+            for bar in group["bars"]:
+                if bar["launch_id"] == launch_id:
+                    launch_rank_bars[bar["rank"]].append(bar)
+            for rank, rank_bars in sorted(launch_rank_bars.items()):
+                if rank not in seen_launch_threads:
+                    seen_launch_threads.add(rank)
+                    events.append({
+                        "name": "thread_name",
+                        "ph": "M",
+                        "pid": rank,
+                        "tid": PERFETTO_LAUNCH_WINDOW_TID,
+                        "args": {"name": f"{rank_label(index.get('hosts', {}), rank)}/launch_windows"},
+                    })
+                max_end_us = max((bar["end_us"] for bar in rank_bars), default=0.0)
+                if max_end_us <= 0:
+                    continue
+                launch_offset_us = launch_offsets.get(launch_id, 0.0)
+                events.append({
+                    "name": f"launch{launch_id}/{rank_label(index.get('hosts', {}), rank)}/window",
+                    "cat": "launch_window",
+                    "ph": "X",
+                    "pid": rank,
+                    "tid": PERFETTO_LAUNCH_WINDOW_TID,
+                    "ts": launch_offset_us,
+                    "dur": max_end_us,
+                    "args": {
+                        "launch_id": launch_id,
+                        "rank": rank,
+                        "host": rank_host_info(index.get("hosts", {}), rank).get("host", ""),
+                        "host_ip": rank_host_info(index.get("hosts", {}), rank).get("ip", ""),
+                        "launch_offset_us": launch_offset_us,
+                    },
+                })
+
+        for status in group.get("trace_statuses", []):
+            if not status.get("incomplete"):
+                continue
+            rank = as_int(status.get("rank"))
+            launch_id = as_int(status.get("launch_id"))
+            if rank not in seen_launch_threads:
+                seen_launch_threads.add(rank)
+                events.append({
+                    "name": "thread_name",
+                    "ph": "M",
+                    "pid": rank,
+                    "tid": PERFETTO_LAUNCH_WINDOW_TID,
+                    "args": {"name": f"{rank_label(index.get('hosts', {}), rank)}/launch_windows"},
+                })
+            rank_status_label = rank_label(index.get("hosts", {}), rank)
+            events.append({
+                "name": f"launch{launch_id}/{rank_status_label}/incomplete_trace",
+                "cat": "trace_status",
+                "ph": "X",
+                "pid": rank,
+                "tid": PERFETTO_LAUNCH_WINDOW_TID,
+                "ts": launch_offsets.get(launch_id, 0.0),
+                "dur": 1.0,
+                "args": {
+                    "launch_id": launch_id,
+                    "rank": rank,
+                    "rank_label": rank_status_label,
+                    "reason": str(status.get("reason") or ""),
+                    "source": status.get("source", ""),
+                    "op_name": group["op_name"],
+                    "message_bytes": group["message_bytes"],
+                    "rank_size": group["rank_size"],
+                    "block_dim": group.get("block_dim", 0),
+                    "max_core_count": group.get("max_core_count", 0),
+                },
+            })
+
+        for bar in group["bars"]:
+            thread_key = (bar["rank"], bar["core"])
+            if thread_key not in seen_threads:
+                seen_threads.add(thread_key)
+                events.append({
+                    "name": "thread_name",
+                    "ph": "M",
+                    "pid": bar["rank"],
+                    "tid": bar["core"],
+                    "args": {"name": f"{rank_label(index.get('hosts', {}), bar['rank'])}/core{bar['core']}"},
+                })
+
+            if bar["duration_us"] <= 0:
+                continue
+
+            launch_offset_us = launch_offsets.get(bar["launch_id"], 0.0)
+            bar_rank_label = rank_label(index.get("hosts", {}), bar["rank"])
+            events.append({
+                "name": f"launch{bar['launch_id']}/{bar_rank_label}/{bar['stage']}",
+                "cat": group["op_name"],
+                "ph": "X",
+                "pid": bar["rank"],
+                "tid": bar["core"],
+                "ts": launch_offset_us + bar["start_us"],
+                "dur": bar["duration_us"],
+                "args": {
+                    "launch_id": bar["launch_id"],
+                    "launch_offset_us": launch_offset_us,
+                    "normalized_ts": bar["start_us"],
+                    "rank": bar["rank"],
+                    "rank_label": bar_rank_label,
+                    "host": bar.get("host", ""),
+                    "host_ip": bar.get("host_ip", ""),
+                    "core": bar["core"],
+                    "stage": bar["stage"],
+                    "stage_id": bar["stage_id"],
+                    "sum_us": bar["sum_us"],
+                    "raw_cycles": bar["raw_cycles"],
+                    "max_cycles": bar["max_cycles"],
+                    "count": bar["count"],
+                    "source": bar["source"],
+                    "op_name": group["op_name"],
+                    "message_bytes": group["message_bytes"],
+                    "rank_size": group["rank_size"],
+                },
+            })
+
+    return {
+        "displayTimeUnit": "us",
+        "traceEvents": events,
+    }
+
+
+def compute_perfetto_launch_offsets(index):
+    max_end_by_launch = defaultdict(float)
+    for group in index["groups"]:
+        for bar in group["bars"]:
+            max_end_by_launch[bar["launch_id"]] = max(max_end_by_launch[bar["launch_id"]], bar["end_us"])
+
+    offsets = {}
+    cursor = 0.0
+    for launch_id in sorted(max_end_by_launch):
+        offsets[launch_id] = cursor
+        cursor += max_end_by_launch[launch_id] + PERFETTO_LAUNCH_GAP_US
+    return offsets
+
+
 def json_for_script(value):
     return (
         json.dumps(value, separators=(",", ":"), ensure_ascii=False)
@@ -435,6 +761,8 @@ def json_for_script(value):
 def render_html(index):
     data = json_for_script(index)
     summary_items = []
+    rank_summary_rows = []
+    trace_status_rows = []
     fallback_rows = []
 
     for group in index["groups"]:
@@ -445,17 +773,54 @@ def render_html(index):
             "<li>"
             f"{html.escape(str(group['op_name']))} bytes={group['message_bytes']}: "
             f"Slowest launch {html.escape(format_slowest_launch(slowest))}; "
+            f"Hosts {html.escape(format_hosts(index.get('hosts', {}), group['rank_ids']))}; "
+            f"Slowest rank {html.escape(format_slowest_rank(summary['slowest_rank'], index.get('hosts', {})))}; "
             f"Top stage {html.escape(str(summary['top_stage']['stage']))} {summary['top_stage']['sum_us']:.3f} us; "
             f"Rank/core max rank{rank_core['rank']} core{rank_core['core']} "
             f"{html.escape(str(rank_core['stage']))} max {rank_core['max_us']:.3f} us"
             "</li>"
         )
 
+        for rank_item in summary["rank_kernel"]:
+            launch_id = rank_item["slowest_launch_id"]
+            drilldown = find_rank_launch_drilldown(group["bars"], rank_item["rank"], launch_id)
+            rank_summary_rows.append(
+                "<tr>"
+                f"<td>{html.escape(str(group['op_name']))}</td>"
+                f"<td>{group['message_bytes']}</td>"
+                f"<td>{html.escape(rank_label(index.get('hosts', {}), rank_item['rank']))}</td>"
+                f"<td>{html.escape(rank_host_info(index.get('hosts', {}), rank_item['rank']).get('ip', ''))}</td>"
+                f"<td>{rank_item['launch_count']}</td>"
+                f"<td>{rank_item['avg_kernel_us']:.3f}</td>"
+                f"<td>{rank_item['max_kernel_us']:.3f}</td>"
+                f"<td>launch{launch_id}</td>"
+                f"<td><a href=\"{html.escape(drilldown)}\">open launch report</a></td>"
+                "</tr>"
+            )
+
+        for status in group.get("trace_statuses", []):
+            if not status.get("incomplete") and not status.get("synthetic"):
+                continue
+            status_text = "incomplete" if status.get("incomplete") else "synthetic"
+            trace_status_rows.append(
+                "<tr>"
+                f"<td>{html.escape(str(group['op_name']))}</td>"
+                f"<td>{group['message_bytes']}</td>"
+                f"<td>launch{status.get('launch_id')}</td>"
+                f"<td>{html.escape(rank_label(index.get('hosts', {}), as_int(status.get('rank'))))}</td>"
+                f"<td>{html.escape(status_text)}</td>"
+                f"<td>{html.escape(str(status.get('reason') or ''))}</td>"
+                f"<td>{html.escape(str(status.get('source') or ''))}</td>"
+                "</tr>"
+            )
+
         for bar in group["bars"]:
             fallback_rows.append(
                 "<tr>"
                 f"<td>launch{bar['launch_id']}</td>"
-                f"<td>rank{bar['rank']}/core{bar['core']}</td>"
+                f"<td>{html.escape(rank_label(index.get('hosts', {}), bar['rank']))}</td>"
+                f"<td>{html.escape(str(bar.get('host_ip', '')))}</td>"
+                f"<td>core{bar['core']}</td>"
                 f"<td>{html.escape(str(bar['stage']))}</td>"
                 f"<td>{bar['duration_us']:.3f}</td>"
                 f"<td>{bar['sum_us']:.3f}</td>"
@@ -468,7 +833,9 @@ def render_html(index):
 
     diagnostics = "".join(f"<li>{html.escape(item)}</li>" for item in index["diagnostics"])
     summary_html = "".join(summary_items) + diagnostics
-    fallback_html = "".join(fallback_rows) or "<tr><td colspan=\"6\">No trace bars available.</td></tr>"
+    rank_summary_html = "".join(rank_summary_rows) or "<tr><td colspan=\"9\">No rank kernel totals available.</td></tr>"
+    trace_status_html = "".join(trace_status_rows) or "<tr><td colspan=\"7\">No incomplete or synthetic traces.</td></tr>"
+    fallback_html = "".join(fallback_rows) or "<tr><td colspan=\"8\">No trace bars available.</td></tr>"
 
     stage_filter_html = "\n".join(
         f"<label><input type=\"checkbox\" checked onchange=\"toggleStage('{name}', this.checked)\"> {name}</label>"
@@ -500,6 +867,16 @@ def render_html(index):
 <ul>{summary_html}</ul>
 </section>
 <section class="panel">
+<h2>Rank-Level Summary</h2>
+<p>Kernel totals are grouped by rank using the slowest core per rank/launch. This is the fastest view for spotting a slow rank before drilling into core-level stages.</p>
+<table><thead><tr><th>Op</th><th>Bytes</th><th>Rank@Host</th><th>Host IP</th><th>Launches</th><th>Avg kernel us</th><th>Max kernel us</th><th>Slowest launch</th><th>Drilldown</th></tr></thead><tbody>{rank_summary_html}</tbody></table>
+</section>
+<section class="panel">
+<h2>Trace Status</h2>
+<p>Incomplete traces keep the same single-launch report schema but indicate that kernel execution failed before device stats could be copied back.</p>
+<table><thead><tr><th>Op</th><th>Bytes</th><th>Launch</th><th>Rank@Host</th><th>Status</th><th>Reason</th><th>Source</th></tr></thead><tbody>{trace_status_html}</tbody></table>
+</section>
+<section class="panel">
 <h2>Chronological Timeline</h2>
 <p>Each rank/launch lane is normalized independently; cross-NPU raw cycle offsets are not assumed to be synchronized.</p>
 <div>
@@ -514,7 +891,7 @@ def render_html(index):
 </section>
 <section class="panel">
 <h2>Fallback Table</h2>
-<table><thead><tr><th>Launch</th><th>Lane</th><th>Stage</th><th>Duration us</th><th>Sum us</th><th>Drilldown</th></tr></thead><tbody>{fallback_html}</tbody></table>
+<table><thead><tr><th>Launch</th><th>Rank@Host</th><th>Host IP</th><th>Core</th><th>Stage</th><th>Duration us</th><th>Sum us</th><th>Drilldown</th></tr></thead><tbody>{fallback_html}</tbody></table>
 </section>
 <script>
 const traceIndex = {data};
@@ -578,7 +955,8 @@ def render_html(index):
     line.textContent = `launch${{launchId}}`;
     root.appendChild(line);
     for (const bar of launchBars) {{
-      const lane = foldCores ? `rank${{bar.rank}}` : `rank${{bar.rank}}/core${{bar.core}}`;
+      const rankLabel = bar.host && bar.host !== `rank${{bar.rank}}` ? `rank${{bar.rank}}@${{bar.host}}` : `rank${{bar.rank}}`;
+      const lane = foldCores ? rankLabel : `${{rankLabel}}/core${{bar.core}}`;
       if (!lanes.has(lane)) lanes.set(lane, laneCount++);
       const cat = category(bar.stage);
       const item = document.createElement('a');
@@ -638,8 +1016,21 @@ def render_html(index):
 """
 
 
+def find_rank_launch_drilldown(bars, rank, launch_id):
+    for bar in bars:
+        if bar["rank"] == rank and bar["launch_id"] == launch_id and bar["stage"] == "kernel_total":
+            return bar["drilldown"]
+    for bar in bars:
+        if bar["rank"] == rank and bar["launch_id"] == launch_id:
+            return bar["drilldown"]
+    return "#"
+
+
 def write_outputs(root, index, emit_ai_prompt):
     (root / "trace_index.json").write_text(json.dumps(index, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    (root / "perfetto_trace.json").write_text(
+        json.dumps(render_perfetto_trace(index), indent=2, sort_keys=True) + "\n",
+        encoding="utf-8")
     (root / "analysis.md").write_text(render_analysis(index), encoding="utf-8")
     (root / "report.html").write_text(render_html(index), encoding="utf-8")
 
diff --git a/tests/collectives/unit/test_collective_perf_report.cpp b/tests/collectives/unit/test_collective_perf_report.cpp
index 641097d..1c5be3b 100644
--- a/tests/collectives/unit/test_collective_perf_report.cpp
+++ b/tests/collectives/unit/test_collective_perf_report.cpp
@@ -92,6 +92,8 @@ TileXR::TileXRPerfCoreStageStats MakeStat(uint32_t rank, uint32_t core, TileXR::
     stat.lastEndCycle = stat.firstStartCycle + sumCycles;
     stat.aux0 = 0x1000 + rank;
     stat.aux1 = 0x2000 + core;
+    stat.aux2 = 0x3000 + static_cast<uint32_t>(stage);
+    stat.aux3 = 0x4000 + count;
     return stat;
 }
 
@@ -126,6 +128,12 @@ void TestPerfReportSummariesAndFiles()
         header.maxCoreCount, header.stageCount);
     stats[peerOffset] = MakeStat(1, 1, TileXR::PerfStageId::PEER_IPC_TO_OUTPUT, 2, 4000, 2500);
 
+    const size_t staleOffset = TileXR::PerfTraceStatsOffset(
+        0, 1, static_cast<uint32_t>(TileXR::PerfStageId::KERNEL_TOTAL),
+        header.maxCoreCount, header.stageCount);
+    stats[staleOffset] = MakeStat(0, 1, TileXR::PerfStageId::KERNEL_TOTAL, 1, 9000, 9000);
+    stats[staleOffset].count = 0;
+
     const auto summaries = TileXRCollectives::Host::SummarizePerfTrace(header, stats);
     CHECK_TRUE(!summaries.empty());
     if (!summaries.empty()) {
@@ -155,13 +163,15 @@ void TestPerfReportSummariesAndFiles()
     CheckTextContains("trace.json", traceJson,
                       "\"min_cycles\": 1000, \"max_cycles\": 1000, \"first_start_cycle\": 100004");
     CheckTextContains("trace.json", traceJson,
-                      "\"last_end_cycle\": 101004, \"aux0\": 4096, \"aux1\": 8192, \"sum_us\": 20");
+                      "\"last_end_cycle\": 101004, \"aux0\": 4096, \"aux1\": 8192, \"aux2\": 12292, \"aux3\": 16385, \"sum_us\": 20");
     CheckTextContains("trace.json", traceJson,
                       "\"stage\": \"peer_ipc_to_output\", \"stage_id\": 5, \"count\": 2, \"raw_cycles\": 4000");
     CheckTextContains("trace.json", traceJson,
                       "\"min_cycles\": 2000, \"max_cycles\": 2500, \"first_start_cycle\": 101105");
     CheckTextContains("trace.json", traceJson,
-                      "\"last_end_cycle\": 105105, \"aux0\": 4097, \"aux1\": 8193, \"sum_us\": 80");
+                      "\"last_end_cycle\": 105105, \"aux0\": 4097, \"aux1\": 8193, \"aux2\": 12293, \"aux3\": 16386, \"sum_us\": 80");
+    CheckFileMissingOrDoesNotContain(options.outputDir + "/trace.json", "\"count\": 0");
+    CheckFileMissingOrDoesNotContain(options.outputDir + "/trace.json", "\"raw_cycles\": 9000");
     CheckContains(options.outputDir + "/summary.csv", "stage,rank,core");
     CheckContains(options.outputDir + "/analysis.md", "bottleneck");
     CheckContains(options.outputDir + "/report.html", "Bottleneck First");
diff --git a/tests/collectives/unit/test_collective_perf_session.cpp b/tests/collectives/unit/test_collective_perf_session.cpp
index 8c8aecf..b7dedc7 100644
--- a/tests/collectives/unit/test_collective_perf_session.cpp
+++ b/tests/collectives/unit/test_collective_perf_session.cpp
@@ -106,6 +106,19 @@ void TestCollectivePerfSessionLifecycle()
     CheckEq("write report succeeds", TileXRCollectivePerfWriteReport(session), TileXR::TILEXR_SUCCESS);
     CheckContains("report.html", ReadFile(outputDir + "/report.html"), "Bottleneck First");
     CheckContains("trace.json", ReadFile(outputDir + "/trace.json"), "tilexr_perf_trace_report.v1");
+    impl->header.rankSize = 2;
+    impl->header.maxCoreCount = 4;
+    impl->header.blockDim = 4;
+    impl->header.opType = static_cast<uint32_t>(TileXR::TileXRType::ALL_GATHER);
+    impl->header.messageBytes = 4096;
+    CheckEq("write incomplete report succeeds",
+            TileXRCollectivePerfWriteIncompleteReport(session, "aclrtSynchronizeEvent stop failed ret=507035"),
+            TileXR::TILEXR_SUCCESS);
+    CheckContains("incomplete trace.json", ReadFile(outputDir + "/trace.json"), "\"incomplete\": true");
+    CheckContains("incomplete trace reason", ReadFile(outputDir + "/trace.json"),
+                  "aclrtSynchronizeEvent stop failed ret=507035");
+    CheckContains("incomplete analysis", ReadFile(outputDir + "/analysis.md"), "Incomplete trace");
+    CheckContains("incomplete report", ReadFile(outputDir + "/report.html"), "Incomplete trace");
     CheckEq("set active again succeeds", TileXRCollectivePerfSetActiveSession(session), TileXR::TILEXR_SUCCESS);
     CheckEq("destroy succeeds", TileXRCollectivePerfSessionDestroy(session), TileXR::TILEXR_SUCCESS);
     CheckTrue("destroy clears active session", TileXRCollectives::Host::GetActivePerfTraceSession() == nullptr);
diff --git a/tests/collectives/unit/test_collective_profile_report.py b/tests/collectives/unit/test_collective_profile_report.py
index 649e827..9fe2c67 100644
--- a/tests/collectives/unit/test_collective_profile_report.py
+++ b/tests/collectives/unit/test_collective_profile_report.py
@@ -10,6 +10,15 @@
 
 COLLECTIVES_DIR = Path(__file__).resolve().parents[1]
 HELPER = COLLECTIVES_DIR / "tilexr_collective_profile_report.py"
+SINGLE_HOST_KERNEL_STAGES = [
+    ("kernel_total", 0),
+    ("chunk_total", 1),
+    ("post_sync", 2),
+    ("local_input_to_ipc", 3),
+    ("flag_poll_wait", 4),
+    ("peer_ipc_to_output", 5),
+    ("chunk_barrier", 6),
+]
 
 
 def make_stat(rank, core, stage, stage_id, first, duration, count=1, raw_cycles=None, max_cycles=None):
@@ -34,7 +43,8 @@ def make_stat(rank, core, stage, stage_id, first, duration, count=1, raw_cycles=
     }
 
 
-def write_trace(root, rank, launch, message_bytes=1024, schema="tilexr_perf_trace_report.v1", rank_size=2):
+def write_trace(root, rank, launch, message_bytes=1024, schema="tilexr_perf_trace_report.v1", rank_size=2,
+                block_dim=2, max_core_count=2):
     launch_dir = root / f"rank{rank}" / f"launch{launch}"
     launch_dir.mkdir(parents=True)
     base = 1000000 + rank * 100000 + launch * 10000
@@ -49,8 +59,8 @@ def write_trace(root, rank, launch, message_bytes=1024, schema="tilexr_perf_trac
         "op_type": 3,
         "op_name": "TileXRAllGather",
         "rank_size": rank_size,
-        "max_core_count": 2,
-        "block_dim": 2,
+        "max_core_count": max_core_count,
+        "block_dim": block_dim,
         "stage_count": 7,
         "cycle_to_us_divisor": 50,
         "message_bytes": message_bytes,
@@ -60,6 +70,48 @@ def write_trace(root, rank, launch, message_bytes=1024, schema="tilexr_perf_trac
     (launch_dir / "report.html").write_text("<html>single launch</html>\n", encoding="utf-8")
 
 
+def write_full_stage_trace(root, rank, launch, message_bytes=4096, rank_size=2, block_dim=2, max_core_count=2):
+    launch_dir = root / f"rank{rank}" / f"launch{launch}"
+    launch_dir.mkdir(parents=True)
+    base = 2000000 + rank * 100000 + launch * 10000
+    stats = []
+    for index, (stage, stage_id) in enumerate(SINGLE_HOST_KERNEL_STAGES):
+        first = base + index * 100
+        duration = 900 - index * 50 + rank * 10 + launch * 20
+        stats.append(make_stat(rank, 0, stage, stage_id, first, duration, count=1, max_cycles=duration))
+    trace = {
+        "schema": "tilexr_perf_trace_report.v1",
+        "op_type": 11,
+        "op_name": "TileXRProfileProbe",
+        "rank_size": rank_size,
+        "max_core_count": max_core_count,
+        "block_dim": block_dim,
+        "stage_count": len(SINGLE_HOST_KERNEL_STAGES),
+        "cycle_to_us_divisor": 50,
+        "message_bytes": message_bytes,
+        "stats": stats,
+    }
+    (launch_dir / "trace.json").write_text(json.dumps(trace, indent=2), encoding="utf-8")
+    (launch_dir / "report.html").write_text("<html>single launch</html>\n", encoding="utf-8")
+
+
+def write_host_info(root, rank, host, ip):
+    rank_dir = root / f"rank{rank}"
+    rank_dir.mkdir(parents=True, exist_ok=True)
+    (rank_dir / "host_info.json").write_text(
+        json.dumps(
+            {
+                "schema": "tilexr_collective_profile_host.v1",
+                "rank": rank,
+                "host": host,
+                "ip": ip,
+            },
+            indent=2,
+        ),
+        encoding="utf-8",
+    )
+
+
 def write_custom_trace(root, rank, launch, trace):
     launch_dir = root / f"rank{rank}" / f"launch{launch}"
     launch_dir.mkdir(parents=True)
@@ -121,6 +173,187 @@ def test_writes_chronological_multi_rank_report(self):
             self.assertIn("TileXR collective profiling run", prompt)
             self.assertIn("warmup iterations: 5", prompt)
 
+    def test_writes_perfetto_trace_for_ui_perfetto_dev(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            write_trace(root, 0, 0)
+            write_trace(root, 1, 0)
+
+            result = run_helper(root, "--warmup-iters", "0", "--iters", "1", "--profile-sample-every", "1")
+            self.assertEqual(result.returncode, 0, result.stderr)
+
+            perfetto = json.loads((root / "perfetto_trace.json").read_text(encoding="utf-8"))
+            self.assertIn("traceEvents", perfetto)
+            self.assertIn({"name": "process_name", "ph": "M", "pid": 0, "args": {"name": "rank0"}}, perfetto["traceEvents"])
+            self.assertIn({"name": "thread_name", "ph": "M", "pid": 0, "tid": 0, "args": {"name": "rank0/core0"}}, perfetto["traceEvents"])
+
+            kernel = next(
+                event
+                for event in perfetto["traceEvents"]
+                if event.get("ph") == "X" and event.get("name") == "launch0/rank0/kernel_total" and
+                event.get("pid") == 0 and event.get("tid") == 0
+            )
+            self.assertEqual(kernel["cat"], "TileXRAllGather")
+            self.assertEqual(kernel["ts"], 0.0)
+            self.assertEqual(kernel["dur"], 20.0)
+            self.assertEqual(kernel["args"]["stage"], "kernel_total")
+            self.assertEqual(kernel["args"]["launch_id"], 0)
+            self.assertEqual(kernel["args"]["source"], "rank0/launch0/trace.json")
+            self.assertEqual(kernel["args"]["message_bytes"], 1024)
+            self.assertEqual(kernel["args"]["rank_size"], 2)
+
+    def test_rank_level_summary_highlights_slowest_rank(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            for launch in (0, 1):
+                write_trace(root, 0, launch)
+                write_trace(root, 1, launch)
+
+            result = run_helper(root, "--warmup-iters", "0", "--iters", "2", "--profile-sample-every", "1")
+            self.assertEqual(result.returncode, 0, result.stderr)
+
+            index = json.loads((root / "trace_index.json").read_text(encoding="utf-8"))
+            summary = index["groups"][0]["summary"]
+            self.assertEqual(summary["slowest_rank"]["rank"], 1)
+            self.assertEqual(summary["slowest_rank"]["slowest_launch_id"], 1)
+            self.assertAlmostEqual(summary["slowest_rank"]["avg_kernel_us"], 21.2)
+            self.assertAlmostEqual(summary["slowest_rank"]["max_kernel_us"], 22.2)
+
+            rank_kernel = {item["rank"]: item for item in summary["rank_kernel"]}
+            self.assertEqual(rank_kernel[0]["launch_count"], 2)
+            self.assertAlmostEqual(rank_kernel[0]["avg_kernel_us"], 21.0)
+            self.assertAlmostEqual(rank_kernel[0]["max_kernel_us"], 22.0)
+            self.assertEqual(rank_kernel[1]["launch_count"], 2)
+            self.assertAlmostEqual(rank_kernel[1]["avg_kernel_us"], 21.2)
+            self.assertAlmostEqual(rank_kernel[1]["max_kernel_us"], 22.2)
+
+            analysis = (root / "analysis.md").read_text(encoding="utf-8")
+            self.assertIn("Slowest rank: rank1 avg 21.200 us max 22.200 us at launch1", analysis)
+            self.assertIn("Rank kernel totals: rank1 avg=21.200 us max=22.200 us; rank0 avg=21.000 us max=22.000 us", analysis)
+
+            html = (root / "report.html").read_text(encoding="utf-8")
+            self.assertIn("Rank-Level Summary", html)
+            self.assertIn("rank1", html)
+            self.assertIn("21.200", html)
+            self.assertIn("22.200", html)
+
+    def test_multihost_report_preserves_single_host_kernel_stage_granularity(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            for launch in (0, 1):
+                write_full_stage_trace(root, 0, launch)
+                write_full_stage_trace(root, 1, launch)
+
+            result = run_helper(root, "--warmup-iters", "0", "--iters", "2", "--profile-sample-every", "1")
+            self.assertEqual(result.returncode, 0, result.stderr)
+
+            expected_stage_names = [stage for stage, _ in SINGLE_HOST_KERNEL_STAGES]
+            index = json.loads((root / "trace_index.json").read_text(encoding="utf-8"))
+            group = index["groups"][0]
+            self.assertEqual(group["op_name"], "TileXRProfileProbe")
+            self.assertEqual(group["stage_count"], len(expected_stage_names))
+            self.assertFalse(index["diagnostics"])
+
+            stages_by_rank_launch = {}
+            for bar in group["bars"]:
+                stages_by_rank_launch.setdefault((bar["rank"], bar["launch_id"]), set()).add(bar["stage"])
+
+            for launch in (0, 1):
+                for rank in (0, 1):
+                    self.assertEqual(
+                        sorted(stages_by_rank_launch[(rank, launch)]),
+                        sorted(expected_stage_names),
+                    )
+
+            html = (root / "report.html").read_text(encoding="utf-8")
+            perfetto = json.loads((root / "perfetto_trace.json").read_text(encoding="utf-8"))
+            perfetto_names = {
+                event.get("name")
+                for event in perfetto["traceEvents"]
+                if event.get("ph") == "X"
+            }
+            for stage in expected_stage_names:
+                self.assertIn(stage, html)
+                self.assertIn(f"launch0/rank0/{stage}", perfetto_names)
+                self.assertIn(f"launch1/rank1/{stage}", perfetto_names)
+
+    def test_host_metadata_is_included_in_html_analysis_and_perfetto(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            write_host_info(root, 0, "host62", "141.62.24.62")
+            write_host_info(root, 1, "host70", "141.62.24.70")
+            write_trace(root, 0, 0)
+            write_trace(root, 1, 0)
+
+            result = run_helper(root, "--warmup-iters", "0", "--iters", "1", "--profile-sample-every", "1")
+            self.assertEqual(result.returncode, 0, result.stderr)
+
+            index = json.loads((root / "trace_index.json").read_text(encoding="utf-8"))
+            self.assertEqual(index["hosts"]["0"]["host"], "host62")
+            self.assertEqual(index["hosts"]["0"]["ip"], "141.62.24.62")
+            self.assertEqual(index["hosts"]["1"]["host"], "host70")
+            self.assertEqual(index["hosts"]["1"]["ip"], "141.62.24.70")
+            rank0_bar = next(bar for bar in index["groups"][0]["bars"] if bar["rank"] == 0)
+            rank1_bar = next(bar for bar in index["groups"][0]["bars"] if bar["rank"] == 1)
+            self.assertEqual(rank0_bar["host"], "host62")
+            self.assertEqual(rank1_bar["host_ip"], "141.62.24.70")
+
+            analysis = (root / "analysis.md").read_text(encoding="utf-8")
+            self.assertIn("Hosts: rank0@host62(141.62.24.62), rank1@host70(141.62.24.70)", analysis)
+            self.assertIn("Slowest rank: rank1@host70", analysis)
+
+            html = (root / "report.html").read_text(encoding="utf-8")
+            self.assertIn("Host", html)
+            self.assertIn("rank1@host70", html)
+            self.assertIn("141.62.24.70", html)
+
+            perfetto = json.loads((root / "perfetto_trace.json").read_text(encoding="utf-8"))
+            self.assertIn(
+                {"name": "process_name", "ph": "M", "pid": 1, "args": {"name": "rank1@host70"}},
+                perfetto["traceEvents"],
+            )
+            kernel = next(
+                event
+                for event in perfetto["traceEvents"]
+                if event.get("ph") == "X" and event.get("name") == "launch0/rank1@host70/kernel_total"
+            )
+            self.assertEqual(kernel["args"]["host"], "host70")
+            self.assertEqual(kernel["args"]["host_ip"], "141.62.24.70")
+
+    def test_perfetto_trace_offsets_launches_and_adds_alignment_windows(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            for launch in (0, 1):
+                write_trace(root, 0, launch)
+                write_trace(root, 1, launch)
+
+            result = run_helper(root, "--warmup-iters", "0", "--iters", "2", "--profile-sample-every", "1")
+            self.assertEqual(result.returncode, 0, result.stderr)
+
+            perfetto = json.loads((root / "perfetto_trace.json").read_text(encoding="utf-8"))
+            launch0_kernel = next(
+                event for event in perfetto["traceEvents"]
+                if event.get("ph") == "X" and event.get("name") == "launch0/rank0/kernel_total"
+            )
+            launch1_kernel = next(
+                event for event in perfetto["traceEvents"]
+                if event.get("ph") == "X" and event.get("name") == "launch1/rank0/kernel_total"
+            )
+            self.assertGreater(launch1_kernel["ts"], launch0_kernel["ts"])
+            self.assertEqual(launch1_kernel["args"]["normalized_ts"], 0.0)
+            self.assertEqual(launch1_kernel["args"]["launch_offset_us"], 70.2)
+
+            marker_thread = {"name": "thread_name", "ph": "M", "pid": 0, "tid": 1000000, "args": {"name": "rank0/launch_windows"}}
+            self.assertIn(marker_thread, perfetto["traceEvents"])
+            window = next(
+                event for event in perfetto["traceEvents"]
+                if event.get("ph") == "X" and event.get("name") == "launch1/rank0/window"
+            )
+            self.assertEqual(window["cat"], "launch_window")
+            self.assertEqual(window["ts"], 70.2)
+            self.assertEqual(window["args"]["rank"], 0)
+            self.assertEqual(window["args"]["launch_id"], 1)
+
     def test_preserves_sparse_launch_ids_and_reports_missing_trace(self):
         with tempfile.TemporaryDirectory() as tmp:
             root = Path(tmp)
@@ -205,6 +438,68 @@ def test_keeps_incompatible_message_bytes_out_of_valid_group(self):
             self.assertIn("message_bytes=2048", joined)
             self.assertIn("rank1/launch0/trace.json", joined)
 
+    def test_keeps_incompatible_kernel_dimensions_out_of_valid_group(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            write_trace(root, 0, 0, block_dim=4, max_core_count=4)
+            write_trace(root, 1, 0, block_dim=2, max_core_count=2)
+
+            result = run_helper(root, "--warmup-iters", "0", "--iters", "1", "--profile-sample-every", "1")
+            self.assertEqual(result.returncode, 0, result.stderr)
+
+            index = json.loads((root / "trace_index.json").read_text(encoding="utf-8"))
+            self.assertEqual(len(index["groups"]), 2)
+            joined = "\n".join(index["diagnostics"])
+            self.assertIn("incompatible trace groups detected", joined)
+            self.assertIn("max_core_count=4 block_dim=4", joined)
+            self.assertIn("rank0/launch0/trace.json", joined)
+            self.assertIn("max_core_count=2 block_dim=2", joined)
+            self.assertIn("rank1/launch0/trace.json", joined)
+
+    def test_incomplete_kernel_trace_is_reported_and_visible_in_perfetto(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            trace = {
+                "schema": "tilexr_perf_trace_report.v1",
+                "incomplete": True,
+                "incomplete_reason": "aclrtSynchronizeEvent stop failed ret=507035",
+                "op_type": 3,
+                "op_name": "TileXRAllGather",
+                "rank_size": 2,
+                "max_core_count": 4,
+                "block_dim": 4,
+                "stage_count": 7,
+                "cycle_to_us_divisor": 50,
+                "message_bytes": 4096,
+                "stats": [],
+            }
+            write_custom_trace(root, 0, 0, trace)
+
+            result = run_helper(root, "--warmup-iters", "0", "--iters", "1", "--profile-sample-every", "1")
+            self.assertEqual(result.returncode, 0, result.stderr)
+
+            index = json.loads((root / "trace_index.json").read_text(encoding="utf-8"))
+            self.assertEqual(index["groups"][0]["block_dim"], 4)
+            self.assertEqual(index["groups"][0]["max_core_count"], 4)
+            self.assertTrue(index["groups"][0]["trace_statuses"][0]["incomplete"])
+            self.assertIn("aclrtSynchronizeEvent stop failed", index["groups"][0]["trace_statuses"][0]["reason"])
+            joined = "\n".join(index["diagnostics"])
+            self.assertIn("incomplete trace in rank0/launch0/trace.json", joined)
+            self.assertIn("missing trace for rank1 launch0", joined)
+
+            analysis = (root / "analysis.md").read_text(encoding="utf-8")
+            self.assertIn("incomplete trace in rank0/launch0/trace.json", analysis)
+            html = (root / "report.html").read_text(encoding="utf-8")
+            self.assertIn("aclrtSynchronizeEvent stop failed ret=507035", html)
+
+            perfetto = json.loads((root / "perfetto_trace.json").read_text(encoding="utf-8"))
+            event = next(
+                event for event in perfetto["traceEvents"]
+                if event.get("ph") == "X" and event.get("name") == "launch0/rank0/incomplete_trace"
+            )
+            self.assertEqual(event["cat"], "trace_status")
+            self.assertEqual(event["args"]["reason"], "aclrtSynchronizeEvent stop failed ret=507035")
+
     def test_rank_core_max_uses_stage_max_cycles(self):
         with tempfile.TemporaryDirectory() as tmp:
             root = Path(tmp)
@@ -235,6 +530,54 @@ def test_rank_core_max_uses_stage_max_cycles(self):
             self.assertEqual(summary["rank_core_max"]["stage"], "short_peak")
             self.assertEqual(summary["rank_core_max"]["max_us"], 18.0)
 
+    def test_count_zero_slots_are_ignored_even_when_cycles_are_nonzero(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            trace = {
+                "schema": "tilexr_perf_trace_report.v1",
+                "op_type": 3,
+                "op_name": "TileXRAllGather",
+                "rank_size": 2,
+                "max_core_count": 2,
+                "block_dim": 2,
+                "stage_count": 7,
+                "cycle_to_us_divisor": 50,
+                "message_bytes": 1024,
+                "stats": [
+                    make_stat(1, 0, "kernel_total", 0, 1000, 500, count=1, max_cycles=500),
+                    {
+                        "rank": 0,
+                        "core": 0,
+                        "stage": "kernel_total",
+                        "stage_id": 0,
+                        "count": 0,
+                        "raw_cycles": 500000,
+                        "min_cycles": 500000,
+                        "max_cycles": 500000,
+                        "first_start_cycle": 1000,
+                        "last_end_cycle": 501000,
+                        "aux0": 0,
+                        "aux1": 0,
+                        "sum_us": 10000.0,
+                    },
+                ],
+            }
+            write_custom_trace(root, 1, 0, trace)
+
+            result = run_helper(root, "--warmup-iters", "0", "--iters", "1", "--profile-sample-every", "1")
+            self.assertEqual(result.returncode, 0, result.stderr)
+
+            index = json.loads((root / "trace_index.json").read_text(encoding="utf-8"))
+            bars = index["groups"][0]["bars"]
+            self.assertEqual(len(bars), 1)
+            self.assertEqual(bars[0]["rank"], 1)
+            self.assertEqual(index["groups"][0]["summary"]["slowest_rank"]["rank"], 1)
+
+            perfetto = json.loads((root / "perfetto_trace.json").read_text(encoding="utf-8"))
+            event_names = [event.get("name") for event in perfetto["traceEvents"] if event.get("ph") == "X"]
+            self.assertIn("launch0/rank1/kernel_total", event_names)
+            self.assertNotIn("launch0/rank0/kernel_total", event_names)
+
     def test_missing_kernel_total_renders_unavailable_slowest_launch(self):
         with tempfile.TemporaryDirectory() as tmp:
             root = Path(tmp)
diff --git a/tests/collectives/unit/test_prepare_host_launch_context.cpp b/tests/collectives/unit/test_prepare_host_launch_context.cpp
index 5060b84..5906871 100644
--- a/tests/collectives/unit/test_prepare_host_launch_context.cpp
+++ b/tests/collectives/unit/test_prepare_host_launch_context.cpp
@@ -1,5 +1,6 @@
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <vector>
@@ -208,6 +209,8 @@ void CheckPerfTraceLaunchMetadata()
     CheckMagic("trace cycleToUsDivisor", session.header.cycleToUsDivisor, 1000);
     CheckMagic("trace stats size", static_cast<int64_t>(session.hostStats.size()),
                static_cast<int64_t>(commArgs.rankSize) * 4 * TileXR::TILEXR_PERF_STAGE_COUNT);
+    CheckMagic("trace stats offset", static_cast<int64_t>(session.header.statsOffset),
+               TileXR::TILEXR_PERF_TRACE_STATS_OFFSET);
 
     deviceTrace = reinterpret_cast<const void *>(0x1);
     CheckStatus("PreparePerfTraceLaunch reducescatter message bytes",
@@ -367,6 +370,58 @@ void CheckPerfTraceRuntimeHooks()
     TileXRCollectives::Host::SetPerfTraceRuntimeHooksForTest(nullptr);
 }
 
+void CheckKernelProfilingCanBeDisabledWithoutLosingHostMetadata()
+{
+#if defined(_WIN32)
+    _putenv("TILEXR_COLLECTIVES_DISABLE_KERNEL_PROFILING=1");
+#else
+    setenv("TILEXR_COLLECTIVES_DISABLE_KERNEL_PROFILING", "1", 1);
+#endif
+    ResetFakeRuntime();
+    TileXRCollectives::Host::SetPerfTraceRuntimeHooksForTest(&kFakeRuntimeHooks);
+
+    TileXRCollectivePerfConfig config {};
+    config.enabled = 1;
+    config.outputDir = "/tmp/tilexr_perf_prepare_disabled_kernel";
+    config.sampleEveryN = 1;
+    TileXRCollectivePerfSession opaqueSession = nullptr;
+    CheckStatus("create perf session for disabled kernel profiling",
+                TileXRCollectivePerfSessionCreate(&config, &opaqueSession), TileXR::TILEXR_SUCCESS);
+    TileXRCollectives::Host::PerfTraceSession *session =
+        static_cast<TileXRCollectives::Host::PerfTraceSession *>(opaqueSession);
+
+    TileXR::CommArgs commArgs {};
+    commArgs.rank = 1;
+    commArgs.rankSize = 2;
+
+    const void *deviceTrace = reinterpret_cast<const void *>(0x1);
+    CheckStatus("PreparePerfTraceLaunch disabled kernel profiling",
+                TileXRCollectives::Host::PreparePerfTraceLaunch(
+                    session, commArgs, TileXR::TileXRType::ALL_GATHER,
+                    TileXR::TILEXR_DATA_TYPE_INT32, 4, 1024, nullptr, &deviceTrace),
+                TileXR::TILEXR_SUCCESS);
+    CheckPointer("disabled kernel profiling deviceTrace", deviceTrace, nullptr);
+    CheckTrue("disabled kernel profiling not ready", !session->deviceTraceReady);
+    CheckMagic("disabled kernel profiling malloc calls", g_mallocCalls, 0);
+    CheckMagic("disabled kernel profiling rank", session->header.rank, 1);
+    CheckMagic("disabled kernel profiling rankSize", session->header.rankSize, 2);
+    CheckMagic("disabled kernel profiling stats size", static_cast<int64_t>(session->hostStats.size()),
+               static_cast<int64_t>(commArgs.rankSize) * 4 * TileXR::TILEXR_PERF_STAGE_COUNT);
+    CheckMagic("disabled kernel profiling stats offset", static_cast<int64_t>(session->header.statsOffset),
+               TileXR::TILEXR_PERF_TRACE_STATS_OFFSET);
+
+    CheckStatus("write disabled kernel profiling report",
+                TileXRCollectivePerfWriteReport(opaqueSession), TileXR::TILEXR_SUCCESS);
+    CheckStatus("destroy disabled kernel profiling session",
+                TileXRCollectivePerfSessionDestroy(opaqueSession), TileXR::TILEXR_SUCCESS);
+    TileXRCollectives::Host::SetPerfTraceRuntimeHooksForTest(nullptr);
+#if defined(_WIN32)
+    _putenv("TILEXR_COLLECTIVES_DISABLE_KERNEL_PROFILING=");
+#else
+    unsetenv("TILEXR_COLLECTIVES_DISABLE_KERNEL_PROFILING");
+#endif
+}
+
 } // namespace
 
 int main()
@@ -374,6 +429,7 @@ int main()
     CheckKernelArgsHasPerfTrace();
     CheckPerfTraceLaunchMetadata();
     CheckPerfTraceRuntimeHooks();
+    CheckKernelProfilingCanBeDisabledWithoutLosingHostMetadata();
 
     TileXRCollectives::Host::HostLaunchContext context;
 
diff --git a/tests/collectives/unit/test_tilexr_collectives_api.cpp b/tests/collectives/unit/test_tilexr_collectives_api.cpp
index 3713d93..824affe 100644
--- a/tests/collectives/unit/test_tilexr_collectives_api.cpp
+++ b/tests/collectives/unit/test_tilexr_collectives_api.cpp
@@ -238,6 +238,51 @@ void TestCollectivesBuildDefinesSeparateSharedLibrary()
     CheckDoesNotContain(path, text, "${CMAKE_INSTALL_PREFIX}/include");
 }
 
+void TestCollectivesKernelBuildSupportsAscend950Arch()
+{
+    const std::string path = "src/collectives/kernels/CMakeLists.txt";
+    const auto text = ReadFile(path);
+
+    CheckContains(path, text, "TILEXR_COLLECTIVES_SOC_TYPE");
+    CheckContains(path, text, "Ascend950");
+    CheckContains(path, text, "dav-c310-vec");
+    CheckContains(path, text, "dav-c220-vec");
+    CheckContains(path, text, "--cce-aicore-arch=${TILEXR_COLLECTIVES_AICORE_ARCH}");
+}
+
+void TestCollectivesKernelSourcesAllowAscend950Macros()
+{
+    const std::string kernelTuPath = "src/collectives/kernels/tilexr_lccl_op.cpp";
+    const auto kernelTu = ReadFile(kernelTuPath);
+    CheckContains(kernelTuPath, kernelTu, "__DAV_C220_VEC__");
+    CheckContains(kernelTuPath, kernelTu, "__DAV_C310_VEC__");
+    CheckContains(kernelTuPath, kernelTu, "LCCL_TYPE_AIV_FUNC(LCCL_ALLGATHER_FUNC_AUTO_DEF)");
+
+    const std::string lcclOpPath = "src/collectives/kernels/lccl_op.h";
+    const auto lcclOp = ReadFile(lcclOpPath);
+    CheckContains(lcclOpPath, lcclOp, "__DAV_C220_VEC__");
+    CheckContains(lcclOpPath, lcclOp, "__DAV_C310_VEC__");
+    CheckContains(lcclOpPath, lcclOp, "TileXRAllGather_##type##suffix");
+
+    const std::string collectivesCcePath = "src/collectives/kernels/kernels/collectives.cce";
+    const auto collectivesCce = ReadFile(collectivesCcePath);
+    CheckContains(collectivesCcePath, collectivesCce, "__DAV_C220_VEC__");
+    CheckContains(collectivesCcePath, collectivesCce, "__DAV_C310_VEC__");
+    CheckContains(collectivesCcePath, collectivesCce, "#define __aicore__");
+    CheckContains(collectivesCcePath, collectivesCce, "#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__)");
+
+    const std::string collectivesHeaderPath = "src/collectives/kernels/collectives.h";
+    const auto collectivesHeader = ReadFile(collectivesHeaderPath);
+    CheckContains(collectivesHeaderPath, collectivesHeader, "#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__)");
+
+    const std::string dataCopyPath = "src/collectives/kernels/datacopy_gm2gm.h";
+    const auto dataCopy = ReadFile(dataCopyPath);
+    CheckContains(dataCopyPath, dataCopy, "#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__)");
+    CheckContains(dataCopyPath, dataCopy, "TileXRAtomicTypeSupported");
+    CheckContains(dataCopyPath, dataCopy, "std::is_same_v<T, int32_t>");
+    CheckContains(dataCopyPath, dataCopy, "if constexpr (TileXRAtomicTypeSupported<T>::value)");
+}
+
 void TestRootBuildRegistersCollectivesTests()
 {
     const std::string path = "CMakeLists.txt";
@@ -359,6 +404,8 @@ int main()
     TestCommInternalDoesNotContainCollectiveRegistration();
     TestCommBuildInstallsPublicHeadersAndKeepsLinksPrivate();
     TestCollectivesBuildDefinesSeparateSharedLibrary();
+    TestCollectivesKernelBuildSupportsAscend950Arch();
+    TestCollectivesKernelSourcesAllowAscend950Macros();
     TestRootBuildRegistersCollectivesTests();
     TestCollectivesTestBuildSupportsInTreeAndInstallPrefixModes();
     TestCollectivesInstallSmokeUsesStandaloneInstallPrefixMode();
diff --git a/tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp b/tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp
index 170934d..76f0f3f 100644
--- a/tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp
+++ b/tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp
@@ -149,6 +149,7 @@ void TestCollectivesOwnsCceBuild()
     CheckContains(kernelsCmakePath, kernelsCmake, "tilexr_collectives_op.o");
     CheckContains(kernelsCmakePath, kernelsCmake, "tilexr_collectives_op");
     CheckContains(kernelsCmakePath, kernelsCmake, "TILEXR_COLLECTIVES_ENABLE_PROFILING");
+    CheckContains(kernelsCmakePath, kernelsCmake, "TILEXR_COLLECTIVES_1OP_BIN_SIZE 10485760");
     CheckDoesNotContain(kernelsCmakePath, kernelsCmake, "src/comm");
 }
 
@@ -161,6 +162,7 @@ void TestCollectivesKernelSourcesAreScoped()
     CheckContains(kernelTuPath, kernelTu, "LCCL_TYPE_AIV_FUNC(LCCL_ALL_REDUCE_FUNC_AUTO_DEF)");
     CheckContains(kernelTuPath, kernelTu, "LCCL_TYPE_AIV_FUNC(LCCL_REDUCE_SCATTER_FUNC_AUTO_DEF)");
     CheckContains(kernelTuPath, kernelTu, "LCCL_BROADCAST_FUNC_AUTO_DEF()");
+    CheckContains(kernelTuPath, kernelTu, "LCCL_PROFILE_PROBE_FUNC_AUTO_DEF()");
 
     const std::string perfTraceKernelPath = "src/collectives/kernels/perf_trace_kernel.h";
     const auto perfTraceKernel = ReadFile(perfTraceKernelPath);
@@ -170,6 +172,34 @@ void TestCollectivesKernelSourcesAreScoped()
     CheckContains(perfTraceKernelPath, perfTraceKernel, "TileXRPerfAccumulateDuration");
     CheckContains(perfTraceKernelPath, perfTraceKernel, "TileXRPerfTraceEnabled");
     CheckContains(perfTraceKernelPath, perfTraceKernel, "#include \"comm_args.h\"");
+    CheckContains(perfTraceKernelPath, perfTraceKernel, "#include \"datacopy_gm2gm.h\"");
+    CheckContains(perfTraceKernelPath, perfTraceKernel, "TILEXR_PERF_TRACE_STATS_UB_OFFSET");
+    CheckContains(perfTraceKernelPath, perfTraceKernel, "TILEXR_PERF_TRACE_STATS_OFFSET");
+    CheckContains(perfTraceKernelPath, perfTraceKernel, "CpGM2UB");
+    CheckContains(perfTraceKernelPath, perfTraceKernel, "CpUB2GM");
+    CheckContains(perfTraceKernelPath, perfTraceKernel, "GetBlockNum()");
+    CheckDoesNotContain(perfTraceKernelPath, perfTraceKernel, "header->statsOffset");
+    CheckDoesNotContain(perfTraceKernelPath, perfTraceKernel, "header->maxCoreCount");
+    CheckDoesNotContain(perfTraceKernelPath, perfTraceKernel, "header->stageCount");
+    CheckDoesNotContain(perfTraceKernelPath, perfTraceKernel, "slot->count += 1");
+
+    const std::string profileProbePath = "src/collectives/kernels/kernels/lcal_profile_probe.cce";
+    const auto profileProbe = ReadFile(profileProbePath);
+    CheckContains(profileProbePath, profileProbe, "TileXRProfileProbeKernel");
+    CheckContains(profileProbePath, profileProbe, "PerfStageId::KERNEL_TOTAL");
+    CheckContains(profileProbePath, profileProbe, "PerfStageId::CHUNK_TOTAL");
+    CheckContains(profileProbePath, profileProbe, "PerfStageId::POST_SYNC");
+    CheckContains(profileProbePath, profileProbe, "PerfStageId::LOCAL_INPUT_TO_IPC");
+    CheckContains(profileProbePath, profileProbe, "PerfStageId::FLAG_POLL_WAIT");
+    CheckContains(profileProbePath, profileProbe, "PerfStageId::PEER_IPC_TO_OUTPUT");
+    CheckContains(profileProbePath, profileProbe, "PerfStageId::CHUNK_BARRIER");
+    CheckContains(profileProbePath, profileProbe, "TileXRPerfStageBegin");
+    CheckContains(profileProbePath, profileProbe, "TileXRPerfStageEnd");
+    CheckContains(profileProbePath, profileProbe, "TileXRPerfAccumulateDuration");
+    CheckContains(profileProbePath, profileProbe, "TILEXR_PROFILE_PROBE_COPY_UB_BYTES");
+    CheckContains(profileProbePath, profileProbe, "TileXR::TILEXR_PERF_TRACE_STATS_UB_OFFSET");
+    CheckDoesNotContain(profileProbePath, profileProbe, "peerMems");
+    CheckDoesNotContain(profileProbePath, profileProbe, "GET_IPC_MEM_ARGS");
 
     const std::string perfTraceLayoutPath = "src/include/tilexr_perf_trace.h";
     const auto perfTraceLayout = ReadFile(perfTraceLayoutPath);
@@ -184,6 +214,7 @@ void TestCollectivesKernelSourcesAreScoped()
     bool sawAllReduceCce = false;
     bool sawReduceScatterCce = false;
     bool sawBroadcastCce = false;
+    bool sawProfileProbeCce = false;
     for (const auto &path : kernelFiles) {
         const auto text = ReadFile(path);
         CheckDoesNotContain(path, text, "tilexr_comm.h");
@@ -204,12 +235,16 @@ void TestCollectivesKernelSourcesAreScoped()
         if (path.find("lcal_broadcast") != std::string::npos && path.find(".cce") != std::string::npos) {
             sawBroadcastCce = true;
         }
+        if (path.find("lcal_profile_probe.cce") != std::string::npos) {
+            sawProfileProbeCce = true;
+        }
     }
     CheckTrue(sawAllGatherCce, "expected copied allgather .cce sources under src/collectives/kernels");
     CheckTrue(sawAllToAllCce, "expected copied all2all .cce source under src/collectives/kernels");
     CheckTrue(sawAllReduceCce, "expected copied allreduce .cce sources under src/collectives/kernels");
     CheckTrue(sawReduceScatterCce, "expected copied reduce_scatter .cce sources under src/collectives/kernels");
     CheckTrue(sawBroadcastCce, "expected copied broadcast .cce sources under src/collectives/kernels");
+    CheckTrue(sawProfileProbeCce, "expected profile probe .cce source under src/collectives/kernels");
 }
 
 void TestHostRegistrationLivesInCollectives()
@@ -242,6 +277,7 @@ void TestHostRegistrationLivesInCollectives()
     CheckContains(registeredTypesPath, registeredTypes, "TileXR::TileXRType::ALL_REDUCE");
     CheckContains(registeredTypesPath, registeredTypes, "TileXR::TileXRType::REDUCE_SCATTER");
     CheckContains(registeredTypesPath, registeredTypes, "TileXR::TileXRType::BROADCAST");
+    CheckContains(registeredTypesPath, registeredTypes, "TileXR::TileXRType::PROFILE_PROBE");
 }
 
 void TestPerfTraceCycleDivisorIsA5Specific()
diff --git a/tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp b/tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp
index 18fe37e..f5b5962 100644
--- a/tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp
+++ b/tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp
@@ -122,7 +122,14 @@ void TestPerfToolSource()
     CheckContains(path, text, "--profile-dir");
     CheckContains(path, text, "--profile-ai-prompt");
     CheckContains(path, text, "--profile-sample-every");
-    CheckContains(path, text, "--op allgather|alltoall|allreduce|reducescatter|broadcast");
+    CheckContains(path, text, "--comm-mode");
+    CheckContains(path, text, "TileXRCommInitRank(");
+    CheckContains(path, text, "TileXRGetUniqueId(");
+    CheckContains(path, text, "WriteProfileHostInfo");
+    CheckContains(path, text, "--op allgather|alltoall|allreduce|reducescatter|broadcast|profile-probe");
+    CheckContains(path, text, "CollectiveOp::NOOP");
+    CheckContains(path, text, "CollectiveOp::PROFILE_PROBE");
+    CheckContains(path, text, "TileXRProfileProbe");
     CheckContains(path, text, "TileXRAllReduce");
     CheckContains(path, text, "TileXRReduceScatter");
     CheckContains(path, text, "TileXRBroadcast");
@@ -279,6 +286,26 @@ void TestLauncherScripts()
     CheckContains(perfPath, perf, "--profile-sample-every");
     CheckContains(perfPath, perf, "is_true_bool");
     CheckContains(perfPath, perf, "yes");
+
+    const std::string multiHostPerfPath = "tests/collectives/run_collective_perf_multihost.sh";
+    CheckFileExists(multiHostPerfPath);
+    const auto multiHostPerf = ReadFile(multiHostPerfPath);
+    CheckContains(multiHostPerfPath, multiHostPerf, "TILEXR_MULTIHOST_PEERS");
+    CheckContains(multiHostPerfPath, multiHostPerf, "TILEXR_COMM_ID");
+    CheckContains(multiHostPerfPath, multiHostPerf, "--comm-mode socket");
+    CheckContains(multiHostPerfPath, multiHostPerf, "tilexr_collective_profile_report.py");
+    CheckContains(multiHostPerfPath, multiHostPerf, "copy_rank_profile");
+    CheckContains(multiHostPerfPath, multiHostPerf, "command -v rsync");
+    CheckContains(multiHostPerfPath, multiHostPerf, "tar -cf - .");
+    CheckContains(multiHostPerfPath, multiHostPerf, "tar -xf - -C");
+    CheckContains(multiHostPerfPath, multiHostPerf, "rank${rank}");
+    CheckContains(multiHostPerfPath, multiHostPerf, "host_label=\"${target#*@}\"");
+    CheckContains(multiHostPerfPath, multiHostPerf, "shift 4");
+    CheckContains(multiHostPerfPath, multiHostPerf, "set +u");
+    CheckContains(multiHostPerfPath, multiHostPerf, "ASCEND_PROCESS_LOG_PATH");
+    CheckContains(multiHostPerfPath, multiHostPerf, "plog/rank${rank}");
+    CheckContains(multiHostPerfPath, multiHostPerf,
+        "LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives");
 }
 
 void TestCMakeWiring()
@@ -291,9 +318,13 @@ void TestCMakeWiring()
     CheckContains(path, text, "add_executable(tilexr_collective_perf");
     CheckContains(path, text, "target_link_libraries(test_tilexr_collectives_correctness");
     CheckContains(path, text, "target_link_libraries(tilexr_collective_perf");
+    CheckContains(path, text, "TileXRAddAscendRpathLink");
+    CheckContains(path, text, "LINKER:-rpath-link,${ASCEND_HOME_PATH}/${ARCH}-linux/lib64");
+    CheckContains(path, text, "LINKER:-rpath-link,${ASCEND_DRIVER_PATH}/lib64/driver");
     CheckContains(path, text, "test_tilexr_collectives_tools_sources");
     CheckContains(path, text, "run_collectives_correctness.sh");
     CheckContains(path, text, "run_collective_perf.sh");
+    CheckContains(path, text, "run_collective_perf_multihost.sh");
     CheckContains(path, text, "find_package(Python3 COMPONENTS Interpreter)");
     CheckContains(path, text, "test_collective_profile_report");
     CheckContains(path, text, "tilexr_collective_profile_report.py");
@@ -324,14 +355,22 @@ void TestReadmeDocumentsManualRuns()
     CheckContains(path, text, "algbw(GB/s)");
     CheckContains(path, text, "busbw(GB/s)");
     CheckContains(path, text, "actual send bytes per rank");
-    CheckContains(path, text, "--op allgather|alltoall|allreduce|reducescatter|broadcast");
+    CheckContains(path, text, "--op allgather|alltoall|allreduce|reducescatter|broadcast|profile-probe");
     CheckContains(path, text,
         "Message-size semantics: allgather/allreduce/broadcast: count * dtype_size; alltoall/reducescatter: count * rank_size * dtype_size.");
     CheckContains(path, text, "Checked");
     CheckContains(path, text, "allreduce/reducescatter runs require `--datatype int32`");
     CheckContains(path, text, "rank_size - 1");
     CheckContains(path, text, "trace_index.json");
+    CheckContains(path, text, "perfetto_trace.json");
     CheckContains(path, text, "root-level report.html");
+    CheckContains(path, text, "rank-level summary");
+    CheckContains(path, text, "launch/rank/stage event names");
+    CheckContains(path, text, "Multi-Host Profiling");
+    CheckContains(path, text, "TILEXR_MULTIHOST_PEERS");
+    CheckContains(path, text, "--comm-mode socket");
+    CheckContains(path, text, "--op profile-probe");
+    CheckContains(path, text, "launch0/rank1@141.62.24.70/kernel_total");
     CheckContains(path, text, "tilexr_collective_profile_report.py");
     CheckContains(path, text, "zoomable chronological timeline");
     CheckContains(path, text, "warmup launches are not profiled");
@@ -341,6 +380,23 @@ void TestReadmeDocumentsManualRuns()
     CheckContains(path, text, "manual");
 }
 
+void TestProfileReportHelperDocumentsSlowRankAndPerfettoEvents()
+{
+    const std::string path = "tests/collectives/tilexr_collective_profile_report.py";
+    CheckFileExists(path);
+    const auto text = ReadFile(path);
+    CheckContains(path, text, "rank_kernel");
+    CheckContains(path, text, "slowest_rank");
+    CheckContains(path, text, "Rank-Level Summary");
+    CheckContains(path, text, "PERFETTO_LAUNCH_WINDOW_TID");
+    CheckContains(path, text, "launch_window");
+    CheckContains(path, text, "launch_offset_us");
+    CheckContains(path, text, "normalized_ts");
+    CheckContains(path, text, "host_info.json");
+    CheckContains(path, text, "rank_label(index.get(\"hosts\", {}), bar[\"rank\"])");
+    CheckContains(path, text, "launch{bar['launch_id']}/{bar_rank_label}/{bar['stage']}");
+}
+
 } // namespace
 
 int main()
@@ -350,6 +406,7 @@ int main()
     TestLauncherScripts();
     TestCMakeWiring();
     TestReadmeDocumentsManualRuns();
+    TestProfileReportHelperDocumentsSlowRankAndPerfettoEvents();
     TestInt32PatternHasNoKnownCollisions();
     if (g_failures != 0) {
         std::cerr << g_failures << " collectives tools source checks failed" << std::endl;
diff --git a/tests/collectives/unit/test_tilexr_perf_trace_layout.cpp b/tests/collectives/unit/test_tilexr_perf_trace_layout.cpp
index d2cb45a..fe3d57e 100644
--- a/tests/collectives/unit/test_tilexr_perf_trace_layout.cpp
+++ b/tests/collectives/unit/test_tilexr_perf_trace_layout.cpp
@@ -34,6 +34,7 @@ int main()
     CheckEq("trace version", TileXR::TILEXR_PERF_TRACE_VERSION, 1u);
     CheckEq("max stage name", TileXR::TILEXR_PERF_MAX_STAGE_NAME, 32u);
     CheckEq("stage count", TileXR::TILEXR_PERF_STAGE_COUNT, 7u);
+    CheckEq("stats offset", TileXR::TILEXR_PERF_TRACE_STATS_OFFSET, 128u);
     CheckEq("a5 divisor", TileXR::PerfTraceCycleDivisor(TileXR::PerfChipClass::A5), 1000u);
     CheckEq("generic divisor", TileXR::PerfTraceCycleDivisor(TileXR::PerfChipClass::GENERIC), 50u);
     CheckEq("cycles to us", TileXR::PerfTraceCyclesToUs(2500, 50), 50.0);
@@ -51,7 +52,9 @@ int main()
               std::is_trivially_copyable<TileXR::TileXRPerfCoreStageStats>::value);
     CheckEq("header sizeof", sizeof(TileXR::TileXRPerfTraceHeader), static_cast<size_t>(88));
     CheckEq("stage desc sizeof", sizeof(TileXR::TileXRPerfStageDesc), static_cast<size_t>(48));
-    CheckEq("core stage stats sizeof", sizeof(TileXR::TileXRPerfCoreStageStats), static_cast<size_t>(80));
+    CheckEq("core stage stats sizeof", sizeof(TileXR::TileXRPerfCoreStageStats), static_cast<size_t>(96));
+    CheckEq("stats offset alignment", TileXR::TILEXR_PERF_TRACE_STATS_OFFSET % 32u, 0u);
+    CheckEq("core stage stats alignment", sizeof(TileXR::TileXRPerfCoreStageStats) % 32u, static_cast<size_t>(0));
 
     CheckEq("header magic offset", offsetof(TileXR::TileXRPerfTraceHeader, magic), static_cast<size_t>(0));
     CheckEq("header version offset", offsetof(TileXR::TileXRPerfTraceHeader, version), static_cast<size_t>(4));
@@ -109,6 +112,10 @@ int main()
             static_cast<size_t>(64));
     CheckEq("core stage stats aux1 offset", offsetof(TileXR::TileXRPerfCoreStageStats, aux1),
             static_cast<size_t>(72));
+    CheckEq("core stage stats aux2 offset", offsetof(TileXR::TileXRPerfCoreStageStats, aux2),
+            static_cast<size_t>(80));
+    CheckEq("core stage stats aux3 offset", offsetof(TileXR::TileXRPerfCoreStageStats, aux3),
+            static_cast<size_t>(88));
 
     TileXR::TileXRPerfTraceHeader header {};
     CheckEq("default header magic", header.magic, TileXR::TILEXR_PERF_TRACE_MAGIC);
diff --git a/tests/comm/unit/test_tilexr_source_guards.cpp b/tests/comm/unit/test_tilexr_source_guards.cpp
index cefd0f6..22d9f44 100644
--- a/tests/comm/unit/test_tilexr_source_guards.cpp
+++ b/tests/comm/unit/test_tilexr_source_guards.cpp
@@ -179,6 +179,16 @@ void TestCommBuildIncludesProfilingHeaders()
     CheckContains(commPath, commText, "${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/profiling/");
 }
 
+void TestChipMapCoversObservedAscend950Variants()
+{
+    const std::string path = "src/comm/tilexr_internal.cpp";
+    const auto text = ReadFile(path);
+
+    CheckContains(path, text, "{\"Ascend950PR\", ChipName::CHIP_950}");
+    CheckContains(path, text, "{\"Ascend950PR_9589\", ChipName::CHIP_950}");
+    CheckContains(path, text, "{\"Ascend950PR_9599\", ChipName::CHIP_950}");
+}
+
 } // namespace
 
 int main()
@@ -191,6 +201,7 @@ int main()
     TestRuntimeEnvDoesNotPrependCannDevlib();
     TestRootCMakeRespectsAscendDriverOverride();
     TestCommBuildIncludesProfilingHeaders();
+    TestChipMapCoversObservedAscend950Variants();
 
     if (g_failures != 0) {
         std::cerr << g_failures << " TileXR source guard checks failed" << std::endl;

From cb3145aeb35d0002e14df75e0123b14d0869d7ee Mon Sep 17 00:00:00 2001
From: liuhaoyu35 <liuhaoyu35@huawei.com>
Date: Tue, 30 Jun 2026 19:55:34 +0800
Subject: [PATCH 2/2] fix: complete multi-host profiling patch

---
 src/collectives/CMakeLists.txt                |  3 +
 .../lcal_allgather_2npu_big_data_write.cce    |  5 --
 .../kernels/lcal_allgather_big_data.cce       |  5 --
 src/collectives/kernels/lccl_op.h             | 28 ++++++++
 .../run_collective_perf_multihost.sh          | 71 ++++++++++++++++---
 .../tilexr-tests/tilexr_collective_perf.cpp   | 20 +++++-
 .../tilexr_collective_profile_report.py       | 22 ++++++
 ...st_tilexr_collectives_kernel_ownership.cpp | 29 +++++++-
 .../test_tilexr_collectives_tools_sources.cpp | 31 +++++++-
 9 files changed, 188 insertions(+), 26 deletions(-)

diff --git a/src/collectives/CMakeLists.txt b/src/collectives/CMakeLists.txt
index a3ebcde..9dcbde7 100644
--- a/src/collectives/CMakeLists.txt
+++ b/src/collectives/CMakeLists.txt
@@ -34,6 +34,9 @@ set(TILEXR_COLLECTIVES_SOURCE_FILE
 
 add_library(tilexr-collectives SHARED ${TILEXR_COLLECTIVES_SOURCE_FILE})
 add_dependencies(tilexr-collectives tilexr_collectives_op)
+set_source_files_properties(${TILEXR_COLLECTIVES_KERNEL_EMBED_CPP}
+        PROPERTIES OBJECT_DEPENDS "${TILEXR_COLLECTIVES_OP}"
+)
 
 target_include_directories(tilexr-collectives
         PUBLIC
diff --git a/src/collectives/kernels/kernels/lcal_allgather_2npu_big_data_write.cce b/src/collectives/kernels/kernels/lcal_allgather_2npu_big_data_write.cce
index dd51d75..1d398b7 100644
--- a/src/collectives/kernels/kernels/lcal_allgather_2npu_big_data_write.cce
+++ b/src/collectives/kernels/kernels/lcal_allgather_2npu_big_data_write.cce
@@ -111,8 +111,6 @@ inline __aicore__ void TileXRAllGather2npuBigDataWrite(ALLREDUCE_ARGS_FUN_16P(T)
 {
     const uint32_t perfCore = static_cast<uint32_t>(GetBlockIdx());
     const uint32_t perfRank = static_cast<uint32_t>(rank);
-    auto kernelTotal = TileXR::TileXRPerfStageBegin(
-        perfTrace, TileXR::PerfStageId::KERNEL_TOTAL, TileXR::PerfBarrierPolicy::NO_BARRIER);
     DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY);
     magic *= 1024;
     const int64_t dataOffsetNum = GetTileXRBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM;
@@ -167,7 +165,4 @@ inline __aicore__ void TileXRAllGather2npuBigDataWrite(ALLREDUCE_ARGS_FUN_16P(T)
             chunkTotal, TileXR::PerfBarrierPolicy::NO_BARRIER);
     }
     DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY);
-    TileXR::TileXRPerfStageEnd(
-        perfTrace, perfRank, perfCore, TileXR::PerfStageId::KERNEL_TOTAL,
-        kernelTotal, TileXR::PerfBarrierPolicy::NO_BARRIER);
 }
diff --git a/src/collectives/kernels/kernels/lcal_allgather_big_data.cce b/src/collectives/kernels/kernels/lcal_allgather_big_data.cce
index 94081ec..4a2715e 100644
--- a/src/collectives/kernels/kernels/lcal_allgather_big_data.cce
+++ b/src/collectives/kernels/kernels/lcal_allgather_big_data.cce
@@ -127,8 +127,6 @@ __attribute__((always_inline)) inline __aicore__ void TileXRAllGatherBigData(ALL
 {
     const uint32_t perfCore = static_cast<uint32_t>(GetBlockIdx());
     const uint32_t perfRank = static_cast<uint32_t>(rank);
-    auto kernelTotal = TileXR::TileXRPerfStageBegin(
-        perfTrace, TileXR::PerfStageId::KERNEL_TOTAL, TileXR::PerfBarrierPolicy::NO_BARRIER);
     DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY);
     magic *= 1024;
     const int64_t dataOffsetNum = GetTileXRBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM;
@@ -190,7 +188,4 @@ __attribute__((always_inline)) inline __aicore__ void TileXRAllGatherBigData(ALL
             chunkTotal, TileXR::PerfBarrierPolicy::NO_BARRIER);
     }
     DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY);
-    TileXR::TileXRPerfStageEnd(
-        perfTrace, perfRank, perfCore, TileXR::PerfStageId::KERNEL_TOTAL,
-        kernelTotal, TileXR::PerfBarrierPolicy::NO_BARRIER);
 }
diff --git a/src/collectives/kernels/lccl_op.h b/src/collectives/kernels/lccl_op.h
index 555d999..051c488 100644
--- a/src/collectives/kernels/lccl_op.h
+++ b/src/collectives/kernels/lccl_op.h
@@ -53,6 +53,24 @@
 
 extern "C" __global__ __aicore__ __attribute__((section("Attr_Section_TileXR"))) void TileXRDescriptor() {}
 
+struct TileXRCoarsePerfToken {
+    TileXR::TileXRPerfStageToken kernelTotal;
+    uint32_t perfRank = 0;
+    uint32_t perfCore = 0;
+};
+
+#define TILEXR_COARSE_PERF_BEGIN(tokenName) \
+    TileXRCoarsePerfToken tokenName {}; \
+    tokenName.perfRank = static_cast<uint32_t>(rank); \
+    tokenName.perfCore = static_cast<uint32_t>(GetBlockIdx()); \
+    tokenName.kernelTotal = TileXR::TileXRPerfStageBegin( \
+        perfTrace, TileXR::PerfStageId::KERNEL_TOTAL, TileXR::PerfBarrierPolicy::NO_BARRIER)
+
+#define TILEXR_COARSE_PERF_END(tokenName) \
+    TileXR::TileXRPerfStageEnd( \
+        perfTrace, tokenName.perfRank, tokenName.perfCore, TileXR::PerfStageId::KERNEL_TOTAL, \
+        tokenName.kernelTotal, TileXR::PerfBarrierPolicy::NO_BARRIER)
+
 #define CLASS_OP_QUANT_LAUNCH(name, outputType, inputType) \
 do { \
 name<outputType, inputType> opKernel(localRank, localRankSize, extraFlag); \
@@ -65,6 +83,7 @@ extern "C" __global__ __aicore__ void TileXRBroadcast##suffix(KERNELS_ARGS_FUN()
 { \
     if ASCEND_IS_AIV { \
     GET_COMM_ARGS; \
+    TILEXR_COARSE_PERF_BEGIN(coarsePerf); \
     __gm__ char * shareAddrs[TILEXR_MAX_RANK_SIZE]; \
     GET_IPC_MEM_ARGS(char); \
     if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \
@@ -72,6 +91,7 @@ extern "C" __global__ __aicore__ void TileXRBroadcast##suffix(KERNELS_ARGS_FUN()
     } else { \
         TileXRBroadcastBigData(ALLREDUCE_ARGS_CALL(char)); \
     } \
+    TILEXR_COARSE_PERF_END(coarsePerf); \
     } \
 }
 
@@ -88,6 +108,7 @@ extern "C" __global__ __aicore__ void TileXRProfileProbe(KERNELS_ARGS_FUN()) \
 extern "C" __global__ __aicore__ void TileXRAllGather_##type##suffix(KERNELS_ARGS_FUN()) { \
     if ASCEND_IS_AIV { \
     GET_COMM_ARGS; \
+    TILEXR_COARSE_PERF_BEGIN(coarsePerf); \
     constexpr int32_t quickOneshotRankSize = 2; \
     constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024; \
     constexpr int32_t smallRankSize = 8; \
@@ -118,6 +139,7 @@ extern "C" __global__ __aicore__ void TileXRAllGather_##type##suffix(KERNELS_ARG
             TileXRAllGatherBigData<type>(ALLREDUCE_ARGS_CALL_16P(type)); \
         } \
     } \
+    TILEXR_COARSE_PERF_END(coarsePerf); \
     } \
 }
 
@@ -125,6 +147,7 @@ extern "C" __global__ __aicore__ void TileXRAllGather_##type##suffix(KERNELS_ARG
 extern "C" __global__ __aicore__ void TileXRAllReduce_##type##suffix(KERNELS_ARGS_FUN()) { \
     if ASCEND_IS_AIV { \
     GET_COMM_ARGS; \
+    TILEXR_COARSE_PERF_BEGIN(coarsePerf); \
     constexpr int32_t quickOneshotRankSize = 2; \
     constexpr int32_t threeStepNum = 3; \
     constexpr int32_t smallRankSize = 8; \
@@ -198,6 +221,7 @@ extern "C" __global__ __aicore__ void TileXRAllReduce_##type##suffix(KERNELS_ARG
             TileXRAllReduceBigData<type>(ALLREDUCE_ARGS_CALL_16P(type)); \
         } \
     } \
+    TILEXR_COARSE_PERF_END(coarsePerf); \
     } \
 }
 
@@ -205,6 +229,7 @@ extern "C" __global__ __aicore__ void TileXRAllReduce_##type##suffix(KERNELS_ARG
 extern "C" __global__ __aicore__ void TileXRAll2All_##type##suffix(KERNELS_ARGS_FUN()) { \
     if ASCEND_IS_AIV { \
     GET_COMM_ARGS; \
+    TILEXR_COARSE_PERF_BEGIN(coarsePerf); \
     __gm__ type * shareAddrs[TILEXR_MAX_RANK_SIZE]; \
     GET_IPC_MEM_ARGS(type); \
     constexpr int32_t smallRankSize = 8; \
@@ -218,6 +243,7 @@ extern "C" __global__ __aicore__ void TileXRAll2All_##type##suffix(KERNELS_ARGS_
             CLASS_OP_LAUNCH(All2AllHierarchy, type); \
         } \
     } \
+    TILEXR_COARSE_PERF_END(coarsePerf); \
     } \
 }
 
@@ -225,6 +251,7 @@ extern "C" __global__ __aicore__ void TileXRAll2All_##type##suffix(KERNELS_ARGS_
 extern "C" __global__ __aicore__ void TileXRReduceScatter_##type##suffix(KERNELS_ARGS_FUN()) { \
     if ASCEND_IS_AIV { \
     GET_COMM_ARGS; \
+    TILEXR_COARSE_PERF_BEGIN(coarsePerf); \
     constexpr int32_t quickOneshotRankSize = 2; \
     constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024; \
     constexpr int32_t a3BigDataSize = 32 * 1024 * 1024; \
@@ -255,6 +282,7 @@ extern "C" __global__ __aicore__ void TileXRReduceScatter_##type##suffix(KERNELS
             TileXRReduceScatterBigData<type>(ALLREDUCE_ARGS_CALL_16P(type)); \
         } \
     } \
+    TILEXR_COARSE_PERF_END(coarsePerf); \
     } \
 }
 
diff --git a/tests/collectives/run_collective_perf_multihost.sh b/tests/collectives/run_collective_perf_multihost.sh
index b526ffb..3d3d1f7 100644
--- a/tests/collectives/run_collective_perf_multihost.sh
+++ b/tests/collectives/run_collective_perf_multihost.sh
@@ -4,18 +4,20 @@ set -euo pipefail
 usage() {
   cat >&2 <<'EOF'
 Usage:
+  TILEXR_MULTIHOST_REMOTE_REPO_DIR=/path/to/TileXR \
   TILEXR_MULTIHOST_PEERS="rank,host,ip,device;rank,host,ip,device" \
   run_collective_perf_multihost.sh profile_dir bin_dir [extra tilexr_collective_perf args...]
 
 Example:
-  TILEXR_MULTIHOST_PEERS="0,root@141.62.24.62,141.62.24.62,0;1,root@141.62.24.70,141.62.24.70,0" \
-  TILEXR_COMM_ID=141.62.24.62:10067 \
-  bash run_collective_perf_multihost.sh /home/l00929943/TileXR/run/prof/collectives-2host \
-    /home/l00929943/TileXR/build-profile-950/tests/collectives \
+  TILEXR_MULTIHOST_REMOTE_REPO_DIR=/path/to/TileXR \
+  TILEXR_MULTIHOST_PEERS="0,user@host-a,10.0.0.1,0;1,user@host-b,10.0.0.2,0" \
+  bash run_collective_perf_multihost.sh /path/to/TileXR/run/prof/collectives-2host \
+    /path/to/TileXR/build-profile-950/tests/collectives \
     --op allgather --min-bytes 4096 --max-bytes 4096 --iters 2 --warmup-iters 1 \
     --datatype int32 --check 0 --profile 1 --profile-sample-every 1 --profile-ai-prompt 1
 
 Each peer entry is rank,ssh_target,host_ip,device_id. The first entry is used as rank0/server.
+TILEXR_COMM_ID defaults to rank0 host_ip with port 10067.
 EOF
 }
 
@@ -29,10 +31,10 @@ bin_dir="${2:?bin_dir required}"
 shift 2
 
 peers_spec="${TILEXR_MULTIHOST_PEERS:?TILEXR_MULTIHOST_PEERS is required}"
-comm_id="${TILEXR_COMM_ID:-141.62.24.62:10067}"
 timeout_sec="${TILEXR_COLLECTIVES_RUN_TIMEOUT_SEC:-600}"
 script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 helper="${script_dir}/tilexr_collective_profile_report.py"
+repo_dir="${TILEXR_MULTIHOST_REMOTE_REPO_DIR:-$(cd "${script_dir}/../.." && pwd)}"
 rank_size=0
 warmup_iters=5
 measured_iters=20
@@ -94,6 +96,10 @@ copy_rank_profile() {
   local dest="${profile_dir}/rank${rank}"
   mkdir -p "${dest}"
 
+  if ! ssh -o BatchMode=yes "${target}" "test -d '${profile_dir}/rank${rank}'"; then
+    return
+  fi
+
   if command -v rsync >/dev/null 2>&1 &&
      ssh -o BatchMode=yes "${target}" "command -v rsync >/dev/null 2>&1"; then
     rsync -a -e "ssh -o BatchMode=yes" "${target}:${profile_dir}/rank${rank}/" "${dest}/"
@@ -114,8 +120,10 @@ launch_rank() {
   local target="$2"
   local host_ip="$3"
   local device_id="$4"
+  local repo_dir="$5"
+  local clock_offset_ns="$6"
   local host_label="${target#*@}"
-  shift 4
+  shift 6
   local log="${profile_dir}/multihost_rank${rank}.log"
   logs+=("${log}")
   targets+=("${target}")
@@ -125,7 +133,7 @@ launch_rank() {
   fi
 
   ssh -o BatchMode=yes "${target}" bash -s -- \
-    "${rank_size}" "${rank}" "${device_id}" "${comm_id}" "${profile_dir}" "${bin_dir}" "${host_ip}" "${host_label}" "$@" >"${log}" 2>&1 <<'REMOTE' &
+    "${rank_size}" "${rank}" "${device_id}" "${comm_id}" "${profile_dir}" "${bin_dir}" "${host_ip}" "${host_label}" "${repo_dir}" "${clock_offset_ns}" "$@" >"${log}" 2>&1 <<'REMOTE' &
 set -euo pipefail
 rank_size="$1"
 rank="$2"
@@ -135,21 +143,26 @@ profile_dir="$5"
 bin_dir="$6"
 host_ip="$7"
 host_label="$8"
-shift 8
+repo_dir="$9"
+clock_offset_ns="${10}"
+shift 10
 
-cd /home/l00929943/TileXR
+cd "${repo_dir}"
+build_dir="$(cd "${bin_dir}/../.." && pwd)"
 set +u
 source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true
 conda activate pt311 2>/dev/null || true
 source /usr/local/Ascend/cann/set_env.sh
 set -u
-export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives:$(pwd)/build-profile-950/src/comm:${LD_LIBRARY_PATH:-}
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:${build_dir}/src/collectives:${build_dir}/src/comm:${LD_LIBRARY_PATH:-}
 export ASCEND_PROCESS_LOG_PATH="${profile_dir}/plog/rank${rank}"
 export ASCEND_GLOBAL_LOG_LEVEL="${ASCEND_GLOBAL_LOG_LEVEL:-3}"
 mkdir -p "${ASCEND_PROCESS_LOG_PATH}"
 export TILEXR_COMM_ID="${comm_id}"
 export TILEXR_PROFILE_HOST="${host_label}"
 export TILEXR_PROFILE_HOST_IP="${host_ip}"
+export TILEXR_PROFILE_CLOCK_OFFSET_NS="${clock_offset_ns}"
+export TILEXR_PROFILE_CLOCK_SYNC_REFERENCE="${comm_id%%:*}"
 
 "${bin_dir}/tilexr_collective_perf" \
   --rank-size "${rank_size}" \
@@ -162,13 +175,49 @@ REMOTE
   ssh_pids+=("$!")
 }
 
+remote_epoch_ns() {
+  local target="$1"
+  ssh -o BatchMode=yes "${target}" "date +%s%N"
+}
+
+reference_target=""
+reference_epoch_ns=""
+rank0_host_ip=""
 for peer in "${peers[@]}"; do
   IFS=',' read -r rank target host_ip device_id <<< "${peer}"
   if [[ -z "${rank:-}" || -z "${target:-}" || -z "${host_ip:-}" || -z "${device_id:-}" ]]; then
     echo "ERROR: invalid peer entry '${peer}', expected rank,target,ip,device" >&2
     exit 1
   fi
-  launch_rank "${rank}" "${target}" "${host_ip}" "${device_id}" "$@"
+  if [[ "${rank}" == "0" ]]; then
+    reference_target="${target}"
+    rank0_host_ip="${host_ip}"
+    reference_epoch_ns="$(remote_epoch_ns "${target}")"
+    break
+  fi
+done
+if [[ -z "${reference_target}" || -z "${reference_epoch_ns}" ]]; then
+  echo "ERROR: TILEXR_MULTIHOST_PEERS must contain rank 0" >&2
+  exit 1
+fi
+comm_id="${TILEXR_COMM_ID:-${rank0_host_ip}:10067}"
+
+for peer in "${peers[@]}"; do
+  IFS=',' read -r rank target host_ip device_id <<< "${peer}"
+  if [[ -z "${rank:-}" || -z "${target:-}" || -z "${host_ip:-}" || -z "${device_id:-}" ]]; then
+    echo "ERROR: invalid peer entry '${peer}', expected rank,target,ip,device" >&2
+    exit 1
+  fi
+  if [[ "${rank}" == "0" ]]; then
+    clock_offset_ns=0
+  else
+    reference_before_ns="$(remote_epoch_ns "${reference_target}")"
+    rank_epoch_ns="$(remote_epoch_ns "${target}")"
+    reference_after_ns="$(remote_epoch_ns "${reference_target}")"
+    reference_midpoint_ns="$(((reference_before_ns + reference_after_ns) / 2))"
+    clock_offset_ns="$((rank_epoch_ns - reference_midpoint_ns))"
+  fi
+  launch_rank "${rank}" "${target}" "${host_ip}" "${device_id}" "${repo_dir}" "${clock_offset_ns}" "$@"
 done
 
 sleep "${timeout_sec}" >/dev/null 2>&1 &
diff --git a/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp b/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp
index ff16b93..6ec5d51 100644
--- a/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp
+++ b/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp
@@ -1161,6 +1161,18 @@ std::string EnvOrEmpty(const char *name)
     return value == nullptr ? std::string() : std::string(value);
 }
 
+std::string HostEpochNs()
+{
+    struct timespec ts {};
+    if (clock_gettime(CLOCK_REALTIME, &ts) != 0) {
+        return "";
+    }
+    std::ostringstream out;
+    out << static_cast<long long>(ts.tv_sec) << std::setw(9) << std::setfill('0')
+        << static_cast<long long>(ts.tv_nsec);
+    return out.str();
+}
+
 std::string DefaultHostName()
 {
     char hostname[256] = {};
@@ -1189,6 +1201,9 @@ bool WriteProfileHostInfo(const Options &options)
     if (hostIp.empty()) {
         hostIp = EnvOrEmpty("TILEXR_NODE_IP");
     }
+    const std::string clockOffsetNs = EnvOrEmpty("TILEXR_PROFILE_CLOCK_OFFSET_NS");
+    const std::string clockReference = EnvOrEmpty("TILEXR_PROFILE_CLOCK_SYNC_REFERENCE");
+    const std::string epochNs = HostEpochNs();
 
     const std::string path = JoinPath(rankDir, "host_info.json");
     std::ofstream out(path.c_str());
@@ -1202,7 +1217,10 @@ bool WriteProfileHostInfo(const Options &options)
         << "  \"rank_size\": " << options.rankSize << ",\n"
         << "  \"host\": \"" << JsonEscape(host) << "\",\n"
         << "  \"ip\": \"" << JsonEscape(hostIp) << "\",\n"
-        << "  \"comm_mode\": \"" << CommModeName(options.commMode) << "\"\n"
+        << "  \"comm_mode\": \"" << CommModeName(options.commMode) << "\",\n"
+        << "  \"clock_offset_ns\": \"" << JsonEscape(clockOffsetNs) << "\",\n"
+        << "  \"clock_reference\": \"" << JsonEscape(clockReference) << "\",\n"
+        << "  \"epoch_ns\": \"" << JsonEscape(epochNs) << "\"\n"
         << "}\n";
     return true;
 }
diff --git a/tests/collectives/tilexr_collective_profile_report.py b/tests/collectives/tilexr_collective_profile_report.py
index 2ca1c05..2fe2620 100755
--- a/tests/collectives/tilexr_collective_profile_report.py
+++ b/tests/collectives/tilexr_collective_profile_report.py
@@ -64,6 +64,9 @@ def load_host_infos(root, diagnostics):
             "host": str(info.get("host") or f"rank{rank}"),
             "ip": str(info.get("ip") or ""),
             "comm_mode": str(info.get("comm_mode") or ""),
+            "clock_offset_ns": str(info.get("clock_offset_ns") or ""),
+            "clock_reference": str(info.get("clock_reference") or ""),
+            "epoch_ns": str(info.get("epoch_ns") or ""),
             "source": source,
         }
     return hosts
@@ -762,9 +765,22 @@ def render_html(index):
     data = json_for_script(index)
     summary_items = []
     rank_summary_rows = []
+    clock_rows = []
     trace_status_rows = []
     fallback_rows = []
 
+    for rank, host_info in sorted(index.get("hosts", {}).items()):
+        clock_rows.append(
+            "<tr>"
+            f"<td>rank{rank}</td>"
+            f"<td>{html.escape(str(host_info.get('host', '')))}</td>"
+            f"<td>{html.escape(str(host_info.get('ip', '')))}</td>"
+            f"<td>{html.escape(str(host_info.get('clock_offset_ns', '')))}</td>"
+            f"<td>{html.escape(str(host_info.get('clock_reference', '')))}</td>"
+            f"<td>{html.escape(str(host_info.get('epoch_ns', '')))}</td>"
+            "</tr>"
+        )
+
     for group in index["groups"]:
         summary = group["summary"]
         slowest = summary["slowest_launch"]
@@ -833,6 +849,7 @@ def render_html(index):
 
     diagnostics = "".join(f"<li>{html.escape(item)}</li>" for item in index["diagnostics"])
     summary_html = "".join(summary_items) + diagnostics
+    clock_html = "".join(clock_rows) or "<tr><td colspan=\"6\">No host clock metadata available.</td></tr>"
     rank_summary_html = "".join(rank_summary_rows) or "<tr><td colspan=\"9\">No rank kernel totals available.</td></tr>"
     trace_status_html = "".join(trace_status_rows) or "<tr><td colspan=\"7\">No incomplete or synthetic traces.</td></tr>"
     fallback_html = "".join(fallback_rows) or "<tr><td colspan=\"8\">No trace bars available.</td></tr>"
@@ -867,6 +884,11 @@ def render_html(index):
 <ul>{summary_html}</ul>
 </section>
 <section class="panel">
+<h2>Clock Sync</h2>
+<p>Offsets are sampled by the multi-host runner before launch. Positive offset means that rank host clock is ahead of the rank0 reference clock.</p>
+<table><thead><tr><th>Rank</th><th>Host</th><th>Host IP</th><th>Offset ns vs rank0</th><th>Reference</th><th>Rank epoch ns</th></tr></thead><tbody>{clock_html}</tbody></table>
+</section>
+<section class="panel">
 <h2>Rank-Level Summary</h2>
 <p>Kernel totals are grouped by rank using the slowest core per rank/launch. This is the fastest view for spotting a slow rank before drilling into core-level stages.</p>
 <table><thead><tr><th>Op</th><th>Bytes</th><th>Rank@Host</th><th>Host IP</th><th>Launches</th><th>Avg kernel us</th><th>Max kernel us</th><th>Slowest launch</th><th>Drilldown</th></tr></thead><tbody>{rank_summary_html}</tbody></table>
diff --git a/tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp b/tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp
index 76f0f3f..b6d8241 100644
--- a/tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp
+++ b/tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp
@@ -141,6 +141,8 @@ void TestCollectivesOwnsCceBuild()
     CheckContains(collectivesCmakePath, collectivesCmake, "add_subdirectory(kernels)");
     CheckContains(collectivesCmakePath, collectivesCmake, "tilexr_collectives_kernel_embed");
     CheckContains(collectivesCmakePath, collectivesCmake, "tilexr_collectives_op");
+    CheckContains(collectivesCmakePath, collectivesCmake, "OBJECT_DEPENDS");
+    CheckContains(collectivesCmakePath, collectivesCmake, "${TILEXR_COLLECTIVES_OP}");
 
     const std::string kernelsCmakePath = "src/collectives/kernels/CMakeLists.txt";
     const auto kernelsCmake = ReadFile(kernelsCmakePath);
@@ -311,7 +313,7 @@ void TestBigDataAllGatherPerfStages()
 {
     const std::string path = "src/collectives/kernels/kernels/lcal_allgather_big_data.cce";
     const auto text = ReadFile(path);
-    CheckContains(path, text, "PerfStageId::KERNEL_TOTAL");
+    CheckDoesNotContain(path, text, "PerfStageId::KERNEL_TOTAL");
     CheckContains(path, text, "PerfStageId::CHUNK_TOTAL");
     CheckContains(path, text, "PerfStageId::POST_SYNC");
     CheckContains(path, text, "PerfStageId::LOCAL_INPUT_TO_IPC");
@@ -329,7 +331,7 @@ void TestTwoNpuBigDataAllGatherPerfStages()
     const std::string path = "src/collectives/kernels/kernels/lcal_allgather_2npu_big_data_write.cce";
     const auto text = ReadFile(path);
     CheckContains(path, text, "GM_ADDR perfTrace");
-    CheckContains(path, text, "PerfStageId::KERNEL_TOTAL");
+    CheckDoesNotContain(path, text, "PerfStageId::KERNEL_TOTAL");
     CheckContains(path, text, "PerfStageId::CHUNK_TOTAL");
     CheckContains(path, text, "PerfStageId::POST_SYNC");
     CheckContains(path, text, "PerfStageId::LOCAL_INPUT_TO_IPC");
@@ -342,6 +344,28 @@ void TestTwoNpuBigDataAllGatherPerfStages()
     CheckContains(path, text, "TileXRPerfTraceEnabled");
 }
 
+void TestStandaloneCollectiveWrappersHaveCoarsePerfTrace()
+{
+    const std::string path = "src/collectives/kernels/lccl_op.h";
+    const auto text = ReadFile(path);
+    CheckContains(path, text, "TileXRCoarsePerfToken");
+    CheckContains(path, text, "TILEXR_COARSE_PERF_BEGIN");
+    CheckContains(path, text, "TILEXR_COARSE_PERF_END");
+
+    const char *wrappers[] = {
+        "LCCL_ALLGATHER_FUNC_AUTO_DEF",
+        "LCCL_BROADCAST_FUNC_AUTO_DEF",
+        "LCCL_ALL_REDUCE_FUNC_AUTO_DEF",
+        "LCCL_ALL2ALL_FUNC_AUTO_DEF",
+        "LCCL_REDUCE_SCATTER_FUNC_AUTO_DEF",
+    };
+    for (const char *wrapper : wrappers) {
+        const auto body = ExtractInitializer(path, text, wrapper);
+        CheckContains(std::string(path) + " " + wrapper, body, "TILEXR_COARSE_PERF_BEGIN");
+        CheckContains(std::string(path) + " " + wrapper, body, "TILEXR_COARSE_PERF_END");
+    }
+}
+
 void TestCommDoesNotOwnCollectiveRuntime()
 {
     const auto commFiles = CollectFiles("src/comm");
@@ -380,6 +404,7 @@ int main()
     TestDeviceKernelArgsMatchHostLaunchAbi();
     TestBigDataAllGatherPerfStages();
     TestTwoNpuBigDataAllGatherPerfStages();
+    TestStandaloneCollectiveWrappersHaveCoarsePerfTrace();
     TestCommDoesNotOwnCollectiveRuntime();
     return g_failures == 0 ? 0 : 1;
 }
diff --git a/tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp b/tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp
index f5b5962..9fdec60 100644
--- a/tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp
+++ b/tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp
@@ -126,6 +126,10 @@ void TestPerfToolSource()
     CheckContains(path, text, "TileXRCommInitRank(");
     CheckContains(path, text, "TileXRGetUniqueId(");
     CheckContains(path, text, "WriteProfileHostInfo");
+    CheckContains(path, text, "TILEXR_PROFILE_CLOCK_OFFSET_NS");
+    CheckContains(path, text, "clock_offset_ns");
+    CheckContains(path, text, "clock_reference");
+    CheckContains(path, text, "epoch_ns");
     CheckContains(path, text, "--op allgather|alltoall|allreduce|reducescatter|broadcast|profile-probe");
     CheckContains(path, text, "CollectiveOp::NOOP");
     CheckContains(path, text, "CollectiveOp::PROFILE_PROBE");
@@ -295,17 +299,38 @@ void TestLauncherScripts()
     CheckContains(multiHostPerfPath, multiHostPerf, "--comm-mode socket");
     CheckContains(multiHostPerfPath, multiHostPerf, "tilexr_collective_profile_report.py");
     CheckContains(multiHostPerfPath, multiHostPerf, "copy_rank_profile");
+    CheckContains(multiHostPerfPath, multiHostPerf, "test -d '${profile_dir}/rank${rank}'");
+    CheckContains(multiHostPerfPath, multiHostPerf, "remote_epoch_ns");
+    CheckContains(multiHostPerfPath, multiHostPerf, "TILEXR_MULTIHOST_REMOTE_REPO_DIR");
+    CheckContains(multiHostPerfPath, multiHostPerf, "rank0_host_ip");
+    CheckContains(multiHostPerfPath, multiHostPerf, "comm_id=\"${TILEXR_COMM_ID:-${rank0_host_ip}:10067}\"");
+    CheckContains(multiHostPerfPath, multiHostPerf, "TILEXR_PROFILE_CLOCK_OFFSET_NS");
+    CheckContains(multiHostPerfPath, multiHostPerf, "TILEXR_PROFILE_CLOCK_SYNC_REFERENCE");
     CheckContains(multiHostPerfPath, multiHostPerf, "command -v rsync");
     CheckContains(multiHostPerfPath, multiHostPerf, "tar -cf - .");
     CheckContains(multiHostPerfPath, multiHostPerf, "tar -xf - -C");
     CheckContains(multiHostPerfPath, multiHostPerf, "rank${rank}");
     CheckContains(multiHostPerfPath, multiHostPerf, "host_label=\"${target#*@}\"");
-    CheckContains(multiHostPerfPath, multiHostPerf, "shift 4");
+    CheckContains(multiHostPerfPath, multiHostPerf, "local repo_dir=\"$5\"");
+    CheckContains(multiHostPerfPath, multiHostPerf, "local clock_offset_ns=\"$6\"");
+    CheckContains(multiHostPerfPath, multiHostPerf,
+        "\"${rank_size}\" \"${rank}\" \"${device_id}\" \"${comm_id}\" \"${profile_dir}\" \"${bin_dir}\" \"${host_ip}\" \"${host_label}\" \"${repo_dir}\" \"${clock_offset_ns}\"");
+    CheckContains(multiHostPerfPath, multiHostPerf, "repo_dir=\"$9\"");
+    CheckContains(multiHostPerfPath, multiHostPerf, "clock_offset_ns=\"${10}\"");
+    CheckContains(multiHostPerfPath, multiHostPerf, "cd \"${repo_dir}\"");
+    CheckContains(multiHostPerfPath, multiHostPerf, "build_dir=\"$(cd \"${bin_dir}/../..\" && pwd)\"");
+    CheckContains(multiHostPerfPath, multiHostPerf, "${build_dir}/src/collectives");
+    CheckContains(multiHostPerfPath, multiHostPerf, "clock_offset_ns=0");
+    CheckContains(multiHostPerfPath, multiHostPerf, "reference_midpoint_ns");
+    CheckContains(multiHostPerfPath, multiHostPerf, "shift 6");
+    CheckContains(multiHostPerfPath, multiHostPerf, "shift 10");
+    CheckDoesNotContain(multiHostPerfPath, multiHostPerf, "cd /home/l00929943/TileXR");
+    CheckDoesNotContain(multiHostPerfPath, multiHostPerf, "comm_id=\"${TILEXR_COMM_ID:-141.62.24.62:10067}\"");
     CheckContains(multiHostPerfPath, multiHostPerf, "set +u");
     CheckContains(multiHostPerfPath, multiHostPerf, "ASCEND_PROCESS_LOG_PATH");
     CheckContains(multiHostPerfPath, multiHostPerf, "plog/rank${rank}");
     CheckContains(multiHostPerfPath, multiHostPerf,
-        "LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives");
+        "LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:${build_dir}/src/collectives:${build_dir}/src/comm");
 }
 
 void TestCMakeWiring()
@@ -393,6 +418,8 @@ void TestProfileReportHelperDocumentsSlowRankAndPerfettoEvents()
     CheckContains(path, text, "launch_offset_us");
     CheckContains(path, text, "normalized_ts");
     CheckContains(path, text, "host_info.json");
+    CheckContains(path, text, "Clock Sync");
+    CheckContains(path, text, "clock_offset_ns");
     CheckContains(path, text, "rank_label(index.get(\"hosts\", {}), bar[\"rank\"])");
     CheckContains(path, text, "launch{bar['launch_id']}/{bar_rank_label}/{bar['stage']}");
 }