From 377ee554068bce758cff24512363800132d507c5 Mon Sep 17 00:00:00 2001 From: liuhaoyu35 Date: Mon, 29 Jun 2026 15:47:46 +0800 Subject: [PATCH 1/2] feat: add multi-host collective profiling --- .gitignore | 3 + docs/multi-host-profiling-report.md | 282 ++++++++++++ docs/multihost-profiling-usage.md | 253 +++++++++++ docs/single-node-profiling-report.md | 148 +++++++ src/collectives/host/collective_kernel.cpp | 40 +- src/collectives/host/collective_utils.cpp | 9 + src/collectives/host/collective_utils.h | 2 + src/collectives/host/perf_trace_report.cpp | 45 +- src/collectives/host/perf_trace_report.h | 2 + src/collectives/host/perf_trace_session.cpp | 37 +- src/collectives/host/tilexr_collectives.cpp | 24 + src/collectives/kernels/CMakeLists.txt | 11 +- src/collectives/kernels/collectives.h | 2 +- src/collectives/kernels/datacopy_gm2gm.h | 47 +- .../kernels/kernels/collectives.cce | 8 +- .../kernels/kernels/lcal_profile_probe.cce | 110 +++++ src/collectives/kernels/lccl_op.h | 12 +- src/collectives/kernels/perf_trace_kernel.h | 96 +++- src/collectives/kernels/tilexr_lccl_op.cpp | 3 +- src/comm/tilexr_internal.cpp | 4 +- src/include/tilexr_collectives.h | 3 + src/include/tilexr_collectives_perf.h | 1 + src/include/tilexr_perf_trace.h | 5 +- src/include/tilexr_types.h | 2 + tests/collectives/CMakeLists.txt | 23 + tests/collectives/README.md | 41 +- .../run_collective_perf_multihost.sh | 217 ++++++++++ .../tilexr-tests/tilexr_collective_perf.cpp | 330 +++++++++++++- .../tilexr_collective_profile_report.py | 409 +++++++++++++++++- .../unit/test_collective_perf_report.cpp | 14 +- .../unit/test_collective_perf_session.cpp | 13 + .../unit/test_collective_profile_report.py | 349 ++++++++++++++- .../unit/test_prepare_host_launch_context.cpp | 56 +++ .../unit/test_tilexr_collectives_api.cpp | 47 ++ ...st_tilexr_collectives_kernel_ownership.cpp | 36 ++ .../test_tilexr_collectives_tools_sources.cpp | 61 ++- .../unit/test_tilexr_perf_trace_layout.cpp | 9 +- tests/comm/unit/test_tilexr_source_guards.cpp | 11 + 38 files changed, 2656 insertions(+), 109 deletions(-) create mode 100644 docs/multi-host-profiling-report.md create mode 100644 docs/multihost-profiling-usage.md create mode 100644 docs/single-node-profiling-report.md create mode 100644 src/collectives/kernels/kernels/lcal_profile_probe.cce create mode 100644 tests/collectives/run_collective_perf_multihost.sh diff --git a/.gitignore b/.gitignore index 1c01658..427b557 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,9 @@ MANIFEST.in .claude/scheduled_tasks.json .claude/settings.local.json +# local transfer bundles for remote Ascend profiling/debug sync +/.tilexr_* + # custom .worktrees/ op-simulator/build diff --git a/docs/multi-host-profiling-report.md b/docs/multi-host-profiling-report.md new file mode 100644 index 0000000..9e162d6 --- /dev/null +++ b/docs/multi-host-profiling-report.md @@ -0,0 +1,282 @@ +# TileXR 多机性能分析模块改动整理 + +## 背景与目标 + +本轮多机性能分析工作以 `C:\TileXR` 的 `lhy-single-profiling` 分支为基本盘,目标是在 62 和 70 两台机器上跑通 2 机 kernel-level profiling,并输出统一的 HTML 报告和 `perfetto_trace.json`。 + +当前已验证链路: + +- 62:`root@141.62.24.62`,远端路径 `/home/l00929943/TileXR`。 +- 70:`root@141.62.24.70`,远端路径 `/home/l00929943/TileXR`。 +- 算子:`profile-probe`。 +- 输出目录:`/home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70-kernel-align`。 +- 本地报告:`C:\TileXR\run\prof\collectives-2host-profile-probe-62-70-kernel-align\report.html`。 +- 本地 Perfetto:`C:\TileXR\run\prof\collectives-2host-profile-probe-62-70-kernel-align\perfetto_trace.json`。 + +注意:`profile-probe` 是多机 profiling/report smoke 算子。它会初始化 ACL 和 socket communicator,并在每个 rank 上执行真实 AIV kernel,记录与单机 profiling 一致的 7 个 kernel stage;但它只做本地 device memory copy,不触碰跨机 `peerMems[]`,不能当作 allgather 跨机带宽结果。 + +## 必须保留的改动 + +### 1. `profile-probe` 算子闭包 + +这些改动是多机 profiling 能稳定跑通的核心,提交时必须保留: + +- `src/include/tilexr_types.h` + - 新增 `TileXRType::PROFILE_PROBE = 11`。 + - 新增 `TileXRProfileProbe` 类型名映射。 +- `src/include/tilexr_collectives.h` + - 新增 public API `TileXRProfileProbe(...)`。 +- `src/collectives/host/tilexr_collectives.cpp` + - 新增 host 侧 `TileXRProfileProbe` 实现。 + - 使用 `GetProfileProbeBlockNum` 固定 probe block 数。 + - 以 `INT8` kernel signature launch,避免 profile probe 与 dtype 组合膨胀。 +- `src/collectives/host/collective_utils.{h,cpp}` + - 新增 `GetProfileProbeBlockNum`。 +- `src/collectives/host/collective_kernel.cpp` + - 注册 `PROFILE_PROBE` kernel。 + - 对 `PROFILE_PROBE` 使用独立 funsig 和 kernel name。 +- `src/collectives/kernels/lccl_op.h` + - include `kernels/lcal_profile_probe.cce`。 + - 新增 `LCCL_PROFILE_PROBE_FUNC_AUTO_DEF()`。 +- `src/collectives/kernels/tilexr_lccl_op.cpp` + - 注册 `TileXRProfileProbe` kernel TU。 +- `src/collectives/kernels/kernels/lcal_profile_probe.cce` + - 新增真实 AIV probe kernel。 + - 输出和单机一致的 7 个 stage:`kernel_total`、`chunk_total`、`post_sync`、`local_input_to_ipc`、`flag_poll_wait`、`peer_ipc_to_output`、`chunk_barrier`。 + - 只做本地 GM -> UB -> GM copy,不访问 `peerMems[]`。 +- `src/collectives/kernels/CMakeLists.txt` + - `TILEXR_COLLECTIVES_1OP_BIN_SIZE` 从 `5242880` 增大到 `10485760`,否则新增 CCE binary 后可能被截断。 + +### 2. kernel trace 数据面修复 + +这些改动保证多机和单机报告的 kernel 粒度一致,也是必须保留的: + +- `src/include/tilexr_perf_trace.h` + - 新增 `TILEXR_PERF_TRACE_STATS_OFFSET = 128`。 + - `TileXRPerfCoreStageStats` 扩展到 96 bytes,并保持 32-byte 对齐。 + - 新增 `aux2/aux3`,防止 ABI/layout 与 kernel 侧不一致。 +- `src/collectives/kernels/perf_trace_kernel.h` + - 使用固定 GM stats offset,避免 kernel 侧读取 header 字段导致不稳定。 + - 通过 UB 临时 buffer 做 stats slot 的 GM <-> UB 更新。 + - 使用 `GetBlockNum()` 和固定 stage count 计算 slot。 + - 保留无 profiling 编译路径的 no-op overload。 +- `src/collectives/host/perf_trace_session.cpp` + - host 侧 header 使用同一个 `TILEXR_PERF_TRACE_STATS_OFFSET`。 + - 支持 `TILEXR_COLLECTIVES_DISABLE_KERNEL_PROFILING=1`,用于只保留 host metadata 的调试对照。 + - 新增 incomplete report 写出入口。 +- `src/include/tilexr_collectives_perf.h` + - 对外声明 `TileXRCollectivePerfWriteIncompleteReport(...)`,让测试工具在 kernel launch 失败时仍能写出可聚合的 trace metadata。 +- `src/collectives/host/perf_trace_report.{h,cpp}` + - 只保留 `count != 0` 的 stats,避免 count=0 的脏 slot 进入报告。 + - `trace.json` 写出 `aux2/aux3`。 + - 支持 incomplete trace 的 `incomplete` / `incomplete_reason`。 + +### 3. 多机运行入口与测试工具 + +这些改动让 62/70 可以一键启动 2 机 profiling,必须保留: + +- `tests/collectives/tilexr-tests/tilexr_collective_perf.cpp` + - 新增 `--comm-mode local|socket`。 + - 新增 `--device-id`。 + - 新增 `--op profile-probe`。 + - socket 模式下使用 `TileXRCommInitRank`。 + - 写出 `rank/host_info.json`,包含 host/IP/comm_mode。 + - 在 measured launch 失败时写 incomplete trace,便于聚合报告诊断。 + - `profile-probe` 支持 `--check 1`,用于确认 profiling 没有导致执行错误。 +- `tests/collectives/run_collective_perf_multihost.sh` + - 读取 `TILEXR_MULTIHOST_PEERS`,按 rank 通过 SSH 启动远端进程。 + - 设置 `--comm-mode socket`、`TILEXR_COMM_ID`、`ASCEND_PROCESS_LOG_PATH`。 + - 优先用当前构建产物的 `LD_LIBRARY_PATH`,并将 driver lib 放前面。 + - 远端 rank 完成后回收 `rank/` profiling 目录。 + - 调用 `tilexr_collective_profile_report.py` 生成聚合报告。 +- `tests/collectives/CMakeLists.txt` + - 安装 `run_collective_perf_multihost.sh`。 + - 增加 rpath-link,避免测试工具链接时误用 CANN stub/缺失 driver 依赖。 +- `tests/collectives/README.md` + - 补充 multi-host profiling 使用说明和限制。 + +### 4. 聚合报告和 Perfetto 输出 + +这些改动让多机信息真正呈现在 HTML 和 Perfetto 中,必须保留: + +- `tests/collectives/tilexr_collective_profile_report.py` + - 聚合 `rank*/host_info.json`。 + - HTML 显示 `rank@host` 和 host IP。 + - `Rank-Level Summary` 用 `kernel_total` 汇总每个 rank 的 avg/max kernel us。 + - `Trace Status` 显示 incomplete/synthetic trace。 + - `perfetto_trace.json` 使用 host/rank/stage 命名,例如 `launch0/rank1@141.62.24.70/kernel_total`。 + - 每个 rank 增加 `launch_windows` thread,便于 `ui.perfetto.dev` 中按 launch 对齐查看。 + - 保留诊断:missing trace、incomplete trace、group incompatible 等。 + +### 5. 回归测试与 source guard + +这些测试是提交多机 profiling 时保护行为不退化的必要部分,建议全部保留: + +- `tests/collectives/unit/test_collective_profile_report.py` + - 覆盖 host metadata、Perfetto、incomplete trace、count=0 过滤。 + - 新增 `test_multihost_report_preserves_single_host_kernel_stage_granularity`,确保多机报告保留单机同款 7 个 kernel stage。 +- `tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp` + - 确认 profile probe CCE 文件、注册宏、kernel 注册和 stats layout 约束。 +- `tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp` + - 确认 CLI、multi-host script、README、Perfetto host label 等关键字符串存在。 +- `tests/collectives/unit/test_tilexr_perf_trace_layout.cpp` + - 锁定 trace header/stats ABI 和 32-byte 对齐。 +- `tests/collectives/unit/test_prepare_host_launch_context.cpp` + - 覆盖 stats offset 和 disable kernel profiling 环境变量。 +- `tests/collectives/unit/test_collective_perf_session.cpp` + - 覆盖 incomplete report 写出。 +- `tests/collectives/unit/test_collective_perf_report.cpp` + - 覆盖 count=0 stats 过滤和 aux 字段写出。 + +## 提交前 staging 清单 + +如果后续要提交“多机性能分析模块跑通”这一组改动,建议至少包含下面这些文件。这里按 git 状态拆分,方便提交前直接核对。 + +新增文件必须保留: + +- `docs/multi-host-profiling-report.md` +- `src/collectives/kernels/kernels/lcal_profile_probe.cce` +- `tests/collectives/run_collective_perf_multihost.sh` + +已修改文件必须保留: + +- `src/collectives/host/collective_kernel.cpp` +- `src/collectives/host/collective_utils.cpp` +- `src/collectives/host/collective_utils.h` +- `src/collectives/host/perf_trace_report.cpp` +- `src/collectives/host/perf_trace_report.h` +- `src/collectives/host/perf_trace_session.cpp` +- `src/collectives/host/tilexr_collectives.cpp` +- `src/collectives/kernels/CMakeLists.txt` +- `src/collectives/kernels/lccl_op.h` +- `src/collectives/kernels/perf_trace_kernel.h` +- `src/collectives/kernels/tilexr_lccl_op.cpp` +- `src/include/tilexr_collectives.h` +- `src/include/tilexr_collectives_perf.h` +- `src/include/tilexr_perf_trace.h` +- `src/include/tilexr_types.h` +- `tests/collectives/CMakeLists.txt` +- `tests/collectives/README.md` +- `tests/collectives/tilexr-tests/tilexr_collective_perf.cpp` +- `tests/collectives/tilexr_collective_profile_report.py` +- `tests/collectives/unit/test_collective_perf_report.cpp` +- `tests/collectives/unit/test_collective_perf_session.cpp` +- `tests/collectives/unit/test_collective_profile_report.py` +- `tests/collectives/unit/test_prepare_host_launch_context.cpp` +- `tests/collectives/unit/test_tilexr_collectives_kernel_ownership.cpp` +- `tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp` +- `tests/collectives/unit/test_tilexr_perf_trace_layout.cpp` + +这组文件形成闭环:算子枚举/API、host launch、device kernel、trace ABI、报告聚合、multi-host 启动脚本、README 和回归测试都必须同时存在。只提交其中一部分,容易出现编译可过但远端跑不通,或者能跑但 HTML/Perfetto 缺少 rank/kernel 粒度信息。 + +## 不应提交的内容 + +以下文件是同步/调试临时产物,不属于代码改动,应在提交前删除或保持 untracked: + +- `.tilexr_profile_sync.tar.gz` +- `.tilexr_profile_sync.tar.gz.b64` +- `.tilexr_profile_probe_sync.tar.gz` +- `.tilexr_profile_probe_sync.tar.gz.b64` +- `.tilexr_profile_probe_sync.ascii.b64` +- `.tilexr_profile_probe_fix2.tar.gz` +- `.tilexr_profile_probe_fix2.ascii.b64` +- `.tilexr_profile_probe_fix3.tar.gz` +- `.tilexr_profile_probe_fix3.ascii.b64` + +运行报告目录也不建议进代码提交,除非后续明确要提交样例报告: + +- `run/prof/collectives-2host-profile-probe-62-70*` +- `run/prof/collectives-2host-profile-probe-ab-profile*` + +## 已验证命令 + +本地报告聚合测试: + +```bash +cd C:\TileXR +python -m unittest tests.collectives.unit.test_collective_profile_report -v +``` + +62/70 远端构建: + +```bash +cd /home/l00929943/TileXR +source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true +conda activate pt311 2>/dev/null || true +source /usr/local/Ascend/cann/set_env.sh +cmake --build build-profile-950 --target \ + test_tilexr_collectives_kernel_ownership \ + test_tilexr_collectives_tools_sources \ + tilexr_collective_perf -j8 +``` + +62/70 远端 source guard: + +```bash +cd /home/l00929943/TileXR/build-profile-950/tests/collectives +./test_tilexr_collectives_kernel_ownership +./test_tilexr_collectives_tools_sources +``` + +2 机 profiling 运行: + +```bash +cd /home/l00929943/TileXR +source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true +conda activate pt311 2>/dev/null || true +source /usr/local/Ascend/cann/set_env.sh +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives:$(pwd)/build-profile-950/src/comm:${LD_LIBRARY_PATH:-} +export TILEXR_MULTIHOST_PEERS='0,root@141.62.24.62,141.62.24.62,0;1,root@141.62.24.70,141.62.24.70,0' +export TILEXR_COMM_ID='141.62.24.62:10067' +export TILEXR_COLLECTIVES_RUN_TIMEOUT_SEC=300 +bash tests/collectives/run_collective_perf_multihost.sh \ + /home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70-kernel-align \ + /home/l00929943/TileXR/build-profile-950/tests/collectives \ + --op profile-probe --min-bytes 4096 --max-bytes 4096 \ + --iters 2 --warmup-iters 0 --datatype int32 --check 1 \ + --profile 1 --profile-sample-every 1 --profile-ai-prompt 1 +``` + +## 当前验证结论 + +最新 2 机结果: + +- `diagnostics=[]`。 +- `op_name=TileXRProfileProbe`。 +- `rank_size=2`。 +- `launch_ids=[0,1]`。 +- `bars=112`。 +- 每个 rank/launch 都包含完整 7 个 kernel stage。 +- `perfetto_trace.json` 中每个 rank/launch/stage 都有 duration event。 +- `Rank-Level Summary` 示例: + - rank0 avg kernel `93.740 us`,max `112.580 us`。 + - rank1 avg kernel `92.170 us`,max `106.740 us`。 + +profiling on/off 对照: + +- `--profile 0`:`errors=0`,host avg `9.607 us`。 +- `--profile 1`:`errors=0`,host avg `393.650 us`,trace 完整。 +- 结论:profiling 会明显放大 host 端统计耗时,但没有导致算子校验失败、trace incomplete 或 kernel stage 缺失。 + +plog 中可见 HCCP deinit/unimport ERROR 和 UDMA topology fallback WARN;这些在 profiling on/off 都存在,且 rank 日志 `errors=0`,目前判断为退出/资源回收阶段或 UDMA/HCCP 清理路径噪声,不是多机 profiling 模块引入的执行异常。 + +## 已知限制 + +- `profile-probe` 不是跨机 allgather 数据面,不报告跨机通信带宽。 +- 当前 report 将不同 rank/launch 的时间线分别归一化,不假设两台机器的 device cycle offset 同步。 +- 62 机器系统时间异常,会影响构建时间戳和 plog 文件名;必要时用 `touch` 关键源文件强制重编。 +- `run_collective_perf_multihost.sh` 当前面向 profiling run;若 `--profile 0`,rank profile 目录不会生成,脚本的回收阶段会失败。做 profiling off 对照时建议手动启动 rank 或后续单独增强脚本兼容。 + +## 提交建议 + +提交多机性能分析模块时建议包含: + +- 上述 `profile-probe` 算子闭包。 +- kernel trace 数据面修复。 +- `tilexr_collective_perf` socket/profile-probe/host_info/incomplete report 支持。 +- `run_collective_perf_multihost.sh`。 +- `tilexr_collective_profile_report.py` 的 host/rank/Perfetto/incomplete 聚合能力。 +- README 和单测/source guard。 +- 本文档 `docs/multi-host-profiling-report.md`。 + +提交前建议清理 `.tilexr_*` 临时包,并确认没有把 `run/prof` 运行产物加入 git。 diff --git a/docs/multihost-profiling-usage.md b/docs/multihost-profiling-usage.md new file mode 100644 index 0000000..8050fda --- /dev/null +++ b/docs/multihost-profiling-usage.md @@ -0,0 +1,253 @@ +# TileXR 多机性能分析模块使用说明 + +## 适用范围 + +本文档对应上交分支 `lhy-multihost-profiling`,用于在 62/70 两台 Ascend 机器上跑通 2 机 kernel-level profiling,并生成统一的 HTML 报告和 `ui.perfetto.dev` 可打开的 `perfetto_trace.json`。 + +当前推荐先使用 `profile-probe` 算子做多机 profiling 链路验证。它会初始化 ACL 和 socket communicator,并在每个 rank 上执行真实 AIV kernel,采集与单机 profiling 对齐的 kernel stage: + +- `kernel_total` +- `chunk_total` +- `post_sync` +- `local_input_to_ipc` +- `flag_poll_wait` +- `peer_ipc_to_output` +- `chunk_barrier` + +注意:`profile-probe` 是 profiling/report smoke 算子,只做本地 GM -> UB -> GM copy,不访问跨机 `peerMems[]`,不能作为跨机 allgather 带宽结果。如果需要看真实跨机 collective 数据面,需要在该链路基础上继续接入对应 collective kernel。 + +## 分支与代码 + +本地基本盘: + +```powershell +cd C:\TileXR +git switch lhy-multihost-profiling +``` + +建议提交/同步到远端的核心文件包括: + +- `src/collectives/host/*perf_trace*` +- `src/collectives/host/tilexr_collectives.cpp` +- `src/collectives/host/collective_kernel.cpp` +- `src/collectives/kernels/perf_trace_kernel.h` +- `src/collectives/kernels/kernels/lcal_profile_probe.cce` +- `src/include/tilexr_collectives*.h` +- `src/include/tilexr_perf_trace.h` +- `src/include/tilexr_types.h` +- `tests/collectives/tilexr-tests/tilexr_collective_perf.cpp` +- `tests/collectives/tilexr_collective_profile_report.py` +- `tests/collectives/run_collective_perf_multihost.sh` +- `tests/collectives/unit/*profile*` +- `tests/collectives/README.md` +- `docs/multi-host-profiling-report.md` +- `docs/multihost-profiling-usage.md` + +不要提交根目录 `.tilexr_*` 同步包,也不要提交 `run/prof/` 运行产物。 + +## 远端环境准备 + +两台机器默认路径: + +- 62:`root@141.62.24.62:/home/l00929943/TileXR` +- 70:`root@141.62.24.70:/home/l00929943/TileXR` + +每台机器进入仓库后加载环境: + +```bash +cd /home/l00929943/TileXR +source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true +conda activate pt311 2>/dev/null || true +source /usr/local/Ascend/cann/set_env.sh +``` + +如果 62 的系统时间异常导致增量编译没有重新生成 kernel,可以在远端执行: + +```bash +touch src/collectives/kernels/kernels/lcal_profile_probe.cce +touch src/collectives/kernels/perf_trace_kernel.h +``` + +## 构建 + +在 62 和 70 两边都执行: + +```bash +cd /home/l00929943/TileXR +source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true +conda activate pt311 2>/dev/null || true +source /usr/local/Ascend/cann/set_env.sh + +cmake -S . -B build-profile-950 \ + -DTILEXR_BUILD_COLLECTIVES=ON \ + -DTILEXR_BUILD_TESTS=ON \ + -DTILEXR_COLLECTIVES_ENABLE_PROFILING=ON \ + -DBUILD_TESTING=OFF + +cmake --build build-profile-950 --target \ + test_tilexr_collectives_kernel_ownership \ + test_tilexr_collectives_tools_sources \ + tilexr_collective_perf -j8 +``` + +建议先跑 source guard: + +```bash +cd /home/l00929943/TileXR/build-profile-950/tests/collectives +./test_tilexr_collectives_kernel_ownership +./test_tilexr_collectives_tools_sources +``` + +## 2 机 profiling 运行 + +建议从 62 发起,`TILEXR_COMM_ID` 使用 62 的 IP 和一个空闲端口: + +```bash +cd /home/l00929943/TileXR +source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true +conda activate pt311 2>/dev/null || true +source /usr/local/Ascend/cann/set_env.sh + +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives:$(pwd)/build-profile-950/src/comm:${LD_LIBRARY_PATH:-} +export TILEXR_MULTIHOST_PEERS='0,root@141.62.24.62,141.62.24.62,0;1,root@141.62.24.70,141.62.24.70,0' +export TILEXR_COMM_ID='141.62.24.62:10067' +export TILEXR_COLLECTIVES_RUN_TIMEOUT_SEC=300 + +bash tests/collectives/run_collective_perf_multihost.sh \ + /home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70 \ + /home/l00929943/TileXR/build-profile-950/tests/collectives \ + --op profile-probe --min-bytes 4096 --max-bytes 4096 \ + --iters 2 --warmup-iters 0 --datatype int32 --check 1 \ + --profile 1 --profile-sample-every 1 --profile-ai-prompt 1 +``` + +`TILEXR_MULTIHOST_PEERS` 格式为: + +```text +rank,ssh_target,host_ip,device_id;rank,ssh_target,host_ip,device_id +``` + +例如 `0,root@141.62.24.62,141.62.24.62,0` 表示 rank0 通过 `root@141.62.24.62` 登录,在 host IP `141.62.24.62` 上使用 device 0。 + +## 产物说明 + +运行完成后,profile 根目录会包含: + +```text +run/prof/collectives-2host-profile-probe-62-70/ + report.html + perfetto_trace.json + trace_index.json + analysis.md + ai_prompt.md + rank0/ + host_info.json + launch0/trace.json + launch0/report.html + rank1/ + host_info.json + launch0/trace.json + launch0/report.html + multihost_rank0.log + multihost_rank1.log + plog/ +``` + +重点看两个聚合产物: + +- `report.html`:本地浏览器直接打开,先看 `Rank-Level Summary` 是否有慢 rank,再看 timeline 和 per-launch drilldown。 +- `perfetto_trace.json`:上传到 `https://ui.perfetto.dev`,按 `rank@host`、`launch`、`kernel_total` 搜索定位慢 rank/慢 stage。 + +Perfetto 事件命名示例: + +```text +launch0/rank0@141.62.24.62/kernel_total +launch0/rank1@141.62.24.70/kernel_total +launch0/rank1@141.62.24.70/flag_poll_wait +``` + +每个 rank 还会有 `launch_windows` thread,用于在 Perfetto 中对齐查看每次 launch。 + +## 本地拉回查看 + +如果需要从 62 拉回到 Windows: + +```powershell +scp -r root@141.62.24.62:/home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70 C:\TileXR\run\prof\ +``` + +然后打开: + +```text +C:\TileXR\run\prof\collectives-2host-profile-probe-62-70\report.html +C:\TileXR\run\prof\collectives-2host-profile-probe-62-70\perfetto_trace.json +``` + +## 重新生成聚合报告 + +如果 `rank*/launch*/trace.json` 已经存在,只想重新生成 HTML/Perfetto: + +```bash +cd /home/l00929943/TileXR +python3 tests/collectives/tilexr_collective_profile_report.py \ + /home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70 \ + --warmup-iters 0 \ + --iters 2 \ + --profile-sample-every 1 \ + --emit-ai-prompt +``` + +## 判断结果是否正常 + +一次正常的 `profile-probe` 结果应满足: + +- `rank_size=2`。 +- `diagnostics=[]`,或没有影响聚合的 fatal diagnostic。 +- 每个 rank/launch 都包含 7 个 kernel stage。 +- HTML 的 `Rank-Level Summary` 能看到每个 rank 的 avg/max kernel us。 +- Perfetto 中能搜索到 `rank0@.../kernel_total` 和 `rank1@.../kernel_total`。 +- `tilexr_collective_perf` 日志中 `errors=0`。 + +profiling 会明显放大 host 端 `avg(us)`,这是因为采集、同步、copy-back 和报告写出都在测量路径附近发生。判断算子是否异常时优先看 `errors=0`、trace 是否完整、kernel stage 是否齐全,以及 profiling on/off 是否都存在相同的退出阶段 plog 噪声。 + +## 常见问题 + +### SSH 能连但脚本失败 + +确认两台机器都能免密互连,并且 `TILEXR_MULTIHOST_PEERS` 里的 `ssh_target` 可以从发起机器直接登录。 + +### `aclInit` 或 `ascend_hal` 相关失败 + +确认执行过: + +```bash +source /usr/local/Ascend/cann/set_env.sh +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives:$(pwd)/build-profile-950/src/comm:${LD_LIBRARY_PATH:-} +``` + +driver lib 需要排在 CANN stub lib 前面,避免误加载 stub `libascend_hal.so`。 + +### 找 plog + +可以在 profile 目录或仓库下使用: + +```bash +find $PWD -name "plog" +``` + +本脚本会设置: + +```bash +ASCEND_PROCESS_LOG_PATH="${profile_dir}/plog/rank${rank}" +``` + +### `--profile 0` 对照 + +当前 `run_collective_perf_multihost.sh` 面向 profiling run。`--profile 0` 不会生成 `rank/` profile 目录,脚本回收阶段可能失败。需要做 profiling off 对照时,建议临时手动启动两个 rank,或后续单独增强脚本兼容 profile-off 模式。 + +## 当前限制 + +- 当前已跑通并建议上交的是多机 multi-rank kernel-level profiling 链路,不是多 ACL stream 并发性能分析工具。 +- `profile-probe` 不代表跨机 allgather 数据面性能。 +- 跨机器 device cycle 不假设同步,因此聚合报告按 rank/launch 独立归一化时间线。 +- 多机真实 collective 数据面 profiling 需要继续把同一套 trace 数据面接入跨机 collective kernel。 diff --git a/docs/single-node-profiling-report.md b/docs/single-node-profiling-report.md new file mode 100644 index 0000000..6fabef5 --- /dev/null +++ b/docs/single-node-profiling-report.md @@ -0,0 +1,148 @@ +# TileXR 单机性能分析模块改动报告 + +## 背景与目标 + +本轮单机性能分析工作基于 `lhy-single-profiling` 分支,目标是在 Ascend950 单机多卡环境上跑通 standalone collectives 的算子内性能采集,并同时保留原有 HTML 报告与新增 `ui.perfetto.dev` 可打开的 trace JSON。 + +当前已验证链路覆盖: + +- 70 环境:Ascend950PR_9599,单机 allgather profiling 跑通。 +- 62 环境:Ascend950PR_9589,补充芯片名映射后单机 allgather profiling 跑通。 +- 本地报告样例:`C:\TileXR\run\prof\collectives-950\report.html`、`C:\TileXR\run\prof\collectives-950-62\report.html`。 +- Perfetto 样例:对应目录下的 `perfetto_trace.json` 可上传到 `https://ui.perfetto.dev` 查看。 + +## 代码改动概览 + +主要改动来自提交 `f80a607 feat: enable Ascend950 single-node profiling`,涉及 collectives kernel 编译、Ascend950 kernel 兼容、profile report 聚合和 Perfetto trace 导出。 + +当前工作区还有一个额外未提交补丁,用于 62 机器上的 `Ascend950PR_9589` 识别: + +- `src/comm/tilexr_internal.cpp`:增加 `Ascend950PR_9589 -> CHIP_950`。 +- `tests/comm/unit/test_tilexr_source_guards.cpp`:增加对应 source guard。 + +## Ascend950 编译与运行适配 + +Ascend950 单机 profiling 的关键编译适配在 `src/collectives/kernels/CMakeLists.txt`: + +- 新增 `TILEXR_COLLECTIVES_SOC_TYPE` CMake 变量。 +- 当 `TILEXR_COLLECTIVES_SOC_TYPE=Ascend950` 时,CCE AIV arch 使用 `dav-c310-vec`。 +- 其他平台默认仍使用原来的 `dav-c220-vec`。 + +相关 CCE 源文件也扩展了 `__DAV_C310_VEC__` 条件,使 Ascend950 编译时能生成 collectives kernel: + +- `src/collectives/kernels/tilexr_lccl_op.cpp` +- `src/collectives/kernels/lccl_op.h` +- `src/collectives/kernels/collectives.h` +- `src/collectives/kernels/kernels/collectives.cce` +- `src/collectives/kernels/datacopy_gm2gm.h` + +`datacopy_gm2gm.h` 还增加了 `TileXRAtomicTypeSupported`,避免 `dav-c310-vec` 下对不支持类型生成 atomic 指令。 + +## Profiling 采集链路 + +运行入口仍然是 `tests/collectives/run_collective_perf.sh` 和 `tests/collectives/tilexr-tests/tilexr_collective_perf.cpp`。 + +典型构建命令: + +```bash +source /usr/local/Ascend/cann/set_env.sh +cmake -S . -B build-profile-950 \ + -DTILEXR_BUILD_COLLECTIVES=ON \ + -DTILEXR_COLLECTIVES_ENABLE_PROFILING=ON \ + -DTILEXR_COLLECTIVES_SOC_TYPE=Ascend950 +cmake --build build-profile-950 --target tilexr_collective_perf -j"$(nproc)" +``` + +典型运行命令: + +```bash +cd tests/collectives +./run_collective_perf.sh 2 0 ../../build-profile-950/tests/collectives \ + --op allgather \ + --min-bytes 4096 \ + --max-bytes 4096 \ + --iters 2 \ + --warmup-iters 1 \ + --datatype int32 \ + --check 0 \ + --profile 1 \ + --profile-dir ../../run/prof/collectives-950 \ + --profile-sample-every 1 \ + --profile-ai-prompt 1 +``` + +采集流程: + +- 每个 measured launch 创建一个 `TileXRCollectivePerfSession`。 +- kernel launch 前通过 `PreparePerfTraceLaunch` 准备 device trace buffer。 +- kernel 内部写入 stage/core/rank 维度的统计信息。 +- launch 完成后调用 `TileXRCollectivePerfWriteReport` 输出单 launch 报告。 +- `run_collective_perf.sh` 等所有 rank 结束后调用 `tilexr_collective_profile_report.py` 生成聚合报告。 + +## 输出产物 + +每个 rank/launch 目录会生成: + +- `trace.json`:单次 launch 的结构化 trace,schema 为 `tilexr_perf_trace_report.v1`。 +- `summary.csv`:stage/core 维度统计表。 +- `analysis.md`:单 launch 文本分析。 +- `report.html`:单 launch HTML drilldown。 +- `ai_prompt.md`:可选,开启 `--profile-ai-prompt 1` 时生成。 + +profile 根目录会生成聚合产物: + +- `report.html`:保留原有 HTML 呈现,包含 bottleneck-first 摘要、timeline、drilldown 链接。 +- `trace_index.json`:聚合后的中间索引,schema 为 `tilexr_perf_trace_run.v1`。 +- `analysis.md`:跨 rank/launch 的文本摘要。 +- `ai_prompt.md`:可选聚合 prompt。 +- `perfetto_trace.json`:新增 Perfetto/Chrome trace event 格式,给 `ui.perfetto.dev` 使用。 + +聚合摘要现在额外包含 rank-level kernel summary: + +- `summary.rank_kernel`:按 rank 汇总 `kernel_total`,记录 launch 数、平均 kernel us、最大 kernel us、最慢 launch。 +- `summary.slowest_rank`:按平均 kernel us 排序选出的最慢 rank。 +- `report.html`:新增 `Rank-Level Summary` 表格,用于快速定位慢 rank 后再跳转到单 launch drilldown。 +- `analysis.md` / `ai_prompt.md`:同步输出 slowest rank 和 rank kernel totals。 + +## Perfetto Trace 支持 + +`tests/collectives/tilexr_collective_profile_report.py` 新增 `render_perfetto_trace()`: + +- 每个 rank 映射为 Perfetto process。 +- 每个 rank/core 映射为 Perfetto thread,thread 名为 `rankN/coreM`。 +- 每个 profiling stage 映射为 `ph: "X"` duration event,事件名为 `launchN/rankR/stage`。 +- 每个 rank 增加 `launch_windows` thread,并写入 `launchN/rankR/window` 对齐窗口。 +- measured launch 之间增加固定 gap,方便在 `ui.perfetto.dev` 中按 launch 展开和搜索。 +- 事件保留 `launch_id`、`launch_offset_us`、`normalized_ts`、`rank`、`core`、`stage`、`stage_id`、`sum_us`、`raw_cycles`、`max_cycles`、`message_bytes`、`rank_size` 和源 trace 路径。 + +注意:不同 NPU 的原始 cycle offset 不假设同步,因此 HTML 和 Perfetto 都按 rank/launch 内部归一化时间展示,更适合观察单 rank 内核阶段耗时、stage 分布和慢 core,而不是直接比较跨 NPU 的绝对开始时间。 + +## 测试覆盖 + +已补充/更新的测试包括: + +- `tests/collectives/unit/test_collective_profile_report.py` + - 校验聚合 HTML、`trace_index.json`、`analysis.md`。 + - 校验 rank-level summary、slowest rank、Perfetto launch window 和 launch/rank/stage 事件命名。 + - 校验 sparse launch、multi-size launch、missing trace diagnostics。 +- `tests/collectives/unit/test_tilexr_collectives_tools_sources.cpp` + - 校验 README 和 profile report helper 保留 rank summary / Perfetto marker 关键字段。 +- `tests/collectives/unit/test_tilexr_collectives_api.cpp` + - 校验 Ascend950 CCE arch 配置。 + - 校验 `__DAV_C310_VEC__` 相关源码路径。 +- `tests/comm/unit/test_tilexr_source_guards.cpp` + - 校验已观察到的 `Ascend950PR`、`Ascend950PR_9589`、`Ascend950PR_9599` variant 被映射为 `CHIP_950`。 + +## 当前限制 + +- 当前单机 profiling 路径依赖 standalone collectives kernel 内的 trace hook,主要用于分析 allgather/allreduce/reducescatter/broadcast/alltoall 这类已有 standalone 算子。 +- warmup launch 不写 profiling trace,聚合报告只展示 measured launch。 +- rank-level summary 已能直接看慢 rank,但更细的自动归因仍需要结合 stage/core drilldown 分析。 +- 多机 62+70 的 communicator bootstrap 可以成功,但 2 机 collective kernel 目前会在 kernel 阶段失败或超时;这属于跨节点数据面支持问题,不属于本报告覆盖的单机 profiling 范围。 + +## 建议后续 + +- 短期:继续用 70 的单机 allgather/profile 结果检查 rank-level summary 的可读性,并按真实数据调整排序和字段展示。 +- 短期:在 HTML 中补充 rank 间差值/百分比,例如 slowest rank 相比 fastest rank 慢多少。 +- 中期:为 Perfetto 增加更多 marker,例如 message size group、op group、rank delta marker,进一步降低定位慢 stage 的成本。 +- 中期:另起多机 profiling 任务,先解决 collective kernel 跨节点数据面,再复用当前 profile report 聚合链路。 diff --git a/src/collectives/host/collective_kernel.cpp b/src/collectives/host/collective_kernel.cpp index d5bb49d..834b153 100644 --- a/src/collectives/host/collective_kernel.cpp +++ b/src/collectives/host/collective_kernel.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -57,6 +58,7 @@ const TileXR::TileXRType kRegisteredCollectiveTypes[] = { TileXR::TileXRType::ALL_REDUCE, TileXR::TileXRType::REDUCE_SCATTER, TileXR::TileXRType::BROADCAST, + TileXR::TileXRType::PROFILE_PROBE, }; bool IsStandaloneCollectiveType(TileXR::TileXRType type) @@ -65,11 +67,17 @@ bool IsStandaloneCollectiveType(TileXR::TileXRType type) type == TileXR::TileXRType::ALL2ALL || type == TileXR::TileXRType::ALL_REDUCE || type == TileXR::TileXRType::REDUCE_SCATTER || - type == TileXR::TileXRType::BROADCAST; + type == TileXR::TileXRType::BROADCAST || + type == TileXR::TileXRType::PROFILE_PROBE; } int8_t *GetFunSig(TileXR::TileXRType type, TileXR::TileXRDataType dataType) { + if (type == TileXR::TileXRType::PROFILE_PROBE) { + const uint64_t sig = (static_cast(type) << FUNSIG_OFFSET_BITS << FUNSIG_OFFSET_BITS) + + FUNSIG_SKEW; + return reinterpret_cast(sig); + } const uint64_t sig = (static_cast(type) << FUNSIG_OFFSET_BITS << FUNSIG_OFFSET_BITS) + (static_cast(dataType) << FUNSIG_OFFSET_BITS) + FUNSIG_SKEW; return reinterpret_cast(sig); @@ -77,6 +85,9 @@ int8_t *GetFunSig(TileXR::TileXRType type, TileXR::TileXRDataType dataType) std::string KernelName(TileXR::TileXRType type, const DataTypeRegistration &dataType) { + if (type == TileXR::TileXRType::PROFILE_PROBE) { + return TileXR::TILEXR_TYPE2NAME.at(type); + } if (type == TileXR::TileXRType::BROADCAST) { return TileXR::TILEXR_TYPE2NAME.at(type); } @@ -102,16 +113,35 @@ int RegisterCollectivesKernelsLocked() void *binHandle = nullptr; rtError_t rtRet = rtDevBinaryRegister(&binary, &binHandle); if (rtRet != RT_ERROR_NONE) { + std::cerr << "TileXR collectives rtDevBinaryRegister failed, ret=" << rtRet + << ", binarySize=" << TileXRCollectivesKernelBinarySize << std::endl; g_registrationStatus = TileXR::TILEXR_ERROR_MKIRT; return g_registrationStatus; } for (const auto type : kRegisteredCollectiveTypes) { + if (type == TileXR::TileXRType::PROFILE_PROBE) { + const std::string name = KernelName(type, kDataTypes[0]); + rtRet = rtFunctionRegister(binHandle, GetFunSig(type, TileXR::TILEXR_DATA_TYPE_INT8), + name.c_str(), name.c_str(), 0); + if (rtRet != RT_ERROR_NONE) { + std::cerr << "TileXR collectives rtFunctionRegister failed, ret=" << rtRet + << ", kernel=" << name + << ", type=" << static_cast(type) << std::endl; + g_registrationStatus = TileXR::TILEXR_ERROR_MKIRT; + return g_registrationStatus; + } + continue; + } for (const auto &dataType : kDataTypes) { const std::string name = KernelName(type, dataType); rtRet = rtFunctionRegister(binHandle, GetFunSig(type, dataType.dataType), name.c_str(), name.c_str(), 0); if (rtRet != RT_ERROR_NONE) { + std::cerr << "TileXR collectives rtFunctionRegister failed, ret=" << rtRet + << ", kernel=" << name + << ", type=" << static_cast(type) + << ", dataType=" << static_cast(dataType.dataType) << std::endl; g_registrationStatus = TileXR::TILEXR_ERROR_MKIRT; return g_registrationStatus; } @@ -181,6 +211,14 @@ int LaunchCollectiveKernel(TileXRCommPtr comm, TileXR::TileXRType type, const Ho const rtError_t ret = rtKernelLaunchWithFlagV2(GetFunSig(type, dataType), blockDim, &argsInfo, nullptr, static_cast(stream), 0, &cfgInfo); + if (ret != RT_ERROR_NONE) { + std::cerr << "TileXR collectives rtKernelLaunchWithFlagV2 failed, ret=" << ret + << ", type=" << static_cast(type) + << ", dataType=" << static_cast(dataType) + << ", blockDim=" << blockDim + << ", kernelCount=" << kernelCount + << ", stream=" << stream << std::endl; + } return ret == RT_ERROR_NONE ? TileXR::TILEXR_SUCCESS : TileXR::TILEXR_ERROR_MKIRT; } diff --git a/src/collectives/host/collective_utils.cpp b/src/collectives/host/collective_utils.cpp index 1ff77e6..bef4646 100644 --- a/src/collectives/host/collective_utils.cpp +++ b/src/collectives/host/collective_utils.cpp @@ -201,5 +201,14 @@ uint32_t GetBroadcastBlockNum(const TileXR::CommArgs &commArgs, int64_t dataSize return rankSize; } +uint32_t GetProfileProbeBlockNum(const TileXR::CommArgs &commArgs, int64_t dataSize) +{ + if (commArgs.rankSize <= 0 || dataSize < 0) { + return 0; + } + constexpr uint32_t kDefaultProbeBlocks = 4; + return kDefaultProbeBlocks; +} + } // namespace Host } // namespace TileXRCollectives diff --git a/src/collectives/host/collective_utils.h b/src/collectives/host/collective_utils.h index f68c1f0..33d1aca 100644 --- a/src/collectives/host/collective_utils.h +++ b/src/collectives/host/collective_utils.h @@ -34,6 +34,8 @@ uint32_t GetReduceScatterBlockNum(const TileXR::CommArgs &commArgs, int64_t data uint32_t GetBroadcastBlockNum(const TileXR::CommArgs &commArgs, int64_t dataSize); +uint32_t GetProfileProbeBlockNum(const TileXR::CommArgs &commArgs, int64_t dataSize); + } // namespace Host } // namespace TileXRCollectives diff --git a/src/collectives/host/perf_trace_report.cpp b/src/collectives/host/perf_trace_report.cpp index 69ee408..3c5da8f 100644 --- a/src/collectives/host/perf_trace_report.cpp +++ b/src/collectives/host/perf_trace_report.cpp @@ -163,7 +163,7 @@ std::vector NonEmptyStats( { std::vector result; for (const auto &stat : stats) { - if (stat.count != 0 || stat.sumCycles != 0 || stat.maxCycles != 0) { + if (stat.count != 0) { result.push_back(stat); } } @@ -185,12 +185,17 @@ bool IsValidTraceHeader(const TileXR::TileXRPerfTraceHeader &header) } std::string BuildTraceJson(const TileXR::TileXRPerfTraceHeader &header, - const std::vector &stats) + const std::vector &stats, + const PerfReportOptions &options) { const auto nonEmptyStats = NonEmptyStats(stats); std::ostringstream out; out << "{\n"; out << " \"schema\": \"tilexr_perf_trace_report.v1\",\n"; + if (options.incomplete) { + out << " \"incomplete\": true,\n"; + out << " \"incomplete_reason\": \"" << EscapeJson(options.incompleteReason) << "\",\n"; + } out << " \"op_type\": " << header.opType << ",\n"; out << " \"op_name\": \"" << EscapeJson(OpName(header.opType)) << "\",\n"; out << " \"rank_size\": " << header.rankSize << ",\n"; @@ -214,6 +219,8 @@ std::string BuildTraceJson(const TileXR::TileXRPerfTraceHeader &header, << ", \"last_end_cycle\": " << stat.lastEndCycle << ", \"aux0\": " << stat.aux0 << ", \"aux1\": " << stat.aux1 + << ", \"aux2\": " << stat.aux2 + << ", \"aux3\": " << stat.aux3 << ", \"sum_us\": " << StatSumUs(header, stat) << "}"; if (i + 1 != nonEmptyStats.size()) { @@ -245,7 +252,8 @@ std::string BuildSummaryCsv(const std::vector &summaries) std::string BuildAnalysisMarkdown(const TileXR::TileXRPerfTraceHeader &header, const std::vector &summaries, - const std::vector &findings) + const std::vector &findings, + const PerfReportOptions &options) { std::ostringstream out; out << "# TileXR Collective Perf Analysis\n\n"; @@ -253,6 +261,11 @@ std::string BuildAnalysisMarkdown(const TileXR::TileXRPerfTraceHeader &header, out << "- Message bytes: " << header.messageBytes << "\n"; out << "- Rank size: " << header.rankSize << "\n"; out << "- Block dim: " << header.blockDim << "\n\n"; + if (options.incomplete) { + out << "## Trace Status\n\n"; + out << "- Incomplete trace: " << options.incompleteReason << "\n"; + out << "- Device-side stats were not copied back; metadata is preserved for run-level diagnostics.\n\n"; + } out << "## Findings\n\n"; for (const auto &finding : findings) { out << "- " << finding << "\n"; @@ -275,7 +288,8 @@ std::string BuildAnalysisMarkdown(const TileXR::TileXRPerfTraceHeader &header, std::string BuildHtmlReport(const TileXR::TileXRPerfTraceHeader &header, const std::vector &summaries, const std::vector &findings, - const std::vector &stats) + const std::vector &stats, + const PerfReportOptions &options) { const auto nonEmptyStats = NonEmptyStats(stats); std::ostringstream out; @@ -288,6 +302,11 @@ std::string BuildHtmlReport(const TileXR::TileXRPerfTraceHeader &header, out << "

Bottleneck First

\n"; out << "

Operation: " << EscapeHtml(OpName(header.opType)) << ", message bytes: " << header.messageBytes << "

\n"; + if (options.incomplete) { + out << "

Incomplete trace: " + << EscapeHtml(options.incompleteReason) + << ". Device-side stats were not copied back.

\n"; + } out << "
    \n"; for (const auto &finding : findings) { out << "
  • " << EscapeHtml(finding) << "
  • \n"; @@ -330,7 +349,8 @@ std::string BuildHtmlReport(const TileXR::TileXRPerfTraceHeader &header, std::string BuildAiPrompt(const TileXR::TileXRPerfTraceHeader &header, const std::vector &summaries, - const std::vector &findings) + const std::vector &findings, + const PerfReportOptions &options) { std::ostringstream out; out << "# TileXR collective profiling\n\n"; @@ -338,6 +358,10 @@ std::string BuildAiPrompt(const TileXR::TileXRPerfTraceHeader &header, out << "Operation: " << OpName(header.opType) << "\n"; out << "Message bytes: " << header.messageBytes << "\n"; out << "Cycle-to-us divisor: " << header.cycleToUsDivisor << "\n\n"; + if (options.incomplete) { + out << "Trace status: incomplete\n"; + out << "Incomplete reason: " << options.incompleteReason << "\n\n"; + } out << "Findings:\n"; for (const auto &finding : findings) { out << "- " << finding << "\n"; @@ -471,15 +495,18 @@ int WritePerfTraceReports( const auto summaries = SummarizePerfTrace(header, stats); const auto findings = AnalyzePerfTrace(header, summaries); - if (!WriteTextFile(JoinPath(options.outputDir, "trace.json"), BuildTraceJson(header, stats)) || + if (!WriteTextFile(JoinPath(options.outputDir, "trace.json"), BuildTraceJson(header, stats, options)) || !WriteTextFile(JoinPath(options.outputDir, "summary.csv"), BuildSummaryCsv(summaries)) || - !WriteTextFile(JoinPath(options.outputDir, "analysis.md"), BuildAnalysisMarkdown(header, summaries, findings)) || - !WriteTextFile(JoinPath(options.outputDir, "report.html"), BuildHtmlReport(header, summaries, findings, stats))) { + !WriteTextFile(JoinPath(options.outputDir, "analysis.md"), + BuildAnalysisMarkdown(header, summaries, findings, options)) || + !WriteTextFile(JoinPath(options.outputDir, "report.html"), + BuildHtmlReport(header, summaries, findings, stats, options))) { return TileXR::TILEXR_ERROR_INTERNAL; } if (options.emitAiPrompt && - !WriteTextFile(JoinPath(options.outputDir, "ai_prompt.md"), BuildAiPrompt(header, summaries, findings))) { + !WriteTextFile(JoinPath(options.outputDir, "ai_prompt.md"), + BuildAiPrompt(header, summaries, findings, options))) { return TileXR::TILEXR_ERROR_INTERNAL; } if (!options.emitAiPrompt && !RemoveIfExists(JoinPath(options.outputDir, "ai_prompt.md"))) { diff --git a/src/collectives/host/perf_trace_report.h b/src/collectives/host/perf_trace_report.h index 708ce8d..458b534 100644 --- a/src/collectives/host/perf_trace_report.h +++ b/src/collectives/host/perf_trace_report.h @@ -25,6 +25,8 @@ struct PerfStageSummary { struct PerfReportOptions { std::string outputDir; bool emitAiPrompt = false; + bool incomplete = false; + std::string incompleteReason; }; const char *PerfStageName(uint32_t stageId); diff --git a/src/collectives/host/perf_trace_session.cpp b/src/collectives/host/perf_trace_session.cpp index 0bf8f69..104b04f 100644 --- a/src/collectives/host/perf_trace_session.cpp +++ b/src/collectives/host/perf_trace_session.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -81,6 +82,13 @@ bool ComputeRequiredBytes(uint64_t statsOffset, uint64_t statsBytes, size_t *req return true; } +bool DisableKernelProfiling() +{ + const char *value = std::getenv("TILEXR_COLLECTIVES_DISABLE_KERNEL_PROFILING"); + return value != nullptr && + (std::string(value) == "1" || std::string(value) == "true" || std::string(value) == "yes"); +} + int64_t ComputeLaunchMessageBytes(TileXR::TileXRType opType, TileXR::TileXRDataType dataType, int64_t count, int rankSize) { @@ -201,7 +209,7 @@ int PreparePerfTraceLaunch(PerfTraceSession *session, const TileXR::CommArgs &co return TileXR::TILEXR_ERROR_INTERNAL; } - session->header.statsOffset = sizeof(TileXR::TileXRPerfTraceHeader); + session->header.statsOffset = TileXR::TILEXR_PERF_TRACE_STATS_OFFSET; size_t statsBytes = 0; if (!ComputeStatsBytes(session->hostStats.size(), &statsBytes)) { return TileXR::TILEXR_ERROR_INTERNAL; @@ -211,6 +219,11 @@ int PreparePerfTraceLaunch(PerfTraceSession *session, const TileXR::CommArgs &co if (!ComputeRequiredBytes(session->header.statsOffset, session->header.statsBytes, &requiredBytes)) { return TileXR::TILEXR_ERROR_INTERNAL; } + if (DisableKernelProfiling()) { + session->deviceTraceReady = false; + *deviceTrace = nullptr; + return TileXR::TILEXR_SUCCESS; + } if (session->deviceBuffer == nullptr || !session->ownsDeviceBuffer || session->deviceBufferBytes < requiredBytes) { void *newBuffer = nullptr; aclError allocRet = g_runtimeHooks->mallocDevice(&newBuffer, requiredBytes); @@ -342,3 +355,25 @@ extern "C" int TileXRCollectivePerfWriteReport(TileXRCollectivePerfSession sessi return TileXR::TILEXR_ERROR_INTERNAL; } } + +extern "C" int TileXRCollectivePerfWriteIncompleteReport(TileXRCollectivePerfSession session, const char *reason) +{ + if (session == nullptr) { + return TileXR::TILEXR_ERROR_PARA_CHECK_FAIL; + } + + try { + TileXRCollectives::Host::PerfTraceSession *impl = + static_cast(session); + TileXRCollectives::Host::PerfReportOptions options {}; + options.outputDir = impl->outputDir; + options.emitAiPrompt = impl->config.emitAiPrompt != 0; + options.incomplete = true; + options.incompleteReason = reason == nullptr || reason[0] == '\0' ? "kernel launch did not complete" : reason; + return TileXRCollectives::Host::WritePerfTraceReports(impl->header, impl->hostStats, options); + } catch (const std::exception &) { + return TileXR::TILEXR_ERROR_INTERNAL; + } catch (...) { + return TileXR::TILEXR_ERROR_INTERNAL; + } +} diff --git a/src/collectives/host/tilexr_collectives.cpp b/src/collectives/host/tilexr_collectives.cpp index 3da431e..97e9401 100644 --- a/src/collectives/host/tilexr_collectives.cpp +++ b/src/collectives/host/tilexr_collectives.cpp @@ -225,3 +225,27 @@ int TileXRBroadcast(void *buf, int64_t count, buf, buf, bytes, dataType, blockDim, stream, TileXRCollectives::Host::CollectiveLaunchAttrs { 0, root }); } + +int TileXRProfileProbe(void *sendBuf, void *recvBuf, int64_t count, + TileXR::TileXRDataType dataType, TileXRCommPtr comm, + aclrtStream stream) +{ + int ret = ValidateCommon(sendBuf, recvBuf, count, dataType, comm); + if (ret != TileXR::TILEXR_SUCCESS) { + return ret; + } + + TileXRCollectives::Host::HostLaunchContext context; + ret = TileXRCollectives::Host::PrepareHostLaunchContext(comm, context); + if (ret != TileXR::TILEXR_SUCCESS) { + return ret; + } + + const int64_t bytes = TileXRCollectives::Host::CountToBytes(count, dataType); + const uint32_t blockDim = TileXRCollectives::Host::GetProfileProbeBlockNum(*context.hostArgs, bytes); + if (blockDim == 0) { + return TileXR::TILEXR_ERROR_PARA_CHECK_FAIL; + } + return TileXRCollectives::Host::LaunchCollectiveKernel(comm, TileXR::TileXRType::PROFILE_PROBE, context, + sendBuf, recvBuf, bytes, TileXR::TILEXR_DATA_TYPE_INT8, blockDim, stream); +} diff --git a/src/collectives/kernels/CMakeLists.txt b/src/collectives/kernels/CMakeLists.txt index 4c9e53b..6275270 100644 --- a/src/collectives/kernels/CMakeLists.txt +++ b/src/collectives/kernels/CMakeLists.txt @@ -6,8 +6,9 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") enable_language(CCE) -set(TILEXR_COLLECTIVES_1OP_BIN_SIZE 5242880 CACHE STRING "Padded size for TileXR collectives CCE binary") +set(TILEXR_COLLECTIVES_1OP_BIN_SIZE 10485760 CACHE STRING "Padded size for TileXR collectives CCE binary") option(TILEXR_COLLECTIVES_ENABLE_PROFILING "Enable TileXR collectives kernel profiling helpers" OFF) +set(TILEXR_COLLECTIVES_SOC_TYPE "" CACHE STRING "SOC type used for TileXR collectives CCE kernels") set(CCE_COMPILE_OPTION -O2 @@ -21,7 +22,11 @@ set(CCE_COMPILE_OPTION "SHELL:-mllvm --cce-aicore-jump-expand=true" ) -set(AIV_ARCH dav-c220-vec) +if(TILEXR_COLLECTIVES_SOC_TYPE STREQUAL "Ascend950") + set(TILEXR_COLLECTIVES_AICORE_ARCH dav-c310-vec) +else() + set(TILEXR_COLLECTIVES_AICORE_ARCH dav-c220-vec) +endif() set_source_files_properties(tilexr_lccl_op.cpp PROPERTIES LANGUAGE CCE) include_directories( @@ -46,7 +51,7 @@ add_library(tilexr_collectives_op_tmp OBJECT target_compile_options(tilexr_collectives_op_tmp PRIVATE ${CCE_COMPILE_OPTION} - --cce-aicore-arch=${AIV_ARCH} + --cce-aicore-arch=${TILEXR_COLLECTIVES_AICORE_ARCH} ) if(TILEXR_COLLECTIVES_ENABLE_PROFILING) diff --git a/src/collectives/kernels/collectives.h b/src/collectives/kernels/collectives.h index 26f3be5..a6d4be5 100644 --- a/src/collectives/kernels/collectives.h +++ b/src/collectives/kernels/collectives.h @@ -441,7 +441,7 @@ class Collectives { { PipeBarrier(); if (op != -1) { -#ifdef __DAV_C220_VEC__ +#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__) SetAtomicOpType(op); #endif } diff --git a/src/collectives/kernels/datacopy_gm2gm.h b/src/collectives/kernels/datacopy_gm2gm.h index c49e7e0..fbbeb51 100644 --- a/src/collectives/kernels/datacopy_gm2gm.h +++ b/src/collectives/kernels/datacopy_gm2gm.h @@ -21,25 +21,38 @@ constexpr int32_t BUFFER_NUM = 1; constexpr int32_t TILE_NUM = 2; constexpr int32_t BLOCK_SIZE = UB_SINGLE_DMA_SIZE_MAX / TILE_NUM / BUFFER_NUM; +template +struct TileXRAtomicTypeSupported { +#if defined(__DAV_C310_VEC__) + static constexpr bool value = std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v; +#else + static constexpr bool value = true; +#endif +}; + template FORCE_INLINE_AICORE void SetAtomicOpType(int op) { - switch (op) { - case ADD: - AscendC::SetAtomicAdd(); - break; + if constexpr (TileXRAtomicTypeSupported::value) { + switch (op) { + case ADD: + AscendC::SetAtomicAdd(); + break; - case MUL: - break; - case MAX: - AscendC::SetAtomicMax(); - break; - case MIN: - AscendC::SetAtomicMin(); - break; - default: - AscendC::SetAtomicNone(); - ; + case MUL: + break; + case MAX: + AscendC::SetAtomicMax(); + break; + case MIN: + AscendC::SetAtomicMin(); + break; + default: + AscendC::SetAtomicNone(); + ; + } } } @@ -222,7 +235,7 @@ class DataCopyGM2GM { { PipeBarrier(); if (op != -1) { -#ifdef __DAV_C220_VEC__ +#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__) SetAtomicOpType(op); #endif } @@ -329,4 +342,4 @@ class DataCopyGM2GM { const __gm__ T* outputGm = nullptr; int op; }; -#endif // LCCL_DATACOPY_GM2GM_H \ No newline at end of file +#endif // LCCL_DATACOPY_GM2GM_H diff --git a/src/collectives/kernels/kernels/collectives.cce b/src/collectives/kernels/kernels/collectives.cce index ca9570e..1111a77 100644 --- a/src/collectives/kernels/kernels/collectives.cce +++ b/src/collectives/kernels/kernels/collectives.cce @@ -10,7 +10,7 @@ #ifndef TILEXR_KERNEL_COLLECTIVES_CCE #define TILEXR_KERNEL_COLLECTIVES_CCE -#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__) +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__) #define __aicore__ #define __ubuf__ #define __gm__ @@ -574,7 +574,7 @@ __attribute__((always_inline)) inline __aicore__ void ProcessData(int64_t dataSi return; } AscendC::PipeBarrier(); - #ifdef __DAV_C220_VEC__ + #if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__) SetAtomicOpType(op); #endif AscendC::PipeBarrier(); @@ -617,7 +617,7 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNew(int64_t dat } AscendC::PipeBarrier(); -#ifdef __DAV_C220_VEC__ +#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__) SetAtomicOpType(op); #endif AscendC::PipeBarrier(); @@ -659,7 +659,7 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNewNonBarrier(i AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); -#ifdef __DAV_C220_VEC__ +#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__) SetAtomicOpType(op); #endif AscendC::SetFlag(EVENT_ID0); diff --git a/src/collectives/kernels/kernels/lcal_profile_probe.cce b/src/collectives/kernels/kernels/lcal_profile_probe.cce new file mode 100644 index 0000000..96b8b9a --- /dev/null +++ b/src/collectives/kernels/kernels/lcal_profile_probe.cce @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2024-2026 TileXR Project + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. + */ +#include +#include "collectives.cce" + +constexpr int64_t TILEXR_PROFILE_PROBE_UB_OFFSET = 64; +constexpr int64_t TILEXR_PROFILE_PROBE_STATS_GUARD_BYTES = 512; +constexpr int64_t TILEXR_PROFILE_PROBE_COPY_UB_BYTES = + TileXR::TILEXR_PERF_TRACE_STATS_UB_OFFSET - TILEXR_PROFILE_PROBE_UB_OFFSET - + TILEXR_PROFILE_PROBE_STATS_GUARD_BYTES; + +__attribute__((always_inline)) inline __aicore__ void TileXRProfileProbeCopy( + GM_ADDR input, GM_ADDR output, int64_t len, __ubuf__ uint8_t *inputUB) +{ + if (input == nullptr || output == nullptr || len <= 0) { + return; + } + const int64_t blockNum = GetBlockNum(); + if (blockNum <= 0) { + return; + } + + const int64_t perCore = (len + blockNum - 1) / blockNum; + const int64_t start = static_cast(GetBlockIdx()) * perCore; + if (start >= len) { + return; + } + int64_t remain = len - start; + if (remain > perCore) { + remain = perCore; + } + if (remain > TILEXR_PROFILE_PROBE_COPY_UB_BYTES) { + remain = TILEXR_PROFILE_PROBE_COPY_UB_BYTES; + } + + CpGM2UB(inputUB, reinterpret_cast<__gm__ uint8_t *>(input) + start, static_cast(remain)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM(reinterpret_cast<__gm__ uint8_t *>(output) + start, inputUB, static_cast(remain)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); +} + +__attribute__((always_inline)) inline __aicore__ void TileXRProfileProbeKernel( + KERNELS_ARGS_FUN(), int rank, int rankSize, uint32_t extraFlag) +{ + (void)commArgs; + (void)magic; + (void)op; + (void)root; + (void)cycleCount; + (void)scale; + (void)scaleCount; + (void)offset; + (void)rankSize; + (void)extraFlag; + + const uint32_t perfCore = static_cast(GetBlockIdx()); + const uint32_t perfRank = static_cast(rank); + __ubuf__ uint8_t *inputUB = reinterpret_cast<__ubuf__ uint8_t *>(TILEXR_PROFILE_PROBE_UB_OFFSET); + + auto kernelTotal = TileXR::TileXRPerfStageBegin( + perfTrace, TileXR::PerfStageId::KERNEL_TOTAL, TileXR::PerfBarrierPolicy::NO_BARRIER); + auto chunkTotal = TileXR::TileXRPerfStageBegin( + perfTrace, TileXR::PerfStageId::CHUNK_TOTAL, TileXR::PerfBarrierPolicy::NO_BARRIER); + + auto postSync = TileXR::TileXRPerfStageBegin( + perfTrace, TileXR::PerfStageId::POST_SYNC, TileXR::PerfBarrierPolicy::BARRIERED); + AscendC::PipeBarrier(); + TileXR::TileXRPerfStageEnd( + perfTrace, perfRank, perfCore, TileXR::PerfStageId::POST_SYNC, + postSync, TileXR::PerfBarrierPolicy::END_BARRIER_ONLY); + + auto localInputToIpc = TileXR::TileXRPerfStageBegin( + perfTrace, TileXR::PerfStageId::LOCAL_INPUT_TO_IPC, TileXR::PerfBarrierPolicy::BARRIERED); + TileXRProfileProbeCopy(input, output, len, inputUB); + TileXR::TileXRPerfStageEnd( + perfTrace, perfRank, perfCore, TileXR::PerfStageId::LOCAL_INPUT_TO_IPC, + localInputToIpc, TileXR::PerfBarrierPolicy::END_BARRIER_ONLY); + + const uint64_t pollStart = static_cast(AscendC::GetSystemCycle()); + AscendC::PipeBarrier(); + const uint64_t pollEnd = static_cast(AscendC::GetSystemCycle()); + TileXR::TileXRPerfAccumulateDuration( + perfTrace, perfRank, perfCore, TileXR::PerfStageId::FLAG_POLL_WAIT, pollStart, pollEnd); + + auto peerIpcToOutput = TileXR::TileXRPerfStageBegin( + perfTrace, TileXR::PerfStageId::PEER_IPC_TO_OUTPUT, TileXR::PerfBarrierPolicy::BARRIERED); + TileXRProfileProbeCopy(input, output, len, inputUB); + TileXR::TileXRPerfStageEnd( + perfTrace, perfRank, perfCore, TileXR::PerfStageId::PEER_IPC_TO_OUTPUT, + peerIpcToOutput, TileXR::PerfBarrierPolicy::END_BARRIER_ONLY); + + auto chunkBarrier = TileXR::TileXRPerfStageBegin( + perfTrace, TileXR::PerfStageId::CHUNK_BARRIER, TileXR::PerfBarrierPolicy::NO_BARRIER); + AscendC::PipeBarrier(); + TileXR::TileXRPerfStageEnd( + perfTrace, perfRank, perfCore, TileXR::PerfStageId::CHUNK_BARRIER, + chunkBarrier, TileXR::PerfBarrierPolicy::NO_BARRIER); + TileXR::TileXRPerfStageEnd( + perfTrace, perfRank, perfCore, TileXR::PerfStageId::CHUNK_TOTAL, + chunkTotal, TileXR::PerfBarrierPolicy::NO_BARRIER); + TileXR::TileXRPerfStageEnd( + perfTrace, perfRank, perfCore, TileXR::PerfStageId::KERNEL_TOTAL, + kernelTotal, TileXR::PerfBarrierPolicy::NO_BARRIER); +} diff --git a/src/collectives/kernels/lccl_op.h b/src/collectives/kernels/lccl_op.h index 47a8178..555d999 100644 --- a/src/collectives/kernels/lccl_op.h +++ b/src/collectives/kernels/lccl_op.h @@ -10,7 +10,7 @@ #ifndef TILEXR_LCCL_OP_H #define TILEXR_LCCL_OP_H -#if defined(__DAV_C220_VEC__) || defined(__DAV_C220_CUBE__) +#if defined(__DAV_C220_VEC__) || defined(__DAV_C220_CUBE__) || defined(__DAV_C310_VEC__) #include "op_def.h" #include "allgather.h" @@ -49,6 +49,7 @@ #include "kernels/lcal_broadcast_write.cce" #include "kernels/lcal_broadcast_big_data.cce" #include "kernels/lcal_all2all_transpose.cce" +#include "kernels/lcal_profile_probe.cce" extern "C" __global__ __aicore__ __attribute__((section("Attr_Section_TileXR"))) void TileXRDescriptor() {} @@ -74,6 +75,15 @@ extern "C" __global__ __aicore__ void TileXRBroadcast##suffix(KERNELS_ARGS_FUN() } \ } +#define LCCL_PROFILE_PROBE_FUNC_AUTO_DEF() \ +extern "C" __global__ __aicore__ void TileXRProfileProbe(KERNELS_ARGS_FUN()) \ +{ \ + if ASCEND_IS_AIV { \ + GET_COMM_ARGS; \ + TileXRProfileProbeKernel(KERNELS_ARGS_CALL(), rank, rankSize, extraFlag); \ + } \ +} + #define LCCL_ALLGATHER_FUNC_AUTO_DEF(type, suffix) \ extern "C" __global__ __aicore__ void TileXRAllGather_##type##suffix(KERNELS_ARGS_FUN()) { \ if ASCEND_IS_AIV { \ diff --git a/src/collectives/kernels/perf_trace_kernel.h b/src/collectives/kernels/perf_trace_kernel.h index 3df62ca..0ce3d35 100644 --- a/src/collectives/kernels/perf_trace_kernel.h +++ b/src/collectives/kernels/perf_trace_kernel.h @@ -3,6 +3,7 @@ #include "comm_args.h" #include "kernel_operator.h" +#include "datacopy_gm2gm.h" #include "tilexr_perf_trace.h" namespace TileXR { @@ -11,6 +12,8 @@ struct TileXRPerfStageToken { uint64_t startCycle = 0; }; +constexpr int64_t TILEXR_PERF_TRACE_STATS_UB_OFFSET = 195616; + #if defined(TILEXR_COLLECTIVES_ENABLE_PROFILING) __attribute__((always_inline)) inline __aicore__ bool TileXRPerfTraceEnabled(GM_ADDR trace) @@ -25,15 +28,13 @@ __attribute__((always_inline)) inline __aicore__ __gm__ TileXRPerfCoreStageStats return nullptr; } - __gm__ TileXRPerfTraceHeader *header = reinterpret_cast<__gm__ TileXRPerfTraceHeader *>(trace); - const size_t slot = PerfTraceStatsOffset(rank, core, stage, header->maxCoreCount, header->stageCount); - return reinterpret_cast<__gm__ TileXRPerfCoreStageStats *>(trace + header->statsOffset) + slot; + const size_t slot = PerfTraceStatsOffset(rank, core, stage, GetBlockNum(), TILEXR_PERF_STAGE_COUNT); + return reinterpret_cast<__gm__ TileXRPerfCoreStageStats *>(trace + TILEXR_PERF_TRACE_STATS_OFFSET) + slot; } __attribute__((always_inline)) inline __aicore__ TileXRPerfStageToken TileXRPerfStageBegin( GM_ADDR trace, PerfStageId stage, PerfBarrierPolicy policy) { - (void)stage; TileXRPerfStageToken token {}; if (trace == nullptr) { return token; @@ -41,12 +42,14 @@ __attribute__((always_inline)) inline __aicore__ TileXRPerfStageToken TileXRPerf if (policy == PerfBarrierPolicy::BARRIERED) { AscendC::PipeBarrier(); } + (void)stage; token.startCycle = static_cast(AscendC::GetSystemCycle()); return token; } __attribute__((always_inline)) inline __aicore__ void TileXRPerfAccumulateDuration( - GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, uint64_t startCycle, uint64_t endCycle) + GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, uint64_t startCycle, uint64_t endCycle, + __ubuf__ TileXRPerfCoreStageStats *statsUB) { if (endCycle < startCycle) { return; @@ -58,35 +61,55 @@ __attribute__((always_inline)) inline __aicore__ void TileXRPerfAccumulateDurati return; } + CpGM2UB(reinterpret_cast<__ubuf__ uint8_t *>(statsUB), + reinterpret_cast<__gm__ uint8_t *>(slot), sizeof(TileXRPerfCoreStageStats)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + const uint64_t duration = endCycle - startCycle; - slot->rank = rank; - slot->core = core; - slot->stageId = stageId; - if (slot->count == 0) { - slot->minCycles = duration; - slot->maxCycles = duration; - slot->firstStartCycle = startCycle; + statsUB->rank = rank; + statsUB->core = core; + statsUB->stageId = stageId; + statsUB->reserved = 0; + if (statsUB->count == 0) { + statsUB->minCycles = duration; + statsUB->maxCycles = duration; + statsUB->firstStartCycle = startCycle; } else { - if (duration < slot->minCycles) { - slot->minCycles = duration; + if (duration < statsUB->minCycles) { + statsUB->minCycles = duration; } - if (duration > slot->maxCycles) { - slot->maxCycles = duration; + if (duration > statsUB->maxCycles) { + statsUB->maxCycles = duration; } - if (startCycle < slot->firstStartCycle) { - slot->firstStartCycle = startCycle; + if (startCycle < statsUB->firstStartCycle) { + statsUB->firstStartCycle = startCycle; } } - slot->count += 1; - slot->sumCycles += duration; - if (endCycle > slot->lastEndCycle) { - slot->lastEndCycle = endCycle; + statsUB->count += 1; + statsUB->sumCycles += duration; + if (endCycle > statsUB->lastEndCycle) { + statsUB->lastEndCycle = endCycle; } + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM(reinterpret_cast<__gm__ uint8_t *>(slot), + reinterpret_cast<__ubuf__ uint8_t *>(statsUB), sizeof(TileXRPerfCoreStageStats)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); +} + +__attribute__((always_inline)) inline __aicore__ void TileXRPerfAccumulateDuration( + GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, uint64_t startCycle, uint64_t endCycle) +{ + TileXRPerfAccumulateDuration( + trace, rank, core, stage, startCycle, endCycle, + reinterpret_cast<__ubuf__ TileXRPerfCoreStageStats *>(TILEXR_PERF_TRACE_STATS_UB_OFFSET)); } __attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd( GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, TileXRPerfStageToken token, - PerfBarrierPolicy policy) + PerfBarrierPolicy policy, __ubuf__ TileXRPerfCoreStageStats *statsUB) { if (trace == nullptr) { return; @@ -95,7 +118,16 @@ __attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd( AscendC::PipeBarrier(); } const uint64_t endCycle = static_cast(AscendC::GetSystemCycle()); - TileXRPerfAccumulateDuration(trace, rank, core, stage, token.startCycle, endCycle); + TileXRPerfAccumulateDuration(trace, rank, core, stage, token.startCycle, endCycle, statsUB); +} + +__attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd( + GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, TileXRPerfStageToken token, + PerfBarrierPolicy policy) +{ + TileXRPerfStageEnd( + trace, rank, core, stage, token, policy, + reinterpret_cast<__ubuf__ TileXRPerfCoreStageStats *>(TILEXR_PERF_TRACE_STATS_UB_OFFSET)); } #else @@ -136,6 +168,14 @@ __attribute__((always_inline)) inline __aicore__ void TileXRPerfAccumulateDurati (void)endCycle; } +__attribute__((always_inline)) inline __aicore__ void TileXRPerfAccumulateDuration( + GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, uint64_t startCycle, uint64_t endCycle, + __ubuf__ TileXRPerfCoreStageStats *statsUB) +{ + (void)statsUB; + TileXRPerfAccumulateDuration(trace, rank, core, stage, startCycle, endCycle); +} + __attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd( GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, TileXRPerfStageToken token, PerfBarrierPolicy policy) @@ -148,6 +188,14 @@ __attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd( (void)policy; } +__attribute__((always_inline)) inline __aicore__ void TileXRPerfStageEnd( + GM_ADDR trace, uint32_t rank, uint32_t core, PerfStageId stage, TileXRPerfStageToken token, + PerfBarrierPolicy policy, __ubuf__ TileXRPerfCoreStageStats *statsUB) +{ + (void)statsUB; + TileXRPerfStageEnd(trace, rank, core, stage, token, policy); +} + #endif } // namespace TileXR diff --git a/src/collectives/kernels/tilexr_lccl_op.cpp b/src/collectives/kernels/tilexr_lccl_op.cpp index 75c2272..10fa497 100644 --- a/src/collectives/kernels/tilexr_lccl_op.cpp +++ b/src/collectives/kernels/tilexr_lccl_op.cpp @@ -1,4 +1,4 @@ -#ifdef __DAV_C220_VEC__ +#if defined(__DAV_C220_VEC__) || defined(__DAV_C310_VEC__) #include "lccl_op.h" @@ -7,5 +7,6 @@ LCCL_TYPE_AIV_FUNC(LCCL_ALL_REDUCE_FUNC_AUTO_DEF); LCCL_TYPE_AIV_FUNC(LCCL_REDUCE_SCATTER_FUNC_AUTO_DEF); LCCL_TYPE_AIV_FUNC(LCCL_ALL2ALL_FUNC_AUTO_DEF); LCCL_BROADCAST_FUNC_AUTO_DEF(); +LCCL_PROFILE_PROBE_FUNC_AUTO_DEF(); #endif diff --git a/src/comm/tilexr_internal.cpp b/src/comm/tilexr_internal.cpp index 1e2bc34..cbc5146 100644 --- a/src/comm/tilexr_internal.cpp +++ b/src/comm/tilexr_internal.cpp @@ -39,7 +39,9 @@ const std::unordered_map CHIP_MAP = { {"Ascend950DT", ChipName::CHIP_950}, {"Ascend950DT_9581", ChipName::CHIP_950}, {"Ascend950DT_9584", ChipName::CHIP_950}, - {"Ascend950PR", ChipName::CHIP_950} + {"Ascend950PR", ChipName::CHIP_950}, + {"Ascend950PR_9589", ChipName::CHIP_950}, + {"Ascend950PR_9599", ChipName::CHIP_950} }; /** diff --git a/src/include/tilexr_collectives.h b/src/include/tilexr_collectives.h index 9faf687..50fa782 100644 --- a/src/include/tilexr_collectives.h +++ b/src/include/tilexr_collectives.h @@ -35,6 +35,9 @@ int TileXRReduceScatter(void *sendBuf, void *recvBuf, int64_t recvCount, int TileXRBroadcast(void *buf, int64_t count, TileXR::TileXRDataType dataType, int root, TileXRCommPtr comm, aclrtStream stream); +int TileXRProfileProbe(void *sendBuf, void *recvBuf, int64_t count, + TileXR::TileXRDataType dataType, TileXRCommPtr comm, + aclrtStream stream); } diff --git a/src/include/tilexr_collectives_perf.h b/src/include/tilexr_collectives_perf.h index 42773ed..ee648b6 100644 --- a/src/include/tilexr_collectives_perf.h +++ b/src/include/tilexr_collectives_perf.h @@ -20,6 +20,7 @@ int TileXRCollectivePerfSessionCreate(const TileXRCollectivePerfConfig *config, int TileXRCollectivePerfSessionDestroy(TileXRCollectivePerfSession session); int TileXRCollectivePerfSetActiveSession(TileXRCollectivePerfSession session); int TileXRCollectivePerfWriteReport(TileXRCollectivePerfSession session); +int TileXRCollectivePerfWriteIncompleteReport(TileXRCollectivePerfSession session, const char *reason); #ifdef __cplusplus } diff --git a/src/include/tilexr_perf_trace.h b/src/include/tilexr_perf_trace.h index c0b1e74..78d91fd 100644 --- a/src/include/tilexr_perf_trace.h +++ b/src/include/tilexr_perf_trace.h @@ -16,6 +16,7 @@ constexpr uint32_t TILEXR_PERF_TRACE_MAGIC = 0x54585054u; // TXPT constexpr uint32_t TILEXR_PERF_TRACE_VERSION = 1u; constexpr uint32_t TILEXR_PERF_MAX_STAGE_NAME = 32u; constexpr uint32_t TILEXR_PERF_STAGE_COUNT = 7u; +constexpr uint32_t TILEXR_PERF_TRACE_STATS_OFFSET = 128u; enum class PerfChipClass : uint32_t { GENERIC = 0, @@ -52,7 +53,7 @@ struct TileXRPerfTraceHeader { uint32_t headerSize = sizeof(TileXRPerfTraceHeader); // Stage descriptions are static schema metadata; this records their ABI size. uint32_t stageDescSize = sizeof(uint32_t) * 4 + TILEXR_PERF_MAX_STAGE_NAME; - uint32_t coreStageStatsSize = sizeof(uint32_t) * 4 + sizeof(uint64_t) * 8; + uint32_t coreStageStatsSize = sizeof(uint32_t) * 4 + sizeof(uint64_t) * 10; uint32_t flags = 0; uint32_t rank = 0; uint32_t rankSize = 0; @@ -89,6 +90,8 @@ struct TileXRPerfCoreStageStats { uint64_t lastEndCycle = 0; uint64_t aux0 = 0; uint64_t aux1 = 0; + uint64_t aux2 = 0; + uint64_t aux3 = 0; }; TILEXR_PERF_TRACE_INLINE size_t PerfTraceStatsOffset(uint32_t rank, uint32_t core, uint32_t stage, diff --git a/src/include/tilexr_types.h b/src/include/tilexr_types.h index 7fda01e..ac53458 100644 --- a/src/include/tilexr_types.h +++ b/src/include/tilexr_types.h @@ -102,6 +102,7 @@ enum class TileXRType { LOCAL_REDUCE = 8, SEND = 9, RECV = 10, + PROFILE_PROBE = 11, PURE_MATMUL = 101, MATMUL_ALL_REDUCE = 102, MATMUL_REDUCE_SCATTER = 103, @@ -144,6 +145,7 @@ const std::map TILEXR_TYPE2NAME = { { TileXRType::GATHER, "TileXRGather" }, { TileXRType::SEND, "TileXRSend" }, { TileXRType::RECV, "TileXRRecv" }, + { TileXRType::PROFILE_PROBE, "TileXRProfileProbe" }, { TileXRType::ALLTOALLV_ALLGATHER_MATMUL, "TileXRAllToAllVAllGatherMatmul" }, { TileXRType::MATMUL_REDUCESCATTER_ALLTOALLV, "TileXRMatmulReduceScatterAllToAllV" }, diff --git a/tests/collectives/CMakeLists.txt b/tests/collectives/CMakeLists.txt index 65aac8c..93e3b05 100644 --- a/tests/collectives/CMakeLists.txt +++ b/tests/collectives/CMakeLists.txt @@ -232,6 +232,13 @@ set(TILEXR_COLLECTIVES_TEST_LINK_DIRS ${ASCEND_DRIVER_PATH}/lib64/driver ) +function(TileXRAddAscendRpathLink target_name) + target_link_options(${target_name} PRIVATE + "LINKER:-rpath-link,${ASCEND_HOME_PATH}/${ARCH}-linux/lib64" + "LINKER:-rpath-link,${ASCEND_DRIVER_PATH}/lib64/driver" + ) +endfunction() + target_link_libraries(test_tilexr_collectives_header_compile ${TILEXR_COLLECTIVES_TEST_TARGET} ) @@ -286,6 +293,21 @@ target_link_libraries(test_prepare_host_launch_context ${TILEXR_COLLECTIVES_TEST_TARGET} ) +foreach(_tilexr_collectives_link_target + test_tilexr_collectives_header_compile + test_tilexr_perf_trace_layout + test_collective_perf_report + test_collective_perf_session + test_tilexr_collectives_correctness + tilexr_collective_perf + test_tilexr_collectives_stub_behavior + test_tilexr_collectives_uninitialized_comm + test_collective_host_utils + test_tilexr_comm_next_magic + test_prepare_host_launch_context) + TileXRAddAscendRpathLink(${_tilexr_collectives_link_target}) +endforeach() + add_test(NAME test_tilexr_collectives_api COMMAND test_tilexr_collectives_api) add_test(NAME test_tilexr_collectives_kernel_ownership COMMAND test_tilexr_collectives_kernel_ownership) add_test(NAME test_tilexr_collectives_tools_sources COMMAND test_tilexr_collectives_tools_sources) @@ -332,6 +354,7 @@ install(TARGETS install(PROGRAMS run_collectives_correctness.sh run_collective_perf.sh + run_collective_perf_multihost.sh tilexr_collective_profile_report.py DESTINATION ${CMAKE_INSTALL_BINDIR} ) diff --git a/tests/collectives/README.md b/tests/collectives/README.md index 42c7d9e..df6348d 100644 --- a/tests/collectives/README.md +++ b/tests/collectives/README.md @@ -58,7 +58,7 @@ cd tests/collectives --iters 20 --warmup-iters 5 --datatype int32 --check 1 ``` -Main options are `--op allgather|alltoall|allreduce|reducescatter|broadcast`, `--min-bytes`, `--max-bytes`, `--step-factor`, `--iters`, +Main options are `--op allgather|alltoall|allreduce|reducescatter|broadcast|profile-probe`, `--min-bytes`, `--max-bytes`, `--step-factor`, `--iters`, `--warmup-iters`, `--datatype int8|int16|int32|int64|fp16|fp32|bf16`, `--rank-size`, `--rank`, `--first-npu`, `--check 0|1`, and `--csv `. Optional thresholds `--min-algbw` and `--max-latency-us` are available but are not set by default. @@ -105,13 +105,15 @@ After all rank processes finish successfully, `run_collective_perf.sh` also writ run/prof/collectives/report.html run/prof/collectives/trace_index.json run/prof/collectives/analysis.md +run/prof/collectives/perfetto_trace.json ``` When prompt export is enabled, the aggregate prompt is written as `run/prof/collectives/ai_prompt.md`. -The root-level report.html keeps the bottleneck-first summary and adds a zoomable chronological timeline across -sampled measured iterations. Warmup execution is controlled by the existing `--warmup-iters` option and is reported -as metadata; warmup launches are not profiled by this report path. The per-launch `rank/launch/report.html` -files remain available for drilldown. +The root-level report.html keeps the bottleneck-first summary, adds a rank-level summary for spotting slow ranks, +and adds a zoomable chronological timeline across sampled measured iterations. `perfetto_trace.json` uses +launch/rank/stage event names plus per-rank launch windows so `ui.perfetto.dev` can quickly filter by launch or rank. +Warmup execution is controlled by the existing `--warmup-iters` option and is reported as metadata; warmup launches are not profiled by this report path. +The per-launch `rank/launch/report.html` files remain available for drilldown. To regenerate the aggregate report from an existing profile directory: @@ -120,6 +122,35 @@ python3 tilexr_collective_profile_report.py run/prof/collectives \ --warmup-iters 5 --iters 20 --profile-sample-every 1 --emit-ai-prompt ``` +### Multi-Host Profiling + +`tilexr_collective_perf` also supports a socket bootstrap mode for one process per host. The helper script below +starts each rank over SSH with `--comm-mode socket`, collects `rank/` profile directories back to the first host, +and then writes a single aggregate `report.html` plus `perfetto_trace.json` containing host/rank labels. + +Use `--op profile-probe` first as a multi-host profiling/report smoke mode. It initializes ACL and the socket +communicator on all hosts, launches a real AIV kernel on each rank, and records the same kernel-level stages as the +single-host collective report. The probe only performs local device-memory copies and does not touch cross-host +`peerMems[]`, so it is useful for two-host profiling/report validation but should not be reported as allgather +cross-host bandwidth. + +```bash +cd tests/collectives +TILEXR_MULTIHOST_PEERS="0,root@141.62.24.62,141.62.24.62,0;1,root@141.62.24.70,141.62.24.70,0" \ +TILEXR_COMM_ID=141.62.24.62:10067 \ +TILEXR_COLLECTIVES_RUN_TIMEOUT_SEC=300 \ +bash ./run_collective_perf_multihost.sh /home/l00929943/TileXR/run/prof/collectives-2host-profile-probe-62-70 \ + /home/l00929943/TileXR/build-profile-950/tests/collectives \ + --op profile-probe --min-bytes 4096 --max-bytes 4096 \ + --iters 2 --warmup-iters 1 --datatype int32 --check 0 \ + --profile 1 --profile-sample-every 1 --profile-ai-prompt 1 +``` + +The current report normalizes each rank/launch independently because cross-host device cycles are not assumed to be +synchronized. Use the HTML rank-level summary to spot slow host/rank pairs, then drill into `rank/launch/`. +The helper labels hosts from the SSH target/IP in `TILEXR_MULTIHOST_PEERS`, so Perfetto event names look like +`launch0/rank1@141.62.24.70/kernel_total` in the example above. + ## Skip Behavior Manual scripts are strict by default. If `npu-smi info -l` or `TILEXR_AVAILABLE_NPUS` reports too few devices, diff --git a/tests/collectives/run_collective_perf_multihost.sh b/tests/collectives/run_collective_perf_multihost.sh new file mode 100644 index 0000000..b526ffb --- /dev/null +++ b/tests/collectives/run_collective_perf_multihost.sh @@ -0,0 +1,217 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat >&2 <<'EOF' +Usage: + TILEXR_MULTIHOST_PEERS="rank,host,ip,device;rank,host,ip,device" \ + run_collective_perf_multihost.sh profile_dir bin_dir [extra tilexr_collective_perf args...] + +Example: + TILEXR_MULTIHOST_PEERS="0,root@141.62.24.62,141.62.24.62,0;1,root@141.62.24.70,141.62.24.70,0" \ + TILEXR_COMM_ID=141.62.24.62:10067 \ + bash run_collective_perf_multihost.sh /home/l00929943/TileXR/run/prof/collectives-2host \ + /home/l00929943/TileXR/build-profile-950/tests/collectives \ + --op allgather --min-bytes 4096 --max-bytes 4096 --iters 2 --warmup-iters 1 \ + --datatype int32 --check 0 --profile 1 --profile-sample-every 1 --profile-ai-prompt 1 + +Each peer entry is rank,ssh_target,host_ip,device_id. The first entry is used as rank0/server. +EOF +} + +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage + exit 0 +fi + +profile_dir="${1:?profile_dir required}" +bin_dir="${2:?bin_dir required}" +shift 2 + +peers_spec="${TILEXR_MULTIHOST_PEERS:?TILEXR_MULTIHOST_PEERS is required}" +comm_id="${TILEXR_COMM_ID:-141.62.24.62:10067}" +timeout_sec="${TILEXR_COLLECTIVES_RUN_TIMEOUT_SEC:-600}" +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +helper="${script_dir}/tilexr_collective_profile_report.py" +rank_size=0 +warmup_iters=5 +measured_iters=20 +profile_sample_every=1 +profile_ai_prompt=0 + +is_true_bool() { + [[ "${1:-}" == "1" || "${1:-}" == "true" || "${1:-}" == "yes" ]] +} + +parse_profile_args() { + local args=("$@") + local i + for ((i = 0; i < ${#args[@]}; i++)); do + case "${args[$i]}" in + --warmup-iters) + if (( i + 1 < ${#args[@]} )); then warmup_iters="${args[$((i + 1))]}"; fi + ;; + --iters) + if (( i + 1 < ${#args[@]} )); then measured_iters="${args[$((i + 1))]}"; fi + ;; + --profile-sample-every) + if (( i + 1 < ${#args[@]} )); then profile_sample_every="${args[$((i + 1))]}"; fi + ;; + --profile-ai-prompt) + if (( i + 1 < ${#args[@]} )); then profile_ai_prompt="${args[$((i + 1))]}"; fi + ;; + esac + done +} + +parse_profile_args "$@" + +IFS=';' read -r -a peers <<< "${peers_spec}" +rank_size="${#peers[@]}" +if (( rank_size < 2 )); then + echo "ERROR: TILEXR_MULTIHOST_PEERS must contain at least two peers" >&2 + exit 1 +fi + +mkdir -p "${profile_dir}" + +ssh_pids=() +logs=() +targets=() +ranks=() + +cleanup() { + local pid + for pid in "${ssh_pids[@]:-}"; do + kill "${pid}" 2>/dev/null || true + done +} +trap cleanup INT TERM + +copy_rank_profile() { + local target="$1" + local rank="$2" + local dest="${profile_dir}/rank${rank}" + mkdir -p "${dest}" + + if command -v rsync >/dev/null 2>&1 && + ssh -o BatchMode=yes "${target}" "command -v rsync >/dev/null 2>&1"; then + rsync -a -e "ssh -o BatchMode=yes" "${target}:${profile_dir}/rank${rank}/" "${dest}/" + return + fi + + ssh -o BatchMode=yes "${target}" bash -s -- "${profile_dir}" "${rank}" <<'REMOTE' | tar -xf - -C "${dest}" +set -euo pipefail +profile_dir="$1" +rank="$2" +cd "${profile_dir}/rank${rank}" +tar -cf - . +REMOTE +} + +launch_rank() { + local rank="$1" + local target="$2" + local host_ip="$3" + local device_id="$4" + local host_label="${target#*@}" + shift 4 + local log="${profile_dir}/multihost_rank${rank}.log" + logs+=("${log}") + targets+=("${target}") + ranks+=("${rank}") + if [[ -z "${host_label}" || "${host_label}" == "${target}" ]]; then + host_label="${host_ip}" + fi + + ssh -o BatchMode=yes "${target}" bash -s -- \ + "${rank_size}" "${rank}" "${device_id}" "${comm_id}" "${profile_dir}" "${bin_dir}" "${host_ip}" "${host_label}" "$@" >"${log}" 2>&1 <<'REMOTE' & +set -euo pipefail +rank_size="$1" +rank="$2" +device_id="$3" +comm_id="$4" +profile_dir="$5" +bin_dir="$6" +host_ip="$7" +host_label="$8" +shift 8 + +cd /home/l00929943/TileXR +set +u +source /root/anaconda3/etc/profile.d/conda.sh 2>/dev/null || true +conda activate pt311 2>/dev/null || true +source /usr/local/Ascend/cann/set_env.sh +set -u +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$(pwd)/build-profile-950/src/collectives:$(pwd)/build-profile-950/src/comm:${LD_LIBRARY_PATH:-} +export ASCEND_PROCESS_LOG_PATH="${profile_dir}/plog/rank${rank}" +export ASCEND_GLOBAL_LOG_LEVEL="${ASCEND_GLOBAL_LOG_LEVEL:-3}" +mkdir -p "${ASCEND_PROCESS_LOG_PATH}" +export TILEXR_COMM_ID="${comm_id}" +export TILEXR_PROFILE_HOST="${host_label}" +export TILEXR_PROFILE_HOST_IP="${host_ip}" + +"${bin_dir}/tilexr_collective_perf" \ + --rank-size "${rank_size}" \ + --rank "${rank}" \ + --device-id "${device_id}" \ + --comm-mode socket \ + --profile-dir "${profile_dir}" \ + "$@" +REMOTE + ssh_pids+=("$!") +} + +for peer in "${peers[@]}"; do + IFS=',' read -r rank target host_ip device_id <<< "${peer}" + if [[ -z "${rank:-}" || -z "${target:-}" || -z "${host_ip:-}" || -z "${device_id:-}" ]]; then + echo "ERROR: invalid peer entry '${peer}', expected rank,target,ip,device" >&2 + exit 1 + fi + launch_rank "${rank}" "${target}" "${host_ip}" "${device_id}" "$@" +done + +sleep "${timeout_sec}" >/dev/null 2>&1 & +watchdog_pid="$!" + +completed=0 +while (( completed < rank_size )); do + if wait -n; then + if ! kill -0 "${watchdog_pid}" 2>/dev/null; then + echo "ERROR: timed out after ${timeout_sec}s" >&2 + cleanup + exit 124 + fi + completed=$((completed + 1)) + else + rc="$?" + echo "ERROR: remote rank failed with ${rc}" >&2 + for log in "${logs[@]}"; do + echo "===== ${log} =====" >&2 + tail -n 120 "${log}" >&2 || true + done + cleanup + exit "${rc}" + fi +done + +kill "${watchdog_pid}" 2>/dev/null || true +wait "${watchdog_pid}" 2>/dev/null || true +trap - INT TERM + +for i in "${!targets[@]}"; do + target="${targets[$i]}" + rank="${ranks[$i]}" + copy_rank_profile "${target}" "${rank}" +done + +prompt_args=() +if is_true_bool "${profile_ai_prompt}"; then + prompt_args+=(--emit-ai-prompt) +fi + +python3 "${helper}" "${profile_dir}" \ + --warmup-iters "${warmup_iters}" \ + --iters "${measured_iters}" \ + --profile-sample-every "${profile_sample_every}" \ + "${prompt_args[@]}" diff --git a/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp b/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp index 0ee88bd..ff16b93 100644 --- a/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp +++ b/tests/collectives/tilexr-tests/tilexr_collective_perf.cpp @@ -22,6 +22,10 @@ #include #include +#include +#include +#include + #include "acl/acl.h" #include "tilexr_collectives.h" #include "tilexr_collectives_perf.h" @@ -37,6 +41,13 @@ enum class CollectiveOp { ALLREDUCE, REDUCESCATTER, BROADCAST, + NOOP, + PROFILE_PROBE, +}; + +enum class CommMode { + LOCAL, + SOCKET, }; struct DataTypeInfo { @@ -56,6 +67,8 @@ struct Options { int rankSize = 2; int rank = 0; int firstNpu = 0; + int deviceIdOverride = -1; + CommMode commMode = CommMode::LOCAL; bool check = true; std::string csvPath; double minAlgBw = -1.0; @@ -88,12 +101,14 @@ struct Row { }; std::string JoinPath(const std::string &base, const std::string &leaf); +bool CreateDirectories(const std::string &path); +bool WriteProfileHostInfo(const Options &options); std::string ResolveProfileOutputDir(const Options &options, uint64_t profileLaunchIndex); bool ProfileThisLaunch(const Options &options, uint64_t profileLaunchIndex); bool StartPerfSessionForLaunch(const Options &options, uint64_t profileLaunchIndex, TileXRCollectivePerfSession &perfSession); void FinishPerfSession(TileXRCollectivePerfSession &perfSession, const Options &options, aclrtStream stream, - int &totalErrors, bool skipWriteReport); + int &totalErrors, bool skipWriteReport, const std::string &skipReason); using TileXRCollectivesTest::CanUseCollisionFreeInt32Pattern; using TileXRCollectivesTest::ExpectedAllGatherValue; @@ -117,6 +132,21 @@ std::string OpName(CollectiveOp op) return "reducescatter"; case CollectiveOp::BROADCAST: return "broadcast"; + case CollectiveOp::NOOP: + return "noop"; + case CollectiveOp::PROFILE_PROBE: + return "profile-probe"; + } + return "unknown"; +} + +std::string CommModeName(CommMode mode) +{ + switch (mode) { + case CommMode::LOCAL: + return "local"; + case CommMode::SOCKET: + return "socket"; } return "unknown"; } @@ -161,13 +191,14 @@ void PrintUsage(const char *program) { std::cerr << "Usage: " << program << " [options]\n" - << " --op allgather|alltoall|allreduce|reducescatter|broadcast\n" + << " --op allgather|alltoall|allreduce|reducescatter|broadcast|profile-probe\n" << " Message-size semantics: allgather/allreduce/broadcast: count * dtype_size; " "alltoall/reducescatter: count * rank_size * dtype_size\n" << " --min-bytes N --max-bytes N --step-factor F\n" << " --iters N --warmup-iters N\n" << " --datatype int8|int16|int32|int64|fp16|fp32|bf16\n" << " --rank-size N --rank R --first-npu D\n" + << " [--device-id D] [--comm-mode local|socket]\n" << " --check 0|1 [--csv path]\n" << " [--min-algbw GB/s] [--max-latency-us us]\n" << " [--profile 0|1] [--profile-dir path]\n" @@ -209,6 +240,19 @@ bool ParseDataType(const std::string &value, DataTypeInfo &info) return true; } +bool ParseCommMode(const std::string &value, CommMode &mode) +{ + if (value == "local") { + mode = CommMode::LOCAL; + return true; + } + if (value == "socket") { + mode = CommMode::SOCKET; + return true; + } + return false; +} + bool ParseInt64(const std::string &text, int64_t &out) { char *end = nullptr; @@ -300,6 +344,10 @@ bool ComputeMessageSizes(const Options &options, int64_t count, int64_t &sendEle break; case CollectiveOp::BROADCAST: break; + case CollectiveOp::NOOP: + break; + case CollectiveOp::PROFILE_PROBE: + break; } return CheckedBytesForElements(sendElements, options.dtype.bytes, sendBytes) && CheckedBytesForElements(recvElements, options.dtype.bytes, recvBytes) && @@ -328,6 +376,7 @@ bool ValidateMaxMessageSize(const Options &options) return false; } if (options.check && options.dtype.name == "int32" && + options.op != CollectiveOp::NOOP && options.op != CollectiveOp::PROFILE_PROBE && !CanUseCollisionFreeInt32Pattern(options.rankSize, validationCount)) { std::cerr << "ERROR: message size is too large for collision-free INT32 validation" << std::endl; return false; @@ -399,9 +448,14 @@ bool ParseOptions(int argc, char **argv, Options &options) options.op = CollectiveOp::REDUCESCATTER; } else if (op == "broadcast") { options.op = CollectiveOp::BROADCAST; + } else if (op == "noop") { + options.op = CollectiveOp::NOOP; + } else if (op == "profile-probe") { + options.op = CollectiveOp::PROFILE_PROBE; } else { - std::cerr << "ERROR: --op must be allgather, alltoall, allreduce, reducescatter, or broadcast" - << std::endl; + std::cerr + << "ERROR: --op must be allgather, alltoall, allreduce, reducescatter, broadcast, or profile-probe" + << std::endl; return false; } } else if (arg == "--min-bytes") { @@ -482,6 +536,21 @@ bool ParseOptions(int argc, char **argv, Options &options) std::cerr << "ERROR: invalid --first-npu" << std::endl; return false; } + } else if (arg == "--device-id") { + const char *value = requireValue(arg); + if (value == nullptr) { + return false; + } + if (!ParseInt(value, options.deviceIdOverride)) { + std::cerr << "ERROR: invalid --device-id" << std::endl; + return false; + } + } else if (arg == "--comm-mode") { + const char *value = requireValue(arg); + if (value == nullptr || !ParseCommMode(value, options.commMode)) { + std::cerr << "ERROR: --comm-mode must be local or socket" << std::endl; + return false; + } } else if (arg == "--check") { const char *value = requireValue(arg); if (value == nullptr || !ParseBool(value, options.check)) { @@ -547,7 +616,8 @@ bool ParseOptions(int argc, char **argv, Options &options) if (options.minBytes <= 0 || options.maxBytes < options.minBytes || options.stepFactor <= 1.0 || options.iters <= 0 || options.warmupIters < 0 || options.rankSize <= 0 || - options.rank < 0 || options.rank >= options.rankSize || options.firstNpu < 0) { + options.rank < 0 || options.rank >= options.rankSize || options.firstNpu < 0 || + options.deviceIdOverride < -1) { std::cerr << "ERROR: invalid option value" << std::endl; return false; } @@ -604,6 +674,16 @@ bool CallCollective(const Options &options, void *sendBuf, void *recvBuf, int64_ case CollectiveOp::BROADCAST: return CheckTileXR(options.rank, "TileXRBroadcast", TileXRBroadcast(sendBuf, count, options.dtype.type, 0, comm, stream)); + case CollectiveOp::NOOP: + (void)sendBuf; + (void)recvBuf; + (void)count; + (void)comm; + (void)stream; + return true; + case CollectiveOp::PROFILE_PROBE: + return CheckTileXR(options.rank, "TileXRProfileProbe", + TileXRProfileProbe(sendBuf, recvBuf, count, options.dtype.type, comm, stream)); } return false; } @@ -638,6 +718,13 @@ bool FillPattern(const Options &options, int64_t count, std::vector &se StoreInt32(send, i, value); } break; + case CollectiveOp::PROFILE_PROBE: + for (int64_t i = 0; i < count; ++i) { + StoreInt32(send, i, ExpectedAllGatherValue(options.rankSize, options.rank, i)); + } + break; + case CollectiveOp::NOOP: + break; } std::fill(recv.begin(), recv.end(), 0xff); return true; @@ -666,6 +753,13 @@ bool FillPattern(const Options &options, int64_t count, std::vector &se send[i] = options.rank == 0 ? PatternByte(0, 0, static_cast(i)) : 0xff; } break; + case CollectiveOp::PROFILE_PROBE: + for (size_t i = 0; i < send.size(); ++i) { + send[i] = PatternByte(options.rank, 0, static_cast(i)); + } + break; + case CollectiveOp::NOOP: + break; } std::fill(recv.begin(), recv.end(), 0xff); return true; @@ -751,6 +845,22 @@ int ValidateInt32(const Options &options, int64_t count, const std::vector(elapsedMs) * 1000.0; - return ok; + if (!ok) { + return false; + } + return true; } bool Measure(const Options &options, void *devSend, void *devRecv, int64_t count, TileXRCommPtr comm, @@ -956,6 +1093,120 @@ std::string JoinPath(const std::string &base, const std::string &leaf) return base + "/" + leaf; } +bool CreateDirectories(const std::string &path) +{ + if (path.empty()) { + return false; + } + + std::string current; + size_t index = 0; + if (path[0] == '/') { + current = "/"; + index = 1; + } + + while (index <= path.size()) { + const size_t next = path.find('/', index); + const std::string part = path.substr(index, next == std::string::npos ? std::string::npos : next - index); + if (!part.empty()) { + if (!current.empty() && current[current.size() - 1] != '/') { + current += "/"; + } + current += part; + if (mkdir(current.c_str(), 0755) != 0 && errno != EEXIST) { + std::cerr << "ERROR: failed to create directory " << current << ": " << strerror(errno) << std::endl; + return false; + } + } + if (next == std::string::npos) { + break; + } + index = next + 1; + } + return true; +} + +std::string JsonEscape(const std::string &text) +{ + std::ostringstream out; + for (const char ch : text) { + switch (ch) { + case '\\': + out << "\\\\"; + break; + case '"': + out << "\\\""; + break; + case '\n': + out << "\\n"; + break; + case '\r': + out << "\\r"; + break; + case '\t': + out << "\\t"; + break; + default: + out << ch; + break; + } + } + return out.str(); +} + +std::string EnvOrEmpty(const char *name) +{ + const char *value = std::getenv(name); + return value == nullptr ? std::string() : std::string(value); +} + +std::string DefaultHostName() +{ + char hostname[256] = {}; + if (gethostname(hostname, sizeof(hostname) - 1) == 0 && hostname[0] != '\0') { + return hostname; + } + return "unknown"; +} + +bool WriteProfileHostInfo(const Options &options) +{ + if (!options.profile) { + return true; + } + const std::string root = options.profileDir.empty() ? "run/prof/collectives" : options.profileDir; + const std::string rankDir = JoinPath(root, "rank" + std::to_string(options.rank)); + if (!CreateDirectories(rankDir)) { + return false; + } + + std::string host = EnvOrEmpty("TILEXR_PROFILE_HOST"); + if (host.empty()) { + host = DefaultHostName(); + } + std::string hostIp = EnvOrEmpty("TILEXR_PROFILE_HOST_IP"); + if (hostIp.empty()) { + hostIp = EnvOrEmpty("TILEXR_NODE_IP"); + } + + const std::string path = JoinPath(rankDir, "host_info.json"); + std::ofstream out(path.c_str()); + if (!out.is_open()) { + std::cerr << "ERROR: failed to write " << path << std::endl; + return false; + } + out << "{\n" + << " \"schema\": \"tilexr_collective_profile_host.v1\",\n" + << " \"rank\": " << options.rank << ",\n" + << " \"rank_size\": " << options.rankSize << ",\n" + << " \"host\": \"" << JsonEscape(host) << "\",\n" + << " \"ip\": \"" << JsonEscape(hostIp) << "\",\n" + << " \"comm_mode\": \"" << CommModeName(options.commMode) << "\"\n" + << "}\n"; + return true; +} + std::string ResolveProfileOutputDir(const Options &options, uint64_t profileLaunchIndex) { const std::string root = options.profileDir.empty() ? "run/prof/collectives" : options.profileDir; @@ -969,6 +1220,13 @@ bool ProfileThisLaunch(const Options &options, uint64_t profileLaunchIndex) (profileLaunchIndex % static_cast(options.profileSampleEvery)) == 0; } +bool KernelProfilingDisabled() +{ + const char *value = std::getenv("TILEXR_COLLECTIVES_DISABLE_KERNEL_PROFILING"); + return value != nullptr && + (std::string(value) == "1" || std::string(value) == "true" || std::string(value) == "yes"); +} + bool StartPerfSessionForLaunch(const Options &options, uint64_t profileLaunchIndex, TileXRCollectivePerfSession &perfSession) { @@ -1006,18 +1264,22 @@ void Cleanup(TileXRCommPtr comm, aclrtStream stream, int deviceId, bool deviceSe } void FinishPerfSession(TileXRCollectivePerfSession &perfSession, const Options &options, aclrtStream stream, - int &totalErrors, bool skipWriteReport) + int &totalErrors, bool skipWriteReport, const std::string &skipReason) { if (perfSession == nullptr) { return; } + std::string reason = skipReason; bool streamSynced = true; - if (stream != nullptr) { + if (stream != nullptr && !KernelProfilingDisabled()) { streamSynced = CheckAcl(options.rank, "aclrtSynchronizeStream before perf report", aclrtSynchronizeStream(stream)); if (!streamSynced) { totalErrors += 1; skipWriteReport = true; + if (reason.empty()) { + reason = "aclrtSynchronizeStream before perf report failed"; + } } } if (TileXRCollectivePerfSetActiveSession(nullptr) != TileXR::TILEXR_SUCCESS) { @@ -1025,9 +1287,17 @@ void FinishPerfSession(TileXRCollectivePerfSession &perfSession, const Options & << std::endl; totalErrors += 1; } - if (!skipWriteReport && TileXRCollectivePerfWriteReport(perfSession) != TileXR::TILEXR_SUCCESS) { - std::cerr << "[rank " << options.rank << "] ERROR: TileXRCollectivePerfWriteReport failed" << std::endl; - totalErrors += 1; + if (skipWriteReport) { + if (TileXRCollectivePerfWriteIncompleteReport(perfSession, reason.c_str()) != TileXR::TILEXR_SUCCESS) { + std::cerr << "[rank " << options.rank + << "] ERROR: TileXRCollectivePerfWriteIncompleteReport failed" << std::endl; + totalErrors += 1; + } + } else { + if (TileXRCollectivePerfWriteReport(perfSession) != TileXR::TILEXR_SUCCESS) { + std::cerr << "[rank " << options.rank << "] ERROR: TileXRCollectivePerfWriteReport failed" << std::endl; + totalErrors += 1; + } } if (!streamSynced) { // Avoid freeing a trace buffer that may still be referenced by queued device work. @@ -1051,7 +1321,7 @@ int main(int argc, char **argv) return 2; } - const int deviceId = options.firstNpu + options.rank; + const int deviceId = options.deviceIdOverride >= 0 ? options.deviceIdOverride : options.firstNpu + options.rank; TileXRCommPtr comm = nullptr; aclrtStream stream = nullptr; bool deviceSet = false; @@ -1063,9 +1333,23 @@ int main(int argc, char **argv) return 1; } deviceSet = true; - if (!CheckAcl(options.rank, "aclrtCreateStream", aclrtCreateStream(&stream)) || - !CheckTileXR(options.rank, "TileXRCommInitRankLocal", - TileXRCommInitRankLocal(options.rankSize, options.rank, &comm))) { + if (!WriteProfileHostInfo(options) || + !CheckAcl(options.rank, "aclrtCreateStream", aclrtCreateStream(&stream))) { + Cleanup(comm, stream, deviceId, deviceSet); + return 1; + } + + bool commOk = false; + if (options.commMode == CommMode::SOCKET) { + TileXRUniqueId uniqueId {}; + commOk = CheckTileXR(options.rank, "TileXRGetUniqueId", TileXRGetUniqueId(&uniqueId, 0)) && + CheckTileXR(options.rank, "TileXRCommInitRank", + TileXRCommInitRank(uniqueId, options.rankSize, options.rank, &comm)); + } else { + commOk = CheckTileXR(options.rank, "TileXRCommInitRankLocal", + TileXRCommInitRankLocal(options.rankSize, options.rank, &comm)); + } + if (!commOk) { Cleanup(comm, stream, deviceId, deviceSet); return 1; } @@ -1105,7 +1389,7 @@ int main(int argc, char **argv) aclrtMemcpy(devSend, static_cast(sendBytes), hostSend.data(), static_cast(sendBytes), ACL_MEMCPY_HOST_TO_DEVICE)); if (ok) { - if (options.op == CollectiveOp::BROADCAST) { + if (options.op == CollectiveOp::BROADCAST || options.op == CollectiveOp::NOOP) { devRecv = devSend; } else { ok = CheckAcl(options.rank, "aclrtMalloc recv", @@ -1123,7 +1407,7 @@ int main(int argc, char **argv) } int errors = 0; - if (ok && options.check) { + if (ok && options.check && options.op != CollectiveOp::NOOP) { if (options.op != CollectiveOp::BROADCAST) { std::fill(hostRecv.begin(), hostRecv.end(), 0xff); ok = CheckAcl(options.rank, "aclrtMemcpy H2D devRecv sentinel", diff --git a/tests/collectives/tilexr_collective_profile_report.py b/tests/collectives/tilexr_collective_profile_report.py index f02699d..2ca1c05 100755 --- a/tests/collectives/tilexr_collective_profile_report.py +++ b/tests/collectives/tilexr_collective_profile_report.py @@ -10,7 +10,10 @@ TRACE_SCHEMA = "tilexr_perf_trace_report.v1" RUN_SCHEMA = "tilexr_perf_trace_run.v1" +HOST_SCHEMA = "tilexr_collective_profile_host.v1" RANK_LAUNCH_RE = re.compile(r"rank([0-9]+)[/\\]launch([0-9]+)[/\\]trace\.json$") +PERFETTO_LAUNCH_GAP_US = 50.0 +PERFETTO_LAUNCH_WINDOW_TID = 1000000 def parse_args(): @@ -34,6 +37,38 @@ def parse_rank_launch(path, root): return int(match.group(1)), int(match.group(2)) +def load_host_infos(root, diagnostics): + hosts = {} + for path in sorted(root.glob("rank*/host_info.json")): + source = relpath(path, root) + try: + info = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + diagnostics.append(f"invalid json in {source}: {exc}") + continue + if not isinstance(info, dict): + diagnostics.append(f"invalid top-level host info type in {source}: expected object") + continue + if info.get("schema") != HOST_SCHEMA: + diagnostics.append(f"invalid schema in {source}") + continue + rank = as_int(info.get("rank"), None) + if rank is None: + parsed = re.search(r"rank([0-9]+)[/\\]host_info\.json$", source) + rank = int(parsed.group(1)) if parsed else None + if rank is None: + diagnostics.append(f"invalid rank in {source}") + continue + hosts[rank] = { + "rank": rank, + "host": str(info.get("host") or f"rank{rank}"), + "ip": str(info.get("ip") or ""), + "comm_mode": str(info.get("comm_mode") or ""), + "source": source, + } + return hosts + + def load_traces(root): traces = [] diagnostics = [] @@ -68,6 +103,10 @@ def load_traces(root): def sanitize_trace(trace, source, diagnostics): trace = dict(trace) + if trace.get("incomplete"): + reason = str(trace.get("incomplete_reason") or "unknown") + diagnostics.append(f"incomplete trace in {source}: {reason}") + stats = trace.get("stats", []) if not isinstance(stats, list): diagnostics.append(f"invalid stats in {source}: expected list") @@ -79,6 +118,13 @@ def sanitize_trace(trace, source, diagnostics): if not isinstance(stat, dict): diagnostics.append(f"invalid stat entry in {source} stats[{index}]: expected object") continue + if as_int(stat.get("count")) <= 0: + raw_cycles = as_int(stat.get("raw_cycles")) + max_cycles = as_int(stat.get("max_cycles")) + last_end = as_int(stat.get("last_end_cycle")) + if raw_cycles != 0 or max_cycles != 0 or last_end != 0: + diagnostics.append(f"ignored count=0 stat in {source} stats[{index}]") + continue valid_stats.append(stat) trace["stats"] = valid_stats return trace @@ -90,6 +136,8 @@ def group_key(entry): trace.get("op_type"), trace.get("op_name", "Unknown"), trace.get("rank_size"), + trace.get("max_core_count"), + trace.get("block_dim"), trace.get("message_bytes"), trace.get("stage_count"), trace.get("cycle_to_us_divisor"), @@ -135,7 +183,29 @@ def cycles_to_us(cycles, divisor): return float(cycles) / float(divisor) -def normalized_bars(entries, root): +def rank_host_info(hosts, rank): + return hosts.get(rank, {"rank": rank, "host": f"rank{rank}", "ip": "", "comm_mode": "", "source": ""}) + + +def rank_label(hosts, rank): + info = rank_host_info(hosts, rank) + host = info.get("host") or f"rank{rank}" + if host == f"rank{rank}": + return f"rank{rank}" + return f"rank{rank}@{host}" + + +def format_hosts(hosts, rank_ids): + parts = [] + for rank in rank_ids: + info = rank_host_info(hosts, rank) + label = rank_label(hosts, rank) + ip = info.get("ip") or "unknown-ip" + parts.append(f"{label}({ip})") + return ", ".join(parts) if parts else "unknown" + + +def normalized_bars(entries, root, hosts): bars = [] entries_by_rank_launch = defaultdict(list) for entry in entries: @@ -169,6 +239,7 @@ def normalized_bars(entries, root): duration_cycles = max(0, last - first) core = as_int(stat.get("core")) stat_rank = as_int(stat.get("rank"), rank) + host_info = rank_host_info(hosts, stat_rank) raw_cycles = as_int(stat.get("raw_cycles")) bars.append({ @@ -187,9 +258,11 @@ def normalized_bars(entries, root): "raw_cycles": raw_cycles, "count": count, "max_cycles": as_int(stat.get("max_cycles")), + "host": host_info["host"], + "host_ip": host_info["ip"], "source": source, "drilldown": drilldown, - "lane": f"rank{stat_rank}/core{core}", + "lane": f"{rank_label(hosts, stat_rank)}/core{core}", }) return sorted(bars, key=lambda bar: ( @@ -206,11 +279,16 @@ def summarize_group(group): stage_totals = defaultdict(float) launch_kernel = defaultdict(float) rank_core_max = {"rank": 0, "core": 0, "stage": "", "duration_us": 0.0, "max_cycles": 0, "max_us": 0.0} + rank_launch_kernel = defaultdict(dict) for bar in group["bars"]: stage_totals[bar["stage"]] += bar["sum_us"] if bar["stage"] == "kernel_total": launch_kernel[bar["launch_id"]] = max(launch_kernel[bar["launch_id"]], bar["duration_us"]) + rank_launch_kernel[bar["rank"]][bar["launch_id"]] = max( + rank_launch_kernel[bar["rank"]].get(bar["launch_id"], 0.0), + bar["duration_us"], + ) max_us = cycles_to_us(bar["max_cycles"], group["cycle_to_us_divisor"]) if max_us > rank_core_max["max_us"]: rank_core_max = { @@ -224,16 +302,50 @@ def summarize_group(group): top_stage = max(stage_totals.items(), key=lambda item: item[1]) if stage_totals else ("none", 0.0) slowest_launch = max(launch_kernel.items(), key=lambda item: item[1]) if launch_kernel else (None, 0.0) + rank_kernel = summarize_rank_kernel(rank_launch_kernel) + slowest_rank = rank_kernel[0] if rank_kernel else { + "rank": None, + "launch_count": 0, + "avg_kernel_us": 0.0, + "max_kernel_us": 0.0, + "slowest_launch_id": None, + } return { "top_stage": {"stage": top_stage[0], "sum_us": top_stage[1]}, "slowest_launch": {"launch_id": slowest_launch[0], "kernel_us": slowest_launch[1]}, "rank_core_max": rank_core_max, + "rank_kernel": rank_kernel, + "slowest_rank": slowest_rank, } +def summarize_rank_kernel(rank_launch_kernel): + summaries = [] + for rank, launch_values in sorted(rank_launch_kernel.items()): + if not launch_values: + continue + values = list(launch_values.values()) + slowest_launch_id, max_kernel_us = max( + launch_values.items(), + key=lambda item: (item[1], -item[0]), + ) + summaries.append({ + "rank": rank, + "launch_count": len(values), + "avg_kernel_us": sum(values) / len(values), + "max_kernel_us": max_kernel_us, + "slowest_launch_id": slowest_launch_id, + }) + return sorted( + summaries, + key=lambda item: (-item["avg_kernel_us"], -item["max_kernel_us"], item["rank"]), + ) + + def build_index(root, args): traces, diagnostics = load_traces(root) + hosts = load_host_infos(root, diagnostics) grouped = defaultdict(list) for entry in traces: grouped[group_key(entry)].append(entry) @@ -249,11 +361,13 @@ def sort_key(item): as_int(key[3]), as_int(key[4]), as_int(key[5]), + as_int(key[6]), + as_int(key[7]), as_int(key[0]), ) for key, entries in sorted(grouped.items(), key=sort_key): - op_type, op_name, rank_size, message_bytes, stage_count, divisor = key + op_type, op_name, rank_size, max_core_count, block_dim, message_bytes, stage_count, divisor = key rank_ids = sorted({entry["rank"] for entry in entries}) launch_ids = sorted({entry["launch_id"] for entry in entries}) effective_rank_size = as_int(rank_size, len(rank_ids)) @@ -269,16 +383,19 @@ def sort_key(item): "op_type": op_type, "op_name": op_name, "rank_size": effective_rank_size, + "max_core_count": as_int(max_core_count), + "block_dim": as_int(block_dim), "message_bytes": as_int(message_bytes), "stage_count": as_int(stage_count), "cycle_to_us_divisor": as_int(divisor), "rank_ids": rank_ids, "launch_ids": launch_ids, + "trace_statuses": trace_statuses(entries, root), "sources": [ relpath(entry["path"], root) for entry in sorted(entries, key=lambda item: (item["launch_id"], item["rank"], relpath(item["path"], root))) ], - "bars": normalized_bars(entries, root), + "bars": normalized_bars(entries, root, hosts), } group["summary"] = summarize_group(group) groups.append(group) @@ -289,6 +406,7 @@ def sort_key(item): "warmup_iters": args.warmup_iters, "measured_iters": args.iters, "profile_sample_every": args.profile_sample_every, + "hosts": public_hosts(hosts), "groups": groups, "diagnostics": diagnostics, } @@ -311,6 +429,8 @@ def add_incompatible_group_diagnostics(grouped, root, diagnostics): as_int(item[3]), as_int(item[4]), as_int(item[5]), + as_int(item[6]), + as_int(item[7]), as_int(item[0]), )) ] @@ -320,18 +440,34 @@ def add_incompatible_group_diagnostics(grouped, root, diagnostics): def describe_group(key, entries, root): - op_type, op_name, rank_size, message_bytes, stage_count, divisor = key + op_type, op_name, rank_size, max_core_count, block_dim, message_bytes, stage_count, divisor = key sources = ", ".join( relpath(entry["path"], root) for entry in sorted(entries, key=lambda item: (item["launch_id"], item["rank"], relpath(item["path"], root))) ) return ( f"op_type={op_type} op_name={op_name} rank_size={rank_size} " + f"max_core_count={max_core_count} block_dim={block_dim} " f"message_bytes={message_bytes} stage_count={stage_count} " f"cycle_to_us_divisor={divisor} sources=[{sources}]" ) +def trace_statuses(entries, root): + statuses = [] + for entry in sorted(entries, key=lambda item: (item["launch_id"], item["rank"], relpath(item["path"], root))): + trace = entry["trace"] + statuses.append({ + "rank": entry["rank"], + "launch_id": entry["launch_id"], + "source": relpath(entry["path"], root), + "incomplete": bool(trace.get("incomplete")), + "reason": str(trace.get("incomplete_reason") or ""), + "synthetic": bool(trace.get("synthetic")), + }) + return statuses + + def format_launches(launch_ids): if not launch_ids: return "none" @@ -344,6 +480,10 @@ def format_slowest_launch(slowest): return f"launch{slowest['launch_id']} at {slowest['kernel_us']:.3f} us" +def public_hosts(hosts): + return {rank: dict(info) for rank, info in sorted(hosts.items())} + + def render_analysis(index): lines = [ "# TileXR Collective Profile Run Analysis", @@ -378,9 +518,12 @@ def render_analysis(index): ) lines.append(f"## Group {group_index}: {group['op_name']} message_bytes={group['message_bytes']}") lines.append(f"- Launches: {format_launches(group['launch_ids'])}") + lines.append(f"- Hosts: {format_hosts(index.get('hosts', {}), group['rank_ids'])}") lines.append(f"- Slowest launch: {format_slowest_launch(slowest)}") lines.append(f"- Top stage: {summary['top_stage']['stage']} at {summary['top_stage']['sum_us']:.3f} us") lines.append(f"- Stage totals: {stage_summary}") + lines.append(f"- Slowest rank: {format_slowest_rank(summary['slowest_rank'], index.get('hosts', {}))}") + lines.append(f"- Rank kernel totals: {format_rank_kernel(summary['rank_kernel'], index.get('hosts', {}))}") lines.append( f"- Rank/core max: rank{rank_core['rank']} core{rank_core['core']} " f"{rank_core['stage']} max {rank_core['max_us']:.3f} us" @@ -390,6 +533,28 @@ def render_analysis(index): return "\n".join(lines) +def format_slowest_rank(slowest_rank, hosts=None): + rank = slowest_rank.get("rank") + if rank is None: + return "unavailable" + launch_id = slowest_rank.get("slowest_launch_id") + launch_text = "unknown launch" if launch_id is None else f"launch{launch_id}" + label = rank_label(hosts or {}, rank) + return ( + f"{label} avg {slowest_rank.get('avg_kernel_us', 0.0):.3f} us " + f"max {slowest_rank.get('max_kernel_us', 0.0):.3f} us at {launch_text}" + ) + + +def format_rank_kernel(rank_kernel, hosts=None): + if not rank_kernel: + return "unavailable" + return "; ".join( + f"{rank_label(hosts or {}, item['rank'])} avg={item['avg_kernel_us']:.3f} us max={item['max_kernel_us']:.3f} us" + for item in rank_kernel + ) + + def render_ai_prompt(index): lines = [ "# TileXR collective profiling run", @@ -412,6 +577,9 @@ def render_ai_prompt(index): else: lines.append(f" slowest_launch=launch{slowest['launch_id']} kernel_us={slowest['kernel_us']:.3f}") lines.append(f" top_stage={summary['top_stage']['stage']} sum_us={summary['top_stage']['sum_us']:.3f}") + lines.append(f" hosts={format_hosts(index.get('hosts', {}), group['rank_ids'])}") + lines.append(f" slowest_rank={format_slowest_rank(summary['slowest_rank'], index.get('hosts', {}))}") + lines.append(f" rank_kernel_totals={format_rank_kernel(summary['rank_kernel'], index.get('hosts', {}))}") if index["diagnostics"]: lines.append("") @@ -421,6 +589,164 @@ def render_ai_prompt(index): return "\n".join(lines) + "\n" +def render_perfetto_trace(index): + events = [] + seen_ranks = set() + seen_threads = set() + seen_launch_threads = set() + launch_offsets = compute_perfetto_launch_offsets(index) + + for group in index["groups"]: + for rank in group["rank_ids"]: + if rank not in seen_ranks: + seen_ranks.add(rank) + events.append({ + "name": "process_name", + "ph": "M", + "pid": rank, + "args": {"name": rank_label(index.get("hosts", {}), rank)}, + }) + + for launch_id in group["launch_ids"]: + launch_rank_bars = defaultdict(list) + for bar in group["bars"]: + if bar["launch_id"] == launch_id: + launch_rank_bars[bar["rank"]].append(bar) + for rank, rank_bars in sorted(launch_rank_bars.items()): + if rank not in seen_launch_threads: + seen_launch_threads.add(rank) + events.append({ + "name": "thread_name", + "ph": "M", + "pid": rank, + "tid": PERFETTO_LAUNCH_WINDOW_TID, + "args": {"name": f"{rank_label(index.get('hosts', {}), rank)}/launch_windows"}, + }) + max_end_us = max((bar["end_us"] for bar in rank_bars), default=0.0) + if max_end_us <= 0: + continue + launch_offset_us = launch_offsets.get(launch_id, 0.0) + events.append({ + "name": f"launch{launch_id}/{rank_label(index.get('hosts', {}), rank)}/window", + "cat": "launch_window", + "ph": "X", + "pid": rank, + "tid": PERFETTO_LAUNCH_WINDOW_TID, + "ts": launch_offset_us, + "dur": max_end_us, + "args": { + "launch_id": launch_id, + "rank": rank, + "host": rank_host_info(index.get("hosts", {}), rank).get("host", ""), + "host_ip": rank_host_info(index.get("hosts", {}), rank).get("ip", ""), + "launch_offset_us": launch_offset_us, + }, + }) + + for status in group.get("trace_statuses", []): + if not status.get("incomplete"): + continue + rank = as_int(status.get("rank")) + launch_id = as_int(status.get("launch_id")) + if rank not in seen_launch_threads: + seen_launch_threads.add(rank) + events.append({ + "name": "thread_name", + "ph": "M", + "pid": rank, + "tid": PERFETTO_LAUNCH_WINDOW_TID, + "args": {"name": f"{rank_label(index.get('hosts', {}), rank)}/launch_windows"}, + }) + rank_status_label = rank_label(index.get("hosts", {}), rank) + events.append({ + "name": f"launch{launch_id}/{rank_status_label}/incomplete_trace", + "cat": "trace_status", + "ph": "X", + "pid": rank, + "tid": PERFETTO_LAUNCH_WINDOW_TID, + "ts": launch_offsets.get(launch_id, 0.0), + "dur": 1.0, + "args": { + "launch_id": launch_id, + "rank": rank, + "rank_label": rank_status_label, + "reason": str(status.get("reason") or ""), + "source": status.get("source", ""), + "op_name": group["op_name"], + "message_bytes": group["message_bytes"], + "rank_size": group["rank_size"], + "block_dim": group.get("block_dim", 0), + "max_core_count": group.get("max_core_count", 0), + }, + }) + + for bar in group["bars"]: + thread_key = (bar["rank"], bar["core"]) + if thread_key not in seen_threads: + seen_threads.add(thread_key) + events.append({ + "name": "thread_name", + "ph": "M", + "pid": bar["rank"], + "tid": bar["core"], + "args": {"name": f"{rank_label(index.get('hosts', {}), bar['rank'])}/core{bar['core']}"}, + }) + + if bar["duration_us"] <= 0: + continue + + launch_offset_us = launch_offsets.get(bar["launch_id"], 0.0) + bar_rank_label = rank_label(index.get("hosts", {}), bar["rank"]) + events.append({ + "name": f"launch{bar['launch_id']}/{bar_rank_label}/{bar['stage']}", + "cat": group["op_name"], + "ph": "X", + "pid": bar["rank"], + "tid": bar["core"], + "ts": launch_offset_us + bar["start_us"], + "dur": bar["duration_us"], + "args": { + "launch_id": bar["launch_id"], + "launch_offset_us": launch_offset_us, + "normalized_ts": bar["start_us"], + "rank": bar["rank"], + "rank_label": bar_rank_label, + "host": bar.get("host", ""), + "host_ip": bar.get("host_ip", ""), + "core": bar["core"], + "stage": bar["stage"], + "stage_id": bar["stage_id"], + "sum_us": bar["sum_us"], + "raw_cycles": bar["raw_cycles"], + "max_cycles": bar["max_cycles"], + "count": bar["count"], + "source": bar["source"], + "op_name": group["op_name"], + "message_bytes": group["message_bytes"], + "rank_size": group["rank_size"], + }, + }) + + return { + "displayTimeUnit": "us", + "traceEvents": events, + } + + +def compute_perfetto_launch_offsets(index): + max_end_by_launch = defaultdict(float) + for group in index["groups"]: + for bar in group["bars"]: + max_end_by_launch[bar["launch_id"]] = max(max_end_by_launch[bar["launch_id"]], bar["end_us"]) + + offsets = {} + cursor = 0.0 + for launch_id in sorted(max_end_by_launch): + offsets[launch_id] = cursor + cursor += max_end_by_launch[launch_id] + PERFETTO_LAUNCH_GAP_US + return offsets + + def json_for_script(value): return ( json.dumps(value, separators=(",", ":"), ensure_ascii=False) @@ -435,6 +761,8 @@ def json_for_script(value): def render_html(index): data = json_for_script(index) summary_items = [] + rank_summary_rows = [] + trace_status_rows = [] fallback_rows = [] for group in index["groups"]: @@ -445,17 +773,54 @@ def render_html(index): "
  • " f"{html.escape(str(group['op_name']))} bytes={group['message_bytes']}: " f"Slowest launch {html.escape(format_slowest_launch(slowest))}; " + f"Hosts {html.escape(format_hosts(index.get('hosts', {}), group['rank_ids']))}; " + f"Slowest rank {html.escape(format_slowest_rank(summary['slowest_rank'], index.get('hosts', {})))}; " f"Top stage {html.escape(str(summary['top_stage']['stage']))} {summary['top_stage']['sum_us']:.3f} us; " f"Rank/core max rank{rank_core['rank']} core{rank_core['core']} " f"{html.escape(str(rank_core['stage']))} max {rank_core['max_us']:.3f} us" "
  • " ) + for rank_item in summary["rank_kernel"]: + launch_id = rank_item["slowest_launch_id"] + drilldown = find_rank_launch_drilldown(group["bars"], rank_item["rank"], launch_id) + rank_summary_rows.append( + "" + f"{html.escape(str(group['op_name']))}" + f"{group['message_bytes']}" + f"{html.escape(rank_label(index.get('hosts', {}), rank_item['rank']))}" + f"{html.escape(rank_host_info(index.get('hosts', {}), rank_item['rank']).get('ip', ''))}" + f"{rank_item['launch_count']}" + f"{rank_item['avg_kernel_us']:.3f}" + f"{rank_item['max_kernel_us']:.3f}" + f"launch{launch_id}" + f"open launch report" + "" + ) + + for status in group.get("trace_statuses", []): + if not status.get("incomplete") and not status.get("synthetic"): + continue + status_text = "incomplete" if status.get("incomplete") else "synthetic" + trace_status_rows.append( + "" + f"{html.escape(str(group['op_name']))}" + f"{group['message_bytes']}" + f"launch{status.get('launch_id')}" + f"{html.escape(rank_label(index.get('hosts', {}), as_int(status.get('rank'))))}" + f"{html.escape(status_text)}" + f"{html.escape(str(status.get('reason') or ''))}" + f"{html.escape(str(status.get('source') or ''))}" + "" + ) + for bar in group["bars"]: fallback_rows.append( "" f"launch{bar['launch_id']}" - f"rank{bar['rank']}/core{bar['core']}" + f"{html.escape(rank_label(index.get('hosts', {}), bar['rank']))}" + f"{html.escape(str(bar.get('host_ip', '')))}" + f"core{bar['core']}" f"{html.escape(str(bar['stage']))}" f"{bar['duration_us']:.3f}" f"{bar['sum_us']:.3f}" @@ -468,7 +833,9 @@ def render_html(index): diagnostics = "".join(f"
  • {html.escape(item)}
  • " for item in index["diagnostics"]) summary_html = "".join(summary_items) + diagnostics - fallback_html = "".join(fallback_rows) or "No trace bars available." + rank_summary_html = "".join(rank_summary_rows) or "No rank kernel totals available." + trace_status_html = "".join(trace_status_rows) or "No incomplete or synthetic traces." + fallback_html = "".join(fallback_rows) or "No trace bars available." stage_filter_html = "\n".join( f"" @@ -500,6 +867,16 @@ def render_html(index):
      {summary_html}
    +

    Rank-Level Summary

    +

    Kernel totals are grouped by rank using the slowest core per rank/launch. This is the fastest view for spotting a slow rank before drilling into core-level stages.

    +{rank_summary_html}
    OpBytesRank@HostHost IPLaunchesAvg kernel usMax kernel usSlowest launchDrilldown
    +
    +
    +

    Trace Status

    +

    Incomplete traces keep the same single-launch report schema but indicate that kernel execution failed before device stats could be copied back.

    +{trace_status_html}
    OpBytesLaunchRank@HostStatusReasonSource
    +
    +

    Chronological Timeline

    Each rank/launch lane is normalized independently; cross-NPU raw cycle offsets are not assumed to be synchronized.

    @@ -514,7 +891,7 @@ def render_html(index):

    Fallback Table

    -{fallback_html}
    LaunchLaneStageDuration usSum usDrilldown
    +{fallback_html}
    LaunchRank@HostHost IPCoreStageDuration usSum usDrilldown