From 20ebb38b9ac2979dcb983265905407505cded5fa Mon Sep 17 00:00:00 2001 From: Shenghsun Cho Date: Wed, 13 May 2026 00:27:11 +0000 Subject: [PATCH 1/3] Enhancement: Add rocprofv2 trace support for AMD GPUs - runner.py: Add SB_ENABLE_ROCPROF/SB_ROCPROF_TRACE_DIR env vars to enable rocprofv2 profiling (--hip-trace --kernel-trace --plugin json) in local, torch.distributed, and mpi modes - pytorch_base.py: Extend GPU guard to support ROCm (torch.version.hip) so PyTorch profiler works on AMD GPUs --- .../model_benchmarks/pytorch_base.py | 4 +- superbench/runner/runner.py | 49 ++++++++++++++----- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index de06b35d0..f2428f2c6 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -591,8 +591,8 @@ def _benchmark(self): Run the benchmark then handle post-run model log save/compare. Set SB_ENABLE_PYTORCH_PROFILER='1' to enable profiling. """ - # Check if this is a Nvidia GPU - if not (torch.cuda.is_available() and torch.version.cuda is not None): + # Check if this is a Nvidia or AMD GPU + if not (torch.cuda.is_available() and (torch.version.cuda is not None or torch.version.hip is not None)): ok = super()._benchmark() self._post_run_model_log() return ok diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index a5ac13cbb..5c588bb96 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -135,12 +135,23 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): enable_nsys = os.environ.get('SB_ENABLE_NSYS', '') == '1' trace_dir = os.environ.get('SB_NSYS_TRACE_DIR', self._sb_output_dir) + # Enable rocprofv2 profiling based on environment variable + enable_rocprof = os.environ.get('SB_ENABLE_ROCPROF', '') == '1' + rocprof_trace_dir = os.environ.get('SB_ROCPROF_TRACE_DIR', self._sb_output_dir) + mode_command = exec_command if mode.name == 'local': - trace_command = ( - f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' - f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' - ) if enable_nsys and mode.proc_rank == 0 else '' + trace_command = '' + if enable_nsys and mode.proc_rank == 0: + trace_command = ( + f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' + f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' + ) + elif enable_rocprof and mode.proc_rank == 0: + trace_command = ( + f'rocprofv2 --hip-trace --kernel-trace --plugin json ' + f'-d {rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' + ) # Build the command parts, only including trace if it's not empty command_parts = [] prefix = mode.prefix.format(proc_rank=mode.proc_rank, proc_num=mode.proc_num) @@ -159,10 +170,17 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' ) - nsys_prefix = ( - f'nsys profile --output {trace_dir}/{benchmark_name}_traces ' - f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' - ) if enable_nsys else '' + nsys_prefix = '' + if enable_nsys: + nsys_prefix = ( + f'nsys profile --output {trace_dir}/{benchmark_name}_traces ' + f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' + ) + elif enable_rocprof: + nsys_prefix = ( + f'rocprofv2 --hip-trace --kernel-trace --plugin json ' + f'-d {rocprof_trace_dir}/{benchmark_name}_traces ' + ) mode_command = ( f'{nsys_prefix}' @@ -172,10 +190,17 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl' ) elif mode.name == 'mpi': - trace_command = ( - f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' - f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' - ) if enable_nsys else '' + trace_command = '' + if enable_nsys: + trace_command = ( + f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' + f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' + ) + elif enable_rocprof: + trace_command = ( + f'rocprofv2 --hip-trace --kernel-trace --plugin json ' + f'-d {rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' + ) mode_command = ( '{trace} ' 'mpirun ' # use default OpenMPI in image From 9b1dd82a66b809da10d45e28e344571fa6dab083 Mon Sep 17 00:00:00 2001 From: Shenghsun Cho Date: Tue, 19 May 2026 18:46:06 +0000 Subject: [PATCH 2/3] Use shlex.quote() for trace output paths in runner.py Address PR review: wrap all interpolated path/name segments in shlex.quote() to prevent command injection or broken commands when paths contain whitespace or shell metacharacters. Applied to both nsys and rocprofv2 trace commands across all three execution modes (local, torch.distributed, mpi). --- superbench/runner/runner.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 5c588bb96..1432408cc 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -7,6 +7,7 @@ import sys import json import random +import shlex import signal from pathlib import Path from pprint import pformat @@ -143,14 +144,16 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): if mode.name == 'local': trace_command = '' if enable_nsys and mode.proc_rank == 0: + trace_output = shlex.quote(f'{trace_dir}/{benchmark_name}_{mode.proc_rank}_traces') trace_command = ( - f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' + f'nsys profile --output {trace_output} ' f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' ) elif enable_rocprof and mode.proc_rank == 0: + trace_output = shlex.quote(f'{rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces') trace_command = ( f'rocprofv2 --hip-trace --kernel-trace --plugin json ' - f'-d {rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' + f'-d {trace_output} ' ) # Build the command parts, only including trace if it's not empty command_parts = [] @@ -172,14 +175,16 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): nsys_prefix = '' if enable_nsys: + trace_output = shlex.quote(f'{trace_dir}/{benchmark_name}_traces') nsys_prefix = ( - f'nsys profile --output {trace_dir}/{benchmark_name}_traces ' + f'nsys profile --output {trace_output} ' f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' ) elif enable_rocprof: + trace_output = shlex.quote(f'{rocprof_trace_dir}/{benchmark_name}_traces') nsys_prefix = ( f'rocprofv2 --hip-trace --kernel-trace --plugin json ' - f'-d {rocprof_trace_dir}/{benchmark_name}_traces ' + f'-d {trace_output} ' ) mode_command = ( @@ -192,14 +197,16 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): elif mode.name == 'mpi': trace_command = '' if enable_nsys: + trace_output = shlex.quote(f'{trace_dir}/{benchmark_name}_{mode.proc_rank}_traces') trace_command = ( - f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' + f'nsys profile --output {trace_output} ' f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' ) elif enable_rocprof: + trace_output = shlex.quote(f'{rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces') trace_command = ( f'rocprofv2 --hip-trace --kernel-trace --plugin json ' - f'-d {rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' + f'-d {trace_output} ' ) mode_command = ( '{trace} ' From a9485efb300ed3efdca7d2124bac3e47d0042d3e Mon Sep 17 00:00:00 2001 From: Shenghsun Cho Date: Tue, 19 May 2026 18:56:09 +0000 Subject: [PATCH 3/3] Rename nsys_prefix to trace_prefix for tool-agnostic naming Address PR review: the variable holds either an nsys or rocprofv2 prefix, so rename to trace_prefix to avoid implying Nsight-only behavior. --- superbench/runner/runner.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 1432408cc..c0b2345e5 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -173,22 +173,22 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' ) - nsys_prefix = '' + trace_prefix = '' if enable_nsys: trace_output = shlex.quote(f'{trace_dir}/{benchmark_name}_traces') - nsys_prefix = ( + trace_prefix = ( f'nsys profile --output {trace_output} ' f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' ) elif enable_rocprof: trace_output = shlex.quote(f'{rocprof_trace_dir}/{benchmark_name}_traces') - nsys_prefix = ( + trace_prefix = ( f'rocprofv2 --hip-trace --kernel-trace --plugin json ' f'-d {trace_output} ' ) mode_command = ( - f'{nsys_prefix}' + f'{trace_prefix}' f'torchrun' f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}' f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp'