From 7f23c7535393333cf93c0adb95b37a028d3cfed1 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 18 Dec 2025 01:16:28 +0000
Subject: [PATCH 01/27] remove fixed gpu id & numa id assignment

---
 .../micro_benchmarks/gpu_stream/gpu_stream.cu | 56 ++++++++-----------
 .../gpu_stream/gpu_stream_utils.hpp           |  5 +-
 tests/data/gpu_stream.log                     | 50 ++++++-----------
 3 files changed, 41 insertions(+), 70 deletions(-)
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 617b8338a..105ea3028 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -235,15 +235,15 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr
     cudaError_t cuda_err = cudaSuccess;
 
     if (args->check_data) {
-        // Generate data to copy
-        args->sub.data_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id));
+        // Generate data to copy - use local NUMA node for best CPU access
+        args->sub.data_buf = static_cast<T *>(numa_alloc_local(args->size * sizeof(T)));
 
         for (int j = 0; j < args->size / sizeof(T); j++) {
             args->sub.data_buf[j] = static_cast<T>(j % kUInt8Mod);
         }
 
-        // Allocate check buffer
-        args->sub.check_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id));
+        // Allocate check buffer on local NUMA node
+        args->sub.check_buf = static_cast<T *>(numa_alloc_local(args->size * sizeof(T)));
     }
 
     // Allocate buffers
@@ -583,9 +583,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
 
     // output formatted results to stdout
     // Tags are of format:
-    // STREAM_<Kernelname>_datatype_gpu_<gpu_id>_buffer_<buffer_size>_block_<block_size>
+    // STREAM_<Kernelname>_datatype_buffer_<buffer_size>_block_<block_size>
     for (int i = 0; i < args->sub.times_in_ms.size(); i++) {
-        std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_gpu_" + std::to_string(args->gpu_id) +
+        std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type +
                           "_buffer_" + std::to_string(args->size);
         for (int j = 0; j < args->sub.times_in_ms[i].size(); j++) {
             // Calculate and display bandwidth
@@ -608,9 +608,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
 /**
  * @brief Runs the Stream benchmark.
  *
- * @details This function processes the input args, validates and composes the BenchArgs structure for the
- availavble
- * GPUs, and runs the benchmark.
+ * @details This function processes the input args, validates and composes the BenchArgs structure for
+ * the first visible GPU (CUDA device 0). When running under Superbench's default_local_mode,
+ * CUDA_VISIBLE_DEVICES is set per process, so device 0 maps to the assigned physical GPU.
  *
  * @return int The status code indicating success or failure of the benchmark execution.
  * */
@@ -631,23 +631,23 @@ int GpuStream::Run() {
         return ret;
     }
 
-    // find all GPUs and compose the Benchmarking data structure
-    for (int j = 0; j < gpu_count; j++) {
-        auto args = std::make_unique<BenchArgs<double>>();
-        args->numa_id = 0;
-        args->gpu_id = j;
-        cudaGetDeviceProperties(&args->gpu_device_prop, j);
+    if (gpu_count < 1) {
+        std::cerr << "Run::No GPU available" << std::endl;
+        return -1;
+    }
 
-        args->num_warm_up = opts_.num_warm_up;
-        args->num_loops = opts_.num_loops;
-        args->size = opts_.size;
-        args->check_data = opts_.check_data;
-        args->numa_id = 0;
-        args->gpu_id = j;
+    // Run on CUDA device 0 (the visible GPU assigned by CUDA_VISIBLE_DEVICES).
+    auto args = std::make_unique<BenchArgs<double>>();
+    args->gpu_id = 0;
+    cudaGetDeviceProperties(&args->gpu_device_prop, 0);
 
-        // add data to vector
-        bench_args_.emplace_back(std::move(args));
-    }
+    args->num_warm_up = opts_.num_warm_up;
+    args->num_loops = opts_.num_loops;
+    args->size = opts_.size;
+    args->check_data = opts_.check_data;
+
+    // add data to vector
+    bench_args_.emplace_back(std::move(args));
 
     bool has_error = false;
     // Run the benchmark for all the configured data
@@ -668,14 +668,6 @@ int GpuStream::Run() {
                 // Print device info with both the memory clock and peak bandwidth
                 PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop, memory_clock_mhz, peak_bw);
 
-                // Set the NUMA node
-                ret = numa_run_on_node(curr_args->numa_id);
-                if (ret != 0) {
-                    std::cerr << "Run::numa_run_on_node error: " << errno << std::endl;
-                    has_error = true;
-                    return;
-                }
-
                 // Run the stream benchmark for the configured data, passing the peak bandwidth
                 if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<float>>) {
                     ret = RunStream<float>(curr_args, "float", peak_bw);
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
index 0c648514b..720810e5a 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
@@ -83,10 +83,7 @@ template <typename T> struct SubBenchArgs {
 // Arguments for each benchmark run.
 template <typename T> struct BenchArgs {
 
-    // NUMA node under which the benchmark is done.
-    uint64_t numa_id = 0;
-
-    // GPU ID for device.
+    // GPU ID for device (always 0 - actual GPU determined by CUDA_VISIBLE_DEVICES).
     int gpu_id = 0;
 
     // GPU device info
diff --git a/tests/data/gpu_stream.log b/tests/data/gpu_stream.log
index c3d6f2390..b26310390 100644
--- a/tests/data/gpu_stream.log
+++ b/tests/data/gpu_stream.log
@@ -5,37 +5,19 @@ Number of loops: 40
 Check data: No
 
 Device 0: "NVIDIA Graphics Device"  152 SMs(10.0)  Memory: 4000MHz x 8192-bit = 8192 GB/s PEAK   ECC is ON
-STREAM_COPY_double_gpu_0_buffer_4294967296_block_128    6711.67 81.93
-STREAM_COPY_double_gpu_0_buffer_4294967296_block_256    6549.50 79.95
-STREAM_COPY_double_gpu_0_buffer_4294967296_block_512    6195.43 75.63
-STREAM_COPY_double_gpu_0_buffer_4294967296_block_1024   5721.52 69.84
-STREAM_SCALE_double_gpu_0_buffer_4294967296_block_128   6680.42 81.55
-STREAM_SCALE_double_gpu_0_buffer_4294967296_block_256   6515.51 79.54
-STREAM_SCALE_double_gpu_0_buffer_4294967296_block_512   6106.69 74.54
-STREAM_SCALE_double_gpu_0_buffer_4294967296_block_1024  5626.68 68.69
-STREAM_ADD_double_gpu_0_buffer_4294967296_block_128     7379.25 90.08
-STREAM_ADD_double_gpu_0_buffer_4294967296_block_256     7407.27 90.42
-STREAM_ADD_double_gpu_0_buffer_4294967296_block_512     7309.59 89.23
-STREAM_ADD_double_gpu_0_buffer_4294967296_block_1024    6788.64 82.87
-STREAM_TRIAD_double_gpu_0_buffer_4294967296_block_128   7378.19 90.07
-STREAM_TRIAD_double_gpu_0_buffer_4294967296_block_256   7414.01 90.50
-STREAM_TRIAD_double_gpu_0_buffer_4294967296_block_512   7295.50 89.06
-STREAM_TRIAD_double_gpu_0_buffer_4294967296_block_1024  6730.42 82.16
-
-Device 1: "NVIDIA Graphics Device"  152 SMs(10.0)  Memory: 4000.00MHz x 8192-bit = 8192.00 GB/s PEAK   ECC is ON
-STREAM_COPY_double_gpu_1_buffer_4294967296_block_128    6708.74 81.89
-STREAM_COPY_double_gpu_1_buffer_4294967296_block_256    6549.47 79.95
-STREAM_COPY_double_gpu_1_buffer_4294967296_block_512    6195.39 75.63
-STREAM_COPY_double_gpu_1_buffer_4294967296_block_1024   5725.07 69.89
-STREAM_SCALE_double_gpu_1_buffer_4294967296_block_128   6678.56 81.53
-STREAM_SCALE_double_gpu_1_buffer_4294967296_block_256   6514.05 79.52
-STREAM_SCALE_double_gpu_1_buffer_4294967296_block_512   6103.80 74.51
-STREAM_SCALE_double_gpu_1_buffer_4294967296_block_1024  5630.41 68.73
-STREAM_ADD_double_gpu_1_buffer_4294967296_block_128     7377.74 90.06
-STREAM_ADD_double_gpu_1_buffer_4294967296_block_256     7410.97 90.47
-STREAM_ADD_double_gpu_1_buffer_4294967296_block_512     7310.80 89.24
-STREAM_ADD_double_gpu_1_buffer_4294967296_block_1024    6789.91 82.88
-STREAM_TRIAD_double_gpu_1_buffer_4294967296_block_128   7379.03 90.08
-STREAM_TRIAD_double_gpu_1_buffer_4294967296_block_256   7414.04 90.50
-STREAM_TRIAD_double_gpu_1_buffer_4294967296_block_512   7298.26 89.09
-STREAM_TRIAD_double_gpu_1_buffer_4294967296_block_1024  6732.15 82.18
\ No newline at end of file
+STREAM_COPY_double_buffer_4294967296_block_128	6711.67	81.93
+STREAM_COPY_double_buffer_4294967296_block_256	6549.50	79.95
+STREAM_COPY_double_buffer_4294967296_block_512	6195.43	75.63
+STREAM_COPY_double_buffer_4294967296_block_1024	5721.52	69.84
+STREAM_SCALE_double_buffer_4294967296_block_128	6680.42	81.55
+STREAM_SCALE_double_buffer_4294967296_block_256	6515.51	79.54
+STREAM_SCALE_double_buffer_4294967296_block_512	6106.69	74.54
+STREAM_SCALE_double_buffer_4294967296_block_1024	5626.68	68.69
+STREAM_ADD_double_buffer_4294967296_block_128	7379.25	90.08
+STREAM_ADD_double_buffer_4294967296_block_256	7407.27	90.42
+STREAM_ADD_double_buffer_4294967296_block_512	7309.59	89.23
+STREAM_ADD_double_buffer_4294967296_block_1024	6788.64	82.87
+STREAM_TRIAD_double_buffer_4294967296_block_128	7378.19	90.07
+STREAM_TRIAD_double_buffer_4294967296_block_256	7414.01	90.50
+STREAM_TRIAD_double_buffer_4294967296_block_512	7295.50	89.06
+STREAM_TRIAD_double_buffer_4294967296_block_1024	6730.42	82.16
\ No newline at end of file

From d63fe8c1f209429eb7b1f6709ea175c62d23d8e0 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 18 Dec 2025 18:49:10 +0000
Subject: [PATCH 02/27] use 128bit alignment, add float support, cleanup

---
 examples/benchmarks/gpu_stream.py             |   2 +-
 .../gpu_stream/CMakeLists.txt                 |   2 +-
 .../micro_benchmarks/gpu_stream/gpu_stream.cu |  64 +++---
 .../gpu_stream/gpu_stream.hpp                 |   2 +-
 .../gpu_stream/gpu_stream_kernels.cu          | 164 ++-------------
 .../gpu_stream/gpu_stream_kernels.hpp         | 195 +++++++++++++++++-
 ...pu_stream_test.cpp => gpu_stream_main.cpp} |   0
 .../gpu_stream/gpu_stream_utils.cpp           |  14 +-
 .../gpu_stream/gpu_stream_utils.hpp           |   4 +-
 tests/data/gpu_stream.log                     |   1 +
 10 files changed, 264 insertions(+), 184 deletions(-)
 rename superbench/benchmarks/micro_benchmarks/gpu_stream/{gpu_stream_test.cpp => gpu_stream_main.cpp} (100%)

diff --git a/examples/benchmarks/gpu_stream.py b/examples/benchmarks/gpu_stream.py
index 88c789efb..1aa67b15d 100644
--- a/examples/benchmarks/gpu_stream.py
+++ b/examples/benchmarks/gpu_stream.py
@@ -12,7 +12,7 @@
 
 if __name__ == '__main__':
     context = BenchmarkRegistry.create_benchmark_context(
-        'gpu-stream', platform=Platform.CUDA, parameters='--num_warm_up 1 --num_loops 10'
+        'gpu-stream', platform=Platform.CUDA, parameters='--num_warm_up 1 --num_loops 10 --data_type double'
     )
     # For ROCm environment, please specify the benchmark name and the platform as the following.
     # context = BenchmarkRegistry.create_benchmark_context(
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
index 2c856f32a..ce15d10c7 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
@@ -15,7 +15,7 @@ find_package(CUDAToolkit QUIET)
 
 # Source files
 set(SOURCES
-    gpu_stream_test.cpp
+    gpu_stream_main.cpp
     gpu_stream_utils.cpp
     gpu_stream.cu
     gpu_stream_kernels.cu
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 105ea3028..2afc5153d 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -420,8 +420,10 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
     int size_factor = 2;
 
     // Validate data size
-    uint64_t num_elements_in_thread_block = kNumLoopUnroll * num_threads_per_block;
-    uint64_t num_bytes_in_thread_block = num_elements_in_thread_block * sizeof(T);
+    // Each thread processes 128 bits (16 bytes) for optimal memory bandwidth.
+    // For double: uses double2 (16 bytes). For float: would use float4 (16 bytes).
+    constexpr uint64_t kBytesPerThread = 16; // 128-bit aligned access
+    uint64_t num_bytes_in_thread_block = num_threads_per_block * kBytesPerThread;
     if (args->size % num_bytes_in_thread_block) {
         std::cerr << "RunCopy: Data size should be multiple of " << num_bytes_in_thread_block << std::endl;
         return -1;
@@ -448,30 +450,30 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
 
         switch (kernel) {
         case Kernel::kCopy:
-            CopyKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()));
+            CopyKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
+                reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()));
             args->sub.kernel_name = "COPY";
             break;
         case Kernel::kScale:
-            ScaleKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
+            ScaleKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
+                reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
             args->sub.kernel_name = "SCALE";
             break;
         case Kernel::kAdd:
-            AddKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()));
+            AddKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
+                reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[1].get()));
             size_factor = 3;
             args->sub.kernel_name = "ADD";
             break;
         case Kernel::kTriad:
-            TriadKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
+            TriadKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
+                reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
             size_factor = 3;
             args->sub.kernel_name = "TRIAD";
             break;
@@ -637,17 +639,25 @@ int GpuStream::Run() {
     }
 
     // Run on CUDA device 0 (the visible GPU assigned by CUDA_VISIBLE_DEVICES).
-    auto args = std::make_unique<BenchArgs<double>>();
-    args->gpu_id = 0;
-    cudaGetDeviceProperties(&args->gpu_device_prop, 0);
-
-    args->num_warm_up = opts_.num_warm_up;
-    args->num_loops = opts_.num_loops;
-    args->size = opts_.size;
-    args->check_data = opts_.check_data;
-
-    // add data to vector
-    bench_args_.emplace_back(std::move(args));
+    if (opts_.data_type == "float") {
+        auto args = std::make_unique<BenchArgs<float>>();
+        args->gpu_id = 0;
+        cudaGetDeviceProperties(&args->gpu_device_prop, 0);
+        args->num_warm_up = opts_.num_warm_up;
+        args->num_loops = opts_.num_loops;
+        args->size = opts_.size;
+        args->check_data = opts_.check_data;
+        bench_args_.emplace_back(std::move(args));
+    } else {
+        auto args = std::make_unique<BenchArgs<double>>();
+        args->gpu_id = 0;
+        cudaGetDeviceProperties(&args->gpu_device_prop, 0);
+        args->num_warm_up = opts_.num_warm_up;
+        args->num_loops = opts_.num_loops;
+        args->size = opts_.size;
+        args->check_data = opts_.check_data;
+        bench_args_.emplace_back(std::move(args));
+    }
 
     bool has_error = false;
     // Run the benchmark for all the configured data
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
index 473a78839..754888339 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
@@ -34,7 +34,7 @@ class GpuStream {
     int Run();
 
   private:
-    using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<double>>>;
+    using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<float>>, std::unique_ptr<BenchArgs<double>>>;
     std::vector<BenchArgsVariant> bench_args_;
     Opts opts_;
 
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.cu
index 548fc8ba3..e40237b83 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.cu
@@ -1,155 +1,33 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include "gpu_stream_kernels.hpp"
-
-/**
- * @brief Fetches a value from source memory and writes it to a register.
- *
- * @details This inline device function fetches a value from the specified source memory
- * location and writes it to the provided register. The implementation references the following:
- * 1) NCCL:
- * https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L483
- * 2) RCCL:
- * https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L268
- *
- * @tparam T The type of the value to fetch.
- * @param[out] v The register to write the fetched value to.
- * @param[in] p The source memory location to fetch the value from.
- */
-template <typename T> inline __device__ void Fetch(T &v, const T *p) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    v = *p;
-#else
-    if constexpr (std::is_same<T, float>::value) {
-        asm volatile("ld.volatile.global.f32 %0, [%1];" : "=f"(v) : "l"(p) : "memory");
-    } else if constexpr (std::is_same<T, double>::value) {
-        asm volatile("ld.volatile.global.f64 %0, [%1];" : "=d"(v) : "l"(p) : "memory");
-    }
-#endif
-}
-
 /**
- * @brief Stores a value from register and writes it to target memory.
+ * @file gpu_stream_kernels.cu
+ * @brief CUDA kernel compilation unit for GPU stream benchmark.
  *
- * @details This inline device function stores a value from the provided register
- * and writes it to the specified target memory location. The implementation references the following:
- * 1) NCCL:
- * https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L486
- * 2) RCCL:
- * https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L276
+ * All template kernel implementations (CopyKernel, ScaleKernel, AddKernel, TriadKernel)
+ * are defined in gpu_stream_kernels.hpp rather than here. This is required because:
  *
- * @tparam T The type of the value to store.
- * @param[out] p The target memory location to write the value to.
- * @param[in] v The register containing the value to be stored.
- */
-template <typename T> inline __device__ void Store(T *p, const T &v) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    *p = v;
-#else
-    if constexpr (std::is_same<T, float>::value) {
-        asm volatile("st.volatile.global.f32 [%0], %1;" ::"l"(p), "f"(v) : "memory");
-    } else if constexpr (std::is_same<T, double>::value) {
-        asm volatile("st.volatile.global.f64 [%0], %1;" ::"l"(p), "d"(v) : "memory");
-    }
-#endif
-}
-
-/**
- * @brief Performs COPY, a simple copy operation from source to target. b = a
+ * 1. **C++ Template Instantiation Model**: Templates are not compiled until they are
+ *    instantiated with concrete types. The compiler needs to see the full template
+ *    definition (not just declaration) at the point of instantiation.
  *
- * @details This CUDA kernel performs a simple copy operation, copying data from the source array
- * to the target array. This is used to measure transfer rates without any arithmetic operations.
+ * 2. **Separate Compilation Units**: When gpu_stream.cu calls `CopyKernel<double><<<...>>>`,
+ *    nvcc needs the full kernel implementation visible in that translation unit.
+ *    If implementations were only in this .cu file, gpu_stream.cu would only see
+ *    declarations, causing "undefined reference" linker errors.
  *
- * @param[out] tgt The target array where data will be copied to.
- * @param[in] src The source array from which data will be copied.
- */
-__global__ void CopyKernel(double *tgt, const double *src) {
-    uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
-    double val[kNumLoopUnrollAlias];
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++)
-        Fetch(val[i], src + index + i * blockDim.x);
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++)
-        Store(tgt + index + i * blockDim.x, val[i]);
-}
-
-/**
- * @brief Performs SCALE, a scaling operation on the source data. b = x * a
- *
- * @details This CUDA kernel performs a simple arithmetic operation by scaling the source data
- * with a given scalar value and storing the result in the target array.
- *
- * @param[out] tgt The target array where the scaled data will be stored.
- * @param[in] src The source array containing the data to be scaled.
- * @param[in] scalar The scalar value used to scale the source data.
- */
-__global__ void ScaleKernel(double *tgt, const double *src, const double scalar) {
-    uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
-    double val[kNumLoopUnrollAlias];
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++)
-        Fetch(val[i], src + index + i * blockDim.x);
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
-        val[i] *= scalar;
-        Store(tgt + index + i * blockDim.x, val[i]);
-    }
-}
-
-/**
- * @brief Performs ADD, an addition operation on two source arrays. c = a + b
- *
- * @details This CUDA kernel adds corresponding elements from two source arrays and stores the result
- * in the target array. This operation is used to measure transfer rates with a simple arithmetic addition.
- *
- * @param[out] tgt The target array where the result of the addition will be stored.
- * @param[in] src_a The first source array containing the first set of operands.
- * @param[in] src_b The second source array containing the second set of operands.
- */
-__global__ void AddKernel(double *tgt, const double *src_a, const double *src_b) {
-    uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
-    double val_a[kNumLoopUnrollAlias];
-    double val_b[kNumLoopUnrollAlias];
-
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
-        Fetch(val_a[i], src_a + index + i * blockDim.x);
-        Fetch(val_b[i], src_b + index + i * blockDim.x);
-    }
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
-        val_a[i] += val_b[i];
-        Store(tgt + index + i * blockDim.x, val_a[i]);
-    }
-}
-
-/**
- * @brief Performs TRIAD, fused multiply/add operations on source arrays. a = b + x * c
+ * 3. **CUDA-Specific Consideration**: Unlike regular C++ where explicit template
+ *    instantiation in a .cpp file can work, CUDA kernel launches require the kernel
+ *    code to be visible to nvcc when compiling the caller. This is because nvcc
+ *    generates device code at compile time, not link time.
  *
- * @details This CUDA kernel performs a fused multiply/add operation by multiplying elements from
- * the second source array with a scalar value, adding the result to corresponding elements from
- * the first source array, and storing the result in the target array.
+ * 4. **Header Guards for Mixed Compilation**: The header uses `#ifdef __CUDACC__` to
+ *    protect CUDA-specific code (blockIdx, threadIdx, __global__, etc.) from g++
+ *    when the header is indirectly included by .cpp files (e.g., via gpu_stream.hpp).
  *
- * @param[out] tgt The target array where the result of the fused multiply/add operation will be stored.
- * @param[in] src_a The first source array containing the first set of operands.
- * @param[in] src_b The second source array containing the second set of operands to be multiplied by the scalar.
- * @param[in] scalar The scalar value used in the multiply/add operation.
+ * This file remains as the compilation unit that ensures the header is processed
+ * by nvcc, and can host any future non-template helper functions if needed.
  */
-__global__ void TriadKernel(double *tgt, const double *src_a, const double *src_b, const double scalar) {
-    uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
-    double val_a[kNumLoopUnrollAlias];
-    double val_b[kNumLoopUnrollAlias];
 
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
-        Fetch(val_a[i], src_a + index + i * blockDim.x);
-        Fetch(val_b[i], src_b + index + i * blockDim.x);
-    }
-#pragma unroll
-    for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
-        val_b[i] += (val_a[i] * scalar);
-        Store(tgt + index + i * blockDim.x, val_b[i]);
-    }
-}
\ No newline at end of file
+#include "gpu_stream_kernels.hpp"
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
index cfe9f2052..6a3bdfda1 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
@@ -7,13 +7,192 @@
 #include <cuda_runtime.h>
 
 #include "gpu_stream_utils.hpp"
-constexpr auto kNumLoopUnrollAlias = stream_config::kNumLoopUnroll;
 
-// Function declarations
-template <typename T> inline __device__ void Fetch(T &v, const T *p);
-template <typename T> inline __device__ void Store(T *p, const T &v);
+/**
+ * @brief Type trait mapping scalar types to their 128-bit aligned vector types.
+ * 
+ * @details For optimal memory bandwidth, we use 128-bit (16 byte) vector loads/stores:
+ * - double -> double2 (2 x 64-bit = 128-bit)
+ * - float  -> float4  (4 x 32-bit = 128-bit)
+ */
+template <typename T> struct VectorType;
+template <> struct VectorType<double> { using type = double2; };
+template <> struct VectorType<float> { using type = float4; };
 
-__global__ void CopyKernel(double *, const double *);
-__global__ void ScaleKernel(double *, const double *, const double);
-__global__ void AddKernel(double *, const double *, const double *);
-__global__ void TriadKernel(double *, const double *, const double *, const double);
\ No newline at end of file
+template <typename T>
+using VecT = typename VectorType<T>::type;
+
+// Kernel declarations (visible to all compilers for function pointer usage)
+template <typename T> __global__ void CopyKernel(VecT<T> *tgt, const VecT<T> *src);
+template <typename T> __global__ void ScaleKernel(VecT<T> *tgt, const VecT<T> *src, const T scalar);
+template <typename T> __global__ void AddKernel(VecT<T> *tgt, const VecT<T> *src_a, const VecT<T> *src_b);
+template <typename T> __global__ void TriadKernel(VecT<T> *tgt, const VecT<T> *src_a, const VecT<T> *src_b, const T scalar);
+
+// Implementation section - only compiled by nvcc
+#ifdef __CUDACC__
+
+/**
+ * @brief Fetches a value from source memory and writes it to a register.
+ *
+ * @details This inline device function fetches a value from the specified source memory
+ * location and writes it to the provided register. The implementation references the following:
+ * 1) NCCL:
+ * https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L483
+ * 2) RCCL:
+ * https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L268
+ *
+ * @tparam T The type of the value to fetch.
+ * @param[out] v The register to write the fetched value to.
+ * @param[in] p The source memory location to fetch the value from.
+ */
+template <typename T> inline __device__ void Fetch(T &v, const T *p) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    v = *p;
+#else
+    if constexpr (std::is_same<T, float>::value) {
+        asm volatile("ld.volatile.global.f32 %0, [%1];" : "=f"(v) : "l"(p) : "memory");
+    } else if constexpr (std::is_same<T, double>::value) {
+        asm volatile("ld.volatile.global.f64 %0, [%1];" : "=d"(v) : "l"(p) : "memory");
+    } else if constexpr (std::is_same<T, double2>::value) {
+        asm volatile("ld.volatile.global.v2.f64 {%0,%1}, [%2];" : "=d"(v.x), "=d"(v.y) : "l"(p) : "memory");
+    } else if constexpr (std::is_same<T, float4>::value) {
+        asm volatile("ld.volatile.global.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(v.x), "=f"(v.y), "=f"(v.z), "=f"(v.w) : "l"(p) : "memory");
+    }
+#endif
+}
+
+/**
+ * @brief Stores a value from register and writes it to target memory.
+ *
+ * @details This inline device function stores a value from the provided register
+ * and writes it to the specified target memory location. The implementation references the following:
+ * 1) NCCL:
+ * https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L486
+ * 2) RCCL:
+ * https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L276
+ *
+ * @tparam T The type of the value to store.
+ * @param[out] p The target memory location to write the value to.
+ * @param[in] v The register containing the value to be stored.
+ */
+template <typename T> inline __device__ void Store(T *p, const T &v) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    *p = v;
+#else
+    if constexpr (std::is_same<T, float>::value) {
+        asm volatile("st.volatile.global.f32 [%0], %1;" ::"l"(p), "f"(v) : "memory");
+    } else if constexpr (std::is_same<T, double>::value) {
+        asm volatile("st.volatile.global.f64 [%0], %1;" ::"l"(p), "d"(v) : "memory");
+    } else if constexpr (std::is_same<T, double2>::value) {
+        asm volatile("st.volatile.global.v2.f64 [%0], {%1,%2};" ::"l"(p), "d"(v.x), "d"(v.y) : "memory");
+    } else if constexpr (std::is_same<T, float4>::value) {
+        asm volatile("st.volatile.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(p), "f"(v.x), "f"(v.y), "f"(v.z), "f"(v.w) : "memory");
+    }
+#endif
+}
+
+/**
+ * @brief Performs COPY, a simple copy operation from source to target. b = a
+ *
+ * @details This CUDA kernel performs a simple copy operation, copying data from the source array
+ * to the target array. This is used to measure transfer rates without any arithmetic operations.
+ *
+ * @param[out] tgt The target array where data will be copied to (128-bit aligned).
+ * @param[in] src The source array from which data will be copied (128-bit aligned).
+ */
+template <typename T>
+__global__ void CopyKernel(VecT<T> *tgt, const VecT<T> *src) {
+    uint64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    VecT<T> val;
+    Fetch(val, src + index);
+    Store(tgt + index, val);
+}
+
+/**
+ * @brief Performs SCALE, a scaling operation on the source data. b = x * a
+ *
+ * @details This CUDA kernel performs a simple arithmetic operation by scaling the source data
+ * with a given scalar value and storing the result in the target array.
+ *
+ * @param[out] tgt The target array where the scaled data will be stored (128-bit aligned).
+ * @param[in] src The source array containing the data to be scaled (128-bit aligned).
+ * @param[in] scalar The scalar value used to scale the source data.
+ */
+template <typename T>
+__global__ void ScaleKernel(VecT<T> *tgt, const VecT<T> *src, const T scalar) {
+    uint64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    VecT<T> val;
+    Fetch(val, src + index);
+    if constexpr (std::is_same<T, double>::value) {
+        val.x *= scalar;
+        val.y *= scalar;
+    } else if constexpr (std::is_same<T, float>::value) {
+        val.x *= scalar;
+        val.y *= scalar;
+        val.z *= scalar;
+        val.w *= scalar;
+    }
+    Store(tgt + index, val);
+}
+
+/**
+ * @brief Performs ADD, an addition operation on two source arrays. c = a + b
+ *
+ * @details This CUDA kernel adds corresponding elements from two source arrays and stores the result
+ * in the target array. This operation is used to measure transfer rates with a simple arithmetic addition.
+ *
+ * @param[out] tgt The target array where the result of the addition will be stored (128-bit aligned).
+ * @param[in] src_a The first source array containing the first set of operands (128-bit aligned).
+ * @param[in] src_b The second source array containing the second set of operands (128-bit aligned).
+ */
+template <typename T>
+__global__ void AddKernel(VecT<T> *tgt, const VecT<T> *src_a, const VecT<T> *src_b) {
+    uint64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    VecT<T> val_a;
+    VecT<T> val_b;
+    Fetch(val_a, src_a + index);
+    Fetch(val_b, src_b + index);
+    if constexpr (std::is_same<T, double>::value) {
+        val_a.x += val_b.x;
+        val_a.y += val_b.y;
+    } else if constexpr (std::is_same<T, float>::value) {
+        val_a.x += val_b.x;
+        val_a.y += val_b.y;
+        val_a.z += val_b.z;
+        val_a.w += val_b.w;
+    }
+    Store(tgt + index, val_a);
+}
+
+/**
+ * @brief Performs TRIAD, fused multiply/add operations on source arrays. a = b + x * c
+ *
+ * @details This CUDA kernel performs a fused multiply/add operation by multiplying elements from
+ * the second source array with a scalar value, adding the result to corresponding elements from
+ * the first source array, and storing the result in the target array.
+ *
+ * @param[out] tgt The target array where the result of the fused multiply/add operation will be stored (128-bit aligned).
+ * @param[in] src_a The first source array containing the first set of operands (128-bit aligned).
+ * @param[in] src_b The second source array containing the second set of operands to be multiplied by the scalar (128-bit aligned).
+ * @param[in] scalar The scalar value used in the multiply/add operation.
+ */
+template <typename T>
+__global__ void TriadKernel(VecT<T> *tgt, const VecT<T> *src_a, const VecT<T> *src_b, const T scalar) {
+    uint64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    VecT<T> val_a;
+    VecT<T> val_b;
+    Fetch(val_a, src_a + index);
+    Fetch(val_b, src_b + index);
+    if constexpr (std::is_same<T, double>::value) {
+        val_b.x += (val_a.x * scalar);
+        val_b.y += (val_a.y * scalar);
+    } else if constexpr (std::is_same<T, float>::value) {
+        val_b.x += (val_a.x * scalar);
+        val_b.y += (val_a.y * scalar);
+        val_b.z += (val_a.z * scalar);
+        val_b.w += (val_a.w * scalar);
+    }
+    Store(tgt + index, val_b);
+}
+
+#endif // __CUDACC__
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_test.cpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_main.cpp
similarity index 100%
rename from superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_test.cpp
rename to superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_main.cpp
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
index 6ced0fdd5..fd0dfb913 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
@@ -43,6 +43,7 @@ void PrintUsage() {
               << "--size <size in bytes> "
               << "--num_warm_up <num_warm_up> "
               << "--num_loops <num_loops> "
+              << "[--data_type <float|double>] "
               << "[--check_data]" << std::endl;
 }
 
@@ -60,6 +61,7 @@ void PrintInputInfo(Opts &opts) {
     std::cout << "Buffer size(bytes): " << opts.size << std::endl;
     std::cout << "Number of warm up runs: " << opts.num_warm_up << std::endl;
     std::cout << "Number of loops: " << opts.num_loops << std::endl;
+    std::cout << "Data type: " << opts.data_type << std::endl;
     std::cout << "Check data: " << (opts.check_data ? "Yes" : "No") << std::endl;
 }
 
@@ -75,11 +77,12 @@ void PrintInputInfo(Opts &opts) {
  * @return int The status code.
  * */
 int ParseOpts(int argc, char **argv, Opts *opts) {
-    enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData };
+    enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData, kDataType };
     const struct option options[] = {{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
                                      {"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
                                      {"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
-                                     {"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
+                                     {"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)},
+                                     {"data_type", required_argument, nullptr, static_cast<int>(OptIdx::kDataType)}};
     int getopt_ret = 0;
     int opt_idx = 0;
     bool size_specified = true;
@@ -126,6 +129,13 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
         case static_cast<int>(OptIdx::kEnableCheckData):
             opts->check_data = true;
             break;
+        case static_cast<int>(OptIdx::kDataType):
+            opts->data_type = optarg;
+            if (opts->data_type != "float" && opts->data_type != "double") {
+                std::cerr << "Invalid data_type: " << optarg << ". Must be 'float' or 'double'." << std::endl;
+                parse_err = true;
+            }
+            break;
         default:
             parse_err = true;
         }
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
index 720810e5a..907d05ef2 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
@@ -31,7 +31,6 @@ unsigned long long getCurrentTimestampInMicroseconds();
 namespace stream_config {
 constexpr std::array<int, 4> kThreadsPerBlock = {128, 256, 512, 1024}; // Threads per block
 constexpr uint64_t kDefaultBufferSizeInBytes = 4294967296;             // Default buffer size 4GB
-constexpr int kNumLoopUnroll = 2;                                      // Unroll depth in SM copy kernel
 constexpr int kNumBuffers = 3;                                         // Number of buffers for triad, add kernel
 constexpr int kNumValidationBuffers = 4;                          // Number of validation buffers, one for each kernel
 constexpr int kUInt8Mod = 256;                                    // Modulo for unsigned long data type
@@ -118,6 +117,9 @@ struct Opts {
 
     // Whether check data after copy.
     bool check_data = false;
+
+    // Data type for the benchmark ("float" or "double").
+    std::string data_type = "double";
 };
 
 std::string KernelToString(int); // Function to convert enum to string
diff --git a/tests/data/gpu_stream.log b/tests/data/gpu_stream.log
index b26310390..a3dcf2b01 100644
--- a/tests/data/gpu_stream.log
+++ b/tests/data/gpu_stream.log
@@ -2,6 +2,7 @@ STREAM Benchmark
 Buffer size(bytes): 4294967296
 Number of warm up runs: 10
 Number of loops: 40
+Data type: double
 Check data: No
 
 Device 0: "NVIDIA Graphics Device"  152 SMs(10.0)  Memory: 4000MHz x 8192-bit = 8192 GB/s PEAK   ECC is ON

From 242714ecad7ab738b9cb784b13d94c9deaa6328d Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 19 Dec 2025 19:45:41 +0000
Subject: [PATCH 03/27] add data_type arg

---
 .../benchmarks/micro_benchmarks/gpu_stream.py       | 13 +++++++++++--
 .../benchmarks/micro_benchmarks/test_gpu_stream.py  |  4 +++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream.py b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
index ecc90951f..44908e6b6 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
@@ -51,6 +51,15 @@ def add_parser_arguments(self):
             help='Number of data buffer copies performed.',
         )
 
+        self._parser.add_argument(
+            '--data_type',
+            type=str,
+            default='double',
+            choices=['float', 'double'],
+            required=False,
+            help='Data type of the buffer elements.',
+        )
+
         self._parser.add_argument(
             '--check_data',
             action='store_true',
@@ -68,8 +77,8 @@ def _preprocess(self):
 
         self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
 
-        args = '--size %d --num_warm_up %d --num_loops %d ' % (
-            self._args.size, self._args.num_warm_up, self._args.num_loops
+        args = '--size %d --num_warm_up %d --num_loops %d --data_type %s' % (
+            self._args.size, self._args.num_warm_up, self._args.num_loops, self._args.data_type
         )
 
         if self._args.check_data:
diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
index e6dea64e8..7f58fe461 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
@@ -31,7 +31,7 @@ def _test_gpu_stream_command_generation(self, platform):
         num_loops = 10
         size = 25769803776
 
-        parameters = '--num_warm_up %d --num_loops %d --size %d ' \
+        parameters = '--num_warm_up %d --num_loops %d --size %d --data_type double ' \
             '--check_data' % \
             (num_warm_up, num_loops, size)
         benchmark = benchmark_class(benchmark_name, parameters=parameters)
@@ -49,6 +49,7 @@ def _test_gpu_stream_command_generation(self, platform):
         assert (benchmark._args.num_warm_up == num_warm_up)
         assert (benchmark._args.num_loops == num_loops)
         assert (benchmark._args.check_data)
+        assert (benchmark._args.data_type == 'double')
 
         # Check command
         assert (1 == len(benchmark._commands))
@@ -56,6 +57,7 @@ def _test_gpu_stream_command_generation(self, platform):
         assert ('--size %d' % size in benchmark._commands[0])
         assert ('--num_warm_up %d' % num_warm_up in benchmark._commands[0])
         assert ('--num_loops %d' % num_loops in benchmark._commands[0])
+        assert ('--data_type double' in benchmark._commands[0])
         assert ('--check_data' in benchmark._commands[0])
 
     @decorator.cuda_test

From e8d02829fe95d73315688f25c29929dcb66060d4 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 19 Dec 2025 23:31:48 +0000
Subject: [PATCH 04/27] fix lint

---
 .../micro_benchmarks/gpu_stream/gpu_stream.cu |  3 +-
 .../gpu_stream/gpu_stream_kernels.hpp         | 39 +++++++++++--------
 .../gpu_stream/gpu_stream_utils.cpp           |  8 +---
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 2afc5153d..5e5ac90e5 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -587,8 +587,7 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
     // Tags are of format:
     // STREAM_<Kernelname>_datatype_buffer_<buffer_size>_block_<block_size>
     for (int i = 0; i < args->sub.times_in_ms.size(); i++) {
-        std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type +
-                          "_buffer_" + std::to_string(args->size);
+        std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_buffer_" + std::to_string(args->size);
         for (int j = 0; j < args->sub.times_in_ms[i].size(); j++) {
             // Calculate and display bandwidth
             double bw = args->size * args->num_loops / args->sub.times_in_ms[i][j] / 1e6;
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
index 6a3bdfda1..6e9e99886 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
@@ -10,23 +10,27 @@
 
 /**
  * @brief Type trait mapping scalar types to their 128-bit aligned vector types.
- * 
+ *
  * @details For optimal memory bandwidth, we use 128-bit (16 byte) vector loads/stores:
  * - double -> double2 (2 x 64-bit = 128-bit)
  * - float  -> float4  (4 x 32-bit = 128-bit)
  */
 template <typename T> struct VectorType;
-template <> struct VectorType<double> { using type = double2; };
-template <> struct VectorType<float> { using type = float4; };
+template <> struct VectorType<double> {
+    using type = double2;
+};
+template <> struct VectorType<float> {
+    using type = float4;
+};
 
-template <typename T>
-using VecT = typename VectorType<T>::type;
+template <typename T> using VecT = typename VectorType<T>::type;
 
 // Kernel declarations (visible to all compilers for function pointer usage)
 template <typename T> __global__ void CopyKernel(VecT<T> *tgt, const VecT<T> *src);
 template <typename T> __global__ void ScaleKernel(VecT<T> *tgt, const VecT<T> *src, const T scalar);
 template <typename T> __global__ void AddKernel(VecT<T> *tgt, const VecT<T> *src_a, const VecT<T> *src_b);
-template <typename T> __global__ void TriadKernel(VecT<T> *tgt, const VecT<T> *src_a, const VecT<T> *src_b, const T scalar);
+template <typename T>
+__global__ void TriadKernel(VecT<T> *tgt, const VecT<T> *src_a, const VecT<T> *src_b, const T scalar);
 
 // Implementation section - only compiled by nvcc
 #ifdef __CUDACC__
@@ -56,7 +60,10 @@ template <typename T> inline __device__ void Fetch(T &v, const T *p) {
     } else if constexpr (std::is_same<T, double2>::value) {
         asm volatile("ld.volatile.global.v2.f64 {%0,%1}, [%2];" : "=d"(v.x), "=d"(v.y) : "l"(p) : "memory");
     } else if constexpr (std::is_same<T, float4>::value) {
-        asm volatile("ld.volatile.global.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(v.x), "=f"(v.y), "=f"(v.z), "=f"(v.w) : "l"(p) : "memory");
+        asm volatile("ld.volatile.global.v4.f32 {%0,%1,%2,%3}, [%4];"
+                     : "=f"(v.x), "=f"(v.y), "=f"(v.z), "=f"(v.w)
+                     : "l"(p)
+                     : "memory");
     }
 #endif
 }
@@ -86,7 +93,8 @@ template <typename T> inline __device__ void Store(T *p, const T &v) {
     } else if constexpr (std::is_same<T, double2>::value) {
         asm volatile("st.volatile.global.v2.f64 [%0], {%1,%2};" ::"l"(p), "d"(v.x), "d"(v.y) : "memory");
     } else if constexpr (std::is_same<T, float4>::value) {
-        asm volatile("st.volatile.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(p), "f"(v.x), "f"(v.y), "f"(v.z), "f"(v.w) : "memory");
+        asm volatile("st.volatile.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(p), "f"(v.x), "f"(v.y), "f"(v.z), "f"(v.w)
+                     : "memory");
     }
 #endif
 }
@@ -100,8 +108,7 @@ template <typename T> inline __device__ void Store(T *p, const T &v) {
  * @param[out] tgt The target array where data will be copied to (128-bit aligned).
  * @param[in] src The source array from which data will be copied (128-bit aligned).
  */
-template <typename T>
-__global__ void CopyKernel(VecT<T> *tgt, const VecT<T> *src) {
+template <typename T> __global__ void CopyKernel(VecT<T> *tgt, const VecT<T> *src) {
     uint64_t index = blockIdx.x * blockDim.x + threadIdx.x;
     VecT<T> val;
     Fetch(val, src + index);
@@ -118,8 +125,7 @@ __global__ void CopyKernel(VecT<T> *tgt, const VecT<T> *src) {
  * @param[in] src The source array containing the data to be scaled (128-bit aligned).
  * @param[in] scalar The scalar value used to scale the source data.
  */
-template <typename T>
-__global__ void ScaleKernel(VecT<T> *tgt, const VecT<T> *src, const T scalar) {
+template <typename T> __global__ void ScaleKernel(VecT<T> *tgt, const VecT<T> *src, const T scalar) {
     uint64_t index = blockIdx.x * blockDim.x + threadIdx.x;
     VecT<T> val;
     Fetch(val, src + index);
@@ -145,8 +151,7 @@ __global__ void ScaleKernel(VecT<T> *tgt, const VecT<T> *src, const T scalar) {
  * @param[in] src_a The first source array containing the first set of operands (128-bit aligned).
  * @param[in] src_b The second source array containing the second set of operands (128-bit aligned).
  */
-template <typename T>
-__global__ void AddKernel(VecT<T> *tgt, const VecT<T> *src_a, const VecT<T> *src_b) {
+template <typename T> __global__ void AddKernel(VecT<T> *tgt, const VecT<T> *src_a, const VecT<T> *src_b) {
     uint64_t index = blockIdx.x * blockDim.x + threadIdx.x;
     VecT<T> val_a;
     VecT<T> val_b;
@@ -171,9 +176,11 @@ __global__ void AddKernel(VecT<T> *tgt, const VecT<T> *src_a, const VecT<T> *src
  * the second source array with a scalar value, adding the result to corresponding elements from
  * the first source array, and storing the result in the target array.
  *
- * @param[out] tgt The target array where the result of the fused multiply/add operation will be stored (128-bit aligned).
+ * @param[out] tgt The target array where the result of the fused multiply/add operation will be stored (128-bit
+ * aligned).
  * @param[in] src_a The first source array containing the first set of operands (128-bit aligned).
- * @param[in] src_b The second source array containing the second set of operands to be multiplied by the scalar (128-bit aligned).
+ * @param[in] src_b The second source array containing the second set of operands to be multiplied by the scalar
+ * (128-bit aligned).
  * @param[in] scalar The scalar value used in the multiply/add operation.
  */
 template <typename T>
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
index fd0dfb913..a5b183bcc 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
@@ -39,12 +39,8 @@ std::string KernelToString(int kernel_idx) {
  * @return void.
  * */
 void PrintUsage() {
-    std::cout << "Usage: gpu_stream "
-              << "--size <size in bytes> "
-              << "--num_warm_up <num_warm_up> "
-              << "--num_loops <num_loops> "
-              << "[--data_type <float|double>] "
-              << "[--check_data]" << std::endl;
+    std::cout << "Usage: gpu_stream " << "--size <size in bytes> " << "--num_warm_up <num_warm_up> "
+              << "--num_loops <num_loops> " << "[--data_type <float|double>] " << "[--check_data]" << std::endl;
 }
 
 /**

From 5a189460f2bb728dfcc9a66380e2f9bd818cf526 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 19 Dec 2025 23:48:07 +0000
Subject: [PATCH 05/27] fix clang lint

---
 .../micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp    | 8 ++------
 .../micro_benchmarks/gpu_stream/gpu_stream_utils.cpp      | 8 ++++++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
index 6e9e99886..b5ba6a43f 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
@@ -16,12 +16,8 @@
  * - float  -> float4  (4 x 32-bit = 128-bit)
  */
 template <typename T> struct VectorType;
-template <> struct VectorType<double> {
-    using type = double2;
-};
-template <> struct VectorType<float> {
-    using type = float4;
-};
+template <> struct VectorType<double> { using type = double2; };
+template <> struct VectorType<float> { using type = float4; };
 
 template <typename T> using VecT = typename VectorType<T>::type;
 
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
index a5b183bcc..fd0dfb913 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
@@ -39,8 +39,12 @@ std::string KernelToString(int kernel_idx) {
  * @return void.
  * */
 void PrintUsage() {
-    std::cout << "Usage: gpu_stream " << "--size <size in bytes> " << "--num_warm_up <num_warm_up> "
-              << "--num_loops <num_loops> " << "[--data_type <float|double>] " << "[--check_data]" << std::endl;
+    std::cout << "Usage: gpu_stream "
+              << "--size <size in bytes> "
+              << "--num_warm_up <num_warm_up> "
+              << "--num_loops <num_loops> "
+              << "[--data_type <float|double>] "
+              << "[--check_data]" << std::endl;
 }
 
 /**

From fddf56e46151557671786c6bc008eb651a54da01 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Sat, 20 Dec 2025 00:09:51 +0000
Subject: [PATCH 06/27] update doc

---
 .../user-tutorial/benchmarks/micro-benchmarks.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index aa3aa965b..dbec712dc 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -273,14 +273,14 @@ Measure the memory bandwidth of GPU using the STREAM benchmark. The benchmark te
 
 | Metric Name                                                | Unit             | Description                                                                                                                             |
 |------------------------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
-| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw  | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size.                         |
-| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size.                         |
-| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw   | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size.                         |
-| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size.                         |
-| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio  | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size.                         |
-| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size.                         |
-| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio   | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size.                         |
-| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size.                         |
+| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw  | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size.                         |
+| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size.                         |
+| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw   | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size.                         |
+| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size.                         |
+| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio  | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size.                         |
+| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size.                         |
+| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio   | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size.                         |
+| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size.                         |
 
 ### `ib-loopback`
 

From f31933fa9e0bd37a646c1860574b228a0ecb0160 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 5 Feb 2026 16:20:18 -0800
Subject: [PATCH 07/27] fix alloc count & comment

---
 .../user-tutorial/benchmarks/micro-benchmarks.md | 16 ++++++++--------
 .../micro_benchmarks/gpu_stream/gpu_stream.cu    |  8 ++++----
 third_party/gpu-burn                             |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index dbec712dc..2cbbb8419 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -273,14 +273,14 @@ Measure the memory bandwidth of GPU using the STREAM benchmark. The benchmark te
 
 | Metric Name                                                | Unit             | Description                                                                                                                             |
 |------------------------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
-| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw  | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size.                         |
-| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size.                         |
-| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw   | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size.                         |
-| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size.                         |
-| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio  | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size.                         |
-| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size.                         |
-| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio   | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size.                         |
-| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size.                         |
+| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw  | bandwidth (GB/s) | The memory bandwidth of the GPU for the copy operation with the selected data type (double for fp64, float for fp32), for the specified buffer size and block size.                         |
+| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The memory bandwidth of the GPU for the scale operation with the selected data type (double for fp64, float for fp32), for the specified buffer size and block size.                         |
+| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw   | bandwidth (GB/s) | The memory bandwidth of the GPU for the add operation with the selected data type (double for fp64, float for fp32), for the specified buffer size and block size.                         |
+| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The memory bandwidth of the GPU for the triad operation with the selected data type (double for fp64, float for fp32), for the specified buffer size and block size.                         |
+| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio  | Efficiency (%) | The memory bandwidth efficiency of the GPU for the copy operation with the selected data type (double for fp64, float for fp32), for the specified buffer size and block size.                         |
+| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The memory bandwidth efficiency of the GPU for the scale operation with the selected data type (double for fp64, float for fp32), for the specified buffer size and block size.                         |
+| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio   | Efficiency (%) | The memory bandwidth efficiency of the GPU for the add operation with the selected data type (double for fp64, float for fp32), for the specified buffer size and block size.                         |
+| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The memory bandwidth efficiency of the GPU for the triad operation with the selected data type (double for fp64, float for fp32), for the specified buffer size and block size.                         |
 
 ### `ib-loopback`
 
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 5e5ac90e5..66b891d65 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -236,14 +236,14 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr
 
     if (args->check_data) {
         // Generate data to copy - use local NUMA node for best CPU access
-        args->sub.data_buf = static_cast<T *>(numa_alloc_local(args->size * sizeof(T)));
+        args->sub.data_buf = static_cast<T *>(numa_alloc_local(args->size));
 
         for (int j = 0; j < args->size / sizeof(T); j++) {
             args->sub.data_buf[j] = static_cast<T>(j % kUInt8Mod);
         }
 
         // Allocate check buffer on local NUMA node
-        args->sub.check_buf = static_cast<T *>(numa_alloc_local(args->size * sizeof(T)));
+        args->sub.check_buf = static_cast<T *>(numa_alloc_local(args->size));
     }
 
     // Allocate buffers
@@ -257,7 +257,7 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr
     // Allocate buffers
     for (auto &buf_ptr : args->sub.gpu_buf_ptrs) {
         T *raw_ptr = nullptr;
-        cuda_err = GpuMallocDataBuf(&raw_ptr, args->size * sizeof(T));
+        cuda_err = GpuMallocDataBuf(&raw_ptr, args->size);
         if (cuda_err != cudaSuccess) {
             std::cerr << "PrepareBufAndStream::cudaMalloc error: " << cuda_err << std::endl;
             return -1;
@@ -425,7 +425,7 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
     constexpr uint64_t kBytesPerThread = 16; // 128-bit aligned access
     uint64_t num_bytes_in_thread_block = num_threads_per_block * kBytesPerThread;
     if (args->size % num_bytes_in_thread_block) {
-        std::cerr << "RunCopy: Data size should be multiple of " << num_bytes_in_thread_block << std::endl;
+        std::cerr << "RunStreamKernel: Data size should be multiple of " << num_bytes_in_thread_block << std::endl;
         return -1;
     }
     num_thread_blocks = args->size / num_bytes_in_thread_block;
diff --git a/third_party/gpu-burn b/third_party/gpu-burn
index 671f4be92..565e55b46 160000
--- a/third_party/gpu-burn
+++ b/third_party/gpu-burn
@@ -1 +1 @@
-Subproject commit 671f4be92477ce01cd9b536bc534a006dbee058f
+Subproject commit 565e55b46f9885688ba9737f1600b1f62d47a95e

From d8a91ab9a1f80456ffbf54e3af4aaa547ddb5c4e Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 6 Feb 2026 11:04:43 -0800
Subject: [PATCH 08/27] fix: reset gpu-burn submodule to correct commit

---
 third_party/gpu-burn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/gpu-burn b/third_party/gpu-burn
index 565e55b46..671f4be92 160000
--- a/third_party/gpu-burn
+++ b/third_party/gpu-burn
@@ -1 +1 @@
-Subproject commit 565e55b46f9885688ba9737f1600b1f62d47a95e
+Subproject commit 671f4be92477ce01cd9b536bc534a006dbee058f

From 6dfdaa6932fc7c54855779a84cc241fa223aa669 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Apr 2026 14:27:57 -0700
Subject: [PATCH 09/27] resolve comments

---
 .../micro_benchmarks/gpu_stream/gpu_stream.cu | 126 +++++++++---------
 .../gpu_stream/gpu_stream.hpp                 |   3 +-
 .../gpu_stream/gpu_stream_kernels.hpp         |  30 ++---
 3 files changed, 77 insertions(+), 82 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 66b891d65..47cbc609a 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -10,6 +10,7 @@
 #include <cassert>
 #include <iostream>
 #include <nvml.h>
+#include <sched.h>
 
 /**
  * @brief Destroys the CUDA events used for benchmarking.
@@ -238,7 +239,7 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr
         // Generate data to copy - use local NUMA node for best CPU access
         args->sub.data_buf = static_cast<T *>(numa_alloc_local(args->size));
 
-        for (int j = 0; j < args->size / sizeof(T); j++) {
+        for (uint64_t j = 0; j < args->size / sizeof(T); j++) {
             args->sub.data_buf[j] = static_cast<T>(j % kUInt8Mod);
         }
 
@@ -437,7 +438,7 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
     }
 
     // Launch jobs and collect running time
-    for (int i = 0; i < args->num_loops + args->num_warm_up; i++) {
+    for (uint64_t i = 0; i < args->num_loops + args->num_warm_up; i++) {
 
         // Record start event once warm up iterations are done
         if (i == args->num_warm_up) {
@@ -586,9 +587,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
     // output formatted results to stdout
     // Tags are of format:
     // STREAM_<Kernelname>_datatype_buffer_<buffer_size>_block_<block_size>
-    for (int i = 0; i < args->sub.times_in_ms.size(); i++) {
+    for (size_t i = 0; i < args->sub.times_in_ms.size(); i++) {
         std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_buffer_" + std::to_string(args->size);
-        for (int j = 0; j < args->sub.times_in_ms[i].size(); j++) {
+        for (size_t j = 0; j < args->sub.times_in_ms[i].size(); j++) {
             // Calculate and display bandwidth
             double bw = args->size * args->num_loops / args->sub.times_in_ms[i][j] / 1e6;
             std::cout << tag << "_block_" << kThreadsPerBlock[j] << "\t" << bw << "\t";
@@ -606,6 +607,22 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
     return ret;
 }
 
+/**
+ * @brief Creates and initializes a BenchArgs for the given type and adds it to bench_args_.
+ *
+ * @tparam T The data type (float or double) for the benchmark arguments.
+ */
+template <typename T> void GpuStream::CreateBenchArgs() {
+    auto args = std::make_unique<BenchArgs<T>>();
+    args->gpu_id = 0;
+    cudaGetDeviceProperties(&args->gpu_device_prop, 0);
+    args->num_warm_up = opts_.num_warm_up;
+    args->num_loops = opts_.num_loops;
+    args->size = opts_.size;
+    args->check_data = opts_.check_data;
+    bench_args_ = std::move(args);
+}
+
 /**
  * @brief Runs the Stream benchmark.
  *
@@ -638,65 +655,46 @@ int GpuStream::Run() {
     }
 
     // Run on CUDA device 0 (the visible GPU assigned by CUDA_VISIBLE_DEVICES).
-    if (opts_.data_type == "float") {
-        auto args = std::make_unique<BenchArgs<float>>();
-        args->gpu_id = 0;
-        cudaGetDeviceProperties(&args->gpu_device_prop, 0);
-        args->num_warm_up = opts_.num_warm_up;
-        args->num_loops = opts_.num_loops;
-        args->size = opts_.size;
-        args->check_data = opts_.check_data;
-        bench_args_.emplace_back(std::move(args));
-    } else {
-        auto args = std::make_unique<BenchArgs<double>>();
-        args->gpu_id = 0;
-        cudaGetDeviceProperties(&args->gpu_device_prop, 0);
-        args->num_warm_up = opts_.num_warm_up;
-        args->num_loops = opts_.num_loops;
-        args->size = opts_.size;
-        args->check_data = opts_.check_data;
-        bench_args_.emplace_back(std::move(args));
-    }
-
-    bool has_error = false;
-    // Run the benchmark for all the configured data
-    for (auto &variant_args : bench_args_) {
-        std::visit(
-            [&](auto &curr_args) {
-                // Get memory clock rate once for both bandwidth computation and display
-                float memory_clock_mhz = GetMemoryClockRate(curr_args->gpu_id, curr_args->gpu_device_prop);
-
-                // Compute theoretical bandwidth using the memory clock rate
-                float peak_bw = -1.0f;
-                if (memory_clock_mhz > 0.0f) {
-                    // Calculate theoretical bandwidth: memory_clock_mhz * bus_width_bytes * 2 (DDR) / 1000 (convert to
-                    // GB/s)
-                    peak_bw = memory_clock_mhz * (curr_args->gpu_device_prop.memoryBusWidth / 8) * 2 / 1000.0;
-                }
-
-                // Print device info with both the memory clock and peak bandwidth
-                PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop, memory_clock_mhz, peak_bw);
-
-                // Run the stream benchmark for the configured data, passing the peak bandwidth
-                if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<float>>) {
-                    ret = RunStream<float>(curr_args, "float", peak_bw);
-                } else if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<double>>) {
-                    ret = RunStream<double>(curr_args, "double", peak_bw);
-                } else {
-                    std::cerr << "Run::Unknown type error" << std::endl;
-                    has_error = true;
-                    return;
-                }
-
-                if (ret != 0) {
-                    std::cerr << "Run::RunStream error: " << errno << std::endl;
-                    has_error = true;
-                }
-            },
-            variant_args);
-    }
-    if (has_error) {
-        return -1;
-    }
+    opts_.data_type == "float" ? CreateBenchArgs<float>() : CreateBenchArgs<double>();
+
+    // Pin the thread to its local NUMA node to prevent migration,
+    // ensuring numa_alloc_local buffers remain node-local.
+    int local_node = numa_node_of_cpu(sched_getcpu());
+    numa_run_on_node(local_node);
+
+    // Run the benchmark for the configured data
+    std::visit(
+        [&](auto &curr_args) {
+            // Get memory clock rate once for both bandwidth computation and display
+            float memory_clock_mhz = GetMemoryClockRate(curr_args->gpu_id, curr_args->gpu_device_prop);
+
+            // Compute theoretical bandwidth using the memory clock rate
+            float peak_bw = -1.0f;
+            if (memory_clock_mhz > 0.0f) {
+                // Calculate theoretical bandwidth: memory_clock_mhz * bus_width_bytes * 2 (DDR) / 1000 (convert to
+                // GB/s)
+                peak_bw = memory_clock_mhz * (curr_args->gpu_device_prop.memoryBusWidth / 8) * 2 / 1000.0;
+            }
+
+            // Print device info with both the memory clock and peak bandwidth
+            PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop, memory_clock_mhz, peak_bw);
+
+            // Run the stream benchmark for the configured data, passing the peak bandwidth
+            if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<float>>) {
+                ret = RunStream<float>(curr_args, "float", peak_bw);
+            } else if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<double>>) {
+                ret = RunStream<double>(curr_args, "double", peak_bw);
+            } else {
+                std::cerr << "Run::Unknown type error" << std::endl;
+                ret = -1;
+                return;
+            }
+
+            if (ret != 0) {
+                std::cerr << "Run::RunStream error: " << errno << std::endl;
+            }
+        },
+        bench_args_);
+
     return ret;
 }
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
index 754888339..c4a472e81 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
@@ -35,7 +35,7 @@ class GpuStream {
 
   private:
     using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<float>>, std::unique_ptr<BenchArgs<double>>>;
-    std::vector<BenchArgsVariant> bench_args_;
+    BenchArgsVariant bench_args_;
     Opts opts_;
 
     // Memory management functions
@@ -56,6 +56,7 @@ class GpuStream {
     template <typename T> int RunStream(std::unique_ptr<BenchArgs<T>> &, const std::string &data_type, float peak_bw);
 
     // Helper functions
+    template <typename T> void CreateBenchArgs();
     int GetGpuCount(int *);
     int SetGpu(int gpu_id);
     float GetMemoryClockRate(int device_id, const cudaDeviceProp &prop);
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
index b5ba6a43f..cebaf6eac 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
@@ -16,8 +16,12 @@
  * - float  -> float4  (4 x 32-bit = 128-bit)
  */
 template <typename T> struct VectorType;
-template <> struct VectorType<double> { using type = double2; };
-template <> struct VectorType<float> { using type = float4; };
+template <> struct VectorType<double> {
+    using type = double2;
+};
+template <> struct VectorType<float> {
+    using type = float4;
+};
 
 template <typename T> using VecT = typename VectorType<T>::type;
 
@@ -49,11 +53,7 @@ template <typename T> inline __device__ void Fetch(T &v, const T *p) {
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
     v = *p;
 #else
-    if constexpr (std::is_same<T, float>::value) {
-        asm volatile("ld.volatile.global.f32 %0, [%1];" : "=f"(v) : "l"(p) : "memory");
-    } else if constexpr (std::is_same<T, double>::value) {
-        asm volatile("ld.volatile.global.f64 %0, [%1];" : "=d"(v) : "l"(p) : "memory");
-    } else if constexpr (std::is_same<T, double2>::value) {
+    if constexpr (std::is_same<T, double2>::value) {
         asm volatile("ld.volatile.global.v2.f64 {%0,%1}, [%2];" : "=d"(v.x), "=d"(v.y) : "l"(p) : "memory");
     } else if constexpr (std::is_same<T, float4>::value) {
         asm volatile("ld.volatile.global.v4.f32 {%0,%1,%2,%3}, [%4];"
@@ -82,11 +82,7 @@ template <typename T> inline __device__ void Store(T *p, const T &v) {
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
     *p = v;
 #else
-    if constexpr (std::is_same<T, float>::value) {
-        asm volatile("st.volatile.global.f32 [%0], %1;" ::"l"(p), "f"(v) : "memory");
-    } else if constexpr (std::is_same<T, double>::value) {
-        asm volatile("st.volatile.global.f64 [%0], %1;" ::"l"(p), "d"(v) : "memory");
-    } else if constexpr (std::is_same<T, double2>::value) {
+    if constexpr (std::is_same<T, double2>::value) {
         asm volatile("st.volatile.global.v2.f64 [%0], {%1,%2};" ::"l"(p), "d"(v.x), "d"(v.y) : "memory");
     } else if constexpr (std::is_same<T, float4>::value) {
         asm volatile("st.volatile.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(p), "f"(v.x), "f"(v.y), "f"(v.z), "f"(v.w)
@@ -166,17 +162,17 @@ template <typename T> __global__ void AddKernel(VecT<T> *tgt, const VecT<T> *src
 }
 
 /**
- * @brief Performs TRIAD, fused multiply/add operations on source arrays. a = b + x * c
+ * @brief Performs TRIAD, fused multiply/add operations on source arrays. c = b + x * a
  *
  * @details This CUDA kernel performs a fused multiply/add operation by multiplying elements from
- * the second source array with a scalar value, adding the result to corresponding elements from
- * the first source array, and storing the result in the target array.
+ * the first source array with a scalar value, adding the result to corresponding elements from
+ * the second source array, and storing the result in the target array.
  *
  * @param[out] tgt The target array where the result of the fused multiply/add operation will be stored (128-bit
  * aligned).
- * @param[in] src_a The first source array containing the first set of operands (128-bit aligned).
- * @param[in] src_b The second source array containing the second set of operands to be multiplied by the scalar
+ * @param[in] src_a The first source array containing the first set of operands to be multiplied by the scalar
  * (128-bit aligned).
+ * @param[in] src_b The second source array containing the second set of operands (128-bit aligned).
  * @param[in] scalar The scalar value used in the multiply/add operation.
  */
 template <typename T>

From e3232f5b6f4d2a2376458b3507b746a4106d9574 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Apr 2026 14:59:18 -0700
Subject: [PATCH 10/27] fix lint

---
 .../micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp    | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
index cebaf6eac..626f96227 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
@@ -16,12 +16,8 @@
  * - float  -> float4  (4 x 32-bit = 128-bit)
  */
 template <typename T> struct VectorType;
-template <> struct VectorType<double> {
-    using type = double2;
-};
-template <> struct VectorType<float> {
-    using type = float4;
-};
+template <> struct VectorType<double> { using type = double2; };
+template <> struct VectorType<float> { using type = float4; };
 
 template <typename T> using VecT = typename VectorType<T>::type;
 

From 58fead34d097557a62cbf6356006cd03ebb3ca5f Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Apr 2026 15:20:55 -0700
Subject: [PATCH 11/27] resolve comment

---
 .../micro_benchmarks/gpu_stream/gpu_stream.cu | 16 +++++++++++++--
 .../micro_benchmarks/test_gpu_stream.py       | 20 ++++++++++++++-----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 47cbc609a..2bb1b2534 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -659,8 +659,20 @@ int GpuStream::Run() {
 
     // Pin the thread to its local NUMA node to prevent migration,
     // ensuring numa_alloc_local buffers remain node-local.
-    int local_node = numa_node_of_cpu(sched_getcpu());
-    numa_run_on_node(local_node);
+    int cpu = sched_getcpu();
+    if (cpu < 0) {
+        std::cerr << "Run::sched_getcpu failed" << std::endl;
+        return -1;
+    }
+    int local_node = numa_node_of_cpu(cpu);
+    if (local_node < 0) {
+        std::cerr << "Run::numa_node_of_cpu failed for cpu " << cpu << std::endl;
+        return -1;
+    }
+    if (numa_run_on_node(local_node) != 0) {
+        std::cerr << "Run::numa_run_on_node failed for node " << local_node << std::endl;
+        return -1;
+    }
 
     // Run the benchmark for the configured data
     std::visit(
diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
index 7f58fe461..d4bb6f8d6 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
@@ -20,7 +20,7 @@ def setUpClass(cls):
         cls.createMockEnvs(cls)
         cls.createMockFiles(cls, ['bin/gpu_stream'])
 
-    def _test_gpu_stream_command_generation(self, platform):
+    def _test_gpu_stream_command_generation(self, platform, data_type='double'):
         """Test gpu-stream benchmark command generation."""
         benchmark_name = 'gpu-stream'
         (benchmark_class,
@@ -31,9 +31,9 @@ def _test_gpu_stream_command_generation(self, platform):
         num_loops = 10
         size = 25769803776
 
-        parameters = '--num_warm_up %d --num_loops %d --size %d --data_type double ' \
+        parameters = '--num_warm_up %d --num_loops %d --size %d --data_type %s ' \
             '--check_data' % \
-            (num_warm_up, num_loops, size)
+            (num_warm_up, num_loops, size, data_type)
         benchmark = benchmark_class(benchmark_name, parameters=parameters)
 
         # Check basic information
@@ -49,7 +49,7 @@ def _test_gpu_stream_command_generation(self, platform):
         assert (benchmark._args.num_warm_up == num_warm_up)
         assert (benchmark._args.num_loops == num_loops)
         assert (benchmark._args.check_data)
-        assert (benchmark._args.data_type == 'double')
+        assert (benchmark._args.data_type == data_type)
 
         # Check command
         assert (1 == len(benchmark._commands))
@@ -57,7 +57,7 @@ def _test_gpu_stream_command_generation(self, platform):
         assert ('--size %d' % size in benchmark._commands[0])
         assert ('--num_warm_up %d' % num_warm_up in benchmark._commands[0])
         assert ('--num_loops %d' % num_loops in benchmark._commands[0])
-        assert ('--data_type double' in benchmark._commands[0])
+        assert ('--data_type %s' % data_type in benchmark._commands[0])
         assert ('--check_data' in benchmark._commands[0])
 
     @decorator.cuda_test
@@ -65,11 +65,21 @@ def test_gpu_stream_command_generation_cuda(self):
         """Test gpu-stream benchmark command generation, CUDA case."""
         self._test_gpu_stream_command_generation(Platform.CUDA)
 
+    @decorator.cuda_test
+    def test_gpu_stream_command_generation_cuda_float(self):
+        """Test gpu-stream benchmark command generation with float, CUDA case."""
+        self._test_gpu_stream_command_generation(Platform.CUDA, data_type='float')
+
     @decorator.rocm_test
     def test_gpu_stream_command_generation_rocm(self):
         """Test gpu-stream benchmark command generation, ROCm case."""
         self._test_gpu_stream_command_generation(Platform.ROCM)
 
+    @decorator.rocm_test
+    def test_gpu_stream_command_generation_rocm_float(self):
+        """Test gpu-stream benchmark command generation with float, ROCm case."""
+        self._test_gpu_stream_command_generation(Platform.ROCM, data_type='float')
+
     @decorator.load_data('tests/data/gpu_stream.log')
     def _test_gpu_stream_result_parsing(self, platform, test_raw_output):
         """Test gpu-stream benchmark result parsing."""

From 01c7454bdf5f3f881e06118be3b30659bc25f9e7 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 20 May 2026 20:14:37 +0000
Subject: [PATCH 12/27] resolve comments

---
 .../micro_benchmarks/gpu_stream/gpu_stream.cu | 90 +++++++++++++++----
 .../gpu_stream/gpu_stream.hpp                 |  2 +-
 .../micro_benchmarks/test_gpu_stream.py       | 41 +++++++++
 tests/data/gpu_stream_float.log               | 24 +++++
 4 files changed, 137 insertions(+), 20 deletions(-)
 create mode 100644 tests/data/gpu_stream_float.log

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 2bb1b2534..a8791cc7b 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -238,6 +238,10 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr
     if (args->check_data) {
         // Generate data to copy - use local NUMA node for best CPU access
         args->sub.data_buf = static_cast<T *>(numa_alloc_local(args->size));
+        if (args->sub.data_buf == nullptr) {
+            std::cerr << "PrepareBufAndStream::numa_alloc_local data_buf failed" << std::endl;
+            return -1;
+        }
 
         for (uint64_t j = 0; j < args->size / sizeof(T); j++) {
             args->sub.data_buf[j] = static_cast<T>(j % kUInt8Mod);
@@ -245,6 +249,12 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr
 
         // Allocate check buffer on local NUMA node
         args->sub.check_buf = static_cast<T *>(numa_alloc_local(args->size));
+        if (args->sub.check_buf == nullptr) {
+            std::cerr << "PrepareBufAndStream::numa_alloc_local check_buf failed" << std::endl;
+            numa_free(args->sub.data_buf, args->size);
+            args->sub.data_buf = nullptr;
+            return -1;
+        }
     }
 
     // Allocate buffers
@@ -351,7 +361,12 @@ template <typename T> int GpuStream::CheckBuf(std::unique_ptr<BenchArgs<T>> &arg
         return -1;
     }
 
-    // Validate result by comparing the data buffer and check buffer
+    // Validate result by comparing the data buffer and check buffer.
+    // NOTE: memcmp is exact (byte-for-byte). This works because the current test values
+    // (j % 256, scalar = 11.0) are exactly representable in both float and double IEEE-754.
+    // If kUInt8Mod or scalar are changed to values that cause rounding differences between
+    // host (two separate ops) and GPU (FMA), this check will need a tolerance-based comparison
+    // for T = float.
     memcmp_result = memcmp(args->sub.validation_buf_ptrs[kernel_idx].data(), args->sub.check_buf, args->size);
     if (memcmp_result) {
         std::cerr << "CheckBuf::Memory check failed for kernel index " << kernel_idx << std::endl;
@@ -420,10 +435,16 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
     uint64_t num_thread_blocks;
     int size_factor = 2;
 
+    if (num_threads_per_block == 0) {
+        std::cerr << "RunStreamKernel::num_threads_per_block must be > 0" << std::endl;
+        return -1;
+    }
+
     // Validate data size
-    // Each thread processes 128 bits (16 bytes) for optimal memory bandwidth.
-    // For double: uses double2 (16 bytes). For float: would use float4 (16 bytes).
-    constexpr uint64_t kBytesPerThread = 16; // 128-bit aligned access
+    // Each thread processes one VecT<T> element (128 bits / 16 bytes) for optimal memory bandwidth.
+    // Derived from VecT<T> so any vector type change is caught at compile time.
+    constexpr uint64_t kBytesPerThread = sizeof(VecT<T>);
+    static_assert(kBytesPerThread == 16, "Vector type must be 128-bit aligned for current PTX");
     uint64_t num_bytes_in_thread_block = num_threads_per_block * kBytesPerThread;
     if (args->size % num_bytes_in_thread_block) {
         std::cerr << "RunStreamKernel: Data size should be multiple of " << num_bytes_in_thread_block << std::endl;
@@ -612,15 +633,20 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
  *
  * @tparam T The data type (float or double) for the benchmark arguments.
  */
-template <typename T> void GpuStream::CreateBenchArgs() {
+template <typename T> int GpuStream::CreateBenchArgs() {
     auto args = std::make_unique<BenchArgs<T>>();
     args->gpu_id = 0;
-    cudaGetDeviceProperties(&args->gpu_device_prop, 0);
+    cudaError_t cuda_err = cudaGetDeviceProperties(&args->gpu_device_prop, 0);
+    if (cuda_err != cudaSuccess) {
+        std::cerr << "CreateBenchArgs::cudaGetDeviceProperties error: " << cuda_err << std::endl;
+        return -1;
+    }
     args->num_warm_up = opts_.num_warm_up;
     args->num_loops = opts_.num_loops;
     args->size = opts_.size;
     args->check_data = opts_.check_data;
     bench_args_ = std::move(args);
+    return 0;
 }
 
 /**
@@ -655,22 +681,48 @@ int GpuStream::Run() {
     }
 
     // Run on CUDA device 0 (the visible GPU assigned by CUDA_VISIBLE_DEVICES).
-    opts_.data_type == "float" ? CreateBenchArgs<float>() : CreateBenchArgs<double>();
-
-    // Pin the thread to its local NUMA node to prevent migration,
-    // ensuring numa_alloc_local buffers remain node-local.
-    int cpu = sched_getcpu();
-    if (cpu < 0) {
-        std::cerr << "Run::sched_getcpu failed" << std::endl;
+    if (opts_.data_type == "float") {
+        ret = CreateBenchArgs<float>();
+    } else if (opts_.data_type == "double") {
+        ret = CreateBenchArgs<double>();
+    } else {
+        std::cerr << "Run::Invalid data_type: " << opts_.data_type << std::endl;
         return -1;
     }
-    int local_node = numa_node_of_cpu(cpu);
-    if (local_node < 0) {
-        std::cerr << "Run::numa_node_of_cpu failed for cpu " << cpu << std::endl;
-        return -1;
+    if (ret != 0) {
+        return ret;
+    }
+
+    // Pin the thread to the GPU's NUMA node for optimal host↔device bandwidth.
+    // Query GPU 0's preferred CPU NUMA node via NVML; fall back to the process's
+    // current node if the NVML query fails (e.g. NUMA disabled, older driver).
+    int target_node = -1;
+    {
+        nvmlDevice_t nvml_dev;
+        unsigned int gpu_numa_node = 0;
+        if (nvmlInit() == NVML_SUCCESS) {
+            if (nvmlDeviceGetHandleByIndex(0, &nvml_dev) == NVML_SUCCESS &&
+                nvmlDeviceGetNumaNodeId(nvml_dev, &gpu_numa_node) == NVML_SUCCESS) {
+                target_node = static_cast<int>(gpu_numa_node);
+            }
+            nvmlShutdown();
+        }
+    }
+    if (target_node < 0) {
+        // Fallback: use the node where this process is currently scheduled
+        int cpu = sched_getcpu();
+        if (cpu < 0) {
+            std::cerr << "Run::sched_getcpu failed" << std::endl;
+            return -1;
+        }
+        target_node = numa_node_of_cpu(cpu);
+        if (target_node < 0) {
+            std::cerr << "Run::numa_node_of_cpu failed for cpu " << cpu << std::endl;
+            return -1;
+        }
     }
-    if (numa_run_on_node(local_node) != 0) {
-        std::cerr << "Run::numa_run_on_node failed for node " << local_node << std::endl;
+    if (numa_run_on_node(target_node) != 0) {
+        std::cerr << "Run::numa_run_on_node failed for node " << target_node << std::endl;
         return -1;
     }
 
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
index c4a472e81..65502a243 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.hpp
@@ -56,7 +56,7 @@ class GpuStream {
     template <typename T> int RunStream(std::unique_ptr<BenchArgs<T>> &, const std::string &data_type, float peak_bw);
 
     // Helper functions
-    template <typename T> void CreateBenchArgs();
+    template <typename T> int CreateBenchArgs();
     int GetGpuCount(int *);
     int SetGpu(int gpu_id);
     float GetMemoryClockRate(int device_id, const cudaDeviceProp &prop);
diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
index 937cce941..bf37074e3 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
@@ -117,3 +117,44 @@ def _test_gpu_stream_result_parsing(self, platform, test_raw_output):
     def test_gpu_stream_result_parsing_cuda(self):
         """Test gpu-stream benchmark result parsing, CUDA case."""
         self._test_gpu_stream_result_parsing(Platform.CUDA)
+
+    @decorator.load_data('tests/data/gpu_stream_float.log')
+    def _test_gpu_stream_result_parsing_float(self, platform, test_raw_output):
+        """Test gpu-stream benchmark result parsing for float data type."""
+        benchmark_name = 'gpu-stream'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
+        assert (benchmark_class)
+        benchmark = benchmark_class(benchmark_name, parameters='--data_type float')
+        assert (benchmark)
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+
+        # Positive case - valid raw output with float tags.
+        assert (benchmark._process_raw_result(0, test_raw_output))
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+
+        assert (1 == len(benchmark.raw_data))
+        test_raw_output_dict = {
+            x.split()[0]: [float(x.split()[1]), float(x.split()[2])]
+            for x in test_raw_output.strip().splitlines() if x.startswith('STREAM_')
+        }
+        assert (len(test_raw_output_dict) * 2 + benchmark.default_metric_count == len(benchmark.result))
+        for output_key in benchmark.result:
+            if output_key == 'return_code':
+                assert (benchmark.result[output_key] == [0])
+            else:
+                assert (len(benchmark.result[output_key]) == 1)
+                assert (isinstance(benchmark.result[output_key][0], numbers.Number))
+                if output_key.endswith('_bw'):
+                    assert (output_key.strip('_bw') in test_raw_output_dict)
+                    assert (test_raw_output_dict[output_key.strip('_bw')][0] == benchmark.result[output_key][0])
+                else:
+                    assert (output_key.strip('_ratio') in test_raw_output_dict)
+                    assert (test_raw_output_dict[output_key.strip('_ratio')][1] == benchmark.result[output_key][0])
+
+    @decorator.cuda_test
+    def test_gpu_stream_result_parsing_cuda_float(self):
+        """Test gpu-stream benchmark result parsing for float, CUDA case."""
+        self._test_gpu_stream_result_parsing_float(Platform.CUDA)
diff --git a/tests/data/gpu_stream_float.log b/tests/data/gpu_stream_float.log
new file mode 100644
index 000000000..d1ab3ad1e
--- /dev/null
+++ b/tests/data/gpu_stream_float.log
@@ -0,0 +1,24 @@
+STREAM Benchmark
+Buffer size(bytes): 4294967296
+Number of warm up runs: 10
+Number of loops: 40
+Data type: float
+Check data: No
+
+Device 0: "NVIDIA Graphics Device"  152 SMs(10.0)  Memory: 4000MHz x 8192-bit = 8192 GB/s PEAK   ECC is ON
+STREAM_COPY_float_buffer_4294967296_block_128	6823.45	83.30
+STREAM_COPY_float_buffer_4294967296_block_256	6650.12	81.18
+STREAM_COPY_float_buffer_4294967296_block_512	6301.88	76.93
+STREAM_COPY_float_buffer_4294967296_block_1024	5812.34	70.95
+STREAM_SCALE_float_buffer_4294967296_block_128	6790.11	82.89
+STREAM_SCALE_float_buffer_4294967296_block_256	6620.33	80.81
+STREAM_SCALE_float_buffer_4294967296_block_512	6210.45	75.81
+STREAM_SCALE_float_buffer_4294967296_block_1024	5718.90	69.81
+STREAM_ADD_float_buffer_4294967296_block_128	7490.22	91.43
+STREAM_ADD_float_buffer_4294967296_block_256	7512.10	91.70
+STREAM_ADD_float_buffer_4294967296_block_512	7405.67	90.40
+STREAM_ADD_float_buffer_4294967296_block_1024	6890.33	84.11
+STREAM_TRIAD_float_buffer_4294967296_block_128	7485.55	91.38
+STREAM_TRIAD_float_buffer_4294967296_block_256	7520.88	91.81
+STREAM_TRIAD_float_buffer_4294967296_block_512	7390.12	90.21
+STREAM_TRIAD_float_buffer_4294967296_block_1024	6825.11	83.32

From 450a28dc1a39b514c3cf61de6a50604fecdf06c0 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 20 May 2026 20:20:05 +0000
Subject: [PATCH 13/27] refine doc

---
 docs/user-tutorial/benchmarks/micro-benchmarks.md    | 4 +++-
 superbench/benchmarks/micro_benchmarks/gpu_stream.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index 2cbbb8419..7a259eb2b 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -267,7 +267,9 @@ For measurements of peer-to-peer communication performance between AMD GPUs, GPU
 
 #### Introduction
 
-Measure the memory bandwidth of GPU using the STREAM benchmark. The benchmark tests various memory operations including copy, scale, add, and triad for double datatype.
+Measure the memory bandwidth of GPU using the STREAM benchmark. The benchmark tests various memory operations including copy, scale, add, and triad for double and float datatypes.
+
+__Note__: When `--check_data` is enabled, each process allocates 2× `--size` bytes of host memory for validation buffers (e.g. 8 GiB with the default 4 GiB `--size`). Under `default_local_mode` with 8 GPUs this totals ~64 GiB of host RAM. Recommend using a small `--size` such as `1048576` (1 MiB) when `--check_data` is enabled.
 
 #### Metrics
 
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream.py b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
index aaf61191d..e8f48fa5a 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
@@ -63,7 +63,9 @@ def add_parser_arguments(self):
         self._parser.add_argument(
             '--check_data',
             action='store_true',
-            help='Enable data checking',
+            help='Enable data checking. Note: allocates 2x --size bytes of host memory per process '
+                 'for validation buffers (e.g. 8 GiB with default 4 GiB --size). '
+                 'Recommend using a small --size such as 1048576 (1 MiB) when this flag is enabled.',
         )
 
     def _preprocess(self):

From 9cffad8d3260002c8368a82f20222d238277bf8c Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 20 May 2026 17:17:49 -0700
Subject: [PATCH 14/27] fix lint

---
 superbench/benchmarks/micro_benchmarks/gpu_stream.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream.py b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
index e8f48fa5a..2a3d38421 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
@@ -64,8 +64,8 @@ def add_parser_arguments(self):
             '--check_data',
             action='store_true',
             help='Enable data checking. Note: allocates 2x --size bytes of host memory per process '
-                 'for validation buffers (e.g. 8 GiB with default 4 GiB --size). '
-                 'Recommend using a small --size such as 1048576 (1 MiB) when this flag is enabled.',
+            'for validation buffers (e.g. 8 GiB with default 4 GiB --size). '
+            'Recommend using a small --size such as 1048576 (1 MiB) when this flag is enabled.',
         )
 
     def _preprocess(self):

From fe00d1ae37cfac3a8e4b82282ca24da8516ffab2 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 20 May 2026 17:26:28 -0700
Subject: [PATCH 15/27] resolve comment

---
 superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt | 1 +
 .../micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
index 0af80e4e1..342c11623 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
@@ -38,6 +38,7 @@ set(SOURCES
 include(../cuda_common.cmake)
 add_executable(gpu_stream ${SOURCES})
 set_property(TARGET gpu_stream PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
+target_compile_definitions(gpu_stream PRIVATE _GNU_SOURCE)
 target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
 target_link_libraries(gpu_stream numa ${NVML_LIBRARY})
 
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
index 626f96227..8702b1421 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_kernels.hpp
@@ -5,6 +5,7 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <type_traits>
 
 #include "gpu_stream_utils.hpp"
 

From ea3fd8e95655db3ba193e8d66d92ee8caedcd455 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 21 May 2026 18:13:00 +0000
Subject: [PATCH 16/27] fix cuda11.1 build

---
 .../benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu       | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index a8791cc7b..684e1eb1e 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -697,6 +697,8 @@ int GpuStream::Run() {
     // Query GPU 0's preferred CPU NUMA node via NVML; fall back to the process's
     // current node if the NVML query fails (e.g. NUMA disabled, older driver).
     int target_node = -1;
+#if CUDA_VERSION >= 11050
+    // nvmlDeviceGetNumaNodeId is available in NVML shipped with CUDA 11.5+
     {
         nvmlDevice_t nvml_dev;
         unsigned int gpu_numa_node = 0;
@@ -708,6 +710,7 @@ int GpuStream::Run() {
             nvmlShutdown();
         }
     }
+#endif
     if (target_node < 0) {
         // Fallback: use the node where this process is currently scheduled
         int cpu = sched_getcpu();

From 2b6ea7ec73f3bdf43a7be5f5ab24b7ea9e24bba6 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 21 May 2026 18:20:47 +0000
Subject: [PATCH 17/27] fix doc

---
 docs/user-tutorial/benchmarks/micro-benchmarks.md    | 2 +-
 superbench/benchmarks/micro_benchmarks/gpu_stream.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index 7a259eb2b..ae370c584 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -269,7 +269,7 @@ For measurements of peer-to-peer communication performance between AMD GPUs, GPU
 
 Measure the memory bandwidth of GPU using the STREAM benchmark. The benchmark tests various memory operations including copy, scale, add, and triad for double and float datatypes.
 
-__Note__: When `--check_data` is enabled, each process allocates 2× `--size` bytes of host memory for validation buffers (e.g. 8 GiB with the default 4 GiB `--size`). Under `default_local_mode` with 8 GPUs this totals ~64 GiB of host RAM. Recommend using a small `--size` such as `1048576` (1 MiB) when `--check_data` is enabled.
+__Note__: When `--check_data` is enabled, each process allocates 6× `--size` bytes of host memory (data\_buf + check\_buf + 4 validation buffers, e.g. 24 GiB with the default 4 GiB `--size`). Under `default_local_mode` with 8 GPUs this totals ~192 GiB of host RAM. Recommend using a small `--size` such as `1048576` (1 MiB) when `--check_data` is enabled.
 
 #### Metrics
 
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream.py b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
index 2a3d38421..985f16039 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
@@ -63,9 +63,9 @@ def add_parser_arguments(self):
         self._parser.add_argument(
             '--check_data',
             action='store_true',
-            help='Enable data checking. Note: allocates 2x --size bytes of host memory per process '
-            'for validation buffers (e.g. 8 GiB with default 4 GiB --size). '
-            'Recommend using a small --size such as 1048576 (1 MiB) when this flag is enabled.',
+            help='Enable data checking. Note: allocates 6x --size bytes of host memory per process '
+            '(data_buf + check_buf + 4 validation buffers, e.g. 24 GiB with default 4 GiB --size). '
+            'Recommend using a small --size such as 1048576 (1 MiB) when this flag is enabled.',,
         )
 
     def _preprocess(self):

From 0fd405c0013aac603668fc68e0bb9da047ec9d04 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 21 May 2026 18:23:57 +0000
Subject: [PATCH 18/27] resolve comments

---
 .../micro_benchmarks/gpu_stream/gpu_stream.cu    |  2 +-
 .../micro_benchmarks/test_gpu_stream.py          | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 684e1eb1e..74a3e2524 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -758,7 +758,7 @@ int GpuStream::Run() {
             }
 
             if (ret != 0) {
-                std::cerr << "Run::RunStream error: " << errno << std::endl;
+                std::cerr << "Run::RunStream error: " << ret << std::endl;
             }
         },
         bench_args_);
diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
index bf37074e3..eb359f6db 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
@@ -103,11 +103,11 @@ def _test_gpu_stream_result_parsing(self, platform, test_raw_output):
                 assert (len(benchmark.result[output_key]) == 1)
                 assert (isinstance(benchmark.result[output_key][0], numbers.Number))
                 if output_key.endswith('_bw'):
-                    assert (output_key.strip('_bw') in test_raw_output_dict)
-                    assert (test_raw_output_dict[output_key.strip('_bw')][0] == benchmark.result[output_key][0])
+                    assert (output_key.removesuffix('_bw') in test_raw_output_dict)
+                    assert (test_raw_output_dict[output_key.removesuffix('_bw')][0] == benchmark.result[output_key][0])
                 else:
-                    assert (output_key.strip('_ratio') in test_raw_output_dict)
-                    assert (test_raw_output_dict[output_key.strip('_ratio')][1] == benchmark.result[output_key][0])
+                    assert (output_key.removesuffix('_ratio') in test_raw_output_dict)
+                    assert (test_raw_output_dict[output_key.removesuffix('_ratio')][1] == benchmark.result[output_key][0])
 
         # Negative case - invalid raw output.
         assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
@@ -148,11 +148,11 @@ def _test_gpu_stream_result_parsing_float(self, platform, test_raw_output):
                 assert (len(benchmark.result[output_key]) == 1)
                 assert (isinstance(benchmark.result[output_key][0], numbers.Number))
                 if output_key.endswith('_bw'):
-                    assert (output_key.strip('_bw') in test_raw_output_dict)
-                    assert (test_raw_output_dict[output_key.strip('_bw')][0] == benchmark.result[output_key][0])
+                    assert (output_key.removesuffix('_bw') in test_raw_output_dict)
+                    assert (test_raw_output_dict[output_key.removesuffix('_bw')][0] == benchmark.result[output_key][0])
                 else:
-                    assert (output_key.strip('_ratio') in test_raw_output_dict)
-                    assert (test_raw_output_dict[output_key.strip('_ratio')][1] == benchmark.result[output_key][0])
+                    assert (output_key.removesuffix('_ratio') in test_raw_output_dict)
+                    assert (test_raw_output_dict[output_key.removesuffix('_ratio')][1] == benchmark.result[output_key][0])
 
     @decorator.cuda_test
     def test_gpu_stream_result_parsing_cuda_float(self):

From 417375972e528ce0ead563fbc2bc6f08bfb81115 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 21 May 2026 11:44:40 -0700
Subject: [PATCH 19/27] fix syntax

---
 superbench/benchmarks/micro_benchmarks/gpu_stream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream.py b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
index 985f16039..142acb5a9 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream.py
@@ -65,7 +65,7 @@ def add_parser_arguments(self):
             action='store_true',
             help='Enable data checking. Note: allocates 6x --size bytes of host memory per process '
             '(data_buf + check_buf + 4 validation buffers, e.g. 24 GiB with default 4 GiB --size). '
-            'Recommend using a small --size such as 1048576 (1 MiB) when this flag is enabled.',,
+            'Recommend using a small --size such as 1048576 (1 MiB) when this flag is enabled.',
         )
 
     def _preprocess(self):

From ee52086b8a8bcad0a56dc2b354e9ba2fc5f4cde0 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 21 May 2026 12:07:45 -0700
Subject: [PATCH 20/27] fix lint

---
 tests/benchmarks/micro_benchmarks/test_gpu_stream.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
index eb359f6db..6a98109f1 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
@@ -107,7 +107,9 @@ def _test_gpu_stream_result_parsing(self, platform, test_raw_output):
                     assert (test_raw_output_dict[output_key.removesuffix('_bw')][0] == benchmark.result[output_key][0])
                 else:
                     assert (output_key.removesuffix('_ratio') in test_raw_output_dict)
-                    assert (test_raw_output_dict[output_key.removesuffix('_ratio')][1] == benchmark.result[output_key][0])
+                    assert (
+                        test_raw_output_dict[output_key.removesuffix('_ratio')][1] == benchmark.result[output_key][0]
+                    )
 
         # Negative case - invalid raw output.
         assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
@@ -152,7 +154,9 @@ def _test_gpu_stream_result_parsing_float(self, platform, test_raw_output):
                     assert (test_raw_output_dict[output_key.removesuffix('_bw')][0] == benchmark.result[output_key][0])
                 else:
                     assert (output_key.removesuffix('_ratio') in test_raw_output_dict)
-                    assert (test_raw_output_dict[output_key.removesuffix('_ratio')][1] == benchmark.result[output_key][0])
+                    assert (
+                        test_raw_output_dict[output_key.removesuffix('_ratio')][1] == benchmark.result[output_key][0]
+                    )
 
     @decorator.cuda_test
     def test_gpu_stream_result_parsing_cuda_float(self):

From 80d5f0a18e2284e8bed207122346a82a5abfffa7 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 21 May 2026 12:58:50 -0700
Subject: [PATCH 21/27] resolve comment

---
 .../micro_benchmarks/test_gpu_stream.py       | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
index 6a98109f1..d9d836a92 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_stream.py
@@ -103,13 +103,12 @@ def _test_gpu_stream_result_parsing(self, platform, test_raw_output):
                 assert (len(benchmark.result[output_key]) == 1)
                 assert (isinstance(benchmark.result[output_key][0], numbers.Number))
                 if output_key.endswith('_bw'):
-                    assert (output_key.removesuffix('_bw') in test_raw_output_dict)
-                    assert (test_raw_output_dict[output_key.removesuffix('_bw')][0] == benchmark.result[output_key][0])
+                    assert (output_key[:-3] in test_raw_output_dict)
+                    assert (test_raw_output_dict[output_key[:-3]][0] == benchmark.result[output_key][0])
                 else:
-                    assert (output_key.removesuffix('_ratio') in test_raw_output_dict)
-                    assert (
-                        test_raw_output_dict[output_key.removesuffix('_ratio')][1] == benchmark.result[output_key][0]
-                    )
+                    assert (output_key.endswith('_ratio'))
+                    assert (output_key[:-6] in test_raw_output_dict)
+                    assert (test_raw_output_dict[output_key[:-6]][1] == benchmark.result[output_key][0])
 
         # Negative case - invalid raw output.
         assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
@@ -150,13 +149,12 @@ def _test_gpu_stream_result_parsing_float(self, platform, test_raw_output):
                 assert (len(benchmark.result[output_key]) == 1)
                 assert (isinstance(benchmark.result[output_key][0], numbers.Number))
                 if output_key.endswith('_bw'):
-                    assert (output_key.removesuffix('_bw') in test_raw_output_dict)
-                    assert (test_raw_output_dict[output_key.removesuffix('_bw')][0] == benchmark.result[output_key][0])
+                    assert (output_key[:-3] in test_raw_output_dict)
+                    assert (test_raw_output_dict[output_key[:-3]][0] == benchmark.result[output_key][0])
                 else:
-                    assert (output_key.removesuffix('_ratio') in test_raw_output_dict)
-                    assert (
-                        test_raw_output_dict[output_key.removesuffix('_ratio')][1] == benchmark.result[output_key][0]
-                    )
+                    assert (output_key.endswith('_ratio'))
+                    assert (output_key[:-6] in test_raw_output_dict)
+                    assert (test_raw_output_dict[output_key[:-6]][1] == benchmark.result[output_key][0])
 
     @decorator.cuda_test
     def test_gpu_stream_result_parsing_cuda_float(self):

From 5e887ff6c6b2ac4a1a9d6fef791b81da92900e5b Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 22 May 2026 17:13:23 +0000
Subject: [PATCH 22/27] fix nvmldevicegetnumanodeid error

---
 .../gpu_stream/CMakeLists.txt                 |  2 +-
 .../micro_benchmarks/gpu_stream/gpu_stream.cu | 25 +++++++++++--------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
index 342c11623..a587fc9c2 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/CMakeLists.txt
@@ -40,6 +40,6 @@ add_executable(gpu_stream ${SOURCES})
 set_property(TARGET gpu_stream PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
 target_compile_definitions(gpu_stream PRIVATE _GNU_SOURCE)
 target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
-target_link_libraries(gpu_stream numa ${NVML_LIBRARY})
+target_link_libraries(gpu_stream numa ${NVML_LIBRARY} ${CMAKE_DL_LIBS})
 
 install(TARGETS gpu_stream RUNTIME DESTINATION bin)
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 74a3e2524..5b5258aec 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -8,6 +8,7 @@
 
 #include "gpu_stream.hpp"
 #include <cassert>
+#include <dlfcn.h>
 #include <iostream>
 #include <nvml.h>
 #include <sched.h>
@@ -697,20 +698,24 @@ int GpuStream::Run() {
     // Query GPU 0's preferred CPU NUMA node via NVML; fall back to the process's
     // current node if the NVML query fails (e.g. NUMA disabled, older driver).
     int target_node = -1;
-#if CUDA_VERSION >= 11050
-    // nvmlDeviceGetNumaNodeId is available in NVML shipped with CUDA 11.5+
+    // Resolve nvmlDeviceGetNumaNodeId at runtime via dlsym so the binary
+    // remains loadable on systems whose NVML driver predates this symbol.
     {
-        nvmlDevice_t nvml_dev;
-        unsigned int gpu_numa_node = 0;
-        if (nvmlInit() == NVML_SUCCESS) {
-            if (nvmlDeviceGetHandleByIndex(0, &nvml_dev) == NVML_SUCCESS &&
-                nvmlDeviceGetNumaNodeId(nvml_dev, &gpu_numa_node) == NVML_SUCCESS) {
-                target_node = static_cast<int>(gpu_numa_node);
+        using NvmlGetNumaNodeId_t = nvmlReturn_t (*)(nvmlDevice_t, unsigned int *);
+        auto nvmlGetNumaNodeId = reinterpret_cast<NvmlGetNumaNodeId_t>(
+            dlsym(RTLD_DEFAULT, "nvmlDeviceGetNumaNodeId"));
+        if (nvmlGetNumaNodeId != nullptr) {
+            nvmlDevice_t nvml_dev;
+            unsigned int gpu_numa_node = 0;
+            if (nvmlInit() == NVML_SUCCESS) {
+                if (nvmlDeviceGetHandleByIndex(0, &nvml_dev) == NVML_SUCCESS &&
+                    nvmlGetNumaNodeId(nvml_dev, &gpu_numa_node) == NVML_SUCCESS) {
+                    target_node = static_cast<int>(gpu_numa_node);
+                }
+                nvmlShutdown();
             }
-            nvmlShutdown();
         }
     }
-#endif
     if (target_node < 0) {
         // Fallback: use the node where this process is currently scheduled
         int cpu = sched_getcpu();

From 18531e000f3e2375936b2fcf17bb74079a3edcb7 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 22 May 2026 17:27:18 +0000
Subject: [PATCH 23/27] fix nvml call indx

---
 .../micro_benchmarks/gpu_stream/gpu_stream.cu | 33 ++++++++++++++-----
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 5b5258aec..450c824b5 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -547,8 +547,17 @@ float GpuStream::GetActualMemoryClockRate(int gpu_id) {
         return -1.0f;
     }
 
-    // Get device handle
-    result = nvmlDeviceGetHandleByIndex(gpu_id, &device);
+    // Map CUDA device index to NVML device via PCI bus ID.
+    // NVML does not honour CUDA_VISIBLE_DEVICES, so nvmlDeviceGetHandleByIndex
+    // would return the wrong physical GPU when the env var is set.
+    char pci_bus_id[16];
+    cudaError_t cuda_err = cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), gpu_id);
+    if (cuda_err != cudaSuccess) {
+        std::cerr << "GetActualMemoryClockRate::cudaDeviceGetPCIBusId error: " << cuda_err << std::endl;
+        nvmlShutdown();
+        return -1.0f;
+    }
+    result = nvmlDeviceGetHandleByPciBusId(pci_bus_id, &device);
     if (result != NVML_SUCCESS) {
         std::cerr << "Failed to get device handle: " << nvmlErrorString(result) << std::endl;
         nvmlShutdown();
@@ -700,19 +709,25 @@ int GpuStream::Run() {
     int target_node = -1;
     // Resolve nvmlDeviceGetNumaNodeId at runtime via dlsym so the binary
     // remains loadable on systems whose NVML driver predates this symbol.
+    // Map CUDA device 0 to NVML via PCI bus ID because NVML does not
+    // honour CUDA_VISIBLE_DEVICES.
     {
         using NvmlGetNumaNodeId_t = nvmlReturn_t (*)(nvmlDevice_t, unsigned int *);
         auto nvmlGetNumaNodeId = reinterpret_cast<NvmlGetNumaNodeId_t>(
             dlsym(RTLD_DEFAULT, "nvmlDeviceGetNumaNodeId"));
         if (nvmlGetNumaNodeId != nullptr) {
-            nvmlDevice_t nvml_dev;
-            unsigned int gpu_numa_node = 0;
-            if (nvmlInit() == NVML_SUCCESS) {
-                if (nvmlDeviceGetHandleByIndex(0, &nvml_dev) == NVML_SUCCESS &&
-                    nvmlGetNumaNodeId(nvml_dev, &gpu_numa_node) == NVML_SUCCESS) {
-                    target_node = static_cast<int>(gpu_numa_node);
+            char pci_bus_id[16];
+            cudaError_t cuda_err = cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), 0);
+            if (cuda_err == cudaSuccess) {
+                nvmlDevice_t nvml_dev;
+                unsigned int gpu_numa_node = 0;
+                if (nvmlInit() == NVML_SUCCESS) {
+                    if (nvmlDeviceGetHandleByPciBusId(pci_bus_id, &nvml_dev) == NVML_SUCCESS &&
+                        nvmlGetNumaNodeId(nvml_dev, &gpu_numa_node) == NVML_SUCCESS) {
+                        target_node = static_cast<int>(gpu_numa_node);
+                    }
+                    nvmlShutdown();
                 }
-                nvmlShutdown();
             }
         }
     }

From b21284be759427e6e3af820110365a6f9346e9dd Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 22 May 2026 17:30:33 +0000
Subject: [PATCH 24/27] fix lint

---
 .../benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu       | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 450c824b5..b3bb5039f 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -713,8 +713,7 @@ int GpuStream::Run() {
     // honour CUDA_VISIBLE_DEVICES.
     {
         using NvmlGetNumaNodeId_t = nvmlReturn_t (*)(nvmlDevice_t, unsigned int *);
-        auto nvmlGetNumaNodeId = reinterpret_cast<NvmlGetNumaNodeId_t>(
-            dlsym(RTLD_DEFAULT, "nvmlDeviceGetNumaNodeId"));
+        auto nvmlGetNumaNodeId = reinterpret_cast<NvmlGetNumaNodeId_t>(dlsym(RTLD_DEFAULT, "nvmlDeviceGetNumaNodeId"));
         if (nvmlGetNumaNodeId != nullptr) {
             char pci_bus_id[16];
             cudaError_t cuda_err = cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), 0);

From a7f83d4ffa419541671ecd450501c6da7acfca6f Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 22 May 2026 17:40:35 +0000
Subject: [PATCH 25/27] fix shadow ret

---
 .../micro_benchmarks/gpu_stream/gpu_stream.cu       | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index b3bb5039f..c17952466 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -607,10 +607,17 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
         // run the stream benchmark over the stream kernels
         for (int i = 0; i < static_cast<int>(Kernel::kCount); ++i) {
             Kernel kernel = static_cast<Kernel>(i);
-            int ret = RunStreamKernel<T>(args, kernel, num_threads_in_block);
-            if (ret == 0 && args->check_data) {
-                // Compare buffer based on the kernel
+            ret = RunStreamKernel<T>(args, kernel, num_threads_in_block);
+            if (ret != 0) {
+                Destroy(args);
+                return ret;
+            }
+            if (args->check_data) {
                 ret = CheckBuf(args, i);
+                if (ret != 0) {
+                    Destroy(args);
+                    return ret;
+                }
             }
         }
     }

From 3cece56e57d494e64acd04ac37cfc39ac5900dad Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 26 May 2026 12:12:41 -0700
Subject: [PATCH 26/27] fix early return undefined behavior

---
 .../micro_benchmarks/gpu_stream/gpu_stream.cu | 45 +++++++++++--------
 .../gpu_stream/gpu_stream_utils.cpp           |  2 +-
 .../gpu_stream/gpu_stream_utils.hpp           |  6 +--
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index c17952466..68740f313 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -30,15 +30,21 @@ template <typename T> int GpuStream::DestroyEvent(std::unique_ptr<BenchArgs<T>>
     if (SetGpu(args->gpu_id)) {
         return -1;
     }
-    cuda_err = cudaEventDestroy(args->sub.start_event);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "DestroyEvent::cudaEventDestroy error: " << cuda_err << std::endl;
-        return -1;
+    if (args->sub.start_event != nullptr) {
+        cuda_err = cudaEventDestroy(args->sub.start_event);
+        if (cuda_err != cudaSuccess) {
+            std::cerr << "DestroyEvent::cudaEventDestroy error: " << cuda_err << std::endl;
+            return -1;
+        }
+        args->sub.start_event = nullptr;
     }
-    cuda_err = cudaEventDestroy(args->sub.end_event);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "DestroyEvent::cudaEventDestroy error: " << cuda_err << std::endl;
-        return -1;
+    if (args->sub.end_event != nullptr) {
+        cuda_err = cudaEventDestroy(args->sub.end_event);
+        if (cuda_err != cudaSuccess) {
+            std::cerr << "DestroyEvent::cudaEventDestroy error: " << cuda_err << std::endl;
+            return -1;
+        }
+        args->sub.end_event = nullptr;
     }
     return 0;
 }
@@ -100,10 +106,8 @@ int GpuStream::GetGpuCount(int *gpu_count) {
  */
 template <typename T> int GpuStream::Destroy(std::unique_ptr<BenchArgs<T>> &args) {
     int ret = DestroyBufAndStream(args);
-    if (ret == 0) {
-        ret = DestroyEvent(args);
-    }
-    return ret;
+    int event_ret = DestroyEvent(args);
+    return (ret != 0) ? ret : event_ret;
 }
 
 /**
@@ -407,10 +411,13 @@ template <typename T> int GpuStream::DestroyBufAndStream(std::unique_ptr<BenchAr
         return -1;
     }
 
-    cuda_err = cudaStreamDestroy(args->sub.stream);
-    if (cuda_err != cudaSuccess) {
-        std::cerr << "DestroyBufAndStream::cudaStreamDestroy error: " << cuda_err << std::endl;
-        return -1;
+    if (args->sub.stream != nullptr) {
+        cuda_err = cudaStreamDestroy(args->sub.stream);
+        if (cuda_err != cudaSuccess) {
+            std::cerr << "DestroyBufAndStream::cudaStreamDestroy error: " << cuda_err << std::endl;
+            return -1;
+        }
+        args->sub.stream = nullptr;
     }
 
     return ret;
@@ -599,7 +606,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
 
     ret = PrepareEvent(args);
     if (ret != 0) {
-        return DestroyEvent(args);
+        DestroyEvent(args);
+        DestroyBufAndStream(args);
+        return -1;
     }
 
     // benchmark over the kThreadsPerBlock array
@@ -629,7 +638,7 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
         std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_buffer_" + std::to_string(args->size);
         for (size_t j = 0; j < args->sub.times_in_ms[i].size(); j++) {
             // Calculate and display bandwidth
-            double bw = args->size * args->num_loops / args->sub.times_in_ms[i][j] / 1e6;
+            double bw = static_cast<double>(args->size) * args->num_loops / args->sub.times_in_ms[i][j] / 1e6;
             std::cout << tag << "_block_" << kThreadsPerBlock[j] << "\t" << bw << "\t";
 
             if (peak_bw < 0) { // cannot get peak_bw -> prints -1 for efficiency
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
index fd0dfb913..668a1f0a1 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.cpp
@@ -85,7 +85,7 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
                                      {"data_type", required_argument, nullptr, static_cast<int>(OptIdx::kDataType)}};
     int getopt_ret = 0;
     int opt_idx = 0;
-    bool size_specified = true;
+    bool size_specified = false;
     bool num_warm_up_specified = false;
     bool num_loops_specified = false;
 
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
index 907d05ef2..20f6a5654 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream_utils.hpp
@@ -64,13 +64,13 @@ template <typename T> struct SubBenchArgs {
     std::vector<std::vector<T>> validation_buf_ptrs;
 
     // CUDA stream to be used.
-    cudaStream_t stream;
+    cudaStream_t stream = nullptr;
 
     // CUDA event to record start time.
-    cudaEvent_t start_event;
+    cudaEvent_t start_event = nullptr;
 
     // CUDA event to record end time.
-    cudaEvent_t end_event;
+    cudaEvent_t end_event = nullptr;
 
     // CUDA event to record end time.
     std::vector<std::vector<float>> times_in_ms;

From ea3a186bf739181d9275e57547cd8dfe0dfbf4e5 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 27 May 2026 10:32:54 -0700
Subject: [PATCH 27/27] idempotent cleanup

---
 .../micro_benchmarks/gpu_stream/gpu_stream.cu        | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
index 68740f313..64964a3df 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
@@ -401,11 +401,17 @@ template <typename T> int GpuStream::DestroyBufAndStream(std::unique_ptr<BenchAr
     // Destroy original data buffer and check buffer
     if (args->sub.data_buf != nullptr) {
         numa_free(args->sub.data_buf, args->size);
+        args->sub.data_buf = nullptr;
     }
     if (args->sub.check_buf != nullptr) {
         numa_free(args->sub.check_buf, args->size);
+        args->sub.check_buf = nullptr;
     }
 
+    // Release GPU buffers immediately to free device memory
+    args->sub.gpu_buf_ptrs.clear();
+    args->sub.validation_buf_ptrs.clear();
+
     // Set to buffer device for GPU buffer
     if (SetGpu(args->gpu_id)) {
         return -1;
@@ -601,13 +607,13 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
     ret = PrepareBufAndStream<T>(args);
 
     if (ret != 0) {
-        return DestroyBufAndStream(args);
+        Destroy(args);
+        return -1;
     }
 
     ret = PrepareEvent(args);
     if (ret != 0) {
-        DestroyEvent(args);
-        DestroyBufAndStream(args);
+        Destroy(args);
         return -1;
     }