Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
b9d87fc
Replace cpu-benchmark with similar stress-ng monte-carlo test
quantumsteve Feb 9, 2026
d3778c2
Add dependency psutil
quantumsteve Feb 10, 2026
827c189
terminate io process
quantumsteve Feb 10, 2026
e41861c
check for division by zero
quantumsteve Feb 10, 2026
901e87b
mute stress-ng
quantumsteve Feb 10, 2026
8c560de
Added a --quiet flag to another stress-ng invocation
henricasanova Feb 11, 2026
ffac645
Fixed the zombie problem
henricasanova Feb 11, 2026
e802679
Since memsize argument document says MB (and not MiB), I changed
henricasanova Feb 12, 2026
45124d9
typo-- !!
henricasanova Feb 12, 2026
c92654e
try to workaround missing cpu_queue
quantumsteve Feb 12, 2026
510ca58
typos
quantumsteve Feb 13, 2026
8bb70bc
Rewrite/Re-engineering of wfbench so that the execution proceeds in
henricasanova Feb 20, 2026
ebe24ab
Made it to that even if wfbench is ^C-ed, it doesn't leave runaway
henricasanova Feb 20, 2026
85079cb
Minor fix
henricasanova Feb 20, 2026
8f46f0e
check container output
quantumsteve Mar 2, 2026
61fe80c
bug-- in bin/wfbench
henricasanova Mar 18, 2026
4dee4fd
bug-- in wfbench
henricasanova Mar 18, 2026
2feb997
Merge branch 'stress-ng_cpu_benchmark' of github.com:wfcommons/WfComm…
henricasanova Mar 18, 2026
50e42f9
Updated the create_benchmark() method to allow specifying the number of
henricasanova Mar 18, 2026
4da3f66
Merge branch 'main' into stress-ng_cpu_benchmark
henricasanova Mar 19, 2026
54c212e
Insane race-condition bug fix if wfbench.py (having to deal with killing
henricasanova Mar 19, 2026
68fb5db
cleanup
quantumsteve Mar 20, 2026
8bd49b5
cleanup
quantumsteve Mar 20, 2026
8639ee7
commented out code
quantumsteve Mar 20, 2026
90a6a49
Updagted wfbench to make it callable as a module
henricasanova Mar 21, 2026
9a77ef8
Made the Swift/T translator create a README file with instructions
henricasanova Mar 21, 2026
f627804
Modified swift-t translator fork-exec wfbench (which is known to be
henricasanova Mar 21, 2026
03b058a
Made Swift/T translator use python_exec()
henricasanova Mar 21, 2026
50bc158
test re-enabling
henricasanova Mar 21, 2026
22efdf1
Merge branch 'stress-ng_cpu_benchmark' into stress-ng_cpu_benchmark-w…
henricasanova Mar 21, 2026
32963ae
test updates
henricasanova Mar 21, 2026
f08f54e
Removed all traces of cpu-benchmark.cpp
henricasanova Mar 22, 2026
3aa7021
added a sleep to let redis server time to start in the swift/t container
henricasanova Mar 23, 2026
0a99665
small test fix/cleanup
henricasanova Mar 23, 2026
225b689
cleanup
quantumsteve Mar 23, 2026
a486c92
Update bin/wfbench
henricasanova Mar 27, 2026
6d58507
set type to integer
quantumsteve Apr 6, 2026
1325d1b
add hipified code
quantumsteve Apr 13, 2026
b633026
check CUDA/HIP return values
quantumsteve Apr 13, 2026
2b47fc3
missed file
quantumsteve Apr 14, 2026
45ff1a5
checkpoint
quantumsteve Apr 27, 2026
7410cfd
use int64_t
quantumsteve Apr 27, 2026
67dbc4a
increase max_work to int64_t
quantumsteve Apr 27, 2026
2aec3bb
use cub for reduction.
quantumsteve Apr 28, 2026
1ce738a
clang-format change
quantumsteve Apr 28, 2026
9058ab0
hipify
quantumsteve Apr 28, 2026
c9aa1dc
CUDA_CHECK to HIP_CHECK
quantumsteve Apr 28, 2026
4baaa7f
skip setup and teardown
quantumsteve Apr 28, 2026
79d01ed
fix warnings
quantumsteve Apr 29, 2026
a58e402
Merge branch 'main' into stress-ng_cpu_benchmark
henricasanova May 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 0 additions & 40 deletions Makefile

This file was deleted.

89 changes: 0 additions & 89 deletions bin/cpu-benchmark.cpp

This file was deleted.

3 changes: 3 additions & 0 deletions bin/cuda/.clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
BasedOnStyle: LLVM
IndentWidth: 2
ColumnLimit: 120
5 changes: 5 additions & 0 deletions bin/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cmake_minimum_required(VERSION 3.21) # HIP language support requires 3.21
cmake_policy(VERSION 3.21.3...3.27)
project(MyProj LANGUAGES CUDA)
add_executable(gpu_benchmark gpu_benchmark.cu kernels.cu)

185 changes: 185 additions & 0 deletions bin/cuda/gpu_benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
#include "gpu_benchmark.h"

#include "kernels.h"
#include <cub/cub.cuh>

#include <chrono>
#include <iostream>

// The macro wraps any CUDA API call
#define CUDA_CHECK(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}

float getElapsedTime(const cudaEvent_t &gpu_start, cudaEvent_t &gpu_stop) {
float gpu_elapsed_time;
CUDA_CHECK(cudaEventRecord(gpu_stop, 0));
CUDA_CHECK(cudaEventSynchronize(gpu_stop));
CUDA_CHECK(cudaEventElapsedTime(&gpu_elapsed_time, gpu_start, gpu_stop));
return gpu_elapsed_time / 1000.0f;
}

// Function to run the GPU benchmark with no time limit
void runBenchmark(long max_work) {
uint32_t n = 256 * 256;
uint64_t m = max_work * 16384 / n;

unsigned long long int *d_count;
curandState *d_state;
CUDA_CHECK(cudaMalloc((void **)&d_count, 256 * sizeof(unsigned long long int)));
CUDA_CHECK(cudaMalloc((void **)&d_state, n * sizeof(curandState)));
CUDA_CHECK(cudaMemset(d_count, 0, 256 * sizeof(unsigned long long int)));

// set up timing stuff
cudaEvent_t gpu_start, gpu_stop;
CUDA_CHECK(cudaEventCreate(&gpu_start));
CUDA_CHECK(cudaEventCreate(&gpu_stop));

// set kernel
dim3 gridSize = 256;
dim3 blockSize = 256;
setup_kernel<<<gridSize, blockSize>>>(d_state);

// monte carlo kernel
CUDA_CHECK(cudaEventRecord(gpu_start, 0));
monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
CUDA_CHECK(cudaDeviceSynchronize());

float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
CUDA_CHECK(cudaEventDestroy(gpu_start));
CUDA_CHECK(cudaEventDestroy(gpu_stop));

// Allocate device output array
unsigned long long int *d_out = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_out, sizeof(unsigned long long int)));

// Request and allocate temporary storage
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
CUDA_CHECK(cudaMalloc((void **)&d_temp_storage, temp_storage_bytes));

// Run
CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));

// copy results back to the host
unsigned long long int h_count = 0;
CUDA_CHECK(cudaMemcpy(&h_count, d_out, sizeof(unsigned long long int), cudaMemcpyDeviceToHost));

// display results and timings for gpu
float pi = h_count * 4.0 / (n * m);
std::cout << "Approximate pi calculated on GPU is: " << pi << " and calculation took " << gpu_elapsed_time << "s\n";
std::cout << "Benchmark completed!" << std::endl;

CUDA_CHECK(cudaFree(d_count));
CUDA_CHECK(cudaFree(d_state));
CUDA_CHECK(cudaFree(d_out));
CUDA_CHECK(cudaFree(d_temp_storage));
}

// Function to run the GPU benchmark for a specified time
void runBenchmarkTime(long max_work, int runtime_in_seconds) {

uint32_t n = 256 * 256;
uint64_t m = max_work * 16384 / n;

// allocate memory
unsigned long long int *d_count;
curandState *d_state;
CUDA_CHECK(cudaMalloc((void **)&d_count, 256 * sizeof(unsigned long long int)));
CUDA_CHECK(cudaMalloc((void **)&d_state, n * sizeof(curandState)));
CUDA_CHECK(cudaMemset(d_count, 0, 256 * sizeof(unsigned long long int)));

// set up timing stuff
cudaEvent_t gpu_start, gpu_stop;
CUDA_CHECK(cudaEventCreate(&gpu_start));
CUDA_CHECK(cudaEventCreate(&gpu_stop));

// set kernel
dim3 gridSize = 256;
dim3 blockSize = 256;

setup_kernel<<<gridSize, blockSize>>>(d_state);

CUDA_CHECK(cudaEventRecord(gpu_start, 0));
int iteration = 0;
// Run the workload loop until the specified runtime is reached
while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {
monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
CUDA_CHECK(cudaDeviceSynchronize()); // Ensure the kernel has finished executing
iteration++;
}

float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
CUDA_CHECK(cudaEventDestroy(gpu_start));
CUDA_CHECK(cudaEventDestroy(gpu_stop));

// copy results back to the host
// Allocate device output array
unsigned long long int *d_out = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_out, sizeof(unsigned long long int)));

// Request and allocate temporary storage
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
CUDA_CHECK(cudaMalloc((void **)&d_temp_storage, temp_storage_bytes));

// Run
CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));

// copy results back to the host
unsigned long long int h_count = 0;
CUDA_CHECK(cudaMemcpy(&h_count, d_out, sizeof(unsigned long long int), cudaMemcpyDeviceToHost));

// display results and timings for gpu
float pi = h_count * 4.0 / (n * m) / iteration;
std::cout << "Approximate pi calculated on GPU is: " << pi << " and calculation took " << gpu_elapsed_time << "s\n";

CUDA_CHECK(cudaFree(d_count));
CUDA_CHECK(cudaFree(d_state));
CUDA_CHECK(cudaFree(d_out));
CUDA_CHECK(cudaFree(d_temp_storage));
}

int main(int argc, char *argv[]) {
// Check for the correct number of command line arguments
if (argc == 2) {
// Parse the command line arguments
long max_work = std::atol(argv[1]);

// Validate the input arguments
if (max_work <= 0) {
std::cerr << "max_work must be a positive integer." << std::endl;
return 1;
}

runBenchmark(max_work);

} else if (argc == 3) {
// Parse the command line arguments
long max_work = std::atol(argv[1]);
int runtime_in_seconds = std::atoi(argv[2]);

// Validate the input arguments
if (max_work <= 0 || runtime_in_seconds <= 0) {
std::cerr << "Both max_work and runtime_in_seconds must be positive integers." << std::endl;
return 1;
}

runBenchmarkTime(max_work, runtime_in_seconds);

} else {
std::cerr << "Usage: " << argv[0] << " <max_work> [runtime_in_seconds]" << std::endl;
return 1;
}

return 0;
}
3 changes: 1 addition & 2 deletions bin/gpu_benchmark.h → bin/cuda/gpu_benchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
#define GPU_BENCHMARK_H

#include <cuda_runtime.h>
#include <curand_kernel.h>

void runBenchmark(int max_work);
void runBenchmarkTime(int max_work, int runtime_in_seconds);

#endif // GPU_BENCHMARK_H


35 changes: 35 additions & 0 deletions bin/cuda/kernels.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include "kernels.h"

#include <cub/cub.cuh>

__global__ void setup_kernel(curandState *state) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
curand_init(123456789, index, 0, &state[index]);
}

__global__ void monte_carlo_kernel(curandState *state, unsigned long long int *count, int64_t m) {
unsigned int index = threadIdx.x + blockDim.x * blockIdx.x;

unsigned long long int thread_data = 0;

unsigned int temp = 0;
while (temp < m) {
float x = curand_uniform(&state[index]);
float y = curand_uniform(&state[index]);
float r = x * x + y * y;

if (r <= 1) {
thread_data++;
}
temp++;
}

typedef cub::BlockReduce<unsigned long long int, 256> BlockReduceT;
__shared__ typename BlockReduceT::TempStorage temp_storage;
unsigned long long int aggregate = BlockReduceT(temp_storage).Sum(thread_data);

// update to our global variable count
if (threadIdx.x == 0) {
count[blockIdx.x] += aggregate;
}
}
3 changes: 1 addition & 2 deletions bin/kernels.cuh → bin/cuda/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <curand_kernel.h>

__global__ void setup_kernel(curandState *state);
__global__ void monte_carlo_kernel(curandState *state, int *count, int m);
__global__ void monte_carlo_kernel(curandState *state, unsigned long long int *count, int64_t m);

#endif

Loading
Loading