Skip to content
Draft
122 changes: 122 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,16 @@ set(gpu_aware_mpi
${default_gpu_aware_mpi}
CACHE BOOL "Enable GPU-aware MPI")

set(team_policy
${default_team_policy}
CACHE BOOL "Enable team_policy tile-blocked deposit/pusher kernels")
set(team_policy_tile_size
${default_team_policy_tile_size}
CACHE STRING "team_policy tile edge length in cells")
set(team_policy_tile_sizes
"4;6;8;10;12;14;16"
CACHE STRING "team_policy tile-size choices")

# -------------------------- Compilation settings -------------------------- #
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
Expand Down Expand Up @@ -136,6 +146,62 @@ else()
set(DEVICE_ENABLED OFF)
endif()

# ------------------------------ team_policy wiring ------------------------ #
if(${team_policy})
list(FIND team_policy_tile_sizes "${team_policy_tile_size}" _tps_idx)
if(_tps_idx EQUAL -1)
message(FATAL_ERROR
"${Red}team_policy_tile_size must be one of ${team_policy_tile_sizes}, "
"got '${team_policy_tile_size}'${ColorReset}")
endif()
add_compile_options("-D TEAM_POLICY")
add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}")

# Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically
# when found; falls back to Kokkos::BinSort otherwise.
if("${Kokkos_DEVICES}" MATCHES "SYCL")
find_package(oneDPL QUIET)
if(oneDPL_FOUND)
message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key")
add_compile_options("-D ONEDPL_ENABLED")
set(DEPENDENCIES ${DEPENDENCIES} oneDPL)
else()
message(STATUS "team_policy: oneDPL not found; using BinSort fallback "
"for SYCL sort_by_key")
endif()
endif()

if("${Kokkos_DEVICES}" MATCHES "CUDA")
find_package(Thrust QUIET)
if(Thrust_FOUND)
message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key")
add_compile_options("-D THRUST_ENABLED")
else()
message(STATUS "team_policy: Thrust not found; using BinSort fallback "
"for CUDA sort_by_key")
endif()
endif()

if("${Kokkos_DEVICES}" MATCHES "HIP")
# rocThrust ships with ROCm and exposes the same thrust:: API. Using
# it lets the HIP backend build a single permutation via
# sort_by_key and gather all SoA members through one reused buffer,
# instead of the legacy per-member Kokkos::BinSort path which
# allocates a fresh `sorted_values` buffer for every member every
# step (the dominant source of allocator churn / fragmentation on
# ROCm).
find_package(rocthrust QUIET)
if(rocthrust_FOUND)
message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key")
add_compile_options("-D ROCTHRUST_ENABLED")
set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust)
else()
message(STATUS "team_policy: rocThrust not found; using BinSort "
"fallback for HIP sort_by_key")
endif()
endif()
endif()

# MPI
if(${mpi})
find_or_fetch_dependency(MPI FALSE REQUIRED)
Expand All @@ -145,6 +211,62 @@ if(${mpi})
if(${DEVICE_ENABLED})
if(${gpu_aware_mpi})
add_compile_options("-D GPU_AWARE_MPI")

# On Cray systems (e.g. Frontier) GPU-aware Cray MPICH can only
# handle device pointers if the GPU Transport Layer (GTL) library
# is linked. The Cray compiler wrappers (cc/CC) inject this
# automatically, but we build with hipcc/nvcc directly, so
# find_package(MPI) only finds base libmpi and the GTL is left
# out -> MPI_Sendrecv on a device pointer fails with
# "OFI ... Bad address". Add it explicitly here.
#
# Cray PE exports PE_MPICH_GTL_DIR_<accel> / PE_MPICH_GTL_LIBS_<accel>
# (e.g. amd_gfx90a -> -lmpi_gtl_hsa). Their absence means this is
# not a Cray MPICH build, in which case nothing extra is needed.
if("${Kokkos_DEVICES}" MATCHES "HIP")
set(_gtl_accels amd_gfx942 amd_gfx940 amd_gfx90a amd_gfx908 amd_gfx906)
elseif("${Kokkos_DEVICES}" MATCHES "CUDA")
set(_gtl_accels nvidia90 nvidia80 nvidia70)
elseif("${Kokkos_DEVICES}" MATCHES "SYCL")
set(_gtl_accels ponteVecchio)
else()
set(_gtl_accels "")
endif()

set(_gtl_dir "")
set(_gtl_libflag "")
foreach(_accel ${_gtl_accels})
if((NOT _gtl_dir) AND (DEFINED ENV{PE_MPICH_GTL_DIR_${_accel}}))
# strip the leading "-L" from the Cray-provided value
string(REGEX REPLACE "^-L" ""
_gtl_dir "$ENV{PE_MPICH_GTL_DIR_${_accel}}")
string(REGEX REPLACE "^-l" ""
_gtl_libflag "$ENV{PE_MPICH_GTL_LIBS_${_accel}}")
endif()
endforeach()

if(_gtl_dir AND _gtl_libflag)
find_library(MPI_GTL_LIBRARY
NAMES ${_gtl_libflag}
HINTS "${_gtl_dir}"
NO_DEFAULT_PATH)
if(MPI_GTL_LIBRARY)
message(STATUS
"GPU-aware MPI: linking Cray GTL library ${MPI_GTL_LIBRARY}")
set(DEPENDENCIES ${DEPENDENCIES} ${MPI_GTL_LIBRARY})
else()
message(FATAL_ERROR
"${Red}gpu_aware_mpi=ON: Cray MPICH detected but the GTL "
"library 'lib${_gtl_libflag}' was not found in '${_gtl_dir}'. "
"GPU-aware MPI will crash at runtime without it. Make sure the "
"craype-accel module is loaded, or build with gpu_aware_mpi=OFF."
"${ColorReset}")
endif()
else()
message(STATUS
"GPU-aware MPI: no Cray GTL environment found; assuming the MPI "
"implementation is GPU-aware without an extra transport library.")
endif()
endif()
else()
set(gpu_aware_mpi
Expand Down
15 changes: 15 additions & 0 deletions cmake/defaults.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,18 @@ else()
endif()

set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL)

if(DEFINED ENV{Entity_ENABLE_TEAM_POLICY})
set(default_team_policy
$ENV{Entity_ENABLE_TEAM_POLICY}
CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
else()
set(default_team_policy
OFF
CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
endif()
set_property(CACHE default_team_policy PROPERTY TYPE BOOL)

set(default_team_policy_tile_size
8
CACHE INTERNAL "Default tile edge length in cells for team_policy")
25 changes: 25 additions & 0 deletions cmake/report.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,26 @@ if(${mpi} AND ${DEVICE_ENABLED})
GPU_AWARE_MPI_REPORT
46)
endif()
printchoices(
"Team Policy"
"team_policy"
"${ON_OFF_VALUES}"
${team_policy}
OFF
"${Green}"
TEAM_POLICY_REPORT
46)
if(${team_policy})
printchoices(
"Team Tile Size"
"team_policy_tile_size"
"${team_policy_tile_sizes}"
${team_policy_tile_size}
${default_team_policy_tile_size}
"${Blue}"
TEAM_POLICY_TILE_SIZE_REPORT
46)
endif()
printchoices(
"Debug mode"
"DEBUG"
Expand Down Expand Up @@ -197,6 +217,11 @@ if(${mpi} AND ${DEVICE_ENABLED})
string(APPEND REPORT_TEXT " " ${GPU_AWARE_MPI_REPORT} "\n")
endif()

string(APPEND REPORT_TEXT " " ${TEAM_POLICY_REPORT} "\n")
if(${team_policy})
string(APPEND REPORT_TEXT " " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n")
endif()

string(
APPEND
REPORT_TEXT
Expand Down
3 changes: 3 additions & 0 deletions src/engines/reporter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ namespace ntt {
"%s",
params.template get<std::string>("simulation.name").c_str());
reporter::AddParam(report, 4, "Engine", "%s", SimEngine(S).to_string());
#if defined(TEAM_POLICY)
reporter::AddParam(report, 4, "Tile size", "%d", TEAM_POLICY_TILE_SIZE);
#endif
reporter::AddParam(report, 4, "Metric", "%s", M.to_string());
#if SHAPE_ORDER == 0
reporter::AddParam(report, 4, "Deposit", "%s", "zigzag");
Expand Down
131 changes: 112 additions & 19 deletions src/engines/srpic/currents.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
* @file engines/srpic/currents.h
* @brief Current deposition and filtering routines for the SRPIC engine
* @implements
* - ntt::srpic::CallDepositKernel<> -> void
* - ntt::srpic::CallDepositKernel<> -> void (flat path)
* - ntt::srpic::CallDepositKernelTiled<> -> void (TEAM_POLICY)
* - ntt::srpic::CurrentsDeposit<> -> void
* - ntt::srpic::CurrentsFilter<> -> void
* @namespaces:
Expand Down Expand Up @@ -38,34 +39,125 @@ namespace ntt {
species.rangeActiveParticles(),
kernel::DepositCurrents_kernel<SimEngine::SRPIC, M, O>(
scatter_cur,
species.i1,
species.i2,
species.i3,
species.i1_prev,
species.i2_prev,
species.i3_prev,
species.dx1,
species.dx2,
species.dx3,
species.dx1_prev,
species.dx2_prev,
species.dx3_prev,
species.ux1,
species.ux2,
species.ux3,
species.phi,
species.weight,
species.tag,
species,
local_metric,
(real_t)(species.charge()),
dt));
}

/**
* @brief Tiled deposit launcher (TeamPolicy + per-team scratch).
*
* Iterates over `tile_layout.ntiles_total` teams; each team accumulates
* its tile's particle contributions in SLM scratch and atomically
* flushes to the global J. Requires the species to have been sorted
* with `team_policy` enabled (`tile_layout` populated by
* `SortSpatially`).
*
* Falls back to the flat kernel if `tile_offsets` is empty — this
* happens on the first step before the first sort, or for very small
* species that exited early in `SortSpatially`. The fallback uses the
* passed-in `scatter_cur` so the caller still composes correctly.
*/
template <SRMetricClass M, unsigned short O>
void CallDepositKernelTiled(const Particles<M::Dim, M::CoordType>& species,
const M& local_metric,
const ndfield_t<M::Dim, 3>& cur,
real_t dt) {
static_assert(O <= 11u, "Shape order must be <= 11");
constexpr unsigned short T = static_cast<unsigned short>(
TEAM_POLICY_TILE_SIZE);
const auto& layout = species.tile_layout();
raise::ErrorIf(layout.ntiles_total == 0u,
"CallDepositKernelTiled: tile_layout has 0 tiles — call "
"SortSpatially before CurrentsDeposit",
HERE);
raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u,
"CallDepositKernelTiled: tile_offsets size inconsistent "
"with ntiles_total",
HERE);

auto deposit_kernel =
kernel::DepositCurrentsTiled_kernel<SimEngine::SRPIC, M, O, T> {
cur, species, local_metric, (real_t)(species.charge()), dt, layout
};

Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
Kokkos::AUTO);
policy.set_scratch_size(
0,
Kokkos::PerTeam(decltype(deposit_kernel)::scratch_bytes()));
Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel);
}

template <SRMetricClass M>
void CurrentsDeposit(Domain<SimEngine::SRPIC, M>& domain,
const prm::Parameters& engine_params) {
const auto dt = engine_params.get<real_t>("dt");
Kokkos::deep_copy(domain.fields.cur, ZERO);

#if defined(TEAM_POLICY)

// First-step fallback: if any contributing species has not been
// sorted yet (tile_layout still empty), fall back to the flat
// scatter-view path for that step. Subsequent steps see populated
// layouts and use the tiled kernel.
bool any_unsorted = false;
for (auto& species : domain.species) {
if ((species.pusher() == ParticlePusher::NONE) or
(species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
continue;
}
if (species.tile_layout().ntiles_total == 0u or
species.tile_layout().tile_offsets.extent(0) == 0u) {
any_unsorted = true;
break;
}
}
if (any_unsorted) {
auto scatter_cur = Kokkos::Experimental::create_scatter_view(
domain.fields.cur);
for (auto& species : domain.species) {
if ((species.pusher() == ParticlePusher::NONE) or
(species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
continue;
}
logger::Checkpoint(
fmt::format(
"Launching currents deposit (flat fallback, no sort yet) "
"for %d [%s] : %lu %f",
species.index(),
species.label().c_str(),
species.npart(),
(double)species.charge()),
HERE);
CallDepositKernel<M, SHAPE_ORDER>(species,
domain.mesh.metric,
scatter_cur,
dt);
}
Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
} else {
for (auto& species : domain.species) {
if ((species.pusher() == ParticlePusher::NONE) or
(species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
continue;
}
logger::Checkpoint(
fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f",
species.index(),
species.label().c_str(),
species.npart(),
(double)species.charge()),
HERE);

CallDepositKernelTiled<M, SHAPE_ORDER>(species,
domain.mesh.metric,
domain.fields.cur,
dt);
}
}
#else
auto scatter_cur = Kokkos::Experimental::create_scatter_view(
domain.fields.cur);
for (auto& species : domain.species) {
Expand All @@ -84,6 +176,7 @@ namespace ntt {
CallDepositKernel<M, SHAPE_ORDER>(species, domain.mesh.metric, scatter_cur, dt);
}
Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
#endif
}

template <SRMetricClass M>
Expand Down
Loading
Loading