Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions .github/scripts/prebuild-case-optimization.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/bin/bash

# Pre-builds all benchmark cases with --case-optimization.
# No GPU hardware needed — compilation only.
# Pre-builds all benchmark cases with --case-optimization using --dry-run so
# binaries are cached before the GPU run job. No simulation is executed.
# Can run in two modes:
# 1. Direct (Frontier login nodes): pass cluster/device/interface as args
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit-slurm-job.sh
# 2. Inside SLURM (Phoenix/frontier_amd): uses $job_device/$job_interface
# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]

set -e
Expand All @@ -22,14 +22,18 @@ case "$cluster" in
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
esac

source .github/scripts/clean-build.sh
clean_build
# Phoenix starts fresh (no prior dep build); other clusters pre-build deps via
# build.sh first, so we must preserve them and only clean MFC target staging.
if [ "$cluster" = "phoenix" ]; then
source .github/scripts/clean-build.sh
clean_build
else
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
fi

. ./mfc.sh load -c "$flag" -m g

# Set GPU build flags from interface — this is always a GPU build.
# Don't use gpu-opts.sh since $job_device may be "cpu" when submitted
# to a CPU SLURM partition (no GPU hardware needed for compilation).
case "$job_interface" in
acc) gpu_opts="--gpu acc" ;;
omp) gpu_opts="--gpu mp" ;;
Expand All @@ -38,5 +42,5 @@ esac

for case in benchmarks/*/case.py; do
echo "=== Pre-building: $case ==="
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
./mfc.sh run "$case" --case-optimization $gpu_opts -j 8 --dry-run
done
5 changes: 3 additions & 2 deletions .github/scripts/run_case_optimization.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,14 @@ benchmarks=(

# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only;
# build case-optimized binaries here on the compute node before running.
# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job.
# For Phoenix and frontier_amd: prebuild-case-optimization.sh already built
# everything in a prior SLURM job (via --dry-run), so skip the build here.
#
# Clean stale MFC target staging before building. On self-hosted CI runners,
# corrupted intermediate files from a prior failed build (e.g. CCE optcg crash)
# can persist and poison subsequent builds. Each case-opt config gets its own
# hash-named staging dir, but install dirs and other artifacts may be stale.
if [ "$job_cluster" != "phoenix" ]; then
if [ "$job_cluster" != "phoenix" ] && [ "$job_cluster" != "frontier_amd" ]; then
# Clean stale MFC target dirs (hash-named) from prior builds, but
# preserve dependency dirs (hipfort, fftw, etc.) since the compute
# node has no internet to re-fetch them.
Expand Down
8 changes: 6 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -659,12 +659,16 @@ jobs:
if: matrix.cluster == 'phoenix'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Pre-Build (SLURM)
if: matrix.cluster == 'frontier_amd'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Build & Run Case-Optimization Tests
if: matrix.cluster != 'phoenix'
if: matrix.cluster != 'phoenix' && matrix.cluster != 'frontier_amd'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Run Case-Optimization Tests
if: matrix.cluster == 'phoenix'
if: matrix.cluster == 'phoenix' || matrix.cluster == 'frontier_amd'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Cancel SLURM Jobs
Expand Down
13 changes: 10 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ exit 0
target_link_options(${a_target} PRIVATE -fopenmp)
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -fopenmp-target-fast -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription)
target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS})
endif()
endif()

Expand Down Expand Up @@ -710,14 +710,15 @@ exit 0
PRIVATE -DFRONTIER_UNIFIED)
endif()

find_library(HIP_LIB amdhip64
find_library(HIP_LIB amdhip64
HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
find_library(HIPFORT_AMDGCN_LIB hipfort-amdgcn
HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
target_include_directories(${a_target} PRIVATE
"$ENV{OLCF_AFAR_ROOT}/include/hipfort/amdgcn")
target_link_libraries(${a_target} PRIVATE
${HIP_LIB} ${HIPFORT_AMDGCN_LIB} flang_rt.hostdevice)
${HIP_LIB} ${HIPFORT_AMDGCN_LIB})

endif()
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
Expand Down Expand Up @@ -790,6 +791,12 @@ if (MFC_POST_PROCESS)

# -O0 is in response to https://github.com/MFlowCode/MFC-develop/issues/95
target_compile_options(post_process PRIVATE -O0)

# flang-23/LLD defaults to PIE; SILO and LAPACK static libs on Frontier are
# non-PIC, producing R_X86_64_32 relocations that LLD rejects in PIE mode.
if (CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
target_link_options(post_process PRIVATE -no-pie)
endif()
endif()

if (MFC_SYSCHECK)
Expand Down
2 changes: 1 addition & 1 deletion examples/3D_performance_test/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
json.dumps(
{
# Logistics
"run_time_info": "T",
"run_time_info": "F",
# Computational Domain Parameters
"x_domain%beg": 0.0e00,
"x_domain%end": 4.0e-03 / 1.0e-03,
Expand Down
2 changes: 1 addition & 1 deletion src/common/m_chemistry.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ contains

$:GPU_UPDATE(device='[isc1, isc2, isc3]')

if (chemistry .or. dummy) then
if (chemistry) then
! Set offsets based on direction using array indexing
offsets = 0
offsets(idir) = 1
Expand Down
2 changes: 0 additions & 2 deletions src/post_process/m_global_parameters.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ module m_global_parameters
logical :: E_wrt
logical, dimension(num_fluids_max) :: alpha_rho_e_wrt
logical :: fft_wrt
logical :: dummy !< AMDFlang workaround for case-optimization + GPU-kernel bug
logical :: pres_wrt
logical, dimension(num_fluids_max) :: alpha_wrt
logical :: gamma_wrt
Expand Down Expand Up @@ -397,7 +396,6 @@ contains
file_per_process = .false.
E_wrt = .false.
fft_wrt = .false.
dummy = .false.
pres_wrt = .false.
alpha_wrt = .false.
gamma_wrt = .false.
Expand Down
2 changes: 0 additions & 2 deletions src/pre_process/m_global_parameters.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ module m_global_parameters
real(wp) :: Bx0 !< Constant magnetic field in the x-direction (1D)
integer :: buff_size !< Number of ghost cells for boundary condition storage
logical :: fft_wrt
logical :: dummy !< AMDFlang workaround for case-optimization + GPU-kernel bug

contains

Expand Down Expand Up @@ -303,7 +302,6 @@ contains
elliptic_smoothing = .false.

fft_wrt = .false.
dummy = .false.

simplex_perturb = .false.
simplex_params%perturb_vel(:) = .false.
Expand Down
18 changes: 10 additions & 8 deletions src/simulation/m_acoustic_src.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,14 +454,16 @@ contains
call s_mpi_abort('Fatal Error: Inconsistent allocation of source_spatials')
end if

$:GPU_UPDATE(device='[source_spatials(ai)%coord]')
$:GPU_UPDATE(device='[source_spatials(ai)%val]')
if (support(ai) >= 5) then
if (dim == 2) then
$:GPU_UPDATE(device='[source_spatials(ai)%angle]')
end if
if (dim == 3) then
$:GPU_UPDATE(device='[source_spatials(ai)%xyz_to_r_ratios]')
if (count > 0) then
$:GPU_UPDATE(device='[source_spatials(ai)%coord]')
$:GPU_UPDATE(device='[source_spatials(ai)%val]')
if (support(ai) >= 5) then
if (dim == 2) then
$:GPU_UPDATE(device='[source_spatials(ai)%angle]')
end if
if (dim == 3) then
$:GPU_UPDATE(device='[source_spatials(ai)%xyz_to_r_ratios]')
end if
end if
end if
end do
Expand Down
4 changes: 2 additions & 2 deletions src/simulation/m_cbc.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ contains
#:for CBC_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
if (cbc_dir == ${CBC_DIR}$ .and. recon_type == WENO_TYPE) then
! PI2 of flux_rs_vf and flux_src_rs_vf at j = 1/2
if (weno_order == 3 .or. dummy) then
if (weno_order == 3) then
call s_convert_primitive_to_flux_variables(q_prim_rs${XYZ}$_vf, F_rs${XYZ}$_vf, F_src_rs${XYZ}$_vf, is1, is2, &
& is3, idwbuff(2)%beg, idwbuff(3)%beg)

Expand Down Expand Up @@ -557,7 +557,7 @@ contains
end if

! PI4 of flux_rs_vf and flux_src_rs_vf at j = 1/2, 3/2
if (weno_order == 5 .or. dummy) then
if (weno_order == 5) then
call s_convert_primitive_to_flux_variables(q_prim_rs${XYZ}$_vf, F_rs${XYZ}$_vf, F_src_rs${XYZ}$_vf, is1, is2, &
& is3, idwbuff(2)%beg, idwbuff(3)%beg)

Expand Down
2 changes: 1 addition & 1 deletion src/simulation/m_data_output.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ contains
#:call GPU_PARALLEL(copyout='[icfl_max_loc]', copyin='[icfl_sf]')
icfl_max_loc = maxval(icfl_sf)
#:endcall GPU_PARALLEL
if (viscous .or. dummy) then
if (viscous) then
#:call GPU_PARALLEL(copyout='[vcfl_max_loc, Rc_min_loc]', copyin='[vcfl_sf,Rc_sf]')
vcfl_max_loc = maxval(vcfl_sf)
Rc_min_loc = minval(Rc_sf)
Expand Down
19 changes: 11 additions & 8 deletions src/simulation/m_fftw.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ module m_fftw

type(c_ptr) :: fwd_plan, bwd_plan
type(c_ptr) :: fftw_real_data, fftw_cmplx_data, fftw_fltr_cmplx_data
integer :: real_size, cmplx_size, x_size, batch_size, Nfq
integer :: real_size, cmplx_size, x_size, batch_size, Nfq, i2
real(c_double), pointer :: data_real(:) !< Real data
complex(c_double_complex), pointer :: data_cmplx(:) !< Complex data in Fourier space
complex(c_double_complex), pointer :: data_fltr_cmplx(:) !< Filtered complex data in Fourier space
#if defined(MFC_GPU)
$:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq]')
$:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq, i2]')

real(dp), allocatable, target :: data_real_gpu(:)
complex(dp), allocatable, target :: data_cmplx_gpu(:)
Expand Down Expand Up @@ -76,8 +76,8 @@ contains
allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank))

gpu_fft_size(1) = real_size
iembed(1) = 0
oembed(1) = 0
iembed(1) = real_size
oembed(1) = cmplx_size
$:GPU_ENTER_DATA(copyin='[real_size, cmplx_size, x_size, sys_size, batch_size, Nfq]')
$:GPU_UPDATE(device='[real_size, cmplx_size, x_size, sys_size, batch_size]')
#else
Expand Down Expand Up @@ -189,6 +189,9 @@ contains
$:END_GPU_PARALLEL_LOOP()

do i = 1, fourier_rings
i2 = i
$:GPU_UPDATE(device='[i2]')

$:GPU_PARALLEL_LOOP(collapse=3)
do k = 1, sys_size
do j = 0, m
Expand All @@ -199,11 +202,11 @@ contains
end do
$:END_GPU_PARALLEL_LOOP()

$:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
$:GPU_PARALLEL_LOOP(collapse=3)
do k = 1, sys_size
do j = 0, m
do l = 0, p
data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i2, l)
end do
end do
end do
Expand Down Expand Up @@ -241,13 +244,13 @@ contains
#endif
#:endcall GPU_HOST_DATA

$:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
$:GPU_PARALLEL_LOOP(collapse=3)
do k = 1, sys_size
do j = 0, m
do l = 0, p
data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k &
& - 1)*real_size*x_size)/real(real_size, dp)
q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
q_cons_vf(k)%sf(j, i2, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
end do
end do
end do
Expand Down
2 changes: 0 additions & 2 deletions src/simulation/m_global_parameters.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,6 @@ module m_global_parameters
$:GPU_DECLARE(create='[Bx0]')
logical :: fft_wrt
logical :: dummy !< AMDFlang workaround for case-optimization + GPU-kernel bug
!> @name Continuum damage model parameters
!> @{!
real(wp) :: tau_star !< Stress threshold for continuum damage modeling
Expand Down Expand Up @@ -695,7 +694,6 @@ contains
#:endfor
fft_wrt = .false.
dummy = .false.
do j = 1, num_probes_max
acoustic(j)%pulse = dflt_int
Expand Down
2 changes: 1 addition & 1 deletion src/simulation/m_igr.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ contains

call s_populate_F_igr_buffers(bc_type, jac_sf)

if (igr_iter_solver == 1 .or. dummy) then ! Jacobi iteration
if (igr_iter_solver == 1) then ! Jacobi iteration
$:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3)
do l = idwbuff(3)%beg, idwbuff(3)%end
do k = idwbuff(2)%beg, idwbuff(2)%end
Expand Down
Loading
Loading