MFlowCode · sbryngelson · May 13, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
@@ -1,10 +1,10 @@
 #!/bin/bash
 
-# Pre-builds all benchmark cases with --case-optimization.
-# No GPU hardware needed — compilation only.
+# Pre-builds all benchmark cases with --case-optimization using --dry-run so
+# binaries are cached before the GPU run job. No simulation is executed.
 # Can run in two modes:
 #   1. Direct (Frontier login nodes): pass cluster/device/interface as args
-#   2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit-slurm-job.sh
+#   2. Inside SLURM (Phoenix/frontier_amd): uses $job_device/$job_interface
 # Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]
 
 set -e
@@ -22,14 +22,18 @@ case "$cluster" in
     *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
 esac
 
-source .github/scripts/clean-build.sh
-clean_build
+# Phoenix starts fresh (no prior dep build); other clusters pre-build deps via
+# build.sh first, so we must preserve them and only clean MFC target staging.
+if [ "$cluster" = "phoenix" ]; then
+    source .github/scripts/clean-build.sh
+    clean_build
+else
+    find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
+    find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
+fi
 
 . ./mfc.sh load -c "$flag" -m g
 
-# Set GPU build flags from interface — this is always a GPU build.
-# Don't use gpu-opts.sh since $job_device may be "cpu" when submitted
-# to a CPU SLURM partition (no GPU hardware needed for compilation).
 case "$job_interface" in
     acc) gpu_opts="--gpu acc" ;;
     omp) gpu_opts="--gpu mp" ;;
@@ -38,5 +42,5 @@ esac
 
 for case in benchmarks/*/case.py; do
     echo "=== Pre-building: $case ==="
-    ./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
+    ./mfc.sh run "$case" --case-optimization $gpu_opts -j 8 --dry-run
 done
@@ -23,13 +23,14 @@ benchmarks=(
 
 # For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only;
 # build case-optimized binaries here on the compute node before running.
-# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job.
+# For Phoenix and frontier_amd: prebuild-case-optimization.sh already built
+# everything in a prior SLURM job (via --dry-run), so skip the build here.
 #
 # Clean stale MFC target staging before building. On self-hosted CI runners,
 # corrupted intermediate files from a prior failed build (e.g. CCE optcg crash)
 # can persist and poison subsequent builds. Each case-opt config gets its own
 # hash-named staging dir, but install dirs and other artifacts may be stale.
-if [ "$job_cluster" != "phoenix" ]; then
+if [ "$job_cluster" != "phoenix" ] && [ "$job_cluster" != "frontier_amd" ]; then
     # Clean stale MFC target dirs (hash-named) from prior builds, but
     # preserve dependency dirs (hipfort, fftw, etc.) since the compute
     # node has no internet to re-fetch them.

@@ -659,12 +659,16 @@ jobs:
         if:   matrix.cluster == 'phoenix'
         run:  bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}
 
+      - name: Pre-Build (SLURM)
+        if:   matrix.cluster == 'frontier_amd'
+        run:  bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }}
+
       - name: Build & Run Case-Optimization Tests
-        if:   matrix.cluster != 'phoenix'
+        if:   matrix.cluster != 'phoenix' && matrix.cluster != 'frontier_amd'
         run:  bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
 
       - name: Run Case-Optimization Tests
-        if:   matrix.cluster == 'phoenix'
+        if:   matrix.cluster == 'phoenix' || matrix.cluster == 'frontier_amd'
         run:  bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
 
       - name: Cancel SLURM Jobs

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -644,7 +644,7 @@ exit 0
                     target_link_options(${a_target} PRIVATE -fopenmp)
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
                     target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -fopenmp-target-fast -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription)
-                    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
+                    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS})
                 endif()
             endif()
 
@@ -710,14 +710,15 @@ exit 0
                         PRIVATE -DFRONTIER_UNIFIED)
                 endif()
 
-                find_library(HIP_LIB amdhip64
+		        find_library(HIP_LIB amdhip64
                     HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
                 find_library(HIPFORT_AMDGCN_LIB hipfort-amdgcn
                     HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
                 target_include_directories(${a_target} PRIVATE
                     "$ENV{OLCF_AFAR_ROOT}/include/hipfort/amdgcn")
                 target_link_libraries(${a_target} PRIVATE
-                    ${HIP_LIB} ${HIPFORT_AMDGCN_LIB} flang_rt.hostdevice)
+                    ${HIP_LIB} ${HIPFORT_AMDGCN_LIB})
+
             endif()
         elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
             target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
@@ -790,6 +791,12 @@ if (MFC_POST_PROCESS)
 
     # -O0 is in response to https://github.com/MFlowCode/MFC-develop/issues/95
     target_compile_options(post_process PRIVATE -O0)
+
+    # flang-23/LLD defaults to PIE; SILO and LAPACK static libs on Frontier are
+    # non-PIC, producing R_X86_64_32 relocations that LLD rejects in PIE mode.
+    if (CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+        target_link_options(post_process PRIVATE -no-pie)
+    endif()
 endif()
 
 if (MFC_SYSCHECK)

@@ -6,7 +6,7 @@
     json.dumps(
         {
             # Logistics
-            "run_time_info": "T",
+            "run_time_info": "F",
             # Computational Domain Parameters
             "x_domain%beg": 0.0e00,
             "x_domain%end": 4.0e-03 / 1.0e-03,

diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
@@ -193,7 +193,7 @@ contains
 
         $:GPU_UPDATE(device='[isc1, isc2, isc3]')
 
-        if (chemistry .or. dummy) then
+        if (chemistry) then
             ! Set offsets based on direction using array indexing
             offsets = 0
             offsets(idir) = 1

diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
@@ -186,7 +186,6 @@ module m_global_parameters
     logical                            :: E_wrt
     logical, dimension(num_fluids_max) :: alpha_rho_e_wrt
     logical                            :: fft_wrt
-    logical                            :: dummy  !< AMDFlang workaround for case-optimization + GPU-kernel bug
     logical                            :: pres_wrt
     logical, dimension(num_fluids_max) :: alpha_wrt
     logical                            :: gamma_wrt
@@ -397,7 +396,6 @@ contains
         file_per_process = .false.
         E_wrt = .false.
         fft_wrt = .false.
-        dummy = .false.
         pres_wrt = .false.
         alpha_wrt = .false.
         gamma_wrt = .false.

diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
@@ -186,7 +186,6 @@ module m_global_parameters
     real(wp)                               :: Bx0        !< Constant magnetic field in the x-direction (1D)
     integer                                :: buff_size  !< Number of ghost cells for boundary condition storage
     logical                                :: fft_wrt
-    logical                                :: dummy      !< AMDFlang workaround for case-optimization + GPU-kernel bug
 
 contains
 
@@ -303,7 +302,6 @@ contains
         elliptic_smoothing = .false.
 
         fft_wrt = .false.
-        dummy = .false.
 
         simplex_perturb = .false.
         simplex_params%perturb_vel(:) = .false.

diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
@@ -454,14 +454,16 @@ contains
                 call s_mpi_abort('Fatal Error: Inconsistent allocation of source_spatials')
             end if
 
-            $:GPU_UPDATE(device='[source_spatials(ai)%coord]')
-            $:GPU_UPDATE(device='[source_spatials(ai)%val]')
-            if (support(ai) >= 5) then
-                if (dim == 2) then
-                    $:GPU_UPDATE(device='[source_spatials(ai)%angle]')
-                end if
-                if (dim == 3) then
-                    $:GPU_UPDATE(device='[source_spatials(ai)%xyz_to_r_ratios]')
+            if (count > 0) then
+                $:GPU_UPDATE(device='[source_spatials(ai)%coord]')
+                $:GPU_UPDATE(device='[source_spatials(ai)%val]')
+                if (support(ai) >= 5) then
+                    if (dim == 2) then
+                        $:GPU_UPDATE(device='[source_spatials(ai)%angle]')
+                    end if
+                    if (dim == 3) then
+                        $:GPU_UPDATE(device='[source_spatials(ai)%xyz_to_r_ratios]')
+                    end if
                 end if
             end if
         end do

diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
@@ -529,7 +529,7 @@ contains
         #:for CBC_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
             if (cbc_dir == ${CBC_DIR}$ .and. recon_type == WENO_TYPE) then
                 ! PI2 of flux_rs_vf and flux_src_rs_vf at j = 1/2
-                if (weno_order == 3 .or. dummy) then
+                if (weno_order == 3) then
                     call s_convert_primitive_to_flux_variables(q_prim_rs${XYZ}$_vf, F_rs${XYZ}$_vf, F_src_rs${XYZ}$_vf, is1, is2, &
                         & is3, idwbuff(2)%beg, idwbuff(3)%beg)
 
@@ -557,7 +557,7 @@ contains
                 end if
 
                 ! PI4 of flux_rs_vf and flux_src_rs_vf at j = 1/2, 3/2
-                if (weno_order == 5 .or. dummy) then
+                if (weno_order == 5) then
                     call s_convert_primitive_to_flux_variables(q_prim_rs${XYZ}$_vf, F_rs${XYZ}$_vf, F_src_rs${XYZ}$_vf, is1, is2, &
                         & is3, idwbuff(2)%beg, idwbuff(3)%beg)
 

diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
@@ -220,7 +220,7 @@ contains
         #:call GPU_PARALLEL(copyout='[icfl_max_loc]', copyin='[icfl_sf]')
             icfl_max_loc = maxval(icfl_sf)
         #:endcall GPU_PARALLEL
-        if (viscous .or. dummy) then
+        if (viscous) then
             #:call GPU_PARALLEL(copyout='[vcfl_max_loc, Rc_min_loc]', copyin='[vcfl_sf,Rc_sf]')
                 vcfl_max_loc = maxval(vcfl_sf)
                 Rc_min_loc = minval(Rc_sf)

diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
@@ -30,12 +30,12 @@ module m_fftw
 
     type(c_ptr)                        :: fwd_plan, bwd_plan
     type(c_ptr)                        :: fftw_real_data, fftw_cmplx_data, fftw_fltr_cmplx_data
-    integer                            :: real_size, cmplx_size, x_size, batch_size, Nfq
+    integer                            :: real_size, cmplx_size, x_size, batch_size, Nfq, i2
     real(c_double), pointer            :: data_real(:)        !< Real data
     complex(c_double_complex), pointer :: data_cmplx(:)       !< Complex data in Fourier space
     complex(c_double_complex), pointer :: data_fltr_cmplx(:)  !< Filtered complex data in Fourier space
 #if defined(MFC_GPU)
-    $:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq]')
+    $:GPU_DECLARE(create='[real_size, cmplx_size, x_size, batch_size, Nfq, i2]')
 
     real(dp), allocatable, target    :: data_real_gpu(:)
     complex(dp), allocatable, target :: data_cmplx_gpu(:)
@@ -76,8 +76,8 @@ contains
         allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank))
 
         gpu_fft_size(1) = real_size
-        iembed(1) = 0
-        oembed(1) = 0
+        iembed(1) = real_size
+        oembed(1) = cmplx_size
         $:GPU_ENTER_DATA(copyin='[real_size, cmplx_size, x_size, sys_size, batch_size, Nfq]')
         $:GPU_UPDATE(device='[real_size, cmplx_size, x_size, sys_size, batch_size]')
 #else
@@ -189,6 +189,9 @@ contains
         $:END_GPU_PARALLEL_LOOP()
 
         do i = 1, fourier_rings
+            i2 = i
+            $:GPU_UPDATE(device='[i2]')
+
             $:GPU_PARALLEL_LOOP(collapse=3)
             do k = 1, sys_size
                 do j = 0, m
@@ -199,11 +202,11 @@ contains
             end do
             $:END_GPU_PARALLEL_LOOP()
 
-            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do k = 1, sys_size
                 do j = 0, m
                     do l = 0, p
-                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i2, l)
                     end do
                 end do
             end do
@@ -241,13 +244,13 @@ contains
 #endif
             #:endcall GPU_HOST_DATA
 
-            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do k = 1, sys_size
                 do j = 0, m
                     do l = 0, p
                         data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k &
                                       & - 1)*real_size*x_size)/real(real_size, dp)
-                        q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                        q_cons_vf(k)%sf(j, i2, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
                     end do
                 end do
             end do

diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
@@ -464,7 +464,6 @@ module m_global_parameters
     $:GPU_DECLARE(create='[Bx0]')
 
     logical :: fft_wrt
-    logical :: dummy  !< AMDFlang workaround for case-optimization + GPU-kernel bug
     !> @name Continuum damage model parameters
     !> @{!
     real(wp) :: tau_star       !< Stress threshold for continuum damage modeling
@@ -695,7 +694,6 @@ contains
         #:endfor
 
         fft_wrt = .false.
-        dummy = .false.
 
         do j = 1, num_probes_max
             acoustic(j)%pulse = dflt_int

diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
@@ -302,7 +302,7 @@ contains
 
             call s_populate_F_igr_buffers(bc_type, jac_sf)
 
-            if (igr_iter_solver == 1 .or. dummy) then  ! Jacobi iteration
+            if (igr_iter_solver == 1) then  ! Jacobi iteration
                 $:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3)
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do k = idwbuff(2)%beg, idwbuff(2)%end