Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,27 @@ jobs:
- name: Run simulation_manager smoke test
run: bash .travis/test-simulation-manager.sh

integration-check:
needs: install
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
cache-dependency-path: requirements.txt
- name: Enable symlink
run: sudo ln -sf $(which python3) /usr/bin/python
- name: Install dependencies
run: |
python -m pip install --upgrade pip --break-system-packages
python -m pip install -r requirements.txt --break-system-packages
python -m pip install coverage pytest pytest-cov --break-system-packages
python -m pip install --editable . --break-system-packages
- name: Run integration sampler check
run: bash .travis/test-integrate.sh

asimov-integration:
needs: install
runs-on: ubuntu-latest
Expand Down Expand Up @@ -211,7 +232,6 @@ jobs:
- name: Run test scripts
run: |
. .travis/test-coord.sh
. .travis/test-integrate.sh
. .travis/test-posterior.sh
bash .travis/test-run.sh
bash .travis/test-run-alts.sh
Expand Down
54 changes: 53 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ default:
image: debian:bookworm

stages:
- containers
- system tests
- unit tests # TODO: write some
- docs
Expand Down Expand Up @@ -60,6 +61,23 @@ before_script:
# install this package (need editable for coverage)
- python -m pip install --editable . --break-system-packages

build_gpu_container:
stage: containers
tags:
- gpu
variables:
RIFT_CI_APPTAINER_IMAGE: "/home/richard.oshaughnessy/rift-ci-gpu.sif"
before_script: []
script:
- mkdir -p "$(dirname "$RIFT_CI_APPTAINER_IMAGE")"
- apptainer build --force "$RIFT_CI_APPTAINER_IMAGE" rift_container.def
- apptainer exec --nv "$RIFT_CI_APPTAINER_IMAGE" python -c 'import cupy; x = cupy.arange(8, dtype=cupy.float64); print("cupy", cupy.__version__, "sum", float(cupy.asnumpy((x * x).sum())))'
rules:
- if: '$CI_PIPELINE_SOURCE == "web"'
when: manual
allow_failure: false
- when: never

help_check:
stage: system tests
script:
Expand All @@ -83,12 +101,46 @@ test_run:
stage: system tests
script:
- . .travis/test-coord.sh
- . .travis/test-integrate.sh
- bash .travis/test-integrate.sh
- . .travis/test-posterior.sh
- bash .travis/test-run.sh
- bash .travis/test-run-alts.sh
- bash .travis/test-build.sh

gpu_integration:
stage: system tests
tags:
- gpu
variables:
CUDA_VISIBLE_DEVICES: "0"
GW_SURROGATE: ""
RIFT_CI_REQUIRE_GPU: "1"
RIFT_CI_APPTAINER_IMAGE: "/home/richard.oshaughnessy/rift-ci-gpu.sif"
RIFT_CI_APPTAINER_BINDPATH: "/cvmfs,/home"
before_script: []
script:
- nvidia-smi
- test -n "$RIFT_CI_APPTAINER_IMAGE"
- test -r "$RIFT_CI_APPTAINER_IMAGE"
- >
apptainer exec --nv --cleanenv
--bind "$CI_PROJECT_DIR:$CI_PROJECT_DIR"
--bind "$RIFT_CI_APPTAINER_BINDPATH"
--env CI_PROJECT_DIR="$CI_PROJECT_DIR"
--env CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES"
--env GW_SURROGATE="$GW_SURROGATE"
--env RIFT_CI_REQUIRE_GPU="$RIFT_CI_REQUIRE_GPU"
"$RIFT_CI_APPTAINER_IMAGE"
bash -lc 'cd "$CI_PROJECT_DIR" && export PYTHONPATH="$CI_PROJECT_DIR/MonteCarloMarginalizeCode/Code${PYTHONPATH:+:$PYTHONPATH}" && bash .travis/test-integrate.sh'
rules:
- if: '$CI_PIPELINE_SOURCE == "web"'
when: manual
allow_failure: false
- if: '$CI_PIPELINE_SOURCE == "schedule"'
when: on_success
allow_failure: false
- when: never

pixi_swig_pre44:
extends: .pixi_template
variables:
Expand Down
34 changes: 33 additions & 1 deletion .travis/test-integrate.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,36 @@
#! /bin/bash
#! /usr/bin/env bash

set -euo pipefail

if [[ "${RIFT_CI_REQUIRE_GPU:-0}" == "1" ]]; then
python - <<'PY'
import sys

try:
import cupy
except Exception as exc:
raise SystemExit(f"RIFT_CI_REQUIRE_GPU=1 but cupy could not be imported: {exc}") from exc

try:
n_devices = cupy.cuda.runtime.getDeviceCount()
except Exception as exc:
raise SystemExit(f"RIFT_CI_REQUIRE_GPU=1 but CUDA devices could not be queried: {exc}") from exc

if n_devices < 1:
raise SystemExit("RIFT_CI_REQUIRE_GPU=1 but cupy reported zero CUDA devices")

x = cupy.arange(8, dtype=cupy.float64)
if float(cupy.asnumpy((x * x).sum())) != 140.0:
raise SystemExit("RIFT_CI_REQUIRE_GPU=1 but a basic cupy device calculation failed")

from RIFT.integrators import mcsamplerGPU

if not getattr(mcsamplerGPU, "cupy_ok", False):
raise SystemExit("RIFT_CI_REQUIRE_GPU=1 but RIFT.integrators.mcsamplerGPU did not enable cupy")

print(f"GPU preflight OK: cupy={cupy.__version__}, cuda_devices={n_devices}")
PY
fi

python MonteCarloMarginalizeCode/Code/test/test_mcsamplerEnsemble_extended.py --as-test --n-max 100000

Expand Down
11 changes: 9 additions & 2 deletions MonteCarloMarginalizeCode/Code/RIFT/misc/dag_utils_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3591,13 +3591,20 @@ def write_consolidate_distance_grids_sub(tag='consolidate_dgrid', exe=None,
exe = "util_ConsolidateDistanceGrids.py"

cmdname = tag + '.sh'
# IMPORTANT: do NOT 'cd' into search_dir relying on a $(macroiteration) in it.
# Condor macros are substituted only inside .sub files, never inside the shell
# script we write here, so a literal "$(macroiteration)" would reach bash as a
# command substitution and expand to nothing (-> 'iteration__ile', no such dir).
# Instead glob search_dir/input_glob directly: callers pass an iteration_*_ile
# wildcard for search_dir, which matches the final-iteration directory regardless
# of its number, with no macro substitution needed.
search_pattern = os.path.join(search_dir, input_glob)
with open(cmdname, 'w') as f:
f.write("#! /bin/bash\n")
f.write("set -e\n")
f.write("cd " + search_dir + "\n")
# --allow-empty keeps the post-extrinsic job from failing the DAG if a
# re-run already consumed the per-event files or none were produced.
f.write(exe + " --input-glob '" + input_glob + "'"
f.write(exe + " --input-glob '" + search_pattern + "'"
" --output " + file_output + " --allow-empty\n")
os.system("chmod a+x " + cmdname)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ from RIFT.misc.dag_utils_generic import mkdir
from RIFT.misc.dag_utils_generic import which


def add_batch_ILE_nodes_to_dag(my_dag,my_ile_job,my_parent_node, my_child_node, n_max, n_group, it, n_retries=3,it_start=0,convert_psd_node_list=[],node_list_dict={}):
def add_batch_ILE_nodes_to_dag(my_dag,my_ile_job,my_parent_node, my_child_node, n_max, n_group, it, n_retries=3,it_start=0,convert_psd_node_list=[],node_list_dict={},extra_parent_nodes=None):
if extra_parent_nodes is None:
extra_parent_nodes = []
for event in np.arange(n_max):
ile_node = pipeline.CondorDAGNode(my_ile_job)
ile_node.set_retry(n_retries)
Expand All @@ -84,6 +86,9 @@ def add_batch_ILE_nodes_to_dag(my_dag,my_ile_job,my_parent_node, my_child_node,
ile_node.add_macro("macroiteration", it)
if not(my_parent_node is None):
ile_node.add_parent(my_parent_node)
for node in extra_parent_nodes:
if not(node is None):
ile_node.add_parent(node)
if it == it_start:
for node in convert_psd_node_list: # for every PSD conversion job, make sure PSD is present before we run the first iteration!
ile_node.add_parent(node)
Expand Down Expand Up @@ -981,9 +986,15 @@ sed 1d ./tmp_converted.dat {extra_shuffle_command} >> ./extrinsic_posterior_sam
if opts.last_iteration_export_marginal_distance_grid:
dgrid_job, dgrid_job_name = dag_utils.write_consolidate_distance_grids_sub(
tag='consolidate_dgrid',
input_glob='EXTR_out.xml_*_.dgrid',
# Match the real per-event batched-extrinsic outputs
# EXTR_out-<event>.xml_<k>_.dgrid (note the leading-segment wildcard;
# 'EXTR_out.xml_*_.dgrid' has no '-<event>' infix and matches nothing).
input_glob='EXTR_out*.xml_*_.dgrid',
file_output=opts.working_directory + '/all_dgrid.dat',
search_dir=opts.working_directory + '/iteration_$(macroiteration)_ile',
# iteration_*_ile wildcard, NOT $(macroiteration): the consolidate driver
# is a shell script (no condor-macro substitution); the wildcard matches
# the final-iteration directory regardless of its number.
search_dir=opts.working_directory + '/iteration_*_ile',
log_dir=opts.working_directory + '/iteration_$(macroiteration)_ile/logs/',
universe=local_worker_universe, no_grid=no_worker_grid,
)
Expand All @@ -996,9 +1007,9 @@ sed 1d ./tmp_converted.dat {extra_shuffle_command} >> ./extrinsic_posterior_sam
if opts.last_iteration_export_distance_slices and opts.last_iteration_export_distance_slices > 0:
dslice_job, dslice_job_name = dag_utils.write_consolidate_distance_grids_sub(
tag='consolidate_dslice',
input_glob='EXTR_out.xml_*_.dslice',
input_glob='EXTR_out*.xml_*_.dslice',
file_output=opts.working_directory + '/all_dslice.dat',
search_dir=opts.working_directory + '/iteration_$(macroiteration)_ile',
search_dir=opts.working_directory + '/iteration_*_ile',
log_dir=opts.working_directory + '/iteration_$(macroiteration)_ile/logs/',
universe=local_worker_universe, no_grid=no_worker_grid,
)
Expand Down Expand Up @@ -1572,6 +1583,7 @@ if opts.comov_distance_reweighting:
#

parent_fit_node = None
last_puff_node = None
last_node=None

if opts.gridinit_args:
Expand Down Expand Up @@ -1708,7 +1720,7 @@ for it in np.arange(it_start,opts.n_iterations):
if puff_args and puff_cadence:
if it>it_start and it <= puff_max_it and (it-1)%puff_cadence ==0: # we made a puffball last iteration, so run it through ILE now
print(" ILE jobs for puffball on iteration ", it)
add_batch_ILE_nodes_to_dag(dag, ilePuff_job, parent_fit_node, con_node, indx_max, n_group_here, it, n_retries=opts.ile_retries,node_list_dict=ile_node_list_per_iteration)
add_batch_ILE_nodes_to_dag(dag, ilePuff_job, parent_fit_node, con_node, indx_max, n_group_here, it, n_retries=opts.ile_retries,node_list_dict=ile_node_list_per_iteration,extra_parent_nodes=[last_puff_node])
# for event in np.arange(indx_max):
# ile_node = pipeline.CondorDAGNode(ilePuff_job) # only difference is here: uses puffball, which by construction is the same size/ perturbed points
# ile_node.set_retry(opts.ile_retries)
Expand Down Expand Up @@ -1980,7 +1992,7 @@ for it in np.arange(it_start,opts.n_iterations):
puff_node.add_parent(parent_fit_node) # only fit if we have results from the previous iteration
dag.add_node(puff_node)

parent_fit_node = puff_node
last_puff_node = puff_node



Expand Down
Loading