diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1053c2a1..dc763968 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -134,6 +134,27 @@ jobs: - name: Run simulation_manager smoke test run: bash .travis/test-simulation-manager.sh + integration-check: + needs: install + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + cache-dependency-path: requirements.txt + - name: Enable symlink + run: sudo ln -sf $(which python3) /usr/bin/python + - name: Install dependencies + run: | + python -m pip install --upgrade pip --break-system-packages + python -m pip install -r requirements.txt --break-system-packages + python -m pip install coverage pytest pytest-cov --break-system-packages + python -m pip install --editable . --break-system-packages + - name: Run integration sampler check + run: bash .travis/test-integrate.sh + asimov-integration: needs: install runs-on: ubuntu-latest @@ -211,7 +232,6 @@ jobs: - name: Run test scripts run: | . .travis/test-coord.sh - . .travis/test-integrate.sh . .travis/test-posterior.sh bash .travis/test-run.sh bash .travis/test-run-alts.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b519cb6c..677a7d42 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -10,6 +10,7 @@ default: image: debian:bookworm stages: + - containers - system tests - unit tests # TODO: write some - docs @@ -60,6 +61,23 @@ before_script: # install this package (need editable for coverage) - python -m pip install --editable . --break-system-packages +build_gpu_container: + stage: containers + tags: + - gpu + variables: + RIFT_CI_APPTAINER_IMAGE: "/home/richard.oshaughnessy/rift-ci-gpu.sif" + before_script: [] + script: + - mkdir -p "$(dirname "$RIFT_CI_APPTAINER_IMAGE")" + - apptainer build --force "$RIFT_CI_APPTAINER_IMAGE" rift_container.def + - apptainer exec --nv "$RIFT_CI_APPTAINER_IMAGE" python -c 'import cupy; x = cupy.arange(8, dtype=cupy.float64); print("cupy", cupy.__version__, "sum", float(cupy.asnumpy((x * x).sum())))' + rules: + - if: '$CI_PIPELINE_SOURCE == "web"' + when: manual + allow_failure: false + - when: never + help_check: stage: system tests script: @@ -83,12 +101,46 @@ test_run: stage: system tests script: - . .travis/test-coord.sh - - . .travis/test-integrate.sh + - bash .travis/test-integrate.sh - . .travis/test-posterior.sh - bash .travis/test-run.sh - bash .travis/test-run-alts.sh - bash .travis/test-build.sh +gpu_integration: + stage: system tests + tags: + - gpu + variables: + CUDA_VISIBLE_DEVICES: "0" + GW_SURROGATE: "" + RIFT_CI_REQUIRE_GPU: "1" + RIFT_CI_APPTAINER_IMAGE: "/home/richard.oshaughnessy/rift-ci-gpu.sif" + RIFT_CI_APPTAINER_BINDPATH: "/cvmfs,/home" + before_script: [] + script: + - nvidia-smi + - test -n "$RIFT_CI_APPTAINER_IMAGE" + - test -r "$RIFT_CI_APPTAINER_IMAGE" + - > + apptainer exec --nv --cleanenv + --bind "$CI_PROJECT_DIR:$CI_PROJECT_DIR" + --bind "$RIFT_CI_APPTAINER_BINDPATH" + --env CI_PROJECT_DIR="$CI_PROJECT_DIR" + --env CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" + --env GW_SURROGATE="$GW_SURROGATE" + --env RIFT_CI_REQUIRE_GPU="$RIFT_CI_REQUIRE_GPU" + "$RIFT_CI_APPTAINER_IMAGE" + bash -lc 'cd "$CI_PROJECT_DIR" && export PYTHONPATH="$CI_PROJECT_DIR/MonteCarloMarginalizeCode/Code${PYTHONPATH:+:$PYTHONPATH}" && bash .travis/test-integrate.sh' + rules: + - if: '$CI_PIPELINE_SOURCE == "web"' + when: manual + allow_failure: false + - if: '$CI_PIPELINE_SOURCE == "schedule"' + when: on_success + allow_failure: false + - when: never + pixi_swig_pre44: extends: .pixi_template variables: diff --git a/.travis/test-integrate.sh b/.travis/test-integrate.sh index d777ccca..fded45cd 100755 --- a/.travis/test-integrate.sh +++ b/.travis/test-integrate.sh @@ -1,4 +1,36 @@ -#! /bin/bash +#! /usr/bin/env bash + +set -euo pipefail + +if [[ "${RIFT_CI_REQUIRE_GPU:-0}" == "1" ]]; then + python - <<'PY' +import sys + +try: + import cupy +except Exception as exc: + raise SystemExit(f"RIFT_CI_REQUIRE_GPU=1 but cupy could not be imported: {exc}") from exc + +try: + n_devices = cupy.cuda.runtime.getDeviceCount() +except Exception as exc: + raise SystemExit(f"RIFT_CI_REQUIRE_GPU=1 but CUDA devices could not be queried: {exc}") from exc + +if n_devices < 1: + raise SystemExit("RIFT_CI_REQUIRE_GPU=1 but cupy reported zero CUDA devices") + +x = cupy.arange(8, dtype=cupy.float64) +if float(cupy.asnumpy((x * x).sum())) != 140.0: + raise SystemExit("RIFT_CI_REQUIRE_GPU=1 but a basic cupy device calculation failed") + +from RIFT.integrators import mcsamplerGPU + +if not getattr(mcsamplerGPU, "cupy_ok", False): + raise SystemExit("RIFT_CI_REQUIRE_GPU=1 but RIFT.integrators.mcsamplerGPU did not enable cupy") + +print(f"GPU preflight OK: cupy={cupy.__version__}, cuda_devices={n_devices}") +PY +fi python MonteCarloMarginalizeCode/Code/test/test_mcsamplerEnsemble_extended.py --as-test --n-max 100000 diff --git a/MonteCarloMarginalizeCode/Code/RIFT/misc/dag_utils_generic.py b/MonteCarloMarginalizeCode/Code/RIFT/misc/dag_utils_generic.py index 64ca252a..5060c16d 100644 --- a/MonteCarloMarginalizeCode/Code/RIFT/misc/dag_utils_generic.py +++ b/MonteCarloMarginalizeCode/Code/RIFT/misc/dag_utils_generic.py @@ -3591,13 +3591,20 @@ def write_consolidate_distance_grids_sub(tag='consolidate_dgrid', exe=None, exe = "util_ConsolidateDistanceGrids.py" cmdname = tag + '.sh' + # IMPORTANT: do NOT 'cd' into search_dir relying on a $(macroiteration) in it. + # Condor macros are substituted only inside .sub files, never inside the shell + # script we write here, so a literal "$(macroiteration)" would reach bash as a + # command substitution and expand to nothing (-> 'iteration__ile', no such dir). + # Instead glob search_dir/input_glob directly: callers pass an iteration_*_ile + # wildcard for search_dir, which matches the final-iteration directory regardless + # of its number, with no macro substitution needed. + search_pattern = os.path.join(search_dir, input_glob) with open(cmdname, 'w') as f: f.write("#! /bin/bash\n") f.write("set -e\n") - f.write("cd " + search_dir + "\n") # --allow-empty keeps the post-extrinsic job from failing the DAG if a # re-run already consumed the per-event files or none were produced. - f.write(exe + " --input-glob '" + input_glob + "'" + f.write(exe + " --input-glob '" + search_pattern + "'" " --output " + file_output + " --allow-empty\n") os.system("chmod a+x " + cmdname) diff --git a/MonteCarloMarginalizeCode/Code/bin/create_event_parameter_pipeline_BasicIteration b/MonteCarloMarginalizeCode/Code/bin/create_event_parameter_pipeline_BasicIteration index 8e6ef3ed..d334de68 100755 --- a/MonteCarloMarginalizeCode/Code/bin/create_event_parameter_pipeline_BasicIteration +++ b/MonteCarloMarginalizeCode/Code/bin/create_event_parameter_pipeline_BasicIteration @@ -75,7 +75,9 @@ from RIFT.misc.dag_utils_generic import mkdir from RIFT.misc.dag_utils_generic import which -def add_batch_ILE_nodes_to_dag(my_dag,my_ile_job,my_parent_node, my_child_node, n_max, n_group, it, n_retries=3,it_start=0,convert_psd_node_list=[],node_list_dict={}): +def add_batch_ILE_nodes_to_dag(my_dag,my_ile_job,my_parent_node, my_child_node, n_max, n_group, it, n_retries=3,it_start=0,convert_psd_node_list=[],node_list_dict={},extra_parent_nodes=None): + if extra_parent_nodes is None: + extra_parent_nodes = [] for event in np.arange(n_max): ile_node = pipeline.CondorDAGNode(my_ile_job) ile_node.set_retry(n_retries) @@ -84,6 +86,9 @@ def add_batch_ILE_nodes_to_dag(my_dag,my_ile_job,my_parent_node, my_child_node, ile_node.add_macro("macroiteration", it) if not(my_parent_node is None): ile_node.add_parent(my_parent_node) + for node in extra_parent_nodes: + if not(node is None): + ile_node.add_parent(node) if it == it_start: for node in convert_psd_node_list: # for every PSD conversion job, make sure PSD is present before we run the first iteration! ile_node.add_parent(node) @@ -981,9 +986,15 @@ sed 1d ./tmp_converted.dat {extra_shuffle_command} >> ./extrinsic_posterior_sam if opts.last_iteration_export_marginal_distance_grid: dgrid_job, dgrid_job_name = dag_utils.write_consolidate_distance_grids_sub( tag='consolidate_dgrid', - input_glob='EXTR_out.xml_*_.dgrid', + # Match the real per-event batched-extrinsic outputs + # EXTR_out-.xml__.dgrid (note the leading-segment wildcard; + # 'EXTR_out.xml_*_.dgrid' has no '-' infix and matches nothing). + input_glob='EXTR_out*.xml_*_.dgrid', file_output=opts.working_directory + '/all_dgrid.dat', - search_dir=opts.working_directory + '/iteration_$(macroiteration)_ile', + # iteration_*_ile wildcard, NOT $(macroiteration): the consolidate driver + # is a shell script (no condor-macro substitution); the wildcard matches + # the final-iteration directory regardless of its number. + search_dir=opts.working_directory + '/iteration_*_ile', log_dir=opts.working_directory + '/iteration_$(macroiteration)_ile/logs/', universe=local_worker_universe, no_grid=no_worker_grid, ) @@ -996,9 +1007,9 @@ sed 1d ./tmp_converted.dat {extra_shuffle_command} >> ./extrinsic_posterior_sam if opts.last_iteration_export_distance_slices and opts.last_iteration_export_distance_slices > 0: dslice_job, dslice_job_name = dag_utils.write_consolidate_distance_grids_sub( tag='consolidate_dslice', - input_glob='EXTR_out.xml_*_.dslice', + input_glob='EXTR_out*.xml_*_.dslice', file_output=opts.working_directory + '/all_dslice.dat', - search_dir=opts.working_directory + '/iteration_$(macroiteration)_ile', + search_dir=opts.working_directory + '/iteration_*_ile', log_dir=opts.working_directory + '/iteration_$(macroiteration)_ile/logs/', universe=local_worker_universe, no_grid=no_worker_grid, ) @@ -1572,6 +1583,7 @@ if opts.comov_distance_reweighting: # parent_fit_node = None +last_puff_node = None last_node=None if opts.gridinit_args: @@ -1708,7 +1720,7 @@ for it in np.arange(it_start,opts.n_iterations): if puff_args and puff_cadence: if it>it_start and it <= puff_max_it and (it-1)%puff_cadence ==0: # we made a puffball last iteration, so run it through ILE now print(" ILE jobs for puffball on iteration ", it) - add_batch_ILE_nodes_to_dag(dag, ilePuff_job, parent_fit_node, con_node, indx_max, n_group_here, it, n_retries=opts.ile_retries,node_list_dict=ile_node_list_per_iteration) + add_batch_ILE_nodes_to_dag(dag, ilePuff_job, parent_fit_node, con_node, indx_max, n_group_here, it, n_retries=opts.ile_retries,node_list_dict=ile_node_list_per_iteration,extra_parent_nodes=[last_puff_node]) # for event in np.arange(indx_max): # ile_node = pipeline.CondorDAGNode(ilePuff_job) # only difference is here: uses puffball, which by construction is the same size/ perturbed points # ile_node.set_retry(opts.ile_retries) @@ -1980,7 +1992,7 @@ for it in np.arange(it_start,opts.n_iterations): puff_node.add_parent(parent_fit_node) # only fit if we have results from the previous iteration dag.add_node(puff_node) - parent_fit_node = puff_node + last_puff_node = puff_node