diff --git a/.buildkite/clima_gpu_pipeline.yml b/.buildkite/clima_gpu_pipeline.yml index 8f2f78a1f..3ca8a752d 100644 --- a/.buildkite/clima_gpu_pipeline.yml +++ b/.buildkite/clima_gpu_pipeline.yml @@ -10,7 +10,6 @@ env: OMPI_MCA_opal_warn_on_missing_libcuda: 0 SLURM_KILL_BAD_EXIT: 1 SLURM_GRES_FLAGS: "allow-task-sharing" - JULIA_DEPOT_PATH: "${BUILDKITE_BUILD_PATH}/${BUILDKITE_PIPELINE_SLUG}/depot/default" steps: - label: "init :computer:" diff --git a/src/backends.jl b/src/backends.jl index 186df9513..c8cd3060e 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -186,10 +186,36 @@ function module_load_string(::DerechoBackend) end function module_load_string(::GCPBackend) - return """export OPAL_PREFIX="/sw/openmpi-5.0.5" - export PATH="/sw/openmpi-5.0.5/bin:\$PATH" - export LD_LIBRARY_PATH="/sw/openmpi-5.0.5/lib:\$LD_LIBRARY_PATH" - export UCX_MEMTYPE_CACHE=y # UCX Memory optimization which toggles whether UCX library intercepts cu*alloc* calls + return """ + unset CUDA_ROOT + unset NVHPC_CUDA_HOME + unset CUDA_INC_DIR + unset CPATH + unset NVHPC_ROOT + + # NVHPC and HPC-X paths + export NVHPC="/sw/nvhpc/Linux_x86_64/24.5" + export HPCX_PATH="\${NVHPC}/comm_libs/12.4/hpcx/hpcx-2.19" + + # CUDA environment + export CUDA_HOME="\${NVHPC}/cuda/12.4" + export CUDA_PATH="\${CUDA_HOME}" + export CUDA_ROOT="\${CUDA_HOME}" + + # MPI via MPIwrapper + export MPITRAMPOLINE_LIB="/sw/mpiwrapper/lib/libmpiwrapper.so" + export OPAL_PREFIX="\${HPCX_PATH}/ompi" + + # Library paths - CUDA first, then HPC-X + export LD_LIBRARY_PATH="\${CUDA_HOME}/lib64:\${HPCX_PATH}/ompi/lib\${LD_LIBRARY_PATH:+:\${LD_LIBRARY_PATH}}" + + # Executable paths + export PATH="/sw/mpiwrapper/bin:\${CUDA_HOME}/bin:\${PATH}" + export PATH="\${NVHPC}/profilers/Nsight_Systems/target-linux-x64:\${PATH}" + + # Julia + export PATH="/sw/julia/julia-1.11.5/bin:\${PATH}" + export JULIA_MPI_HAS_CUDA="true" """ end diff --git a/src/slurm.jl b/src/slurm.jl index a6081113f..38517062f 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -212,8 +212,7 @@ function generate_sbatch_script( climacomms_device = gpus_per_task > 0 ? "CUDA" : "CPU" # TODO: Remove this exception for GCP mpiexec_string = - get_backend() == GCPBackend ? - "/sw/openmpi-5.0.5/bin/mpiexec -n $ntasks" : + get_backend() == GCPBackend ? "mpiexec -n $ntasks" : "srun --output=$member_log --open-mode=append" sbatch_contents = """ #!/bin/bash