Skip to content

Commit 6d21784

Browse files
authored
ICON4Py check: apps folder & uenv agnostic (#464)
1 parent a02a563 commit 6d21784

File tree

7 files changed

+236
-128
lines changed

7 files changed

+236
-128
lines changed

checks/microbenchmarks/gpu/gpu_benchmarks/icon4py.py renamed to checks/apps/icon4py/icon4py_check.py

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,31 @@
33
#
44
# SPDX-License-Identifier: BSD-3-Clause
55

6-
import os
7-
86
import reframe as rfm
97
import reframe.utility.sanity as sn
108

119

1210
@rfm.simple_test
1311
class ICON4PyBenchmarks(rfm.RunOnlyRegressionTest):
14-
descr = 'ICON4Py GPU benchmarks -Diffusion & DyCore Granules-'
12+
descr = 'ICON4Py Check -Diffusion & DyCore Granules-'
1513
maintainers = ['SSA']
1614
valid_systems = ['+uenv +amdgpu', '+uenv +nvgpu']
17-
valid_prog_environs = ['+uenv +prgenv +rocm', '+uenv +prgenv +cuda']
15+
valid_prog_environs = ['+uenv +rocm', '+uenv +cuda']
1816
tags = {'uenv', 'bencher'}
19-
time_limit = '30m'
17+
time_limit = '60m'
2018
build_locally = False
2119
env_vars = {
20+
'ICON4PY_PYTHON_VERSION': '3.11',
21+
'UV_NO_CACHE': '1',
2222
'UV_CACHE_DIR': '$SCRATCH/.cache/uv',
2323
'CC': '$(which gcc)',
2424
'MPICH_CC': '$(which gcc)',
2525
'CXX': '$(which g++)',
2626
'MPICH_CXX': '$(which g++)',
27+
'HUGETLB_ELFMAP': 'no',
28+
'HUGETLB_MORECORE': 'no',
29+
'GT4PY_UNSTRUCTURED_HORIZONTAL_HAS_UNIT_STRIDE': '1',
30+
'PYTHONOPTIMIZE': '2',
2731
# GT4Py cache does not work properly for dace backend yet
2832
# 'GT4PY_BUILD_CACHE_LIFETIME': 'persistent',
2933
# 'GT4PY_BUILD_CACHE_DIR': '...',
@@ -32,36 +36,33 @@ class ICON4PyBenchmarks(rfm.RunOnlyRegressionTest):
3236
executable_opts = ['2>&1']
3337

3438
@run_before('run')
35-
def install_deps(self):
36-
self.prerun_cmds = ['./_install.sh &> _install.sh.log 2>&1']
37-
38-
@run_before('run')
39-
def prepare_run(self):
40-
if 'rocm' in self.current_environ.features:
41-
gpu_arch = self.current_partition.select_devices('gpu')[0].arch
39+
def prepare_env(self):
40+
gpu_arch = self.current_partition.select_devices('gpu')[0].arch
41+
if 'gfx' in gpu_arch: # AMD GPU
4242
self.env_vars['CUPY_INSTALL_USE_HIP'] = '1'
4343
self.env_vars['HCC_AMDGPU_TARGET'] = gpu_arch
4444
self.env_vars['ROCM_HOME'] = '/user-environment/env/default'
45+
# elif 'sm_' in gpu_arch: # CUDA GPU
46+
47+
@run_before('run')
48+
def install_deps(self):
49+
self.prerun_cmds = ['./_install.sh &> _install.sh.log 2>&1']
4550

4651
@sanity_function
4752
def validate_test(self):
48-
rfm_stop = os.getenv('RFM_ICON4PY_STOP')
49-
if rfm_stop == 'Y':
50-
diffusion_granule = sn.assert_found('# INFO:', self.stdout)
51-
else:
52-
diffusion_granule = sn.assert_found(
53-
(r'^\s*model/atmosphere/diffusion/tests/'
54-
r'diffusion/integration_tests/'
55-
r'test_benchmark_diffusion\.py'
56-
r'::test_diffusion_benchmark\s*PASSED'
57-
), self.stdout)
58-
# dycore_granule = sn.assert_found(
59-
# (r'^\s*model/atmosphere/dycore/tests/'
60-
# r'dycore/integration_tests/test_benchmark_solve_nonhydro\.py'
61-
# r'::test_benchmark_solve_nonhydro\[True-False\]\s*PASSED'),
62-
# self.stdout)
53+
diffusion_granule = sn.assert_found(
54+
(r'^\s*model/atmosphere/diffusion/tests/'
55+
r'diffusion/integration_tests/'
56+
r'test_benchmark_diffusion\.py'
57+
r'::test_diffusion_benchmark\s*PASSED'
58+
), self.stdout)
59+
dycore_granule = sn.assert_found(
60+
(r'^\s*model/atmosphere/dycore/tests/'
61+
r'dycore/integration_tests/test_benchmark_solve_nonhydro\.py'
62+
r'::test_benchmark_solve_nonhydro\[True-False\]\s*PASSED'
63+
), self.stdout)
6364

64-
return diffusion_granule # and dycore_granule
65+
return diffusion_granule and dycore_granule
6566

6667
@run_before('performance')
6768
def set_perf_vars(self):
@@ -76,23 +77,23 @@ def set_perf_vars(self):
7677
diffusion_granule_mean = sn.extractsingle(
7778
diffusion_regex, self.stdout, 'mean', float)
7879

79-
# dycore_regex = (
80-
# r'^\s*test_benchmark_solve_nonhydro\[True-False\]\s+'
81-
# r'(?P<min>\d+(?:\.\d+)?)' # Min
82-
# r'(?:\s+\([^)]+\))?\s+' # optional '(...)'
83-
# r'(?P<max>\d+(?:\.\d+)?)' # Max
84-
# r'(?:\s+\([^)]+\))?\s+' # optional '(...)'
85-
# r'(?P<mean>\d+(?:\.\d+)?)' # Mean
86-
# )
87-
# dycore_granule_mean = sn.extractsingle(
88-
# dycore_regex, self.stdout, 'mean', float)
80+
dycore_regex = (
81+
r'^\s*test_benchmark_solve_nonhydro\[True-False\]\s+'
82+
r'(?P<min>\d+(?:\.\d+)?)' # Min
83+
r'(?:\s+\([^)]+\))?\s+' # optional '(...)'
84+
r'(?P<max>\d+(?:\.\d+)?)' # Max
85+
r'(?:\s+\([^)]+\))?\s+' # optional '(...)'
86+
r'(?P<mean>\d+(?:\.\d+)?)' # Mean
87+
)
88+
dycore_granule_mean = sn.extractsingle(
89+
dycore_regex, self.stdout, 'mean', float)
8990

9091
self.perf_variables = {
9192
'diffusion_granule':
9293
sn.make_performance_function(diffusion_granule_mean, 'ms'),
9394
#
94-
# 'dycore_granule':
95-
# sn.make_performance_function(dycore_granule_mean, 'ms'),
95+
'dycore_granule':
96+
sn.make_performance_function(dycore_granule_mean, 'ms'),
9697
}
9798

9899
# TODO: add ref. (https://github.com/eth-cscs/cscs-reframe-tests/pull/440)
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/bin/bash
2+
3+
date
4+
echo "# SLURM_JOBID=$SLURM_JOBID"
5+
6+
unset PYTHONPATH
7+
8+
git clone https://github.com/C2SM/icon4py.git
9+
cd icon4py
10+
git checkout 5485bcacb1dbc7688b1e7d276d4e2e28362c5444 # Commit: Update to GT4Py v1.1.0 (#933)
11+
12+
# Install uv locally
13+
curl -LsSf https://astral.sh/uv/install.sh | UV_UNMANAGED_INSTALL="$PWD/bin" sh
14+
export PATH="$PWD/bin:$PATH"
15+
16+
# Install ICON4Py
17+
HOME="$PWD/_home" uv python install $ICON4PY_PYTHON_VERSION
18+
uv sync --extra all --python $PWD/_home/.local/bin/python$ICON4PY_PYTHON_VERSION
19+
20+
# Activate virtual environment
21+
source .venv/bin/activate
22+
23+
# Compatibility for both Daint & Beverin
24+
mpi4py_ver=$(uv pip show mpi4py | awk '/Version:/ {print $2}')
25+
uv pip uninstall mpi4py && uv pip install --no-binary mpi4py "mpi4py==$mpi4py_ver"
26+
uv pip install git+https://github.com/cupy/cupy.git
27+
28+
################################################################################
29+
# NVHPC runtime auto-discovery for serialbox (libnvhpcatm.so)
30+
################################################################################
31+
nvhpc_lib="$(find /user-environment -type f -name 'libnvhpcatm.so' 2>/dev/null | head -n1 || true)"
32+
if [ -n "$nvhpc_lib" ]; then
33+
nvhpc_dir="$(dirname "$nvhpc_lib")"
34+
35+
# Export for the current script run (dedupe-safe)
36+
case ":$LD_LIBRARY_PATH:" in
37+
*":$nvhpc_dir:"*) : ;;
38+
*) export LD_LIBRARY_PATH="${nvhpc_dir}${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" ;;
39+
esac
40+
41+
echo "# Found libnvhpcatm.so at: $nvhpc_lib"
42+
echo "# Adding NVHPC lib dir to LD_LIBRARY_PATH: $nvhpc_dir"
43+
else
44+
echo "LD_LIBRARY_PATH not modified for NVHPC (libnvhpcatm.so not found). Is this expected?"
45+
fi
46+
47+
# Persist a dynamic NVHPC discovery into .venv/bin/activate so that every
48+
# future activation re-discovers libnvhpcatm.so (if present on that cluster).
49+
cat >> .venv/bin/activate <<'EOF'
50+
51+
# Added automatically to make serialbox work (needs libnvhpcatm.so)
52+
nvhpc_lib=$(find /user-environment -type f -name 'libnvhpcatm.so' 2>/dev/null | head -n1 || true)
53+
if [ -n "$nvhpc_lib" ]; then
54+
nvhpc_dir=$(dirname "$nvhpc_lib")
55+
if [ -d "$nvhpc_dir" ]; then
56+
case ":$LD_LIBRARY_PATH:" in
57+
*:"$nvhpc_dir":*) : ;;
58+
*) export LD_LIBRARY_PATH="$nvhpc_dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" ;;
59+
esac
60+
fi
61+
fi
62+
63+
EOF
64+
65+
################################################################################
66+
# Serialbox / libstdc++ auto-discovery
67+
################################################################################
68+
69+
# 1) Fix for the current script run: find libSerialboxC.so, ask ldd which
70+
# libstdc++.so it uses, and prepend that directory to LD_LIBRARY_PATH.
71+
serialbox_so="$(find "$PWD/.venv" -maxdepth 7 -type f -name 'libSerialboxC.so*' 2>/dev/null | head -n1 || true)"
72+
if [ -n "$serialbox_so" ] && [ -f "$serialbox_so" ]; then
73+
libstdcpp_path="$(ldd "$serialbox_so" 2>/dev/null | awk '/libstdc\+\+\.so/ {print $3; exit}')"
74+
if [ -n "$libstdcpp_path" ] && [ -f "$libstdcpp_path" ]; then
75+
libstdcpp_dir="$(dirname "$libstdcpp_path")"
76+
77+
case ":$LD_LIBRARY_PATH:" in
78+
*":$libstdcpp_dir:"*) : ;;
79+
*) export LD_LIBRARY_PATH="${libstdcpp_dir}${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" ;;
80+
esac
81+
82+
echo "# Serialbox library : $serialbox_so"
83+
echo "# Serialbox libstdc++ : $libstdcpp_path"
84+
echo "# Adding libstdc++ dir to LD_LIBRARY_PATH: $libstdcpp_dir"
85+
else
86+
echo "# WARNING: Could not determine libstdc++.so used by $serialbox_so"
87+
fi
88+
else
89+
echo "# NOTE: libSerialboxC.so not found under .venv yet (is serialbox installed?)."
90+
fi
91+
92+
# 2) Persist this logic into .venv/bin/activate so every future activation
93+
# automatically discovers Serialbox and its libstdc++ and updates LD_LIBRARY_PATH.
94+
cat >> .venv/bin/activate <<'EOF'
95+
96+
# Added automatically so Serialbox can always find the right libstdc++
97+
if [ -n "$VIRTUAL_ENV" ]; then
98+
serialbox_so=$(find "$VIRTUAL_ENV" -maxdepth 7 -type f -name 'libSerialboxC.so*' 2>/dev/null | head -n 1)
99+
if [ -n "$serialbox_so" ] && [ -f "$serialbox_so" ]; then
100+
libstdcpp_path=$(ldd "$serialbox_so" 2>/dev/null | awk '/libstdc\+\+\.so/ {print $3; exit}')
101+
if [ -n "$libstdcpp_path" ] && [ -f "$libstdcpp_path" ]; then
102+
libstdcpp_dir=$(dirname "$libstdcpp_path")
103+
case ":$LD_LIBRARY_PATH:" in
104+
*:"$libstdcpp_dir":*) : ;;
105+
*) export LD_LIBRARY_PATH="$libstdcpp_dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" ;;
106+
esac
107+
fi
108+
fi
109+
fi
110+
111+
EOF
112+
113+
################################################################################
114+
115+
echo "# install done"
116+
date

checks/apps/icon4py/src/_run.sh

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/bin/bash
2+
3+
date
4+
5+
unset PYTHONPATH
6+
7+
cd icon4py
8+
source .venv/bin/activate
9+
10+
pytest -v \
11+
-m continuous_benchmarking \
12+
--benchmark-only \
13+
--benchmark-warmup=on \
14+
--benchmark-warmup-iterations=30 \
15+
--benchmark-json=icon4py_benchmarks.json \
16+
--backend=dace_gpu \
17+
--grid=icon_benchmark_regional \
18+
--benchmark-time-unit=ms \
19+
model/atmosphere/diffusion/tests/diffusion/integration_tests/test_benchmark_diffusion.py::test_diffusion_benchmark \
20+
model/atmosphere/dycore/tests/dycore/integration_tests/test_benchmark_solve_nonhydro.py::test_benchmark_solve_nonhydro[True-False]
21+
echo
22+
23+
# Cleanup
24+
deactivate
25+
cd ..
26+
rm -rf icon4py
27+
28+
echo "# run done"
29+
date

checks/microbenchmarks/gpu/gpu_benchmarks/src/_install.sh

Lines changed: 0 additions & 37 deletions
This file was deleted.

checks/microbenchmarks/gpu/gpu_benchmarks/src/_run.sh

Lines changed: 0 additions & 44 deletions
This file was deleted.

ci/alps_uenv.yml

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,8 @@ reframe:
1010
extends: .f7t-baremetal-runner
1111
timeout: 8 hours
1212
# needs: ['setup and pull']
13-
rules:
14-
- if: '$BENCHER_API_TOKEN == null'
15-
variables:
16-
SLURM_TIMELIMIT: '02:00:00'
17-
- if: '$BENCHER_API_TOKEN'
18-
variables:
19-
SLURM_TIMELIMIT: '01:00:00'
13+
variables:
14+
SLURM_TIMELIMIT: '02:00:00'
2015
before_script:
2116
- echo "FIRECREST_SYSTEM=$FIRECREST_SYSTEM / F7T_URL=$F7T_URL / CLUSTER_NAME=$CLUSTER_NAME"
2217
- uname -a

0 commit comments

Comments
 (0)