4343 TF_VAR_cluster_name : slurmci-${{ matrix.os_version }}-${{ github.run_number }}
4444 CI_CLOUD : ${{ vars.CI_CLOUD }} # default from repo settings
4545 TF_VAR_os_version : ${{ matrix.os_version }}
46+ STACKHPC_TF_DIR : environments/.stackhpc/tofu
4647 steps :
47- - uses : actions/checkout@v2
48+
49+ - name : Find the latest release
50+ run : |
51+ echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV"
52+
53+ - name : Checkout latest release
54+ uses : actions/checkout@v4
55+ with :
56+ ref : ${{ env.LATEST_RELEASE_TAG }}
57+ fetch-depth : 0
4858
4959 - name : Override CI_CLOUD if PR label is present
5060 if : ${{ github.event_name == 'pull_request' }}
6070 fi
6171 done
6272
63- - name : Record settings for CI cloud
73+ - name : Record debug info
6474 run : |
65- echo CI_CLOUD: ${{ env.CI_CLOUD }}
75+ echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG
76+ echo CI_CLOUD: $CI_CLOUD
6677
6778 - name : Setup ssh
6879 run : |
7687 run : cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
7788 shell : bash
7889
79- - name : Install ansible etc
90+ - name : Install ansible, pip and galaxy requirements
8091 run : dev/setup-env.sh
8192
8293 - name : Install OpenTofu
8697
8798 - name : Initialise tofu
8899 run : tofu init
89- working-directory : ${{ github.workspace }}/environments/.stackhpc/tofu
100+ working-directory : ${{ env.STACKHPC_TF_DIR }}
90101
91102 - name : Write clouds.yaml
92103 run : |
@@ -103,42 +114,90 @@ jobs:
103114 env :
104115 DEMO_USER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
105116
106- - name : Provision nodes using fat image
117+ - name : Provision nodes using latest release image
107118 id : provision_servers
108119 run : |
109120 . venv/bin/activate
110121 . environments/.stackhpc/activate
111- cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
122+ cd $STACKHPC_TF_DIR
112123 tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
113124
114125 - name : Delete infrastructure if provisioning failed
115126 run : |
116127 . venv/bin/activate
117128 . environments/.stackhpc/activate
118- cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
129+ cd $STACKHPC_TF_DIR
119130 tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
120131 if : failure() && steps.provision_servers.outcome == 'failure'
121132
122- - name : Configure cluster
133+ - name : Configure cluster at latest release
123134 run : |
124135 . venv/bin/activate
125136 . environments/.stackhpc/activate
126137 ansible all -m wait_for_connection
127138 ansible-playbook -v ansible/site.yml
128139 ansible-playbook -v ansible/ci/check_slurm.yml
129140
130- - name : Run MPI-based tests
141+ - name : Run MPI-based tests at latest release
131142 run : |
132143 . venv/bin/activate
133144 . environments/.stackhpc/activate
134- ansible-playbook -vv ansible/adhoc/hpctests.yml
145+ ansible-playbook -vv ansible/adhoc/hpctests.yml --tags pingpong
135146
136147 # - name: Run EESSI tests
137148 # run: |
138149 # . venv/bin/activate
139150 # . environments/.stackhpc/activate
140151 # ansible-playbook -vv ansible/ci/check_eessi.yml
141152
153+ - name : Checkout current branch
154+ run : git checkout ${{ github.head_ref || github.ref_name }}
155+
156+ - name : Update ansible, pip and galaxy requirements
157+ run : dev/setup-env.sh
158+
159+ - name : Reimage login and control nodes to image in current branch
160+ id : reimage_non_compute
161+ run : |
162+ . venv/bin/activate
163+ . environments/.stackhpc/activate
164+ cd $STACKHPC_TF_DIR
165+ tofu init
166+ tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
167+
168+ - name : Configure cluster using current branch
169+ run : |
170+ . venv/bin/activate
171+ . environments/.stackhpc/activate
172+ ansible all -m wait_for_connection
173+ ansible-playbook -v ansible/site.yml
174+ ansible-playbook -v ansible/ci/check_slurm.yml
175+
176+ - name : Reimage compute nodes to image in current branch using slurm - tests compute-init
177+ run : |
178+ . venv/bin/activate
179+ . environments/.stackhpc/activate
180+ ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
181+ ansible-playbook -v ansible/ci/check_slurm.yml
182+
183+ - name : Check sacct state survived reimage to current branch
184+ run : |
185+ . venv/bin/activate
186+ . environments/.stackhpc/activate
187+ ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
188+
189+ - name : Check MPI-based tests are shown in Grafana
190+ run : |
191+ . venv/bin/activate
192+ . environments/.stackhpc/activate
193+ ansible-playbook -vv ansible/ci/check_grafana.yml
194+
195+ - name : Run MPI-based tests again in current branch
196+ run : |
197+ . venv/bin/activate
198+ . environments/.stackhpc/activate
199+ ansible-playbook -vv ansible/adhoc/hpctests.yml
200+
142201 - name : Confirm Open Ondemand is up (via SOCKS proxy)
143202 run : |
144203 . venv/bin/activate
@@ -170,43 +229,10 @@ jobs:
170229 env :
171230 DEMO_USER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
172231
173- - name : Test reimage of login and control nodes (via rebuild adhoc)
174- run : |
175- . venv/bin/activate
176- . environments/.stackhpc/activate
177- ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
178- ansible-playbook -v ansible/site.yml
179- ansible-playbook -v ansible/ci/check_slurm.yml
180-
181- - name : Test compute node reboot and compute-init
182- run : |
183- . venv/bin/activate
184- . environments/.stackhpc/activate
185- ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
186- ansible-playbook -v ansible/ci/check_slurm.yml
187-
188- - name : Check sacct state survived reimage
189- run : |
190- . venv/bin/activate
191- . environments/.stackhpc/activate
192- ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
193-
194- - name : Check MPI-based tests are shown in Grafana
195- run : |
196- . venv/bin/activate
197- . environments/.stackhpc/activate
198- ansible-playbook -vv ansible/ci/check_grafana.yml
199-
200232 - name : Delete infrastructure
201233 run : |
202234 . venv/bin/activate
203235 . environments/.stackhpc/activate
204- cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
205- tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
236+ cd $STACKHPC_TF_DIR
237+ tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR"
206238 if : ${{ success() || cancelled() }}
207-
208- # - name: Delete images
209- # run: |
210- # . venv/bin/activate
211- # . environments/.stackhpc/activate
212- # ansible-playbook -vv ansible/ci/delete_images.yml
0 commit comments