1616 fail-fast : false # as want clouds to continue independently
1717 concurrency : ${{ github.ref }} # to branch/PR
1818 runs-on : ubuntu-20.04
19+ env :
20+ ANSIBLE_FORCE_COLOR : True
21+ OS_CLOUD : openstack
22+ TF_VAR_cluster_name : ci${{ github.run_id }}
1923 steps :
2024 - uses : actions/checkout@v2
2125
2630 echo "${${{ matrix.cloud }}_SSH_KEY}" > ~/.ssh/id_rsa
2731 chmod 0600 ~/.ssh/id_rsa
2832 env :
29- smslabs_SSH_KEY : ${{ secrets.SSH_KEY }}
3033 arcus_SSH_KEY : ${{ secrets.ARCUS_SSH_KEY }}
3134
3235 - name : Add bastion's ssh key to known_hosts
@@ -49,19 +52,14 @@ jobs:
4952 echo "${${{ matrix.cloud }}_CLOUDS_YAML}" > ~/.config/openstack/clouds.yaml
5053 shell : bash
5154 env :
52- smslabs_CLOUDS_YAML : ${{ secrets.CLOUDS_YAML }}
5355 arcus_CLOUDS_YAML : ${{ secrets.ARCUS_CLOUDS_YAML }}
5456
5557 - name : Provision ports, inventory and other infrastructure apart from nodes
56- id : provision_ports
5758 run : |
5859 . venv/bin/activate
5960 . environments/${{ matrix.cloud }}/activate
6061 cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
6162 TF_VAR_create_nodes=false terraform apply -auto-approve
62- env :
63- OS_CLOUD : openstack
64- TF_VAR_cluster_name : ci${{ github.run_id }}
6563
6664 - name : Setup environment-specific inventory/terraform inputs
6765 run : |
7169 echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
7270 ansible-playbook ansible/adhoc/template-cloud-init.yml
7371 env :
74- ANSIBLE_FORCE_COLOR : True
7572 TESTUSER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
7673
7774 - name : Provision servers
8178 . environments/${{ matrix.cloud }}/activate
8279 cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
8380 terraform apply -auto-approve
84- env :
85- OS_CLOUD : openstack
86- TF_VAR_cluster_name : ci${{ github.run_id }}
8781
8882 - name : Get server provisioning failure messages
8983 id : provision_failure
9488 TF_FAIL_MSGS="$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
9589 echo TF failure messages: $TF_FAIL_MSGS
9690 echo "::set-output name=messages::${TF_FAIL_MSGS}"
97- env :
98- OS_CLOUD : openstack
99- TF_VAR_cluster_name : ci${{ github.run_id }}
10091 if : always() && steps.provision_servers.outcome == 'failure'
10192
10293 - name : Delete infrastructure if failed due to lack of hosts
@@ -105,29 +96,21 @@ jobs:
10596 . environments/${{ matrix.cloud }}/activate
10697 cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
10798 terraform destroy -auto-approve
108- env :
109- OS_CLOUD : openstack
110- TF_VAR_cluster_name : ci${{ github.run_id }}
11199 if : ${{ always() && steps.provision_servers.outcome == 'failure' && contains(steps.provision_failure.messages, 'not enough hosts available') }}
112100
113101 - name : Directly configure cluster
114102 run : |
115103 . venv/bin/activate
116104 . environments/${{ matrix.cloud }}/activate
117105 ansible all -m wait_for_connection
118- ansible-playbook -vv ansible/site.yml
119- env :
120- OS_CLOUD : openstack
121- ANSIBLE_FORCE_COLOR : True
106+ ansible-playbook -v ansible/site.yml
107+ ansible-playbook -v ansible/ci/check_slurm.yml
122108
123109 - name : Run MPI-based tests
124110 run : |
125111 . venv/bin/activate
126112 . environments/${{ matrix.cloud }}/activate
127113 ansible-playbook -vv ansible/adhoc/hpctests.yml
128- env :
129- ANSIBLE_FORCE_COLOR : True
130- OS_CLOUD : openstack
131114
132115 - name : Confirm Open Ondemand is up (via SOCKS proxy)
133116 run : |
@@ -161,62 +144,62 @@ jobs:
161144 TESTUSER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
162145
163146 - name : Build packer images
147+ id : packer_build
164148 run : |
165149 . venv/bin/activate
166150 . environments/${{ matrix.cloud }}/activate
167- ansible-playbook ansible/adhoc/generate-passwords.yml
168- echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
169151 cd packer/
170152 PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
171- env :
172- OS_CLOUD : openstack
173- ANSIBLE_FORCE_COLOR : True
174- TESTUSER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
153+ ../dev/output_manifest.py packer-manifest.json # Sets NEW_{COMPUTE,CONTROL,LOGIN}_IMAGE_ID outputs
175154
176- - name : Test reimage of nodes
155+ - name : Test reimage of login nodes (via rebuild adhoc)
177156 run : |
178157 . venv/bin/activate
179158 . environments/${{ matrix.cloud }}/activate
180- ansible all -m wait_for_connection
181- ansible-playbook -vv ansible/ci/test_reimage.yml
182- env :
183- OS_CLOUD : openstack
184- ANSIBLE_FORCE_COLOR : True
159+ ansible-playbook -v --limit login ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_LOGIN_IMAGE_ID }}
160+ ansible login -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
161+ ansible-playbook -v ansible/ci/check_slurm.yml
162+
163+ - name : Test reimage of compute nodes (via slurm)
164+ run : |
165+ . venv/bin/activate
166+ . environments/${{ matrix.cloud }}/activate
167+ ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
168+ ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
169+ ansible-playbook -v ansible/ci/check_slurm.yml
170+
171+ - name : Test reimage of control node (via rebuild adhoc)
172+ run : |
173+ . venv/bin/activate
174+ . environments/${{ matrix.cloud }}/activate
175+ ansible-playbook -v --limit control ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_CONTROL_IMAGE_ID }}
176+ ansible control -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
177+ ansible-playbook ansible/slurm.yml --tags openhpc # configures partitions
178+ ansible-playbook ansible/monitoring.yml --tags prometheus # configures scrapes
179+ ansible-playbook -v ansible/ci/check_slurm.yml
185180
186181 - name : Check sacct state survived reimage
187182 run : |
188183 . venv/bin/activate
189184 . environments/${{ matrix.cloud }}/activate
190185 ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
191- env :
192- ANSIBLE_FORCE_COLOR : True
193- OS_CLOUD : openstack
194186
195187 - name : Check MPI-based tests are shown in Grafana
196188 run : |
197189 . venv/bin/activate
198190 . environments/${{ matrix.cloud }}/activate
199191 ansible-playbook -vv ansible/ci/check_grafana.yml
200- env :
201- ANSIBLE_FORCE_COLOR : True
202- OS_CLOUD : openstack
203192
204193 - name : Delete infrastructure
205194 run : |
206195 . venv/bin/activate
207196 . environments/${{ matrix.cloud }}/activate
208197 cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
209198 terraform destroy -auto-approve
210- env :
211- OS_CLOUD : openstack
212- TF_VAR_cluster_name : ci${{ github.run_id }}
213199 if : ${{ success() || cancelled() }}
214200
215201 - name : Delete images
216202 run : |
217203 . venv/bin/activate
218204 . environments/${{ matrix.cloud }}/activate
219205 ansible-playbook -vv ansible/ci/delete_images.yml
220- env :
221- OS_CLOUD : openstack
222- ANSIBLE_FORCE_COLOR : True
0 commit comments