Skip to content

Commit 149bfdf

Browse files
authored
Fix nightly ci test timeout (#2399)
1 parent 09b6b35 commit 149bfdf

File tree

2 files changed

+143
-50
lines changed

2 files changed

+143
-50
lines changed

.circleci/config.yml

Lines changed: 90 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -25,47 +25,6 @@ commands:
2525
name: Login to Quay
2626
command: docker login -u=$QUAY_USERNAME -p=$QUAY_PASSWORD quay.io
2727

28-
install-e2e-dependencies:
29-
description: Install E2E Tests Dependencies
30-
steps:
31-
- run:
32-
name: Install Dependencies
33-
command: |
34-
pip install boto3 pyyaml awscli
35-
pip install -e ./test/e2e
36-
pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz
37-
38-
run-e2e-tests:
39-
description: Creates a temporary cluster and runs the cortex E2E tests
40-
parameters:
41-
config:
42-
description: cluster config file path
43-
type: string
44-
default: ./cluster.yaml
45-
slack_channel:
46-
description: "slack channel where failed builds will be posted (should start with #)"
47-
type: string
48-
default: "#builds"
49-
steps:
50-
- run:
51-
name: Create Cluster
52-
command: cortex cluster up << parameters.config >> --configure-env cortex -y
53-
- run:
54-
name: Run E2E Tests
55-
no_output_timeout: 30m
56-
command: |
57-
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia --arm-nodegroups arm --skip-autoscaling --skip-load --skip-long-running
58-
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_autoscaling
59-
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_load
60-
- run:
61-
name: Delete Cluster
62-
command: cortex cluster down --config << parameters.config >> -y
63-
when: always
64-
- slack/notify:
65-
event: fail
66-
channel: << parameters.slack_channel >>
67-
template: basic_fail_1
68-
6928
jobs:
7029
lint:
7130
docker:
@@ -187,15 +146,14 @@ jobs:
187146
command: make ci-amend-images
188147
no_output_timeout: 20m
189148

190-
e2e-tests:
149+
cluster-up:
191150
docker:
192151
- image: cimg/python:3.6
193-
environment:
194-
CORTEX_TEST_BATCH_S3_PATH: s3://cortex-nightly-artifacts/test/jobs
195152
steps:
196-
- setup_remote_docker
197-
- checkout
198-
- install-e2e-dependencies
153+
- run:
154+
name: Install Cortex CLI
155+
command: |
156+
pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz
199157
- run:
200158
name: Initialize Credentials
201159
command: |
@@ -240,8 +198,68 @@ jobs:
240198
- run:
241199
name: Verify configuration of credentials
242200
command: aws sts get-caller-identity | jq ".Arn" | grep "dev-cortex-nightly-us-east-1"
243-
- run-e2e-tests:
244-
config: ./cluster.yaml
201+
- run:
202+
name: Create Cluster
203+
command: cortex cluster up cluster.yaml --configure-env cortex -y
204+
- slack/notify:
205+
event: fail
206+
channel: "#builds"
207+
template: basic_fail_1
208+
209+
e2e-tests:
210+
docker:
211+
- image: cimg/python:3.6
212+
environment:
213+
CORTEX_TEST_BATCH_S3_PATH: s3://cortex-nightly-artifacts/test/jobs
214+
steps:
215+
- checkout
216+
- run:
217+
name: Install Dependencies
218+
command: |
219+
pip install boto3 pyyaml awscli
220+
pip install -e ./test/e2e
221+
pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz
222+
- run:
223+
name: Initialize Credentials
224+
command: |
225+
echo 'export AWS_ACCESS_KEY_ID=${NIGHTLY_AWS_ACCESS_KEY_ID}' >> $BASH_ENV
226+
echo 'export AWS_SECRET_ACCESS_KEY=${NIGHTLY_AWS_SECRET_ACCESS_KEY}' >> $BASH_ENV
227+
- run:
228+
name: Configure Cortex CLI
229+
command: cortex env configure cortex --operator-endpoint $(python dev/get_operator_url.py cortex-nightly us-east-1)
230+
- run:
231+
name: Run E2E Tests
232+
no_output_timeout: 30m
233+
command: |
234+
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia --arm-nodegroups arm --skip-autoscaling --skip-load --skip-long-running
235+
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_autoscaling
236+
pytest -v test/e2e/tests --env cortex --x86-nodegroups spot,cpu,gpu,inferentia -k test_load
237+
- slack/notify:
238+
event: fail
239+
channel: "#builds"
240+
template: basic_fail_1
241+
242+
cluster-down:
243+
docker:
244+
- image: cimg/python:3.6
245+
steps:
246+
- run:
247+
name: Install Cortex CLI
248+
command: |
249+
pip install https://s3-us-west-2.amazonaws.com/get-cortex/master/python/cortex-master.tar.gz
250+
- run:
251+
name: Initialize Credentials
252+
command: |
253+
echo 'export AWS_ACCESS_KEY_ID=${NIGHTLY_AWS_ACCESS_KEY_ID}' >> $BASH_ENV
254+
echo 'export AWS_SECRET_ACCESS_KEY=${NIGHTLY_AWS_SECRET_ACCESS_KEY}' >> $BASH_ENV
255+
- run:
256+
name: Delete Cluster
257+
command: cortex cluster down --name cortex-nightly --region us-east-1 -y
258+
when: always
259+
- slack/notify:
260+
event: fail
261+
channel: "#builds"
262+
template: basic_fail_1
245263

246264
workflows:
247265
build:
@@ -294,13 +312,35 @@ workflows:
294312
- master
295313
- /^[0-9]+\.[0-9]+$/
296314

297-
nightly:
315+
nightly-cluster-up:
298316
triggers:
299317
- schedule:
300318
cron: "0 0 * * *"
301319
filters:
302320
branches:
303321
only:
304322
- master
323+
jobs:
324+
- cluster-up
325+
326+
nightly-e2e-tests:
327+
triggers:
328+
- schedule:
329+
cron: "0 1 * * *"
330+
filters:
331+
branches:
332+
only:
333+
- master
305334
jobs:
306335
- e2e-tests
336+
337+
nightly-cluster-down:
338+
triggers:
339+
- schedule:
340+
cron: "0 2 * * *"
341+
filters:
342+
branches:
343+
only:
344+
- master
345+
jobs:
346+
- cluster-down

dev/get_operator_url.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Copyright 2021 Cortex Labs, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import sys
16+
import boto3
17+
18+
19+
def main():
20+
cluster_name = sys.argv[1]
21+
region = sys.argv[2]
22+
operator_url = get_operator_url(cluster_name, region)
23+
print("https://" + operator_url)
24+
25+
26+
def get_operator_url(cluster_name, region):
27+
client_elbv2 = boto3.client("elbv2", region_name=region)
28+
29+
paginator = client_elbv2.get_paginator("describe_load_balancers")
30+
for load_balancer_page in paginator.paginate(PaginationConfig={"PageSize": 20}):
31+
load_balancers = {
32+
load_balancer["LoadBalancerArn"]: load_balancer
33+
for load_balancer in load_balancer_page["LoadBalancers"]
34+
}
35+
tag_descriptions = client_elbv2.describe_tags(ResourceArns=list(load_balancers.keys()))[
36+
"TagDescriptions"
37+
]
38+
for tag_description in tag_descriptions:
39+
foundClusterNameTag = False
40+
foundLoadBalancerTag = False
41+
for tags in tag_description["Tags"]:
42+
if tags["Key"] == "cortex.dev/cluster-name" and tags["Value"] == cluster_name:
43+
foundClusterNameTag = True
44+
if tags["Key"] == "cortex.dev/load-balancer" and tags["Value"] == "operator":
45+
foundLoadBalancerTag = True
46+
if foundClusterNameTag and foundLoadBalancerTag:
47+
load_balancer = load_balancers[tag_description["ResourceArn"]]
48+
return load_balancer["DNSName"]
49+
50+
51+
# usage: python get_operator_url.py CLUSTER_NAME REGION
52+
if __name__ == "__main__":
53+
main()

0 commit comments

Comments
 (0)