Skip to content

Commit c2033be

Browse files
authored
Wait for load balancers to be ready when creating cluster (#1431)
1 parent b1db1ee commit c2033be

File tree

6 files changed

+274
-62
lines changed

6 files changed

+274
-62
lines changed

manager/debug.sh

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
set +e
18+
1719
CORTEX_VERSION_MINOR=master
1820

1921
debug_out_path="$1"
@@ -31,21 +33,35 @@ echo -n "gathering cluster data"
3133

3234
mkdir -p /cortex-debug/k8s
3335
for resource in pods pods.metrics nodes nodes.metrics daemonsets deployments hpa services virtualservices gateways ingresses configmaps jobs replicasets events; do
34-
kubectl describe $resource --all-namespaces &>/dev/null > "/cortex-debug/k8s/${resource}"
35-
kubectl get $resource --all-namespaces &>/dev/null > "/cortex-debug/k8s/${resource}-list"
36+
kubectl describe $resource --all-namespaces > "/cortex-debug/k8s/${resource}" 2>&1
37+
kubectl get $resource --all-namespaces > "/cortex-debug/k8s/${resource}-list" 2>&1
3638
echo -n "."
3739
done
3840

3941
mkdir -p /cortex-debug/logs
40-
kubectl get pods --all-namespaces -o json | jq '.items[] | "kubectl logs -n \(.metadata.namespace) \(.metadata.name) --all-containers --timestamps --tail=10000 &>/dev/null > /cortex-debug/logs/\(.metadata.namespace).\(.metadata.name) && echo -n ."' | xargs -n 1 bash -c
42+
kubectl get pods --all-namespaces -o json | jq '.items[] | "kubectl logs -n \(.metadata.namespace) \(.metadata.name) --all-containers --timestamps --tail=10000 > /cortex-debug/logs/\(.metadata.namespace).\(.metadata.name) 2>&1 && echo -n ."' | xargs -n 1 bash -c
4143

42-
kubectl top pods --all-namespaces --containers=true &>/dev/null > "/cortex-debug/k8s/top_pods"
43-
kubectl top nodes &>/dev/null > "/cortex-debug/k8s/top_nodes"
44+
kubectl top pods --all-namespaces --containers=true > "/cortex-debug/k8s/top_pods" 2>&1
45+
kubectl top nodes > "/cortex-debug/k8s/top_nodes" 2>&1
4446

4547
mkdir -p /cortex-debug/aws
46-
aws --region=$CORTEX_REGION autoscaling describe-auto-scaling-groups &>/dev/null > "/cortex-debug/aws/asgs"
48+
aws --region=$CORTEX_REGION autoscaling describe-auto-scaling-groups > "/cortex-debug/aws/asgs" 2>&1
49+
echo -n "."
50+
aws --region=$CORTEX_REGION autoscaling describe-scaling-activities > "/cortex-debug/aws/asg-activities" 2>&1
51+
echo -n "."
52+
python get_operator_load_balancer_state.py > "/cortex-debug/aws/operator_load_balancer_state" 2>&1
53+
python get_api_load_balancer_state.py > "/cortex-debug/aws/api_load_balancer_state" 2>&1
54+
python get_operator_target_group_status.py > "/cortex-debug/aws/operator_load_balancer_target_group_status" 2>&1
4755
echo -n "."
48-
aws --region=$CORTEX_REGION autoscaling describe-scaling-activities &>/dev/null > "/cortex-debug/aws/asg-activities"
56+
57+
mkdir -p /cortex-debug/misc
58+
operator_endpoint=$(kubectl -n=istio-system get service ingressgateway-operator -o json 2>/dev/null | tr -d '[:space:]' | sed 's/.*{\"hostname\":\"\(.*\)\".*/\1/')
59+
echo "$operator_endpoint" > /cortex-debug/misc/operator_endpoint
60+
if [ "$operator_endpoint" == "" ]; then
61+
echo "unable to get operator endpoint" > /cortex-debug/misc/operator_curl
62+
else
63+
curl -sv --max-time 5 "${operator_endpoint}/verifycortex" > /cortex-debug/misc/operator_curl 2>&1
64+
fi
4965
echo -n "."
5066

5167
(cd / && tar -czf cortex-debug.tgz cortex-debug)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright 2020 Cortex Labs, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import boto3
16+
import os
17+
18+
from helpers import get_api_load_balancer
19+
20+
21+
def get_api_load_balancer_state():
22+
cluster_name = os.environ["CORTEX_CLUSTER_NAME"]
23+
region = os.environ["CORTEX_REGION"]
24+
25+
client_elbv2 = boto3.client("elbv2", region_name=region)
26+
27+
load_balancer = get_api_load_balancer(cluster_name, client_elbv2)
28+
return load_balancer["State"]["Code"]
29+
30+
31+
if __name__ == "__main__":
32+
print(get_api_load_balancer_state(), end="")
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright 2020 Cortex Labs, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import boto3
16+
import os
17+
18+
from helpers import get_operator_load_balancer
19+
20+
21+
def get_operator_load_balancer_state():
22+
cluster_name = os.environ["CORTEX_CLUSTER_NAME"]
23+
region = os.environ["CORTEX_REGION"]
24+
25+
client_elbv2 = boto3.client("elbv2", region_name=region)
26+
27+
load_balancer = get_operator_load_balancer(cluster_name, client_elbv2)
28+
return load_balancer["State"]["Code"]
29+
30+
31+
if __name__ == "__main__":
32+
print(get_operator_load_balancer_state(), end="")
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Copyright 2020 Cortex Labs, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import boto3
16+
import os
17+
import json
18+
19+
from helpers import get_operator_load_balancer
20+
21+
22+
def get_operator_target_group_status():
23+
cluster_name = os.environ["CORTEX_CLUSTER_NAME"]
24+
region = os.environ["CORTEX_REGION"]
25+
26+
client_elbv2 = boto3.client("elbv2", region_name=region)
27+
28+
load_balancer_arn = get_operator_load_balancer(cluster_name, client_elbv2)["LoadBalancerArn"]
29+
target_group_arn = get_load_balancer_https_target_group_arn(load_balancer_arn, client_elbv2)
30+
return get_target_health(target_group_arn, client_elbv2)
31+
32+
33+
def get_load_balancer_https_target_group_arn(load_balancer_arn, client_elbv2):
34+
paginator = client_elbv2.get_paginator("describe_listeners")
35+
for listener_page in paginator.paginate(LoadBalancerArn=load_balancer_arn):
36+
for listener in listener_page["Listeners"]:
37+
if listener["Port"] == 443:
38+
return listener["DefaultActions"][0]["TargetGroupArn"]
39+
40+
raise Exception(
41+
f"unable to find https target group for operator load balancer ({load_balancer_arn})"
42+
)
43+
44+
45+
def get_target_health(target_group_arn, client_elbv2):
46+
response = client_elbv2.describe_target_health(TargetGroupArn=target_group_arn)
47+
for health_description in response["TargetHealthDescriptions"]:
48+
if health_description["TargetHealth"]["State"] == "healthy":
49+
return "healthy"
50+
51+
return json.dumps(response["TargetHealthDescriptions"])
52+
53+
54+
if __name__ == "__main__":
55+
print(get_operator_target_group_status(), end="")

manager/helpers.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Copyright 2020 Cortex Labs, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def get_operator_load_balancer(cluster_name, client_elbv2):
17+
return _get_load_balancer("operator", cluster_name, client_elbv2)
18+
19+
20+
def get_api_load_balancer(cluster_name, client_elbv2):
21+
return _get_load_balancer("api", cluster_name, client_elbv2)
22+
23+
24+
def _get_load_balancer(load_balancer_tag, cluster_name, client_elbv2):
25+
paginator = client_elbv2.get_paginator("describe_load_balancers")
26+
for load_balancer_page in paginator.paginate(PaginationConfig={"PageSize": 20}):
27+
load_balancers = {
28+
load_balancer["LoadBalancerArn"]: load_balancer
29+
for load_balancer in load_balancer_page["LoadBalancers"]
30+
}
31+
tag_descriptions = client_elbv2.describe_tags(ResourceArns=list(load_balancers.keys()))[
32+
"TagDescriptions"
33+
]
34+
for tag_description in tag_descriptions:
35+
foundClusterNameTag = False
36+
foundLoadBalancerTag = False
37+
for tags in tag_description["Tags"]:
38+
if tags["Key"] == "cortex.dev/cluster-name" and tags["Value"] == cluster_name:
39+
foundClusterNameTag = True
40+
if tags["Key"] == "cortex.dev/load-balancer" and tags["Value"] == load_balancer_tag:
41+
foundLoadBalancerTag = True
42+
if foundClusterNameTag and foundLoadBalancerTag:
43+
return load_balancers[tag_description["ResourceArn"]]
44+
45+
raise Exception(f"unable to find {load_balancer_tag} load balancer")

0 commit comments

Comments
 (0)