Skip to content

Commit d46feed

Browse files
authored
add canary related supporting infra (#23)
* helper scripts for canary * update acktest * buildspec and path changs * bug fix after rebase
1 parent d1bfe3f commit d46feed

File tree

9 files changed

+255
-4
lines changed

9 files changed

+255
-4
lines changed

CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
# https://github.com/blog/2392-introducing-code-owners
44

55
# Amazon SageMaker CodeOwners
6-
* @akartsky @jkuruba @mbaijal @RedbackThomson @surajkota
6+
* @akartsky @mbaijal @surajkota

test/canary/Dockerfile.canary

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
FROM ubuntu:18.04
2+
3+
# Build time parameters
4+
ARG SERVICE=sagemaker
5+
6+
RUN apt-get update && apt-get install -y curl \
7+
wget \
8+
git \
9+
python3.8 \
10+
python3-pip \
11+
python3.8-dev \
12+
vim \
13+
sudo \
14+
jq \
15+
unzip
16+
17+
# Install awscli
18+
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
19+
&& unzip -qq awscliv2.zip \
20+
&& ./aws/install
21+
22+
# Add yq repository and install yq
23+
RUN apt-get update && apt install -y software-properties-common \
24+
&& sudo add-apt-repository ppa:rmescandon/yq \
25+
&& apt update && apt install -y yq
26+
27+
# Install kubectl
28+
RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.18.6/bin/linux/amd64/kubectl \
29+
&& chmod +x ./kubectl \
30+
&& cp ./kubectl /bin
31+
32+
# Install eksctl
33+
RUN curl --silent --location "https://github.com/weaveworks/eksctl/releases/download/latest_release/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && mv /tmp/eksctl /bin
34+
35+
# Install Helm
36+
RUN curl -q -L "https://get.helm.sh/helm-v3.2.4-linux-amd64.tar.gz" | tar zxf - -C /usr/local/bin/ \
37+
&& mv /usr/local/bin/linux-amd64/helm /usr/local/bin/helm \
38+
&& rm -r /usr/local/bin/linux-amd64 \
39+
&& chmod +x /usr/local/bin/helm
40+
41+
ENV SERVICE_REPO_PATH=/$SERVICE-controller
42+
COPY ./test/e2e/requirements.txt requirements.txt
43+
44+
RUN ln -s /usr/bin/python3.8 /usr/bin/python \
45+
&& python -m pip install --upgrade pip
46+
47+
RUN python -m pip install -r requirements.txt
48+
49+
WORKDIR /$SERVICE_REPO_PATH
50+
CMD ["./test/canary/scripts/run_test.sh"]

test/canary/canary.buildspec.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
version: 0.2
2+
3+
phases:
4+
pre_build:
5+
commands:
6+
# Make all shell scripts executable. This is required when running code copied from S3
7+
- find ./ -type f -name "*.sh" -exec chmod +x {} \;
8+
9+
# Get cached test image
10+
- aws ecr get-login-password --region $CLUSTER_REGION | docker login --username AWS --password-stdin $ECR_CACHE_URI || true
11+
- docker pull ${ECR_CACHE_URI}:latest --quiet || true
12+
13+
# Login to dockerhub to avoid hitting throttle limit
14+
- docker login -u $DOCKER_CONFIG_USERNAME -p $DOCKER_CONFIG_PASSWORD
15+
16+
# Build test image
17+
- >
18+
docker build -f ./test/canary/Dockerfile.canary . -t ${ECR_CACHE_URI}:latest
19+
--build-arg SERVICE="${SERVICE##*/}" --quiet
20+
|| echo "Docker Build Failed" || true
21+
build:
22+
commands:
23+
# Run tests
24+
- docker run --name ack-canary $(env | cut -f1 -d= | sed 's/^/-e /') --mount type=bind,source="$(pwd)/",target="/${SERVICE}-controller/" ${ECR_CACHE_URI}:latest
25+
26+
# Push test image to cache ECR repo
27+
- docker push ${ECR_CACHE_URI}:latest || true
28+
29+
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env bash
2+
3+
# Deploy ACK Helm Charts
4+
5+
function install_helm_chart() {
6+
local service="$1"
7+
local oidc_role_arn="$2"
8+
local region="$3"
9+
local namespace="$4"
10+
11+
yq w -i helm/values.yaml "serviceAccount.annotations" ""
12+
yq w -i helm/values.yaml 'serviceAccount.annotations."eks.amazonaws.com/role-arn"' "$oidc_role_arn"
13+
yq w -i helm/values.yaml "aws.region" $region
14+
15+
kubectl create namespace $namespace
16+
helm install -n $namespace ack-$service-controller helm
17+
}

test/canary/scripts/run_test.sh

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/bin/bash
2+
3+
# cleanup on EXIT regardles of error
4+
5+
# Inputs to this file as environment variables
6+
# SERVICE
7+
# SERVICE_REGION
8+
# CLUSTER_REGION
9+
# CLUSTER_NAME
10+
# SERVICE_REPO_PATH
11+
# NAMESPACE
12+
13+
set -euo pipefail
14+
export NAMESPACE=${NAMESPACE:-"ack-system"}
15+
export AWS_DEFAULT_REGION=$SERVICE_REGION
16+
export E2E_DIR=$SERVICE_REPO_PATH/test/e2e/
17+
SCRIPTS_DIR=${SERVICE_REPO_PATH}/test/canary/scripts
18+
19+
source $SCRIPTS_DIR/setup_oidc.sh
20+
source $SCRIPTS_DIR/install_controller_helm.sh
21+
22+
function print_controller_logs() {
23+
pod_id=$( kubectl get pods -n $NAMESPACE --field-selector="status.phase=Running" \
24+
--sort-by=.metadata.creationTimestamp \
25+
| grep ack-sagemaker-controller | awk '{print $1}' 2>/dev/null )
26+
27+
kubectl -n $NAMESPACE logs "$pod_id"
28+
}
29+
30+
function cleanup {
31+
echo "Cleaning up resources"
32+
set +e
33+
kubectl delete endpoints.sagemaker --all
34+
kubectl delete endpointconfigs --all
35+
kubectl delete models --all
36+
kubectl delete trainingjobs --all
37+
kubectl delete processingjobs --all
38+
kubectl delete transformjobs --all
39+
kubectl delete hyperparametertuningjobs --all
40+
kubectl delete dataqualityjobdefinitions --all
41+
kubectl delete modelbiasjobdefinitions --all
42+
kubectl delete modelexplainabilityjobdefinitions --all
43+
kubectl delete modelqualityjobdefinitions --all
44+
kubectl delete monitoringschedules --all
45+
kubectl delete adoptedresources --all
46+
47+
print_controller_logs
48+
49+
helm delete -n $NAMESPACE ack-$SERVICE-controller
50+
kubectl delete namespace $NAMESPACE
51+
52+
cd $E2E_DIR
53+
export PYTHONPATH=..
54+
python service_cleanup.py
55+
56+
}
57+
trap cleanup EXIT
58+
59+
# Update kubeconfig
60+
aws --region $CLUSTER_REGION eks update-kubeconfig --name $CLUSTER_NAME
61+
62+
# Setup OIDC
63+
create_oidc_role "$CLUSTER_NAME" "$CLUSTER_REGION" "$NAMESPACE"
64+
65+
# Install service helm chart
66+
install_helm_chart $SERVICE $OIDC_ROLE_ARN $SERVICE_REGION $NAMESPACE
67+
68+
echo "Log helm charts are deployed properly"
69+
kubectl -n $NAMESPACE get pods
70+
kubectl get crds
71+
72+
pushd $E2E_DIR
73+
export PYTHONPATH=..
74+
# create resources for test
75+
python service_bootstrap.py
76+
sleep 5m
77+
78+
# run tests
79+
echo "Run Tests"
80+
pytest -n 10 --dist loadfile --log-cli-level INFO -m canary
81+
popd

test/canary/scripts/setup_oidc.sh

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/usr/bin/env bash
2+
# OIDC Setup
3+
4+
# A function to get the OIDC_ID associated with an EKS cluster
5+
function get_oidc_id() {
6+
local cluster_name="$1"
7+
local region = "$2"
8+
eksctl utils associate-iam-oidc-provider --cluster $cluster_name --region $region --approve
9+
local oidc_url=$(aws eks describe-cluster --region $region --name $cluster_name --query "cluster.identity.oidc.issuer" --output text | cut -c9-)
10+
echo "${oidc_url}"
11+
}
12+
13+
14+
function generate_trust_policy() {
15+
local oidc_url="$1"
16+
local namespace="$2"
17+
local account_id=$(aws sts get-caller-identity --output text --query "Account")
18+
19+
cat <<EOF > trust.json
20+
{
21+
"Version": "2012-10-17",
22+
"Statement": [
23+
{
24+
"Effect": "Allow",
25+
"Principal": {
26+
"Federated": "arn:aws:iam::${account_id}:oidc-provider/${oidc_url}"
27+
},
28+
"Action": "sts:AssumeRoleWithWebIdentity",
29+
"Condition": {
30+
"StringEquals": {
31+
"${oidc_url}:aud": "sts.amazonaws.com",
32+
"${oidc_url}:sub": ["system:serviceaccount:${namespace}:ack-sagemaker-controller"]
33+
}
34+
}
35+
}
36+
]
37+
}
38+
EOF
39+
}
40+
41+
function create_oidc_role() {
42+
local cluster_name="$1"
43+
local region="$2"
44+
local namespace="$3"
45+
local oidc_role_name=ack-oidc-role-$cluster_name-$namespace
46+
47+
# Create role only if it does not exist
48+
set +e
49+
aws iam get-role --role-name ${oidc_role_name}
50+
exit_code=$?
51+
set -euo pipefail
52+
53+
if [[ $exit_code -eq 0 ]]; then
54+
echo "A role for this cluster and namespace already exists in this account, assuming sagemaker access and proceeding."
55+
else
56+
echo "Creating new IAM role: $oidc_role_name"
57+
local oidc_url=$(get_oidc_id "$cluster_name" "$region")
58+
local trustfile="trust.json"
59+
generate_trust_policy "$oidc_url" "$namespace"
60+
aws iam create-role --role-name "$oidc_role_name" --assume-role-policy-document file://${trustfile}
61+
aws iam attach-role-policy --role-name "$oidc_role_name" --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
62+
aws iam attach-role-policy --role-name "$oidc_role_name" --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess
63+
rm "${trustfile}"
64+
fi
65+
local oidc_role_arn=$(aws iam get-role --role-name $oidc_role_name --output text --query 'Role.Arn')
66+
export OIDC_ROLE_ARN=$oidc_role_arn
67+
}

test/e2e/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@955d7831ee374a212250179e95a5f3b75e555fd9
1+
acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@5ed60a505afa953096e53c9d3d6779830250915b
22
black==20.8b1

test/e2e/service_bootstrap.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import json
1818
import logging
1919
import time
20+
import subprocess
2021

2122
from acktest import resources
2223
from acktest.aws.identity import get_region, get_account_id
@@ -88,7 +89,13 @@ def create_data_bucket() -> str:
8889

8990
source_bucket = s3_resource.Bucket(SAGEMAKER_SOURCE_DATA_BUCKET)
9091
destination_bucket = s3_resource.Bucket(bucket_name)
91-
duplicate_bucket_contents(source_bucket, destination_bucket)
92+
temp_dir = "/tmp/ack_s3_data"
93+
# duplicate_bucket_contents(source_bucket, destination_bucket)
94+
# workaround to copy if buckets are across regions
95+
# TODO: check if there is a better way and merge to test-infra
96+
subprocess.call(['mkdir',f'{temp_dir}'])
97+
subprocess.call(['aws', 's3', 'sync', f's3://{SAGEMAKER_SOURCE_DATA_BUCKET}', f'./{temp_dir}/', '--quiet'])
98+
subprocess.call(['aws', 's3', 'sync', f'./{temp_dir}/', f's3://{bucket_name}', '--quiet'])
9299

93100
logging.info(f"Synced data bucket")
94101

test/e2e/tests/test_endpoint_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
create_sagemaker_resource,
2727
)
2828
from e2e.replacement_values import REPLACEMENT_VALUES
29-
from e2e.common.config import config as cfg
29+
from e2e.common import config as cfg
3030

3131

3232
@pytest.fixture(scope="module")

0 commit comments

Comments
 (0)