add canary related supporting infra (#23)

surajkota · web-flow · commit d46feed1ec6e · 2021-05-07T11:38:16.000-07:00
* helper scripts for canary

* update acktest

* buildspec and path changs

* bug fix after rebase
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -3,4 +3,4 @@
 # https://github.com/blog/2392-introducing-code-owners
 
 # Amazon SageMaker CodeOwners
-*       @akartsky @jkuruba @mbaijal @RedbackThomson @surajkota
+*       @akartsky @mbaijal @surajkota
diff --git a/test/canary/Dockerfile.canary b/test/canary/Dockerfile.canary
@@ -0,0 +1,50 @@
+FROM ubuntu:18.04
+
+# Build time parameters 
+ARG SERVICE=sagemaker
+
+RUN apt-get update && apt-get install -y curl \
+    wget \
+    git \
+    python3.8 \
+    python3-pip \
+    python3.8-dev \
+    vim \
+    sudo \
+    jq \
+    unzip
+
+# Install awscli
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
+ && unzip -qq awscliv2.zip \
+ && ./aws/install
+
+# Add yq repository and install yq
+RUN apt-get update && apt install -y software-properties-common \
+ && sudo add-apt-repository ppa:rmescandon/yq \
+ && apt update && apt install -y yq
+
+# Install kubectl
+RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.18.6/bin/linux/amd64/kubectl \
+ && chmod +x ./kubectl \
+ && cp ./kubectl /bin
+
+# Install eksctl
+RUN curl --silent --location "https://github.com/weaveworks/eksctl/releases/download/latest_release/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && mv /tmp/eksctl /bin
+
+# Install Helm 
+RUN curl -q -L "https://get.helm.sh/helm-v3.2.4-linux-amd64.tar.gz" | tar zxf - -C /usr/local/bin/ \
+ && mv /usr/local/bin/linux-amd64/helm /usr/local/bin/helm \
+ && rm -r /usr/local/bin/linux-amd64 \
+ && chmod +x /usr/local/bin/helm 
+
+ENV SERVICE_REPO_PATH=/$SERVICE-controller
+COPY ./test/e2e/requirements.txt requirements.txt
+
+RUN ln -s /usr/bin/python3.8 /usr/bin/python \
+ && python -m pip install --upgrade pip
+
+RUN python -m pip install -r requirements.txt
+
+WORKDIR /$SERVICE_REPO_PATH
+CMD ["./test/canary/scripts/run_test.sh"]
diff --git a/test/canary/canary.buildspec.yaml b/test/canary/canary.buildspec.yaml
@@ -0,0 +1,29 @@
+version: 0.2
+
+phases:
+  pre_build:
+    commands:
+      # Make all shell scripts executable. This is required when running code copied from S3
+      - find ./ -type f -name "*.sh" -exec chmod +x {} \;
+
+      # Get cached test image
+      - aws ecr get-login-password --region $CLUSTER_REGION | docker login --username AWS --password-stdin $ECR_CACHE_URI || true
+      - docker pull ${ECR_CACHE_URI}:latest --quiet || true
+
+      # Login to dockerhub to avoid hitting throttle limit
+      - docker login -u $DOCKER_CONFIG_USERNAME -p $DOCKER_CONFIG_PASSWORD
+
+      # Build test image
+      - >
+        docker build -f ./test/canary/Dockerfile.canary . -t ${ECR_CACHE_URI}:latest
+        --build-arg SERVICE="${SERVICE##*/}" --quiet
+        || echo "Docker Build Failed" || true
+  build:
+    commands:
+      # Run tests
+      - docker run --name ack-canary $(env | cut -f1 -d= | sed 's/^/-e /') --mount type=bind,source="$(pwd)/",target="/${SERVICE}-controller/" ${ECR_CACHE_URI}:latest
+      
+      # Push test image to cache ECR repo
+      - docker push ${ECR_CACHE_URI}:latest || true
+      
+      
diff --git a/test/canary/scripts/install_controller_helm.sh b/test/canary/scripts/install_controller_helm.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# Deploy ACK Helm Charts
+
+function install_helm_chart() {
+    local service="$1"
+    local oidc_role_arn="$2"
+    local region="$3"
+    local namespace="$4"
+
+    yq w -i helm/values.yaml "serviceAccount.annotations" ""
+    yq w -i helm/values.yaml 'serviceAccount.annotations."eks.amazonaws.com/role-arn"' "$oidc_role_arn"
+    yq w -i helm/values.yaml "aws.region" $region
+
+    kubectl create namespace $namespace
+    helm install -n $namespace ack-$service-controller helm
+}
diff --git a/test/canary/scripts/run_test.sh b/test/canary/scripts/run_test.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+# cleanup on EXIT regardles of error 
+
+# Inputs to this file as environment variables
+# SERVICE
+# SERVICE_REGION
+# CLUSTER_REGION
+# CLUSTER_NAME
+# SERVICE_REPO_PATH
+# NAMESPACE
+
+set -euo pipefail
+export NAMESPACE=${NAMESPACE:-"ack-system"}
+export AWS_DEFAULT_REGION=$SERVICE_REGION 
+export E2E_DIR=$SERVICE_REPO_PATH/test/e2e/
+SCRIPTS_DIR=${SERVICE_REPO_PATH}/test/canary/scripts
+
+source $SCRIPTS_DIR/setup_oidc.sh
+source $SCRIPTS_DIR/install_controller_helm.sh
+
+function print_controller_logs() {
+  pod_id=$( kubectl get pods -n $NAMESPACE --field-selector="status.phase=Running" \
+      --sort-by=.metadata.creationTimestamp \
+      | grep ack-sagemaker-controller | awk '{print $1}' 2>/dev/null )
+
+  kubectl -n $NAMESPACE logs "$pod_id"
+}
+
+function cleanup {
+  echo "Cleaning up resources"
+  set +e
+  kubectl delete endpoints.sagemaker --all
+  kubectl delete endpointconfigs --all
+  kubectl delete models --all
+  kubectl delete trainingjobs --all
+  kubectl delete processingjobs --all
+  kubectl delete transformjobs --all
+  kubectl delete hyperparametertuningjobs --all
+  kubectl delete dataqualityjobdefinitions --all
+  kubectl delete modelbiasjobdefinitions --all
+  kubectl delete modelexplainabilityjobdefinitions --all
+  kubectl delete modelqualityjobdefinitions --all
+  kubectl delete monitoringschedules --all
+  kubectl delete adoptedresources --all
+
+  print_controller_logs
+
+  helm delete -n $NAMESPACE ack-$SERVICE-controller
+  kubectl delete namespace $NAMESPACE
+
+  cd $E2E_DIR
+  export PYTHONPATH=.. 
+  python service_cleanup.py
+
+}
+trap cleanup EXIT
+
+# Update kubeconfig
+aws --region $CLUSTER_REGION eks update-kubeconfig --name $CLUSTER_NAME
+
+# Setup OIDC
+create_oidc_role "$CLUSTER_NAME" "$CLUSTER_REGION" "$NAMESPACE"
+
+# Install service helm chart
+install_helm_chart $SERVICE $OIDC_ROLE_ARN $SERVICE_REGION $NAMESPACE
+
+echo "Log helm charts are deployed properly"
+kubectl -n $NAMESPACE get pods
+kubectl get crds
+
+pushd $E2E_DIR
+  export PYTHONPATH=..
+  # create resources for test
+  python service_bootstrap.py
+  sleep 5m
+
+  # run tests
+  echo "Run Tests"
+  pytest -n 10 --dist loadfile --log-cli-level INFO -m canary
+popd
diff --git a/test/canary/scripts/setup_oidc.sh b/test/canary/scripts/setup_oidc.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# OIDC Setup
+
+# A function to get the OIDC_ID associated with an EKS cluster
+function get_oidc_id() {
+  local cluster_name="$1"
+  local region = "$2"
+  eksctl utils associate-iam-oidc-provider --cluster $cluster_name --region $region --approve
+  local oidc_url=$(aws eks describe-cluster --region $region --name $cluster_name  --query "cluster.identity.oidc.issuer" --output text | cut -c9-)
+  echo "${oidc_url}"
+}
+
+
+function generate_trust_policy() {
+  local oidc_url="$1"
+  local namespace="$2"
+  local account_id=$(aws sts get-caller-identity --output text --query "Account")
+
+  cat <<EOF > trust.json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Principal": {
+        "Federated": "arn:aws:iam::${account_id}:oidc-provider/${oidc_url}"
+      },
+      "Action": "sts:AssumeRoleWithWebIdentity",
+      "Condition": {
+        "StringEquals": {
+          "${oidc_url}:aud": "sts.amazonaws.com",
+          "${oidc_url}:sub": ["system:serviceaccount:${namespace}:ack-sagemaker-controller"]
+        }
+      }
+    }
+  ]
+}
+EOF
+}
+
+function create_oidc_role() {
+  local cluster_name="$1"
+  local region="$2"
+  local namespace="$3"
+  local oidc_role_name=ack-oidc-role-$cluster_name-$namespace
+  
+  # Create role only if it does not exist
+  set +e
+  aws iam get-role --role-name ${oidc_role_name}
+  exit_code=$?
+  set -euo pipefail
+
+  if [[ $exit_code -eq 0 ]]; then
+    echo "A role for this cluster and namespace already exists in this account, assuming sagemaker access and proceeding."
+  else
+    echo "Creating new IAM role: $oidc_role_name"
+    local oidc_url=$(get_oidc_id "$cluster_name" "$region")
+    local trustfile="trust.json"
+    generate_trust_policy "$oidc_url" "$namespace"
+    aws iam create-role --role-name "$oidc_role_name" --assume-role-policy-document file://${trustfile}
+    aws iam attach-role-policy --role-name "$oidc_role_name" --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
+    aws iam attach-role-policy --role-name "$oidc_role_name" --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess
+    rm "${trustfile}" 
+  fi
+  local oidc_role_arn=$(aws iam get-role --role-name $oidc_role_name --output text --query 'Role.Arn')
+  export OIDC_ROLE_ARN=$oidc_role_arn
+}
diff --git a/test/e2e/requirements.txt b/test/e2e/requirements.txt
@@ -1,2 +1,2 @@
-acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@955d7831ee374a212250179e95a5f3b75e555fd9
+acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@5ed60a505afa953096e53c9d3d6779830250915b
 black==20.8b1
diff --git a/test/e2e/service_bootstrap.py b/test/e2e/service_bootstrap.py
@@ -17,6 +17,7 @@
 import json
 import logging
 import time
+import subprocess
 
 from acktest import resources
 from acktest.aws.identity import get_region, get_account_id
@@ -88,7 +89,13 @@ def create_data_bucket() -> str:
 
     source_bucket = s3_resource.Bucket(SAGEMAKER_SOURCE_DATA_BUCKET)
     destination_bucket = s3_resource.Bucket(bucket_name)
-    duplicate_bucket_contents(source_bucket, destination_bucket)
+    temp_dir = "/tmp/ack_s3_data"
+    # duplicate_bucket_contents(source_bucket, destination_bucket)
+    # workaround to copy if buckets are across regions
+    # TODO: check if there is a better way and merge to test-infra
+    subprocess.call(['mkdir',f'{temp_dir}'])
+    subprocess.call(['aws', 's3', 'sync', f's3://{SAGEMAKER_SOURCE_DATA_BUCKET}', f'./{temp_dir}/', '--quiet'])
+    subprocess.call(['aws', 's3', 'sync', f'./{temp_dir}/', f's3://{bucket_name}', '--quiet'])
 
     logging.info(f"Synced data bucket")
 
diff --git a/test/e2e/tests/test_endpoint_config.py b/test/e2e/tests/test_endpoint_config.py
@@ -26,7 +26,7 @@
     create_sagemaker_resource,
 )
 from e2e.replacement_values import REPLACEMENT_VALUES
-from e2e.common.config import config as cfg
+from e2e.common import config as cfg
 
 
 @pytest.fixture(scope="module")

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@955d7831ee374a212250179e95a5f3b75e555fd9`
	`1`	`+acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@5ed60a505afa953096e53c9d3d6779830250915b`
`2`	`2`	`black==20.8b1`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`create_sagemaker_resource,`
`27`	`27`	`)`
`28`	`28`	`from e2e.replacement_values import REPLACEMENT_VALUES`
`29`		`-from e2e.common.config import config as cfg`
	`29`	`+from e2e.common import config as cfg`
`30`	`30`
`31`	`31`
`32`	`32`	`@pytest.fixture(scope="module")`