Skip to content

Commit d1bfe3f

Browse files
authored
Job-style resource testing improvements (#19)
* Add tests to completion for job style resources * Address review comments * Debugger Image TYPO fix * Fixes * Fix Transform Test and Black Formatting updates * Minor fixes for issues found on final test run
1 parent 8268506 commit d1bfe3f

18 files changed

+813
-302
lines changed

test/e2e/__init__.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,17 @@
2323
CRD_GROUP = "sagemaker.services.k8s.aws"
2424
CRD_VERSION = "v1alpha1"
2525

26-
ENDPOINT_CONFIG_RESOURCE_PLURAL = "endpointconfigs"
27-
MODEL_RESOURCE_PLURAL = "models"
28-
ENDPOINT_RESOURCE_PLURAL = "endpoints"
29-
DATA_QUALITY_JOB_DEFINITION_RESOURCE_PLURAL = "dataqualityjobdefinitions"
30-
3126
# PyTest marker for the current service
3227
service_marker = pytest.mark.service(arg=SERVICE_NAME)
3328

3429
bootstrap_directory = Path(__file__).parent
3530
resource_directory = Path(__file__).parent / "resources"
3631

3732

33+
def sagemaker_client():
34+
return boto3.client("sagemaker")
35+
36+
3837
def create_sagemaker_resource(
3938
resource_plural, resource_name, spec_file, replacements, namespace="default"
4039
):

test/e2e/common/config.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You may
4+
# not use this file except in compliance with the License. A copy of the
5+
# License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is distributed
10+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11+
# express or implied. See the License for the specific language governing
12+
# permissions and limitations under the License.
13+
"""String Constants
14+
"""
15+
16+
ENDPOINT_CONFIG_RESOURCE_PLURAL = "endpointconfigs"
17+
MODEL_RESOURCE_PLURAL = "models"
18+
ENDPOINT_RESOURCE_PLURAL = "endpoints"
19+
DATA_QUALITY_JOB_DEFINITION_RESOURCE_PLURAL = "dataqualityjobdefinitions"
20+
21+
# Job Type Resource Statuses
22+
LIST_JOB_STATUS_STOPPED = ("Stopped", "Stopping", "Completed")
23+
JOB_STATUS_INPROGRESS: str = "InProgress"
24+
JOB_STATUS_COMPLETED: str = "Completed"
25+
DEBUGGERJOB_STATUS_COMPLETED: str = "NoIssuesFound"

test/e2e/common/fixtures.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,14 @@
1616
import pytest
1717

1818
from e2e import (
19-
ENDPOINT_CONFIG_RESOURCE_PLURAL,
20-
MODEL_RESOURCE_PLURAL,
21-
ENDPOINT_RESOURCE_PLURAL,
22-
DATA_QUALITY_JOB_DEFINITION_RESOURCE_PLURAL,
2319
create_sagemaker_resource,
2420
wait_sagemaker_endpoint_status,
2521
)
2622

2723
from e2e.replacement_values import REPLACEMENT_VALUES
2824
from acktest.resources import random_suffix_name
2925
from acktest.k8s import resource as k8s
26+
from e2e.common import config as cfg
3027

3128

3229
@pytest.fixture(scope="module")
@@ -48,7 +45,7 @@ def xgboost_churn_endpoint(sagemaker_client):
4845
] = f"s3://{data_bucket}/sagemaker/model/xgb-churn-prediction-model.tar.gz"
4946

5047
model_reference, model_spec, model_resource = create_sagemaker_resource(
51-
resource_plural=MODEL_RESOURCE_PLURAL,
48+
resource_plural=cfg.MODEL_RESOURCE_PLURAL,
5249
resource_name=model_resource_name,
5350
spec_file="xgboost_model_with_model_location",
5451
replacements=replacements,
@@ -61,7 +58,7 @@ def xgboost_churn_endpoint(sagemaker_client):
6158
endpoint_config_spec,
6259
endpoint_config_resource,
6360
) = create_sagemaker_resource(
64-
resource_plural=ENDPOINT_CONFIG_RESOURCE_PLURAL,
61+
resource_plural=cfg.ENDPOINT_CONFIG_RESOURCE_PLURAL,
6562
resource_name=endpoint_config_resource_name,
6663
spec_file="endpoint_config_data_capture_single_variant",
6764
replacements=replacements,
@@ -70,7 +67,7 @@ def xgboost_churn_endpoint(sagemaker_client):
7067
assert k8s.get_resource_arn(endpoint_config_resource) is not None
7168

7269
endpoint_reference, endpoint_spec, endpoint_resource = create_sagemaker_resource(
73-
resource_plural=ENDPOINT_RESOURCE_PLURAL,
70+
resource_plural=cfg.ENDPOINT_RESOURCE_PLURAL,
7471
resource_name=endpoint_resource_name,
7572
spec_file="endpoint_base",
7673
replacements=replacements,
@@ -100,7 +97,7 @@ def xgboost_churn_data_quality_job_definition(xgboost_churn_endpoint):
10097
replacements["ENDPOINT_NAME"] = endpoint_name
10198

10299
job_definition_reference, _, resource = create_sagemaker_resource(
103-
resource_plural=DATA_QUALITY_JOB_DEFINITION_RESOURCE_PLURAL,
100+
resource_plural=cfg.DATA_QUALITY_JOB_DEFINITION_RESOURCE_PLURAL,
104101
resource_name=resource_name,
105102
spec_file="data_quality_job_definition_xgboost_churn",
106103
replacements=replacements,

test/e2e/replacement_values.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,54 @@
4242
"sa-east-1": "737474898029.dkr.ecr.sa-east-1.amazonaws.com",
4343
}
4444

45+
DEBUGGER_IMAGE_URIS = {
46+
"us-west-1": "685455198987.dkr.ecr.us-west-1.amazonaws.com",
47+
"us-west-2": "895741380848.dkr.ecr.us-west-2.amazonaws.com",
48+
"us-east-1": "503895931360.dkr.ecr.us-east-1.amazonaws.com",
49+
"us-east-2": "915447279597.dkr.ecr.us-east-2.amazonaws.com",
50+
"ap-east-1": "199566480951.dkr.ecr.ap-east-1.amazonaws.com",
51+
"ap-northeast-1": "430734990657.dkr.ecr.ap-northeast-1.amazonaws.com",
52+
"ap-northeast-2": "578805364391.dkr.ecr.ap-northeast-2.amazonaws.com",
53+
"ap-south-1": "904829902805.dkr.ecr.ap-south-1.amazonaws.com",
54+
"ap-southeast-1": "972752614525.dkr.ecr.ap-southeast-1.amazonaws.com",
55+
"ap-southeast-2": "184798709955.dkr.ecr.ap-southeast-2.amazonaws.com",
56+
"ca-central-1": "519511493484.dkr.ecr.ca-central-1.amazonaws.com",
57+
"cn-north-1": "618459771430.dkr.ecr.cn-north-1.amazonaws.com.cn",
58+
"cn-northwest-1": "658757709296.dkr.ecr.cn-northwest-1.amazonaws.com.cn",
59+
"eu-central-1": "482524230118.dkr.ecr.eu-central-1.amazonaws.com",
60+
"eu-north-1": "314864569078.dkr.ecr.eu-north-1.amazonaws.com",
61+
"eu-west-1": "929884845733.dkr.ecr.eu-west-1.amazonaws.com",
62+
"eu-west-2": "250201462417.dkr.ecr.eu-west-2.amazonaws.com",
63+
"eu-west-3": "447278800020.dkr.ecr.eu-west-3.amazonaws.com",
64+
"me-south-1": "986000313247.dkr.ecr.me-south-1.amazonaws.com",
65+
"sa-east-1": "818342061345.dkr.ecr.sa-east-1.amazonaws.com",
66+
}
67+
68+
# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
69+
XGBOOST_V1_IMAGE_URIS = {
70+
"us-west-1": "632365934929.dkr.ecr.us-west-1.amazonaws.com",
71+
"us-west-2": "433757028032.dkr.ecr.us-west-2.amazonaws.com",
72+
"us-east-1": "811284229777.dkr.ecr.us-east-1.amazonaws.com",
73+
"us-east-2": "825641698319.dkr.ecr.us-east-2.amazonaws.com",
74+
"ap-east-1": "286214385809.dkr.ecr.ap-east-1.amazonaws.com",
75+
"ap-northeast-1": "501404015308.dkr.ecr.ap-northeast-1.amazonaws.com",
76+
"ap-northeast-2": "306986355934.dkr.ecr.ap-northeast-2.amazonaws.com",
77+
"ap-south-1": "991648021394.dkr.ecr.ap-south-1.amazonaws.com",
78+
"ap-southeast-1": "475088953585.dkr.ecr.ap-southeast-1.amazonaws.com",
79+
"ap-southeast-2": "544295431143.dkr.ecr.ap-southeast-2.amazonaws.com",
80+
"ca-central-1": "469771592824.dkr.ecr.ca-central-1.amazonaws.com",
81+
"cn-north-1": "390948362332.dkr.ecr.cn-north-1.amazonaws.com",
82+
"cn-northwest-1": "387376663083.dkr.ecr.cn-northwest-1.amazonaws.com",
83+
"eu-central-1": "813361260812.dkr.ecr.eu-central-1.amazonaws.com",
84+
"eu-north-1": "669576153137.dkr.ecr.eu-north-1.amazonaws.com",
85+
"eu-west-1": "685385470294.dkr.ecr.eu-west-1.amazonaws.com",
86+
"eu-west-2": "644912444149.dkr.ecr.eu-west-2.amazonaws.com",
87+
"eu-west-3": "749696950732.dkr.ecr.eu-west-3.amazonaws.com",
88+
"me-south-1": "249704162688.dkr.ecr.me-south-1.amazonaws.com",
89+
"sa-east-1": "855470959533.dkr.ecr.sa-east-1.amazonaws.com",
90+
}
91+
92+
4593
PYTORCH_TRAIN_IMAGE_URIS = {
4694
"us-east-1": "763104351884.dkr.ecr.us-east-1.amazonaws.com",
4795
"us-east-2": "763104351884.dkr.ecr.us-east-2.amazonaws.com",
@@ -121,6 +169,8 @@
121169
REPLACEMENT_VALUES = {
122170
"SAGEMAKER_DATA_BUCKET": get_bootstrap_resources().DataBucketName,
123171
"XGBOOST_IMAGE_URI": f"{XGBOOST_IMAGE_URIS[get_region()]}/sagemaker-xgboost:1.0-1-cpu-py3",
172+
"DEBUGGER_IMAGE_URI": f"{DEBUGGER_IMAGE_URIS[get_region()]}/sagemaker-debugger-rules:latest",
173+
"XGBOOST_V1_IMAGE_URI": f"{XGBOOST_V1_IMAGE_URIS[get_region()]}/xgboost:latest",
124174
"PYTORCH_TRAIN_IMAGE_URI": f"{PYTORCH_TRAIN_IMAGE_URIS[get_region()]}/pytorch-training:1.5.0-cpu-py36-ubuntu16.04",
125175
"SAGEMAKER_EXECUTION_ROLE_ARN": get_bootstrap_resources().ExecutionRoleARN,
126176
"MODEL_MONITOR_ANALYZER_IMAGE_URI": f"{MODEL_MONITOR_IMAGE_URIS[get_region()]}/sagemaker-model-monitor-analyzer",

test/e2e/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@955d7831ee374a212250179e95a5f3b75e555fd9
2-
black==20.*
2+
black==20.8b1

test/e2e/resources/kmeans_processingjob.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,6 @@ spec:
4747
s3UploadMode: EndOfJob
4848
- outputName: valid_data
4949
s3Output:
50-
s3URI: s3://{DATA_BUCKET}/sagemaker/processing/output/
50+
s3URI: s3://$SAGEMAKER_DATA_BUCKET/sagemaker/processing/output/
5151
localPath: /opt/ml/processing/output_valid/
5252
s3UploadMode: EndOfJob

test/e2e/resources/xgboost_model.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ spec:
77
primaryContainer:
88
containerHostname: xgboost
99
modelDataURL: s3://$SAGEMAKER_DATA_BUCKET/sagemaker/model/xgboost-mnist-model.tar.gz
10-
image: $XGBOOST_IMAGE_URI
10+
image: $XGBOOST_V1_IMAGE_URI
1111
environment:
1212
my_var: my_value
1313
my_var2: my_value2
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
apiVersion: sagemaker.services.k8s.aws/v1alpha1
2+
kind: TrainingJob
3+
metadata:
4+
name: $TRAINING_JOB_NAME
5+
spec:
6+
trainingJobName: $TRAINING_JOB_NAME
7+
roleARN: $SAGEMAKER_EXECUTION_ROLE_ARN
8+
hyperParameters:
9+
max_depth: "5"
10+
gamma: "4"
11+
eta: "0.2"
12+
min_child_weight: "6"
13+
silent: "0"
14+
objective: "reg:squarederror"
15+
subsample: "0.7"
16+
num_round: "51"
17+
algorithmSpecification:
18+
trainingImage: $XGBOOST_IMAGE_URI
19+
trainingInputMode: File
20+
outputDataConfig:
21+
s3OutputPath: s3://$SAGEMAKER_DATA_BUCKET/sagemaker/training/debugger/output
22+
resourceConfig:
23+
instanceCount: 1
24+
instanceType: ml.m4.xlarge
25+
volumeSizeInGB: 5
26+
stoppingCondition:
27+
maxRuntimeInSeconds: 86400
28+
inputDataConfig:
29+
- channelName: train
30+
dataSource:
31+
s3DataSource:
32+
s3DataType: S3Prefix
33+
s3URI: s3://$SAGEMAKER_DATA_BUCKET/sagemaker/training/train
34+
s3DataDistributionType: FullyReplicated
35+
contentType: text/csv
36+
compressionType: None
37+
- channelName: validation
38+
dataSource:
39+
s3DataSource:
40+
s3DataType: S3Prefix
41+
s3URI: s3://$SAGEMAKER_DATA_BUCKET/sagemaker/training/validation
42+
s3DataDistributionType: FullyReplicated
43+
contentType: text/csv
44+
compressionType: None
45+
debugHookConfig:
46+
s3OutputPath: s3://$SAGEMAKER_DATA_BUCKET/sagemaker/training/debugger/hookconfig
47+
collectionConfigurations:
48+
- collectionName: feature_importance
49+
collectionParameters:
50+
name: save_interval
51+
value: "5"
52+
- collectionName: losses
53+
collectionParameters:
54+
name: save_interval
55+
value: "500"
56+
- collectionName: average_shap
57+
collectionParameters:
58+
name: save_interval
59+
value: "5"
60+
- collectionName: metrics
61+
collectionParameters:
62+
name: save_interval
63+
value: "5"
64+
debugRuleConfigurations:
65+
- ruleConfigurationName: LossNotDecreasing
66+
ruleEvaluatorImage: $DEBUGGER_IMAGE_URI
67+
ruleParameters:
68+
collection_names: metrics
69+
num_steps: "10"
70+
rule_to_invoke: LossNotDecreasing

test/e2e/resources/xgboost_transformjob.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ metadata:
44
name: $TRANSFORM_JOB_NAME
55
spec:
66
transformJobName: $TRANSFORM_JOB_NAME
7-
roleARN: $SAGEMAKER_EXECUTION_ROLE_ARN
87
modelName: $MODEL_NAME
98
transformInput:
109
contentType: text/csv

test/e2e/tests/test_endpoint.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,12 @@
2525

2626
from e2e import (
2727
service_marker,
28-
ENDPOINT_CONFIG_RESOURCE_PLURAL,
29-
MODEL_RESOURCE_PLURAL,
30-
ENDPOINT_RESOURCE_PLURAL,
3128
create_sagemaker_resource,
3229
wait_sagemaker_endpoint_status,
3330
wait_resource_endpoint_status,
3431
)
3532
from e2e.replacement_values import REPLACEMENT_VALUES
33+
from e2e.common import config as cfg
3634

3735
FAIL_UPDATE_ERROR_MESSAGE = "unable to update endpoint. check FailureReason"
3836

@@ -49,7 +47,7 @@ def single_container_model(name_suffix):
4947
replacements["MODEL_NAME"] = model_resource_name
5048

5149
model_reference, model_spec, model_resource = create_sagemaker_resource(
52-
resource_plural=MODEL_RESOURCE_PLURAL,
50+
resource_plural=cfg.MODEL_RESOURCE_PLURAL,
5351
resource_name=model_resource_name,
5452
spec_file="xgboost_model",
5553
replacements=replacements,
@@ -74,7 +72,7 @@ def multi_variant_config(name_suffix, single_container_model):
7472
replacements["MODEL_NAME"] = model_resource_name
7573

7674
config_reference, config_spec, config_resource = create_sagemaker_resource(
77-
resource_plural=ENDPOINT_CONFIG_RESOURCE_PLURAL,
75+
resource_plural=cfg.ENDPOINT_CONFIG_RESOURCE_PLURAL,
7876
resource_name=config_resource_name,
7977
spec_file="endpoint_config_multi_variant",
8078
replacements=replacements,
@@ -99,7 +97,7 @@ def single_variant_config(name_suffix, single_container_model):
9997
replacements["MODEL_NAME"] = model_resource_name
10098

10199
config_reference, config_spec, config_resource = create_sagemaker_resource(
102-
resource_plural=ENDPOINT_CONFIG_RESOURCE_PLURAL,
100+
resource_plural=cfg.ENDPOINT_CONFIG_RESOURCE_PLURAL,
103101
resource_name=config_resource_name,
104102
spec_file="endpoint_config_single_variant",
105103
replacements=replacements,
@@ -124,7 +122,7 @@ def xgboost_endpoint(name_suffix, single_variant_config):
124122
replacements["ENDPOINT_CONFIG_NAME"] = config_resource_name
125123

126124
reference, spec, resource = create_sagemaker_resource(
127-
resource_plural=ENDPOINT_RESOURCE_PLURAL,
125+
resource_plural=cfg.ENDPOINT_RESOURCE_PLURAL,
128126
resource_name=endpoint_resource_name,
129127
spec_file="endpoint_base",
130128
replacements=replacements,
@@ -157,7 +155,7 @@ def faulty_config(name_suffix, single_container_model):
157155
replacements["MODEL_NAME"] = model_resource_name
158156
replacements["MODEL_LOCATION"] = f"s3://{model_bucket}/{model_destination_key}"
159157
model_reference, model_spec, model_resource = create_sagemaker_resource(
160-
resource_plural=MODEL_RESOURCE_PLURAL,
158+
resource_plural=cfg.MODEL_RESOURCE_PLURAL,
161159
resource_name=model_resource_name,
162160
spec_file="xgboost_model_with_model_location",
163161
replacements=replacements,
@@ -174,7 +172,7 @@ def faulty_config(name_suffix, single_container_model):
174172
replacements["ENDPOINT_CONFIG_NAME"] = config_resource_name
175173

176174
config_reference, config_spec, config_resource = create_sagemaker_resource(
177-
resource_plural=ENDPOINT_CONFIG_RESOURCE_PLURAL,
175+
resource_plural=cfg.ENDPOINT_CONFIG_RESOURCE_PLURAL,
178176
resource_name=config_resource_name,
179177
spec_file="endpoint_config_multi_variant",
180178
replacements=replacements,

0 commit comments

Comments
 (0)