Skip to content

Commit 354ded3

Browse files
authored
Add wrapper for LDA. (#56)
Update CHANGELOG and bump the version number.
1 parent 54b3830 commit 354ded3

File tree

13 files changed

+475
-18
lines changed

13 files changed

+475
-18
lines changed

CHANGELOG.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,15 @@
22
CHANGELOG
33
=========
44

5+
1.0.3
6+
=====
7+
8+
* feature: Estimators: add support for Amazon LDA algorithm
9+
* feature: Hyperparameters: Add data_type to hyperparameters
10+
* feature: Documentation: Update TensorFlow examples following API change
11+
* feature: Session: Support multi-part uploads
12+
13+
514
1.0.2
615
=====
716

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def read(fname):
1111

1212

1313
setup(name="sagemaker",
14-
version="1.0.2",
14+
version="1.0.3",
1515
description="Open source library for training and deploying models on Amazon SageMaker.",
1616
packages=find_packages('src'),
1717
package_dir={'': 'src'},

src/sagemaker/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from sagemaker import estimator
1616
from sagemaker.amazon.kmeans import KMeans, KMeansModel, KMeansPredictor
1717
from sagemaker.amazon.pca import PCA, PCAModel, PCAPredictor
18+
from sagemaker.amazon.lda import LDA, LDAModel, LDAPredictor
1819
from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel, LinearLearnerPredictor
1920
from sagemaker.amazon.factorization_machines import FactorizationMachines, FactorizationMachinesModel
2021
from sagemaker.amazon.factorization_machines import FactorizationMachinesPredictor
@@ -30,6 +31,7 @@
3031

3132
__all__ = [estimator, KMeans, KMeansModel, KMeansPredictor, PCA, PCAModel, PCAPredictor, LinearLearner,
3233
LinearLearnerModel, LinearLearnerPredictor,
34+
LDA, LDAModel, LDAPredictor,
3335
FactorizationMachines, FactorizationMachinesModel, FactorizationMachinesPredictor,
3436
Model, RealTimePredictor, Session,
3537
container_def, s3_input, production_variant, get_execution_role]

src/sagemaker/amazon/amazon_estimator.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ def __init__(self, role, train_instance_count, train_instance_type, data_locatio
4747
self.data_location = data_location
4848

4949
def train_image(self):
50-
return registry(self.sagemaker_session.boto_region_name) + "/" + type(self).repo
50+
repo = '{}:{}'.format(type(self).repo_name, type(self).repo_version)
51+
return '{}/{}'.format(registry(self.sagemaker_session.boto_region_name, type(self).repo_name), repo)
5152

5253
def hyperparameters(self):
5354
return hp.serialize_all(self)
@@ -200,12 +201,22 @@ def upload_numpy_to_s3_shards(num_shards, s3, bucket, key_prefix, array, labels=
200201
raise ex
201202

202203

203-
def registry(region_name):
204+
def registry(region_name, algorithm=None):
204205
"""Return docker registry for the given AWS region"""
205-
account_id = {
206-
"us-east-1": "382416733822",
207-
"us-east-2": "404615174143",
208-
"us-west-2": "174872318107",
209-
"eu-west-1": "438346466558"
210-
}[region_name]
206+
if algorithm in [None, "pca", "kmeans", "linear-learner", "factorization-machines"]:
207+
account_id = {
208+
"us-east-1": "382416733822",
209+
"us-east-2": "404615174143",
210+
"us-west-2": "174872318107",
211+
"eu-west-1": "438346466558"
212+
}[region_name]
213+
elif algorithm in ["lda"]:
214+
account_id = {
215+
"us-east-1": "766337827248",
216+
"us-east-2": "999911452149",
217+
"us-west-2": "266724342769",
218+
"eu-west-1": "999678624901"
219+
}[region_name]
220+
else:
221+
raise ValueError("Algorithm class:{} doesn't have mapping to account_id with images".format(algorithm))
211222
return "{}.dkr.ecr.{}.amazonaws.com".format(account_id, region_name)

src/sagemaker/amazon/factorization_machines.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121

2222
class FactorizationMachines(AmazonAlgorithmEstimatorBase):
2323

24-
repo = 'factorization-machines:1'
24+
repo_name = 'factorization-machines'
25+
repo_version = 1
2526

2627
num_factors = hp('num_factors', gt(0), 'An integer greater than zero', int)
2728
predictor_type = hp('predictor_type', isin('binary_classifier', 'regressor'),
@@ -194,7 +195,8 @@ class FactorizationMachinesModel(Model):
194195

195196
def __init__(self, model_data, role, sagemaker_session=None):
196197
sagemaker_session = sagemaker_session or Session()
197-
image = registry(sagemaker_session.boto_session.region_name) + "/" + FactorizationMachines.repo
198+
repo = '{}:{}'.format(FactorizationMachines.repo_name, FactorizationMachines.repo_version)
199+
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name), repo)
198200
super(FactorizationMachinesModel, self).__init__(model_data,
199201
image,
200202
role,

src/sagemaker/amazon/kmeans.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121

2222
class KMeans(AmazonAlgorithmEstimatorBase):
2323

24-
repo = 'kmeans:1'
24+
repo_name = 'kmeans'
25+
repo_version = 1
2526

2627
k = hp('k', gt(1), 'An integer greater-than 1', int)
2728
init_method = hp('init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"', str)
@@ -132,6 +133,7 @@ class KMeansModel(Model):
132133

133134
def __init__(self, model_data, role, sagemaker_session=None):
134135
sagemaker_session = sagemaker_session or Session()
135-
image = registry(sagemaker_session.boto_session.region_name) + "/" + KMeans.repo
136+
repo = '{}:{}'.format(KMeans.repo_name, KMeans.repo_version)
137+
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name), repo)
136138
super(KMeansModel, self).__init__(model_data, image, role, predictor_cls=KMeansPredictor,
137139
sagemaker_session=sagemaker_session)

src/sagemaker/amazon/lda.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
14+
from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
15+
from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa
16+
from sagemaker.amazon.validation import gt
17+
from sagemaker.predictor import RealTimePredictor
18+
from sagemaker.model import Model
19+
from sagemaker.session import Session
20+
21+
22+
class LDA(AmazonAlgorithmEstimatorBase):
23+
24+
repo_name = 'lda'
25+
repo_version = 1
26+
27+
num_topics = hp('num_topics', gt(0), 'An integer greater than zero', int)
28+
alpha0 = hp('alpha0', gt(0), 'A positive float', float)
29+
max_restarts = hp('max_restarts', gt(0), 'An integer greater than zero', int)
30+
max_iterations = hp('max_iterations', gt(0), 'An integer greater than zero', int)
31+
tol = hp('tol', gt(0), 'A positive float', float)
32+
33+
def __init__(self, role, train_instance_type, num_topics,
34+
alpha0=None, max_restarts=None, max_iterations=None, tol=None, **kwargs):
35+
"""Latent Dirichlet Allocation (LDA) is :class:`Estimator` used for unsupervised learning.
36+
37+
Amazon SageMaker Latent Dirichlet Allocation is an unsupervised learning algorithm that attempts to describe
38+
a set of observations as a mixture of distinct categories. LDA is most commonly used to discover
39+
a user-specified number of topics shared by documents within a text corpus.
40+
Here each observation is a document, the features are the presence (or occurrence count) of each word, and
41+
the categories are the topics.
42+
43+
This Estimator may be fit via calls to
44+
:meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
45+
:class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
46+
There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
47+
can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
48+
to the `fit` call.
49+
50+
To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
51+
consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
52+
53+
After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
54+
Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an Endpoint,
55+
deploy returns a :class:`~sagemaker.amazon.lda.LDAPredictor` object that can be used
56+
for inference calls using the trained model hosted in the SageMaker Endpoint.
57+
58+
LDA Estimators can be configured by setting hyperparameters. The available hyperparameters for
59+
LDA are documented below.
60+
61+
For further information on the AWS LDA algorithm,
62+
please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/lda.html
63+
64+
Args:
65+
role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
66+
APIs that create Amazon SageMaker endpoints use this role to access
67+
training data and model artifacts. After the endpoint is created,
68+
the inference code might use the IAM role, if accessing AWS resource.
69+
train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
70+
num_topics (int): The number of topics for LDA to find within the data.
71+
alpha0 (float): Optional. Initial guess for the concentration parameter
72+
max_restarts (int): Optional. The number of restarts to perform during the Alternating Least Squares (ALS)
73+
spectral decomposition phase of the algorithm.
74+
max_iterations (int): Optional. The maximum number of iterations to perform during the
75+
ALS phase of the algorithm.
76+
tol (float): Optional. Target error tolerance for the ALS phase of the algorithm.
77+
**kwargs: base class keyword argument values.
78+
"""
79+
80+
# this algorithm only supports single instance training
81+
super(LDA, self).__init__(role, 1, train_instance_type, **kwargs)
82+
self.num_topics = num_topics
83+
self.alpha0 = alpha0
84+
self.max_restarts = max_restarts
85+
self.max_iterations = max_iterations
86+
self.tol = tol
87+
88+
def create_model(self):
89+
"""Return a :class:`~sagemaker.amazon.LDAModel` referencing the latest
90+
s3 model data produced by this Estimator."""
91+
92+
return LDAModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)
93+
94+
def fit(self, records, mini_batch_size, **kwargs):
95+
# mini_batch_size is required, prevent explicit calls with None
96+
if mini_batch_size is None:
97+
raise ValueError("mini_batch_size must be set")
98+
super(LDA, self).fit(records, mini_batch_size, **kwargs)
99+
100+
101+
class LDAPredictor(RealTimePredictor):
102+
"""Transforms input vectors to lower-dimesional representations.
103+
104+
The implementation of :meth:`~sagemaker.predictor.RealTimePredictor.predict` in this
105+
`RealTimePredictor` requires a numpy ``ndarray`` as input. The array should contain the
106+
same number of columns as the feature-dimension of the data used to fit the model this
107+
Predictor performs inference on.
108+
109+
:meth:`predict()` returns a list of :class:`~sagemaker.amazon.record_pb2.Record` objects, one
110+
for each row in the input ``ndarray``. The lower dimension vector result is stored in the ``projection``
111+
key of the ``Record.label`` field."""
112+
113+
def __init__(self, endpoint, sagemaker_session=None):
114+
super(LDAPredictor, self).__init__(endpoint, sagemaker_session, serializer=numpy_to_record_serializer(),
115+
deserializer=record_deserializer())
116+
117+
118+
class LDAModel(Model):
119+
"""Reference LDA s3 model data. Calling :meth:`~sagemaker.model.Model.deploy` creates an Endpoint and return
120+
a Predictor that transforms vectors to a lower-dimensional representation."""
121+
122+
def __init__(self, model_data, role, sagemaker_session=None):
123+
sagemaker_session = sagemaker_session or Session()
124+
repo = '{}:{}'.format(LDA.repo_name, LDA.repo_version)
125+
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name, LDA.repo_name), repo)
126+
super(LDAModel, self).__init__(model_data, image, role, predictor_cls=LDAPredictor,
127+
sagemaker_session=sagemaker_session)

src/sagemaker/amazon/linear_learner.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121

2222
class LinearLearner(AmazonAlgorithmEstimatorBase):
2323

24-
repo = 'linear-learner:1'
24+
repo_name = 'linear-learner'
25+
repo_version = 1
2526

2627
DEFAULT_MINI_BATCH_SIZE = 1000
2728

@@ -226,7 +227,8 @@ class LinearLearnerModel(Model):
226227

227228
def __init__(self, model_data, role, sagemaker_session=None):
228229
sagemaker_session = sagemaker_session or Session()
229-
image = registry(sagemaker_session.boto_session.region_name) + "/" + LinearLearner.repo
230+
repo = '{}:{}'.format(LinearLearner.repo_name, LinearLearner.repo_version)
231+
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name), repo)
230232
super(LinearLearnerModel, self).__init__(model_data, image, role,
231233
predictor_cls=LinearLearnerPredictor,
232234
sagemaker_session=sagemaker_session)

src/sagemaker/amazon/pca.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020

2121
class PCA(AmazonAlgorithmEstimatorBase):
2222

23-
repo = 'pca:1'
23+
repo_name = 'pca'
24+
repo_version = 1
2425

2526
DEFAULT_MINI_BATCH_SIZE = 500
2627

@@ -118,6 +119,7 @@ class PCAModel(Model):
118119

119120
def __init__(self, model_data, role, sagemaker_session=None):
120121
sagemaker_session = sagemaker_session or Session()
121-
image = registry(sagemaker_session.boto_session.region_name) + "/" + PCA.repo
122+
repo = '{}:{}'.format(PCA.repo_name, PCA.repo_version)
123+
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name), repo)
122124
super(PCAModel, self).__init__(model_data, image, role, predictor_cls=PCAPredictor,
123125
sagemaker_session=sagemaker_session)

tests/data/lda/nips-train_1.pbr

1.01 MB
Binary file not shown.

0 commit comments

Comments
 (0)