feature: Decode application/x-npz content type (#60)

bveeramani · web-flow · commit f897549b0f5e · 2020-07-23T18:42:52.000-05:00
diff --git a/setup.py b/setup.py
@@ -29,7 +29,7 @@ def read_version():
 
 packages = setuptools.find_packages(where="src", exclude=("test",))
 
-required_packages = ["numpy", "six", "psutil", "retrying==1.3.3"]
+required_packages = ["numpy", "six", "psutil", "retrying==1.3.3", "scipy"]
 
 # enum is introduced in Python 3.4. Installing enum back port
 if sys.version_info < (3, 4):
diff --git a/src/sagemaker_inference/content_types.py b/src/sagemaker_inference/content_types.py
@@ -16,4 +16,5 @@
 OCTET_STREAM = "application/octet-stream"
 ANY = "*/*"
 NPY = "application/x-npy"
+NPZ = "application/x-npz"
 UTF8_TYPES = [JSON, CSV]
diff --git a/src/sagemaker_inference/decoder.py b/src/sagemaker_inference/decoder.py
@@ -17,6 +17,7 @@
 import json
 
 import numpy as np
+import scipy.sparse
 from six import BytesIO, StringIO
 
 from sagemaker_inference import content_types, errors
@@ -70,22 +71,36 @@ def _npy_to_numpy(npy_array):  # type: (object) -> np.array
     return np.load(stream, allow_pickle=True)
 
 
+def _npz_to_sparse(npz_bytes):  # type: (object) -> scipy.sparse.spmatrix
+    """Convert .npz-formatted data to a sparse matrix.
+
+    Args:
+        npz_bytes (object): Bytes encoding a sparse matrix in the .npz format.
+
+    Returns:
+        (scipy.sparse.spmatrix): A sparse matrix.
+    """
+    buffer = BytesIO(npz_bytes)
+    return scipy.sparse.load_npz(buffer)
+
+
 _decoder_map = {
     content_types.NPY: _npy_to_numpy,
     content_types.CSV: _csv_to_numpy,
     content_types.JSON: _json_to_numpy,
+    content_types.NPZ: _npz_to_sparse,
 }
 
 
 def decode(obj, content_type):
-    """Decode an object to one of the default content types to a numpy array.
+    """Decode an object that is encoded as one of the default content types.
 
     Args:
         obj (object): to be decoded.
         content_type (str): content type to be used.
 
     Returns:
-        np.array: decoded object.
+        object: decoded object for prediction.
     """
     try:
         decoder = _decoder_map[content_type]
diff --git a/test/unit/test_decoder.py b/test/unit/test_decoder.py
@@ -13,6 +13,7 @@
 from mock import Mock, patch
 import numpy as np
 import pytest
+import scipy.sparse
 from six import BytesIO
 
 from sagemaker_inference import content_types, decoder, errors
@@ -63,6 +64,26 @@ def test_csv_to_numpy(target, expected):
     np.testing.assert_equal(actual, expected)
 
 
+@pytest.mark.parametrize(
+    "target",
+    [
+        scipy.sparse.csc_matrix(np.array([[0, 0, 3], [4, 0, 0]])),
+        scipy.sparse.csr_matrix(np.array([[1, 0], [0, 7]])),
+        scipy.sparse.coo_matrix(np.array([[6, 2], [5, 9]])),
+    ],
+)
+def test_npz_to_sparse(target):
+    buffer = BytesIO()
+    scipy.sparse.save_npz(buffer, target)
+    data = buffer.getvalue()
+    matrix = decoder._npz_to_sparse(data)
+
+    actual = matrix.toarray()
+    expected = target.toarray()
+
+    np.testing.assert_equal(actual, expected)
+
+
 def test_decode_error():
     with pytest.raises(errors.UnsupportedFormatError):
         decoder.decode(42, content_types.OCTET_STREAM)