Improve builtin index_string (#127)

1vn · web-flow · commit e1b4a5e244b8 · 2019-06-05T15:17:57.000-04:00
diff --git a/docs/applications/resources/transformed-columns.md b/docs/applications/resources/transformed-columns.md
@@ -55,7 +55,7 @@ See <!-- CORTEX_VERSION_MINOR -->[`transformers.yaml`](https://github.com/cortex
     columns:
       col: class  # the name of a string column
     args:
-      index: ["t", "f"]  # a value to be used as the index
+      index: {"indexes": ["t", "f"], "reversed_index": ["t": 0, "f": 1]}  # a value to be used as the index
 
 - kind: transformed_column
   name: price_bucketized
diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -219,7 +219,7 @@ Add to `app.yaml`:
     columns:
       text: class
     args:
-      index: class_index
+      indexes: class_index
 ```
 
 You can simplify the configuration for aggregates and transformed columns using [templates](applications/advanced/templates.md).
diff --git a/examples/iris/implementations/models/dnn.py b/examples/iris/implementations/models/dnn.py
@@ -12,6 +12,6 @@ def create_estimator(run_config, model_config):
     return tf.estimator.DNNClassifier(
         feature_columns=feature_columns,
         hidden_units=model_config["hparams"]["hidden_units"],
-        n_classes=len(model_config["aggregates"]["class_index"]),
+        n_classes=len(model_config["aggregates"]["class_index"]["index"]),
         config=run_config,
     )
diff --git a/examples/iris/resources/transformed_columns.yaml b/examples/iris/resources/transformed_columns.yaml
@@ -45,4 +45,4 @@
     columns:
       text: class
     args:
-      index: class_index
+      indexes: class_index
diff --git a/examples/movie-ratings/resources/transformed_columns.yaml b/examples/movie-ratings/resources/transformed_columns.yaml
@@ -12,7 +12,7 @@
     columns:
       text: user_id
     args:
-      index: user_id_index
+      indexes: user_id_index
 
 - kind: aggregate
   name: movie_id_index
@@ -28,4 +28,4 @@
     columns:
       text: movie_id
     args:
-      index: movie_id_index
+      indexes: movie_id_index
diff --git a/examples/reviews/resources/columns.yaml b/examples/reviews/resources/columns.yaml
@@ -25,4 +25,4 @@
     columns:
       text: label
     args:
-      index: label_index
+      indexes: label_index
diff --git a/pkg/aggregators/aggregators.yaml b/pkg/aggregators/aggregators.yaml
@@ -317,11 +317,12 @@
 # Enumerates the unique values in a string column and orders them by placing the unique strings in
 # list ordered by most frequent starting at the 0th index.
 # Works well in conjunction with transformers.index_string.
-# For example: An input column with the following values ['t', 'f', 't'] would return ['t', 'f'].
+# For example: An input column with the following values ['t', 'f', 't'] would return
+# {"index": ['t', 'f'], "reversed_index": {'t': 0, 'f': 1}}.
 - kind: aggregator
   name: index_string
   path: index_string.py
-  output_type: [STRING]
+  output_type: {"index": [STRING], "reversed_index": {STRING: INT}}
   inputs:
     columns:
       col: STRING_COLUMN
diff --git a/pkg/aggregators/index_string.py b/pkg/aggregators/index_string.py
@@ -17,4 +17,6 @@ def aggregate_spark(data, columns, args):
     from pyspark.ml.feature import StringIndexer
 
     indexer = StringIndexer(inputCol=columns["col"])
-    return indexer.fit(data).labels
+    index = indexer.fit(data).labels
+    reversed_index = {v: k for k, v in enumerate(index)}
+    return {"index": index, "reversed_index": reversed_index}
diff --git a/pkg/transformers/index_string.py b/pkg/transformers/index_string.py
@@ -18,7 +18,7 @@ def transform_spark(data, columns, args, transformed_column_name):
     import pyspark.sql.functions as F
 
     indexer = StringIndexerModel.from_labels(
-        args["index"], inputCol=columns["text"], outputCol=transformed_column_name
+        args["indexes"]["index"], inputCol=columns["text"], outputCol=transformed_column_name
     )
 
     return indexer.transform(data).withColumn(
@@ -27,12 +27,11 @@ def transform_spark(data, columns, args, transformed_column_name):
 
 
 def transform_python(sample, args):
-    for idx, label in enumerate(args["index"]):
-        if label == sample["text"]:
-            return idx
+    if sample["text"] in args["indexes"]["reversed_index"]:
+        return args["indexes"]["reversed_index"][sample["text"]]
 
     raise Exception("Could not find {} in index: {}".format(sample["text"], args))
 
 
 def reverse_transform_python(transformed_value, args):
-    return args["index"][transformed_value]
+    return args["indexes"]["index"][transformed_value]
diff --git a/pkg/transformers/transformers.yaml b/pkg/transformers/transformers.yaml
@@ -42,7 +42,7 @@
 
 # Given labels, map the string column to its index in the labels array.
 # Example:
-#   INPUT: labels = ['r', 'b', 'g'] column = ['r', 'b', 'g', 'g'],
+#   INPUT: indexes = {"index": ['r', 'b', 'g'], "reversed_index": {'r': 0, 'b': 1, 'g': 2}} column = ['r', 'b', 'g', 'g'],
 #   OUTPUT: [0, 1, 2, 2]
 - kind: transformer
   name: index_string
@@ -52,4 +52,4 @@
     columns:
       text: STRING_COLUMN
     args:
-      index: [STRING]
+      indexes: {"index": [STRING], "reversed_index": {STRING: INT}}
diff --git a/pkg/workloads/spark_job/test/integration/iris_context.py b/pkg/workloads/spark_job/test/integration/iris_context.py
@@ -46,7 +46,7 @@ def get(input_data_path):
             "workload_id": "jjd3l0fi4fhwqtgmpatg",
             "key": "apps/iris/data/2019-03-08-09-58-35-701834/3976c5679bcf7cb550453802f4c3a9333c5f193f6097f1f5642de48d2397554/aggregates/54ead5d565a57cad06972cc11d2f01f05c4e9e1dbfc525d1fa66b7999213722.msgpack",
             "tags": {},
-            "type": ["STRING"],
+            "type": {"index": ["STRING"], "reversed_index": {"STRING": "INT"}},
             "embed": None,
             "file_path": "resources/aggregates.yaml",
             "name": "class_index",
@@ -208,7 +208,10 @@ def get(input_data_path):
             "index": 2,
             "id_with_tags": "81bcee8795009e19f3378b2c3ea10fa6048741f2ad6ef841e5ed55c81319a0c",
             "resource_type": "transformer",
-            "inputs": {"columns": {"text": "STRING_COLUMN"}, "args": {"index": ["STRING"]}},
+            "inputs": {
+                "columns": {"text": "STRING_COLUMN"},
+                "args": {"indexes": {"index": ["STRING"], "reversed_index": {"STRING": "INT"}}},
+            },
         },
     },
     "python_packages": {},
@@ -441,7 +444,7 @@ def get(input_data_path):
             "index": 4,
             "id_with_tags": "f3b94376e20e64f67d0808c3589d8a4bb09196e38ff81ba775408be38148c1e",
             "resource_type": "transformed_column",
-            "inputs": {"columns": {"text": "class"}, "args": {"index": "class_index"}},
+            "inputs": {"columns": {"text": "class"}, "args": {"indexes": "class_index"}},
         },
     },
     "models": {
@@ -559,7 +562,7 @@ def get(input_data_path):
             "name": "index_string",
             "namespace": "cortex",
             "path": "",
-            "output_type": ["STRING"],
+            "output_type": {"index": ["STRING"], "reversed_index": {"STRING": "INT"}},
             "index": 29,
             "id_with_tags": "c32f21159377d5dc3ddc664fe5cabbe7b275eadc82b5f6ed711faa1a988deb4",
             "resource_type": "aggregator",

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,6 @@ def create_estimator(run_config, model_config):`
`12`	`12`	`return tf.estimator.DNNClassifier(`
`13`	`13`	`feature_columns=feature_columns,`
`14`	`14`	`hidden_units=model_config["hparams"]["hidden_units"],`
`15`		`- n_classes=len(model_config["aggregates"]["class_index"]),`
	`15`	`+ n_classes=len(model_config["aggregates"]["class_index"]["index"]),`
`16`	`16`	`config=run_config,`
`17`	`17`	`)`