Fix Spark file and cache warnings (#137)

1vn · web-flow · commit 8ee32b8dfeb7 · 2019-06-05T19:26:05.000-04:00
* don't re-upload impls

* remove second cache calll

* rename sample_df -&gt; transformed_df
diff --git a/pkg/workloads/lib/context.py b/pkg/workloads/lib/context.py
@@ -113,6 +113,7 @@ def __init__(self, **kwargs):
         self._aggregator_impls = {}
         self._model_impls = {}
         self._metadatas = {}
+        self.spark_uploaded_impls = {}
 
         # This affects Tensorflow S3 access
         os.environ["AWS_REGION"] = self.cortex_config.get("region", "")
diff --git a/pkg/workloads/spark_job/spark_job.py b/pkg/workloads/spark_job/spark_job.py
@@ -236,8 +236,8 @@ def validate_transformers(spark, ctx, cols_to_transform, raw_df):
     TEST_DF_SIZE = 100
 
     logger.info("Sanity checking transformers against the first {} samples".format(TEST_DF_SIZE))
-    sample_df = raw_df.limit(TEST_DF_SIZE).cache()
-    test_df = raw_df.limit(TEST_DF_SIZE).cache()
+    transformed_df = raw_df.limit(TEST_DF_SIZE).cache()
+    test_df = raw_df.limit(TEST_DF_SIZE)
 
     resource_list = sorted([ctx.tf_id_map[f] for f in cols_to_transform], key=lambda r: r["name"])
     for transformed_column in resource_list:
@@ -257,17 +257,17 @@ def validate_transformers(spark, ctx, cols_to_transform, raw_df):
             logger.info("Transforming {} to {}".format(", ".join(input_cols), tf_name))
 
             spark_util.validate_transformer(tf_name, test_df, ctx, spark)
-            sample_df = spark_util.transform_column(
-                transformed_column["name"], sample_df, ctx, spark
+            transformed_df = spark_util.transform_column(
+                transformed_column["name"], transformed_df, ctx, spark
             )
 
-            sample_df.select(tf_name).collect()  # run the transformer
-            show_df(sample_df.select(*input_cols, tf_name), ctx, n=3, sort=False)
+            transformed_df.select(tf_name).collect()  # run the transformer
+            show_df(transformed_df.select(*input_cols, tf_name), ctx, n=3, sort=False)
 
             for alias in transformed_column["aliases"][1:]:
                 logger.info("Transforming {} to {}".format(", ".join(input_cols), alias))
 
-                display_transform_df = sample_df.withColumn(alias, F.col(tf_name)).select(
+                display_transform_df = transformed_df.withColumn(alias, F.col(tf_name)).select(
                     *input_cols, alias
                 )
                 show_df(display_transform_df, ctx, n=3, sort=False)
diff --git a/pkg/workloads/spark_job/spark_util.py b/pkg/workloads/spark_job/spark_util.py
@@ -501,7 +501,11 @@ def extract_inputs(column_name, ctx):
 
 def execute_transform_spark(column_name, df, ctx, spark):
     trans_impl, trans_impl_path = ctx.get_transformer_impl(column_name)
-    spark.sparkContext.addPyFile(trans_impl_path)  # Executor pods need this because of the UDF
+
+    if trans_impl_path not in ctx.spark_uploaded_impls:
+        spark.sparkContext.addPyFile(trans_impl_path)  # Executor pods need this because of the UDF
+        ctx.spark_uploaded_impls[trans_impl_path] = True
+
     columns_input_config, impl_args = extract_inputs(column_name, ctx)
     try:
         return trans_impl.transform_spark(df, columns_input_config, impl_args, column_name)
@@ -513,8 +517,11 @@ def execute_transform_python(column_name, df, ctx, spark, validate=False):
     trans_impl, trans_impl_path = ctx.get_transformer_impl(column_name)
     columns_input_config, impl_args = extract_inputs(column_name, ctx)
 
-    spark.sparkContext.addPyFile(trans_impl_path)  # Executor pods need this because of the UDF
-    # not a dictionary because it is possible that one column may map to multiple input names
+    if trans_impl_path not in ctx.spark_uploaded_impls:
+        spark.sparkContext.addPyFile(trans_impl_path)  # Executor pods need this because of the UDF
+        # not a dictionary because it is possible that one column may map to multiple input names
+        ctx.spark_uploaded_impls[trans_impl_path] = True
+
     required_columns_sorted, columns_input_config_indexed = column_names_to_index(
         columns_input_config
     )