@@ -236,8 +236,8 @@ def validate_transformers(spark, ctx, cols_to_transform, raw_df):
236236 TEST_DF_SIZE = 100
237237
238238 logger .info ("Sanity checking transformers against the first {} samples" .format (TEST_DF_SIZE ))
239- sample_df = raw_df .limit (TEST_DF_SIZE ).cache ()
240- test_df = raw_df .limit (TEST_DF_SIZE ). cache ()
239+ transformed_df = raw_df .limit (TEST_DF_SIZE ).cache ()
240+ test_df = raw_df .limit (TEST_DF_SIZE )
241241
242242 resource_list = sorted ([ctx .tf_id_map [f ] for f in cols_to_transform ], key = lambda r : r ["name" ])
243243 for transformed_column in resource_list :
@@ -257,17 +257,17 @@ def validate_transformers(spark, ctx, cols_to_transform, raw_df):
257257 logger .info ("Transforming {} to {}" .format (", " .join (input_cols ), tf_name ))
258258
259259 spark_util .validate_transformer (tf_name , test_df , ctx , spark )
260- sample_df = spark_util .transform_column (
261- transformed_column ["name" ], sample_df , ctx , spark
260+ transformed_df = spark_util .transform_column (
261+ transformed_column ["name" ], transformed_df , ctx , spark
262262 )
263263
264- sample_df .select (tf_name ).collect () # run the transformer
265- show_df (sample_df .select (* input_cols , tf_name ), ctx , n = 3 , sort = False )
264+ transformed_df .select (tf_name ).collect () # run the transformer
265+ show_df (transformed_df .select (* input_cols , tf_name ), ctx , n = 3 , sort = False )
266266
267267 for alias in transformed_column ["aliases" ][1 :]:
268268 logger .info ("Transforming {} to {}" .format (", " .join (input_cols ), alias ))
269269
270- display_transform_df = sample_df .withColumn (alias , F .col (tf_name )).select (
270+ display_transform_df = transformed_df .withColumn (alias , F .col (tf_name )).select (
271271 * input_cols , alias
272272 )
273273 show_df (display_transform_df , ctx , n = 3 , sort = False )
0 commit comments