From 2156623ead8d44c09bcfeb86838539f593dd0659 Mon Sep 17 00:00:00 2001 From: vpostrigan Date: Thu, 20 Jan 2022 13:11:50 +0200 Subject: [PATCH 1/2] Fix filename mistype in comment --- .../lab210_schema_introspection/SchemaIntrospectionApp.java | 5 +++-- .../SchemaIntrospectionScalaApp.scala | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/java/net/jgp/books/spark/ch03/lab210_schema_introspection/SchemaIntrospectionApp.java b/src/main/java/net/jgp/books/spark/ch03/lab210_schema_introspection/SchemaIntrospectionApp.java index 3735f9a..b2180a4 100644 --- a/src/main/java/net/jgp/books/spark/ch03/lab210_schema_introspection/SchemaIntrospectionApp.java +++ b/src/main/java/net/jgp/books/spark/ch03/lab210_schema_introspection/SchemaIntrospectionApp.java @@ -39,8 +39,9 @@ private void start() { .master("local") .getOrCreate(); - // Reads a CSV file with header, called books.csv, stores it in a - // dataframe + // Reads a CSV file with header, called + // Restaurants_in_Wake_County_NC.csv, + // stores it in a dataframe Dataset df = spark.read().format("csv") .option("header", "true") .load("data/Restaurants_in_Wake_County_NC.csv"); diff --git a/src/main/scala/net/jgp/books/spark/ch03/lab210_schema_introspection/SchemaIntrospectionScalaApp.scala b/src/main/scala/net/jgp/books/spark/ch03/lab210_schema_introspection/SchemaIntrospectionScalaApp.scala index f7cba1b..65e7465 100644 --- a/src/main/scala/net/jgp/books/spark/ch03/lab210_schema_introspection/SchemaIntrospectionScalaApp.scala +++ b/src/main/scala/net/jgp/books/spark/ch03/lab210_schema_introspection/SchemaIntrospectionScalaApp.scala @@ -22,8 +22,9 @@ object SchemaIntrospectionScalaApp { val spark = SparkSession.builder.appName("Schema introspection for restaurants in Wake County, NC") .master("local").getOrCreate - // Reads a CSV file with header, called books.csv, stores it in a - // dataframe + // Reads a CSV file with header, called + // Restaurants_in_Wake_County_NC.csv, + // stores it in a dataframe var df = spark.read.format("csv").option("header", "true") .load("data/Restaurants_in_Wake_County_NC.csv") From 2e31805849df135d71557298101e9dffa5d7301d Mon Sep 17 00:00:00 2001 From: vpostrigan Date: Thu, 20 Jan 2022 13:21:10 +0200 Subject: [PATCH 2/2] Fix mapping between Lat/Long and X/Y --- .../JsonIngestionSchemaManipulationApp.java | 4 ++-- .../spark/ch03/lab230_dataframe_union/DataframeUnionApp.java | 4 ++-- .../jsonIngestionSchemaManipulationApp.py | 4 ++-- src/main/python/lab230_dataframe_union/util.py | 4 ++-- .../JsonIngestionSchemaManipulationScalaApp.scala | 4 ++-- .../ch03/lab230_dataframe_union/DataframeUnionScalaApp.scala | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/main/java/net/jgp/books/spark/ch03/lab220_json_ingestion_schema_manipulation/JsonIngestionSchemaManipulationApp.java b/src/main/java/net/jgp/books/spark/ch03/lab220_json_ingestion_schema_manipulation/JsonIngestionSchemaManipulationApp.java index 188de74..e270fb9 100644 --- a/src/main/java/net/jgp/books/spark/ch03/lab220_json_ingestion_schema_manipulation/JsonIngestionSchemaManipulationApp.java +++ b/src/main/java/net/jgp/books/spark/ch03/lab220_json_ingestion_schema_manipulation/JsonIngestionSchemaManipulationApp.java @@ -60,8 +60,8 @@ private void start() { .withColumn("dateEnd", df.col("fields.closing_date")) .withColumn("type", split(df.col("fields.type_description"), " - ").getItem(1)) - .withColumn("geoX", df.col("fields.geolocation").getItem(0)) - .withColumn("geoY", df.col("fields.geolocation").getItem(1)); + .withColumn("geoX", df.col("fields.geolocation").getItem(1)) + .withColumn("geoY", df.col("fields.geolocation").getItem(0)); df = df.withColumn("id", concat(df.col("state"), lit("_"), df.col("county"), lit("_"), diff --git a/src/main/java/net/jgp/books/spark/ch03/lab230_dataframe_union/DataframeUnionApp.java b/src/main/java/net/jgp/books/spark/ch03/lab230_dataframe_union/DataframeUnionApp.java index 3ac17a0..599325e 100644 --- a/src/main/java/net/jgp/books/spark/ch03/lab230_dataframe_union/DataframeUnionApp.java +++ b/src/main/java/net/jgp/books/spark/ch03/lab230_dataframe_union/DataframeUnionApp.java @@ -121,8 +121,8 @@ private Dataset buildDurhamRestaurantsDataframe() { .withColumn("dateEnd", df.col("fields.closing_date")) .withColumn("type", split(df.col("fields.type_description"), " - ").getItem(1)) - .withColumn("geoX", df.col("fields.geolocation").getItem(0)) - .withColumn("geoY", df.col("fields.geolocation").getItem(1)) + .withColumn("geoX", df.col("fields.geolocation").getItem(1)) + .withColumn("geoY", df.col("fields.geolocation").getItem(0)) .drop(df.col("fields")) .drop(df.col("geometry")) .drop(df.col("record_timestamp")) diff --git a/src/main/python/lab220_json_ingestion_schema_manipulation/jsonIngestionSchemaManipulationApp.py b/src/main/python/lab220_json_ingestion_schema_manipulation/jsonIngestionSchemaManipulationApp.py index 4009536..74b39c0 100644 --- a/src/main/python/lab220_json_ingestion_schema_manipulation/jsonIngestionSchemaManipulationApp.py +++ b/src/main/python/lab220_json_ingestion_schema_manipulation/jsonIngestionSchemaManipulationApp.py @@ -41,8 +41,8 @@ def main(spark): .withColumn("dateStart", F.col("fields.opening_date")) \ .withColumn("dateEnd", F.col("fields.closing_date")) \ .withColumn("type", F.split(F.col("fields.type_description"), " - ").getItem(1)) \ - .withColumn("geoX", F.col("fields.geolocation").getItem(0)) \ - .withColumn("geoY", F.col("fields.geolocation").getItem(1)) + .withColumn("geoX", F.col("fields.geolocation").getItem(1)) \ + .withColumn("geoY", F.col("fields.geolocation").getItem(0)) df = df.withColumn("id", F.concat(F.col("state"), F.lit("_"), F.col("county"), F.lit("_"), diff --git a/src/main/python/lab230_dataframe_union/util.py b/src/main/python/lab230_dataframe_union/util.py index 6299b29..f0cfcf3 100644 --- a/src/main/python/lab230_dataframe_union/util.py +++ b/src/main/python/lab230_dataframe_union/util.py @@ -56,8 +56,8 @@ def build_durham_restaurants_dataframe(df): .withColumn("dateStart", F.col("fields.opening_date")) \ .withColumn("dateEnd", F.col("fields.closing_date")) \ .withColumn("type", F.split(F.col("fields.type_description"), " - ").getItem(1)) \ - .withColumn("geoX", F.col("fields.geolocation").getItem(0)) \ - .withColumn("geoY", F.col("fields.geolocation").getItem(1)) \ + .withColumn("geoX", F.col("fields.geolocation").getItem(1)) \ + .withColumn("geoY", F.col("fields.geolocation").getItem(0)) \ .drop(*drop_cols) df = df.withColumn("id", diff --git a/src/main/scala/net/jgp/books/spark/ch03/lab220_json_ingestion_schema_manipulation/JsonIngestionSchemaManipulationScalaApp.scala b/src/main/scala/net/jgp/books/spark/ch03/lab220_json_ingestion_schema_manipulation/JsonIngestionSchemaManipulationScalaApp.scala index 961a0e9..18ec599 100644 --- a/src/main/scala/net/jgp/books/spark/ch03/lab220_json_ingestion_schema_manipulation/JsonIngestionSchemaManipulationScalaApp.scala +++ b/src/main/scala/net/jgp/books/spark/ch03/lab220_json_ingestion_schema_manipulation/JsonIngestionSchemaManipulationScalaApp.scala @@ -42,8 +42,8 @@ object JsonIngestionSchemaManipulationScalaApp { .withColumn("dateStart", col("fields.opening_date")) .withColumn("dateEnd", col("fields.closing_date")) .withColumn("type", split(col("fields.type_description"), " - ").getItem(1)) - .withColumn("geoX", col("fields.geolocation").getItem(0)) - .withColumn("geoY", col("fields.geolocation").getItem(1)) + .withColumn("geoX", col("fields.geolocation").getItem(1)) + .withColumn("geoY", col("fields.geolocation").getItem(0)) val cols_list = List(col("state"), lit("_"), col("county"), lit("_"), col("datasetId")) diff --git a/src/main/scala/net/jgp/books/spark/ch03/lab230_dataframe_union/DataframeUnionScalaApp.scala b/src/main/scala/net/jgp/books/spark/ch03/lab230_dataframe_union/DataframeUnionScalaApp.scala index 9225572..551b3fa 100644 --- a/src/main/scala/net/jgp/books/spark/ch03/lab230_dataframe_union/DataframeUnionScalaApp.scala +++ b/src/main/scala/net/jgp/books/spark/ch03/lab230_dataframe_union/DataframeUnionScalaApp.scala @@ -95,8 +95,8 @@ object DataframeUnionScalaApp { .withColumn("dateStart", col("fields.opening_date")) .withColumn("dateEnd", col("fields.closing_date")) .withColumn("type", split(col("fields.type_description"), " - ").getItem(1)) - .withColumn("geoX", col("fields.geolocation").getItem(0)) - .withColumn("geoY", col("fields.geolocation").getItem(1)) + .withColumn("geoX", col("fields.geolocation").getItem(1)) + .withColumn("geoY", col("fields.geolocation").getItem(0)) .drop(drop_cols:_*) df1 = df1.withColumn("id",