From 3b9e388d70ece7f443829aabf5573d8bb3778cd8 Mon Sep 17 00:00:00 2001 From: Md Shahid Afridi P Date: Wed, 21 Feb 2024 13:50:40 +0530 Subject: [PATCH 1/3] Update pyspark-add-month.py Straight forward solution to add months --- pyspark-add-month.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pyspark-add-month.py b/pyspark-add-month.py index 3249579..bff2c17 100644 --- a/pyspark-add-month.py +++ b/pyspark-add-month.py @@ -5,9 +5,6 @@ from pyspark.sql import SparkSession spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() -from pyspark.sql.functions import col,expr +from pyspark.sql.functions import add_months, to_date data=[("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)] -spark.createDataFrame(data).toDF("date","increment") \ - .select(col("date"),col("increment"), \ - expr("add_months(to_date(date,'yyyy-MM-dd'),cast(increment as int))").alias("inc_date")) \ - .show() \ No newline at end of file +spark.createDataFrame(data,schema=["date","increment"]).select(['date','increment',add_months(to_date('date'),'increment').alias("inc_date")]).show() From a99994bcd4ecdb6abd5b87746aa127e76c452aef Mon Sep 17 00:00:00 2001 From: Md Shahid Afridi P Date: Fri, 23 Feb 2024 11:34:38 +0530 Subject: [PATCH 2/3] Update pyspark-dataframe-flatMap.py added the code for flatmap. --- pyspark-dataframe-flatMap.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pyspark-dataframe-flatMap.py b/pyspark-dataframe-flatMap.py index 3dd84df..c0f3b12 100644 --- a/pyspark-dataframe-flatMap.py +++ b/pyspark-dataframe-flatMap.py @@ -12,9 +12,18 @@ ("Michael,Rose,",["Spark","Java","C++"],"NJ"), \ ("Robert,,Williams",["CSharp","VB"],"NV")] -df = spark.createDataFrame(data=data,schema=columns) -df.printSchema() -df.show(truncate=False) +# Convert data to a DataFrame +rdd = spark.sparkContext.parallelize(data) +row_rdd = rdd.map(lambda x: Row(name=x[0], languagesAtSchool=x[1], currentState=x[2])) +df = spark.createDataFrame(row_rdd, columns) -#Flatmap +# Apply flatMap transformation +flat_mapped_df = df.rdd.flatMap(lambda x: [(x["name"], lang, x["currentState"]) for lang in x["languagesAtSchool"]]) + +# Convert result to DataFrame +result_columns = ["name", "language", "currentState"] +result_df = flat_mapped_df.toDF(result_columns) + +# Show the result +result_df.show() From 0e42a6485186a249cb18f045ac69551082a903c9 Mon Sep 17 00:00:00 2001 From: Md Shahid Afridi P Date: Fri, 23 Feb 2024 11:43:33 +0530 Subject: [PATCH 3/3] Update pyspark-dataframe-flatMap.py --- pyspark-dataframe-flatMap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyspark-dataframe-flatMap.py b/pyspark-dataframe-flatMap.py index c0f3b12..20df053 100644 --- a/pyspark-dataframe-flatMap.py +++ b/pyspark-dataframe-flatMap.py @@ -4,7 +4,7 @@ """ -from pyspark.sql import SparkSession +from pyspark.sql import SparkSession, Row spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() columns = ["name","languagesAtSchool","currentState"]