Add 6 window functions from blog post

mitchellvanrijkom · mitchellvanrijkom · commit ddb01d41bd8f · 2022-07-28T22:51:19.000+02:00
diff --git a/README.md b/README.md
@@ -0,0 +1,73 @@
+# Introduction
+
+This repository contains the source code for a blog post about window functions in PySpark. Go to [Blog Post](https://mitchellvanrijkom.com). In that post, I described how to use the 6 most window functions in PySpark.
+
+# Window functions
+
+When we work with data in Spark, we commonly use the SQL module. With this module, we can easily create dataframes with the DataFrame APIs that use different optimizers to help in supporting a wide range of data sources and algorithms optimized for big data workloads.
+
+In SQL, we have a particular type of operation called a Window Function. This operation calculates a function on a subset of rows based on the current row. For each row, a frame window is determined. On this frame, a calculation is made based on the rows in this frame. For every row, the calculation returns a value.
+
+Because Spark uses SQL we also have window functions at our disposal. When we combine the power of DataFrames with window functions, we can create some unique optimized calculations!
+
+# Repository
+
+## Getting started
+
+```bash
+# Create virtualenv
+python -m venv .venv 
+
+# Activate virtualenv
+. .venv/bin/activate 
+
+# Install dependencies
+pip install -r requirements.txt
+
+# Run the code
+python most_recent.py
+```
+
+The repository contains the following files:
+
+## Aggregates Functions
+
+### How to calculate a cumulative sum (running total) 📈
+
+Very easy with a SQL window function! 👇🏻
+
+[cumulative_sum.py](cumulative_sum.py)
+
+### How to calculate a moving average 📈
+
+Filter out the noise to determine the direction of a trend!
+
+[moving_average.py](moving_average.py)
+
+## Ranking Functions
+
+### Select only the most recent records
+
+Easy way to remove duplicate entries  
+
+[most_recent.py](most_recent.py)
+
+### Break your dataset into equal groups
+
+Rank each value in your dataset  
+
+[rank.py](rank.py)
+
+## Value/Analytical Functions
+
+### Calculate the difference from preceeding rows
+
+Very easy to select preceeding or following rows  
+
+[difference.py](difference.py)
+
+### Get the first and last value of the month
+
+Quickly analyze the start and end of each month  
+
+[first_last.py](first_last.py)
diff --git a/cumulative_sum.py b/cumulative_sum.py
@@ -0,0 +1,78 @@
+from pyspark.sql import SparkSession
+from pyspark.sql import functions as F
+from pyspark.sql.window import Window
+
+spark = SparkSession.builder.appName("Cumulative Sum").getOrCreate()
+
+from pyspark.sql import functions as F
+from pyspark.sql.window import Window
+import matplotlib
+from datetime import datetime
+
+
+headers = ["date", "sales"]
+
+data = [
+    [datetime(2022, 1, 1), 100],
+    [datetime(2022, 1, 2), 1543],
+    [datetime(2022, 1, 3), 756],
+    [datetime(2022, 1, 4), 2223],
+    [datetime(2022, 1, 5), 765],
+    [datetime(2022, 1, 6), 734],
+    [datetime(2022, 1, 7), 762],
+    [datetime(2022, 1, 8), 3422],
+    [datetime(2022, 1, 9), 1500],
+    [datetime(2022, 1, 10), 7332],
+    [datetime(2022, 1, 11), 4200],
+    [datetime(2022, 1, 12), 1121],
+    [datetime(2022, 1, 13), 448],
+    [datetime(2022, 1, 14), 1198],
+    [datetime(2022, 1, 15), 1500],
+    [datetime(2022, 1, 16), 4200],
+    [datetime(2022, 1, 17), 1121],
+    [datetime(2022, 1, 18), 448],
+    [datetime(2022, 1, 19), 1198],
+    [datetime(2022, 1, 20), 1198],
+    [datetime(2022, 1, 21), 7653],
+    [datetime(2022, 1, 22), 2345],
+    [datetime(2022, 1, 23), 1246],
+    [datetime(2022, 1, 24), 888],
+    [datetime(2022, 1, 25), 2653],
+    [datetime(2022, 1, 26), 8445],
+    [datetime(2022, 1, 27), 1198],
+    [datetime(2022, 1, 28), 3211],
+    [datetime(2022, 1, 29), 2745],
+    [datetime(2022, 1, 30), 1234],
+    [datetime(2022, 1, 31), 6542],
+]
+df = spark.createDataFrame(data, headers).withColumn(
+    "cumsum", F.sum("sales").over(Window.partitionBy().orderBy("date"))
+)
+df.show()
+# +-------------------+-----+------+
+# |               date|sales|cumsum|
+# +-------------------+-----+------+
+# |2022-01-01 00:00:00|  100|   100|
+# |2022-01-02 00:00:00| 1543|  1643|
+# |2022-01-03 00:00:00|  756|  2399|
+# |2022-01-04 00:00:00| 2223|  4622|
+# |2022-01-05 00:00:00|  765|  5387|
+# |2022-01-06 00:00:00|  734|  6121|
+# |2022-01-07 00:00:00|  762|  6883|
+# |2022-01-08 00:00:00| 3422| 10305|
+# |2022-01-09 00:00:00| 1500| 11805|
+# |2022-01-10 00:00:00| 7332| 19137|
+# |2022-01-11 00:00:00| 4200| 23337|
+# |2022-01-12 00:00:00| 1121| 24458|
+# |2022-01-13 00:00:00|  448| 24906|
+# |2022-01-14 00:00:00| 1198| 26104|
+# |2022-01-15 00:00:00| 1500| 27604|
+# |2022-01-16 00:00:00| 4200| 31804|
+# |2022-01-17 00:00:00| 1121| 32925|
+# |2022-01-18 00:00:00|  448| 33373|
+# |2022-01-19 00:00:00| 1198| 34571|
+# |2022-01-20 00:00:00| 1198| 35769|
+# +-------------------+-----+------+
+# only showing top 20 rows
+
+df.toPandas().plot.line(x="date", y=["sales", "cumsum"], rot=45)
diff --git a/difference.py b/difference.py
@@ -0,0 +1,42 @@
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.appName("Differences with lag").getOrCreate()
+
+from pyspark.sql import functions as F
+from pyspark.sql.window import Window
+from datetime import datetime
+
+
+headers = ["date", "product", "price"]
+
+data = [
+    [datetime(2022, 1, 10), "Bose Revolve+", 330],
+    [datetime(2022, 1, 11), "JBL Partybox", 299],
+    [datetime(2022, 1, 12), "Bose Revolve+", 299],
+    [datetime(2022, 1, 12), "Sonos Move", 399],
+    [datetime(2022, 1, 13), "JBL Partybox", 275],
+    [datetime(2022, 2, 10), "Bose Revolve+", 360],
+    [datetime(2022, 2, 12), "Sonos Move", 359],
+    [datetime(2022, 2, 13), "JBL Partybox", 269],
+    [datetime(2022, 2, 16), "Bose Revolve+", 330],
+]
+df = spark.createDataFrame(data, headers)
+
+window_spec = Window.partitionBy("product").orderBy("date")
+
+difference_df = (
+    df.withColumn("previous_price", F.lag("price").over(window_spec))
+    .filter(F.col("previous_price").isNotNull())
+    .withColumn("difference", F.col("price") - F.col("previous_price"))
+)
+difference_df.show()
+# +-------------------+-------------+-----+--------------+----------+
+# |               date|      product|price|previous_price|difference|
+# +-------------------+-------------+-----+--------------+----------+
+# |2022-01-12 00:00:00|Bose Revolve+|  299|           330|       -31|
+# |2022-02-10 00:00:00|Bose Revolve+|  360|           299|        61|
+# |2022-02-16 00:00:00|Bose Revolve+|  330|           360|       -30|
+# |2022-01-13 00:00:00| JBL Partybox|  275|           299|       -24|
+# |2022-02-13 00:00:00| JBL Partybox|  269|           275|        -6|
+# |2022-02-12 00:00:00|   Sonos Move|  359|           399|       -40|
+# +-------------------+-------------+-----+--------------+----------+
diff --git a/first_last.py b/first_last.py
@@ -0,0 +1,48 @@
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.appName("First Last").getOrCreate()
+
+from pyspark.sql import functions as F
+from pyspark.sql.window import Window
+from datetime import datetime
+
+headers = ["date", "product", "price"]
+
+data = [
+    [datetime(2022, 1, 10), "Bose Revolve+", 330],
+    [datetime(2022, 1, 11), "JBL Partybox", 299],
+    [datetime(2022, 1, 12), "Bose Revolve+", 299],
+    [datetime(2022, 1, 14), "Bose Revolve+", 399],
+    [datetime(2022, 1, 18), "JBL Partybox", 300],
+    [datetime(2022, 1, 29), "Bose Revolve+", 450],
+    [datetime(2022, 1, 13), "JBL Partybox", 275],
+    [datetime(2022, 2, 10), "Bose Revolve+", 360],
+    [datetime(2022, 2, 13), "JBL Partybox", 269],
+    [datetime(2022, 2, 10), "Bose Revolve+", 200],
+    [datetime(2022, 2, 16), "Bose Revolve+", None],
+]
+df = spark.createDataFrame(data, headers)
+
+window_spec = (
+    Window.partitionBy("year", "month", "product")
+    .orderBy("date")
+    .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
+)
+
+first_last_df = (
+    df.withColumn("year", F.year("date"))
+    .withColumn("month", F.month("date"))
+    .withColumn("first_value", F.first("price", ignorenulls=True).over(window_spec))
+    .withColumn("last_value", F.last("price", ignorenulls=True).over(window_spec))
+    .select(["year", "month", "product", "first_value", "last_value"])
+    .distinct()
+)
+first_last_df.show()
+# +----+-----+-------------+-----------+----------+
+# |year|month|      product|first_value|last_value|
+# +----+-----+-------------+-----------+----------+
+# |2022|    1|Bose Revolve+|        330|       450|
+# |2022|    1| JBL Partybox|        299|       300|
+# |2022|    2|Bose Revolve+|        360|       200|
+# |2022|    2| JBL Partybox|        269|       269|
+# +----+-----+-------------+-----------+----------+
diff --git a/most_recent.py b/most_recent.py
@@ -0,0 +1,42 @@
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.appName("Most Recent Record").getOrCreate()
+
+from pyspark.sql import functions as F
+from pyspark.sql.window import Window
+from datetime import datetime
+
+headers = ["date", "product", "price"]
+
+data = [
+    [datetime(2022, 1, 10), "Bose Revolve+", 330],
+    [datetime(2022, 1, 11), "JBL Partybox", 299],
+    [datetime(2022, 1, 12), "Bose Revolve+", 299],
+    [datetime(2022, 1, 12), "Sonos Move", 399],
+    [datetime(2022, 1, 13), "JBL Partybox", 275],
+    [datetime(2022, 2, 10), "Bose Revolve+", 360],
+    [datetime(2022, 2, 12), "Sonos Move", 359],
+    [datetime(2022, 2, 13), "JBL Partybox", 269],
+    [datetime(2022, 2, 16), "Bose Revolve+", 330],
+]
+df = spark.createDataFrame(data, headers)
+
+
+product_window = Window.partitionBy("product").orderBy(F.col("date").desc())
+
+recent_df = (
+    df.withColumn("row_num", F.row_number().over(product_window))
+    # We now have created a subsequent numbering based on date descending for every product
+    # Simply filter the rows with row_num = 1 to select the most recent record
+    .filter(F.col("row_num") == 1)
+    # # We do not need the row_number column anymore, so we drop it
+    .drop("row_num")
+)
+recent_df.show()
+# +-------------------+-------------+-----+
+# |               date|      product|price|
+# +-------------------+-------------+-----+
+# |2022-02-16 00:00:00|Bose Revolve+|  330|
+# |2022-02-13 00:00:00| JBL Partybox|  269|
+# |2022-02-12 00:00:00|   Sonos Move|  359|
+# +-------------------+-------------+-----+
diff --git a/moving_average.py b/moving_average.py
@@ -0,0 +1,81 @@
+from pyspark.sql import functions as F
+from pyspark.sql.window import Window
+from pyspark.sql import SparkSession
+from datetime import datetime
+
+spark = SparkSession.builder.appName("Moving Average").getOrCreate()
+
+headers = ["date", "sales"]
+
+data = [
+    [datetime(2022, 1, 1), 100],
+    [datetime(2022, 1, 2), 1543],
+    [datetime(2022, 1, 3), 756],
+    [datetime(2022, 1, 4), 2223],
+    [datetime(2022, 1, 5), 765],
+    [datetime(2022, 1, 6), 734],
+    [datetime(2022, 1, 7), 762],
+    [datetime(2022, 1, 8), 3422],
+    [datetime(2022, 1, 9), 1500],
+    [datetime(2022, 1, 10), 7332],
+    [datetime(2022, 1, 11), 4200],
+    [datetime(2022, 1, 12), 1121],
+    [datetime(2022, 1, 13), 448],
+    [datetime(2022, 1, 14), 1198],
+    [datetime(2022, 1, 15), 1500],
+    [datetime(2022, 1, 16), 4200],
+    [datetime(2022, 1, 17), 1121],
+    [datetime(2022, 1, 18), 448],
+    [datetime(2022, 1, 19), 1198],
+    [datetime(2022, 1, 20), 1198],
+    [datetime(2022, 1, 21), 7653],
+    [datetime(2022, 1, 22), 2345],
+    [datetime(2022, 1, 23), 1246],
+    [datetime(2022, 1, 24), 888],
+    [datetime(2022, 1, 25), 2653],
+    [datetime(2022, 1, 26), 8445],
+    [datetime(2022, 1, 27), 1198],
+    [datetime(2022, 1, 28), 3211],
+    [datetime(2022, 1, 29), 2745],
+    [datetime(2022, 1, 30), 1234],
+    [datetime(2022, 1, 31), 6542],
+]
+
+days = lambda i: i * 86400
+
+moving_7_day_window = Window.orderBy(
+    F.col("date").cast("timestamp").cast("long")
+).rangeBetween(-days(7), Window.currentRow)
+
+df = spark.createDataFrame(data, headers).withColumn(
+    "mov_avg", F.avg("sales").over(moving_7_day_window)
+)
+
+df.show()
+# +-------------------+-----+------------------+
+# |               date|sales|           mov_avg|
+# +-------------------+-----+------------------+
+# |2022-01-01 00:00:00|  100|             100.0|
+# |2022-01-02 00:00:00| 1543|             821.5|
+# |2022-01-03 00:00:00|  756| 799.6666666666666|
+# |2022-01-04 00:00:00| 2223|            1155.5|
+# |2022-01-05 00:00:00|  765|            1077.4|
+# |2022-01-06 00:00:00|  734|1020.1666666666666|
+# |2022-01-07 00:00:00|  762| 983.2857142857143|
+# |2022-01-08 00:00:00| 3422|          1288.125|
+# |2022-01-09 00:00:00| 1500|          1463.125|
+# |2022-01-10 00:00:00| 7332|           2186.75|
+# |2022-01-11 00:00:00| 4200|           2617.25|
+# |2022-01-12 00:00:00| 1121|            2479.5|
+# |2022-01-13 00:00:00|  448|          2439.875|
+# |2022-01-14 00:00:00| 1198|          2497.875|
+# |2022-01-15 00:00:00| 1500|          2590.125|
+# |2022-01-16 00:00:00| 4200|          2687.375|
+# |2022-01-17 00:00:00| 1121|            2640.0|
+# |2022-01-18 00:00:00|  448|            1779.5|
+# |2022-01-19 00:00:00| 1198|           1404.25|
+# |2022-01-20 00:00:00| 1198|          1413.875|
+# +-------------------+-----+------------------+
+# only showing top 20 rows
+
+df.toPandas().plot.line(x="date", y=["sales", "mov_avg"], rot=45)
diff --git a/rank.py b/rank.py
diff --git a/requirements.txt b/requirements.txt