|
| 1 | +from pyspark.sql import functions as F |
| 2 | +from pyspark.sql.window import Window |
| 3 | +from pyspark.sql import SparkSession |
| 4 | +from datetime import datetime |
| 5 | + |
| 6 | +spark = SparkSession.builder.appName("Moving Average").getOrCreate() |
| 7 | + |
| 8 | +headers = ["date", "sales"] |
| 9 | + |
| 10 | +data = [ |
| 11 | + [datetime(2022, 1, 1), 100], |
| 12 | + [datetime(2022, 1, 2), 1543], |
| 13 | + [datetime(2022, 1, 3), 756], |
| 14 | + [datetime(2022, 1, 4), 2223], |
| 15 | + [datetime(2022, 1, 5), 765], |
| 16 | + [datetime(2022, 1, 6), 734], |
| 17 | + [datetime(2022, 1, 7), 762], |
| 18 | + [datetime(2022, 1, 8), 3422], |
| 19 | + [datetime(2022, 1, 9), 1500], |
| 20 | + [datetime(2022, 1, 10), 7332], |
| 21 | + [datetime(2022, 1, 11), 4200], |
| 22 | + [datetime(2022, 1, 12), 1121], |
| 23 | + [datetime(2022, 1, 13), 448], |
| 24 | + [datetime(2022, 1, 14), 1198], |
| 25 | + [datetime(2022, 1, 15), 1500], |
| 26 | + [datetime(2022, 1, 16), 4200], |
| 27 | + [datetime(2022, 1, 17), 1121], |
| 28 | + [datetime(2022, 1, 18), 448], |
| 29 | + [datetime(2022, 1, 19), 1198], |
| 30 | + [datetime(2022, 1, 20), 1198], |
| 31 | + [datetime(2022, 1, 21), 7653], |
| 32 | + [datetime(2022, 1, 22), 2345], |
| 33 | + [datetime(2022, 1, 23), 1246], |
| 34 | + [datetime(2022, 1, 24), 888], |
| 35 | + [datetime(2022, 1, 25), 2653], |
| 36 | + [datetime(2022, 1, 26), 8445], |
| 37 | + [datetime(2022, 1, 27), 1198], |
| 38 | + [datetime(2022, 1, 28), 3211], |
| 39 | + [datetime(2022, 1, 29), 2745], |
| 40 | + [datetime(2022, 1, 30), 1234], |
| 41 | + [datetime(2022, 1, 31), 6542], |
| 42 | +] |
| 43 | + |
| 44 | +days = lambda i: i * 86400 |
| 45 | + |
| 46 | +moving_7_day_window = Window.orderBy( |
| 47 | + F.col("date").cast("timestamp").cast("long") |
| 48 | +).rangeBetween(-days(7), Window.currentRow) |
| 49 | + |
| 50 | +df = spark.createDataFrame(data, headers).withColumn( |
| 51 | + "mov_avg", F.avg("sales").over(moving_7_day_window) |
| 52 | +) |
| 53 | + |
| 54 | +df.show() |
| 55 | +# +-------------------+-----+------------------+ |
| 56 | +# | date|sales| mov_avg| |
| 57 | +# +-------------------+-----+------------------+ |
| 58 | +# |2022-01-01 00:00:00| 100| 100.0| |
| 59 | +# |2022-01-02 00:00:00| 1543| 821.5| |
| 60 | +# |2022-01-03 00:00:00| 756| 799.6666666666666| |
| 61 | +# |2022-01-04 00:00:00| 2223| 1155.5| |
| 62 | +# |2022-01-05 00:00:00| 765| 1077.4| |
| 63 | +# |2022-01-06 00:00:00| 734|1020.1666666666666| |
| 64 | +# |2022-01-07 00:00:00| 762| 983.2857142857143| |
| 65 | +# |2022-01-08 00:00:00| 3422| 1288.125| |
| 66 | +# |2022-01-09 00:00:00| 1500| 1463.125| |
| 67 | +# |2022-01-10 00:00:00| 7332| 2186.75| |
| 68 | +# |2022-01-11 00:00:00| 4200| 2617.25| |
| 69 | +# |2022-01-12 00:00:00| 1121| 2479.5| |
| 70 | +# |2022-01-13 00:00:00| 448| 2439.875| |
| 71 | +# |2022-01-14 00:00:00| 1198| 2497.875| |
| 72 | +# |2022-01-15 00:00:00| 1500| 2590.125| |
| 73 | +# |2022-01-16 00:00:00| 4200| 2687.375| |
| 74 | +# |2022-01-17 00:00:00| 1121| 2640.0| |
| 75 | +# |2022-01-18 00:00:00| 448| 1779.5| |
| 76 | +# |2022-01-19 00:00:00| 1198| 1404.25| |
| 77 | +# |2022-01-20 00:00:00| 1198| 1413.875| |
| 78 | +# +-------------------+-----+------------------+ |
| 79 | +# only showing top 20 rows |
| 80 | + |
| 81 | +df.toPandas().plot.line(x="date", y=["sales", "mov_avg"], rot=45) |
0 commit comments