@@ -4,53 +4,40 @@ import org.apache.spark.sql.functions.when
44
55object SkewJoinApp extends SparkApp {
66
7- // ./bin/spark-shell --master spark://spark-master:7077 --driver-memory 4g --executor-memory 1024mb --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.adaptive.enabled=false
7+ // ./bin/spark-shell --master spark://spark-master:7077 --driver-memory 3g --executor-memory 1024mb --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.adaptive.enabled=false
88
99 spark.sparkContext.setLogLevel(" WARN" )
1010
1111 import spark .implicits ._
1212
1313 spark.sparkContext.setJobGroup(" skewed data" , " skewed data" )
1414
15- val skewedData = spark
16- .range(0 , 10000000 ) // 10M
17- .withColumn(" key" , when($" id" < 10 , $" id" ).otherwise(999 ))
18- .withColumn(" value" , $" id" )
19-
2015 val uniformData = spark
21- .range(0 , 1000000 ) // 1M
16+ .range(0 , 10000000 ) // 10M
2217 .withColumn(" key" , $" id" )
2318 .withColumn(" value" , $" id" )
2419
25- val joined = skewedData.join(uniformData, " key" )
20+ val skewedData = spark
21+ .range(0 , 200000000 ) // 200M
22+ .withColumn(" key" , when($" id" < 10000000 , $" id" ).otherwise(999 ))
23+ .withColumn(" value" , $" id" )
2624
27- val res = joined.filter($" key" === 999 ).count()
28- println(s " Count for skew key (999): $res" )
25+ skewedData.join(uniformData, " key" ).count()
2926
3027 spark.sparkContext.clearJobGroup()
3128
3229 spark.sparkContext.setJobGroup(" adaptative query execution" , " adaptative query execution" )
3330
3431 spark.conf.set(" spark.sql.adaptive.enabled" , " true" )
32+ spark.conf.set(" spark.sql.adaptive.skewJoin.skewedPartitionFactor" , " 1" )
33+ spark.conf.set(" spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes" , " 20MB" )
34+ spark.conf.set(" spark.sql.adaptive.advisoryPartitionSizeInBytes" , " 15MB" )
3535
36- import org .apache .spark .sql .functions ._
37-
38- val skewedDataAQE = spark
39- .range(0 , 10000000 ) // 10M
40- .withColumn(" key" , when($" id" < 10 , $" id" ).otherwise(999 ))
41- .withColumn(" value" , $" id" )
42-
43- val uniformDataAQE = spark
44- .range(0 , 1000000 ) // 1M
45- .withColumn(" key" , $" id" )
46- .withColumn(" value" , $" id" )
47-
48- val joinedAQE = skewedDataAQE.join(uniformDataAQE, " key" )
36+ val joinedAQE = skewedData.join(uniformData, " key" )
4937
5038 joinedAQE.explain(true )
5139
52- val resAQE = joinedAQE.filter($" key" === 999 ).count()
53- println(s " Count for skew key (999): $resAQE" )
40+ joinedAQE.count()
5441
5542 spark.sparkContext.clearJobGroup()
5643
0 commit comments