Bullet spark 0.1.2 (#6)

hbxie · web-flow · commit 191cbbf748c7 · 2018-06-19T18:00:13.000-07:00
diff --git a/docs/quick-start/bullet-on-spark-with-rest.md b/docs/quick-start/bullet-on-spark-with-rest.md
@@ -106,7 +106,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz
 
 ```bash
 cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK
-curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar
+curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar
 ```
 
 #### Step 8: Launch the Bullet Spark Backend
@@ -117,7 +117,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa
 $BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \
     --master local[10]  \
     --class com.yahoo.bullet.spark.BulletSparkStreamingMain \
-    --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \
+    --jars $BULLET_SPARK/bullet-spark-example.jar \
     $BULLET_SPARK/bullet-spark.jar \
     --bullet-spark-conf=$BULLET_SPARK/bullet_spark_rest_settings.yaml &> log.txt &
 
diff --git a/docs/quick-start/bullet-on-spark.md b/docs/quick-start/bullet-on-spark.md
@@ -117,7 +117,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz
 
 ```bash
 cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK
-curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar
+curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar
 ```
 
 #### Step 11: Launch the Bullet Spark Backend
@@ -128,7 +128,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa
 $BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \
     --master local[10]  \
     --class com.yahoo.bullet.spark.BulletSparkStreamingMain \
-    --driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \
+    --jars $BULLET_HOME/pubsub/bullet-kafka.jar,$BULLET_SPARK/bullet-spark-example.jar \
     $BULLET_SPARK/bullet-spark.jar \
     --bullet-spark-conf=$BULLET_SPARK/bullet_spark_kafka_settings.yaml &> log.txt &
 
diff --git a/examples/spark/pom.xml b/examples/spark/pom.xml
@@ -11,6 +11,7 @@
         <scala.dep.version>2.11</scala.dep.version>
         <spark.version>2.3.0</spark.version>
         <bullet.spark.version>0.1.2</bullet.spark.version>
+        <bullet.record.version>0.2.0</bullet.record.version>
     </properties>
 
     <repositories>
diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml
@@ -1,34 +1,103 @@
 ########################################################################################################################
-######################################### Bullet Spark Settings #####################################
+###############################################  Bullet Spark defaults #################################################
 ########################################################################################################################
+# This is the name of the concrete implementation of Data Producer to use.
 bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer"
+
+# This is the batch interval of your Spark Streaming job. Find out more at
+# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval.
 bullet.spark.batch.duration.ms: 1000
+
+# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark.
 bullet.spark.receiver.query.block.size: 1
+
+# This is the maximum number of partitions that will be created by the Query Receiver.
 bullet.spark.receiver.query.coalesce.partitions: 10
+
+# This is the number of Data Producers.
 bullet.spark.data.producer.parallelism: 1
+
+# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path.
 bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint"
+
+# If true, Bullet Spark recovers context from checkpoint files when restarting.
+# Otherwise Bullet Spark creates a new context.
 bullet.spark.recover.from.checkpoint.enable: false
+
+# This is the Spark application name.
 bullet.spark.app.name: "BulletSparkStreamingJob"
+
+# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json).
 bullet.spark.metrics.enabled: false
+
+# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly
+# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning
+# the data and instead choose to parallelize within each partition (fixed by the producer) instead.
+# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering
+# operation concurrently.
 bullet.spark.filter.partition.parallel.mode.enabled: false
+
+# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true.
 bullet.spark.filter.partition.parallel.mode.parallelism: 4
+
+# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed
+# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold.
+# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true.
 bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10
 
+# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation.
+# Checkpoint interval = Spark duration * checkpoint duration multiplier
+# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much
+# data to checkpoint (RDD lineage graph).
+bullet.spark.query.union.checkpoint.duration.multiplier: 10
+bullet.spark.join.checkpoint.duration.multiplier: 10
+
+# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query
+# receiver. If you need to change settings for your publisher in this mode that is different from the settings
+# used in the result publisher, override them here. This setting needs to be a Map if provided.
+# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours.
+# Example:
+#
+# bullet.spark.loop.pubsub.overrides:
+#   bullet.pubsub.custom.publisher.setting: 1
+#   bullet.pubsub.custom.nested.publisher.setting:
+#     foo: bar
+#     bar: baz
+bullet.spark.loop.pubsub.overrides: {}
+
 ########################################################################################################################
-######################################### Spark Streaming Settings #####################################
+############################################### Spark Streaming defaults ###############################################
 ########################################################################################################################
+# The following settings are passed to Spark directly. You can add more settings here.
+# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html.
+# Add configuration that change infrequently here and submit more variable settings while submitting the job on the
+# command line.
 spark.serializer: "org.apache.spark.serializer.KryoSerializer"
 spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer"
-spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator"
 spark.streaming.stopGracefullyOnShutdown: "true"
 spark.streaming.receiver.writeAheadLog.enable: "false"
 spark.streaming.driver.writeAheadLog.allowBatching: "false"
 
 ########################################################################################################################
-######################################### Query PubSub Settings ########################################
+############################################### Query PubSub defaults ##################################################
 ########################################################################################################################
+# This is the type of PubSub context to use for result publisher.
+# The feedback publisher uses QUERY_SUBMISSION since it submits messages.
 bullet.pubsub.context.name: "QUERY_PROCESSING"
+# This is the name of the concrete implementation of PubSub to use.
+# By default, it is the bulletin REST in-memory PubSub.
 bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub"
+# Add settings specific to your PubSub.
 bullet.pubsub.kafka.bootstrap.servers: "localhost:9092"
 bullet.pubsub.kafka.request.topic.name: "bullet.requests"
 bullet.pubsub.kafka.response.topic.name: "bullet.responses"
+
+########################################################################################################################
+############################################### Bullet Core settings ###################################################
+########################################################################################################################
+## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to:
+## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml
+########################################################################################################################
+########################################################################################################################
+# Factory class to get new BulletRecords.
+bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider"
diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml
diff --git a/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml b/examples/spark/src/main/resources/bullet_spark_rest_settings.yaml