Skip to content

Commit 191cbbf

Browse files
authored
Bullet spark 0.1.2 (#6)
1 parent 77f11da commit 191cbbf

File tree

6 files changed

+150
-115
lines changed

6 files changed

+150
-115
lines changed

docs/quick-start/bullet-on-spark-with-rest.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz
106106

107107
```bash
108108
cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK
109-
curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar
109+
curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar
110110
```
111111

112112
#### Step 8: Launch the Bullet Spark Backend
@@ -117,7 +117,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa
117117
$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \
118118
--master local[10] \
119119
--class com.yahoo.bullet.spark.BulletSparkStreamingMain \
120-
--driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \
120+
--jars $BULLET_SPARK/bullet-spark-example.jar \
121121
$BULLET_SPARK/bullet-spark.jar \
122122
--bullet-spark-conf=$BULLET_SPARK/bullet_spark_rest_settings.yaml &> log.txt &
123123

docs/quick-start/bullet-on-spark.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ tar -xzf spark-2.2.1-bin-hadoop2.7.tgz
117117

118118
```bash
119119
cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK
120-
curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.1/bullet-spark-0.1.1-standalone.jar
120+
curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.1.2/bullet-spark-0.1.2-standalone.jar
121121
```
122122

123123
#### Step 11: Launch the Bullet Spark Backend
@@ -128,7 +128,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa
128128
$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \
129129
--master local[10] \
130130
--class com.yahoo.bullet.spark.BulletSparkStreamingMain \
131-
--driver-class-path $BULLET_SPARK/bullet-spark.jar:$BULLET_HOME/pubsub/bullet-kafka.jar:$BULLET_SPARK/bullet-spark-example.jar \
131+
--jars $BULLET_HOME/pubsub/bullet-kafka.jar,$BULLET_SPARK/bullet-spark-example.jar \
132132
$BULLET_SPARK/bullet-spark.jar \
133133
--bullet-spark-conf=$BULLET_SPARK/bullet_spark_kafka_settings.yaml &> log.txt &
134134

examples/spark/pom.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
<scala.dep.version>2.11</scala.dep.version>
1212
<spark.version>2.3.0</spark.version>
1313
<bullet.spark.version>0.1.2</bullet.spark.version>
14+
<bullet.record.version>0.2.0</bullet.record.version>
1415
</properties>
1516

1617
<repositories>
Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,103 @@
11
########################################################################################################################
2-
######################################### Bullet Spark Settings #####################################
2+
############################################### Bullet Spark defaults #################################################
33
########################################################################################################################
4+
# This is the name of the concrete implementation of Data Producer to use.
45
bullet.spark.data.producer.class.name: "com.yahoo.bullet.spark.examples.RandomProducer"
6+
7+
# This is the batch interval of your Spark Streaming job. Find out more at
8+
# https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval.
59
bullet.spark.batch.duration.ms: 1000
10+
11+
# This is the size of the buffer for accumulating queries in the Query Receiver before emitting to Spark.
612
bullet.spark.receiver.query.block.size: 1
13+
14+
# This is the maximum number of partitions that will be created by the Query Receiver.
715
bullet.spark.receiver.query.coalesce.partitions: 10
16+
17+
# This is the number of Data Producers.
818
bullet.spark.data.producer.parallelism: 1
19+
20+
# This is the checkpoint directory. If you are running your Spark on a cluster, the directory must be an HDFS path.
921
bullet.spark.checkpoint.dir: "/tmp/spark-checkpoint"
22+
23+
# If true, Bullet Spark recovers context from checkpoint files when restarting.
24+
# Otherwise Bullet Spark creates a new context.
1025
bullet.spark.recover.from.checkpoint.enable: false
26+
27+
# This is the Spark application name.
1128
bullet.spark.app.name: "BulletSparkStreamingJob"
29+
30+
# If true, Bullet Spark collects metrics which can be accessed via the Spark REST API (/metrics/json).
1231
bullet.spark.metrics.enabled: false
32+
33+
# If true, enables parallel processing of queries in each partition of the Filter Streaming job, This is particularly
34+
# useful when using Producers that are Direct (e.g. DirectKafkaProducer) and you would like to avoid repartitioning
35+
# the data and instead choose to parallelize within each partition (fixed by the producer) instead.
36+
# It speeds up the processing within those partitions by partitioning queries to multiple threads to do the filtering
37+
# operation concurrently.
1338
bullet.spark.filter.partition.parallel.mode.enabled: false
39+
40+
# This is the thread pool size to use when bullet.spark.filter.partition.parallel.mode.enabled is true.
1441
bullet.spark.filter.partition.parallel.mode.parallelism: 4
42+
43+
# This is the minimum number of queries at which the parallel partition filtering is applied. Since there are fixed
44+
# costs to manage a thread pool, they are only created once the number of queries exceeds this threshold.
45+
# It is only used when bullet.spark.filter.partition.parallel.mode.enabled is true.
1546
bullet.spark.filter.partition.parallel.mode.min.query.threshold: 10
1647

48+
# The following 2 settings are used to set the checkpoint intervals independently for each stateful transformation.
49+
# Checkpoint interval = Spark duration * checkpoint duration multiplier
50+
# Use this to control the frequency of checkpointing operation. If this is set too high, there might be too much
51+
# data to checkpoint (RDD lineage graph).
52+
bullet.spark.query.union.checkpoint.duration.multiplier: 10
53+
bullet.spark.join.checkpoint.duration.multiplier: 10
54+
55+
# The feedback publisher switches your PubSub into QUERY_SUBMISSION mode to loop back metadata messages to query
56+
# receiver. If you need to change settings for your publisher in this mode that is different from the settings
57+
# used in the result publisher, override them here. This setting needs to be a Map if provided.
58+
# The example below pretends that your PubSub settings start with bullet.pubsub.custom. You will provide yours.
59+
# Example:
60+
#
61+
# bullet.spark.loop.pubsub.overrides:
62+
# bullet.pubsub.custom.publisher.setting: 1
63+
# bullet.pubsub.custom.nested.publisher.setting:
64+
# foo: bar
65+
# bar: baz
66+
bullet.spark.loop.pubsub.overrides: {}
67+
1768
########################################################################################################################
18-
######################################### Spark Streaming Settings #####################################
69+
############################################### Spark Streaming defaults ###############################################
1970
########################################################################################################################
71+
# The following settings are passed to Spark directly. You can add more settings here.
72+
# Find out more information about configuring a Spark job at https://spark.apache.org/docs/latest/configuration.html.
73+
# Add configuration that change infrequently here and submit more variable settings while submitting the job on the
74+
# command line.
2075
spark.serializer: "org.apache.spark.serializer.KryoSerializer"
2176
spark.closure.serializer: "org.apache.spark.serializer.KryoSerializer"
22-
spark.kryo.registrator: "com.yahoo.bullet.spark.utils.BulletKryoRegistrator"
2377
spark.streaming.stopGracefullyOnShutdown: "true"
2478
spark.streaming.receiver.writeAheadLog.enable: "false"
2579
spark.streaming.driver.writeAheadLog.allowBatching: "false"
2680

2781
########################################################################################################################
28-
######################################### Query PubSub Settings ########################################
82+
############################################### Query PubSub defaults ##################################################
2983
########################################################################################################################
84+
# This is the type of PubSub context to use for result publisher.
85+
# The feedback publisher uses QUERY_SUBMISSION since it submits messages.
3086
bullet.pubsub.context.name: "QUERY_PROCESSING"
87+
# This is the name of the concrete implementation of PubSub to use.
88+
# By default, it is the bulletin REST in-memory PubSub.
3189
bullet.pubsub.class.name: "com.yahoo.bullet.kafka.KafkaPubSub"
90+
# Add settings specific to your PubSub.
3291
bullet.pubsub.kafka.bootstrap.servers: "localhost:9092"
3392
bullet.pubsub.kafka.request.topic.name: "bullet.requests"
3493
bullet.pubsub.kafka.response.topic.name: "bullet.responses"
94+
95+
########################################################################################################################
96+
############################################### Bullet Core settings ###################################################
97+
########################################################################################################################
98+
## You can also configure the core Bullet settings here. For documentation and defaults for those settings, refer to:
99+
## https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml
100+
########################################################################################################################
101+
########################################################################################################################
102+
# Factory class to get new BulletRecords.
103+
bullet.record.provider.class.name: "com.yahoo.bullet.record.SimpleBulletRecordProvider"

examples/spark/src/main/resources/bullet_spark_kafka_settings_new.yaml

Lines changed: 0 additions & 103 deletions
This file was deleted.

0 commit comments

Comments
 (0)