Skip to content

Commit 077ffad

Browse files
committed
(feat!): first working version
0 parents  commit 077ffad

File tree

7 files changed

+191
-0
lines changed

7 files changed

+191
-0
lines changed

.editorconfig

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# EditorConfig is awesome: https://EditorConfig.org
2+
3+
# top-most EditorConfig file
4+
root = true
5+
6+
[*]
7+
indent_style = space
8+
indent_size = 4
9+
end_of_line = lf
10+
charset = utf-8
11+
trim_trailing_whitespace = false
12+
insert_final_newline = false

.tool-versions

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
python 3.11.9

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Spark Dev Box
2+
3+
## Build image
4+
5+
Fork and clone Clone the [Jupyter Docker Stacks](https://jupyter-docker-stacks.readthedocs.io/) repo
6+
7+
```shell
8+
gh repo clone https://github.com/jupyter/docker-stacks
9+
cd docker-stacks
10+
gh repo fork
11+
```
12+
13+
## Build Spark Image
14+
15+
The following command builds the Spark3.5 with Hadoop3 image with Java 17
16+
17+
```
18+
task build_spark_image
19+
```
20+
21+
## Start the env
22+
23+
```shell
24+
docker compose up -d minio nessie
25+
```
26+
27+
Access the minio using <http://localhost:9000> and create a bucket named `warehouse`
28+
29+
### Start Spark
30+
31+
```shell
32+
docker compose up spark
33+
```
34+
35+
Use the URL in the docker logs to connect to the server.

Taskfile.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# https://taskfile.dev
2+
3+
version: "3"
4+
5+
vars:
6+
JAVA_VERSION: 17
7+
SPARK_VERSION: 3.5.4
8+
HADOOP_VERSION: 3
9+
SPARK_NOTEBOOK_IMAGE: docker.io/kameshsampath/spark35notebook
10+
JUPYTER_DOCKER_STACK_REPO: git/jupyter/jupyter-docker-stacks
11+
12+
tasks:
13+
build_spark_image:
14+
silent: false
15+
desc: "Build Spark Notebook Image"
16+
cmds:
17+
- >-
18+
docker build --rm --force-rm --build-arg
19+
openjdk_version={{.JAVA_VERSION}} --build-arg
20+
spark_version={{.SPARK_VERSION}} --build-arg
21+
hadoop_version={{.HADOOP_VERSION}} --build-arg
22+
spark_download_url="https://archive.apache.org/dist/spark/" -t
23+
{{.SPARK_NOTEBOOK_IMAGE}}
24+
{{.HOME}}/{{.JUPYTER_DOCKER_STACK_REPO}}/images/pyspark-notebook

conf/ivy_settings.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<ivysettings>
2+
<settings defaultResolver="m2-nexus-resolver"/>
3+
<resolvers>
4+
<chain name="m2-nexus-resolver">
5+
<ibiblio name="host-nexus" m2compatible="true"
6+
root="http://host.docker.internal:8081/repository/maven-public/"/>
7+
</chain>
8+
</resolvers>
9+
</ivysettings>

docker-compose.yml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
services:
2+
## Apache Spark + Notebook
3+
spark:
4+
image: docker.io/kameshsampath/spark35notebook:latest
5+
ports:
6+
- 8888:8888 # Notebook
7+
environment:
8+
- AWS_REGION=us-east-1
9+
- AWS_ACCESS_KEY_ID=admin #minio username
10+
- AWS_SECRET_ACCESS_KEY=password #minio password
11+
container_name: spark
12+
volumes:
13+
# make local copy of the notebooks
14+
- ./notebooks:/home/jovyan/work
15+
# use local nexus as maven mirror for ivy
16+
- ./conf/ivy_settings.xml:/conf/ivy_settings.xml
17+
networks:
18+
spark-iceberg:
19+
## Minio Object Storage
20+
minio:
21+
image: minio/minio
22+
container_name: minio
23+
ports:
24+
- "9000:9000"
25+
- "9001:9001"
26+
environment:
27+
- MINIO_ROOT_USER=admin
28+
- MINIO_ROOT_PASSWORD=password
29+
- MINIO_DOMAIN=storage
30+
- MINIO_REGION_NAME=us-east-1
31+
- MINIO_REGION=us-east-1
32+
command: server /data --console-address ":9001"
33+
networks:
34+
spark-iceberg:
35+
## Nessie Iceberg Catalog
36+
nessie:
37+
image: projectnessie/nessie
38+
container_name: nessie
39+
ports:
40+
- "19120:19120"
41+
networks:
42+
spark-iceberg:
43+
networks:
44+
spark-iceberg:

scripts/setup_check.ipynb

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"vscode": {
8+
"languageId": "plaintext"
9+
}
10+
},
11+
"outputs": [],
12+
"source": [
13+
"import pyspark\n",
14+
"from pyspark.sql import SparkSession\n",
15+
"import os\n",
16+
"\n",
17+
"## DEFINE SENSITIVE VARIABLES\n",
18+
"NESSIE_SERVER_URI = \"http://nessie:19120/api/v2\"\n",
19+
"WAREHOUSE_BUCKET = \"s3://warehouse\"\n",
20+
"# use the IP of the minio server for this to be resolved\n",
21+
"MINIO_URI = \"http://minio:9000\"\n",
22+
"\n",
23+
"\n",
24+
"## Configurations for Spark Session\n",
25+
"conf = (\n",
26+
" pyspark.SparkConf()\n",
27+
" .setAppName('app_name')\n",
28+
" \t\t#packages\n",
29+
" .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.91.3,software.amazon.awssdk:bundle:2.20.131,software.amazon.awssdk:url-connection-client:2.20.131')\n",
30+
" \t\t#SQL Extensions\n",
31+
" .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')\n",
32+
" \t\t#Configuring Catalog\n",
33+
" .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')\n",
34+
" .set('spark.sql.catalog.nessie.uri', NESSIE_SERVER_URI)\n",
35+
" .set('spark.sql.catalog.nessie.ref', 'main')\n",
36+
" .set('spark.sql.catalog.nessie.authentication.type', 'NONE')\n",
37+
" .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')\n",
38+
" .set(\"spark.sql.catalog.nessie.s3.endpoint\",MINIO_URI)\n",
39+
" .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE_BUCKET)\n",
40+
" .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')\n",
41+
" .set('spark.jars.ivySettings','/path/to/custom-ivysettings.xml')\n",
42+
")\n",
43+
"\n",
44+
"## Start Spark Session\n",
45+
"spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
46+
"print(\"Spark Running\")\n",
47+
"\n",
48+
"\n",
49+
"## TEST QUERY TO CHECK IT WORKING\n",
50+
"### Create TABLE\n",
51+
"spark.sql(\"CREATE TABLE nessie.example (name STRING) USING iceberg;\").show()\n",
52+
"### INSERT INTO TABLE\n",
53+
"spark.sql(\"INSERT INTO nessie.example VALUES ('Jai Guru!');\").show()\n",
54+
"### Query Table\n",
55+
"spark.sql(\"SELECT * FROM nessie.example;\").show()"
56+
]
57+
}
58+
],
59+
"metadata": {
60+
"language_info": {
61+
"name": "python"
62+
}
63+
},
64+
"nbformat": 4,
65+
"nbformat_minor": 2
66+
}

0 commit comments

Comments
 (0)