From b487cda774ff7d3cdf32971e6b319d795323c11e Mon Sep 17 00:00:00 2001
From: Ronan Stokes <42389040+ronanstokes-db@users.noreply.github.com>
Date: Tue, 3 Oct 2023 13:37:03 -0700
Subject: [PATCH 1/2] Create README.md

---
 examples/notebooks/README.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 examples/notebooks/README.md

diff --git a/examples/notebooks/README.md b/examples/notebooks/README.md
new file mode 100644
index 00000000..fea1ec21
--- /dev/null
+++ b/examples/notebooks/README.md
@@ -0,0 +1,6 @@
+These examples are exported Databricks notebooks in source format. 
+
+To use these, import the notebooks into a Databricks workspace, create a cluster and run the cells of the notebook, or
+alternatively, use `run all` to run all cells.
+
+They are meant as starting points and may require updating of storage paths etc to meet your needs.

From ec6d875a4e7626633d540241adf5d9e729603dcf Mon Sep 17 00:00:00 2001
From: Ronan Stokes <42389040+ronanstokes-db@users.noreply.github.com>
Date: Tue, 3 Oct 2023 13:37:47 -0700
Subject: [PATCH 2/2] Add files via upload

Added exported Databricks example notebook
---
 ...eratingBatchSyntheticDataWithDbldatagen.py | 171 ++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 examples/notebooks/GeneratingBatchSyntheticDataWithDbldatagen.py

diff --git a/examples/notebooks/GeneratingBatchSyntheticDataWithDbldatagen.py b/examples/notebooks/GeneratingBatchSyntheticDataWithDbldatagen.py
new file mode 100644
index 00000000..1cbed992
--- /dev/null
+++ b/examples/notebooks/GeneratingBatchSyntheticDataWithDbldatagen.py
@@ -0,0 +1,171 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC
+# MAGIC ## Overview
+# MAGIC
+# MAGIC This notebook shows you how to generate and use tabular synthetic data using the Databricks Labs Data Generator
+# MAGIC
+# MAGIC Further information:
+# MAGIC
+# MAGIC - Online docs: https://databrickslabs.github.io/dbldatagen/public_docs/index.html
+# MAGIC - Project Github home page : https://github.com/databrickslabs/dbldatagen
+# MAGIC - Pypi home page: https://pypi.org/project/dbldatagen/
+# MAGIC - Other Databricks Labs projects : https://www.databricks.com/learn/labs
+# MAGIC
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC
+# MAGIC ### Step 1: Import Databricks Labs Data Generator using `pip install`
+# MAGIC
+# MAGIC This is a **Python** notebook so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` magic command. Python, Scala, SQL, and R are all supported.
+# MAGIC
+# MAGIC Generation of the data specification and the dataframe is only supported in Python. However you can create a named view over the data frame or save it to a table and
+# MAGIC use it from other languages
+
+# COMMAND ----------
+
+# DBTITLE 1,Install the Data Generator Library (`dbldatagen`) using `pip install`
+# MAGIC %pip install dbldatagen
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC
+# MAGIC ### Step 2: Define the data generation specification
+# MAGIC
+# MAGIC The Databricks Labs Data Generator (`dbldatagen`) uses a data generation specification to control how to generate the data. 
+# MAGIC
+# MAGIC Once the data generation specification is completed, you can invoke the `build` method to create a dataframe which then then be used or written to tables etc
+
+# COMMAND ----------
+
+# build data generation specification for IOT style device data
+
+from pyspark.sql.types import LongType, IntegerType, StringType
+
+import dbldatagen as dg
+
+device_population = 100000
+data_rows = 20 * 1000000
+partitions_requested = 20
+
+country_codes = [
+    "CN", "US", "FR", "CA", "IN", "JM", "IE", "PK", "GB", "IL", "AU", 
+    "SG", "ES", "GE", "MX", "ET", "SA", "LB", "NL",
+]
+country_weights = [
+    1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 
+    126, 109, 58, 8, 17,
+]
+
+manufacturers = [
+    "Delta corp", "Xyzzy Inc.", "Lakehouse Ltd", "Acme Corp", "Embanks Devices",
+]
+
+lines = ["delta", "xyzzy", "lakehouse", "gadget", "droid"]
+
+datagenSpec = (
+    dg.DataGenerator(spark, name="device_data_set", rows=data_rows, 
+                     partitions=partitions_requested)
+    .withIdOutput()
+    # we'll use hash of the base field to generate the ids to
+    # avoid a simple incrementing sequence
+    .withColumn("internal_device_id", "long", minValue=0x1000000000000, 
+                uniqueValues=device_population, omit=True, baseColumnType="hash",
+    )
+    # note for format strings, we must use "%lx" not "%x" as the
+    # underlying value is a long
+    .withColumn(
+        "device_id", "string", format="0x%013x", baseColumn="internal_device_id"
+    )
+    # the device / user attributes will be the same for the same device id
+    # so lets use the internal device id as the base column for these attribute
+    .withColumn("country", "string", values=country_codes, weights=country_weights, 
+                baseColumn="internal_device_id")
+    .withColumn("manufacturer", "string", values=manufacturers, 
+                baseColumn="internal_device_id", )
+    # use omit = True if you don't want a column to appear in the final output
+    # but just want to use it as part of generation of another column
+    .withColumn("line", "string", values=lines, baseColumn="manufacturer", 
+                baseColumnType="hash" )
+    .withColumn("event_type", "string", 
+                values=["activation", "deactivation", "plan change", "telecoms activity", 
+                        "internet activity", "device error", ],
+                random=True)
+    .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", 
+                end="2020-12-31 23:59:00", 
+                interval="1 minute", random=True )
+)
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC
+# MAGIC ### Step 3: Generate the data
+# MAGIC
+# MAGIC Invoke `build` to generate a dataframe
+
+# COMMAND ----------
+
+dfGeneratedData = datagenSpec.build(withTempView=True)
+
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC
+# MAGIC ### Step 4: Browse the generated data
+# MAGIC
+
+# COMMAND ----------
+
+display(dfGeneratedData)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC
+# MAGIC ### Step 5: Querying the data from SQL
+# MAGIC
+# MAGIC Lets perform a number of quick checks:
+# MAGIC
+# MAGIC - what is total rows generated and number of distinct device ids
+# MAGIC - lets make sure that no device has more than 1 country, manufacturer and line
+# MAGIC
+
+# COMMAND ----------
+
+# MAGIC %sql
+# MAGIC select count(*) , count(distinct device_id) from device_data_set
+
+# COMMAND ----------
+
+# MAGIC %sql
+# MAGIC select * from 
+# MAGIC (select device_id, count(distinct country) as num_countries, 
+# MAGIC                   count(distinct manufacturer) as num_manufacturers,
+# MAGIC                   count(distinct line) as num_lines
+# MAGIC from  device_data_set
+# MAGIC group by device_id)
+# MAGIC where num_countries > 1 or num_manufacturers > 1 or num_lines > 1
+# MAGIC
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC
+# MAGIC ### Step 6 (optional): Saving the generated data to the file system
+# MAGIC
+# MAGIC Lets save the data to the file system
+# MAGIC
+# MAGIC - what is total rows generated and number of distinct device ids
+# MAGIC - lets make sure that no device has more than 1 country, manufacturer and line
+
+# COMMAND ----------
+
+# MAGIC %md ### Step 7 (optional): Saving the data to a table
+# MAGIC
+# MAGIC Lets save the generated data to a table