|
| 1 | +# Databricks notebook source |
| 2 | +# MAGIC %md |
| 3 | +# MAGIC |
| 4 | +# MAGIC ## Overview |
| 5 | +# MAGIC |
| 6 | +# MAGIC This notebook shows you how to generate and use tabular synthetic data using the Databricks Labs Data Generator |
| 7 | +# MAGIC |
| 8 | +# MAGIC Further information: |
| 9 | +# MAGIC |
| 10 | +# MAGIC - Online docs: https://databrickslabs.github.io/dbldatagen/public_docs/index.html |
| 11 | +# MAGIC - Project Github home page : https://github.com/databrickslabs/dbldatagen |
| 12 | +# MAGIC - Pypi home page: https://pypi.org/project/dbldatagen/ |
| 13 | +# MAGIC - Other Databricks Labs projects : https://www.databricks.com/learn/labs |
| 14 | +# MAGIC |
| 15 | + |
| 16 | +# COMMAND ---------- |
| 17 | + |
| 18 | +# MAGIC %md |
| 19 | +# MAGIC |
| 20 | +# MAGIC ### Step 1: Import Databricks Labs Data Generator using `pip install` |
| 21 | +# MAGIC |
| 22 | +# MAGIC This is a **Python** notebook so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` magic command. Python, Scala, SQL, and R are all supported. |
| 23 | +# MAGIC |
| 24 | +# MAGIC Generation of the data specification and the dataframe is only supported in Python. However you can create a named view over the data frame or save it to a table and |
| 25 | +# MAGIC use it from other languages |
| 26 | + |
| 27 | +# COMMAND ---------- |
| 28 | + |
| 29 | +# DBTITLE 1,Install the Data Generator Library (`dbldatagen`) using `pip install` |
| 30 | +# MAGIC %pip install dbldatagen |
| 31 | + |
| 32 | +# COMMAND ---------- |
| 33 | + |
| 34 | +# MAGIC %md |
| 35 | +# MAGIC |
| 36 | +# MAGIC ### Step 2: Define the data generation specification |
| 37 | +# MAGIC |
| 38 | +# MAGIC The Databricks Labs Data Generator (`dbldatagen`) uses a data generation specification to control how to generate the data. |
| 39 | +# MAGIC |
| 40 | +# MAGIC Once the data generation specification is completed, you can invoke the `build` method to create a dataframe which then then be used or written to tables etc |
| 41 | + |
| 42 | +# COMMAND ---------- |
| 43 | + |
| 44 | +# build data generation specification for IOT style device data |
| 45 | + |
| 46 | +from pyspark.sql.types import LongType, IntegerType, StringType |
| 47 | + |
| 48 | +import dbldatagen as dg |
| 49 | + |
| 50 | +device_population = 100000 |
| 51 | +data_rows = 20 * 1000000 |
| 52 | +partitions_requested = 20 |
| 53 | + |
| 54 | +country_codes = [ |
| 55 | + "CN", "US", "FR", "CA", "IN", "JM", "IE", "PK", "GB", "IL", "AU", |
| 56 | + "SG", "ES", "GE", "MX", "ET", "SA", "LB", "NL", |
| 57 | +] |
| 58 | +country_weights = [ |
| 59 | + 1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, |
| 60 | + 126, 109, 58, 8, 17, |
| 61 | +] |
| 62 | + |
| 63 | +manufacturers = [ |
| 64 | + "Delta corp", "Xyzzy Inc.", "Lakehouse Ltd", "Acme Corp", "Embanks Devices", |
| 65 | +] |
| 66 | + |
| 67 | +lines = ["delta", "xyzzy", "lakehouse", "gadget", "droid"] |
| 68 | + |
| 69 | +datagenSpec = ( |
| 70 | + dg.DataGenerator(spark, name="device_data_set", rows=data_rows, |
| 71 | + partitions=partitions_requested) |
| 72 | + .withIdOutput() |
| 73 | + # we'll use hash of the base field to generate the ids to |
| 74 | + # avoid a simple incrementing sequence |
| 75 | + .withColumn("internal_device_id", "long", minValue=0x1000000000000, |
| 76 | + uniqueValues=device_population, omit=True, baseColumnType="hash", |
| 77 | + ) |
| 78 | + # note for format strings, we must use "%lx" not "%x" as the |
| 79 | + # underlying value is a long |
| 80 | + .withColumn( |
| 81 | + "device_id", "string", format="0x%013x", baseColumn="internal_device_id" |
| 82 | + ) |
| 83 | + # the device / user attributes will be the same for the same device id |
| 84 | + # so lets use the internal device id as the base column for these attribute |
| 85 | + .withColumn("country", "string", values=country_codes, weights=country_weights, |
| 86 | + baseColumn="internal_device_id") |
| 87 | + .withColumn("manufacturer", "string", values=manufacturers, |
| 88 | + baseColumn="internal_device_id", ) |
| 89 | + # use omit = True if you don't want a column to appear in the final output |
| 90 | + # but just want to use it as part of generation of another column |
| 91 | + .withColumn("line", "string", values=lines, baseColumn="manufacturer", |
| 92 | + baseColumnType="hash" ) |
| 93 | + .withColumn("event_type", "string", |
| 94 | + values=["activation", "deactivation", "plan change", "telecoms activity", |
| 95 | + "internet activity", "device error", ], |
| 96 | + random=True) |
| 97 | + .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", |
| 98 | + end="2020-12-31 23:59:00", |
| 99 | + interval="1 minute", random=True ) |
| 100 | +) |
| 101 | + |
| 102 | + |
| 103 | +# COMMAND ---------- |
| 104 | + |
| 105 | +# MAGIC %md |
| 106 | +# MAGIC |
| 107 | +# MAGIC ### Step 3: Generate the data |
| 108 | +# MAGIC |
| 109 | +# MAGIC Invoke `build` to generate a dataframe |
| 110 | + |
| 111 | +# COMMAND ---------- |
| 112 | + |
| 113 | +dfGeneratedData = datagenSpec.build(withTempView=True) |
| 114 | + |
| 115 | + |
| 116 | +# COMMAND ---------- |
| 117 | + |
| 118 | +# MAGIC %md |
| 119 | +# MAGIC |
| 120 | +# MAGIC ### Step 4: Browse the generated data |
| 121 | +# MAGIC |
| 122 | + |
| 123 | +# COMMAND ---------- |
| 124 | + |
| 125 | +display(dfGeneratedData) |
| 126 | + |
| 127 | +# COMMAND ---------- |
| 128 | + |
| 129 | +# MAGIC %md |
| 130 | +# MAGIC |
| 131 | +# MAGIC ### Step 5: Querying the data from SQL |
| 132 | +# MAGIC |
| 133 | +# MAGIC Lets perform a number of quick checks: |
| 134 | +# MAGIC |
| 135 | +# MAGIC - what is total rows generated and number of distinct device ids |
| 136 | +# MAGIC - lets make sure that no device has more than 1 country, manufacturer and line |
| 137 | +# MAGIC |
| 138 | + |
| 139 | +# COMMAND ---------- |
| 140 | + |
| 141 | +# MAGIC %sql |
| 142 | +# MAGIC select count(*) , count(distinct device_id) from device_data_set |
| 143 | + |
| 144 | +# COMMAND ---------- |
| 145 | + |
| 146 | +# MAGIC %sql |
| 147 | +# MAGIC select * from |
| 148 | +# MAGIC (select device_id, count(distinct country) as num_countries, |
| 149 | +# MAGIC count(distinct manufacturer) as num_manufacturers, |
| 150 | +# MAGIC count(distinct line) as num_lines |
| 151 | +# MAGIC from device_data_set |
| 152 | +# MAGIC group by device_id) |
| 153 | +# MAGIC where num_countries > 1 or num_manufacturers > 1 or num_lines > 1 |
| 154 | +# MAGIC |
| 155 | + |
| 156 | +# COMMAND ---------- |
| 157 | + |
| 158 | +# MAGIC %md |
| 159 | +# MAGIC |
| 160 | +# MAGIC ### Step 6 (optional): Saving the generated data to the file system |
| 161 | +# MAGIC |
| 162 | +# MAGIC Lets save the data to the file system |
| 163 | +# MAGIC |
| 164 | +# MAGIC - what is total rows generated and number of distinct device ids |
| 165 | +# MAGIC - lets make sure that no device has more than 1 country, manufacturer and line |
| 166 | + |
| 167 | +# COMMAND ---------- |
| 168 | + |
| 169 | +# MAGIC %md ### Step 7 (optional): Saving the data to a table |
| 170 | +# MAGIC |
| 171 | +# MAGIC Lets save the generated data to a table |
0 commit comments