Skip to content

Commit 542809f

Browse files
Doc updates 032523 (#181)
* wip * documentation updates
1 parent 1355bec commit 542809f

File tree

6 files changed

+175
-160
lines changed

6 files changed

+175
-160
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ All notable changes to the Databricks Labs Data Generator will be documented in
1010
* Fixed template issues
1111
* Added use of prospector to build process to validate common code issues
1212
* Apply pandas optimizations when generating multiple columns using same `withColumn` or `withColumnSpec`
13+
* Document reformatting and updates
1314

1415
### Version 0.3.2
1516

docs/source/extending_text_generation.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ extended syntax.
3838
.withColumn("address", text=fakerText("address" ))
3939
.withColumn("email", text=fakerText("ascii_company_email") )
4040
.withColumn("ip_address", text=fakerText("ipv4_private" ))
41-
.withColumn("faker_text", text=fakerText("sentence", ext_word_list=my_word_list) )
41+
.withColumn("faker_text", text=fakerText("sentence", ext_word_list=my_word_list))
4242
)
4343
dfFakerOnly = fakerDataspec.build()
4444
@@ -91,7 +91,9 @@ The following code shows use of a custom Python function to generate text:
9191
9292
pluginDataspec = (DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
9393
randomSeedMethod="hash_fieldname")
94-
.withColumn("text", text=PyfuncText(text_generator, initFn=initPluginContext))
94+
.withColumn("text",
95+
text=PyfuncText(text_generator,
96+
initFn=initPluginContext))
9597
)
9698
9799
dfPlugin = pluginDataspec.build()

docs/source/generating_cdc_data.rst

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.. Test Data Generator documentation master file, created by
2-
sphinx-quickstart on Sun Jun 21 10:54:30 2020.
3-
You can adapt this file completely to your liking, but it should at least
4-
contain the root `toctree` directive.
2+
sphinx-quickstart on Sun Jun 21 10:54:30 2020.
3+
You can adapt this file completely to your liking, but it should at least
4+
contain the root `toctree` directive.
55

66
Generating Change Data Capture Data
77
===================================
@@ -47,28 +47,30 @@ We'll add a timestamp for when the row was generated and a memo field to mark wh
4747
4848
uniqueCustomers = 10 * 1000000
4949
50-
dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
51-
.withColumn("customer_id","long", uniqueValues=uniqueCustomers)
52-
.withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
53-
.withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
54-
.withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
55-
'American Express', 'discover', 'branded visa', 'branded mastercard'],
56-
random=True, distribution="normal")
57-
.withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999, baseColumn="customer_id",
58-
baseColumnType="hash", omit=True)
59-
.withColumn("payment_instrument", expr="format_number(int_payment_instrument, '**** ****** *####')",
60-
baseColumn="int_payment_instrument")
61-
.withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
62-
.withColumn("email2", template=r'\\w.\\w@\\w.com')
63-
.withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
64-
.withColumn("md5_payment_instrument",
65-
expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
66-
base_column=['payment_instrument_type', 'payment_instrument'])
67-
.withColumn("customer_notes", text=dg.ILText(words=(1,8)))
68-
.withColumn("created_ts", "timestamp", expr="now()")
69-
.withColumn("modified_ts", "timestamp", expr="now()")
70-
.withColumn("memo", expr="'original data'")
71-
)
50+
dataspec = (
51+
dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
52+
.withColumn("customer_id","long", uniqueValues=uniqueCustomers)
53+
.withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
54+
.withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
55+
.withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
56+
'American Express', 'discover', 'branded visa', 'branded mastercard'],
57+
random=True, distribution="normal")
58+
.withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999,
59+
baseColumn="customer_id", baseColumnType="hash", omit=True)
60+
.withColumn("payment_instrument",
61+
expr="format_number(int_payment_instrument, '**** ****** *####')",
62+
baseColumn="int_payment_instrument")
63+
.withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
64+
.withColumn("email2", template=r'\\w.\\w@\\w.com')
65+
.withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
66+
.withColumn("md5_payment_instrument",
67+
expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
68+
base_column=['payment_instrument_type', 'payment_instrument'])
69+
.withColumn("customer_notes", text=dg.ILText(words=(1,8)))
70+
.withColumn("created_ts", "timestamp", expr="now()")
71+
.withColumn("modified_ts", "timestamp", expr="now()")
72+
.withColumn("memo", expr="'original data'")
73+
)
7274
df1 = dataspec.build()
7375
7476
# write table
@@ -168,7 +170,6 @@ values of the columns from the source table will be used.
168170
])
169171
170172
print(sqlStmt)
171-
172173
spark.sql(sqlStmt)
173174
174175
That's all that's required to perform merges with the data generation framework.

docs/source/generating_json_data.rst

Lines changed: 46 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -195,51 +195,52 @@ functions such as `named_struct` and `to_json`.
195195
196196
lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid']
197197
198-
testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000,
199-
partitions=8,
200-
randomSeedMethod='hash_fieldname')
201-
.withIdOutput()
202-
# we'll use hash of the base field to generate the ids to
203-
# avoid a simple incrementing sequence
204-
.withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
205-
uniqueValues=device_population, omit=True, baseColumnType="hash")
206-
207-
# note for format strings, we must use "%lx" not "%x" as the
208-
# underlying value is a long
209-
.withColumn("device_id", StringType(), format="0x%013x",
210-
baseColumn="internal_device_id")
211-
212-
# the device / user attributes will be the same for the same device id
213-
# so lets use the internal device id as the base column for these attribute
214-
.withColumn("country", StringType(), values=country_codes,
215-
weights=country_weights,
216-
baseColumn="internal_device_id")
217-
218-
.withColumn("manufacturer", StringType(), values=manufacturers,
219-
baseColumn="internal_device_id", omit=True)
220-
.withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
221-
baseColumnType="hash", omit=True)
222-
.withColumn("manufacturer_info", "string",
223-
expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))",
224-
baseColumn=['manufacturer', 'line'])
225-
226-
227-
.withColumn("model_ser", IntegerType(), minValue=1, maxValue=11,
228-
baseColumn="device_id",
229-
baseColumnType="hash", omit=True)
230-
231-
.withColumn("event_type", StringType(),
232-
values=["activation", "deactivation", "plan change",
233-
"telecoms activity", "internet activity", "device error"],
234-
random=True, omit=True)
235-
.withColumn("event_ts", "timestamp",
236-
begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
237-
interval="1 minute", random=True, omit=True)
238-
239-
.withColumn("event_info", "string",
240-
expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))",
241-
baseColumn=['event_type', 'event_ts'])
242-
)
198+
testDataSpec = (
199+
dg.DataGenerator(spark, name="device_data_set", rows=1000000,
200+
partitions=8,
201+
randomSeedMethod='hash_fieldname')
202+
.withIdOutput()
203+
# we'll use hash of the base field to generate the ids to
204+
# avoid a simple incrementing sequence
205+
.withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
206+
uniqueValues=device_population, omit=True, baseColumnType="hash")
207+
208+
# note for format strings, we must use "%lx" not "%x" as the
209+
# underlying value is a long
210+
.withColumn("device_id", StringType(), format="0x%013x",
211+
baseColumn="internal_device_id")
212+
213+
# the device / user attributes will be the same for the same device id
214+
# so lets use the internal device id as the base column for these attribute
215+
.withColumn("country", StringType(), values=country_codes,
216+
weights=country_weights,
217+
baseColumn="internal_device_id")
218+
219+
.withColumn("manufacturer", StringType(), values=manufacturers,
220+
baseColumn="internal_device_id", omit=True)
221+
.withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
222+
baseColumnType="hash", omit=True)
223+
.withColumn("manufacturer_info", "string",
224+
expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))",
225+
baseColumn=['manufacturer', 'line'])
226+
227+
228+
.withColumn("model_ser", IntegerType(), minValue=1, maxValue=11,
229+
baseColumn="device_id",
230+
baseColumnType="hash", omit=True)
231+
232+
.withColumn("event_type", StringType(),
233+
values=["activation", "deactivation", "plan change",
234+
"telecoms activity", "internet activity", "device error"],
235+
random=True, omit=True)
236+
.withColumn("event_ts", "timestamp",
237+
begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
238+
interval="1 minute", random=True, omit=True)
239+
240+
.withColumn("event_info", "string",
241+
expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))",
242+
baseColumn=['event_type', 'event_ts'])
243+
)
243244
244245
dfTestData = testDataSpec.build()
245246

docs/source/multi_table_data.rst

Lines changed: 48 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.. Test Data Generator documentation master file, created by
2-
sphinx-quickstart on Sun Jun 21 10:54:30 2020.
3-
You can adapt this file completely to your liking, but it should at least
4-
contain the root `toctree` directive.
2+
sphinx-quickstart on Sun Jun 21 10:54:30 2020.
3+
You can adapt this file completely to your liking, but it should at least
4+
contain the root `toctree` directive.
55

66
Generating and Using Data with Multiple Tables
77
==============================================
@@ -73,7 +73,9 @@ Here we use a simple sequence for our plan ids.
7373
import dbldatagen as dg
7474
import pyspark.sql.functions as F
7575
76-
spark.catalog.clearCache() # clear cache so that if we run multiple times to check performance, we're not relying on cache
76+
# clear cache so that if we run multiple times to check performance,
77+
# we're not relying on cache
78+
spark.catalog.clearCache()
7779
7880
UNIQUE_PLANS = 20
7981
PLAN_MIN_VALUE = 100
@@ -87,36 +89,35 @@ Here we use a simple sequence for our plan ids.
8789
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 20000)
8890
8991
90-
plan_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
91-
.withColumn("plan_id","int", minValue=PLAN_MIN_VALUE, uniqueValues=UNIQUE_PLANS)
92-
# use plan_id as root value
93-
.withColumn("plan_name", prefix="plan", baseColumn="plan_id")
94-
95-
# note default step is 1 so you must specify a step for small number ranges,
96-
.withColumn("cost_per_mb", "decimal(5,3)", minValue=0.005, maxValue=0.050,
97-
step=0.005, random=True)
98-
.withColumn("cost_per_message", "decimal(5,3)", minValue=0.001, maxValue=0.02,
99-
step=0.001, random=True)
100-
.withColumn("cost_per_minute", "decimal(5,3)", minValue=0.001, maxValue=0.01,
101-
step=0.001, random=True)
102-
103-
# we're modelling long distance and international prices simplistically -
104-
# each is a multiplier thats applied to base rate
105-
.withColumn("ld_multiplier", "decimal(5,3)", minValue=1.5, maxValue=3, step=0.05,
106-
random=True, distribution="normal", omit=True)
107-
.withColumn("ld_cost_per_minute", "decimal(5,3)",
108-
expr="cost_per_minute * ld_multiplier",
109-
baseColumns=['cost_per_minute', 'ld_multiplier'])
110-
.withColumn("intl_multiplier", "decimal(5,3)", minValue=2, maxValue=4, step=0.05,
111-
random=True, distribution="normal", omit=True)
112-
.withColumn("intl_cost_per_minute", "decimal(5,3)",
113-
expr="cost_per_minute * intl_multiplier",
114-
baseColumns=['cost_per_minute', 'intl_multiplier'])
92+
plan_dataspec = (
93+
dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
94+
.withColumn("plan_id","int", minValue=PLAN_MIN_VALUE, uniqueValues=UNIQUE_PLANS)
95+
# use plan_id as root value
96+
.withColumn("plan_name", prefix="plan", baseColumn="plan_id")
97+
98+
# note default step is 1 so you must specify a step for small number ranges,
99+
.withColumn("cost_per_mb", "decimal(5,3)", minValue=0.005, maxValue=0.050,
100+
step=0.005, random=True)
101+
.withColumn("cost_per_message", "decimal(5,3)", minValue=0.001, maxValue=0.02,
102+
step=0.001, random=True)
103+
.withColumn("cost_per_minute", "decimal(5,3)", minValue=0.001, maxValue=0.01,
104+
step=0.001, random=True)
105+
106+
# we're modelling long distance and international prices simplistically -
107+
# each is a multiplier thats applied to base rate
108+
.withColumn("ld_multiplier", "decimal(5,3)", minValue=1.5, maxValue=3, step=0.05,
109+
random=True, distribution="normal", omit=True)
110+
.withColumn("ld_cost_per_minute", "decimal(5,3)",
111+
expr="cost_per_minute * ld_multiplier",
112+
baseColumns=['cost_per_minute', 'ld_multiplier'])
113+
.withColumn("intl_multiplier", "decimal(5,3)", minValue=2, maxValue=4, step=0.05,
114+
random=True, distribution="normal", omit=True)
115+
.withColumn("intl_cost_per_minute", "decimal(5,3)",
116+
expr="cost_per_minute * intl_multiplier",
117+
baseColumns=['cost_per_minute', 'intl_multiplier'])
115118
)
116119
117-
df_plans = (plan_dataspec.build()
118-
.cache()
119-
)
120+
df_plans = plan_dataspec.build().cache()
120121
121122
display(df_plans)
122123
@@ -195,10 +196,11 @@ when using hashed values, the range of the hashes produced can be large.
195196
196197
effective_customers = df_customers.count()
197198
198-
print(stripMargin(f"""revised customers : {df_customers.count()},
199-
| unique customers: {df_customers.select(F.countDistinct('customer_id')).take(1)[0][0]},
200-
| unique device ids: {df_customers.select(F.countDistinct('device_id')).take(1)[0][0]},
201-
| unique phone numbers: {df_customers.select(F.countDistinct('phone_number')).take(1)[0][0]}""")
199+
print(stripMargin(
200+
f"""revised customers : {df_customers.count()},
201+
| unique customers: {df_customers.select(F.countDistinct('customer_id')).take(1)[0][0]},
202+
| unique device ids: {df_customers.select(F.countDistinct('device_id')).take(1)[0][0]},
203+
| unique phone numbers: {df_customers.select(F.countDistinct('phone_number')).take(1)[0][0]}""")
202204
)
203205
204206
display(df_customers)
@@ -247,7 +249,8 @@ A simple approach is simply to multiply the
247249
# use random seed method of 'hash_fieldname' for better spread - default in later builds
248250
events_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
249251
randomSeed=42, randomSeedMethod="hash_fieldname")
250-
# use same logic as per customers dataset to ensure matching keys - but make them random
252+
# use same logic as per customers dataset to ensure matching keys
253+
# but make them random
251254
.withColumn("device_id_base","decimal(10)", minValue=CUSTOMER_MIN_VALUE,
252255
uniqueValues=UNIQUE_CUSTOMERS,
253256
random=True, omit=True)
@@ -260,12 +263,14 @@ A simple approach is simply to multiply the
260263
weights=[50, 50, 20, 10, 5 ], random=True)
261264
262265
# use Gamma distribution for skew towards short calls
263-
.withColumn("base_minutes","decimal(7,2)", minValue=1.0, maxValue=100.0, step=0.1,
266+
.withColumn("base_minutes","decimal(7,2)",
267+
minValue=1.0, maxValue=100.0, step=0.1,
264268
distribution=dg.distributions.Gamma(shape=1.5, scale=2.0),
265269
random=True, omit=True)
266270
267271
# use Gamma distribution for skew towards short transfers
268-
.withColumn("base_bytes_transferred","decimal(12)", minValue=K_1, maxValue=MB_100,
272+
.withColumn("base_bytes_transferred","decimal(12)",
273+
minValue=K_1, maxValue=MB_100,
269274
distribution=dg.distributions.Gamma(shape=0.75, scale=2.0),
270275
random=True, omit=True)
271276
@@ -308,8 +313,7 @@ Let's compute the customers and associated plans
308313
import pyspark.sql.functions as F
309314
import pyspark.sql.types as T
310315
311-
df_customer_pricing = df_customers.join(df_plans,
312-
df_plans.plan_id == df_customers.plan)
316+
df_customer_pricing = df_customers.join(df_plans, df_plans.plan_id == df_customers.plan)
313317
314318
display(df_customer_pricing)
315319
@@ -365,8 +369,9 @@ now let's compute the invoices
365369

366370
.. code-block:: python
367371
368-
df_customer_summary = (df_customer_pricing.join(df_summary,
369-
df_customer_pricing.device_id == df_summary.device_id )
372+
df_customer_summary = (
373+
df_customer_pricing.join(df_summary,
374+
df_customer_pricing.device_id == df_summary.device_id )
370375
.createOrReplaceTempView("customer_summary"))
371376
372377
df_invoices = spark.sql("""

0 commit comments

Comments
 (0)