Doc updates 032523 (#181)

ronanstokes-db · web-flow · commit 542809f0ca47 · 2023-03-27T14:37:10.000-07:00
* wip

* documentation updates
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 * Fixed template issues 
 * Added use of prospector to build process to validate common code issues
 * Apply pandas optimizations when generating multiple columns using same `withColumn` or `withColumnSpec`
+* Document reformatting and updates
 
 ### Version 0.3.2
 
diff --git a/docs/source/extending_text_generation.rst b/docs/source/extending_text_generation.rst
@@ -38,7 +38,7 @@ extended syntax.
                .withColumn("address", text=fakerText("address" ))
                .withColumn("email", text=fakerText("ascii_company_email") )
                .withColumn("ip_address", text=fakerText("ipv4_private" ))
-               .withColumn("faker_text", text=fakerText("sentence", ext_word_list=my_word_list) )
+               .withColumn("faker_text", text=fakerText("sentence", ext_word_list=my_word_list))
                )
    dfFakerOnly = fakerDataspec.build()
 
@@ -91,7 +91,9 @@ The following code shows use of a custom Python function to generate text:
 
    pluginDataspec = (DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
                      randomSeedMethod="hash_fieldname")
-                     .withColumn("text", text=PyfuncText(text_generator, initFn=initPluginContext))
+                     .withColumn("text",
+                                 text=PyfuncText(text_generator,
+                                 initFn=initPluginContext))
                     )
 
    dfPlugin = pluginDataspec.build()
diff --git a/docs/source/generating_cdc_data.rst b/docs/source/generating_cdc_data.rst
@@ -1,7 +1,7 @@
 .. Test Data Generator documentation master file, created by
-   sphinx-quickstart on Sun Jun 21 10:54:30 2020.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
+sphinx-quickstart on Sun Jun 21 10:54:30 2020.
+You can adapt this file completely to your liking, but it should at least
+contain the root `toctree` directive.
 
 Generating Change Data Capture Data
 ===================================
@@ -47,28 +47,30 @@ We'll add a timestamp for when the row was generated and a memo field to mark wh
 
    uniqueCustomers = 10 * 1000000
 
-   dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
-               .withColumn("customer_id","long", uniqueValues=uniqueCustomers)
-               .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
-               .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
-               .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
-                           'American Express', 'discover', 'branded visa', 'branded mastercard'],
-                           random=True, distribution="normal")
-               .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,  baseColumn="customer_id",
-                           baseColumnType="hash", omit=True)
-               .withColumn("payment_instrument", expr="format_number(int_payment_instrument, '**** ****** *####')",
-                           baseColumn="int_payment_instrument")
-               .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
-               .withColumn("email2", template=r'\\w.\\w@\\w.com')
-               .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
-               .withColumn("md5_payment_instrument",
-                           expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
-                           base_column=['payment_instrument_type', 'payment_instrument'])
-               .withColumn("customer_notes", text=dg.ILText(words=(1,8)))
-               .withColumn("created_ts", "timestamp", expr="now()")
-               .withColumn("modified_ts", "timestamp", expr="now()")
-               .withColumn("memo", expr="'original data'")
-               )
+   dataspec = (
+       dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
+         .withColumn("customer_id","long", uniqueValues=uniqueCustomers)
+         .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
+         .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
+         .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
+                     'American Express', 'discover', 'branded visa', 'branded mastercard'],
+                     random=True, distribution="normal")
+         .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,
+                     baseColumn="customer_id", baseColumnType="hash", omit=True)
+         .withColumn("payment_instrument",
+                     expr="format_number(int_payment_instrument, '**** ****** *####')",
+                     baseColumn="int_payment_instrument")
+         .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
+         .withColumn("email2", template=r'\\w.\\w@\\w.com')
+         .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
+         .withColumn("md5_payment_instrument",
+                     expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
+                     base_column=['payment_instrument_type', 'payment_instrument'])
+         .withColumn("customer_notes", text=dg.ILText(words=(1,8)))
+         .withColumn("created_ts", "timestamp", expr="now()")
+         .withColumn("modified_ts", "timestamp", expr="now()")
+         .withColumn("memo", expr="'original data'")
+         )
    df1 = dataspec.build()
 
    # write table
@@ -168,7 +170,6 @@ values of the columns from the source table will be used.
                                                     ])
 
    print(sqlStmt)
-
    spark.sql(sqlStmt)
 
 That's all that's required to perform merges with the data generation framework.
diff --git a/docs/source/generating_json_data.rst b/docs/source/generating_json_data.rst
@@ -195,51 +195,52 @@ functions such as `named_struct` and `to_json`.
 
    lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid']
 
-   testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000,
-                                    partitions=8,
-                                    randomSeedMethod='hash_fieldname')
-                   .withIdOutput()
-                   # we'll use hash of the base field to generate the ids to
-                   # avoid a simple incrementing sequence
-                   .withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
-                               uniqueValues=device_population, omit=True, baseColumnType="hash")
-
-                   # note for format strings, we must use "%lx" not "%x" as the
-                   # underlying value is a long
-                   .withColumn("device_id", StringType(), format="0x%013x",
-                               baseColumn="internal_device_id")
-
-                   # the device / user attributes will be the same for the same device id
-                   # so lets use the internal device id as the base column for these attribute
-                   .withColumn("country", StringType(), values=country_codes,
-                               weights=country_weights,
-                               baseColumn="internal_device_id")
-
-                   .withColumn("manufacturer", StringType(), values=manufacturers,
-                               baseColumn="internal_device_id", omit=True)
-                   .withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
-                               baseColumnType="hash", omit=True)
-                   .withColumn("manufacturer_info", "string",
-                               expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))",
-                               baseColumn=['manufacturer', 'line'])
-
-
-                   .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11,
-                               baseColumn="device_id",
-                               baseColumnType="hash", omit=True)
-
-                   .withColumn("event_type", StringType(),
-                               values=["activation", "deactivation", "plan change",
-                                       "telecoms activity", "internet activity", "device error"],
-                               random=True, omit=True)
-                   .withColumn("event_ts", "timestamp",
-                               begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
-                               interval="1 minute", random=True, omit=True)
-
-                   .withColumn("event_info", "string",
-                               expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))",
-                               baseColumn=['event_type', 'event_ts'])
-                   )
+   testDataSpec = (
+       dg.DataGenerator(spark, name="device_data_set", rows=1000000,
+                        partitions=8,
+                        randomSeedMethod='hash_fieldname')
+       .withIdOutput()
+       # we'll use hash of the base field to generate the ids to
+       # avoid a simple incrementing sequence
+       .withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
+                   uniqueValues=device_population, omit=True, baseColumnType="hash")
+
+       # note for format strings, we must use "%lx" not "%x" as the
+       # underlying value is a long
+       .withColumn("device_id", StringType(), format="0x%013x",
+                   baseColumn="internal_device_id")
+
+       # the device / user attributes will be the same for the same device id
+       # so lets use the internal device id as the base column for these attribute
+       .withColumn("country", StringType(), values=country_codes,
+                   weights=country_weights,
+                   baseColumn="internal_device_id")
+
+       .withColumn("manufacturer", StringType(), values=manufacturers,
+                   baseColumn="internal_device_id", omit=True)
+       .withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
+                   baseColumnType="hash", omit=True)
+       .withColumn("manufacturer_info", "string",
+                   expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))",
+                   baseColumn=['manufacturer', 'line'])
+
+
+       .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11,
+                   baseColumn="device_id",
+                   baseColumnType="hash", omit=True)
+
+       .withColumn("event_type", StringType(),
+                   values=["activation", "deactivation", "plan change",
+                           "telecoms activity", "internet activity", "device error"],
+                   random=True, omit=True)
+       .withColumn("event_ts", "timestamp",
+                   begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
+                   interval="1 minute", random=True, omit=True)
+
+       .withColumn("event_info", "string",
+                   expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))",
+                   baseColumn=['event_type', 'event_ts'])
+       )
 
    dfTestData = testDataSpec.build()
 
diff --git a/docs/source/multi_table_data.rst b/docs/source/multi_table_data.rst
@@ -1,7 +1,7 @@
 .. Test Data Generator documentation master file, created by
-   sphinx-quickstart on Sun Jun 21 10:54:30 2020.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
+sphinx-quickstart on Sun Jun 21 10:54:30 2020.
+You can adapt this file completely to your liking, but it should at least
+contain the root `toctree` directive.
 
 Generating and Using Data with Multiple Tables
 ==============================================
@@ -73,7 +73,9 @@ Here we use a simple sequence for our plan ids.
    import dbldatagen as dg
    import pyspark.sql.functions as F
 
-   spark.catalog.clearCache()  # clear cache so that if we run multiple times to check performance, we're not relying on cache
+   # clear cache so that if we run multiple times to check performance,
+   # we're not relying on cache
+   spark.catalog.clearCache()
 
    UNIQUE_PLANS = 20
    PLAN_MIN_VALUE = 100
@@ -87,36 +89,35 @@ Here we use a simple sequence for our plan ids.
    spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 20000)
 
 
-   plan_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
-               .withColumn("plan_id","int", minValue=PLAN_MIN_VALUE, uniqueValues=UNIQUE_PLANS)
-               # use plan_id as root value
-               .withColumn("plan_name", prefix="plan", baseColumn="plan_id")
-
-               # note default step is 1 so you must specify a step for small number ranges,
-               .withColumn("cost_per_mb", "decimal(5,3)", minValue=0.005, maxValue=0.050,
-                           step=0.005, random=True)
-               .withColumn("cost_per_message", "decimal(5,3)", minValue=0.001, maxValue=0.02,
-                           step=0.001, random=True)
-               .withColumn("cost_per_minute", "decimal(5,3)", minValue=0.001, maxValue=0.01,
-                           step=0.001, random=True)
-
-               # we're modelling long distance and international prices simplistically -
-               # each is a multiplier thats applied to base rate
-               .withColumn("ld_multiplier", "decimal(5,3)", minValue=1.5, maxValue=3, step=0.05,
-                           random=True, distribution="normal", omit=True)
-               .withColumn("ld_cost_per_minute", "decimal(5,3)",
-                           expr="cost_per_minute * ld_multiplier",
-                           baseColumns=['cost_per_minute', 'ld_multiplier'])
-               .withColumn("intl_multiplier", "decimal(5,3)", minValue=2, maxValue=4, step=0.05,
-                           random=True,  distribution="normal", omit=True)
-               .withColumn("intl_cost_per_minute", "decimal(5,3)",
-                           expr="cost_per_minute * intl_multiplier",
-                           baseColumns=['cost_per_minute', 'intl_multiplier'])
+   plan_dataspec = (
+       dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
+       .withColumn("plan_id","int", minValue=PLAN_MIN_VALUE, uniqueValues=UNIQUE_PLANS)
+       # use plan_id as root value
+       .withColumn("plan_name", prefix="plan", baseColumn="plan_id")
+
+       # note default step is 1 so you must specify a step for small number ranges,
+       .withColumn("cost_per_mb", "decimal(5,3)", minValue=0.005, maxValue=0.050,
+                   step=0.005, random=True)
+       .withColumn("cost_per_message", "decimal(5,3)", minValue=0.001, maxValue=0.02,
+                   step=0.001, random=True)
+       .withColumn("cost_per_minute", "decimal(5,3)", minValue=0.001, maxValue=0.01,
+                   step=0.001, random=True)
+
+       # we're modelling long distance and international prices simplistically -
+       # each is a multiplier thats applied to base rate
+       .withColumn("ld_multiplier", "decimal(5,3)", minValue=1.5, maxValue=3, step=0.05,
+                   random=True, distribution="normal", omit=True)
+       .withColumn("ld_cost_per_minute", "decimal(5,3)",
+                   expr="cost_per_minute * ld_multiplier",
+                   baseColumns=['cost_per_minute', 'ld_multiplier'])
+       .withColumn("intl_multiplier", "decimal(5,3)", minValue=2, maxValue=4, step=0.05,
+                   random=True,  distribution="normal", omit=True)
+       .withColumn("intl_cost_per_minute", "decimal(5,3)",
+                   expr="cost_per_minute * intl_multiplier",
+                   baseColumns=['cost_per_minute', 'intl_multiplier'])
                )
 
-   df_plans = (plan_dataspec.build()
-               .cache()
-              )
+   df_plans = plan_dataspec.build().cache()
 
    display(df_plans)
 
@@ -195,10 +196,11 @@ when using hashed values, the range of the hashes produced can be large.
 
    effective_customers = df_customers.count()
 
-   print(stripMargin(f"""revised customers : {df_customers.count()},
-          |   unique customers: {df_customers.select(F.countDistinct('customer_id')).take(1)[0][0]},
-          |   unique device ids: {df_customers.select(F.countDistinct('device_id')).take(1)[0][0]},
-          |   unique phone numbers: {df_customers.select(F.countDistinct('phone_number')).take(1)[0][0]}""")
+   print(stripMargin(
+     f"""revised customers : {df_customers.count()},
+      |   unique customers: {df_customers.select(F.countDistinct('customer_id')).take(1)[0][0]},
+      |   unique device ids: {df_customers.select(F.countDistinct('device_id')).take(1)[0][0]},
+      |   unique phone numbers: {df_customers.select(F.countDistinct('phone_number')).take(1)[0][0]}""")
         )
 
    display(df_customers)
@@ -247,7 +249,8 @@ A simple approach is simply to multiply the
    # use random seed method of 'hash_fieldname' for better spread - default in later builds
    events_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
                       randomSeed=42, randomSeedMethod="hash_fieldname")
-                # use same logic as per customers dataset to ensure matching keys - but make them random
+                # use same logic as per customers dataset to ensure matching keys
+                # but make them random
                .withColumn("device_id_base","decimal(10)", minValue=CUSTOMER_MIN_VALUE,
                            uniqueValues=UNIQUE_CUSTOMERS,
                            random=True, omit=True)
@@ -260,12 +263,14 @@ A simple approach is simply to multiply the
                            weights=[50, 50, 20, 10, 5 ], random=True)
 
                # use Gamma distribution for skew towards short calls
-               .withColumn("base_minutes","decimal(7,2)",  minValue=1.0, maxValue=100.0, step=0.1,
+               .withColumn("base_minutes","decimal(7,2)",
+                           minValue=1.0, maxValue=100.0, step=0.1,
                            distribution=dg.distributions.Gamma(shape=1.5, scale=2.0),
                            random=True, omit=True)
 
                # use Gamma distribution for skew towards short transfers
-               .withColumn("base_bytes_transferred","decimal(12)",  minValue=K_1, maxValue=MB_100,
+               .withColumn("base_bytes_transferred","decimal(12)",
+                           minValue=K_1, maxValue=MB_100,
                            distribution=dg.distributions.Gamma(shape=0.75, scale=2.0),
                            random=True, omit=True)
 
@@ -308,8 +313,7 @@ Let's compute the customers and associated plans
    import pyspark.sql.functions as F
    import pyspark.sql.types as T
 
-   df_customer_pricing = df_customers.join(df_plans,
-                                           df_plans.plan_id == df_customers.plan)
+   df_customer_pricing = df_customers.join(df_plans, df_plans.plan_id == df_customers.plan)
 
    display(df_customer_pricing)
 
@@ -365,8 +369,9 @@ now let's compute the invoices
 
 .. code-block:: python
 
-   df_customer_summary = (df_customer_pricing.join(df_summary,
-                                                   df_customer_pricing.device_id == df_summary.device_id )
+   df_customer_summary = (
+         df_customer_pricing.join(df_summary,
+                                   df_customer_pricing.device_id == df_summary.device_id )
                           .createOrReplaceTempView("customer_summary"))
 
    df_invoices = spark.sql("""
diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst