Feature build ordering improvements 2 (#189)

ronanstokes-db · web-flow · commit 3a2e3a8bfcc0 · 2023-04-04T10:45:47.000-07:00
* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* improved build ordering

* improved build ordering

* improved build ordering

* reverted unnecessary changes

* reverted unnecessary changes

* udated ColumnSpecOptions description
diff --git a/dbldatagen/column_spec_options.py b/dbldatagen/column_spec_options.py
@@ -36,6 +36,12 @@ class ColumnSpecOptions(object):
 
     :param step: Step to use for range of generated value. As an alternative, you may use the `dataRange` parameter
 
+    :param numColumns: generate `n` columns numbered from 1 .. n-1 with same definition
+
+    :param numFeatures: generate `n` columns numbered from 0 .. n-1 with same definition. Alias for `numColumns`
+
+    :param structType: If specified as "array" and used with numColumns / numFeatures, will combine columns as array
+
     :param random: If True, will generate random values for column value. Defaults to `False`
 
     :param baseColumn: Either the string name of the base column, or a list of columns to use to
diff --git a/dbldatagen/utils.py b/dbldatagen/utils.py
@@ -116,7 +116,13 @@ def topologicalSort(sources, initial_columns=None, flatten=True):
     :arg sources: list of ``(name, set(names of dependencies))`` pairs
     :arg initial_columns: force ``initial_columns`` to be computed first
     :arg flatten: if true, flatten output list
-    :returns: list of names in dependency order. If not flattened, result will be list of lists
+    :returns: list of names in dependency order separated into build phases
+
+    .. note::
+       The algorith will give preference to retaining order of inbound sequence
+       over modifying order to produce a lower number of build phases.
+
+       Overall the effect is that the input build order should be retained unless there are forward references
     """
     # generate a copy so that we can modify in place
     pending = [(name, set(deps)) for name, deps in sources]
@@ -127,27 +133,36 @@ def topologicalSort(sources, initial_columns=None, flatten=True):
         next_pending = []
         gen = []
         value_emitted = False
+        defer_emitted = False
         gen_provided = []
         for entry in pending:
             name, deps = entry
             deps.difference_update(provided)
             if deps:
                 next_pending.append((name, set(deps)))
+
+                # if dependencies will be satisfied by item emitted in this round, defer output
+                if not deps.difference(gen_provided):
+                    defer_emitted = True
+            elif defer_emitted:
+                next_pending.append((name, set(deps)))
             elif name in provided:
-                value_emitted |= True
+                value_emitted = True
             else:
                 gen.append(name)
                 gen_provided.append(name)
-                value_emitted |= True
+                value_emitted = True
         provided.extend(gen_provided)
         build_orders.append(gen)
+
         if not value_emitted:
             raise ValueError(f"cyclic or missing dependency detected [{next_pending}]")
 
         pending = next_pending
 
     if flatten:
-        return [item for sublist in build_orders for item in sublist]
+        flattened_list = [item for sublist in build_orders for item in sublist]
+        return flattened_list
     else:
         return build_orders
 
diff --git a/tests/test_build_planning.py b/tests/test_build_planning.py
@@ -168,7 +168,7 @@ def sampleDataSpec(self):
                         .withColumnSpecs(patterns=".*_ID", match_types=StringType(), format="%010d",
                                          minValue=1, maxValue=123,
                                          step=1)
-                        .withColumnSpecs(patterns=".*_IDS", match_types=StringType(), format="%010d", minValue=1,
+                        .withColumnSpecs(patterns=".*_IDS", match_types="string", format="%010d", minValue=1,
                                          maxValue=100, step=1)
                         # .withColumnSpec("R3D3_CLUSTER_IDS", minValue=1, maxValue=100, step=1)
                         .withColumnSpec("XYYZ_IDS", minValue=1, maxValue=123, step=1,
@@ -294,6 +294,7 @@ def test_build_ordering_explicit_dependency(self):
                         baseColumn="city_id")
 
         build_order = gen1.build_order
+        logging.info(f"Build order {build_order}")
 
         assert self.builtBefore("city_id", "city_name", build_order)
         assert self.builtBefore("city", "city2", build_order)
@@ -308,8 +309,6 @@ def test_build_ordering_explicit_dependency(self):
         assert self.builtInSeparatePhase("city", "city_id", build_order)
         assert self.builtInSeparatePhase("city", "city_pop", build_order)
 
-        print(gen1.build_order)
-
     def test_build_ordering_explicit_dependency2(self):
         gen1 = dg.DataGenerator(sparkSession=spark, name="nested_schema", rows=1000, partitions=4,
                                 seedColumnName="_id") \
@@ -327,6 +326,7 @@ def test_build_ordering_explicit_dependency2(self):
                         baseColumn="city_id")
 
         build_order = gen1.build_order
+        logging.info(f"Build order {build_order}")
 
         assert self.builtBefore("city", "city_name", build_order)
         assert self.builtBefore("city", "city_id", build_order)
@@ -335,8 +335,6 @@ def test_build_ordering_explicit_dependency2(self):
         assert self.builtInSeparatePhase("city", "city_id", build_order)
         assert self.builtInSeparatePhase("city", "city_pop", build_order)
 
-        print(gen1.build_order)
-
     def test_build_ordering_implicit_dependency(self):
         gen1 = dg.DataGenerator(sparkSession=spark, name="nested_schema", rows=1000, partitions=4,
                                 seedColumnName="_id") \
@@ -348,7 +346,7 @@ def test_build_ordering_implicit_dependency(self):
                         expr="named_struct('name', city_name, 'id', city_id, 'population', city_pop)")
 
         build_order = gen1.build_order
-        print(gen1.build_order)
+        logging.info(f"Build order {build_order}")
 
         assert self.builtBefore("city", "city_name", build_order)
         assert self.builtBefore("city", "city_id", build_order)
@@ -357,6 +355,116 @@ def test_build_ordering_implicit_dependency(self):
         assert self.builtInSeparatePhase("city", "city_id", build_order), "fields should be built in separate phase"
         assert self.builtInSeparatePhase("city", "city_pop", build_order), "fields should be built in separate phase"
 
+    # TODO: build ordering should initially try and build in the order supplied but separate into phases
+
+    def test_build_ordering_implicit_dependency2(self):
+        DEVICE_STATES = ['RUNNING', 'IDLE', 'DOWN']
+        DEVICE_WEIGHTS = [10, 5, 1]
+        SITES = ['alpha', 'beta', 'gamma', 'delta', 'phi', 'mu', 'lambda']
+        AREAS = ['area 1', 'area 2', 'area 3', 'area 4', 'area 5']
+        LINES = ['line 1', 'line 2', 'line 3', 'line 4', 'line 5', 'line 6']
+        TAGS = ['statusCode', 'another notification 1', 'another notification 2', 'another notification 3']
+        NUM_LOCAL_DEVICES = 20
+
+        STARTING_DATETIME = "2022-06-01 01:00:00"
+        END_DATETIME = "2022-09-01 23:59:00"
+        EVENT_INTERVAL = "10 seconds"
+
+        gen1 = (
+            dg.DataGenerator(spark, rows=1000, partitions=4)
+            # can combine internal site id computation with value lookup but clearer to use a separate internal column
+            .withColumn("site", "string", values=SITES, random=True)
+            .withColumn("area", "string", values=AREAS, random=True, omit=True)
+            .withColumn("line", "string", values=LINES, random=True, omit=True)
+            .withColumn("local_device_id", "int", maxValue=NUM_LOCAL_DEVICES - 1, omit=True, random=True)
+
+            .withColumn("local_device", "string", prefix="device", baseColumn="local_device_id")
+
+            .withColumn("device_key", "string",
+                        expr="concat('/', site, '/', area, '/', line, '/', local_device)")
+
+            # used to compute the device id
+            .withColumn("internal_device_key", "long", expr="hash(site,  area,  line, local_device)",
+                        omit=True)
+
+            .withColumn("deviceId", "string", format="0x%013x",
+                        baseColumn="internal_device_key")
+
+            # tag name is name of device signal
+            .withColumn("tagName", "string", values=TAGS, random=True)
+
+            # tag value is state
+            .withColumn("tagValue", "string",
+                        values=DEVICE_STATES, weights=DEVICE_WEIGHTS,
+                        random=True)
+
+            .withColumn("tag_ts", "timestamp",
+                        begin=STARTING_DATETIME,
+                        end=END_DATETIME,
+                        interval=EVENT_INTERVAL,
+                        random=True)
+
+            .withColumn("event_date", "date", expr="to_date(tag_ts)")
+        )
+
+        build_order = gen1.build_order
+        logging.info(f"Build order {build_order}")
+
+        assert self.builtBefore("event_date", "tag_ts", build_order)
+        assert self.builtBefore("device_key", "site", build_order)
+        assert self.builtBefore("device_key", "area", build_order)
+        assert self.builtBefore("device_key", "line", build_order)
+        assert self.builtBefore("device_key", "local_device", build_order)
+        assert self.builtBefore("internal_device_key", "site", build_order)
+        assert self.builtBefore("internal_device_key", "area", build_order)
+        assert self.builtBefore("internal_device_key", "line", build_order)
+        assert self.builtBefore("internal_device_key", "local_device", build_order)
+        assert self.builtBefore("device_key", "site", build_order)
+        assert self.builtBefore("device_key", "area", build_order)
+        assert self.builtBefore("device_key", "line", build_order)
+        assert self.builtBefore("device_key", "local_device", build_order)
+
+        assert self.builtInSeparatePhase("tag_ts", "event_date", build_order)
+        assert self.builtInSeparatePhase("site", "device_key", build_order)
+        assert self.builtInSeparatePhase("area", "device_key", build_order)
+        assert self.builtInSeparatePhase("line", "device_key", build_order)
+        assert self.builtInSeparatePhase("local_device", "device_key", build_order)
+        assert self.builtInSeparatePhase("site", "internal_device_key", build_order)
+        assert self.builtInSeparatePhase("area", "internal_device_key", build_order)
+        assert self.builtInSeparatePhase("line", "internal_device_key", build_order)
+        assert self.builtInSeparatePhase("local_device", "internal_device_key", build_order)
+        assert self.builtInSeparatePhase("site", "device_key", build_order)
+        assert self.builtInSeparatePhase("area", "device_key", build_order)
+        assert self.builtInSeparatePhase("line", "device_key", build_order)
+        assert self.builtInSeparatePhase("local_device", "device_key", build_order)
+
+    def test_implicit_dependency3(self):
+        dataspec = (
+            dg.DataGenerator(spark, rows=1000, partitions=4)
+            .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
+            .withColumn("payment_instrument_type", values=['cash', 'cc', 'app'],
+                        random=True)
+            .withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999,
+                        baseColumn="name",
+                        baseColumnType="hash", omit=True)
+            .withColumn("payment_instrument",
+                        expr="format_number(int_payment_instrument, '**** ****** *####')")
+            .withColumn("email", template=r'\\w.\\w@\\w.com')
+            .withColumn("md5_payment_instrument",
+                        expr="md5(concat(payment_instrument_type, ':', payment_instrument))")
+        )
+
+        build_order = dataspec.build_order
+        logging.info(f"Build order {build_order}")
+
+        assert self.builtBefore("payment_instrument", "int_payment_instrument", build_order)
+        assert self.builtBefore("md5_payment_instrument", "payment_instrument", build_order)
+        assert self.builtBefore("md5_payment_instrument", "payment_instrument_type", build_order)
+
+        assert self.builtInSeparatePhase("int_payment_instrument", "payment_instrument", build_order)
+        assert self.builtInSeparatePhase("md5_payment_instrument", "payment_instrument", build_order)
+        assert self.builtInSeparatePhase("md5_payment_instrument", "payment_instrument_type", build_order)
+
     def test_expr_attribute(self):
         sql_expr = "named_struct('name', city_name, 'id', city_id, 'population', city_pop)"
         gen1 = dg.DataGenerator(sparkSession=spark, name="nested_schema", rows=1000, partitions=4,
@@ -393,24 +501,19 @@ def test_build_ordering_duplicate_names1(self):
             .withColumn("id", "long", minValue=1000000, uniqueValues=10000, random=True) \
             .withColumn("city_name", "long", minValue=1000000, uniqueValues=10000, random=True) \
             .withColumn("city_name", "string", template=r"\w", random=True, omit=True) \
+            .withColumn("extra_field", "long", minValue=1000000, uniqueValues=10000, random=True) \
+            .withColumn("extra_field", "string", template=r"\w", random=True) \
             .withColumn("city_id", "long", minValue=1000000, uniqueValues=10000, random=True, omit=True) \
             .withColumn("city_pop", "long", minValue=1000000, uniqueValues=10000, random=True, omit=True) \
             .withColumn("city", "struct<name:string, id:long, population:long>",
                         expr="named_struct('name', city_name, 'id', city_id, 'population', city_pop)")
 
-        build_order = gen1.build_order
-        print(gen1.build_order)
+        logging.info(f"Build order {gen1.build_order}")
 
         df = gen1.build()
 
-        df.show()
-
-        # assert self.builtBefore("city", "city_name", build_order)
-        # assert self.builtBefore("city", "city_id", build_order)
-        # assert self.builtBefore("city", "city_pop", build_order)
-        # assert self.builtInSeparatePhase("city", "city_name", build_order), "fields should be built in separate phase"
-        # assert self.builtInSeparatePhase("city", "city_id", build_order), "fields should be built in separate phase"
-        # assert self.builtInSeparatePhase("city", "city_pop", build_order), "fields should be built in separate phase"
+        count = df.count()
+        assert count == 1000
 
     def test_build_ordering_forward_ref(self, caplog):
         # caplog fixture captures log content
@@ -425,8 +528,7 @@ def test_build_ordering_forward_ref(self, caplog):
                         expr="named_struct('name', city_name, 'id', city_id, 'population', city_pop)") \
             .withColumn("city_id", "long", minValue=1000000, uniqueValues=10000, random=True, omit=True)
 
-        build_order = gen1.build_order
-        print(gen1.build_order)
+        logging.info(f"Build order {gen1.build_order}")
 
         seed_column_warnings_and_errors = self.get_log_capture_warngings_and_errors(caplog, "forward references")
         assert seed_column_warnings_and_errors >= 1, "Should not have error messages about forward references"
@@ -443,16 +545,9 @@ def test_build_ordering_duplicate_names2(self):
                         expr="named_struct('name', city_name, 'id', city_id, 'population', city_pop)",
                         baseColumns=["city_name", "city_id", "city_pop"])
 
-        build_order = gen1.build_order
-        print(gen1.build_order)
+        logging.info(f"Build order {gen1.build_order}")
 
         df = gen1.build()
 
-        df.show()
-
-        # assert self.builtBefore("city", "city_name", build_order)
-        # assert self.builtBefore("city", "city_id", build_order)
-        # assert self.builtBefore("city", "city_pop", build_order)
-        # assert self.builtInSeparatePhase("city", "city_name", build_order), "fields should be built in separate phase"
-        # assert self.builtInSeparatePhase("city", "city_id", build_order), "fields should be built in separate phase"
-        # assert self.builtInSeparatePhase("city", "city_pop", build_order), "fields should be built in separate phase"
+        count = df.count()
+        assert count == 1000