Feature issue 209 (#210)

ronanstokes-db · web-flow · commit 1561d4eb0555 · 2023-04-18T11:15:12.000-07:00
* wip

* wip

* wip
diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ details of use and many examples.
 
 Release notes and details of the latest changes for this specific release
 can be found in the GitHub repository
-[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.4post1/CHANGELOG.md)
+[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.4post2/CHANGELOG.md)
 
 # Installation
 
diff --git a/dbldatagen/_version.py b/dbldatagen/_version.py
@@ -34,7 +34,7 @@ def get_version(version):
     return version_info
 
 
-__version__ = "0.3.4post1"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
+__version__ = "0.3.4post2"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
 __version_info__ = get_version(__version__)
 
 
diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py
@@ -85,6 +85,8 @@ class ColumnGenerationSpec(object):
         'short': 65536
     }
 
+    _ARRAY_STRUCT_TYPE = "array"
+
     # set up logging
 
     # restrict spurious messages from java gateway
@@ -1021,7 +1023,8 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
             # rs: initialize the begin, end and interval if not initialized for date computations
             # defaults are start of day, now, and 1 minute respectively
 
-            if not type(self.datatype) in [ArrayType, MapType, StructType]:
+            # for array, struct and map types, either value is provided via `expr` or via values
+            if not type(self.datatype) in [ArrayType, MapType, StructType] or self.values is not None:
                 self._computeImpliedRangeIfNeeded(self.datatype)
 
             # TODO: add full support for date value generation
@@ -1032,7 +1035,7 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
                 # record execution history
                 self.executionHistory.append(f".. using SQL expression `{self.expr}` as base")
                 self.executionHistory.append(f".. casting to  `{self.datatype}`")
-            elif type(self.datatype) in [ArrayType, MapType, StructType]:
+            elif type(self.datatype) in [ArrayType, MapType, StructType] and self.values is None:
                 new_def = expr("NULL")
             elif self._dataRange is not None and self._dataRange.isFullyPopulated():
                 self.executionHistory.append(f".. computing ranged value: {self._dataRange}")
@@ -1204,7 +1207,7 @@ def _getMultiColumnDetails(self, validate):
 
             min_num_columns, max_num_columns = 1, 1
 
-        if validate and (min_num_columns != max_num_columns) and (struct_type != "array"):
+        if validate and (min_num_columns != max_num_columns) and (struct_type != self._ARRAY_STRUCT_TYPE):
             self.logger.warning(
                 f"Varying number of features / columns specified for non-array column [{self.name}]")
             self.logger.warning(
@@ -1228,7 +1231,7 @@ def makeGenerationExpressions(self):
 
         self.executionHistory = []
 
-        if (min_num_columns == 1) and (max_num_columns == 1):
+        if (min_num_columns == 1) and (max_num_columns == 1) and struct_type != self._ARRAY_STRUCT_TYPE:
             # record execution history for troubleshooting
             self.executionHistory.append(f"generating single column - `{self.name}` having type `{self.datatype}`")
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -28,7 +28,7 @@
 author = 'Databricks Inc'
 
 # The full version, including alpha/beta/rc tags
-release = "0.3.4post1"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
+release = "0.3.4post2"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/source/generating_column_data.rst b/docs/source/generating_column_data.rst
@@ -97,10 +97,13 @@ Complex column types are supported - that is a column may have its type specifie
 be specified in the datatype parameter to the `withColumn` method as a string such as "array<string>" or as a
 composite of datatype object instances.
 
-If the column type is based on a struct, map or array, then the `expr` attribute must be specified to provide a
-value for the column.
+If the column type is based on a struct, map or array, then either the `expr` or the `values` attributes must be
+specified to provide a value or range of possible values for the column.
 
-If the `expr` attribute is not specified, then the default column value will be `NULL`.
+If the `values` attribute is being used to specify a range of possible values, each of the values elements must be of
+the same type as the column.
+
+If neither the `expr` or `values` attributes are specified, then the default column value will be `NULL`.
 
 For array valued columns, where all of the elements of the array are to be generated with the same column
 specification, an alternative method is also supported.
diff --git a/python/.bumpversion.cfg b/python/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.4post1
+current_version = 0.3.4post2
 commit = False
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+){0,1}(?P<release>\D*)(?P<build>\d*)
diff --git a/setup.py b/setup.py
@@ -31,7 +31,7 @@
 
 setuptools.setup(
     name="dbldatagen",
-    version="0.3.4post1",
+    version="0.3.4post2",
     author="Ronan Stokes, Databricks",
     description="Databricks Labs -  PySpark Synthetic Data Generator",
     long_description=long_description,
diff --git a/tests/test_complex_columns.py b/tests/test_complex_columns.py
@@ -1,9 +1,9 @@
 import logging
-import pytest
 
+import pytest
 from pyspark.sql import functions as F
 from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, ArrayType, MapType, \
-                              BinaryType, LongType
+    BinaryType, LongType
 
 import dbldatagen as dg
 
@@ -245,6 +245,7 @@ def test_basic_arrays_with_existing_schema6(self, arraySchema, setupLogging):
                 .withColumnSpec("arrayVal", expr="array(id+1)")
                 )
         df = gen1.build()
+        assert df is not None
         df.show()
 
     def test_use_of_struct_in_schema1(self, setupLogging):
@@ -290,3 +291,108 @@ def test_varying_arrays(self, setupLogging):
 
         df = df_spec.build()
         df.show()
+
+    def test_array_values(self):
+        df_spec = dg.DataGenerator(spark, name="test-data", rows=2)
+        df_spec = df_spec.withColumn(
+            "test",
+            ArrayType(StringType()),
+            values=[
+                F.array(F.lit("A")),
+                F.array(F.lit("C")),
+                F.array(F.lit("T")),
+                F.array(F.lit("G")),
+            ],
+        )
+        test_df = df_spec.build()
+
+        rows = test_df.collect()
+
+        for r in rows:
+            assert r['test'] is not None
+
+    def test_single_element_array(self):
+        df_spec = dg.DataGenerator(spark, name="test-data", rows=2)
+        df_spec = df_spec.withColumn(
+            "test1",
+            ArrayType(StringType()),
+            values=[
+                F.array(F.lit("A")),
+                F.array(F.lit("C")),
+                F.array(F.lit("T")),
+                F.array(F.lit("G")),
+            ],
+        )
+        df_spec = df_spec.withColumn(
+            "test2", "string", structType="array", numFeatures=1, values=["one", "two", "three"]
+        )
+        df_spec = df_spec.withColumn(
+            "test3", "string", structType="array", numFeatures=(1, 1), values=["one", "two", "three"]
+        )
+        df_spec = df_spec.withColumn(
+            "test4", "string", structType="array", values=["one", "two", "three"]
+        )
+
+        test_df = df_spec.build()
+
+        for field in test_df.schema:
+            assert isinstance(field.dataType, ArrayType)
+
+    def test_map_values(self):
+        df_spec = dg.DataGenerator(spark, name="test-data", rows=50, random=True)
+        df_spec = df_spec.withColumn(
+            "v1",
+            "array<string>",
+            values=[
+                F.array(F.lit("A")),
+                F.array(F.lit("C")),
+                F.array(F.lit("T")),
+                F.array(F.lit("G")),
+            ],
+        )
+        df_spec = df_spec.withColumn(
+            "v2",
+            "array<string>",
+            values=[
+                F.array(F.lit("one")),
+                F.array(F.lit("two")),
+                F.array(F.lit("three")),
+                F.array(F.lit("four")),
+            ],
+        )
+        df_spec = df_spec.withColumn(
+            "v3",
+            "array<string>",
+            values=[
+                F.array(F.lit("alpha")),
+                F.array(F.lit("beta")),
+                F.array(F.lit("delta")),
+                F.array(F.lit("gamma")),
+            ],
+        )
+        df_spec = df_spec.withColumn(
+            "v4",
+            "string",
+            values=["this", "is", "a", "test"],
+            numFeatures=1,
+            structType="array"
+        )
+
+        df_spec = df_spec.withColumn(
+            "test",
+            "map<string,string>",
+            values=[F.map_from_arrays(F.col("v1"), F.col("v2")),
+                    F.map_from_arrays(F.col("v1"), F.col("v3")),
+                    F.map_from_arrays(F.col("v2"), F.col("v3")),
+                    F.map_from_arrays(F.col("v1"), F.col("v4")),
+                    F.map_from_arrays(F.col("v2"), F.col("v4")),
+                    F.map_from_arrays(F.col("v3"), F.col("v4"))
+                    ],
+            baseColumns=["v1", "v2", "v3", "v4"]
+        )
+        test_df = df_spec.build()
+
+        rows = test_df.collect()
+
+        for r in rows:
+            assert r['test'] is not None