Skip to content

Commit 1561d4e

Browse files
Feature issue 209 (#210)
* wip * wip * wip
1 parent 54d7948 commit 1561d4e

File tree

8 files changed

+126
-14
lines changed

8 files changed

+126
-14
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ details of use and many examples.
6363

6464
Release notes and details of the latest changes for this specific release
6565
can be found in the GitHub repository
66-
[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.4post1/CHANGELOG.md)
66+
[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.4post2/CHANGELOG.md)
6767

6868
# Installation
6969

dbldatagen/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def get_version(version):
3434
return version_info
3535

3636

37-
__version__ = "0.3.4post1" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion
37+
__version__ = "0.3.4post2" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion
3838
__version_info__ = get_version(__version__)
3939

4040

dbldatagen/column_generation_spec.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ class ColumnGenerationSpec(object):
8585
'short': 65536
8686
}
8787

88+
_ARRAY_STRUCT_TYPE = "array"
89+
8890
# set up logging
8991

9092
# restrict spurious messages from java gateway
@@ -1021,7 +1023,8 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
10211023
# rs: initialize the begin, end and interval if not initialized for date computations
10221024
# defaults are start of day, now, and 1 minute respectively
10231025

1024-
if not type(self.datatype) in [ArrayType, MapType, StructType]:
1026+
# for array, struct and map types, either value is provided via `expr` or via values
1027+
if not type(self.datatype) in [ArrayType, MapType, StructType] or self.values is not None:
10251028
self._computeImpliedRangeIfNeeded(self.datatype)
10261029

10271030
# TODO: add full support for date value generation
@@ -1032,7 +1035,7 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
10321035
# record execution history
10331036
self.executionHistory.append(f".. using SQL expression `{self.expr}` as base")
10341037
self.executionHistory.append(f".. casting to `{self.datatype}`")
1035-
elif type(self.datatype) in [ArrayType, MapType, StructType]:
1038+
elif type(self.datatype) in [ArrayType, MapType, StructType] and self.values is None:
10361039
new_def = expr("NULL")
10371040
elif self._dataRange is not None and self._dataRange.isFullyPopulated():
10381041
self.executionHistory.append(f".. computing ranged value: {self._dataRange}")
@@ -1204,7 +1207,7 @@ def _getMultiColumnDetails(self, validate):
12041207

12051208
min_num_columns, max_num_columns = 1, 1
12061209

1207-
if validate and (min_num_columns != max_num_columns) and (struct_type != "array"):
1210+
if validate and (min_num_columns != max_num_columns) and (struct_type != self._ARRAY_STRUCT_TYPE):
12081211
self.logger.warning(
12091212
f"Varying number of features / columns specified for non-array column [{self.name}]")
12101213
self.logger.warning(
@@ -1228,7 +1231,7 @@ def makeGenerationExpressions(self):
12281231

12291232
self.executionHistory = []
12301233

1231-
if (min_num_columns == 1) and (max_num_columns == 1):
1234+
if (min_num_columns == 1) and (max_num_columns == 1) and struct_type != self._ARRAY_STRUCT_TYPE:
12321235
# record execution history for troubleshooting
12331236
self.executionHistory.append(f"generating single column - `{self.name}` having type `{self.datatype}`")
12341237

docs/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
author = 'Databricks Inc'
2929

3030
# The full version, including alpha/beta/rc tags
31-
release = "0.3.4post1" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion
31+
release = "0.3.4post2" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion
3232

3333

3434
# -- General configuration ---------------------------------------------------

docs/source/generating_column_data.rst

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,13 @@ Complex column types are supported - that is a column may have its type specifie
9797
be specified in the datatype parameter to the `withColumn` method as a string such as "array<string>" or as a
9898
composite of datatype object instances.
9999

100-
If the column type is based on a struct, map or array, then the `expr` attribute must be specified to provide a
101-
value for the column.
100+
If the column type is based on a struct, map or array, then either the `expr` or the `values` attributes must be
101+
specified to provide a value or range of possible values for the column.
102102

103-
If the `expr` attribute is not specified, then the default column value will be `NULL`.
103+
If the `values` attribute is being used to specify a range of possible values, each of the values elements must be of
104+
the same type as the column.
105+
106+
If neither the `expr` or `values` attributes are specified, then the default column value will be `NULL`.
104107

105108
For array valued columns, where all of the elements of the array are to be generated with the same column
106109
specification, an alternative method is also supported.

python/.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.3.4post1
2+
current_version = 0.3.4post2
33
commit = False
44
tag = False
55
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+){0,1}(?P<release>\D*)(?P<build>\d*)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
setuptools.setup(
3333
name="dbldatagen",
34-
version="0.3.4post1",
34+
version="0.3.4post2",
3535
author="Ronan Stokes, Databricks",
3636
description="Databricks Labs - PySpark Synthetic Data Generator",
3737
long_description=long_description,

tests/test_complex_columns.py

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import logging
2-
import pytest
32

3+
import pytest
44
from pyspark.sql import functions as F
55
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, ArrayType, MapType, \
6-
BinaryType, LongType
6+
BinaryType, LongType
77

88
import dbldatagen as dg
99

@@ -245,6 +245,7 @@ def test_basic_arrays_with_existing_schema6(self, arraySchema, setupLogging):
245245
.withColumnSpec("arrayVal", expr="array(id+1)")
246246
)
247247
df = gen1.build()
248+
assert df is not None
248249
df.show()
249250

250251
def test_use_of_struct_in_schema1(self, setupLogging):
@@ -290,3 +291,108 @@ def test_varying_arrays(self, setupLogging):
290291

291292
df = df_spec.build()
292293
df.show()
294+
295+
def test_array_values(self):
296+
df_spec = dg.DataGenerator(spark, name="test-data", rows=2)
297+
df_spec = df_spec.withColumn(
298+
"test",
299+
ArrayType(StringType()),
300+
values=[
301+
F.array(F.lit("A")),
302+
F.array(F.lit("C")),
303+
F.array(F.lit("T")),
304+
F.array(F.lit("G")),
305+
],
306+
)
307+
test_df = df_spec.build()
308+
309+
rows = test_df.collect()
310+
311+
for r in rows:
312+
assert r['test'] is not None
313+
314+
def test_single_element_array(self):
315+
df_spec = dg.DataGenerator(spark, name="test-data", rows=2)
316+
df_spec = df_spec.withColumn(
317+
"test1",
318+
ArrayType(StringType()),
319+
values=[
320+
F.array(F.lit("A")),
321+
F.array(F.lit("C")),
322+
F.array(F.lit("T")),
323+
F.array(F.lit("G")),
324+
],
325+
)
326+
df_spec = df_spec.withColumn(
327+
"test2", "string", structType="array", numFeatures=1, values=["one", "two", "three"]
328+
)
329+
df_spec = df_spec.withColumn(
330+
"test3", "string", structType="array", numFeatures=(1, 1), values=["one", "two", "three"]
331+
)
332+
df_spec = df_spec.withColumn(
333+
"test4", "string", structType="array", values=["one", "two", "three"]
334+
)
335+
336+
test_df = df_spec.build()
337+
338+
for field in test_df.schema:
339+
assert isinstance(field.dataType, ArrayType)
340+
341+
def test_map_values(self):
342+
df_spec = dg.DataGenerator(spark, name="test-data", rows=50, random=True)
343+
df_spec = df_spec.withColumn(
344+
"v1",
345+
"array<string>",
346+
values=[
347+
F.array(F.lit("A")),
348+
F.array(F.lit("C")),
349+
F.array(F.lit("T")),
350+
F.array(F.lit("G")),
351+
],
352+
)
353+
df_spec = df_spec.withColumn(
354+
"v2",
355+
"array<string>",
356+
values=[
357+
F.array(F.lit("one")),
358+
F.array(F.lit("two")),
359+
F.array(F.lit("three")),
360+
F.array(F.lit("four")),
361+
],
362+
)
363+
df_spec = df_spec.withColumn(
364+
"v3",
365+
"array<string>",
366+
values=[
367+
F.array(F.lit("alpha")),
368+
F.array(F.lit("beta")),
369+
F.array(F.lit("delta")),
370+
F.array(F.lit("gamma")),
371+
],
372+
)
373+
df_spec = df_spec.withColumn(
374+
"v4",
375+
"string",
376+
values=["this", "is", "a", "test"],
377+
numFeatures=1,
378+
structType="array"
379+
)
380+
381+
df_spec = df_spec.withColumn(
382+
"test",
383+
"map<string,string>",
384+
values=[F.map_from_arrays(F.col("v1"), F.col("v2")),
385+
F.map_from_arrays(F.col("v1"), F.col("v3")),
386+
F.map_from_arrays(F.col("v2"), F.col("v3")),
387+
F.map_from_arrays(F.col("v1"), F.col("v4")),
388+
F.map_from_arrays(F.col("v2"), F.col("v4")),
389+
F.map_from_arrays(F.col("v3"), F.col("v4"))
390+
],
391+
baseColumns=["v1", "v2", "v3", "v4"]
392+
)
393+
test_df = df_spec.build()
394+
395+
rows = test_df.collect()
396+
397+
for r in rows:
398+
assert r['test'] is not None

0 commit comments

Comments
 (0)