Skip to content

Commit 168b6f1

Browse files
Feature struct changes (#219)
* wip * wip * added support for inferred types * added support for inferred types * wip * wip * wip * updates and fixes to unit tests * wip * updated pipfile due to upstream changes in pipenv * additional tests * wip * wip * wip * wip * removed old code
1 parent bc80ef3 commit 168b6f1

File tree

7 files changed

+609
-29
lines changed

7 files changed

+609
-29
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ All notable changes to the Databricks Labs Data Generator will be documented in
77

88
#### Changed
99
* Added formatting of generated code as Html for script methods
10+
* Allow use of inferred types on `withColumn` method when `expr` attribute is used
11+
* Added ``withStructColumn`` method to allow simplified generation of struct and JSON columns
1012
* Modified pipfile to use newer version of package specifications
1113

1214
### Version 0.3.4 Post 3

dbldatagen/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525

2626
from .data_generator import DataGenerator
2727
from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_RANDOM, RANDOM_SEED_FIXED, \
28-
RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION
28+
RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION, \
29+
INFER_DATATYPE
2930
from .utils import ensure, topologicalSort, mkBoundsList, coalesce_values, \
3031
deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins, \
3132
json_value_from_path, system_time_millis

dbldatagen/column_generation_spec.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,20 @@
2020

2121
from .column_spec_options import ColumnSpecOptions
2222
from .datagen_constants import RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, RANDOM_SEED_RANDOM, \
23-
DEFAULT_SEED_COLUMN, OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD
23+
DEFAULT_SEED_COLUMN, OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD, INFER_DATATYPE
2424

2525
from .daterange import DateRange
2626
from .distributions import Normal, DataDistribution
2727
from .nrange import NRange
2828
from .text_generators import TemplateGenerator
2929
from .utils import ensure, coalesce_values
30+
from .schema_parser import SchemaParser
3031

3132
HASH_COMPUTE_METHOD = "hash"
3233
VALUES_COMPUTE_METHOD = "values"
3334
RAW_VALUES_COMPUTE_METHOD = "raw_values"
3435
AUTO_COMPUTE_METHOD = "auto"
36+
EXPR_OPTION = "expr"
3537
COMPUTE_METHOD_VALID_VALUES = [HASH_COMPUTE_METHOD,
3638
AUTO_COMPUTE_METHOD,
3739
VALUES_COMPUTE_METHOD,
@@ -107,8 +109,18 @@ def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix
107109
# set up default range and type for column
108110
self._dataRange = NRange(None, None, None) # by default range of values for column is unconstrained
109111

112+
self._inferDataType = False
110113
if colType is None: # default to integer field if none specified
111114
colType = IntegerType()
115+
elif colType == INFER_DATATYPE:
116+
colType = StringType() # default inferred data type to string until exact type is known
117+
self._inferDataType = True
118+
119+
if EXPR_OPTION not in kwargs:
120+
raise ValueError("Column generation spec must have `expr` attribute specified if datatype is inferred")
121+
122+
elif type(colType) == str:
123+
colType = SchemaParser.columnTypeFromString(colType)
112124

113125
assert isinstance(colType, DataType), f"colType `{colType}` is not instance of DataType"
114126

@@ -399,6 +411,12 @@ def textGenerator(self):
399411
""" Get the text generator for the column spec"""
400412
return self._textGenerator
401413

414+
@property
415+
def inferDatatype(self):
416+
""" If True indicates that datatype should be inferred to be result of computing SQL expression
417+
"""
418+
return self._inferDataType
419+
402420
@property
403421
def baseColumns(self):
404422
""" Return base columns as list of strings"""
@@ -1030,11 +1048,12 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
10301048
# TODO: add full support for date value generation
10311049
if self.expr is not None:
10321050
# note use of SQL expression ignores range specifications
1033-
new_def = expr(self.expr).astype(self.datatype)
1034-
1035-
# record execution history
1051+
new_def = expr(self.expr)
10361052
self.executionHistory.append(f".. using SQL expression `{self.expr}` as base")
1037-
self.executionHistory.append(f".. casting to `{self.datatype}`")
1053+
1054+
if not self._inferDataType:
1055+
new_def = new_def.astype(self.datatype)
1056+
self.executionHistory.append(f".. casting to `{self.datatype}`")
10381057
elif type(self.datatype) in [ArrayType, MapType, StructType] and self.values is None:
10391058
new_def = expr("NULL")
10401059
elif self._dataRange is not None and self._dataRange.isFullyPopulated():
@@ -1083,6 +1102,22 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
10831102
new_def = self._applyComputePercentNullsExpression(new_def, percent_nulls)
10841103
return new_def
10851104

1105+
def _onSelect(self, df):
1106+
"""
1107+
The _onSelect method is called when the column specifications expression as produced by the
1108+
method ``_makeSingleGenerationExpression`` is used in a select statement.
1109+
1110+
:param df: Dataframe in which expression is used
1111+
:return: nothing
1112+
1113+
.. note:: The purpose of this method is to allow for introspection of information such as datatype
1114+
which can only be determined when column specifications expression is used.
1115+
"""
1116+
if self._inferDataType:
1117+
inferred_type = df.schema[self.name].dataType
1118+
self.logger.info("Inferred datatype for column %s as %s", self.name, str(inferred_type))
1119+
self._csOptions.options['type'] = inferred_type
1120+
10861121
def _applyTextFormatExpression(self, new_def, sformat):
10871122
# note :
10881123
# while it seems like this could use a shared instance, this does not work if initialized
@@ -1141,6 +1176,9 @@ def _applyFinalCastExpression(self, col_type, new_def):
11411176
# cast the result to the appropriate type. For dates, cast first to timestamp, then to date
11421177
if type(col_type) is DateType:
11431178
new_def = new_def.astype(TimestampType()).astype(col_type)
1179+
elif self._inferDataType:
1180+
# dont apply cast when column has an inferred data type
1181+
pass
11441182
else:
11451183
new_def = new_def.astype(col_type)
11461184

0 commit comments

Comments
 (0)