|
20 | 20 |
|
21 | 21 | from .column_spec_options import ColumnSpecOptions |
22 | 22 | from .datagen_constants import RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, RANDOM_SEED_RANDOM, \ |
23 | | - DEFAULT_SEED_COLUMN, OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD |
| 23 | + DEFAULT_SEED_COLUMN, OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD, INFER_DATATYPE |
24 | 24 |
|
25 | 25 | from .daterange import DateRange |
26 | 26 | from .distributions import Normal, DataDistribution |
27 | 27 | from .nrange import NRange |
28 | 28 | from .text_generators import TemplateGenerator |
29 | 29 | from .utils import ensure, coalesce_values |
| 30 | +from .schema_parser import SchemaParser |
30 | 31 |
|
31 | 32 | HASH_COMPUTE_METHOD = "hash" |
32 | 33 | VALUES_COMPUTE_METHOD = "values" |
33 | 34 | RAW_VALUES_COMPUTE_METHOD = "raw_values" |
34 | 35 | AUTO_COMPUTE_METHOD = "auto" |
| 36 | +EXPR_OPTION = "expr" |
35 | 37 | COMPUTE_METHOD_VALID_VALUES = [HASH_COMPUTE_METHOD, |
36 | 38 | AUTO_COMPUTE_METHOD, |
37 | 39 | VALUES_COMPUTE_METHOD, |
@@ -107,8 +109,18 @@ def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix |
107 | 109 | # set up default range and type for column |
108 | 110 | self._dataRange = NRange(None, None, None) # by default range of values for column is unconstrained |
109 | 111 |
|
| 112 | + self._inferDataType = False |
110 | 113 | if colType is None: # default to integer field if none specified |
111 | 114 | colType = IntegerType() |
| 115 | + elif colType == INFER_DATATYPE: |
| 116 | + colType = StringType() # default inferred data type to string until exact type is known |
| 117 | + self._inferDataType = True |
| 118 | + |
| 119 | + if EXPR_OPTION not in kwargs: |
| 120 | + raise ValueError("Column generation spec must have `expr` attribute specified if datatype is inferred") |
| 121 | + |
| 122 | + elif type(colType) == str: |
| 123 | + colType = SchemaParser.columnTypeFromString(colType) |
112 | 124 |
|
113 | 125 | assert isinstance(colType, DataType), f"colType `{colType}` is not instance of DataType" |
114 | 126 |
|
@@ -399,6 +411,12 @@ def textGenerator(self): |
399 | 411 | """ Get the text generator for the column spec""" |
400 | 412 | return self._textGenerator |
401 | 413 |
|
| 414 | + @property |
| 415 | + def inferDatatype(self): |
| 416 | + """ If True indicates that datatype should be inferred to be result of computing SQL expression |
| 417 | + """ |
| 418 | + return self._inferDataType |
| 419 | + |
402 | 420 | @property |
403 | 421 | def baseColumns(self): |
404 | 422 | """ Return base columns as list of strings""" |
@@ -1030,11 +1048,12 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T |
1030 | 1048 | # TODO: add full support for date value generation |
1031 | 1049 | if self.expr is not None: |
1032 | 1050 | # note use of SQL expression ignores range specifications |
1033 | | - new_def = expr(self.expr).astype(self.datatype) |
1034 | | - |
1035 | | - # record execution history |
| 1051 | + new_def = expr(self.expr) |
1036 | 1052 | self.executionHistory.append(f".. using SQL expression `{self.expr}` as base") |
1037 | | - self.executionHistory.append(f".. casting to `{self.datatype}`") |
| 1053 | + |
| 1054 | + if not self._inferDataType: |
| 1055 | + new_def = new_def.astype(self.datatype) |
| 1056 | + self.executionHistory.append(f".. casting to `{self.datatype}`") |
1038 | 1057 | elif type(self.datatype) in [ArrayType, MapType, StructType] and self.values is None: |
1039 | 1058 | new_def = expr("NULL") |
1040 | 1059 | elif self._dataRange is not None and self._dataRange.isFullyPopulated(): |
@@ -1083,6 +1102,22 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T |
1083 | 1102 | new_def = self._applyComputePercentNullsExpression(new_def, percent_nulls) |
1084 | 1103 | return new_def |
1085 | 1104 |
|
| 1105 | + def _onSelect(self, df): |
| 1106 | + """ |
| 1107 | + The _onSelect method is called when the column specifications expression as produced by the |
| 1108 | + method ``_makeSingleGenerationExpression`` is used in a select statement. |
| 1109 | +
|
| 1110 | + :param df: Dataframe in which expression is used |
| 1111 | + :return: nothing |
| 1112 | +
|
| 1113 | + .. note:: The purpose of this method is to allow for introspection of information such as datatype |
| 1114 | + which can only be determined when column specifications expression is used. |
| 1115 | + """ |
| 1116 | + if self._inferDataType: |
| 1117 | + inferred_type = df.schema[self.name].dataType |
| 1118 | + self.logger.info("Inferred datatype for column %s as %s", self.name, str(inferred_type)) |
| 1119 | + self._csOptions.options['type'] = inferred_type |
| 1120 | + |
1086 | 1121 | def _applyTextFormatExpression(self, new_def, sformat): |
1087 | 1122 | # note : |
1088 | 1123 | # while it seems like this could use a shared instance, this does not work if initialized |
@@ -1141,6 +1176,9 @@ def _applyFinalCastExpression(self, col_type, new_def): |
1141 | 1176 | # cast the result to the appropriate type. For dates, cast first to timestamp, then to date |
1142 | 1177 | if type(col_type) is DateType: |
1143 | 1178 | new_def = new_def.astype(TimestampType()).astype(col_type) |
| 1179 | + elif self._inferDataType: |
| 1180 | + # dont apply cast when column has an inferred data type |
| 1181 | + pass |
1144 | 1182 | else: |
1145 | 1183 | new_def = new_def.astype(col_type) |
1146 | 1184 |
|
|
0 commit comments