diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 5c915304..9d23a1e8 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -9,7 +9,7 @@ on: jobs: tests: # Ubuntu latest no longer installs Python 3.9 by default so install it - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v4 @@ -27,6 +27,12 @@ jobs: # restore-keys: | # ${{ runner.os }}-go- + - name: Set Java 8 + run: | + sudo update-alternatives --set java /usr/lib/jvm/temurin-8-jdk-amd64/bin/java + java -version + sudo update-alternatives --config java + - name: Set up Python 3.8 uses: actions/setup-python@v5 with: @@ -50,7 +56,9 @@ jobs: pipenv run prospector --profile prospector.yaml - name: Run tests - run: make test + run: | + java -version + make test - name: Publish test coverage to coverage site uses: codecov/codecov-action@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e7ec05de..e755fb61 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -19,6 +19,9 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Check Java version + run: java -version + - name: Set up Python 3.8 uses: actions/setup-python@v5 with: diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py index 8330062f..a9df64ef 100644 --- a/dbldatagen/column_generation_spec.py +++ b/dbldatagen/column_generation_spec.py @@ -119,7 +119,7 @@ def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix if EXPR_OPTION not in kwargs: raise ValueError("Column generation spec must have `expr` attribute specified if datatype is inferred") - elif type(colType) == str: + elif isinstance(colType, str): colType = SchemaParser.columnTypeFromString(colType) assert isinstance(colType, DataType), f"colType `{colType}` is not instance of DataType" @@ -1300,3 +1300,22 @@ def makeGenerationExpressions(self): retval = F.slice(retval, F.lit(1), F.expr(expr_str)) return retval + + def toDict(self): + """ Creates a dictionary from a ColumnGenerationSpec. + :return: A dictionary representation of the ColumnGenerationSpec + """ + return { + "colName": self.name, + "colType": self.datatype.simpleString(), + "minValue": self.min, + "maxValue": self.max, + "step": self.step, + "values": self.values, + "expr": self.expr, + "prefix": self.prefix, + "random": self.random, + "nullable": self.nullable, + "omit": self.omit, + "implicit": self.implicit + } diff --git a/dbldatagen/constraints/chained_relation.py b/dbldatagen/constraints/chained_relation.py index ea189506..185e3a09 100644 --- a/dbldatagen/constraints/chained_relation.py +++ b/dbldatagen/constraints/chained_relation.py @@ -57,3 +57,13 @@ def _generateFilterExpression(self): # ... and combine them using logical `and` operation return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "columns": self._columns, + "relation": self._relation + } diff --git a/dbldatagen/constraints/constraint.py b/dbldatagen/constraints/constraint.py index e9291098..73d3fc75 100644 --- a/dbldatagen/constraints/constraint.py +++ b/dbldatagen/constraints/constraint.py @@ -133,6 +133,26 @@ def filterExpression(self): self._calculatedFilterExpression = True return self._filterExpression + @classmethod + def fromDict(cls, constraint): + """ Creates a Constraint from a Python dictionary. + :param constraint: Constraint definition as a Python dictionary + :return: Constraint object + """ + inner_obj = constraint.copy() + constraint_type = inner_obj.pop("type") + for c in cls.__subclasses__(): + if c.__name__ == constraint_type: + return c(**inner_obj) + raise ValueError(f"Unknown constraint type: {constraint_type}") + + @abstractmethod + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + raise NotImplementedError("Method toDict must be implemented in derived class") + class NoFilterMixin: """ Mixin class to indicate that constraint has no filter expression diff --git a/dbldatagen/constraints/literal_range_constraint.py b/dbldatagen/constraints/literal_range_constraint.py index 3076cf6b..c08228f1 100644 --- a/dbldatagen/constraints/literal_range_constraint.py +++ b/dbldatagen/constraints/literal_range_constraint.py @@ -43,3 +43,15 @@ def _generateFilterExpression(self): # ... and combine them using logical `and` operation return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "columns": self._columns, + "lowValue": self._lowValue, + "highValue": self._highValue, + "strict": self._strict + } diff --git a/dbldatagen/constraints/literal_relation_constraint.py b/dbldatagen/constraints/literal_relation_constraint.py index 1ec629bd..b22ec9e9 100644 --- a/dbldatagen/constraints/literal_relation_constraint.py +++ b/dbldatagen/constraints/literal_relation_constraint.py @@ -35,3 +35,14 @@ def _generateFilterExpression(self): filters = [self._generate_relation_expression(col, self._relation, literalValue) for col in expressions] return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "columns": self._columns, + "relation": self._relation, + "value": self._value + } diff --git a/dbldatagen/constraints/negative_values.py b/dbldatagen/constraints/negative_values.py index 22d43ddb..f43f04d0 100644 --- a/dbldatagen/constraints/negative_values.py +++ b/dbldatagen/constraints/negative_values.py @@ -35,3 +35,13 @@ def _generateFilterExpression(self): filters = [col.isNotNull() & (col <= 0) for col in expressions] return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": "NegativeValues", + "columns": self._columns, + "strict": self._strict + } diff --git a/dbldatagen/constraints/positive_values.py b/dbldatagen/constraints/positive_values.py index 42aae7cb..dc6b67c1 100644 --- a/dbldatagen/constraints/positive_values.py +++ b/dbldatagen/constraints/positive_values.py @@ -36,3 +36,13 @@ def _generateFilterExpression(self): filters = [col.isNotNull() & (col >= 0) for col in expressions] return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "columns": self._columns, + "strict": self._strict + } diff --git a/dbldatagen/constraints/ranged_values_constraint.py b/dbldatagen/constraints/ranged_values_constraint.py index b2b9df49..8d2e901b 100644 --- a/dbldatagen/constraints/ranged_values_constraint.py +++ b/dbldatagen/constraints/ranged_values_constraint.py @@ -42,3 +42,15 @@ def _generateFilterExpression(self): # ... and combine them using logical `and` operation return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": "RangedValues", + "columns": self._columns, + "lowValue": self._lowValue, + "highValue": self._highValue, + "strict": self._strict + } diff --git a/dbldatagen/constraints/sql_expr.py b/dbldatagen/constraints/sql_expr.py index 91855330..f8e48c74 100644 --- a/dbldatagen/constraints/sql_expr.py +++ b/dbldatagen/constraints/sql_expr.py @@ -28,3 +28,12 @@ def __init__(self, expr: str): def _generateFilterExpression(self): """ Generate a SQL filter expression that may be used for filtering""" return F.expr(self._expr) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "expr": self._expr + } diff --git a/dbldatagen/constraints/unique_combinations.py b/dbldatagen/constraints/unique_combinations.py index 3bea785d..1122e0b1 100644 --- a/dbldatagen/constraints/unique_combinations.py +++ b/dbldatagen/constraints/unique_combinations.py @@ -79,3 +79,12 @@ def transformDataframe(self, dataGenerator, dataFrame): results = dataFrame.dropDuplicates(columnsToEvaluate) return results + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "columns": self._columns + } diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py index 12015438..936cbabe 100644 --- a/dbldatagen/data_generator.py +++ b/dbldatagen/data_generator.py @@ -6,9 +6,11 @@ This file defines the `DataGenError` and `DataGenerator` classes """ import copy +import json import logging import re +import yaml from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField, DataType from ._version import _get_spark_version @@ -869,6 +871,17 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None self._inferredSchemaFields.append(StructField(colName, newColumn.datatype, nullable)) return self + def withColumnDefinitions(self, columns): + """ Adds a set of columns to the synthetic generation specification. + + :param columns: A list of column generation specifications as dictionaries + :returns: A modified in-place instance of a data generator allowing for chaining of calls + following a builder pattern + """ + for column in columns: + self.withColumn(**column) + return self + def _mkSqlStructFromList(self, fields): """ Create a SQL struct expression from a list of fields @@ -1206,6 +1219,12 @@ def _getColumnDataTypes(self, columns): """ return [self._columnSpecsByName[colspec].datatype for colspec in columns] + def getColumnGenerationSpecs(self): + return self._allColumnSpecs + + def getConstraints(self): + return self._constraints + def withConstraint(self, constraint): """Add a constraint to control the data generation @@ -1255,6 +1274,17 @@ def withSqlConstraint(self, sqlExpression: str): self.withConstraint(SqlExpr(sqlExpression)) return self + def withConstraintDefinitions(self, constraints): + """ Adds a set of constraints to the synthetic generation specification. + + :param constraints: A list of constraints as dictionaries + :returns: A modified in-place instance of a data generator allowing for chaining of calls + following a builder pattern + """ + for c in constraints: + self.withConstraint(Constraint.fromDict(c)) + return self + def computeBuildPlan(self): """ prepare for building by computing a pseudo build plan @@ -1604,3 +1634,66 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None, result = HtmlUtils.formatCodeAsHtml(results) return result + + @staticmethod + def fromDict(options): + """ Creates a data generator from a dictionary of options. + :param options: Dictionary with data generator options (e.g. "name", "rows") + :return: A data generator with the specified options + """ + generator = options["generator"] + columns = options.get("columns", []) + constraints = options.get("constraints", []) + return ( + DataGenerator(**generator) + .withColumnDefinitions(columns) + .withConstraintDefinitions(constraints) + ) + + def toDict(self): + """ Creates a dictionary from a DataGenerator. + :return: A dictionary representation of the DataGenerator + """ + generator = { + "name": self.name, + "rows": self.rowCount, + "partitions": self.partitions, + "random": self.random, + "randomSeed": self.randomSeed, + "startingId": self.starting_id, + } + return { + "generator": generator, + "columns": [column.toDict() for column in self.getColumnGenerationSpecs()], + "constraints": [constraint.toDict() for constraint in self.getConstraints()] + } + + @staticmethod + def fromJson(options): + """ Creates a data generator from a JSON string. + :param options: A JSON string containing data generation options + :return: A data generator with the specified options + """ + options = json.loads(options) + return DataGenerator.fromDict(options) + + def toJson(self): + """ Returns the JSON string representation of a data generator. + :return: A JSON string representation of the DataGenerator + """ + return json.dumps(self.toDict()) + + @staticmethod + def fromYaml(options): + """ Creates a data generator from a YAML string. + :param options: A YAML string containing data generation options + :return: A data generator with the specified options + """ + options = yaml.safe_load(options) + return DataGenerator.fromDict(options) + + def toYaml(self): + """ Returns the YAML string representation of a data generator. + :return: A YAML string representation of the DataGenerator + """ + return yaml.dump(self.toDict()) diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py index 965350be..ae20a63e 100644 --- a/dbldatagen/text_generators.py +++ b/dbldatagen/text_generators.py @@ -77,7 +77,7 @@ def __str__(self): return f"TextGenerator(randomSeed={self._randomSeed})" def __eq__(self, other): - return type(self) == type(other) and self._randomSeed == other._randomSeed + return isinstance(self, type(other)) and self._randomSeed == other._randomSeed def withRandomSeed(self, seed): """ Set the random seed for the text generator @@ -260,7 +260,7 @@ def __init__(self, template, escapeSpecialChars=False, extendedWordList=None): assert v is not None and isinstance(v, tuple) and len(v) == 2, "value must be tuple of length 2" mapping_length, mappings = v assert isinstance(mapping_length, int), "mapping length must be of type int" - assert isinstance(mappings, (list, np.ndarray)),\ + assert isinstance(mappings, (list, np.ndarray)), \ "mappings are lists or numpy arrays" assert mapping_length == 0 or len(mappings) == mapping_length, "mappings must match mapping_length" @@ -277,7 +277,7 @@ def __init__(self, template, escapeSpecialChars=False, extendedWordList=None): assert v is not None and isinstance(v, tuple) and len(v) == 2, "value must be tuple of length 2" mapping_length, mappings = v assert isinstance(mapping_length, int), "mapping length must be of type int" - assert mappings is None or isinstance(mappings, (list, np.ndarray)),\ + assert mappings is None or isinstance(mappings, (list, np.ndarray)), \ "mappings are lists or numpy arrays" # for escaped mappings, the mapping can be None in which case the mapping is to the number itself diff --git a/docs/source/generating_column_data.rst b/docs/source/generating_column_data.rst index 7efe68a7..8033651e 100644 --- a/docs/source/generating_column_data.rst +++ b/docs/source/generating_column_data.rst @@ -182,3 +182,48 @@ This has several implications: SQL expression. To enforce the dependency, you must use the `baseColumn` attribute to indicate the dependency. +Creating data generation specs from files +----------------------------------------- + +``DataGenerator.fromFile("file_path")`` will return a ``DataGenerator`` with ``ColumnGenerationSpecs`` from definitions +in a JSON or YAML file. Use the ``"generator"`` key to specify ``DataGenerator`` options and the ``"columns"`` key to +specify ``ColumnGenerationSpec`` options. + +**JSON Example:** + +.. code-block:: JSON + { + "generator": { + "name": "test_data_generator", + "rows": 1000, + "partitions": 10 + }, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true} + ] + } + +**YAML Example:** +.. code-block:: YAML +generator: + name: test_data_generator + rows: 1000 + partitions: 10 +columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 1000 + - colName: col2 + colType: float + minValue: -10.0 + maxValue: 10.0 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true diff --git a/docs/source/options_and_features.rst b/docs/source/options_and_features.rst index 2727c273..af590c5a 100644 --- a/docs/source/options_and_features.rst +++ b/docs/source/options_and_features.rst @@ -128,6 +128,12 @@ representing the column - for example "email_0", "email_1" etc. If you specify the attribute ``structType="array"``, the multiple columns will be combined into a single array valued column. +Generating columns from Python dictionaries +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can generate columns from Python dictionaries using ``withColumns(column_options)``. Each dictionary should contain +keys which match the ``withColumn`` arguments (e.g. ``"colName"``, ``"colType"``). + Generating random values ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/makefile b/makefile index e76e0952..17e2bfc1 100644 --- a/makefile +++ b/makefile @@ -71,6 +71,10 @@ prep-doc-release: # Tests test: export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES +test: export SPARK_MASTER_HOST='localhost' + +test: export SPARK_LOCAL_IP=127.0.0.1 + dev-test: export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES dev-test: export SPARK_MASTER_HOST='localhost' diff --git a/tests/test_generation_from_config.py b/tests/test_generation_from_config.py new file mode 100644 index 00000000..2be92afc --- /dev/null +++ b/tests/test_generation_from_config.py @@ -0,0 +1,403 @@ +from contextlib import nullcontext as does_not_raise +import json +import pytest +import yaml +import dbldatagen as dg + +spark = dg.SparkSingleton.getLocalInstance("unit tests") + + +class TestGenerationFromConfig: + @pytest.mark.parametrize("expectation, columns", [ + (does_not_raise(), [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} + ]), + (does_not_raise(), [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} + ]), + ]) + def test_column_definitions_from_dict(self, columns, expectation): + with expectation: + # Test the options set on the ColumnGenerationSpecs: + gen_from_dicts = dg.DataGenerator(rows=100, partitions=1).withColumnDefinitions(columns) + for column in columns: + column_spec = gen_from_dicts.getColumnSpec(column["colName"]) + for key in column.keys(): + assert column_spec.toDict()[key] == column[key] + + # Test the data generated after building the DataFrame: + df_from_dicts = gen_from_dicts.build() + assert df_from_dicts.columns == ["col1", "col2", "col3"] + + @pytest.mark.parametrize("expectation, constraints", [ + (does_not_raise(), [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True} + ]), + (does_not_raise(), [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": False}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "LiteralRelation", "columns": ["col2"], "relation": "<>", "value": "0"} + ]), + (pytest.raises(ValueError), [ # Testing an invalid "relation" value + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "LiteralRelation", "columns": ["col2"], "relation": "+", "value": "0"} + ]), + (pytest.raises(ValueError), [ # Testing an invalid "type" value + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": False}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "Equivalent", "columns": ["col2"], "value": "0"} + ]), + (does_not_raise(), [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "NegativeValues", "columns": ["col1", "col2"], "strict": False}, + {"type": "ChainedRelation", "columns": ["col1", "col2"], "relation": ">"}, + {"type": "RangedValues", "columns": ["col2"], "lowValue": 0, "highValue": 100, "strict": True}, + {"type": "UniqueCombinations", "columns": ["col1", "col2"]} + ]), + ]) + def test_constraint_definitions_from_dict(self, constraints, expectation): + with expectation: + # Test the options set on the ColumnGenerationSpecs: + columns = [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} + ] + gen_from_dicts = dg.DataGenerator(rows=100, partitions=1) \ + .withColumnDefinitions(columns) \ + .withConstraintDefinitions(constraints) + + constraint_specs = [constraint.toDict() for constraint in gen_from_dicts.getConstraints()] + for constraint in constraints: + assert constraint in constraint_specs + + @pytest.mark.parametrize("expectation, options", [ + (does_not_raise(), + {"generator": {"name": "test_generator", "rows": 1000}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] + }), + (does_not_raise(), + {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] + }), + (does_not_raise(), + {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "LiteralRelation", "columns": ["col2"], "relation": "<>", "value": "0"}] + }), + (pytest.raises(KeyError), # Testing a dictionary missing a "generator" object + {"columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] + }), + (pytest.raises(ValueError), # Testing an invalid "type" value + {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "Equivalent", "columns": ["col2"], "value": 0}] + }), + ]) + def test_generator_from_dict(self, options, expectation): + with expectation: + # Test the options set on the DataGenerator: + gen_from_dicts = dg.DataGenerator.fromDict(options) + generator = options.get("generator") + for key in generator: + assert gen_from_dicts.toDict()["generator"][key] == generator[key] + + # Test the options set on the ColumnGenerationSpecs: + columns = options.get("columns", []) + for column in columns: + column_spec = gen_from_dicts.getColumnSpec(column["colName"]) + for key in column.keys(): + assert column_spec.toDict()[key] == column[key] + + # Test the options set on the Constraints: + constraints = options.get("constraints", []) + constraint_specs = [constraint.toDict() for constraint in gen_from_dicts.getConstraints()] + for constraint in constraints: + assert constraint in constraint_specs + + # Test the data generated after building the DataFrame: + df_from_dicts = gen_from_dicts.build() + assert df_from_dicts.columns == ["col1", "col2", "col3"] + + @pytest.mark.parametrize("expectation, json_options", [ + (does_not_raise(), + '''{"generator": {"name": "test_generator", "rows": 1000}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] + }'''), + (does_not_raise(), + '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] + }'''), + (does_not_raise(), + '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": true}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": true}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "LiteralRelation", "columns": ["col2"], "relation": "<>", "value": 0}] + }'''), + (pytest.raises(KeyError), # Testing a JSON object missing the "generator" key + '''{"columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] + }'''), + (pytest.raises(ValueError), # Testing an invalid "type" value + '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": true}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": true}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "Equivalent", "columns": ["col2"], "value": 0}] + }'''), + ]) + def test_generator_from_json(self, json_options, expectation): + options = json.loads(json_options) + with expectation: + # Test the options set on the DataGenerator: + gen_from_dicts = dg.DataGenerator.fromJson(json_options) + generator = options.get("generator") + for key in generator: + assert gen_from_dicts.toDict()["generator"][key] == generator[key] + + # Test the options set on the ColumnGenerationSpecs: + columns = options.get("columns", []) + for column in columns: + column_spec = gen_from_dicts.getColumnSpec(column["colName"]) + for key in column.keys(): + assert column_spec.toDict()[key] == column[key] + + # Test the options set on the Constraints: + constraints = options.get("constraints", []) + constraint_specs = [constraint.toDict() for constraint in gen_from_dicts.getConstraints()] + for constraint in constraints: + assert constraint in constraint_specs + + # Test the data generated after building the DataFrame: + df_from_dicts = gen_from_dicts.build() + assert df_from_dicts.columns == ["col1", "col2", "col3"] + + @pytest.mark.parametrize("expectation, yaml_options", [ + (does_not_raise(), + '''--- + generator: + name: test_generator + rows: 10000 + randomSeed: 42 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + step: 2 + random: true + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + step: 1.5 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true'''), + (does_not_raise(), + '''--- + generator: + name: test_generator + rows: 1000 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true'''), + (does_not_raise(), + '''--- + generator: + name: test_generator + rows: 10000 + randomSeed: 42 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + step: 2 + random: true + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + step: 1.5 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true + constraints: + - type: LiteralRange + columns: + - col1 + lowValue: -1000 + highValue: 1000 + strict: true + - type: PositiveValues + columns: + - col1 + - col2 + strict: true + - type: SqlExpr + expr: col1 > 0 + - type: LiteralRelation + columns: + - col2 + relation: "<>" + value: 0'''), + (pytest.raises(KeyError), # Testing a YAML object missing the "generator" key + '''--- + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true'''), + (pytest.raises(ValueError), # Testing an invalid "type" value + '''--- + generator: + name: test_generator + rows: 10000 + randomSeed: 42 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + step: 2 + random: true + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + step: 1.5 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true + constraints: + - type: LiteralRange + columns: + - col1 + lowValue: -1000 + highValue: 1000 + strict: true + - type: PositiveValues + columns: + - col1 + - col2 + strict: true + - type: SqlExpr + expr: col1 > 0 + - type: Equivalent + columns: + - col2 + value: 0''') + ]) + def test_generator_from_yaml(self, yaml_options, expectation): + options = yaml.safe_load(yaml_options) + with expectation: + # Test the options set on the DataGenerator: + gen_from_dicts = dg.DataGenerator.fromYaml(yaml_options) + generator = options.get("generator") + for key in generator: + assert gen_from_dicts.toDict()["generator"][key] == generator[key] + + # Test the options set on the ColumnGenerationSpecs: + columns = options.get("columns", []) + for column in columns: + column_spec = gen_from_dicts.getColumnSpec(column["colName"]) + for key in column.keys(): + assert column_spec.toDict()[key] == column[key] + + # Test the options set on the Constraints: + constraints = options.get("constraints", []) + constraint_specs = [constraint.toDict() for constraint in gen_from_dicts.getConstraints()] + for constraint in constraints: + assert constraint in constraint_specs + + # Test the data generated after building the DataFrame: + df_from_dicts = gen_from_dicts.build() + assert df_from_dicts.columns == ["col1", "col2", "col3"] diff --git a/tests/test_quick_tests.py b/tests/test_quick_tests.py index de83daa3..f5ed9c67 100644 --- a/tests/test_quick_tests.py +++ b/tests/test_quick_tests.py @@ -1,6 +1,7 @@ from datetime import timedelta, datetime - +import json import pytest +import yaml from pyspark.sql.types import ( StructType, StructField, IntegerType, StringType, FloatType, DateType, DecimalType, DoubleType, ByteType, ShortType, LongType @@ -10,6 +11,7 @@ import dbldatagen as dg from dbldatagen import DataGenerator from dbldatagen import NRange, DateRange +from dbldatagen.constraints import PositiveValues schema = StructType([ StructField("site_id", IntegerType(), True), @@ -754,3 +756,102 @@ def test_random_generation_without_range_values(self, columnSpecOptions): def test_version_info(self): # test access to version info without explicit import print("Data generator version", dg.__version__) + + def test_multi_column_generation(self): + column_specs = [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} + ] + df_from_dicts = dg.DataGenerator(rows=100, partitions=1).withColumnDefinitions(column_specs).build() + assert df_from_dicts.columns == ["col1", "col2", "col3"] + + def test_generation_from_dictionary(self): + dg_spec = { + "name": "test_data_generator", + "rows": 1000, + "partitions": 10, + "randomSeedMethod": "fixed", + "randomSeed": 42, + "random": True + } + gen_from_dict = DataGenerator.fromDict({"generator": dg_spec}) + assert gen_from_dict.name == dg_spec.get("name") + assert gen_from_dict.rowCount == dg_spec.get("rows") + assert gen_from_dict.partitions == dg_spec.get("partitions") + assert gen_from_dict.random == dg_spec.get("random") + assert gen_from_dict.randomSeed == dg_spec.get("randomSeed") + + def test_to_dict(self): + gen = ( + dg.DataGenerator(rows=1000, name="test_data_generator", partitions=1) + .withColumn("val1", "int", minValue=0, maxValue=100, step=1, random=True) + .withConstraint(PositiveValues(columns=["val1"], strict=True)) + ) + gen_dict = gen.toDict() + assert gen_dict["generator"]["rows"] == 1000 + assert gen_dict["generator"]["name"] == "test_data_generator" + assert gen_dict["generator"]["partitions"] == 1 + + column = gen_dict["columns"][1] + assert column["colName"] == "val1" + assert column["colType"] == "int" + assert column["minValue"] == 0 + assert column["maxValue"] == 100 + assert column["step"] == 1 + assert column["random"] + + constraint = gen_dict["constraints"][0] + assert constraint["type"] == "PositiveValues" + assert constraint["columns"] == ["val1"] + assert constraint["strict"] + + def test_to_json(self): + gen = ( + dg.DataGenerator(rows=1000, name="test_data_generator", partitions=1) + .withColumn("val1", "int", minValue=0, maxValue=100, step=1, random=True) + .withConstraint(PositiveValues(columns=["val1"], strict=True)) + ) + gen_json = gen.toJson() + gen_dict = json.loads(gen_json) + assert gen_dict["generator"]["rows"] == 1000 + assert gen_dict["generator"]["name"] == "test_data_generator" + assert gen_dict["generator"]["partitions"] == 1 + + column = gen_dict["columns"][1] + assert column["colName"] == "val1" + assert column["colType"] == "int" + assert column["minValue"] == 0 + assert column["maxValue"] == 100 + assert column["step"] == 1 + assert column["random"] + + constraint = gen_dict["constraints"][0] + assert constraint["type"] == "PositiveValues" + assert constraint["columns"] == ["val1"] + assert constraint["strict"] + + def test_to_yaml(self): + gen = ( + dg.DataGenerator(rows=1000, name="test_data_generator", partitions=1) + .withColumn("val1", "int", minValue=0, maxValue=100, step=1, random=True) + .withConstraint(PositiveValues(columns=["val1"], strict=True)) + ) + gen_yaml = gen.toYaml() + gen_dict = yaml.safe_load(gen_yaml) + assert gen_dict["generator"]["rows"] == 1000 + assert gen_dict["generator"]["name"] == "test_data_generator" + assert gen_dict["generator"]["partitions"] == 1 + + column = gen_dict["columns"][1] + assert column["colName"] == "val1" + assert column["colType"] == "int" + assert column["minValue"] == 0 + assert column["maxValue"] == 100 + assert column["step"] == 1 + assert column["random"] + + constraint = gen_dict["constraints"][0] + assert constraint["type"] == "PositiveValues" + assert constraint["columns"] == ["val1"] + assert constraint["strict"]