|
| 1 | +# See the License for the specific language governing permissions and |
| 2 | +# limitations under the License. |
| 3 | +# |
| 4 | + |
| 5 | +""" |
| 6 | +This module defines the Constraint class |
| 7 | +""" |
| 8 | +import types |
| 9 | +from abc import ABC, abstractmethod |
| 10 | +from pyspark.sql import Column |
| 11 | + |
| 12 | + |
| 13 | +class Constraint(ABC): |
| 14 | + """ Constraint object - base class for predefined and custom constraints |
| 15 | +
|
| 16 | + This class is meant for internal use only. |
| 17 | +
|
| 18 | + """ |
| 19 | + SUPPORTED_OPERATORS = ["<", ">", ">=", "!=", "==", "=", "<=", "<>"] |
| 20 | + |
| 21 | + def __init__(self, supportsStreaming=False): |
| 22 | + """ |
| 23 | + Initialize the constraint object |
| 24 | + """ |
| 25 | + self._filterExpression = None |
| 26 | + self._calculatedFilterExpression = False |
| 27 | + self._supportsStreaming = supportsStreaming |
| 28 | + |
| 29 | + @staticmethod |
| 30 | + def _columnsFromListOrString(columns): |
| 31 | + """ Get columns as list of columns from string of list-like |
| 32 | +
|
| 33 | + :param columns: string or list of strings representing column names |
| 34 | + """ |
| 35 | + if isinstance(columns, str): |
| 36 | + return [columns] |
| 37 | + elif isinstance(columns, (list, set, tuple, types.GeneratorType)): |
| 38 | + return list(columns) |
| 39 | + else: |
| 40 | + raise ValueError("Columns must be a string or list of strings") |
| 41 | + |
| 42 | + @staticmethod |
| 43 | + def _generate_relation_expression(column, relation, valueExpression): |
| 44 | + """ Generate comparison expression |
| 45 | +
|
| 46 | + :param column: Column to generate comparison against |
| 47 | + :param relation: relation to implement |
| 48 | + :param valueExpression: expression to compare to |
| 49 | + :return: relation expression as variation of Pyspark SQL columns |
| 50 | + """ |
| 51 | + if relation == ">": |
| 52 | + return column > valueExpression |
| 53 | + elif relation == ">=": |
| 54 | + return column >= valueExpression |
| 55 | + elif relation == "<": |
| 56 | + return column < valueExpression |
| 57 | + elif relation == "<=": |
| 58 | + return column <= valueExpression |
| 59 | + elif relation in ["!=", "<>"]: |
| 60 | + return column != valueExpression |
| 61 | + elif relation in ["=", "=="]: |
| 62 | + return column == valueExpression |
| 63 | + else: |
| 64 | + raise ValueError(f"Unsupported relation type '{relation}") |
| 65 | + |
| 66 | + @staticmethod |
| 67 | + def mkCombinedConstraintExpression(constraintExpressions): |
| 68 | + """ Generate a SQL expression that combines multiple constraints using AND |
| 69 | +
|
| 70 | + :param constraintExpressions: list of Pyspark SQL Column constraint expression objects |
| 71 | + :return: combined constraint expression as Pyspark SQL Column object (or None if no valid expressions) |
| 72 | +
|
| 73 | + """ |
| 74 | + assert constraintExpressions is not None and isinstance(constraintExpressions, list), \ |
| 75 | + "Constraints must be a list of Pyspark SQL Column instances" |
| 76 | + |
| 77 | + assert all(expr is None or isinstance(expr, Column) for expr in constraintExpressions), \ |
| 78 | + "Constraint expressions must be Pyspark SQL columns or None" |
| 79 | + |
| 80 | + valid_constraint_expressions = [expr for expr in constraintExpressions if expr is not None] |
| 81 | + |
| 82 | + if len(valid_constraint_expressions) > 0: |
| 83 | + combined_constraint_expression = valid_constraint_expressions[0] |
| 84 | + |
| 85 | + for additional_constraint in valid_constraint_expressions[1:]: |
| 86 | + combined_constraint_expression = combined_constraint_expression & additional_constraint |
| 87 | + |
| 88 | + return combined_constraint_expression |
| 89 | + else: |
| 90 | + return None |
| 91 | + |
| 92 | + @abstractmethod |
| 93 | + def prepareDataGenerator(self, dataGenerator): |
| 94 | + """ Prepare the data generator to generate data that matches the constraint |
| 95 | +
|
| 96 | + This method may modify the data generation rules to meet the constraint |
| 97 | +
|
| 98 | + :param dataGenerator: Data generation object that will generate the dataframe |
| 99 | + :return: modified or unmodified data generator |
| 100 | + """ |
| 101 | + raise NotImplementedError("Method prepareDataGenerator must be implemented in derived class") |
| 102 | + |
| 103 | + @abstractmethod |
| 104 | + def transformDataframe(self, dataGenerator, dataFrame): |
| 105 | + """ Transform the dataframe to make data conform to constraint if possible |
| 106 | +
|
| 107 | + This method should not modify the dataGenerator - but may modify the dataframe |
| 108 | +
|
| 109 | + :param dataGenerator: Data generation object that generated the dataframe |
| 110 | + :param dataFrame: generated dataframe |
| 111 | + :return: modified or unmodified Spark dataframe |
| 112 | +
|
| 113 | + The default transformation returns the dataframe unmodified |
| 114 | +
|
| 115 | + """ |
| 116 | + raise NotImplementedError("Method transformDataframe must be implemented in derived class") |
| 117 | + |
| 118 | + @abstractmethod |
| 119 | + def _generateFilterExpression(self): |
| 120 | + """ Generate a Pyspark SQL expression that may be used for filtering""" |
| 121 | + raise NotImplementedError("Method _generateFilterExpression must be implemented in derived class") |
| 122 | + |
| 123 | + @property |
| 124 | + def supportsStreaming(self): |
| 125 | + """ Return True if the constraint supports streaming dataframes""" |
| 126 | + return self._supportsStreaming |
| 127 | + |
| 128 | + @property |
| 129 | + def filterExpression(self): |
| 130 | + """ Return the filter expression (as instance of type Column that evaluates to True or non-True)""" |
| 131 | + if not self._calculatedFilterExpression: |
| 132 | + self._filterExpression = self._generateFilterExpression() |
| 133 | + self._calculatedFilterExpression = True |
| 134 | + return self._filterExpression |
| 135 | + |
| 136 | + |
| 137 | +class NoFilterMixin: |
| 138 | + """ Mixin class to indicate that constraint has no filter expression |
| 139 | +
|
| 140 | + Intended to be used in implementation of the concrete constraint classes. |
| 141 | +
|
| 142 | + Use of the mixin class is optional but when used with the Constraint class and multiple inheritance, |
| 143 | + it will provide a default implementation of the _generateFilterExpression method that satisfies |
| 144 | + the abstract method requirement of the Constraint class. |
| 145 | +
|
| 146 | + When using mixins, place the mixin class first in the list of base classes. |
| 147 | + """ |
| 148 | + def _generateFilterExpression(self): |
| 149 | + """ Generate a Pyspark SQL expression that may be used for filtering""" |
| 150 | + return None |
| 151 | + |
| 152 | + |
| 153 | +class NoPrepareTransformMixin: |
| 154 | + """ Mixin class to indicate that constraint has no filter expression |
| 155 | +
|
| 156 | + Intended to be used in implementation of the concrete constraint classes. |
| 157 | +
|
| 158 | + Use of the mixin class is optional but when used with the Constraint class and multiple inheritance, |
| 159 | + it will provide a default implementation of the `prepareDataGenerator` and `transformeDataFrame` methods |
| 160 | + that satisfies the abstract method requirements of the Constraint class. |
| 161 | +
|
| 162 | + When using mixins, place the mixin class first in the list of base classes. |
| 163 | + """ |
| 164 | + def prepareDataGenerator(self, dataGenerator): |
| 165 | + """ Prepare the data generator to generate data that matches the constraint |
| 166 | +
|
| 167 | + This method may modify the data generation rules to meet the constraint |
| 168 | +
|
| 169 | + :param dataGenerator: Data generation object that will generate the dataframe |
| 170 | + :return: modified or unmodified data generator |
| 171 | + """ |
| 172 | + return dataGenerator |
| 173 | + |
| 174 | + def transformDataframe(self, dataGenerator, dataFrame): |
| 175 | + """ Transform the dataframe to make data conform to constraint if possible |
| 176 | +
|
| 177 | + This method should not modify the dataGenerator - but may modify the dataframe |
| 178 | +
|
| 179 | + :param dataGenerator: Data generation object that generated the dataframe |
| 180 | + :param dataFrame: generated dataframe |
| 181 | + :return: modified or unmodified Spark dataframe |
| 182 | +
|
| 183 | + The default transformation returns the dataframe unmodified |
| 184 | +
|
| 185 | + """ |
| 186 | + return dataFrame |
0 commit comments