From 91a6f5604ffe11b610ab94a09b54ee0111b8228c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 7 Nov 2025 18:30:47 +0000 Subject: [PATCH 1/4] Initial plan From 5b0232562bb9ee4d0d0926b7f1ec1d35187be356 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 7 Nov 2025 18:41:53 +0000 Subject: [PATCH 2/4] Implement validation_rules for Array column with inner validation support Co-authored-by: borchero <22455425+borchero@users.noreply.github.com> --- dataframely/columns/array.py | 31 ++++++++++++++++++++---------- tests/column_types/test_array.py | 33 +++++++++++++++++++++++++++----- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index 2ff1b06..8cef317 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -11,6 +11,7 @@ import polars as pl from dataframely._compat import pa, sa, sa_TypeEngine +from dataframely._polars import PolarsDataType from dataframely.random import Generator from ._base import Check, Column @@ -42,7 +43,7 @@ def __init__( ): """ Args: - inner: The inner column type. No validation rules on the inner type are supported yet. + inner: The inner column type. shape: The shape of the array. nullable: Whether this column may contain null values. primary_key: Whether this column is part of the primary key of the schema. @@ -72,15 +73,6 @@ def __init__( "`primary_key=True` is not yet supported for inner types of the Array type." ) - # We disallow validation rules on the inner type since Polars arrays currently don't support .eval(). Converting - # to a list and calling .list.eval() is possible, however, since the shape can have multiple axes, the recursive - # conversion could have significant performance impact. Hence, we simply disallow inner validation rules. - # Another option would be to allow validation rules only for sampling, but not enforce them. - if inner.validation_rules(pl.lit(None)): - raise ValueError( - "Validation rules on the inner type of Array are not yet supported." - ) - super().__init__( nullable=nullable, primary_key=False, @@ -95,6 +87,25 @@ def __init__( def dtype(self) -> pl.DataType: return pl.Array(self.inner.dtype, self.shape) + def validate_dtype(self, dtype: PolarsDataType) -> bool: + if not isinstance(dtype, pl.Array): + return False + # Compare the constructed dtype directly - this handles both flat and nested cases + return self.dtype == dtype + + def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]: + inner_rules = { + f"inner_{rule_name}": expr.arr.eval(inner_expr).arr.all() + for rule_name, inner_expr in self.inner.validation_rules( + pl.element() + ).items() + } + + return { + **super().validation_rules(expr), + **inner_rules, + } + def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine: # NOTE: We might want to add support for PostgreSQL's ARRAY type or use JSON in the future. raise NotImplementedError("SQL column cannot have 'Array' type.") diff --git a/tests/column_types/test_array.py b/tests/column_types/test_array.py index 4b53911..b426df7 100644 --- a/tests/column_types/test_array.py +++ b/tests/column_types/test_array.py @@ -6,7 +6,7 @@ import dataframely as dy from dataframely.columns._base import Column -from dataframely.testing import create_schema +from dataframely.testing import create_schema, validation_mask @pytest.mark.parametrize( @@ -142,10 +142,33 @@ def test_array_with_inner_pk() -> None: def test_array_with_rules() -> None: - with pytest.raises(ValueError): - create_schema( - "test", {"a": dy.Array(dy.String(min_length=2, nullable=False), 1)} - ) + schema = create_schema( + "test", {"a": dy.Array(dy.String(min_length=2, nullable=False), 1)} + ) + df = pl.DataFrame( + {"a": [["ab"], ["a"], [None]]}, + schema={"a": pl.Array(pl.String, 1)}, + ) + _, failures = schema.filter(df) + assert validation_mask(df, failures).to_list() == [True, False, False] + assert failures.counts() == {"a|inner_nullability": 1, "a|inner_min_length": 1} + + +def test_nested_array_with_rules() -> None: + schema = create_schema( + "test", {"a": dy.Array(dy.Array(dy.String(min_length=2, nullable=False), 1), 1)} + ) + df = pl.DataFrame( + {"a": [[["ab"]], [["a"]], [[None]]]}, + schema={"a": pl.Array(pl.String, (1, 1))}, + ) + _, failures = schema.filter(df) + # NOTE: `validation_mask` currently fails for multiply nested arrays + assert failures.invalid().to_dict(as_series=False) == {"a": [[["a"]], [[None]]]} + assert failures.counts() == { + "a|inner_inner_nullability": 1, + "a|inner_inner_min_length": 1, + } def test_outer_nullability() -> None: From c56a038579bf4be27a6ba1a96d6e2a1d8cb44681 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 7 Nov 2025 18:47:23 +0000 Subject: [PATCH 3/4] Add docstrings to validate_dtype and validation_rules methods Co-authored-by: borchero <22455425+borchero@users.noreply.github.com> --- dataframely/columns/array.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index 8cef317..ceef3e2 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -88,12 +88,39 @@ def dtype(self) -> pl.DataType: return pl.Array(self.inner.dtype, self.shape) def validate_dtype(self, dtype: PolarsDataType) -> bool: + """Validate if the polars data type satisfies the column definition. + + Args: + dtype: The dtype to validate. + + Returns: + Whether the dtype is valid. + + Note: + This method handles both flat and nested array representations. + For example, ``Array(Array(String, 1), 1)`` and ``Array(String, (1, 1))`` + both produce the same polars dtype and are considered equivalent. + """ if not isinstance(dtype, pl.Array): return False # Compare the constructed dtype directly - this handles both flat and nested cases return self.dtype == dtype def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]: + """Return validation rules for this array column. + + This method validates both the array itself and its inner elements. + Inner validation rules are applied using ``arr.eval()`` and are prefixed + with ``inner_`` to distinguish them from outer array validations. + + Args: + expr: An expression referencing the column of the data frame. + + Returns: + A mapping from validation rule names to expressions that provide exactly + one boolean value per column item indicating whether validation with respect + to the rule is successful. + """ inner_rules = { f"inner_{rule_name}": expr.arr.eval(inner_expr).arr.all() for rule_name, inner_expr in self.inner.validation_rules( From 0872a2c1529c1f26bb80de23221bbb00c28c09ab Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Wed, 12 Nov 2025 18:25:16 +0100 Subject: [PATCH 4/4] Fix --- dataframely/columns/array.py | 49 ++++---------------------- dataframely/columns/list.py | 59 +++++++++++++++++++------------- tests/column_types/test_array.py | 25 ++++---------- tests/columns/test_pyarrow.py | 5 +-- 4 files changed, 49 insertions(+), 89 deletions(-) diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py index ceef3e2..f7fb593 100644 --- a/dataframely/columns/array.py +++ b/dataframely/columns/array.py @@ -11,12 +11,11 @@ import polars as pl from dataframely._compat import pa, sa, sa_TypeEngine -from dataframely._polars import PolarsDataType from dataframely.random import Generator from ._base import Check, Column from ._registry import column_from_dict, register -from .struct import Struct +from .list import _list_primary_key_check if sys.version_info >= (3, 11): from typing import Self @@ -65,14 +64,6 @@ def __init__( names, the specified alias is the only valid name. metadata: A dictionary of metadata to attach to the column. """ - if inner.primary_key or ( - isinstance(inner, Struct) - and any(col.primary_key for col in inner.inner.values()) - ): - raise ValueError( - "`primary_key=True` is not yet supported for inner types of the Array type." - ) - super().__init__( nullable=nullable, primary_key=False, @@ -87,40 +78,7 @@ def __init__( def dtype(self) -> pl.DataType: return pl.Array(self.inner.dtype, self.shape) - def validate_dtype(self, dtype: PolarsDataType) -> bool: - """Validate if the polars data type satisfies the column definition. - - Args: - dtype: The dtype to validate. - - Returns: - Whether the dtype is valid. - - Note: - This method handles both flat and nested array representations. - For example, ``Array(Array(String, 1), 1)`` and ``Array(String, (1, 1))`` - both produce the same polars dtype and are considered equivalent. - """ - if not isinstance(dtype, pl.Array): - return False - # Compare the constructed dtype directly - this handles both flat and nested cases - return self.dtype == dtype - def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]: - """Return validation rules for this array column. - - This method validates both the array itself and its inner elements. - Inner validation rules are applied using ``arr.eval()`` and are prefixed - with ``inner_`` to distinguish them from outer array validations. - - Args: - expr: An expression referencing the column of the data frame. - - Returns: - A mapping from validation rule names to expressions that provide exactly - one boolean value per column item indicating whether validation with respect - to the rule is successful. - """ inner_rules = { f"inner_{rule_name}": expr.arr.eval(inner_expr).arr.all() for rule_name, inner_expr in self.inner.validation_rules( @@ -128,8 +86,13 @@ def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]: ).items() } + array_rules: dict[str, pl.Expr] = {} + if (rule := _list_primary_key_check(expr.arr, self.inner)) is not None: + array_rules["primary_key"] = rule + return { **super().validation_rules(expr), + **array_rules, **inner_rules, } diff --git a/dataframely/columns/list.py b/dataframely/columns/list.py index 0804cf1..6d3338e 100644 --- a/dataframely/columns/list.py +++ b/dataframely/columns/list.py @@ -8,6 +8,8 @@ from typing import Any, cast import polars as pl +from polars.expr.array import ExprArrayNameSpace +from polars.expr.list import ExprListNameSpace from dataframely._compat import pa, sa, sa_TypeEngine from dataframely._polars import PolarsDataType @@ -97,29 +99,8 @@ def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]: } list_rules: dict[str, pl.Expr] = {} - if self.inner.primary_key: - list_rules["primary_key"] = ~expr.list.eval( - pl.element().is_duplicated() - ).list.any() - elif isinstance(self.inner, Struct) and any( - col.primary_key for col in self.inner.inner.values() - ): - primary_key_columns = [ - name for name, col in self.inner.inner.items() if col.primary_key - ] - # NOTE: We optimize for a single primary key column here as it is much - # faster to run duplication checks for non-struct types in polars 1.22. - if len(primary_key_columns) == 1: - list_rules["primary_key"] = ~expr.list.eval( - pl.element().struct.field(primary_key_columns[0]).is_duplicated() - ).list.any() - else: - list_rules["primary_key"] = ~expr.list.eval( - pl.struct( - pl.element().struct.field(primary_key_columns) - ).is_duplicated() - ).list.any() - + if (rule := _list_primary_key_check(expr.list, self.inner)) is not None: + list_rules["primary_key"] = rule if self.min_length is not None: list_rules["min_length"] = ( pl.when(expr.is_null()) @@ -187,3 +168,35 @@ def as_dict(self, expr: pl.Expr) -> dict[str, Any]: def from_dict(cls, data: dict[str, Any]) -> Self: data["inner"] = column_from_dict(data["inner"]) return super().from_dict(data) + + +def _list_primary_key_check( + list_expr: ExprListNameSpace | ExprArrayNameSpace, inner: Column +) -> pl.Expr | None: + def list_any(expr: pl.Expr) -> pl.Expr: + if isinstance(list_expr, ExprListNameSpace): + return expr.list.any() + return expr.arr.any() + + if inner.primary_key: + return ~list_expr.eval(pl.element().is_duplicated()).pipe(list_any) + elif isinstance(inner, Struct) and any( + col.primary_key for col in inner.inner.values() + ): + primary_key_columns = [ + name for name, col in inner.inner.items() if col.primary_key + ] + # NOTE: We optimize for a single primary key column here as it is much + # faster to run duplication checks for non-struct types in polars 1.22. + if len(primary_key_columns) == 1: + return ~list_expr.eval( + pl.element().struct.field(primary_key_columns[0]).is_duplicated() + ).pipe(list_any) + else: + return ~list_expr.eval( + pl.struct( + pl.element().struct.field(primary_key_columns) + ).is_duplicated() + ).pipe(list_any) + + return None diff --git a/tests/column_types/test_array.py b/tests/column_types/test_array.py index b426df7..fdb1c3f 100644 --- a/tests/column_types/test_array.py +++ b/tests/column_types/test_array.py @@ -132,15 +132,6 @@ def test_nested_array() -> None: ) -def test_array_with_inner_pk() -> None: - with pytest.raises(ValueError): - column = dy.Array(dy.String(primary_key=True), 2) - create_schema( - "test", - {"a": column}, - ) - - def test_array_with_rules() -> None: schema = create_schema( "test", {"a": dy.Array(dy.String(min_length=2, nullable=False), 1)} @@ -154,21 +145,17 @@ def test_array_with_rules() -> None: assert failures.counts() == {"a|inner_nullability": 1, "a|inner_min_length": 1} -def test_nested_array_with_rules() -> None: +def test_array_with_primary_key_rule() -> None: schema = create_schema( - "test", {"a": dy.Array(dy.Array(dy.String(min_length=2, nullable=False), 1), 1)} + "test", {"a": dy.Array(dy.String(min_length=2, primary_key=True), 2)} ) df = pl.DataFrame( - {"a": [[["ab"]], [["a"]], [[None]]]}, - schema={"a": pl.Array(pl.String, (1, 1))}, + {"a": [["ab", "ab"], ["cd", "de"], ["def", "ghi"]]}, + schema={"a": pl.Array(pl.String, 2)}, ) _, failures = schema.filter(df) - # NOTE: `validation_mask` currently fails for multiply nested arrays - assert failures.invalid().to_dict(as_series=False) == {"a": [[["a"]], [[None]]]} - assert failures.counts() == { - "a|inner_inner_nullability": 1, - "a|inner_inner_min_length": 1, - } + assert validation_mask(df, failures).to_list() == [False, True, True] + assert failures.counts() == {"a|primary_key": 1} def test_outer_nullability() -> None: diff --git a/tests/columns/test_pyarrow.py b/tests/columns/test_pyarrow.py index 5ae35ea..8a288b5 100644 --- a/tests/columns/test_pyarrow.py +++ b/tests/columns/test_pyarrow.py @@ -61,10 +61,7 @@ def test_equal_polars_schema_enum(categories: list[str]) -> None: "inner", [c() for c in ALL_COLUMN_TYPES] + [dy.List(t()) for t in ALL_COLUMN_TYPES] - + [ - dy.Array(t() if t == dy.Any else t(nullable=True), 1) - for t in NO_VALIDATION_COLUMN_TYPES - ] + + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES] + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES], ) def test_equal_polars_schema_list(inner: Column) -> None: