From 91a6f5604ffe11b610ab94a09b54ee0111b8228c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 7 Nov 2025 18:30:47 +0000
Subject: [PATCH 1/4] Initial plan


From 5b0232562bb9ee4d0d0926b7f1ec1d35187be356 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 7 Nov 2025 18:41:53 +0000
Subject: [PATCH 2/4] Implement validation_rules for Array column with inner
 validation support

Co-authored-by: borchero <22455425+borchero@users.noreply.github.com>
---
 dataframely/columns/array.py     | 31 ++++++++++++++++++++----------
 tests/column_types/test_array.py | 33 +++++++++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py
index 2ff1b06..8cef317 100644
--- a/dataframely/columns/array.py
+++ b/dataframely/columns/array.py
@@ -11,6 +11,7 @@
 import polars as pl
 
 from dataframely._compat import pa, sa, sa_TypeEngine
+from dataframely._polars import PolarsDataType
 from dataframely.random import Generator
 
 from ._base import Check, Column
@@ -42,7 +43,7 @@ def __init__(
     ):
         """
         Args:
-            inner: The inner column type. No validation rules on the inner type are supported yet.
+            inner: The inner column type.
             shape: The shape of the array.
             nullable: Whether this column may contain null values.
             primary_key: Whether this column is part of the primary key of the schema.
@@ -72,15 +73,6 @@ def __init__(
                 "`primary_key=True` is not yet supported for inner types of the Array type."
             )
 
-        # We disallow validation rules on the inner type since Polars arrays currently don't support .eval(). Converting
-        # to a list and calling .list.eval() is possible, however, since the shape can have multiple axes, the recursive
-        # conversion could have significant performance impact. Hence, we simply disallow inner validation rules.
-        # Another option would be to allow validation rules only for sampling, but not enforce them.
-        if inner.validation_rules(pl.lit(None)):
-            raise ValueError(
-                "Validation rules on the inner type of Array are not yet supported."
-            )
-
         super().__init__(
             nullable=nullable,
             primary_key=False,
@@ -95,6 +87,25 @@ def __init__(
     def dtype(self) -> pl.DataType:
         return pl.Array(self.inner.dtype, self.shape)
 
+    def validate_dtype(self, dtype: PolarsDataType) -> bool:
+        if not isinstance(dtype, pl.Array):
+            return False
+        # Compare the constructed dtype directly - this handles both flat and nested cases
+        return self.dtype == dtype
+
+    def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
+        inner_rules = {
+            f"inner_{rule_name}": expr.arr.eval(inner_expr).arr.all()
+            for rule_name, inner_expr in self.inner.validation_rules(
+                pl.element()
+            ).items()
+        }
+
+        return {
+            **super().validation_rules(expr),
+            **inner_rules,
+        }
+
     def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
         # NOTE: We might want to add support for PostgreSQL's ARRAY type or use JSON in the future.
         raise NotImplementedError("SQL column cannot have 'Array' type.")
diff --git a/tests/column_types/test_array.py b/tests/column_types/test_array.py
index 4b53911..b426df7 100644
--- a/tests/column_types/test_array.py
+++ b/tests/column_types/test_array.py
@@ -6,7 +6,7 @@
 
 import dataframely as dy
 from dataframely.columns._base import Column
-from dataframely.testing import create_schema
+from dataframely.testing import create_schema, validation_mask
 
 
 @pytest.mark.parametrize(
@@ -142,10 +142,33 @@ def test_array_with_inner_pk() -> None:
 
 
 def test_array_with_rules() -> None:
-    with pytest.raises(ValueError):
-        create_schema(
-            "test", {"a": dy.Array(dy.String(min_length=2, nullable=False), 1)}
-        )
+    schema = create_schema(
+        "test", {"a": dy.Array(dy.String(min_length=2, nullable=False), 1)}
+    )
+    df = pl.DataFrame(
+        {"a": [["ab"], ["a"], [None]]},
+        schema={"a": pl.Array(pl.String, 1)},
+    )
+    _, failures = schema.filter(df)
+    assert validation_mask(df, failures).to_list() == [True, False, False]
+    assert failures.counts() == {"a|inner_nullability": 1, "a|inner_min_length": 1}
+
+
+def test_nested_array_with_rules() -> None:
+    schema = create_schema(
+        "test", {"a": dy.Array(dy.Array(dy.String(min_length=2, nullable=False), 1), 1)}
+    )
+    df = pl.DataFrame(
+        {"a": [[["ab"]], [["a"]], [[None]]]},
+        schema={"a": pl.Array(pl.String, (1, 1))},
+    )
+    _, failures = schema.filter(df)
+    # NOTE: `validation_mask` currently fails for multiply nested arrays
+    assert failures.invalid().to_dict(as_series=False) == {"a": [[["a"]], [[None]]]}
+    assert failures.counts() == {
+        "a|inner_inner_nullability": 1,
+        "a|inner_inner_min_length": 1,
+    }
 
 
 def test_outer_nullability() -> None:

From c56a038579bf4be27a6ba1a96d6e2a1d8cb44681 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 7 Nov 2025 18:47:23 +0000
Subject: [PATCH 3/4] Add docstrings to validate_dtype and validation_rules
 methods

Co-authored-by: borchero <22455425+borchero@users.noreply.github.com>
---
 dataframely/columns/array.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py
index 8cef317..ceef3e2 100644
--- a/dataframely/columns/array.py
+++ b/dataframely/columns/array.py
@@ -88,12 +88,39 @@ def dtype(self) -> pl.DataType:
         return pl.Array(self.inner.dtype, self.shape)
 
     def validate_dtype(self, dtype: PolarsDataType) -> bool:
+        """Validate if the polars data type satisfies the column definition.
+
+        Args:
+            dtype: The dtype to validate.
+
+        Returns:
+            Whether the dtype is valid.
+
+        Note:
+            This method handles both flat and nested array representations.
+            For example, ``Array(Array(String, 1), 1)`` and ``Array(String, (1, 1))``
+            both produce the same polars dtype and are considered equivalent.
+        """
         if not isinstance(dtype, pl.Array):
             return False
         # Compare the constructed dtype directly - this handles both flat and nested cases
         return self.dtype == dtype
 
     def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
+        """Return validation rules for this array column.
+
+        This method validates both the array itself and its inner elements.
+        Inner validation rules are applied using ``arr.eval()`` and are prefixed
+        with ``inner_`` to distinguish them from outer array validations.
+
+        Args:
+            expr: An expression referencing the column of the data frame.
+
+        Returns:
+            A mapping from validation rule names to expressions that provide exactly
+            one boolean value per column item indicating whether validation with respect
+            to the rule is successful.
+        """
         inner_rules = {
             f"inner_{rule_name}": expr.arr.eval(inner_expr).arr.all()
             for rule_name, inner_expr in self.inner.validation_rules(

From 0872a2c1529c1f26bb80de23221bbb00c28c09ab Mon Sep 17 00:00:00 2001
From: Oliver Borchert <oliver.borchert@quantco.com>
Date: Wed, 12 Nov 2025 18:25:16 +0100
Subject: [PATCH 4/4] Fix

---
 dataframely/columns/array.py     | 49 ++++----------------------
 dataframely/columns/list.py      | 59 +++++++++++++++++++-------------
 tests/column_types/test_array.py | 25 ++++----------
 tests/columns/test_pyarrow.py    |  5 +--
 4 files changed, 49 insertions(+), 89 deletions(-)

diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py
index ceef3e2..f7fb593 100644
--- a/dataframely/columns/array.py
+++ b/dataframely/columns/array.py
@@ -11,12 +11,11 @@
 import polars as pl
 
 from dataframely._compat import pa, sa, sa_TypeEngine
-from dataframely._polars import PolarsDataType
 from dataframely.random import Generator
 
 from ._base import Check, Column
 from ._registry import column_from_dict, register
-from .struct import Struct
+from .list import _list_primary_key_check
 
 if sys.version_info >= (3, 11):
     from typing import Self
@@ -65,14 +64,6 @@ def __init__(
                 names, the specified alias is the only valid name.
             metadata: A dictionary of metadata to attach to the column.
         """
-        if inner.primary_key or (
-            isinstance(inner, Struct)
-            and any(col.primary_key for col in inner.inner.values())
-        ):
-            raise ValueError(
-                "`primary_key=True` is not yet supported for inner types of the Array type."
-            )
-
         super().__init__(
             nullable=nullable,
             primary_key=False,
@@ -87,40 +78,7 @@ def __init__(
     def dtype(self) -> pl.DataType:
         return pl.Array(self.inner.dtype, self.shape)
 
-    def validate_dtype(self, dtype: PolarsDataType) -> bool:
-        """Validate if the polars data type satisfies the column definition.
-
-        Args:
-            dtype: The dtype to validate.
-
-        Returns:
-            Whether the dtype is valid.
-
-        Note:
-            This method handles both flat and nested array representations.
-            For example, ``Array(Array(String, 1), 1)`` and ``Array(String, (1, 1))``
-            both produce the same polars dtype and are considered equivalent.
-        """
-        if not isinstance(dtype, pl.Array):
-            return False
-        # Compare the constructed dtype directly - this handles both flat and nested cases
-        return self.dtype == dtype
-
     def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
-        """Return validation rules for this array column.
-
-        This method validates both the array itself and its inner elements.
-        Inner validation rules are applied using ``arr.eval()`` and are prefixed
-        with ``inner_`` to distinguish them from outer array validations.
-
-        Args:
-            expr: An expression referencing the column of the data frame.
-
-        Returns:
-            A mapping from validation rule names to expressions that provide exactly
-            one boolean value per column item indicating whether validation with respect
-            to the rule is successful.
-        """
         inner_rules = {
             f"inner_{rule_name}": expr.arr.eval(inner_expr).arr.all()
             for rule_name, inner_expr in self.inner.validation_rules(
@@ -128,8 +86,13 @@ def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
             ).items()
         }
 
+        array_rules: dict[str, pl.Expr] = {}
+        if (rule := _list_primary_key_check(expr.arr, self.inner)) is not None:
+            array_rules["primary_key"] = rule
+
         return {
             **super().validation_rules(expr),
+            **array_rules,
             **inner_rules,
         }
 
diff --git a/dataframely/columns/list.py b/dataframely/columns/list.py
index 0804cf1..6d3338e 100644
--- a/dataframely/columns/list.py
+++ b/dataframely/columns/list.py
@@ -8,6 +8,8 @@
 from typing import Any, cast
 
 import polars as pl
+from polars.expr.array import ExprArrayNameSpace
+from polars.expr.list import ExprListNameSpace
 
 from dataframely._compat import pa, sa, sa_TypeEngine
 from dataframely._polars import PolarsDataType
@@ -97,29 +99,8 @@ def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
         }
 
         list_rules: dict[str, pl.Expr] = {}
-        if self.inner.primary_key:
-            list_rules["primary_key"] = ~expr.list.eval(
-                pl.element().is_duplicated()
-            ).list.any()
-        elif isinstance(self.inner, Struct) and any(
-            col.primary_key for col in self.inner.inner.values()
-        ):
-            primary_key_columns = [
-                name for name, col in self.inner.inner.items() if col.primary_key
-            ]
-            # NOTE: We optimize for a single primary key column here as it is much
-            #  faster to run duplication checks for non-struct types in polars 1.22.
-            if len(primary_key_columns) == 1:
-                list_rules["primary_key"] = ~expr.list.eval(
-                    pl.element().struct.field(primary_key_columns[0]).is_duplicated()
-                ).list.any()
-            else:
-                list_rules["primary_key"] = ~expr.list.eval(
-                    pl.struct(
-                        pl.element().struct.field(primary_key_columns)
-                    ).is_duplicated()
-                ).list.any()
-
+        if (rule := _list_primary_key_check(expr.list, self.inner)) is not None:
+            list_rules["primary_key"] = rule
         if self.min_length is not None:
             list_rules["min_length"] = (
                 pl.when(expr.is_null())
@@ -187,3 +168,35 @@ def as_dict(self, expr: pl.Expr) -> dict[str, Any]:
     def from_dict(cls, data: dict[str, Any]) -> Self:
         data["inner"] = column_from_dict(data["inner"])
         return super().from_dict(data)
+
+
+def _list_primary_key_check(
+    list_expr: ExprListNameSpace | ExprArrayNameSpace, inner: Column
+) -> pl.Expr | None:
+    def list_any(expr: pl.Expr) -> pl.Expr:
+        if isinstance(list_expr, ExprListNameSpace):
+            return expr.list.any()
+        return expr.arr.any()
+
+    if inner.primary_key:
+        return ~list_expr.eval(pl.element().is_duplicated()).pipe(list_any)
+    elif isinstance(inner, Struct) and any(
+        col.primary_key for col in inner.inner.values()
+    ):
+        primary_key_columns = [
+            name for name, col in inner.inner.items() if col.primary_key
+        ]
+        # NOTE: We optimize for a single primary key column here as it is much
+        #  faster to run duplication checks for non-struct types in polars 1.22.
+        if len(primary_key_columns) == 1:
+            return ~list_expr.eval(
+                pl.element().struct.field(primary_key_columns[0]).is_duplicated()
+            ).pipe(list_any)
+        else:
+            return ~list_expr.eval(
+                pl.struct(
+                    pl.element().struct.field(primary_key_columns)
+                ).is_duplicated()
+            ).pipe(list_any)
+
+    return None
diff --git a/tests/column_types/test_array.py b/tests/column_types/test_array.py
index b426df7..fdb1c3f 100644
--- a/tests/column_types/test_array.py
+++ b/tests/column_types/test_array.py
@@ -132,15 +132,6 @@ def test_nested_array() -> None:
     )
 
 
-def test_array_with_inner_pk() -> None:
-    with pytest.raises(ValueError):
-        column = dy.Array(dy.String(primary_key=True), 2)
-        create_schema(
-            "test",
-            {"a": column},
-        )
-
-
 def test_array_with_rules() -> None:
     schema = create_schema(
         "test", {"a": dy.Array(dy.String(min_length=2, nullable=False), 1)}
@@ -154,21 +145,17 @@ def test_array_with_rules() -> None:
     assert failures.counts() == {"a|inner_nullability": 1, "a|inner_min_length": 1}
 
 
-def test_nested_array_with_rules() -> None:
+def test_array_with_primary_key_rule() -> None:
     schema = create_schema(
-        "test", {"a": dy.Array(dy.Array(dy.String(min_length=2, nullable=False), 1), 1)}
+        "test", {"a": dy.Array(dy.String(min_length=2, primary_key=True), 2)}
     )
     df = pl.DataFrame(
-        {"a": [[["ab"]], [["a"]], [[None]]]},
-        schema={"a": pl.Array(pl.String, (1, 1))},
+        {"a": [["ab", "ab"], ["cd", "de"], ["def", "ghi"]]},
+        schema={"a": pl.Array(pl.String, 2)},
     )
     _, failures = schema.filter(df)
-    # NOTE: `validation_mask` currently fails for multiply nested arrays
-    assert failures.invalid().to_dict(as_series=False) == {"a": [[["a"]], [[None]]]}
-    assert failures.counts() == {
-        "a|inner_inner_nullability": 1,
-        "a|inner_inner_min_length": 1,
-    }
+    assert validation_mask(df, failures).to_list() == [False, True, True]
+    assert failures.counts() == {"a|primary_key": 1}
 
 
 def test_outer_nullability() -> None:
diff --git a/tests/columns/test_pyarrow.py b/tests/columns/test_pyarrow.py
index 5ae35ea..8a288b5 100644
--- a/tests/columns/test_pyarrow.py
+++ b/tests/columns/test_pyarrow.py
@@ -61,10 +61,7 @@ def test_equal_polars_schema_enum(categories: list[str]) -> None:
     "inner",
     [c() for c in ALL_COLUMN_TYPES]
     + [dy.List(t()) for t in ALL_COLUMN_TYPES]
-    + [
-        dy.Array(t() if t == dy.Any else t(nullable=True), 1)
-        for t in NO_VALIDATION_COLUMN_TYPES
-    ]
+    + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES]
     + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES],
 )
 def test_equal_polars_schema_list(inner: Column) -> None: