apache · timsaucer · Nov 3, 2025 · Nov 7, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -141,6 +141,7 @@ dev = [
     "maturin>=1.8.1",
     "numpy>1.25.0;python_version<'3.14'",
     "numpy>=2.3.2;python_version>='3.14'",
+    "pyarrow>=19.0.0",
     "pre-commit>=4.3.0",
     "pyyaml>=6.0.3",
     "pytest>=7.4.4",

diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py
@@ -31,7 +31,7 @@
 
 if TYPE_CHECKING:
     _R = TypeVar("_R", bound=pa.DataType)
-    from collections.abc import Callable
+    from collections.abc import Callable, Sequence
 
 
 class Volatility(Enum):
@@ -78,6 +78,27 @@ def __str__(self) -> str:
         return self.name.lower()
 
 
+def data_type_or_field_to_field(value: pa.DataType | pa.Field, name: str) -> pa.Field:
+    """Helper function to return a Field from either a Field or DataType."""
+    if isinstance(value, pa.Field):
+        return value
+    return pa.field(name, type=value)
+
+
+def data_types_or_fields_to_field_list(
+    inputs: Sequence[pa.Field | pa.DataType] | pa.Field | pa.DataType,
+) -> list[pa.Field]:
+    """Helper function to return a list of Fields."""
+    if isinstance(inputs, pa.DataType):
+        return [pa.field("value", type=inputs)]
+    if isinstance(inputs, pa.Field):
+        return [inputs]
+
+    return [
+        data_type_or_field_to_field(v, f"value_{idx}") for (idx, v) in enumerate(inputs)
+    ]
+
+
 class ScalarUDFExportable(Protocol):
     """Type hint for object that has __datafusion_scalar_udf__ PyCapsule."""
 
@@ -95,7 +116,7 @@ def __init__(
         self,
         name: str,
         func: Callable[..., _R],
-        input_types: pa.DataType | list[pa.DataType],
+        input_types: list[pa.Field],
         return_type: _R,
         volatility: Volatility | str,
     ) -> None:
@@ -128,8 +149,8 @@ def __call__(self, *args: Expr) -> Expr:
     @overload
     @staticmethod
     def udf(
-        input_types: list[pa.DataType],
-        return_type: _R,
+        input_types: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field,
+        return_type: pa.DataType | pa.Field,
         volatility: Volatility | str,
         name: str | None = None,
     ) -> Callable[..., ScalarUDF]: ...
@@ -138,8 +159,8 @@ def udf(
     @staticmethod
     def udf(
         func: Callable[..., _R],
-        input_types: list[pa.DataType],
-        return_type: _R,
+        input_types: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field,
+        return_type: pa.DataType | pa.Field,
         volatility: Volatility | str,
         name: str | None = None,
     ) -> ScalarUDF: ...
@@ -192,8 +213,8 @@ def double_udf(x):
 
         def _function(
             func: Callable[..., _R],
-            input_types: list[pa.DataType],
-            return_type: _R,
+            input_types: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field,
+            return_type: pa.DataType | pa.Field,
             volatility: Volatility | str,
             name: str | None = None,
         ) -> ScalarUDF:
@@ -205,6 +226,8 @@ def _function(
                     name = func.__qualname__.lower()
                 else:
                     name = func.__class__.__name__.lower()
+            input_types = data_types_or_fields_to_field_list(input_types)
+            return_type = data_type_or_field_to_field(return_type, "value")
             return ScalarUDF(
                 name=name,
                 func=func,

diff --git a/python/tests/test_udf.py b/python/tests/test_udf.py
@@ -18,6 +18,7 @@
 import pyarrow as pa
 import pytest
 from datafusion import column, udf
+from datafusion import functions as f
 
 
 @pytest.fixture
@@ -124,3 +125,26 @@ def udf_with_param(values: pa.Array) -> pa.Array:
     result = df2.collect()[0].column(0)
 
     assert result == pa.array([False, True, True])
+
+
+def test_udf_with_metadata(ctx) -> None:
+    from uuid import UUID
+
+    @udf([pa.string()], pa.uuid(), "stable")
+    def uuid_from_string(uuid_string):
+        return pa.array((UUID(s).bytes for s in uuid_string.to_pylist()), pa.uuid())
+
+    @udf([pa.uuid()], pa.int64(), "stable")
+    def uuid_version(uuid):
+        return pa.array(s.version for s in uuid.to_pylist())
+
+    batch = pa.record_batch({"idx": pa.array(range(5))})
+    results = (
+        ctx.create_dataframe([[batch]])
+        .with_column("uuid_string", f.uuid())
+        .with_column("uuid", uuid_from_string(column("uuid_string")))
+        .select(uuid_version(column("uuid").alias("uuid_version")))
+        .collect()
+    )
+
+    assert results[0][0].to_pylist() == [4, 4, 4, 4, 4]
diff --git a/src/array.rs b/src/array.rs
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::errors::PyDataFusionResult;
+use crate::utils::validate_pycapsule;
+use arrow::array::{Array, ArrayRef};
+use arrow::datatypes::{Field, FieldRef};
+use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
+use arrow::pyarrow::ToPyArrow;
+use pyo3::prelude::{PyAnyMethods, PyCapsuleMethods};
+use pyo3::types::PyCapsule;
+use pyo3::{pyclass, pymethods, Bound, PyObject, PyResult, Python};
+use std::sync::Arc;
+
+/// A Python object which implements the Arrow PyCapsule for importing
+/// into other libraries.
+#[pyclass(name = "ArrowArrayExportable", module = "datafusion", frozen)]
+#[derive(Clone)]
+pub struct PyArrowArrayExportable {
+    array: ArrayRef,
+    field: FieldRef,
+}
+
+#[pymethods]
+impl PyArrowArrayExportable {
+    #[pyo3(signature = (requested_schema=None))]
+    fn __arrow_c_array__<'py>(
+        &'py self,
+        py: Python<'py>,
+        requested_schema: Option<Bound<'py, PyCapsule>>,
+    ) -> PyDataFusionResult<(Bound<'py, PyCapsule>, Bound<'py, PyCapsule>)> {
+        let field = if let Some(schema_capsule) = requested_schema {
+            validate_pycapsule(&schema_capsule, "arrow_schema")?;
+
+            let schema_ptr = unsafe { schema_capsule.reference::<FFI_ArrowSchema>() };
+            let desired_field = Field::try_from(schema_ptr)?;
+
+            Arc::new(desired_field)
+        } else {
+            Arc::clone(&self.field)
+        };
+
+        let ffi_schema = FFI_ArrowSchema::try_from(&field)?;
+        let schema_capsule = PyCapsule::new(py, ffi_schema, Some(cr"arrow_schema".into()))?;
+
+        let ffi_array = FFI_ArrowArray::new(&self.array.to_data());
+        let array_capsule = PyCapsule::new(py, ffi_array, Some(cr"arrow_array".into()))?;
+
+        Ok((schema_capsule, array_capsule))
+    }
+}
+
+impl ToPyArrow for PyArrowArrayExportable {
+    fn to_pyarrow(&self, py: Python) -> PyResult<PyObject> {
+        let module = py.import("pyarrow")?;
+        let method = module.getattr("array")?;
+        let array = method.call((self.clone(),), None)?;
+        Ok(array.unbind())
+    }
+}
+
+impl PyArrowArrayExportable {
+    pub fn new(array: ArrayRef, field: FieldRef) -> Self {
+        Self { array, field }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -55,6 +55,7 @@ pub mod store;
 pub mod table;
 pub mod unparser;
 
+mod array;
 #[cfg(feature = "substrait")]
 pub mod substrait;
 #[allow(clippy::borrow_deref_ref)]