Skip to content

Commit 1baa2b7

Browse files
committed
Teach _wrap_extension_value to handle empty arrays
Return a wrapped empty extension array for chunked storage arrays with no chunks, preserving extension metadata. Expand UUID UDF regression to support chunked inputs, test empty chunked returns, and ensure UUID extension type remains intact through UDF chaining.
1 parent 153b5f1 commit 1baa2b7

File tree

2 files changed

+40
-2
lines changed

2 files changed

+40
-2
lines changed

python/datafusion/user_defined.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,10 @@ def _wrap_extension_value(value: PyArrowArrayT, data_type: pa.DataType) -> PyArr
126126
return wrap_array(value)
127127
if isinstance(value, pa.ChunkedArray) and value.type.equals(storage_type):
128128
wrapped_chunks = [wrap_array(chunk) for chunk in value.chunks]
129-
return pa.chunked_array(wrapped_chunks)
129+
if not wrapped_chunks:
130+
empty_storage = pa.array([], type=storage_type)
131+
return wrap_array(empty_storage)
132+
return pa.chunked_array(wrapped_chunks, type=data_type)
130133
return value
131134

132135

python/tests/test_udf.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,12 @@ def test_uuid_extension_chain(ctx) -> None:
147147
name="uuid_identity",
148148
)
149149

150-
def ensure_extension(values: pa.Array) -> pa.Array:
150+
def ensure_extension(values: pa.Array | pa.ChunkedArray) -> pa.Array:
151+
if isinstance(values, pa.ChunkedArray):
152+
assert values.type.equals(uuid_type)
153+
return values.combine_chunks()
151154
assert isinstance(values, pa.ExtensionArray)
155+
assert values.type.equals(uuid_type)
152156
return values
153157

154158
second = udf(
@@ -191,3 +195,34 @@ def ensure_extension(values: pa.Array) -> pa.Array:
191195
assert result.type.equals(uuid_type)
192196

193197
assert result.equals(expected)
198+
199+
empty_storage = pa.array([], type=uuid_type.storage_type)
200+
empty_batch = pa.RecordBatch.from_arrays(
201+
[uuid_type.wrap_array(empty_storage)],
202+
names=["uuid_col"],
203+
)
204+
205+
empty_first = udf(
206+
lambda values: pa.chunked_array([], type=uuid_type.storage_type),
207+
[uuid_field],
208+
uuid_field,
209+
volatility="immutable",
210+
name="uuid_empty_chunk",
211+
)
212+
213+
empty_df = ctx.create_dataframe([[empty_batch]])
214+
empty_result = (
215+
empty_df.select(second(empty_first(column("uuid_col"))))
216+
.collect()[0]
217+
.column(0)
218+
)
219+
220+
expected_empty = uuid_type.wrap_array(empty_storage)
221+
222+
if isinstance(empty_result, pa.ChunkedArray):
223+
assert empty_result.type.equals(uuid_type)
224+
assert empty_result.combine_chunks().equals(expected_empty)
225+
else:
226+
assert isinstance(empty_result, pa.ExtensionArray)
227+
assert empty_result.type.equals(uuid_type)
228+
assert empty_result.equals(expected_empty)

0 commit comments

Comments
 (0)