Skip to content

Commit b63e601

Browse files
ENH: add basic DataFrame.from_arrow class method for importing through Arrow PyCapsule interface
1 parent db1b8ab commit b63e601

File tree

2 files changed

+75
-0
lines changed

2 files changed

+75
-0
lines changed

pandas/core/frame.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1746,6 +1746,41 @@ def __rmatmul__(self, other) -> DataFrame:
17461746
# ----------------------------------------------------------------------
17471747
# IO methods (to / from other formats)
17481748

1749+
@classmethod
1750+
def from_arrow(cls, data):
1751+
"""
1752+
Construct a DataFrame from a tabular Arrow object.
1753+
1754+
This function accepts any tabular Arrow object implementing
1755+
the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__``
1756+
or ``__arrow_c_stream__`` method).
1757+
1758+
This function currently relies on ``pyarrow`` to convert the tabular
1759+
object in Arrow format to pandas.
1760+
1761+
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
1762+
1763+
.. versionadded:: 3.0
1764+
1765+
Parameters
1766+
----------
1767+
data : pyarrow.Table or Arrow-compatible table
1768+
Any tabular object implementing the Arrow PyCapsule Protocol
1769+
(i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__``
1770+
method).
1771+
1772+
Returns
1773+
-------
1774+
DataFrame
1775+
1776+
"""
1777+
pa = import_optional_dependency("pyarrow", min_version="14.0.0")
1778+
if not isinstance(data, pa.Table):
1779+
data = pa.table(data)
1780+
1781+
df = data.to_pandas()
1782+
return df
1783+
17491784
@classmethod
17501785
def from_dict(
17511786
cls,

pandas/tests/frame/test_arrow_interface.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pandas.util._test_decorators as td
88

99
import pandas as pd
10+
import pandas._testing as tm
1011

1112
pa = pytest.importorskip("pyarrow")
1213

@@ -47,3 +48,42 @@ def test_dataframe_to_arrow():
4748
table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all()
4849
expected = expected.cast(schema)
4950
assert table.equals(expected)
51+
52+
53+
class ArrowArrayWrapper:
54+
def __init__(self, batch):
55+
self.array = batch
56+
57+
def __arrow_c_array__(self, requested_schema=None):
58+
return self.array.__arrow_c_array__(requested_schema)
59+
60+
61+
class ArrowStreamWrapper:
62+
def __init__(self, table):
63+
self.stream = table
64+
65+
def __arrow_c_stream__(self, requested_schema=None):
66+
return self.stream.__arrow_c_stream__(requested_schema)
67+
68+
69+
@td.skip_if_no("pyarrow", min_version="14.0")
70+
def test_dataframe_from_arrow():
71+
# objects with __arrow_c_stream__
72+
table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
73+
74+
result = pd.DataFrame.from_arrow(table)
75+
expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
76+
tm.assert_frame_equal(result, expected)
77+
78+
# not only pyarrow object are supported
79+
result = pd.DataFrame.from_arrow(ArrowStreamWrapper(table))
80+
tm.assert_frame_equal(result, expected)
81+
82+
# objects with __arrow_c_array__
83+
batch = pa.record_batch([[1, 2, 3], ["a", "b", "c"]], names=["a", "b"])
84+
85+
result = pd.DataFrame.from_arrow(table)
86+
tm.assert_frame_equal(result, expected)
87+
88+
result = pd.DataFrame.from_arrow(ArrowArrayWrapper(batch))
89+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)