diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0396d1704b579..ea0139191a310 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -232,7 +232,9 @@ Other enhancements - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) - Switched wheel upload to **PyPI Trusted Publishing** (OIDC) for release-tag pushes in ``wheels.yml``. (:issue:`61718`) -- +- Added a new :meth:`DataFrame.from_arrow` method to import any Arrow-compatible + tabular data object into a pandas :class:`DataFrame` through the + `Arrow PyCapsule Protocol `__ (:issue:`59631`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/_typing.py b/pandas/_typing.py index 23598bd2bc517..d072e472537ae 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -533,4 +533,45 @@ def closed(self) -> bool: SliceType: TypeAlias = Hashable | None + +# Arrow PyCapsule Interface +# from https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#protocol-typehints + + +class ArrowArrayExportable(Protocol): + """ + An object with an ``__arrow_c_array__`` method. + + This method indicates the object is an Arrow-compatible object implementing + the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_ in + Python), enabling zero-copy Arrow data interchange across libraries. + + .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + .. _Arrow C Data Interface: https://arrow.apache.org/docs/format/CDataInterface.html + + """ + + def __arrow_c_array__( + self, requested_schema: object | None = None + ) -> tuple[object, object]: ... + + +class ArrowStreamExportable(Protocol): + """ + An object with an ``__arrow_c_stream__`` method. + + This method indicates the object is an Arrow-compatible object implementing + the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_ + for streams in Python), enabling zero-copy Arrow data interchange across + libraries. + + .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + .. _Arrow C Stream Interface: https://arrow.apache.org/docs/format/CStreamInterface.html + + """ + + def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ... + + __all__ = ["type_t"] +4 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0f36ea031d3d4..fe9cdf17610a6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -215,6 +215,8 @@ AnyAll, AnyArrayLike, ArrayLike, + ArrowArrayExportable, + ArrowStreamExportable, Axes, Axis, AxisInt, @@ -1832,6 +1834,54 @@ def __rmatmul__(self, other) -> DataFrame: # ---------------------------------------------------------------------- # IO methods (to / from other formats) + @classmethod + def from_arrow( + cls, data: ArrowArrayExportable | ArrowStreamExportable + ) -> DataFrame: + """ + Construct a DataFrame from a tabular Arrow object. + + This function accepts any Arrow-compatible tabular object implementing + the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__`` + or ``__arrow_c_stream__`` method). + + This function currently relies on ``pyarrow`` to convert the tabular + object in Arrow format to pandas. + + .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + + .. versionadded:: 3.0 + + Parameters + ---------- + data : pyarrow.Table or Arrow-compatible table + Any tabular object implementing the Arrow PyCapsule Protocol + (i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` + method). + + Returns + ------- + DataFrame + + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if not isinstance(data, pa.Table): + if not ( + hasattr(data, "__arrow_c_array__") + or hasattr(data, "__arrow_c_stream__") + ): + # explicitly test this, because otherwise we would accept variour other + # input types through the pa.table(..) call + raise TypeError( + "Expected an Arrow-compatible tabular object (i.e. having an " + "'_arrow_c_array__' or '__arrow_c_stream__' method), got " + f"'{type(data).__name__}' instead." + ) + data = pa.table(data) + + df = data.to_pandas() + return df + @classmethod def from_dict( cls, diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index b36b6b5ffe0cc..68b0244f930cf 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -5,6 +5,7 @@ import pandas.util._test_decorators as td import pandas as pd +import pandas._testing as tm pa = pytest.importorskip("pyarrow") @@ -45,3 +46,46 @@ def test_dataframe_to_arrow(using_infer_string): table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all() expected = expected.cast(schema) assert table.equals(expected) + + +class ArrowArrayWrapper: + def __init__(self, batch): + self.array = batch + + def __arrow_c_array__(self, requested_schema=None): + return self.array.__arrow_c_array__(requested_schema) + + +class ArrowStreamWrapper: + def __init__(self, table): + self.stream = table + + def __arrow_c_stream__(self, requested_schema=None): + return self.stream.__arrow_c_stream__(requested_schema) + + +@td.skip_if_no("pyarrow", min_version="14.0") +def test_dataframe_from_arrow(): + # objects with __arrow_c_stream__ + table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + result = pd.DataFrame.from_arrow(table) + expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + tm.assert_frame_equal(result, expected) + + # not only pyarrow object are supported + result = pd.DataFrame.from_arrow(ArrowStreamWrapper(table)) + tm.assert_frame_equal(result, expected) + + # objects with __arrow_c_array__ + batch = pa.record_batch([[1, 2, 3], ["a", "b", "c"]], names=["a", "b"]) + + result = pd.DataFrame.from_arrow(table) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame.from_arrow(ArrowArrayWrapper(batch)) + tm.assert_frame_equal(result, expected) + + # only accept actual Arrow objects + with pytest.raises(TypeError, match="Expected an Arrow-compatible tabular object"): + pd.DataFrame.from_arrow({"a": [1, 2, 3], "b": ["a", "b", "c"]})