pandas-dev · jorisvandenbossche · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 4, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -232,7 +232,9 @@ Other enhancements
 - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
 - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
 - Switched wheel upload to **PyPI Trusted Publishing** (OIDC) for release-tag pushes in ``wheels.yml``. (:issue:`61718`)
--
+- Added a new :meth:`DataFrame.from_arrow` method to import any Arrow-compatible
+  tabular data object into a pandas :class:`DataFrame` through the
+  `Arrow PyCapsule Protocol <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`__ (:issue:`59631`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.notable_bug_fixes:

@@ -533,4 +533,45 @@ def closed(self) -> bool:
 
 SliceType: TypeAlias = Hashable | None
 
+
+# Arrow PyCapsule Interface
+# from https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#protocol-typehints
+
+
+class ArrowArrayExportable(Protocol):
+    """
+    An object with an ``__arrow_c_array__`` method.
+
+    This method indicates the object is an Arrow-compatible object implementing
+    the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_ in
+    Python), enabling zero-copy Arrow data interchange across libraries.
+
+    .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
+    .. _Arrow C Data Interface: https://arrow.apache.org/docs/format/CDataInterface.html
+
+    """
+
+    def __arrow_c_array__(
+        self, requested_schema: object | None = None
+    ) -> tuple[object, object]: ...
+
+
+class ArrowStreamExportable(Protocol):
+    """
+    An object with an ``__arrow_c_stream__`` method.
+
+    This method indicates the object is an Arrow-compatible object implementing
+    the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_
+    for streams in Python), enabling zero-copy Arrow data interchange across
+    libraries.
+
+    .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
+    .. _Arrow C Stream Interface: https://arrow.apache.org/docs/format/CStreamInterface.html
+
+    """
+
+    def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ...
+
+
 __all__ = ["type_t"]
+4
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -215,6 +215,8 @@
         AnyAll,
         AnyArrayLike,
         ArrayLike,
+        ArrowArrayExportable,
+        ArrowStreamExportable,
         Axes,
         Axis,
         AxisInt,
@@ -1832,6 +1834,54 @@ def __rmatmul__(self, other) -> DataFrame:
     # ----------------------------------------------------------------------
     # IO methods (to / from other formats)
 
+    @classmethod
+    def from_arrow(
+        cls, data: ArrowArrayExportable | ArrowStreamExportable
+    ) -> DataFrame:
+        """
+        Construct a DataFrame from a tabular Arrow object.
+
+        This function accepts any Arrow-compatible tabular object implementing
+        the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__``
+        or ``__arrow_c_stream__`` method).
+
+        This function currently relies on ``pyarrow`` to convert the tabular
+        object in Arrow format to pandas.
+
+        .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
+
+        .. versionadded:: 3.0
+
+        Parameters
+        ----------
+        data : pyarrow.Table or Arrow-compatible table
+            Any tabular object implementing the Arrow PyCapsule Protocol
+            (i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__``
+            method).
+
+        Returns
+        -------
+        DataFrame
+
+        """
+        pa = import_optional_dependency("pyarrow", min_version="14.0.0")
+        if not isinstance(data, pa.Table):
+            if not (
+                hasattr(data, "__arrow_c_array__")
+                or hasattr(data, "__arrow_c_stream__")
+            ):
+                # explicitly test this, because otherwise we would accept variour other
+                # input types through the pa.table(..) call
+                raise TypeError(
+                    "Expected an Arrow-compatible tabular object (i.e. having an "
+                    "'_arrow_c_array__' or '__arrow_c_stream__' method), got "
+                    f"'{type(data).__name__}' instead."
+                )
+            data = pa.table(data)
+
+        df = data.to_pandas()
+        return df
+
     @classmethod
     def from_dict(
         cls,

diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py
@@ -5,6 +5,7 @@
 import pandas.util._test_decorators as td
 
 import pandas as pd
+import pandas._testing as tm
 
 pa = pytest.importorskip("pyarrow")
 
@@ -45,3 +46,46 @@ def test_dataframe_to_arrow(using_infer_string):
     table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all()
     expected = expected.cast(schema)
     assert table.equals(expected)
+
+
+class ArrowArrayWrapper:
+    def __init__(self, batch):
+        self.array = batch
+
+    def __arrow_c_array__(self, requested_schema=None):
+        return self.array.__arrow_c_array__(requested_schema)
+
+
+class ArrowStreamWrapper:
+    def __init__(self, table):
+        self.stream = table
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        return self.stream.__arrow_c_stream__(requested_schema)
+
+
+@td.skip_if_no("pyarrow", min_version="14.0")
+def test_dataframe_from_arrow():
+    # objects with __arrow_c_stream__
+    table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+
+    result = pd.DataFrame.from_arrow(table)
+    expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    tm.assert_frame_equal(result, expected)
+
+    # not only pyarrow object are supported
+    result = pd.DataFrame.from_arrow(ArrowStreamWrapper(table))
+    tm.assert_frame_equal(result, expected)
+
+    # objects with __arrow_c_array__
+    batch = pa.record_batch([[1, 2, 3], ["a", "b", "c"]], names=["a", "b"])
+
+    result = pd.DataFrame.from_arrow(table)
+    tm.assert_frame_equal(result, expected)
+
+    result = pd.DataFrame.from_arrow(ArrowArrayWrapper(batch))
+    tm.assert_frame_equal(result, expected)
+
+    # only accept actual Arrow objects
+    with pytest.raises(TypeError, match="Expected an Arrow-compatible tabular object"):
+        pd.DataFrame.from_arrow({"a": [1, 2, 3], "b": ["a", "b", "c"]})