use numpy dtype mapping

dxdc · dxdc · commit 74f01a7fde57 · 2025-09-03T16:20:04.000-05:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -987,7 +987,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
-- Bug in :meth:`read_csv`` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
+- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
 - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -6,6 +6,7 @@
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
+    DtypeWarning,
     Pandas4Warning,
     ParserError,
     ParserWarning,
@@ -15,7 +16,6 @@
 )
 
 from pandas.core.dtypes.common import (
-    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.inference import is_integer
@@ -146,21 +146,25 @@ def handle_warning(invalid_row) -> str:
             if isinstance(self.dtype, dict):
                 column_types = {}
                 for col, col_dtype in self.dtype.items():
-                    if is_string_dtype(col_dtype):
-                        column_types[col] = pa.string()
-                    else:
+                    try:
+                        numpy_dtype = pandas_dtype(col_dtype).type
+                        pyarrow_dtype = pa.from_numpy_dtype(numpy_dtype)
+                        column_types[col] = pyarrow_dtype
+                    except (TypeError, ValueError, pa.ArrowNotImplementedError):
                         warnings.warn(
                             f"Column '{col}' has dtype '{col_dtype}', "
                             "which may not be handled correctly by the pyarrow engine.",
-                            ParserWarning,
+                            DtypeWarning,
                             stacklevel=find_stack_level(),
                         )
+
                 if column_types:
                     self.convert_options["column_types"] = column_types
             else:
                 warnings.warn(
-                    "The pyarrow engine expects a dict mapping columns to types.",
-                    ParserWarning,
+                    f"Global dtype '{self.dtype}' not supported with pyarrow engine. "
+                    "Use dtype dictionary instead.",
+                    DtypeWarning,
                     stacklevel=find_stack_level(),
                 )
 
diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from pandas.errors import ParserWarning
+from pandas.errors import DtypeWarning
 
 import pandas._testing as tm
 
@@ -22,7 +22,7 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request):
 
     if engine_name == "pyarrow":
         with tm.assert_produces_warning(
-            ParserWarning, match="pyarrow engine expects a dict mapping"
+            DtypeWarning, match="not supported with pyarrow engine"
         ):
             result = parser.read_csv(
                 StringIO(data),
@@ -53,7 +53,7 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request):
         raise
 
 
-def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers):
+def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
     # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
     # GH#61618: further discussion on ensuring string dtype preservation across engines
 
@@ -67,7 +67,7 @@ def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers):
 
     result = parser.read_csv(
         StringIO(data),
-        dtype={"col2": str, "col4": str},
+        dtype={"col2": str, "col3": int, "col4": str},
     )
 
     assert result.shape == (4, 4)
@@ -82,44 +82,3 @@ def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers):
     assert result.loc[1, "col3"] == 200
     assert result.loc[2, "col3"] == 201
     assert result.loc[3, "col3"] == 202
-
-
-def test_leading_zeros_preserved_with_heterogeneous_dtypes(all_parsers):
-    # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
-    # GH#61618: further discussion on ensuring string dtype preservation across engines
-
-    parser = all_parsers
-    engine_name = getattr(parser, "engine", "unknown")
-
-    data = """col1,col2,col3,col4
-AB,000388907,199,0150
-CD,101044572,200,0150
-EF,000023607,201,0205
-GH,100102040,202,0205"""
-
-    if engine_name == "pyarrow":
-        with tm.assert_produces_warning(
-            ParserWarning, match="may not be handled correctly by the pyarrow engine"
-        ):
-            result = parser.read_csv(
-                StringIO(data),
-                dtype={"col2": str, "col3": int, "col4": str},
-            )
-    else:
-        result = parser.read_csv(
-            StringIO(data),
-            dtype={"col2": str, "col3": int, "col4": str},
-        )
-
-    assert result.shape == (4, 4)
-    assert list(result.columns) == ["col1", "col2", "col3", "col4"]
-
-    assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
-    assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
-    assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
-    assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
-
-    assert result.loc[0, "col3"] == 199
-    assert result.loc[1, "col3"] == 200
-    assert result.loc[2, "col3"] == 201
-    assert result.loc[3, "col3"] == 202