Skip to content

Commit 74f01a7

Browse files
committed
use numpy dtype mapping
1 parent 22e4129 commit 74f01a7

File tree

3 files changed

+16
-53
lines changed

3 files changed

+16
-53
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -987,7 +987,7 @@ I/O
987987
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
988988
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
989989
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
990-
- Bug in :meth:`read_csv`` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
990+
- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
991991
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
992992
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
993993
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)

pandas/io/parsers/arrow_parser_wrapper.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pandas._libs import lib
77
from pandas.compat._optional import import_optional_dependency
88
from pandas.errors import (
9+
DtypeWarning,
910
Pandas4Warning,
1011
ParserError,
1112
ParserWarning,
@@ -15,7 +16,6 @@
1516
)
1617

1718
from pandas.core.dtypes.common import (
18-
is_string_dtype,
1919
pandas_dtype,
2020
)
2121
from pandas.core.dtypes.inference import is_integer
@@ -146,21 +146,25 @@ def handle_warning(invalid_row) -> str:
146146
if isinstance(self.dtype, dict):
147147
column_types = {}
148148
for col, col_dtype in self.dtype.items():
149-
if is_string_dtype(col_dtype):
150-
column_types[col] = pa.string()
151-
else:
149+
try:
150+
numpy_dtype = pandas_dtype(col_dtype).type
151+
pyarrow_dtype = pa.from_numpy_dtype(numpy_dtype)
152+
column_types[col] = pyarrow_dtype
153+
except (TypeError, ValueError, pa.ArrowNotImplementedError):
152154
warnings.warn(
153155
f"Column '{col}' has dtype '{col_dtype}', "
154156
"which may not be handled correctly by the pyarrow engine.",
155-
ParserWarning,
157+
DtypeWarning,
156158
stacklevel=find_stack_level(),
157159
)
160+
158161
if column_types:
159162
self.convert_options["column_types"] = column_types
160163
else:
161164
warnings.warn(
162-
"The pyarrow engine expects a dict mapping columns to types.",
163-
ParserWarning,
165+
f"Global dtype '{self.dtype}' not supported with pyarrow engine. "
166+
"Use dtype dictionary instead.",
167+
DtypeWarning,
164168
stacklevel=find_stack_level(),
165169
)
166170

pandas/tests/io/parser/test_preserve_leading_zeros.py

Lines changed: 4 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pytest
44

5-
from pandas.errors import ParserWarning
5+
from pandas.errors import DtypeWarning
66

77
import pandas._testing as tm
88

@@ -22,7 +22,7 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request):
2222

2323
if engine_name == "pyarrow":
2424
with tm.assert_produces_warning(
25-
ParserWarning, match="pyarrow engine expects a dict mapping"
25+
DtypeWarning, match="not supported with pyarrow engine"
2626
):
2727
result = parser.read_csv(
2828
StringIO(data),
@@ -53,7 +53,7 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request):
5353
raise
5454

5555

56-
def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers):
56+
def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
5757
# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
5858
# GH#61618: further discussion on ensuring string dtype preservation across engines
5959

@@ -67,7 +67,7 @@ def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers):
6767

6868
result = parser.read_csv(
6969
StringIO(data),
70-
dtype={"col2": str, "col4": str},
70+
dtype={"col2": str, "col3": int, "col4": str},
7171
)
7272

7373
assert result.shape == (4, 4)
@@ -82,44 +82,3 @@ def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers):
8282
assert result.loc[1, "col3"] == 200
8383
assert result.loc[2, "col3"] == 201
8484
assert result.loc[3, "col3"] == 202
85-
86-
87-
def test_leading_zeros_preserved_with_heterogeneous_dtypes(all_parsers):
88-
# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
89-
# GH#61618: further discussion on ensuring string dtype preservation across engines
90-
91-
parser = all_parsers
92-
engine_name = getattr(parser, "engine", "unknown")
93-
94-
data = """col1,col2,col3,col4
95-
AB,000388907,199,0150
96-
CD,101044572,200,0150
97-
EF,000023607,201,0205
98-
GH,100102040,202,0205"""
99-
100-
if engine_name == "pyarrow":
101-
with tm.assert_produces_warning(
102-
ParserWarning, match="may not be handled correctly by the pyarrow engine"
103-
):
104-
result = parser.read_csv(
105-
StringIO(data),
106-
dtype={"col2": str, "col3": int, "col4": str},
107-
)
108-
else:
109-
result = parser.read_csv(
110-
StringIO(data),
111-
dtype={"col2": str, "col3": int, "col4": str},
112-
)
113-
114-
assert result.shape == (4, 4)
115-
assert list(result.columns) == ["col1", "col2", "col3", "col4"]
116-
117-
assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
118-
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
119-
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
120-
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
121-
122-
assert result.loc[0, "col3"] == 199
123-
assert result.loc[1, "col3"] == 200
124-
assert result.loc[2, "col3"] == 201
125-
assert result.loc[3, "col3"] == 202

0 commit comments

Comments
 (0)