update tests

dxdc · dxdc · commit 0ebca38c26be · 2025-09-03T15:15:04.000-05:00
diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py
@@ -2,6 +2,9 @@
 
 import pytest
 
+import pandas._testing as tm
+from pandas.errors import ParserWarning
+
 
 def test_leading_zeros_preserved_with_dtype_str(all_parsers, request):
     # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
@@ -16,10 +19,19 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request):
 EF,000023607,ghi,0205
 GH,100102040,jkl,0205"""
 
-    result = parser.read_csv(
-        StringIO(data),
-        dtype=str,
-    )
+    if engine_name == "pyarrow":
+        with tm.assert_produces_warning(
+            ParserWarning, match="pyarrow engine expects a dict mapping"
+        ):
+            result = parser.read_csv(
+                StringIO(data),
+                dtype=str,
+            )
+    else:
+        result = parser.read_csv(
+            StringIO(data),
+            dtype=str,
+        )
 
     try:
         assert result.shape == (4, 4)
@@ -40,7 +52,7 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request):
         raise
 
 
-def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
+def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers):
     # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
     # GH#61618: further discussion on ensuring string dtype preservation across engines
 
@@ -69,3 +81,44 @@ def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
     assert result.loc[1, "col3"] == 200
     assert result.loc[2, "col3"] == 201
     assert result.loc[3, "col3"] == 202
+
+
+def test_leading_zeros_preserved_with_heterogeneous_dtypes(all_parsers):
+    # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
+    # GH#61618: further discussion on ensuring string dtype preservation across engines
+
+    parser = all_parsers
+    engine_name = getattr(parser, "engine", "unknown")
+
+    data = """col1,col2,col3,col4
+AB,000388907,199,0150
+CD,101044572,200,0150
+EF,000023607,201,0205
+GH,100102040,202,0205"""
+
+    if engine_name == "pyarrow":
+        with tm.assert_produces_warning(
+            ParserWarning, match="may not be handled correctly by the pyarrow engine"
+        ):
+            result = parser.read_csv(
+                StringIO(data),
+                dtype={"col2": str, "col3": int, "col4": str},
+            )
+    else:
+        result = parser.read_csv(
+            StringIO(data),
+            dtype={"col2": str, "col3": int, "col4": str},
+        )
+
+    assert result.shape == (4, 4)
+    assert list(result.columns) == ["col1", "col2", "col3", "col4"]
+
+    assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
+    assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
+    assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
+    assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
+
+    assert result.loc[0, "col3"] == 199
+    assert result.loc[1, "col3"] == 200
+    assert result.loc[2, "col3"] == 201
+    assert result.loc[3, "col3"] == 202