@@ -636,3 +636,89 @@ def test_index_col_with_dtype_no_rangeindex(all_parsers):
636636 ).index
637637 expected = pd .Index ([0 , 1 ], dtype = np .uint32 , name = "bin_id" )
638638 tm .assert_index_equal (result , expected )
639+
640+
641+ def test_leading_zeros_preserved_with_dtype_str (all_parsers ):
642+ # GH#61618: ensure string dtype preservation across engines
643+ parser = all_parsers
644+ engine_name = getattr (parser , "engine" , "unknown" )
645+
646+ # Skip pyarrow engine as it has its own xfail test
647+ if engine_name == "pyarrow" :
648+ pytest .skip ("pyarrow engine tested separately with xfail" )
649+
650+ data = """col1,col2,col3,col4
651+ AB,000388907,abc,0150
652+ CD,101044572,def,0150
653+ EF,000023607,ghi,0205
654+ GH,100102040,jkl,0205"""
655+
656+ result = parser .read_csv (
657+ StringIO (data ),
658+ dtype = str ,
659+ )
660+
661+ assert result .shape == (4 , 4 )
662+ assert list (result .columns ) == ["col1" , "col2" , "col3" , "col4" ]
663+ assert result .loc [0 , "col2" ] == "000388907" , "lost zeros in col2 row 0"
664+ assert result .loc [2 , "col2" ] == "000023607" , "lost zeros in col2 row 2"
665+ assert result .loc [0 , "col4" ] == "0150" , "lost zeros in col4 row 0"
666+ assert result .loc [2 , "col4" ] == "0205" , "lost zeros in col4 row 2"
667+
668+
669+ @pytest .mark .xfail (
670+ reason = "pyarrow engine strips leading zeros with dtype=str (GH#57666)" , strict = False
671+ )
672+ def test_leading_zeros_preserved_with_dtype_str_pyarrow (pyarrow_parser_only ):
673+ # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
674+ # This is a known issue that needs to be fixed in the pyarrow engine
675+ parser = pyarrow_parser_only
676+
677+ data = """col1,col2,col3,col4
678+ AB,000388907,abc,0150
679+ CD,101044572,def,0150
680+ EF,000023607,ghi,0205
681+ GH,100102040,jkl,0205"""
682+
683+ result = parser .read_csv (
684+ StringIO (data ),
685+ dtype = str ,
686+ )
687+
688+ assert result .shape == (4 , 4 )
689+ assert list (result .columns ) == ["col1" , "col2" , "col3" , "col4" ]
690+ assert result .loc [0 , "col2" ] == "000388907" , "lost zeros in col2 row 0"
691+ assert result .loc [2 , "col2" ] == "000023607" , "lost zeros in col2 row 2"
692+ assert result .loc [0 , "col4" ] == "0150" , "lost zeros in col4 row 0"
693+ assert result .loc [2 , "col4" ] == "0205" , "lost zeros in col4 row 2"
694+
695+
696+ def test_leading_zeros_preserved_with_dtype_dict (all_parsers ):
697+ # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
698+ # GH#61618: further discussion on ensuring string dtype preservation across engines
699+
700+ parser = all_parsers
701+
702+ data = """col1,col2,col3,col4
703+ AB,000388907,199,0150
704+ CD,101044572,200,0150
705+ EF,000023607,201,0205
706+ GH,100102040,202,0205"""
707+
708+ result = parser .read_csv (
709+ StringIO (data ),
710+ dtype = {"col2" : str , "col3" : int , "col4" : str },
711+ )
712+
713+ assert result .shape == (4 , 4 )
714+ assert list (result .columns ) == ["col1" , "col2" , "col3" , "col4" ]
715+
716+ assert result .loc [0 , "col2" ] == "000388907" , "lost zeros in col2 row 0"
717+ assert result .loc [2 , "col2" ] == "000023607" , "lost zeros in col2 row 2"
718+ assert result .loc [0 , "col4" ] == "0150" , "lost zeros in col4 row 0"
719+ assert result .loc [2 , "col4" ] == "0205" , "lost zeros in col4 row 2"
720+
721+ assert result .loc [0 , "col3" ] == 199
722+ assert result .loc [1 , "col3" ] == 200
723+ assert result .loc [2 , "col3" ] == 201
724+ assert result .loc [3 , "col3" ] == 202
0 commit comments