1717 pa_version_under13p0 ,
1818 pa_version_under15p0 ,
1919 pa_version_under17p0 ,
20+ pa_version_under19p0 ,
2021)
2122
2223import pandas as pd
@@ -254,8 +255,10 @@ def test_invalid_engine(df_compat):
254255 check_round_trip (df_compat , "foo" , "bar" )
255256
256257
257- def test_options_py (df_compat , pa ):
258+ def test_options_py (df_compat , pa , using_infer_string ):
258259 # use the set option
260+ if using_infer_string and not pa_version_under19p0 :
261+ df_compat .columns = df_compat .columns .astype ("str" )
259262
260263 with pd .option_context ("io.parquet.engine" , "pyarrow" ):
261264 check_round_trip (df_compat )
@@ -784,18 +787,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type):
784787
785788 def test_categorical (self , pa ):
786789 # supported in >= 0.7.0
787- df = pd .DataFrame ()
788- df ["a" ] = pd .Categorical (list ("abcdef" ))
789-
790- # test for null, out-of-order values, and unobserved category
791- df ["b" ] = pd .Categorical (
792- ["bar" , "foo" , "foo" , "bar" , None , "bar" ],
793- dtype = pd .CategoricalDtype (["foo" , "bar" , "baz" ]),
794- )
795-
796- # test for ordered flag
797- df ["c" ] = pd .Categorical (
798- ["a" , "b" , "c" , "a" , "c" , "b" ], categories = ["b" , "c" , "d" ], ordered = True
790+ df = pd .DataFrame (
791+ {
792+ "a" : pd .Categorical (list ("abcdef" )),
793+ # test for null, out-of-order values, and unobserved category
794+ "b" : pd .Categorical (
795+ ["bar" , "foo" , "foo" , "bar" , None , "bar" ],
796+ dtype = pd .CategoricalDtype (["foo" , "bar" , "baz" ]),
797+ ),
798+ # test for ordered flag
799+ "c" : pd .Categorical (
800+ ["a" , "b" , "c" , "a" , "c" , "b" ],
801+ categories = ["b" , "c" , "d" ],
802+ ordered = True ,
803+ ),
804+ }
799805 )
800806
801807 check_round_trip (df , pa )
@@ -858,11 +864,13 @@ def test_s3_roundtrip_for_dir(
858864 repeat = 1 ,
859865 )
860866
861- def test_read_file_like_obj_support (self , df_compat ):
867+ def test_read_file_like_obj_support (self , df_compat , using_infer_string ):
862868 pytest .importorskip ("pyarrow" )
863869 buffer = BytesIO ()
864870 df_compat .to_parquet (buffer )
865871 df_from_buf = read_parquet (buffer )
872+ if using_infer_string and not pa_version_under19p0 :
873+ df_compat .columns = df_compat .columns .astype ("str" )
866874 tm .assert_frame_equal (df_compat , df_from_buf )
867875
868876 def test_expand_user (self , df_compat , monkeypatch ):
@@ -929,7 +937,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string):
929937 "c" : pd .Series (["a" , None , "c" ], dtype = "string" ),
930938 }
931939 )
932- if using_infer_string :
940+ if using_infer_string and pa_version_under19p0 :
933941 check_round_trip (df , pa , expected = df .astype ({"c" : "str" }))
934942 else :
935943 check_round_trip (df , pa )
@@ -943,7 +951,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin
943951 df = pd .DataFrame ({"a" : pd .Series (["a" , None , "c" ], dtype = "string[pyarrow]" )})
944952 with pd .option_context ("string_storage" , string_storage ):
945953 if using_infer_string :
946- expected = df .astype ("str" )
954+ if pa_version_under19p0 :
955+ expected = df .astype ("str" )
956+ else :
957+ expected = df .astype (f"string[{ string_storage } ]" )
947958 expected .columns = expected .columns .astype ("str" )
948959 else :
949960 expected = df .astype (f"string[{ string_storage } ]" )
@@ -1099,17 +1110,24 @@ def test_df_attrs_persistence(self, tmp_path, pa):
10991110 new_df = read_parquet (path , engine = pa )
11001111 assert new_df .attrs == df .attrs
11011112
1102- def test_string_inference (self , tmp_path , pa ):
1113+ def test_string_inference (self , tmp_path , pa , using_infer_string ):
11031114 # GH#54431
11041115 path = tmp_path / "test_string_inference.p"
11051116 df = pd .DataFrame (data = {"a" : ["x" , "y" ]}, index = ["a" , "b" ])
1106- df .to_parquet (path , engine = "pyarrow" )
1117+ df .to_parquet (path , engine = pa )
11071118 with pd .option_context ("future.infer_string" , True ):
1108- result = read_parquet (path , engine = "pyarrow" )
1119+ result = read_parquet (path , engine = pa )
1120+ dtype = pd .StringDtype (na_value = np .nan )
11091121 expected = pd .DataFrame (
11101122 data = {"a" : ["x" , "y" ]},
1111- dtype = pd .StringDtype (na_value = np .nan ),
1112- index = pd .Index (["a" , "b" ], dtype = pd .StringDtype (na_value = np .nan )),
1123+ dtype = dtype ,
1124+ index = pd .Index (["a" , "b" ], dtype = dtype ),
1125+ columns = pd .Index (
1126+ ["a" ],
1127+ dtype = object
1128+ if pa_version_under19p0 and not using_infer_string
1129+ else dtype ,
1130+ ),
11131131 )
11141132 tm .assert_frame_equal (result , expected )
11151133
@@ -1122,7 +1140,10 @@ def test_roundtrip_decimal(self, tmp_path, pa):
11221140 df = pd .DataFrame ({"a" : [Decimal ("123.00" )]}, dtype = "string[pyarrow]" )
11231141 df .to_parquet (path , schema = pa .schema ([("a" , pa .decimal128 (5 ))]))
11241142 result = read_parquet (path )
1125- expected = pd .DataFrame ({"a" : ["123" ]}, dtype = "string[python]" )
1143+ if pa_version_under19p0 :
1144+ expected = pd .DataFrame ({"a" : ["123" ]}, dtype = "string[python]" )
1145+ else :
1146+ expected = pd .DataFrame ({"a" : [Decimal ("123.00" )]}, dtype = "object" )
11261147 tm .assert_frame_equal (result , expected )
11271148
11281149 def test_infer_string_large_string_type (self , tmp_path , pa ):
0 commit comments