diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5b94f45490da4..d1ea4092efe40 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2126,12 +2126,18 @@ def sanitize_objects(ndarray[object] values, set na_values) -> int: n = len(values) onan = np.nan + bool_set = {True, False} for i in range(n): val = values[i] if val in na_values: values[i] = onan na_count += 1 + elif val in bool_set: + # GH60088: Skip memoization + # since 1 == 1.0 == True == np.True_ + # and 0 == 0.0 == False == np.False_ + values[i] = val elif val in memo: values[i] = memo[val] else: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 71fb8f490e114..024fc71bbf17b 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -168,6 +168,42 @@ def test_read_excel_type_check(self, col, tmp_excel, read_ext): df2 = pd.read_excel(tmp_excel, dtype={"bool_column": "boolean"}) tm.assert_frame_equal(df, df2) + def test_read_excel_int_bool_mix_type_check(self, tmp_excel, read_ext): + # GH 60088 + if read_ext in (".xlsb", ".xls"): + pytest.skip(f"No engine for filetype: '{read_ext}'") + + df1 = DataFrame( + { + "a": [True, True], + "b": [1, True], + "c": [True, 1], + "d": [False, 0], + "e": [0, False], + "f": [False, False], + }, + dtype=object, + ) + df1.to_excel(tmp_excel, index=False) + + df2 = pd.read_excel(tmp_excel, dtype=object) + + tm.assert_frame_equal(df1, df2) + + for idx, row in df2.iterrows(): + for col in df2.columns: + val = row[col] + exp_val = df1.iloc[idx][col] + # Check if values match + assert val == exp_val, ( + f"Mismatch at Row {idx} Column {col}: {val} != {exp_val}" + ) + # Check if types match + assert type(val) == type(exp_val), ( + f"Type mismatch at Row {idx} Column {col}: " + f"{type(val)} != {type(exp_val)}" + ) + def test_pass_none_type(self, datapath): # GH 58159 f_path = datapath("io", "data", "excel", "test_none_type.xlsx") diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 3680273f5e98a..96da78f997962 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,6 +15,7 @@ from pandas._config import using_string_dtype +from pandas._libs import parsers as libparsers from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, @@ -830,3 +831,15 @@ def test_read_seek(all_parsers): actual = parser.read_csv(file) expected = parser.read_csv(StringIO(content)) tm.assert_frame_equal(actual, expected) + + +def test_dtype_conversion_in_sanitization(): + # GH60088 + values = np.array([1, True, 0, False, 1.0, 0.0, np.True_, np.False_], dtype=object) + expected = np.array( + [1, True, 0, False, 1.0, 0.0, np.True_, np.False_], dtype=object + ) + libparsers.sanitize_objects(values, na_values=set()) + for v, e in zip(values, expected, strict=True): + assert v == e + assert type(v) == type(e)