From 604758413c6128c9a60d2ce398137fd3dc9b88ee Mon Sep 17 00:00:00 2001 From: vignesh14052002 Date: Sun, 31 Aug 2025 17:51:10 +0530 Subject: [PATCH 1/7] fix : remove memo usage --- pandas/_libs/parsers.pyx | 5 ----- pandas/tests/io/parser/common/test_common_basic.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5b94f45490da4..d315a5bb5e86e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2122,7 +2122,6 @@ def sanitize_objects(ndarray[object] values, set na_values) -> int: Py_ssize_t i, n object val, onan Py_ssize_t na_count = 0 - dict memo = {} n = len(values) onan = np.nan @@ -2132,9 +2131,5 @@ def sanitize_objects(ndarray[object] values, set na_values) -> int: if val in na_values: values[i] = onan na_count += 1 - elif val in memo: - values[i] = memo[val] - else: - memo[val] = val return na_count diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 3680273f5e98a..3c518e93c9035 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -28,6 +28,7 @@ compat, ) import pandas._testing as tm +from pandas._libs import parsers as libparsers pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" @@ -830,3 +831,12 @@ def test_read_seek(all_parsers): actual = parser.read_csv(file) expected = parser.read_csv(StringIO(content)) tm.assert_frame_equal(actual, expected) + +def test_dtype_conversion_in_sanitization(): + # GH60088 + values = np.array([1,True],dtype=object) + expected = np.array([1,True],dtype=object) + libparsers.sanitize_objects(values,na_values=set()) + for v,e in zip(values,expected): + assert v==e + assert type(v)==type(e) From 6eb738a0dcddb93609275eb7a74e1d1c8c3dbbb1 Mon Sep 17 00:00:00 2001 From: vignesh14052002 Date: Sun, 31 Aug 2025 19:26:33 +0530 Subject: [PATCH 2/7] fix linting --- .../tests/io/parser/common/test_common_basic.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 3c518e93c9035..d90e01ca4a40f 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,6 +15,7 @@ from pandas._config import using_string_dtype +from pandas._libs import parsers as libparsers from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, @@ -28,7 +29,6 @@ compat, ) import pandas._testing as tm -from pandas._libs import parsers as libparsers pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" @@ -832,11 +832,12 @@ def test_read_seek(all_parsers): expected = parser.read_csv(StringIO(content)) tm.assert_frame_equal(actual, expected) + def test_dtype_conversion_in_sanitization(): # GH60088 - values = np.array([1,True],dtype=object) - expected = np.array([1,True],dtype=object) - libparsers.sanitize_objects(values,na_values=set()) - for v,e in zip(values,expected): - assert v==e - assert type(v)==type(e) + values = np.array([1, True], dtype=object) + expected = np.array([1, True], dtype=object) + libparsers.sanitize_objects(values, na_values=set()) + for v, e in zip(values, expected): + assert v == e + assert type(v) == type(e) From 30788075170774820721d550413bb7362c09f1cf Mon Sep 17 00:00:00 2001 From: vignesh14052002 Date: Wed, 3 Sep 2025 14:04:08 +0530 Subject: [PATCH 3/7] include type in memo key to handle 0,1,True and False conflict --- pandas/_libs/parsers.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index d315a5bb5e86e..8b1d72ff63339 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2122,14 +2122,20 @@ def sanitize_objects(ndarray[object] values, set na_values) -> int: Py_ssize_t i, n object val, onan Py_ssize_t na_count = 0 + dict memo = {} n = len(values) onan = np.nan for i in range(n): val = values[i] + memo_key = (val, type(val)) if val in na_values: values[i] = onan na_count += 1 + elif memo_key in memo: + values[i] = memo[memo_key] + else: + memo[memo_key] = val return na_count From 2aa4a018fd9168979110a2473f91c0ecba232e3c Mon Sep 17 00:00:00 2001 From: vignesh14052002 Date: Thu, 11 Sep 2025 16:55:21 +0530 Subject: [PATCH 4/7] skip memoization for 0,1,True,False --- pandas/_libs/parsers.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 8b1d72ff63339..e07e0088536e2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2129,13 +2129,15 @@ def sanitize_objects(ndarray[object] values, set na_values) -> int: for i in range(n): val = values[i] - memo_key = (val, type(val)) if val in na_values: values[i] = onan na_count += 1 - elif memo_key in memo: - values[i] = memo[memo_key] + elif val in [0, 1, True, False]: + # Skip memoization, since 1==True and 0==False + values[i] = val + elif val in memo: + values[i] = memo[val] else: - memo[memo_key] = val + memo[val] = val return na_count From c01d4d51af470f5d44cf537c74f4ad687172906b Mon Sep 17 00:00:00 2001 From: vignesh14052002 Date: Tue, 4 Nov 2025 14:31:54 +0530 Subject: [PATCH 5/7] use bool set --- pandas/_libs/parsers.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e07e0088536e2..d1ea4092efe40 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2126,14 +2126,17 @@ def sanitize_objects(ndarray[object] values, set na_values) -> int: n = len(values) onan = np.nan + bool_set = {True, False} for i in range(n): val = values[i] if val in na_values: values[i] = onan na_count += 1 - elif val in [0, 1, True, False]: - # Skip memoization, since 1==True and 0==False + elif val in bool_set: + # GH60088: Skip memoization + # since 1 == 1.0 == True == np.True_ + # and 0 == 0.0 == False == np.False_ values[i] = val elif val in memo: values[i] = memo[val] From 4cffc67c3aa94c023da014595a55370973ab4792 Mon Sep 17 00:00:00 2001 From: vignesh14052002 Date: Tue, 4 Nov 2025 14:36:37 +0530 Subject: [PATCH 6/7] update tests --- pandas/tests/io/excel/test_readers.py | 36 +++++++++++++++++++ .../io/parser/common/test_common_basic.py | 6 ++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 71fb8f490e114..024fc71bbf17b 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -168,6 +168,42 @@ def test_read_excel_type_check(self, col, tmp_excel, read_ext): df2 = pd.read_excel(tmp_excel, dtype={"bool_column": "boolean"}) tm.assert_frame_equal(df, df2) + def test_read_excel_int_bool_mix_type_check(self, tmp_excel, read_ext): + # GH 60088 + if read_ext in (".xlsb", ".xls"): + pytest.skip(f"No engine for filetype: '{read_ext}'") + + df1 = DataFrame( + { + "a": [True, True], + "b": [1, True], + "c": [True, 1], + "d": [False, 0], + "e": [0, False], + "f": [False, False], + }, + dtype=object, + ) + df1.to_excel(tmp_excel, index=False) + + df2 = pd.read_excel(tmp_excel, dtype=object) + + tm.assert_frame_equal(df1, df2) + + for idx, row in df2.iterrows(): + for col in df2.columns: + val = row[col] + exp_val = df1.iloc[idx][col] + # Check if values match + assert val == exp_val, ( + f"Mismatch at Row {idx} Column {col}: {val} != {exp_val}" + ) + # Check if types match + assert type(val) == type(exp_val), ( + f"Type mismatch at Row {idx} Column {col}: " + f"{type(val)} != {type(exp_val)}" + ) + def test_pass_none_type(self, datapath): # GH 58159 f_path = datapath("io", "data", "excel", "test_none_type.xlsx") diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index d90e01ca4a40f..27a36d708a472 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -835,8 +835,10 @@ def test_read_seek(all_parsers): def test_dtype_conversion_in_sanitization(): # GH60088 - values = np.array([1, True], dtype=object) - expected = np.array([1, True], dtype=object) + values = np.array([1, True, 0, False, 1.0, 0.0, np.True_, np.False_], dtype=object) + expected = np.array( + [1, True, 0, False, 1.0, 0.0, np.True_, np.False_], dtype=object + ) libparsers.sanitize_objects(values, na_values=set()) for v, e in zip(values, expected): assert v == e From b863616a95fc76a336a32bacf580f367ad3af3bf Mon Sep 17 00:00:00 2001 From: vignesh14052002 Date: Tue, 4 Nov 2025 14:41:57 +0530 Subject: [PATCH 7/7] use strict = True while asserting --- pandas/tests/io/parser/common/test_common_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 27a36d708a472..96da78f997962 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -840,6 +840,6 @@ def test_dtype_conversion_in_sanitization(): [1, True, 0, False, 1.0, 0.0, np.True_, np.False_], dtype=object ) libparsers.sanitize_objects(values, na_values=set()) - for v, e in zip(values, expected): + for v, e in zip(values, expected, strict=True): assert v == e assert type(v) == type(e)