diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 38755aef32b85..f1be0ef104125 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1097,6 +1097,7 @@ I/O - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`) - Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`) +- Bug in :func:`read_csv` where empty entries in :class:`MultiIndex` columns were filled with ``"Unnamed: x_level_y"`` instead of remaining empty (:issue:`59560`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index a6a5a7c23b506..02eb705aebab2 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -238,6 +238,11 @@ def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) columns = list(zip(*(extract(r) for r in header), strict=True)) + + # Clean unnamed placeholders for CSV parsers only (GH#59560) + if getattr(self, "_clean_csv_unnamed_columns", False): + columns = self._clean_column_levels(columns) + names = columns.copy() for single_ic in sorted(ic): names.insert(single_ic, single_ic) @@ -704,6 +709,40 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis return index_names, columns, index_col + @final + def _clean_column_levels( + self, columns: list[tuple[Hashable, ...]] + ) -> list[tuple[Hashable, ...]]: + """ + Clean MultiIndex column level values by normalizing empty strings + and automatically generated 'Unnamed: x_level_y' placeholders + from header rows. + """ + + def _is_generated_unnamed(level: str | None) -> bool: + # Return True if the value matches pandas auto-generated + # placeholder pattern, e.g. 'Unnamed: 2_level_1' + if not (isinstance(level, str) and level.startswith("Unnamed: ")): + return False + if "_level_" not in level: + return False + tail = level.split("Unnamed: ")[1] + return tail.replace("_level_", "").replace("_", "").isdigit() + + return [ + tuple( + "" + if ( + level is None + or (isinstance(level, str) and level.strip() == "") + or (isinstance(level, str) and _is_generated_unnamed(level)) + ) + else level + for level in col + ) + for col in columns + ] + @final def _get_empty_meta( self, columns: Sequence[HashableT], dtype: DtypeArg | None = None diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index e517647d071d6..1d33baf9e9614 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -64,6 +64,7 @@ class CParserWrapper(ParserBase): def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: super().__init__(kwds) + self._clean_csv_unnamed_columns = True self.kwds = kwds kwds = kwds.copy() diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index dc7a21c859a33..b29cd78a7b337 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -101,7 +101,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: Workhorse function for processing nested list into DataFrame """ super().__init__(kwds) - + # Only enable cleaning for CSV (file/buffer), not Excel (list) + if not isinstance(f, list) and "has_index_names" not in kwds: + self._clean_csv_unnamed_columns = True self.data: Iterator[list[str]] | list[list[Scalar]] = [] self.buf: list = [] self.pos = 0 diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index d333aef723de2..abc0be3588fe2 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -11,6 +11,7 @@ from pandas.errors import ParserError +import pandas as pd from pandas import ( DataFrame, Index, @@ -566,11 +567,8 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): if columns is None: columns = ["", "", ""] - for i, col in enumerate(columns): - if not col: # Unnamed. - col = f"Unnamed: {i if index_col is None else i + 1}_level_0" - - exp_columns.append(col) + # After GH#59560: keep empty cells as "", do not auto-fill "Unnamed: ..." + exp_columns = [col or "" for col in (columns or [])] columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) expected = DataFrame([[2, 3], [4, 5]], columns=columns) @@ -725,3 +723,40 @@ def test_usecols_no_header_pyarrow(pyarrow_parser_only): ) expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("engine", ["c", "python"]) +def test_multiindex_empty_vals_cleaned(tmp_path, engine): + # GH#59560 - ensure empty values in MultiIndex columns are preserved + path = tmp_path / "file.csv" + df = DataFrame( + np.arange(6, dtype=np.int64).reshape((2, 3)), + columns=MultiIndex.from_tuples([("a", ""), ("b", ""), ("b", "b2")]), + index=MultiIndex.from_tuples([("i1", ""), ("i2", "")]), + ) + + df.to_csv(path) + result = pd.read_csv( + path, + header=[0, 1], + index_col=[0, 1], + engine=engine, + keep_default_na=False, + ) + + tm.assert_frame_equal(result, df) + + +def test_multiindex_real_unnamed_label_preserved(tmp_path): + # GH#59560 follow-up: genuine "Unnamed:" labels should not be cleaned + path = tmp_path / "file.csv" + df = DataFrame( + np.arange(4, dtype=np.int64).reshape((2, 2)), + columns=MultiIndex.from_tuples([("a", "Unnamed: revenue"), ("a", "sales")]), + ) + + df.to_csv(path) + result = pd.read_csv(path, header=[0, 1], index_col=0) + + expected = MultiIndex.from_tuples([("a", "Unnamed: revenue"), ("a", "sales")]) + tm.assert_index_equal(result.columns, expected)