BUG: Fix inconsistency when read_csv reads MultiIndex with empty values (#59560)

allamlobna · allamlobna · commit aa3940eea9da · 2025-10-10T18:31:03.000Z
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1061,6 +1061,7 @@ I/O
 - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`)
 - Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits
   ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
+- Bug in :func:`read_csv` where empty entries in :class:`MultiIndex` columns were filled with ``"Unnamed: x_level_y"`` instead of remaining empty (:issue:`59560`)
 - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
 - Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`)
 - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -237,7 +237,10 @@ def _extract_multi_indexer_columns(
         def extract(r):
             return tuple(r[i] for i in range(field_count) if i not in sic)
 
-        columns = list(zip(*(extract(r) for r in header), strict=True))
+        columns = list(zip(*(extract(r) for r in header)))
+        # Clean the columns by removing placeholders.
+        columns = self._clean_column_levels(columns)
+
         names = columns.copy()
         for single_ic in sorted(ic):
             names.insert(single_ic, single_ic)
@@ -700,6 +703,40 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis
 
         return index_names, columns, index_col
 
+    @final
+    def _clean_column_levels(
+        self, columns: list[tuple[Hashable, ...]]
+    ) -> list[tuple[Hashable, ...]]:
+        """
+        Clean MultiIndex column level values by normalizing empty strings
+        and automatically generated 'Unnamed: x_level_y' placeholders
+        from header rows.
+        """
+
+        def _is_generated_unnamed(level: str | None) -> bool:
+            # Return True if the value matches pandas auto-generated
+            # placeholder pattern, e.g. 'Unnamed: 2_level_1'
+            if not (isinstance(level, str) and level.startswith("Unnamed: ")):
+                return False
+            if "_level_" not in level:
+                return False
+            tail = level.split("Unnamed: ")[1]
+            return tail.replace("_level_", "").replace("_", "").isdigit()
+
+        return [
+            tuple(
+                ""
+                if (
+                    level is None
+                    or str(level).strip() == ""
+                    or _is_generated_unnamed(level)
+                )
+                else level
+                for level in col
+            )
+            for col in columns
+        ]
+
     @final
     def _get_empty_meta(
         self, columns: Sequence[HashableT], dtype: DtypeArg | None = None
diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py
@@ -11,6 +11,7 @@
 
 from pandas.errors import ParserError
 
+import pandas as pd
 from pandas import (
     DataFrame,
     Index,
@@ -566,11 +567,8 @@ def test_multi_index_unnamed(all_parsers, index_col, columns):
     if columns is None:
         columns = ["", "", ""]
 
-    for i, col in enumerate(columns):
-        if not col:  # Unnamed.
-            col = f"Unnamed: {i if index_col is None else i + 1}_level_0"
-
-        exp_columns.append(col)
+    # After GH#59560: keep empty cells as "", do not auto-fill "Unnamed: ..."
+    exp_columns = [col or "" for col in (columns or [])]
 
     columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
     expected = DataFrame([[2, 3], [4, 5]], columns=columns)
@@ -725,3 +723,40 @@ def test_usecols_no_header_pyarrow(pyarrow_parser_only):
     )
     expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]")
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("engine", ["c", "python"])
+def test_multiindex_empty_vals_cleaned(tmp_path, engine):
+    # GH#59560 - ensure empty values in MultiIndex columns are preserved
+    path = tmp_path / "file.csv"
+    df = DataFrame(
+        np.arange(6).reshape((2, 3)),
+        columns=MultiIndex.from_tuples([("a", ""), ("b", ""), ("b", "b2")]),
+        index=MultiIndex.from_tuples([("i1", ""), ("i2", "")]),
+    )
+
+    df.to_csv(path)
+    result = pd.read_csv(
+        path,
+        header=[0, 1],
+        index_col=[0, 1],
+        engine=engine,
+        keep_default_na=False,
+    )
+
+    tm.assert_frame_equal(result, df)
+
+
+def test_multiindex_real_unnamed_label_preserved(tmp_path):
+    # GH#59560 follow-up: genuine "Unnamed:" labels should not be cleaned
+    path = tmp_path / "file.csv"
+    df = DataFrame(
+        np.arange(4).reshape((2, 2)),
+        columns=MultiIndex.from_tuples([("a", "Unnamed: revenue"), ("a", "sales")]),
+    )
+
+    df.to_csv(path)
+    result = pd.read_csv(path, header=[0, 1], index_col=0)
+
+    expected = MultiIndex.from_tuples([("a", "Unnamed: revenue"), ("a", "sales")])
+    tm.assert_index_equal(result.columns, expected)