Skip to content

Commit aa3940e

Browse files
committed
BUG: Fix inconsistency when read_csv reads MultiIndex with empty values (#59560)
1 parent 1863adb commit aa3940e

File tree

3 files changed

+79
-6
lines changed

3 files changed

+79
-6
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,6 +1061,7 @@ I/O
10611061
- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`)
10621062
- Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits
10631063
``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
1064+
- Bug in :func:`read_csv` where empty entries in :class:`MultiIndex` columns were filled with ``"Unnamed: x_level_y"`` instead of remaining empty (:issue:`59560`)
10641065
- Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
10651066
- Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`)
10661067
- Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)

pandas/io/parsers/base_parser.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,10 @@ def _extract_multi_indexer_columns(
237237
def extract(r):
238238
return tuple(r[i] for i in range(field_count) if i not in sic)
239239

240-
columns = list(zip(*(extract(r) for r in header), strict=True))
240+
columns = list(zip(*(extract(r) for r in header)))
241+
# Clean the columns by removing placeholders.
242+
columns = self._clean_column_levels(columns)
243+
241244
names = columns.copy()
242245
for single_ic in sorted(ic):
243246
names.insert(single_ic, single_ic)
@@ -700,6 +703,40 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis
700703

701704
return index_names, columns, index_col
702705

706+
@final
707+
def _clean_column_levels(
708+
self, columns: list[tuple[Hashable, ...]]
709+
) -> list[tuple[Hashable, ...]]:
710+
"""
711+
Clean MultiIndex column level values by normalizing empty strings
712+
and automatically generated 'Unnamed: x_level_y' placeholders
713+
from header rows.
714+
"""
715+
716+
def _is_generated_unnamed(level: str | None) -> bool:
717+
# Return True if the value matches pandas auto-generated
718+
# placeholder pattern, e.g. 'Unnamed: 2_level_1'
719+
if not (isinstance(level, str) and level.startswith("Unnamed: ")):
720+
return False
721+
if "_level_" not in level:
722+
return False
723+
tail = level.split("Unnamed: ")[1]
724+
return tail.replace("_level_", "").replace("_", "").isdigit()
725+
726+
return [
727+
tuple(
728+
""
729+
if (
730+
level is None
731+
or str(level).strip() == ""
732+
or _is_generated_unnamed(level)
733+
)
734+
else level
735+
for level in col
736+
)
737+
for col in columns
738+
]
739+
703740
@final
704741
def _get_empty_meta(
705742
self, columns: Sequence[HashableT], dtype: DtypeArg | None = None

pandas/tests/io/parser/test_header.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from pandas.errors import ParserError
1313

14+
import pandas as pd
1415
from pandas import (
1516
DataFrame,
1617
Index,
@@ -566,11 +567,8 @@ def test_multi_index_unnamed(all_parsers, index_col, columns):
566567
if columns is None:
567568
columns = ["", "", ""]
568569

569-
for i, col in enumerate(columns):
570-
if not col: # Unnamed.
571-
col = f"Unnamed: {i if index_col is None else i + 1}_level_0"
572-
573-
exp_columns.append(col)
570+
# After GH#59560: keep empty cells as "", do not auto-fill "Unnamed: ..."
571+
exp_columns = [col or "" for col in (columns or [])]
574572

575573
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
576574
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
@@ -725,3 +723,40 @@ def test_usecols_no_header_pyarrow(pyarrow_parser_only):
725723
)
726724
expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]")
727725
tm.assert_frame_equal(result, expected)
726+
727+
728+
@pytest.mark.parametrize("engine", ["c", "python"])
729+
def test_multiindex_empty_vals_cleaned(tmp_path, engine):
730+
# GH#59560 - ensure empty values in MultiIndex columns are preserved
731+
path = tmp_path / "file.csv"
732+
df = DataFrame(
733+
np.arange(6).reshape((2, 3)),
734+
columns=MultiIndex.from_tuples([("a", ""), ("b", ""), ("b", "b2")]),
735+
index=MultiIndex.from_tuples([("i1", ""), ("i2", "")]),
736+
)
737+
738+
df.to_csv(path)
739+
result = pd.read_csv(
740+
path,
741+
header=[0, 1],
742+
index_col=[0, 1],
743+
engine=engine,
744+
keep_default_na=False,
745+
)
746+
747+
tm.assert_frame_equal(result, df)
748+
749+
750+
def test_multiindex_real_unnamed_label_preserved(tmp_path):
751+
# GH#59560 follow-up: genuine "Unnamed:" labels should not be cleaned
752+
path = tmp_path / "file.csv"
753+
df = DataFrame(
754+
np.arange(4).reshape((2, 2)),
755+
columns=MultiIndex.from_tuples([("a", "Unnamed: revenue"), ("a", "sales")]),
756+
)
757+
758+
df.to_csv(path)
759+
result = pd.read_csv(path, header=[0, 1], index_col=0)
760+
761+
expected = MultiIndex.from_tuples([("a", "Unnamed: revenue"), ("a", "sales")])
762+
tm.assert_index_equal(result.columns, expected)

0 commit comments

Comments
 (0)