From a8bcf6851feff078a9520f3f8abe3e090c0c24a0 Mon Sep 17 00:00:00 2001 From: parthava-adabala Date: Sat, 8 Nov 2025 20:54:06 -0500 Subject: [PATCH 1/4] BUG: Fix MultiIndex construction in pd.concat() with Int64Dtype NA --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/concat.py | 5 +++-- .../tests/indexes/multi/test_constructors.py | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 65982ecdb810c..6beec6560bd3c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1140,6 +1140,7 @@ Indexing - Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`) - Bug in :meth:`Index.equals` when comparing between :class:`Series` with string dtype :class:`Index` (:issue:`61099`) - Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`) +- Bug in :func:`pandas.concat` incorrectly constructing the :class:`MultiIndex` when an inner level contained :obj:`pandas.NA` with :class:`pandas.Int64Dtype`, causing a :exc:`KeyError` on lookup (:issue:`62903`) - Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) - Bug in :meth:`Series.__setitem__` when assigning boolean series with boolean indexer will raise ``LossySetitemError`` (:issue:`57338`) - Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 7d5d6bac9db41..2083cd9877b0b 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -990,8 +990,9 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde new_levels.extend(new_index.levels) new_codes.extend(np.tile(lab, kpieces) for lab in new_index.codes) else: - new_levels.append(new_index.unique()) - single_codes = new_index.unique().get_indexer(new_index) + levels_for_index = new_index.unique().dropna() + new_levels.append(levels_for_index) + single_codes = levels_for_index.get_indexer(new_index) new_codes.append(np.tile(single_codes, kpieces)) if len(new_names) < len(new_levels): diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 6b461fcf3920d..c23b02c840816 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -343,6 +343,24 @@ def test_from_arrays_respects_none_names(): tm.assert_index_equal(result, expected) +def test_concat_int64dtype_na_multiindex_lookup(): + levels1 = ['a', 'b'] + levels2 = pd.Series([1, 2, pd.NA], dtype=pd.Int64Dtype()) + index1 = MultiIndex.from_product([levels1, levels2], names=['one', 'two']) + series1 = pd.Series([f'{i1}-{i2}' for i1, i2 in index1], index=index1) + series2 = pd.concat( + [series1.loc[i1] for i1 in levels1], + keys=levels1, + names=['one'] + ) + lookup_key = ('a', pd.NA) + result = series2.at[lookup_key] + assert result == 'a-' + level_two = series2.index.levels[1] + codes_two = series2.index.codes[1] + assert level_two.hasnans is False + assert codes_two[2] == -1 + assert codes_two[5] == -1 # ---------------------------------------------------------------------------- # from_tuples From 9b28888b9cea5b70dbfbe45dd79e9637c3b925ce Mon Sep 17 00:00:00 2001 From: parthava-adabala Date: Sat, 8 Nov 2025 20:58:23 -0500 Subject: [PATCH 2/4] BUG: Fix MultiIndex construction in pd.concat() with Int64Dtype NA --- doc/source/whatsnew/v3.0.0.rst | 2 +- .../tests/indexes/multi/test_constructors.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6beec6560bd3c..af4e2d0ec3f94 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1132,6 +1132,7 @@ Interval Indexing ^^^^^^^^ +- Bug in :func:`pandas.concat` incorrectly constructing the :class:`MultiIndex` when an inner level contained :obj:`pandas.NA` with :class:`pandas.Int64Dtype`, causing a :exc:`KeyError` on lookup (:issue:`62903`) - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Bug in :meth:`DataFrame.__getitem__` when slicing a :class:`DataFrame` with many rows raised an ``OverflowError`` (:issue:`59531`) - Bug in :meth:`DataFrame.__setitem__` on an empty :class:`DataFrame` with a tuple corrupting the frame (:issue:`54385`) @@ -1140,7 +1141,6 @@ Indexing - Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`) - Bug in :meth:`Index.equals` when comparing between :class:`Series` with string dtype :class:`Index` (:issue:`61099`) - Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`) -- Bug in :func:`pandas.concat` incorrectly constructing the :class:`MultiIndex` when an inner level contained :obj:`pandas.NA` with :class:`pandas.Int64Dtype`, causing a :exc:`KeyError` on lookup (:issue:`62903`) - Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) - Bug in :meth:`Series.__setitem__` when assigning boolean series with boolean indexer will raise ``LossySetitemError`` (:issue:`57338`) - Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index c23b02c840816..936d6c304b6fa 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -343,25 +343,25 @@ def test_from_arrays_respects_none_names(): tm.assert_index_equal(result, expected) + def test_concat_int64dtype_na_multiindex_lookup(): - levels1 = ['a', 'b'] - levels2 = pd.Series([1, 2, pd.NA], dtype=pd.Int64Dtype()) - index1 = MultiIndex.from_product([levels1, levels2], names=['one', 'two']) - series1 = pd.Series([f'{i1}-{i2}' for i1, i2 in index1], index=index1) + levels1 = ["a", "b"] + levels2 = Series([1, 2, pd.NA], dtype=pd.Int64Dtype()) + index1 = MultiIndex.from_product([levels1, levels2], names=["one", "two"]) + series1 = Series([f"{i1}-{i2}" for i1, i2 in index1], index=index1) series2 = pd.concat( - [series1.loc[i1] for i1 in levels1], - keys=levels1, - names=['one'] + [series1.loc[i1] for i1 in levels1], keys=levels1, names=["one"] ) - lookup_key = ('a', pd.NA) + lookup_key = ("a", pd.NA) result = series2.at[lookup_key] - assert result == 'a-' + assert result == "a-" level_two = series2.index.levels[1] codes_two = series2.index.codes[1] assert level_two.hasnans is False assert codes_two[2] == -1 assert codes_two[5] == -1 + # ---------------------------------------------------------------------------- # from_tuples # ---------------------------------------------------------------------------- From 142c6405e16e075aee4d9475c98893d6d70348b8 Mon Sep 17 00:00:00 2001 From: parthava-adabala Date: Sun, 9 Nov 2025 15:34:06 -0500 Subject: [PATCH 3/4] revert previous changes --- doc/source/whatsnew/v3.0.0.rst | 1 - pandas/core/reshape/concat.py | 5 ++--- .../tests/indexes/multi/test_constructors.py | 18 ------------------ 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index af4e2d0ec3f94..65982ecdb810c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1132,7 +1132,6 @@ Interval Indexing ^^^^^^^^ -- Bug in :func:`pandas.concat` incorrectly constructing the :class:`MultiIndex` when an inner level contained :obj:`pandas.NA` with :class:`pandas.Int64Dtype`, causing a :exc:`KeyError` on lookup (:issue:`62903`) - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Bug in :meth:`DataFrame.__getitem__` when slicing a :class:`DataFrame` with many rows raised an ``OverflowError`` (:issue:`59531`) - Bug in :meth:`DataFrame.__setitem__` on an empty :class:`DataFrame` with a tuple corrupting the frame (:issue:`54385`) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 2083cd9877b0b..7d5d6bac9db41 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -990,9 +990,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde new_levels.extend(new_index.levels) new_codes.extend(np.tile(lab, kpieces) for lab in new_index.codes) else: - levels_for_index = new_index.unique().dropna() - new_levels.append(levels_for_index) - single_codes = levels_for_index.get_indexer(new_index) + new_levels.append(new_index.unique()) + single_codes = new_index.unique().get_indexer(new_index) new_codes.append(np.tile(single_codes, kpieces)) if len(new_names) < len(new_levels): diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 936d6c304b6fa..6b461fcf3920d 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -344,24 +344,6 @@ def test_from_arrays_respects_none_names(): tm.assert_index_equal(result, expected) -def test_concat_int64dtype_na_multiindex_lookup(): - levels1 = ["a", "b"] - levels2 = Series([1, 2, pd.NA], dtype=pd.Int64Dtype()) - index1 = MultiIndex.from_product([levels1, levels2], names=["one", "two"]) - series1 = Series([f"{i1}-{i2}" for i1, i2 in index1], index=index1) - series2 = pd.concat( - [series1.loc[i1] for i1 in levels1], keys=levels1, names=["one"] - ) - lookup_key = ("a", pd.NA) - result = series2.at[lookup_key] - assert result == "a-" - level_two = series2.index.levels[1] - codes_two = series2.index.codes[1] - assert level_two.hasnans is False - assert codes_two[2] == -1 - assert codes_two[5] == -1 - - # ---------------------------------------------------------------------------- # from_tuples # ---------------------------------------------------------------------------- From 5d8009d84e1c03d4cd952ddb159f5ffc54b1eaec Mon Sep 17 00:00:00 2001 From: parthava-adabala Date: Sun, 9 Nov 2025 16:37:55 -0500 Subject: [PATCH 4/4] BUG: Fix MultiIndex.at lookup with pd.NA in Int64Dtype level --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/multi.py | 6 ++++++ pandas/tests/indexes/multi/test_indexing.py | 8 ++++++++ 3 files changed, 15 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 65982ecdb810c..af4e2d0ec3f94 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1132,6 +1132,7 @@ Interval Indexing ^^^^^^^^ +- Bug in :func:`pandas.concat` incorrectly constructing the :class:`MultiIndex` when an inner level contained :obj:`pandas.NA` with :class:`pandas.Int64Dtype`, causing a :exc:`KeyError` on lookup (:issue:`62903`) - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Bug in :meth:`DataFrame.__getitem__` when slicing a :class:`DataFrame` with many rows raised an ``OverflowError`` (:issue:`59531`) - Bug in :meth:`DataFrame.__setitem__` on an empty :class:`DataFrame` with a tuple corrupting the frame (:issue:`54385`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 43e6469e078f0..a818fb8bce65c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3325,6 +3325,12 @@ def _maybe_to_slice(loc): try: return self._engine.get_loc(key) except KeyError as err: + if any(isna(k) for k in key): + loc, _ = self.get_loc_level( + key, range(self.nlevels), drop_level=False + ) + if lib.is_integer(loc): + return loc raise KeyError(key) from err except TypeError: # e.g. test_partial_slicing_with_multiindex partial string slicing diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index f098690be2afa..45ab01165350a 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -710,6 +710,14 @@ def test_get_loc_nan(self, level, nulls_fixture): idx = MultiIndex.from_product(levels) assert idx.get_loc(tuple(key)) == 3 + def test_multiindex_at_lookup_with_na_key(self): + index = MultiIndex(levels=[[1, 2], [2, pd.NA]], codes=[[0, 1], [0, 1]]) + df = DataFrame({"a": [1, 2]}, index=index) + result = df.at[(2, pd.NA), "a"] + assert result == 2 + loc_result = df.loc[(2, pd.NA), "a"] + assert loc_result == 2 + def test_get_loc_missing_nan(self): # GH 8569 idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]])