From 742a268b6b868d44bbf39da86566ab3de3cd233e Mon Sep 17 00:00:00 2001 From: Augustus Date: Thu, 4 Sep 2025 10:08:35 -0400 Subject: [PATCH 01/17] Activated test for metadata of merge operation. --- pandas/core/reshape/merge.py | 17 ++++++++++- pandas/tests/generic/test_finalize.py | 41 +++++++++++++++++++++------ 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8786ce361c900..88f92a907c616 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1129,6 +1129,9 @@ def _reindex_and_concat( return result def get_result(self) -> DataFrame: + """ + Execute the merge. + """ if self.indicator: self.left, self.right = self._indicator_pre_merge(self.left, self.right) @@ -1148,7 +1151,7 @@ def get_result(self) -> DataFrame: self._maybe_restore_index_levels(result) return result.__finalize__( - types.SimpleNamespace(input_objs=[self.left, self.right]), method="merge" + self.left, method="merge" ) @final @@ -1167,6 +1170,12 @@ def _indicator_name(self) -> str | None: def _indicator_pre_merge( self, left: DataFrame, right: DataFrame ) -> tuple[DataFrame, DataFrame]: + """ + Add one indicator column to each of the left and right inputs to a merge operation. + + These columns are used to produce another column in the output of the merge, indicating + for each row of the output whether it was produced using the left, right or both inputs. + """ columns = left.columns.union(right.columns) for i in ["_left_indicator", "_right_indicator"]: @@ -1193,6 +1202,12 @@ def _indicator_pre_merge( @final def _indicator_post_merge(self, result: DataFrame) -> DataFrame: + """ + Add an indicator column to the merge result. + + This column indicates for each row of the output whether it was produced using the left, + right or both inputs. + """ result["_left_indicator"] = result["_left_indicator"].fillna(0) result["_right_indicator"] = result["_right_indicator"].fillna(0) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 4b841b54c488b..f08b6c438dcea 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -148,14 +148,7 @@ operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]), ), (pd.DataFrame, frame_data, operator.methodcaller("map", lambda x: x)), - pytest.param( - ( - pd.DataFrame, - frame_data, - operator.methodcaller("merge", pd.DataFrame({"A": [1]})), - ), - marks=not_implemented_mark, - ), + (pd.DataFrame, frame_data, operator.methodcaller("merge", pd.DataFrame({"A": [1]}))), (pd.DataFrame, frame_data, operator.methodcaller("round", 2)), (pd.DataFrame, frame_data, operator.methodcaller("corr")), pytest.param( @@ -675,3 +668,35 @@ def test_finalize_frame_series_name(): df = pd.DataFrame({"name": [1, 2]}) result = pd.Series([1, 2]).__finalize__(df) assert result.name is None + +# ---------------------------------------------------------------------------- +# Merge tests + +@pytest.mark.parametrize(["allow_duplication_on_left", "allow_duplication_on_right"], +[ + (False, False), + (False, True), + (True, False), + (True, True) +]) +def test_merge_sets_duplication_allowance_flag(allow_duplication_on_left, allow_duplication_on_right): + """ + Check that pandas.merge correctly sets the allow_duplicate_labels flag + on its result. + + If one or both of the arguments to merge has its flag set to False, + then the result of merge should have its flag set to False. + Otherwise, the result should have its flag set to True. + """ + # Arrange + left = pd.DataFrame({"test": [1]}) + left.set_flags(allows_duplicate_labels=allow_duplication_on_left) + right = pd.DataFrame({"test": [1]}) + right.set_flags(allows_duplicate_labels=allow_duplication_on_right) + + # Act + result = left.merge(right, how="inner", on="test") + + # Assert + expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right + assert result.flags.allows_duplicate_labels == expected_duplication_allowance \ No newline at end of file From a12bdbdf8e4ec1c4670f1477d9359569e734e0a7 Mon Sep 17 00:00:00 2001 From: Augustus Date: Thu, 4 Sep 2025 12:58:21 -0400 Subject: [PATCH 02/17] Fixed error in test for merge result flags. --- pandas/tests/generic/test_finalize.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index f08b6c438dcea..fa909d42ac13a 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -670,7 +670,7 @@ def test_finalize_frame_series_name(): assert result.name is None # ---------------------------------------------------------------------------- -# Merge tests +# Tests for merge @pytest.mark.parametrize(["allow_duplication_on_left", "allow_duplication_on_right"], [ @@ -689,14 +689,29 @@ def test_merge_sets_duplication_allowance_flag(allow_duplication_on_left, allow_ Otherwise, the result should have its flag set to True. """ # Arrange + left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_left) + right = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_right) + + # Act + result = left.merge(right, how="inner", on="test") + + # Assert + expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right + assert result.flags.allows_duplicate_labels == expected_duplication_allowance + +def test_merge_collects_metadata_from_both_inputs(): + """ + Check that pandas.merge sets the metadata of its result by merging the metadata from both + of its inputs. + """ + # Arrange left = pd.DataFrame({"test": [1]}) - left.set_flags(allows_duplicate_labels=allow_duplication_on_left) + left.attrs = {"a": 2} right = pd.DataFrame({"test": [1]}) - right.set_flags(allows_duplicate_labels=allow_duplication_on_right) + right.attrs = {"b": 3} # Act result = left.merge(right, how="inner", on="test") # Assert - expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right - assert result.flags.allows_duplicate_labels == expected_duplication_allowance \ No newline at end of file + assert result.attrs == {"a": 2, "b": 3} \ No newline at end of file From d22b90afb849f70f285946e0cae4d3301c1c1a69 Mon Sep 17 00:00:00 2001 From: Augustus Date: Thu, 4 Sep 2025 13:07:47 -0400 Subject: [PATCH 03/17] Tested that merge collects metadata from only its left argument. --- pandas/tests/generic/test_finalize.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index fa909d42ac13a..351a00aef258c 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -699,14 +699,16 @@ def test_merge_sets_duplication_allowance_flag(allow_duplication_on_left, allow_ expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right assert result.flags.allows_duplicate_labels == expected_duplication_allowance -def test_merge_collects_metadata_from_both_inputs(): +def test_merge_collects_metadata_from_only_its_left_input(): """ - Check that pandas.merge sets the metadata of its result by merging the metadata from both - of its inputs. + Check that pandas.merge sets the metadata of its result to a copy of the metadata from its + left input. """ # Arrange left = pd.DataFrame({"test": [1]}) - left.attrs = {"a": 2} + metadata = {"a": 2} + left.attrs = metadata + right = pd.DataFrame({"test": [1]}) right.attrs = {"b": 3} @@ -714,4 +716,7 @@ def test_merge_collects_metadata_from_both_inputs(): result = left.merge(right, how="inner", on="test") # Assert - assert result.attrs == {"a": 2, "b": 3} \ No newline at end of file + assert result.attrs == metadata + # Check that the metadata from the left argument is copied, rather than shared. + left.attrs = {"c": 4} + assert result.attrs == metadata \ No newline at end of file From 41b3571f6dc49f9a886944c199141565fc678140 Mon Sep 17 00:00:00 2001 From: Augustus Date: Thu, 4 Sep 2025 13:48:13 -0400 Subject: [PATCH 04/17] Added test to check whether merge_asof correctly sets the allow duplicates flag. --- pandas/tests/generic/test_finalize.py | 49 +++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 351a00aef258c..4cb2f6016381a 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -679,21 +679,58 @@ def test_finalize_frame_series_name(): (True, False), (True, True) ]) -def test_merge_sets_duplication_allowance_flag(allow_duplication_on_left, allow_duplication_on_right): +@pytest.mark.parametrize(["how"], [ + ("left",), + ("right",), + ("inner",), + ("outer",), + ("left_anti",), + ("right_anti",), + ("cross",), +]) +def test_merge_sets_duplication_allowance_flag(how, allow_duplication_on_left, allow_duplication_on_right): """ - Check that pandas.merge correctly sets the allow_duplicate_labels flag + Check that DataFrame.merge correctly sets the allow_duplicate_labels flag on its result. - If one or both of the arguments to merge has its flag set to False, - then the result of merge should have its flag set to False. - Otherwise, the result should have its flag set to True. + The flag on the result should be set to true if and only if both arguments to merge + have their flags set to True. """ # Arrange left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_left) right = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_right) # Act - result = left.merge(right, how="inner", on="test") + if not how == "cross": + result = left.merge(right, how=how, on="test") + else: + result = left.merge(right, how=how) + + # Assert + expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right + assert result.flags.allows_duplicate_labels == expected_duplication_allowance + +@pytest.mark.parametrize(["allow_duplication_on_left", "allow_duplication_on_right"], +[ + (False, False), + (False, True), + (True, False), + (True, True) +]) +def test_merge_asof_sets_duplication_allowance_flag(allow_duplication_on_left, allow_duplication_on_right): + """ + Check that pandas.merge_asof correctly sets the allow_duplicate_labels flag + on its result. + + The flag on the result should be set to true if and only if both arguments to merge_asof + have their flags set to True. + """ + # Arrange + left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_left) + right = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_right) + + # Act + result = pd.merge_asof(left, right) # Assert expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right From 58b3c2add804981ae019a0c7523d492bf2693912 Mon Sep 17 00:00:00 2001 From: Augustus Date: Fri, 5 Sep 2025 13:14:40 -0400 Subject: [PATCH 05/17] Added bug fix description to documentation. --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ffa65032e6aae..1631dc72197fb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1060,6 +1060,7 @@ Reshaping - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) - Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`) - Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) +- Bug in :meth:`DataFrame.merge` where the result of a merge does not contain any metadata or flag information from the inputs to the merge. (:issue:`28283`) Sparse ^^^^^^ From 11ebac865d52e149bdf343078cb303edea234e21 Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Tue, 9 Sep 2025 12:08:40 -0400 Subject: [PATCH 06/17] Added test to check metadata handling for pandas.merge. --- pandas/core/generic.py | 5 ++- pandas/core/reshape/merge.py | 3 +- pandas/tests/generic/test_finalize.py | 61 +++++++++++++++++++++++---- pandas/tests/generic/test_frame.py | 3 +- 4 files changed, 60 insertions(+), 12 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6557388d88f20..f9f7ded86ab5d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6096,7 +6096,10 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: Parameters ---------- other : the object from which to get the attributes that we are going - to propagate + to propagate. If ``other`` has an ``input_objs`` attribute, then this attribute + must contain an iterable of objects, each with an ``attrs`` attribute, in which + case, each such ``attrs`` instance must be a dictionary that is equal to all of + the others. method : str, optional A passed method name providing context on where ``__finalize__`` was called. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 88f92a907c616..0fa51f01e7f6e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1151,7 +1151,8 @@ def get_result(self) -> DataFrame: self._maybe_restore_index_levels(result) return result.__finalize__( - self.left, method="merge" + types.SimpleNamespace(input_objs=[self.left, self.right]), + method="merge" ) @final diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 4cb2f6016381a..7d56bbd4578da 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -148,7 +148,6 @@ operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]), ), (pd.DataFrame, frame_data, operator.methodcaller("map", lambda x: x)), - (pd.DataFrame, frame_data, operator.methodcaller("merge", pd.DataFrame({"A": [1]}))), (pd.DataFrame, frame_data, operator.methodcaller("round", 2)), (pd.DataFrame, frame_data, operator.methodcaller("corr")), pytest.param( @@ -736,16 +735,34 @@ def test_merge_asof_sets_duplication_allowance_flag(allow_duplication_on_left, a expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right assert result.flags.allows_duplicate_labels == expected_duplication_allowance -def test_merge_collects_metadata_from_only_its_left_input(): +def test_merge_propagates_metadata_from_equal_input_metadata(): """ - Check that pandas.merge sets the metadata of its result to a copy of the metadata from its - left input. + Check that pandas.merge sets the metadata of its result to a deep copy of the metadata from + its left input, if the metadata from both inputs are equal. """ # Arrange - left = pd.DataFrame({"test": [1]}) metadata = {"a": 2} + left = pd.DataFrame({"test": [1]}) left.attrs = metadata + right = pd.DataFrame({"test": [1]}) + right.attrs = metadata.copy() + # Act + result = left.merge(right, how="inner", on="test") + + # Assert + assert result.attrs == metadata + left.attrs = {"b": 3} + assert result.attrs == metadata + +def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): + """ + Check that the metadata for the result of pandas.merge is empty if the metadata + for both inputs to pandas.merge are not equal. + """ + # Arrange + left = pd.DataFrame({"test": [1]}) + left.attrs = {"a": 2} right = pd.DataFrame({"test": [1]}) right.attrs = {"b": 3} @@ -753,7 +770,33 @@ def test_merge_collects_metadata_from_only_its_left_input(): result = left.merge(right, how="inner", on="test") # Assert - assert result.attrs == metadata - # Check that the metadata from the left argument is copied, rather than shared. - left.attrs = {"c": 4} - assert result.attrs == metadata \ No newline at end of file + assert result.attrs == {} + +no_metadata = pd.DataFrame({"test": [1]}) + +metadata = {"a": 2} +has_metadata = pd.DataFrame({"test": [1]}) +has_metadata.attrs = metadata + +@pytest.mark.parametrize(["left", "right", "expected"], + [(no_metadata, has_metadata, metadata), + (has_metadata, no_metadata, metadata), + (no_metadata, no_metadata, {})]) +def test_merge_propagates_metadata_if_one_input_has_no_metadata(left: pd.DataFrame, right: pd.DataFrame, expected: dict): + """ + Check that if the metadata for one input to pandas.merge is empty, the result + of merge has the same metadata as the other input. + + (empty) (A) (A) (empty) (empty) (empty) + | | | | | | + --> merge <-- --> merge <-- --> merge <-- + | | | + (A) (A) (empty) + """ + # Arrange + + # Act + result = left.merge(right, how="inner", on="test") + + # Assert + assert result.attrs == expected diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index e927c17eceb76..01e8addacfe4e 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -1,5 +1,6 @@ from copy import deepcopy from operator import methodcaller +from typing import Literal import numpy as np import pytest @@ -77,7 +78,7 @@ def test_metadata_propagation_indiv(self, monkeypatch): # merging with override # GH 6923 - def finalize(self, other, method=None, **kwargs): + def finalize(self: DataFrame, other: DataFrame, method: Literal["merge", "concat"] | None = None, **kwargs): for name in self._metadata: if method == "merge": left, right = other.input_objs From 438136481088552920a36bef22d520f13b2ab5e9 Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Thu, 11 Sep 2025 11:38:20 -0400 Subject: [PATCH 07/17] Modified tests for __finalize__ to respect documented merge behavior. --- pandas/core/generic.py | 3 + pandas/core/reshape/merge.py | 4 + pandas/tests/generic/test_finalize.py | 101 ++++++++++++++++---------- 3 files changed, 70 insertions(+), 38 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1a7f413b8d176..ace5a27c43f31 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6093,6 +6093,9 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: """ Propagate metadata from other to self. + This is the default implementation. Subclasses may override this method to + implement their own metadata handling. + Parameters ---------- other : the object from which to get the attributes that we are going diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0fa51f01e7f6e..61d0b5d3cea92 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1138,6 +1138,8 @@ def get_result(self) -> DataFrame: join_index, left_indexer, right_indexer = self._get_join_info() result = self._reindex_and_concat(join_index, left_indexer, right_indexer) + + # Is this call to __finalize__ really necessary? result = result.__finalize__( types.SimpleNamespace(input_objs=[self.left, self.right]), method=self._merge_type, @@ -1150,6 +1152,8 @@ def get_result(self) -> DataFrame: self._maybe_restore_index_levels(result) + # __finalize is responsible for copying the metadata from the inputs to merge + # to the result. return result.__finalize__( types.SimpleNamespace(input_objs=[self.left, self.right]), method="merge" diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 65933607341c6..3faf3310e2034 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -671,13 +671,15 @@ def test_finalize_frame_series_name(): # ---------------------------------------------------------------------------- # Tests for merge -@pytest.mark.parametrize(["allow_duplication_on_left", "allow_duplication_on_right"], -[ - (False, False), - (False, True), - (True, False), - (True, True) -]) +@pytest.mark.parametrize( + ["allow_on_left", "allow_on_right"], + [ + (False, False), + (False, True), + (True, False), + (True, True) + ] +) @pytest.mark.parametrize(["how"], [ ("left",), ("right",), @@ -687,17 +689,25 @@ def test_finalize_frame_series_name(): ("right_anti",), ("cross",), ]) -def test_merge_sets_duplication_allowance_flag(how, allow_duplication_on_left, allow_duplication_on_right): +def test_merge_sets_duplication_allowance_flag( + how, + allow_on_left, + allow_on_right +): """ Check that DataFrame.merge correctly sets the allow_duplicate_labels flag on its result. - The flag on the result should be set to true if and only if both arguments to merge - have their flags set to True. + The flag on the result should be set to true if and only if both arguments + to merge have their flags set to True. """ # Arrange - left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_left) - right = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_right) + left = pd.DataFrame({"test": [1]}).set_flags( + allows_duplicate_labels=allow_on_left + ) + right = pd.DataFrame({"test": [1]}).set_flags( + allows_duplicate_labels=allow_on_right + ) # Act if not how == "cross": @@ -706,39 +716,48 @@ def test_merge_sets_duplication_allowance_flag(how, allow_duplication_on_left, a result = left.merge(right, how=how) # Assert - expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right + expected_duplication_allowance = allow_on_left and allow_on_right assert result.flags.allows_duplicate_labels == expected_duplication_allowance -@pytest.mark.parametrize(["allow_duplication_on_left", "allow_duplication_on_right"], -[ - (False, False), - (False, True), - (True, False), - (True, True) -]) -def test_merge_asof_sets_duplication_allowance_flag(allow_duplication_on_left, allow_duplication_on_right): +@pytest.mark.parametrize( + ["allow_on_left", "allow_on_right"], + [ + (False, False), + (False, True), + (True, False), + (True, True) + ] +) +def test_merge_asof_sets_duplication_allowance_flag( + allow_on_left, + allow_on_right +): """ Check that pandas.merge_asof correctly sets the allow_duplicate_labels flag on its result. - The flag on the result should be set to true if and only if both arguments to merge_asof - have their flags set to True. + The flag on the result should be set to true if and only if both arguments + to merge_asof have their flags set to True. """ # Arrange - left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_left) - right = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_right) + left = pd.DataFrame({"test": [1]}).set_flags( + allows_duplicate_labels=allow_on_left + ) + right = pd.DataFrame({"test": [1]}).set_flags( + allows_duplicate_labels=allow_on_right + ) # Act result = pd.merge_asof(left, right) # Assert - expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right + expected_duplication_allowance = allow_on_left and allow_on_right assert result.flags.allows_duplicate_labels == expected_duplication_allowance def test_merge_propagates_metadata_from_equal_input_metadata(): """ - Check that pandas.merge sets the metadata of its result to a deep copy of the metadata from - its left input, if the metadata from both inputs are equal. + Check that pandas.merge sets the metadata of its result to a deep copy of + the metadata from its left input, if the metadata from both inputs are equal. """ # Arrange metadata = {"a": 2} @@ -757,8 +776,8 @@ def test_merge_propagates_metadata_from_equal_input_metadata(): def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): """ - Check that the metadata for the result of pandas.merge is empty if the metadata - for both inputs to pandas.merge are not equal. + Check that the metadata for the result of pandas.merge is empty if the + metadata for both inputs to pandas.merge are not equal. """ # Arrange left = pd.DataFrame({"test": [1]}) @@ -774,15 +793,21 @@ def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): no_metadata = pd.DataFrame({"test": [1]}) -metadata = {"a": 2} has_metadata = pd.DataFrame({"test": [1]}) -has_metadata.attrs = metadata +has_metadata.attrs = {"a": 2} -@pytest.mark.parametrize(["left", "right", "expected"], - [(no_metadata, has_metadata, metadata), - (has_metadata, no_metadata, metadata), - (no_metadata, no_metadata, {})], ids=["left-empty", "right-empty", "both-empty"]) -def test_merge_propagates_metadata_if_one_input_has_no_metadata(left: pd.DataFrame, right: pd.DataFrame, expected: dict): +@pytest.mark.parametrize( + ["left", "right", "expected"], + [(no_metadata, has_metadata, {}), + (has_metadata, no_metadata, {}), + (no_metadata, no_metadata, {})], + ids=["left-empty", "right-empty", "both-empty"] +) +def test_merge_does_not_propagate_metadata_if_one_input_has_no_metadata( + left: pd.DataFrame, + right: pd.DataFrame, + expected: dict +): """ Check that if the metadata for one input to pandas.merge is empty, the result of merge has the same metadata as the other input. @@ -791,7 +816,7 @@ def test_merge_propagates_metadata_if_one_input_has_no_metadata(left: pd.DataFra | | | | | | --> merge <-- --> merge <-- --> merge <-- | | | - (A) (A) (empty) + (empty) (empty) (empty) """ # Arrange From bba9a1307e1086fa0c387b0e6d5fa5bb5575b828 Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Wed, 17 Sep 2025 10:51:58 -0400 Subject: [PATCH 08/17] Added type annotations to test method parameters. --- pandas/tests/generic/test_finalize.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 3faf3310e2034..aabbc82973124 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -690,9 +690,9 @@ def test_finalize_frame_series_name(): ("cross",), ]) def test_merge_sets_duplication_allowance_flag( - how, - allow_on_left, - allow_on_right + how: str, + allow_on_left: bool, + allow_on_right: bool ): """ Check that DataFrame.merge correctly sets the allow_duplicate_labels flag @@ -729,8 +729,8 @@ def test_merge_sets_duplication_allowance_flag( ] ) def test_merge_asof_sets_duplication_allowance_flag( - allow_on_left, - allow_on_right + allow_on_left: bool, + allow_on_right: bool ): """ Check that pandas.merge_asof correctly sets the allow_duplicate_labels flag From 0d52fffef184b4f00b8d04256adbf60740004077 Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:55:38 -0400 Subject: [PATCH 09/17] Fixed type issue for test_finalize. Added a little documentation. --- pandas/core/generic.py | 8 ++++---- pandas/core/reshape/merge.py | 8 +++++--- pandas/tests/generic/test_finalize.py | 4 +++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 29a070ff5108a..a1c3bbde3a2c4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6102,10 +6102,10 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: Parameters ---------- other : the object from which to get the attributes that we are going - to propagate. If ``other`` has an ``input_objs`` attribute, then this attribute - must contain an iterable of objects, each with an ``attrs`` attribute, in which - case, each such ``attrs`` instance must be a dictionary that is equal to all of - the others. + to propagate. If ``other`` has an ``input_objs`` attribute, then + this attribute must contain an iterable of objects, each with an + ``attrs`` attribute, in which case, each such ``attrs`` instance + must be a dictionary that is equal to all of the others. method : str, optional A passed method name providing context on where ``__finalize__`` was called. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index da621c05be442..816fa5310b25e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1176,10 +1176,12 @@ def _indicator_pre_merge( self, left: DataFrame, right: DataFrame ) -> tuple[DataFrame, DataFrame]: """ - Add one indicator column to each of the left and right inputs to a merge operation. + Add one indicator column to each of the left and right inputs to a + merge operation. - These columns are used to produce another column in the output of the merge, indicating - for each row of the output whether it was produced using the left, right or both inputs. + These columns are used to produce another column in the output of the + merge, indicating for each row of the output whether it was produced + using the left, right or both inputs. """ columns = left.columns.union(right.columns) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index aabbc82973124..efc07dcdf5b27 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -10,6 +10,8 @@ import pandas as pd +from pandas._typing import MergeHow + # TODO: # * Binary methods (mul, div, etc.) # * Binary outputs (align, etc.) @@ -690,7 +692,7 @@ def test_finalize_frame_series_name(): ("cross",), ]) def test_merge_sets_duplication_allowance_flag( - how: str, + how: MergeHow, allow_on_left: bool, allow_on_right: bool ): From 9f6813416544faffb22e498ab4fc7161691eedaa Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:54:53 -0400 Subject: [PATCH 10/17] Ran ruff formatter to correct some issues. --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/reshape/merge.py | 7 +- pandas/tests/generic/test_finalize.py | 101 +++++++++++--------------- pandas/tests/generic/test_frame.py | 7 +- 4 files changed, 53 insertions(+), 64 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 154777ede5ca9..495847978e871 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1093,8 +1093,8 @@ Reshaping - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) - Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`) -- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) - Bug in :meth:`DataFrame.merge` where the result of a merge does not contain any metadata or flag information from the inputs to the merge. (:issue:`28283`) +- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) - Bug in :meth:`DataFrame.merge` with :class:`CategoricalDtype` columns incorrectly raising ``RecursionError`` (:issue:`56376`) - Bug in :meth:`DataFrame.merge` with a ``float32`` index incorrectly casting the index to ``float64`` (:issue:`41626`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 816fa5310b25e..324a2bb810981 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1155,8 +1155,7 @@ def get_result(self) -> DataFrame: # __finalize is responsible for copying the metadata from the inputs to merge # to the result. return result.__finalize__( - types.SimpleNamespace(input_objs=[self.left, self.right]), - method="merge" + types.SimpleNamespace(input_objs=[self.left, self.right]), method="merge" ) @final @@ -1212,8 +1211,8 @@ def _indicator_post_merge(self, result: DataFrame) -> DataFrame: """ Add an indicator column to the merge result. - This column indicates for each row of the output whether it was produced using the left, - right or both inputs. + This column indicates for each row of the output whether it was produced using + the left, right or both inputs. """ result["_left_indicator"] = result["_left_indicator"].fillna(0) result["_right_indicator"] = result["_right_indicator"].fillna(0) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index efc07dcdf5b27..a51e9a468f0d8 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -1,5 +1,4 @@ -""" -An exhaustive list of pandas methods exercising NDFrame.__finalize__. +"""An exhaustive list of pandas methods exercising NDFrame.__finalize__. """ import operator @@ -9,7 +8,6 @@ import pytest import pandas as pd - from pandas._typing import MergeHow # TODO: @@ -365,8 +363,7 @@ def idfn(x): m = xpr.search(str(x)) if m: return m.group(1) - else: - return str(x) + return str(x) @pytest.mark.parametrize("ndframe_method", _all_methods, ids=lambda x: idfn(x[-1])) @@ -580,7 +577,7 @@ def test_datetime_property(attr): @pytest.mark.parametrize( - "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"] + "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"], ) def test_timedelta_property(attr): s = pd.Series(pd.timedelta_range("2000", periods=4)) @@ -624,7 +621,7 @@ def test_categorical_accessor(method): @pytest.mark.parametrize( - "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], ) @pytest.mark.parametrize( "method", @@ -643,7 +640,7 @@ def test_groupby_finalize(obj, method): @pytest.mark.parametrize( - "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], ) @pytest.mark.parametrize( "method", @@ -670,45 +667,40 @@ def test_finalize_frame_series_name(): result = pd.Series([1, 2]).__finalize__(df) assert result.name is None + # ---------------------------------------------------------------------------- # Tests for merge + @pytest.mark.parametrize( ["allow_on_left", "allow_on_right"], + [(False, False), (False, True), (True, False), (True, True)], +) +@pytest.mark.parametrize( + "how", [ - (False, False), - (False, True), - (True, False), - (True, True) - ] + "left", + "right", + "inner", + "outer", + "left_anti", + "right_anti", + "cross", + ], ) -@pytest.mark.parametrize(["how"], [ - ("left",), - ("right",), - ("inner",), - ("outer",), - ("left_anti",), - ("right_anti",), - ("cross",), -]) def test_merge_sets_duplication_allowance_flag( - how: MergeHow, - allow_on_left: bool, - allow_on_right: bool + how: MergeHow, allow_on_left: bool, allow_on_right: bool, ): - """ - Check that DataFrame.merge correctly sets the allow_duplicate_labels flag + """Check that DataFrame.merge correctly sets the allow_duplicate_labels flag on its result. The flag on the result should be set to true if and only if both arguments to merge have their flags set to True. """ # Arrange - left = pd.DataFrame({"test": [1]}).set_flags( - allows_duplicate_labels=allow_on_left - ) + left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left) right = pd.DataFrame({"test": [1]}).set_flags( - allows_duplicate_labels=allow_on_right + allows_duplicate_labels=allow_on_right, ) # Act @@ -721,32 +713,24 @@ def test_merge_sets_duplication_allowance_flag( expected_duplication_allowance = allow_on_left and allow_on_right assert result.flags.allows_duplicate_labels == expected_duplication_allowance + @pytest.mark.parametrize( ["allow_on_left", "allow_on_right"], - [ - (False, False), - (False, True), - (True, False), - (True, True) - ] + [(False, False), (False, True), (True, False), (True, True)], ) def test_merge_asof_sets_duplication_allowance_flag( - allow_on_left: bool, - allow_on_right: bool + allow_on_left: bool, allow_on_right: bool, ): - """ - Check that pandas.merge_asof correctly sets the allow_duplicate_labels flag + """Check that pandas.merge_asof correctly sets the allow_duplicate_labels flag on its result. The flag on the result should be set to true if and only if both arguments to merge_asof have their flags set to True. """ # Arrange - left = pd.DataFrame({"test": [1]}).set_flags( - allows_duplicate_labels=allow_on_left - ) + left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left) right = pd.DataFrame({"test": [1]}).set_flags( - allows_duplicate_labels=allow_on_right + allows_duplicate_labels=allow_on_right, ) # Act @@ -756,9 +740,9 @@ def test_merge_asof_sets_duplication_allowance_flag( expected_duplication_allowance = allow_on_left and allow_on_right assert result.flags.allows_duplicate_labels == expected_duplication_allowance + def test_merge_propagates_metadata_from_equal_input_metadata(): - """ - Check that pandas.merge sets the metadata of its result to a deep copy of + """Check that pandas.merge sets the metadata of its result to a deep copy of the metadata from its left input, if the metadata from both inputs are equal. """ # Arrange @@ -776,9 +760,9 @@ def test_merge_propagates_metadata_from_equal_input_metadata(): left.attrs = {"b": 3} assert result.attrs == metadata + def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): - """ - Check that the metadata for the result of pandas.merge is empty if the + """Check that the metadata for the result of pandas.merge is empty if the metadata for both inputs to pandas.merge are not equal. """ # Arrange @@ -793,25 +777,26 @@ def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): # Assert assert result.attrs == {} + no_metadata = pd.DataFrame({"test": [1]}) has_metadata = pd.DataFrame({"test": [1]}) has_metadata.attrs = {"a": 2} + @pytest.mark.parametrize( - ["left", "right", "expected"], - [(no_metadata, has_metadata, {}), + ["left", "right", "expected"], + [ + (no_metadata, has_metadata, {}), (has_metadata, no_metadata, {}), - (no_metadata, no_metadata, {})], - ids=["left-empty", "right-empty", "both-empty"] + (no_metadata, no_metadata, {}), + ], + ids=["left-empty", "right-empty", "both-empty"], ) def test_merge_does_not_propagate_metadata_if_one_input_has_no_metadata( - left: pd.DataFrame, - right: pd.DataFrame, - expected: dict + left: pd.DataFrame, right: pd.DataFrame, expected: dict, ): - """ - Check that if the metadata for one input to pandas.merge is empty, the result + """Check that if the metadata for one input to pandas.merge is empty, the result of merge has the same metadata as the other input. (empty) (A) (A) (empty) (empty) (empty) diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 01e8addacfe4e..c2d24cceeab0c 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -78,7 +78,12 @@ def test_metadata_propagation_indiv(self, monkeypatch): # merging with override # GH 6923 - def finalize(self: DataFrame, other: DataFrame, method: Literal["merge", "concat"] | None = None, **kwargs): + def finalize( + self: DataFrame, + other: DataFrame, + method: Literal["merge", "concat"] | None = None, + **kwargs, + ): for name in self._metadata: if method == "merge": left, right = other.input_objs From ff1aba5cdea9844f7220febcc654a444600f5e3e Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Fri, 19 Sep 2025 12:05:52 -0400 Subject: [PATCH 11/17] Fixed some cosmetic issues with pre-commit hooks. --- pandas/tests/generic/test_finalize.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index a51e9a468f0d8..f4a1646c13806 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -1,5 +1,4 @@ -"""An exhaustive list of pandas methods exercising NDFrame.__finalize__. -""" +"""An exhaustive list of pandas methods exercising NDFrame.__finalize__.""" import operator import re @@ -7,9 +6,10 @@ import numpy as np import pytest -import pandas as pd from pandas._typing import MergeHow +import pandas as pd + # TODO: # * Binary methods (mul, div, etc.) # * Binary outputs (align, etc.) @@ -577,7 +577,8 @@ def test_datetime_property(attr): @pytest.mark.parametrize( - "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"], + "attr", + ["days", "seconds", "microseconds", "nanoseconds", "components"], ) def test_timedelta_property(attr): s = pd.Series(pd.timedelta_range("2000", periods=4)) @@ -621,7 +622,8 @@ def test_categorical_accessor(method): @pytest.mark.parametrize( - "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], + "obj", + [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], ) @pytest.mark.parametrize( "method", @@ -640,7 +642,8 @@ def test_groupby_finalize(obj, method): @pytest.mark.parametrize( - "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], + "obj", + [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], ) @pytest.mark.parametrize( "method", @@ -689,7 +692,9 @@ def test_finalize_frame_series_name(): ], ) def test_merge_sets_duplication_allowance_flag( - how: MergeHow, allow_on_left: bool, allow_on_right: bool, + how: MergeHow, + allow_on_left: bool, + allow_on_right: bool, ): """Check that DataFrame.merge correctly sets the allow_duplicate_labels flag on its result. @@ -719,7 +724,8 @@ def test_merge_sets_duplication_allowance_flag( [(False, False), (False, True), (True, False), (True, True)], ) def test_merge_asof_sets_duplication_allowance_flag( - allow_on_left: bool, allow_on_right: bool, + allow_on_left: bool, + allow_on_right: bool, ): """Check that pandas.merge_asof correctly sets the allow_duplicate_labels flag on its result. @@ -794,7 +800,9 @@ def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): ids=["left-empty", "right-empty", "both-empty"], ) def test_merge_does_not_propagate_metadata_if_one_input_has_no_metadata( - left: pd.DataFrame, right: pd.DataFrame, expected: dict, + left: pd.DataFrame, + right: pd.DataFrame, + expected: dict, ): """Check that if the metadata for one input to pandas.merge is empty, the result of merge has the same metadata as the other input. From 9b51d3e298290fe10f09c4e1a8c206bb5b51503a Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Mon, 6 Oct 2025 14:39:31 -0400 Subject: [PATCH 12/17] Removed docstrings on tests. --- doc/source/whatsnew/v3.0.0.rst | 1 - pandas/core/reshape/merge.py | 11 +------ pandas/tests/generic/test_finalize.py | 45 ++------------------------- 3 files changed, 4 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a3145aaa049bb..293f1cb6f5e79 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1156,7 +1156,6 @@ Reshaping - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) - Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`) -- Bug in :meth:`DataFrame.merge` where the result of a merge does not contain any metadata or flag information from the inputs to the merge. (:issue:`28283`) - Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) - Bug in :meth:`DataFrame.merge` with :class:`CategoricalDtype` columns incorrectly raising ``RecursionError`` (:issue:`56376`) - Bug in :meth:`DataFrame.merge` with a ``float32`` index incorrectly casting the index to ``float64`` (:issue:`41626`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f5a6f20bc04e1..604181214ad44 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1139,12 +1139,6 @@ def get_result(self) -> DataFrame: result = self._reindex_and_concat(join_index, left_indexer, right_indexer) - # Is this call to __finalize__ really necessary? - result = result.__finalize__( - types.SimpleNamespace(input_objs=[self.left, self.right]), - method=self._merge_type, - ) - if self.indicator: result = self._indicator_post_merge(result) @@ -1152,8 +1146,6 @@ def get_result(self) -> DataFrame: self._maybe_restore_index_levels(result) - # __finalize is responsible for copying the metadata from the inputs to merge - # to the result. return result.__finalize__( types.SimpleNamespace(input_objs=[self.left, self.right]), method="merge" ) @@ -1175,8 +1167,7 @@ def _indicator_pre_merge( self, left: DataFrame, right: DataFrame ) -> tuple[DataFrame, DataFrame]: """ - Add one indicator column to each of the left and right inputs to a - merge operation. + Add one indicator column to each of the left and right inputs. These columns are used to produce another column in the output of the merge, indicating for each row of the output whether it was produced diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index f4a1646c13806..1e0f78b9d64c6 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -691,30 +691,24 @@ def test_finalize_frame_series_name(): "cross", ], ) -def test_merge_sets_duplication_allowance_flag( +def test_merge_correctly_sets_duplication_allowance_flag( how: MergeHow, allow_on_left: bool, allow_on_right: bool, ): """Check that DataFrame.merge correctly sets the allow_duplicate_labels flag on its result. - - The flag on the result should be set to true if and only if both arguments - to merge have their flags set to True. """ - # Arrange left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left) right = pd.DataFrame({"test": [1]}).set_flags( allows_duplicate_labels=allow_on_right, ) - # Act if not how == "cross": result = left.merge(right, how=how, on="test") else: result = left.merge(right, how=how) - # Assert expected_duplication_allowance = allow_on_left and allow_on_right assert result.flags.allows_duplicate_labels == expected_duplication_allowance @@ -723,64 +717,44 @@ def test_merge_sets_duplication_allowance_flag( ["allow_on_left", "allow_on_right"], [(False, False), (False, True), (True, False), (True, True)], ) -def test_merge_asof_sets_duplication_allowance_flag( +def test_merge_asof_correctly_sets_duplication_allowance_flag( allow_on_left: bool, allow_on_right: bool, ): - """Check that pandas.merge_asof correctly sets the allow_duplicate_labels flag - on its result. - - The flag on the result should be set to true if and only if both arguments - to merge_asof have their flags set to True. - """ - # Arrange left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left) right = pd.DataFrame({"test": [1]}).set_flags( allows_duplicate_labels=allow_on_right, ) - # Act result = pd.merge_asof(left, right) - # Assert expected_duplication_allowance = allow_on_left and allow_on_right assert result.flags.allows_duplicate_labels == expected_duplication_allowance def test_merge_propagates_metadata_from_equal_input_metadata(): - """Check that pandas.merge sets the metadata of its result to a deep copy of - the metadata from its left input, if the metadata from both inputs are equal. - """ - # Arrange metadata = {"a": 2} left = pd.DataFrame({"test": [1]}) left.attrs = metadata right = pd.DataFrame({"test": [1]}) right.attrs = metadata.copy() - # Act result = left.merge(right, how="inner", on="test") - # Assert assert result.attrs == metadata + # Verify that merge deep-copies the attr dictionary. left.attrs = {"b": 3} assert result.attrs == metadata def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): - """Check that the metadata for the result of pandas.merge is empty if the - metadata for both inputs to pandas.merge are not equal. - """ - # Arrange left = pd.DataFrame({"test": [1]}) left.attrs = {"a": 2} right = pd.DataFrame({"test": [1]}) right.attrs = {"b": 3} - # Act result = left.merge(right, how="inner", on="test") - # Assert assert result.attrs == {} @@ -804,19 +778,6 @@ def test_merge_does_not_propagate_metadata_if_one_input_has_no_metadata( right: pd.DataFrame, expected: dict, ): - """Check that if the metadata for one input to pandas.merge is empty, the result - of merge has the same metadata as the other input. - - (empty) (A) (A) (empty) (empty) (empty) - | | | | | | - --> merge <-- --> merge <-- --> merge <-- - | | | - (empty) (empty) (empty) - """ - # Arrange - - # Act result = left.merge(right, how="inner", on="test") - # Assert assert result.attrs == expected From f2abf1f0f57e6110b4230b718d2a6b8b42b2f115 Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Tue, 7 Oct 2025 12:16:20 -0400 Subject: [PATCH 13/17] Resolved several issues in test_finalize. --- pandas/tests/generic/test_finalize.py | 55 ++++++++++++++++----------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 1e0f78b9d64c6..65642c558adc6 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -1,5 +1,8 @@ -"""An exhaustive list of pandas methods exercising NDFrame.__finalize__.""" +""" +An exhaustive list of pandas methods exercising NDFrame.__finalize__. +""" +from copy import deepcopy import operator import re @@ -363,7 +366,8 @@ def idfn(x): m = xpr.search(str(x)) if m: return m.group(1) - return str(x) + else: + return str(x) @pytest.mark.parametrize("ndframe_method", _all_methods, ids=lambda x: idfn(x[-1])) @@ -672,7 +676,7 @@ def test_finalize_frame_series_name(): # ---------------------------------------------------------------------------- -# Tests for merge +# Merge @pytest.mark.parametrize( @@ -696,9 +700,6 @@ def test_merge_correctly_sets_duplication_allowance_flag( allow_on_left: bool, allow_on_right: bool, ): - """Check that DataFrame.merge correctly sets the allow_duplicate_labels flag - on its result. - """ left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left) right = pd.DataFrame({"test": [1]}).set_flags( allows_duplicate_labels=allow_on_right, @@ -733,18 +734,21 @@ def test_merge_asof_correctly_sets_duplication_allowance_flag( def test_merge_propagates_metadata_from_equal_input_metadata(): - metadata = {"a": 2} + metadata = {"a": [1, 2]} left = pd.DataFrame({"test": [1]}) left.attrs = metadata right = pd.DataFrame({"test": [1]}) - right.attrs = metadata.copy() + right.attrs = deepcopy(metadata) result = left.merge(right, how="inner", on="test") assert result.attrs == metadata + # Verify that merge deep-copies the attr dictionary. - left.attrs = {"b": 3} - assert result.attrs == metadata + assert result.attrs is not left.attrs + assert result.attrs is not right.attrs + assert result.attrs["a"] is not left.attrs["a"] + assert result.attrs["a"] is not right.attrs["a"] def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): @@ -758,26 +762,33 @@ def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): assert result.attrs == {} -no_metadata = pd.DataFrame({"test": [1]}) - -has_metadata = pd.DataFrame({"test": [1]}) -has_metadata.attrs = {"a": 2} - - @pytest.mark.parametrize( - ["left", "right", "expected"], + ["left_has_metadata", "right_has_metadata", "expected"], [ - (no_metadata, has_metadata, {}), - (has_metadata, no_metadata, {}), - (no_metadata, no_metadata, {}), + (False, True, {}), + (True, False, {}), + (False, False, {}), ], ids=["left-empty", "right-empty", "both-empty"], ) def test_merge_does_not_propagate_metadata_if_one_input_has_no_metadata( - left: pd.DataFrame, - right: pd.DataFrame, + left_has_metadata: bool, + right_has_metadata: bool, expected: dict, ): + left = pd.DataFrame({"test": [1]}) + right = pd.DataFrame({"test": [1]}) + + if left_has_metadata: + left.attrs = {"a": [1, 2]} + else: + left.attrs = {} + + if right_has_metadata: + right.attrs = {"a": [1, 2]} + else: + right.attrs = {} + result = left.merge(right, how="inner", on="test") assert result.attrs == expected From dddc031be9bc3ee79ce5443008d6695e6bd8e8e7 Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Tue, 7 Oct 2025 12:29:12 -0400 Subject: [PATCH 14/17] Fixed a couple nitpicks. --- pandas/tests/generic/test_finalize.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 65642c558adc6..caaeb72c4b70c 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -581,8 +581,7 @@ def test_datetime_property(attr): @pytest.mark.parametrize( - "attr", - ["days", "seconds", "microseconds", "nanoseconds", "components"], + "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"], ) def test_timedelta_property(attr): s = pd.Series(pd.timedelta_range("2000", periods=4)) @@ -626,8 +625,7 @@ def test_categorical_accessor(method): @pytest.mark.parametrize( - "obj", - [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], ) @pytest.mark.parametrize( "method", @@ -646,8 +644,7 @@ def test_groupby_finalize(obj, method): @pytest.mark.parametrize( - "obj", - [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], ) @pytest.mark.parametrize( "method", From 15adcd7c2e4af2bfe864926e8712110c6d9d385a Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Wed, 8 Oct 2025 11:41:21 -0400 Subject: [PATCH 15/17] Reformatted argument lists as required by ruff. --- pandas/tests/generic/test_finalize.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index caaeb72c4b70c..65642c558adc6 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -581,7 +581,8 @@ def test_datetime_property(attr): @pytest.mark.parametrize( - "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"], + "attr", + ["days", "seconds", "microseconds", "nanoseconds", "components"], ) def test_timedelta_property(attr): s = pd.Series(pd.timedelta_range("2000", periods=4)) @@ -625,7 +626,8 @@ def test_categorical_accessor(method): @pytest.mark.parametrize( - "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], + "obj", + [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], ) @pytest.mark.parametrize( "method", @@ -644,7 +646,8 @@ def test_groupby_finalize(obj, method): @pytest.mark.parametrize( - "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], + "obj", + [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], ) @pytest.mark.parametrize( "method", From 1a8602dbbb3514d2c0749b9952c7014f18d076b6 Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Wed, 8 Oct 2025 11:55:06 -0400 Subject: [PATCH 16/17] Added note and removed potentially confusing docs from __fianlize__. --- pandas/core/generic.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 08a4fbb27330d..50fc7c739d36f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6110,8 +6110,7 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: other : the object from which to get the attributes that we are going to propagate. If ``other`` has an ``input_objs`` attribute, then this attribute must contain an iterable of objects, each with an - ``attrs`` attribute, in which case, each such ``attrs`` instance - must be a dictionary that is equal to all of the others. + ``attrs`` attribute. method : str, optional A passed method name providing context on where ``__finalize__`` was called. @@ -6120,6 +6119,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: The value passed as `method` are not currently considered stable across pandas releases. + + Notes + ----- + In case ``other`` has an ``input_objs`` attribute, this method only + propagates its metadata if each object in ``input_objs`` has the exact + same metadata as the others. """ if isinstance(other, NDFrame): if other.attrs: From d9b52f0516a4c110b72cd54cb4becff9c51abd2d Mon Sep 17 00:00:00 2001 From: Augustus <22328646+aijams@users.noreply.github.com> Date: Thu, 9 Oct 2025 11:22:31 -0400 Subject: [PATCH 17/17] Removed trailing commas. --- pandas/tests/generic/test_finalize.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 65642c558adc6..641d9518adb9a 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -581,8 +581,7 @@ def test_datetime_property(attr): @pytest.mark.parametrize( - "attr", - ["days", "seconds", "microseconds", "nanoseconds", "components"], + "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"] ) def test_timedelta_property(attr): s = pd.Series(pd.timedelta_range("2000", periods=4)) @@ -626,8 +625,7 @@ def test_categorical_accessor(method): @pytest.mark.parametrize( - "obj", - [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] ) @pytest.mark.parametrize( "method", @@ -646,8 +644,7 @@ def test_groupby_finalize(obj, method): @pytest.mark.parametrize( - "obj", - [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] ) @pytest.mark.parametrize( "method",