diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 604181214ad44..a66f52d45b761 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1954,42 +1954,68 @@ def _validate_left_right_on(self, left_on, right_on): def _validate_validate_kwd(self, validate: str) -> None: # Check uniqueness of each if self.left_index: - left_unique = self.orig_left.index.is_unique + left_keys = self.orig_left.index else: - left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique + left_keys = MultiIndex.from_arrays(self.left_join_keys) if self.right_index: - right_unique = self.orig_right.index.is_unique + right_keys = self.orig_right.index else: - right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique + right_keys = MultiIndex.from_arrays(self.right_join_keys) + + left_unique = left_keys.is_unique + right_unique = right_keys.is_unique + + def sample_duplicates(keys, limit=10): + """Return up to 'limit' unique duplicate keys.""" + dups = keys[keys.duplicated()] + if not len(dups): + return [] + return list(dups.unique()[:limit]) # Check data integrity if validate in ["one_to_one", "1:1"]: if not left_unique and not right_unique: + left_sample = sample_duplicates(left_keys, limit=5) + right_sample = sample_duplicates(right_keys, limit=5) + raise MergeError( "Merge keys are not unique in either left " - "or right dataset; not a one-to-one merge" + "or right dataset; not a one-to-one merge. " + f"Offending keys in left dataset (sample): {left_sample} " + f"Offending keys in right dataset (sample): {right_sample} " ) if not left_unique: + sample = sample_duplicates(left_keys) raise MergeError( - "Merge keys are not unique in left dataset; not a one-to-one merge" + "Merge keys are not unique in left dataset; " + "not a one-to-one merge. " + f"Offending keys (sample): {sample}" ) if not right_unique: + sample = sample_duplicates(right_keys) raise MergeError( - "Merge keys are not unique in right dataset; not a one-to-one merge" + "Merge keys are not unique in right dataset; " + "not a one-to-one merge. " + f"Offending keys (sample): {sample}" ) elif validate in ["one_to_many", "1:m"]: if not left_unique: + sample = sample_duplicates(left_keys) raise MergeError( - "Merge keys are not unique in left dataset; not a one-to-many merge" + "Merge keys are not unique in left dataset; " + "not a one-to-many merge. " + f"Offending keys (sample): {sample}" ) elif validate in ["many_to_one", "m:1"]: if not right_unique: + sample = sample_duplicates(right_keys) raise MergeError( "Merge keys are not unique in right dataset; " - "not a many-to-one merge" + "not a many-to-one merge. " + f"Offending keys (sample): {sample}" ) elif validate in ["many_to_many", "m:m"]: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c38ee32cb7226..e940aba9f4018 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3121,3 +3121,9 @@ def test_merge_pyarrow_datetime_duplicates(): ) expected = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_frame_equal(result, expected) + + +def test_merge_validate_one_to_one_offending_keys(): + df = DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]}) + with pytest.raises(pd.errors.MergeError, match="Offending keys"): + df.merge(df, on="a", validate="one_to_one")