Skip to content

Commit 7408b85

Browse files
committed
improve merge validation error messages
1 parent d597079 commit 7408b85

File tree

2 files changed

+32
-4
lines changed

2 files changed

+32
-4
lines changed

pandas/core/reshape/merge.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1954,42 +1954,65 @@ def _validate_left_right_on(self, left_on, right_on):
19541954
def _validate_validate_kwd(self, validate: str) -> None:
19551955
# Check uniqueness of each
19561956
if self.left_index:
1957-
left_unique = self.orig_left.index.is_unique
1957+
left_keys = self.orig_left.index
19581958
else:
1959-
left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
1959+
left_keys = MultiIndex.from_arrays(self.left_join_keys)
19601960

19611961
if self.right_index:
1962-
right_unique = self.orig_right.index.is_unique
1962+
right_keys = self.orig_right.index
19631963
else:
1964-
right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
1964+
right_keys = MultiIndex.from_arrays(self.right_join_keys)
1965+
1966+
left_unique = left_keys.is_unique
1967+
right_unique = right_keys.is_unique
1968+
1969+
def sample_duplicates(keys, limit=10):
1970+
"""Return up to 'limit' unique duplicate keys."""
1971+
keys = Index(keys)
1972+
dups = keys[keys.duplicated()]
1973+
if not len(dups):
1974+
return []
1975+
return list(dups.unique()[:limit])
19651976

19661977
# Check data integrity
19671978
if validate in ["one_to_one", "1:1"]:
19681979
if not left_unique and not right_unique:
1980+
combined_keys = list(left_keys.append(right_keys))
1981+
sample = sample_duplicates(combined_keys, limit=10)
19691982
raise MergeError(
19701983
"Merge keys are not unique in either left "
19711984
"or right dataset; not a one-to-one merge"
1985+
f"Offending keys (sample): {sample}"
19721986
)
19731987
if not left_unique:
1988+
sample = sample_duplicates(left_keys)
19741989
raise MergeError(
19751990
"Merge keys are not unique in left dataset; not a one-to-one merge"
1991+
f"Offending keys (sample): {sample}"
19761992
)
19771993
if not right_unique:
1994+
sample = sample_duplicates(right_keys)
19781995
raise MergeError(
19791996
"Merge keys are not unique in right dataset; not a one-to-one merge"
1997+
f"Offending keys (sample): {sample}"
19801998
)
19811999

19822000
elif validate in ["one_to_many", "1:m"]:
19832001
if not left_unique:
2002+
sample = sample_duplicates(left_keys)
19842003
raise MergeError(
19852004
"Merge keys are not unique in left dataset; not a one-to-many merge"
2005+
f"Offending keys (sample): {sample}"
2006+
19862007
)
19872008

19882009
elif validate in ["many_to_one", "m:1"]:
19892010
if not right_unique:
2011+
sample = sample_duplicates(right_keys)
19902012
raise MergeError(
19912013
"Merge keys are not unique in right dataset; "
19922014
"not a many-to-one merge"
2015+
f"Offending keys (sample): {sample}"
19932016
)
19942017

19952018
elif validate in ["many_to_many", "m:m"]:

pandas/tests/reshape/merge/test_merge.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3121,3 +3121,8 @@ def test_merge_pyarrow_datetime_duplicates():
31213121
)
31223122
expected = expected.convert_dtypes(dtype_backend="pyarrow")
31233123
tm.assert_frame_equal(result, expected)
3124+
3125+
def test_merge_validate_one_to_one_offending_keys():
3126+
df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]})
3127+
with pytest.raises(pd.errors.MergeError, match="Offending keys"):
3128+
df.merge(df, on="a", validate="one_to_one")

0 commit comments

Comments
 (0)