Skip to content

Commit 94c7e88

Browse files
authored
PERF: Avoid deep copies when casting dtypes in merge (#62902)
1 parent 235e6ff commit 94c7e88

File tree

3 files changed

+28
-4
lines changed

3 files changed

+28
-4
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -939,6 +939,7 @@ Performance improvements
939939
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
940940
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
941941
- Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`)
942+
- Performance improvement in :func:`merge` when join keys have different dtypes and need to be upcast (:issue:`62902`)
942943
- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
943944
- Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`)
944945
- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)

pandas/core/reshape/merge.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,8 +1192,8 @@ def _indicator_pre_merge(
11921192
"Cannot use name of an existing column for indicator column"
11931193
)
11941194

1195-
left = left.copy()
1196-
right = right.copy()
1195+
left = left.copy(deep=False)
1196+
right = right.copy(deep=False)
11971197

11981198
left["_left_indicator"] = 1
11991199
left["_left_indicator"] = left["_left_indicator"].astype("int8")
@@ -1871,11 +1871,11 @@ def _maybe_coerce_merge_keys(self) -> None:
18711871
# incompatible dtypes. See GH 16900.
18721872
if name in self.left.columns:
18731873
typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object
1874-
self.left = self.left.copy()
1874+
self.left = self.left.copy(deep=False)
18751875
self.left[name] = self.left[name].astype(typ)
18761876
if name in self.right.columns:
18771877
typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object
1878-
self.right = self.right.copy()
1878+
self.right = self.right.copy(deep=False)
18791879
self.right[name] = self.right[name].astype(typ)
18801880

18811881
def _validate_left_right_on(self, left_on, right_on):

pandas/tests/copy_view/test_functions.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,29 @@ def test_merge_copy_keyword():
243243
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
244244

245245

246+
def test_merge_upcasting_no_copy():
247+
left = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
248+
left_copy = left.copy()
249+
right = DataFrame({"a": [1, 2, 3], "c": [7, 8, 9]}, dtype=object)
250+
result = merge(left, right, on="a")
251+
assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
252+
assert not np.shares_memory(get_array(result, "a"), get_array(left, "a"))
253+
tm.assert_frame_equal(left, left_copy)
254+
255+
result = merge(right, left, on="a")
256+
assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
257+
assert not np.shares_memory(get_array(result, "a"), get_array(left, "a"))
258+
tm.assert_frame_equal(left, left_copy)
259+
260+
261+
def test_merge_indicator_no_deep_copy():
262+
left = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
263+
right = DataFrame({"a": [1, 2, 3], "c": [7, 8, 9]})
264+
result = merge(left, right, on="a", indicator=True)
265+
assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
266+
assert np.shares_memory(get_array(result, "c"), get_array(right, "c"))
267+
268+
246269
@pytest.mark.parametrize("dtype", [object, "str"])
247270
def test_join_on_key(dtype):
248271
df_index = Index(["a", "b", "c"], name="key", dtype=dtype)

0 commit comments

Comments
 (0)