Skip to content

Commit d08eb28

Browse files
authored
Merge branch 'main' into shiny-new-feature
2 parents 4b2bb50 + 94c7e88 commit d08eb28

File tree

20 files changed

+237
-166
lines changed

20 files changed

+237
-166
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,7 @@ Performance improvements
940940
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
941941
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
942942
- Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`)
943+
- Performance improvement in :func:`merge` when join keys have different dtypes and need to be upcast (:issue:`62902`)
943944
- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
944945
- Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`)
945946
- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)
@@ -1177,6 +1178,7 @@ Groupby/resample/rolling
11771178

11781179
Reshaping
11791180
^^^^^^^^^
1181+
- Bug in :func:`concat` with mixed integer and bool dtypes incorrectly casting the bools to integers (:issue:`45101`)
11801182
- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
11811183
- Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`)
11821184
- Bug in :meth:`DataFrame.explode` producing incorrect result for :class:`pyarrow.large_list` type (:issue:`61091`)

pandas/core/dtypes/concat.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,10 @@ def _get_result_dtype(
161161
# coerce to object
162162
target_dtype = np.dtype(object)
163163
kinds = {"o"}
164+
elif "b" in kinds and len(kinds) > 1:
165+
# GH#21108, GH#45101
166+
target_dtype = np.dtype(object)
167+
kinds = {"o"}
164168
else:
165169
# error: Argument 1 to "np_find_common_type" has incompatible type
166170
# "*Set[Union[ExtensionDtype, Any]]"; expected "dtype[Any]"

pandas/core/generic.py

Lines changed: 86 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8156,7 +8156,6 @@ def asof(self, where, subset=None):
81568156
# ----------------------------------------------------------------------
81578157
# Action Methods
81588158

8159-
@doc(klass=_shared_doc_kwargs["klass"])
81608159
def isna(self) -> Self:
81618160
"""
81628161
Detect missing values.
@@ -8169,15 +8168,18 @@ def isna(self) -> Self:
81698168
81708169
Returns
81718170
-------
8172-
{klass}
8173-
Mask of bool values for each element in {klass} that
8174-
indicates whether an element is an NA value.
8171+
Series/DataFrame
8172+
Mask of bool values for each element in Series/DataFrame
8173+
that indicates whether an element is an NA value.
81758174
81768175
See Also
81778176
--------
8178-
{klass}.isnull : Alias of isna.
8179-
{klass}.notna : Boolean inverse of isna.
8180-
{klass}.dropna : Omit axes labels with missing values.
8177+
Series.isnull : Alias of isna.
8178+
DataFrame.isnull : Alias of isna.
8179+
Series.notna : Boolean inverse of isna.
8180+
DataFrame.notna : Boolean inverse of isna.
8181+
Series.dropna : Omit axes labels with missing values.
8182+
DataFrame.dropna : Omit axes labels with missing values.
81818183
isna : Top-level isna.
81828184
81838185
Examples
@@ -8225,11 +8227,77 @@ def isna(self) -> Self:
82258227
"""
82268228
return isna(self).__finalize__(self, method="isna")
82278229

8228-
@doc(isna, klass=_shared_doc_kwargs["klass"])
82298230
def isnull(self) -> Self:
8231+
"""
8232+
Detect missing values.
8233+
8234+
Return a boolean same-sized object indicating if the values are NA.
8235+
NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
8236+
values.
8237+
Everything else gets mapped to False values. Characters such as empty
8238+
strings ``''`` or :attr:`numpy.inf` are not considered NA values.
8239+
8240+
Returns
8241+
-------
8242+
Series/DataFrame
8243+
Mask of bool values for each element in Series/DataFrame
8244+
that indicates whether an element is an NA value.
8245+
8246+
See Also
8247+
--------
8248+
Series.isna : Alias of isnull.
8249+
DataFrame.isna : Alias of isnull.
8250+
Series.notna : Boolean inverse of isnull.
8251+
DataFrame.notna : Boolean inverse of isnull.
8252+
Series.dropna : Omit axes labels with missing values.
8253+
DataFrame.dropna : Omit axes labels with missing values.
8254+
isna : Top-level isna.
8255+
8256+
Examples
8257+
--------
8258+
Show which entries in a DataFrame are NA.
8259+
8260+
>>> df = pd.DataFrame(
8261+
... dict(
8262+
... age=[5, 6, np.nan],
8263+
... born=[
8264+
... pd.NaT,
8265+
... pd.Timestamp("1939-05-27"),
8266+
... pd.Timestamp("1940-04-25"),
8267+
... ],
8268+
... name=["Alfred", "Batman", ""],
8269+
... toy=[None, "Batmobile", "Joker"],
8270+
... )
8271+
... )
8272+
>>> df
8273+
age born name toy
8274+
0 5.0 NaT Alfred NaN
8275+
1 6.0 1939-05-27 Batman Batmobile
8276+
2 NaN 1940-04-25 Joker
8277+
8278+
>>> df.isna()
8279+
age born name toy
8280+
0 False True False True
8281+
1 False False False False
8282+
2 True False False False
8283+
8284+
Show which entries in a Series are NA.
8285+
8286+
>>> ser = pd.Series([5, 6, np.nan])
8287+
>>> ser
8288+
0 5.0
8289+
1 6.0
8290+
2 NaN
8291+
dtype: float64
8292+
8293+
>>> ser.isna()
8294+
0 False
8295+
1 False
8296+
2 True
8297+
dtype: bool
8298+
"""
82308299
return isna(self).__finalize__(self, method="isnull")
82318300

8232-
@doc(klass=_shared_doc_kwargs["klass"])
82338301
def notna(self) -> Self:
82348302
"""
82358303
Detect existing (non-missing) values.
@@ -8242,15 +8310,18 @@ def notna(self) -> Self:
82428310
82438311
Returns
82448312
-------
8245-
{klass}
8246-
Mask of bool values for each element in {klass} that
8247-
indicates whether an element is not an NA value.
8313+
Series/DataFrame
8314+
Mask of bool values for each element in Series/DataFrame
8315+
that indicates whether an element is not an NA value.
82488316
82498317
See Also
82508318
--------
8251-
{klass}.notnull : Alias of notna.
8252-
{klass}.isna : Boolean inverse of notna.
8253-
{klass}.dropna : Omit axes labels with missing values.
8319+
Series.notnull : Alias of notna.
8320+
DataFrame.notnull : Alias of notna.
8321+
Series.isna : Boolean inverse of notna.
8322+
DataFrame.isna : Boolean inverse of notna.
8323+
Series.dropna : Omit axes labels with missing values.
8324+
DataFrame.dropna : Omit axes labels with missing values.
82548325
notna : Top-level notna.
82558326
82568327
Examples

pandas/core/reshape/merge.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,8 +1192,8 @@ def _indicator_pre_merge(
11921192
"Cannot use name of an existing column for indicator column"
11931193
)
11941194

1195-
left = left.copy()
1196-
right = right.copy()
1195+
left = left.copy(deep=False)
1196+
right = right.copy(deep=False)
11971197

11981198
left["_left_indicator"] = 1
11991199
left["_left_indicator"] = left["_left_indicator"].astype("int8")
@@ -1871,11 +1871,11 @@ def _maybe_coerce_merge_keys(self) -> None:
18711871
# incompatible dtypes. See GH 16900.
18721872
if name in self.left.columns:
18731873
typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object
1874-
self.left = self.left.copy()
1874+
self.left = self.left.copy(deep=False)
18751875
self.left[name] = self.left[name].astype(typ)
18761876
if name in self.right.columns:
18771877
typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object
1878-
self.right = self.right.copy()
1878+
self.right = self.right.copy(deep=False)
18791879
self.right[name] = self.right[name].astype(typ)
18801880

18811881
def _validate_left_right_on(self, left_on, right_on):

pandas/core/series.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6183,7 +6183,7 @@ def isna(self) -> Series:
61836183
return NDFrame.isna(self)
61846184

61856185
# error: Cannot determine type of 'isna'
6186-
@doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type]
6186+
@doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
61876187
def isnull(self) -> Series:
61886188
"""
61896189
Series.isnull is an alias for Series.isna.
@@ -6260,7 +6260,7 @@ def notna(self) -> Series:
62606260
return super().notna()
62616261

62626262
# error: Cannot determine type of 'notna'
6263-
@doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type]
6263+
@doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
62646264
def notnull(self) -> Series:
62656265
"""
62666266
Series.notnull is an alias for Series.notna.

pandas/plotting/_matplotlib/timeseries.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def maybe_resample(series: Series, ax: Axes, kwargs: dict[str, Any]):
7575

7676
if ax_freq is not None and freq != ax_freq:
7777
if is_superperiod(freq, ax_freq): # upsample input
78-
series = series.copy()
78+
series = series.copy(deep=False)
7979
# error: "Index" has no attribute "asfreq"
8080
series.index = series.index.asfreq( # type: ignore[attr-defined]
8181
ax_freq, how="s"
@@ -142,7 +142,7 @@ def _replot_ax(ax: Axes, freq: BaseOffset):
142142
labels = []
143143
if data is not None:
144144
for series, plotf, kwds in data:
145-
series = series.copy()
145+
series = series.copy(deep=False)
146146
idx = series.index.asfreq(freq, how="S")
147147
series.index = idx
148148
# TODO #54485

pandas/tests/copy_view/test_functions.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,29 @@ def test_merge_copy_keyword():
243243
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
244244

245245

246+
def test_merge_upcasting_no_copy():
247+
left = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
248+
left_copy = left.copy()
249+
right = DataFrame({"a": [1, 2, 3], "c": [7, 8, 9]}, dtype=object)
250+
result = merge(left, right, on="a")
251+
assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
252+
assert not np.shares_memory(get_array(result, "a"), get_array(left, "a"))
253+
tm.assert_frame_equal(left, left_copy)
254+
255+
result = merge(right, left, on="a")
256+
assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
257+
assert not np.shares_memory(get_array(result, "a"), get_array(left, "a"))
258+
tm.assert_frame_equal(left, left_copy)
259+
260+
261+
def test_merge_indicator_no_deep_copy():
262+
left = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
263+
right = DataFrame({"a": [1, 2, 3], "c": [7, 8, 9]})
264+
result = merge(left, right, on="a", indicator=True)
265+
assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
266+
assert np.shares_memory(get_array(result, "c"), get_array(right, "c"))
267+
268+
246269
@pytest.mark.parametrize("dtype", [object, "str"])
247270
def test_join_on_key(dtype):
248271
df_index = Index(["a", "b", "c"], name="key", dtype=dtype)

pandas/tests/frame/methods/test_to_csv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -836,9 +836,9 @@ def test_to_csv_dups_cols2(self, temp_file):
836836
result = result.rename(columns={"a.1": "a"})
837837
tm.assert_frame_equal(result, df)
838838

839-
@pytest.mark.parametrize("chunksize", [10000, 50000, 100000])
839+
@pytest.mark.parametrize("chunksize", [1, 5, 10])
840840
def test_to_csv_chunking(self, chunksize, temp_file):
841-
aa = DataFrame({"A": range(100000)})
841+
aa = DataFrame({"A": range(10)})
842842
aa["B"] = aa.A + 1.0
843843
aa["C"] = aa.A + 2.0
844844
aa["D"] = aa.A + 3.0

pandas/tests/groupby/test_groupby_dropna.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -394,8 +394,20 @@ def test_groupby_drop_nan_with_multi_index():
394394
tm.assert_frame_equal(result, expected)
395395

396396

397-
# sequence_index enumerates all strings made up of x, y, z of length 4
398-
@pytest.mark.parametrize("sequence_index", range(3**4))
397+
# y >x and z is the missing value
398+
@pytest.mark.parametrize(
399+
"sequence",
400+
[
401+
"xyzy",
402+
"xxyz",
403+
"yzxz",
404+
"zzzz",
405+
"zyzx",
406+
"yyyy",
407+
"zzxy",
408+
"xyxy",
409+
],
410+
)
399411
@pytest.mark.parametrize(
400412
"dtype",
401413
[
@@ -419,15 +431,9 @@ def test_groupby_drop_nan_with_multi_index():
419431
],
420432
)
421433
@pytest.mark.parametrize("test_series", [True, False])
422-
def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index):
434+
def test_no_sort_keep_na(sequence, dtype, test_series, as_index):
423435
# GH#46584, GH#48794
424436

425-
# Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
426-
# This sequence is used for the grouper.
427-
sequence = "".join(
428-
[{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)]
429-
)
430-
431437
# Unique values to use for grouper, depends on dtype
432438
if dtype in ("string", "string[pyarrow]"):
433439
uniques = {"x": "x", "y": "y", "z": pd.NA}

pandas/tests/indexes/datetimes/methods/test_tz_localize.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,23 +149,23 @@ def test_dti_tz_localize_pass_dates_to_utc(self, tzstr):
149149
@pytest.mark.parametrize("prefix", ["", "dateutil/"])
150150
def test_dti_tz_localize(self, prefix):
151151
tzstr = prefix + "US/Eastern"
152-
dti = date_range(start="1/1/2005", end="1/1/2005 0:00:30.256", freq="ms")
152+
dti = date_range(start="1/1/2005", end="1/1/2005 0:00:02.256", freq="ms")
153153
dti2 = dti.tz_localize(tzstr)
154154

155155
dti_utc = date_range(
156-
start="1/1/2005 05:00", end="1/1/2005 5:00:30.256", freq="ms", tz="utc"
156+
start="1/1/2005 05:00", end="1/1/2005 5:00:02.256", freq="ms", tz="utc"
157157
)
158158

159159
tm.assert_numpy_array_equal(dti2.values, dti_utc.values)
160160

161161
dti3 = dti2.tz_convert(prefix + "US/Pacific")
162162
tm.assert_numpy_array_equal(dti3.values, dti_utc.values)
163163

164-
dti = date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="ms")
164+
dti = date_range(start="11/6/2011 1:59:59", end="11/6/2011 2:00", freq="ms")
165165
with pytest.raises(ValueError, match="Cannot infer dst time"):
166166
dti.tz_localize(tzstr)
167167

168-
dti = date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="ms")
168+
dti = date_range(start="3/13/2011 1:59:59", end="3/13/2011 2:00", freq="ms")
169169
with pytest.raises(ValueError, match="2011-03-13 02:00:00"):
170170
dti.tz_localize(tzstr)
171171

0 commit comments

Comments
 (0)