Skip to content

Commit 0aa81ec

Browse files
authored
Merge branch 'main' into shiny-new-feature
2 parents fd93622 + ea75dd7 commit 0aa81ec

File tree

15 files changed

+1872
-563
lines changed

15 files changed

+1872
-563
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ Other enhancements
220220
- Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
221221
- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
222222
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
223+
- Improve error reporting through outputting the first few duplicates when :func:`merge` validation fails (:issue:`62742`)
223224
- Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`62038`)
224225
- Improved deprecation message for offset aliases (:issue:`60820`)
225226
- Many type aliases are now exposed in the new submodule :py:mod:`pandas.api.typing.aliases` (:issue:`55231`)
@@ -955,6 +956,7 @@ Bug fixes
955956

956957
Categorical
957958
^^^^^^^^^^^
959+
- Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the ``categories.dtype`` is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings) (:issue:`61778`)
958960
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
959961
- Bug in :func:`testing.assert_index_equal` raising ``TypeError`` instead of ``AssertionError`` for incomparable ``CategoricalIndex`` when ``check_categorical=True`` and ``exact=False`` (:issue:`61935`)
960962
- Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`)
@@ -1014,6 +1016,7 @@ Numeric
10141016
^^^^^^^
10151017
- Bug in :func:`api.types.infer_dtype` returning "mixed" for complex and ``pd.NA`` mix (:issue:`61976`)
10161018
- Bug in :func:`api.types.infer_dtype` returning "mixed-integer-float" for float and ``pd.NA`` mix (:issue:`61621`)
1019+
- Bug in :meth:`DataFrame.combine_first` where Int64 and UInt64 integers with absolute value greater than ``2**53`` would lose precision after the operation. (:issue:`60128`)
10171020
- Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`)
10181021
- Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`)
10191022
- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`)
@@ -1042,6 +1045,7 @@ Interval
10421045
- :meth:`Index.is_monotonic_decreasing`, :meth:`Index.is_monotonic_increasing`, and :meth:`Index.is_unique` could incorrectly be ``False`` for an ``Index`` created from a slice of another ``Index``. (:issue:`57911`)
10431046
- Bug in :class:`Index`, :class:`Series`, :class:`DataFrame` constructors when given a sequence of :class:`Interval` subclass objects casting them to :class:`Interval` (:issue:`46945`)
10441047
- Bug in :func:`interval_range` where start and end numeric types were always cast to 64 bit (:issue:`57268`)
1048+
- Bug in :meth:`IntervalIndex.get_indexer` and :meth:`IntervalIndex.drop` when one of the sides of the index is non-unique (:issue:`52245`)
10451049

10461050
Indexing
10471051
^^^^^^^^

pandas/_libs/index.pyx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,9 @@ cdef class IndexEngine:
321321
if is_strict_monotonic:
322322
self.unique = 1
323323
self.need_unique_check = 0
324+
elif self.monotonic_inc == 1 or self.monotonic_dec == 1:
325+
self.unique = 0
326+
self.need_unique_check = 0
324327

325328
cdef _call_monotonic(self, values):
326329
return algos.is_monotonic(values, timelike=False)

pandas/core/arrays/categorical.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,10 @@ def __init__(
460460
codes = arr.indices.to_numpy()
461461
dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)
462462
else:
463+
preserve_object = False
464+
if isinstance(values, (ABCIndex, ABCSeries)) and values.dtype == object:
465+
# GH#61778
466+
preserve_object = True
463467
if not isinstance(values, ABCIndex):
464468
# in particular RangeIndex xref test_index_equal_range_categories
465469
values = sanitize_array(values, None)
@@ -476,7 +480,14 @@ def __init__(
476480
"by passing in a categories argument."
477481
) from err
478482

479-
# we're inferring from values
483+
if preserve_object:
484+
# GH#61778 wrap categories in an Index to prevent dtype
485+
# inference in the CategoricalDtype constructor
486+
from pandas import Index
487+
488+
categories = Index(categories, dtype=object, copy=False)
489+
490+
# if not preserve_obejct, we're inferring from values
480491
dtype = CategoricalDtype(categories, dtype.ordered)
481492

482493
elif isinstance(values.dtype, CategoricalDtype):

pandas/core/frame.py

Lines changed: 67 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3293,28 +3293,71 @@ def to_html(
32933293
Examples
32943294
--------
32953295
>>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
3296-
>>> html_string = '''<table border="1" class="dataframe">
3297-
... <thead>
3298-
... <tr style="text-align: right;">
3299-
... <th></th>
3300-
... <th>col1</th>
3301-
... <th>col2</th>
3302-
... </tr>
3303-
... </thead>
3304-
... <tbody>
3305-
... <tr>
3306-
... <th>0</th>
3307-
... <td>1</td>
3308-
... <td>4</td>
3309-
... </tr>
3310-
... <tr>
3311-
... <th>1</th>
3312-
... <td>2</td>
3313-
... <td>3</td>
3314-
... </tr>
3315-
... </tbody>
3316-
... </table>'''
3317-
>>> assert html_string == df.to_html()
3296+
>>> html_string = df.to_html()
3297+
>>> print(html_string)
3298+
<table border="1" class="dataframe">
3299+
<thead>
3300+
<tr style="text-align: right;">
3301+
<th></th>
3302+
<th>col1</th>
3303+
<th>col2</th>
3304+
</tr>
3305+
</thead>
3306+
<tbody>
3307+
<tr>
3308+
<th>0</th>
3309+
<td>1</td>
3310+
<td>4</td>
3311+
</tr>
3312+
<tr>
3313+
<th>1</th>
3314+
<td>2</td>
3315+
<td>3</td>
3316+
</tr>
3317+
</tbody>
3318+
</table>
3319+
3320+
HTML output
3321+
3322+
+----+-----+-----+
3323+
| |col1 |col2 |
3324+
+====+=====+=====+
3325+
|0 |1 |4 |
3326+
+----+-----+-----+
3327+
|1 |2 |3 |
3328+
+----+-----+-----+
3329+
3330+
>>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
3331+
>>> html_string = df.to_html(index=False)
3332+
>>> print(html_string)
3333+
<table border="1" class="dataframe">
3334+
<thead>
3335+
<tr style="text-align: right;">
3336+
<th>col1</th>
3337+
<th>col2</th>
3338+
</tr>
3339+
</thead>
3340+
<tbody>
3341+
<tr>
3342+
<td>1</td>
3343+
<td>4</td>
3344+
</tr>
3345+
<tr>
3346+
<td>2</td>
3347+
<td>3</td>
3348+
</tr>
3349+
</tbody>
3350+
</table>
3351+
3352+
HTML output
3353+
3354+
+-----+-----+
3355+
|col1 |col2 |
3356+
+=====+=====+
3357+
|1 |4 |
3358+
+-----+-----+
3359+
|2 |3 |
3360+
+-----+-----+
33183361
"""
33193362
if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS:
33203363
raise ValueError("Invalid value for justify parameter")
@@ -9165,20 +9208,10 @@ def combine_first(self, other: DataFrame) -> DataFrame:
91659208
1 0.0 3.0 1.0
91669209
2 NaN 3.0 1.0
91679210
"""
9168-
from pandas.core.computation import expressions
91699211

91709212
def combiner(x: Series, y: Series):
9171-
mask = x.isna()._values
9172-
9173-
x_values = x._values
9174-
y_values = y._values
9175-
9176-
# If the column y in other DataFrame is not in first DataFrame,
9177-
# just return y_values.
9178-
if y.name not in self.columns:
9179-
return y_values
9180-
9181-
return expressions.where(mask, y_values, x_values)
9213+
# GH#60128 The combiner is supposed to preserve EA Dtypes.
9214+
return y if y.name not in self.columns else y.where(x.isna(), x)
91829215

91839216
if len(other) == 0:
91849217
combined = self.reindex(

pandas/core/indexes/interval.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -712,7 +712,10 @@ def _get_indexer(
712712
# -> at most one match per interval in target
713713
# want exact matches -> need both left/right to match, so defer to
714714
# left/right get_indexer, compare elementwise, equality -> match
715-
indexer = self._get_indexer_unique_sides(target)
715+
if self.left.is_unique and self.right.is_unique:
716+
indexer = self._get_indexer_unique_sides(target)
717+
else:
718+
indexer = self._get_indexer_pointwise(target)[0]
716719

717720
elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)):
718721
# homogeneous scalar index: use IntervalTree

pandas/core/reshape/merge.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1954,42 +1954,62 @@ def _validate_left_right_on(self, left_on, right_on):
19541954
def _validate_validate_kwd(self, validate: str) -> None:
19551955
# Check uniqueness of each
19561956
if self.left_index:
1957-
left_unique = self.orig_left.index.is_unique
1957+
left_join_index = self.orig_left.index
1958+
left_unique = left_join_index.is_unique
19581959
else:
1959-
left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
1960+
left_join_index = MultiIndex.from_arrays(self.left_join_keys)
1961+
left_unique = left_join_index.is_unique
19601962

19611963
if self.right_index:
1964+
right_join_index = self.orig_right.index
19621965
right_unique = self.orig_right.index.is_unique
19631966
else:
1964-
right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
1967+
right_join_index = MultiIndex.from_arrays(self.right_join_keys)
1968+
right_unique = right_join_index.is_unique
1969+
1970+
def left_error_msg(x: Index) -> str:
1971+
name = self.left_on if not self.left_index else lib.no_default
1972+
msg = x[x.duplicated()][:5].to_frame(name=name).to_string(index=False)
1973+
return f"\nDuplicates in left:\n {msg} ..."
1974+
1975+
def right_error_msg(x: Index) -> str:
1976+
name = self.right_on if not self.right_index else lib.no_default
1977+
msg = x[x.duplicated()][:5].to_frame(name=name).to_string(index=False)
1978+
return f"\nDuplicates in right:\n {msg} ..."
19651979

19661980
# Check data integrity
19671981
if validate in ["one_to_one", "1:1"]:
19681982
if not left_unique and not right_unique:
19691983
raise MergeError(
19701984
"Merge keys are not unique in either left "
1971-
"or right dataset; not a one-to-one merge"
1985+
"or right dataset; not a one-to-one merge."
1986+
f"{left_error_msg(left_join_index)}"
1987+
f"{right_error_msg(right_join_index)}"
19721988
)
19731989
if not left_unique:
19741990
raise MergeError(
19751991
"Merge keys are not unique in left dataset; not a one-to-one merge"
1992+
f"{left_error_msg(left_join_index)}"
19761993
)
19771994
if not right_unique:
19781995
raise MergeError(
19791996
"Merge keys are not unique in right dataset; not a one-to-one merge"
1997+
f"{right_error_msg(right_join_index)}"
19801998
)
19811999

19822000
elif validate in ["one_to_many", "1:m"]:
19832001
if not left_unique:
19842002
raise MergeError(
19852003
"Merge keys are not unique in left dataset; not a one-to-many merge"
2004+
f"{left_error_msg(left_join_index)}"
19862005
)
19872006

19882007
elif validate in ["many_to_one", "m:1"]:
19892008
if not right_unique:
19902009
raise MergeError(
19912010
"Merge keys are not unique in right dataset; "
1992-
"not a many-to-one merge"
2011+
"not a many-to-one merge\n"
2012+
f"{right_error_msg(right_join_index)}"
19932013
)
19942014

19952015
elif validate in ["many_to_many", "m:m"]:

0 commit comments

Comments
 (0)