pandas-dev
diff --git a/‎.github/workflows/unit-tests.yml‎
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/unit-tests.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.github/workflows/wheels.yml‎
Lines changed: 3 additions & 4 deletions b/‎.github/workflows/wheels.yml‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎asv_bench/benchmarks/ctors.py‎
Lines changed: 1 addition & 1 deletion b/‎asv_bench/benchmarks/ctors.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎asv_bench/benchmarks/series_methods.py‎
Lines changed: 4 additions & 2 deletions b/‎asv_bench/benchmarks/series_methods.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎doc/source/whatsnew/v3.0.0.rst‎
Lines changed: 21 additions & 0 deletions b/‎doc/source/whatsnew/v3.0.0.rst‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎pandas/_config/config.py‎
Lines changed: 8 additions & 0 deletions b/‎pandas/_config/config.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎pandas/_libs/index.pyx‎
Lines changed: 3 additions & 0 deletions b/‎pandas/_libs/index.pyx‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pandas/_libs/lib.pyx‎
Lines changed: 9 additions & 0 deletions b/‎pandas/_libs/lib.pyx‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎pandas/_libs/parsers.pyx‎
Lines changed: 29 additions & 73 deletions b/‎pandas/_libs/parsers.pyx‎
Lines changed: 29 additions & 73 deletions
@@ -181,8 +181,7 @@ jobs:
     timeout-minutes: 90
     strategy:
       matrix:
-        # Note: Don't use macOS latest since macos 14 appears to be arm64 only
-        os: [macos-13, macos-14, windows-2025]
+        os: [macos-15-intel, macos-15, windows-2025]
         env_file: [actions-311.yaml, actions-312.yaml, actions-313.yaml]
       fail-fast: false
     runs-on: ${{ matrix.os }}
 
@@ -98,10 +98,9 @@ jobs:
         - [ubuntu-24.04, musllinux_x86_64]
         - [ubuntu-24.04-arm, manylinux_aarch64]
         - [ubuntu-24.04-arm, musllinux_aarch64]
-        - [macos-13, macosx_x86_64]
-        # Note: M1 images on Github Actions start from macOS 14
-        - [macos-14, macosx_arm64]
-        - [windows-2022, win_amd64]
+        - [macos-15-intel, macosx_x86_64]
+        - [macos-15, macosx_arm64]
+        - [windows-2025, win_amd64]
         - [windows-11-arm, win_arm64]
         python: [["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"], ["cp314", "3.14"], ["cp314t", "3.14"]]
         include:
 
@@ -23,7 +23,7 @@ def gen_of_str(arr):
 
 
 def arr_dict(arr):
-    return dict(zip(range(len(arr)), arr))
+    return dict(zip(range(len(arr)), arr, strict=True))
 
 
 def list_of_tuples(arr):
 
@@ -16,7 +16,7 @@ def setup(self):
         self.idx = date_range(
             start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s"
         )
-        self.data = dict(zip(self.idx, range(len(self.idx))))
+        self.data = dict(zip(self.idx, range(len(self.idx)), strict=True))
         self.array = np.array([1, 2, 3])
         self.idx2 = Index(["a", "b", "c"])
 
@@ -407,7 +407,9 @@ def setup(self, num_to_replace):
         self.to_replace_list = np.random.choice(self.arr, num_to_replace)
         self.values_list = np.random.choice(self.arr1, num_to_replace)
 
-        self.replace_dict = dict(zip(self.to_replace_list, self.values_list))
+        self.replace_dict = dict(
+            zip(self.to_replace_list, self.values_list, strict=True)
+        )
 
     def time_replace_dict(self, num_to_replace):
         self.ser.replace(self.replace_dict)
 
@@ -219,6 +219,7 @@ Other enhancements
 - Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
 - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
 - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
+- Improve error reporting through outputting the first few duplicates when :func:`merge` validation fails (:issue:`62742`)
 - Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`62038`)
 - Improved deprecation message for offset aliases (:issue:`60820`)
 - Many type aliases are now exposed in the new submodule :py:mod:`pandas.api.typing.aliases` (:issue:`55231`)
@@ -515,6 +516,22 @@ If we had passed ``pd.Int64Dtype()`` or ``"int64[pyarrow]"`` for the dtype in th
 
 With ``"mode.nan_is_na"`` set to ``False``, ``ser.to_numpy()`` (and ``frame.values`` and ``np.asarray(obj)``) will convert to ``object`` dtype if :class:`NA` entries are present, where before they would coerce to ``NaN``.  To retain a float numpy dtype, explicitly pass ``na_value=np.nan`` to :meth:`Series.to_numpy`.
 
+The ``__module__`` attribute now points to public modules
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``__module__`` attribute on functions and classes in the public API has been
+updated to refer to the preferred public module from which to access the object,
+rather than the module in which the object happens to be defined (:issue:`55178`).
+
+This produces more informative displays in the Python console for classes, e.g.,
+instead of ``<class 'pandas.core.frame.DataFrame'>`` you now see
+``<class 'pandas.DataFrame'>``, and in interactive tools such as IPython, e.g.,
+instead of ``<function pandas.io.parsers.readers.read_csv(...)>`` you now see
+``<function pandas.read_csv(...)>``.
+
+This may break code that relies on the previous ``__module__`` values (e.g.
+doctests inspecting the ``type()`` of a DataFrame object).
+
 .. _whatsnew_300.api_breaking.deps:
 
 Increased minimum version for Python
@@ -938,6 +955,7 @@ Bug fixes
 
 Categorical
 ^^^^^^^^^^^
+- Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the ``categories.dtype`` is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings) (:issue:`61778`)
 - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
 - Bug in :func:`testing.assert_index_equal` raising ``TypeError`` instead of ``AssertionError`` for incomparable ``CategoricalIndex`` when ``check_categorical=True`` and ``exact=False`` (:issue:`61935`)
 - Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`)
@@ -997,6 +1015,7 @@ Numeric
 ^^^^^^^
 - Bug in :func:`api.types.infer_dtype` returning "mixed" for complex and ``pd.NA`` mix (:issue:`61976`)
 - Bug in :func:`api.types.infer_dtype` returning "mixed-integer-float" for float and ``pd.NA`` mix (:issue:`61621`)
+- Bug in :meth:`DataFrame.combine_first` where Int64 and UInt64 integers with absolute value greater than ``2**53`` would lose precision after the operation. (:issue:`60128`)
 - Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`)
 - Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`)
 - Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`)
@@ -1025,6 +1044,7 @@ Interval
 - :meth:`Index.is_monotonic_decreasing`, :meth:`Index.is_monotonic_increasing`, and :meth:`Index.is_unique` could incorrectly be ``False`` for an ``Index`` created from a slice of another ``Index``. (:issue:`57911`)
 - Bug in :class:`Index`, :class:`Series`, :class:`DataFrame` constructors when given a sequence of :class:`Interval` subclass objects casting them to :class:`Interval` (:issue:`46945`)
 - Bug in :func:`interval_range` where start and end numeric types were always cast to 64 bit (:issue:`57268`)
+- Bug in :meth:`IntervalIndex.get_indexer` and :meth:`IntervalIndex.drop` when one of the sides of the index is non-unique (:issue:`52245`)
 
 Indexing
 ^^^^^^^^
@@ -1088,6 +1108,7 @@ I/O
 - Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`)
 - Bug in :meth:`python_parser` where :class:`MyDialect` did not appropriately skip a line when instructed, causing Empty Data Error (:issue:`62739`)
 - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
+- Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 
@@ -944,3 +944,11 @@ def is_callable(obj: object) -> bool:
     if not callable(obj):
         raise ValueError("Value must be a callable")
     return True
+
+
+# import set_module here would cause circular import
+get_option.__module__ = "pandas"
+set_option.__module__ = "pandas"
+describe_option.__module__ = "pandas"
+reset_option.__module__ = "pandas"
+option_context.__module__ = "pandas"
@@ -321,6 +321,9 @@ cdef class IndexEngine:
             if is_strict_monotonic:
                 self.unique = 1
                 self.need_unique_check = 0
+            elif self.monotonic_inc == 1 or self.monotonic_dec == 1:
+                self.unique = 0
+                self.need_unique_check = 0
 
     cdef _call_monotonic(self, values):
         return algos.is_monotonic(values, timelike=False)
 
@@ -41,6 +41,7 @@ from cython cimport (
 from pandas._config import using_string_dtype
 
 from pandas._libs.missing import check_na_tuples_nonequal
+from pandas.util._decorators import set_module
 
 import_datetime()
 
@@ -154,6 +155,7 @@ def memory_usage_of_objects(arr: object[:]) -> int64_t:
 # ----------------------------------------------------------------------
 
 
+@set_module("pandas.api.types")
 def is_scalar(val: object) -> bool:
     """
     Return True if given object is scalar.
@@ -255,6 +257,7 @@ cdef int64_t get_itemsize(object val):
         return -1
 
 
+@set_module("pandas.api.types")
 def is_iterator(obj: object) -> bool:
     """
     Check if the object is an iterator.
@@ -1095,6 +1098,7 @@ def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list ke
 
 # core.common import for fast inference checks
 
+@set_module("pandas.api.types")
 def is_float(obj: object) -> bool:
     """
     Return True if given object is float.
@@ -1128,6 +1132,7 @@ def is_float(obj: object) -> bool:
     return util.is_float_object(obj)
 
 
+@set_module("pandas.api.types")
 def is_integer(obj: object) -> bool:
     """
     Return True if given object is integer.
@@ -1172,6 +1177,7 @@ def is_int_or_none(obj) -> bool:
     return obj is None or util.is_integer_object(obj)
 
 
+@set_module("pandas.api.types")
 def is_bool(obj: object) -> bool:
     """
     Return True if given object is boolean.
@@ -1202,6 +1208,7 @@ def is_bool(obj: object) -> bool:
     return util.is_bool_object(obj)
 
 
+@set_module("pandas.api.types")
 def is_complex(obj: object) -> bool:
     """
     Return True if given object is complex.
@@ -1237,6 +1244,7 @@ cpdef bint is_decimal(object obj):
     return isinstance(obj, Decimal)
 
 
+@set_module("pandas.api.types")
 def is_list_like(obj: object, allow_sets: bool = True) -> bool:
     """
     Check if the object is list-like.
@@ -1520,6 +1528,7 @@ cdef object _try_infer_map(object dtype):
     return None
 
 
+@set_module("pandas.api.types")
 def infer_dtype(value: object, skipna: bool = True) -> str:
     """
     Return a string label of the type of the elements in a list-like input.
 
@@ -144,7 +144,7 @@ cdef extern from "pandas/parser/tokenizer.h":
         SKIP_LINE
         FINISHED
 
-    enum: ERROR_OVERFLOW
+    enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS
 
     ctypedef enum BadLineHandleMethod:
         ERROR,
@@ -1051,7 +1051,7 @@ cdef class TextReader:
         if col_dtype is not None:
             col_res, na_count = self._convert_with_dtype(
                 col_dtype, i, start, end, na_filter,
-                1, na_hashset, na_fset)
+                1, na_hashset, na_fset, False)
 
             # Fallback on the parse (e.g. we requested int dtype,
             # but its actually a float).
@@ -1062,30 +1062,34 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter, na_hashset)
         else:
             col_res = None
+            maybe_int = True
             for dt in self.dtype_cast_order:
-                if (dt.kind in "iu" and
-                        self._column_has_float(i, start, end, na_filter, na_hashset)):
+                if not maybe_int and dt.kind in "iu":
                     continue
 
                 try:
                     col_res, na_count = self._convert_with_dtype(
-                        dt, i, start, end, na_filter, 0, na_hashset, na_fset)
-                except ValueError:
-                    # This error is raised from trying to convert to uint64,
-                    # and we discover that we cannot convert to any numerical
-                    # dtype successfully. As a result, we leave the data
-                    # column AS IS with object dtype.
-                    col_res, na_count = self._convert_with_dtype(
-                        np.dtype("object"), i, start, end, 0,
-                        0, na_hashset, na_fset)
+                        dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
+                except ValueError as e:
+                    if str(e) == "Number is not int":
+                        maybe_int = False
+                        continue
+                    else:
+                        # This error is raised from trying to convert to uint64,
+                        # and we discover that we cannot convert to any numerical
+                        # dtype successfully. As a result, we leave the data
+                        # column AS IS with object dtype.
+                        col_res, na_count = self._convert_with_dtype(
+                            np.dtype("object"), i, start, end, 0,
+                            0, na_hashset, na_fset, False)
                 except OverflowError:
                     try:
                         col_res, na_count = _try_pylong(self.parser, i, start,
                                                         end, na_filter, na_hashset)
                     except ValueError:
                         col_res, na_count = self._convert_with_dtype(
                             np.dtype("object"), i, start, end, 0,
-                            0, na_hashset, na_fset)
+                            0, na_hashset, na_fset, False)
 
                 if col_res is not None:
                     break
@@ -1133,7 +1137,7 @@ cdef class TextReader:
                              bint na_filter,
                              bint user_dtype,
                              kh_str_starts_t *na_hashset,
-                             set na_fset):
+                             set na_fset, bint raise_on_invalid):
         if isinstance(dtype, CategoricalDtype):
             # TODO: I suspect that _categorical_convert could be
             # optimized when dtype is an instance of CategoricalDtype
@@ -1174,14 +1178,14 @@ cdef class TextReader:
 
         elif dtype.kind in "iu":
             try:
-                result, na_count = _try_int64(self.parser, i, start,
-                                              end, na_filter, na_hashset)
+                result, na_count = _try_int64(self.parser, i, start, end,
+                                              na_filter, na_hashset, raise_on_invalid)
                 if user_dtype and na_count is not None:
                     if na_count > 0:
                         raise ValueError(f"Integer column has NA values in column {i}")
             except OverflowError:
                 result = _try_uint64(self.parser, i, start, end,
-                                     na_filter, na_hashset)
+                                     na_filter, na_hashset, raise_on_invalid)
                 na_count = 0
 
             if result is not None and dtype != "int64":
@@ -1344,59 +1348,6 @@ cdef class TextReader:
             else:
                 return None
 
-    cdef bint _column_has_float(self, Py_ssize_t col,
-                                int64_t start, int64_t end,
-                                bint na_filter, kh_str_starts_t *na_hashset):
-        """Check if the column contains any float number."""
-        cdef:
-            Py_ssize_t i, j, lines = end - start
-            coliter_t it
-            const char *word = NULL
-            const char *ignored_chars = " +-"
-            const char *digits = "0123456789"
-            const char *float_indicating_chars = "eE"
-            char null_byte = 0
-
-        coliter_setup(&it, self.parser, col, start)
-
-        for i in range(lines):
-            COLITER_NEXT(it, word)
-
-            if na_filter and kh_get_str_starts_item(na_hashset, word):
-                continue
-
-            found_first_digit = False
-            j = 0
-            while word[j] != null_byte:
-                if word[j] == self.parser.decimal:
-                    return True
-                elif not found_first_digit and word[j] in ignored_chars:
-                    # no-op
-                    pass
-                elif not found_first_digit and word[j] not in digits:
-                    # word isn't numeric
-                    return False
-                elif not found_first_digit and word[j] in digits:
-                    found_first_digit = True
-                elif word[j] in float_indicating_chars:
-                    # preceding chars indicates numeric and
-                    # current char indicates float
-                    return True
-                elif word[j] not in digits:
-                    # previous characters indicates numeric
-                    # current character shows otherwise
-                    return False
-                elif word[j] in digits:
-                    # no-op
-                    pass
-                else:
-                    raise AssertionError(
-                            f"Unhandled case {word[j]=} {found_first_digit=}"
-                            )
-                j += 1
-
-        return False
-
 # Factor out code common to TextReader.__dealloc__ and TextReader.close
 # It cannot be a class method, since calling self.close() in __dealloc__
 # which causes a class attribute lookup and violates best practices
@@ -1793,7 +1744,8 @@ cdef int _try_double_nogil(parser_t *parser,
 
 cdef _try_uint64(parser_t *parser, int64_t col,
                  int64_t line_start, int64_t line_end,
-                 bint na_filter, kh_str_starts_t *na_hashset):
+                 bint na_filter, kh_str_starts_t *na_hashset,
+                 bint raise_on_invalid):
     cdef:
         int error
         Py_ssize_t lines
@@ -1815,6 +1767,8 @@ cdef _try_uint64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
+        elif raise_on_invalid and error == ERROR_INVALID_CHARS:
+            raise ValueError("Number is not int")
         return None
 
     if uint64_conflict(&state):
@@ -1863,7 +1817,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
 
 cdef _try_int64(parser_t *parser, int64_t col,
                 int64_t line_start, int64_t line_end,
-                bint na_filter, kh_str_starts_t *na_hashset):
+                bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_invalid):
     cdef:
         int error, na_count = 0
         Py_ssize_t lines
@@ -1883,6 +1837,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
+        elif raise_on_invalid and error == ERROR_INVALID_CHARS:
+            raise ValueError("Number is not int")
         return None, None
 
     return result, na_count