Skip to content

Commit 017d4eb

Browse files
authored
Merge branch 'main' into 62739-csv-double-quotes
2 parents 6a3e5ad + ea75dd7 commit 017d4eb

File tree

47 files changed

+2392
-858
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+2392
-858
lines changed

.github/workflows/unit-tests.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,7 @@ jobs:
181181
timeout-minutes: 90
182182
strategy:
183183
matrix:
184-
# Note: Don't use macOS latest since macos 14 appears to be arm64 only
185-
os: [macos-13, macos-14, windows-2025]
184+
os: [macos-15-intel, macos-15, windows-2025]
186185
env_file: [actions-311.yaml, actions-312.yaml, actions-313.yaml]
187186
fail-fast: false
188187
runs-on: ${{ matrix.os }}

.github/workflows/wheels.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,9 @@ jobs:
9898
- [ubuntu-24.04, musllinux_x86_64]
9999
- [ubuntu-24.04-arm, manylinux_aarch64]
100100
- [ubuntu-24.04-arm, musllinux_aarch64]
101-
- [macos-13, macosx_x86_64]
102-
# Note: M1 images on Github Actions start from macOS 14
103-
- [macos-14, macosx_arm64]
104-
- [windows-2022, win_amd64]
101+
- [macos-15-intel, macosx_x86_64]
102+
- [macos-15, macosx_arm64]
103+
- [windows-2025, win_amd64]
105104
- [windows-11-arm, win_arm64]
106105
python: [["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"], ["cp314", "3.14"], ["cp314t", "3.14"]]
107106
include:

asv_bench/benchmarks/ctors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def gen_of_str(arr):
2323

2424

2525
def arr_dict(arr):
26-
return dict(zip(range(len(arr)), arr))
26+
return dict(zip(range(len(arr)), arr, strict=True))
2727

2828

2929
def list_of_tuples(arr):

asv_bench/benchmarks/series_methods.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def setup(self):
1616
self.idx = date_range(
1717
start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s"
1818
)
19-
self.data = dict(zip(self.idx, range(len(self.idx))))
19+
self.data = dict(zip(self.idx, range(len(self.idx)), strict=True))
2020
self.array = np.array([1, 2, 3])
2121
self.idx2 = Index(["a", "b", "c"])
2222

@@ -407,7 +407,9 @@ def setup(self, num_to_replace):
407407
self.to_replace_list = np.random.choice(self.arr, num_to_replace)
408408
self.values_list = np.random.choice(self.arr1, num_to_replace)
409409

410-
self.replace_dict = dict(zip(self.to_replace_list, self.values_list))
410+
self.replace_dict = dict(
411+
zip(self.to_replace_list, self.values_list, strict=True)
412+
)
411413

412414
def time_replace_dict(self, num_to_replace):
413415
self.ser.replace(self.replace_dict)

doc/source/whatsnew/v3.0.0.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ Other enhancements
219219
- Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
220220
- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
221221
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
222+
- Improve error reporting through outputting the first few duplicates when :func:`merge` validation fails (:issue:`62742`)
222223
- Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`62038`)
223224
- Improved deprecation message for offset aliases (:issue:`60820`)
224225
- Many type aliases are now exposed in the new submodule :py:mod:`pandas.api.typing.aliases` (:issue:`55231`)
@@ -515,6 +516,22 @@ If we had passed ``pd.Int64Dtype()`` or ``"int64[pyarrow]"`` for the dtype in th
515516

516517
With ``"mode.nan_is_na"`` set to ``False``, ``ser.to_numpy()`` (and ``frame.values`` and ``np.asarray(obj)``) will convert to ``object`` dtype if :class:`NA` entries are present, where before they would coerce to ``NaN``. To retain a float numpy dtype, explicitly pass ``na_value=np.nan`` to :meth:`Series.to_numpy`.
517518

519+
The ``__module__`` attribute now points to public modules
520+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
521+
522+
The ``__module__`` attribute on functions and classes in the public API has been
523+
updated to refer to the preferred public module from which to access the object,
524+
rather than the module in which the object happens to be defined (:issue:`55178`).
525+
526+
This produces more informative displays in the Python console for classes, e.g.,
527+
instead of ``<class 'pandas.core.frame.DataFrame'>`` you now see
528+
``<class 'pandas.DataFrame'>``, and in interactive tools such as IPython, e.g.,
529+
instead of ``<function pandas.io.parsers.readers.read_csv(...)>`` you now see
530+
``<function pandas.read_csv(...)>``.
531+
532+
This may break code that relies on the previous ``__module__`` values (e.g.
533+
doctests inspecting the ``type()`` of a DataFrame object).
534+
518535
.. _whatsnew_300.api_breaking.deps:
519536

520537
Increased minimum version for Python
@@ -938,6 +955,7 @@ Bug fixes
938955

939956
Categorical
940957
^^^^^^^^^^^
958+
- Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the ``categories.dtype`` is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings) (:issue:`61778`)
941959
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
942960
- Bug in :func:`testing.assert_index_equal` raising ``TypeError`` instead of ``AssertionError`` for incomparable ``CategoricalIndex`` when ``check_categorical=True`` and ``exact=False`` (:issue:`61935`)
943961
- Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`)
@@ -997,6 +1015,7 @@ Numeric
9971015
^^^^^^^
9981016
- Bug in :func:`api.types.infer_dtype` returning "mixed" for complex and ``pd.NA`` mix (:issue:`61976`)
9991017
- Bug in :func:`api.types.infer_dtype` returning "mixed-integer-float" for float and ``pd.NA`` mix (:issue:`61621`)
1018+
- Bug in :meth:`DataFrame.combine_first` where Int64 and UInt64 integers with absolute value greater than ``2**53`` would lose precision after the operation. (:issue:`60128`)
10001019
- Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`)
10011020
- Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`)
10021021
- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`)
@@ -1025,6 +1044,7 @@ Interval
10251044
- :meth:`Index.is_monotonic_decreasing`, :meth:`Index.is_monotonic_increasing`, and :meth:`Index.is_unique` could incorrectly be ``False`` for an ``Index`` created from a slice of another ``Index``. (:issue:`57911`)
10261045
- Bug in :class:`Index`, :class:`Series`, :class:`DataFrame` constructors when given a sequence of :class:`Interval` subclass objects casting them to :class:`Interval` (:issue:`46945`)
10271046
- Bug in :func:`interval_range` where start and end numeric types were always cast to 64 bit (:issue:`57268`)
1047+
- Bug in :meth:`IntervalIndex.get_indexer` and :meth:`IntervalIndex.drop` when one of the sides of the index is non-unique (:issue:`52245`)
10281048

10291049
Indexing
10301050
^^^^^^^^
@@ -1088,6 +1108,7 @@ I/O
10881108
- Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`)
10891109
- Bug in :meth:`python_parser` where :class:`MyDialect` did not appropriately skip a line when instructed, causing Empty Data Error (:issue:`62739`)
10901110
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
1111+
- Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`)
10911112
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
10921113
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
10931114
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)

pandas/_config/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -944,3 +944,11 @@ def is_callable(obj: object) -> bool:
944944
if not callable(obj):
945945
raise ValueError("Value must be a callable")
946946
return True
947+
948+
949+
# import set_module here would cause circular import
950+
get_option.__module__ = "pandas"
951+
set_option.__module__ = "pandas"
952+
describe_option.__module__ = "pandas"
953+
reset_option.__module__ = "pandas"
954+
option_context.__module__ = "pandas"

pandas/_libs/index.pyx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,9 @@ cdef class IndexEngine:
321321
if is_strict_monotonic:
322322
self.unique = 1
323323
self.need_unique_check = 0
324+
elif self.monotonic_inc == 1 or self.monotonic_dec == 1:
325+
self.unique = 0
326+
self.need_unique_check = 0
324327

325328
cdef _call_monotonic(self, values):
326329
return algos.is_monotonic(values, timelike=False)

pandas/_libs/lib.pyx

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ from cython cimport (
4141
from pandas._config import using_string_dtype
4242

4343
from pandas._libs.missing import check_na_tuples_nonequal
44+
from pandas.util._decorators import set_module
4445

4546
import_datetime()
4647

@@ -154,6 +155,7 @@ def memory_usage_of_objects(arr: object[:]) -> int64_t:
154155
# ----------------------------------------------------------------------
155156

156157

158+
@set_module("pandas.api.types")
157159
def is_scalar(val: object) -> bool:
158160
"""
159161
Return True if given object is scalar.
@@ -255,6 +257,7 @@ cdef int64_t get_itemsize(object val):
255257
return -1
256258

257259

260+
@set_module("pandas.api.types")
258261
def is_iterator(obj: object) -> bool:
259262
"""
260263
Check if the object is an iterator.
@@ -1095,6 +1098,7 @@ def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list ke
10951098

10961099
# core.common import for fast inference checks
10971100

1101+
@set_module("pandas.api.types")
10981102
def is_float(obj: object) -> bool:
10991103
"""
11001104
Return True if given object is float.
@@ -1128,6 +1132,7 @@ def is_float(obj: object) -> bool:
11281132
return util.is_float_object(obj)
11291133

11301134

1135+
@set_module("pandas.api.types")
11311136
def is_integer(obj: object) -> bool:
11321137
"""
11331138
Return True if given object is integer.
@@ -1172,6 +1177,7 @@ def is_int_or_none(obj) -> bool:
11721177
return obj is None or util.is_integer_object(obj)
11731178

11741179

1180+
@set_module("pandas.api.types")
11751181
def is_bool(obj: object) -> bool:
11761182
"""
11771183
Return True if given object is boolean.
@@ -1202,6 +1208,7 @@ def is_bool(obj: object) -> bool:
12021208
return util.is_bool_object(obj)
12031209

12041210

1211+
@set_module("pandas.api.types")
12051212
def is_complex(obj: object) -> bool:
12061213
"""
12071214
Return True if given object is complex.
@@ -1237,6 +1244,7 @@ cpdef bint is_decimal(object obj):
12371244
return isinstance(obj, Decimal)
12381245

12391246

1247+
@set_module("pandas.api.types")
12401248
def is_list_like(obj: object, allow_sets: bool = True) -> bool:
12411249
"""
12421250
Check if the object is list-like.
@@ -1520,6 +1528,7 @@ cdef object _try_infer_map(object dtype):
15201528
return None
15211529

15221530

1531+
@set_module("pandas.api.types")
15231532
def infer_dtype(value: object, skipna: bool = True) -> str:
15241533
"""
15251534
Return a string label of the type of the elements in a list-like input.

pandas/_libs/parsers.pyx

Lines changed: 29 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ cdef extern from "pandas/parser/tokenizer.h":
144144
SKIP_LINE
145145
FINISHED
146146

147-
enum: ERROR_OVERFLOW
147+
enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS
148148

149149
ctypedef enum BadLineHandleMethod:
150150
ERROR,
@@ -1051,7 +1051,7 @@ cdef class TextReader:
10511051
if col_dtype is not None:
10521052
col_res, na_count = self._convert_with_dtype(
10531053
col_dtype, i, start, end, na_filter,
1054-
1, na_hashset, na_fset)
1054+
1, na_hashset, na_fset, False)
10551055

10561056
# Fallback on the parse (e.g. we requested int dtype,
10571057
# but its actually a float).
@@ -1062,30 +1062,34 @@ cdef class TextReader:
10621062
return self._string_convert(i, start, end, na_filter, na_hashset)
10631063
else:
10641064
col_res = None
1065+
maybe_int = True
10651066
for dt in self.dtype_cast_order:
1066-
if (dt.kind in "iu" and
1067-
self._column_has_float(i, start, end, na_filter, na_hashset)):
1067+
if not maybe_int and dt.kind in "iu":
10681068
continue
10691069

10701070
try:
10711071
col_res, na_count = self._convert_with_dtype(
1072-
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
1073-
except ValueError:
1074-
# This error is raised from trying to convert to uint64,
1075-
# and we discover that we cannot convert to any numerical
1076-
# dtype successfully. As a result, we leave the data
1077-
# column AS IS with object dtype.
1078-
col_res, na_count = self._convert_with_dtype(
1079-
np.dtype("object"), i, start, end, 0,
1080-
0, na_hashset, na_fset)
1072+
dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
1073+
except ValueError as e:
1074+
if str(e) == "Number is not int":
1075+
maybe_int = False
1076+
continue
1077+
else:
1078+
# This error is raised from trying to convert to uint64,
1079+
# and we discover that we cannot convert to any numerical
1080+
# dtype successfully. As a result, we leave the data
1081+
# column AS IS with object dtype.
1082+
col_res, na_count = self._convert_with_dtype(
1083+
np.dtype("object"), i, start, end, 0,
1084+
0, na_hashset, na_fset, False)
10811085
except OverflowError:
10821086
try:
10831087
col_res, na_count = _try_pylong(self.parser, i, start,
10841088
end, na_filter, na_hashset)
10851089
except ValueError:
10861090
col_res, na_count = self._convert_with_dtype(
10871091
np.dtype("object"), i, start, end, 0,
1088-
0, na_hashset, na_fset)
1092+
0, na_hashset, na_fset, False)
10891093

10901094
if col_res is not None:
10911095
break
@@ -1133,7 +1137,7 @@ cdef class TextReader:
11331137
bint na_filter,
11341138
bint user_dtype,
11351139
kh_str_starts_t *na_hashset,
1136-
set na_fset):
1140+
set na_fset, bint raise_on_invalid):
11371141
if isinstance(dtype, CategoricalDtype):
11381142
# TODO: I suspect that _categorical_convert could be
11391143
# optimized when dtype is an instance of CategoricalDtype
@@ -1174,14 +1178,14 @@ cdef class TextReader:
11741178

11751179
elif dtype.kind in "iu":
11761180
try:
1177-
result, na_count = _try_int64(self.parser, i, start,
1178-
end, na_filter, na_hashset)
1181+
result, na_count = _try_int64(self.parser, i, start, end,
1182+
na_filter, na_hashset, raise_on_invalid)
11791183
if user_dtype and na_count is not None:
11801184
if na_count > 0:
11811185
raise ValueError(f"Integer column has NA values in column {i}")
11821186
except OverflowError:
11831187
result = _try_uint64(self.parser, i, start, end,
1184-
na_filter, na_hashset)
1188+
na_filter, na_hashset, raise_on_invalid)
11851189
na_count = 0
11861190

11871191
if result is not None and dtype != "int64":
@@ -1344,59 +1348,6 @@ cdef class TextReader:
13441348
else:
13451349
return None
13461350

1347-
cdef bint _column_has_float(self, Py_ssize_t col,
1348-
int64_t start, int64_t end,
1349-
bint na_filter, kh_str_starts_t *na_hashset):
1350-
"""Check if the column contains any float number."""
1351-
cdef:
1352-
Py_ssize_t i, j, lines = end - start
1353-
coliter_t it
1354-
const char *word = NULL
1355-
const char *ignored_chars = " +-"
1356-
const char *digits = "0123456789"
1357-
const char *float_indicating_chars = "eE"
1358-
char null_byte = 0
1359-
1360-
coliter_setup(&it, self.parser, col, start)
1361-
1362-
for i in range(lines):
1363-
COLITER_NEXT(it, word)
1364-
1365-
if na_filter and kh_get_str_starts_item(na_hashset, word):
1366-
continue
1367-
1368-
found_first_digit = False
1369-
j = 0
1370-
while word[j] != null_byte:
1371-
if word[j] == self.parser.decimal:
1372-
return True
1373-
elif not found_first_digit and word[j] in ignored_chars:
1374-
# no-op
1375-
pass
1376-
elif not found_first_digit and word[j] not in digits:
1377-
# word isn't numeric
1378-
return False
1379-
elif not found_first_digit and word[j] in digits:
1380-
found_first_digit = True
1381-
elif word[j] in float_indicating_chars:
1382-
# preceding chars indicates numeric and
1383-
# current char indicates float
1384-
return True
1385-
elif word[j] not in digits:
1386-
# previous characters indicates numeric
1387-
# current character shows otherwise
1388-
return False
1389-
elif word[j] in digits:
1390-
# no-op
1391-
pass
1392-
else:
1393-
raise AssertionError(
1394-
f"Unhandled case {word[j]=} {found_first_digit=}"
1395-
)
1396-
j += 1
1397-
1398-
return False
1399-
14001351
# Factor out code common to TextReader.__dealloc__ and TextReader.close
14011352
# It cannot be a class method, since calling self.close() in __dealloc__
14021353
# which causes a class attribute lookup and violates best practices
@@ -1793,7 +1744,8 @@ cdef int _try_double_nogil(parser_t *parser,
17931744

17941745
cdef _try_uint64(parser_t *parser, int64_t col,
17951746
int64_t line_start, int64_t line_end,
1796-
bint na_filter, kh_str_starts_t *na_hashset):
1747+
bint na_filter, kh_str_starts_t *na_hashset,
1748+
bint raise_on_invalid):
17971749
cdef:
17981750
int error
17991751
Py_ssize_t lines
@@ -1815,6 +1767,8 @@ cdef _try_uint64(parser_t *parser, int64_t col,
18151767
if error == ERROR_OVERFLOW:
18161768
# Can't get the word variable
18171769
raise OverflowError("Overflow")
1770+
elif raise_on_invalid and error == ERROR_INVALID_CHARS:
1771+
raise ValueError("Number is not int")
18181772
return None
18191773

18201774
if uint64_conflict(&state):
@@ -1863,7 +1817,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
18631817

18641818
cdef _try_int64(parser_t *parser, int64_t col,
18651819
int64_t line_start, int64_t line_end,
1866-
bint na_filter, kh_str_starts_t *na_hashset):
1820+
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_invalid):
18671821
cdef:
18681822
int error, na_count = 0
18691823
Py_ssize_t lines
@@ -1883,6 +1837,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
18831837
if error == ERROR_OVERFLOW:
18841838
# Can't get the word variable
18851839
raise OverflowError("Overflow")
1840+
elif raise_on_invalid and error == ERROR_INVALID_CHARS:
1841+
raise ValueError("Number is not int")
18861842
return None, None
18871843

18881844
return result, na_count

0 commit comments

Comments
 (0)