Skip to content

Commit e0ff0d0

Browse files
committed
Merge remote-tracking branch 'upstream/main' into aijams-take-function-invalid-dtype
2 parents b6e45ac + cbda666 commit e0ff0d0

File tree

19 files changed

+128
-42
lines changed

19 files changed

+128
-42
lines changed

.pre-commit-config.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ ci:
1919
skip: [pyright, mypy]
2020
repos:
2121
- repo: https://github.com/astral-sh/ruff-pre-commit
22-
rev: v0.12.11
22+
rev: v0.13.3
2323
hooks:
2424
- id: ruff
2525
args: [--exit-non-zero-on-fix]
@@ -46,7 +46,7 @@ repos:
4646
- id: codespell
4747
types_or: [python, rst, markdown, cython, c]
4848
- repo: https://github.com/MarcoGorelli/cython-lint
49-
rev: v0.16.7
49+
rev: v0.17.0
5050
hooks:
5151
- id: cython-lint
5252
- id: double-quote-cython-strings
@@ -67,7 +67,7 @@ repos:
6767
- id: trailing-whitespace
6868
args: [--markdown-linebreak-ext=md]
6969
- repo: https://github.com/PyCQA/isort
70-
rev: 6.0.1
70+
rev: 6.1.0
7171
hooks:
7272
- id: isort
7373
- repo: https://github.com/asottile/pyupgrade
@@ -92,14 +92,14 @@ repos:
9292
- id: sphinx-lint
9393
args: ["--enable", "all", "--disable", "line-too-long"]
9494
- repo: https://github.com/pre-commit/mirrors-clang-format
95-
rev: v21.1.0
95+
rev: v21.1.2
9696
hooks:
9797
- id: clang-format
9898
files: ^pandas/_libs/src|^pandas/_libs/include
9999
args: [-i]
100100
types_or: [c, c++]
101101
- repo: https://github.com/trim21/pre-commit-mirror-meson
102-
rev: v1.9.0
102+
rev: v1.9.1
103103
hooks:
104104
- id: meson-fmt
105105
args: ['--inplace']

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1080,6 +1080,7 @@ I/O
10801080
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
10811081
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
10821082
- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
1083+
- Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`)
10831084
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
10841085
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
10851086
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)

pandas/_libs/parsers.pyx

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,10 @@ cdef class TextReader:
10701070
else:
10711071
col_res = None
10721072
for dt in self.dtype_cast_order:
1073+
if (dt.kind in "iu" and
1074+
self._column_has_float(i, start, end, na_filter, na_hashset)):
1075+
continue
1076+
10731077
try:
10741078
col_res, na_count = self._convert_with_dtype(
10751079
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
@@ -1347,6 +1351,58 @@ cdef class TextReader:
13471351
else:
13481352
return None
13491353

1354+
cdef bint _column_has_float(self, Py_ssize_t col,
1355+
int64_t start, int64_t end,
1356+
bint na_filter, kh_str_starts_t *na_hashset):
1357+
"""Check if the column contains any float number."""
1358+
cdef:
1359+
Py_ssize_t i, j, lines = end - start
1360+
coliter_t it
1361+
const char *word = NULL
1362+
const char *ignored_chars = " +-"
1363+
const char *digits = "0123456789"
1364+
const char *float_indicating_chars = "eE"
1365+
char null_byte = 0
1366+
1367+
coliter_setup(&it, self.parser, col, start)
1368+
1369+
for i in range(lines):
1370+
COLITER_NEXT(it, word)
1371+
1372+
if na_filter and kh_get_str_starts_item(na_hashset, word):
1373+
continue
1374+
1375+
found_first_digit = False
1376+
j = 0
1377+
while word[j] != null_byte:
1378+
if word[j] == self.parser.decimal:
1379+
return True
1380+
elif not found_first_digit and word[j] in ignored_chars:
1381+
# no-op
1382+
pass
1383+
elif not found_first_digit and word[j] not in digits:
1384+
# word isn't numeric
1385+
return False
1386+
elif not found_first_digit and word[j] in digits:
1387+
found_first_digit = True
1388+
elif word[j] in float_indicating_chars:
1389+
# preceding chars indicates numeric and
1390+
# current char indicates float
1391+
return True
1392+
elif word[j] not in digits:
1393+
# previous characters indicates numeric
1394+
# current character shows otherwise
1395+
return False
1396+
elif word[j] in digits:
1397+
# no-op
1398+
pass
1399+
else:
1400+
raise AssertionError(
1401+
f"Unhandled case {word[j]=} {found_first_digit=}"
1402+
)
1403+
j += 1
1404+
1405+
return False
13501406

13511407
# Factor out code common to TextReader.__dealloc__ and TextReader.close
13521408
# It cannot be a class method, since calling self.close() in __dealloc__

pandas/core/arrays/arrow/array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -888,7 +888,7 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray:
888888
boxed = self._box_pa(other)
889889
except pa.lib.ArrowInvalid:
890890
# e.g. GH#60228 [1, "b"] we have to operate pointwise
891-
res_values = [op(x, y) for x, y in zip(self, other)]
891+
res_values = [op(x, y) for x, y in zip(self, other, strict=True)]
892892
result = pa.array(res_values, type=pa.bool_(), from_pandas=True)
893893
else:
894894
rtype = boxed.type
@@ -2713,7 +2713,7 @@ def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
27132713
if expand:
27142714
return {
27152715
col: self._from_pyarrow_array(pc.struct_field(result, [i]))
2716-
for col, i in zip(groups, range(result.type.num_fields))
2716+
for col, i in zip(groups, range(result.type.num_fields), strict=True)
27172717
}
27182718
else:
27192719
return type(self)(pc.struct_field(result, [0]))

pandas/core/arrays/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2869,7 +2869,7 @@ def convert_values(param):
28692869

28702870
# If the operator is not defined for the underlying objects,
28712871
# a TypeError should be raised
2872-
res = [op(a, b) for (a, b) in zip(lvalues, rvalues)]
2872+
res = [op(a, b) for (a, b) in zip(lvalues, rvalues, strict=True)]
28732873

28742874
def _maybe_convert(arr):
28752875
if coerce_to_dtype:
@@ -2885,7 +2885,7 @@ def _maybe_convert(arr):
28852885
return res
28862886

28872887
if op.__name__ in {"divmod", "rdivmod"}:
2888-
a, b = zip(*res)
2888+
a, b = zip(*res, strict=True)
28892889
return _maybe_convert(a), _maybe_convert(b)
28902890

28912891
return _maybe_convert(res)

pandas/core/arrays/categorical.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from csv import QUOTE_NONNUMERIC
44
from functools import partial
5+
import itertools
56
import operator
67
from shutil import get_terminal_size
78
from typing import (
@@ -2429,8 +2430,8 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
24292430
ensure_platform_int(self.codes), categories.size
24302431
)
24312432
counts = ensure_int64(counts).cumsum()
2432-
_result = (r[start:end] for start, end in zip(counts, counts[1:]))
2433-
return dict(zip(categories, _result))
2433+
_result = (r[start:end] for start, end in itertools.pairwise(counts))
2434+
return dict(zip(categories, _result, strict=True))
24342435

24352436
# ------------------------------------------------------------------
24362437
# Reductions
@@ -3165,5 +3166,8 @@ def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]:
31653166
# For consistency, it should return two empty lists.
31663167
return [], []
31673168

3168-
codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
3169+
codes, categories = zip(
3170+
*(factorize_from_iterable(it) for it in iterables),
3171+
strict=True,
3172+
)
31693173
return list(codes), list(categories)

pandas/core/arrays/datetimelike.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2374,7 +2374,7 @@ def _concat_same_type(
23742374
to_concat = [x for x in to_concat if len(x)]
23752375

23762376
if obj.freq is not None and all(x.freq == obj.freq for x in to_concat):
2377-
pairs = zip(to_concat[:-1], to_concat[1:])
2377+
pairs = zip(to_concat[:-1], to_concat[1:], strict=True)
23782378
if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs):
23792379
new_freq = obj.freq
23802380
new_obj._freq = new_freq

pandas/core/arrays/interval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1893,7 +1893,7 @@ def to_tuples(self, na_tuple: bool = True) -> np.ndarray:
18931893
>>> idx.to_tuples()
18941894
Index([(0, 1), (1, 2)], dtype='object')
18951895
"""
1896-
tuples = com.asarray_tuplesafe(zip(self._left, self._right))
1896+
tuples = com.asarray_tuplesafe(zip(self._left, self._right, strict=True))
18971897
if not na_tuple:
18981898
# GH 18756
18991899
tuples = np.where(~self.isna(), tuples, np.nan)

pandas/core/arrays/masked.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def __iter__(self) -> Iterator:
344344
yield val
345345
else:
346346
na_value = self.dtype.na_value
347-
for isna_, val in zip(self._mask, self._data):
347+
for isna_, val in zip(self._mask, self._data, strict=True):
348348
if isna_:
349349
yield na_value
350350
else:

pandas/core/arrays/period.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1445,7 +1445,7 @@ def _range_from_fields(
14451445

14461446
freqstr = freq.freqstr
14471447
year, quarter = _make_field_arrays(year, quarter)
1448-
for y, q in zip(year, quarter):
1448+
for y, q in zip(year, quarter, strict=True):
14491449
calendar_year, calendar_month = parsing.quarter_to_myear(y, q, freqstr)
14501450
val = libperiod.period_ordinal(
14511451
calendar_year, calendar_month, 1, 1, 1, 1, 0, 0, base
@@ -1455,7 +1455,7 @@ def _range_from_fields(
14551455
freq = to_offset(freq, is_period=True)
14561456
base = libperiod.freq_to_dtype_code(freq)
14571457
arrays = _make_field_arrays(year, month, day, hour, minute, second)
1458-
for y, mth, d, h, mn, s in zip(*arrays):
1458+
for y, mth, d, h, mn, s in zip(*arrays, strict=True):
14591459
ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base))
14601460

14611461
return np.array(ordinals, dtype=np.int64), freq

0 commit comments

Comments
 (0)