Merge remote-tracking branch 'upstream/main' into aijams-take-function-invalid-dtype

aijams · aijams · commit e0ff0d006361 · 2025-10-07T12:34:03.000-04:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ ci:
     skip: [pyright, mypy]
 repos:
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.11
+    rev: v0.13.3
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -46,7 +46,7 @@ repos:
     -   id: codespell
         types_or: [python, rst, markdown, cython, c]
 -   repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.16.7
+    rev: v0.17.0
     hooks:
     -   id: cython-lint
     -   id: double-quote-cython-strings
@@ -67,7 +67,7 @@ repos:
     -   id: trailing-whitespace
         args: [--markdown-linebreak-ext=md]
 -   repo: https://github.com/PyCQA/isort
-    rev: 6.0.1
+    rev: 6.1.0
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
@@ -92,14 +92,14 @@ repos:
     - id: sphinx-lint
       args: ["--enable", "all", "--disable", "line-too-long"]
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v21.1.0
+    rev: v21.1.2
     hooks:
     - id: clang-format
       files: ^pandas/_libs/src|^pandas/_libs/include
       args: [-i]
       types_or: [c, c++]
 -   repo: https://github.com/trim21/pre-commit-mirror-meson
-    rev: v1.9.0
+    rev: v1.9.1
     hooks:
     - id: meson-fmt
       args: ['--inplace']
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1080,6 +1080,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
+- Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`)
 - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1070,6 +1070,10 @@ cdef class TextReader:
         else:
             col_res = None
             for dt in self.dtype_cast_order:
+                if (dt.kind in "iu" and
+                        self._column_has_float(i, start, end, na_filter, na_hashset)):
+                    continue
+
                 try:
                     col_res, na_count = self._convert_with_dtype(
                         dt, i, start, end, na_filter, 0, na_hashset, na_fset)
@@ -1347,6 +1351,58 @@ cdef class TextReader:
             else:
                 return None
 
+    cdef bint _column_has_float(self, Py_ssize_t col,
+                                int64_t start, int64_t end,
+                                bint na_filter, kh_str_starts_t *na_hashset):
+        """Check if the column contains any float number."""
+        cdef:
+            Py_ssize_t i, j, lines = end - start
+            coliter_t it
+            const char *word = NULL
+            const char *ignored_chars = " +-"
+            const char *digits = "0123456789"
+            const char *float_indicating_chars = "eE"
+            char null_byte = 0
+
+        coliter_setup(&it, self.parser, col, start)
+
+        for i in range(lines):
+            COLITER_NEXT(it, word)
+
+            if na_filter and kh_get_str_starts_item(na_hashset, word):
+                continue
+
+            found_first_digit = False
+            j = 0
+            while word[j] != null_byte:
+                if word[j] == self.parser.decimal:
+                    return True
+                elif not found_first_digit and word[j] in ignored_chars:
+                    # no-op
+                    pass
+                elif not found_first_digit and word[j] not in digits:
+                    # word isn't numeric
+                    return False
+                elif not found_first_digit and word[j] in digits:
+                    found_first_digit = True
+                elif word[j] in float_indicating_chars:
+                    # preceding chars indicates numeric and
+                    # current char indicates float
+                    return True
+                elif word[j] not in digits:
+                    # previous characters indicates numeric
+                    # current character shows otherwise
+                    return False
+                elif word[j] in digits:
+                    # no-op
+                    pass
+                else:
+                    raise AssertionError(
+                            f"Unhandled case {word[j]=} {found_first_digit=}"
+                            )
+                j += 1
+
+        return False
 
 # Factor out code common to TextReader.__dealloc__ and TextReader.close
 # It cannot be a class method, since calling self.close() in __dealloc__
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -888,7 +888,7 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray:
                 boxed = self._box_pa(other)
             except pa.lib.ArrowInvalid:
                 # e.g. GH#60228 [1, "b"] we have to operate pointwise
-                res_values = [op(x, y) for x, y in zip(self, other)]
+                res_values = [op(x, y) for x, y in zip(self, other, strict=True)]
                 result = pa.array(res_values, type=pa.bool_(), from_pandas=True)
             else:
                 rtype = boxed.type
@@ -2713,7 +2713,7 @@ def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
         if expand:
             return {
                 col: self._from_pyarrow_array(pc.struct_field(result, [i]))
-                for col, i in zip(groups, range(result.type.num_fields))
+                for col, i in zip(groups, range(result.type.num_fields), strict=True)
             }
         else:
             return type(self)(pc.struct_field(result, [0]))
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -2869,7 +2869,7 @@ def convert_values(param):
 
             # If the operator is not defined for the underlying objects,
             # a TypeError should be raised
-            res = [op(a, b) for (a, b) in zip(lvalues, rvalues)]
+            res = [op(a, b) for (a, b) in zip(lvalues, rvalues, strict=True)]
 
             def _maybe_convert(arr):
                 if coerce_to_dtype:
@@ -2885,7 +2885,7 @@ def _maybe_convert(arr):
                 return res
 
             if op.__name__ in {"divmod", "rdivmod"}:
-                a, b = zip(*res)
+                a, b = zip(*res, strict=True)
                 return _maybe_convert(a), _maybe_convert(b)
 
             return _maybe_convert(res)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2,6 +2,7 @@
 
 from csv import QUOTE_NONNUMERIC
 from functools import partial
+import itertools
 import operator
 from shutil import get_terminal_size
 from typing import (
@@ -2429,8 +2430,8 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
             ensure_platform_int(self.codes), categories.size
         )
         counts = ensure_int64(counts).cumsum()
-        _result = (r[start:end] for start, end in zip(counts, counts[1:]))
-        return dict(zip(categories, _result))
+        _result = (r[start:end] for start, end in itertools.pairwise(counts))
+        return dict(zip(categories, _result, strict=True))
 
     # ------------------------------------------------------------------
     # Reductions
@@ -3165,5 +3166,8 @@ def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]:
         # For consistency, it should return two empty lists.
         return [], []
 
-    codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
+    codes, categories = zip(
+        *(factorize_from_iterable(it) for it in iterables),
+        strict=True,
+    )
     return list(codes), list(categories)
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -2374,7 +2374,7 @@ def _concat_same_type(
             to_concat = [x for x in to_concat if len(x)]
 
             if obj.freq is not None and all(x.freq == obj.freq for x in to_concat):
-                pairs = zip(to_concat[:-1], to_concat[1:])
+                pairs = zip(to_concat[:-1], to_concat[1:], strict=True)
                 if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs):
                     new_freq = obj.freq
                     new_obj._freq = new_freq
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -1893,7 +1893,7 @@ def to_tuples(self, na_tuple: bool = True) -> np.ndarray:
         >>> idx.to_tuples()
         Index([(0, 1), (1, 2)], dtype='object')
         """
-        tuples = com.asarray_tuplesafe(zip(self._left, self._right))
+        tuples = com.asarray_tuplesafe(zip(self._left, self._right, strict=True))
         if not na_tuple:
             # GH 18756
             tuples = np.where(~self.isna(), tuples, np.nan)
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -344,7 +344,7 @@ def __iter__(self) -> Iterator:
                     yield val
             else:
                 na_value = self.dtype.na_value
-                for isna_, val in zip(self._mask, self._data):
+                for isna_, val in zip(self._mask, self._data, strict=True):
                     if isna_:
                         yield na_value
                     else:
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -1445,7 +1445,7 @@ def _range_from_fields(
 
         freqstr = freq.freqstr
         year, quarter = _make_field_arrays(year, quarter)
-        for y, q in zip(year, quarter):
+        for y, q in zip(year, quarter, strict=True):
             calendar_year, calendar_month = parsing.quarter_to_myear(y, q, freqstr)
             val = libperiod.period_ordinal(
                 calendar_year, calendar_month, 1, 1, 1, 1, 0, 0, base
@@ -1455,7 +1455,7 @@ def _range_from_fields(
         freq = to_offset(freq, is_period=True)
         base = libperiod.freq_to_dtype_code(freq)
         arrays = _make_field_arrays(year, month, day, hour, minute, second)
-        for y, mth, d, h, mn, s in zip(*arrays):
+        for y, mth, d, h, mn, s in zip(*arrays, strict=True):
             ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base))
 
     return np.array(ordinals, dtype=np.int64), freq
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -1753,7 +1753,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
                     self._simple_new(
                         sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)
                     )
-                    for sp_value, fv in zip(sp_values, fill_value)
+                    for sp_value, fv in zip(sp_values, fill_value, strict=True)
                 )
                 return arrays
             elif method == "reduce":
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -621,7 +621,9 @@ def __truediv__(self, other):
         if is_object_dtype(other.dtype):
             other = np.asarray(other)
             if self.ndim > 1:
-                res_cols = [left / right for left, right in zip(self, other)]
+                res_cols = [
+                    left / right for left, right in zip(self, other, strict=True)
+                ]
                 res_cols2 = [x.reshape(1, -1) for x in res_cols]
                 result = np.concatenate(res_cols2, axis=0)
             else:
@@ -670,7 +672,9 @@ def __floordiv__(self, other):
         elif is_object_dtype(other.dtype):
             other = np.asarray(other)
             if self.ndim > 1:
-                res_cols = [left // right for left, right in zip(self, other)]
+                res_cols = [
+                    left // right for left, right in zip(self, other, strict=True)
+                ]
                 res_cols2 = [x.reshape(1, -1) for x in res_cols]
                 result = np.concatenate(res_cols2, axis=0)
             else:
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -4325,7 +4325,7 @@ def nth(self) -> GroupByNthSelector:
     def _nth(
         self,
         n: PositionalIndexer | tuple,
-        dropna: Literal["any", "all", None] = None,
+        dropna: Literal["any", "all"] | None = None,
     ) -> NDFrameT:
         if not dropna:
             mask = self._make_mask_from_positional_indexer(n)
diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py
@@ -296,7 +296,7 @@ def __init__(self, groupby_object: groupby.GroupBy) -> None:
     def __call__(
         self,
         n: PositionalIndexer | tuple,
-        dropna: Literal["any", "all", None] = None,
+        dropna: Literal["any", "all"] | None = None,
     ) -> DataFrame | Series:
         return self.groupby_object._nth(n, dropna)
 
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -334,7 +334,7 @@ def _wrap_result(
                 )
                 result = {
                     label: ArrowExtensionArray(pa.array(res))
-                    for label, res in zip(name, result.T)
+                    for label, res in zip(name, result.T, strict=True)
                 }
             elif is_object_dtype(result):
 
@@ -684,7 +684,8 @@ def cat(
         elif na_rep is not None and union_mask.any():
             # fill NaNs with na_rep in case there are actually any NaNs
             all_cols = [
-                np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)
+                np.where(nm, na_rep, col)
+                for nm, col in zip(na_masks, all_cols, strict=True)
             ]
             result = cat_safe(all_cols, sep)
         else:
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -223,7 +223,7 @@ def __init__(
         attrs: dict[str, str] | None,
         encoding: str,
         displayed_only: bool,
-        extract_links: Literal[None, "header", "footer", "body", "all"],
+        extract_links: Literal["header", "footer", "body", "all"] | None,
         storage_options: StorageOptions = None,
     ) -> None:
         self.io = io
@@ -1046,7 +1046,7 @@ def read_html(
     na_values: Iterable[object] | None = None,
     keep_default_na: bool = True,
     displayed_only: bool = True,
-    extract_links: Literal[None, "header", "footer", "body", "all"] = None,
+    extract_links: Literal["header", "footer", "body", "all"] | None = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
     storage_options: StorageOptions = None,
 ) -> list[DataFrame]:
diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py
@@ -77,3 +77,30 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
         expected = DataFrame({"data": [f"10E{exp}"]})
 
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "value, expected_value",
+    [
+        ("32.0", 32.0),
+        ("32e0", 32.0),
+        ("3.2e1", 32.0),
+        ("3.2e80", 3.2e80),
+        ("3.2e-80", 3.2e-80),
+        ("18446744073709551616.0", float(1 << 64)),  # loses precision
+        ("18446744073709551616.5", float(1 << 64)),  # loses precision
+        ("36893488147419103232.3", float(1 << 65)),  # loses precision
+    ],
+)
+def test_small_int_followed_by_float(
+    all_parsers_all_precisions, value, expected_value, request
+):
+    # GH#51295
+    parser, precision = all_parsers_all_precisions
+    data = f"""data
+    42
+    {value}"""
+    result = parser.read_csv(StringIO(data), float_precision=precision)
+    expected = DataFrame({"data": [42.0, expected_value]})
+
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py
@@ -11,7 +11,6 @@
     HDFStore,
     Index,
     MultiIndex,
-    _testing as tm,
     date_range,
     read_hdf,
 )
@@ -182,16 +181,16 @@ def test_append_with_diff_col_name_types_raises_value_error(setup_path):
                 store.append(name, d)
 
 
-def test_invalid_complib(setup_path):
+def test_invalid_complib(tmp_path, setup_path):
     df = DataFrame(
         np.random.default_rng(2).random((4, 5)),
         index=list("abcd"),
         columns=list("ABCDE"),
     )
-    with tm.ensure_clean(setup_path) as path:
-        msg = r"complib only supports \[.*\] compression."
-        with pytest.raises(ValueError, match=msg):
-            df.to_hdf(path, key="df", complib="foolib")
+    path = tmp_path / setup_path
+    msg = r"complib only supports \[.*\] compression."
+    with pytest.raises(ValueError, match=msg):
+        df.to_hdf(path, key="df", complib="foolib")
 
 
 @pytest.mark.parametrize(
diff --git a/pyproject.toml b/pyproject.toml