Skip to content

Commit c8c6e76

Browse files
committed
fix: allow read_csv with python engine read large integers as int
1 parent 81f8d5d commit c8c6e76

File tree

3 files changed

+18
-12
lines changed

3 files changed

+18
-12
lines changed

pandas/_libs/lib.pyx

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,7 @@ cdef class Seen:
13861386
bint nan_ # seen_np.nan
13871387
bint uint_ # seen_uint (unsigned integer)
13881388
bint sint_ # seen_sint (signed integer)
1389+
bint overflow_ # seen_overflow
13891390
bint float_ # seen_float
13901391
bint object_ # seen_object
13911392
bint complex_ # seen_complex
@@ -1414,6 +1415,7 @@ cdef class Seen:
14141415
self.nan_ = False
14151416
self.uint_ = False
14161417
self.sint_ = False
1418+
self.overflow_ = False
14171419
self.float_ = False
14181420
self.object_ = False
14191421
self.complex_ = False
@@ -2379,6 +2381,9 @@ def maybe_convert_numeric(
23792381
ndarray[uint64_t, ndim=1] uints = cnp.PyArray_EMPTY(
23802382
1, values.shape, cnp.NPY_UINT64, 0
23812383
)
2384+
ndarray[object, ndim=1] pyints = cnp.PyArray_EMPTY(
2385+
1, values.shape, cnp.NPY_OBJECT, 0
2386+
)
23822387
ndarray[uint8_t, ndim=1] bools = cnp.PyArray_EMPTY(
23832388
1, values.shape, cnp.NPY_UINT8, 0
23842389
)
@@ -2476,6 +2481,7 @@ def maybe_convert_numeric(
24762481

24772482
if maybe_int:
24782483
as_int = int(val)
2484+
pyints[i] = as_int
24792485

24802486
if as_int in na_values:
24812487
mask[i] = 1
@@ -2490,7 +2496,7 @@ def maybe_convert_numeric(
24902496
if seen.coerce_numeric:
24912497
seen.float_ = True
24922498
else:
2493-
raise ValueError("Integer out of range.")
2499+
seen.overflow_ = True
24942500
else:
24952501
if as_int >= 0:
24962502
uints[i] = as_int
@@ -2529,11 +2535,15 @@ def maybe_convert_numeric(
25292535
return (floats, None)
25302536
elif seen.int_:
25312537
if seen.null_ and convert_to_masked_nullable:
2532-
if seen.uint_:
2538+
if seen.overflow_:
2539+
return (pyints, mask.view(np.bool_))
2540+
elif seen.uint_:
25332541
return (uints, mask.view(np.bool_))
25342542
else:
25352543
return (ints, mask.view(np.bool_))
2536-
if seen.uint_:
2544+
if seen.overflow_:
2545+
return (pyints, None)
2546+
elif seen.uint_:
25372547
return (uints, None)
25382548
else:
25392549
return (ints, None)

pandas/io/parsers/base_parser.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,11 @@ def _infer_types(
521521
if values.dtype == np.object_:
522522
na_count = parsers.sanitize_objects(values, na_values)
523523

524-
if result.dtype == np.object_ and try_num_bool:
524+
if (
525+
result.dtype == np.object_
526+
and try_num_bool
527+
and (len(result) == 0 or not isinstance(result[0], int))
528+
):
525529
result, bool_mask = libops.maybe_convert_bool(
526530
np.asarray(values),
527531
true_values=self.true_values,

pandas/tests/io/parser/common/test_ints.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -144,11 +144,6 @@ def test_int64_overflow(all_parsers, conv, request):
144144
if parser.engine == "pyarrow":
145145
mark = pytest.mark.xfail(reason="parses to float64")
146146
request.applymarker(mark)
147-
elif parser.engine == "python":
148-
mark = pytest.mark.xfail(
149-
reason="TODO: Python engine reads bigint as string"
150-
)
151-
request.applymarker(mark)
152147

153148
result = parser.read_csv(StringIO(data))
154149
expected = DataFrame(
@@ -206,9 +201,6 @@ def test_outside_int64_uint64_range(all_parsers, val, request):
206201
# These numbers fall just outside the int64-uint64
207202
# range, so they should be parsed as object.
208203
parser = all_parsers
209-
if parser.engine == "python":
210-
mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string")
211-
request.applymarker(mark)
212204

213205
result = parser.read_csv(StringIO(str(val)), header=None)
214206

0 commit comments

Comments
 (0)