Skip to content

Commit 4bc51be

Browse files
committed
Fix integer overflow in CSV parser causing segfault
Fixes #63089 When parsing scientific notation in CSV files, extremely large exponent values (e.g., '4e492493924924') caused integer overflow in the exponent accumulation loop, leading to undefined behavior and segmentation faults. The issue occurred in xstrtod() at pandas/_libs/src/parser/tokenizer.c where exponent digits were accumulated without bounds checking: int n = 0; while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); // Overflow here with large exponents ... } Solution: - Add a maximum exponent digits cap (MAX_EXPONENT_DIGITS = 4) to prevent overflow while still allowing valid scientific notation - Continue consuming remaining digits to maintain correct parsing position - The capped value (up to 9999) is sufficient since the subsequent range check (DBL_MIN_EXP to DBL_MAX_EXP) will catch invalid exponents This fix prevents the overflow while maintaining correct parsing behavior for both valid and invalid exponent values. Signed-off-by: Samaresh Kumar Singh <ssam3003@gmail.com>
1 parent 415830f commit 4bc51be

File tree

3 files changed

+55
-1
lines changed

3 files changed

+55
-1
lines changed

pandas/_libs/src/parser/tokenizer.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1510,8 +1510,14 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
15101510
// Process string of digits.
15111511
num_digits = 0;
15121512
int n = 0;
1513+
// Prevent integer overflow by capping exponent value
1514+
// DBL_MAX_EXP is typically 1024, so we use a safe upper bound
1515+
const int MAX_EXPONENT_DIGITS = 4; // Allows up to 9999
15131516
while (isdigit_ascii(*p)) {
1514-
n = n * 10 + (*p - '0');
1517+
if (num_digits < MAX_EXPONENT_DIGITS) {
1518+
n = n * 10 + (*p - '0');
1519+
}
1520+
// Continue consuming digits even after cap to maintain correct parsing position
15151521
num_digits++;
15161522
p++;
15171523
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""
2+
Test for issue #63089 - read_csv segfault with large exponent
3+
"""
4+
import io
5+
import pandas as pd
6+
import pytest
7+
8+
9+
class TestIssue63089:
10+
def test_large_exponent_no_segfault(self):
11+
"""Test that extremely large exponents don't cause segfault."""
12+
# This previously caused SIGSEGV due to integer overflow
13+
# when parsing the exponent
14+
result = pd.read_csv(io.StringIO("""h
15+
4e492493924924"""))
16+
17+
# Should parse as infinity or large float, not crash
18+
assert len(result) == 1
19+
assert 'h' in result.columns
20+
# The value should be infinity since the exponent is way too large
21+
import numpy as np
22+
assert np.isinf(result['h'].iloc[0]) or result['h'].iloc[0] > 1e308
23+
24+
def test_various_large_exponents(self):
25+
"""Test various edge cases with large exponents."""
26+
test_cases = [
27+
"1e999999999", # Very large positive exponent
28+
"1e-999999999", # Very large negative exponent
29+
"2.5e123456789", # Large exponent with decimal
30+
]
31+
32+
for test_val in test_cases:
33+
csv_data = f"col\n{test_val}"
34+
result = pd.read_csv(io.StringIO(csv_data))
35+
# Should not crash, result should be inf, 0, or valid float
36+
assert len(result) == 1
37+
assert not pd.isna(result['col'].iloc[0]) or True # Just don't crash

reproduce_issue_63089.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import io
2+
import pandas as pd
3+
4+
print("Testing issue #63089...")
5+
try:
6+
result = pd.read_csv(io.StringIO("""h
7+
4e492493924924"""))
8+
print("Success! Result:")
9+
print(result)
10+
except Exception as e:
11+
print(f"Exception occurred: {type(e).__name__}: {e}")

0 commit comments

Comments
 (0)